scripts/last-map-probs
author Martin C. Frith
Fri Jun 02 18:40:29 2017 +0900 (2017-06-02)
changeset 863 6a4915d5b5cb
parent 479 e707702f53f8
child 937 8a4fadbe6080
permissions -rwxr-xr-x
last-dotplot: get bp-per-pixel faster
Martin@61
     1
#! /usr/bin/env python
Martin@61
     2
Martin@411
     3
# Copyright 2010, 2011, 2012, 2014 Martin C. Frith
Martin@148
     4
Martin@148
     5
# Read query-genome alignments: write them along with the probability
Martin@148
     6
# that each alignment is not the true mapping of its query.  These
Martin@148
     7
# probabilities make the risky assumption that one of the alignments
Martin@148
     8
# reported for each query is correct.
Martin@61
     9
Martin@94
    10
import sys, os, fileinput, math, optparse, signal
Martin@94
    11
Martin@247
    12
def logsum(numbers):
Martin@247
    13
    """Adds numbers, in log space, to avoid overflow."""
Martin@247
    14
    m = max(numbers)
Martin@247
    15
    s = sum(math.exp(i - m) for i in numbers)
Martin@247
    16
    return m + math.log(s)
Martin@61
    17
Martin@246
    18
def mafScore(aLine):
Martin@246
    19
    for word in aLine.split():
Martin@148
    20
        if word.startswith("score="):
Martin@148
    21
            return float(word[6:])
Martin@148
    22
    raise Exception("found an alignment without a score")
Martin@148
    23
Martin@160
    24
def namesAndScores(lines):
Martin@160
    25
    queryNames = []
Martin@160
    26
    scores = []
Martin@160
    27
    for line in lines:
Martin@243
    28
        if line[0] == "a":  # faster than line.startswith("a")
Martin@246
    29
            s = mafScore(line)
Martin@160
    30
            scores.append(s)
Martin@160
    31
            sLineCount = 0
Martin@243
    32
        elif line[0] == "s":
Martin@160
    33
            sLineCount += 1
Martin@245
    34
            if sLineCount == 2: queryNames.append(line.split(None, 2)[1])
Martin@160
    35
        elif line[0].isdigit():  # we have an alignment in tabular format
Martin@245
    36
            w = line.split(None, 7)
Martin@160
    37
            scores.append(float(w[0]))
Martin@160
    38
            queryNames.append(w[6])
Martin@160
    39
    return queryNames, scores
Martin@160
    40
Martin@160
    41
def scoreTotals(queryNames, scores, temperature):
Martin@247
    42
    queryLists = {}
Martin@160
    43
    for n, s in zip(queryNames, scores):
Martin@247
    44
        queryLists.setdefault(n, []).append(s / temperature)
Martin@247
    45
    return dict((k, logsum(v)) for k, v in queryLists.iteritems())
Martin@160
    46
Martin@160
    47
def writeOneBatch(lines, queryNames, scores, denominators, opts, temperature):
Martin@160
    48
    isWanted = True
Martin@160
    49
    i = 0
Martin@160
    50
    for line in lines:
Martin@243
    51
        if line[0] == "a":
Martin@160
    52
            s = scores[i]
Martin@242
    53
            p = 1.0 - math.exp(s / temperature - denominators[queryNames[i]])
Martin@160
    54
            i += 1
Martin@160
    55
            if s < opts.score or p > opts.mismap:
Martin@160
    56
                isWanted = False
Martin@160
    57
            else:
Martin@247
    58
                newLineEnd = " mismap=%.3g\n" % p
Martin@160
    59
                line = line.rstrip() + newLineEnd
Martin@160
    60
        elif line[0].isdigit():  # we have an alignment in tabular format
Martin@160
    61
            s = scores[i]
Martin@242
    62
            p = 1.0 - math.exp(s / temperature - denominators[queryNames[i]])
Martin@160
    63
            i += 1
Martin@160
    64
            if s < opts.score or p > opts.mismap: continue
Martin@568
    65
            newLineEnd = "\tmismap=%.3g\n" % p
Martin@160
    66
            line = line.rstrip() + newLineEnd
Martin@160
    67
        if isWanted: print line,
Martin@160
    68
        if line.isspace(): isWanted = True  # reset at end of maf paragraph
Martin@160
    69
Martin@246
    70
def doOneBatch(lines, opts, temperature):
Martin@160
    71
    queryNames, scores = namesAndScores(lines)
Martin@160
    72
    denominators = scoreTotals(queryNames, scores, temperature)
Martin@160
    73
    writeOneBatch(lines, queryNames, scores, denominators, opts, temperature)
Martin@160
    74
Martin@244
    75
def readHeaderOrDie(lines):
Martin@411
    76
    t = 0.0
Martin@411
    77
    e = -1
Martin@244
    78
    for line in lines:
Martin@244
    79
        if line.startswith("#") or line.isspace():
Martin@244
    80
            print line,
Martin@244
    81
            for i in line.split():
Martin@411
    82
                if i.startswith("t="): t = float(i[2:])
Martin@411
    83
                elif i.startswith("e="): e = int(i[2:])
Martin@411
    84
            if t > 0 and e >= 0: break
Martin@244
    85
        else:
Martin@411
    86
            raise Exception("I need a header with t= and e=")
Martin@411
    87
    return t, e
Martin@244
    88
Martin@148
    89
def lastMapProbs(opts, args):
Martin@244
    90
    f = fileinput.input(args)
Martin@411
    91
    temperature, e = readHeaderOrDie(f)
Martin@411
    92
    if opts.score < 0: opts.score = e + round(temperature * math.log(1000))
Martin@160
    93
    lines = []
Martin@148
    94
Martin@244
    95
    for line in f:
Martin@160
    96
        if line.startswith("# batch"):
Martin@246
    97
            doOneBatch(lines, opts, temperature)
Martin@160
    98
            lines = []
Martin@160
    99
        lines.append(line)
Martin@246
   100
    doOneBatch(lines, opts, temperature)
Martin@148
   101
Martin@148
   102
if __name__ == "__main__":
Martin@148
   103
    signal.signal(signal.SIGPIPE, signal.SIG_DFL)  # avoid silly error message
Martin@148
   104
Martin@148
   105
    usage = """
Martin@148
   106
  %prog --help
Martin@148
   107
  %prog [options] lastal-alignments"""
Martin@148
   108
Martin@148
   109
    description = "Calculate a mismap probability for each alignment.  This is the probability that the alignment does not reflect the origin of the query sequence, assuming that one reported alignment does reflect the origin of each query."
Martin@148
   110
Martin@148
   111
    op = optparse.OptionParser(usage=usage, description=description)
Martin@148
   112
    op.add_option("-m", "--mismap", type="float", default=0.01, metavar="M",
Martin@148
   113
                  help="don't write alignments with mismap probability > M (default: %default)")
Martin@411
   114
    op.add_option("-s", "--score", type="float", default=-1, metavar="S",
Martin@411
   115
                  help="don't write alignments with score < S (default: e+t*ln[1000])")
Martin@148
   116
    (opts, args) = op.parse_args()
Martin@162
   117
    if not args and sys.stdin.isatty():
Martin@162
   118
        op.print_help()
Martin@162
   119
        op.exit()
Martin@148
   120
Martin@148
   121
    try: lastMapProbs(opts, args)
Martin@148
   122
    except KeyboardInterrupt: pass  # avoid silly error message
Martin@148
   123
    except Exception, e:
Martin@148
   124
        prog = os.path.basename(sys.argv[0])
Martin@148
   125
        sys.exit(prog + ": error: " + str(e))