scripts/last-map-probs
author Martin C. Frith
Fri Jun 02 18:40:29 2017 +0900 (2017-06-02)
changeset 863 6a4915d5b5cb
parent 479 e707702f53f8
child 937 8a4fadbe6080
permissions -rwxr-xr-x
last-dotplot: get bp-per-pixel faster
     1 #! /usr/bin/env python
     2 
     3 # Copyright 2010, 2011, 2012, 2014 Martin C. Frith
     4 
     5 # Read query-genome alignments: write them along with the probability
     6 # that each alignment is not the true mapping of its query.  These
     7 # probabilities make the risky assumption that one of the alignments
     8 # reported for each query is correct.
     9 
    10 import sys, os, fileinput, math, optparse, signal
    11 
    12 def logsum(numbers):
    13     """Adds numbers, in log space, to avoid overflow."""
    14     m = max(numbers)
    15     s = sum(math.exp(i - m) for i in numbers)
    16     return m + math.log(s)
    17 
    18 def mafScore(aLine):
    19     for word in aLine.split():
    20         if word.startswith("score="):
    21             return float(word[6:])
    22     raise Exception("found an alignment without a score")
    23 
    24 def namesAndScores(lines):
    25     queryNames = []
    26     scores = []
    27     for line in lines:
    28         if line[0] == "a":  # faster than line.startswith("a")
    29             s = mafScore(line)
    30             scores.append(s)
    31             sLineCount = 0
    32         elif line[0] == "s":
    33             sLineCount += 1
    34             if sLineCount == 2: queryNames.append(line.split(None, 2)[1])
    35         elif line[0].isdigit():  # we have an alignment in tabular format
    36             w = line.split(None, 7)
    37             scores.append(float(w[0]))
    38             queryNames.append(w[6])
    39     return queryNames, scores
    40 
    41 def scoreTotals(queryNames, scores, temperature):
    42     queryLists = {}
    43     for n, s in zip(queryNames, scores):
    44         queryLists.setdefault(n, []).append(s / temperature)
    45     return dict((k, logsum(v)) for k, v in queryLists.iteritems())
    46 
    47 def writeOneBatch(lines, queryNames, scores, denominators, opts, temperature):
    48     isWanted = True
    49     i = 0
    50     for line in lines:
    51         if line[0] == "a":
    52             s = scores[i]
    53             p = 1.0 - math.exp(s / temperature - denominators[queryNames[i]])
    54             i += 1
    55             if s < opts.score or p > opts.mismap:
    56                 isWanted = False
    57             else:
    58                 newLineEnd = " mismap=%.3g\n" % p
    59                 line = line.rstrip() + newLineEnd
    60         elif line[0].isdigit():  # we have an alignment in tabular format
    61             s = scores[i]
    62             p = 1.0 - math.exp(s / temperature - denominators[queryNames[i]])
    63             i += 1
    64             if s < opts.score or p > opts.mismap: continue
    65             newLineEnd = "\tmismap=%.3g\n" % p
    66             line = line.rstrip() + newLineEnd
    67         if isWanted: print line,
    68         if line.isspace(): isWanted = True  # reset at end of maf paragraph
    69 
    70 def doOneBatch(lines, opts, temperature):
    71     queryNames, scores = namesAndScores(lines)
    72     denominators = scoreTotals(queryNames, scores, temperature)
    73     writeOneBatch(lines, queryNames, scores, denominators, opts, temperature)
    74 
    75 def readHeaderOrDie(lines):
    76     t = 0.0
    77     e = -1
    78     for line in lines:
    79         if line.startswith("#") or line.isspace():
    80             print line,
    81             for i in line.split():
    82                 if i.startswith("t="): t = float(i[2:])
    83                 elif i.startswith("e="): e = int(i[2:])
    84             if t > 0 and e >= 0: break
    85         else:
    86             raise Exception("I need a header with t= and e=")
    87     return t, e
    88 
    89 def lastMapProbs(opts, args):
    90     f = fileinput.input(args)
    91     temperature, e = readHeaderOrDie(f)
    92     if opts.score < 0: opts.score = e + round(temperature * math.log(1000))
    93     lines = []
    94 
    95     for line in f:
    96         if line.startswith("# batch"):
    97             doOneBatch(lines, opts, temperature)
    98             lines = []
    99         lines.append(line)
   100     doOneBatch(lines, opts, temperature)
   101 
   102 if __name__ == "__main__":
   103     signal.signal(signal.SIGPIPE, signal.SIG_DFL)  # avoid silly error message
   104 
   105     usage = """
   106   %prog --help
   107   %prog [options] lastal-alignments"""
   108 
   109     description = "Calculate a mismap probability for each alignment.  This is the probability that the alignment does not reflect the origin of the query sequence, assuming that one reported alignment does reflect the origin of each query."
   110 
   111     op = optparse.OptionParser(usage=usage, description=description)
   112     op.add_option("-m", "--mismap", type="float", default=0.01, metavar="M",
   113                   help="don't write alignments with mismap probability > M (default: %default)")
   114     op.add_option("-s", "--score", type="float", default=-1, metavar="S",
   115                   help="don't write alignments with score < S (default: e+t*ln[1000])")
   116     (opts, args) = op.parse_args()
   117     if not args and sys.stdin.isatty():
   118         op.print_help()
   119         op.exit()
   120 
   121     try: lastMapProbs(opts, args)
   122     except KeyboardInterrupt: pass  # avoid silly error message
   123     except Exception, e:
   124         prog = os.path.basename(sys.argv[0])
   125         sys.exit(prog + ": error: " + str(e))