scripts/last-postmask
author Martin C. Frith
Thu Jun 15 14:58:18 2017 +0900 (2017-06-15)
changeset 867 f9ec9c71d72e
parent 760 5e99b6e235e0
child 875 592295375eb1
permissions -rwxr-xr-x
Make last-train read sequence file "-" from stdin
     1 #! /usr/bin/env python
     2 
     3 # Copyright 2014 Martin C. Frith
     4 
     5 # Read MAF-format alignments, and write those that have a segment with
     6 # score >= threshold, with gentle masking of lowercase letters.  There
     7 # must be a lastal header with score parameters.
     8 
     9 # Gentle masking is described in: MC Frith, PLoS One 2011;6(12):e28819
    10 # "Gentle masking of low-complexity sequences improves homology search"
    11 
    12 # Limitations: doesn't (yet) handle sequence quality data,
    13 # frameshifts, or generalized affine gaps.
    14 
    15 import itertools, optparse, os, signal, sys
    16 
    17 def getScoreMatrix(rowHeads, colHeads, matrix, deleteCost, insertCost):
    18     defaultScore = min(map(min, matrix))
    19     scoreMatrix = [[defaultScore for i in range(128)] for j in range(128)]
    20     for i, x in enumerate(rowHeads):
    21         for j, y in enumerate(colHeads):
    22             xu = ord(x.upper())
    23             xl = ord(x.lower())
    24             yu = ord(y.upper())
    25             yl = ord(y.lower())
    26             score = matrix[i][j]
    27             maskScore = min(score, 0)
    28             scoreMatrix[xu][yu] = score
    29             scoreMatrix[xu][yl] = maskScore
    30             scoreMatrix[xl][yu] = maskScore
    31             scoreMatrix[xl][yl] = maskScore
    32     for i in range(128):
    33         scoreMatrix[i][ord("-")] = -deleteCost
    34         scoreMatrix[ord("-")][i] = -insertCost
    35     return scoreMatrix
    36 
    37 def isGoodAlignment(seqs, scoreMatrix, delOpenCost, insOpenCost, minScore):
    38     """Does the alignment have a segment with score >= minScore?"""
    39     r, q = seqs
    40     score = 0
    41     xOld = " "
    42     yOld = " "
    43     for x, y in itertools.izip(r, q):
    44         score += scoreMatrix[ord(x)][ord(y)]
    45         if score >= minScore: return True
    46         if x == "-" and xOld != "-": score -= insOpenCost
    47         if y == "-" and yOld != "-": score -= delOpenCost
    48         if score < 0: score = 0
    49         xOld = x
    50         yOld = y
    51     return False
    52 
    53 def printIfGood(maf, seqs, scoreMatrix, delOpenCost, insOpenCost, minScore):
    54     if isGoodAlignment(seqs, scoreMatrix, delOpenCost, insOpenCost, minScore):
    55         for line in maf:
    56             print line,
    57         print
    58 
    59 def doOneFile(lines):
    60     scoreMatrix = []
    61     maf = []
    62     seqs = []
    63 
    64     for line in lines:
    65         if line[0] == "#":
    66             print line,
    67             w = line.split()
    68             for i in w:
    69                 if i.startswith("a="): aDel = int(i[2:])
    70                 if i.startswith("b="): bDel = int(i[2:])
    71                 if i.startswith("A="): aIns = int(i[2:])
    72                 if i.startswith("B="): bIns = int(i[2:])
    73                 if i.startswith("e="): minScore = int(i[2:])
    74             if len(w) > 1 and max(map(len, w)) == 1:
    75                 colHeads = w[1:]
    76                 rowHeads = []
    77                 matrix = []
    78             elif len(w) > 2 and len(w[1]) == 1:
    79                 rowHeads.append(w[1])
    80                 matrix.append(map(int, w[2:]))
    81         elif line.isspace():
    82             if seqs: printIfGood(maf, seqs, scoreMatrix, aDel, aIns, minScore)
    83             maf = []
    84             seqs = []
    85         else:
    86             if not scoreMatrix:
    87                 scoreMatrix = getScoreMatrix(rowHeads, colHeads, matrix,
    88                                              bDel, bIns)
    89             maf.append(line)
    90             if line[0] == "s": seqs.append(line.split()[6])
    91     if seqs: printIfGood(maf, seqs, scoreMatrix, aDel, aIns, minScore)
    92 
    93 def lastPostmask(args):
    94     if args:
    95         for i in args:
    96             if i == "-":
    97                 doOneFile(sys.stdin)
    98             else:
    99                 with open(i) as f:
   100                     doOneFile(f)
   101     else:
   102         doOneFile(sys.stdin)
   103 
   104 if __name__ == "__main__":
   105     signal.signal(signal.SIGPIPE, signal.SIG_DFL)  # avoid silly error message
   106 
   107     usage = "%prog in.maf > out.maf"
   108     description = "Get alignments that have a segment with score >= threshold, with gentle masking of lowercase letters."
   109     op = optparse.OptionParser(usage=usage, description=description)
   110     (opts, args) = op.parse_args()
   111 
   112     try: lastPostmask(args)
   113     except KeyboardInterrupt: pass  # avoid silly error message
   114     except Exception, e:
   115         prog = os.path.basename(sys.argv[0])
   116         sys.exit(prog + ": error: " + str(e))