3 # Copyright 2014 Martin C. Frith
5 # Read MAF-format alignments, and write those that have a segment with
6 # score >= threshold, with gentle masking of lowercase letters. There
7 # must be a lastal header with score parameters.
9 # Gentle masking is described in: MC Frith, PLoS One 2011;6(12):e28819
10 # "Gentle masking of low-complexity sequences improves homology search"
12 # Limitations: doesn't (yet) handle sequence quality data,
13 # frameshifts, or generalized affine gaps.
15 import itertools, optparse, os, signal, sys
17 def getScoreMatrix(rowHeads, colHeads, matrix, deleteCost, insertCost):
18 defaultScore = min(map(min, matrix))
19 scoreMatrix = [[defaultScore for i in range(128)] for j in range(128)]
20 for i, x in enumerate(rowHeads):
21 for j, y in enumerate(colHeads):
27 maskScore = min(score, 0)
28 scoreMatrix[xu][yu] = score
29 scoreMatrix[xu][yl] = maskScore
30 scoreMatrix[xl][yu] = maskScore
31 scoreMatrix[xl][yl] = maskScore
33 scoreMatrix[i][ord("-")] = -deleteCost
34 scoreMatrix[ord("-")][i] = -insertCost
37 def isGoodAlignment(seqs, scoreMatrix, delOpenCost, insOpenCost, minScore):
38 """Does the alignment have a segment with score >= minScore?"""
43 for x, y in itertools.izip(r, q):
44 score += scoreMatrix[ord(x)][ord(y)]
45 if score >= minScore: return True
46 if x == "-" and xOld != "-": score -= insOpenCost
47 if y == "-" and yOld != "-": score -= delOpenCost
48 if score < 0: score = 0
53 def printIfGood(maf, seqs, scoreMatrix, delOpenCost, insOpenCost, minScore):
54 if isGoodAlignment(seqs, scoreMatrix, delOpenCost, insOpenCost, minScore):
69 if i.startswith("a="): aDel = int(i[2:])
70 if i.startswith("b="): bDel = int(i[2:])
71 if i.startswith("A="): aIns = int(i[2:])
72 if i.startswith("B="): bIns = int(i[2:])
73 if i.startswith("e="): minScore = int(i[2:])
74 if len(w) > 1 and max(map(len, w)) == 1:
78 elif len(w) > 2 and len(w[1]) == 1:
80 matrix.append(map(int, w[2:]))
82 if seqs: printIfGood(maf, seqs, scoreMatrix, aDel, aIns, minScore)
87 scoreMatrix = getScoreMatrix(rowHeads, colHeads, matrix,
90 if line[0] == "s": seqs.append(line.split()[6])
91 if seqs: printIfGood(maf, seqs, scoreMatrix, aDel, aIns, minScore)
93 def lastPostmask(args):
104 if __name__ == "__main__":
105 signal.signal(signal.SIGPIPE, signal.SIG_DFL) # avoid silly error message
107 usage = "%prog in.maf > out.maf"
108 description = "Get alignments that have a segment with score >= threshold, with gentle masking of lowercase letters."
109 op = optparse.OptionParser(usage=usage, description=description)
110 (opts, args) = op.parse_args()
112 try: lastPostmask(args)
113 except KeyboardInterrupt: pass # avoid silly error message
115 prog = os.path.basename(sys.argv[0])
116 sys.exit(prog + ": error: " + str(e))