scripts/last-postmask
author Martin C. Frith
Fri Jun 02 18:40:29 2017 +0900 (2017-06-02)
changeset 863 6a4915d5b5cb
parent 480 aec8ba9a675a
child 867 f9ec9c71d72e
permissions -rwxr-xr-x
last-dotplot: get bp-per-pixel faster
Martin@480
     1
#! /usr/bin/env python
Martin@480
     2
Martin@480
     3
# Copyright 2014 Martin C. Frith
Martin@480
     4
Martin@480
     5
# Read MAF-format alignments, and write those that have a segment with
Martin@480
     6
# score >= threshold, with gentle masking of lowercase letters.  There
Martin@480
     7
# must be a lastal header with score parameters.
Martin@480
     8
Martin@480
     9
# Gentle masking is described in: MC Frith, PLoS One 2011;6(12):e28819
Martin@480
    10
# "Gentle masking of low-complexity sequences improves homology search"
Martin@480
    11
Martin@480
    12
# Limitations: doesn't (yet) handle sequence quality data,
Martin@480
    13
# frameshifts, or generalized affine gaps.
Martin@480
    14
Martin@760
    15
import itertools, optparse, os, signal, sys
Martin@480
    16
Martin@480
    17
def getScoreMatrix(rowHeads, colHeads, matrix, deleteCost, insertCost):
Martin@480
    18
    defaultScore = min(map(min, matrix))
Martin@480
    19
    scoreMatrix = [[defaultScore for i in range(128)] for j in range(128)]
Martin@480
    20
    for i, x in enumerate(rowHeads):
Martin@480
    21
        for j, y in enumerate(colHeads):
Martin@480
    22
            xu = ord(x.upper())
Martin@480
    23
            xl = ord(x.lower())
Martin@480
    24
            yu = ord(y.upper())
Martin@480
    25
            yl = ord(y.lower())
Martin@480
    26
            score = matrix[i][j]
Martin@480
    27
            maskScore = min(score, 0)
Martin@480
    28
            scoreMatrix[xu][yu] = score
Martin@480
    29
            scoreMatrix[xu][yl] = maskScore
Martin@480
    30
            scoreMatrix[xl][yu] = maskScore
Martin@480
    31
            scoreMatrix[xl][yl] = maskScore
Martin@480
    32
    for i in range(128):
Martin@480
    33
        scoreMatrix[i][ord("-")] = -deleteCost
Martin@480
    34
        scoreMatrix[ord("-")][i] = -insertCost
Martin@480
    35
    return scoreMatrix
Martin@480
    36
Martin@480
    37
def isGoodAlignment(seqs, scoreMatrix, delOpenCost, insOpenCost, minScore):
Martin@480
    38
    """Does the alignment have a segment with score >= minScore?"""
Martin@480
    39
    r, q = seqs
Martin@480
    40
    score = 0
Martin@480
    41
    xOld = " "
Martin@480
    42
    yOld = " "
Martin@480
    43
    for x, y in itertools.izip(r, q):
Martin@480
    44
        score += scoreMatrix[ord(x)][ord(y)]
Martin@480
    45
        if score >= minScore: return True
Martin@480
    46
        if x == "-" and xOld != "-": score -= insOpenCost
Martin@480
    47
        if y == "-" and yOld != "-": score -= delOpenCost
Martin@480
    48
        if score < 0: score = 0
Martin@480
    49
        xOld = x
Martin@480
    50
        yOld = y
Martin@480
    51
    return False
Martin@480
    52
Martin@480
    53
def printIfGood(maf, seqs, scoreMatrix, delOpenCost, insOpenCost, minScore):
Martin@480
    54
    if isGoodAlignment(seqs, scoreMatrix, delOpenCost, insOpenCost, minScore):
Martin@480
    55
        for line in maf:
Martin@480
    56
            print line,
Martin@480
    57
        print
Martin@480
    58
Martin@760
    59
def doOneFile(file):
Martin@480
    60
    scoreMatrix = []
Martin@480
    61
    maf = []
Martin@480
    62
    seqs = []
Martin@480
    63
Martin@760
    64
    for line in file:
Martin@480
    65
        if line[0] == "#":
Martin@480
    66
            print line,
Martin@480
    67
            w = line.split()
Martin@480
    68
            for i in w:
Martin@480
    69
                if i.startswith("a="): aDel = int(i[2:])
Martin@480
    70
                if i.startswith("b="): bDel = int(i[2:])
Martin@480
    71
                if i.startswith("A="): aIns = int(i[2:])
Martin@480
    72
                if i.startswith("B="): bIns = int(i[2:])
Martin@480
    73
                if i.startswith("e="): minScore = int(i[2:])
Martin@480
    74
            if len(w) > 1 and max(map(len, w)) == 1:
Martin@480
    75
                colHeads = w[1:]
Martin@480
    76
                rowHeads = []
Martin@480
    77
                matrix = []
Martin@480
    78
            elif len(w) > 2 and len(w[1]) == 1:
Martin@480
    79
                rowHeads.append(w[1])
Martin@480
    80
                matrix.append(map(int, w[2:]))
Martin@480
    81
        elif line.isspace():
Martin@480
    82
            if seqs: printIfGood(maf, seqs, scoreMatrix, aDel, aIns, minScore)
Martin@480
    83
            maf = []
Martin@480
    84
            seqs = []
Martin@480
    85
        else:
Martin@480
    86
            if not scoreMatrix:
Martin@480
    87
                scoreMatrix = getScoreMatrix(rowHeads, colHeads, matrix,
Martin@480
    88
                                             bDel, bIns)
Martin@480
    89
            maf.append(line)
Martin@480
    90
            if line[0] == "s": seqs.append(line.split()[6])
Martin@480
    91
    if seqs: printIfGood(maf, seqs, scoreMatrix, aDel, aIns, minScore)
Martin@480
    92
Martin@760
    93
def lastPostmask(args):
Martin@760
    94
    if args:
Martin@760
    95
        for i in args:
Martin@760
    96
            if i == "-":
Martin@760
    97
                doOneFile(sys.stdin)
Martin@760
    98
            else:
Martin@760
    99
                with open(i) as file:
Martin@760
   100
                    doOneFile(file)
Martin@760
   101
    else:
Martin@760
   102
        doOneFile(sys.stdin)
Martin@760
   103
Martin@480
   104
if __name__ == "__main__":
Martin@480
   105
    signal.signal(signal.SIGPIPE, signal.SIG_DFL)  # avoid silly error message
Martin@480
   106
Martin@480
   107
    usage = "%prog in.maf > out.maf"
Martin@480
   108
    description = "Get alignments that have a segment with score >= threshold, with gentle masking of lowercase letters."
Martin@480
   109
    op = optparse.OptionParser(usage=usage, description=description)
Martin@480
   110
    (opts, args) = op.parse_args()
Martin@480
   111
Martin@480
   112
    try: lastPostmask(args)
Martin@480
   113
    except KeyboardInterrupt: pass  # avoid silly error message
Martin@480
   114
    except Exception, e:
Martin@480
   115
        prog = os.path.basename(sys.argv[0])
Martin@480
   116
        sys.exit(prog + ": error: " + str(e))