scripts/maf-swap
author Martin C. Frith
Fri Jun 02 18:40:29 2017 +0900 (2017-06-02)
changeset 863 6a4915d5b5cb
parent 177 67a953e00e55
child 936 b1c09fdd12fe
permissions -rwxr-xr-x
last-dotplot: get bp-per-pixel faster
Martin@1
     1
#! /usr/bin/env python
Martin@1
     2
Martin@144
     3
# Read MAF-format alignments, and write them, after moving the Nth
Martin@144
     4
# sequence to the top in each alignment.
Martin@1
     5
Martin@144
     6
# Before writing, if the top sequence would be on the - strand, then
Martin@144
     7
# flip all the strands.  But don't do this if the top sequence is
Martin@144
     8
# translated DNA.
Martin@94
     9
Martin@144
    10
# Seems to work with Python 2.x, x>=4
Martin@47
    11
Martin@144
    12
import fileinput, itertools, optparse, os, signal, string, sys
Martin@144
    13
Martin@144
    14
def filterComments(lines):
Martin@144
    15
    for i in lines:
Martin@144
    16
        if i.startswith("#"): print i,
Martin@144
    17
        else: yield i
Martin@144
    18
Martin@144
    19
def mafInput(lines):
Martin@144
    20
    for k, v in itertools.groupby(lines, str.isspace):
Martin@144
    21
        if not k: yield list(v)
Martin@144
    22
Martin@144
    23
def indexOfNthSequence(mafLines, n):
Martin@144
    24
    for i, line in enumerate(mafLines):
Martin@144
    25
        if line.startswith("s"):
Martin@144
    26
            if n == 1: return i
Martin@144
    27
            n -= 1
Martin@144
    28
    raise Exception("encountered an alignment with too few sequences")
Martin@1
    29
Martin@147
    30
def rangeOfNthSequence(mafLines, n):
Martin@147
    31
    """Get the range of lines associated with the Nth sequence."""
Martin@147
    32
    start = indexOfNthSequence(mafLines, n)
Martin@147
    33
    stop = start + 1
Martin@147
    34
    while stop < len(mafLines):
Martin@147
    35
        line = mafLines[stop]
Martin@147
    36
        if not (line.startswith("q") or line.startswith("i")): break
Martin@147
    37
        stop += 1
Martin@147
    38
    return start, stop
Martin@147
    39
Martin@1
    40
complement = string.maketrans('ACGTNSWRYKMBDHVacgtnswrykmbdhv',
Martin@1
    41
                              'TGCANSWYRMKVHDBtgcanswyrmkvhdb')
Martin@144
    42
# doesn't handle "U" in RNA sequences
Martin@1
    43
def revcomp(seq):
Martin@1
    44
    return seq[::-1].translate(complement)
Martin@1
    45
Martin@144
    46
def flippedMafS(words):
Martin@144
    47
    alnStart = int(words[2])
Martin@144
    48
    alnSize = int(words[3])
Martin@144
    49
    strand = words[4]
Martin@144
    50
    seqSize = int(words[5])
Martin@144
    51
    alnString = words[6]
Martin@144
    52
    newStart = seqSize - alnStart - alnSize
Martin@144
    53
    if strand == "-": newStrand = "+"
Martin@144
    54
    else:             newStrand = "-"
Martin@144
    55
    newString = revcomp(alnString)
Martin@144
    56
    out = words[0], words[1], newStart, alnSize, newStrand, seqSize, newString
Martin@144
    57
    return map(str, out)
Martin@1
    58
Martin@144
    59
def flippedMafP(words):
Martin@177
    60
    flippedString = words[1][::-1]
Martin@177
    61
    return words[:1] + [flippedString]
Martin@1
    62
Martin@144
    63
def flippedMafQ(words):
Martin@144
    64
    qualityString = words[2]
Martin@144
    65
    flippedString = qualityString[::-1]
Martin@144
    66
    return words[:2] + [flippedString]
Martin@57
    67
Martin@144
    68
def flippedMafLine(mafLine):
Martin@144
    69
    words = mafLine.split()
Martin@144
    70
    if   words[0] == "s": return flippedMafS(words)
Martin@144
    71
    elif words[0] == "p": return flippedMafP(words)
Martin@144
    72
    elif words[0] == "q": return flippedMafQ(words)
Martin@144
    73
    else: return words
Martin@18
    74
Martin@144
    75
def maxlen(s):
Martin@144
    76
    return max(map(len, s))
Martin@18
    77
Martin@144
    78
def sLineFieldWidths(mafLines):
Martin@144
    79
    sLines = (i for i in mafLines if i[0] == "s")
Martin@144
    80
    sColumns = zip(*sLines)
Martin@144
    81
    return map(maxlen, sColumns)
Martin@47
    82
Martin@144
    83
def joinedMafS(words, fieldWidths):
Martin@144
    84
    formatParams = itertools.chain(*zip(fieldWidths, words))
Martin@144
    85
    return "%*s %-*s %*s %*s %*s %*s %*s\n" % tuple(formatParams)
Martin@1
    86
Martin@144
    87
def joinedMafLine(words, fieldWidths):
Martin@144
    88
    if words[0] == "s":
Martin@144
    89
        return joinedMafS(words, fieldWidths)
Martin@144
    90
    elif words[0] == "q":
Martin@144
    91
        words = words[:2] + [""] * 4 + words[2:]
Martin@144
    92
        return joinedMafS(words, fieldWidths)
Martin@177
    93
    elif words[0] == "p":
Martin@177
    94
        words = words[:1] + [""] * 5 + words[1:]
Martin@177
    95
        return joinedMafS(words, fieldWidths)
Martin@144
    96
    else:
Martin@144
    97
        return " ".join(words) + "\n"
Martin@1
    98
Martin@144
    99
def flippedMaf(mafLines):
Martin@144
   100
    flippedLines = map(flippedMafLine, mafLines)
Martin@144
   101
    fieldWidths = sLineFieldWidths(flippedLines)
Martin@144
   102
    return (joinedMafLine(i, fieldWidths) for i in flippedLines)
Martin@1
   103
Martin@144
   104
def isCanonicalStrand(mafLine):
Martin@144
   105
    words = mafLine.split()
Martin@144
   106
    strand = words[4]
Martin@144
   107
    if strand == "+": return True
Martin@144
   108
    alnString = words[6]
Martin@144
   109
    if "/" in alnString or "\\" in alnString: return True  # frameshifts
Martin@144
   110
    alnSize = int(words[3])
Martin@144
   111
    gapCount = alnString.count("-")
Martin@144
   112
    if len(alnString) - gapCount < alnSize: return True  # translated DNA
Martin@144
   113
    return False
Martin@144
   114
Martin@144
   115
def mafSwap(opts, args):
Martin@144
   116
    inputLines = fileinput.input(args)
Martin@144
   117
    for mafLines in mafInput(filterComments(inputLines)):
Martin@147
   118
        start, stop = rangeOfNthSequence(mafLines, opts.n)
Martin@147
   119
        mafLines[1:stop] = mafLines[start:stop] + mafLines[1:start]
Martin@147
   120
        if not isCanonicalStrand(mafLines[1]):
Martin@144
   121
            mafLines = flippedMaf(mafLines)
Martin@144
   122
        for i in mafLines: print i,
Martin@144
   123
        print  # blank line after each alignment
Martin@144
   124
Martin@144
   125
if __name__ == "__main__":
Martin@144
   126
    signal.signal(signal.SIGPIPE, signal.SIG_DFL)  # avoid silly error message
Martin@144
   127
Martin@144
   128
    usage = "%prog [options] my-alignments.maf"
Martin@144
   129
    description = "Change the order of sequences in MAF-format alignments."
Martin@144
   130
    op = optparse.OptionParser(usage=usage, description=description)
Martin@144
   131
    op.add_option("-n", type="int", default=2,
Martin@144
   132
                  help="move the Nth sequence to the top (default: %default)")
Martin@144
   133
    (opts, args) = op.parse_args()
Martin@144
   134
    if opts.n < 1: op.error("option -n: should be >= 1")
Martin@144
   135
Martin@144
   136
    try: mafSwap(opts, args)
Martin@144
   137
    except KeyboardInterrupt: pass  # avoid silly error message
Martin@144
   138
    except Exception, e:
Martin@144
   139
        prog = os.path.basename(sys.argv[0])
Martin@144
   140
        sys.exit(prog + ": error: " + str(e))