scripts/maf-join
author Martin C. Frith
Fri Jun 02 18:40:29 2017 +0900 (2017-06-02)
changeset 863 6a4915d5b5cb
parent 177 67a953e00e55
child 936 b1c09fdd12fe
permissions -rwxr-xr-x
last-dotplot: get bp-per-pixel faster
Martin@47
     1
#! /usr/bin/env python
Martin@47
     2
Martin@177
     3
# Copyright 2009, 2010, 2011 Martin C. Frith
Martin@47
     4
Martin@47
     5
# Join two or more sets of MAF-format multiple alignments into bigger
Martin@47
     6
# multiple alignments.  The 'join field' is the top genome, which
Martin@47
     7
# should be the same for each input.  Each input should be sorted by
Martin@47
     8
# position in the top genome.
Martin@47
     9
Martin@47
    10
# WARNING: Alignment columns with a gap in the top genome are joined
Martin@47
    11
# arbitrarily!!!
Martin@47
    12
Martin@94
    13
import sys, os, fileinput, optparse, signal
Martin@94
    14
Martin@94
    15
signal.signal(signal.SIGPIPE, signal.SIG_DFL)  # stop spurious error message
Martin@47
    16
Martin@47
    17
class peekable:  # Adapted from Python Cookbook 2nd edition
Martin@47
    18
    """An iterator that supports a peek operation."""
Martin@47
    19
    def __init__(self, iterable):
Martin@47
    20
        self.it = iter(iterable)
Martin@47
    21
        self.cache = []
Martin@47
    22
    def __iter__(self):
Martin@47
    23
        return self
Martin@47
    24
    def next(self):
Martin@47
    25
        if self.cache: return self.cache.pop()
Martin@47
    26
        else: return self.it.next()
Martin@47
    27
    def peek(self):
Martin@47
    28
        if not self.cache: self.cache.append(self.it.next())
Martin@47
    29
        return self.cache[0]
Martin@47
    30
Martin@47
    31
def maxLen(things): return max(map(len, things))
Martin@47
    32
Martin@47
    33
class MafBlock:
Martin@47
    34
    def __init__(self, chr, beg, end, strand, chrSize, seq, prob):
Martin@47
    35
        self.chr = chr  # chromosome names
Martin@47
    36
        self.beg = beg  # alignment begin coordinates
Martin@47
    37
        self.end = end  # alignment end coordinates
Martin@47
    38
        self.strand = strand
Martin@47
    39
        self.chrSize = chrSize  # chromosome sizes
Martin@47
    40
        self.seq = seq  # aligned sequences, including gaps
Martin@47
    41
        self.prob = prob  # probabilities (may be empty)
Martin@47
    42
Martin@47
    43
    def __nonzero__(self):
Martin@47
    44
        return len(self.seq) > 0
Martin@47
    45
Martin@47
    46
    def __cmp__(self, other):
Martin@47
    47
        return cmp(self.chr[:1] + self.beg[:1], other.chr[:1] + other.beg[:1])
Martin@47
    48
Martin@47
    49
    def before(self, other):
Martin@47
    50
        return (self.chr[0], self.end[0]) <= (other.chr[0], other.beg[0])
Martin@47
    51
Martin@47
    52
    def after(self, other):
Martin@47
    53
        return (self.chr[0], self.beg[0]) >= (other.chr[0], other.end[0])
Martin@47
    54
Martin@47
    55
    def addLine(self, line):
Martin@47
    56
        words = line.split()
Martin@47
    57
        if line.startswith('s'):
Martin@47
    58
            self.chr.append(words[1])
Martin@47
    59
            self.beg.append(int(words[2]))
Martin@47
    60
            self.end.append(int(words[2]) + int(words[3]))
Martin@47
    61
            self.strand.append(words[4])
Martin@47
    62
            self.chrSize.append(words[5])
Martin@47
    63
            self.seq.append(list(words[6]))
Martin@47
    64
        elif line.startswith('p'):
Martin@177
    65
            self.prob.append(words[1])
Martin@47
    66
Martin@47
    67
    def write(self):
Martin@47
    68
        beg = map(str, self.beg)
Martin@47
    69
        size = [str(e-b) for b, e in zip(self.beg, self.end)]
Martin@47
    70
        seq = [''.join(i) for i in self.seq]
Martin@47
    71
        columns = self.chr, beg, size, self.strand, self.chrSize, seq
Martin@47
    72
        widths = map(maxLen, columns)
Martin@47
    73
        print 'a'
Martin@47
    74
        for row in zip(*columns):
Martin@47
    75
            widthsAndFields = zip(widths, row)
Martin@47
    76
            field0 = "%-*s" % widthsAndFields[0]  # left-justify
Martin@47
    77
            fields = ["%*s" % i for i in widthsAndFields[1:]]  # right-justify
Martin@47
    78
            print 's', field0, ' '.join(fields)
Martin@177
    79
        pad = ' '.join(' ' * i for i in widths[:-1])
Martin@47
    80
        for i in self.prob:
Martin@177
    81
            print 'p', pad, i
Martin@47
    82
        print  # blank line afterwards
Martin@47
    83
Martin@47
    84
def topSeqBeg(maf): return maf.beg[0]
Martin@47
    85
def topSeqEnd(maf): return maf.end[0]
Martin@47
    86
def emptyMaf(): return MafBlock([], [], [], [], [], [], [])
Martin@47
    87
Martin@47
    88
def joinOnFirstItem(x, y):
Martin@47
    89
    if x[0] != y[0]:
Martin@47
    90
        raise ValueError('join fields not equal:\n'+str(x[0])+'\n'+str(y[0]))
Martin@47
    91
    return x + y[1:]
Martin@47
    92
Martin@47
    93
def mafEasyJoin(x, y):
Martin@47
    94
    '''Join two MAF blocks on the top sequence.'''
Martin@47
    95
    xJoin = zip(x.chr, x.beg, x.end, x.strand, x.chrSize, x.seq)
Martin@47
    96
    yJoin = zip(y.chr, y.beg, y.end, y.strand, y.chrSize, y.seq)
Martin@47
    97
    joined = joinOnFirstItem(xJoin, yJoin)
Martin@47
    98
    chr, beg, end, strand, chrSize, seq = zip(*joined)
Martin@47
    99
    prob = x.prob + y.prob
Martin@47
   100
    return MafBlock(chr, beg, end, strand, chrSize, seq, prob)
Martin@47
   101
Martin@47
   102
def countNonGaps(s): return len(s) - s.count('-')
Martin@47
   103
Martin@47
   104
def nthNonGap(s, n):
Martin@47
   105
    '''Get the start position of the n-th non-gap.'''
Martin@47
   106
    for i, x in enumerate(s):
Martin@47
   107
        if x != '-':
Martin@47
   108
            if n == 0: return i
Martin@47
   109
            n -= 1
Martin@47
   110
    raise ValueError('non-gap not found')
Martin@47
   111
Martin@47
   112
def nthLastNonGap(s, n):
Martin@47
   113
    '''Get the end position of the n-th last non-gap.'''
Martin@47
   114
    return len(s) - nthNonGap(s[::-1], n)
Martin@47
   115
Martin@47
   116
def mafSlice(maf, alnBeg, alnEnd):
Martin@47
   117
    '''Return a slice of a MAF block, using coordinates in the alignment.'''
Martin@47
   118
    beg = [b + countNonGaps(s[:alnBeg]) for b, s in zip(maf.beg, maf.seq)]
Martin@47
   119
    end = [e - countNonGaps(s[alnEnd:]) for e, s in zip(maf.end, maf.seq)]
Martin@47
   120
    seq = [i[alnBeg:alnEnd] for i in maf.seq]
Martin@47
   121
    prob = [i[alnBeg:alnEnd] for i in maf.prob]
Martin@47
   122
    return MafBlock(maf.chr, beg, end, maf.strand, maf.chrSize, seq, prob)
Martin@47
   123
Martin@47
   124
def mafSliceTopSeq(maf, newTopSeqBeg, newTopSeqEnd):
Martin@47
   125
    '''Return a slice of a MAF block, using coordinates in the top sequence.'''
Martin@47
   126
    lettersFromBeg = newTopSeqBeg - topSeqBeg(maf)
Martin@47
   127
    lettersFromEnd = topSeqEnd(maf) - newTopSeqEnd
Martin@47
   128
    alnBeg = nthNonGap(maf.seq[0], lettersFromBeg)
Martin@47
   129
    alnEnd = nthLastNonGap(maf.seq[0], lettersFromEnd)
Martin@47
   130
    return mafSlice(maf, alnBeg, alnEnd)
Martin@47
   131
Martin@47
   132
def jumpGaps(sequence, index):
Martin@47
   133
    '''Return the next index of the sequence where there is a non-gap.'''
Martin@47
   134
    nextIndex = index
Martin@47
   135
    while sequence[nextIndex] == '-': nextIndex += 1
Martin@47
   136
    return nextIndex
Martin@47
   137
Martin@47
   138
def gapsToAdd(sequences):
Martin@47
   139
    '''Return new gaps and their positions, needed to align the non-gaps.'''
Martin@47
   140
    gapInfo = [[] for i in sequences]
Martin@47
   141
    gapBeg = [0 for i in sequences]
Martin@47
   142
    try:
Martin@47
   143
        while True:
Martin@47
   144
            gapEnd = [jumpGaps(s, p) for s, p in zip(sequences, gapBeg)]
Martin@47
   145
            gapSize = [e-b for b, e in zip(gapBeg, gapEnd)]
Martin@47
   146
            maxGapSize = max(gapSize)
Martin@47
   147
            for s, e, i in zip(gapSize, gapEnd, gapInfo):
Martin@47
   148
                if s < maxGapSize:
Martin@177
   149
                    newGap = maxGapSize - s
Martin@47
   150
                    i.append((newGap, e))
Martin@47
   151
            gapBeg = [e+1 for e in gapEnd]
Martin@47
   152
    except IndexError: return gapInfo
Martin@47
   153
Martin@177
   154
def chunksAndGaps(s, gapsAndPositions, oneGap):
Martin@47
   155
    '''Yield chunks of "s" interspersed with gaps at given positions.'''
Martin@47
   156
    oldPosition = 0
Martin@177
   157
    for gapLen, position in gapsAndPositions:
Martin@47
   158
        yield s[oldPosition:position]
Martin@177
   159
        yield oneGap * gapLen
Martin@47
   160
        oldPosition = position
Martin@47
   161
    yield s[oldPosition:]
Martin@47
   162
Martin@47
   163
def mafAddGaps(maf, gapsAndPositions):
Martin@47
   164
    '''Add the given gaps at the given positions to a MAF block.'''
Martin@177
   165
    maf.seq = [sum(chunksAndGaps(i, gapsAndPositions, ['-']), [])
Martin@177
   166
               for i in maf.seq]
Martin@177
   167
    maf.prob = [''.join(chunksAndGaps(i, gapsAndPositions, '~'))
Martin@177
   168
                for i in maf.prob]
Martin@47
   169
Martin@47
   170
def mafJoin(mafs):
Martin@47
   171
    '''Intersect and join overlapping MAF blocks.'''
Martin@47
   172
    newTopSeqBeg = max(map(topSeqBeg, mafs))
Martin@47
   173
    newTopSeqEnd = min(map(topSeqEnd, mafs))
Martin@47
   174
    mafs = [mafSliceTopSeq(i, newTopSeqBeg, newTopSeqEnd) for i in mafs]
Martin@47
   175
    topSeqs = [i.seq[0] for i in mafs]
Martin@47
   176
    gapInfo = gapsToAdd(topSeqs)
Martin@47
   177
    for maf, gapsAndPositions in zip(mafs, gapInfo):
Martin@47
   178
        mafAddGaps(maf, gapsAndPositions)
Martin@47
   179
    return reduce(mafEasyJoin, mafs)
Martin@47
   180
Martin@47
   181
def mafInput(lines):
Martin@47
   182
    '''Read lines and yield MAF blocks.'''
Martin@47
   183
    maf = emptyMaf()
Martin@47
   184
    for line in lines:
Martin@47
   185
        if line.isspace():
Martin@47
   186
            if maf: yield maf
Martin@47
   187
            maf = emptyMaf()
Martin@47
   188
        else:
Martin@47
   189
            maf.addLine(line)
Martin@47
   190
    if maf: yield maf
Martin@47
   191
Martin@47
   192
def sortedMafInput(lines):
Martin@47
   193
    '''Read lines and yield MAF blocks, checking that they are in order.'''
Martin@47
   194
    old = emptyMaf()
Martin@47
   195
    for maf in mafInput(lines):
Martin@47
   196
        if maf < old: sys.exit(progName + ": MAF blocks not sorted properly")
Martin@47
   197
        yield maf
Martin@47
   198
        old = maf
Martin@47
   199
Martin@47
   200
def allOverlaps(sequences, beg, end):
Martin@47
   201
    '''Yield all combinations of MAF blocks that overlap in the top genome.'''
Martin@47
   202
    assert beg < end
Martin@47
   203
    if not sequences:
Martin@47
   204
        yield ()
Martin@47
   205
        return
Martin@47
   206
    for i in sequences[0]:
Martin@47
   207
        if topSeqEnd(i) <= beg: continue
Martin@47
   208
        if topSeqBeg(i) >= end: break  # assumes they're sorted by position
Martin@47
   209
        newBeg = max(beg, topSeqBeg(i))
Martin@47
   210
        newEnd = min(end, topSeqEnd(i))
Martin@47
   211
        for j in allOverlaps(sequences[1:], newBeg, newEnd):
Martin@47
   212
            yield (i,) + j
Martin@47
   213
Martin@47
   214
def nextWindow(window, input, referenceMaf):
Martin@47
   215
    '''Yield "relevant" MAF blocks, based on overlap with referenceMaf.'''
Martin@47
   216
    for maf in window:
Martin@47
   217
        if not maf.before(referenceMaf): yield maf
Martin@47
   218
    try:
Martin@47
   219
        while True:
Martin@47
   220
            maf = input.peek()
Martin@47
   221
            if maf.after(referenceMaf): break
Martin@47
   222
            maf = input.next()
Martin@47
   223
            if not maf.before(referenceMaf): yield maf
Martin@47
   224
    except StopIteration: pass
Martin@47
   225
Martin@47
   226
def overlappingMafs(sortedMafInputs):
Martin@47
   227
    '''Yield all combinations of MAF blocks that overlap in the top genome.'''
Martin@47
   228
    if not sortedMafInputs: return
Martin@47
   229
    head, tail = sortedMafInputs[0], sortedMafInputs[1:]
Martin@47
   230
    windows = [[] for t in tail]
Martin@47
   231
    for h in head:  # iterate over MAF blocks in the first input
Martin@47
   232
        windows = [list(nextWindow(w, t, h)) for w, t in zip(windows, tail)]
Martin@47
   233
        for i in allOverlaps(windows, topSeqBeg(h), topSeqEnd(h)):
Martin@47
   234
            yield (h,) + i
Martin@47
   235
Martin@67
   236
op = optparse.OptionParser(usage="%prog sorted-file1.maf sorted-file2.maf ...")
Martin@67
   237
(opts, args) = op.parse_args()
Martin@67
   238
Martin@47
   239
progName = os.path.basename(sys.argv[0])
Martin@47
   240
Martin@67
   241
inputs = [peekable(sortedMafInput(fileinput.input(i))) for i in args]
Martin@47
   242
Martin@47
   243
for mafs in overlappingMafs(inputs):
Martin@47
   244
    mafJoin(mafs).write()