scripts/maf-join
author Martin C. Frith
Fri Jun 02 18:40:29 2017 +0900 (2017-06-02)
changeset 863 6a4915d5b5cb
parent 177 67a953e00e55
child 936 b1c09fdd12fe
permissions -rwxr-xr-x
last-dotplot: get bp-per-pixel faster
     1 #! /usr/bin/env python
     2 
     3 # Copyright 2009, 2010, 2011 Martin C. Frith
     4 
     5 # Join two or more sets of MAF-format multiple alignments into bigger
     6 # multiple alignments.  The 'join field' is the top genome, which
     7 # should be the same for each input.  Each input should be sorted by
     8 # position in the top genome.
     9 
    10 # WARNING: Alignment columns with a gap in the top genome are joined
    11 # arbitrarily!!!
    12 
    13 import sys, os, fileinput, optparse, signal
    14 
    15 signal.signal(signal.SIGPIPE, signal.SIG_DFL)  # stop spurious error message
    16 
    17 class peekable:  # Adapted from Python Cookbook 2nd edition
    18     """An iterator that supports a peek operation."""
    19     def __init__(self, iterable):
    20         self.it = iter(iterable)
    21         self.cache = []
    22     def __iter__(self):
    23         return self
    24     def next(self):
    25         if self.cache: return self.cache.pop()
    26         else: return self.it.next()
    27     def peek(self):
    28         if not self.cache: self.cache.append(self.it.next())
    29         return self.cache[0]
    30 
    31 def maxLen(things): return max(map(len, things))
    32 
    33 class MafBlock:
    34     def __init__(self, chr, beg, end, strand, chrSize, seq, prob):
    35         self.chr = chr  # chromosome names
    36         self.beg = beg  # alignment begin coordinates
    37         self.end = end  # alignment end coordinates
    38         self.strand = strand
    39         self.chrSize = chrSize  # chromosome sizes
    40         self.seq = seq  # aligned sequences, including gaps
    41         self.prob = prob  # probabilities (may be empty)
    42 
    43     def __nonzero__(self):
    44         return len(self.seq) > 0
    45 
    46     def __cmp__(self, other):
    47         return cmp(self.chr[:1] + self.beg[:1], other.chr[:1] + other.beg[:1])
    48 
    49     def before(self, other):
    50         return (self.chr[0], self.end[0]) <= (other.chr[0], other.beg[0])
    51 
    52     def after(self, other):
    53         return (self.chr[0], self.beg[0]) >= (other.chr[0], other.end[0])
    54 
    55     def addLine(self, line):
    56         words = line.split()
    57         if line.startswith('s'):
    58             self.chr.append(words[1])
    59             self.beg.append(int(words[2]))
    60             self.end.append(int(words[2]) + int(words[3]))
    61             self.strand.append(words[4])
    62             self.chrSize.append(words[5])
    63             self.seq.append(list(words[6]))
    64         elif line.startswith('p'):
    65             self.prob.append(words[1])
    66 
    67     def write(self):
    68         beg = map(str, self.beg)
    69         size = [str(e-b) for b, e in zip(self.beg, self.end)]
    70         seq = [''.join(i) for i in self.seq]
    71         columns = self.chr, beg, size, self.strand, self.chrSize, seq
    72         widths = map(maxLen, columns)
    73         print 'a'
    74         for row in zip(*columns):
    75             widthsAndFields = zip(widths, row)
    76             field0 = "%-*s" % widthsAndFields[0]  # left-justify
    77             fields = ["%*s" % i for i in widthsAndFields[1:]]  # right-justify
    78             print 's', field0, ' '.join(fields)
    79         pad = ' '.join(' ' * i for i in widths[:-1])
    80         for i in self.prob:
    81             print 'p', pad, i
    82         print  # blank line afterwards
    83 
    84 def topSeqBeg(maf): return maf.beg[0]
    85 def topSeqEnd(maf): return maf.end[0]
    86 def emptyMaf(): return MafBlock([], [], [], [], [], [], [])
    87 
    88 def joinOnFirstItem(x, y):
    89     if x[0] != y[0]:
    90         raise ValueError('join fields not equal:\n'+str(x[0])+'\n'+str(y[0]))
    91     return x + y[1:]
    92 
    93 def mafEasyJoin(x, y):
    94     '''Join two MAF blocks on the top sequence.'''
    95     xJoin = zip(x.chr, x.beg, x.end, x.strand, x.chrSize, x.seq)
    96     yJoin = zip(y.chr, y.beg, y.end, y.strand, y.chrSize, y.seq)
    97     joined = joinOnFirstItem(xJoin, yJoin)
    98     chr, beg, end, strand, chrSize, seq = zip(*joined)
    99     prob = x.prob + y.prob
   100     return MafBlock(chr, beg, end, strand, chrSize, seq, prob)
   101 
   102 def countNonGaps(s): return len(s) - s.count('-')
   103 
   104 def nthNonGap(s, n):
   105     '''Get the start position of the n-th non-gap.'''
   106     for i, x in enumerate(s):
   107         if x != '-':
   108             if n == 0: return i
   109             n -= 1
   110     raise ValueError('non-gap not found')
   111 
   112 def nthLastNonGap(s, n):
   113     '''Get the end position of the n-th last non-gap.'''
   114     return len(s) - nthNonGap(s[::-1], n)
   115 
   116 def mafSlice(maf, alnBeg, alnEnd):
   117     '''Return a slice of a MAF block, using coordinates in the alignment.'''
   118     beg = [b + countNonGaps(s[:alnBeg]) for b, s in zip(maf.beg, maf.seq)]
   119     end = [e - countNonGaps(s[alnEnd:]) for e, s in zip(maf.end, maf.seq)]
   120     seq = [i[alnBeg:alnEnd] for i in maf.seq]
   121     prob = [i[alnBeg:alnEnd] for i in maf.prob]
   122     return MafBlock(maf.chr, beg, end, maf.strand, maf.chrSize, seq, prob)
   123 
   124 def mafSliceTopSeq(maf, newTopSeqBeg, newTopSeqEnd):
   125     '''Return a slice of a MAF block, using coordinates in the top sequence.'''
   126     lettersFromBeg = newTopSeqBeg - topSeqBeg(maf)
   127     lettersFromEnd = topSeqEnd(maf) - newTopSeqEnd
   128     alnBeg = nthNonGap(maf.seq[0], lettersFromBeg)
   129     alnEnd = nthLastNonGap(maf.seq[0], lettersFromEnd)
   130     return mafSlice(maf, alnBeg, alnEnd)
   131 
   132 def jumpGaps(sequence, index):
   133     '''Return the next index of the sequence where there is a non-gap.'''
   134     nextIndex = index
   135     while sequence[nextIndex] == '-': nextIndex += 1
   136     return nextIndex
   137 
   138 def gapsToAdd(sequences):
   139     '''Return new gaps and their positions, needed to align the non-gaps.'''
   140     gapInfo = [[] for i in sequences]
   141     gapBeg = [0 for i in sequences]
   142     try:
   143         while True:
   144             gapEnd = [jumpGaps(s, p) for s, p in zip(sequences, gapBeg)]
   145             gapSize = [e-b for b, e in zip(gapBeg, gapEnd)]
   146             maxGapSize = max(gapSize)
   147             for s, e, i in zip(gapSize, gapEnd, gapInfo):
   148                 if s < maxGapSize:
   149                     newGap = maxGapSize - s
   150                     i.append((newGap, e))
   151             gapBeg = [e+1 for e in gapEnd]
   152     except IndexError: return gapInfo
   153 
   154 def chunksAndGaps(s, gapsAndPositions, oneGap):
   155     '''Yield chunks of "s" interspersed with gaps at given positions.'''
   156     oldPosition = 0
   157     for gapLen, position in gapsAndPositions:
   158         yield s[oldPosition:position]
   159         yield oneGap * gapLen
   160         oldPosition = position
   161     yield s[oldPosition:]
   162 
   163 def mafAddGaps(maf, gapsAndPositions):
   164     '''Add the given gaps at the given positions to a MAF block.'''
   165     maf.seq = [sum(chunksAndGaps(i, gapsAndPositions, ['-']), [])
   166                for i in maf.seq]
   167     maf.prob = [''.join(chunksAndGaps(i, gapsAndPositions, '~'))
   168                 for i in maf.prob]
   169 
   170 def mafJoin(mafs):
   171     '''Intersect and join overlapping MAF blocks.'''
   172     newTopSeqBeg = max(map(topSeqBeg, mafs))
   173     newTopSeqEnd = min(map(topSeqEnd, mafs))
   174     mafs = [mafSliceTopSeq(i, newTopSeqBeg, newTopSeqEnd) for i in mafs]
   175     topSeqs = [i.seq[0] for i in mafs]
   176     gapInfo = gapsToAdd(topSeqs)
   177     for maf, gapsAndPositions in zip(mafs, gapInfo):
   178         mafAddGaps(maf, gapsAndPositions)
   179     return reduce(mafEasyJoin, mafs)
   180 
   181 def mafInput(lines):
   182     '''Read lines and yield MAF blocks.'''
   183     maf = emptyMaf()
   184     for line in lines:
   185         if line.isspace():
   186             if maf: yield maf
   187             maf = emptyMaf()
   188         else:
   189             maf.addLine(line)
   190     if maf: yield maf
   191 
   192 def sortedMafInput(lines):
   193     '''Read lines and yield MAF blocks, checking that they are in order.'''
   194     old = emptyMaf()
   195     for maf in mafInput(lines):
   196         if maf < old: sys.exit(progName + ": MAF blocks not sorted properly")
   197         yield maf
   198         old = maf
   199 
   200 def allOverlaps(sequences, beg, end):
   201     '''Yield all combinations of MAF blocks that overlap in the top genome.'''
   202     assert beg < end
   203     if not sequences:
   204         yield ()
   205         return
   206     for i in sequences[0]:
   207         if topSeqEnd(i) <= beg: continue
   208         if topSeqBeg(i) >= end: break  # assumes they're sorted by position
   209         newBeg = max(beg, topSeqBeg(i))
   210         newEnd = min(end, topSeqEnd(i))
   211         for j in allOverlaps(sequences[1:], newBeg, newEnd):
   212             yield (i,) + j
   213 
   214 def nextWindow(window, input, referenceMaf):
   215     '''Yield "relevant" MAF blocks, based on overlap with referenceMaf.'''
   216     for maf in window:
   217         if not maf.before(referenceMaf): yield maf
   218     try:
   219         while True:
   220             maf = input.peek()
   221             if maf.after(referenceMaf): break
   222             maf = input.next()
   223             if not maf.before(referenceMaf): yield maf
   224     except StopIteration: pass
   225 
   226 def overlappingMafs(sortedMafInputs):
   227     '''Yield all combinations of MAF blocks that overlap in the top genome.'''
   228     if not sortedMafInputs: return
   229     head, tail = sortedMafInputs[0], sortedMafInputs[1:]
   230     windows = [[] for t in tail]
   231     for h in head:  # iterate over MAF blocks in the first input
   232         windows = [list(nextWindow(w, t, h)) for w, t in zip(windows, tail)]
   233         for i in allOverlaps(windows, topSeqBeg(h), topSeqEnd(h)):
   234             yield (h,) + i
   235 
   236 op = optparse.OptionParser(usage="%prog sorted-file1.maf sorted-file2.maf ...")
   237 (opts, args) = op.parse_args()
   238 
   239 progName = os.path.basename(sys.argv[0])
   240 
   241 inputs = [peekable(sortedMafInput(fileinput.input(i))) for i in args]
   242 
   243 for mafs in overlappingMafs(inputs):
   244     mafJoin(mafs).write()