scripts/maf-convert
author Martin C. Frith
Fri Jun 02 18:40:29 2017 +0900 (2017-06-02)
changeset 863 6a4915d5b5cb
parent 831 c5591feba7a2
child 875 592295375eb1
permissions -rwxr-xr-x
last-dotplot: get bp-per-pixel faster
     1 #! /usr/bin/env python
     2 # Copyright 2010, 2011, 2013, 2014 Martin C. Frith
     3 # Read MAF-format alignments: write them in other formats.
     4 # Seems to work with Python 2.x, x>=6
     5 
     6 # By "MAF" we mean "multiple alignment format" described in the UCSC
     7 # Genome FAQ, not e.g. "MIRA assembly format".
     8 
     9 from itertools import *
    10 import math, optparse, os, signal, sys
    11 
    12 def maxlen(s):
    13     return max(map(len, s))
    14 
    15 def pairOrDie(sLines, formatName):
    16     if len(sLines) != 2:
    17         e = "for %s, each alignment must have 2 sequences" % formatName
    18         raise Exception(e)
    19     return sLines
    20 
    21 def isMatch(alignmentColumn):
    22     # No special treatment of ambiguous bases/residues: same as NCBI BLAST.
    23     first = alignmentColumn[0].upper()
    24     for i in alignmentColumn[1:]:
    25         if i.upper() != first: return False
    26     return True
    27 
    28 def gapRunCount(row):
    29     """Get the number of runs of gap characters."""
    30     return sum(k == "-" for k, v in groupby(row))
    31 
    32 def alignmentRowsFromColumns(columns):
    33     return imap(''.join, izip(*columns))
    34 
    35 def symbolSize(symbol, letterSize):
    36     if symbol == "\\": return 1
    37     if symbol == "/": return -1
    38     return letterSize
    39 
    40 def insertSize(row, letterSize):
    41     """Get the length of sequence included in the row."""
    42     return (len(row) - row.count("-")) * letterSize - 4 * row.count("/") - 2 * row.count("\\")
    43 
    44 def matchAndInsertSizes(alignmentColumns, letterSizes):
    45     """Get sizes of gapless blocks, and of the inserts between them."""
    46     letterSizeA, letterSizeB = letterSizes
    47     delSize = insSize = subSize = 0
    48     for x, y in alignmentColumns:
    49         if x == "-":
    50             if subSize:
    51                 if delSize or insSize: yield str(delSize) + ":" + str(insSize)
    52                 yield str(subSize)
    53                 delSize = insSize = subSize = 0
    54             insSize += symbolSize(y, letterSizeB)
    55         elif y == "-":
    56             if subSize:
    57                 if delSize or insSize: yield str(delSize) + ":" + str(insSize)
    58                 yield str(subSize)
    59                 delSize = insSize = subSize = 0
    60             delSize += symbolSize(x, letterSizeA)
    61         else:
    62             subSize += 1
    63     if delSize or insSize: yield str(delSize) + ":" + str(insSize)
    64     if subSize: yield str(subSize)
    65 
    66 ##### Routines for reading MAF format: #####
    67 
    68 def updateEvalueParameters(opts, line):
    69     for field in line.split():
    70         try:
    71             k, v = field.split("=")
    72             x = float(v)
    73             if k == "lambda":
    74                 opts.bitScoreA = x / math.log(2)
    75             if k == "K":
    76                 opts.bitScoreB = math.log(x, 2)
    77         except ValueError:
    78             pass
    79 
    80 def scoreAndEvalue(aLine):
    81     score = evalue = None
    82     for i in aLine.split():
    83         if i.startswith("score="):
    84             score = i[6:]
    85         elif i.startswith("E="):
    86             evalue = i[2:]
    87     return score, evalue
    88 
    89 def mafInput(opts, lines):
    90     aLine = ""
    91     sLines = []
    92     qLines = []
    93     pLines = []
    94     for line in lines:
    95         if line[0] == "s":
    96             junk, seqName, beg, span, strand, seqLen, row = line.split()
    97             beg = int(beg)
    98             span = int(span)
    99             seqLen = int(seqLen)
   100             if "\\" in row or "/" in row or len(row) - row.count("-") < span:
   101                 letterSize = 3
   102             else:
   103                 letterSize = 1
   104             fields = seqName, seqLen, strand, letterSize, beg, beg + span, row
   105             sLines.append(fields)
   106         elif line[0] == "a":
   107             aLine = line
   108         elif line[0] == "q":
   109             qLines.append(line)
   110         elif line[0] == "p":
   111             pLines.append(line)
   112         elif line.isspace():
   113             if sLines: yield aLine, sLines, qLines, pLines
   114             aLine = ""
   115             sLines = []
   116             qLines = []
   117             pLines = []
   118         elif line[0] == "#":
   119             updateEvalueParameters(opts, line)
   120             if opts.isKeepComments:
   121                 print line,
   122     if sLines: yield aLine, sLines, qLines, pLines
   123 
   124 def isJoinable(opts, oldMaf, newMaf):
   125     x = oldMaf[1]
   126     y = newMaf[1]
   127     if x[-1][2] == "-":
   128         x, y = y, x
   129     return all(i[:4] == j[:4] and i[5] <= j[4] and j[4] - i[5] <= opts.join
   130                for i, j in zip(x, y))
   131 
   132 def fixOrder(mafs):
   133     sLines = mafs[0][1]
   134     if sLines[-1][2] == "-":
   135         mafs.reverse()
   136 
   137 def mafGroupInput(opts, lines):
   138     x = []
   139     for i in mafInput(opts, lines):
   140         if x and not isJoinable(opts, x[-1], i):
   141             fixOrder(x)
   142             yield x
   143             x = []
   144         x.append(i)
   145     if x:
   146         fixOrder(x)
   147         yield x
   148 
   149 ##### Routines for converting to AXT format: #####
   150 
   151 axtCounter = count()
   152 
   153 def writeAxt(maf):
   154     aLine, sLines, qLines, pLines = maf
   155 
   156     if sLines[0][2] != "+":
   157         raise Exception("for AXT, the 1st strand in each alignment must be +")
   158 
   159     # Convert to AXT's 1-based coordinates:
   160     ranges = [(i[0], str(i[4] + 1), str(i[5]), i[2]) for i in sLines]
   161 
   162     head, body = ranges[0], ranges[1:]
   163 
   164     outWords = [str(axtCounter.next())]
   165     outWords.extend(head[:3])
   166     for i in body:
   167         outWords.extend(i)
   168 
   169     score, evalue = scoreAndEvalue(aLine)
   170     if score:
   171         outWords.append(score)
   172 
   173     print " ".join(outWords)
   174     for i in sLines:
   175         print i[6]
   176     print  # print a blank line at the end
   177 
   178 def mafConvertToAxt(opts, lines):
   179     for maf in mafInput(opts, lines):
   180         writeAxt(maf)
   181 
   182 ##### Routines for converting to tabular format: #####
   183 
   184 def writeTab(maf):
   185     aLine, sLines, qLines, pLines = maf
   186 
   187     score = "0"
   188     endWords = []
   189     for i in aLine.split():
   190         if   i.startswith("score="):
   191             score = i[6:]
   192         elif len(i) > 1:
   193             endWords.append(i)
   194 
   195     outWords = [score]
   196 
   197     for seqName, seqLen, strand, letterSize, beg, end, row in sLines:
   198         x = seqName, str(beg), str(end - beg), strand, str(seqLen)
   199         outWords.extend(x)
   200 
   201     letterSizes = [i[3] for i in sLines]
   202     rows = [i[6] for i in sLines]
   203     alignmentColumns = izip(*rows)
   204     gapWord = ",".join(matchAndInsertSizes(alignmentColumns, letterSizes))
   205     outWords.append(gapWord)
   206 
   207     print "\t".join(outWords + endWords)
   208 
   209 def mafConvertToTab(opts, lines):
   210     for maf in mafInput(opts, lines):
   211         writeTab(maf)
   212 
   213 ##### Routines for converting to PSL format: #####
   214 
   215 def pslBlocks(opts, mafs, outCounts):
   216     """Get sizes and start coordinates of gapless blocks in an alignment."""
   217     # repMatches is always zero
   218     # for proteins, nCount is always zero, because that's what BLATv34 does
   219     normalBases = "ACGTU"
   220     matches = mismatches = repMatches = nCount = 0
   221 
   222     for maf in mafs:
   223         sLines = maf[1]
   224         fieldsA, fieldsB = pairOrDie(sLines, "PSL")
   225         letterSizeA, begA, endA, rowA = fieldsA[3:7]
   226         letterSizeB, begB, endB, rowB = fieldsB[3:7]
   227 
   228         size = 0
   229         for x, y in izip(rowA.upper(), rowB.upper()):
   230             if x == "-":
   231                 if size:
   232                     yield size, begA, begB
   233                     begA += size * letterSizeA
   234                     begB += size * letterSizeB
   235                     size = 0
   236                 begB += symbolSize(y, letterSizeB)
   237             elif y == "-":
   238                 if size:
   239                     yield size, begA, begB
   240                     begA += size * letterSizeA
   241                     begB += size * letterSizeB
   242                     size = 0
   243                 begA += symbolSize(x, letterSizeA)
   244             else:
   245                 size += 1
   246                 if x in normalBases and y in normalBases or opts.protein:
   247                     if x == y:
   248                         matches += 1
   249                     else:
   250                         mismatches += 1
   251                 else:
   252                     nCount += 1
   253         if size:
   254             yield size, begA, begB
   255 
   256     outCounts[0:4] = matches, mismatches, repMatches, nCount
   257 
   258 def pslNumInserts(blocks, letterSizeA, letterSizeB):
   259     numInsertA = numInsertB = 0
   260     for i, x in enumerate(blocks):
   261         size, begA, begB = x
   262         if i:
   263             if begA > endA:
   264                 numInsertA += 1
   265             if begB > endB:
   266                 numInsertB += 1
   267         endA = begA + size * letterSizeA
   268         endB = begB + size * letterSizeB
   269     return numInsertA, numInsertB
   270 
   271 def pslCommaString(things):
   272     # UCSC software seems to prefer a trailing comma
   273     return ",".join(map(str, things)) + ","
   274 
   275 def pslEnds(seqLen, strand, beg, end):
   276     if strand == "-":
   277         return seqLen - end, seqLen - beg
   278     return beg, end
   279 
   280 def writePsl(opts, mafs):
   281     matchCounts = [0] * 4
   282     blocks = list(pslBlocks(opts, mafs, matchCounts))
   283     matches, mismatches, repMatches, nCount = matchCounts
   284     numGaplessColumns = sum(matchCounts)
   285 
   286     if not blocks:
   287         return
   288 
   289     fieldsA, fieldsB = mafs[0][1]
   290     headSize, headBegA, headBegB = blocks[0]
   291     tailSize, tailBegA, tailBegB = blocks[-1]
   292 
   293     seqNameA, seqLenA, strandA, letterSizeA = fieldsA[0:4]
   294     begA, endA = pslEnds(seqLenA, strandA, headBegA, tailBegA + tailSize)
   295     baseInsertA = endA - begA - numGaplessColumns * letterSizeA
   296 
   297     seqNameB, seqLenB, strandB, letterSizeB = fieldsB[0:4]
   298     begB, endB = pslEnds(seqLenB, strandB, headBegB, tailBegB + tailSize)
   299     baseInsertB = endB - begB - numGaplessColumns * letterSizeB
   300 
   301     numInsertA, numInsertB = pslNumInserts(blocks, letterSizeA, letterSizeB)
   302 
   303     strand = strandB
   304     if letterSizeA > 1 or letterSizeB > 1:
   305         strand += strandA
   306     elif strandA != "+":
   307         raise Exception("for non-translated PSL, the 1st strand in each alignment must be +")
   308 
   309     blockCount = len(blocks)
   310     blockSizes, blockStartsA, blockStartsB = map(pslCommaString, zip(*blocks))
   311 
   312     outWords = (matches, mismatches, repMatches, nCount,
   313                 numInsertB, baseInsertB, numInsertA, baseInsertA, strand,
   314                 seqNameB, seqLenB, begB, endB, seqNameA, seqLenA, begA, endA,
   315                 blockCount, blockSizes, blockStartsB, blockStartsA)
   316 
   317     print "\t".join(map(str, outWords))
   318 
   319 def mafConvertToPsl(opts, lines):
   320     if opts.join:
   321         for i in mafGroupInput(opts, lines):
   322             writePsl(opts, i)
   323     else:
   324         for i in mafInput(opts, lines):
   325             writePsl(opts, [i])
   326 
   327 ##### Routines for converting to SAM format: #####
   328 
   329 def readGroupId(readGroupItems):
   330     for i in readGroupItems:
   331         if i.startswith("ID:"):
   332             return i[3:]
   333     raise Exception("readgroup must include ID")
   334 
   335 def readSequenceLengths(fileNames):
   336     """Read name & length of topmost sequence in each maf block."""
   337     for i in fileNames:
   338         with open(i) as f:
   339             fields = None
   340             for line in f:
   341                 if fields:
   342                     if line.isspace():
   343                         fields = None
   344                 else:
   345                     if line[0] == "s":
   346                         fields = line.split()
   347                         yield fields[1], fields[5]
   348 
   349 def naturalSortKey(s):
   350     """Return a key that sorts strings in "natural" order."""
   351     return [(str, int)[k]("".join(v)) for k, v in groupby(s, str.isdigit)]
   352 
   353 def karyotypicSortKey(s):
   354     """Attempt to sort chromosomes in GATK's ridiculous order."""
   355     if s == "chrM": return []
   356     if s == "MT": return ["~"]
   357     return naturalSortKey(s)
   358 
   359 def copyDictFile(lines):
   360     for line in lines:
   361         if line.startswith("@SQ"):
   362             sys.stdout.write(line)
   363         elif not line[0] == "@":
   364             break
   365 
   366 def writeSamHeader(opts, fileNames):
   367     print "@HD\tVN:1.3\tSO:unsorted"
   368 
   369     if opts.dictionary:
   370         sequenceLengths = dict(readSequenceLengths(fileNames))
   371         for k in sorted(sequenceLengths, key=karyotypicSortKey):
   372             print "@SQ\tSN:%s\tLN:%s" % (k, sequenceLengths[k])
   373 
   374     if opts.dictfile:
   375         if opts.dictfile == "-":
   376             copyDictFile(sys.stdin)
   377         else:
   378             with open(opts.dictfile) as f:
   379                 copyDictFile(f)
   380 
   381     if opts.readgroup:
   382         print "@RG\t" + "\t".join(opts.readgroup.split())
   383 
   384 mapqMissing = "255"
   385 mapqMaximum = "254"
   386 mapqMaximumNum = float(mapqMaximum)
   387 
   388 def mapqFromProb(probString):
   389     try: p = float(probString)
   390     except ValueError: raise Exception("bad probability: " + probString)
   391     if p < 0 or p > 1: raise Exception("bad probability: " + probString)
   392     if p == 0: return mapqMaximum
   393     phred = -10 * math.log(p, 10)
   394     if phred >= mapqMaximumNum: return mapqMaximum
   395     return str(int(round(phred)))
   396 
   397 def cigarParts(beg, alignmentColumns, end):
   398     if beg: yield str(beg) + "H"
   399 
   400     # (doesn't handle translated alignments)
   401     # uses "read-ahead" technique, aiming to be as fast as possible:
   402     isActive = True
   403     for x, y in alignmentColumns: break
   404     else: isActive = False
   405     while isActive:
   406         size = 1
   407         if x == y:  # xxx assumes no gap-gap columns, ignores ambiguous bases
   408             for x, y in alignmentColumns:
   409                 if x != y: break
   410                 size += 1
   411             else: isActive = False
   412             yield str(size) + "="
   413         elif x == "-":
   414             for x, y in alignmentColumns:
   415                 if x != "-": break
   416                 size += 1
   417             else: isActive = False
   418             yield str(size) + "I"
   419         elif y == "-":
   420             for x, y in alignmentColumns:
   421                 if y != "-": break
   422                 size += 1
   423             else: isActive = False
   424             yield str(size) + "D"
   425         else:
   426             for x, y in alignmentColumns:
   427                 if x == y or x == "-" or y == "-": break
   428                 size += 1
   429             else: isActive = False
   430             yield str(size) + "X"
   431 
   432     if end: yield str(end) + "H"
   433 
   434 def writeSam(readGroup, maf):
   435     aLine, sLines, qLines, pLines = maf
   436     fieldsA, fieldsB = pairOrDie(sLines, "SAM")
   437     seqNameA, seqLenA, strandA, letterSizeA, begA, endA, rowA = fieldsA
   438     seqNameB, seqLenB, strandB, letterSizeB, begB, endB, rowB = fieldsB
   439 
   440     if letterSizeA > 1 or letterSizeB > 1:
   441         raise Exception("this looks like translated DNA - can't convert to SAM format")
   442 
   443     if strandA != "+":
   444         raise Exception("for SAM, the 1st strand in each alignment must be +")
   445 
   446     score = None
   447     evalue = None
   448     mapq = mapqMissing
   449     for i in aLine.split():
   450         if i.startswith("score="):
   451             v = i[6:]
   452             if v.isdigit(): score = "AS:i:" + v  # it must be an integer
   453         elif i.startswith("E="):
   454             evalue = "EV:Z:" + i[2:]
   455         elif i.startswith("mismap="):
   456             mapq = mapqFromProb(i[7:])
   457 
   458     pos = str(begA + 1)  # convert to 1-based coordinate
   459 
   460     alignmentColumns = zip(rowA.upper(), rowB.upper())
   461 
   462     revBegB = seqLenB - endB
   463     cigar = "".join(cigarParts(begB, iter(alignmentColumns), revBegB))
   464 
   465     seq = rowB.translate(None, "-")
   466 
   467     qual = "*"
   468     if qLines:
   469         qFields = qLines[-1].split()
   470         if qFields[1] == seqNameB:
   471             qual = ''.join(j for i, j in izip(rowB, qFields[2]) if i != "-")
   472 
   473     # It's hard to get all the pair info, so this is very
   474     # incomplete, but hopefully good enough.
   475     # I'm not sure whether to add 2 and/or 8 to flag.
   476     if seqNameB.endswith("/1"):
   477         seqNameB = seqNameB[:-2]
   478         if strandB == "+": flag = "99"  # 1 + 2 + 32 + 64
   479         else:              flag = "83"  # 1 + 2 + 16 + 64
   480     elif seqNameB.endswith("/2"):
   481         seqNameB = seqNameB[:-2]
   482         if strandB == "+": flag = "163"  # 1 + 2 + 32 + 128
   483         else:              flag = "147"  # 1 + 2 + 16 + 128
   484     else:
   485         if strandB == "+": flag = "0"
   486         else:              flag = "16"
   487 
   488     editDistance = sum(x != y for x, y in alignmentColumns)
   489     # no special treatment of ambiguous bases: might be a minor bug
   490     editDistance = "NM:i:" + str(editDistance)
   491 
   492     out = [seqNameB, flag, seqNameA, pos, mapq, cigar, "*\t0\t0", seq, qual]
   493     out.append(editDistance)
   494     if score: out.append(score)
   495     if evalue: out.append(evalue)
   496     if readGroup: out.append(readGroup)
   497     print "\t".join(out)
   498 
   499 def mafConvertToSam(opts, lines):
   500     readGroup = ""
   501     if opts.readgroup:
   502         readGroup = "RG:Z:" + readGroupId(opts.readgroup.split())
   503     for maf in mafInput(opts, lines):
   504         writeSam(readGroup, maf)
   505 
   506 ##### Routines for converting to BLAST-like format: #####
   507 
   508 def pairwiseMatchSymbol(alignmentColumn):
   509     if isMatch(alignmentColumn):
   510         return "|"
   511     else:
   512         return " "
   513 
   514 def strandText(strand):
   515     if strand == "+":
   516         return "Plus"
   517     else:
   518         return "Minus"
   519 
   520 def blastBegCoordinate(zeroBasedCoordinate, strand, seqLen):
   521     if strand == "+":
   522         return str(zeroBasedCoordinate + 1)
   523     else:
   524         return str(seqLen - zeroBasedCoordinate)
   525 
   526 def blastEndCoordinate(zeroBasedCoordinate, strand, seqLen):
   527     if strand == "+":
   528         return str(zeroBasedCoordinate)
   529     else:
   530         return str(seqLen - zeroBasedCoordinate + 1)
   531 
   532 def nextCoordinate(coordinate, row, letterSize):
   533     return coordinate + insertSize(row, letterSize)
   534 
   535 def chunker(things, chunkSize):
   536     for i in range(0, len(things), chunkSize):
   537         yield things[i:i+chunkSize]
   538 
   539 def blastChunker(sLines, lineSize, alignmentColumns):
   540     seqLens = [i[1] for i in sLines]
   541     strands = [i[2] for i in sLines]
   542     letterSizes = [i[3] for i in sLines]
   543     coords = [i[4] for i in sLines]
   544     for chunkCols in chunker(alignmentColumns, lineSize):
   545         chunkRows = list(alignmentRowsFromColumns(chunkCols))
   546         begs = map(blastBegCoordinate, coords, strands, seqLens)
   547         coords = map(nextCoordinate, coords, chunkRows, letterSizes)
   548         ends = map(blastEndCoordinate, coords, strands, seqLens)
   549         yield chunkCols, chunkRows, begs, ends
   550 
   551 def writeBlast(opts, maf, oldQueryName):
   552     aLine, sLines, qLines, pLines = maf
   553     fieldsA, fieldsB = pairOrDie(sLines, "Blast")
   554     seqNameA, seqLenA, strandA, letterSizeA, begA, endA, rowA = fieldsA
   555     seqNameB, seqLenB, strandB, letterSizeB, begB, endB, rowB = fieldsB
   556 
   557     if seqNameB != oldQueryName:
   558         print "Query= " + seqNameB
   559         print "         (%s letters)" % seqLenB
   560         print
   561 
   562     print ">" + seqNameA
   563     print "          Length = %s" % seqLenA
   564     print
   565 
   566     score, evalue = scoreAndEvalue(aLine)
   567 
   568     if score and opts.bitScoreA is not None and opts.bitScoreB is not None:
   569         bitScore = opts.bitScoreA * float(score) - opts.bitScoreB
   570         scoreLine = " Score = %.3g bits (%s)" % (bitScore, score)
   571     else:
   572         scoreLine = " Score = %s" % score
   573 
   574     if evalue:
   575         scoreLine += ", Expect = %s" % evalue
   576 
   577     print scoreLine
   578 
   579     alignmentColumns = zip(rowA, rowB)
   580 
   581     alnSize = len(alignmentColumns)
   582     matches = sum(x.upper() == y.upper() for x, y in alignmentColumns)
   583     matchPercent = 100 * matches // alnSize  # round down, like BLAST
   584     identLine = " Identities = %s/%s (%s%%)" % (matches, alnSize, matchPercent)
   585     gaps = rowA.count("-") + rowB.count("-")
   586     if gaps:
   587         gapPercent = 100 * gaps // alnSize  # round down, like BLAST
   588         identLine += ", Gaps = %s/%s (%s%%)" % (gaps, alnSize, gapPercent)
   589     print identLine
   590 
   591     print " Strand = %s / %s" % (strandText(strandB), strandText(strandA))
   592     print
   593 
   594     for chunk in blastChunker(sLines, opts.linesize, alignmentColumns):
   595         cols, rows, begs, ends = chunk
   596         begWidth = maxlen(begs)
   597         matchSymbols = ''.join(map(pairwiseMatchSymbol, cols))
   598         print "Query: %-*s %s %s" % (begWidth, begs[1], rows[1], ends[1])
   599         print "       %-*s %s"    % (begWidth, " ", matchSymbols)
   600         print "Sbjct: %-*s %s %s" % (begWidth, begs[0], rows[0], ends[0])
   601         print
   602 
   603 def mafConvertToBlast(opts, lines):
   604     oldQueryName = ""
   605     for maf in mafInput(opts, lines):
   606         writeBlast(opts, maf, oldQueryName)
   607         sLines = maf[1]
   608         oldQueryName = sLines[1][0]
   609 
   610 def blastDataFromMafFields(fields):
   611     seqName, seqLen, strand, letterSize, beg, end, row = fields
   612     if strand == "+":
   613         beg += 1
   614     else:
   615         beg = seqLen - beg
   616         end = seqLen - end + 1
   617     return seqName, str(beg), str(end), row.upper()
   618 
   619 def writeBlastTab(opts, maf):
   620     aLine, sLines, qLines, pLines = maf
   621     fieldsA, fieldsB = pairOrDie(sLines, "BlastTab")
   622     seqNameA, begA, endA, rowA = blastDataFromMafFields(fieldsA)
   623     seqNameB, begB, endB, rowB = blastDataFromMafFields(fieldsB)
   624 
   625     alignmentColumns = zip(rowA, rowB)
   626     alnSize = len(alignmentColumns)
   627     matches = sum(x == y for x, y in alignmentColumns)
   628     matchPercent = "%.2f" % (100.0 * matches / alnSize)
   629     mismatches = alnSize - matches - rowA.count("-") - rowB.count("-")
   630     gapOpens = gapRunCount(rowA) + gapRunCount(rowB)
   631 
   632     out = [seqNameB, seqNameA, matchPercent, str(alnSize), str(mismatches),
   633            str(gapOpens), begB, endB, begA, endA]
   634 
   635     score, evalue = scoreAndEvalue(aLine)
   636     if evalue:
   637         out.append(evalue)
   638         if score and opts.bitScoreA is not None and opts.bitScoreB is not None:
   639             bitScore = opts.bitScoreA * float(score) - opts.bitScoreB
   640             out.append("%.3g" % bitScore)
   641 
   642     print "\t".join(out)
   643 
   644 def mafConvertToBlastTab(opts, lines):
   645     for maf in mafInput(opts, lines):
   646         writeBlastTab(opts, maf)
   647 
   648 ##### Routines for converting to HTML format: #####
   649 
   650 def writeHtmlHeader():
   651     print '''
   652 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
   653  "http://www.w3.org/TR/html4/strict.dtd">
   654 <html lang="en"><head>
   655 <meta http-equiv="Content-type" content="text/html; charset=UTF-8">
   656 <title>Reliable Alignments</title>
   657 <style type="text/css">
   658 /* Try to force monospace, working around browser insanity: */
   659 pre {font-family: "Courier New", monospace, serif; font-size: 0.8125em}
   660 .a {background-color: #3333FF}
   661 .b {background-color: #9933FF}
   662 .c {background-color: #FF66CC}
   663 .d {background-color: #FF3333}
   664 .e {background-color: #FF9933}
   665 .f {background-color: #FFFF00}
   666 .key {display:inline; margin-right:2em}
   667 </style>
   668 </head><body>
   669 
   670 <div style="line-height:1">
   671 <pre class="key"><span class="a">  </span> prob &gt; 0.999</pre>
   672 <pre class="key"><span class="b">  </span> prob &gt; 0.99 </pre>
   673 <pre class="key"><span class="c">  </span> prob &gt; 0.95 </pre>
   674 <pre class="key"><span class="d">  </span> prob &gt; 0.9  </pre>
   675 <pre class="key"><span class="e">  </span> prob &gt; 0.5  </pre>
   676 <pre class="key"><span class="f">  </span> prob &le; 0.5  </pre>
   677 </div>
   678 '''
   679 
   680 def probabilityClass(probabilityColumn):
   681     p = ord(min(probabilityColumn)) - 33
   682     if   p >= 30: return 'a'
   683     elif p >= 20: return 'b'
   684     elif p >= 13: return 'c'
   685     elif p >= 10: return 'd'
   686     elif p >=  3: return 'e'
   687     else: return 'f'
   688 
   689 def identicalRuns(s):
   690     """Yield (item, start, end) for each run of identical items in s."""
   691     beg = 0
   692     for k, v in groupby(s):
   693         end = beg + len(list(v))
   694         yield k, beg, end
   695         beg = end
   696 
   697 def htmlSpan(text, classRun):
   698     key, beg, end = classRun
   699     textbit = text[beg:end]
   700     if key: return '<span class="%s">%s</span>' % (key, textbit)
   701     else: return textbit
   702 
   703 def multipleMatchSymbol(alignmentColumn):
   704     if isMatch(alignmentColumn): return "*"
   705     else: return " "
   706 
   707 def writeHtml(opts, maf):
   708     aLine, sLines, qLines, pLines = maf
   709 
   710     scoreLine = "Alignment"
   711     score, evalue = scoreAndEvalue(aLine)
   712     if score:
   713         scoreLine += " score=" + score
   714         if evalue:
   715             scoreLine += ", expect=" + evalue
   716     print "<h3>%s:</h3>" % scoreLine
   717 
   718     if pLines:
   719         probRows = [i.split()[1] for i in pLines]
   720         probCols = izip(*probRows)
   721         classes = imap(probabilityClass, probCols)
   722     else:
   723         classes = repeat(None)
   724 
   725     seqNames = [i[0] for i in sLines]
   726     nameWidth = maxlen(seqNames)
   727     rows = [i[6] for i in sLines]
   728     alignmentColumns = zip(*rows)
   729 
   730     print '<pre>'
   731     for chunk in blastChunker(sLines, opts.linesize, alignmentColumns):
   732         cols, rows, begs, ends = chunk
   733         begWidth = maxlen(begs)
   734         endWidth = maxlen(ends)
   735         matchSymbols = ''.join(map(multipleMatchSymbol, cols))
   736         classChunk = islice(classes, opts.linesize)
   737         classRuns = list(identicalRuns(classChunk))
   738         for n, b, r, e in zip(seqNames, begs, rows, ends):
   739             spans = [htmlSpan(r, i) for i in classRuns]
   740             spans = ''.join(spans)
   741             formatParams = nameWidth, n, begWidth, b, spans, endWidth, e
   742             print '%-*s %*s %s %*s' % formatParams
   743         print ' ' * nameWidth, ' ' * begWidth, matchSymbols
   744         print
   745     print '</pre>'
   746 
   747 def mafConvertToHtml(opts, lines):
   748     for maf in mafInput(opts, lines):
   749         writeHtml(opts, maf)
   750 
   751 ##### Main program: #####
   752 
   753 def isFormat(myString, myFormat):
   754     return myFormat.startswith(myString)
   755 
   756 def mafConvertOneFile(opts, formatName, lines):
   757     if   isFormat(formatName, "axt"):
   758         mafConvertToAxt(opts, lines)
   759     elif isFormat(formatName, "blast"):
   760         mafConvertToBlast(opts, lines)
   761     elif isFormat(formatName, "blasttab"):
   762         mafConvertToBlastTab(opts, lines)
   763     elif isFormat(formatName, "html"):
   764         mafConvertToHtml(opts, lines)
   765     elif isFormat(formatName, "psl"):
   766         mafConvertToPsl(opts, lines)
   767     elif isFormat(formatName, "sam"):
   768         mafConvertToSam(opts, lines)
   769     elif isFormat(formatName, "tabular"):
   770         mafConvertToTab(opts, lines)
   771     else:
   772         raise Exception("unknown format: " + formatName)
   773 
   774 def mafConvert(opts, args):
   775     formatName = args[0].lower()
   776     fileNames = args[1:]
   777 
   778     opts.isKeepComments = False
   779     opts.bitScoreA = None
   780     opts.bitScoreB = None
   781 
   782     if not opts.noheader:
   783         if isFormat(formatName, "html"):
   784             writeHtmlHeader()
   785         if isFormat(formatName, "sam"):
   786             writeSamHeader(opts, fileNames)
   787         if isFormat(formatName, "tabular"):
   788             opts.isKeepComments = True
   789 
   790     if fileNames:
   791         for i in fileNames:
   792             if i == "-":
   793                 mafConvertOneFile(opts, formatName, sys.stdin)
   794             else:
   795                 with open(i) as f:
   796                     mafConvertOneFile(opts, formatName, f)
   797     else:
   798         mafConvertOneFile(opts, formatName, sys.stdin)
   799 
   800     if not opts.noheader:
   801         if isFormat(formatName, "html"):
   802             print "</body></html>"
   803 
   804 if __name__ == "__main__":
   805     signal.signal(signal.SIGPIPE, signal.SIG_DFL)  # avoid silly error message
   806 
   807     usage = """
   808   %prog --help
   809   %prog axt mafFile(s)
   810   %prog blast mafFile(s)
   811   %prog blasttab mafFile(s)
   812   %prog html mafFile(s)
   813   %prog psl mafFile(s)
   814   %prog sam mafFile(s)
   815   %prog tab mafFile(s)"""
   816 
   817     description = "Read MAF-format alignments & write them in another format."
   818 
   819     op = optparse.OptionParser(usage=usage, description=description)
   820     op.add_option("-p", "--protein", action="store_true",
   821                   help="assume protein alignments, for psl match counts")
   822     op.add_option("-j", "--join", type="float", metavar="N",
   823                   help="join co-linear alignments separated by <= N letters")
   824     op.add_option("-n", "--noheader", action="store_true",
   825                   help="omit any header lines from the output")
   826     op.add_option("-d", "--dictionary", action="store_true",
   827                   help="include dictionary of sequence lengths in sam format")
   828     op.add_option("-f", "--dictfile",
   829                   help="get sequence dictionary from DICTFILE")
   830     op.add_option("-r", "--readgroup",
   831                   help="read group info for sam format")
   832     op.add_option("-l", "--linesize", type="int", default=60, #metavar="CHARS",
   833                   help="line length for blast and html formats (default: %default)")
   834     opts, args = op.parse_args()
   835     if opts.linesize <= 0: op.error("option -l: should be >= 1")
   836     if opts.dictionary and opts.dictfile: op.error("can't use both -d and -f")
   837     if len(args) < 1: op.error("I need a format-name and some MAF alignments")
   838     if opts.dictionary and (len(args) == 1 or "-" in args[1:]):
   839         op.error("need file (not pipe) with option -d")
   840 
   841     try: mafConvert(opts, args)
   842     except Exception, e:
   843         prog = os.path.basename(sys.argv[0])
   844         sys.exit(prog + ": error: " + str(e))