scripts/maf-convert
author Martin C. Frith
Fri Jun 02 18:40:29 2017 +0900 (2017-06-02)
changeset 863 6a4915d5b5cb
parent 831 c5591feba7a2
child 875 592295375eb1
permissions -rwxr-xr-x
last-dotplot: get bp-per-pixel faster
Martin@101
     1
#! /usr/bin/env python
Martin@413
     2
# Copyright 2010, 2011, 2013, 2014 Martin C. Frith
Martin@140
     3
# Read MAF-format alignments: write them in other formats.
Martin@775
     4
# Seems to work with Python 2.x, x>=6
Martin@101
     5
Martin@150
     6
# By "MAF" we mean "multiple alignment format" described in the UCSC
Martin@150
     7
# Genome FAQ, not e.g. "MIRA assembly format".
Martin@150
     8
Martin@140
     9
from itertools import *
Martin@789
    10
import math, optparse, os, signal, sys
Martin@101
    11
Martin@140
    12
def maxlen(s):
Martin@140
    13
    return max(map(len, s))
Martin@101
    14
Martin@790
    15
def pairOrDie(sLines, formatName):
Martin@790
    16
    if len(sLines) != 2:
Martin@790
    17
        e = "for %s, each alignment must have 2 sequences" % formatName
Martin@790
    18
        raise Exception(e)
Martin@790
    19
    return sLines
Martin@790
    20
Martin@140
    21
def isMatch(alignmentColumn):
Martin@148
    22
    # No special treatment of ambiguous bases/residues: same as NCBI BLAST.
Martin@140
    23
    first = alignmentColumn[0].upper()
Martin@140
    24
    for i in alignmentColumn[1:]:
Martin@140
    25
        if i.upper() != first: return False
Martin@140
    26
    return True
Martin@140
    27
Martin@776
    28
def gapRunCount(row):
Martin@776
    29
    """Get the number of runs of gap characters."""
Martin@776
    30
    return sum(k == "-" for k, v in groupby(row))
Martin@776
    31
Martin@777
    32
def alignmentRowsFromColumns(columns):
Martin@777
    33
    return imap(''.join, izip(*columns))
Martin@777
    34
Martin@773
    35
def symbolSize(symbol, letterSize):
Martin@773
    36
    if symbol == "\\": return 1
Martin@773
    37
    if symbol == "/": return -1
Martin@773
    38
    return letterSize
Martin@773
    39
Martin@776
    40
def insertSize(row, letterSize):
Martin@776
    41
    """Get the length of sequence included in the row."""
Martin@776
    42
    return (len(row) - row.count("-")) * letterSize - 4 * row.count("/") - 2 * row.count("\\")
Martin@773
    43
Martin@140
    44
def matchAndInsertSizes(alignmentColumns, letterSizes):
Martin@140
    45
    """Get sizes of gapless blocks, and of the inserts between them."""
Martin@854
    46
    letterSizeA, letterSizeB = letterSizes
Martin@854
    47
    delSize = insSize = subSize = 0
Martin@854
    48
    for x, y in alignmentColumns:
Martin@854
    49
        if x == "-":
Martin@854
    50
            if subSize:
Martin@854
    51
                if delSize or insSize: yield str(delSize) + ":" + str(insSize)
Martin@854
    52
                yield str(subSize)
Martin@854
    53
                delSize = insSize = subSize = 0
Martin@854
    54
            insSize += symbolSize(y, letterSizeB)
Martin@854
    55
        elif y == "-":
Martin@854
    56
            if subSize:
Martin@854
    57
                if delSize or insSize: yield str(delSize) + ":" + str(insSize)
Martin@854
    58
                yield str(subSize)
Martin@854
    59
                delSize = insSize = subSize = 0
Martin@854
    60
            delSize += symbolSize(x, letterSizeA)
Martin@140
    61
        else:
Martin@854
    62
            subSize += 1
Martin@854
    63
    if delSize or insSize: yield str(delSize) + ":" + str(insSize)
Martin@854
    64
    if subSize: yield str(subSize)
Martin@101
    65
Martin@783
    66
##### Routines for reading MAF format: #####
Martin@783
    67
Martin@783
    68
def updateEvalueParameters(opts, line):
Martin@783
    69
    for field in line.split():
Martin@783
    70
        try:
Martin@783
    71
            k, v = field.split("=")
Martin@783
    72
            x = float(v)
Martin@783
    73
            if k == "lambda":
Martin@783
    74
                opts.bitScoreA = x / math.log(2)
Martin@783
    75
            if k == "K":
Martin@783
    76
                opts.bitScoreB = math.log(x, 2)
Martin@783
    77
        except ValueError:
Martin@783
    78
            pass
Martin@783
    79
Martin@785
    80
def scoreAndEvalue(aLine):
Martin@785
    81
    score = evalue = None
Martin@785
    82
    for i in aLine.split():
Martin@785
    83
        if i.startswith("score="):
Martin@785
    84
            score = i[6:]
Martin@785
    85
        elif i.startswith("E="):
Martin@785
    86
            evalue = i[2:]
Martin@785
    87
    return score, evalue
Martin@785
    88
Martin@774
    89
def mafInput(opts, lines):
Martin@784
    90
    aLine = ""
Martin@783
    91
    sLines = []
Martin@783
    92
    qLines = []
Martin@783
    93
    pLines = []
Martin@783
    94
    for line in lines:
Martin@785
    95
        if line[0] == "s":
Martin@787
    96
            junk, seqName, beg, span, strand, seqLen, row = line.split()
Martin@787
    97
            beg = int(beg)
Martin@787
    98
            span = int(span)
Martin@787
    99
            seqLen = int(seqLen)
Martin@788
   100
            if "\\" in row or "/" in row or len(row) - row.count("-") < span:
Martin@788
   101
                letterSize = 3
Martin@788
   102
            else:
Martin@788
   103
                letterSize = 1
Martin@797
   104
            fields = seqName, seqLen, strand, letterSize, beg, beg + span, row
Martin@785
   105
            sLines.append(fields)
Martin@785
   106
        elif line[0] == "a":
Martin@785
   107
            aLine = line
Martin@785
   108
        elif line[0] == "q":
Martin@785
   109
            qLines.append(line)
Martin@785
   110
        elif line[0] == "p":
Martin@785
   111
            pLines.append(line)
Martin@785
   112
        elif line.isspace():
Martin@784
   113
            if sLines: yield aLine, sLines, qLines, pLines
Martin@784
   114
            aLine = ""
Martin@783
   115
            sLines = []
Martin@783
   116
            qLines = []
Martin@783
   117
            pLines = []
Martin@783
   118
        elif line[0] == "#":
Martin@783
   119
            updateEvalueParameters(opts, line)
Martin@783
   120
            if opts.isKeepComments:
Martin@783
   121
                print line,
Martin@784
   122
    if sLines: yield aLine, sLines, qLines, pLines
Martin@773
   123
Martin@798
   124
def isJoinable(opts, oldMaf, newMaf):
Martin@798
   125
    x = oldMaf[1]
Martin@798
   126
    y = newMaf[1]
Martin@798
   127
    if x[-1][2] == "-":
Martin@798
   128
        x, y = y, x
Martin@798
   129
    return all(i[:4] == j[:4] and i[5] <= j[4] and j[4] - i[5] <= opts.join
Martin@798
   130
               for i, j in zip(x, y))
Martin@798
   131
Martin@798
   132
def fixOrder(mafs):
Martin@798
   133
    sLines = mafs[0][1]
Martin@798
   134
    if sLines[-1][2] == "-":
Martin@798
   135
        mafs.reverse()
Martin@798
   136
Martin@798
   137
def mafGroupInput(opts, lines):
Martin@798
   138
    x = []
Martin@798
   139
    for i in mafInput(opts, lines):
Martin@798
   140
        if x and not isJoinable(opts, x[-1], i):
Martin@798
   141
            fixOrder(x)
Martin@798
   142
            yield x
Martin@798
   143
            x = []
Martin@798
   144
        x.append(i)
Martin@798
   145
    if x:
Martin@798
   146
        fixOrder(x)
Martin@798
   147
        yield x
Martin@798
   148
Martin@101
   149
##### Routines for converting to AXT format: #####
Martin@101
   150
Martin@140
   151
axtCounter = count()
Martin@101
   152
Martin@101
   153
def writeAxt(maf):
Martin@789
   154
    aLine, sLines, qLines, pLines = maf
Martin@789
   155
Martin@789
   156
    if sLines[0][2] != "+":
Martin@101
   157
        raise Exception("for AXT, the 1st strand in each alignment must be +")
Martin@140
   158
Martin@140
   159
    # Convert to AXT's 1-based coordinates:
Martin@797
   160
    ranges = [(i[0], str(i[4] + 1), str(i[5]), i[2]) for i in sLines]
Martin@140
   161
Martin@779
   162
    head, body = ranges[0], ranges[1:]
Martin@140
   163
Martin@779
   164
    outWords = [str(axtCounter.next())]
Martin@140
   165
    outWords.extend(head[:3])
Martin@779
   166
    for i in body:
Martin@779
   167
        outWords.extend(i)
Martin@779
   168
Martin@789
   169
    score, evalue = scoreAndEvalue(aLine)
Martin@786
   170
    if score:
Martin@786
   171
        outWords.append(score)
Martin@140
   172
Martin@790
   173
    print " ".join(outWords)
Martin@789
   174
    for i in sLines:
Martin@789
   175
        print i[6]
Martin@101
   176
    print  # print a blank line at the end
Martin@101
   177
Martin@778
   178
def mafConvertToAxt(opts, lines):
Martin@778
   179
    for maf in mafInput(opts, lines):
Martin@789
   180
        writeAxt(maf)
Martin@778
   181
Martin@101
   182
##### Routines for converting to tabular format: #####
Martin@101
   183
Martin@140
   184
def writeTab(maf):
Martin@488
   185
    aLine, sLines, qLines, pLines = maf
Martin@488
   186
Martin@488
   187
    score = "0"
Martin@568
   188
    endWords = []
Martin@784
   189
    for i in aLine.split():
Martin@779
   190
        if   i.startswith("score="):
Martin@779
   191
            score = i[6:]
Martin@779
   192
        elif len(i) > 1:
Martin@779
   193
            endWords.append(i)
Martin@488
   194
Martin@779
   195
    outWords = [score]
Martin@101
   196
Martin@797
   197
    for seqName, seqLen, strand, letterSize, beg, end, row in sLines:
Martin@797
   198
        x = seqName, str(beg), str(end - beg), strand, str(seqLen)
Martin@787
   199
        outWords.extend(x)
Martin@101
   200
Martin@788
   201
    letterSizes = [i[3] for i in sLines]
Martin@779
   202
    rows = [i[6] for i in sLines]
Martin@797
   203
    alignmentColumns = izip(*rows)
Martin@797
   204
    gapWord = ",".join(matchAndInsertSizes(alignmentColumns, letterSizes))
Martin@140
   205
    outWords.append(gapWord)
Martin@101
   206
Martin@568
   207
    print "\t".join(outWords + endWords)
Martin@140
   208
Martin@778
   209
def mafConvertToTab(opts, lines):
Martin@778
   210
    for maf in mafInput(opts, lines):
Martin@778
   211
        writeTab(maf)
Martin@778
   212
Martin@140
   213
##### Routines for converting to PSL format: #####
Martin@140
   214
Martin@798
   215
def pslBlocks(opts, mafs, outCounts):
Martin@140
   216
    """Get sizes and start coordinates of gapless blocks in an alignment."""
Martin@795
   217
    # repMatches is always zero
Martin@795
   218
    # for proteins, nCount is always zero, because that's what BLATv34 does
Martin@795
   219
    normalBases = "ACGTU"
Martin@795
   220
    matches = mismatches = repMatches = nCount = 0
Martin@795
   221
Martin@798
   222
    for maf in mafs:
Martin@798
   223
        sLines = maf[1]
Martin@798
   224
        fieldsA, fieldsB = pairOrDie(sLines, "PSL")
Martin@798
   225
        letterSizeA, begA, endA, rowA = fieldsA[3:7]
Martin@798
   226
        letterSizeB, begB, endB, rowB = fieldsB[3:7]
Martin@794
   227
Martin@798
   228
        size = 0
Martin@798
   229
        for x, y in izip(rowA.upper(), rowB.upper()):
Martin@798
   230
            if x == "-":
Martin@798
   231
                if size:
Martin@798
   232
                    yield size, begA, begB
Martin@798
   233
                    begA += size * letterSizeA
Martin@798
   234
                    begB += size * letterSizeB
Martin@798
   235
                    size = 0
Martin@798
   236
                begB += symbolSize(y, letterSizeB)
Martin@798
   237
            elif y == "-":
Martin@798
   238
                if size:
Martin@798
   239
                    yield size, begA, begB
Martin@798
   240
                    begA += size * letterSizeA
Martin@798
   241
                    begB += size * letterSizeB
Martin@798
   242
                    size = 0
Martin@798
   243
                begA += symbolSize(x, letterSizeA)
Martin@798
   244
            else:
Martin@798
   245
                size += 1
Martin@798
   246
                if x in normalBases and y in normalBases or opts.protein:
Martin@798
   247
                    if x == y:
Martin@798
   248
                        matches += 1
Martin@798
   249
                    else:
Martin@798
   250
                        mismatches += 1
Martin@795
   251
                else:
Martin@798
   252
                    nCount += 1
Martin@798
   253
        if size:
Martin@798
   254
            yield size, begA, begB
Martin@140
   255
Martin@795
   256
    outCounts[0:4] = matches, mismatches, repMatches, nCount
Martin@795
   257
Martin@792
   258
def pslNumInserts(blocks, letterSizeA, letterSizeB):
Martin@792
   259
    numInsertA = numInsertB = 0
Martin@792
   260
    for i, x in enumerate(blocks):
Martin@792
   261
        size, begA, begB = x
Martin@792
   262
        if i:
Martin@792
   263
            if begA > endA:
Martin@792
   264
                numInsertA += 1
Martin@792
   265
            if begB > endB:
Martin@792
   266
                numInsertB += 1
Martin@792
   267
        endA = begA + size * letterSizeA
Martin@792
   268
        endB = begB + size * letterSizeB
Martin@792
   269
    return numInsertA, numInsertB
Martin@792
   270
Martin@140
   271
def pslCommaString(things):
Martin@140
   272
    # UCSC software seems to prefer a trailing comma
Martin@781
   273
    return ",".join(map(str, things)) + ","
Martin@140
   274
Martin@831
   275
def pslEnds(seqLen, strand, beg, end):
Martin@794
   276
    if strand == "-":
Martin@831
   277
        return seqLen - end, seqLen - beg
Martin@831
   278
    return beg, end
Martin@140
   279
Martin@798
   280
def writePsl(opts, mafs):
Martin@795
   281
    matchCounts = [0] * 4
Martin@798
   282
    blocks = list(pslBlocks(opts, mafs, matchCounts))
Martin@148
   283
    matches, mismatches, repMatches, nCount = matchCounts
Martin@148
   284
    numGaplessColumns = sum(matchCounts)
Martin@140
   285
Martin@831
   286
    if not blocks:
Martin@831
   287
        return
Martin@794
   288
Martin@831
   289
    fieldsA, fieldsB = mafs[0][1]
Martin@831
   290
    headSize, headBegA, headBegB = blocks[0]
Martin@831
   291
    tailSize, tailBegA, tailBegB = blocks[-1]
Martin@831
   292
Martin@831
   293
    seqNameA, seqLenA, strandA, letterSizeA = fieldsA[0:4]
Martin@831
   294
    begA, endA = pslEnds(seqLenA, strandA, headBegA, tailBegA + tailSize)
Martin@796
   295
    baseInsertA = endA - begA - numGaplessColumns * letterSizeA
Martin@796
   296
Martin@831
   297
    seqNameB, seqLenB, strandB, letterSizeB = fieldsB[0:4]
Martin@831
   298
    begB, endB = pslEnds(seqLenB, strandB, headBegB, tailBegB + tailSize)
Martin@796
   299
    baseInsertB = endB - begB - numGaplessColumns * letterSizeB
Martin@140
   300
Martin@792
   301
    numInsertA, numInsertB = pslNumInserts(blocks, letterSizeA, letterSizeB)
Martin@140
   302
Martin@792
   303
    strand = strandB
Martin@792
   304
    if letterSizeA > 1 or letterSizeB > 1:
Martin@792
   305
        strand += strandA
Martin@792
   306
    elif strandA != "+":
Martin@140
   307
        raise Exception("for non-translated PSL, the 1st strand in each alignment must be +")
Martin@140
   308
Martin@140
   309
    blockCount = len(blocks)
Martin@782
   310
    blockSizes, blockStartsA, blockStartsB = map(pslCommaString, zip(*blocks))
Martin@140
   311
Martin@140
   312
    outWords = (matches, mismatches, repMatches, nCount,
Martin@782
   313
                numInsertB, baseInsertB, numInsertA, baseInsertA, strand,
Martin@782
   314
                seqNameB, seqLenB, begB, endB, seqNameA, seqLenA, begA, endA,
Martin@782
   315
                blockCount, blockSizes, blockStartsB, blockStartsA)
Martin@781
   316
Martin@781
   317
    print "\t".join(map(str, outWords))
Martin@140
   318
Martin@778
   319
def mafConvertToPsl(opts, lines):
Martin@798
   320
    if opts.join:
Martin@798
   321
        for i in mafGroupInput(opts, lines):
Martin@798
   322
            writePsl(opts, i)
Martin@798
   323
    else:
Martin@798
   324
        for i in mafInput(opts, lines):
Martin@798
   325
            writePsl(opts, [i])
Martin@778
   326
Martin@148
   327
##### Routines for converting to SAM format: #####
Martin@148
   328
Martin@285
   329
def readGroupId(readGroupItems):
Martin@285
   330
    for i in readGroupItems:
Martin@285
   331
        if i.startswith("ID:"):
Martin@285
   332
            return i[3:]
Martin@285
   333
    raise Exception("readgroup must include ID")
Martin@285
   334
Martin@775
   335
def readSequenceLengths(fileNames):
Martin@155
   336
    """Read name & length of topmost sequence in each maf block."""
Martin@775
   337
    for i in fileNames:
Martin@775
   338
        with open(i) as f:
Martin@775
   339
            fields = None
Martin@775
   340
            for line in f:
Martin@775
   341
                if fields:
Martin@775
   342
                    if line.isspace():
Martin@775
   343
                        fields = None
Martin@775
   344
                else:
Martin@775
   345
                    if line[0] == "s":
Martin@775
   346
                        fields = line.split()
Martin@775
   347
                        yield fields[1], fields[5]
Martin@155
   348
Martin@284
   349
def naturalSortKey(s):
Martin@284
   350
    """Return a key that sorts strings in "natural" order."""
Martin@284
   351
    return [(str, int)[k]("".join(v)) for k, v in groupby(s, str.isdigit)]
Martin@284
   352
Martin@284
   353
def karyotypicSortKey(s):
Martin@284
   354
    """Attempt to sort chromosomes in GATK's ridiculous order."""
Martin@284
   355
    if s == "chrM": return []
Martin@284
   356
    if s == "MT": return ["~"]
Martin@284
   357
    return naturalSortKey(s)
Martin@284
   358
Martin@780
   359
def copyDictFile(lines):
Martin@780
   360
    for line in lines:
Martin@780
   361
        if line.startswith("@SQ"):
Martin@780
   362
            sys.stdout.write(line)
Martin@780
   363
        elif not line[0] == "@":
Martin@780
   364
            break
Martin@780
   365
Martin@780
   366
def writeSamHeader(opts, fileNames):
Martin@284
   367
    print "@HD\tVN:1.3\tSO:unsorted"
Martin@780
   368
Martin@780
   369
    if opts.dictionary:
Martin@780
   370
        sequenceLengths = dict(readSequenceLengths(fileNames))
Martin@780
   371
        for k in sorted(sequenceLengths, key=karyotypicSortKey):
Martin@780
   372
            print "@SQ\tSN:%s\tLN:%s" % (k, sequenceLengths[k])
Martin@780
   373
Martin@780
   374
    if opts.dictfile:
Martin@780
   375
        if opts.dictfile == "-":
Martin@780
   376
            copyDictFile(sys.stdin)
Martin@780
   377
        else:
Martin@780
   378
            with open(opts.dictfile) as f:
Martin@780
   379
                copyDictFile(f)
Martin@780
   380
Martin@780
   381
    if opts.readgroup:
Martin@780
   382
        print "@RG\t" + "\t".join(opts.readgroup.split())
Martin@155
   383
Martin@417
   384
mapqMissing = "255"
Martin@417
   385
mapqMaximum = "254"
Martin@417
   386
mapqMaximumNum = float(mapqMaximum)
Martin@148
   387
Martin@148
   388
def mapqFromProb(probString):
Martin@148
   389
    try: p = float(probString)
Martin@148
   390
    except ValueError: raise Exception("bad probability: " + probString)
Martin@148
   391
    if p < 0 or p > 1: raise Exception("bad probability: " + probString)
Martin@148
   392
    if p == 0: return mapqMaximum
Martin@148
   393
    phred = -10 * math.log(p, 10)
Martin@417
   394
    if phred >= mapqMaximumNum: return mapqMaximum
Martin@417
   395
    return str(int(round(phred)))
Martin@148
   396
Martin@484
   397
def cigarParts(beg, alignmentColumns, end):
Martin@484
   398
    if beg: yield str(beg) + "H"
Martin@148
   399
Martin@148
   400
    # (doesn't handle translated alignments)
Martin@718
   401
    # uses "read-ahead" technique, aiming to be as fast as possible:
Martin@484
   402
    isActive = True
Martin@484
   403
    for x, y in alignmentColumns: break
Martin@484
   404
    else: isActive = False
Martin@484
   405
    while isActive:
Martin@484
   406
        size = 1
Martin@718
   407
        if x == y:  # xxx assumes no gap-gap columns, ignores ambiguous bases
Martin@718
   408
            for x, y in alignmentColumns:
Martin@718
   409
                if x != y: break
Martin@718
   410
                size += 1
Martin@718
   411
            else: isActive = False
Martin@718
   412
            yield str(size) + "="
Martin@718
   413
        elif x == "-":
Martin@484
   414
            for x, y in alignmentColumns:
Martin@484
   415
                if x != "-": break
Martin@484
   416
                size += 1
Martin@484
   417
            else: isActive = False
Martin@484
   418
            yield str(size) + "I"
Martin@484
   419
        elif y == "-":
Martin@484
   420
            for x, y in alignmentColumns:
Martin@484
   421
                if y != "-": break
Martin@484
   422
                size += 1
Martin@484
   423
            else: isActive = False
Martin@484
   424
            yield str(size) + "D"
Martin@484
   425
        else:
Martin@484
   426
            for x, y in alignmentColumns:
Martin@718
   427
                if x == y or x == "-" or y == "-": break
Martin@484
   428
                size += 1
Martin@484
   429
            else: isActive = False
Martin@718
   430
            yield str(size) + "X"
Martin@484
   431
Martin@484
   432
    if end: yield str(end) + "H"
Martin@148
   433
Martin@781
   434
def writeSam(readGroup, maf):
Martin@487
   435
    aLine, sLines, qLines, pLines = maf
Martin@790
   436
    fieldsA, fieldsB = pairOrDie(sLines, "SAM")
Martin@797
   437
    seqNameA, seqLenA, strandA, letterSizeA, begA, endA, rowA = fieldsA
Martin@797
   438
    seqNameB, seqLenB, strandB, letterSizeB, begB, endB, rowB = fieldsB
Martin@783
   439
Martin@788
   440
    if letterSizeA > 1 or letterSizeB > 1:
Martin@784
   441
        raise Exception("this looks like translated DNA - can't convert to SAM format")
Martin@784
   442
Martin@783
   443
    if strandA != "+":
Martin@783
   444
        raise Exception("for SAM, the 1st strand in each alignment must be +")
Martin@783
   445
Martin@487
   446
    score = None
Martin@487
   447
    evalue = None
Martin@487
   448
    mapq = mapqMissing
Martin@784
   449
    for i in aLine.split():
Martin@487
   450
        if i.startswith("score="):
Martin@487
   451
            v = i[6:]
Martin@487
   452
            if v.isdigit(): score = "AS:i:" + v  # it must be an integer
Martin@569
   453
        elif i.startswith("E="):
Martin@569
   454
            evalue = "EV:Z:" + i[2:]
Martin@487
   455
        elif i.startswith("mismap="):
Martin@487
   456
            mapq = mapqFromProb(i[7:])
Martin@148
   457
Martin@787
   458
    pos = str(begA + 1)  # convert to 1-based coordinate
Martin@148
   459
Martin@782
   460
    alignmentColumns = zip(rowA.upper(), rowB.upper())
Martin@148
   461
Martin@797
   462
    revBegB = seqLenB - endB
Martin@782
   463
    cigar = "".join(cigarParts(begB, iter(alignmentColumns), revBegB))
Martin@148
   464
Martin@782
   465
    seq = rowB.translate(None, "-")
Martin@148
   466
Martin@783
   467
    qual = "*"
Martin@783
   468
    if qLines:
Martin@784
   469
        qFields = qLines[-1].split()
Martin@783
   470
        if qFields[1] == seqNameB:
Martin@783
   471
            qual = ''.join(j for i, j in izip(rowB, qFields[2]) if i != "-")
Martin@148
   472
Martin@717
   473
    # It's hard to get all the pair info, so this is very
Martin@717
   474
    # incomplete, but hopefully good enough.
Martin@717
   475
    # I'm not sure whether to add 2 and/or 8 to flag.
Martin@782
   476
    if seqNameB.endswith("/1"):
Martin@782
   477
        seqNameB = seqNameB[:-2]
Martin@782
   478
        if strandB == "+": flag = "99"  # 1 + 2 + 32 + 64
Martin@717
   479
        else:              flag = "83"  # 1 + 2 + 16 + 64
Martin@782
   480
    elif seqNameB.endswith("/2"):
Martin@782
   481
        seqNameB = seqNameB[:-2]
Martin@782
   482
        if strandB == "+": flag = "163"  # 1 + 2 + 32 + 128
Martin@717
   483
        else:              flag = "147"  # 1 + 2 + 16 + 128
Martin@717
   484
    else:
Martin@782
   485
        if strandB == "+": flag = "0"
Martin@717
   486
        else:              flag = "16"
Martin@283
   487
Martin@783
   488
    editDistance = sum(x != y for x, y in alignmentColumns)
Martin@717
   489
    # no special treatment of ambiguous bases: might be a minor bug
Martin@717
   490
    editDistance = "NM:i:" + str(editDistance)
Martin@148
   491
Martin@782
   492
    out = [seqNameB, flag, seqNameA, pos, mapq, cigar, "*\t0\t0", seq, qual]
Martin@781
   493
    out.append(editDistance)
Martin@781
   494
    if score: out.append(score)
Martin@781
   495
    if evalue: out.append(evalue)
Martin@781
   496
    if readGroup: out.append(readGroup)
Martin@781
   497
    print "\t".join(out)
Martin@148
   498
Martin@778
   499
def mafConvertToSam(opts, lines):
Martin@778
   500
    readGroup = ""
Martin@778
   501
    if opts.readgroup:
Martin@778
   502
        readGroup = "RG:Z:" + readGroupId(opts.readgroup.split())
Martin@778
   503
    for maf in mafInput(opts, lines):
Martin@781
   504
        writeSam(readGroup, maf)
Martin@778
   505
Martin@140
   506
##### Routines for converting to BLAST-like format: #####
Martin@140
   507
Martin@140
   508
def pairwiseMatchSymbol(alignmentColumn):
Martin@792
   509
    if isMatch(alignmentColumn):
Martin@792
   510
        return "|"
Martin@792
   511
    else:
Martin@792
   512
        return " "
Martin@140
   513
Martin@140
   514
def strandText(strand):
Martin@792
   515
    if strand == "+":
Martin@792
   516
        return "Plus"
Martin@792
   517
    else:
Martin@792
   518
        return "Minus"
Martin@140
   519
Martin@777
   520
def blastBegCoordinate(zeroBasedCoordinate, strand, seqLen):
Martin@777
   521
    if strand == "+":
Martin@777
   522
        return str(zeroBasedCoordinate + 1)
Martin@777
   523
    else:
Martin@777
   524
        return str(seqLen - zeroBasedCoordinate)
Martin@777
   525
Martin@777
   526
def blastEndCoordinate(zeroBasedCoordinate, strand, seqLen):
Martin@777
   527
    if strand == "+":
Martin@777
   528
        return str(zeroBasedCoordinate)
Martin@777
   529
    else:
Martin@777
   530
        return str(seqLen - zeroBasedCoordinate + 1)
Martin@777
   531
Martin@777
   532
def nextCoordinate(coordinate, row, letterSize):
Martin@777
   533
    return coordinate + insertSize(row, letterSize)
Martin@140
   534
Martin@140
   535
def chunker(things, chunkSize):
Martin@140
   536
    for i in range(0, len(things), chunkSize):
Martin@140
   537
        yield things[i:i+chunkSize]
Martin@140
   538
Martin@791
   539
def blastChunker(sLines, lineSize, alignmentColumns):
Martin@791
   540
    seqLens = [i[1] for i in sLines]
Martin@791
   541
    strands = [i[2] for i in sLines]
Martin@791
   542
    letterSizes = [i[3] for i in sLines]
Martin@791
   543
    coords = [i[4] for i in sLines]
Martin@140
   544
    for chunkCols in chunker(alignmentColumns, lineSize):
Martin@777
   545
        chunkRows = list(alignmentRowsFromColumns(chunkCols))
Martin@791
   546
        begs = map(blastBegCoordinate, coords, strands, seqLens)
Martin@777
   547
        coords = map(nextCoordinate, coords, chunkRows, letterSizes)
Martin@791
   548
        ends = map(blastEndCoordinate, coords, strands, seqLens)
Martin@777
   549
        yield chunkCols, chunkRows, begs, ends
Martin@140
   550
Martin@792
   551
def writeBlast(opts, maf, oldQueryName):
Martin@791
   552
    aLine, sLines, qLines, pLines = maf
Martin@791
   553
    fieldsA, fieldsB = pairOrDie(sLines, "Blast")
Martin@797
   554
    seqNameA, seqLenA, strandA, letterSizeA, begA, endA, rowA = fieldsA
Martin@797
   555
    seqNameB, seqLenB, strandB, letterSizeB, begB, endB, rowB = fieldsB
Martin@140
   556
Martin@791
   557
    if seqNameB != oldQueryName:
Martin@791
   558
        print "Query= " + seqNameB
Martin@791
   559
        print "         (%s letters)" % seqLenB
Martin@322
   560
        print
Martin@322
   561
Martin@791
   562
    print ">" + seqNameA
Martin@791
   563
    print "          Length = %s" % seqLenA
Martin@140
   564
    print
Martin@140
   565
Martin@791
   566
    score, evalue = scoreAndEvalue(aLine)
Martin@786
   567
Martin@786
   568
    if score and opts.bitScoreA is not None and opts.bitScoreB is not None:
Martin@571
   569
        bitScore = opts.bitScoreA * float(score) - opts.bitScoreB
Martin@571
   570
        scoreLine = " Score = %.3g bits (%s)" % (bitScore, score)
Martin@571
   571
    else:
Martin@571
   572
        scoreLine = " Score = %s" % score
Martin@786
   573
Martin@786
   574
    if evalue:
Martin@786
   575
        scoreLine += ", Expect = %s" % evalue
Martin@786
   576
Martin@140
   577
    print scoreLine
Martin@140
   578
Martin@791
   579
    alignmentColumns = zip(rowA, rowB)
Martin@140
   580
Martin@140
   581
    alnSize = len(alignmentColumns)
Martin@785
   582
    matches = sum(x.upper() == y.upper() for x, y in alignmentColumns)
Martin@140
   583
    matchPercent = 100 * matches // alnSize  # round down, like BLAST
Martin@140
   584
    identLine = " Identities = %s/%s (%s%%)" % (matches, alnSize, matchPercent)
Martin@792
   585
    gaps = rowA.count("-") + rowB.count("-")
Martin@777
   586
    if gaps:
Martin@777
   587
        gapPercent = 100 * gaps // alnSize  # round down, like BLAST
Martin@777
   588
        identLine += ", Gaps = %s/%s (%s%%)" % (gaps, alnSize, gapPercent)
Martin@140
   589
    print identLine
Martin@140
   590
Martin@791
   591
    print " Strand = %s / %s" % (strandText(strandB), strandText(strandA))
Martin@140
   592
    print
Martin@140
   593
Martin@791
   594
    for chunk in blastChunker(sLines, opts.linesize, alignmentColumns):
Martin@777
   595
        cols, rows, begs, ends = chunk
Martin@777
   596
        begWidth = maxlen(begs)
Martin@777
   597
        matchSymbols = ''.join(map(pairwiseMatchSymbol, cols))
Martin@777
   598
        print "Query: %-*s %s %s" % (begWidth, begs[1], rows[1], ends[1])
Martin@777
   599
        print "       %-*s %s"    % (begWidth, " ", matchSymbols)
Martin@777
   600
        print "Sbjct: %-*s %s %s" % (begWidth, begs[0], rows[0], ends[0])
Martin@140
   601
        print
Martin@140
   602
Martin@778
   603
def mafConvertToBlast(opts, lines):
Martin@778
   604
    oldQueryName = ""
Martin@778
   605
    for maf in mafInput(opts, lines):
Martin@792
   606
        writeBlast(opts, maf, oldQueryName)
Martin@791
   607
        sLines = maf[1]
Martin@791
   608
        oldQueryName = sLines[1][0]
Martin@778
   609
Martin@789
   610
def blastDataFromMafFields(fields):
Martin@797
   611
    seqName, seqLen, strand, letterSize, beg, end, row = fields
Martin@789
   612
    if strand == "+":
Martin@785
   613
        beg += 1
Martin@654
   614
    else:
Martin@785
   615
        beg = seqLen - beg
Martin@785
   616
        end = seqLen - end + 1
Martin@792
   617
    return seqName, str(beg), str(end), row.upper()
Martin@654
   618
Martin@781
   619
def writeBlastTab(opts, maf):
Martin@572
   620
    aLine, sLines, qLines, pLines = maf
Martin@790
   621
    fieldsA, fieldsB = pairOrDie(sLines, "BlastTab")
Martin@789
   622
    seqNameA, begA, endA, rowA = blastDataFromMafFields(fieldsA)
Martin@789
   623
    seqNameB, begB, endB, rowB = blastDataFromMafFields(fieldsB)
Martin@572
   624
Martin@785
   625
    alignmentColumns = zip(rowA, rowB)
Martin@572
   626
    alnSize = len(alignmentColumns)
Martin@789
   627
    matches = sum(x == y for x, y in alignmentColumns)
Martin@572
   628
    matchPercent = "%.2f" % (100.0 * matches / alnSize)
Martin@792
   629
    mismatches = alnSize - matches - rowA.count("-") - rowB.count("-")
Martin@785
   630
    gapOpens = gapRunCount(rowA) + gapRunCount(rowB)
Martin@572
   631
Martin@792
   632
    out = [seqNameB, seqNameA, matchPercent, str(alnSize), str(mismatches),
Martin@792
   633
           str(gapOpens), begB, endB, begA, endA]
Martin@785
   634
Martin@785
   635
    score, evalue = scoreAndEvalue(aLine)
Martin@572
   636
    if evalue:
Martin@572
   637
        out.append(evalue)
Martin@572
   638
        if score and opts.bitScoreA is not None and opts.bitScoreB is not None:
Martin@572
   639
            bitScore = opts.bitScoreA * float(score) - opts.bitScoreB
Martin@572
   640
            out.append("%.3g" % bitScore)
Martin@785
   641
Martin@792
   642
    print "\t".join(out)
Martin@572
   643
Martin@778
   644
def mafConvertToBlastTab(opts, lines):
Martin@778
   645
    for maf in mafInput(opts, lines):
Martin@781
   646
        writeBlastTab(opts, maf)
Martin@778
   647
Martin@140
   648
##### Routines for converting to HTML format: #####
Martin@140
   649
Martin@140
   650
def writeHtmlHeader():
Martin@140
   651
    print '''
Martin@140
   652
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
Martin@140
   653
 "http://www.w3.org/TR/html4/strict.dtd">
Martin@140
   654
<html lang="en"><head>
Martin@140
   655
<meta http-equiv="Content-type" content="text/html; charset=UTF-8">
Martin@140
   656
<title>Reliable Alignments</title>
Martin@140
   657
<style type="text/css">
Martin@171
   658
/* Try to force monospace, working around browser insanity: */
Martin@171
   659
pre {font-family: "Courier New", monospace, serif; font-size: 0.8125em}
Martin@140
   660
.a {background-color: #3333FF}
Martin@140
   661
.b {background-color: #9933FF}
Martin@140
   662
.c {background-color: #FF66CC}
Martin@140
   663
.d {background-color: #FF3333}
Martin@140
   664
.e {background-color: #FF9933}
Martin@140
   665
.f {background-color: #FFFF00}
Martin@140
   666
.key {display:inline; margin-right:2em}
Martin@140
   667
</style>
Martin@140
   668
</head><body>
Martin@140
   669
Martin@140
   670
<div style="line-height:1">
Martin@140
   671
<pre class="key"><span class="a">  </span> prob &gt; 0.999</pre>
Martin@140
   672
<pre class="key"><span class="b">  </span> prob &gt; 0.99 </pre>
Martin@140
   673
<pre class="key"><span class="c">  </span> prob &gt; 0.95 </pre>
Martin@140
   674
<pre class="key"><span class="d">  </span> prob &gt; 0.9  </pre>
Martin@140
   675
<pre class="key"><span class="e">  </span> prob &gt; 0.5  </pre>
Martin@140
   676
<pre class="key"><span class="f">  </span> prob &le; 0.5  </pre>
Martin@140
   677
</div>
Martin@140
   678
'''
Martin@140
   679
Martin@140
   680
def probabilityClass(probabilityColumn):
Martin@177
   681
    p = ord(min(probabilityColumn)) - 33
Martin@177
   682
    if   p >= 30: return 'a'
Martin@177
   683
    elif p >= 20: return 'b'
Martin@177
   684
    elif p >= 13: return 'c'
Martin@177
   685
    elif p >= 10: return 'd'
Martin@177
   686
    elif p >=  3: return 'e'
Martin@140
   687
    else: return 'f'
Martin@140
   688
Martin@140
   689
def identicalRuns(s):
Martin@140
   690
    """Yield (item, start, end) for each run of identical items in s."""
Martin@140
   691
    beg = 0
Martin@140
   692
    for k, v in groupby(s):
Martin@140
   693
        end = beg + len(list(v))
Martin@140
   694
        yield k, beg, end
Martin@140
   695
        beg = end
Martin@140
   696
Martin@140
   697
def htmlSpan(text, classRun):
Martin@140
   698
    key, beg, end = classRun
Martin@140
   699
    textbit = text[beg:end]
Martin@140
   700
    if key: return '<span class="%s">%s</span>' % (key, textbit)
Martin@140
   701
    else: return textbit
Martin@140
   702
Martin@140
   703
def multipleMatchSymbol(alignmentColumn):
Martin@140
   704
    if isMatch(alignmentColumn): return "*"
Martin@140
   705
    else: return " "
Martin@140
   706
Martin@777
   707
def writeHtml(opts, maf):
Martin@791
   708
    aLine, sLines, qLines, pLines = maf
Martin@791
   709
Martin@140
   710
    scoreLine = "Alignment"
Martin@791
   711
    score, evalue = scoreAndEvalue(aLine)
Martin@786
   712
    if score:
Martin@786
   713
        scoreLine += " score=" + score
Martin@786
   714
        if evalue:
Martin@786
   715
            scoreLine += ", expect=" + evalue
Martin@140
   716
    print "<h3>%s:</h3>" % scoreLine
Martin@140
   717
Martin@791
   718
    if pLines:
Martin@791
   719
        probRows = [i.split()[1] for i in pLines]
Martin@177
   720
        probCols = izip(*probRows)
Martin@140
   721
        classes = imap(probabilityClass, probCols)
Martin@140
   722
    else:
Martin@140
   723
        classes = repeat(None)
Martin@140
   724
Martin@791
   725
    seqNames = [i[0] for i in sLines]
Martin@791
   726
    nameWidth = maxlen(seqNames)
Martin@791
   727
    rows = [i[6] for i in sLines]
Martin@791
   728
    alignmentColumns = zip(*rows)
Martin@140
   729
Martin@140
   730
    print '<pre>'
Martin@791
   731
    for chunk in blastChunker(sLines, opts.linesize, alignmentColumns):
Martin@777
   732
        cols, rows, begs, ends = chunk
Martin@777
   733
        begWidth = maxlen(begs)
Martin@140
   734
        endWidth = maxlen(ends)
Martin@777
   735
        matchSymbols = ''.join(map(multipleMatchSymbol, cols))
Martin@777
   736
        classChunk = islice(classes, opts.linesize)
Martin@140
   737
        classRuns = list(identicalRuns(classChunk))
Martin@791
   738
        for n, b, r, e in zip(seqNames, begs, rows, ends):
Martin@140
   739
            spans = [htmlSpan(r, i) for i in classRuns]
Martin@140
   740
            spans = ''.join(spans)
Martin@777
   741
            formatParams = nameWidth, n, begWidth, b, spans, endWidth, e
Martin@140
   742
            print '%-*s %*s %s %*s' % formatParams
Martin@777
   743
        print ' ' * nameWidth, ' ' * begWidth, matchSymbols
Martin@140
   744
        print
Martin@140
   745
    print '</pre>'
Martin@101
   746
Martin@778
   747
def mafConvertToHtml(opts, lines):
Martin@778
   748
    for maf in mafInput(opts, lines):
Martin@791
   749
        writeHtml(opts, maf)
Martin@778
   750
Martin@773
   751
##### Main program: #####
Martin@101
   752
Martin@140
   753
def isFormat(myString, myFormat):
Martin@140
   754
    return myFormat.startswith(myString)
Martin@101
   755
Martin@778
   756
def mafConvertOneFile(opts, formatName, lines):
Martin@778
   757
    if   isFormat(formatName, "axt"):
Martin@778
   758
        mafConvertToAxt(opts, lines)
Martin@778
   759
    elif isFormat(formatName, "blast"):
Martin@778
   760
        mafConvertToBlast(opts, lines)
Martin@778
   761
    elif isFormat(formatName, "blasttab"):
Martin@778
   762
        mafConvertToBlastTab(opts, lines)
Martin@778
   763
    elif isFormat(formatName, "html"):
Martin@778
   764
        mafConvertToHtml(opts, lines)
Martin@778
   765
    elif isFormat(formatName, "psl"):
Martin@778
   766
        mafConvertToPsl(opts, lines)
Martin@778
   767
    elif isFormat(formatName, "sam"):
Martin@778
   768
        mafConvertToSam(opts, lines)
Martin@778
   769
    elif isFormat(formatName, "tabular"):
Martin@778
   770
        mafConvertToTab(opts, lines)
Martin@778
   771
    else:
Martin@778
   772
        raise Exception("unknown format: " + formatName)
Martin@778
   773
Martin@140
   774
def mafConvert(opts, args):
Martin@779
   775
    formatName = args[0].lower()
Martin@775
   776
    fileNames = args[1:]
Martin@774
   777
Martin@774
   778
    opts.isKeepComments = False
Martin@774
   779
    opts.bitScoreA = None
Martin@774
   780
    opts.bitScoreB = None
Martin@774
   781
Martin@774
   782
    if not opts.noheader:
Martin@779
   783
        if isFormat(formatName, "html"):
Martin@774
   784
            writeHtmlHeader()
Martin@780
   785
        if isFormat(formatName, "sam"):
Martin@780
   786
            writeSamHeader(opts, fileNames)
Martin@779
   787
        if isFormat(formatName, "tabular"):
Martin@774
   788
            opts.isKeepComments = True
Martin@774
   789
Martin@779
   790
    if fileNames:
Martin@779
   791
        for i in fileNames:
Martin@779
   792
            if i == "-":
Martin@779
   793
                mafConvertOneFile(opts, formatName, sys.stdin)
Martin@779
   794
            else:
Martin@779
   795
                with open(i) as f:
Martin@779
   796
                    mafConvertOneFile(opts, formatName, f)
Martin@779
   797
    else:
Martin@779
   798
        mafConvertOneFile(opts, formatName, sys.stdin)
Martin@774
   799
Martin@774
   800
    if not opts.noheader:
Martin@779
   801
        if isFormat(formatName, "html"):
Martin@774
   802
            print "</body></html>"
Martin@101
   803
Martin@101
   804
if __name__ == "__main__":
Martin@101
   805
    signal.signal(signal.SIGPIPE, signal.SIG_DFL)  # avoid silly error message
Martin@101
   806
Martin@101
   807
    usage = """
Martin@148
   808
  %prog --help
Martin@291
   809
  %prog axt mafFile(s)
Martin@291
   810
  %prog blast mafFile(s)
Martin@572
   811
  %prog blasttab mafFile(s)
Martin@291
   812
  %prog html mafFile(s)
Martin@291
   813
  %prog psl mafFile(s)
Martin@291
   814
  %prog sam mafFile(s)
Martin@291
   815
  %prog tab mafFile(s)"""
Martin@101
   816
Martin@148
   817
    description = "Read MAF-format alignments & write them in another format."
Martin@148
   818
Martin@148
   819
    op = optparse.OptionParser(usage=usage, description=description)
Martin@148
   820
    op.add_option("-p", "--protein", action="store_true",
Martin@148
   821
                  help="assume protein alignments, for psl match counts")
Martin@798
   822
    op.add_option("-j", "--join", type="float", metavar="N",
Martin@798
   823
                  help="join co-linear alignments separated by <= N letters")
Martin@413
   824
    op.add_option("-n", "--noheader", action="store_true",
Martin@413
   825
                  help="omit any header lines from the output")
Martin@155
   826
    op.add_option("-d", "--dictionary", action="store_true",
Martin@155
   827
                  help="include dictionary of sequence lengths in sam format")
Martin@286
   828
    op.add_option("-f", "--dictfile",
Martin@286
   829
                  help="get sequence dictionary from DICTFILE")
Martin@285
   830
    op.add_option("-r", "--readgroup",
Martin@285
   831
                  help="read group info for sam format")
Martin@140
   832
    op.add_option("-l", "--linesize", type="int", default=60, #metavar="CHARS",
Martin@140
   833
                  help="line length for blast and html formats (default: %default)")
Martin@775
   834
    opts, args = op.parse_args()
Martin@140
   835
    if opts.linesize <= 0: op.error("option -l: should be >= 1")
Martin@286
   836
    if opts.dictionary and opts.dictfile: op.error("can't use both -d and -f")
Martin@291
   837
    if len(args) < 1: op.error("I need a format-name and some MAF alignments")
Martin@291
   838
    if opts.dictionary and (len(args) == 1 or "-" in args[1:]):
Martin@291
   839
        op.error("need file (not pipe) with option -d")
Martin@101
   840
Martin@140
   841
    try: mafConvert(opts, args)
Martin@102
   842
    except Exception, e:
Martin@101
   843
        prog = os.path.basename(sys.argv[0])
Martin@101
   844
        sys.exit(prog + ": error: " + str(e))