scripts/last-dotplot
author Martin C. Frith
Mon Dec 10 08:23:09 2018 +0900 (24 months ago)
changeset 961 ed0fb9b1eb40
parent 957 c54672c8892e
child 980 27c26371226b
permissions -rwxr-xr-x
Add last-dotplot --maxseqs option
Martin@1
     1
#! /usr/bin/env python
Martin@1
     2
Martin@272
     3
# Read pair-wise alignments in MAF or LAST tabular format: write an
Martin@272
     4
# "Oxford grid", a.k.a. dotplot.
Martin@1
     5
Martin@1
     6
# TODO: Currently, pixels with zero aligned nt-pairs are white, and
Martin@1
     7
# pixels with one or more aligned nt-pairs are black.  This can look
Martin@1
     8
# too crowded for large genome alignments.  I tried shading each pixel
Martin@1
     9
# according to the number of aligned nt-pairs within it, but the
Martin@1
    10
# result is too faint.  How can this be done better?
Martin@1
    11
Martin@916
    12
import collections
Martin@914
    13
import functools
Martin@875
    14
import gzip
Martin@896
    15
from fnmatch import fnmatchcase
Martin@961
    16
import logging
Martin@906
    17
from operator import itemgetter
Martin@903
    18
import subprocess
Martin@896
    19
import itertools, optparse, os, re, sys
Martin@475
    20
Martin@475
    21
# Try to make PIL/PILLOW work:
Martin@938
    22
try:
Martin@938
    23
    from PIL import Image, ImageDraw, ImageFont, ImageColor
Martin@938
    24
except ImportError:
Martin@938
    25
    import Image, ImageDraw, ImageFont, ImageColor
Martin@938
    26
Martin@938
    27
try:
Martin@938
    28
    from future_builtins import zip
Martin@938
    29
except ImportError:
Martin@938
    30
    pass
Martin@1
    31
Martin@844
    32
def myOpen(fileName):  # faster than fileinput
Martin@904
    33
    if fileName is None:
Martin@904
    34
        return []
Martin@844
    35
    if fileName == "-":
Martin@844
    36
        return sys.stdin
Martin@875
    37
    if fileName.endswith(".gz"):
Martin@957
    38
        return gzip.open(fileName, "rt")  # xxx dubious for Python2
Martin@844
    39
    return open(fileName)
Martin@844
    40
Martin@911
    41
def groupByFirstItem(things):
Martin@911
    42
    for k, v in itertools.groupby(things, itemgetter(0)):
Martin@911
    43
        yield k, [i[1:] for i in v]
Martin@911
    44
Martin@908
    45
def croppedBlocks(blocks, ranges1, ranges2):
Martin@908
    46
    headBeg1, headBeg2, headSize = blocks[0]
Martin@908
    47
    for r1 in ranges1:
Martin@908
    48
        for r2 in ranges2:
Martin@908
    49
            cropBeg1, cropEnd1 = r1
Martin@908
    50
            if headBeg1 < 0:
Martin@908
    51
                cropBeg1, cropEnd1 = -cropEnd1, -cropBeg1
Martin@908
    52
            cropBeg2, cropEnd2 = r2
Martin@908
    53
            if headBeg2 < 0:
Martin@908
    54
                cropBeg2, cropEnd2 = -cropEnd2, -cropBeg2
Martin@908
    55
            for beg1, beg2, size in blocks:
Martin@908
    56
                b1 = max(cropBeg1, beg1)
Martin@908
    57
                e1 = min(cropEnd1, beg1 + size)
Martin@908
    58
                if b1 >= e1: continue
Martin@908
    59
                offset = beg2 - beg1
Martin@908
    60
                b2 = max(cropBeg2, b1 + offset)
Martin@908
    61
                e2 = min(cropEnd2, e1 + offset)
Martin@908
    62
                if b2 >= e2: continue
Martin@908
    63
                yield b2 - offset, b2, e2 - b2
Martin@840
    64
Martin@482
    65
def tabBlocks(beg1, beg2, blocks):
Martin@482
    66
    '''Get the gapless blocks of an alignment, from LAST tabular format.'''
Martin@482
    67
    for i in blocks.split(","):
Martin@482
    68
        if ":" in i:
Martin@482
    69
            x, y = i.split(":")
Martin@482
    70
            beg1 += int(x)
Martin@482
    71
            beg2 += int(y)
Martin@482
    72
        else:
Martin@482
    73
            size = int(i)
Martin@482
    74
            yield beg1, beg2, size
Martin@482
    75
            beg1 += size
Martin@482
    76
            beg2 += size
Martin@272
    77
Martin@482
    78
def mafBlocks(beg1, beg2, seq1, seq2):
Martin@482
    79
    '''Get the gapless blocks of an alignment, from MAF format.'''
Martin@482
    80
    size = 0
Martin@938
    81
    for x, y in zip(seq1, seq2):
Martin@482
    82
        if x == "-":
Martin@482
    83
            if size:
Martin@482
    84
                yield beg1, beg2, size
Martin@482
    85
                beg1 += size
Martin@482
    86
                beg2 += size
Martin@482
    87
                size = 0
Martin@482
    88
            beg2 += 1
Martin@482
    89
        elif y == "-":
Martin@482
    90
            if size:
Martin@482
    91
                yield beg1, beg2, size
Martin@482
    92
                beg1 += size
Martin@482
    93
                beg2 += size
Martin@482
    94
                size = 0
Martin@482
    95
            beg1 += 1
Martin@272
    96
        else:
Martin@482
    97
            size += 1
Martin@482
    98
    if size: yield beg1, beg2, size
Martin@272
    99
Martin@482
   100
def alignmentInput(lines):
Martin@482
   101
    '''Get alignments and sequence lengths, from MAF or tabular format.'''
Martin@482
   102
    mafCount = 0
Martin@272
   103
    for line in lines:
Martin@272
   104
        w = line.split()
Martin@272
   105
        if line[0].isdigit():  # tabular format
Martin@482
   106
            chr1, beg1, seqlen1 = w[1], int(w[2]), int(w[5])
Martin@482
   107
            if w[4] == "-": beg1 -= seqlen1
Martin@482
   108
            chr2, beg2, seqlen2 = w[6], int(w[7]), int(w[10])
Martin@482
   109
            if w[9] == "-": beg2 -= seqlen2
Martin@847
   110
            blocks = tabBlocks(beg1, beg2, w[11])
Martin@482
   111
            yield chr1, seqlen1, chr2, seqlen2, blocks
Martin@272
   112
        elif line[0] == "s":  # MAF format
Martin@482
   113
            if mafCount == 0:
Martin@482
   114
                chr1, beg1, seqlen1, seq1 = w[1], int(w[2]), int(w[5]), w[6]
Martin@482
   115
                if w[4] == "-": beg1 -= seqlen1
Martin@482
   116
                mafCount = 1
Martin@482
   117
            else:
Martin@482
   118
                chr2, beg2, seqlen2, seq2 = w[1], int(w[2]), int(w[5]), w[6]
Martin@482
   119
                if w[4] == "-": beg2 -= seqlen2
Martin@847
   120
                blocks = mafBlocks(beg1, beg2, seq1, seq2)
Martin@482
   121
                yield chr1, seqlen1, chr2, seqlen2, blocks
Martin@482
   122
                mafCount = 0
Martin@272
   123
Martin@897
   124
def seqRequestFromText(text):
Martin@840
   125
    if ":" in text:
Martin@840
   126
        pattern, interval = text.rsplit(":", 1)
Martin@840
   127
        if "-" in interval:
Martin@840
   128
            beg, end = interval.rsplit("-", 1)
Martin@840
   129
            return pattern, int(beg), int(end)  # beg may be negative
Martin@840
   130
    return text, 0, sys.maxsize
Martin@840
   131
Martin@909
   132
def rangesFromSeqName(seqRequests, name, seqLen):
Martin@908
   133
    if seqRequests:
Martin@908
   134
        base = name.split(".")[-1]  # allow for names like hg19.chr7
Martin@908
   135
        for pat, beg, end in seqRequests:
Martin@908
   136
            if fnmatchcase(name, pat) or fnmatchcase(base, pat):
Martin@909
   137
                yield max(beg, 0), min(end, seqLen)
Martin@908
   138
    else:
Martin@909
   139
        yield 0, seqLen
Martin@651
   140
Martin@909
   141
def updateSeqs(coverDict, seqRanges, seqName, ranges, coveredRange):
Martin@909
   142
    beg, end = coveredRange
Martin@909
   143
    if beg < 0:
Martin@909
   144
        coveredRange = -end, -beg
Martin@909
   145
    if seqName in coverDict:
Martin@909
   146
        coverDict[seqName].append(coveredRange)
Martin@839
   147
    else:
Martin@909
   148
        coverDict[seqName] = [coveredRange]
Martin@909
   149
        for beg, end in ranges:
Martin@909
   150
            r = seqName, beg, end
Martin@909
   151
            seqRanges.append(r)
Martin@839
   152
Martin@651
   153
def readAlignments(fileName, opts):
Martin@839
   154
    '''Get alignments and sequence limits, from MAF or tabular format.'''
Martin@937
   155
    seqRequests1 = [seqRequestFromText(i) for i in opts.seq1]
Martin@937
   156
    seqRequests2 = [seqRequestFromText(i) for i in opts.seq2]
Martin@840
   157
Martin@482
   158
    alignments = []
Martin@909
   159
    seqRanges1 = []
Martin@909
   160
    seqRanges2 = []
Martin@909
   161
    coverDict1 = {}
Martin@909
   162
    coverDict2 = {}
Martin@844
   163
    lines = myOpen(fileName)
Martin@838
   164
    for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
Martin@909
   165
        ranges1 = sorted(rangesFromSeqName(seqRequests1, seqName1, seqLen1))
Martin@909
   166
        if not ranges1: continue
Martin@909
   167
        ranges2 = sorted(rangesFromSeqName(seqRequests2, seqName2, seqLen2))
Martin@909
   168
        if not ranges2: continue
Martin@909
   169
        b = list(croppedBlocks(list(blocks), ranges1, ranges2))
Martin@840
   170
        if not b: continue
Martin@840
   171
        aln = seqName1, seqName2, b
Martin@482
   172
        alignments.append(aln)
Martin@909
   173
        coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
Martin@909
   174
        updateSeqs(coverDict1, seqRanges1, seqName1, ranges1, coveredRange1)
Martin@909
   175
        coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
Martin@909
   176
        updateSeqs(coverDict2, seqRanges2, seqName2, ranges2, coveredRange2)
Martin@909
   177
    return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
Martin@1
   178
Martin@911
   179
def nameAndRangesFromDict(cropDict, seqName):
Martin@911
   180
    if seqName in cropDict:
Martin@911
   181
        return seqName, cropDict[seqName]
Martin@911
   182
    n = seqName.split(".")[-1]
Martin@911
   183
    if n in cropDict:
Martin@911
   184
        return n, cropDict[n]
Martin@911
   185
    return seqName, []
Martin@911
   186
Martin@911
   187
def rangesForSecondaryAlignments(primaryRanges, seqLen):
Martin@911
   188
    if primaryRanges:
Martin@911
   189
        return primaryRanges
Martin@911
   190
    return [(0, seqLen)]
Martin@911
   191
Martin@911
   192
def readSecondaryAlignments(opts, cropRanges1, cropRanges2):
Martin@911
   193
    cropDict1 = dict(groupByFirstItem(cropRanges1))
Martin@911
   194
    cropDict2 = dict(groupByFirstItem(cropRanges2))
Martin@911
   195
Martin@911
   196
    alignments = []
Martin@911
   197
    seqRanges1 = []
Martin@911
   198
    seqRanges2 = []
Martin@911
   199
    coverDict1 = {}
Martin@911
   200
    coverDict2 = {}
Martin@911
   201
    lines = myOpen(opts.alignments)
Martin@911
   202
    for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
Martin@911
   203
        seqName1, ranges1 = nameAndRangesFromDict(cropDict1, seqName1)
Martin@911
   204
        seqName2, ranges2 = nameAndRangesFromDict(cropDict2, seqName2)
Martin@911
   205
        if not ranges1 and not ranges2:
Martin@911
   206
            continue
Martin@911
   207
        r1 = rangesForSecondaryAlignments(ranges1, seqLen1)
Martin@911
   208
        r2 = rangesForSecondaryAlignments(ranges2, seqLen2)
Martin@911
   209
        b = list(croppedBlocks(list(blocks), r1, r2))
Martin@911
   210
        if not b: continue
Martin@911
   211
        aln = seqName1, seqName2, b
Martin@911
   212
        alignments.append(aln)
Martin@911
   213
        if not ranges1:
Martin@911
   214
            coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
Martin@911
   215
            updateSeqs(coverDict1, seqRanges1, seqName1, r1, coveredRange1)
Martin@911
   216
        if not ranges2:
Martin@911
   217
            coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
Martin@911
   218
            updateSeqs(coverDict2, seqRanges2, seqName2, r2, coveredRange2)
Martin@911
   219
    return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
Martin@911
   220
Martin@910
   221
def twoValuesFromOption(text, separator):
Martin@910
   222
    if separator in text:
Martin@910
   223
        return text.split(separator)
Martin@910
   224
    return text, text
Martin@910
   225
Martin@910
   226
def mergedRanges(ranges):
Martin@910
   227
    oldBeg, maxEnd = ranges[0]
Martin@910
   228
    for beg, end in ranges:
Martin@910
   229
        if beg > maxEnd:
Martin@910
   230
            yield oldBeg, maxEnd
Martin@910
   231
            oldBeg = beg
Martin@910
   232
            maxEnd = end
Martin@910
   233
        elif end > maxEnd:
Martin@910
   234
            maxEnd = end
Martin@910
   235
    yield oldBeg, maxEnd
Martin@910
   236
Martin@910
   237
def mergedRangesPerSeq(coverDict):
Martin@941
   238
    for k, v in coverDict.items():
Martin@910
   239
        v.sort()
Martin@910
   240
        yield k, list(mergedRanges(v))
Martin@910
   241
Martin@910
   242
def coveredLength(mergedCoverDict):
Martin@941
   243
    return sum(sum(e - b for b, e in v) for v in mergedCoverDict.values())
Martin@910
   244
Martin@910
   245
def trimmed(seqRanges, coverDict, minAlignedBases, maxGapFrac, endPad, midPad):
Martin@910
   246
    maxEndGapFrac, maxMidGapFrac = twoValuesFromOption(maxGapFrac, ",")
Martin@910
   247
    maxEndGap = max(float(maxEndGapFrac) * minAlignedBases, endPad * 1.0)
Martin@910
   248
    maxMidGap = max(float(maxMidGapFrac) * minAlignedBases, midPad * 2.0)
Martin@910
   249
Martin@910
   250
    for seqName, rangeBeg, rangeEnd in seqRanges:
Martin@910
   251
        seqBlocks = coverDict[seqName]
Martin@910
   252
        blocks = [i for i in seqBlocks if i[0] < rangeEnd and i[1] > rangeBeg]
Martin@910
   253
        if blocks[0][0] - rangeBeg > maxEndGap:
Martin@910
   254
            rangeBeg = blocks[0][0] - endPad
Martin@910
   255
        for j, y in enumerate(blocks):
Martin@910
   256
            if j:
Martin@910
   257
                x = blocks[j - 1]
Martin@910
   258
                if y[0] - x[1] > maxMidGap:
Martin@910
   259
                    yield seqName, rangeBeg, x[1] + midPad
Martin@910
   260
                    rangeBeg = y[0] - midPad
Martin@910
   261
        if rangeEnd - blocks[-1][1] > maxEndGap:
Martin@910
   262
            rangeEnd = blocks[-1][1] + endPad
Martin@910
   263
        yield seqName, rangeBeg, rangeEnd
Martin@910
   264
Martin@916
   265
def rangesWithStrandInfo(seqRanges, strandOpt, alignments, seqIndex):
Martin@916
   266
    if strandOpt == "1":
Martin@916
   267
        forwardMinusReverse = collections.defaultdict(int)
Martin@916
   268
        for i in alignments:
Martin@916
   269
            blocks = i[2]
Martin@916
   270
            beg1, beg2, size = blocks[0]
Martin@916
   271
            numOfAlignedLetterPairs = sum(i[2] for i in blocks)
Martin@916
   272
            if (beg1 < 0) != (beg2 < 0):  # opposite-strand alignment
Martin@916
   273
                numOfAlignedLetterPairs *= -1
Martin@916
   274
            forwardMinusReverse[i[seqIndex]] += numOfAlignedLetterPairs
Martin@916
   275
    strandNum = 0
Martin@916
   276
    for seqName, beg, end in seqRanges:
Martin@916
   277
        if strandOpt == "1":
Martin@916
   278
            strandNum = 1 if forwardMinusReverse[seqName] >= 0 else 2
Martin@916
   279
        yield seqName, beg, end, strandNum
Martin@916
   280
Martin@1
   281
def natural_sort_key(my_string):
Martin@1
   282
    '''Return a sort key for "natural" ordering, e.g. chr9 < chr10.'''
Martin@1
   283
    parts = re.split(r'(\d+)', my_string)
Martin@1
   284
    parts[1::2] = map(int, parts[1::2])
Martin@1
   285
    return parts
Martin@1
   286
Martin@907
   287
def nameKey(oneSeqRanges):
Martin@907
   288
    return natural_sort_key(oneSeqRanges[0][0])
Martin@907
   289
Martin@907
   290
def sizeKey(oneSeqRanges):
Martin@925
   291
    return sum(b - e for n, b, e, s in oneSeqRanges), nameKey(oneSeqRanges)
Martin@907
   292
Martin@914
   293
def alignmentKey(seqNamesToLists, oneSeqRanges):
Martin@914
   294
    seqName = oneSeqRanges[0][0]
Martin@914
   295
    alignmentsOfThisSequence = seqNamesToLists[seqName]
Martin@914
   296
    numOfAlignedLetterPairs = sum(i[3] for i in alignmentsOfThisSequence)
Martin@914
   297
    toMiddle = numOfAlignedLetterPairs // 2
Martin@914
   298
    for i in alignmentsOfThisSequence:
Martin@914
   299
        toMiddle -= i[3]
Martin@914
   300
        if toMiddle < 0:
Martin@914
   301
            return i[1:3]  # sequence-rank and "position" of this alignment
Martin@914
   302
Martin@916
   303
def rankAndFlipPerSeq(seqRanges):
Martin@916
   304
    rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
Martin@916
   305
    for rank, group in enumerate(rangesGroupedBySeqName):
Martin@916
   306
        seqName, ranges = group
Martin@916
   307
        strandNum = next(ranges)[3]
Martin@916
   308
        flip = 1 if strandNum < 2 else -1
Martin@916
   309
        yield seqName, (rank, flip)
Martin@916
   310
Martin@916
   311
def alignmentSortData(alignments, seqIndex, otherNamesToRanksAndFlips):
Martin@915
   312
    otherIndex = 1 - seqIndex
Martin@915
   313
    for i in alignments:
Martin@915
   314
        blocks = i[2]
Martin@916
   315
        otherRank, otherFlip = otherNamesToRanksAndFlips[i[otherIndex]]
Martin@916
   316
        otherPos = otherFlip * abs(blocks[0][otherIndex] +
Martin@916
   317
                                   blocks[-1][otherIndex] + blocks[-1][2])
Martin@914
   318
        numOfAlignedLetterPairs = sum(i[2] for i in blocks)
Martin@915
   319
        yield i[seqIndex], otherRank, otherPos, numOfAlignedLetterPairs
Martin@914
   320
Martin@915
   321
def mySortedRanges(seqRanges, sortOpt, seqIndex, alignments, otherRanges):
Martin@913
   322
    rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
Martin@913
   323
    g = [list(ranges) for seqName, ranges in rangesGroupedBySeqName]
Martin@916
   324
    for i in g:
Martin@916
   325
        if i[0][3] > 1:
Martin@916
   326
            i.reverse()
Martin@907
   327
    if sortOpt == "1":
Martin@907
   328
        g.sort(key=nameKey)
Martin@907
   329
    if sortOpt == "2":
Martin@907
   330
        g.sort(key=sizeKey)
Martin@914
   331
    if sortOpt == "3":
Martin@916
   332
        otherNamesToRanksAndFlips = dict(rankAndFlipPerSeq(otherRanges))
Martin@915
   333
        alns = sorted(alignmentSortData(alignments, seqIndex,
Martin@916
   334
                                        otherNamesToRanksAndFlips))
Martin@914
   335
        alnsGroupedBySeqName = itertools.groupby(alns, itemgetter(0))
Martin@914
   336
        seqNamesToLists = dict((k, list(v)) for k, v in alnsGroupedBySeqName)
Martin@914
   337
        g.sort(key=functools.partial(alignmentKey, seqNamesToLists))
Martin@907
   338
    return [j for i in g for j in i]
Martin@907
   339
Martin@914
   340
def allSortedRanges(opts, alignments, alignmentsB,
Martin@914
   341
                    seqRanges1, seqRangesB1, seqRanges2, seqRangesB2):
Martin@916
   342
    o1, oB1 = twoValuesFromOption(opts.strands1, ":")
Martin@916
   343
    o2, oB2 = twoValuesFromOption(opts.strands2, ":")
Martin@916
   344
    if o1 == "1" and o2 == "1":
Martin@916
   345
        raise Exception("the strand options have circular dependency")
Martin@916
   346
    seqRanges1 = list(rangesWithStrandInfo(seqRanges1, o1, alignments, 0))
Martin@916
   347
    seqRanges2 = list(rangesWithStrandInfo(seqRanges2, o2, alignments, 1))
Martin@916
   348
    seqRangesB1 = list(rangesWithStrandInfo(seqRangesB1, oB1, alignmentsB, 0))
Martin@916
   349
    seqRangesB2 = list(rangesWithStrandInfo(seqRangesB2, oB2, alignmentsB, 1))
Martin@916
   350
Martin@914
   351
    o1, oB1 = twoValuesFromOption(opts.sort1, ":")
Martin@914
   352
    o2, oB2 = twoValuesFromOption(opts.sort2, ":")
Martin@914
   353
    if o1 == "3" and o2 == "3":
Martin@914
   354
        raise Exception("the sort options have circular dependency")
Martin@914
   355
    if o1 != "3":
Martin@914
   356
        s1 = mySortedRanges(seqRanges1, o1, None, None, None)
Martin@914
   357
    if o2 != "3":
Martin@914
   358
        s2 = mySortedRanges(seqRanges2, o2, None, None, None)
Martin@914
   359
    if o1 == "3":
Martin@915
   360
        s1 = mySortedRanges(seqRanges1, o1, 0, alignments, s2)
Martin@914
   361
    if o2 == "3":
Martin@915
   362
        s2 = mySortedRanges(seqRanges2, o2, 1, alignments, s1)
Martin@915
   363
    t1 = mySortedRanges(seqRangesB1, oB1, 0, alignmentsB, s2)
Martin@915
   364
    t2 = mySortedRanges(seqRangesB2, oB2, 1, alignmentsB, s1)
Martin@914
   365
    return s1 + t1, s2 + t2
Martin@911
   366
Martin@898
   367
def prettyNum(n):
Martin@898
   368
    t = str(n)
Martin@898
   369
    groups = []
Martin@898
   370
    while t:
Martin@898
   371
        groups.append(t[-3:])
Martin@898
   372
        t = t[:-3]
Martin@904
   373
    return ",".join(reversed(groups))
Martin@898
   374
Martin@846
   375
def sizeText(size):
Martin@846
   376
    suffixes = "bp", "kb", "Mb", "Gb"
Martin@846
   377
    for i, x in enumerate(suffixes):
Martin@846
   378
        j = 10 ** (i * 3)
Martin@846
   379
        if size < j * 10:
Martin@846
   380
            return "%.2g" % (1.0 * size / j) + x
Martin@846
   381
        if size < j * 1000 or i == len(suffixes) - 1:
Martin@846
   382
            return "%.0f" % (1.0 * size / j) + x
Martin@846
   383
Martin@898
   384
def labelText(seqRange, labelOpt):
Martin@916
   385
    seqName, beg, end, strandNum = seqRange
Martin@898
   386
    if labelOpt == 1:
Martin@898
   387
        return seqName + ": " + sizeText(end - beg)
Martin@898
   388
    if labelOpt == 2:
Martin@904
   389
        return seqName + ":" + prettyNum(beg) + ": " + sizeText(end - beg)
Martin@898
   390
    if labelOpt == 3:
Martin@904
   391
        return seqName + ":" + prettyNum(beg) + "-" + prettyNum(end)
Martin@898
   392
    return seqName
Martin@846
   393
Martin@899
   394
def rangeLabels(seqRanges, labelOpt, font, fontsize, image_mode, textRot):
Martin@899
   395
    if fontsize:
Martin@899
   396
        image_size = 1, 1
Martin@899
   397
        im = Image.new(image_mode, image_size)
Martin@899
   398
        draw = ImageDraw.Draw(im)
Martin@899
   399
    x = y = 0
Martin@899
   400
    for r in seqRanges:
Martin@899
   401
        text = labelText(r, labelOpt)
Martin@899
   402
        if fontsize:
Martin@899
   403
            x, y = draw.textsize(text, font=font)
Martin@899
   404
            if textRot:
Martin@899
   405
                x, y = y, x
Martin@916
   406
        yield text, x, y, r[3]
Martin@899
   407
Martin@914
   408
def dataFromRanges(sortedRanges, font, fontSize, imageMode, labelOpt, textRot):
Martin@916
   409
    for seqName, rangeBeg, rangeEnd, strandNum in sortedRanges:
Martin@916
   410
        out = [seqName, str(rangeBeg), str(rangeEnd)]
Martin@916
   411
        if strandNum > 0:
Martin@916
   412
            out.append(".+-"[strandNum])
Martin@961
   413
        logging.info("\t".join(out))
Martin@961
   414
    logging.info("")
Martin@916
   415
    rangeSizes = [e - b for n, b, e, s in sortedRanges]
Martin@913
   416
    labs = list(rangeLabels(sortedRanges, labelOpt, font, fontSize,
Martin@913
   417
                            imageMode, textRot))
Martin@907
   418
    margin = max(i[2] for i in labs)
Martin@878
   419
    # xxx the margin may be too big, because some labels may get omitted
Martin@914
   420
    return rangeSizes, labs, margin
Martin@1
   421
Martin@1
   422
def div_ceil(x, y):
Martin@1
   423
    '''Return x / y rounded up.'''
Martin@1
   424
    q, r = divmod(x, y)
Martin@1
   425
    return q + (r != 0)
Martin@1
   426
Martin@896
   427
def get_bp_per_pix(rangeSizes, pixTweenRanges, maxPixels):
Martin@1
   428
    '''Get the minimum bp-per-pixel that fits in the size limit.'''
Martin@961
   429
    logging.info("choosing bp per pixel...")
Martin@896
   430
    numOfRanges = len(rangeSizes)
Martin@896
   431
    maxPixelsInRanges = maxPixels - pixTweenRanges * (numOfRanges - 1)
Martin@896
   432
    if maxPixelsInRanges < numOfRanges:
Martin@649
   433
        raise Exception("can't fit the image: too many sequences?")
Martin@896
   434
    negLimit = -maxPixelsInRanges
Martin@896
   435
    negBpPerPix = sum(rangeSizes) // negLimit
Martin@863
   436
    while True:
Martin@896
   437
        if sum(i // negBpPerPix for i in rangeSizes) >= negLimit:
Martin@863
   438
            return -negBpPerPix
Martin@863
   439
        negBpPerPix -= 1
Martin@1
   440
Martin@900
   441
def getRangePixBegs(rangePixLens, pixTweenRanges, margin):
Martin@900
   442
    '''Get the start pixel for each range.'''
Martin@900
   443
    rangePixBegs = []
Martin@896
   444
    pix_tot = margin - pixTweenRanges
Martin@900
   445
    for i in rangePixLens:
Martin@896
   446
        pix_tot += pixTweenRanges
Martin@900
   447
        rangePixBegs.append(pix_tot)
Martin@1
   448
        pix_tot += i
Martin@900
   449
    return rangePixBegs
Martin@1
   450
Martin@896
   451
def pixelData(rangeSizes, bp_per_pix, pixTweenRanges, margin):
Martin@900
   452
    '''Return pixel information about the ranges.'''
Martin@900
   453
    rangePixLens = [div_ceil(i, bp_per_pix) for i in rangeSizes]
Martin@900
   454
    rangePixBegs = getRangePixBegs(rangePixLens, pixTweenRanges, margin)
Martin@900
   455
    tot_pix = rangePixBegs[-1] + rangePixLens[-1]
Martin@900
   456
    return rangePixBegs, rangePixLens, tot_pix
Martin@1
   457
Martin@835
   458
def drawLineForward(hits, width, bp_per_pix, beg1, beg2, size):
Martin@639
   459
    while True:
Martin@639
   460
        q1, r1 = divmod(beg1, bp_per_pix)
Martin@639
   461
        q2, r2 = divmod(beg2, bp_per_pix)
Martin@835
   462
        hits[q2 * width + q1] |= 1
Martin@639
   463
        next_pix = min(bp_per_pix - r1, bp_per_pix - r2)
Martin@639
   464
        if next_pix >= size: break
Martin@639
   465
        beg1 += next_pix
Martin@639
   466
        beg2 += next_pix
Martin@639
   467
        size -= next_pix
Martin@639
   468
Martin@835
   469
def drawLineReverse(hits, width, bp_per_pix, beg1, beg2, size):
Martin@639
   470
    while True:
Martin@639
   471
        q1, r1 = divmod(beg1, bp_per_pix)
Martin@639
   472
        q2, r2 = divmod(beg2, bp_per_pix)
Martin@835
   473
        hits[q2 * width + q1] |= 2
Martin@639
   474
        next_pix = min(bp_per_pix - r1, r2 + 1)
Martin@639
   475
        if next_pix >= size: break
Martin@639
   476
        beg1 += next_pix
Martin@639
   477
        beg2 -= next_pix
Martin@639
   478
        size -= next_pix
Martin@639
   479
Martin@916
   480
def strandAndOrigin(ranges, beg, size):
Martin@916
   481
    isReverseStrand = (beg < 0)
Martin@916
   482
    if isReverseStrand:
Martin@905
   483
        beg = -(beg + size)
Martin@916
   484
    for rangeBeg, rangeEnd, isReverseRange, origin in ranges:
Martin@926
   485
        if rangeEnd > beg:  # assumes the ranges are sorted
Martin@916
   486
            return (isReverseStrand != isReverseRange), origin
Martin@905
   487
Martin@905
   488
def alignmentPixels(width, height, alignments, bp_per_pix,
Martin@905
   489
                    rangeDict1, rangeDict2):
Martin@640
   490
    hits = [0] * (width * height)  # the image data
Martin@640
   491
    for seq1, seq2, blocks in alignments:
Martin@905
   492
        beg1, beg2, size = blocks[0]
Martin@916
   493
        isReverse1, ori1 = strandAndOrigin(rangeDict1[seq1], beg1, size)
Martin@916
   494
        isReverse2, ori2 = strandAndOrigin(rangeDict2[seq2], beg2, size)
Martin@640
   495
        for beg1, beg2, size in blocks:
Martin@916
   496
            if isReverse1:
Martin@640
   497
                beg1 = -(beg1 + size)
Martin@640
   498
                beg2 = -(beg2 + size)
Martin@916
   499
            if isReverse1 == isReverse2:
Martin@835
   500
                drawLineForward(hits, width, bp_per_pix,
Martin@915
   501
                                ori1 + beg1, ori2 + beg2, size)
Martin@640
   502
            else:
Martin@835
   503
                drawLineReverse(hits, width, bp_per_pix,
Martin@915
   504
                                ori1 + beg1, ori2 - beg2 - 1, size)
Martin@640
   505
    return hits
Martin@1
   506
Martin@650
   507
def expandedSeqDict(seqDict):
Martin@650
   508
    '''Allow lookup by short sequence names, e.g. chr7 as well as hg19.chr7.'''
Martin@861
   509
    newDict = seqDict.copy()
Martin@650
   510
    for name, x in seqDict.items():
Martin@861
   511
        if "." in name:
Martin@861
   512
            base = name.split(".")[-1]
Martin@861
   513
            if base in newDict:  # an ambiguous case was found:
Martin@861
   514
                return seqDict   # so give up completely
Martin@861
   515
            newDict[base] = x
Martin@650
   516
    return newDict
Martin@650
   517
Martin@905
   518
def readBed(fileName, rangeDict):
Martin@845
   519
    for line in myOpen(fileName):
Martin@845
   520
        w = line.split()
Martin@847
   521
        if not w: continue
Martin@845
   522
        seqName = w[0]
Martin@905
   523
        if seqName not in rangeDict: continue
Martin@845
   524
        beg = int(w[1])
Martin@845
   525
        end = int(w[2])
Martin@857
   526
        layer = 900
Martin@895
   527
        color = "#fbf"
Martin@858
   528
        if len(w) > 4:
Martin@858
   529
            if w[4] != ".":
Martin@858
   530
                layer = float(w[4])
Martin@858
   531
            if len(w) > 5:
Martin@858
   532
                if len(w) > 8 and w[8].count(",") == 2:
Martin@858
   533
                    color = "rgb(" + w[8] + ")"
Martin@916
   534
                else:
Martin@916
   535
                    strand = w[5]
Martin@916
   536
                    isRev = rangeDict[seqName][0][2]
Martin@916
   537
                    if strand == "+" and not isRev or strand == "-" and isRev:
Martin@916
   538
                        color = "#ffe8e8"
Martin@916
   539
                    if strand == "-" and not isRev or strand == "+" and isRev:
Martin@916
   540
                        color = "#e8e8ff"
Martin@859
   541
        yield layer, color, seqName, beg, end
Martin@845
   542
Martin@860
   543
def commaSeparatedInts(text):
Martin@860
   544
    return map(int, text.rstrip(",").split(","))
Martin@860
   545
Martin@905
   546
def readGenePred(opts, fileName, rangeDict):
Martin@860
   547
    for line in myOpen(fileName):
Martin@860
   548
        fields = line.split()
Martin@860
   549
        if not fields: continue
Martin@860
   550
        if fields[2] not in "+-": fields = fields[1:]
Martin@860
   551
        seqName = fields[1]
Martin@905
   552
        if seqName not in rangeDict: continue
Martin@860
   553
        #strand = fields[2]
Martin@860
   554
        cdsBeg = int(fields[5])
Martin@860
   555
        cdsEnd = int(fields[6])
Martin@860
   556
        exonBegs = commaSeparatedInts(fields[8])
Martin@860
   557
        exonEnds = commaSeparatedInts(fields[9])
Martin@860
   558
        for beg, end in zip(exonBegs, exonEnds):
Martin@860
   559
            yield 300, opts.exon_color, seqName, beg, end
Martin@860
   560
            b = max(beg, cdsBeg)
Martin@860
   561
            e = min(end, cdsEnd)
Martin@860
   562
            if b < e: yield 400, opts.cds_color, seqName, b, e
Martin@860
   563
Martin@905
   564
def readRmsk(fileName, rangeDict):
Martin@860
   565
    for line in myOpen(fileName):
Martin@860
   566
        fields = line.split()
Martin@860
   567
        if len(fields) == 17:  # rmsk.txt
Martin@860
   568
            seqName = fields[5]
Martin@905
   569
            if seqName not in rangeDict: continue  # do this ASAP for speed
Martin@860
   570
            beg = int(fields[6])
Martin@860
   571
            end = int(fields[7])
Martin@860
   572
            strand = fields[9]
Martin@860
   573
            repeatClass = fields[11]
Martin@860
   574
        elif len(fields) == 15:  # .out
Martin@860
   575
            seqName = fields[4]
Martin@905
   576
            if seqName not in rangeDict: continue
Martin@860
   577
            beg = int(fields[5]) - 1
Martin@860
   578
            end = int(fields[6])
Martin@860
   579
            strand = fields[8]
Martin@860
   580
            repeatClass = fields[10]
Martin@860
   581
        else:
Martin@860
   582
            continue
Martin@860
   583
        if repeatClass in ("Low_complexity", "Simple_repeat"):
Martin@895
   584
            yield 200, "#fbf", seqName, beg, end
Martin@916
   585
        elif (strand == "+") != rangeDict[seqName][0][2]:
Martin@895
   586
            yield 100, "#ffe8e8", seqName, beg, end
Martin@860
   587
        else:
Martin@895
   588
            yield 100, "#e8e8ff", seqName, beg, end
Martin@860
   589
Martin@650
   590
def isExtraFirstGapField(fields):
Martin@650
   591
    return fields[4].isdigit()
Martin@650
   592
Martin@905
   593
def readGaps(opts, fileName, rangeDict):
Martin@650
   594
    '''Read locations of unsequenced gaps, from an agp or gap file.'''
Martin@844
   595
    for line in myOpen(fileName):
Martin@650
   596
        w = line.split()
Martin@650
   597
        if not w or w[0][0] == "#": continue
Martin@650
   598
        if isExtraFirstGapField(w): w = w[1:]
Martin@650
   599
        if w[4] not in "NU": continue
Martin@650
   600
        seqName = w[0]
Martin@905
   601
        if seqName not in rangeDict: continue
Martin@650
   602
        end = int(w[2])
Martin@650
   603
        beg = end - int(w[5])  # zero-based coordinate
Martin@857
   604
        if w[7] == "yes":
Martin@859
   605
            yield 3000, opts.bridged_color, seqName, beg, end
Martin@857
   606
        else:
Martin@859
   607
            yield 2000, opts.unbridged_color, seqName, beg, end
Martin@650
   608
Martin@905
   609
def bedBoxes(beds, rangeDict, margin, edge, isTop, bpPerPix):
Martin@897
   610
    for layer, color, seqName, bedBeg, bedEnd in beds:
Martin@916
   611
        for rangeBeg, rangeEnd, isReverseRange, origin in rangeDict[seqName]:
Martin@905
   612
            beg = max(bedBeg, rangeBeg)
Martin@905
   613
            end = min(bedEnd, rangeEnd)
Martin@905
   614
            if beg >= end: continue
Martin@916
   615
            if isReverseRange:
Martin@916
   616
                beg, end = -end, -beg
Martin@905
   617
            if layer <= 1000:
Martin@905
   618
                # include partly-covered pixels
Martin@905
   619
                b = (origin + beg) // bpPerPix
Martin@905
   620
                e = div_ceil(origin + end, bpPerPix)
Martin@905
   621
            else:
Martin@905
   622
                # exclude partly-covered pixels
Martin@905
   623
                b = div_ceil(origin + beg, bpPerPix)
Martin@905
   624
                e = (origin + end) // bpPerPix
Martin@905
   625
                if e <= b: continue
Martin@916
   626
                if bedEnd >= rangeEnd:  # include partly-covered end pixels
Martin@916
   627
                    if isReverseRange:
Martin@916
   628
                        b = (origin + beg) // bpPerPix
Martin@916
   629
                    else:
Martin@916
   630
                        e = div_ceil(origin + end, bpPerPix)
Martin@905
   631
            if isTop:
Martin@905
   632
                box = b, margin, e, edge
Martin@905
   633
            else:
Martin@905
   634
                box = margin, b, edge, e
Martin@905
   635
            yield layer, color, box
Martin@845
   636
Martin@857
   637
def drawAnnotations(im, boxes):
Martin@857
   638
    # xxx use partial transparency for different-color overlaps?
Martin@857
   639
    for layer, color, box in boxes:
Martin@650
   640
        im.paste(color, box)
Martin@650
   641
Martin@901
   642
def placedLabels(labels, rangePixBegs, rangePixLens, beg, end):
Martin@901
   643
    '''Return axis labels with endpoint & sort-order information.'''
Martin@901
   644
    maxWidth = end - beg
Martin@901
   645
    for i, j, k in zip(labels, rangePixBegs, rangePixLens):
Martin@916
   646
        text, textWidth, textHeight, strandNum = i
Martin@901
   647
        if textWidth > maxWidth:
Martin@901
   648
            continue
Martin@901
   649
        labelBeg = j + (k - textWidth) // 2
Martin@901
   650
        labelEnd = labelBeg + textWidth
Martin@901
   651
        sortKey = textWidth - k
Martin@901
   652
        if labelBeg < beg:
Martin@904
   653
            sortKey += maxWidth * (beg - labelBeg)
Martin@901
   654
            labelBeg = beg
Martin@901
   655
            labelEnd = beg + textWidth
Martin@901
   656
        if labelEnd > end:
Martin@904
   657
            sortKey += maxWidth * (labelEnd - end)
Martin@901
   658
            labelEnd = end
Martin@901
   659
            labelBeg = end - textWidth
Martin@916
   660
        yield sortKey, labelBeg, labelEnd, text, textHeight, strandNum
Martin@1
   661
Martin@897
   662
def nonoverlappingLabels(labels, minPixTweenLabels):
Martin@1
   663
    '''Get a subset of non-overlapping axis labels, greedily.'''
Martin@897
   664
    out = []
Martin@1
   665
    for i in labels:
Martin@897
   666
        beg = i[1] - minPixTweenLabels
Martin@897
   667
        end = i[2] + minPixTweenLabels
Martin@897
   668
        if all(j[2] <= beg or j[1] >= end for j in out):
Martin@897
   669
            out.append(i)
Martin@897
   670
    return out
Martin@1
   671
Martin@901
   672
def axisImage(labels, rangePixBegs, rangePixLens, textRot,
Martin@895
   673
              textAln, font, image_mode, opts):
Martin@1
   674
    '''Make an image of axis labels.'''
Martin@900
   675
    beg = rangePixBegs[0]
Martin@900
   676
    end = rangePixBegs[-1] + rangePixLens[-1]
Martin@901
   677
    margin = max(i[2] for i in labels)
Martin@901
   678
    labels = sorted(placedLabels(labels, rangePixBegs, rangePixLens, beg, end))
Martin@878
   679
    minPixTweenLabels = 0 if textRot else opts.label_space
Martin@897
   680
    labels = nonoverlappingLabels(labels, minPixTweenLabels)
Martin@897
   681
    image_size = (margin, end) if textRot else (end, margin)
Martin@895
   682
    im = Image.new(image_mode, image_size, opts.margin_color)
Martin@1
   683
    draw = ImageDraw.Draw(im)
Martin@916
   684
    for sortKey, labelBeg, labelEnd, text, textHeight, strandNum in labels:
Martin@915
   685
        base = margin - textHeight if textAln else 0
Martin@915
   686
        position = (base, labelBeg) if textRot else (labelBeg, base)
Martin@916
   687
        fill = ("black", opts.forwardcolor, opts.reversecolor)[strandNum]
Martin@916
   688
        draw.text(position, text, font=font, fill=fill)
Martin@1
   689
    return im
Martin@1
   690
Martin@916
   691
def rangesWithOrigins(sortedRanges, rangePixBegs, rangePixLens, bpPerPix):
Martin@916
   692
    for i, j, k in zip(sortedRanges, rangePixBegs, rangePixLens):
Martin@916
   693
        seqName, rangeBeg, rangeEnd, strandNum = i
Martin@916
   694
        isReverseRange = (strandNum > 1)
Martin@916
   695
        if isReverseRange:
Martin@916
   696
            origin = bpPerPix * (j + k) + rangeBeg
Martin@916
   697
        else:
Martin@916
   698
            origin = bpPerPix * j - rangeBeg
Martin@916
   699
        yield seqName, (rangeBeg, rangeEnd, isReverseRange, origin)
Martin@906
   700
Martin@916
   701
def rangesPerSeq(sortedRanges, rangePixBegs, rangePixLens, bpPerPix):
Martin@916
   702
    a = rangesWithOrigins(sortedRanges, rangePixBegs, rangePixLens, bpPerPix)
Martin@906
   703
    for k, v in itertools.groupby(a, itemgetter(0)):
Martin@926
   704
        yield k, sorted(i[1] for i in v)
Martin@836
   705
Martin@903
   706
def getFont(opts):
Martin@903
   707
    if opts.fontfile:
Martin@903
   708
        return ImageFont.truetype(opts.fontfile, opts.fontsize)
Martin@903
   709
    fileNames = []
Martin@903
   710
    try:
Martin@903
   711
        x = ["fc-match", "-f%{file}", "arial"]
Martin@942
   712
        p = subprocess.Popen(x, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
Martin@942
   713
                             universal_newlines=True)
Martin@903
   714
        out, err = p.communicate()
Martin@903
   715
        fileNames.append(out)
Martin@903
   716
    except OSError as e:
Martin@961
   717
        logging.info("fc-match error: " + str(e))
Martin@903
   718
    fileNames.append("/Library/Fonts/Arial.ttf")  # for Mac
Martin@903
   719
    for i in fileNames:
Martin@903
   720
        try:
Martin@903
   721
            font = ImageFont.truetype(i, opts.fontsize)
Martin@961
   722
            logging.info("font: " + i)
Martin@903
   723
            return font
Martin@903
   724
        except IOError as e:
Martin@961
   725
            logging.info("font load error: " + str(e))
Martin@903
   726
    return ImageFont.load_default()
Martin@903
   727
Martin@961
   728
def sequenceSizesAndNames(seqRanges):
Martin@961
   729
    for seqName, ranges in itertools.groupby(seqRanges, itemgetter(0)):
Martin@961
   730
        size = sum(e - b for n, b, e in ranges)
Martin@961
   731
        yield size, seqName
Martin@961
   732
Martin@961
   733
def biggestSequences(seqRanges, maxNumOfSequences):
Martin@961
   734
    s = sorted(sequenceSizesAndNames(seqRanges), reverse=True)
Martin@961
   735
    if len(s) > maxNumOfSequences:
Martin@961
   736
        logging.warning("too many sequences - discarding the smallest ones")
Martin@961
   737
        s = s[:maxNumOfSequences]
Martin@961
   738
    return set(i[1] for i in s)
Martin@961
   739
Martin@961
   740
def remainingSequenceRanges(seqRanges, alignments, seqIndex):
Martin@961
   741
    remainingSequences = set(i[seqIndex] for i in alignments)
Martin@961
   742
    return [i for i in seqRanges if i[0] in remainingSequences]
Martin@961
   743
Martin@648
   744
def lastDotplot(opts, args):
Martin@961
   745
    logLevel = logging.INFO if opts.verbose else logging.WARNING
Martin@961
   746
    logging.basicConfig(format="%(filename)s: %(message)s", level=logLevel)
Martin@961
   747
Martin@903
   748
    font = getFont(opts)
Martin@643
   749
    image_mode = 'RGB'
Martin@643
   750
    forward_color = ImageColor.getcolor(opts.forwardcolor, image_mode)
Martin@643
   751
    reverse_color = ImageColor.getcolor(opts.reversecolor, image_mode)
Martin@643
   752
    zipped_colors = zip(forward_color, reverse_color)
Martin@643
   753
    overlap_color = tuple([(i + j) // 2 for i, j in zipped_colors])
Martin@641
   754
Martin@911
   755
    maxGap1, maxGapB1 = twoValuesFromOption(opts.max_gap1, ":")
Martin@911
   756
    maxGap2, maxGapB2 = twoValuesFromOption(opts.max_gap2, ":")
Martin@911
   757
Martin@961
   758
    logging.info("reading alignments...")
Martin@904
   759
    alnData = readAlignments(args[0], opts)
Martin@909
   760
    alignments, seqRanges1, coverDict1, seqRanges2, coverDict2 = alnData
Martin@649
   761
    if not alignments: raise Exception("there are no alignments")
Martin@961
   762
    logging.info("cutting...")
Martin@910
   763
    coverDict1 = dict(mergedRangesPerSeq(coverDict1))
Martin@910
   764
    coverDict2 = dict(mergedRangesPerSeq(coverDict2))
Martin@910
   765
    minAlignedBases = min(coveredLength(coverDict1), coveredLength(coverDict2))
Martin@910
   766
    pad = int(opts.pad * minAlignedBases)
Martin@910
   767
    cutRanges1 = list(trimmed(seqRanges1, coverDict1, minAlignedBases,
Martin@911
   768
                              maxGap1, pad, pad))
Martin@910
   769
    cutRanges2 = list(trimmed(seqRanges2, coverDict2, minAlignedBases,
Martin@911
   770
                              maxGap2, pad, pad))
Martin@911
   771
Martin@961
   772
    biggestSeqs1 = biggestSequences(cutRanges1, opts.maxseqs)
Martin@961
   773
    cutRanges1 = [i for i in cutRanges1 if i[0] in biggestSeqs1]
Martin@961
   774
    alignments = [i for i in alignments if i[0] in biggestSeqs1]
Martin@961
   775
    cutRanges2 = remainingSequenceRanges(cutRanges2, alignments, 1)
Martin@961
   776
Martin@961
   777
    biggestSeqs2 = biggestSequences(cutRanges2, opts.maxseqs)
Martin@961
   778
    cutRanges2 = [i for i in cutRanges2 if i[0] in biggestSeqs2]
Martin@961
   779
    alignments = [i for i in alignments if i[1] in biggestSeqs2]
Martin@961
   780
    cutRanges1 = remainingSequenceRanges(cutRanges1, alignments, 0)
Martin@961
   781
Martin@961
   782
    logging.info("reading secondary alignments...")
Martin@911
   783
    alnDataB = readSecondaryAlignments(opts, cutRanges1, cutRanges2)
Martin@911
   784
    alignmentsB, seqRangesB1, coverDictB1, seqRangesB2, coverDictB2 = alnDataB
Martin@961
   785
    logging.info("cutting...")
Martin@911
   786
    coverDictB1 = dict(mergedRangesPerSeq(coverDictB1))
Martin@911
   787
    coverDictB2 = dict(mergedRangesPerSeq(coverDictB2))
Martin@911
   788
    cutRangesB1 = trimmed(seqRangesB1, coverDictB1, minAlignedBases,
Martin@911
   789
                          maxGapB1, 0, 0)
Martin@911
   790
    cutRangesB2 = trimmed(seqRangesB2, coverDictB2, minAlignedBases,
Martin@911
   791
                          maxGapB2, 0, 0)
Martin@641
   792
Martin@961
   793
    logging.info("sorting...")
Martin@914
   794
    sortOut = allSortedRanges(opts, alignments, alignmentsB,
Martin@914
   795
                              cutRanges1, cutRangesB1, cutRanges2, cutRangesB2)
Martin@914
   796
    sortedRanges1, sortedRanges2 = sortOut
Martin@914
   797
Martin@878
   798
    textRot1 = "vertical".startswith(opts.rot1)
Martin@914
   799
    i1 = dataFromRanges(sortedRanges1, font,
Martin@908
   800
                        opts.fontsize, image_mode, opts.labels1, textRot1)
Martin@914
   801
    rangeSizes1, labelData1, tMargin = i1
Martin@846
   802
Martin@878
   803
    textRot2 = "horizontal".startswith(opts.rot2)
Martin@914
   804
    i2 = dataFromRanges(sortedRanges2, font,
Martin@908
   805
                        opts.fontsize, image_mode, opts.labels2, textRot2)
Martin@914
   806
    rangeSizes2, labelData2, lMargin = i2
Martin@641
   807
Martin@896
   808
    maxPixels1 = opts.width  - lMargin
Martin@896
   809
    maxPixels2 = opts.height - tMargin
Martin@897
   810
    bpPerPix1 = get_bp_per_pix(rangeSizes1, opts.border_pixels, maxPixels1)
Martin@897
   811
    bpPerPix2 = get_bp_per_pix(rangeSizes2, opts.border_pixels, maxPixels2)
Martin@855
   812
    bpPerPix = max(bpPerPix1, bpPerPix2)
Martin@961
   813
    logging.info("bp per pixel = " + str(bpPerPix))
Martin@641
   814
Martin@900
   815
    p1 = pixelData(rangeSizes1, bpPerPix, opts.border_pixels, lMargin)
Martin@900
   816
    rangePixBegs1, rangePixLens1, width = p1
Martin@916
   817
    rangeDict1 = dict(rangesPerSeq(sortedRanges1, rangePixBegs1,
Martin@916
   818
                                   rangePixLens1, bpPerPix))
Martin@900
   819
Martin@900
   820
    p2 = pixelData(rangeSizes2, bpPerPix, opts.border_pixels, tMargin)
Martin@900
   821
    rangePixBegs2, rangePixLens2, height = p2
Martin@916
   822
    rangeDict2 = dict(rangesPerSeq(sortedRanges2, rangePixBegs2,
Martin@916
   823
                                   rangePixLens2, bpPerPix))
Martin@900
   824
Martin@961
   825
    logging.info("width:  " + str(width))
Martin@961
   826
    logging.info("height: " + str(height))
Martin@839
   827
Martin@961
   828
    logging.info("processing alignments...")
Martin@911
   829
    hits = alignmentPixels(width, height, alignments + alignmentsB, bpPerPix,
Martin@905
   830
                           rangeDict1, rangeDict2)
Martin@641
   831
Martin@961
   832
    logging.info("reading annotations...")
Martin@134
   833
Martin@905
   834
    rangeDict1 = expandedSeqDict(rangeDict1)
Martin@905
   835
    beds1 = itertools.chain(readBed(opts.bed1, rangeDict1),
Martin@905
   836
                            readRmsk(opts.rmsk1, rangeDict1),
Martin@905
   837
                            readGenePred(opts, opts.genePred1, rangeDict1),
Martin@905
   838
                            readGaps(opts, opts.gap1, rangeDict1))
Martin@905
   839
    b1 = bedBoxes(beds1, rangeDict1, tMargin, height, True, bpPerPix)
Martin@845
   840
Martin@905
   841
    rangeDict2 = expandedSeqDict(rangeDict2)
Martin@905
   842
    beds2 = itertools.chain(readBed(opts.bed2, rangeDict2),
Martin@905
   843
                            readRmsk(opts.rmsk2, rangeDict2),
Martin@905
   844
                            readGenePred(opts, opts.genePred2, rangeDict2),
Martin@905
   845
                            readGaps(opts, opts.gap2, rangeDict2))
Martin@905
   846
    b2 = bedBoxes(beds2, rangeDict2, lMargin, width, False, bpPerPix)
Martin@857
   847
Martin@857
   848
    boxes = sorted(itertools.chain(b1, b2))
Martin@904
   849
Martin@961
   850
    logging.info("drawing...")
Martin@904
   851
Martin@904
   852
    image_size = width, height
Martin@904
   853
    im = Image.new(image_mode, image_size, opts.background_color)
Martin@904
   854
Martin@857
   855
    drawAnnotations(im, boxes)
Martin@650
   856
Martin@643
   857
    for i in range(height):
Martin@643
   858
        for j in range(width):
Martin@643
   859
            store_value = hits[i * width + j]
Martin@643
   860
            xy = j, i
Martin@643
   861
            if   store_value == 1: im.putpixel(xy, forward_color)
Martin@643
   862
            elif store_value == 2: im.putpixel(xy, reverse_color)
Martin@643
   863
            elif store_value == 3: im.putpixel(xy, overlap_color)
Martin@95
   864
Martin@643
   865
    if opts.fontsize != 0:
Martin@900
   866
        axis1 = axisImage(labelData1, rangePixBegs1, rangePixLens1,
Martin@895
   867
                          textRot1, False, font, image_mode, opts)
Martin@878
   868
        if textRot1:
Martin@878
   869
            axis1 = axis1.transpose(Image.ROTATE_90)
Martin@900
   870
        axis2 = axisImage(labelData2, rangePixBegs2, rangePixLens2,
Martin@895
   871
                          textRot2, textRot2, font, image_mode, opts)
Martin@878
   872
        if not textRot2:
Martin@878
   873
            axis2 = axis2.transpose(Image.ROTATE_270)
Martin@643
   874
        im.paste(axis1, (0, 0))
Martin@643
   875
        im.paste(axis2, (0, 0))
Martin@1
   876
Martin@900
   877
    for i in rangePixBegs1[1:]:
Martin@877
   878
        box = i - opts.border_pixels, tMargin, i, height
Martin@852
   879
        im.paste(opts.border_color, box)
Martin@1
   880
Martin@900
   881
    for i in rangePixBegs2[1:]:
Martin@877
   882
        box = lMargin, i - opts.border_pixels, width, i
Martin@852
   883
        im.paste(opts.border_color, box)
Martin@1
   884
Martin@643
   885
    im.save(args[1])
Martin@648
   886
Martin@648
   887
if __name__ == "__main__":
Martin@649
   888
    usage = """%prog --help
Martin@649
   889
   or: %prog [options] maf-or-tab-alignments dotplot.png
Martin@649
   890
   or: %prog [options] maf-or-tab-alignments dotplot.gif
Martin@649
   891
   or: ..."""
Martin@649
   892
    description = "Draw a dotplot of pair-wise sequence alignments in MAF or tabular format."
Martin@649
   893
    op = optparse.OptionParser(usage=usage, description=description)
Martin@866
   894
    op.add_option("-v", "--verbose", action="count",
Martin@866
   895
                  help="show progress messages & data about the plot")
Martin@961
   896
    # Replace "width" & "height" with a single "length" option?
Martin@961
   897
    op.add_option("-x", "--width", metavar="INT", type="int", default=1000,
Martin@961
   898
                  help="maximum width in pixels (default: %default)")
Martin@961
   899
    op.add_option("-y", "--height", metavar="INT", type="int", default=1000,
Martin@961
   900
                  help="maximum height in pixels (default: %default)")
Martin@961
   901
    op.add_option("-m", "--maxseqs", type="int", default=100, metavar="M",
Martin@961
   902
                  help="maximum number of horizontal or vertical sequences "
Martin@961
   903
                  "(default=%default)")
Martin@651
   904
    op.add_option("-1", "--seq1", metavar="PATTERN", action="append",
Martin@840
   905
                  default=[],
Martin@651
   906
                  help="which sequences to show from the 1st genome")
Martin@651
   907
    op.add_option("-2", "--seq2", metavar="PATTERN", action="append",
Martin@840
   908
                  default=[],
Martin@651
   909
                  help="which sequences to show from the 2nd genome")
Martin@649
   910
    op.add_option("-c", "--forwardcolor", metavar="COLOR", default="red",
Martin@649
   911
                  help="color for forward alignments (default: %default)")
Martin@649
   912
    op.add_option("-r", "--reversecolor", metavar="COLOR", default="blue",
Martin@649
   913
                  help="color for reverse alignments (default: %default)")
Martin@911
   914
    op.add_option("--alignments", metavar="FILE", help="secondary alignments")
Martin@907
   915
    op.add_option("--sort1", default="1", metavar="N",
Martin@851
   916
                  help="genome1 sequence order: 0=input order, 1=name order, "
Martin@914
   917
                  "2=length order, 3=alignment order (default=%default)")
Martin@907
   918
    op.add_option("--sort2", default="1", metavar="N",
Martin@851
   919
                  help="genome2 sequence order: 0=input order, 1=name order, "
Martin@914
   920
                  "2=length order, 3=alignment order (default=%default)")
Martin@916
   921
    op.add_option("--strands1", default="0", metavar="N", help=
Martin@916
   922
                  "genome1 sequence orientation: 0=forward orientation, "
Martin@916
   923
                  "1=alignment orientation (default=%default)")
Martin@916
   924
    op.add_option("--strands2", default="0", metavar="N", help=
Martin@916
   925
                  "genome2 sequence orientation: 0=forward orientation, "
Martin@916
   926
                  "1=alignment orientation (default=%default)")
Martin@910
   927
    op.add_option("--max-gap1", metavar="FRAC", default="0.5,2", help=
Martin@910
   928
                  "maximum unaligned (end,mid) gap in genome1: "
Martin@910
   929
                  "fraction of aligned length (default=%default)")
Martin@910
   930
    op.add_option("--max-gap2", metavar="FRAC", default="0.5,2", help=
Martin@910
   931
                  "maximum unaligned (end,mid) gap in genome2: "
Martin@910
   932
                  "fraction of aligned length (default=%default)")
Martin@910
   933
    op.add_option("--pad", metavar="FRAC", type="float", default=0.04, help=
Martin@910
   934
                  "pad length when cutting unaligned gaps: "
Martin@910
   935
                  "fraction of aligned length (default=%default)")
Martin@852
   936
    op.add_option("--border-pixels", metavar="INT", type="int", default=1,
Martin@852
   937
                  help="number of pixels between sequences (default=%default)")
Martin@895
   938
    op.add_option("--border-color", metavar="COLOR", default="black",
Martin@852
   939
                  help="color for pixels between sequences (default=%default)")
Martin@911
   940
    # --break-color and/or --break-pixels for intra-sequence breaks?
Martin@895
   941
    op.add_option("--margin-color", metavar="COLOR", default="#dcdcdc",
Martin@895
   942
                  help="margin color")
Martin@846
   943
Martin@850
   944
    og = optparse.OptionGroup(op, "Text options")
Martin@850
   945
    og.add_option("-f", "--fontfile", metavar="FILE",
Martin@850
   946
                  help="TrueType or OpenType font file")
Martin@903
   947
    og.add_option("-s", "--fontsize", metavar="SIZE", type="int", default=14,
Martin@850
   948
                  help="TrueType or OpenType font size (default: %default)")
Martin@898
   949
    og.add_option("--labels1", type="int", default=0, metavar="N", help=
Martin@898
   950
                  "genome1 labels: 0=name, 1=name:length, "
Martin@898
   951
                  "2=name:start:length, 3=name:start-end (default=%default)")
Martin@898
   952
    og.add_option("--labels2", type="int", default=0, metavar="N", help=
Martin@898
   953
                  "genome2 labels: 0=name, 1=name:length, "
Martin@898
   954
                  "2=name:start:length, 3=name:start-end (default=%default)")
Martin@878
   955
    og.add_option("--rot1", metavar="ROT", default="h",
Martin@878
   956
                  help="text rotation for the 1st genome (default=%default)")
Martin@878
   957
    og.add_option("--rot2", metavar="ROT", default="v",
Martin@878
   958
                  help="text rotation for the 2nd genome (default=%default)")
Martin@850
   959
    op.add_option_group(og)
Martin@850
   960
Martin@860
   961
    og = optparse.OptionGroup(op, "Annotation options")
Martin@860
   962
    og.add_option("--bed1", metavar="FILE",
Martin@860
   963
                  help="read genome1 annotations from BED file")
Martin@860
   964
    og.add_option("--bed2", metavar="FILE",
Martin@860
   965
                  help="read genome2 annotations from BED file")
Martin@860
   966
    og.add_option("--rmsk1", metavar="FILE", help="read genome1 repeats from "
Martin@860
   967
                  "RepeatMasker .out or rmsk.txt file")
Martin@860
   968
    og.add_option("--rmsk2", metavar="FILE", help="read genome2 repeats from "
Martin@860
   969
                  "RepeatMasker .out or rmsk.txt file")
Martin@860
   970
    op.add_option_group(og)
Martin@860
   971
Martin@860
   972
    og = optparse.OptionGroup(op, "Gene options")
Martin@860
   973
    og.add_option("--genePred1", metavar="FILE",
Martin@860
   974
                  help="read genome1 genes from genePred file")
Martin@860
   975
    og.add_option("--genePred2", metavar="FILE",
Martin@860
   976
                  help="read genome2 genes from genePred file")
Martin@895
   977
    og.add_option("--exon-color", metavar="COLOR", default="PaleGreen",
Martin@860
   978
                  help="color for exons (default=%default)")
Martin@895
   979
    og.add_option("--cds-color", metavar="COLOR", default="LimeGreen",
Martin@860
   980
                  help="color for protein-coding regions (default=%default)")
Martin@860
   981
    op.add_option_group(og)
Martin@860
   982
Martin@650
   983
    og = optparse.OptionGroup(op, "Unsequenced gap options")
Martin@650
   984
    og.add_option("--gap1", metavar="FILE",
Martin@650
   985
                  help="read genome1 unsequenced gaps from agp or gap file")
Martin@650
   986
    og.add_option("--gap2", metavar="FILE",
Martin@650
   987
                  help="read genome2 unsequenced gaps from agp or gap file")
Martin@650
   988
    og.add_option("--bridged-color", metavar="COLOR", default="yellow",
Martin@650
   989
                  help="color for bridged gaps (default: %default)")
Martin@895
   990
    og.add_option("--unbridged-color", metavar="COLOR", default="orange",
Martin@650
   991
                  help="color for unbridged gaps (default: %default)")
Martin@650
   992
    op.add_option_group(og)
Martin@648
   993
    (opts, args) = op.parse_args()
Martin@648
   994
    if len(args) != 2: op.error("2 arguments needed")
Martin@648
   995
Martin@648
   996
    opts.background_color = "white"
Martin@648
   997
    opts.label_space = 5     # minimum number of pixels between axis labels
Martin@648
   998
Martin@649
   999
    try: lastDotplot(opts, args)
Martin@649
  1000
    except KeyboardInterrupt: pass  # avoid silly error message
Martin@903
  1001
    except Exception as e:
Martin@649
  1002
        prog = os.path.basename(sys.argv[0])
Martin@649
  1003
        sys.exit(prog + ": error: " + str(e))