scripts/last-dotplot
author Martin C. Frith
Mon Mar 05 17:43:20 2018 +0900 (2018-03-05)
changeset 926 bc08832db1a1
parent 925 e1861f956f60
child 937 8a4fadbe6080
permissions -rwxr-xr-x
Fix dotplot --strands bug with cut sequences
     1 #! /usr/bin/env python
     2 
     3 # Read pair-wise alignments in MAF or LAST tabular format: write an
     4 # "Oxford grid", a.k.a. dotplot.
     5 
     6 # TODO: Currently, pixels with zero aligned nt-pairs are white, and
     7 # pixels with one or more aligned nt-pairs are black.  This can look
     8 # too crowded for large genome alignments.  I tried shading each pixel
     9 # according to the number of aligned nt-pairs within it, but the
    10 # result is too faint.  How can this be done better?
    11 
    12 import collections
    13 import functools
    14 import gzip
    15 from fnmatch import fnmatchcase
    16 from operator import itemgetter
    17 import subprocess
    18 import itertools, optparse, os, re, sys
    19 
    20 # Try to make PIL/PILLOW work:
    21 try: from PIL import Image, ImageDraw, ImageFont, ImageColor
    22 except ImportError: import Image, ImageDraw, ImageFont, ImageColor
    23 
    24 def myOpen(fileName):  # faster than fileinput
    25     if fileName is None:
    26         return []
    27     if fileName == "-":
    28         return sys.stdin
    29     if fileName.endswith(".gz"):
    30         return gzip.open(fileName)
    31     return open(fileName)
    32 
    33 def warn(message):
    34     if opts.verbose:
    35         prog = os.path.basename(sys.argv[0])
    36         sys.stderr.write(prog + ": " + message + "\n")
    37 
    38 def groupByFirstItem(things):
    39     for k, v in itertools.groupby(things, itemgetter(0)):
    40         yield k, [i[1:] for i in v]
    41 
    42 def croppedBlocks(blocks, ranges1, ranges2):
    43     headBeg1, headBeg2, headSize = blocks[0]
    44     for r1 in ranges1:
    45         for r2 in ranges2:
    46             cropBeg1, cropEnd1 = r1
    47             if headBeg1 < 0:
    48                 cropBeg1, cropEnd1 = -cropEnd1, -cropBeg1
    49             cropBeg2, cropEnd2 = r2
    50             if headBeg2 < 0:
    51                 cropBeg2, cropEnd2 = -cropEnd2, -cropBeg2
    52             for beg1, beg2, size in blocks:
    53                 b1 = max(cropBeg1, beg1)
    54                 e1 = min(cropEnd1, beg1 + size)
    55                 if b1 >= e1: continue
    56                 offset = beg2 - beg1
    57                 b2 = max(cropBeg2, b1 + offset)
    58                 e2 = min(cropEnd2, e1 + offset)
    59                 if b2 >= e2: continue
    60                 yield b2 - offset, b2, e2 - b2
    61 
    62 def tabBlocks(beg1, beg2, blocks):
    63     '''Get the gapless blocks of an alignment, from LAST tabular format.'''
    64     for i in blocks.split(","):
    65         if ":" in i:
    66             x, y = i.split(":")
    67             beg1 += int(x)
    68             beg2 += int(y)
    69         else:
    70             size = int(i)
    71             yield beg1, beg2, size
    72             beg1 += size
    73             beg2 += size
    74 
    75 def mafBlocks(beg1, beg2, seq1, seq2):
    76     '''Get the gapless blocks of an alignment, from MAF format.'''
    77     size = 0
    78     for x, y in itertools.izip(seq1, seq2):
    79         if x == "-":
    80             if size:
    81                 yield beg1, beg2, size
    82                 beg1 += size
    83                 beg2 += size
    84                 size = 0
    85             beg2 += 1
    86         elif y == "-":
    87             if size:
    88                 yield beg1, beg2, size
    89                 beg1 += size
    90                 beg2 += size
    91                 size = 0
    92             beg1 += 1
    93         else:
    94             size += 1
    95     if size: yield beg1, beg2, size
    96 
    97 def alignmentInput(lines):
    98     '''Get alignments and sequence lengths, from MAF or tabular format.'''
    99     mafCount = 0
   100     for line in lines:
   101         w = line.split()
   102         if line[0].isdigit():  # tabular format
   103             chr1, beg1, seqlen1 = w[1], int(w[2]), int(w[5])
   104             if w[4] == "-": beg1 -= seqlen1
   105             chr2, beg2, seqlen2 = w[6], int(w[7]), int(w[10])
   106             if w[9] == "-": beg2 -= seqlen2
   107             blocks = tabBlocks(beg1, beg2, w[11])
   108             yield chr1, seqlen1, chr2, seqlen2, blocks
   109         elif line[0] == "s":  # MAF format
   110             if mafCount == 0:
   111                 chr1, beg1, seqlen1, seq1 = w[1], int(w[2]), int(w[5]), w[6]
   112                 if w[4] == "-": beg1 -= seqlen1
   113                 mafCount = 1
   114             else:
   115                 chr2, beg2, seqlen2, seq2 = w[1], int(w[2]), int(w[5]), w[6]
   116                 if w[4] == "-": beg2 -= seqlen2
   117                 blocks = mafBlocks(beg1, beg2, seq1, seq2)
   118                 yield chr1, seqlen1, chr2, seqlen2, blocks
   119                 mafCount = 0
   120 
   121 def seqRequestFromText(text):
   122     if ":" in text:
   123         pattern, interval = text.rsplit(":", 1)
   124         if "-" in interval:
   125             beg, end = interval.rsplit("-", 1)
   126             return pattern, int(beg), int(end)  # beg may be negative
   127     return text, 0, sys.maxsize
   128 
   129 def rangesFromSeqName(seqRequests, name, seqLen):
   130     if seqRequests:
   131         base = name.split(".")[-1]  # allow for names like hg19.chr7
   132         for pat, beg, end in seqRequests:
   133             if fnmatchcase(name, pat) or fnmatchcase(base, pat):
   134                 yield max(beg, 0), min(end, seqLen)
   135     else:
   136         yield 0, seqLen
   137 
   138 def updateSeqs(coverDict, seqRanges, seqName, ranges, coveredRange):
   139     beg, end = coveredRange
   140     if beg < 0:
   141         coveredRange = -end, -beg
   142     if seqName in coverDict:
   143         coverDict[seqName].append(coveredRange)
   144     else:
   145         coverDict[seqName] = [coveredRange]
   146         for beg, end in ranges:
   147             r = seqName, beg, end
   148             seqRanges.append(r)
   149 
   150 def readAlignments(fileName, opts):
   151     '''Get alignments and sequence limits, from MAF or tabular format.'''
   152     seqRequests1 = map(seqRequestFromText, opts.seq1)
   153     seqRequests2 = map(seqRequestFromText, opts.seq2)
   154 
   155     alignments = []
   156     seqRanges1 = []
   157     seqRanges2 = []
   158     coverDict1 = {}
   159     coverDict2 = {}
   160     lines = myOpen(fileName)
   161     for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
   162         ranges1 = sorted(rangesFromSeqName(seqRequests1, seqName1, seqLen1))
   163         if not ranges1: continue
   164         ranges2 = sorted(rangesFromSeqName(seqRequests2, seqName2, seqLen2))
   165         if not ranges2: continue
   166         b = list(croppedBlocks(list(blocks), ranges1, ranges2))
   167         if not b: continue
   168         aln = seqName1, seqName2, b
   169         alignments.append(aln)
   170         coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
   171         updateSeqs(coverDict1, seqRanges1, seqName1, ranges1, coveredRange1)
   172         coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
   173         updateSeqs(coverDict2, seqRanges2, seqName2, ranges2, coveredRange2)
   174     return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
   175 
   176 def nameAndRangesFromDict(cropDict, seqName):
   177     if seqName in cropDict:
   178         return seqName, cropDict[seqName]
   179     n = seqName.split(".")[-1]
   180     if n in cropDict:
   181         return n, cropDict[n]
   182     return seqName, []
   183 
   184 def rangesForSecondaryAlignments(primaryRanges, seqLen):
   185     if primaryRanges:
   186         return primaryRanges
   187     return [(0, seqLen)]
   188 
   189 def readSecondaryAlignments(opts, cropRanges1, cropRanges2):
   190     cropDict1 = dict(groupByFirstItem(cropRanges1))
   191     cropDict2 = dict(groupByFirstItem(cropRanges2))
   192 
   193     alignments = []
   194     seqRanges1 = []
   195     seqRanges2 = []
   196     coverDict1 = {}
   197     coverDict2 = {}
   198     lines = myOpen(opts.alignments)
   199     for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
   200         seqName1, ranges1 = nameAndRangesFromDict(cropDict1, seqName1)
   201         seqName2, ranges2 = nameAndRangesFromDict(cropDict2, seqName2)
   202         if not ranges1 and not ranges2:
   203             continue
   204         r1 = rangesForSecondaryAlignments(ranges1, seqLen1)
   205         r2 = rangesForSecondaryAlignments(ranges2, seqLen2)
   206         b = list(croppedBlocks(list(blocks), r1, r2))
   207         if not b: continue
   208         aln = seqName1, seqName2, b
   209         alignments.append(aln)
   210         if not ranges1:
   211             coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
   212             updateSeqs(coverDict1, seqRanges1, seqName1, r1, coveredRange1)
   213         if not ranges2:
   214             coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
   215             updateSeqs(coverDict2, seqRanges2, seqName2, r2, coveredRange2)
   216     return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
   217 
   218 def twoValuesFromOption(text, separator):
   219     if separator in text:
   220         return text.split(separator)
   221     return text, text
   222 
   223 def mergedRanges(ranges):
   224     oldBeg, maxEnd = ranges[0]
   225     for beg, end in ranges:
   226         if beg > maxEnd:
   227             yield oldBeg, maxEnd
   228             oldBeg = beg
   229             maxEnd = end
   230         elif end > maxEnd:
   231             maxEnd = end
   232     yield oldBeg, maxEnd
   233 
   234 def mergedRangesPerSeq(coverDict):
   235     for k, v in coverDict.iteritems():
   236         v.sort()
   237         yield k, list(mergedRanges(v))
   238 
   239 def coveredLength(mergedCoverDict):
   240     return sum(sum(e - b for b, e in v) for v in mergedCoverDict.itervalues())
   241 
   242 def trimmed(seqRanges, coverDict, minAlignedBases, maxGapFrac, endPad, midPad):
   243     maxEndGapFrac, maxMidGapFrac = twoValuesFromOption(maxGapFrac, ",")
   244     maxEndGap = max(float(maxEndGapFrac) * minAlignedBases, endPad * 1.0)
   245     maxMidGap = max(float(maxMidGapFrac) * minAlignedBases, midPad * 2.0)
   246 
   247     for seqName, rangeBeg, rangeEnd in seqRanges:
   248         seqBlocks = coverDict[seqName]
   249         blocks = [i for i in seqBlocks if i[0] < rangeEnd and i[1] > rangeBeg]
   250         if blocks[0][0] - rangeBeg > maxEndGap:
   251             rangeBeg = blocks[0][0] - endPad
   252         for j, y in enumerate(blocks):
   253             if j:
   254                 x = blocks[j - 1]
   255                 if y[0] - x[1] > maxMidGap:
   256                     yield seqName, rangeBeg, x[1] + midPad
   257                     rangeBeg = y[0] - midPad
   258         if rangeEnd - blocks[-1][1] > maxEndGap:
   259             rangeEnd = blocks[-1][1] + endPad
   260         yield seqName, rangeBeg, rangeEnd
   261 
   262 def rangesWithStrandInfo(seqRanges, strandOpt, alignments, seqIndex):
   263     if strandOpt == "1":
   264         forwardMinusReverse = collections.defaultdict(int)
   265         for i in alignments:
   266             blocks = i[2]
   267             beg1, beg2, size = blocks[0]
   268             numOfAlignedLetterPairs = sum(i[2] for i in blocks)
   269             if (beg1 < 0) != (beg2 < 0):  # opposite-strand alignment
   270                 numOfAlignedLetterPairs *= -1
   271             forwardMinusReverse[i[seqIndex]] += numOfAlignedLetterPairs
   272     strandNum = 0
   273     for seqName, beg, end in seqRanges:
   274         if strandOpt == "1":
   275             strandNum = 1 if forwardMinusReverse[seqName] >= 0 else 2
   276         yield seqName, beg, end, strandNum
   277 
   278 def natural_sort_key(my_string):
   279     '''Return a sort key for "natural" ordering, e.g. chr9 < chr10.'''
   280     parts = re.split(r'(\d+)', my_string)
   281     parts[1::2] = map(int, parts[1::2])
   282     return parts
   283 
   284 def nameKey(oneSeqRanges):
   285     return natural_sort_key(oneSeqRanges[0][0])
   286 
   287 def sizeKey(oneSeqRanges):
   288     return sum(b - e for n, b, e, s in oneSeqRanges), nameKey(oneSeqRanges)
   289 
   290 def alignmentKey(seqNamesToLists, oneSeqRanges):
   291     seqName = oneSeqRanges[0][0]
   292     alignmentsOfThisSequence = seqNamesToLists[seqName]
   293     numOfAlignedLetterPairs = sum(i[3] for i in alignmentsOfThisSequence)
   294     toMiddle = numOfAlignedLetterPairs // 2
   295     for i in alignmentsOfThisSequence:
   296         toMiddle -= i[3]
   297         if toMiddle < 0:
   298             return i[1:3]  # sequence-rank and "position" of this alignment
   299 
   300 def rankAndFlipPerSeq(seqRanges):
   301     rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
   302     for rank, group in enumerate(rangesGroupedBySeqName):
   303         seqName, ranges = group
   304         strandNum = next(ranges)[3]
   305         flip = 1 if strandNum < 2 else -1
   306         yield seqName, (rank, flip)
   307 
   308 def alignmentSortData(alignments, seqIndex, otherNamesToRanksAndFlips):
   309     otherIndex = 1 - seqIndex
   310     for i in alignments:
   311         blocks = i[2]
   312         otherRank, otherFlip = otherNamesToRanksAndFlips[i[otherIndex]]
   313         otherPos = otherFlip * abs(blocks[0][otherIndex] +
   314                                    blocks[-1][otherIndex] + blocks[-1][2])
   315         numOfAlignedLetterPairs = sum(i[2] for i in blocks)
   316         yield i[seqIndex], otherRank, otherPos, numOfAlignedLetterPairs
   317 
   318 def mySortedRanges(seqRanges, sortOpt, seqIndex, alignments, otherRanges):
   319     rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
   320     g = [list(ranges) for seqName, ranges in rangesGroupedBySeqName]
   321     for i in g:
   322         if i[0][3] > 1:
   323             i.reverse()
   324     if sortOpt == "1":
   325         g.sort(key=nameKey)
   326     if sortOpt == "2":
   327         g.sort(key=sizeKey)
   328     if sortOpt == "3":
   329         otherNamesToRanksAndFlips = dict(rankAndFlipPerSeq(otherRanges))
   330         alns = sorted(alignmentSortData(alignments, seqIndex,
   331                                         otherNamesToRanksAndFlips))
   332         alnsGroupedBySeqName = itertools.groupby(alns, itemgetter(0))
   333         seqNamesToLists = dict((k, list(v)) for k, v in alnsGroupedBySeqName)
   334         g.sort(key=functools.partial(alignmentKey, seqNamesToLists))
   335     return [j for i in g for j in i]
   336 
   337 def allSortedRanges(opts, alignments, alignmentsB,
   338                     seqRanges1, seqRangesB1, seqRanges2, seqRangesB2):
   339     o1, oB1 = twoValuesFromOption(opts.strands1, ":")
   340     o2, oB2 = twoValuesFromOption(opts.strands2, ":")
   341     if o1 == "1" and o2 == "1":
   342         raise Exception("the strand options have circular dependency")
   343     seqRanges1 = list(rangesWithStrandInfo(seqRanges1, o1, alignments, 0))
   344     seqRanges2 = list(rangesWithStrandInfo(seqRanges2, o2, alignments, 1))
   345     seqRangesB1 = list(rangesWithStrandInfo(seqRangesB1, oB1, alignmentsB, 0))
   346     seqRangesB2 = list(rangesWithStrandInfo(seqRangesB2, oB2, alignmentsB, 1))
   347 
   348     o1, oB1 = twoValuesFromOption(opts.sort1, ":")
   349     o2, oB2 = twoValuesFromOption(opts.sort2, ":")
   350     if o1 == "3" and o2 == "3":
   351         raise Exception("the sort options have circular dependency")
   352     if o1 != "3":
   353         s1 = mySortedRanges(seqRanges1, o1, None, None, None)
   354     if o2 != "3":
   355         s2 = mySortedRanges(seqRanges2, o2, None, None, None)
   356     if o1 == "3":
   357         s1 = mySortedRanges(seqRanges1, o1, 0, alignments, s2)
   358     if o2 == "3":
   359         s2 = mySortedRanges(seqRanges2, o2, 1, alignments, s1)
   360     t1 = mySortedRanges(seqRangesB1, oB1, 0, alignmentsB, s2)
   361     t2 = mySortedRanges(seqRangesB2, oB2, 1, alignmentsB, s1)
   362     return s1 + t1, s2 + t2
   363 
   364 def prettyNum(n):
   365     t = str(n)
   366     groups = []
   367     while t:
   368         groups.append(t[-3:])
   369         t = t[:-3]
   370     return ",".join(reversed(groups))
   371 
   372 def sizeText(size):
   373     suffixes = "bp", "kb", "Mb", "Gb"
   374     for i, x in enumerate(suffixes):
   375         j = 10 ** (i * 3)
   376         if size < j * 10:
   377             return "%.2g" % (1.0 * size / j) + x
   378         if size < j * 1000 or i == len(suffixes) - 1:
   379             return "%.0f" % (1.0 * size / j) + x
   380 
   381 def labelText(seqRange, labelOpt):
   382     seqName, beg, end, strandNum = seqRange
   383     if labelOpt == 1:
   384         return seqName + ": " + sizeText(end - beg)
   385     if labelOpt == 2:
   386         return seqName + ":" + prettyNum(beg) + ": " + sizeText(end - beg)
   387     if labelOpt == 3:
   388         return seqName + ":" + prettyNum(beg) + "-" + prettyNum(end)
   389     return seqName
   390 
   391 def rangeLabels(seqRanges, labelOpt, font, fontsize, image_mode, textRot):
   392     if fontsize:
   393         image_size = 1, 1
   394         im = Image.new(image_mode, image_size)
   395         draw = ImageDraw.Draw(im)
   396     x = y = 0
   397     for r in seqRanges:
   398         text = labelText(r, labelOpt)
   399         if fontsize:
   400             x, y = draw.textsize(text, font=font)
   401             if textRot:
   402                 x, y = y, x
   403         yield text, x, y, r[3]
   404 
   405 def dataFromRanges(sortedRanges, font, fontSize, imageMode, labelOpt, textRot):
   406     for seqName, rangeBeg, rangeEnd, strandNum in sortedRanges:
   407         out = [seqName, str(rangeBeg), str(rangeEnd)]
   408         if strandNum > 0:
   409             out.append(".+-"[strandNum])
   410         warn("\t".join(out))
   411     warn("")
   412     rangeSizes = [e - b for n, b, e, s in sortedRanges]
   413     labs = list(rangeLabels(sortedRanges, labelOpt, font, fontSize,
   414                             imageMode, textRot))
   415     margin = max(i[2] for i in labs)
   416     # xxx the margin may be too big, because some labels may get omitted
   417     return rangeSizes, labs, margin
   418 
   419 def div_ceil(x, y):
   420     '''Return x / y rounded up.'''
   421     q, r = divmod(x, y)
   422     return q + (r != 0)
   423 
   424 def get_bp_per_pix(rangeSizes, pixTweenRanges, maxPixels):
   425     '''Get the minimum bp-per-pixel that fits in the size limit.'''
   426     warn("choosing bp per pixel...")
   427     numOfRanges = len(rangeSizes)
   428     maxPixelsInRanges = maxPixels - pixTweenRanges * (numOfRanges - 1)
   429     if maxPixelsInRanges < numOfRanges:
   430         raise Exception("can't fit the image: too many sequences?")
   431     negLimit = -maxPixelsInRanges
   432     negBpPerPix = sum(rangeSizes) // negLimit
   433     while True:
   434         if sum(i // negBpPerPix for i in rangeSizes) >= negLimit:
   435             return -negBpPerPix
   436         negBpPerPix -= 1
   437 
   438 def getRangePixBegs(rangePixLens, pixTweenRanges, margin):
   439     '''Get the start pixel for each range.'''
   440     rangePixBegs = []
   441     pix_tot = margin - pixTweenRanges
   442     for i in rangePixLens:
   443         pix_tot += pixTweenRanges
   444         rangePixBegs.append(pix_tot)
   445         pix_tot += i
   446     return rangePixBegs
   447 
   448 def pixelData(rangeSizes, bp_per_pix, pixTweenRanges, margin):
   449     '''Return pixel information about the ranges.'''
   450     rangePixLens = [div_ceil(i, bp_per_pix) for i in rangeSizes]
   451     rangePixBegs = getRangePixBegs(rangePixLens, pixTweenRanges, margin)
   452     tot_pix = rangePixBegs[-1] + rangePixLens[-1]
   453     return rangePixBegs, rangePixLens, tot_pix
   454 
   455 def drawLineForward(hits, width, bp_per_pix, beg1, beg2, size):
   456     while True:
   457         q1, r1 = divmod(beg1, bp_per_pix)
   458         q2, r2 = divmod(beg2, bp_per_pix)
   459         hits[q2 * width + q1] |= 1
   460         next_pix = min(bp_per_pix - r1, bp_per_pix - r2)
   461         if next_pix >= size: break
   462         beg1 += next_pix
   463         beg2 += next_pix
   464         size -= next_pix
   465 
   466 def drawLineReverse(hits, width, bp_per_pix, beg1, beg2, size):
   467     while True:
   468         q1, r1 = divmod(beg1, bp_per_pix)
   469         q2, r2 = divmod(beg2, bp_per_pix)
   470         hits[q2 * width + q1] |= 2
   471         next_pix = min(bp_per_pix - r1, r2 + 1)
   472         if next_pix >= size: break
   473         beg1 += next_pix
   474         beg2 -= next_pix
   475         size -= next_pix
   476 
   477 def strandAndOrigin(ranges, beg, size):
   478     isReverseStrand = (beg < 0)
   479     if isReverseStrand:
   480         beg = -(beg + size)
   481     for rangeBeg, rangeEnd, isReverseRange, origin in ranges:
   482         if rangeEnd > beg:  # assumes the ranges are sorted
   483             return (isReverseStrand != isReverseRange), origin
   484 
   485 def alignmentPixels(width, height, alignments, bp_per_pix,
   486                     rangeDict1, rangeDict2):
   487     hits = [0] * (width * height)  # the image data
   488     for seq1, seq2, blocks in alignments:
   489         beg1, beg2, size = blocks[0]
   490         isReverse1, ori1 = strandAndOrigin(rangeDict1[seq1], beg1, size)
   491         isReverse2, ori2 = strandAndOrigin(rangeDict2[seq2], beg2, size)
   492         for beg1, beg2, size in blocks:
   493             if isReverse1:
   494                 beg1 = -(beg1 + size)
   495                 beg2 = -(beg2 + size)
   496             if isReverse1 == isReverse2:
   497                 drawLineForward(hits, width, bp_per_pix,
   498                                 ori1 + beg1, ori2 + beg2, size)
   499             else:
   500                 drawLineReverse(hits, width, bp_per_pix,
   501                                 ori1 + beg1, ori2 - beg2 - 1, size)
   502     return hits
   503 
   504 def expandedSeqDict(seqDict):
   505     '''Allow lookup by short sequence names, e.g. chr7 as well as hg19.chr7.'''
   506     newDict = seqDict.copy()
   507     for name, x in seqDict.items():
   508         if "." in name:
   509             base = name.split(".")[-1]
   510             if base in newDict:  # an ambiguous case was found:
   511                 return seqDict   # so give up completely
   512             newDict[base] = x
   513     return newDict
   514 
   515 def readBed(fileName, rangeDict):
   516     for line in myOpen(fileName):
   517         w = line.split()
   518         if not w: continue
   519         seqName = w[0]
   520         if seqName not in rangeDict: continue
   521         beg = int(w[1])
   522         end = int(w[2])
   523         layer = 900
   524         color = "#fbf"
   525         if len(w) > 4:
   526             if w[4] != ".":
   527                 layer = float(w[4])
   528             if len(w) > 5:
   529                 if len(w) > 8 and w[8].count(",") == 2:
   530                     color = "rgb(" + w[8] + ")"
   531                 else:
   532                     strand = w[5]
   533                     isRev = rangeDict[seqName][0][2]
   534                     if strand == "+" and not isRev or strand == "-" and isRev:
   535                         color = "#ffe8e8"
   536                     if strand == "-" and not isRev or strand == "+" and isRev:
   537                         color = "#e8e8ff"
   538         yield layer, color, seqName, beg, end
   539 
   540 def commaSeparatedInts(text):
   541     return map(int, text.rstrip(",").split(","))
   542 
   543 def readGenePred(opts, fileName, rangeDict):
   544     for line in myOpen(fileName):
   545         fields = line.split()
   546         if not fields: continue
   547         if fields[2] not in "+-": fields = fields[1:]
   548         seqName = fields[1]
   549         if seqName not in rangeDict: continue
   550         #strand = fields[2]
   551         cdsBeg = int(fields[5])
   552         cdsEnd = int(fields[6])
   553         exonBegs = commaSeparatedInts(fields[8])
   554         exonEnds = commaSeparatedInts(fields[9])
   555         for beg, end in zip(exonBegs, exonEnds):
   556             yield 300, opts.exon_color, seqName, beg, end
   557             b = max(beg, cdsBeg)
   558             e = min(end, cdsEnd)
   559             if b < e: yield 400, opts.cds_color, seqName, b, e
   560 
   561 def readRmsk(fileName, rangeDict):
   562     for line in myOpen(fileName):
   563         fields = line.split()
   564         if len(fields) == 17:  # rmsk.txt
   565             seqName = fields[5]
   566             if seqName not in rangeDict: continue  # do this ASAP for speed
   567             beg = int(fields[6])
   568             end = int(fields[7])
   569             strand = fields[9]
   570             repeatClass = fields[11]
   571         elif len(fields) == 15:  # .out
   572             seqName = fields[4]
   573             if seqName not in rangeDict: continue
   574             beg = int(fields[5]) - 1
   575             end = int(fields[6])
   576             strand = fields[8]
   577             repeatClass = fields[10]
   578         else:
   579             continue
   580         if repeatClass in ("Low_complexity", "Simple_repeat"):
   581             yield 200, "#fbf", seqName, beg, end
   582         elif (strand == "+") != rangeDict[seqName][0][2]:
   583             yield 100, "#ffe8e8", seqName, beg, end
   584         else:
   585             yield 100, "#e8e8ff", seqName, beg, end
   586 
   587 def isExtraFirstGapField(fields):
   588     return fields[4].isdigit()
   589 
   590 def readGaps(opts, fileName, rangeDict):
   591     '''Read locations of unsequenced gaps, from an agp or gap file.'''
   592     for line in myOpen(fileName):
   593         w = line.split()
   594         if not w or w[0][0] == "#": continue
   595         if isExtraFirstGapField(w): w = w[1:]
   596         if w[4] not in "NU": continue
   597         seqName = w[0]
   598         if seqName not in rangeDict: continue
   599         end = int(w[2])
   600         beg = end - int(w[5])  # zero-based coordinate
   601         if w[7] == "yes":
   602             yield 3000, opts.bridged_color, seqName, beg, end
   603         else:
   604             yield 2000, opts.unbridged_color, seqName, beg, end
   605 
   606 def bedBoxes(beds, rangeDict, margin, edge, isTop, bpPerPix):
   607     for layer, color, seqName, bedBeg, bedEnd in beds:
   608         for rangeBeg, rangeEnd, isReverseRange, origin in rangeDict[seqName]:
   609             beg = max(bedBeg, rangeBeg)
   610             end = min(bedEnd, rangeEnd)
   611             if beg >= end: continue
   612             if isReverseRange:
   613                 beg, end = -end, -beg
   614             if layer <= 1000:
   615                 # include partly-covered pixels
   616                 b = (origin + beg) // bpPerPix
   617                 e = div_ceil(origin + end, bpPerPix)
   618             else:
   619                 # exclude partly-covered pixels
   620                 b = div_ceil(origin + beg, bpPerPix)
   621                 e = (origin + end) // bpPerPix
   622                 if e <= b: continue
   623                 if bedEnd >= rangeEnd:  # include partly-covered end pixels
   624                     if isReverseRange:
   625                         b = (origin + beg) // bpPerPix
   626                     else:
   627                         e = div_ceil(origin + end, bpPerPix)
   628             if isTop:
   629                 box = b, margin, e, edge
   630             else:
   631                 box = margin, b, edge, e
   632             yield layer, color, box
   633 
   634 def drawAnnotations(im, boxes):
   635     # xxx use partial transparency for different-color overlaps?
   636     for layer, color, box in boxes:
   637         im.paste(color, box)
   638 
   639 def placedLabels(labels, rangePixBegs, rangePixLens, beg, end):
   640     '''Return axis labels with endpoint & sort-order information.'''
   641     maxWidth = end - beg
   642     for i, j, k in zip(labels, rangePixBegs, rangePixLens):
   643         text, textWidth, textHeight, strandNum = i
   644         if textWidth > maxWidth:
   645             continue
   646         labelBeg = j + (k - textWidth) // 2
   647         labelEnd = labelBeg + textWidth
   648         sortKey = textWidth - k
   649         if labelBeg < beg:
   650             sortKey += maxWidth * (beg - labelBeg)
   651             labelBeg = beg
   652             labelEnd = beg + textWidth
   653         if labelEnd > end:
   654             sortKey += maxWidth * (labelEnd - end)
   655             labelEnd = end
   656             labelBeg = end - textWidth
   657         yield sortKey, labelBeg, labelEnd, text, textHeight, strandNum
   658 
   659 def nonoverlappingLabels(labels, minPixTweenLabels):
   660     '''Get a subset of non-overlapping axis labels, greedily.'''
   661     out = []
   662     for i in labels:
   663         beg = i[1] - minPixTweenLabels
   664         end = i[2] + minPixTweenLabels
   665         if all(j[2] <= beg or j[1] >= end for j in out):
   666             out.append(i)
   667     return out
   668 
   669 def axisImage(labels, rangePixBegs, rangePixLens, textRot,
   670               textAln, font, image_mode, opts):
   671     '''Make an image of axis labels.'''
   672     beg = rangePixBegs[0]
   673     end = rangePixBegs[-1] + rangePixLens[-1]
   674     margin = max(i[2] for i in labels)
   675     labels = sorted(placedLabels(labels, rangePixBegs, rangePixLens, beg, end))
   676     minPixTweenLabels = 0 if textRot else opts.label_space
   677     labels = nonoverlappingLabels(labels, minPixTweenLabels)
   678     image_size = (margin, end) if textRot else (end, margin)
   679     im = Image.new(image_mode, image_size, opts.margin_color)
   680     draw = ImageDraw.Draw(im)
   681     for sortKey, labelBeg, labelEnd, text, textHeight, strandNum in labels:
   682         base = margin - textHeight if textAln else 0
   683         position = (base, labelBeg) if textRot else (labelBeg, base)
   684         fill = ("black", opts.forwardcolor, opts.reversecolor)[strandNum]
   685         draw.text(position, text, font=font, fill=fill)
   686     return im
   687 
   688 def rangesWithOrigins(sortedRanges, rangePixBegs, rangePixLens, bpPerPix):
   689     for i, j, k in zip(sortedRanges, rangePixBegs, rangePixLens):
   690         seqName, rangeBeg, rangeEnd, strandNum = i
   691         isReverseRange = (strandNum > 1)
   692         if isReverseRange:
   693             origin = bpPerPix * (j + k) + rangeBeg
   694         else:
   695             origin = bpPerPix * j - rangeBeg
   696         yield seqName, (rangeBeg, rangeEnd, isReverseRange, origin)
   697 
   698 def rangesPerSeq(sortedRanges, rangePixBegs, rangePixLens, bpPerPix):
   699     a = rangesWithOrigins(sortedRanges, rangePixBegs, rangePixLens, bpPerPix)
   700     for k, v in itertools.groupby(a, itemgetter(0)):
   701         yield k, sorted(i[1] for i in v)
   702 
   703 def getFont(opts):
   704     if opts.fontfile:
   705         return ImageFont.truetype(opts.fontfile, opts.fontsize)
   706     fileNames = []
   707     try:
   708         x = ["fc-match", "-f%{file}", "arial"]
   709         p = subprocess.Popen(x, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
   710         out, err = p.communicate()
   711         fileNames.append(out)
   712     except OSError as e:
   713         warn("fc-match error: " + str(e))
   714     fileNames.append("/Library/Fonts/Arial.ttf")  # for Mac
   715     for i in fileNames:
   716         try:
   717             font = ImageFont.truetype(i, opts.fontsize)
   718             warn("font: " + i)
   719             return font
   720         except IOError as e:
   721             warn("font load error: " + str(e))
   722     return ImageFont.load_default()
   723 
   724 def lastDotplot(opts, args):
   725     font = getFont(opts)
   726     image_mode = 'RGB'
   727     forward_color = ImageColor.getcolor(opts.forwardcolor, image_mode)
   728     reverse_color = ImageColor.getcolor(opts.reversecolor, image_mode)
   729     zipped_colors = zip(forward_color, reverse_color)
   730     overlap_color = tuple([(i + j) // 2 for i, j in zipped_colors])
   731 
   732     maxGap1, maxGapB1 = twoValuesFromOption(opts.max_gap1, ":")
   733     maxGap2, maxGapB2 = twoValuesFromOption(opts.max_gap2, ":")
   734 
   735     warn("reading alignments...")
   736     alnData = readAlignments(args[0], opts)
   737     alignments, seqRanges1, coverDict1, seqRanges2, coverDict2 = alnData
   738     if not alignments: raise Exception("there are no alignments")
   739     warn("cutting...")
   740     coverDict1 = dict(mergedRangesPerSeq(coverDict1))
   741     coverDict2 = dict(mergedRangesPerSeq(coverDict2))
   742     minAlignedBases = min(coveredLength(coverDict1), coveredLength(coverDict2))
   743     pad = int(opts.pad * minAlignedBases)
   744     cutRanges1 = list(trimmed(seqRanges1, coverDict1, minAlignedBases,
   745                               maxGap1, pad, pad))
   746     cutRanges2 = list(trimmed(seqRanges2, coverDict2, minAlignedBases,
   747                               maxGap2, pad, pad))
   748 
   749     warn("reading secondary alignments...")
   750     alnDataB = readSecondaryAlignments(opts, cutRanges1, cutRanges2)
   751     alignmentsB, seqRangesB1, coverDictB1, seqRangesB2, coverDictB2 = alnDataB
   752     warn("cutting...")
   753     coverDictB1 = dict(mergedRangesPerSeq(coverDictB1))
   754     coverDictB2 = dict(mergedRangesPerSeq(coverDictB2))
   755     cutRangesB1 = trimmed(seqRangesB1, coverDictB1, minAlignedBases,
   756                           maxGapB1, 0, 0)
   757     cutRangesB2 = trimmed(seqRangesB2, coverDictB2, minAlignedBases,
   758                           maxGapB2, 0, 0)
   759 
   760     warn("sorting...")
   761     sortOut = allSortedRanges(opts, alignments, alignmentsB,
   762                               cutRanges1, cutRangesB1, cutRanges2, cutRangesB2)
   763     sortedRanges1, sortedRanges2 = sortOut
   764 
   765     textRot1 = "vertical".startswith(opts.rot1)
   766     i1 = dataFromRanges(sortedRanges1, font,
   767                         opts.fontsize, image_mode, opts.labels1, textRot1)
   768     rangeSizes1, labelData1, tMargin = i1
   769 
   770     textRot2 = "horizontal".startswith(opts.rot2)
   771     i2 = dataFromRanges(sortedRanges2, font,
   772                         opts.fontsize, image_mode, opts.labels2, textRot2)
   773     rangeSizes2, labelData2, lMargin = i2
   774 
   775     maxPixels1 = opts.width  - lMargin
   776     maxPixels2 = opts.height - tMargin
   777     bpPerPix1 = get_bp_per_pix(rangeSizes1, opts.border_pixels, maxPixels1)
   778     bpPerPix2 = get_bp_per_pix(rangeSizes2, opts.border_pixels, maxPixels2)
   779     bpPerPix = max(bpPerPix1, bpPerPix2)
   780     warn("bp per pixel = " + str(bpPerPix))
   781 
   782     p1 = pixelData(rangeSizes1, bpPerPix, opts.border_pixels, lMargin)
   783     rangePixBegs1, rangePixLens1, width = p1
   784     rangeDict1 = dict(rangesPerSeq(sortedRanges1, rangePixBegs1,
   785                                    rangePixLens1, bpPerPix))
   786 
   787     p2 = pixelData(rangeSizes2, bpPerPix, opts.border_pixels, tMargin)
   788     rangePixBegs2, rangePixLens2, height = p2
   789     rangeDict2 = dict(rangesPerSeq(sortedRanges2, rangePixBegs2,
   790                                    rangePixLens2, bpPerPix))
   791 
   792     warn("width:  " + str(width))
   793     warn("height: " + str(height))
   794 
   795     warn("processing alignments...")
   796     hits = alignmentPixels(width, height, alignments + alignmentsB, bpPerPix,
   797                            rangeDict1, rangeDict2)
   798 
   799     warn("reading annotations...")
   800 
   801     rangeDict1 = expandedSeqDict(rangeDict1)
   802     beds1 = itertools.chain(readBed(opts.bed1, rangeDict1),
   803                             readRmsk(opts.rmsk1, rangeDict1),
   804                             readGenePred(opts, opts.genePred1, rangeDict1),
   805                             readGaps(opts, opts.gap1, rangeDict1))
   806     b1 = bedBoxes(beds1, rangeDict1, tMargin, height, True, bpPerPix)
   807 
   808     rangeDict2 = expandedSeqDict(rangeDict2)
   809     beds2 = itertools.chain(readBed(opts.bed2, rangeDict2),
   810                             readRmsk(opts.rmsk2, rangeDict2),
   811                             readGenePred(opts, opts.genePred2, rangeDict2),
   812                             readGaps(opts, opts.gap2, rangeDict2))
   813     b2 = bedBoxes(beds2, rangeDict2, lMargin, width, False, bpPerPix)
   814 
   815     boxes = sorted(itertools.chain(b1, b2))
   816 
   817     warn("drawing...")
   818 
   819     image_size = width, height
   820     im = Image.new(image_mode, image_size, opts.background_color)
   821 
   822     drawAnnotations(im, boxes)
   823 
   824     for i in range(height):
   825         for j in range(width):
   826             store_value = hits[i * width + j]
   827             xy = j, i
   828             if   store_value == 1: im.putpixel(xy, forward_color)
   829             elif store_value == 2: im.putpixel(xy, reverse_color)
   830             elif store_value == 3: im.putpixel(xy, overlap_color)
   831 
   832     if opts.fontsize != 0:
   833         axis1 = axisImage(labelData1, rangePixBegs1, rangePixLens1,
   834                           textRot1, False, font, image_mode, opts)
   835         if textRot1:
   836             axis1 = axis1.transpose(Image.ROTATE_90)
   837         axis2 = axisImage(labelData2, rangePixBegs2, rangePixLens2,
   838                           textRot2, textRot2, font, image_mode, opts)
   839         if not textRot2:
   840             axis2 = axis2.transpose(Image.ROTATE_270)
   841         im.paste(axis1, (0, 0))
   842         im.paste(axis2, (0, 0))
   843 
   844     for i in rangePixBegs1[1:]:
   845         box = i - opts.border_pixels, tMargin, i, height
   846         im.paste(opts.border_color, box)
   847 
   848     for i in rangePixBegs2[1:]:
   849         box = lMargin, i - opts.border_pixels, width, i
   850         im.paste(opts.border_color, box)
   851 
   852     im.save(args[1])
   853 
   854 if __name__ == "__main__":
   855     usage = """%prog --help
   856    or: %prog [options] maf-or-tab-alignments dotplot.png
   857    or: %prog [options] maf-or-tab-alignments dotplot.gif
   858    or: ..."""
   859     description = "Draw a dotplot of pair-wise sequence alignments in MAF or tabular format."
   860     op = optparse.OptionParser(usage=usage, description=description)
   861     op.add_option("-v", "--verbose", action="count",
   862                   help="show progress messages & data about the plot")
   863     op.add_option("-1", "--seq1", metavar="PATTERN", action="append",
   864                   default=[],
   865                   help="which sequences to show from the 1st genome")
   866     op.add_option("-2", "--seq2", metavar="PATTERN", action="append",
   867                   default=[],
   868                   help="which sequences to show from the 2nd genome")
   869     # Replace "width" & "height" with a single "length" option?
   870     op.add_option("-x", "--width", type="int", default=1000,
   871                   help="maximum width in pixels (default: %default)")
   872     op.add_option("-y", "--height", type="int", default=1000,
   873                   help="maximum height in pixels (default: %default)")
   874     op.add_option("-c", "--forwardcolor", metavar="COLOR", default="red",
   875                   help="color for forward alignments (default: %default)")
   876     op.add_option("-r", "--reversecolor", metavar="COLOR", default="blue",
   877                   help="color for reverse alignments (default: %default)")
   878     op.add_option("--alignments", metavar="FILE", help="secondary alignments")
   879     op.add_option("--sort1", default="1", metavar="N",
   880                   help="genome1 sequence order: 0=input order, 1=name order, "
   881                   "2=length order, 3=alignment order (default=%default)")
   882     op.add_option("--sort2", default="1", metavar="N",
   883                   help="genome2 sequence order: 0=input order, 1=name order, "
   884                   "2=length order, 3=alignment order (default=%default)")
   885     op.add_option("--strands1", default="0", metavar="N", help=
   886                   "genome1 sequence orientation: 0=forward orientation, "
   887                   "1=alignment orientation (default=%default)")
   888     op.add_option("--strands2", default="0", metavar="N", help=
   889                   "genome2 sequence orientation: 0=forward orientation, "
   890                   "1=alignment orientation (default=%default)")
   891     op.add_option("--max-gap1", metavar="FRAC", default="0.5,2", help=
   892                   "maximum unaligned (end,mid) gap in genome1: "
   893                   "fraction of aligned length (default=%default)")
   894     op.add_option("--max-gap2", metavar="FRAC", default="0.5,2", help=
   895                   "maximum unaligned (end,mid) gap in genome2: "
   896                   "fraction of aligned length (default=%default)")
   897     op.add_option("--pad", metavar="FRAC", type="float", default=0.04, help=
   898                   "pad length when cutting unaligned gaps: "
   899                   "fraction of aligned length (default=%default)")
   900     op.add_option("--border-pixels", metavar="INT", type="int", default=1,
   901                   help="number of pixels between sequences (default=%default)")
   902     op.add_option("--border-color", metavar="COLOR", default="black",
   903                   help="color for pixels between sequences (default=%default)")
   904     # --break-color and/or --break-pixels for intra-sequence breaks?
   905     op.add_option("--margin-color", metavar="COLOR", default="#dcdcdc",
   906                   help="margin color")
   907 
   908     og = optparse.OptionGroup(op, "Text options")
   909     og.add_option("-f", "--fontfile", metavar="FILE",
   910                   help="TrueType or OpenType font file")
   911     og.add_option("-s", "--fontsize", metavar="SIZE", type="int", default=14,
   912                   help="TrueType or OpenType font size (default: %default)")
   913     og.add_option("--labels1", type="int", default=0, metavar="N", help=
   914                   "genome1 labels: 0=name, 1=name:length, "
   915                   "2=name:start:length, 3=name:start-end (default=%default)")
   916     og.add_option("--labels2", type="int", default=0, metavar="N", help=
   917                   "genome2 labels: 0=name, 1=name:length, "
   918                   "2=name:start:length, 3=name:start-end (default=%default)")
   919     og.add_option("--rot1", metavar="ROT", default="h",
   920                   help="text rotation for the 1st genome (default=%default)")
   921     og.add_option("--rot2", metavar="ROT", default="v",
   922                   help="text rotation for the 2nd genome (default=%default)")
   923     op.add_option_group(og)
   924 
   925     og = optparse.OptionGroup(op, "Annotation options")
   926     og.add_option("--bed1", metavar="FILE",
   927                   help="read genome1 annotations from BED file")
   928     og.add_option("--bed2", metavar="FILE",
   929                   help="read genome2 annotations from BED file")
   930     og.add_option("--rmsk1", metavar="FILE", help="read genome1 repeats from "
   931                   "RepeatMasker .out or rmsk.txt file")
   932     og.add_option("--rmsk2", metavar="FILE", help="read genome2 repeats from "
   933                   "RepeatMasker .out or rmsk.txt file")
   934     op.add_option_group(og)
   935 
   936     og = optparse.OptionGroup(op, "Gene options")
   937     og.add_option("--genePred1", metavar="FILE",
   938                   help="read genome1 genes from genePred file")
   939     og.add_option("--genePred2", metavar="FILE",
   940                   help="read genome2 genes from genePred file")
   941     og.add_option("--exon-color", metavar="COLOR", default="PaleGreen",
   942                   help="color for exons (default=%default)")
   943     og.add_option("--cds-color", metavar="COLOR", default="LimeGreen",
   944                   help="color for protein-coding regions (default=%default)")
   945     op.add_option_group(og)
   946 
   947     og = optparse.OptionGroup(op, "Unsequenced gap options")
   948     og.add_option("--gap1", metavar="FILE",
   949                   help="read genome1 unsequenced gaps from agp or gap file")
   950     og.add_option("--gap2", metavar="FILE",
   951                   help="read genome2 unsequenced gaps from agp or gap file")
   952     og.add_option("--bridged-color", metavar="COLOR", default="yellow",
   953                   help="color for bridged gaps (default: %default)")
   954     og.add_option("--unbridged-color", metavar="COLOR", default="orange",
   955                   help="color for unbridged gaps (default: %default)")
   956     op.add_option_group(og)
   957     (opts, args) = op.parse_args()
   958     if len(args) != 2: op.error("2 arguments needed")
   959 
   960     opts.background_color = "white"
   961     opts.label_space = 5     # minimum number of pixels between axis labels
   962 
   963     try: lastDotplot(opts, args)
   964     except KeyboardInterrupt: pass  # avoid silly error message
   965     except Exception as e:
   966         prog = os.path.basename(sys.argv[0])
   967         sys.exit(prog + ": error: " + str(e))