scripts/last-dotplot
author Martin C. Frith
Thu Dec 21 16:22:52 2017 +0900 (2017-12-21)
changeset 915 6079fabb49a5
parent 914 f4ca19d126e3
child 916 ffe68ba3d865
permissions -rwxr-xr-x
Refactor last-dotplot
     1 #! /usr/bin/env python
     2 
     3 # Read pair-wise alignments in MAF or LAST tabular format: write an
     4 # "Oxford grid", a.k.a. dotplot.
     5 
     6 # TODO: Currently, pixels with zero aligned nt-pairs are white, and
     7 # pixels with one or more aligned nt-pairs are black.  This can look
     8 # too crowded for large genome alignments.  I tried shading each pixel
     9 # according to the number of aligned nt-pairs within it, but the
    10 # result is too faint.  How can this be done better?
    11 
    12 import functools
    13 import gzip
    14 from fnmatch import fnmatchcase
    15 from operator import itemgetter
    16 import subprocess
    17 import itertools, optparse, os, re, sys
    18 
    19 # Try to make PIL/PILLOW work:
    20 try: from PIL import Image, ImageDraw, ImageFont, ImageColor
    21 except ImportError: import Image, ImageDraw, ImageFont, ImageColor
    22 
    23 def myOpen(fileName):  # faster than fileinput
    24     if fileName is None:
    25         return []
    26     if fileName == "-":
    27         return sys.stdin
    28     if fileName.endswith(".gz"):
    29         return gzip.open(fileName)
    30     return open(fileName)
    31 
    32 def warn(message):
    33     if opts.verbose:
    34         prog = os.path.basename(sys.argv[0])
    35         sys.stderr.write(prog + ": " + message + "\n")
    36 
    37 def groupByFirstItem(things):
    38     for k, v in itertools.groupby(things, itemgetter(0)):
    39         yield k, [i[1:] for i in v]
    40 
    41 def croppedBlocks(blocks, ranges1, ranges2):
    42     headBeg1, headBeg2, headSize = blocks[0]
    43     for r1 in ranges1:
    44         for r2 in ranges2:
    45             cropBeg1, cropEnd1 = r1
    46             if headBeg1 < 0:
    47                 cropBeg1, cropEnd1 = -cropEnd1, -cropBeg1
    48             cropBeg2, cropEnd2 = r2
    49             if headBeg2 < 0:
    50                 cropBeg2, cropEnd2 = -cropEnd2, -cropBeg2
    51             for beg1, beg2, size in blocks:
    52                 b1 = max(cropBeg1, beg1)
    53                 e1 = min(cropEnd1, beg1 + size)
    54                 if b1 >= e1: continue
    55                 offset = beg2 - beg1
    56                 b2 = max(cropBeg2, b1 + offset)
    57                 e2 = min(cropEnd2, e1 + offset)
    58                 if b2 >= e2: continue
    59                 yield b2 - offset, b2, e2 - b2
    60 
    61 def tabBlocks(beg1, beg2, blocks):
    62     '''Get the gapless blocks of an alignment, from LAST tabular format.'''
    63     for i in blocks.split(","):
    64         if ":" in i:
    65             x, y = i.split(":")
    66             beg1 += int(x)
    67             beg2 += int(y)
    68         else:
    69             size = int(i)
    70             yield beg1, beg2, size
    71             beg1 += size
    72             beg2 += size
    73 
    74 def mafBlocks(beg1, beg2, seq1, seq2):
    75     '''Get the gapless blocks of an alignment, from MAF format.'''
    76     size = 0
    77     for x, y in itertools.izip(seq1, seq2):
    78         if x == "-":
    79             if size:
    80                 yield beg1, beg2, size
    81                 beg1 += size
    82                 beg2 += size
    83                 size = 0
    84             beg2 += 1
    85         elif y == "-":
    86             if size:
    87                 yield beg1, beg2, size
    88                 beg1 += size
    89                 beg2 += size
    90                 size = 0
    91             beg1 += 1
    92         else:
    93             size += 1
    94     if size: yield beg1, beg2, size
    95 
    96 def alignmentInput(lines):
    97     '''Get alignments and sequence lengths, from MAF or tabular format.'''
    98     mafCount = 0
    99     for line in lines:
   100         w = line.split()
   101         if line[0].isdigit():  # tabular format
   102             chr1, beg1, seqlen1 = w[1], int(w[2]), int(w[5])
   103             if w[4] == "-": beg1 -= seqlen1
   104             chr2, beg2, seqlen2 = w[6], int(w[7]), int(w[10])
   105             if w[9] == "-": beg2 -= seqlen2
   106             blocks = tabBlocks(beg1, beg2, w[11])
   107             yield chr1, seqlen1, chr2, seqlen2, blocks
   108         elif line[0] == "s":  # MAF format
   109             if mafCount == 0:
   110                 chr1, beg1, seqlen1, seq1 = w[1], int(w[2]), int(w[5]), w[6]
   111                 if w[4] == "-": beg1 -= seqlen1
   112                 mafCount = 1
   113             else:
   114                 chr2, beg2, seqlen2, seq2 = w[1], int(w[2]), int(w[5]), w[6]
   115                 if w[4] == "-": beg2 -= seqlen2
   116                 blocks = mafBlocks(beg1, beg2, seq1, seq2)
   117                 yield chr1, seqlen1, chr2, seqlen2, blocks
   118                 mafCount = 0
   119 
   120 def seqRequestFromText(text):
   121     if ":" in text:
   122         pattern, interval = text.rsplit(":", 1)
   123         if "-" in interval:
   124             beg, end = interval.rsplit("-", 1)
   125             return pattern, int(beg), int(end)  # beg may be negative
   126     return text, 0, sys.maxsize
   127 
   128 def rangesFromSeqName(seqRequests, name, seqLen):
   129     if seqRequests:
   130         base = name.split(".")[-1]  # allow for names like hg19.chr7
   131         for pat, beg, end in seqRequests:
   132             if fnmatchcase(name, pat) or fnmatchcase(base, pat):
   133                 yield max(beg, 0), min(end, seqLen)
   134     else:
   135         yield 0, seqLen
   136 
   137 def updateSeqs(coverDict, seqRanges, seqName, ranges, coveredRange):
   138     beg, end = coveredRange
   139     if beg < 0:
   140         coveredRange = -end, -beg
   141     if seqName in coverDict:
   142         coverDict[seqName].append(coveredRange)
   143     else:
   144         coverDict[seqName] = [coveredRange]
   145         for beg, end in ranges:
   146             r = seqName, beg, end
   147             seqRanges.append(r)
   148 
   149 def readAlignments(fileName, opts):
   150     '''Get alignments and sequence limits, from MAF or tabular format.'''
   151     seqRequests1 = map(seqRequestFromText, opts.seq1)
   152     seqRequests2 = map(seqRequestFromText, opts.seq2)
   153 
   154     alignments = []
   155     seqRanges1 = []
   156     seqRanges2 = []
   157     coverDict1 = {}
   158     coverDict2 = {}
   159     lines = myOpen(fileName)
   160     for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
   161         ranges1 = sorted(rangesFromSeqName(seqRequests1, seqName1, seqLen1))
   162         if not ranges1: continue
   163         ranges2 = sorted(rangesFromSeqName(seqRequests2, seqName2, seqLen2))
   164         if not ranges2: continue
   165         b = list(croppedBlocks(list(blocks), ranges1, ranges2))
   166         if not b: continue
   167         aln = seqName1, seqName2, b
   168         alignments.append(aln)
   169         coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
   170         updateSeqs(coverDict1, seqRanges1, seqName1, ranges1, coveredRange1)
   171         coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
   172         updateSeqs(coverDict2, seqRanges2, seqName2, ranges2, coveredRange2)
   173     return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
   174 
   175 def nameAndRangesFromDict(cropDict, seqName):
   176     if seqName in cropDict:
   177         return seqName, cropDict[seqName]
   178     n = seqName.split(".")[-1]
   179     if n in cropDict:
   180         return n, cropDict[n]
   181     return seqName, []
   182 
   183 def rangesForSecondaryAlignments(primaryRanges, seqLen):
   184     if primaryRanges:
   185         return primaryRanges
   186     return [(0, seqLen)]
   187 
   188 def readSecondaryAlignments(opts, cropRanges1, cropRanges2):
   189     cropDict1 = dict(groupByFirstItem(cropRanges1))
   190     cropDict2 = dict(groupByFirstItem(cropRanges2))
   191 
   192     alignments = []
   193     seqRanges1 = []
   194     seqRanges2 = []
   195     coverDict1 = {}
   196     coverDict2 = {}
   197     lines = myOpen(opts.alignments)
   198     for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
   199         seqName1, ranges1 = nameAndRangesFromDict(cropDict1, seqName1)
   200         seqName2, ranges2 = nameAndRangesFromDict(cropDict2, seqName2)
   201         if not ranges1 and not ranges2:
   202             continue
   203         r1 = rangesForSecondaryAlignments(ranges1, seqLen1)
   204         r2 = rangesForSecondaryAlignments(ranges2, seqLen2)
   205         b = list(croppedBlocks(list(blocks), r1, r2))
   206         if not b: continue
   207         aln = seqName1, seqName2, b
   208         alignments.append(aln)
   209         if not ranges1:
   210             coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
   211             updateSeqs(coverDict1, seqRanges1, seqName1, r1, coveredRange1)
   212         if not ranges2:
   213             coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
   214             updateSeqs(coverDict2, seqRanges2, seqName2, r2, coveredRange2)
   215     return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
   216 
   217 def twoValuesFromOption(text, separator):
   218     if separator in text:
   219         return text.split(separator)
   220     return text, text
   221 
   222 def mergedRanges(ranges):
   223     oldBeg, maxEnd = ranges[0]
   224     for beg, end in ranges:
   225         if beg > maxEnd:
   226             yield oldBeg, maxEnd
   227             oldBeg = beg
   228             maxEnd = end
   229         elif end > maxEnd:
   230             maxEnd = end
   231     yield oldBeg, maxEnd
   232 
   233 def mergedRangesPerSeq(coverDict):
   234     for k, v in coverDict.iteritems():
   235         v.sort()
   236         yield k, list(mergedRanges(v))
   237 
   238 def coveredLength(mergedCoverDict):
   239     return sum(sum(e - b for b, e in v) for v in mergedCoverDict.itervalues())
   240 
   241 def trimmed(seqRanges, coverDict, minAlignedBases, maxGapFrac, endPad, midPad):
   242     maxEndGapFrac, maxMidGapFrac = twoValuesFromOption(maxGapFrac, ",")
   243     maxEndGap = max(float(maxEndGapFrac) * minAlignedBases, endPad * 1.0)
   244     maxMidGap = max(float(maxMidGapFrac) * minAlignedBases, midPad * 2.0)
   245 
   246     for seqName, rangeBeg, rangeEnd in seqRanges:
   247         seqBlocks = coverDict[seqName]
   248         blocks = [i for i in seqBlocks if i[0] < rangeEnd and i[1] > rangeBeg]
   249         if blocks[0][0] - rangeBeg > maxEndGap:
   250             rangeBeg = blocks[0][0] - endPad
   251         for j, y in enumerate(blocks):
   252             if j:
   253                 x = blocks[j - 1]
   254                 if y[0] - x[1] > maxMidGap:
   255                     yield seqName, rangeBeg, x[1] + midPad
   256                     rangeBeg = y[0] - midPad
   257         if rangeEnd - blocks[-1][1] > maxEndGap:
   258             rangeEnd = blocks[-1][1] + endPad
   259         yield seqName, rangeBeg, rangeEnd
   260 
   261 def natural_sort_key(my_string):
   262     '''Return a sort key for "natural" ordering, e.g. chr9 < chr10.'''
   263     parts = re.split(r'(\d+)', my_string)
   264     parts[1::2] = map(int, parts[1::2])
   265     return parts
   266 
   267 def nameKey(oneSeqRanges):
   268     return natural_sort_key(oneSeqRanges[0][0])
   269 
   270 def sizeKey(oneSeqRanges):
   271     return sum(b - e for n, b, e in oneSeqRanges), nameKey(oneSeqRanges)
   272 
   273 def alignmentKey(seqNamesToLists, oneSeqRanges):
   274     seqName = oneSeqRanges[0][0]
   275     alignmentsOfThisSequence = seqNamesToLists[seqName]
   276     numOfAlignedLetterPairs = sum(i[3] for i in alignmentsOfThisSequence)
   277     toMiddle = numOfAlignedLetterPairs // 2
   278     for i in alignmentsOfThisSequence:
   279         toMiddle -= i[3]
   280         if toMiddle < 0:
   281             return i[1:3]  # sequence-rank and "position" of this alignment
   282 
   283 def alignmentSortData(alignments, seqIndex, otherNamesToRanks):
   284     otherIndex = 1 - seqIndex
   285     for i in alignments:
   286         blocks = i[2]
   287         otherRank = otherNamesToRanks[i[otherIndex]]
   288         otherPos = abs(blocks[0][otherIndex] +
   289                        blocks[-1][otherIndex] + blocks[-1][2])
   290         numOfAlignedLetterPairs = sum(i[2] for i in blocks)
   291         yield i[seqIndex], otherRank, otherPos, numOfAlignedLetterPairs
   292 
   293 def mySortedRanges(seqRanges, sortOpt, seqIndex, alignments, otherRanges):
   294     rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
   295     g = [list(ranges) for seqName, ranges in rangesGroupedBySeqName]
   296     if sortOpt == "1":
   297         g.sort(key=nameKey)
   298     if sortOpt == "2":
   299         g.sort(key=sizeKey)
   300     if sortOpt == "3":
   301         otherNameGroups = itertools.groupby(i[0] for i in otherRanges)
   302         ranksAndNames = enumerate(i[0] for i in otherNameGroups)
   303         otherNamesToRanks = dict((n, r) for r, n in ranksAndNames)
   304         alns = sorted(alignmentSortData(alignments, seqIndex,
   305                                         otherNamesToRanks))
   306         alnsGroupedBySeqName = itertools.groupby(alns, itemgetter(0))
   307         seqNamesToLists = dict((k, list(v)) for k, v in alnsGroupedBySeqName)
   308         g.sort(key=functools.partial(alignmentKey, seqNamesToLists))
   309     return [j for i in g for j in i]
   310 
   311 def allSortedRanges(opts, alignments, alignmentsB,
   312                     seqRanges1, seqRangesB1, seqRanges2, seqRangesB2):
   313     o1, oB1 = twoValuesFromOption(opts.sort1, ":")
   314     o2, oB2 = twoValuesFromOption(opts.sort2, ":")
   315     if o1 == "3" and o2 == "3":
   316         raise Exception("the sort options have circular dependency")
   317     if o1 != "3":
   318         s1 = mySortedRanges(seqRanges1, o1, None, None, None)
   319     if o2 != "3":
   320         s2 = mySortedRanges(seqRanges2, o2, None, None, None)
   321     if o1 == "3":
   322         s1 = mySortedRanges(seqRanges1, o1, 0, alignments, s2)
   323     if o2 == "3":
   324         s2 = mySortedRanges(seqRanges2, o2, 1, alignments, s1)
   325     t1 = mySortedRanges(seqRangesB1, oB1, 0, alignmentsB, s2)
   326     t2 = mySortedRanges(seqRangesB2, oB2, 1, alignmentsB, s1)
   327     return s1 + t1, s2 + t2
   328 
   329 def prettyNum(n):
   330     t = str(n)
   331     groups = []
   332     while t:
   333         groups.append(t[-3:])
   334         t = t[:-3]
   335     return ",".join(reversed(groups))
   336 
   337 def sizeText(size):
   338     suffixes = "bp", "kb", "Mb", "Gb"
   339     for i, x in enumerate(suffixes):
   340         j = 10 ** (i * 3)
   341         if size < j * 10:
   342             return "%.2g" % (1.0 * size / j) + x
   343         if size < j * 1000 or i == len(suffixes) - 1:
   344             return "%.0f" % (1.0 * size / j) + x
   345 
   346 def labelText(seqRange, labelOpt):
   347     seqName, beg, end = seqRange
   348     if labelOpt == 1:
   349         return seqName + ": " + sizeText(end - beg)
   350     if labelOpt == 2:
   351         return seqName + ":" + prettyNum(beg) + ": " + sizeText(end - beg)
   352     if labelOpt == 3:
   353         return seqName + ":" + prettyNum(beg) + "-" + prettyNum(end)
   354     return seqName
   355 
   356 def rangeLabels(seqRanges, labelOpt, font, fontsize, image_mode, textRot):
   357     if fontsize:
   358         image_size = 1, 1
   359         im = Image.new(image_mode, image_size)
   360         draw = ImageDraw.Draw(im)
   361     x = y = 0
   362     for r in seqRanges:
   363         text = labelText(r, labelOpt)
   364         if fontsize:
   365             x, y = draw.textsize(text, font=font)
   366             if textRot:
   367                 x, y = y, x
   368         yield text, x, y
   369 
   370 def dataFromRanges(sortedRanges, font, fontSize, imageMode, labelOpt, textRot):
   371     for i in sortedRanges:
   372         warn("\t".join(map(str, i)))
   373     warn("")
   374     rangeSizes = [e - b for n, b, e in sortedRanges]
   375     labs = list(rangeLabels(sortedRanges, labelOpt, font, fontSize,
   376                             imageMode, textRot))
   377     margin = max(i[2] for i in labs)
   378     # xxx the margin may be too big, because some labels may get omitted
   379     return rangeSizes, labs, margin
   380 
   381 def div_ceil(x, y):
   382     '''Return x / y rounded up.'''
   383     q, r = divmod(x, y)
   384     return q + (r != 0)
   385 
   386 def get_bp_per_pix(rangeSizes, pixTweenRanges, maxPixels):
   387     '''Get the minimum bp-per-pixel that fits in the size limit.'''
   388     warn("choosing bp per pixel...")
   389     numOfRanges = len(rangeSizes)
   390     maxPixelsInRanges = maxPixels - pixTweenRanges * (numOfRanges - 1)
   391     if maxPixelsInRanges < numOfRanges:
   392         raise Exception("can't fit the image: too many sequences?")
   393     negLimit = -maxPixelsInRanges
   394     negBpPerPix = sum(rangeSizes) // negLimit
   395     while True:
   396         if sum(i // negBpPerPix for i in rangeSizes) >= negLimit:
   397             return -negBpPerPix
   398         negBpPerPix -= 1
   399 
   400 def getRangePixBegs(rangePixLens, pixTweenRanges, margin):
   401     '''Get the start pixel for each range.'''
   402     rangePixBegs = []
   403     pix_tot = margin - pixTweenRanges
   404     for i in rangePixLens:
   405         pix_tot += pixTweenRanges
   406         rangePixBegs.append(pix_tot)
   407         pix_tot += i
   408     return rangePixBegs
   409 
   410 def pixelData(rangeSizes, bp_per_pix, pixTweenRanges, margin):
   411     '''Return pixel information about the ranges.'''
   412     rangePixLens = [div_ceil(i, bp_per_pix) for i in rangeSizes]
   413     rangePixBegs = getRangePixBegs(rangePixLens, pixTweenRanges, margin)
   414     tot_pix = rangePixBegs[-1] + rangePixLens[-1]
   415     return rangePixBegs, rangePixLens, tot_pix
   416 
   417 def drawLineForward(hits, width, bp_per_pix, beg1, beg2, size):
   418     while True:
   419         q1, r1 = divmod(beg1, bp_per_pix)
   420         q2, r2 = divmod(beg2, bp_per_pix)
   421         hits[q2 * width + q1] |= 1
   422         next_pix = min(bp_per_pix - r1, bp_per_pix - r2)
   423         if next_pix >= size: break
   424         beg1 += next_pix
   425         beg2 += next_pix
   426         size -= next_pix
   427 
   428 def drawLineReverse(hits, width, bp_per_pix, beg1, beg2, size):
   429     while True:
   430         q1, r1 = divmod(beg1, bp_per_pix)
   431         q2, r2 = divmod(beg2, bp_per_pix)
   432         hits[q2 * width + q1] |= 2
   433         next_pix = min(bp_per_pix - r1, r2 + 1)
   434         if next_pix >= size: break
   435         beg1 += next_pix
   436         beg2 -= next_pix
   437         size -= next_pix
   438 
   439 def findOrigin(ranges, beg, size):
   440     if beg < 0:
   441         beg = -(beg + size)
   442     for rangeBeg, rangeEnd, origin in ranges:
   443         if rangeEnd > beg:
   444             return origin
   445 
   446 def alignmentPixels(width, height, alignments, bp_per_pix,
   447                     rangeDict1, rangeDict2):
   448     hits = [0] * (width * height)  # the image data
   449     for seq1, seq2, blocks in alignments:
   450         beg1, beg2, size = blocks[0]
   451         ori1 = findOrigin(rangeDict1[seq1], beg1, size)
   452         ori2 = findOrigin(rangeDict2[seq2], beg2, size)
   453         for beg1, beg2, size in blocks:
   454             if beg1 < 0:
   455                 beg1 = -(beg1 + size)
   456                 beg2 = -(beg2 + size)
   457             if beg2 >= 0:
   458                 drawLineForward(hits, width, bp_per_pix,
   459                                 ori1 + beg1, ori2 + beg2, size)
   460             else:
   461                 drawLineReverse(hits, width, bp_per_pix,
   462                                 ori1 + beg1, ori2 - beg2 - 1, size)
   463     return hits
   464 
   465 def expandedSeqDict(seqDict):
   466     '''Allow lookup by short sequence names, e.g. chr7 as well as hg19.chr7.'''
   467     newDict = seqDict.copy()
   468     for name, x in seqDict.items():
   469         if "." in name:
   470             base = name.split(".")[-1]
   471             if base in newDict:  # an ambiguous case was found:
   472                 return seqDict   # so give up completely
   473             newDict[base] = x
   474     return newDict
   475 
   476 def readBed(fileName, rangeDict):
   477     for line in myOpen(fileName):
   478         w = line.split()
   479         if not w: continue
   480         seqName = w[0]
   481         if seqName not in rangeDict: continue
   482         beg = int(w[1])
   483         end = int(w[2])
   484         layer = 900
   485         color = "#fbf"
   486         if len(w) > 4:
   487             if w[4] != ".":
   488                 layer = float(w[4])
   489             if len(w) > 5:
   490                 if len(w) > 8 and w[8].count(",") == 2:
   491                     color = "rgb(" + w[8] + ")"
   492                 elif w[5] == "+":
   493                     color = "#ffe8e8"
   494                 elif w[5] == "-":
   495                     color = "#e8e8ff"
   496         yield layer, color, seqName, beg, end
   497 
   498 def commaSeparatedInts(text):
   499     return map(int, text.rstrip(",").split(","))
   500 
   501 def readGenePred(opts, fileName, rangeDict):
   502     for line in myOpen(fileName):
   503         fields = line.split()
   504         if not fields: continue
   505         if fields[2] not in "+-": fields = fields[1:]
   506         seqName = fields[1]
   507         if seqName not in rangeDict: continue
   508         #strand = fields[2]
   509         cdsBeg = int(fields[5])
   510         cdsEnd = int(fields[6])
   511         exonBegs = commaSeparatedInts(fields[8])
   512         exonEnds = commaSeparatedInts(fields[9])
   513         for beg, end in zip(exonBegs, exonEnds):
   514             yield 300, opts.exon_color, seqName, beg, end
   515             b = max(beg, cdsBeg)
   516             e = min(end, cdsEnd)
   517             if b < e: yield 400, opts.cds_color, seqName, b, e
   518 
   519 def readRmsk(fileName, rangeDict):
   520     for line in myOpen(fileName):
   521         fields = line.split()
   522         if len(fields) == 17:  # rmsk.txt
   523             seqName = fields[5]
   524             if seqName not in rangeDict: continue  # do this ASAP for speed
   525             beg = int(fields[6])
   526             end = int(fields[7])
   527             strand = fields[9]
   528             repeatClass = fields[11]
   529         elif len(fields) == 15:  # .out
   530             seqName = fields[4]
   531             if seqName not in rangeDict: continue
   532             beg = int(fields[5]) - 1
   533             end = int(fields[6])
   534             strand = fields[8]
   535             repeatClass = fields[10]
   536         else:
   537             continue
   538         if repeatClass in ("Low_complexity", "Simple_repeat"):
   539             yield 200, "#fbf", seqName, beg, end
   540         elif strand == "+":
   541             yield 100, "#ffe8e8", seqName, beg, end
   542         else:
   543             yield 100, "#e8e8ff", seqName, beg, end
   544 
   545 def isExtraFirstGapField(fields):
   546     return fields[4].isdigit()
   547 
   548 def readGaps(opts, fileName, rangeDict):
   549     '''Read locations of unsequenced gaps, from an agp or gap file.'''
   550     for line in myOpen(fileName):
   551         w = line.split()
   552         if not w or w[0][0] == "#": continue
   553         if isExtraFirstGapField(w): w = w[1:]
   554         if w[4] not in "NU": continue
   555         seqName = w[0]
   556         if seqName not in rangeDict: continue
   557         end = int(w[2])
   558         beg = end - int(w[5])  # zero-based coordinate
   559         if w[7] == "yes":
   560             yield 3000, opts.bridged_color, seqName, beg, end
   561         else:
   562             yield 2000, opts.unbridged_color, seqName, beg, end
   563 
   564 def bedBoxes(beds, rangeDict, margin, edge, isTop, bpPerPix):
   565     for layer, color, seqName, bedBeg, bedEnd in beds:
   566         for rangeBeg, rangeEnd, origin in rangeDict[seqName]:
   567             beg = max(bedBeg, rangeBeg)
   568             end = min(bedEnd, rangeEnd)
   569             if beg >= end: continue
   570             if layer <= 1000:
   571                 # include partly-covered pixels
   572                 b = (origin + beg) // bpPerPix
   573                 e = div_ceil(origin + end, bpPerPix)
   574             else:
   575                 # exclude partly-covered pixels
   576                 b = div_ceil(origin + beg, bpPerPix)
   577                 e = (origin + end) // bpPerPix
   578                 if e <= b: continue
   579                 if end == rangeEnd:  # include partly-covered end pixels
   580                     e = div_ceil(origin + end, bpPerPix)
   581             if isTop:
   582                 box = b, margin, e, edge
   583             else:
   584                 box = margin, b, edge, e
   585             yield layer, color, box
   586 
   587 def drawAnnotations(im, boxes):
   588     # xxx use partial transparency for different-color overlaps?
   589     for layer, color, box in boxes:
   590         im.paste(color, box)
   591 
   592 def placedLabels(labels, rangePixBegs, rangePixLens, beg, end):
   593     '''Return axis labels with endpoint & sort-order information.'''
   594     maxWidth = end - beg
   595     for i, j, k in zip(labels, rangePixBegs, rangePixLens):
   596         text, textWidth, textHeight = i
   597         if textWidth > maxWidth:
   598             continue
   599         labelBeg = j + (k - textWidth) // 2
   600         labelEnd = labelBeg + textWidth
   601         sortKey = textWidth - k
   602         if labelBeg < beg:
   603             sortKey += maxWidth * (beg - labelBeg)
   604             labelBeg = beg
   605             labelEnd = beg + textWidth
   606         if labelEnd > end:
   607             sortKey += maxWidth * (labelEnd - end)
   608             labelEnd = end
   609             labelBeg = end - textWidth
   610         yield sortKey, labelBeg, labelEnd, text, textHeight
   611 
   612 def nonoverlappingLabels(labels, minPixTweenLabels):
   613     '''Get a subset of non-overlapping axis labels, greedily.'''
   614     out = []
   615     for i in labels:
   616         beg = i[1] - minPixTweenLabels
   617         end = i[2] + minPixTweenLabels
   618         if all(j[2] <= beg or j[1] >= end for j in out):
   619             out.append(i)
   620     return out
   621 
   622 def axisImage(labels, rangePixBegs, rangePixLens, textRot,
   623               textAln, font, image_mode, opts):
   624     '''Make an image of axis labels.'''
   625     beg = rangePixBegs[0]
   626     end = rangePixBegs[-1] + rangePixLens[-1]
   627     margin = max(i[2] for i in labels)
   628     labels = sorted(placedLabels(labels, rangePixBegs, rangePixLens, beg, end))
   629     minPixTweenLabels = 0 if textRot else opts.label_space
   630     labels = nonoverlappingLabels(labels, minPixTweenLabels)
   631     image_size = (margin, end) if textRot else (end, margin)
   632     im = Image.new(image_mode, image_size, opts.margin_color)
   633     draw = ImageDraw.Draw(im)
   634     for sortKey, labelBeg, labelEnd, text, textHeight in labels:
   635         base = margin - textHeight if textAln else 0
   636         position = (base, labelBeg) if textRot else (labelBeg, base)
   637         draw.text(position, text, font=font, fill=opts.text_color)
   638     return im
   639 
   640 def rangesWithOrigins(sortedRanges, rangePixBegs, bpPerPix):
   641     for i, j in zip(sortedRanges, rangePixBegs):
   642         seqName, rangeBeg, rangeEnd = i
   643         origin = bpPerPix * j - rangeBeg
   644         yield seqName, (rangeBeg, rangeEnd, origin)
   645 
   646 def rangesPerSeq(sortedRanges, rangePixBegs, bpPerPix):
   647     a = rangesWithOrigins(sortedRanges, rangePixBegs, bpPerPix)
   648     for k, v in itertools.groupby(a, itemgetter(0)):
   649         yield k, [i[1] for i in v]
   650 
   651 def getFont(opts):
   652     if opts.fontfile:
   653         return ImageFont.truetype(opts.fontfile, opts.fontsize)
   654     fileNames = []
   655     try:
   656         x = ["fc-match", "-f%{file}", "arial"]
   657         p = subprocess.Popen(x, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
   658         out, err = p.communicate()
   659         fileNames.append(out)
   660     except OSError as e:
   661         warn("fc-match error: " + str(e))
   662     fileNames.append("/Library/Fonts/Arial.ttf")  # for Mac
   663     for i in fileNames:
   664         try:
   665             font = ImageFont.truetype(i, opts.fontsize)
   666             warn("font: " + i)
   667             return font
   668         except IOError as e:
   669             warn("font load error: " + str(e))
   670     return ImageFont.load_default()
   671 
   672 def lastDotplot(opts, args):
   673     font = getFont(opts)
   674     image_mode = 'RGB'
   675     forward_color = ImageColor.getcolor(opts.forwardcolor, image_mode)
   676     reverse_color = ImageColor.getcolor(opts.reversecolor, image_mode)
   677     zipped_colors = zip(forward_color, reverse_color)
   678     overlap_color = tuple([(i + j) // 2 for i, j in zipped_colors])
   679 
   680     maxGap1, maxGapB1 = twoValuesFromOption(opts.max_gap1, ":")
   681     maxGap2, maxGapB2 = twoValuesFromOption(opts.max_gap2, ":")
   682 
   683     warn("reading alignments...")
   684     alnData = readAlignments(args[0], opts)
   685     alignments, seqRanges1, coverDict1, seqRanges2, coverDict2 = alnData
   686     if not alignments: raise Exception("there are no alignments")
   687     warn("cutting...")
   688     coverDict1 = dict(mergedRangesPerSeq(coverDict1))
   689     coverDict2 = dict(mergedRangesPerSeq(coverDict2))
   690     minAlignedBases = min(coveredLength(coverDict1), coveredLength(coverDict2))
   691     pad = int(opts.pad * minAlignedBases)
   692     cutRanges1 = list(trimmed(seqRanges1, coverDict1, minAlignedBases,
   693                               maxGap1, pad, pad))
   694     cutRanges2 = list(trimmed(seqRanges2, coverDict2, minAlignedBases,
   695                               maxGap2, pad, pad))
   696 
   697     warn("reading secondary alignments...")
   698     alnDataB = readSecondaryAlignments(opts, cutRanges1, cutRanges2)
   699     alignmentsB, seqRangesB1, coverDictB1, seqRangesB2, coverDictB2 = alnDataB
   700     warn("cutting...")
   701     coverDictB1 = dict(mergedRangesPerSeq(coverDictB1))
   702     coverDictB2 = dict(mergedRangesPerSeq(coverDictB2))
   703     cutRangesB1 = trimmed(seqRangesB1, coverDictB1, minAlignedBases,
   704                           maxGapB1, 0, 0)
   705     cutRangesB2 = trimmed(seqRangesB2, coverDictB2, minAlignedBases,
   706                           maxGapB2, 0, 0)
   707 
   708     warn("sorting...")
   709     sortOut = allSortedRanges(opts, alignments, alignmentsB,
   710                               cutRanges1, cutRangesB1, cutRanges2, cutRangesB2)
   711     sortedRanges1, sortedRanges2 = sortOut
   712 
   713     textRot1 = "vertical".startswith(opts.rot1)
   714     i1 = dataFromRanges(sortedRanges1, font,
   715                         opts.fontsize, image_mode, opts.labels1, textRot1)
   716     rangeSizes1, labelData1, tMargin = i1
   717 
   718     textRot2 = "horizontal".startswith(opts.rot2)
   719     i2 = dataFromRanges(sortedRanges2, font,
   720                         opts.fontsize, image_mode, opts.labels2, textRot2)
   721     rangeSizes2, labelData2, lMargin = i2
   722 
   723     maxPixels1 = opts.width  - lMargin
   724     maxPixels2 = opts.height - tMargin
   725     bpPerPix1 = get_bp_per_pix(rangeSizes1, opts.border_pixels, maxPixels1)
   726     bpPerPix2 = get_bp_per_pix(rangeSizes2, opts.border_pixels, maxPixels2)
   727     bpPerPix = max(bpPerPix1, bpPerPix2)
   728     warn("bp per pixel = " + str(bpPerPix))
   729 
   730     p1 = pixelData(rangeSizes1, bpPerPix, opts.border_pixels, lMargin)
   731     rangePixBegs1, rangePixLens1, width = p1
   732     rangeDict1 = dict(rangesPerSeq(sortedRanges1, rangePixBegs1, bpPerPix))
   733 
   734     p2 = pixelData(rangeSizes2, bpPerPix, opts.border_pixels, tMargin)
   735     rangePixBegs2, rangePixLens2, height = p2
   736     rangeDict2 = dict(rangesPerSeq(sortedRanges2, rangePixBegs2, bpPerPix))
   737 
   738     warn("width:  " + str(width))
   739     warn("height: " + str(height))
   740 
   741     warn("processing alignments...")
   742     hits = alignmentPixels(width, height, alignments + alignmentsB, bpPerPix,
   743                            rangeDict1, rangeDict2)
   744 
   745     warn("reading annotations...")
   746 
   747     rangeDict1 = expandedSeqDict(rangeDict1)
   748     beds1 = itertools.chain(readBed(opts.bed1, rangeDict1),
   749                             readRmsk(opts.rmsk1, rangeDict1),
   750                             readGenePred(opts, opts.genePred1, rangeDict1),
   751                             readGaps(opts, opts.gap1, rangeDict1))
   752     b1 = bedBoxes(beds1, rangeDict1, tMargin, height, True, bpPerPix)
   753 
   754     rangeDict2 = expandedSeqDict(rangeDict2)
   755     beds2 = itertools.chain(readBed(opts.bed2, rangeDict2),
   756                             readRmsk(opts.rmsk2, rangeDict2),
   757                             readGenePred(opts, opts.genePred2, rangeDict2),
   758                             readGaps(opts, opts.gap2, rangeDict2))
   759     b2 = bedBoxes(beds2, rangeDict2, lMargin, width, False, bpPerPix)
   760 
   761     boxes = sorted(itertools.chain(b1, b2))
   762 
   763     warn("drawing...")
   764 
   765     image_size = width, height
   766     im = Image.new(image_mode, image_size, opts.background_color)
   767 
   768     drawAnnotations(im, boxes)
   769 
   770     for i in range(height):
   771         for j in range(width):
   772             store_value = hits[i * width + j]
   773             xy = j, i
   774             if   store_value == 1: im.putpixel(xy, forward_color)
   775             elif store_value == 2: im.putpixel(xy, reverse_color)
   776             elif store_value == 3: im.putpixel(xy, overlap_color)
   777 
   778     if opts.fontsize != 0:
   779         axis1 = axisImage(labelData1, rangePixBegs1, rangePixLens1,
   780                           textRot1, False, font, image_mode, opts)
   781         if textRot1:
   782             axis1 = axis1.transpose(Image.ROTATE_90)
   783         axis2 = axisImage(labelData2, rangePixBegs2, rangePixLens2,
   784                           textRot2, textRot2, font, image_mode, opts)
   785         if not textRot2:
   786             axis2 = axis2.transpose(Image.ROTATE_270)
   787         im.paste(axis1, (0, 0))
   788         im.paste(axis2, (0, 0))
   789 
   790     for i in rangePixBegs1[1:]:
   791         box = i - opts.border_pixels, tMargin, i, height
   792         im.paste(opts.border_color, box)
   793 
   794     for i in rangePixBegs2[1:]:
   795         box = lMargin, i - opts.border_pixels, width, i
   796         im.paste(opts.border_color, box)
   797 
   798     im.save(args[1])
   799 
   800 if __name__ == "__main__":
   801     usage = """%prog --help
   802    or: %prog [options] maf-or-tab-alignments dotplot.png
   803    or: %prog [options] maf-or-tab-alignments dotplot.gif
   804    or: ..."""
   805     description = "Draw a dotplot of pair-wise sequence alignments in MAF or tabular format."
   806     op = optparse.OptionParser(usage=usage, description=description)
   807     op.add_option("-v", "--verbose", action="count",
   808                   help="show progress messages & data about the plot")
   809     op.add_option("-1", "--seq1", metavar="PATTERN", action="append",
   810                   default=[],
   811                   help="which sequences to show from the 1st genome")
   812     op.add_option("-2", "--seq2", metavar="PATTERN", action="append",
   813                   default=[],
   814                   help="which sequences to show from the 2nd genome")
   815     # Replace "width" & "height" with a single "length" option?
   816     op.add_option("-x", "--width", type="int", default=1000,
   817                   help="maximum width in pixels (default: %default)")
   818     op.add_option("-y", "--height", type="int", default=1000,
   819                   help="maximum height in pixels (default: %default)")
   820     op.add_option("-c", "--forwardcolor", metavar="COLOR", default="red",
   821                   help="color for forward alignments (default: %default)")
   822     op.add_option("-r", "--reversecolor", metavar="COLOR", default="blue",
   823                   help="color for reverse alignments (default: %default)")
   824     op.add_option("--alignments", metavar="FILE", help="secondary alignments")
   825     op.add_option("--sort1", default="1", metavar="N",
   826                   help="genome1 sequence order: 0=input order, 1=name order, "
   827                   "2=length order, 3=alignment order (default=%default)")
   828     op.add_option("--sort2", default="1", metavar="N",
   829                   help="genome2 sequence order: 0=input order, 1=name order, "
   830                   "2=length order, 3=alignment order (default=%default)")
   831     op.add_option("--max-gap1", metavar="FRAC", default="0.5,2", help=
   832                   "maximum unaligned (end,mid) gap in genome1: "
   833                   "fraction of aligned length (default=%default)")
   834     op.add_option("--max-gap2", metavar="FRAC", default="0.5,2", help=
   835                   "maximum unaligned (end,mid) gap in genome2: "
   836                   "fraction of aligned length (default=%default)")
   837     op.add_option("--pad", metavar="FRAC", type="float", default=0.04, help=
   838                   "pad length when cutting unaligned gaps: "
   839                   "fraction of aligned length (default=%default)")
   840     op.add_option("--border-pixels", metavar="INT", type="int", default=1,
   841                   help="number of pixels between sequences (default=%default)")
   842     op.add_option("--border-color", metavar="COLOR", default="black",
   843                   help="color for pixels between sequences (default=%default)")
   844     # --break-color and/or --break-pixels for intra-sequence breaks?
   845     op.add_option("--margin-color", metavar="COLOR", default="#dcdcdc",
   846                   help="margin color")
   847 
   848     og = optparse.OptionGroup(op, "Text options")
   849     og.add_option("-f", "--fontfile", metavar="FILE",
   850                   help="TrueType or OpenType font file")
   851     og.add_option("-s", "--fontsize", metavar="SIZE", type="int", default=14,
   852                   help="TrueType or OpenType font size (default: %default)")
   853     og.add_option("--labels1", type="int", default=0, metavar="N", help=
   854                   "genome1 labels: 0=name, 1=name:length, "
   855                   "2=name:start:length, 3=name:start-end (default=%default)")
   856     og.add_option("--labels2", type="int", default=0, metavar="N", help=
   857                   "genome2 labels: 0=name, 1=name:length, "
   858                   "2=name:start:length, 3=name:start-end (default=%default)")
   859     og.add_option("--rot1", metavar="ROT", default="h",
   860                   help="text rotation for the 1st genome (default=%default)")
   861     og.add_option("--rot2", metavar="ROT", default="v",
   862                   help="text rotation for the 2nd genome (default=%default)")
   863     op.add_option_group(og)
   864 
   865     og = optparse.OptionGroup(op, "Annotation options")
   866     og.add_option("--bed1", metavar="FILE",
   867                   help="read genome1 annotations from BED file")
   868     og.add_option("--bed2", metavar="FILE",
   869                   help="read genome2 annotations from BED file")
   870     og.add_option("--rmsk1", metavar="FILE", help="read genome1 repeats from "
   871                   "RepeatMasker .out or rmsk.txt file")
   872     og.add_option("--rmsk2", metavar="FILE", help="read genome2 repeats from "
   873                   "RepeatMasker .out or rmsk.txt file")
   874     op.add_option_group(og)
   875 
   876     og = optparse.OptionGroup(op, "Gene options")
   877     og.add_option("--genePred1", metavar="FILE",
   878                   help="read genome1 genes from genePred file")
   879     og.add_option("--genePred2", metavar="FILE",
   880                   help="read genome2 genes from genePred file")
   881     og.add_option("--exon-color", metavar="COLOR", default="PaleGreen",
   882                   help="color for exons (default=%default)")
   883     og.add_option("--cds-color", metavar="COLOR", default="LimeGreen",
   884                   help="color for protein-coding regions (default=%default)")
   885     op.add_option_group(og)
   886 
   887     og = optparse.OptionGroup(op, "Unsequenced gap options")
   888     og.add_option("--gap1", metavar="FILE",
   889                   help="read genome1 unsequenced gaps from agp or gap file")
   890     og.add_option("--gap2", metavar="FILE",
   891                   help="read genome2 unsequenced gaps from agp or gap file")
   892     og.add_option("--bridged-color", metavar="COLOR", default="yellow",
   893                   help="color for bridged gaps (default: %default)")
   894     og.add_option("--unbridged-color", metavar="COLOR", default="orange",
   895                   help="color for unbridged gaps (default: %default)")
   896     op.add_option_group(og)
   897     (opts, args) = op.parse_args()
   898     if len(args) != 2: op.error("2 arguments needed")
   899 
   900     opts.text_color = "black"
   901     opts.background_color = "white"
   902     opts.label_space = 5     # minimum number of pixels between axis labels
   903 
   904     try: lastDotplot(opts, args)
   905     except KeyboardInterrupt: pass  # avoid silly error message
   906     except Exception as e:
   907         prog = os.path.basename(sys.argv[0])
   908         sys.exit(prog + ": error: " + str(e))