scripts/last-dotplot
author Martin C. Frith
Tue Dec 05 13:41:22 2017 +0900 (2017-12-05)
changeset 913 e87abc7ae9c6
parent 912 8834139fa8a8
child 914 f4ca19d126e3
permissions -rwxr-xr-x
Refactor last-dotplot
     1 #! /usr/bin/env python
     2 
     3 # Read pair-wise alignments in MAF or LAST tabular format: write an
     4 # "Oxford grid", a.k.a. dotplot.
     5 
     6 # TODO: Currently, pixels with zero aligned nt-pairs are white, and
     7 # pixels with one or more aligned nt-pairs are black.  This can look
     8 # too crowded for large genome alignments.  I tried shading each pixel
     9 # according to the number of aligned nt-pairs within it, but the
    10 # result is too faint.  How can this be done better?
    11 
    12 import gzip
    13 from fnmatch import fnmatchcase
    14 from operator import itemgetter
    15 import subprocess
    16 import itertools, optparse, os, re, sys
    17 
    18 # Try to make PIL/PILLOW work:
    19 try: from PIL import Image, ImageDraw, ImageFont, ImageColor
    20 except ImportError: import Image, ImageDraw, ImageFont, ImageColor
    21 
    22 def myOpen(fileName):  # faster than fileinput
    23     if fileName is None:
    24         return []
    25     if fileName == "-":
    26         return sys.stdin
    27     if fileName.endswith(".gz"):
    28         return gzip.open(fileName)
    29     return open(fileName)
    30 
    31 def warn(message):
    32     if opts.verbose:
    33         prog = os.path.basename(sys.argv[0])
    34         sys.stderr.write(prog + ": " + message + "\n")
    35 
    36 def groupByFirstItem(things):
    37     for k, v in itertools.groupby(things, itemgetter(0)):
    38         yield k, [i[1:] for i in v]
    39 
    40 def croppedBlocks(blocks, ranges1, ranges2):
    41     headBeg1, headBeg2, headSize = blocks[0]
    42     for r1 in ranges1:
    43         for r2 in ranges2:
    44             cropBeg1, cropEnd1 = r1
    45             if headBeg1 < 0:
    46                 cropBeg1, cropEnd1 = -cropEnd1, -cropBeg1
    47             cropBeg2, cropEnd2 = r2
    48             if headBeg2 < 0:
    49                 cropBeg2, cropEnd2 = -cropEnd2, -cropBeg2
    50             for beg1, beg2, size in blocks:
    51                 b1 = max(cropBeg1, beg1)
    52                 e1 = min(cropEnd1, beg1 + size)
    53                 if b1 >= e1: continue
    54                 offset = beg2 - beg1
    55                 b2 = max(cropBeg2, b1 + offset)
    56                 e2 = min(cropEnd2, e1 + offset)
    57                 if b2 >= e2: continue
    58                 yield b2 - offset, b2, e2 - b2
    59 
    60 def tabBlocks(beg1, beg2, blocks):
    61     '''Get the gapless blocks of an alignment, from LAST tabular format.'''
    62     for i in blocks.split(","):
    63         if ":" in i:
    64             x, y = i.split(":")
    65             beg1 += int(x)
    66             beg2 += int(y)
    67         else:
    68             size = int(i)
    69             yield beg1, beg2, size
    70             beg1 += size
    71             beg2 += size
    72 
    73 def mafBlocks(beg1, beg2, seq1, seq2):
    74     '''Get the gapless blocks of an alignment, from MAF format.'''
    75     size = 0
    76     for x, y in itertools.izip(seq1, seq2):
    77         if x == "-":
    78             if size:
    79                 yield beg1, beg2, size
    80                 beg1 += size
    81                 beg2 += size
    82                 size = 0
    83             beg2 += 1
    84         elif y == "-":
    85             if size:
    86                 yield beg1, beg2, size
    87                 beg1 += size
    88                 beg2 += size
    89                 size = 0
    90             beg1 += 1
    91         else:
    92             size += 1
    93     if size: yield beg1, beg2, size
    94 
    95 def alignmentInput(lines):
    96     '''Get alignments and sequence lengths, from MAF or tabular format.'''
    97     mafCount = 0
    98     for line in lines:
    99         w = line.split()
   100         if line[0].isdigit():  # tabular format
   101             chr1, beg1, seqlen1 = w[1], int(w[2]), int(w[5])
   102             if w[4] == "-": beg1 -= seqlen1
   103             chr2, beg2, seqlen2 = w[6], int(w[7]), int(w[10])
   104             if w[9] == "-": beg2 -= seqlen2
   105             blocks = tabBlocks(beg1, beg2, w[11])
   106             yield chr1, seqlen1, chr2, seqlen2, blocks
   107         elif line[0] == "s":  # MAF format
   108             if mafCount == 0:
   109                 chr1, beg1, seqlen1, seq1 = w[1], int(w[2]), int(w[5]), w[6]
   110                 if w[4] == "-": beg1 -= seqlen1
   111                 mafCount = 1
   112             else:
   113                 chr2, beg2, seqlen2, seq2 = w[1], int(w[2]), int(w[5]), w[6]
   114                 if w[4] == "-": beg2 -= seqlen2
   115                 blocks = mafBlocks(beg1, beg2, seq1, seq2)
   116                 yield chr1, seqlen1, chr2, seqlen2, blocks
   117                 mafCount = 0
   118 
   119 def seqRequestFromText(text):
   120     if ":" in text:
   121         pattern, interval = text.rsplit(":", 1)
   122         if "-" in interval:
   123             beg, end = interval.rsplit("-", 1)
   124             return pattern, int(beg), int(end)  # beg may be negative
   125     return text, 0, sys.maxsize
   126 
   127 def rangesFromSeqName(seqRequests, name, seqLen):
   128     if seqRequests:
   129         base = name.split(".")[-1]  # allow for names like hg19.chr7
   130         for pat, beg, end in seqRequests:
   131             if fnmatchcase(name, pat) or fnmatchcase(base, pat):
   132                 yield max(beg, 0), min(end, seqLen)
   133     else:
   134         yield 0, seqLen
   135 
   136 def updateSeqs(coverDict, seqRanges, seqName, ranges, coveredRange):
   137     beg, end = coveredRange
   138     if beg < 0:
   139         coveredRange = -end, -beg
   140     if seqName in coverDict:
   141         coverDict[seqName].append(coveredRange)
   142     else:
   143         coverDict[seqName] = [coveredRange]
   144         for beg, end in ranges:
   145             r = seqName, beg, end
   146             seqRanges.append(r)
   147 
   148 def readAlignments(fileName, opts):
   149     '''Get alignments and sequence limits, from MAF or tabular format.'''
   150     seqRequests1 = map(seqRequestFromText, opts.seq1)
   151     seqRequests2 = map(seqRequestFromText, opts.seq2)
   152 
   153     alignments = []
   154     seqRanges1 = []
   155     seqRanges2 = []
   156     coverDict1 = {}
   157     coverDict2 = {}
   158     lines = myOpen(fileName)
   159     for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
   160         ranges1 = sorted(rangesFromSeqName(seqRequests1, seqName1, seqLen1))
   161         if not ranges1: continue
   162         ranges2 = sorted(rangesFromSeqName(seqRequests2, seqName2, seqLen2))
   163         if not ranges2: continue
   164         b = list(croppedBlocks(list(blocks), ranges1, ranges2))
   165         if not b: continue
   166         aln = seqName1, seqName2, b
   167         alignments.append(aln)
   168         coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
   169         updateSeqs(coverDict1, seqRanges1, seqName1, ranges1, coveredRange1)
   170         coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
   171         updateSeqs(coverDict2, seqRanges2, seqName2, ranges2, coveredRange2)
   172     return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
   173 
   174 def nameAndRangesFromDict(cropDict, seqName):
   175     if seqName in cropDict:
   176         return seqName, cropDict[seqName]
   177     n = seqName.split(".")[-1]
   178     if n in cropDict:
   179         return n, cropDict[n]
   180     return seqName, []
   181 
   182 def rangesForSecondaryAlignments(primaryRanges, seqLen):
   183     if primaryRanges:
   184         return primaryRanges
   185     return [(0, seqLen)]
   186 
   187 def readSecondaryAlignments(opts, cropRanges1, cropRanges2):
   188     cropDict1 = dict(groupByFirstItem(cropRanges1))
   189     cropDict2 = dict(groupByFirstItem(cropRanges2))
   190 
   191     alignments = []
   192     seqRanges1 = []
   193     seqRanges2 = []
   194     coverDict1 = {}
   195     coverDict2 = {}
   196     lines = myOpen(opts.alignments)
   197     for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
   198         seqName1, ranges1 = nameAndRangesFromDict(cropDict1, seqName1)
   199         seqName2, ranges2 = nameAndRangesFromDict(cropDict2, seqName2)
   200         if not ranges1 and not ranges2:
   201             continue
   202         r1 = rangesForSecondaryAlignments(ranges1, seqLen1)
   203         r2 = rangesForSecondaryAlignments(ranges2, seqLen2)
   204         b = list(croppedBlocks(list(blocks), r1, r2))
   205         if not b: continue
   206         aln = seqName1, seqName2, b
   207         alignments.append(aln)
   208         if not ranges1:
   209             coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
   210             updateSeqs(coverDict1, seqRanges1, seqName1, r1, coveredRange1)
   211         if not ranges2:
   212             coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
   213             updateSeqs(coverDict2, seqRanges2, seqName2, r2, coveredRange2)
   214     return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
   215 
   216 def twoValuesFromOption(text, separator):
   217     if separator in text:
   218         return text.split(separator)
   219     return text, text
   220 
   221 def mergedRanges(ranges):
   222     oldBeg, maxEnd = ranges[0]
   223     for beg, end in ranges:
   224         if beg > maxEnd:
   225             yield oldBeg, maxEnd
   226             oldBeg = beg
   227             maxEnd = end
   228         elif end > maxEnd:
   229             maxEnd = end
   230     yield oldBeg, maxEnd
   231 
   232 def mergedRangesPerSeq(coverDict):
   233     for k, v in coverDict.iteritems():
   234         v.sort()
   235         yield k, list(mergedRanges(v))
   236 
   237 def coveredLength(mergedCoverDict):
   238     return sum(sum(e - b for b, e in v) for v in mergedCoverDict.itervalues())
   239 
   240 def trimmed(seqRanges, coverDict, minAlignedBases, maxGapFrac, endPad, midPad):
   241     maxEndGapFrac, maxMidGapFrac = twoValuesFromOption(maxGapFrac, ",")
   242     maxEndGap = max(float(maxEndGapFrac) * minAlignedBases, endPad * 1.0)
   243     maxMidGap = max(float(maxMidGapFrac) * minAlignedBases, midPad * 2.0)
   244 
   245     for seqName, rangeBeg, rangeEnd in seqRanges:
   246         seqBlocks = coverDict[seqName]
   247         blocks = [i for i in seqBlocks if i[0] < rangeEnd and i[1] > rangeBeg]
   248         if blocks[0][0] - rangeBeg > maxEndGap:
   249             rangeBeg = blocks[0][0] - endPad
   250         for j, y in enumerate(blocks):
   251             if j:
   252                 x = blocks[j - 1]
   253                 if y[0] - x[1] > maxMidGap:
   254                     yield seqName, rangeBeg, x[1] + midPad
   255                     rangeBeg = y[0] - midPad
   256         if rangeEnd - blocks[-1][1] > maxEndGap:
   257             rangeEnd = blocks[-1][1] + endPad
   258         yield seqName, rangeBeg, rangeEnd
   259 
   260 def natural_sort_key(my_string):
   261     '''Return a sort key for "natural" ordering, e.g. chr9 < chr10.'''
   262     parts = re.split(r'(\d+)', my_string)
   263     parts[1::2] = map(int, parts[1::2])
   264     return parts
   265 
   266 def nameKey(oneSeqRanges):
   267     return natural_sort_key(oneSeqRanges[0][0])
   268 
   269 def sizeKey(oneSeqRanges):
   270     return sum(b - e for n, b, e in oneSeqRanges), nameKey(oneSeqRanges)
   271 
   272 def mySortedRanges(seqRanges, sortOpt):
   273     rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
   274     g = [list(ranges) for seqName, ranges in rangesGroupedBySeqName]
   275     if sortOpt == "1":
   276         g.sort(key=nameKey)
   277     if sortOpt == "2":
   278         g.sort(key=sizeKey)
   279     return [j for i in g for j in i]
   280 
   281 def allSortedRanges(seqRanges, seqRangesB, sortOpt):
   282     x, y = twoValuesFromOption(sortOpt, ":")
   283     return mySortedRanges(seqRanges, x) + mySortedRanges(seqRangesB, y)
   284 
   285 def prettyNum(n):
   286     t = str(n)
   287     groups = []
   288     while t:
   289         groups.append(t[-3:])
   290         t = t[:-3]
   291     return ",".join(reversed(groups))
   292 
   293 def sizeText(size):
   294     suffixes = "bp", "kb", "Mb", "Gb"
   295     for i, x in enumerate(suffixes):
   296         j = 10 ** (i * 3)
   297         if size < j * 10:
   298             return "%.2g" % (1.0 * size / j) + x
   299         if size < j * 1000 or i == len(suffixes) - 1:
   300             return "%.0f" % (1.0 * size / j) + x
   301 
   302 def labelText(seqRange, labelOpt):
   303     seqName, beg, end = seqRange
   304     if labelOpt == 1:
   305         return seqName + ": " + sizeText(end - beg)
   306     if labelOpt == 2:
   307         return seqName + ":" + prettyNum(beg) + ": " + sizeText(end - beg)
   308     if labelOpt == 3:
   309         return seqName + ":" + prettyNum(beg) + "-" + prettyNum(end)
   310     return seqName
   311 
   312 def rangeLabels(seqRanges, labelOpt, font, fontsize, image_mode, textRot):
   313     if fontsize:
   314         image_size = 1, 1
   315         im = Image.new(image_mode, image_size)
   316         draw = ImageDraw.Draw(im)
   317     x = y = 0
   318     for r in seqRanges:
   319         text = labelText(r, labelOpt)
   320         if fontsize:
   321             x, y = draw.textsize(text, font=font)
   322             if textRot:
   323                 x, y = y, x
   324         yield text, x, y
   325 
   326 def dataFromRanges(cutRanges, cutRangesB, sortOpt, font,
   327                    fontSize, imageMode, labelOpt, textRot):
   328     sortedRanges = allSortedRanges(cutRanges, cutRangesB, sortOpt)
   329     for i in sortedRanges:
   330         warn("\t".join(map(str, i)))
   331     warn("")
   332     rangeSizes = [e - b for n, b, e in sortedRanges]
   333     labs = list(rangeLabels(sortedRanges, labelOpt, font, fontSize,
   334                             imageMode, textRot))
   335     margin = max(i[2] for i in labs)
   336     # xxx the margin may be too big, because some labels may get omitted
   337     return sortedRanges, rangeSizes, labs, margin
   338 
   339 def div_ceil(x, y):
   340     '''Return x / y rounded up.'''
   341     q, r = divmod(x, y)
   342     return q + (r != 0)
   343 
   344 def get_bp_per_pix(rangeSizes, pixTweenRanges, maxPixels):
   345     '''Get the minimum bp-per-pixel that fits in the size limit.'''
   346     warn("choosing bp per pixel...")
   347     numOfRanges = len(rangeSizes)
   348     maxPixelsInRanges = maxPixels - pixTweenRanges * (numOfRanges - 1)
   349     if maxPixelsInRanges < numOfRanges:
   350         raise Exception("can't fit the image: too many sequences?")
   351     negLimit = -maxPixelsInRanges
   352     negBpPerPix = sum(rangeSizes) // negLimit
   353     while True:
   354         if sum(i // negBpPerPix for i in rangeSizes) >= negLimit:
   355             return -negBpPerPix
   356         negBpPerPix -= 1
   357 
   358 def getRangePixBegs(rangePixLens, pixTweenRanges, margin):
   359     '''Get the start pixel for each range.'''
   360     rangePixBegs = []
   361     pix_tot = margin - pixTweenRanges
   362     for i in rangePixLens:
   363         pix_tot += pixTweenRanges
   364         rangePixBegs.append(pix_tot)
   365         pix_tot += i
   366     return rangePixBegs
   367 
   368 def pixelData(rangeSizes, bp_per_pix, pixTweenRanges, margin):
   369     '''Return pixel information about the ranges.'''
   370     rangePixLens = [div_ceil(i, bp_per_pix) for i in rangeSizes]
   371     rangePixBegs = getRangePixBegs(rangePixLens, pixTweenRanges, margin)
   372     tot_pix = rangePixBegs[-1] + rangePixLens[-1]
   373     return rangePixBegs, rangePixLens, tot_pix
   374 
   375 def drawLineForward(hits, width, bp_per_pix, beg1, beg2, size):
   376     while True:
   377         q1, r1 = divmod(beg1, bp_per_pix)
   378         q2, r2 = divmod(beg2, bp_per_pix)
   379         hits[q2 * width + q1] |= 1
   380         next_pix = min(bp_per_pix - r1, bp_per_pix - r2)
   381         if next_pix >= size: break
   382         beg1 += next_pix
   383         beg2 += next_pix
   384         size -= next_pix
   385 
   386 def drawLineReverse(hits, width, bp_per_pix, beg1, beg2, size):
   387     beg2 = -1 - beg2
   388     while True:
   389         q1, r1 = divmod(beg1, bp_per_pix)
   390         q2, r2 = divmod(beg2, bp_per_pix)
   391         hits[q2 * width + q1] |= 2
   392         next_pix = min(bp_per_pix - r1, r2 + 1)
   393         if next_pix >= size: break
   394         beg1 += next_pix
   395         beg2 -= next_pix
   396         size -= next_pix
   397 
   398 def findOrigin(ranges, beg, size):
   399     if beg < 0:
   400         beg = -(beg + size)
   401     for rangeBeg, rangeEnd, origin in ranges:
   402         if rangeEnd > beg:
   403             return origin
   404 
   405 def alignmentPixels(width, height, alignments, bp_per_pix,
   406                     rangeDict1, rangeDict2):
   407     hits = [0] * (width * height)  # the image data
   408     for seq1, seq2, blocks in alignments:
   409         beg1, beg2, size = blocks[0]
   410         ori1 = findOrigin(rangeDict1[seq1], beg1, size)
   411         ori2 = findOrigin(rangeDict2[seq2], beg2, size)
   412         for beg1, beg2, size in blocks:
   413             if beg1 < 0:
   414                 beg1 = -(beg1 + size)
   415                 beg2 = -(beg2 + size)
   416             if beg2 >= 0:
   417                 drawLineForward(hits, width, bp_per_pix,
   418                                 beg1 + ori1, beg2 + ori2, size)
   419             else:
   420                 drawLineReverse(hits, width, bp_per_pix,
   421                                 beg1 + ori1, beg2 - ori2, size)
   422     return hits
   423 
   424 def expandedSeqDict(seqDict):
   425     '''Allow lookup by short sequence names, e.g. chr7 as well as hg19.chr7.'''
   426     newDict = seqDict.copy()
   427     for name, x in seqDict.items():
   428         if "." in name:
   429             base = name.split(".")[-1]
   430             if base in newDict:  # an ambiguous case was found:
   431                 return seqDict   # so give up completely
   432             newDict[base] = x
   433     return newDict
   434 
   435 def readBed(fileName, rangeDict):
   436     for line in myOpen(fileName):
   437         w = line.split()
   438         if not w: continue
   439         seqName = w[0]
   440         if seqName not in rangeDict: continue
   441         beg = int(w[1])
   442         end = int(w[2])
   443         layer = 900
   444         color = "#fbf"
   445         if len(w) > 4:
   446             if w[4] != ".":
   447                 layer = float(w[4])
   448             if len(w) > 5:
   449                 if len(w) > 8 and w[8].count(",") == 2:
   450                     color = "rgb(" + w[8] + ")"
   451                 elif w[5] == "+":
   452                     color = "#ffe8e8"
   453                 elif w[5] == "-":
   454                     color = "#e8e8ff"
   455         yield layer, color, seqName, beg, end
   456 
   457 def commaSeparatedInts(text):
   458     return map(int, text.rstrip(",").split(","))
   459 
   460 def readGenePred(opts, fileName, rangeDict):
   461     for line in myOpen(fileName):
   462         fields = line.split()
   463         if not fields: continue
   464         if fields[2] not in "+-": fields = fields[1:]
   465         seqName = fields[1]
   466         if seqName not in rangeDict: continue
   467         #strand = fields[2]
   468         cdsBeg = int(fields[5])
   469         cdsEnd = int(fields[6])
   470         exonBegs = commaSeparatedInts(fields[8])
   471         exonEnds = commaSeparatedInts(fields[9])
   472         for beg, end in zip(exonBegs, exonEnds):
   473             yield 300, opts.exon_color, seqName, beg, end
   474             b = max(beg, cdsBeg)
   475             e = min(end, cdsEnd)
   476             if b < e: yield 400, opts.cds_color, seqName, b, e
   477 
   478 def readRmsk(fileName, rangeDict):
   479     for line in myOpen(fileName):
   480         fields = line.split()
   481         if len(fields) == 17:  # rmsk.txt
   482             seqName = fields[5]
   483             if seqName not in rangeDict: continue  # do this ASAP for speed
   484             beg = int(fields[6])
   485             end = int(fields[7])
   486             strand = fields[9]
   487             repeatClass = fields[11]
   488         elif len(fields) == 15:  # .out
   489             seqName = fields[4]
   490             if seqName not in rangeDict: continue
   491             beg = int(fields[5]) - 1
   492             end = int(fields[6])
   493             strand = fields[8]
   494             repeatClass = fields[10]
   495         else:
   496             continue
   497         if repeatClass in ("Low_complexity", "Simple_repeat"):
   498             yield 200, "#fbf", seqName, beg, end
   499         elif strand == "+":
   500             yield 100, "#ffe8e8", seqName, beg, end
   501         else:
   502             yield 100, "#e8e8ff", seqName, beg, end
   503 
   504 def isExtraFirstGapField(fields):
   505     return fields[4].isdigit()
   506 
   507 def readGaps(opts, fileName, rangeDict):
   508     '''Read locations of unsequenced gaps, from an agp or gap file.'''
   509     for line in myOpen(fileName):
   510         w = line.split()
   511         if not w or w[0][0] == "#": continue
   512         if isExtraFirstGapField(w): w = w[1:]
   513         if w[4] not in "NU": continue
   514         seqName = w[0]
   515         if seqName not in rangeDict: continue
   516         end = int(w[2])
   517         beg = end - int(w[5])  # zero-based coordinate
   518         if w[7] == "yes":
   519             yield 3000, opts.bridged_color, seqName, beg, end
   520         else:
   521             yield 2000, opts.unbridged_color, seqName, beg, end
   522 
   523 def bedBoxes(beds, rangeDict, margin, edge, isTop, bpPerPix):
   524     for layer, color, seqName, bedBeg, bedEnd in beds:
   525         for rangeBeg, rangeEnd, origin in rangeDict[seqName]:
   526             beg = max(bedBeg, rangeBeg)
   527             end = min(bedEnd, rangeEnd)
   528             if beg >= end: continue
   529             if layer <= 1000:
   530                 # include partly-covered pixels
   531                 b = (origin + beg) // bpPerPix
   532                 e = div_ceil(origin + end, bpPerPix)
   533             else:
   534                 # exclude partly-covered pixels
   535                 b = div_ceil(origin + beg, bpPerPix)
   536                 e = (origin + end) // bpPerPix
   537                 if e <= b: continue
   538                 if end == rangeEnd:  # include partly-covered end pixels
   539                     e = div_ceil(origin + end, bpPerPix)
   540             if isTop:
   541                 box = b, margin, e, edge
   542             else:
   543                 box = margin, b, edge, e
   544             yield layer, color, box
   545 
   546 def drawAnnotations(im, boxes):
   547     # xxx use partial transparency for different-color overlaps?
   548     for layer, color, box in boxes:
   549         im.paste(color, box)
   550 
   551 def placedLabels(labels, rangePixBegs, rangePixLens, beg, end):
   552     '''Return axis labels with endpoint & sort-order information.'''
   553     maxWidth = end - beg
   554     for i, j, k in zip(labels, rangePixBegs, rangePixLens):
   555         text, textWidth, textHeight = i
   556         if textWidth > maxWidth:
   557             continue
   558         labelBeg = j + (k - textWidth) // 2
   559         labelEnd = labelBeg + textWidth
   560         sortKey = textWidth - k
   561         if labelBeg < beg:
   562             sortKey += maxWidth * (beg - labelBeg)
   563             labelBeg = beg
   564             labelEnd = beg + textWidth
   565         if labelEnd > end:
   566             sortKey += maxWidth * (labelEnd - end)
   567             labelEnd = end
   568             labelBeg = end - textWidth
   569         yield sortKey, labelBeg, labelEnd, text, textHeight
   570 
   571 def nonoverlappingLabels(labels, minPixTweenLabels):
   572     '''Get a subset of non-overlapping axis labels, greedily.'''
   573     out = []
   574     for i in labels:
   575         beg = i[1] - minPixTweenLabels
   576         end = i[2] + minPixTweenLabels
   577         if all(j[2] <= beg or j[1] >= end for j in out):
   578             out.append(i)
   579     return out
   580 
   581 def axisImage(labels, rangePixBegs, rangePixLens, textRot,
   582               textAln, font, image_mode, opts):
   583     '''Make an image of axis labels.'''
   584     beg = rangePixBegs[0]
   585     end = rangePixBegs[-1] + rangePixLens[-1]
   586     margin = max(i[2] for i in labels)
   587     labels = sorted(placedLabels(labels, rangePixBegs, rangePixLens, beg, end))
   588     minPixTweenLabels = 0 if textRot else opts.label_space
   589     labels = nonoverlappingLabels(labels, minPixTweenLabels)
   590     image_size = (margin, end) if textRot else (end, margin)
   591     im = Image.new(image_mode, image_size, opts.margin_color)
   592     draw = ImageDraw.Draw(im)
   593     for i in labels:
   594         base = margin - i[4] if textAln else 0
   595         position = (base, i[1]) if textRot else (i[1], base)
   596         draw.text(position, i[3], font=font, fill=opts.text_color)
   597     return im
   598 
   599 def rangesWithOrigins(sortedRanges, rangePixBegs, bpPerPix):
   600     for i, j in zip(sortedRanges, rangePixBegs):
   601         seqName, rangeBeg, rangeEnd = i
   602         origin = bpPerPix * j - rangeBeg
   603         yield seqName, (rangeBeg, rangeEnd, origin)
   604 
   605 def rangesPerSeq(sortedRanges, rangePixBegs, bpPerPix):
   606     a = rangesWithOrigins(sortedRanges, rangePixBegs, bpPerPix)
   607     for k, v in itertools.groupby(a, itemgetter(0)):
   608         yield k, [i[1] for i in v]
   609 
   610 def getFont(opts):
   611     if opts.fontfile:
   612         return ImageFont.truetype(opts.fontfile, opts.fontsize)
   613     fileNames = []
   614     try:
   615         x = ["fc-match", "-f%{file}", "arial"]
   616         p = subprocess.Popen(x, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
   617         out, err = p.communicate()
   618         fileNames.append(out)
   619     except OSError as e:
   620         warn("fc-match error: " + str(e))
   621     fileNames.append("/Library/Fonts/Arial.ttf")  # for Mac
   622     for i in fileNames:
   623         try:
   624             font = ImageFont.truetype(i, opts.fontsize)
   625             warn("font: " + i)
   626             return font
   627         except IOError as e:
   628             warn("font load error: " + str(e))
   629     return ImageFont.load_default()
   630 
   631 def lastDotplot(opts, args):
   632     font = getFont(opts)
   633     image_mode = 'RGB'
   634     forward_color = ImageColor.getcolor(opts.forwardcolor, image_mode)
   635     reverse_color = ImageColor.getcolor(opts.reversecolor, image_mode)
   636     zipped_colors = zip(forward_color, reverse_color)
   637     overlap_color = tuple([(i + j) // 2 for i, j in zipped_colors])
   638 
   639     maxGap1, maxGapB1 = twoValuesFromOption(opts.max_gap1, ":")
   640     maxGap2, maxGapB2 = twoValuesFromOption(opts.max_gap2, ":")
   641 
   642     warn("reading alignments...")
   643     alnData = readAlignments(args[0], opts)
   644     alignments, seqRanges1, coverDict1, seqRanges2, coverDict2 = alnData
   645     if not alignments: raise Exception("there are no alignments")
   646     warn("cutting...")
   647     coverDict1 = dict(mergedRangesPerSeq(coverDict1))
   648     coverDict2 = dict(mergedRangesPerSeq(coverDict2))
   649     minAlignedBases = min(coveredLength(coverDict1), coveredLength(coverDict2))
   650     pad = int(opts.pad * minAlignedBases)
   651     cutRanges1 = list(trimmed(seqRanges1, coverDict1, minAlignedBases,
   652                               maxGap1, pad, pad))
   653     cutRanges2 = list(trimmed(seqRanges2, coverDict2, minAlignedBases,
   654                               maxGap2, pad, pad))
   655 
   656     warn("reading secondary alignments...")
   657     alnDataB = readSecondaryAlignments(opts, cutRanges1, cutRanges2)
   658     alignmentsB, seqRangesB1, coverDictB1, seqRangesB2, coverDictB2 = alnDataB
   659     warn("cutting...")
   660     coverDictB1 = dict(mergedRangesPerSeq(coverDictB1))
   661     coverDictB2 = dict(mergedRangesPerSeq(coverDictB2))
   662     cutRangesB1 = trimmed(seqRangesB1, coverDictB1, minAlignedBases,
   663                           maxGapB1, 0, 0)
   664     cutRangesB2 = trimmed(seqRangesB2, coverDictB2, minAlignedBases,
   665                           maxGapB2, 0, 0)
   666 
   667     textRot1 = "vertical".startswith(opts.rot1)
   668     i1 = dataFromRanges(cutRanges1, cutRangesB1, opts.sort1, font,
   669                         opts.fontsize, image_mode, opts.labels1, textRot1)
   670     sortedRanges1, rangeSizes1, labelData1, tMargin = i1
   671 
   672     textRot2 = "horizontal".startswith(opts.rot2)
   673     i2 = dataFromRanges(cutRanges2, cutRangesB2, opts.sort2, font,
   674                         opts.fontsize, image_mode, opts.labels2, textRot2)
   675     sortedRanges2, rangeSizes2, labelData2, lMargin = i2
   676 
   677     maxPixels1 = opts.width  - lMargin
   678     maxPixels2 = opts.height - tMargin
   679     bpPerPix1 = get_bp_per_pix(rangeSizes1, opts.border_pixels, maxPixels1)
   680     bpPerPix2 = get_bp_per_pix(rangeSizes2, opts.border_pixels, maxPixels2)
   681     bpPerPix = max(bpPerPix1, bpPerPix2)
   682     warn("bp per pixel = " + str(bpPerPix))
   683 
   684     p1 = pixelData(rangeSizes1, bpPerPix, opts.border_pixels, lMargin)
   685     rangePixBegs1, rangePixLens1, width = p1
   686     rangeDict1 = dict(rangesPerSeq(sortedRanges1, rangePixBegs1, bpPerPix))
   687 
   688     p2 = pixelData(rangeSizes2, bpPerPix, opts.border_pixels, tMargin)
   689     rangePixBegs2, rangePixLens2, height = p2
   690     rangeDict2 = dict(rangesPerSeq(sortedRanges2, rangePixBegs2, bpPerPix))
   691 
   692     warn("width:  " + str(width))
   693     warn("height: " + str(height))
   694 
   695     warn("processing alignments...")
   696     hits = alignmentPixels(width, height, alignments + alignmentsB, bpPerPix,
   697                            rangeDict1, rangeDict2)
   698 
   699     warn("reading annotations...")
   700 
   701     rangeDict1 = expandedSeqDict(rangeDict1)
   702     beds1 = itertools.chain(readBed(opts.bed1, rangeDict1),
   703                             readRmsk(opts.rmsk1, rangeDict1),
   704                             readGenePred(opts, opts.genePred1, rangeDict1),
   705                             readGaps(opts, opts.gap1, rangeDict1))
   706     b1 = bedBoxes(beds1, rangeDict1, tMargin, height, True, bpPerPix)
   707 
   708     rangeDict2 = expandedSeqDict(rangeDict2)
   709     beds2 = itertools.chain(readBed(opts.bed2, rangeDict2),
   710                             readRmsk(opts.rmsk2, rangeDict2),
   711                             readGenePred(opts, opts.genePred2, rangeDict2),
   712                             readGaps(opts, opts.gap2, rangeDict2))
   713     b2 = bedBoxes(beds2, rangeDict2, lMargin, width, False, bpPerPix)
   714 
   715     boxes = sorted(itertools.chain(b1, b2))
   716 
   717     warn("drawing...")
   718 
   719     image_size = width, height
   720     im = Image.new(image_mode, image_size, opts.background_color)
   721 
   722     drawAnnotations(im, boxes)
   723 
   724     for i in range(height):
   725         for j in range(width):
   726             store_value = hits[i * width + j]
   727             xy = j, i
   728             if   store_value == 1: im.putpixel(xy, forward_color)
   729             elif store_value == 2: im.putpixel(xy, reverse_color)
   730             elif store_value == 3: im.putpixel(xy, overlap_color)
   731 
   732     if opts.fontsize != 0:
   733         axis1 = axisImage(labelData1, rangePixBegs1, rangePixLens1,
   734                           textRot1, False, font, image_mode, opts)
   735         if textRot1:
   736             axis1 = axis1.transpose(Image.ROTATE_90)
   737         axis2 = axisImage(labelData2, rangePixBegs2, rangePixLens2,
   738                           textRot2, textRot2, font, image_mode, opts)
   739         if not textRot2:
   740             axis2 = axis2.transpose(Image.ROTATE_270)
   741         im.paste(axis1, (0, 0))
   742         im.paste(axis2, (0, 0))
   743 
   744     for i in rangePixBegs1[1:]:
   745         box = i - opts.border_pixels, tMargin, i, height
   746         im.paste(opts.border_color, box)
   747 
   748     for i in rangePixBegs2[1:]:
   749         box = lMargin, i - opts.border_pixels, width, i
   750         im.paste(opts.border_color, box)
   751 
   752     im.save(args[1])
   753 
   754 if __name__ == "__main__":
   755     usage = """%prog --help
   756    or: %prog [options] maf-or-tab-alignments dotplot.png
   757    or: %prog [options] maf-or-tab-alignments dotplot.gif
   758    or: ..."""
   759     description = "Draw a dotplot of pair-wise sequence alignments in MAF or tabular format."
   760     op = optparse.OptionParser(usage=usage, description=description)
   761     op.add_option("-v", "--verbose", action="count",
   762                   help="show progress messages & data about the plot")
   763     op.add_option("-1", "--seq1", metavar="PATTERN", action="append",
   764                   default=[],
   765                   help="which sequences to show from the 1st genome")
   766     op.add_option("-2", "--seq2", metavar="PATTERN", action="append",
   767                   default=[],
   768                   help="which sequences to show from the 2nd genome")
   769     # Replace "width" & "height" with a single "length" option?
   770     op.add_option("-x", "--width", type="int", default=1000,
   771                   help="maximum width in pixels (default: %default)")
   772     op.add_option("-y", "--height", type="int", default=1000,
   773                   help="maximum height in pixels (default: %default)")
   774     op.add_option("-c", "--forwardcolor", metavar="COLOR", default="red",
   775                   help="color for forward alignments (default: %default)")
   776     op.add_option("-r", "--reversecolor", metavar="COLOR", default="blue",
   777                   help="color for reverse alignments (default: %default)")
   778     op.add_option("--alignments", metavar="FILE", help="secondary alignments")
   779     op.add_option("--sort1", default="1", metavar="N",
   780                   help="genome1 sequence order: 0=input order, 1=name order, "
   781                   "2=length order (default=%default)")
   782     op.add_option("--sort2", default="1", metavar="N",
   783                   help="genome2 sequence order: 0=input order, 1=name order, "
   784                   "2=length order (default=%default)")
   785     op.add_option("--max-gap1", metavar="FRAC", default="0.5,2", help=
   786                   "maximum unaligned (end,mid) gap in genome1: "
   787                   "fraction of aligned length (default=%default)")
   788     op.add_option("--max-gap2", metavar="FRAC", default="0.5,2", help=
   789                   "maximum unaligned (end,mid) gap in genome2: "
   790                   "fraction of aligned length (default=%default)")
   791     op.add_option("--pad", metavar="FRAC", type="float", default=0.04, help=
   792                   "pad length when cutting unaligned gaps: "
   793                   "fraction of aligned length (default=%default)")
   794     op.add_option("--border-pixels", metavar="INT", type="int", default=1,
   795                   help="number of pixels between sequences (default=%default)")
   796     op.add_option("--border-color", metavar="COLOR", default="black",
   797                   help="color for pixels between sequences (default=%default)")
   798     # --break-color and/or --break-pixels for intra-sequence breaks?
   799     op.add_option("--margin-color", metavar="COLOR", default="#dcdcdc",
   800                   help="margin color")
   801 
   802     og = optparse.OptionGroup(op, "Text options")
   803     og.add_option("-f", "--fontfile", metavar="FILE",
   804                   help="TrueType or OpenType font file")
   805     og.add_option("-s", "--fontsize", metavar="SIZE", type="int", default=14,
   806                   help="TrueType or OpenType font size (default: %default)")
   807     og.add_option("--labels1", type="int", default=0, metavar="N", help=
   808                   "genome1 labels: 0=name, 1=name:length, "
   809                   "2=name:start:length, 3=name:start-end (default=%default)")
   810     og.add_option("--labels2", type="int", default=0, metavar="N", help=
   811                   "genome2 labels: 0=name, 1=name:length, "
   812                   "2=name:start:length, 3=name:start-end (default=%default)")
   813     og.add_option("--rot1", metavar="ROT", default="h",
   814                   help="text rotation for the 1st genome (default=%default)")
   815     og.add_option("--rot2", metavar="ROT", default="v",
   816                   help="text rotation for the 2nd genome (default=%default)")
   817     op.add_option_group(og)
   818 
   819     og = optparse.OptionGroup(op, "Annotation options")
   820     og.add_option("--bed1", metavar="FILE",
   821                   help="read genome1 annotations from BED file")
   822     og.add_option("--bed2", metavar="FILE",
   823                   help="read genome2 annotations from BED file")
   824     og.add_option("--rmsk1", metavar="FILE", help="read genome1 repeats from "
   825                   "RepeatMasker .out or rmsk.txt file")
   826     og.add_option("--rmsk2", metavar="FILE", help="read genome2 repeats from "
   827                   "RepeatMasker .out or rmsk.txt file")
   828     op.add_option_group(og)
   829 
   830     og = optparse.OptionGroup(op, "Gene options")
   831     og.add_option("--genePred1", metavar="FILE",
   832                   help="read genome1 genes from genePred file")
   833     og.add_option("--genePred2", metavar="FILE",
   834                   help="read genome2 genes from genePred file")
   835     og.add_option("--exon-color", metavar="COLOR", default="PaleGreen",
   836                   help="color for exons (default=%default)")
   837     og.add_option("--cds-color", metavar="COLOR", default="LimeGreen",
   838                   help="color for protein-coding regions (default=%default)")
   839     op.add_option_group(og)
   840 
   841     og = optparse.OptionGroup(op, "Unsequenced gap options")
   842     og.add_option("--gap1", metavar="FILE",
   843                   help="read genome1 unsequenced gaps from agp or gap file")
   844     og.add_option("--gap2", metavar="FILE",
   845                   help="read genome2 unsequenced gaps from agp or gap file")
   846     og.add_option("--bridged-color", metavar="COLOR", default="yellow",
   847                   help="color for bridged gaps (default: %default)")
   848     og.add_option("--unbridged-color", metavar="COLOR", default="orange",
   849                   help="color for unbridged gaps (default: %default)")
   850     op.add_option_group(og)
   851     (opts, args) = op.parse_args()
   852     if len(args) != 2: op.error("2 arguments needed")
   853 
   854     opts.text_color = "black"
   855     opts.background_color = "white"
   856     opts.label_space = 5     # minimum number of pixels between axis labels
   857 
   858     try: lastDotplot(opts, args)
   859     except KeyboardInterrupt: pass  # avoid silly error message
   860     except Exception as e:
   861         prog = os.path.basename(sys.argv[0])
   862         sys.exit(prog + ": error: " + str(e))