scripts/last-dotplot
author Martin C. Frith
Wed Nov 08 17:59:10 2017 +0900 (2017-11-08)
changeset 911 4080f3f6f73d
parent 910 42653029d900
child 912 8834139fa8a8
permissions -rwxr-xr-x
Add last-dotplot secondary alignments option
     1 #! /usr/bin/env python
     2 
     3 # Read pair-wise alignments in MAF or LAST tabular format: write an
     4 # "Oxford grid", a.k.a. dotplot.
     5 
     6 # TODO: Currently, pixels with zero aligned nt-pairs are white, and
     7 # pixels with one or more aligned nt-pairs are black.  This can look
     8 # too crowded for large genome alignments.  I tried shading each pixel
     9 # according to the number of aligned nt-pairs within it, but the
    10 # result is too faint.  How can this be done better?
    11 
    12 import gzip
    13 from fnmatch import fnmatchcase
    14 from operator import itemgetter
    15 import subprocess
    16 import itertools, optparse, os, re, sys
    17 
    18 # Try to make PIL/PILLOW work:
    19 try: from PIL import Image, ImageDraw, ImageFont, ImageColor
    20 except ImportError: import Image, ImageDraw, ImageFont, ImageColor
    21 
    22 def myOpen(fileName):  # faster than fileinput
    23     if fileName is None:
    24         return []
    25     if fileName == "-":
    26         return sys.stdin
    27     if fileName.endswith(".gz"):
    28         return gzip.open(fileName)
    29     return open(fileName)
    30 
    31 def warn(message):
    32     if opts.verbose:
    33         prog = os.path.basename(sys.argv[0])
    34         sys.stderr.write(prog + ": " + message + "\n")
    35 
    36 def groupByFirstItem(things):
    37     for k, v in itertools.groupby(things, itemgetter(0)):
    38         yield k, [i[1:] for i in v]
    39 
    40 def croppedBlocks(blocks, ranges1, ranges2):
    41     headBeg1, headBeg2, headSize = blocks[0]
    42     for r1 in ranges1:
    43         for r2 in ranges2:
    44             cropBeg1, cropEnd1 = r1
    45             if headBeg1 < 0:
    46                 cropBeg1, cropEnd1 = -cropEnd1, -cropBeg1
    47             cropBeg2, cropEnd2 = r2
    48             if headBeg2 < 0:
    49                 cropBeg2, cropEnd2 = -cropEnd2, -cropBeg2
    50             for beg1, beg2, size in blocks:
    51                 b1 = max(cropBeg1, beg1)
    52                 e1 = min(cropEnd1, beg1 + size)
    53                 if b1 >= e1: continue
    54                 offset = beg2 - beg1
    55                 b2 = max(cropBeg2, b1 + offset)
    56                 e2 = min(cropEnd2, e1 + offset)
    57                 if b2 >= e2: continue
    58                 yield b2 - offset, b2, e2 - b2
    59 
    60 def tabBlocks(beg1, beg2, blocks):
    61     '''Get the gapless blocks of an alignment, from LAST tabular format.'''
    62     for i in blocks.split(","):
    63         if ":" in i:
    64             x, y = i.split(":")
    65             beg1 += int(x)
    66             beg2 += int(y)
    67         else:
    68             size = int(i)
    69             yield beg1, beg2, size
    70             beg1 += size
    71             beg2 += size
    72 
    73 def mafBlocks(beg1, beg2, seq1, seq2):
    74     '''Get the gapless blocks of an alignment, from MAF format.'''
    75     size = 0
    76     for x, y in itertools.izip(seq1, seq2):
    77         if x == "-":
    78             if size:
    79                 yield beg1, beg2, size
    80                 beg1 += size
    81                 beg2 += size
    82                 size = 0
    83             beg2 += 1
    84         elif y == "-":
    85             if size:
    86                 yield beg1, beg2, size
    87                 beg1 += size
    88                 beg2 += size
    89                 size = 0
    90             beg1 += 1
    91         else:
    92             size += 1
    93     if size: yield beg1, beg2, size
    94 
    95 def alignmentInput(lines):
    96     '''Get alignments and sequence lengths, from MAF or tabular format.'''
    97     mafCount = 0
    98     for line in lines:
    99         w = line.split()
   100         if line[0].isdigit():  # tabular format
   101             chr1, beg1, seqlen1 = w[1], int(w[2]), int(w[5])
   102             if w[4] == "-": beg1 -= seqlen1
   103             chr2, beg2, seqlen2 = w[6], int(w[7]), int(w[10])
   104             if w[9] == "-": beg2 -= seqlen2
   105             blocks = tabBlocks(beg1, beg2, w[11])
   106             yield chr1, seqlen1, chr2, seqlen2, blocks
   107         elif line[0] == "s":  # MAF format
   108             if mafCount == 0:
   109                 chr1, beg1, seqlen1, seq1 = w[1], int(w[2]), int(w[5]), w[6]
   110                 if w[4] == "-": beg1 -= seqlen1
   111                 mafCount = 1
   112             else:
   113                 chr2, beg2, seqlen2, seq2 = w[1], int(w[2]), int(w[5]), w[6]
   114                 if w[4] == "-": beg2 -= seqlen2
   115                 blocks = mafBlocks(beg1, beg2, seq1, seq2)
   116                 yield chr1, seqlen1, chr2, seqlen2, blocks
   117                 mafCount = 0
   118 
   119 def seqRequestFromText(text):
   120     if ":" in text:
   121         pattern, interval = text.rsplit(":", 1)
   122         if "-" in interval:
   123             beg, end = interval.rsplit("-", 1)
   124             return pattern, int(beg), int(end)  # beg may be negative
   125     return text, 0, sys.maxsize
   126 
   127 def rangesFromSeqName(seqRequests, name, seqLen):
   128     if seqRequests:
   129         base = name.split(".")[-1]  # allow for names like hg19.chr7
   130         for pat, beg, end in seqRequests:
   131             if fnmatchcase(name, pat) or fnmatchcase(base, pat):
   132                 yield max(beg, 0), min(end, seqLen)
   133     else:
   134         yield 0, seqLen
   135 
   136 def updateSeqs(coverDict, seqRanges, seqName, ranges, coveredRange):
   137     beg, end = coveredRange
   138     if beg < 0:
   139         coveredRange = -end, -beg
   140     if seqName in coverDict:
   141         coverDict[seqName].append(coveredRange)
   142     else:
   143         coverDict[seqName] = [coveredRange]
   144         for beg, end in ranges:
   145             r = seqName, beg, end
   146             seqRanges.append(r)
   147 
   148 def readAlignments(fileName, opts):
   149     '''Get alignments and sequence limits, from MAF or tabular format.'''
   150     seqRequests1 = map(seqRequestFromText, opts.seq1)
   151     seqRequests2 = map(seqRequestFromText, opts.seq2)
   152 
   153     alignments = []
   154     seqRanges1 = []
   155     seqRanges2 = []
   156     coverDict1 = {}
   157     coverDict2 = {}
   158     lines = myOpen(fileName)
   159     for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
   160         ranges1 = sorted(rangesFromSeqName(seqRequests1, seqName1, seqLen1))
   161         if not ranges1: continue
   162         ranges2 = sorted(rangesFromSeqName(seqRequests2, seqName2, seqLen2))
   163         if not ranges2: continue
   164         b = list(croppedBlocks(list(blocks), ranges1, ranges2))
   165         if not b: continue
   166         aln = seqName1, seqName2, b
   167         alignments.append(aln)
   168         coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
   169         updateSeqs(coverDict1, seqRanges1, seqName1, ranges1, coveredRange1)
   170         coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
   171         updateSeqs(coverDict2, seqRanges2, seqName2, ranges2, coveredRange2)
   172     return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
   173 
   174 def nameAndRangesFromDict(cropDict, seqName):
   175     if seqName in cropDict:
   176         return seqName, cropDict[seqName]
   177     n = seqName.split(".")[-1]
   178     if n in cropDict:
   179         return n, cropDict[n]
   180     return seqName, []
   181 
   182 def rangesForSecondaryAlignments(primaryRanges, seqLen):
   183     if primaryRanges:
   184         return primaryRanges
   185     return [(0, seqLen)]
   186 
   187 def readSecondaryAlignments(opts, cropRanges1, cropRanges2):
   188     cropDict1 = dict(groupByFirstItem(cropRanges1))
   189     cropDict2 = dict(groupByFirstItem(cropRanges2))
   190 
   191     alignments = []
   192     seqRanges1 = []
   193     seqRanges2 = []
   194     coverDict1 = {}
   195     coverDict2 = {}
   196     lines = myOpen(opts.alignments)
   197     for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
   198         seqName1, ranges1 = nameAndRangesFromDict(cropDict1, seqName1)
   199         seqName2, ranges2 = nameAndRangesFromDict(cropDict2, seqName2)
   200         if not ranges1 and not ranges2:
   201             continue
   202         r1 = rangesForSecondaryAlignments(ranges1, seqLen1)
   203         r2 = rangesForSecondaryAlignments(ranges2, seqLen2)
   204         b = list(croppedBlocks(list(blocks), r1, r2))
   205         if not b: continue
   206         aln = seqName1, seqName2, b
   207         alignments.append(aln)
   208         if not ranges1:
   209             coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
   210             updateSeqs(coverDict1, seqRanges1, seqName1, r1, coveredRange1)
   211         if not ranges2:
   212             coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
   213             updateSeqs(coverDict2, seqRanges2, seqName2, r2, coveredRange2)
   214     return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
   215 
   216 def twoValuesFromOption(text, separator):
   217     if separator in text:
   218         return text.split(separator)
   219     return text, text
   220 
   221 def mergedRanges(ranges):
   222     oldBeg, maxEnd = ranges[0]
   223     for beg, end in ranges:
   224         if beg > maxEnd:
   225             yield oldBeg, maxEnd
   226             oldBeg = beg
   227             maxEnd = end
   228         elif end > maxEnd:
   229             maxEnd = end
   230     yield oldBeg, maxEnd
   231 
   232 def mergedRangesPerSeq(coverDict):
   233     for k, v in coverDict.iteritems():
   234         v.sort()
   235         yield k, list(mergedRanges(v))
   236 
   237 def coveredLength(mergedCoverDict):
   238     return sum(sum(e - b for b, e in v) for v in mergedCoverDict.itervalues())
   239 
   240 def trimmed(seqRanges, coverDict, minAlignedBases, maxGapFrac, endPad, midPad):
   241     maxEndGapFrac, maxMidGapFrac = twoValuesFromOption(maxGapFrac, ",")
   242     maxEndGap = max(float(maxEndGapFrac) * minAlignedBases, endPad * 1.0)
   243     maxMidGap = max(float(maxMidGapFrac) * minAlignedBases, midPad * 2.0)
   244 
   245     for seqName, rangeBeg, rangeEnd in seqRanges:
   246         seqBlocks = coverDict[seqName]
   247         blocks = [i for i in seqBlocks if i[0] < rangeEnd and i[1] > rangeBeg]
   248         if blocks[0][0] - rangeBeg > maxEndGap:
   249             rangeBeg = blocks[0][0] - endPad
   250         for j, y in enumerate(blocks):
   251             if j:
   252                 x = blocks[j - 1]
   253                 if y[0] - x[1] > maxMidGap:
   254                     yield seqName, rangeBeg, x[1] + midPad
   255                     rangeBeg = y[0] - midPad
   256         if rangeEnd - blocks[-1][1] > maxEndGap:
   257             rangeEnd = blocks[-1][1] + endPad
   258         yield seqName, rangeBeg, rangeEnd
   259 
   260 def natural_sort_key(my_string):
   261     '''Return a sort key for "natural" ordering, e.g. chr9 < chr10.'''
   262     parts = re.split(r'(\d+)', my_string)
   263     parts[1::2] = map(int, parts[1::2])
   264     return parts
   265 
   266 def nameKey(oneSeqRanges):
   267     return natural_sort_key(oneSeqRanges[0][0])
   268 
   269 def sizeKey(oneSeqRanges):
   270     return sum(b - e for n, b, e in oneSeqRanges), nameKey(oneSeqRanges)
   271 
   272 def getSortedRanges(seqRanges, sortOpt):
   273     g = [list(v) for k, v in itertools.groupby(seqRanges, itemgetter(0))]
   274     if sortOpt == "1":
   275         g.sort(key=nameKey)
   276     if sortOpt == "2":
   277         g.sort(key=sizeKey)
   278     return [j for i in g for j in i]
   279 
   280 def allSortedRanges(seqRanges, seqRangesB, sortOpt):
   281     x, y = twoValuesFromOption(sortOpt, ":")
   282     return getSortedRanges(seqRanges, x) + getSortedRanges(seqRangesB, y)
   283 
   284 def prettyNum(n):
   285     t = str(n)
   286     groups = []
   287     while t:
   288         groups.append(t[-3:])
   289         t = t[:-3]
   290     return ",".join(reversed(groups))
   291 
   292 def sizeText(size):
   293     suffixes = "bp", "kb", "Mb", "Gb"
   294     for i, x in enumerate(suffixes):
   295         j = 10 ** (i * 3)
   296         if size < j * 10:
   297             return "%.2g" % (1.0 * size / j) + x
   298         if size < j * 1000 or i == len(suffixes) - 1:
   299             return "%.0f" % (1.0 * size / j) + x
   300 
   301 def labelText(seqRange, labelOpt):
   302     seqName, beg, end = seqRange
   303     if labelOpt == 1:
   304         return seqName + ": " + sizeText(end - beg)
   305     if labelOpt == 2:
   306         return seqName + ":" + prettyNum(beg) + ": " + sizeText(end - beg)
   307     if labelOpt == 3:
   308         return seqName + ":" + prettyNum(beg) + "-" + prettyNum(end)
   309     return seqName
   310 
   311 def rangeLabels(seqRanges, labelOpt, font, fontsize, image_mode, textRot):
   312     if fontsize:
   313         image_size = 1, 1
   314         im = Image.new(image_mode, image_size)
   315         draw = ImageDraw.Draw(im)
   316     x = y = 0
   317     for r in seqRanges:
   318         text = labelText(r, labelOpt)
   319         if fontsize:
   320             x, y = draw.textsize(text, font=font)
   321             if textRot:
   322                 x, y = y, x
   323         yield text, x, y
   324 
   325 def dataFromRanges(cutRanges, cutRangesB, sortOpt, font,
   326                    fontsize, image_mode, labelOpt, textRot):
   327     s = allSortedRanges(cutRanges, cutRangesB, sortOpt)
   328     for i in s:
   329         warn("\t".join(map(str, i)))
   330     warn("")
   331     rangeSizes = [e - b for n, b, e in s]
   332     labs = list(rangeLabels(s, labelOpt, font, fontsize, image_mode, textRot))
   333     margin = max(i[2] for i in labs)
   334     # xxx the margin may be too big, because some labels may get omitted
   335     return s, rangeSizes, labs, margin
   336 
   337 def div_ceil(x, y):
   338     '''Return x / y rounded up.'''
   339     q, r = divmod(x, y)
   340     return q + (r != 0)
   341 
   342 def get_bp_per_pix(rangeSizes, pixTweenRanges, maxPixels):
   343     '''Get the minimum bp-per-pixel that fits in the size limit.'''
   344     warn("choosing bp per pixel...")
   345     numOfRanges = len(rangeSizes)
   346     maxPixelsInRanges = maxPixels - pixTweenRanges * (numOfRanges - 1)
   347     if maxPixelsInRanges < numOfRanges:
   348         raise Exception("can't fit the image: too many sequences?")
   349     negLimit = -maxPixelsInRanges
   350     negBpPerPix = sum(rangeSizes) // negLimit
   351     while True:
   352         if sum(i // negBpPerPix for i in rangeSizes) >= negLimit:
   353             return -negBpPerPix
   354         negBpPerPix -= 1
   355 
   356 def getRangePixBegs(rangePixLens, pixTweenRanges, margin):
   357     '''Get the start pixel for each range.'''
   358     rangePixBegs = []
   359     pix_tot = margin - pixTweenRanges
   360     for i in rangePixLens:
   361         pix_tot += pixTweenRanges
   362         rangePixBegs.append(pix_tot)
   363         pix_tot += i
   364     return rangePixBegs
   365 
   366 def pixelData(rangeSizes, bp_per_pix, pixTweenRanges, margin):
   367     '''Return pixel information about the ranges.'''
   368     rangePixLens = [div_ceil(i, bp_per_pix) for i in rangeSizes]
   369     rangePixBegs = getRangePixBegs(rangePixLens, pixTweenRanges, margin)
   370     tot_pix = rangePixBegs[-1] + rangePixLens[-1]
   371     return rangePixBegs, rangePixLens, tot_pix
   372 
   373 def drawLineForward(hits, width, bp_per_pix, beg1, beg2, size):
   374     while True:
   375         q1, r1 = divmod(beg1, bp_per_pix)
   376         q2, r2 = divmod(beg2, bp_per_pix)
   377         hits[q2 * width + q1] |= 1
   378         next_pix = min(bp_per_pix - r1, bp_per_pix - r2)
   379         if next_pix >= size: break
   380         beg1 += next_pix
   381         beg2 += next_pix
   382         size -= next_pix
   383 
   384 def drawLineReverse(hits, width, bp_per_pix, beg1, beg2, size):
   385     beg2 = -1 - beg2
   386     while True:
   387         q1, r1 = divmod(beg1, bp_per_pix)
   388         q2, r2 = divmod(beg2, bp_per_pix)
   389         hits[q2 * width + q1] |= 2
   390         next_pix = min(bp_per_pix - r1, r2 + 1)
   391         if next_pix >= size: break
   392         beg1 += next_pix
   393         beg2 -= next_pix
   394         size -= next_pix
   395 
   396 def findOrigin(ranges, beg, size):
   397     if beg < 0:
   398         beg = -(beg + size)
   399     for rangeBeg, rangeEnd, origin in ranges:
   400         if rangeEnd > beg:
   401             return origin
   402 
   403 def alignmentPixels(width, height, alignments, bp_per_pix,
   404                     rangeDict1, rangeDict2):
   405     hits = [0] * (width * height)  # the image data
   406     for seq1, seq2, blocks in alignments:
   407         beg1, beg2, size = blocks[0]
   408         ori1 = findOrigin(rangeDict1[seq1], beg1, size)
   409         ori2 = findOrigin(rangeDict2[seq2], beg2, size)
   410         for beg1, beg2, size in blocks:
   411             if beg1 < 0:
   412                 beg1 = -(beg1 + size)
   413                 beg2 = -(beg2 + size)
   414             if beg2 >= 0:
   415                 drawLineForward(hits, width, bp_per_pix,
   416                                 beg1 + ori1, beg2 + ori2, size)
   417             else:
   418                 drawLineReverse(hits, width, bp_per_pix,
   419                                 beg1 + ori1, beg2 - ori2, size)
   420     return hits
   421 
   422 def expandedSeqDict(seqDict):
   423     '''Allow lookup by short sequence names, e.g. chr7 as well as hg19.chr7.'''
   424     newDict = seqDict.copy()
   425     for name, x in seqDict.items():
   426         if "." in name:
   427             base = name.split(".")[-1]
   428             if base in newDict:  # an ambiguous case was found:
   429                 return seqDict   # so give up completely
   430             newDict[base] = x
   431     return newDict
   432 
   433 def readBed(fileName, rangeDict):
   434     for line in myOpen(fileName):
   435         w = line.split()
   436         if not w: continue
   437         seqName = w[0]
   438         if seqName not in rangeDict: continue
   439         beg = int(w[1])
   440         end = int(w[2])
   441         layer = 900
   442         color = "#fbf"
   443         if len(w) > 4:
   444             if w[4] != ".":
   445                 layer = float(w[4])
   446             if len(w) > 5:
   447                 if len(w) > 8 and w[8].count(",") == 2:
   448                     color = "rgb(" + w[8] + ")"
   449                 elif w[5] == "+":
   450                     color = "#ffe8e8"
   451                 elif w[5] == "-":
   452                     color = "#e8e8ff"
   453         yield layer, color, seqName, beg, end
   454 
   455 def commaSeparatedInts(text):
   456     return map(int, text.rstrip(",").split(","))
   457 
   458 def readGenePred(opts, fileName, rangeDict):
   459     for line in myOpen(fileName):
   460         fields = line.split()
   461         if not fields: continue
   462         if fields[2] not in "+-": fields = fields[1:]
   463         seqName = fields[1]
   464         if seqName not in rangeDict: continue
   465         #strand = fields[2]
   466         cdsBeg = int(fields[5])
   467         cdsEnd = int(fields[6])
   468         exonBegs = commaSeparatedInts(fields[8])
   469         exonEnds = commaSeparatedInts(fields[9])
   470         for beg, end in zip(exonBegs, exonEnds):
   471             yield 300, opts.exon_color, seqName, beg, end
   472             b = max(beg, cdsBeg)
   473             e = min(end, cdsEnd)
   474             if b < e: yield 400, opts.cds_color, seqName, b, e
   475 
   476 def readRmsk(fileName, rangeDict):
   477     for line in myOpen(fileName):
   478         fields = line.split()
   479         if len(fields) == 17:  # rmsk.txt
   480             seqName = fields[5]
   481             if seqName not in rangeDict: continue  # do this ASAP for speed
   482             beg = int(fields[6])
   483             end = int(fields[7])
   484             strand = fields[9]
   485             repeatClass = fields[11]
   486         elif len(fields) == 15:  # .out
   487             seqName = fields[4]
   488             if seqName not in rangeDict: continue
   489             beg = int(fields[5]) - 1
   490             end = int(fields[6])
   491             strand = fields[8]
   492             repeatClass = fields[10]
   493         else:
   494             continue
   495         if repeatClass in ("Low_complexity", "Simple_repeat"):
   496             yield 200, "#fbf", seqName, beg, end
   497         elif strand == "+":
   498             yield 100, "#ffe8e8", seqName, beg, end
   499         else:
   500             yield 100, "#e8e8ff", seqName, beg, end
   501 
   502 def isExtraFirstGapField(fields):
   503     return fields[4].isdigit()
   504 
   505 def readGaps(opts, fileName, rangeDict):
   506     '''Read locations of unsequenced gaps, from an agp or gap file.'''
   507     for line in myOpen(fileName):
   508         w = line.split()
   509         if not w or w[0][0] == "#": continue
   510         if isExtraFirstGapField(w): w = w[1:]
   511         if w[4] not in "NU": continue
   512         seqName = w[0]
   513         if seqName not in rangeDict: continue
   514         end = int(w[2])
   515         beg = end - int(w[5])  # zero-based coordinate
   516         if w[7] == "yes":
   517             yield 3000, opts.bridged_color, seqName, beg, end
   518         else:
   519             yield 2000, opts.unbridged_color, seqName, beg, end
   520 
   521 def bedBoxes(beds, rangeDict, margin, edge, isTop, bpPerPix):
   522     for layer, color, seqName, bedBeg, bedEnd in beds:
   523         for rangeBeg, rangeEnd, origin in rangeDict[seqName]:
   524             beg = max(bedBeg, rangeBeg)
   525             end = min(bedEnd, rangeEnd)
   526             if beg >= end: continue
   527             if layer <= 1000:
   528                 # include partly-covered pixels
   529                 b = (origin + beg) // bpPerPix
   530                 e = div_ceil(origin + end, bpPerPix)
   531             else:
   532                 # exclude partly-covered pixels
   533                 b = div_ceil(origin + beg, bpPerPix)
   534                 e = (origin + end) // bpPerPix
   535                 if e <= b: continue
   536             if isTop:
   537                 box = b, margin, e, edge
   538             else:
   539                 box = margin, b, edge, e
   540             yield layer, color, box
   541 
   542 def drawAnnotations(im, boxes):
   543     # xxx use partial transparency for different-color overlaps?
   544     for layer, color, box in boxes:
   545         im.paste(color, box)
   546 
   547 def placedLabels(labels, rangePixBegs, rangePixLens, beg, end):
   548     '''Return axis labels with endpoint & sort-order information.'''
   549     maxWidth = end - beg
   550     for i, j, k in zip(labels, rangePixBegs, rangePixLens):
   551         text, textWidth, textHeight = i
   552         if textWidth > maxWidth:
   553             continue
   554         labelBeg = j + (k - textWidth) // 2
   555         labelEnd = labelBeg + textWidth
   556         sortKey = textWidth - k
   557         if labelBeg < beg:
   558             sortKey += maxWidth * (beg - labelBeg)
   559             labelBeg = beg
   560             labelEnd = beg + textWidth
   561         if labelEnd > end:
   562             sortKey += maxWidth * (labelEnd - end)
   563             labelEnd = end
   564             labelBeg = end - textWidth
   565         yield sortKey, labelBeg, labelEnd, text, textHeight
   566 
   567 def nonoverlappingLabels(labels, minPixTweenLabels):
   568     '''Get a subset of non-overlapping axis labels, greedily.'''
   569     out = []
   570     for i in labels:
   571         beg = i[1] - minPixTweenLabels
   572         end = i[2] + minPixTweenLabels
   573         if all(j[2] <= beg or j[1] >= end for j in out):
   574             out.append(i)
   575     return out
   576 
   577 def axisImage(labels, rangePixBegs, rangePixLens, textRot,
   578               textAln, font, image_mode, opts):
   579     '''Make an image of axis labels.'''
   580     beg = rangePixBegs[0]
   581     end = rangePixBegs[-1] + rangePixLens[-1]
   582     margin = max(i[2] for i in labels)
   583     labels = sorted(placedLabels(labels, rangePixBegs, rangePixLens, beg, end))
   584     minPixTweenLabels = 0 if textRot else opts.label_space
   585     labels = nonoverlappingLabels(labels, minPixTweenLabels)
   586     image_size = (margin, end) if textRot else (end, margin)
   587     im = Image.new(image_mode, image_size, opts.margin_color)
   588     draw = ImageDraw.Draw(im)
   589     for i in labels:
   590         base = margin - i[4] if textAln else 0
   591         position = (base, i[1]) if textRot else (i[1], base)
   592         draw.text(position, i[3], font=font, fill=opts.text_color)
   593     return im
   594 
   595 def rangesWithOrigins(sortedRanges, rangePixBegs, bpPerPix):
   596     for i, j in zip(sortedRanges, rangePixBegs):
   597         seqName, rangeBeg, rangeEnd = i
   598         origin = bpPerPix * j - rangeBeg
   599         yield seqName, (rangeBeg, rangeEnd, origin)
   600 
   601 def rangesPerSeq(sortedRanges, rangePixBegs, bpPerPix):
   602     a = rangesWithOrigins(sortedRanges, rangePixBegs, bpPerPix)
   603     for k, v in itertools.groupby(a, itemgetter(0)):
   604         yield k, [i[1] for i in v]
   605 
   606 def getFont(opts):
   607     if opts.fontfile:
   608         return ImageFont.truetype(opts.fontfile, opts.fontsize)
   609     fileNames = []
   610     try:
   611         x = ["fc-match", "-f%{file}", "arial"]
   612         p = subprocess.Popen(x, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
   613         out, err = p.communicate()
   614         fileNames.append(out)
   615     except OSError as e:
   616         warn("fc-match error: " + str(e))
   617     fileNames.append("/Library/Fonts/Arial.ttf")  # for Mac
   618     for i in fileNames:
   619         try:
   620             font = ImageFont.truetype(i, opts.fontsize)
   621             warn("font: " + i)
   622             return font
   623         except IOError as e:
   624             warn("font load error: " + str(e))
   625     return ImageFont.load_default()
   626 
   627 def lastDotplot(opts, args):
   628     font = getFont(opts)
   629     image_mode = 'RGB'
   630     forward_color = ImageColor.getcolor(opts.forwardcolor, image_mode)
   631     reverse_color = ImageColor.getcolor(opts.reversecolor, image_mode)
   632     zipped_colors = zip(forward_color, reverse_color)
   633     overlap_color = tuple([(i + j) // 2 for i, j in zipped_colors])
   634 
   635     maxGap1, maxGapB1 = twoValuesFromOption(opts.max_gap1, ":")
   636     maxGap2, maxGapB2 = twoValuesFromOption(opts.max_gap2, ":")
   637 
   638     warn("reading alignments...")
   639     alnData = readAlignments(args[0], opts)
   640     alignments, seqRanges1, coverDict1, seqRanges2, coverDict2 = alnData
   641     if not alignments: raise Exception("there are no alignments")
   642     warn("cutting...")
   643     coverDict1 = dict(mergedRangesPerSeq(coverDict1))
   644     coverDict2 = dict(mergedRangesPerSeq(coverDict2))
   645     minAlignedBases = min(coveredLength(coverDict1), coveredLength(coverDict2))
   646     pad = int(opts.pad * minAlignedBases)
   647     cutRanges1 = list(trimmed(seqRanges1, coverDict1, minAlignedBases,
   648                               maxGap1, pad, pad))
   649     cutRanges2 = list(trimmed(seqRanges2, coverDict2, minAlignedBases,
   650                               maxGap2, pad, pad))
   651 
   652     warn("reading secondary alignments...")
   653     alnDataB = readSecondaryAlignments(opts, cutRanges1, cutRanges2)
   654     alignmentsB, seqRangesB1, coverDictB1, seqRangesB2, coverDictB2 = alnDataB
   655     warn("cutting...")
   656     coverDictB1 = dict(mergedRangesPerSeq(coverDictB1))
   657     coverDictB2 = dict(mergedRangesPerSeq(coverDictB2))
   658     cutRangesB1 = trimmed(seqRangesB1, coverDictB1, minAlignedBases,
   659                           maxGapB1, 0, 0)
   660     cutRangesB2 = trimmed(seqRangesB2, coverDictB2, minAlignedBases,
   661                           maxGapB2, 0, 0)
   662 
   663     textRot1 = "vertical".startswith(opts.rot1)
   664     i1 = dataFromRanges(cutRanges1, cutRangesB1, opts.sort1, font,
   665                         opts.fontsize, image_mode, opts.labels1, textRot1)
   666     sortedRanges1, rangeSizes1, labelData1, tMargin = i1
   667 
   668     textRot2 = "horizontal".startswith(opts.rot2)
   669     i2 = dataFromRanges(cutRanges2, cutRangesB2, opts.sort2, font,
   670                         opts.fontsize, image_mode, opts.labels2, textRot2)
   671     sortedRanges2, rangeSizes2, labelData2, lMargin = i2
   672 
   673     maxPixels1 = opts.width  - lMargin
   674     maxPixels2 = opts.height - tMargin
   675     bpPerPix1 = get_bp_per_pix(rangeSizes1, opts.border_pixels, maxPixels1)
   676     bpPerPix2 = get_bp_per_pix(rangeSizes2, opts.border_pixels, maxPixels2)
   677     bpPerPix = max(bpPerPix1, bpPerPix2)
   678     warn("bp per pixel = " + str(bpPerPix))
   679 
   680     p1 = pixelData(rangeSizes1, bpPerPix, opts.border_pixels, lMargin)
   681     rangePixBegs1, rangePixLens1, width = p1
   682     rangeDict1 = dict(rangesPerSeq(sortedRanges1, rangePixBegs1, bpPerPix))
   683 
   684     p2 = pixelData(rangeSizes2, bpPerPix, opts.border_pixels, tMargin)
   685     rangePixBegs2, rangePixLens2, height = p2
   686     rangeDict2 = dict(rangesPerSeq(sortedRanges2, rangePixBegs2, bpPerPix))
   687 
   688     warn("width:  " + str(width))
   689     warn("height: " + str(height))
   690 
   691     warn("processing alignments...")
   692     hits = alignmentPixels(width, height, alignments + alignmentsB, bpPerPix,
   693                            rangeDict1, rangeDict2)
   694 
   695     warn("reading annotations...")
   696 
   697     rangeDict1 = expandedSeqDict(rangeDict1)
   698     beds1 = itertools.chain(readBed(opts.bed1, rangeDict1),
   699                             readRmsk(opts.rmsk1, rangeDict1),
   700                             readGenePred(opts, opts.genePred1, rangeDict1),
   701                             readGaps(opts, opts.gap1, rangeDict1))
   702     b1 = bedBoxes(beds1, rangeDict1, tMargin, height, True, bpPerPix)
   703 
   704     rangeDict2 = expandedSeqDict(rangeDict2)
   705     beds2 = itertools.chain(readBed(opts.bed2, rangeDict2),
   706                             readRmsk(opts.rmsk2, rangeDict2),
   707                             readGenePred(opts, opts.genePred2, rangeDict2),
   708                             readGaps(opts, opts.gap2, rangeDict2))
   709     b2 = bedBoxes(beds2, rangeDict2, lMargin, width, False, bpPerPix)
   710 
   711     boxes = sorted(itertools.chain(b1, b2))
   712 
   713     warn("drawing...")
   714 
   715     image_size = width, height
   716     im = Image.new(image_mode, image_size, opts.background_color)
   717 
   718     drawAnnotations(im, boxes)
   719 
   720     for i in range(height):
   721         for j in range(width):
   722             store_value = hits[i * width + j]
   723             xy = j, i
   724             if   store_value == 1: im.putpixel(xy, forward_color)
   725             elif store_value == 2: im.putpixel(xy, reverse_color)
   726             elif store_value == 3: im.putpixel(xy, overlap_color)
   727 
   728     if opts.fontsize != 0:
   729         axis1 = axisImage(labelData1, rangePixBegs1, rangePixLens1,
   730                           textRot1, False, font, image_mode, opts)
   731         if textRot1:
   732             axis1 = axis1.transpose(Image.ROTATE_90)
   733         axis2 = axisImage(labelData2, rangePixBegs2, rangePixLens2,
   734                           textRot2, textRot2, font, image_mode, opts)
   735         if not textRot2:
   736             axis2 = axis2.transpose(Image.ROTATE_270)
   737         im.paste(axis1, (0, 0))
   738         im.paste(axis2, (0, 0))
   739 
   740     for i in rangePixBegs1[1:]:
   741         box = i - opts.border_pixels, tMargin, i, height
   742         im.paste(opts.border_color, box)
   743 
   744     for i in rangePixBegs2[1:]:
   745         box = lMargin, i - opts.border_pixels, width, i
   746         im.paste(opts.border_color, box)
   747 
   748     im.save(args[1])
   749 
   750 if __name__ == "__main__":
   751     usage = """%prog --help
   752    or: %prog [options] maf-or-tab-alignments dotplot.png
   753    or: %prog [options] maf-or-tab-alignments dotplot.gif
   754    or: ..."""
   755     description = "Draw a dotplot of pair-wise sequence alignments in MAF or tabular format."
   756     op = optparse.OptionParser(usage=usage, description=description)
   757     op.add_option("-v", "--verbose", action="count",
   758                   help="show progress messages & data about the plot")
   759     op.add_option("-1", "--seq1", metavar="PATTERN", action="append",
   760                   default=[],
   761                   help="which sequences to show from the 1st genome")
   762     op.add_option("-2", "--seq2", metavar="PATTERN", action="append",
   763                   default=[],
   764                   help="which sequences to show from the 2nd genome")
   765     # Replace "width" & "height" with a single "length" option?
   766     op.add_option("-x", "--width", type="int", default=1000,
   767                   help="maximum width in pixels (default: %default)")
   768     op.add_option("-y", "--height", type="int", default=1000,
   769                   help="maximum height in pixels (default: %default)")
   770     op.add_option("-c", "--forwardcolor", metavar="COLOR", default="red",
   771                   help="color for forward alignments (default: %default)")
   772     op.add_option("-r", "--reversecolor", metavar="COLOR", default="blue",
   773                   help="color for reverse alignments (default: %default)")
   774     op.add_option("--alignments", metavar="FILE", help="secondary alignments")
   775     op.add_option("--sort1", default="1", metavar="N",
   776                   help="genome1 sequence order: 0=input order, 1=name order, "
   777                   "2=length order (default=%default)")
   778     op.add_option("--sort2", default="1", metavar="N",
   779                   help="genome2 sequence order: 0=input order, 1=name order, "
   780                   "2=length order (default=%default)")
   781     op.add_option("--max-gap1", metavar="FRAC", default="0.5,2", help=
   782                   "maximum unaligned (end,mid) gap in genome1: "
   783                   "fraction of aligned length (default=%default)")
   784     op.add_option("--max-gap2", metavar="FRAC", default="0.5,2", help=
   785                   "maximum unaligned (end,mid) gap in genome2: "
   786                   "fraction of aligned length (default=%default)")
   787     op.add_option("--pad", metavar="FRAC", type="float", default=0.04, help=
   788                   "pad length when cutting unaligned gaps: "
   789                   "fraction of aligned length (default=%default)")
   790     op.add_option("--border-pixels", metavar="INT", type="int", default=1,
   791                   help="number of pixels between sequences (default=%default)")
   792     op.add_option("--border-color", metavar="COLOR", default="black",
   793                   help="color for pixels between sequences (default=%default)")
   794     # --break-color and/or --break-pixels for intra-sequence breaks?
   795     op.add_option("--margin-color", metavar="COLOR", default="#dcdcdc",
   796                   help="margin color")
   797 
   798     og = optparse.OptionGroup(op, "Text options")
   799     og.add_option("-f", "--fontfile", metavar="FILE",
   800                   help="TrueType or OpenType font file")
   801     og.add_option("-s", "--fontsize", metavar="SIZE", type="int", default=14,
   802                   help="TrueType or OpenType font size (default: %default)")
   803     og.add_option("--labels1", type="int", default=0, metavar="N", help=
   804                   "genome1 labels: 0=name, 1=name:length, "
   805                   "2=name:start:length, 3=name:start-end (default=%default)")
   806     og.add_option("--labels2", type="int", default=0, metavar="N", help=
   807                   "genome2 labels: 0=name, 1=name:length, "
   808                   "2=name:start:length, 3=name:start-end (default=%default)")
   809     og.add_option("--rot1", metavar="ROT", default="h",
   810                   help="text rotation for the 1st genome (default=%default)")
   811     og.add_option("--rot2", metavar="ROT", default="v",
   812                   help="text rotation for the 2nd genome (default=%default)")
   813     op.add_option_group(og)
   814 
   815     og = optparse.OptionGroup(op, "Annotation options")
   816     og.add_option("--bed1", metavar="FILE",
   817                   help="read genome1 annotations from BED file")
   818     og.add_option("--bed2", metavar="FILE",
   819                   help="read genome2 annotations from BED file")
   820     og.add_option("--rmsk1", metavar="FILE", help="read genome1 repeats from "
   821                   "RepeatMasker .out or rmsk.txt file")
   822     og.add_option("--rmsk2", metavar="FILE", help="read genome2 repeats from "
   823                   "RepeatMasker .out or rmsk.txt file")
   824     op.add_option_group(og)
   825 
   826     og = optparse.OptionGroup(op, "Gene options")
   827     og.add_option("--genePred1", metavar="FILE",
   828                   help="read genome1 genes from genePred file")
   829     og.add_option("--genePred2", metavar="FILE",
   830                   help="read genome2 genes from genePred file")
   831     og.add_option("--exon-color", metavar="COLOR", default="PaleGreen",
   832                   help="color for exons (default=%default)")
   833     og.add_option("--cds-color", metavar="COLOR", default="LimeGreen",
   834                   help="color for protein-coding regions (default=%default)")
   835     op.add_option_group(og)
   836 
   837     og = optparse.OptionGroup(op, "Unsequenced gap options")
   838     og.add_option("--gap1", metavar="FILE",
   839                   help="read genome1 unsequenced gaps from agp or gap file")
   840     og.add_option("--gap2", metavar="FILE",
   841                   help="read genome2 unsequenced gaps from agp or gap file")
   842     og.add_option("--bridged-color", metavar="COLOR", default="yellow",
   843                   help="color for bridged gaps (default: %default)")
   844     og.add_option("--unbridged-color", metavar="COLOR", default="orange",
   845                   help="color for unbridged gaps (default: %default)")
   846     op.add_option_group(og)
   847     (opts, args) = op.parse_args()
   848     if len(args) != 2: op.error("2 arguments needed")
   849 
   850     opts.text_color = "black"
   851     opts.background_color = "white"
   852     opts.label_space = 5     # minimum number of pixels between axis labels
   853 
   854     try: lastDotplot(opts, args)
   855     except KeyboardInterrupt: pass  # avoid silly error message
   856     except Exception as e:
   857         prog = os.path.basename(sys.argv[0])
   858         sys.exit(prog + ": error: " + str(e))