scripts/last-dotplot
author Martin C. Frith
Wed Nov 08 18:14:51 2017 +0900 (2017-11-08)
changeset 912 8834139fa8a8
parent 911 4080f3f6f73d
child 913 e87abc7ae9c6
permissions -rwxr-xr-x
Tweak last-dotplot range-end pixels
     1 #! /usr/bin/env python
     2 
     3 # Read pair-wise alignments in MAF or LAST tabular format: write an
     4 # "Oxford grid", a.k.a. dotplot.
     5 
     6 # TODO: Currently, pixels with zero aligned nt-pairs are white, and
     7 # pixels with one or more aligned nt-pairs are black.  This can look
     8 # too crowded for large genome alignments.  I tried shading each pixel
     9 # according to the number of aligned nt-pairs within it, but the
    10 # result is too faint.  How can this be done better?
    11 
    12 import gzip
    13 from fnmatch import fnmatchcase
    14 from operator import itemgetter
    15 import subprocess
    16 import itertools, optparse, os, re, sys
    17 
    18 # Try to make PIL/PILLOW work:
    19 try: from PIL import Image, ImageDraw, ImageFont, ImageColor
    20 except ImportError: import Image, ImageDraw, ImageFont, ImageColor
    21 
    22 def myOpen(fileName):  # faster than fileinput
    23     if fileName is None:
    24         return []
    25     if fileName == "-":
    26         return sys.stdin
    27     if fileName.endswith(".gz"):
    28         return gzip.open(fileName)
    29     return open(fileName)
    30 
    31 def warn(message):
    32     if opts.verbose:
    33         prog = os.path.basename(sys.argv[0])
    34         sys.stderr.write(prog + ": " + message + "\n")
    35 
    36 def groupByFirstItem(things):
    37     for k, v in itertools.groupby(things, itemgetter(0)):
    38         yield k, [i[1:] for i in v]
    39 
    40 def croppedBlocks(blocks, ranges1, ranges2):
    41     headBeg1, headBeg2, headSize = blocks[0]
    42     for r1 in ranges1:
    43         for r2 in ranges2:
    44             cropBeg1, cropEnd1 = r1
    45             if headBeg1 < 0:
    46                 cropBeg1, cropEnd1 = -cropEnd1, -cropBeg1
    47             cropBeg2, cropEnd2 = r2
    48             if headBeg2 < 0:
    49                 cropBeg2, cropEnd2 = -cropEnd2, -cropBeg2
    50             for beg1, beg2, size in blocks:
    51                 b1 = max(cropBeg1, beg1)
    52                 e1 = min(cropEnd1, beg1 + size)
    53                 if b1 >= e1: continue
    54                 offset = beg2 - beg1
    55                 b2 = max(cropBeg2, b1 + offset)
    56                 e2 = min(cropEnd2, e1 + offset)
    57                 if b2 >= e2: continue
    58                 yield b2 - offset, b2, e2 - b2
    59 
    60 def tabBlocks(beg1, beg2, blocks):
    61     '''Get the gapless blocks of an alignment, from LAST tabular format.'''
    62     for i in blocks.split(","):
    63         if ":" in i:
    64             x, y = i.split(":")
    65             beg1 += int(x)
    66             beg2 += int(y)
    67         else:
    68             size = int(i)
    69             yield beg1, beg2, size
    70             beg1 += size
    71             beg2 += size
    72 
    73 def mafBlocks(beg1, beg2, seq1, seq2):
    74     '''Get the gapless blocks of an alignment, from MAF format.'''
    75     size = 0
    76     for x, y in itertools.izip(seq1, seq2):
    77         if x == "-":
    78             if size:
    79                 yield beg1, beg2, size
    80                 beg1 += size
    81                 beg2 += size
    82                 size = 0
    83             beg2 += 1
    84         elif y == "-":
    85             if size:
    86                 yield beg1, beg2, size
    87                 beg1 += size
    88                 beg2 += size
    89                 size = 0
    90             beg1 += 1
    91         else:
    92             size += 1
    93     if size: yield beg1, beg2, size
    94 
    95 def alignmentInput(lines):
    96     '''Get alignments and sequence lengths, from MAF or tabular format.'''
    97     mafCount = 0
    98     for line in lines:
    99         w = line.split()
   100         if line[0].isdigit():  # tabular format
   101             chr1, beg1, seqlen1 = w[1], int(w[2]), int(w[5])
   102             if w[4] == "-": beg1 -= seqlen1
   103             chr2, beg2, seqlen2 = w[6], int(w[7]), int(w[10])
   104             if w[9] == "-": beg2 -= seqlen2
   105             blocks = tabBlocks(beg1, beg2, w[11])
   106             yield chr1, seqlen1, chr2, seqlen2, blocks
   107         elif line[0] == "s":  # MAF format
   108             if mafCount == 0:
   109                 chr1, beg1, seqlen1, seq1 = w[1], int(w[2]), int(w[5]), w[6]
   110                 if w[4] == "-": beg1 -= seqlen1
   111                 mafCount = 1
   112             else:
   113                 chr2, beg2, seqlen2, seq2 = w[1], int(w[2]), int(w[5]), w[6]
   114                 if w[4] == "-": beg2 -= seqlen2
   115                 blocks = mafBlocks(beg1, beg2, seq1, seq2)
   116                 yield chr1, seqlen1, chr2, seqlen2, blocks
   117                 mafCount = 0
   118 
   119 def seqRequestFromText(text):
   120     if ":" in text:
   121         pattern, interval = text.rsplit(":", 1)
   122         if "-" in interval:
   123             beg, end = interval.rsplit("-", 1)
   124             return pattern, int(beg), int(end)  # beg may be negative
   125     return text, 0, sys.maxsize
   126 
   127 def rangesFromSeqName(seqRequests, name, seqLen):
   128     if seqRequests:
   129         base = name.split(".")[-1]  # allow for names like hg19.chr7
   130         for pat, beg, end in seqRequests:
   131             if fnmatchcase(name, pat) or fnmatchcase(base, pat):
   132                 yield max(beg, 0), min(end, seqLen)
   133     else:
   134         yield 0, seqLen
   135 
   136 def updateSeqs(coverDict, seqRanges, seqName, ranges, coveredRange):
   137     beg, end = coveredRange
   138     if beg < 0:
   139         coveredRange = -end, -beg
   140     if seqName in coverDict:
   141         coverDict[seqName].append(coveredRange)
   142     else:
   143         coverDict[seqName] = [coveredRange]
   144         for beg, end in ranges:
   145             r = seqName, beg, end
   146             seqRanges.append(r)
   147 
   148 def readAlignments(fileName, opts):
   149     '''Get alignments and sequence limits, from MAF or tabular format.'''
   150     seqRequests1 = map(seqRequestFromText, opts.seq1)
   151     seqRequests2 = map(seqRequestFromText, opts.seq2)
   152 
   153     alignments = []
   154     seqRanges1 = []
   155     seqRanges2 = []
   156     coverDict1 = {}
   157     coverDict2 = {}
   158     lines = myOpen(fileName)
   159     for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
   160         ranges1 = sorted(rangesFromSeqName(seqRequests1, seqName1, seqLen1))
   161         if not ranges1: continue
   162         ranges2 = sorted(rangesFromSeqName(seqRequests2, seqName2, seqLen2))
   163         if not ranges2: continue
   164         b = list(croppedBlocks(list(blocks), ranges1, ranges2))
   165         if not b: continue
   166         aln = seqName1, seqName2, b
   167         alignments.append(aln)
   168         coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
   169         updateSeqs(coverDict1, seqRanges1, seqName1, ranges1, coveredRange1)
   170         coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
   171         updateSeqs(coverDict2, seqRanges2, seqName2, ranges2, coveredRange2)
   172     return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
   173 
   174 def nameAndRangesFromDict(cropDict, seqName):
   175     if seqName in cropDict:
   176         return seqName, cropDict[seqName]
   177     n = seqName.split(".")[-1]
   178     if n in cropDict:
   179         return n, cropDict[n]
   180     return seqName, []
   181 
   182 def rangesForSecondaryAlignments(primaryRanges, seqLen):
   183     if primaryRanges:
   184         return primaryRanges
   185     return [(0, seqLen)]
   186 
   187 def readSecondaryAlignments(opts, cropRanges1, cropRanges2):
   188     cropDict1 = dict(groupByFirstItem(cropRanges1))
   189     cropDict2 = dict(groupByFirstItem(cropRanges2))
   190 
   191     alignments = []
   192     seqRanges1 = []
   193     seqRanges2 = []
   194     coverDict1 = {}
   195     coverDict2 = {}
   196     lines = myOpen(opts.alignments)
   197     for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
   198         seqName1, ranges1 = nameAndRangesFromDict(cropDict1, seqName1)
   199         seqName2, ranges2 = nameAndRangesFromDict(cropDict2, seqName2)
   200         if not ranges1 and not ranges2:
   201             continue
   202         r1 = rangesForSecondaryAlignments(ranges1, seqLen1)
   203         r2 = rangesForSecondaryAlignments(ranges2, seqLen2)
   204         b = list(croppedBlocks(list(blocks), r1, r2))
   205         if not b: continue
   206         aln = seqName1, seqName2, b
   207         alignments.append(aln)
   208         if not ranges1:
   209             coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
   210             updateSeqs(coverDict1, seqRanges1, seqName1, r1, coveredRange1)
   211         if not ranges2:
   212             coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
   213             updateSeqs(coverDict2, seqRanges2, seqName2, r2, coveredRange2)
   214     return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
   215 
   216 def twoValuesFromOption(text, separator):
   217     if separator in text:
   218         return text.split(separator)
   219     return text, text
   220 
   221 def mergedRanges(ranges):
   222     oldBeg, maxEnd = ranges[0]
   223     for beg, end in ranges:
   224         if beg > maxEnd:
   225             yield oldBeg, maxEnd
   226             oldBeg = beg
   227             maxEnd = end
   228         elif end > maxEnd:
   229             maxEnd = end
   230     yield oldBeg, maxEnd
   231 
   232 def mergedRangesPerSeq(coverDict):
   233     for k, v in coverDict.iteritems():
   234         v.sort()
   235         yield k, list(mergedRanges(v))
   236 
   237 def coveredLength(mergedCoverDict):
   238     return sum(sum(e - b for b, e in v) for v in mergedCoverDict.itervalues())
   239 
   240 def trimmed(seqRanges, coverDict, minAlignedBases, maxGapFrac, endPad, midPad):
   241     maxEndGapFrac, maxMidGapFrac = twoValuesFromOption(maxGapFrac, ",")
   242     maxEndGap = max(float(maxEndGapFrac) * minAlignedBases, endPad * 1.0)
   243     maxMidGap = max(float(maxMidGapFrac) * minAlignedBases, midPad * 2.0)
   244 
   245     for seqName, rangeBeg, rangeEnd in seqRanges:
   246         seqBlocks = coverDict[seqName]
   247         blocks = [i for i in seqBlocks if i[0] < rangeEnd and i[1] > rangeBeg]
   248         if blocks[0][0] - rangeBeg > maxEndGap:
   249             rangeBeg = blocks[0][0] - endPad
   250         for j, y in enumerate(blocks):
   251             if j:
   252                 x = blocks[j - 1]
   253                 if y[0] - x[1] > maxMidGap:
   254                     yield seqName, rangeBeg, x[1] + midPad
   255                     rangeBeg = y[0] - midPad
   256         if rangeEnd - blocks[-1][1] > maxEndGap:
   257             rangeEnd = blocks[-1][1] + endPad
   258         yield seqName, rangeBeg, rangeEnd
   259 
   260 def natural_sort_key(my_string):
   261     '''Return a sort key for "natural" ordering, e.g. chr9 < chr10.'''
   262     parts = re.split(r'(\d+)', my_string)
   263     parts[1::2] = map(int, parts[1::2])
   264     return parts
   265 
   266 def nameKey(oneSeqRanges):
   267     return natural_sort_key(oneSeqRanges[0][0])
   268 
   269 def sizeKey(oneSeqRanges):
   270     return sum(b - e for n, b, e in oneSeqRanges), nameKey(oneSeqRanges)
   271 
   272 def getSortedRanges(seqRanges, sortOpt):
   273     g = [list(v) for k, v in itertools.groupby(seqRanges, itemgetter(0))]
   274     if sortOpt == "1":
   275         g.sort(key=nameKey)
   276     if sortOpt == "2":
   277         g.sort(key=sizeKey)
   278     return [j for i in g for j in i]
   279 
   280 def allSortedRanges(seqRanges, seqRangesB, sortOpt):
   281     x, y = twoValuesFromOption(sortOpt, ":")
   282     return getSortedRanges(seqRanges, x) + getSortedRanges(seqRangesB, y)
   283 
   284 def prettyNum(n):
   285     t = str(n)
   286     groups = []
   287     while t:
   288         groups.append(t[-3:])
   289         t = t[:-3]
   290     return ",".join(reversed(groups))
   291 
   292 def sizeText(size):
   293     suffixes = "bp", "kb", "Mb", "Gb"
   294     for i, x in enumerate(suffixes):
   295         j = 10 ** (i * 3)
   296         if size < j * 10:
   297             return "%.2g" % (1.0 * size / j) + x
   298         if size < j * 1000 or i == len(suffixes) - 1:
   299             return "%.0f" % (1.0 * size / j) + x
   300 
   301 def labelText(seqRange, labelOpt):
   302     seqName, beg, end = seqRange
   303     if labelOpt == 1:
   304         return seqName + ": " + sizeText(end - beg)
   305     if labelOpt == 2:
   306         return seqName + ":" + prettyNum(beg) + ": " + sizeText(end - beg)
   307     if labelOpt == 3:
   308         return seqName + ":" + prettyNum(beg) + "-" + prettyNum(end)
   309     return seqName
   310 
   311 def rangeLabels(seqRanges, labelOpt, font, fontsize, image_mode, textRot):
   312     if fontsize:
   313         image_size = 1, 1
   314         im = Image.new(image_mode, image_size)
   315         draw = ImageDraw.Draw(im)
   316     x = y = 0
   317     for r in seqRanges:
   318         text = labelText(r, labelOpt)
   319         if fontsize:
   320             x, y = draw.textsize(text, font=font)
   321             if textRot:
   322                 x, y = y, x
   323         yield text, x, y
   324 
   325 def dataFromRanges(cutRanges, cutRangesB, sortOpt, font,
   326                    fontsize, image_mode, labelOpt, textRot):
   327     s = allSortedRanges(cutRanges, cutRangesB, sortOpt)
   328     for i in s:
   329         warn("\t".join(map(str, i)))
   330     warn("")
   331     rangeSizes = [e - b for n, b, e in s]
   332     labs = list(rangeLabels(s, labelOpt, font, fontsize, image_mode, textRot))
   333     margin = max(i[2] for i in labs)
   334     # xxx the margin may be too big, because some labels may get omitted
   335     return s, rangeSizes, labs, margin
   336 
   337 def div_ceil(x, y):
   338     '''Return x / y rounded up.'''
   339     q, r = divmod(x, y)
   340     return q + (r != 0)
   341 
   342 def get_bp_per_pix(rangeSizes, pixTweenRanges, maxPixels):
   343     '''Get the minimum bp-per-pixel that fits in the size limit.'''
   344     warn("choosing bp per pixel...")
   345     numOfRanges = len(rangeSizes)
   346     maxPixelsInRanges = maxPixels - pixTweenRanges * (numOfRanges - 1)
   347     if maxPixelsInRanges < numOfRanges:
   348         raise Exception("can't fit the image: too many sequences?")
   349     negLimit = -maxPixelsInRanges
   350     negBpPerPix = sum(rangeSizes) // negLimit
   351     while True:
   352         if sum(i // negBpPerPix for i in rangeSizes) >= negLimit:
   353             return -negBpPerPix
   354         negBpPerPix -= 1
   355 
   356 def getRangePixBegs(rangePixLens, pixTweenRanges, margin):
   357     '''Get the start pixel for each range.'''
   358     rangePixBegs = []
   359     pix_tot = margin - pixTweenRanges
   360     for i in rangePixLens:
   361         pix_tot += pixTweenRanges
   362         rangePixBegs.append(pix_tot)
   363         pix_tot += i
   364     return rangePixBegs
   365 
   366 def pixelData(rangeSizes, bp_per_pix, pixTweenRanges, margin):
   367     '''Return pixel information about the ranges.'''
   368     rangePixLens = [div_ceil(i, bp_per_pix) for i in rangeSizes]
   369     rangePixBegs = getRangePixBegs(rangePixLens, pixTweenRanges, margin)
   370     tot_pix = rangePixBegs[-1] + rangePixLens[-1]
   371     return rangePixBegs, rangePixLens, tot_pix
   372 
   373 def drawLineForward(hits, width, bp_per_pix, beg1, beg2, size):
   374     while True:
   375         q1, r1 = divmod(beg1, bp_per_pix)
   376         q2, r2 = divmod(beg2, bp_per_pix)
   377         hits[q2 * width + q1] |= 1
   378         next_pix = min(bp_per_pix - r1, bp_per_pix - r2)
   379         if next_pix >= size: break
   380         beg1 += next_pix
   381         beg2 += next_pix
   382         size -= next_pix
   383 
   384 def drawLineReverse(hits, width, bp_per_pix, beg1, beg2, size):
   385     beg2 = -1 - beg2
   386     while True:
   387         q1, r1 = divmod(beg1, bp_per_pix)
   388         q2, r2 = divmod(beg2, bp_per_pix)
   389         hits[q2 * width + q1] |= 2
   390         next_pix = min(bp_per_pix - r1, r2 + 1)
   391         if next_pix >= size: break
   392         beg1 += next_pix
   393         beg2 -= next_pix
   394         size -= next_pix
   395 
   396 def findOrigin(ranges, beg, size):
   397     if beg < 0:
   398         beg = -(beg + size)
   399     for rangeBeg, rangeEnd, origin in ranges:
   400         if rangeEnd > beg:
   401             return origin
   402 
   403 def alignmentPixels(width, height, alignments, bp_per_pix,
   404                     rangeDict1, rangeDict2):
   405     hits = [0] * (width * height)  # the image data
   406     for seq1, seq2, blocks in alignments:
   407         beg1, beg2, size = blocks[0]
   408         ori1 = findOrigin(rangeDict1[seq1], beg1, size)
   409         ori2 = findOrigin(rangeDict2[seq2], beg2, size)
   410         for beg1, beg2, size in blocks:
   411             if beg1 < 0:
   412                 beg1 = -(beg1 + size)
   413                 beg2 = -(beg2 + size)
   414             if beg2 >= 0:
   415                 drawLineForward(hits, width, bp_per_pix,
   416                                 beg1 + ori1, beg2 + ori2, size)
   417             else:
   418                 drawLineReverse(hits, width, bp_per_pix,
   419                                 beg1 + ori1, beg2 - ori2, size)
   420     return hits
   421 
   422 def expandedSeqDict(seqDict):
   423     '''Allow lookup by short sequence names, e.g. chr7 as well as hg19.chr7.'''
   424     newDict = seqDict.copy()
   425     for name, x in seqDict.items():
   426         if "." in name:
   427             base = name.split(".")[-1]
   428             if base in newDict:  # an ambiguous case was found:
   429                 return seqDict   # so give up completely
   430             newDict[base] = x
   431     return newDict
   432 
   433 def readBed(fileName, rangeDict):
   434     for line in myOpen(fileName):
   435         w = line.split()
   436         if not w: continue
   437         seqName = w[0]
   438         if seqName not in rangeDict: continue
   439         beg = int(w[1])
   440         end = int(w[2])
   441         layer = 900
   442         color = "#fbf"
   443         if len(w) > 4:
   444             if w[4] != ".":
   445                 layer = float(w[4])
   446             if len(w) > 5:
   447                 if len(w) > 8 and w[8].count(",") == 2:
   448                     color = "rgb(" + w[8] + ")"
   449                 elif w[5] == "+":
   450                     color = "#ffe8e8"
   451                 elif w[5] == "-":
   452                     color = "#e8e8ff"
   453         yield layer, color, seqName, beg, end
   454 
   455 def commaSeparatedInts(text):
   456     return map(int, text.rstrip(",").split(","))
   457 
   458 def readGenePred(opts, fileName, rangeDict):
   459     for line in myOpen(fileName):
   460         fields = line.split()
   461         if not fields: continue
   462         if fields[2] not in "+-": fields = fields[1:]
   463         seqName = fields[1]
   464         if seqName not in rangeDict: continue
   465         #strand = fields[2]
   466         cdsBeg = int(fields[5])
   467         cdsEnd = int(fields[6])
   468         exonBegs = commaSeparatedInts(fields[8])
   469         exonEnds = commaSeparatedInts(fields[9])
   470         for beg, end in zip(exonBegs, exonEnds):
   471             yield 300, opts.exon_color, seqName, beg, end
   472             b = max(beg, cdsBeg)
   473             e = min(end, cdsEnd)
   474             if b < e: yield 400, opts.cds_color, seqName, b, e
   475 
   476 def readRmsk(fileName, rangeDict):
   477     for line in myOpen(fileName):
   478         fields = line.split()
   479         if len(fields) == 17:  # rmsk.txt
   480             seqName = fields[5]
   481             if seqName not in rangeDict: continue  # do this ASAP for speed
   482             beg = int(fields[6])
   483             end = int(fields[7])
   484             strand = fields[9]
   485             repeatClass = fields[11]
   486         elif len(fields) == 15:  # .out
   487             seqName = fields[4]
   488             if seqName not in rangeDict: continue
   489             beg = int(fields[5]) - 1
   490             end = int(fields[6])
   491             strand = fields[8]
   492             repeatClass = fields[10]
   493         else:
   494             continue
   495         if repeatClass in ("Low_complexity", "Simple_repeat"):
   496             yield 200, "#fbf", seqName, beg, end
   497         elif strand == "+":
   498             yield 100, "#ffe8e8", seqName, beg, end
   499         else:
   500             yield 100, "#e8e8ff", seqName, beg, end
   501 
   502 def isExtraFirstGapField(fields):
   503     return fields[4].isdigit()
   504 
   505 def readGaps(opts, fileName, rangeDict):
   506     '''Read locations of unsequenced gaps, from an agp or gap file.'''
   507     for line in myOpen(fileName):
   508         w = line.split()
   509         if not w or w[0][0] == "#": continue
   510         if isExtraFirstGapField(w): w = w[1:]
   511         if w[4] not in "NU": continue
   512         seqName = w[0]
   513         if seqName not in rangeDict: continue
   514         end = int(w[2])
   515         beg = end - int(w[5])  # zero-based coordinate
   516         if w[7] == "yes":
   517             yield 3000, opts.bridged_color, seqName, beg, end
   518         else:
   519             yield 2000, opts.unbridged_color, seqName, beg, end
   520 
   521 def bedBoxes(beds, rangeDict, margin, edge, isTop, bpPerPix):
   522     for layer, color, seqName, bedBeg, bedEnd in beds:
   523         for rangeBeg, rangeEnd, origin in rangeDict[seqName]:
   524             beg = max(bedBeg, rangeBeg)
   525             end = min(bedEnd, rangeEnd)
   526             if beg >= end: continue
   527             if layer <= 1000:
   528                 # include partly-covered pixels
   529                 b = (origin + beg) // bpPerPix
   530                 e = div_ceil(origin + end, bpPerPix)
   531             else:
   532                 # exclude partly-covered pixels
   533                 b = div_ceil(origin + beg, bpPerPix)
   534                 e = (origin + end) // bpPerPix
   535                 if e <= b: continue
   536                 if end == rangeEnd:  # include partly-covered end pixels
   537                     e = div_ceil(origin + end, bpPerPix)
   538             if isTop:
   539                 box = b, margin, e, edge
   540             else:
   541                 box = margin, b, edge, e
   542             yield layer, color, box
   543 
   544 def drawAnnotations(im, boxes):
   545     # xxx use partial transparency for different-color overlaps?
   546     for layer, color, box in boxes:
   547         im.paste(color, box)
   548 
   549 def placedLabels(labels, rangePixBegs, rangePixLens, beg, end):
   550     '''Return axis labels with endpoint & sort-order information.'''
   551     maxWidth = end - beg
   552     for i, j, k in zip(labels, rangePixBegs, rangePixLens):
   553         text, textWidth, textHeight = i
   554         if textWidth > maxWidth:
   555             continue
   556         labelBeg = j + (k - textWidth) // 2
   557         labelEnd = labelBeg + textWidth
   558         sortKey = textWidth - k
   559         if labelBeg < beg:
   560             sortKey += maxWidth * (beg - labelBeg)
   561             labelBeg = beg
   562             labelEnd = beg + textWidth
   563         if labelEnd > end:
   564             sortKey += maxWidth * (labelEnd - end)
   565             labelEnd = end
   566             labelBeg = end - textWidth
   567         yield sortKey, labelBeg, labelEnd, text, textHeight
   568 
   569 def nonoverlappingLabels(labels, minPixTweenLabels):
   570     '''Get a subset of non-overlapping axis labels, greedily.'''
   571     out = []
   572     for i in labels:
   573         beg = i[1] - minPixTweenLabels
   574         end = i[2] + minPixTweenLabels
   575         if all(j[2] <= beg or j[1] >= end for j in out):
   576             out.append(i)
   577     return out
   578 
   579 def axisImage(labels, rangePixBegs, rangePixLens, textRot,
   580               textAln, font, image_mode, opts):
   581     '''Make an image of axis labels.'''
   582     beg = rangePixBegs[0]
   583     end = rangePixBegs[-1] + rangePixLens[-1]
   584     margin = max(i[2] for i in labels)
   585     labels = sorted(placedLabels(labels, rangePixBegs, rangePixLens, beg, end))
   586     minPixTweenLabels = 0 if textRot else opts.label_space
   587     labels = nonoverlappingLabels(labels, minPixTweenLabels)
   588     image_size = (margin, end) if textRot else (end, margin)
   589     im = Image.new(image_mode, image_size, opts.margin_color)
   590     draw = ImageDraw.Draw(im)
   591     for i in labels:
   592         base = margin - i[4] if textAln else 0
   593         position = (base, i[1]) if textRot else (i[1], base)
   594         draw.text(position, i[3], font=font, fill=opts.text_color)
   595     return im
   596 
   597 def rangesWithOrigins(sortedRanges, rangePixBegs, bpPerPix):
   598     for i, j in zip(sortedRanges, rangePixBegs):
   599         seqName, rangeBeg, rangeEnd = i
   600         origin = bpPerPix * j - rangeBeg
   601         yield seqName, (rangeBeg, rangeEnd, origin)
   602 
   603 def rangesPerSeq(sortedRanges, rangePixBegs, bpPerPix):
   604     a = rangesWithOrigins(sortedRanges, rangePixBegs, bpPerPix)
   605     for k, v in itertools.groupby(a, itemgetter(0)):
   606         yield k, [i[1] for i in v]
   607 
   608 def getFont(opts):
   609     if opts.fontfile:
   610         return ImageFont.truetype(opts.fontfile, opts.fontsize)
   611     fileNames = []
   612     try:
   613         x = ["fc-match", "-f%{file}", "arial"]
   614         p = subprocess.Popen(x, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
   615         out, err = p.communicate()
   616         fileNames.append(out)
   617     except OSError as e:
   618         warn("fc-match error: " + str(e))
   619     fileNames.append("/Library/Fonts/Arial.ttf")  # for Mac
   620     for i in fileNames:
   621         try:
   622             font = ImageFont.truetype(i, opts.fontsize)
   623             warn("font: " + i)
   624             return font
   625         except IOError as e:
   626             warn("font load error: " + str(e))
   627     return ImageFont.load_default()
   628 
   629 def lastDotplot(opts, args):
   630     font = getFont(opts)
   631     image_mode = 'RGB'
   632     forward_color = ImageColor.getcolor(opts.forwardcolor, image_mode)
   633     reverse_color = ImageColor.getcolor(opts.reversecolor, image_mode)
   634     zipped_colors = zip(forward_color, reverse_color)
   635     overlap_color = tuple([(i + j) // 2 for i, j in zipped_colors])
   636 
   637     maxGap1, maxGapB1 = twoValuesFromOption(opts.max_gap1, ":")
   638     maxGap2, maxGapB2 = twoValuesFromOption(opts.max_gap2, ":")
   639 
   640     warn("reading alignments...")
   641     alnData = readAlignments(args[0], opts)
   642     alignments, seqRanges1, coverDict1, seqRanges2, coverDict2 = alnData
   643     if not alignments: raise Exception("there are no alignments")
   644     warn("cutting...")
   645     coverDict1 = dict(mergedRangesPerSeq(coverDict1))
   646     coverDict2 = dict(mergedRangesPerSeq(coverDict2))
   647     minAlignedBases = min(coveredLength(coverDict1), coveredLength(coverDict2))
   648     pad = int(opts.pad * minAlignedBases)
   649     cutRanges1 = list(trimmed(seqRanges1, coverDict1, minAlignedBases,
   650                               maxGap1, pad, pad))
   651     cutRanges2 = list(trimmed(seqRanges2, coverDict2, minAlignedBases,
   652                               maxGap2, pad, pad))
   653 
   654     warn("reading secondary alignments...")
   655     alnDataB = readSecondaryAlignments(opts, cutRanges1, cutRanges2)
   656     alignmentsB, seqRangesB1, coverDictB1, seqRangesB2, coverDictB2 = alnDataB
   657     warn("cutting...")
   658     coverDictB1 = dict(mergedRangesPerSeq(coverDictB1))
   659     coverDictB2 = dict(mergedRangesPerSeq(coverDictB2))
   660     cutRangesB1 = trimmed(seqRangesB1, coverDictB1, minAlignedBases,
   661                           maxGapB1, 0, 0)
   662     cutRangesB2 = trimmed(seqRangesB2, coverDictB2, minAlignedBases,
   663                           maxGapB2, 0, 0)
   664 
   665     textRot1 = "vertical".startswith(opts.rot1)
   666     i1 = dataFromRanges(cutRanges1, cutRangesB1, opts.sort1, font,
   667                         opts.fontsize, image_mode, opts.labels1, textRot1)
   668     sortedRanges1, rangeSizes1, labelData1, tMargin = i1
   669 
   670     textRot2 = "horizontal".startswith(opts.rot2)
   671     i2 = dataFromRanges(cutRanges2, cutRangesB2, opts.sort2, font,
   672                         opts.fontsize, image_mode, opts.labels2, textRot2)
   673     sortedRanges2, rangeSizes2, labelData2, lMargin = i2
   674 
   675     maxPixels1 = opts.width  - lMargin
   676     maxPixels2 = opts.height - tMargin
   677     bpPerPix1 = get_bp_per_pix(rangeSizes1, opts.border_pixels, maxPixels1)
   678     bpPerPix2 = get_bp_per_pix(rangeSizes2, opts.border_pixels, maxPixels2)
   679     bpPerPix = max(bpPerPix1, bpPerPix2)
   680     warn("bp per pixel = " + str(bpPerPix))
   681 
   682     p1 = pixelData(rangeSizes1, bpPerPix, opts.border_pixels, lMargin)
   683     rangePixBegs1, rangePixLens1, width = p1
   684     rangeDict1 = dict(rangesPerSeq(sortedRanges1, rangePixBegs1, bpPerPix))
   685 
   686     p2 = pixelData(rangeSizes2, bpPerPix, opts.border_pixels, tMargin)
   687     rangePixBegs2, rangePixLens2, height = p2
   688     rangeDict2 = dict(rangesPerSeq(sortedRanges2, rangePixBegs2, bpPerPix))
   689 
   690     warn("width:  " + str(width))
   691     warn("height: " + str(height))
   692 
   693     warn("processing alignments...")
   694     hits = alignmentPixels(width, height, alignments + alignmentsB, bpPerPix,
   695                            rangeDict1, rangeDict2)
   696 
   697     warn("reading annotations...")
   698 
   699     rangeDict1 = expandedSeqDict(rangeDict1)
   700     beds1 = itertools.chain(readBed(opts.bed1, rangeDict1),
   701                             readRmsk(opts.rmsk1, rangeDict1),
   702                             readGenePred(opts, opts.genePred1, rangeDict1),
   703                             readGaps(opts, opts.gap1, rangeDict1))
   704     b1 = bedBoxes(beds1, rangeDict1, tMargin, height, True, bpPerPix)
   705 
   706     rangeDict2 = expandedSeqDict(rangeDict2)
   707     beds2 = itertools.chain(readBed(opts.bed2, rangeDict2),
   708                             readRmsk(opts.rmsk2, rangeDict2),
   709                             readGenePred(opts, opts.genePred2, rangeDict2),
   710                             readGaps(opts, opts.gap2, rangeDict2))
   711     b2 = bedBoxes(beds2, rangeDict2, lMargin, width, False, bpPerPix)
   712 
   713     boxes = sorted(itertools.chain(b1, b2))
   714 
   715     warn("drawing...")
   716 
   717     image_size = width, height
   718     im = Image.new(image_mode, image_size, opts.background_color)
   719 
   720     drawAnnotations(im, boxes)
   721 
   722     for i in range(height):
   723         for j in range(width):
   724             store_value = hits[i * width + j]
   725             xy = j, i
   726             if   store_value == 1: im.putpixel(xy, forward_color)
   727             elif store_value == 2: im.putpixel(xy, reverse_color)
   728             elif store_value == 3: im.putpixel(xy, overlap_color)
   729 
   730     if opts.fontsize != 0:
   731         axis1 = axisImage(labelData1, rangePixBegs1, rangePixLens1,
   732                           textRot1, False, font, image_mode, opts)
   733         if textRot1:
   734             axis1 = axis1.transpose(Image.ROTATE_90)
   735         axis2 = axisImage(labelData2, rangePixBegs2, rangePixLens2,
   736                           textRot2, textRot2, font, image_mode, opts)
   737         if not textRot2:
   738             axis2 = axis2.transpose(Image.ROTATE_270)
   739         im.paste(axis1, (0, 0))
   740         im.paste(axis2, (0, 0))
   741 
   742     for i in rangePixBegs1[1:]:
   743         box = i - opts.border_pixels, tMargin, i, height
   744         im.paste(opts.border_color, box)
   745 
   746     for i in rangePixBegs2[1:]:
   747         box = lMargin, i - opts.border_pixels, width, i
   748         im.paste(opts.border_color, box)
   749 
   750     im.save(args[1])
   751 
   752 if __name__ == "__main__":
   753     usage = """%prog --help
   754    or: %prog [options] maf-or-tab-alignments dotplot.png
   755    or: %prog [options] maf-or-tab-alignments dotplot.gif
   756    or: ..."""
   757     description = "Draw a dotplot of pair-wise sequence alignments in MAF or tabular format."
   758     op = optparse.OptionParser(usage=usage, description=description)
   759     op.add_option("-v", "--verbose", action="count",
   760                   help="show progress messages & data about the plot")
   761     op.add_option("-1", "--seq1", metavar="PATTERN", action="append",
   762                   default=[],
   763                   help="which sequences to show from the 1st genome")
   764     op.add_option("-2", "--seq2", metavar="PATTERN", action="append",
   765                   default=[],
   766                   help="which sequences to show from the 2nd genome")
   767     # Replace "width" & "height" with a single "length" option?
   768     op.add_option("-x", "--width", type="int", default=1000,
   769                   help="maximum width in pixels (default: %default)")
   770     op.add_option("-y", "--height", type="int", default=1000,
   771                   help="maximum height in pixels (default: %default)")
   772     op.add_option("-c", "--forwardcolor", metavar="COLOR", default="red",
   773                   help="color for forward alignments (default: %default)")
   774     op.add_option("-r", "--reversecolor", metavar="COLOR", default="blue",
   775                   help="color for reverse alignments (default: %default)")
   776     op.add_option("--alignments", metavar="FILE", help="secondary alignments")
   777     op.add_option("--sort1", default="1", metavar="N",
   778                   help="genome1 sequence order: 0=input order, 1=name order, "
   779                   "2=length order (default=%default)")
   780     op.add_option("--sort2", default="1", metavar="N",
   781                   help="genome2 sequence order: 0=input order, 1=name order, "
   782                   "2=length order (default=%default)")
   783     op.add_option("--max-gap1", metavar="FRAC", default="0.5,2", help=
   784                   "maximum unaligned (end,mid) gap in genome1: "
   785                   "fraction of aligned length (default=%default)")
   786     op.add_option("--max-gap2", metavar="FRAC", default="0.5,2", help=
   787                   "maximum unaligned (end,mid) gap in genome2: "
   788                   "fraction of aligned length (default=%default)")
   789     op.add_option("--pad", metavar="FRAC", type="float", default=0.04, help=
   790                   "pad length when cutting unaligned gaps: "
   791                   "fraction of aligned length (default=%default)")
   792     op.add_option("--border-pixels", metavar="INT", type="int", default=1,
   793                   help="number of pixels between sequences (default=%default)")
   794     op.add_option("--border-color", metavar="COLOR", default="black",
   795                   help="color for pixels between sequences (default=%default)")
   796     # --break-color and/or --break-pixels for intra-sequence breaks?
   797     op.add_option("--margin-color", metavar="COLOR", default="#dcdcdc",
   798                   help="margin color")
   799 
   800     og = optparse.OptionGroup(op, "Text options")
   801     og.add_option("-f", "--fontfile", metavar="FILE",
   802                   help="TrueType or OpenType font file")
   803     og.add_option("-s", "--fontsize", metavar="SIZE", type="int", default=14,
   804                   help="TrueType or OpenType font size (default: %default)")
   805     og.add_option("--labels1", type="int", default=0, metavar="N", help=
   806                   "genome1 labels: 0=name, 1=name:length, "
   807                   "2=name:start:length, 3=name:start-end (default=%default)")
   808     og.add_option("--labels2", type="int", default=0, metavar="N", help=
   809                   "genome2 labels: 0=name, 1=name:length, "
   810                   "2=name:start:length, 3=name:start-end (default=%default)")
   811     og.add_option("--rot1", metavar="ROT", default="h",
   812                   help="text rotation for the 1st genome (default=%default)")
   813     og.add_option("--rot2", metavar="ROT", default="v",
   814                   help="text rotation for the 2nd genome (default=%default)")
   815     op.add_option_group(og)
   816 
   817     og = optparse.OptionGroup(op, "Annotation options")
   818     og.add_option("--bed1", metavar="FILE",
   819                   help="read genome1 annotations from BED file")
   820     og.add_option("--bed2", metavar="FILE",
   821                   help="read genome2 annotations from BED file")
   822     og.add_option("--rmsk1", metavar="FILE", help="read genome1 repeats from "
   823                   "RepeatMasker .out or rmsk.txt file")
   824     og.add_option("--rmsk2", metavar="FILE", help="read genome2 repeats from "
   825                   "RepeatMasker .out or rmsk.txt file")
   826     op.add_option_group(og)
   827 
   828     og = optparse.OptionGroup(op, "Gene options")
   829     og.add_option("--genePred1", metavar="FILE",
   830                   help="read genome1 genes from genePred file")
   831     og.add_option("--genePred2", metavar="FILE",
   832                   help="read genome2 genes from genePred file")
   833     og.add_option("--exon-color", metavar="COLOR", default="PaleGreen",
   834                   help="color for exons (default=%default)")
   835     og.add_option("--cds-color", metavar="COLOR", default="LimeGreen",
   836                   help="color for protein-coding regions (default=%default)")
   837     op.add_option_group(og)
   838 
   839     og = optparse.OptionGroup(op, "Unsequenced gap options")
   840     og.add_option("--gap1", metavar="FILE",
   841                   help="read genome1 unsequenced gaps from agp or gap file")
   842     og.add_option("--gap2", metavar="FILE",
   843                   help="read genome2 unsequenced gaps from agp or gap file")
   844     og.add_option("--bridged-color", metavar="COLOR", default="yellow",
   845                   help="color for bridged gaps (default: %default)")
   846     og.add_option("--unbridged-color", metavar="COLOR", default="orange",
   847                   help="color for unbridged gaps (default: %default)")
   848     op.add_option_group(og)
   849     (opts, args) = op.parse_args()
   850     if len(args) != 2: op.error("2 arguments needed")
   851 
   852     opts.text_color = "black"
   853     opts.background_color = "white"
   854     opts.label_space = 5     # minimum number of pixels between axis labels
   855 
   856     try: lastDotplot(opts, args)
   857     except KeyboardInterrupt: pass  # avoid silly error message
   858     except Exception as e:
   859         prog = os.path.basename(sys.argv[0])
   860         sys.exit(prog + ": error: " + str(e))