scripts/last-dotplot
author Martin C. Frith
Wed Nov 08 17:02:56 2017 +0900 (2017-11-08)
changeset 910 42653029d900
parent 909 92dfad507286
child 911 4080f3f6f73d
permissions -rwxr-xr-x
Add gap-cutting options to last-dotplot
     1 #! /usr/bin/env python
     2 
     3 # Read pair-wise alignments in MAF or LAST tabular format: write an
     4 # "Oxford grid", a.k.a. dotplot.
     5 
     6 # TODO: Currently, pixels with zero aligned nt-pairs are white, and
     7 # pixels with one or more aligned nt-pairs are black.  This can look
     8 # too crowded for large genome alignments.  I tried shading each pixel
     9 # according to the number of aligned nt-pairs within it, but the
    10 # result is too faint.  How can this be done better?
    11 
    12 import gzip
    13 from fnmatch import fnmatchcase
    14 from operator import itemgetter
    15 import subprocess
    16 import itertools, optparse, os, re, sys
    17 
    18 # Try to make PIL/PILLOW work:
    19 try: from PIL import Image, ImageDraw, ImageFont, ImageColor
    20 except ImportError: import Image, ImageDraw, ImageFont, ImageColor
    21 
    22 def myOpen(fileName):  # faster than fileinput
    23     if fileName is None:
    24         return []
    25     if fileName == "-":
    26         return sys.stdin
    27     if fileName.endswith(".gz"):
    28         return gzip.open(fileName)
    29     return open(fileName)
    30 
    31 def warn(message):
    32     if opts.verbose:
    33         prog = os.path.basename(sys.argv[0])
    34         sys.stderr.write(prog + ": " + message + "\n")
    35 
    36 def croppedBlocks(blocks, ranges1, ranges2):
    37     headBeg1, headBeg2, headSize = blocks[0]
    38     for r1 in ranges1:
    39         for r2 in ranges2:
    40             cropBeg1, cropEnd1 = r1
    41             if headBeg1 < 0:
    42                 cropBeg1, cropEnd1 = -cropEnd1, -cropBeg1
    43             cropBeg2, cropEnd2 = r2
    44             if headBeg2 < 0:
    45                 cropBeg2, cropEnd2 = -cropEnd2, -cropBeg2
    46             for beg1, beg2, size in blocks:
    47                 b1 = max(cropBeg1, beg1)
    48                 e1 = min(cropEnd1, beg1 + size)
    49                 if b1 >= e1: continue
    50                 offset = beg2 - beg1
    51                 b2 = max(cropBeg2, b1 + offset)
    52                 e2 = min(cropEnd2, e1 + offset)
    53                 if b2 >= e2: continue
    54                 yield b2 - offset, b2, e2 - b2
    55 
    56 def tabBlocks(beg1, beg2, blocks):
    57     '''Get the gapless blocks of an alignment, from LAST tabular format.'''
    58     for i in blocks.split(","):
    59         if ":" in i:
    60             x, y = i.split(":")
    61             beg1 += int(x)
    62             beg2 += int(y)
    63         else:
    64             size = int(i)
    65             yield beg1, beg2, size
    66             beg1 += size
    67             beg2 += size
    68 
    69 def mafBlocks(beg1, beg2, seq1, seq2):
    70     '''Get the gapless blocks of an alignment, from MAF format.'''
    71     size = 0
    72     for x, y in itertools.izip(seq1, seq2):
    73         if x == "-":
    74             if size:
    75                 yield beg1, beg2, size
    76                 beg1 += size
    77                 beg2 += size
    78                 size = 0
    79             beg2 += 1
    80         elif y == "-":
    81             if size:
    82                 yield beg1, beg2, size
    83                 beg1 += size
    84                 beg2 += size
    85                 size = 0
    86             beg1 += 1
    87         else:
    88             size += 1
    89     if size: yield beg1, beg2, size
    90 
    91 def alignmentInput(lines):
    92     '''Get alignments and sequence lengths, from MAF or tabular format.'''
    93     mafCount = 0
    94     for line in lines:
    95         w = line.split()
    96         if line[0].isdigit():  # tabular format
    97             chr1, beg1, seqlen1 = w[1], int(w[2]), int(w[5])
    98             if w[4] == "-": beg1 -= seqlen1
    99             chr2, beg2, seqlen2 = w[6], int(w[7]), int(w[10])
   100             if w[9] == "-": beg2 -= seqlen2
   101             blocks = tabBlocks(beg1, beg2, w[11])
   102             yield chr1, seqlen1, chr2, seqlen2, blocks
   103         elif line[0] == "s":  # MAF format
   104             if mafCount == 0:
   105                 chr1, beg1, seqlen1, seq1 = w[1], int(w[2]), int(w[5]), w[6]
   106                 if w[4] == "-": beg1 -= seqlen1
   107                 mafCount = 1
   108             else:
   109                 chr2, beg2, seqlen2, seq2 = w[1], int(w[2]), int(w[5]), w[6]
   110                 if w[4] == "-": beg2 -= seqlen2
   111                 blocks = mafBlocks(beg1, beg2, seq1, seq2)
   112                 yield chr1, seqlen1, chr2, seqlen2, blocks
   113                 mafCount = 0
   114 
   115 def seqRequestFromText(text):
   116     if ":" in text:
   117         pattern, interval = text.rsplit(":", 1)
   118         if "-" in interval:
   119             beg, end = interval.rsplit("-", 1)
   120             return pattern, int(beg), int(end)  # beg may be negative
   121     return text, 0, sys.maxsize
   122 
   123 def rangesFromSeqName(seqRequests, name, seqLen):
   124     if seqRequests:
   125         base = name.split(".")[-1]  # allow for names like hg19.chr7
   126         for pat, beg, end in seqRequests:
   127             if fnmatchcase(name, pat) or fnmatchcase(base, pat):
   128                 yield max(beg, 0), min(end, seqLen)
   129     else:
   130         yield 0, seqLen
   131 
   132 def updateSeqs(coverDict, seqRanges, seqName, ranges, coveredRange):
   133     beg, end = coveredRange
   134     if beg < 0:
   135         coveredRange = -end, -beg
   136     if seqName in coverDict:
   137         coverDict[seqName].append(coveredRange)
   138     else:
   139         coverDict[seqName] = [coveredRange]
   140         for beg, end in ranges:
   141             r = seqName, beg, end
   142             seqRanges.append(r)
   143 
   144 def readAlignments(fileName, opts):
   145     '''Get alignments and sequence limits, from MAF or tabular format.'''
   146     seqRequests1 = map(seqRequestFromText, opts.seq1)
   147     seqRequests2 = map(seqRequestFromText, opts.seq2)
   148 
   149     alignments = []
   150     seqRanges1 = []
   151     seqRanges2 = []
   152     coverDict1 = {}
   153     coverDict2 = {}
   154     lines = myOpen(fileName)
   155     for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
   156         ranges1 = sorted(rangesFromSeqName(seqRequests1, seqName1, seqLen1))
   157         if not ranges1: continue
   158         ranges2 = sorted(rangesFromSeqName(seqRequests2, seqName2, seqLen2))
   159         if not ranges2: continue
   160         b = list(croppedBlocks(list(blocks), ranges1, ranges2))
   161         if not b: continue
   162         aln = seqName1, seqName2, b
   163         alignments.append(aln)
   164         coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
   165         updateSeqs(coverDict1, seqRanges1, seqName1, ranges1, coveredRange1)
   166         coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
   167         updateSeqs(coverDict2, seqRanges2, seqName2, ranges2, coveredRange2)
   168     return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
   169 
   170 def twoValuesFromOption(text, separator):
   171     if separator in text:
   172         return text.split(separator)
   173     return text, text
   174 
   175 def mergedRanges(ranges):
   176     oldBeg, maxEnd = ranges[0]
   177     for beg, end in ranges:
   178         if beg > maxEnd:
   179             yield oldBeg, maxEnd
   180             oldBeg = beg
   181             maxEnd = end
   182         elif end > maxEnd:
   183             maxEnd = end
   184     yield oldBeg, maxEnd
   185 
   186 def mergedRangesPerSeq(coverDict):
   187     for k, v in coverDict.iteritems():
   188         v.sort()
   189         yield k, list(mergedRanges(v))
   190 
   191 def coveredLength(mergedCoverDict):
   192     return sum(sum(e - b for b, e in v) for v in mergedCoverDict.itervalues())
   193 
   194 def trimmed(seqRanges, coverDict, minAlignedBases, maxGapFrac, endPad, midPad):
   195     maxEndGapFrac, maxMidGapFrac = twoValuesFromOption(maxGapFrac, ",")
   196     maxEndGap = max(float(maxEndGapFrac) * minAlignedBases, endPad * 1.0)
   197     maxMidGap = max(float(maxMidGapFrac) * minAlignedBases, midPad * 2.0)
   198 
   199     for seqName, rangeBeg, rangeEnd in seqRanges:
   200         seqBlocks = coverDict[seqName]
   201         blocks = [i for i in seqBlocks if i[0] < rangeEnd and i[1] > rangeBeg]
   202         if blocks[0][0] - rangeBeg > maxEndGap:
   203             rangeBeg = blocks[0][0] - endPad
   204         for j, y in enumerate(blocks):
   205             if j:
   206                 x = blocks[j - 1]
   207                 if y[0] - x[1] > maxMidGap:
   208                     yield seqName, rangeBeg, x[1] + midPad
   209                     rangeBeg = y[0] - midPad
   210         if rangeEnd - blocks[-1][1] > maxEndGap:
   211             rangeEnd = blocks[-1][1] + endPad
   212         yield seqName, rangeBeg, rangeEnd
   213 
   214 def natural_sort_key(my_string):
   215     '''Return a sort key for "natural" ordering, e.g. chr9 < chr10.'''
   216     parts = re.split(r'(\d+)', my_string)
   217     parts[1::2] = map(int, parts[1::2])
   218     return parts
   219 
   220 def nameKey(oneSeqRanges):
   221     return natural_sort_key(oneSeqRanges[0][0])
   222 
   223 def sizeKey(oneSeqRanges):
   224     return sum(b - e for n, b, e in oneSeqRanges), nameKey(oneSeqRanges)
   225 
   226 def getSortedRanges(seqRanges, sortOpt):
   227     g = [list(v) for k, v in itertools.groupby(seqRanges, itemgetter(0))]
   228     if sortOpt == "1":
   229         g.sort(key=nameKey)
   230     if sortOpt == "2":
   231         g.sort(key=sizeKey)
   232     return [j for i in g for j in i]
   233 
   234 def prettyNum(n):
   235     t = str(n)
   236     groups = []
   237     while t:
   238         groups.append(t[-3:])
   239         t = t[:-3]
   240     return ",".join(reversed(groups))
   241 
   242 def sizeText(size):
   243     suffixes = "bp", "kb", "Mb", "Gb"
   244     for i, x in enumerate(suffixes):
   245         j = 10 ** (i * 3)
   246         if size < j * 10:
   247             return "%.2g" % (1.0 * size / j) + x
   248         if size < j * 1000 or i == len(suffixes) - 1:
   249             return "%.0f" % (1.0 * size / j) + x
   250 
   251 def labelText(seqRange, labelOpt):
   252     seqName, beg, end = seqRange
   253     if labelOpt == 1:
   254         return seqName + ": " + sizeText(end - beg)
   255     if labelOpt == 2:
   256         return seqName + ":" + prettyNum(beg) + ": " + sizeText(end - beg)
   257     if labelOpt == 3:
   258         return seqName + ":" + prettyNum(beg) + "-" + prettyNum(end)
   259     return seqName
   260 
   261 def rangeLabels(seqRanges, labelOpt, font, fontsize, image_mode, textRot):
   262     if fontsize:
   263         image_size = 1, 1
   264         im = Image.new(image_mode, image_size)
   265         draw = ImageDraw.Draw(im)
   266     x = y = 0
   267     for r in seqRanges:
   268         text = labelText(r, labelOpt)
   269         if fontsize:
   270             x, y = draw.textsize(text, font=font)
   271             if textRot:
   272                 x, y = y, x
   273         yield text, x, y
   274 
   275 def dataFromRanges(seqRanges, sortOpt, font,
   276                    fontsize, image_mode, labelOpt, textRot):
   277     s = getSortedRanges(seqRanges, sortOpt)
   278     for i in s:
   279         warn("\t".join(map(str, i)))
   280     warn("")
   281     rangeSizes = [e - b for n, b, e in s]
   282     labs = list(rangeLabels(s, labelOpt, font, fontsize, image_mode, textRot))
   283     margin = max(i[2] for i in labs)
   284     # xxx the margin may be too big, because some labels may get omitted
   285     return s, rangeSizes, labs, margin
   286 
   287 def div_ceil(x, y):
   288     '''Return x / y rounded up.'''
   289     q, r = divmod(x, y)
   290     return q + (r != 0)
   291 
   292 def get_bp_per_pix(rangeSizes, pixTweenRanges, maxPixels):
   293     '''Get the minimum bp-per-pixel that fits in the size limit.'''
   294     warn("choosing bp per pixel...")
   295     numOfRanges = len(rangeSizes)
   296     maxPixelsInRanges = maxPixels - pixTweenRanges * (numOfRanges - 1)
   297     if maxPixelsInRanges < numOfRanges:
   298         raise Exception("can't fit the image: too many sequences?")
   299     negLimit = -maxPixelsInRanges
   300     negBpPerPix = sum(rangeSizes) // negLimit
   301     while True:
   302         if sum(i // negBpPerPix for i in rangeSizes) >= negLimit:
   303             return -negBpPerPix
   304         negBpPerPix -= 1
   305 
   306 def getRangePixBegs(rangePixLens, pixTweenRanges, margin):
   307     '''Get the start pixel for each range.'''
   308     rangePixBegs = []
   309     pix_tot = margin - pixTweenRanges
   310     for i in rangePixLens:
   311         pix_tot += pixTweenRanges
   312         rangePixBegs.append(pix_tot)
   313         pix_tot += i
   314     return rangePixBegs
   315 
   316 def pixelData(rangeSizes, bp_per_pix, pixTweenRanges, margin):
   317     '''Return pixel information about the ranges.'''
   318     rangePixLens = [div_ceil(i, bp_per_pix) for i in rangeSizes]
   319     rangePixBegs = getRangePixBegs(rangePixLens, pixTweenRanges, margin)
   320     tot_pix = rangePixBegs[-1] + rangePixLens[-1]
   321     return rangePixBegs, rangePixLens, tot_pix
   322 
   323 def drawLineForward(hits, width, bp_per_pix, beg1, beg2, size):
   324     while True:
   325         q1, r1 = divmod(beg1, bp_per_pix)
   326         q2, r2 = divmod(beg2, bp_per_pix)
   327         hits[q2 * width + q1] |= 1
   328         next_pix = min(bp_per_pix - r1, bp_per_pix - r2)
   329         if next_pix >= size: break
   330         beg1 += next_pix
   331         beg2 += next_pix
   332         size -= next_pix
   333 
   334 def drawLineReverse(hits, width, bp_per_pix, beg1, beg2, size):
   335     beg2 = -1 - beg2
   336     while True:
   337         q1, r1 = divmod(beg1, bp_per_pix)
   338         q2, r2 = divmod(beg2, bp_per_pix)
   339         hits[q2 * width + q1] |= 2
   340         next_pix = min(bp_per_pix - r1, r2 + 1)
   341         if next_pix >= size: break
   342         beg1 += next_pix
   343         beg2 -= next_pix
   344         size -= next_pix
   345 
   346 def findOrigin(ranges, beg, size):
   347     if beg < 0:
   348         beg = -(beg + size)
   349     for rangeBeg, rangeEnd, origin in ranges:
   350         if rangeEnd > beg:
   351             return origin
   352 
   353 def alignmentPixels(width, height, alignments, bp_per_pix,
   354                     rangeDict1, rangeDict2):
   355     hits = [0] * (width * height)  # the image data
   356     for seq1, seq2, blocks in alignments:
   357         beg1, beg2, size = blocks[0]
   358         ori1 = findOrigin(rangeDict1[seq1], beg1, size)
   359         ori2 = findOrigin(rangeDict2[seq2], beg2, size)
   360         for beg1, beg2, size in blocks:
   361             if beg1 < 0:
   362                 beg1 = -(beg1 + size)
   363                 beg2 = -(beg2 + size)
   364             if beg2 >= 0:
   365                 drawLineForward(hits, width, bp_per_pix,
   366                                 beg1 + ori1, beg2 + ori2, size)
   367             else:
   368                 drawLineReverse(hits, width, bp_per_pix,
   369                                 beg1 + ori1, beg2 - ori2, size)
   370     return hits
   371 
   372 def expandedSeqDict(seqDict):
   373     '''Allow lookup by short sequence names, e.g. chr7 as well as hg19.chr7.'''
   374     newDict = seqDict.copy()
   375     for name, x in seqDict.items():
   376         if "." in name:
   377             base = name.split(".")[-1]
   378             if base in newDict:  # an ambiguous case was found:
   379                 return seqDict   # so give up completely
   380             newDict[base] = x
   381     return newDict
   382 
   383 def readBed(fileName, rangeDict):
   384     for line in myOpen(fileName):
   385         w = line.split()
   386         if not w: continue
   387         seqName = w[0]
   388         if seqName not in rangeDict: continue
   389         beg = int(w[1])
   390         end = int(w[2])
   391         layer = 900
   392         color = "#fbf"
   393         if len(w) > 4:
   394             if w[4] != ".":
   395                 layer = float(w[4])
   396             if len(w) > 5:
   397                 if len(w) > 8 and w[8].count(",") == 2:
   398                     color = "rgb(" + w[8] + ")"
   399                 elif w[5] == "+":
   400                     color = "#ffe8e8"
   401                 elif w[5] == "-":
   402                     color = "#e8e8ff"
   403         yield layer, color, seqName, beg, end
   404 
   405 def commaSeparatedInts(text):
   406     return map(int, text.rstrip(",").split(","))
   407 
   408 def readGenePred(opts, fileName, rangeDict):
   409     for line in myOpen(fileName):
   410         fields = line.split()
   411         if not fields: continue
   412         if fields[2] not in "+-": fields = fields[1:]
   413         seqName = fields[1]
   414         if seqName not in rangeDict: continue
   415         #strand = fields[2]
   416         cdsBeg = int(fields[5])
   417         cdsEnd = int(fields[6])
   418         exonBegs = commaSeparatedInts(fields[8])
   419         exonEnds = commaSeparatedInts(fields[9])
   420         for beg, end in zip(exonBegs, exonEnds):
   421             yield 300, opts.exon_color, seqName, beg, end
   422             b = max(beg, cdsBeg)
   423             e = min(end, cdsEnd)
   424             if b < e: yield 400, opts.cds_color, seqName, b, e
   425 
   426 def readRmsk(fileName, rangeDict):
   427     for line in myOpen(fileName):
   428         fields = line.split()
   429         if len(fields) == 17:  # rmsk.txt
   430             seqName = fields[5]
   431             if seqName not in rangeDict: continue  # do this ASAP for speed
   432             beg = int(fields[6])
   433             end = int(fields[7])
   434             strand = fields[9]
   435             repeatClass = fields[11]
   436         elif len(fields) == 15:  # .out
   437             seqName = fields[4]
   438             if seqName not in rangeDict: continue
   439             beg = int(fields[5]) - 1
   440             end = int(fields[6])
   441             strand = fields[8]
   442             repeatClass = fields[10]
   443         else:
   444             continue
   445         if repeatClass in ("Low_complexity", "Simple_repeat"):
   446             yield 200, "#fbf", seqName, beg, end
   447         elif strand == "+":
   448             yield 100, "#ffe8e8", seqName, beg, end
   449         else:
   450             yield 100, "#e8e8ff", seqName, beg, end
   451 
   452 def isExtraFirstGapField(fields):
   453     return fields[4].isdigit()
   454 
   455 def readGaps(opts, fileName, rangeDict):
   456     '''Read locations of unsequenced gaps, from an agp or gap file.'''
   457     for line in myOpen(fileName):
   458         w = line.split()
   459         if not w or w[0][0] == "#": continue
   460         if isExtraFirstGapField(w): w = w[1:]
   461         if w[4] not in "NU": continue
   462         seqName = w[0]
   463         if seqName not in rangeDict: continue
   464         end = int(w[2])
   465         beg = end - int(w[5])  # zero-based coordinate
   466         if w[7] == "yes":
   467             yield 3000, opts.bridged_color, seqName, beg, end
   468         else:
   469             yield 2000, opts.unbridged_color, seqName, beg, end
   470 
   471 def bedBoxes(beds, rangeDict, margin, edge, isTop, bpPerPix):
   472     for layer, color, seqName, bedBeg, bedEnd in beds:
   473         for rangeBeg, rangeEnd, origin in rangeDict[seqName]:
   474             beg = max(bedBeg, rangeBeg)
   475             end = min(bedEnd, rangeEnd)
   476             if beg >= end: continue
   477             if layer <= 1000:
   478                 # include partly-covered pixels
   479                 b = (origin + beg) // bpPerPix
   480                 e = div_ceil(origin + end, bpPerPix)
   481             else:
   482                 # exclude partly-covered pixels
   483                 b = div_ceil(origin + beg, bpPerPix)
   484                 e = (origin + end) // bpPerPix
   485                 if e <= b: continue
   486             if isTop:
   487                 box = b, margin, e, edge
   488             else:
   489                 box = margin, b, edge, e
   490             yield layer, color, box
   491 
   492 def drawAnnotations(im, boxes):
   493     # xxx use partial transparency for different-color overlaps?
   494     for layer, color, box in boxes:
   495         im.paste(color, box)
   496 
   497 def placedLabels(labels, rangePixBegs, rangePixLens, beg, end):
   498     '''Return axis labels with endpoint & sort-order information.'''
   499     maxWidth = end - beg
   500     for i, j, k in zip(labels, rangePixBegs, rangePixLens):
   501         text, textWidth, textHeight = i
   502         if textWidth > maxWidth:
   503             continue
   504         labelBeg = j + (k - textWidth) // 2
   505         labelEnd = labelBeg + textWidth
   506         sortKey = textWidth - k
   507         if labelBeg < beg:
   508             sortKey += maxWidth * (beg - labelBeg)
   509             labelBeg = beg
   510             labelEnd = beg + textWidth
   511         if labelEnd > end:
   512             sortKey += maxWidth * (labelEnd - end)
   513             labelEnd = end
   514             labelBeg = end - textWidth
   515         yield sortKey, labelBeg, labelEnd, text, textHeight
   516 
   517 def nonoverlappingLabels(labels, minPixTweenLabels):
   518     '''Get a subset of non-overlapping axis labels, greedily.'''
   519     out = []
   520     for i in labels:
   521         beg = i[1] - minPixTweenLabels
   522         end = i[2] + minPixTweenLabels
   523         if all(j[2] <= beg or j[1] >= end for j in out):
   524             out.append(i)
   525     return out
   526 
   527 def axisImage(labels, rangePixBegs, rangePixLens, textRot,
   528               textAln, font, image_mode, opts):
   529     '''Make an image of axis labels.'''
   530     beg = rangePixBegs[0]
   531     end = rangePixBegs[-1] + rangePixLens[-1]
   532     margin = max(i[2] for i in labels)
   533     labels = sorted(placedLabels(labels, rangePixBegs, rangePixLens, beg, end))
   534     minPixTweenLabels = 0 if textRot else opts.label_space
   535     labels = nonoverlappingLabels(labels, minPixTweenLabels)
   536     image_size = (margin, end) if textRot else (end, margin)
   537     im = Image.new(image_mode, image_size, opts.margin_color)
   538     draw = ImageDraw.Draw(im)
   539     for i in labels:
   540         base = margin - i[4] if textAln else 0
   541         position = (base, i[1]) if textRot else (i[1], base)
   542         draw.text(position, i[3], font=font, fill=opts.text_color)
   543     return im
   544 
   545 def rangesWithOrigins(sortedRanges, rangePixBegs, bpPerPix):
   546     for i, j in zip(sortedRanges, rangePixBegs):
   547         seqName, rangeBeg, rangeEnd = i
   548         origin = bpPerPix * j - rangeBeg
   549         yield seqName, (rangeBeg, rangeEnd, origin)
   550 
   551 def rangesPerSeq(sortedRanges, rangePixBegs, bpPerPix):
   552     a = rangesWithOrigins(sortedRanges, rangePixBegs, bpPerPix)
   553     for k, v in itertools.groupby(a, itemgetter(0)):
   554         yield k, [i[1] for i in v]
   555 
   556 def getFont(opts):
   557     if opts.fontfile:
   558         return ImageFont.truetype(opts.fontfile, opts.fontsize)
   559     fileNames = []
   560     try:
   561         x = ["fc-match", "-f%{file}", "arial"]
   562         p = subprocess.Popen(x, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
   563         out, err = p.communicate()
   564         fileNames.append(out)
   565     except OSError as e:
   566         warn("fc-match error: " + str(e))
   567     fileNames.append("/Library/Fonts/Arial.ttf")  # for Mac
   568     for i in fileNames:
   569         try:
   570             font = ImageFont.truetype(i, opts.fontsize)
   571             warn("font: " + i)
   572             return font
   573         except IOError as e:
   574             warn("font load error: " + str(e))
   575     return ImageFont.load_default()
   576 
   577 def lastDotplot(opts, args):
   578     font = getFont(opts)
   579     image_mode = 'RGB'
   580     forward_color = ImageColor.getcolor(opts.forwardcolor, image_mode)
   581     reverse_color = ImageColor.getcolor(opts.reversecolor, image_mode)
   582     zipped_colors = zip(forward_color, reverse_color)
   583     overlap_color = tuple([(i + j) // 2 for i, j in zipped_colors])
   584 
   585     warn("reading alignments...")
   586     alnData = readAlignments(args[0], opts)
   587     alignments, seqRanges1, coverDict1, seqRanges2, coverDict2 = alnData
   588     if not alignments: raise Exception("there are no alignments")
   589     warn("cutting...")
   590     coverDict1 = dict(mergedRangesPerSeq(coverDict1))
   591     coverDict2 = dict(mergedRangesPerSeq(coverDict2))
   592     minAlignedBases = min(coveredLength(coverDict1), coveredLength(coverDict2))
   593     pad = int(opts.pad * minAlignedBases)
   594     cutRanges1 = list(trimmed(seqRanges1, coverDict1, minAlignedBases,
   595                               opts.max_gap1, pad, pad))
   596     cutRanges2 = list(trimmed(seqRanges2, coverDict2, minAlignedBases,
   597                               opts.max_gap2, pad, pad))
   598 
   599     textRot1 = "vertical".startswith(opts.rot1)
   600     i1 = dataFromRanges(cutRanges1, opts.sort1, font,
   601                         opts.fontsize, image_mode, opts.labels1, textRot1)
   602     sortedRanges1, rangeSizes1, labelData1, tMargin = i1
   603 
   604     textRot2 = "horizontal".startswith(opts.rot2)
   605     i2 = dataFromRanges(cutRanges2, opts.sort2, font,
   606                         opts.fontsize, image_mode, opts.labels2, textRot2)
   607     sortedRanges2, rangeSizes2, labelData2, lMargin = i2
   608 
   609     maxPixels1 = opts.width  - lMargin
   610     maxPixels2 = opts.height - tMargin
   611     bpPerPix1 = get_bp_per_pix(rangeSizes1, opts.border_pixels, maxPixels1)
   612     bpPerPix2 = get_bp_per_pix(rangeSizes2, opts.border_pixels, maxPixels2)
   613     bpPerPix = max(bpPerPix1, bpPerPix2)
   614     warn("bp per pixel = " + str(bpPerPix))
   615 
   616     p1 = pixelData(rangeSizes1, bpPerPix, opts.border_pixels, lMargin)
   617     rangePixBegs1, rangePixLens1, width = p1
   618     rangeDict1 = dict(rangesPerSeq(sortedRanges1, rangePixBegs1, bpPerPix))
   619 
   620     p2 = pixelData(rangeSizes2, bpPerPix, opts.border_pixels, tMargin)
   621     rangePixBegs2, rangePixLens2, height = p2
   622     rangeDict2 = dict(rangesPerSeq(sortedRanges2, rangePixBegs2, bpPerPix))
   623 
   624     warn("width:  " + str(width))
   625     warn("height: " + str(height))
   626 
   627     warn("processing alignments...")
   628     hits = alignmentPixels(width, height, alignments, bpPerPix,
   629                            rangeDict1, rangeDict2)
   630 
   631     warn("reading annotations...")
   632 
   633     rangeDict1 = expandedSeqDict(rangeDict1)
   634     beds1 = itertools.chain(readBed(opts.bed1, rangeDict1),
   635                             readRmsk(opts.rmsk1, rangeDict1),
   636                             readGenePred(opts, opts.genePred1, rangeDict1),
   637                             readGaps(opts, opts.gap1, rangeDict1))
   638     b1 = bedBoxes(beds1, rangeDict1, tMargin, height, True, bpPerPix)
   639 
   640     rangeDict2 = expandedSeqDict(rangeDict2)
   641     beds2 = itertools.chain(readBed(opts.bed2, rangeDict2),
   642                             readRmsk(opts.rmsk2, rangeDict2),
   643                             readGenePred(opts, opts.genePred2, rangeDict2),
   644                             readGaps(opts, opts.gap2, rangeDict2))
   645     b2 = bedBoxes(beds2, rangeDict2, lMargin, width, False, bpPerPix)
   646 
   647     boxes = sorted(itertools.chain(b1, b2))
   648 
   649     warn("drawing...")
   650 
   651     image_size = width, height
   652     im = Image.new(image_mode, image_size, opts.background_color)
   653 
   654     drawAnnotations(im, boxes)
   655 
   656     for i in range(height):
   657         for j in range(width):
   658             store_value = hits[i * width + j]
   659             xy = j, i
   660             if   store_value == 1: im.putpixel(xy, forward_color)
   661             elif store_value == 2: im.putpixel(xy, reverse_color)
   662             elif store_value == 3: im.putpixel(xy, overlap_color)
   663 
   664     if opts.fontsize != 0:
   665         axis1 = axisImage(labelData1, rangePixBegs1, rangePixLens1,
   666                           textRot1, False, font, image_mode, opts)
   667         if textRot1:
   668             axis1 = axis1.transpose(Image.ROTATE_90)
   669         axis2 = axisImage(labelData2, rangePixBegs2, rangePixLens2,
   670                           textRot2, textRot2, font, image_mode, opts)
   671         if not textRot2:
   672             axis2 = axis2.transpose(Image.ROTATE_270)
   673         im.paste(axis1, (0, 0))
   674         im.paste(axis2, (0, 0))
   675 
   676     for i in rangePixBegs1[1:]:
   677         box = i - opts.border_pixels, tMargin, i, height
   678         im.paste(opts.border_color, box)
   679 
   680     for i in rangePixBegs2[1:]:
   681         box = lMargin, i - opts.border_pixels, width, i
   682         im.paste(opts.border_color, box)
   683 
   684     im.save(args[1])
   685 
   686 if __name__ == "__main__":
   687     usage = """%prog --help
   688    or: %prog [options] maf-or-tab-alignments dotplot.png
   689    or: %prog [options] maf-or-tab-alignments dotplot.gif
   690    or: ..."""
   691     description = "Draw a dotplot of pair-wise sequence alignments in MAF or tabular format."
   692     op = optparse.OptionParser(usage=usage, description=description)
   693     op.add_option("-v", "--verbose", action="count",
   694                   help="show progress messages & data about the plot")
   695     op.add_option("-1", "--seq1", metavar="PATTERN", action="append",
   696                   default=[],
   697                   help="which sequences to show from the 1st genome")
   698     op.add_option("-2", "--seq2", metavar="PATTERN", action="append",
   699                   default=[],
   700                   help="which sequences to show from the 2nd genome")
   701     # Replace "width" & "height" with a single "length" option?
   702     op.add_option("-x", "--width", type="int", default=1000,
   703                   help="maximum width in pixels (default: %default)")
   704     op.add_option("-y", "--height", type="int", default=1000,
   705                   help="maximum height in pixels (default: %default)")
   706     op.add_option("-c", "--forwardcolor", metavar="COLOR", default="red",
   707                   help="color for forward alignments (default: %default)")
   708     op.add_option("-r", "--reversecolor", metavar="COLOR", default="blue",
   709                   help="color for reverse alignments (default: %default)")
   710     op.add_option("--sort1", default="1", metavar="N",
   711                   help="genome1 sequence order: 0=input order, 1=name order, "
   712                   "2=length order (default=%default)")
   713     op.add_option("--sort2", default="1", metavar="N",
   714                   help="genome2 sequence order: 0=input order, 1=name order, "
   715                   "2=length order (default=%default)")
   716     op.add_option("--max-gap1", metavar="FRAC", default="0.5,2", help=
   717                   "maximum unaligned (end,mid) gap in genome1: "
   718                   "fraction of aligned length (default=%default)")
   719     op.add_option("--max-gap2", metavar="FRAC", default="0.5,2", help=
   720                   "maximum unaligned (end,mid) gap in genome2: "
   721                   "fraction of aligned length (default=%default)")
   722     op.add_option("--pad", metavar="FRAC", type="float", default=0.04, help=
   723                   "pad length when cutting unaligned gaps: "
   724                   "fraction of aligned length (default=%default)")
   725     op.add_option("--border-pixels", metavar="INT", type="int", default=1,
   726                   help="number of pixels between sequences (default=%default)")
   727     op.add_option("--border-color", metavar="COLOR", default="black",
   728                   help="color for pixels between sequences (default=%default)")
   729     op.add_option("--margin-color", metavar="COLOR", default="#dcdcdc",
   730                   help="margin color")
   731 
   732     og = optparse.OptionGroup(op, "Text options")
   733     og.add_option("-f", "--fontfile", metavar="FILE",
   734                   help="TrueType or OpenType font file")
   735     og.add_option("-s", "--fontsize", metavar="SIZE", type="int", default=14,
   736                   help="TrueType or OpenType font size (default: %default)")
   737     og.add_option("--labels1", type="int", default=0, metavar="N", help=
   738                   "genome1 labels: 0=name, 1=name:length, "
   739                   "2=name:start:length, 3=name:start-end (default=%default)")
   740     og.add_option("--labels2", type="int", default=0, metavar="N", help=
   741                   "genome2 labels: 0=name, 1=name:length, "
   742                   "2=name:start:length, 3=name:start-end (default=%default)")
   743     og.add_option("--rot1", metavar="ROT", default="h",
   744                   help="text rotation for the 1st genome (default=%default)")
   745     og.add_option("--rot2", metavar="ROT", default="v",
   746                   help="text rotation for the 2nd genome (default=%default)")
   747     op.add_option_group(og)
   748 
   749     og = optparse.OptionGroup(op, "Annotation options")
   750     og.add_option("--bed1", metavar="FILE",
   751                   help="read genome1 annotations from BED file")
   752     og.add_option("--bed2", metavar="FILE",
   753                   help="read genome2 annotations from BED file")
   754     og.add_option("--rmsk1", metavar="FILE", help="read genome1 repeats from "
   755                   "RepeatMasker .out or rmsk.txt file")
   756     og.add_option("--rmsk2", metavar="FILE", help="read genome2 repeats from "
   757                   "RepeatMasker .out or rmsk.txt file")
   758     op.add_option_group(og)
   759 
   760     og = optparse.OptionGroup(op, "Gene options")
   761     og.add_option("--genePred1", metavar="FILE",
   762                   help="read genome1 genes from genePred file")
   763     og.add_option("--genePred2", metavar="FILE",
   764                   help="read genome2 genes from genePred file")
   765     og.add_option("--exon-color", metavar="COLOR", default="PaleGreen",
   766                   help="color for exons (default=%default)")
   767     og.add_option("--cds-color", metavar="COLOR", default="LimeGreen",
   768                   help="color for protein-coding regions (default=%default)")
   769     op.add_option_group(og)
   770 
   771     og = optparse.OptionGroup(op, "Unsequenced gap options")
   772     og.add_option("--gap1", metavar="FILE",
   773                   help="read genome1 unsequenced gaps from agp or gap file")
   774     og.add_option("--gap2", metavar="FILE",
   775                   help="read genome2 unsequenced gaps from agp or gap file")
   776     og.add_option("--bridged-color", metavar="COLOR", default="yellow",
   777                   help="color for bridged gaps (default: %default)")
   778     og.add_option("--unbridged-color", metavar="COLOR", default="orange",
   779                   help="color for unbridged gaps (default: %default)")
   780     op.add_option_group(og)
   781     (opts, args) = op.parse_args()
   782     if len(args) != 2: op.error("2 arguments needed")
   783 
   784     opts.text_color = "black"
   785     opts.background_color = "white"
   786     opts.label_space = 5     # minimum number of pixels between axis labels
   787 
   788     try: lastDotplot(opts, args)
   789     except KeyboardInterrupt: pass  # avoid silly error message
   790     except Exception as e:
   791         prog = os.path.basename(sys.argv[0])
   792         sys.exit(prog + ": error: " + str(e))