scripts/last-dotplot
author Martin C. Frith
Tue Dec 05 17:57:24 2017 +0900 (2017-12-05)
changeset 914 f4ca19d126e3
parent 913 e87abc7ae9c6
child 915 6079fabb49a5
permissions -rwxr-xr-x
last-dotplot: add alignment-order options
     1 #! /usr/bin/env python
     2 
     3 # Read pair-wise alignments in MAF or LAST tabular format: write an
     4 # "Oxford grid", a.k.a. dotplot.
     5 
     6 # TODO: Currently, pixels with zero aligned nt-pairs are white, and
     7 # pixels with one or more aligned nt-pairs are black.  This can look
     8 # too crowded for large genome alignments.  I tried shading each pixel
     9 # according to the number of aligned nt-pairs within it, but the
    10 # result is too faint.  How can this be done better?
    11 
    12 import functools
    13 import gzip
    14 from fnmatch import fnmatchcase
    15 from operator import itemgetter
    16 import subprocess
    17 import itertools, optparse, os, re, sys
    18 
    19 # Try to make PIL/PILLOW work:
    20 try: from PIL import Image, ImageDraw, ImageFont, ImageColor
    21 except ImportError: import Image, ImageDraw, ImageFont, ImageColor
    22 
    23 def myOpen(fileName):  # faster than fileinput
    24     if fileName is None:
    25         return []
    26     if fileName == "-":
    27         return sys.stdin
    28     if fileName.endswith(".gz"):
    29         return gzip.open(fileName)
    30     return open(fileName)
    31 
    32 def warn(message):
    33     if opts.verbose:
    34         prog = os.path.basename(sys.argv[0])
    35         sys.stderr.write(prog + ": " + message + "\n")
    36 
    37 def groupByFirstItem(things):
    38     for k, v in itertools.groupby(things, itemgetter(0)):
    39         yield k, [i[1:] for i in v]
    40 
    41 def croppedBlocks(blocks, ranges1, ranges2):
    42     headBeg1, headBeg2, headSize = blocks[0]
    43     for r1 in ranges1:
    44         for r2 in ranges2:
    45             cropBeg1, cropEnd1 = r1
    46             if headBeg1 < 0:
    47                 cropBeg1, cropEnd1 = -cropEnd1, -cropBeg1
    48             cropBeg2, cropEnd2 = r2
    49             if headBeg2 < 0:
    50                 cropBeg2, cropEnd2 = -cropEnd2, -cropBeg2
    51             for beg1, beg2, size in blocks:
    52                 b1 = max(cropBeg1, beg1)
    53                 e1 = min(cropEnd1, beg1 + size)
    54                 if b1 >= e1: continue
    55                 offset = beg2 - beg1
    56                 b2 = max(cropBeg2, b1 + offset)
    57                 e2 = min(cropEnd2, e1 + offset)
    58                 if b2 >= e2: continue
    59                 yield b2 - offset, b2, e2 - b2
    60 
    61 def tabBlocks(beg1, beg2, blocks):
    62     '''Get the gapless blocks of an alignment, from LAST tabular format.'''
    63     for i in blocks.split(","):
    64         if ":" in i:
    65             x, y = i.split(":")
    66             beg1 += int(x)
    67             beg2 += int(y)
    68         else:
    69             size = int(i)
    70             yield beg1, beg2, size
    71             beg1 += size
    72             beg2 += size
    73 
    74 def mafBlocks(beg1, beg2, seq1, seq2):
    75     '''Get the gapless blocks of an alignment, from MAF format.'''
    76     size = 0
    77     for x, y in itertools.izip(seq1, seq2):
    78         if x == "-":
    79             if size:
    80                 yield beg1, beg2, size
    81                 beg1 += size
    82                 beg2 += size
    83                 size = 0
    84             beg2 += 1
    85         elif y == "-":
    86             if size:
    87                 yield beg1, beg2, size
    88                 beg1 += size
    89                 beg2 += size
    90                 size = 0
    91             beg1 += 1
    92         else:
    93             size += 1
    94     if size: yield beg1, beg2, size
    95 
    96 def alignmentInput(lines):
    97     '''Get alignments and sequence lengths, from MAF or tabular format.'''
    98     mafCount = 0
    99     for line in lines:
   100         w = line.split()
   101         if line[0].isdigit():  # tabular format
   102             chr1, beg1, seqlen1 = w[1], int(w[2]), int(w[5])
   103             if w[4] == "-": beg1 -= seqlen1
   104             chr2, beg2, seqlen2 = w[6], int(w[7]), int(w[10])
   105             if w[9] == "-": beg2 -= seqlen2
   106             blocks = tabBlocks(beg1, beg2, w[11])
   107             yield chr1, seqlen1, chr2, seqlen2, blocks
   108         elif line[0] == "s":  # MAF format
   109             if mafCount == 0:
   110                 chr1, beg1, seqlen1, seq1 = w[1], int(w[2]), int(w[5]), w[6]
   111                 if w[4] == "-": beg1 -= seqlen1
   112                 mafCount = 1
   113             else:
   114                 chr2, beg2, seqlen2, seq2 = w[1], int(w[2]), int(w[5]), w[6]
   115                 if w[4] == "-": beg2 -= seqlen2
   116                 blocks = mafBlocks(beg1, beg2, seq1, seq2)
   117                 yield chr1, seqlen1, chr2, seqlen2, blocks
   118                 mafCount = 0
   119 
   120 def seqRequestFromText(text):
   121     if ":" in text:
   122         pattern, interval = text.rsplit(":", 1)
   123         if "-" in interval:
   124             beg, end = interval.rsplit("-", 1)
   125             return pattern, int(beg), int(end)  # beg may be negative
   126     return text, 0, sys.maxsize
   127 
   128 def rangesFromSeqName(seqRequests, name, seqLen):
   129     if seqRequests:
   130         base = name.split(".")[-1]  # allow for names like hg19.chr7
   131         for pat, beg, end in seqRequests:
   132             if fnmatchcase(name, pat) or fnmatchcase(base, pat):
   133                 yield max(beg, 0), min(end, seqLen)
   134     else:
   135         yield 0, seqLen
   136 
   137 def updateSeqs(coverDict, seqRanges, seqName, ranges, coveredRange):
   138     beg, end = coveredRange
   139     if beg < 0:
   140         coveredRange = -end, -beg
   141     if seqName in coverDict:
   142         coverDict[seqName].append(coveredRange)
   143     else:
   144         coverDict[seqName] = [coveredRange]
   145         for beg, end in ranges:
   146             r = seqName, beg, end
   147             seqRanges.append(r)
   148 
   149 def readAlignments(fileName, opts):
   150     '''Get alignments and sequence limits, from MAF or tabular format.'''
   151     seqRequests1 = map(seqRequestFromText, opts.seq1)
   152     seqRequests2 = map(seqRequestFromText, opts.seq2)
   153 
   154     alignments = []
   155     seqRanges1 = []
   156     seqRanges2 = []
   157     coverDict1 = {}
   158     coverDict2 = {}
   159     lines = myOpen(fileName)
   160     for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
   161         ranges1 = sorted(rangesFromSeqName(seqRequests1, seqName1, seqLen1))
   162         if not ranges1: continue
   163         ranges2 = sorted(rangesFromSeqName(seqRequests2, seqName2, seqLen2))
   164         if not ranges2: continue
   165         b = list(croppedBlocks(list(blocks), ranges1, ranges2))
   166         if not b: continue
   167         aln = seqName1, seqName2, b
   168         alignments.append(aln)
   169         coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
   170         updateSeqs(coverDict1, seqRanges1, seqName1, ranges1, coveredRange1)
   171         coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
   172         updateSeqs(coverDict2, seqRanges2, seqName2, ranges2, coveredRange2)
   173     return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
   174 
   175 def nameAndRangesFromDict(cropDict, seqName):
   176     if seqName in cropDict:
   177         return seqName, cropDict[seqName]
   178     n = seqName.split(".")[-1]
   179     if n in cropDict:
   180         return n, cropDict[n]
   181     return seqName, []
   182 
   183 def rangesForSecondaryAlignments(primaryRanges, seqLen):
   184     if primaryRanges:
   185         return primaryRanges
   186     return [(0, seqLen)]
   187 
   188 def readSecondaryAlignments(opts, cropRanges1, cropRanges2):
   189     cropDict1 = dict(groupByFirstItem(cropRanges1))
   190     cropDict2 = dict(groupByFirstItem(cropRanges2))
   191 
   192     alignments = []
   193     seqRanges1 = []
   194     seqRanges2 = []
   195     coverDict1 = {}
   196     coverDict2 = {}
   197     lines = myOpen(opts.alignments)
   198     for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
   199         seqName1, ranges1 = nameAndRangesFromDict(cropDict1, seqName1)
   200         seqName2, ranges2 = nameAndRangesFromDict(cropDict2, seqName2)
   201         if not ranges1 and not ranges2:
   202             continue
   203         r1 = rangesForSecondaryAlignments(ranges1, seqLen1)
   204         r2 = rangesForSecondaryAlignments(ranges2, seqLen2)
   205         b = list(croppedBlocks(list(blocks), r1, r2))
   206         if not b: continue
   207         aln = seqName1, seqName2, b
   208         alignments.append(aln)
   209         if not ranges1:
   210             coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
   211             updateSeqs(coverDict1, seqRanges1, seqName1, r1, coveredRange1)
   212         if not ranges2:
   213             coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
   214             updateSeqs(coverDict2, seqRanges2, seqName2, r2, coveredRange2)
   215     return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
   216 
   217 def twoValuesFromOption(text, separator):
   218     if separator in text:
   219         return text.split(separator)
   220     return text, text
   221 
   222 def mergedRanges(ranges):
   223     oldBeg, maxEnd = ranges[0]
   224     for beg, end in ranges:
   225         if beg > maxEnd:
   226             yield oldBeg, maxEnd
   227             oldBeg = beg
   228             maxEnd = end
   229         elif end > maxEnd:
   230             maxEnd = end
   231     yield oldBeg, maxEnd
   232 
   233 def mergedRangesPerSeq(coverDict):
   234     for k, v in coverDict.iteritems():
   235         v.sort()
   236         yield k, list(mergedRanges(v))
   237 
   238 def coveredLength(mergedCoverDict):
   239     return sum(sum(e - b for b, e in v) for v in mergedCoverDict.itervalues())
   240 
   241 def trimmed(seqRanges, coverDict, minAlignedBases, maxGapFrac, endPad, midPad):
   242     maxEndGapFrac, maxMidGapFrac = twoValuesFromOption(maxGapFrac, ",")
   243     maxEndGap = max(float(maxEndGapFrac) * minAlignedBases, endPad * 1.0)
   244     maxMidGap = max(float(maxMidGapFrac) * minAlignedBases, midPad * 2.0)
   245 
   246     for seqName, rangeBeg, rangeEnd in seqRanges:
   247         seqBlocks = coverDict[seqName]
   248         blocks = [i for i in seqBlocks if i[0] < rangeEnd and i[1] > rangeBeg]
   249         if blocks[0][0] - rangeBeg > maxEndGap:
   250             rangeBeg = blocks[0][0] - endPad
   251         for j, y in enumerate(blocks):
   252             if j:
   253                 x = blocks[j - 1]
   254                 if y[0] - x[1] > maxMidGap:
   255                     yield seqName, rangeBeg, x[1] + midPad
   256                     rangeBeg = y[0] - midPad
   257         if rangeEnd - blocks[-1][1] > maxEndGap:
   258             rangeEnd = blocks[-1][1] + endPad
   259         yield seqName, rangeBeg, rangeEnd
   260 
   261 def natural_sort_key(my_string):
   262     '''Return a sort key for "natural" ordering, e.g. chr9 < chr10.'''
   263     parts = re.split(r'(\d+)', my_string)
   264     parts[1::2] = map(int, parts[1::2])
   265     return parts
   266 
   267 def nameKey(oneSeqRanges):
   268     return natural_sort_key(oneSeqRanges[0][0])
   269 
   270 def sizeKey(oneSeqRanges):
   271     return sum(b - e for n, b, e in oneSeqRanges), nameKey(oneSeqRanges)
   272 
   273 def alignmentKey(seqNamesToLists, oneSeqRanges):
   274     seqName = oneSeqRanges[0][0]
   275     alignmentsOfThisSequence = seqNamesToLists[seqName]
   276     numOfAlignedLetterPairs = sum(i[3] for i in alignmentsOfThisSequence)
   277     toMiddle = numOfAlignedLetterPairs // 2
   278     for i in alignmentsOfThisSequence:
   279         toMiddle -= i[3]
   280         if toMiddle < 0:
   281             return i[1:3]  # sequence-rank and "position" of this alignment
   282 
   283 def alignmentSortData1(alignments, seqNamesToRanks2):
   284     for seqName1, seqName2, blocks in alignments:
   285         seqRank2 = seqNamesToRanks2[seqName2]
   286         pos2 = abs(blocks[0][1] + blocks[-1][1] + blocks[-1][2])
   287         numOfAlignedLetterPairs = sum(i[2] for i in blocks)
   288         yield seqName1, seqRank2, pos2, numOfAlignedLetterPairs
   289 
   290 def alignmentSortData2(alignments, seqNamesToRanks1):
   291     for seqName1, seqName2, blocks in alignments:
   292         seqRank1 = seqNamesToRanks1[seqName1]
   293         pos1 = abs(blocks[0][0] + blocks[-1][0] + blocks[-1][2])
   294         numOfAlignedLetterPairs = sum(i[2] for i in blocks)
   295         yield seqName2, seqRank1, pos1, numOfAlignedLetterPairs
   296 
   297 def mySortedRanges(seqRanges, sortOpt,
   298                    alignmentSortDataFunc, alignments, otherRanges):
   299     rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
   300     g = [list(ranges) for seqName, ranges in rangesGroupedBySeqName]
   301     if sortOpt == "1":
   302         g.sort(key=nameKey)
   303     if sortOpt == "2":
   304         g.sort(key=sizeKey)
   305     if sortOpt == "3":
   306         otherNameGroups = itertools.groupby(i[0] for i in otherRanges)
   307         ranksAndNames = enumerate(i[0] for i in otherNameGroups)
   308         otherNamesToRanks = dict((n, r) for r, n in ranksAndNames)
   309         alns = sorted(alignmentSortDataFunc(alignments, otherNamesToRanks))
   310         alnsGroupedBySeqName = itertools.groupby(alns, itemgetter(0))
   311         seqNamesToLists = dict((k, list(v)) for k, v in alnsGroupedBySeqName)
   312         g.sort(key=functools.partial(alignmentKey, seqNamesToLists))
   313     return [j for i in g for j in i]
   314 
   315 def allSortedRanges(opts, alignments, alignmentsB,
   316                     seqRanges1, seqRangesB1, seqRanges2, seqRangesB2):
   317     o1, oB1 = twoValuesFromOption(opts.sort1, ":")
   318     o2, oB2 = twoValuesFromOption(opts.sort2, ":")
   319     if o1 == "3" and o2 == "3":
   320         raise Exception("the sort options have circular dependency")
   321     if o1 != "3":
   322         s1 = mySortedRanges(seqRanges1, o1, None, None, None)
   323     if o2 != "3":
   324         s2 = mySortedRanges(seqRanges2, o2, None, None, None)
   325     if o1 == "3":
   326         s1 = mySortedRanges(seqRanges1, o1, alignmentSortData1, alignments, s2)
   327     if o2 == "3":
   328         s2 = mySortedRanges(seqRanges2, o2, alignmentSortData2, alignments, s1)
   329     t1 = mySortedRanges(seqRangesB1, oB1, alignmentSortData1, alignmentsB, s2)
   330     t2 = mySortedRanges(seqRangesB2, oB2, alignmentSortData2, alignmentsB, s1)
   331     return s1 + t1, s2 + t2
   332 
   333 def prettyNum(n):
   334     t = str(n)
   335     groups = []
   336     while t:
   337         groups.append(t[-3:])
   338         t = t[:-3]
   339     return ",".join(reversed(groups))
   340 
   341 def sizeText(size):
   342     suffixes = "bp", "kb", "Mb", "Gb"
   343     for i, x in enumerate(suffixes):
   344         j = 10 ** (i * 3)
   345         if size < j * 10:
   346             return "%.2g" % (1.0 * size / j) + x
   347         if size < j * 1000 or i == len(suffixes) - 1:
   348             return "%.0f" % (1.0 * size / j) + x
   349 
   350 def labelText(seqRange, labelOpt):
   351     seqName, beg, end = seqRange
   352     if labelOpt == 1:
   353         return seqName + ": " + sizeText(end - beg)
   354     if labelOpt == 2:
   355         return seqName + ":" + prettyNum(beg) + ": " + sizeText(end - beg)
   356     if labelOpt == 3:
   357         return seqName + ":" + prettyNum(beg) + "-" + prettyNum(end)
   358     return seqName
   359 
   360 def rangeLabels(seqRanges, labelOpt, font, fontsize, image_mode, textRot):
   361     if fontsize:
   362         image_size = 1, 1
   363         im = Image.new(image_mode, image_size)
   364         draw = ImageDraw.Draw(im)
   365     x = y = 0
   366     for r in seqRanges:
   367         text = labelText(r, labelOpt)
   368         if fontsize:
   369             x, y = draw.textsize(text, font=font)
   370             if textRot:
   371                 x, y = y, x
   372         yield text, x, y
   373 
   374 def dataFromRanges(sortedRanges, font, fontSize, imageMode, labelOpt, textRot):
   375     for i in sortedRanges:
   376         warn("\t".join(map(str, i)))
   377     warn("")
   378     rangeSizes = [e - b for n, b, e in sortedRanges]
   379     labs = list(rangeLabels(sortedRanges, labelOpt, font, fontSize,
   380                             imageMode, textRot))
   381     margin = max(i[2] for i in labs)
   382     # xxx the margin may be too big, because some labels may get omitted
   383     return rangeSizes, labs, margin
   384 
   385 def div_ceil(x, y):
   386     '''Return x / y rounded up.'''
   387     q, r = divmod(x, y)
   388     return q + (r != 0)
   389 
   390 def get_bp_per_pix(rangeSizes, pixTweenRanges, maxPixels):
   391     '''Get the minimum bp-per-pixel that fits in the size limit.'''
   392     warn("choosing bp per pixel...")
   393     numOfRanges = len(rangeSizes)
   394     maxPixelsInRanges = maxPixels - pixTweenRanges * (numOfRanges - 1)
   395     if maxPixelsInRanges < numOfRanges:
   396         raise Exception("can't fit the image: too many sequences?")
   397     negLimit = -maxPixelsInRanges
   398     negBpPerPix = sum(rangeSizes) // negLimit
   399     while True:
   400         if sum(i // negBpPerPix for i in rangeSizes) >= negLimit:
   401             return -negBpPerPix
   402         negBpPerPix -= 1
   403 
   404 def getRangePixBegs(rangePixLens, pixTweenRanges, margin):
   405     '''Get the start pixel for each range.'''
   406     rangePixBegs = []
   407     pix_tot = margin - pixTweenRanges
   408     for i in rangePixLens:
   409         pix_tot += pixTweenRanges
   410         rangePixBegs.append(pix_tot)
   411         pix_tot += i
   412     return rangePixBegs
   413 
   414 def pixelData(rangeSizes, bp_per_pix, pixTweenRanges, margin):
   415     '''Return pixel information about the ranges.'''
   416     rangePixLens = [div_ceil(i, bp_per_pix) for i in rangeSizes]
   417     rangePixBegs = getRangePixBegs(rangePixLens, pixTweenRanges, margin)
   418     tot_pix = rangePixBegs[-1] + rangePixLens[-1]
   419     return rangePixBegs, rangePixLens, tot_pix
   420 
   421 def drawLineForward(hits, width, bp_per_pix, beg1, beg2, size):
   422     while True:
   423         q1, r1 = divmod(beg1, bp_per_pix)
   424         q2, r2 = divmod(beg2, bp_per_pix)
   425         hits[q2 * width + q1] |= 1
   426         next_pix = min(bp_per_pix - r1, bp_per_pix - r2)
   427         if next_pix >= size: break
   428         beg1 += next_pix
   429         beg2 += next_pix
   430         size -= next_pix
   431 
   432 def drawLineReverse(hits, width, bp_per_pix, beg1, beg2, size):
   433     beg2 = -1 - beg2
   434     while True:
   435         q1, r1 = divmod(beg1, bp_per_pix)
   436         q2, r2 = divmod(beg2, bp_per_pix)
   437         hits[q2 * width + q1] |= 2
   438         next_pix = min(bp_per_pix - r1, r2 + 1)
   439         if next_pix >= size: break
   440         beg1 += next_pix
   441         beg2 -= next_pix
   442         size -= next_pix
   443 
   444 def findOrigin(ranges, beg, size):
   445     if beg < 0:
   446         beg = -(beg + size)
   447     for rangeBeg, rangeEnd, origin in ranges:
   448         if rangeEnd > beg:
   449             return origin
   450 
   451 def alignmentPixels(width, height, alignments, bp_per_pix,
   452                     rangeDict1, rangeDict2):
   453     hits = [0] * (width * height)  # the image data
   454     for seq1, seq2, blocks in alignments:
   455         beg1, beg2, size = blocks[0]
   456         ori1 = findOrigin(rangeDict1[seq1], beg1, size)
   457         ori2 = findOrigin(rangeDict2[seq2], beg2, size)
   458         for beg1, beg2, size in blocks:
   459             if beg1 < 0:
   460                 beg1 = -(beg1 + size)
   461                 beg2 = -(beg2 + size)
   462             if beg2 >= 0:
   463                 drawLineForward(hits, width, bp_per_pix,
   464                                 beg1 + ori1, beg2 + ori2, size)
   465             else:
   466                 drawLineReverse(hits, width, bp_per_pix,
   467                                 beg1 + ori1, beg2 - ori2, size)
   468     return hits
   469 
   470 def expandedSeqDict(seqDict):
   471     '''Allow lookup by short sequence names, e.g. chr7 as well as hg19.chr7.'''
   472     newDict = seqDict.copy()
   473     for name, x in seqDict.items():
   474         if "." in name:
   475             base = name.split(".")[-1]
   476             if base in newDict:  # an ambiguous case was found:
   477                 return seqDict   # so give up completely
   478             newDict[base] = x
   479     return newDict
   480 
   481 def readBed(fileName, rangeDict):
   482     for line in myOpen(fileName):
   483         w = line.split()
   484         if not w: continue
   485         seqName = w[0]
   486         if seqName not in rangeDict: continue
   487         beg = int(w[1])
   488         end = int(w[2])
   489         layer = 900
   490         color = "#fbf"
   491         if len(w) > 4:
   492             if w[4] != ".":
   493                 layer = float(w[4])
   494             if len(w) > 5:
   495                 if len(w) > 8 and w[8].count(",") == 2:
   496                     color = "rgb(" + w[8] + ")"
   497                 elif w[5] == "+":
   498                     color = "#ffe8e8"
   499                 elif w[5] == "-":
   500                     color = "#e8e8ff"
   501         yield layer, color, seqName, beg, end
   502 
   503 def commaSeparatedInts(text):
   504     return map(int, text.rstrip(",").split(","))
   505 
   506 def readGenePred(opts, fileName, rangeDict):
   507     for line in myOpen(fileName):
   508         fields = line.split()
   509         if not fields: continue
   510         if fields[2] not in "+-": fields = fields[1:]
   511         seqName = fields[1]
   512         if seqName not in rangeDict: continue
   513         #strand = fields[2]
   514         cdsBeg = int(fields[5])
   515         cdsEnd = int(fields[6])
   516         exonBegs = commaSeparatedInts(fields[8])
   517         exonEnds = commaSeparatedInts(fields[9])
   518         for beg, end in zip(exonBegs, exonEnds):
   519             yield 300, opts.exon_color, seqName, beg, end
   520             b = max(beg, cdsBeg)
   521             e = min(end, cdsEnd)
   522             if b < e: yield 400, opts.cds_color, seqName, b, e
   523 
   524 def readRmsk(fileName, rangeDict):
   525     for line in myOpen(fileName):
   526         fields = line.split()
   527         if len(fields) == 17:  # rmsk.txt
   528             seqName = fields[5]
   529             if seqName not in rangeDict: continue  # do this ASAP for speed
   530             beg = int(fields[6])
   531             end = int(fields[7])
   532             strand = fields[9]
   533             repeatClass = fields[11]
   534         elif len(fields) == 15:  # .out
   535             seqName = fields[4]
   536             if seqName not in rangeDict: continue
   537             beg = int(fields[5]) - 1
   538             end = int(fields[6])
   539             strand = fields[8]
   540             repeatClass = fields[10]
   541         else:
   542             continue
   543         if repeatClass in ("Low_complexity", "Simple_repeat"):
   544             yield 200, "#fbf", seqName, beg, end
   545         elif strand == "+":
   546             yield 100, "#ffe8e8", seqName, beg, end
   547         else:
   548             yield 100, "#e8e8ff", seqName, beg, end
   549 
   550 def isExtraFirstGapField(fields):
   551     return fields[4].isdigit()
   552 
   553 def readGaps(opts, fileName, rangeDict):
   554     '''Read locations of unsequenced gaps, from an agp or gap file.'''
   555     for line in myOpen(fileName):
   556         w = line.split()
   557         if not w or w[0][0] == "#": continue
   558         if isExtraFirstGapField(w): w = w[1:]
   559         if w[4] not in "NU": continue
   560         seqName = w[0]
   561         if seqName not in rangeDict: continue
   562         end = int(w[2])
   563         beg = end - int(w[5])  # zero-based coordinate
   564         if w[7] == "yes":
   565             yield 3000, opts.bridged_color, seqName, beg, end
   566         else:
   567             yield 2000, opts.unbridged_color, seqName, beg, end
   568 
   569 def bedBoxes(beds, rangeDict, margin, edge, isTop, bpPerPix):
   570     for layer, color, seqName, bedBeg, bedEnd in beds:
   571         for rangeBeg, rangeEnd, origin in rangeDict[seqName]:
   572             beg = max(bedBeg, rangeBeg)
   573             end = min(bedEnd, rangeEnd)
   574             if beg >= end: continue
   575             if layer <= 1000:
   576                 # include partly-covered pixels
   577                 b = (origin + beg) // bpPerPix
   578                 e = div_ceil(origin + end, bpPerPix)
   579             else:
   580                 # exclude partly-covered pixels
   581                 b = div_ceil(origin + beg, bpPerPix)
   582                 e = (origin + end) // bpPerPix
   583                 if e <= b: continue
   584                 if end == rangeEnd:  # include partly-covered end pixels
   585                     e = div_ceil(origin + end, bpPerPix)
   586             if isTop:
   587                 box = b, margin, e, edge
   588             else:
   589                 box = margin, b, edge, e
   590             yield layer, color, box
   591 
   592 def drawAnnotations(im, boxes):
   593     # xxx use partial transparency for different-color overlaps?
   594     for layer, color, box in boxes:
   595         im.paste(color, box)
   596 
   597 def placedLabels(labels, rangePixBegs, rangePixLens, beg, end):
   598     '''Return axis labels with endpoint & sort-order information.'''
   599     maxWidth = end - beg
   600     for i, j, k in zip(labels, rangePixBegs, rangePixLens):
   601         text, textWidth, textHeight = i
   602         if textWidth > maxWidth:
   603             continue
   604         labelBeg = j + (k - textWidth) // 2
   605         labelEnd = labelBeg + textWidth
   606         sortKey = textWidth - k
   607         if labelBeg < beg:
   608             sortKey += maxWidth * (beg - labelBeg)
   609             labelBeg = beg
   610             labelEnd = beg + textWidth
   611         if labelEnd > end:
   612             sortKey += maxWidth * (labelEnd - end)
   613             labelEnd = end
   614             labelBeg = end - textWidth
   615         yield sortKey, labelBeg, labelEnd, text, textHeight
   616 
   617 def nonoverlappingLabels(labels, minPixTweenLabels):
   618     '''Get a subset of non-overlapping axis labels, greedily.'''
   619     out = []
   620     for i in labels:
   621         beg = i[1] - minPixTweenLabels
   622         end = i[2] + minPixTweenLabels
   623         if all(j[2] <= beg or j[1] >= end for j in out):
   624             out.append(i)
   625     return out
   626 
   627 def axisImage(labels, rangePixBegs, rangePixLens, textRot,
   628               textAln, font, image_mode, opts):
   629     '''Make an image of axis labels.'''
   630     beg = rangePixBegs[0]
   631     end = rangePixBegs[-1] + rangePixLens[-1]
   632     margin = max(i[2] for i in labels)
   633     labels = sorted(placedLabels(labels, rangePixBegs, rangePixLens, beg, end))
   634     minPixTweenLabels = 0 if textRot else opts.label_space
   635     labels = nonoverlappingLabels(labels, minPixTweenLabels)
   636     image_size = (margin, end) if textRot else (end, margin)
   637     im = Image.new(image_mode, image_size, opts.margin_color)
   638     draw = ImageDraw.Draw(im)
   639     for i in labels:
   640         base = margin - i[4] if textAln else 0
   641         position = (base, i[1]) if textRot else (i[1], base)
   642         draw.text(position, i[3], font=font, fill=opts.text_color)
   643     return im
   644 
   645 def rangesWithOrigins(sortedRanges, rangePixBegs, bpPerPix):
   646     for i, j in zip(sortedRanges, rangePixBegs):
   647         seqName, rangeBeg, rangeEnd = i
   648         origin = bpPerPix * j - rangeBeg
   649         yield seqName, (rangeBeg, rangeEnd, origin)
   650 
   651 def rangesPerSeq(sortedRanges, rangePixBegs, bpPerPix):
   652     a = rangesWithOrigins(sortedRanges, rangePixBegs, bpPerPix)
   653     for k, v in itertools.groupby(a, itemgetter(0)):
   654         yield k, [i[1] for i in v]
   655 
   656 def getFont(opts):
   657     if opts.fontfile:
   658         return ImageFont.truetype(opts.fontfile, opts.fontsize)
   659     fileNames = []
   660     try:
   661         x = ["fc-match", "-f%{file}", "arial"]
   662         p = subprocess.Popen(x, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
   663         out, err = p.communicate()
   664         fileNames.append(out)
   665     except OSError as e:
   666         warn("fc-match error: " + str(e))
   667     fileNames.append("/Library/Fonts/Arial.ttf")  # for Mac
   668     for i in fileNames:
   669         try:
   670             font = ImageFont.truetype(i, opts.fontsize)
   671             warn("font: " + i)
   672             return font
   673         except IOError as e:
   674             warn("font load error: " + str(e))
   675     return ImageFont.load_default()
   676 
   677 def lastDotplot(opts, args):
   678     font = getFont(opts)
   679     image_mode = 'RGB'
   680     forward_color = ImageColor.getcolor(opts.forwardcolor, image_mode)
   681     reverse_color = ImageColor.getcolor(opts.reversecolor, image_mode)
   682     zipped_colors = zip(forward_color, reverse_color)
   683     overlap_color = tuple([(i + j) // 2 for i, j in zipped_colors])
   684 
   685     maxGap1, maxGapB1 = twoValuesFromOption(opts.max_gap1, ":")
   686     maxGap2, maxGapB2 = twoValuesFromOption(opts.max_gap2, ":")
   687 
   688     warn("reading alignments...")
   689     alnData = readAlignments(args[0], opts)
   690     alignments, seqRanges1, coverDict1, seqRanges2, coverDict2 = alnData
   691     if not alignments: raise Exception("there are no alignments")
   692     warn("cutting...")
   693     coverDict1 = dict(mergedRangesPerSeq(coverDict1))
   694     coverDict2 = dict(mergedRangesPerSeq(coverDict2))
   695     minAlignedBases = min(coveredLength(coverDict1), coveredLength(coverDict2))
   696     pad = int(opts.pad * minAlignedBases)
   697     cutRanges1 = list(trimmed(seqRanges1, coverDict1, minAlignedBases,
   698                               maxGap1, pad, pad))
   699     cutRanges2 = list(trimmed(seqRanges2, coverDict2, minAlignedBases,
   700                               maxGap2, pad, pad))
   701 
   702     warn("reading secondary alignments...")
   703     alnDataB = readSecondaryAlignments(opts, cutRanges1, cutRanges2)
   704     alignmentsB, seqRangesB1, coverDictB1, seqRangesB2, coverDictB2 = alnDataB
   705     warn("cutting...")
   706     coverDictB1 = dict(mergedRangesPerSeq(coverDictB1))
   707     coverDictB2 = dict(mergedRangesPerSeq(coverDictB2))
   708     cutRangesB1 = trimmed(seqRangesB1, coverDictB1, minAlignedBases,
   709                           maxGapB1, 0, 0)
   710     cutRangesB2 = trimmed(seqRangesB2, coverDictB2, minAlignedBases,
   711                           maxGapB2, 0, 0)
   712 
   713     warn("sorting...")
   714     sortOut = allSortedRanges(opts, alignments, alignmentsB,
   715                               cutRanges1, cutRangesB1, cutRanges2, cutRangesB2)
   716     sortedRanges1, sortedRanges2 = sortOut
   717 
   718     textRot1 = "vertical".startswith(opts.rot1)
   719     i1 = dataFromRanges(sortedRanges1, font,
   720                         opts.fontsize, image_mode, opts.labels1, textRot1)
   721     rangeSizes1, labelData1, tMargin = i1
   722 
   723     textRot2 = "horizontal".startswith(opts.rot2)
   724     i2 = dataFromRanges(sortedRanges2, font,
   725                         opts.fontsize, image_mode, opts.labels2, textRot2)
   726     rangeSizes2, labelData2, lMargin = i2
   727 
   728     maxPixels1 = opts.width  - lMargin
   729     maxPixels2 = opts.height - tMargin
   730     bpPerPix1 = get_bp_per_pix(rangeSizes1, opts.border_pixels, maxPixels1)
   731     bpPerPix2 = get_bp_per_pix(rangeSizes2, opts.border_pixels, maxPixels2)
   732     bpPerPix = max(bpPerPix1, bpPerPix2)
   733     warn("bp per pixel = " + str(bpPerPix))
   734 
   735     p1 = pixelData(rangeSizes1, bpPerPix, opts.border_pixels, lMargin)
   736     rangePixBegs1, rangePixLens1, width = p1
   737     rangeDict1 = dict(rangesPerSeq(sortedRanges1, rangePixBegs1, bpPerPix))
   738 
   739     p2 = pixelData(rangeSizes2, bpPerPix, opts.border_pixels, tMargin)
   740     rangePixBegs2, rangePixLens2, height = p2
   741     rangeDict2 = dict(rangesPerSeq(sortedRanges2, rangePixBegs2, bpPerPix))
   742 
   743     warn("width:  " + str(width))
   744     warn("height: " + str(height))
   745 
   746     warn("processing alignments...")
   747     hits = alignmentPixels(width, height, alignments + alignmentsB, bpPerPix,
   748                            rangeDict1, rangeDict2)
   749 
   750     warn("reading annotations...")
   751 
   752     rangeDict1 = expandedSeqDict(rangeDict1)
   753     beds1 = itertools.chain(readBed(opts.bed1, rangeDict1),
   754                             readRmsk(opts.rmsk1, rangeDict1),
   755                             readGenePred(opts, opts.genePred1, rangeDict1),
   756                             readGaps(opts, opts.gap1, rangeDict1))
   757     b1 = bedBoxes(beds1, rangeDict1, tMargin, height, True, bpPerPix)
   758 
   759     rangeDict2 = expandedSeqDict(rangeDict2)
   760     beds2 = itertools.chain(readBed(opts.bed2, rangeDict2),
   761                             readRmsk(opts.rmsk2, rangeDict2),
   762                             readGenePred(opts, opts.genePred2, rangeDict2),
   763                             readGaps(opts, opts.gap2, rangeDict2))
   764     b2 = bedBoxes(beds2, rangeDict2, lMargin, width, False, bpPerPix)
   765 
   766     boxes = sorted(itertools.chain(b1, b2))
   767 
   768     warn("drawing...")
   769 
   770     image_size = width, height
   771     im = Image.new(image_mode, image_size, opts.background_color)
   772 
   773     drawAnnotations(im, boxes)
   774 
   775     for i in range(height):
   776         for j in range(width):
   777             store_value = hits[i * width + j]
   778             xy = j, i
   779             if   store_value == 1: im.putpixel(xy, forward_color)
   780             elif store_value == 2: im.putpixel(xy, reverse_color)
   781             elif store_value == 3: im.putpixel(xy, overlap_color)
   782 
   783     if opts.fontsize != 0:
   784         axis1 = axisImage(labelData1, rangePixBegs1, rangePixLens1,
   785                           textRot1, False, font, image_mode, opts)
   786         if textRot1:
   787             axis1 = axis1.transpose(Image.ROTATE_90)
   788         axis2 = axisImage(labelData2, rangePixBegs2, rangePixLens2,
   789                           textRot2, textRot2, font, image_mode, opts)
   790         if not textRot2:
   791             axis2 = axis2.transpose(Image.ROTATE_270)
   792         im.paste(axis1, (0, 0))
   793         im.paste(axis2, (0, 0))
   794 
   795     for i in rangePixBegs1[1:]:
   796         box = i - opts.border_pixels, tMargin, i, height
   797         im.paste(opts.border_color, box)
   798 
   799     for i in rangePixBegs2[1:]:
   800         box = lMargin, i - opts.border_pixels, width, i
   801         im.paste(opts.border_color, box)
   802 
   803     im.save(args[1])
   804 
   805 if __name__ == "__main__":
   806     usage = """%prog --help
   807    or: %prog [options] maf-or-tab-alignments dotplot.png
   808    or: %prog [options] maf-or-tab-alignments dotplot.gif
   809    or: ..."""
   810     description = "Draw a dotplot of pair-wise sequence alignments in MAF or tabular format."
   811     op = optparse.OptionParser(usage=usage, description=description)
   812     op.add_option("-v", "--verbose", action="count",
   813                   help="show progress messages & data about the plot")
   814     op.add_option("-1", "--seq1", metavar="PATTERN", action="append",
   815                   default=[],
   816                   help="which sequences to show from the 1st genome")
   817     op.add_option("-2", "--seq2", metavar="PATTERN", action="append",
   818                   default=[],
   819                   help="which sequences to show from the 2nd genome")
   820     # Replace "width" & "height" with a single "length" option?
   821     op.add_option("-x", "--width", type="int", default=1000,
   822                   help="maximum width in pixels (default: %default)")
   823     op.add_option("-y", "--height", type="int", default=1000,
   824                   help="maximum height in pixels (default: %default)")
   825     op.add_option("-c", "--forwardcolor", metavar="COLOR", default="red",
   826                   help="color for forward alignments (default: %default)")
   827     op.add_option("-r", "--reversecolor", metavar="COLOR", default="blue",
   828                   help="color for reverse alignments (default: %default)")
   829     op.add_option("--alignments", metavar="FILE", help="secondary alignments")
   830     op.add_option("--sort1", default="1", metavar="N",
   831                   help="genome1 sequence order: 0=input order, 1=name order, "
   832                   "2=length order, 3=alignment order (default=%default)")
   833     op.add_option("--sort2", default="1", metavar="N",
   834                   help="genome2 sequence order: 0=input order, 1=name order, "
   835                   "2=length order, 3=alignment order (default=%default)")
   836     op.add_option("--max-gap1", metavar="FRAC", default="0.5,2", help=
   837                   "maximum unaligned (end,mid) gap in genome1: "
   838                   "fraction of aligned length (default=%default)")
   839     op.add_option("--max-gap2", metavar="FRAC", default="0.5,2", help=
   840                   "maximum unaligned (end,mid) gap in genome2: "
   841                   "fraction of aligned length (default=%default)")
   842     op.add_option("--pad", metavar="FRAC", type="float", default=0.04, help=
   843                   "pad length when cutting unaligned gaps: "
   844                   "fraction of aligned length (default=%default)")
   845     op.add_option("--border-pixels", metavar="INT", type="int", default=1,
   846                   help="number of pixels between sequences (default=%default)")
   847     op.add_option("--border-color", metavar="COLOR", default="black",
   848                   help="color for pixels between sequences (default=%default)")
   849     # --break-color and/or --break-pixels for intra-sequence breaks?
   850     op.add_option("--margin-color", metavar="COLOR", default="#dcdcdc",
   851                   help="margin color")
   852 
   853     og = optparse.OptionGroup(op, "Text options")
   854     og.add_option("-f", "--fontfile", metavar="FILE",
   855                   help="TrueType or OpenType font file")
   856     og.add_option("-s", "--fontsize", metavar="SIZE", type="int", default=14,
   857                   help="TrueType or OpenType font size (default: %default)")
   858     og.add_option("--labels1", type="int", default=0, metavar="N", help=
   859                   "genome1 labels: 0=name, 1=name:length, "
   860                   "2=name:start:length, 3=name:start-end (default=%default)")
   861     og.add_option("--labels2", type="int", default=0, metavar="N", help=
   862                   "genome2 labels: 0=name, 1=name:length, "
   863                   "2=name:start:length, 3=name:start-end (default=%default)")
   864     og.add_option("--rot1", metavar="ROT", default="h",
   865                   help="text rotation for the 1st genome (default=%default)")
   866     og.add_option("--rot2", metavar="ROT", default="v",
   867                   help="text rotation for the 2nd genome (default=%default)")
   868     op.add_option_group(og)
   869 
   870     og = optparse.OptionGroup(op, "Annotation options")
   871     og.add_option("--bed1", metavar="FILE",
   872                   help="read genome1 annotations from BED file")
   873     og.add_option("--bed2", metavar="FILE",
   874                   help="read genome2 annotations from BED file")
   875     og.add_option("--rmsk1", metavar="FILE", help="read genome1 repeats from "
   876                   "RepeatMasker .out or rmsk.txt file")
   877     og.add_option("--rmsk2", metavar="FILE", help="read genome2 repeats from "
   878                   "RepeatMasker .out or rmsk.txt file")
   879     op.add_option_group(og)
   880 
   881     og = optparse.OptionGroup(op, "Gene options")
   882     og.add_option("--genePred1", metavar="FILE",
   883                   help="read genome1 genes from genePred file")
   884     og.add_option("--genePred2", metavar="FILE",
   885                   help="read genome2 genes from genePred file")
   886     og.add_option("--exon-color", metavar="COLOR", default="PaleGreen",
   887                   help="color for exons (default=%default)")
   888     og.add_option("--cds-color", metavar="COLOR", default="LimeGreen",
   889                   help="color for protein-coding regions (default=%default)")
   890     op.add_option_group(og)
   891 
   892     og = optparse.OptionGroup(op, "Unsequenced gap options")
   893     og.add_option("--gap1", metavar="FILE",
   894                   help="read genome1 unsequenced gaps from agp or gap file")
   895     og.add_option("--gap2", metavar="FILE",
   896                   help="read genome2 unsequenced gaps from agp or gap file")
   897     og.add_option("--bridged-color", metavar="COLOR", default="yellow",
   898                   help="color for bridged gaps (default: %default)")
   899     og.add_option("--unbridged-color", metavar="COLOR", default="orange",
   900                   help="color for unbridged gaps (default: %default)")
   901     op.add_option_group(og)
   902     (opts, args) = op.parse_args()
   903     if len(args) != 2: op.error("2 arguments needed")
   904 
   905     opts.text_color = "black"
   906     opts.background_color = "white"
   907     opts.label_space = 5     # minimum number of pixels between axis labels
   908 
   909     try: lastDotplot(opts, args)
   910     except KeyboardInterrupt: pass  # avoid silly error message
   911     except Exception as e:
   912         prog = os.path.basename(sys.argv[0])
   913         sys.exit(prog + ": error: " + str(e))