scripts/last-dotplot
author Martin C. Frith
Mon Oct 29 08:23:59 2018 +0900 (2018-10-29)
changeset 957 c54672c8892e
parent 942 82b69208c878
child 961 ed0fb9b1eb40
permissions -rwxr-xr-x
Fix gz reading for Python3
     1 #! /usr/bin/env python
     2 
     3 # Read pair-wise alignments in MAF or LAST tabular format: write an
     4 # "Oxford grid", a.k.a. dotplot.
     5 
     6 # TODO: Currently, pixels with zero aligned nt-pairs are white, and
     7 # pixels with one or more aligned nt-pairs are black.  This can look
     8 # too crowded for large genome alignments.  I tried shading each pixel
     9 # according to the number of aligned nt-pairs within it, but the
    10 # result is too faint.  How can this be done better?
    11 
    12 import collections
    13 import functools
    14 import gzip
    15 from fnmatch import fnmatchcase
    16 from operator import itemgetter
    17 import subprocess
    18 import itertools, optparse, os, re, sys
    19 
    20 # Try to make PIL/PILLOW work:
    21 try:
    22     from PIL import Image, ImageDraw, ImageFont, ImageColor
    23 except ImportError:
    24     import Image, ImageDraw, ImageFont, ImageColor
    25 
    26 try:
    27     from future_builtins import zip
    28 except ImportError:
    29     pass
    30 
    31 def myOpen(fileName):  # faster than fileinput
    32     if fileName is None:
    33         return []
    34     if fileName == "-":
    35         return sys.stdin
    36     if fileName.endswith(".gz"):
    37         return gzip.open(fileName, "rt")  # xxx dubious for Python2
    38     return open(fileName)
    39 
    40 def warn(message):
    41     if opts.verbose:
    42         prog = os.path.basename(sys.argv[0])
    43         sys.stderr.write(prog + ": " + message + "\n")
    44 
    45 def groupByFirstItem(things):
    46     for k, v in itertools.groupby(things, itemgetter(0)):
    47         yield k, [i[1:] for i in v]
    48 
    49 def croppedBlocks(blocks, ranges1, ranges2):
    50     headBeg1, headBeg2, headSize = blocks[0]
    51     for r1 in ranges1:
    52         for r2 in ranges2:
    53             cropBeg1, cropEnd1 = r1
    54             if headBeg1 < 0:
    55                 cropBeg1, cropEnd1 = -cropEnd1, -cropBeg1
    56             cropBeg2, cropEnd2 = r2
    57             if headBeg2 < 0:
    58                 cropBeg2, cropEnd2 = -cropEnd2, -cropBeg2
    59             for beg1, beg2, size in blocks:
    60                 b1 = max(cropBeg1, beg1)
    61                 e1 = min(cropEnd1, beg1 + size)
    62                 if b1 >= e1: continue
    63                 offset = beg2 - beg1
    64                 b2 = max(cropBeg2, b1 + offset)
    65                 e2 = min(cropEnd2, e1 + offset)
    66                 if b2 >= e2: continue
    67                 yield b2 - offset, b2, e2 - b2
    68 
    69 def tabBlocks(beg1, beg2, blocks):
    70     '''Get the gapless blocks of an alignment, from LAST tabular format.'''
    71     for i in blocks.split(","):
    72         if ":" in i:
    73             x, y = i.split(":")
    74             beg1 += int(x)
    75             beg2 += int(y)
    76         else:
    77             size = int(i)
    78             yield beg1, beg2, size
    79             beg1 += size
    80             beg2 += size
    81 
    82 def mafBlocks(beg1, beg2, seq1, seq2):
    83     '''Get the gapless blocks of an alignment, from MAF format.'''
    84     size = 0
    85     for x, y in zip(seq1, seq2):
    86         if x == "-":
    87             if size:
    88                 yield beg1, beg2, size
    89                 beg1 += size
    90                 beg2 += size
    91                 size = 0
    92             beg2 += 1
    93         elif y == "-":
    94             if size:
    95                 yield beg1, beg2, size
    96                 beg1 += size
    97                 beg2 += size
    98                 size = 0
    99             beg1 += 1
   100         else:
   101             size += 1
   102     if size: yield beg1, beg2, size
   103 
   104 def alignmentInput(lines):
   105     '''Get alignments and sequence lengths, from MAF or tabular format.'''
   106     mafCount = 0
   107     for line in lines:
   108         w = line.split()
   109         if line[0].isdigit():  # tabular format
   110             chr1, beg1, seqlen1 = w[1], int(w[2]), int(w[5])
   111             if w[4] == "-": beg1 -= seqlen1
   112             chr2, beg2, seqlen2 = w[6], int(w[7]), int(w[10])
   113             if w[9] == "-": beg2 -= seqlen2
   114             blocks = tabBlocks(beg1, beg2, w[11])
   115             yield chr1, seqlen1, chr2, seqlen2, blocks
   116         elif line[0] == "s":  # MAF format
   117             if mafCount == 0:
   118                 chr1, beg1, seqlen1, seq1 = w[1], int(w[2]), int(w[5]), w[6]
   119                 if w[4] == "-": beg1 -= seqlen1
   120                 mafCount = 1
   121             else:
   122                 chr2, beg2, seqlen2, seq2 = w[1], int(w[2]), int(w[5]), w[6]
   123                 if w[4] == "-": beg2 -= seqlen2
   124                 blocks = mafBlocks(beg1, beg2, seq1, seq2)
   125                 yield chr1, seqlen1, chr2, seqlen2, blocks
   126                 mafCount = 0
   127 
   128 def seqRequestFromText(text):
   129     if ":" in text:
   130         pattern, interval = text.rsplit(":", 1)
   131         if "-" in interval:
   132             beg, end = interval.rsplit("-", 1)
   133             return pattern, int(beg), int(end)  # beg may be negative
   134     return text, 0, sys.maxsize
   135 
   136 def rangesFromSeqName(seqRequests, name, seqLen):
   137     if seqRequests:
   138         base = name.split(".")[-1]  # allow for names like hg19.chr7
   139         for pat, beg, end in seqRequests:
   140             if fnmatchcase(name, pat) or fnmatchcase(base, pat):
   141                 yield max(beg, 0), min(end, seqLen)
   142     else:
   143         yield 0, seqLen
   144 
   145 def updateSeqs(coverDict, seqRanges, seqName, ranges, coveredRange):
   146     beg, end = coveredRange
   147     if beg < 0:
   148         coveredRange = -end, -beg
   149     if seqName in coverDict:
   150         coverDict[seqName].append(coveredRange)
   151     else:
   152         coverDict[seqName] = [coveredRange]
   153         for beg, end in ranges:
   154             r = seqName, beg, end
   155             seqRanges.append(r)
   156 
   157 def readAlignments(fileName, opts):
   158     '''Get alignments and sequence limits, from MAF or tabular format.'''
   159     seqRequests1 = [seqRequestFromText(i) for i in opts.seq1]
   160     seqRequests2 = [seqRequestFromText(i) for i in opts.seq2]
   161 
   162     alignments = []
   163     seqRanges1 = []
   164     seqRanges2 = []
   165     coverDict1 = {}
   166     coverDict2 = {}
   167     lines = myOpen(fileName)
   168     for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
   169         ranges1 = sorted(rangesFromSeqName(seqRequests1, seqName1, seqLen1))
   170         if not ranges1: continue
   171         ranges2 = sorted(rangesFromSeqName(seqRequests2, seqName2, seqLen2))
   172         if not ranges2: continue
   173         b = list(croppedBlocks(list(blocks), ranges1, ranges2))
   174         if not b: continue
   175         aln = seqName1, seqName2, b
   176         alignments.append(aln)
   177         coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
   178         updateSeqs(coverDict1, seqRanges1, seqName1, ranges1, coveredRange1)
   179         coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
   180         updateSeqs(coverDict2, seqRanges2, seqName2, ranges2, coveredRange2)
   181     return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
   182 
   183 def nameAndRangesFromDict(cropDict, seqName):
   184     if seqName in cropDict:
   185         return seqName, cropDict[seqName]
   186     n = seqName.split(".")[-1]
   187     if n in cropDict:
   188         return n, cropDict[n]
   189     return seqName, []
   190 
   191 def rangesForSecondaryAlignments(primaryRanges, seqLen):
   192     if primaryRanges:
   193         return primaryRanges
   194     return [(0, seqLen)]
   195 
   196 def readSecondaryAlignments(opts, cropRanges1, cropRanges2):
   197     cropDict1 = dict(groupByFirstItem(cropRanges1))
   198     cropDict2 = dict(groupByFirstItem(cropRanges2))
   199 
   200     alignments = []
   201     seqRanges1 = []
   202     seqRanges2 = []
   203     coverDict1 = {}
   204     coverDict2 = {}
   205     lines = myOpen(opts.alignments)
   206     for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
   207         seqName1, ranges1 = nameAndRangesFromDict(cropDict1, seqName1)
   208         seqName2, ranges2 = nameAndRangesFromDict(cropDict2, seqName2)
   209         if not ranges1 and not ranges2:
   210             continue
   211         r1 = rangesForSecondaryAlignments(ranges1, seqLen1)
   212         r2 = rangesForSecondaryAlignments(ranges2, seqLen2)
   213         b = list(croppedBlocks(list(blocks), r1, r2))
   214         if not b: continue
   215         aln = seqName1, seqName2, b
   216         alignments.append(aln)
   217         if not ranges1:
   218             coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
   219             updateSeqs(coverDict1, seqRanges1, seqName1, r1, coveredRange1)
   220         if not ranges2:
   221             coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
   222             updateSeqs(coverDict2, seqRanges2, seqName2, r2, coveredRange2)
   223     return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
   224 
   225 def twoValuesFromOption(text, separator):
   226     if separator in text:
   227         return text.split(separator)
   228     return text, text
   229 
   230 def mergedRanges(ranges):
   231     oldBeg, maxEnd = ranges[0]
   232     for beg, end in ranges:
   233         if beg > maxEnd:
   234             yield oldBeg, maxEnd
   235             oldBeg = beg
   236             maxEnd = end
   237         elif end > maxEnd:
   238             maxEnd = end
   239     yield oldBeg, maxEnd
   240 
   241 def mergedRangesPerSeq(coverDict):
   242     for k, v in coverDict.items():
   243         v.sort()
   244         yield k, list(mergedRanges(v))
   245 
   246 def coveredLength(mergedCoverDict):
   247     return sum(sum(e - b for b, e in v) for v in mergedCoverDict.values())
   248 
   249 def trimmed(seqRanges, coverDict, minAlignedBases, maxGapFrac, endPad, midPad):
   250     maxEndGapFrac, maxMidGapFrac = twoValuesFromOption(maxGapFrac, ",")
   251     maxEndGap = max(float(maxEndGapFrac) * minAlignedBases, endPad * 1.0)
   252     maxMidGap = max(float(maxMidGapFrac) * minAlignedBases, midPad * 2.0)
   253 
   254     for seqName, rangeBeg, rangeEnd in seqRanges:
   255         seqBlocks = coverDict[seqName]
   256         blocks = [i for i in seqBlocks if i[0] < rangeEnd and i[1] > rangeBeg]
   257         if blocks[0][0] - rangeBeg > maxEndGap:
   258             rangeBeg = blocks[0][0] - endPad
   259         for j, y in enumerate(blocks):
   260             if j:
   261                 x = blocks[j - 1]
   262                 if y[0] - x[1] > maxMidGap:
   263                     yield seqName, rangeBeg, x[1] + midPad
   264                     rangeBeg = y[0] - midPad
   265         if rangeEnd - blocks[-1][1] > maxEndGap:
   266             rangeEnd = blocks[-1][1] + endPad
   267         yield seqName, rangeBeg, rangeEnd
   268 
   269 def rangesWithStrandInfo(seqRanges, strandOpt, alignments, seqIndex):
   270     if strandOpt == "1":
   271         forwardMinusReverse = collections.defaultdict(int)
   272         for i in alignments:
   273             blocks = i[2]
   274             beg1, beg2, size = blocks[0]
   275             numOfAlignedLetterPairs = sum(i[2] for i in blocks)
   276             if (beg1 < 0) != (beg2 < 0):  # opposite-strand alignment
   277                 numOfAlignedLetterPairs *= -1
   278             forwardMinusReverse[i[seqIndex]] += numOfAlignedLetterPairs
   279     strandNum = 0
   280     for seqName, beg, end in seqRanges:
   281         if strandOpt == "1":
   282             strandNum = 1 if forwardMinusReverse[seqName] >= 0 else 2
   283         yield seqName, beg, end, strandNum
   284 
   285 def natural_sort_key(my_string):
   286     '''Return a sort key for "natural" ordering, e.g. chr9 < chr10.'''
   287     parts = re.split(r'(\d+)', my_string)
   288     parts[1::2] = map(int, parts[1::2])
   289     return parts
   290 
   291 def nameKey(oneSeqRanges):
   292     return natural_sort_key(oneSeqRanges[0][0])
   293 
   294 def sizeKey(oneSeqRanges):
   295     return sum(b - e for n, b, e, s in oneSeqRanges), nameKey(oneSeqRanges)
   296 
   297 def alignmentKey(seqNamesToLists, oneSeqRanges):
   298     seqName = oneSeqRanges[0][0]
   299     alignmentsOfThisSequence = seqNamesToLists[seqName]
   300     numOfAlignedLetterPairs = sum(i[3] for i in alignmentsOfThisSequence)
   301     toMiddle = numOfAlignedLetterPairs // 2
   302     for i in alignmentsOfThisSequence:
   303         toMiddle -= i[3]
   304         if toMiddle < 0:
   305             return i[1:3]  # sequence-rank and "position" of this alignment
   306 
   307 def rankAndFlipPerSeq(seqRanges):
   308     rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
   309     for rank, group in enumerate(rangesGroupedBySeqName):
   310         seqName, ranges = group
   311         strandNum = next(ranges)[3]
   312         flip = 1 if strandNum < 2 else -1
   313         yield seqName, (rank, flip)
   314 
   315 def alignmentSortData(alignments, seqIndex, otherNamesToRanksAndFlips):
   316     otherIndex = 1 - seqIndex
   317     for i in alignments:
   318         blocks = i[2]
   319         otherRank, otherFlip = otherNamesToRanksAndFlips[i[otherIndex]]
   320         otherPos = otherFlip * abs(blocks[0][otherIndex] +
   321                                    blocks[-1][otherIndex] + blocks[-1][2])
   322         numOfAlignedLetterPairs = sum(i[2] for i in blocks)
   323         yield i[seqIndex], otherRank, otherPos, numOfAlignedLetterPairs
   324 
   325 def mySortedRanges(seqRanges, sortOpt, seqIndex, alignments, otherRanges):
   326     rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
   327     g = [list(ranges) for seqName, ranges in rangesGroupedBySeqName]
   328     for i in g:
   329         if i[0][3] > 1:
   330             i.reverse()
   331     if sortOpt == "1":
   332         g.sort(key=nameKey)
   333     if sortOpt == "2":
   334         g.sort(key=sizeKey)
   335     if sortOpt == "3":
   336         otherNamesToRanksAndFlips = dict(rankAndFlipPerSeq(otherRanges))
   337         alns = sorted(alignmentSortData(alignments, seqIndex,
   338                                         otherNamesToRanksAndFlips))
   339         alnsGroupedBySeqName = itertools.groupby(alns, itemgetter(0))
   340         seqNamesToLists = dict((k, list(v)) for k, v in alnsGroupedBySeqName)
   341         g.sort(key=functools.partial(alignmentKey, seqNamesToLists))
   342     return [j for i in g for j in i]
   343 
   344 def allSortedRanges(opts, alignments, alignmentsB,
   345                     seqRanges1, seqRangesB1, seqRanges2, seqRangesB2):
   346     o1, oB1 = twoValuesFromOption(opts.strands1, ":")
   347     o2, oB2 = twoValuesFromOption(opts.strands2, ":")
   348     if o1 == "1" and o2 == "1":
   349         raise Exception("the strand options have circular dependency")
   350     seqRanges1 = list(rangesWithStrandInfo(seqRanges1, o1, alignments, 0))
   351     seqRanges2 = list(rangesWithStrandInfo(seqRanges2, o2, alignments, 1))
   352     seqRangesB1 = list(rangesWithStrandInfo(seqRangesB1, oB1, alignmentsB, 0))
   353     seqRangesB2 = list(rangesWithStrandInfo(seqRangesB2, oB2, alignmentsB, 1))
   354 
   355     o1, oB1 = twoValuesFromOption(opts.sort1, ":")
   356     o2, oB2 = twoValuesFromOption(opts.sort2, ":")
   357     if o1 == "3" and o2 == "3":
   358         raise Exception("the sort options have circular dependency")
   359     if o1 != "3":
   360         s1 = mySortedRanges(seqRanges1, o1, None, None, None)
   361     if o2 != "3":
   362         s2 = mySortedRanges(seqRanges2, o2, None, None, None)
   363     if o1 == "3":
   364         s1 = mySortedRanges(seqRanges1, o1, 0, alignments, s2)
   365     if o2 == "3":
   366         s2 = mySortedRanges(seqRanges2, o2, 1, alignments, s1)
   367     t1 = mySortedRanges(seqRangesB1, oB1, 0, alignmentsB, s2)
   368     t2 = mySortedRanges(seqRangesB2, oB2, 1, alignmentsB, s1)
   369     return s1 + t1, s2 + t2
   370 
   371 def prettyNum(n):
   372     t = str(n)
   373     groups = []
   374     while t:
   375         groups.append(t[-3:])
   376         t = t[:-3]
   377     return ",".join(reversed(groups))
   378 
   379 def sizeText(size):
   380     suffixes = "bp", "kb", "Mb", "Gb"
   381     for i, x in enumerate(suffixes):
   382         j = 10 ** (i * 3)
   383         if size < j * 10:
   384             return "%.2g" % (1.0 * size / j) + x
   385         if size < j * 1000 or i == len(suffixes) - 1:
   386             return "%.0f" % (1.0 * size / j) + x
   387 
   388 def labelText(seqRange, labelOpt):
   389     seqName, beg, end, strandNum = seqRange
   390     if labelOpt == 1:
   391         return seqName + ": " + sizeText(end - beg)
   392     if labelOpt == 2:
   393         return seqName + ":" + prettyNum(beg) + ": " + sizeText(end - beg)
   394     if labelOpt == 3:
   395         return seqName + ":" + prettyNum(beg) + "-" + prettyNum(end)
   396     return seqName
   397 
   398 def rangeLabels(seqRanges, labelOpt, font, fontsize, image_mode, textRot):
   399     if fontsize:
   400         image_size = 1, 1
   401         im = Image.new(image_mode, image_size)
   402         draw = ImageDraw.Draw(im)
   403     x = y = 0
   404     for r in seqRanges:
   405         text = labelText(r, labelOpt)
   406         if fontsize:
   407             x, y = draw.textsize(text, font=font)
   408             if textRot:
   409                 x, y = y, x
   410         yield text, x, y, r[3]
   411 
   412 def dataFromRanges(sortedRanges, font, fontSize, imageMode, labelOpt, textRot):
   413     for seqName, rangeBeg, rangeEnd, strandNum in sortedRanges:
   414         out = [seqName, str(rangeBeg), str(rangeEnd)]
   415         if strandNum > 0:
   416             out.append(".+-"[strandNum])
   417         warn("\t".join(out))
   418     warn("")
   419     rangeSizes = [e - b for n, b, e, s in sortedRanges]
   420     labs = list(rangeLabels(sortedRanges, labelOpt, font, fontSize,
   421                             imageMode, textRot))
   422     margin = max(i[2] for i in labs)
   423     # xxx the margin may be too big, because some labels may get omitted
   424     return rangeSizes, labs, margin
   425 
   426 def div_ceil(x, y):
   427     '''Return x / y rounded up.'''
   428     q, r = divmod(x, y)
   429     return q + (r != 0)
   430 
   431 def get_bp_per_pix(rangeSizes, pixTweenRanges, maxPixels):
   432     '''Get the minimum bp-per-pixel that fits in the size limit.'''
   433     warn("choosing bp per pixel...")
   434     numOfRanges = len(rangeSizes)
   435     maxPixelsInRanges = maxPixels - pixTweenRanges * (numOfRanges - 1)
   436     if maxPixelsInRanges < numOfRanges:
   437         raise Exception("can't fit the image: too many sequences?")
   438     negLimit = -maxPixelsInRanges
   439     negBpPerPix = sum(rangeSizes) // negLimit
   440     while True:
   441         if sum(i // negBpPerPix for i in rangeSizes) >= negLimit:
   442             return -negBpPerPix
   443         negBpPerPix -= 1
   444 
   445 def getRangePixBegs(rangePixLens, pixTweenRanges, margin):
   446     '''Get the start pixel for each range.'''
   447     rangePixBegs = []
   448     pix_tot = margin - pixTweenRanges
   449     for i in rangePixLens:
   450         pix_tot += pixTweenRanges
   451         rangePixBegs.append(pix_tot)
   452         pix_tot += i
   453     return rangePixBegs
   454 
   455 def pixelData(rangeSizes, bp_per_pix, pixTweenRanges, margin):
   456     '''Return pixel information about the ranges.'''
   457     rangePixLens = [div_ceil(i, bp_per_pix) for i in rangeSizes]
   458     rangePixBegs = getRangePixBegs(rangePixLens, pixTweenRanges, margin)
   459     tot_pix = rangePixBegs[-1] + rangePixLens[-1]
   460     return rangePixBegs, rangePixLens, tot_pix
   461 
   462 def drawLineForward(hits, width, bp_per_pix, beg1, beg2, size):
   463     while True:
   464         q1, r1 = divmod(beg1, bp_per_pix)
   465         q2, r2 = divmod(beg2, bp_per_pix)
   466         hits[q2 * width + q1] |= 1
   467         next_pix = min(bp_per_pix - r1, bp_per_pix - r2)
   468         if next_pix >= size: break
   469         beg1 += next_pix
   470         beg2 += next_pix
   471         size -= next_pix
   472 
   473 def drawLineReverse(hits, width, bp_per_pix, beg1, beg2, size):
   474     while True:
   475         q1, r1 = divmod(beg1, bp_per_pix)
   476         q2, r2 = divmod(beg2, bp_per_pix)
   477         hits[q2 * width + q1] |= 2
   478         next_pix = min(bp_per_pix - r1, r2 + 1)
   479         if next_pix >= size: break
   480         beg1 += next_pix
   481         beg2 -= next_pix
   482         size -= next_pix
   483 
   484 def strandAndOrigin(ranges, beg, size):
   485     isReverseStrand = (beg < 0)
   486     if isReverseStrand:
   487         beg = -(beg + size)
   488     for rangeBeg, rangeEnd, isReverseRange, origin in ranges:
   489         if rangeEnd > beg:  # assumes the ranges are sorted
   490             return (isReverseStrand != isReverseRange), origin
   491 
   492 def alignmentPixels(width, height, alignments, bp_per_pix,
   493                     rangeDict1, rangeDict2):
   494     hits = [0] * (width * height)  # the image data
   495     for seq1, seq2, blocks in alignments:
   496         beg1, beg2, size = blocks[0]
   497         isReverse1, ori1 = strandAndOrigin(rangeDict1[seq1], beg1, size)
   498         isReverse2, ori2 = strandAndOrigin(rangeDict2[seq2], beg2, size)
   499         for beg1, beg2, size in blocks:
   500             if isReverse1:
   501                 beg1 = -(beg1 + size)
   502                 beg2 = -(beg2 + size)
   503             if isReverse1 == isReverse2:
   504                 drawLineForward(hits, width, bp_per_pix,
   505                                 ori1 + beg1, ori2 + beg2, size)
   506             else:
   507                 drawLineReverse(hits, width, bp_per_pix,
   508                                 ori1 + beg1, ori2 - beg2 - 1, size)
   509     return hits
   510 
   511 def expandedSeqDict(seqDict):
   512     '''Allow lookup by short sequence names, e.g. chr7 as well as hg19.chr7.'''
   513     newDict = seqDict.copy()
   514     for name, x in seqDict.items():
   515         if "." in name:
   516             base = name.split(".")[-1]
   517             if base in newDict:  # an ambiguous case was found:
   518                 return seqDict   # so give up completely
   519             newDict[base] = x
   520     return newDict
   521 
   522 def readBed(fileName, rangeDict):
   523     for line in myOpen(fileName):
   524         w = line.split()
   525         if not w: continue
   526         seqName = w[0]
   527         if seqName not in rangeDict: continue
   528         beg = int(w[1])
   529         end = int(w[2])
   530         layer = 900
   531         color = "#fbf"
   532         if len(w) > 4:
   533             if w[4] != ".":
   534                 layer = float(w[4])
   535             if len(w) > 5:
   536                 if len(w) > 8 and w[8].count(",") == 2:
   537                     color = "rgb(" + w[8] + ")"
   538                 else:
   539                     strand = w[5]
   540                     isRev = rangeDict[seqName][0][2]
   541                     if strand == "+" and not isRev or strand == "-" and isRev:
   542                         color = "#ffe8e8"
   543                     if strand == "-" and not isRev or strand == "+" and isRev:
   544                         color = "#e8e8ff"
   545         yield layer, color, seqName, beg, end
   546 
   547 def commaSeparatedInts(text):
   548     return map(int, text.rstrip(",").split(","))
   549 
   550 def readGenePred(opts, fileName, rangeDict):
   551     for line in myOpen(fileName):
   552         fields = line.split()
   553         if not fields: continue
   554         if fields[2] not in "+-": fields = fields[1:]
   555         seqName = fields[1]
   556         if seqName not in rangeDict: continue
   557         #strand = fields[2]
   558         cdsBeg = int(fields[5])
   559         cdsEnd = int(fields[6])
   560         exonBegs = commaSeparatedInts(fields[8])
   561         exonEnds = commaSeparatedInts(fields[9])
   562         for beg, end in zip(exonBegs, exonEnds):
   563             yield 300, opts.exon_color, seqName, beg, end
   564             b = max(beg, cdsBeg)
   565             e = min(end, cdsEnd)
   566             if b < e: yield 400, opts.cds_color, seqName, b, e
   567 
   568 def readRmsk(fileName, rangeDict):
   569     for line in myOpen(fileName):
   570         fields = line.split()
   571         if len(fields) == 17:  # rmsk.txt
   572             seqName = fields[5]
   573             if seqName not in rangeDict: continue  # do this ASAP for speed
   574             beg = int(fields[6])
   575             end = int(fields[7])
   576             strand = fields[9]
   577             repeatClass = fields[11]
   578         elif len(fields) == 15:  # .out
   579             seqName = fields[4]
   580             if seqName not in rangeDict: continue
   581             beg = int(fields[5]) - 1
   582             end = int(fields[6])
   583             strand = fields[8]
   584             repeatClass = fields[10]
   585         else:
   586             continue
   587         if repeatClass in ("Low_complexity", "Simple_repeat"):
   588             yield 200, "#fbf", seqName, beg, end
   589         elif (strand == "+") != rangeDict[seqName][0][2]:
   590             yield 100, "#ffe8e8", seqName, beg, end
   591         else:
   592             yield 100, "#e8e8ff", seqName, beg, end
   593 
   594 def isExtraFirstGapField(fields):
   595     return fields[4].isdigit()
   596 
   597 def readGaps(opts, fileName, rangeDict):
   598     '''Read locations of unsequenced gaps, from an agp or gap file.'''
   599     for line in myOpen(fileName):
   600         w = line.split()
   601         if not w or w[0][0] == "#": continue
   602         if isExtraFirstGapField(w): w = w[1:]
   603         if w[4] not in "NU": continue
   604         seqName = w[0]
   605         if seqName not in rangeDict: continue
   606         end = int(w[2])
   607         beg = end - int(w[5])  # zero-based coordinate
   608         if w[7] == "yes":
   609             yield 3000, opts.bridged_color, seqName, beg, end
   610         else:
   611             yield 2000, opts.unbridged_color, seqName, beg, end
   612 
   613 def bedBoxes(beds, rangeDict, margin, edge, isTop, bpPerPix):
   614     for layer, color, seqName, bedBeg, bedEnd in beds:
   615         for rangeBeg, rangeEnd, isReverseRange, origin in rangeDict[seqName]:
   616             beg = max(bedBeg, rangeBeg)
   617             end = min(bedEnd, rangeEnd)
   618             if beg >= end: continue
   619             if isReverseRange:
   620                 beg, end = -end, -beg
   621             if layer <= 1000:
   622                 # include partly-covered pixels
   623                 b = (origin + beg) // bpPerPix
   624                 e = div_ceil(origin + end, bpPerPix)
   625             else:
   626                 # exclude partly-covered pixels
   627                 b = div_ceil(origin + beg, bpPerPix)
   628                 e = (origin + end) // bpPerPix
   629                 if e <= b: continue
   630                 if bedEnd >= rangeEnd:  # include partly-covered end pixels
   631                     if isReverseRange:
   632                         b = (origin + beg) // bpPerPix
   633                     else:
   634                         e = div_ceil(origin + end, bpPerPix)
   635             if isTop:
   636                 box = b, margin, e, edge
   637             else:
   638                 box = margin, b, edge, e
   639             yield layer, color, box
   640 
   641 def drawAnnotations(im, boxes):
   642     # xxx use partial transparency for different-color overlaps?
   643     for layer, color, box in boxes:
   644         im.paste(color, box)
   645 
   646 def placedLabels(labels, rangePixBegs, rangePixLens, beg, end):
   647     '''Return axis labels with endpoint & sort-order information.'''
   648     maxWidth = end - beg
   649     for i, j, k in zip(labels, rangePixBegs, rangePixLens):
   650         text, textWidth, textHeight, strandNum = i
   651         if textWidth > maxWidth:
   652             continue
   653         labelBeg = j + (k - textWidth) // 2
   654         labelEnd = labelBeg + textWidth
   655         sortKey = textWidth - k
   656         if labelBeg < beg:
   657             sortKey += maxWidth * (beg - labelBeg)
   658             labelBeg = beg
   659             labelEnd = beg + textWidth
   660         if labelEnd > end:
   661             sortKey += maxWidth * (labelEnd - end)
   662             labelEnd = end
   663             labelBeg = end - textWidth
   664         yield sortKey, labelBeg, labelEnd, text, textHeight, strandNum
   665 
   666 def nonoverlappingLabels(labels, minPixTweenLabels):
   667     '''Get a subset of non-overlapping axis labels, greedily.'''
   668     out = []
   669     for i in labels:
   670         beg = i[1] - minPixTweenLabels
   671         end = i[2] + minPixTweenLabels
   672         if all(j[2] <= beg or j[1] >= end for j in out):
   673             out.append(i)
   674     return out
   675 
   676 def axisImage(labels, rangePixBegs, rangePixLens, textRot,
   677               textAln, font, image_mode, opts):
   678     '''Make an image of axis labels.'''
   679     beg = rangePixBegs[0]
   680     end = rangePixBegs[-1] + rangePixLens[-1]
   681     margin = max(i[2] for i in labels)
   682     labels = sorted(placedLabels(labels, rangePixBegs, rangePixLens, beg, end))
   683     minPixTweenLabels = 0 if textRot else opts.label_space
   684     labels = nonoverlappingLabels(labels, minPixTweenLabels)
   685     image_size = (margin, end) if textRot else (end, margin)
   686     im = Image.new(image_mode, image_size, opts.margin_color)
   687     draw = ImageDraw.Draw(im)
   688     for sortKey, labelBeg, labelEnd, text, textHeight, strandNum in labels:
   689         base = margin - textHeight if textAln else 0
   690         position = (base, labelBeg) if textRot else (labelBeg, base)
   691         fill = ("black", opts.forwardcolor, opts.reversecolor)[strandNum]
   692         draw.text(position, text, font=font, fill=fill)
   693     return im
   694 
   695 def rangesWithOrigins(sortedRanges, rangePixBegs, rangePixLens, bpPerPix):
   696     for i, j, k in zip(sortedRanges, rangePixBegs, rangePixLens):
   697         seqName, rangeBeg, rangeEnd, strandNum = i
   698         isReverseRange = (strandNum > 1)
   699         if isReverseRange:
   700             origin = bpPerPix * (j + k) + rangeBeg
   701         else:
   702             origin = bpPerPix * j - rangeBeg
   703         yield seqName, (rangeBeg, rangeEnd, isReverseRange, origin)
   704 
   705 def rangesPerSeq(sortedRanges, rangePixBegs, rangePixLens, bpPerPix):
   706     a = rangesWithOrigins(sortedRanges, rangePixBegs, rangePixLens, bpPerPix)
   707     for k, v in itertools.groupby(a, itemgetter(0)):
   708         yield k, sorted(i[1] for i in v)
   709 
   710 def getFont(opts):
   711     if opts.fontfile:
   712         return ImageFont.truetype(opts.fontfile, opts.fontsize)
   713     fileNames = []
   714     try:
   715         x = ["fc-match", "-f%{file}", "arial"]
   716         p = subprocess.Popen(x, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
   717                              universal_newlines=True)
   718         out, err = p.communicate()
   719         fileNames.append(out)
   720     except OSError as e:
   721         warn("fc-match error: " + str(e))
   722     fileNames.append("/Library/Fonts/Arial.ttf")  # for Mac
   723     for i in fileNames:
   724         try:
   725             font = ImageFont.truetype(i, opts.fontsize)
   726             warn("font: " + i)
   727             return font
   728         except IOError as e:
   729             warn("font load error: " + str(e))
   730     return ImageFont.load_default()
   731 
   732 def lastDotplot(opts, args):
   733     font = getFont(opts)
   734     image_mode = 'RGB'
   735     forward_color = ImageColor.getcolor(opts.forwardcolor, image_mode)
   736     reverse_color = ImageColor.getcolor(opts.reversecolor, image_mode)
   737     zipped_colors = zip(forward_color, reverse_color)
   738     overlap_color = tuple([(i + j) // 2 for i, j in zipped_colors])
   739 
   740     maxGap1, maxGapB1 = twoValuesFromOption(opts.max_gap1, ":")
   741     maxGap2, maxGapB2 = twoValuesFromOption(opts.max_gap2, ":")
   742 
   743     warn("reading alignments...")
   744     alnData = readAlignments(args[0], opts)
   745     alignments, seqRanges1, coverDict1, seqRanges2, coverDict2 = alnData
   746     if not alignments: raise Exception("there are no alignments")
   747     warn("cutting...")
   748     coverDict1 = dict(mergedRangesPerSeq(coverDict1))
   749     coverDict2 = dict(mergedRangesPerSeq(coverDict2))
   750     minAlignedBases = min(coveredLength(coverDict1), coveredLength(coverDict2))
   751     pad = int(opts.pad * minAlignedBases)
   752     cutRanges1 = list(trimmed(seqRanges1, coverDict1, minAlignedBases,
   753                               maxGap1, pad, pad))
   754     cutRanges2 = list(trimmed(seqRanges2, coverDict2, minAlignedBases,
   755                               maxGap2, pad, pad))
   756 
   757     warn("reading secondary alignments...")
   758     alnDataB = readSecondaryAlignments(opts, cutRanges1, cutRanges2)
   759     alignmentsB, seqRangesB1, coverDictB1, seqRangesB2, coverDictB2 = alnDataB
   760     warn("cutting...")
   761     coverDictB1 = dict(mergedRangesPerSeq(coverDictB1))
   762     coverDictB2 = dict(mergedRangesPerSeq(coverDictB2))
   763     cutRangesB1 = trimmed(seqRangesB1, coverDictB1, minAlignedBases,
   764                           maxGapB1, 0, 0)
   765     cutRangesB2 = trimmed(seqRangesB2, coverDictB2, minAlignedBases,
   766                           maxGapB2, 0, 0)
   767 
   768     warn("sorting...")
   769     sortOut = allSortedRanges(opts, alignments, alignmentsB,
   770                               cutRanges1, cutRangesB1, cutRanges2, cutRangesB2)
   771     sortedRanges1, sortedRanges2 = sortOut
   772 
   773     textRot1 = "vertical".startswith(opts.rot1)
   774     i1 = dataFromRanges(sortedRanges1, font,
   775                         opts.fontsize, image_mode, opts.labels1, textRot1)
   776     rangeSizes1, labelData1, tMargin = i1
   777 
   778     textRot2 = "horizontal".startswith(opts.rot2)
   779     i2 = dataFromRanges(sortedRanges2, font,
   780                         opts.fontsize, image_mode, opts.labels2, textRot2)
   781     rangeSizes2, labelData2, lMargin = i2
   782 
   783     maxPixels1 = opts.width  - lMargin
   784     maxPixels2 = opts.height - tMargin
   785     bpPerPix1 = get_bp_per_pix(rangeSizes1, opts.border_pixels, maxPixels1)
   786     bpPerPix2 = get_bp_per_pix(rangeSizes2, opts.border_pixels, maxPixels2)
   787     bpPerPix = max(bpPerPix1, bpPerPix2)
   788     warn("bp per pixel = " + str(bpPerPix))
   789 
   790     p1 = pixelData(rangeSizes1, bpPerPix, opts.border_pixels, lMargin)
   791     rangePixBegs1, rangePixLens1, width = p1
   792     rangeDict1 = dict(rangesPerSeq(sortedRanges1, rangePixBegs1,
   793                                    rangePixLens1, bpPerPix))
   794 
   795     p2 = pixelData(rangeSizes2, bpPerPix, opts.border_pixels, tMargin)
   796     rangePixBegs2, rangePixLens2, height = p2
   797     rangeDict2 = dict(rangesPerSeq(sortedRanges2, rangePixBegs2,
   798                                    rangePixLens2, bpPerPix))
   799 
   800     warn("width:  " + str(width))
   801     warn("height: " + str(height))
   802 
   803     warn("processing alignments...")
   804     hits = alignmentPixels(width, height, alignments + alignmentsB, bpPerPix,
   805                            rangeDict1, rangeDict2)
   806 
   807     warn("reading annotations...")
   808 
   809     rangeDict1 = expandedSeqDict(rangeDict1)
   810     beds1 = itertools.chain(readBed(opts.bed1, rangeDict1),
   811                             readRmsk(opts.rmsk1, rangeDict1),
   812                             readGenePred(opts, opts.genePred1, rangeDict1),
   813                             readGaps(opts, opts.gap1, rangeDict1))
   814     b1 = bedBoxes(beds1, rangeDict1, tMargin, height, True, bpPerPix)
   815 
   816     rangeDict2 = expandedSeqDict(rangeDict2)
   817     beds2 = itertools.chain(readBed(opts.bed2, rangeDict2),
   818                             readRmsk(opts.rmsk2, rangeDict2),
   819                             readGenePred(opts, opts.genePred2, rangeDict2),
   820                             readGaps(opts, opts.gap2, rangeDict2))
   821     b2 = bedBoxes(beds2, rangeDict2, lMargin, width, False, bpPerPix)
   822 
   823     boxes = sorted(itertools.chain(b1, b2))
   824 
   825     warn("drawing...")
   826 
   827     image_size = width, height
   828     im = Image.new(image_mode, image_size, opts.background_color)
   829 
   830     drawAnnotations(im, boxes)
   831 
   832     for i in range(height):
   833         for j in range(width):
   834             store_value = hits[i * width + j]
   835             xy = j, i
   836             if   store_value == 1: im.putpixel(xy, forward_color)
   837             elif store_value == 2: im.putpixel(xy, reverse_color)
   838             elif store_value == 3: im.putpixel(xy, overlap_color)
   839 
   840     if opts.fontsize != 0:
   841         axis1 = axisImage(labelData1, rangePixBegs1, rangePixLens1,
   842                           textRot1, False, font, image_mode, opts)
   843         if textRot1:
   844             axis1 = axis1.transpose(Image.ROTATE_90)
   845         axis2 = axisImage(labelData2, rangePixBegs2, rangePixLens2,
   846                           textRot2, textRot2, font, image_mode, opts)
   847         if not textRot2:
   848             axis2 = axis2.transpose(Image.ROTATE_270)
   849         im.paste(axis1, (0, 0))
   850         im.paste(axis2, (0, 0))
   851 
   852     for i in rangePixBegs1[1:]:
   853         box = i - opts.border_pixels, tMargin, i, height
   854         im.paste(opts.border_color, box)
   855 
   856     for i in rangePixBegs2[1:]:
   857         box = lMargin, i - opts.border_pixels, width, i
   858         im.paste(opts.border_color, box)
   859 
   860     im.save(args[1])
   861 
   862 if __name__ == "__main__":
   863     usage = """%prog --help
   864    or: %prog [options] maf-or-tab-alignments dotplot.png
   865    or: %prog [options] maf-or-tab-alignments dotplot.gif
   866    or: ..."""
   867     description = "Draw a dotplot of pair-wise sequence alignments in MAF or tabular format."
   868     op = optparse.OptionParser(usage=usage, description=description)
   869     op.add_option("-v", "--verbose", action="count",
   870                   help="show progress messages & data about the plot")
   871     op.add_option("-1", "--seq1", metavar="PATTERN", action="append",
   872                   default=[],
   873                   help="which sequences to show from the 1st genome")
   874     op.add_option("-2", "--seq2", metavar="PATTERN", action="append",
   875                   default=[],
   876                   help="which sequences to show from the 2nd genome")
   877     # Replace "width" & "height" with a single "length" option?
   878     op.add_option("-x", "--width", type="int", default=1000,
   879                   help="maximum width in pixels (default: %default)")
   880     op.add_option("-y", "--height", type="int", default=1000,
   881                   help="maximum height in pixels (default: %default)")
   882     op.add_option("-c", "--forwardcolor", metavar="COLOR", default="red",
   883                   help="color for forward alignments (default: %default)")
   884     op.add_option("-r", "--reversecolor", metavar="COLOR", default="blue",
   885                   help="color for reverse alignments (default: %default)")
   886     op.add_option("--alignments", metavar="FILE", help="secondary alignments")
   887     op.add_option("--sort1", default="1", metavar="N",
   888                   help="genome1 sequence order: 0=input order, 1=name order, "
   889                   "2=length order, 3=alignment order (default=%default)")
   890     op.add_option("--sort2", default="1", metavar="N",
   891                   help="genome2 sequence order: 0=input order, 1=name order, "
   892                   "2=length order, 3=alignment order (default=%default)")
   893     op.add_option("--strands1", default="0", metavar="N", help=
   894                   "genome1 sequence orientation: 0=forward orientation, "
   895                   "1=alignment orientation (default=%default)")
   896     op.add_option("--strands2", default="0", metavar="N", help=
   897                   "genome2 sequence orientation: 0=forward orientation, "
   898                   "1=alignment orientation (default=%default)")
   899     op.add_option("--max-gap1", metavar="FRAC", default="0.5,2", help=
   900                   "maximum unaligned (end,mid) gap in genome1: "
   901                   "fraction of aligned length (default=%default)")
   902     op.add_option("--max-gap2", metavar="FRAC", default="0.5,2", help=
   903                   "maximum unaligned (end,mid) gap in genome2: "
   904                   "fraction of aligned length (default=%default)")
   905     op.add_option("--pad", metavar="FRAC", type="float", default=0.04, help=
   906                   "pad length when cutting unaligned gaps: "
   907                   "fraction of aligned length (default=%default)")
   908     op.add_option("--border-pixels", metavar="INT", type="int", default=1,
   909                   help="number of pixels between sequences (default=%default)")
   910     op.add_option("--border-color", metavar="COLOR", default="black",
   911                   help="color for pixels between sequences (default=%default)")
   912     # --break-color and/or --break-pixels for intra-sequence breaks?
   913     op.add_option("--margin-color", metavar="COLOR", default="#dcdcdc",
   914                   help="margin color")
   915 
   916     og = optparse.OptionGroup(op, "Text options")
   917     og.add_option("-f", "--fontfile", metavar="FILE",
   918                   help="TrueType or OpenType font file")
   919     og.add_option("-s", "--fontsize", metavar="SIZE", type="int", default=14,
   920                   help="TrueType or OpenType font size (default: %default)")
   921     og.add_option("--labels1", type="int", default=0, metavar="N", help=
   922                   "genome1 labels: 0=name, 1=name:length, "
   923                   "2=name:start:length, 3=name:start-end (default=%default)")
   924     og.add_option("--labels2", type="int", default=0, metavar="N", help=
   925                   "genome2 labels: 0=name, 1=name:length, "
   926                   "2=name:start:length, 3=name:start-end (default=%default)")
   927     og.add_option("--rot1", metavar="ROT", default="h",
   928                   help="text rotation for the 1st genome (default=%default)")
   929     og.add_option("--rot2", metavar="ROT", default="v",
   930                   help="text rotation for the 2nd genome (default=%default)")
   931     op.add_option_group(og)
   932 
   933     og = optparse.OptionGroup(op, "Annotation options")
   934     og.add_option("--bed1", metavar="FILE",
   935                   help="read genome1 annotations from BED file")
   936     og.add_option("--bed2", metavar="FILE",
   937                   help="read genome2 annotations from BED file")
   938     og.add_option("--rmsk1", metavar="FILE", help="read genome1 repeats from "
   939                   "RepeatMasker .out or rmsk.txt file")
   940     og.add_option("--rmsk2", metavar="FILE", help="read genome2 repeats from "
   941                   "RepeatMasker .out or rmsk.txt file")
   942     op.add_option_group(og)
   943 
   944     og = optparse.OptionGroup(op, "Gene options")
   945     og.add_option("--genePred1", metavar="FILE",
   946                   help="read genome1 genes from genePred file")
   947     og.add_option("--genePred2", metavar="FILE",
   948                   help="read genome2 genes from genePred file")
   949     og.add_option("--exon-color", metavar="COLOR", default="PaleGreen",
   950                   help="color for exons (default=%default)")
   951     og.add_option("--cds-color", metavar="COLOR", default="LimeGreen",
   952                   help="color for protein-coding regions (default=%default)")
   953     op.add_option_group(og)
   954 
   955     og = optparse.OptionGroup(op, "Unsequenced gap options")
   956     og.add_option("--gap1", metavar="FILE",
   957                   help="read genome1 unsequenced gaps from agp or gap file")
   958     og.add_option("--gap2", metavar="FILE",
   959                   help="read genome2 unsequenced gaps from agp or gap file")
   960     og.add_option("--bridged-color", metavar="COLOR", default="yellow",
   961                   help="color for bridged gaps (default: %default)")
   962     og.add_option("--unbridged-color", metavar="COLOR", default="orange",
   963                   help="color for unbridged gaps (default: %default)")
   964     op.add_option_group(og)
   965     (opts, args) = op.parse_args()
   966     if len(args) != 2: op.error("2 arguments needed")
   967 
   968     opts.background_color = "white"
   969     opts.label_space = 5     # minimum number of pixels between axis labels
   970 
   971     try: lastDotplot(opts, args)
   972     except KeyboardInterrupt: pass  # avoid silly error message
   973     except Exception as e:
   974         prog = os.path.basename(sys.argv[0])
   975         sys.exit(prog + ": error: " + str(e))