scripts/last-dotplot
author Martin C. Frith
Mon Dec 10 08:23:09 2018 +0900 (2018-12-10)
changeset 961 ed0fb9b1eb40
parent 957 c54672c8892e
child 980 27c26371226b
permissions -rwxr-xr-x
Add last-dotplot --maxseqs option
     1 #! /usr/bin/env python
     2 
     3 # Read pair-wise alignments in MAF or LAST tabular format: write an
     4 # "Oxford grid", a.k.a. dotplot.
     5 
     6 # TODO: Currently, pixels with zero aligned nt-pairs are white, and
     7 # pixels with one or more aligned nt-pairs are black.  This can look
     8 # too crowded for large genome alignments.  I tried shading each pixel
     9 # according to the number of aligned nt-pairs within it, but the
    10 # result is too faint.  How can this be done better?
    11 
    12 import collections
    13 import functools
    14 import gzip
    15 from fnmatch import fnmatchcase
    16 import logging
    17 from operator import itemgetter
    18 import subprocess
    19 import itertools, optparse, os, re, sys
    20 
    21 # Try to make PIL/PILLOW work:
    22 try:
    23     from PIL import Image, ImageDraw, ImageFont, ImageColor
    24 except ImportError:
    25     import Image, ImageDraw, ImageFont, ImageColor
    26 
    27 try:
    28     from future_builtins import zip
    29 except ImportError:
    30     pass
    31 
    32 def myOpen(fileName):  # faster than fileinput
    33     if fileName is None:
    34         return []
    35     if fileName == "-":
    36         return sys.stdin
    37     if fileName.endswith(".gz"):
    38         return gzip.open(fileName, "rt")  # xxx dubious for Python2
    39     return open(fileName)
    40 
    41 def groupByFirstItem(things):
    42     for k, v in itertools.groupby(things, itemgetter(0)):
    43         yield k, [i[1:] for i in v]
    44 
    45 def croppedBlocks(blocks, ranges1, ranges2):
    46     headBeg1, headBeg2, headSize = blocks[0]
    47     for r1 in ranges1:
    48         for r2 in ranges2:
    49             cropBeg1, cropEnd1 = r1
    50             if headBeg1 < 0:
    51                 cropBeg1, cropEnd1 = -cropEnd1, -cropBeg1
    52             cropBeg2, cropEnd2 = r2
    53             if headBeg2 < 0:
    54                 cropBeg2, cropEnd2 = -cropEnd2, -cropBeg2
    55             for beg1, beg2, size in blocks:
    56                 b1 = max(cropBeg1, beg1)
    57                 e1 = min(cropEnd1, beg1 + size)
    58                 if b1 >= e1: continue
    59                 offset = beg2 - beg1
    60                 b2 = max(cropBeg2, b1 + offset)
    61                 e2 = min(cropEnd2, e1 + offset)
    62                 if b2 >= e2: continue
    63                 yield b2 - offset, b2, e2 - b2
    64 
    65 def tabBlocks(beg1, beg2, blocks):
    66     '''Get the gapless blocks of an alignment, from LAST tabular format.'''
    67     for i in blocks.split(","):
    68         if ":" in i:
    69             x, y = i.split(":")
    70             beg1 += int(x)
    71             beg2 += int(y)
    72         else:
    73             size = int(i)
    74             yield beg1, beg2, size
    75             beg1 += size
    76             beg2 += size
    77 
    78 def mafBlocks(beg1, beg2, seq1, seq2):
    79     '''Get the gapless blocks of an alignment, from MAF format.'''
    80     size = 0
    81     for x, y in zip(seq1, seq2):
    82         if x == "-":
    83             if size:
    84                 yield beg1, beg2, size
    85                 beg1 += size
    86                 beg2 += size
    87                 size = 0
    88             beg2 += 1
    89         elif y == "-":
    90             if size:
    91                 yield beg1, beg2, size
    92                 beg1 += size
    93                 beg2 += size
    94                 size = 0
    95             beg1 += 1
    96         else:
    97             size += 1
    98     if size: yield beg1, beg2, size
    99 
   100 def alignmentInput(lines):
   101     '''Get alignments and sequence lengths, from MAF or tabular format.'''
   102     mafCount = 0
   103     for line in lines:
   104         w = line.split()
   105         if line[0].isdigit():  # tabular format
   106             chr1, beg1, seqlen1 = w[1], int(w[2]), int(w[5])
   107             if w[4] == "-": beg1 -= seqlen1
   108             chr2, beg2, seqlen2 = w[6], int(w[7]), int(w[10])
   109             if w[9] == "-": beg2 -= seqlen2
   110             blocks = tabBlocks(beg1, beg2, w[11])
   111             yield chr1, seqlen1, chr2, seqlen2, blocks
   112         elif line[0] == "s":  # MAF format
   113             if mafCount == 0:
   114                 chr1, beg1, seqlen1, seq1 = w[1], int(w[2]), int(w[5]), w[6]
   115                 if w[4] == "-": beg1 -= seqlen1
   116                 mafCount = 1
   117             else:
   118                 chr2, beg2, seqlen2, seq2 = w[1], int(w[2]), int(w[5]), w[6]
   119                 if w[4] == "-": beg2 -= seqlen2
   120                 blocks = mafBlocks(beg1, beg2, seq1, seq2)
   121                 yield chr1, seqlen1, chr2, seqlen2, blocks
   122                 mafCount = 0
   123 
   124 def seqRequestFromText(text):
   125     if ":" in text:
   126         pattern, interval = text.rsplit(":", 1)
   127         if "-" in interval:
   128             beg, end = interval.rsplit("-", 1)
   129             return pattern, int(beg), int(end)  # beg may be negative
   130     return text, 0, sys.maxsize
   131 
   132 def rangesFromSeqName(seqRequests, name, seqLen):
   133     if seqRequests:
   134         base = name.split(".")[-1]  # allow for names like hg19.chr7
   135         for pat, beg, end in seqRequests:
   136             if fnmatchcase(name, pat) or fnmatchcase(base, pat):
   137                 yield max(beg, 0), min(end, seqLen)
   138     else:
   139         yield 0, seqLen
   140 
   141 def updateSeqs(coverDict, seqRanges, seqName, ranges, coveredRange):
   142     beg, end = coveredRange
   143     if beg < 0:
   144         coveredRange = -end, -beg
   145     if seqName in coverDict:
   146         coverDict[seqName].append(coveredRange)
   147     else:
   148         coverDict[seqName] = [coveredRange]
   149         for beg, end in ranges:
   150             r = seqName, beg, end
   151             seqRanges.append(r)
   152 
   153 def readAlignments(fileName, opts):
   154     '''Get alignments and sequence limits, from MAF or tabular format.'''
   155     seqRequests1 = [seqRequestFromText(i) for i in opts.seq1]
   156     seqRequests2 = [seqRequestFromText(i) for i in opts.seq2]
   157 
   158     alignments = []
   159     seqRanges1 = []
   160     seqRanges2 = []
   161     coverDict1 = {}
   162     coverDict2 = {}
   163     lines = myOpen(fileName)
   164     for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
   165         ranges1 = sorted(rangesFromSeqName(seqRequests1, seqName1, seqLen1))
   166         if not ranges1: continue
   167         ranges2 = sorted(rangesFromSeqName(seqRequests2, seqName2, seqLen2))
   168         if not ranges2: continue
   169         b = list(croppedBlocks(list(blocks), ranges1, ranges2))
   170         if not b: continue
   171         aln = seqName1, seqName2, b
   172         alignments.append(aln)
   173         coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
   174         updateSeqs(coverDict1, seqRanges1, seqName1, ranges1, coveredRange1)
   175         coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
   176         updateSeqs(coverDict2, seqRanges2, seqName2, ranges2, coveredRange2)
   177     return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
   178 
   179 def nameAndRangesFromDict(cropDict, seqName):
   180     if seqName in cropDict:
   181         return seqName, cropDict[seqName]
   182     n = seqName.split(".")[-1]
   183     if n in cropDict:
   184         return n, cropDict[n]
   185     return seqName, []
   186 
   187 def rangesForSecondaryAlignments(primaryRanges, seqLen):
   188     if primaryRanges:
   189         return primaryRanges
   190     return [(0, seqLen)]
   191 
   192 def readSecondaryAlignments(opts, cropRanges1, cropRanges2):
   193     cropDict1 = dict(groupByFirstItem(cropRanges1))
   194     cropDict2 = dict(groupByFirstItem(cropRanges2))
   195 
   196     alignments = []
   197     seqRanges1 = []
   198     seqRanges2 = []
   199     coverDict1 = {}
   200     coverDict2 = {}
   201     lines = myOpen(opts.alignments)
   202     for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
   203         seqName1, ranges1 = nameAndRangesFromDict(cropDict1, seqName1)
   204         seqName2, ranges2 = nameAndRangesFromDict(cropDict2, seqName2)
   205         if not ranges1 and not ranges2:
   206             continue
   207         r1 = rangesForSecondaryAlignments(ranges1, seqLen1)
   208         r2 = rangesForSecondaryAlignments(ranges2, seqLen2)
   209         b = list(croppedBlocks(list(blocks), r1, r2))
   210         if not b: continue
   211         aln = seqName1, seqName2, b
   212         alignments.append(aln)
   213         if not ranges1:
   214             coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
   215             updateSeqs(coverDict1, seqRanges1, seqName1, r1, coveredRange1)
   216         if not ranges2:
   217             coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
   218             updateSeqs(coverDict2, seqRanges2, seqName2, r2, coveredRange2)
   219     return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
   220 
   221 def twoValuesFromOption(text, separator):
   222     if separator in text:
   223         return text.split(separator)
   224     return text, text
   225 
   226 def mergedRanges(ranges):
   227     oldBeg, maxEnd = ranges[0]
   228     for beg, end in ranges:
   229         if beg > maxEnd:
   230             yield oldBeg, maxEnd
   231             oldBeg = beg
   232             maxEnd = end
   233         elif end > maxEnd:
   234             maxEnd = end
   235     yield oldBeg, maxEnd
   236 
   237 def mergedRangesPerSeq(coverDict):
   238     for k, v in coverDict.items():
   239         v.sort()
   240         yield k, list(mergedRanges(v))
   241 
   242 def coveredLength(mergedCoverDict):
   243     return sum(sum(e - b for b, e in v) for v in mergedCoverDict.values())
   244 
   245 def trimmed(seqRanges, coverDict, minAlignedBases, maxGapFrac, endPad, midPad):
   246     maxEndGapFrac, maxMidGapFrac = twoValuesFromOption(maxGapFrac, ",")
   247     maxEndGap = max(float(maxEndGapFrac) * minAlignedBases, endPad * 1.0)
   248     maxMidGap = max(float(maxMidGapFrac) * minAlignedBases, midPad * 2.0)
   249 
   250     for seqName, rangeBeg, rangeEnd in seqRanges:
   251         seqBlocks = coverDict[seqName]
   252         blocks = [i for i in seqBlocks if i[0] < rangeEnd and i[1] > rangeBeg]
   253         if blocks[0][0] - rangeBeg > maxEndGap:
   254             rangeBeg = blocks[0][0] - endPad
   255         for j, y in enumerate(blocks):
   256             if j:
   257                 x = blocks[j - 1]
   258                 if y[0] - x[1] > maxMidGap:
   259                     yield seqName, rangeBeg, x[1] + midPad
   260                     rangeBeg = y[0] - midPad
   261         if rangeEnd - blocks[-1][1] > maxEndGap:
   262             rangeEnd = blocks[-1][1] + endPad
   263         yield seqName, rangeBeg, rangeEnd
   264 
   265 def rangesWithStrandInfo(seqRanges, strandOpt, alignments, seqIndex):
   266     if strandOpt == "1":
   267         forwardMinusReverse = collections.defaultdict(int)
   268         for i in alignments:
   269             blocks = i[2]
   270             beg1, beg2, size = blocks[0]
   271             numOfAlignedLetterPairs = sum(i[2] for i in blocks)
   272             if (beg1 < 0) != (beg2 < 0):  # opposite-strand alignment
   273                 numOfAlignedLetterPairs *= -1
   274             forwardMinusReverse[i[seqIndex]] += numOfAlignedLetterPairs
   275     strandNum = 0
   276     for seqName, beg, end in seqRanges:
   277         if strandOpt == "1":
   278             strandNum = 1 if forwardMinusReverse[seqName] >= 0 else 2
   279         yield seqName, beg, end, strandNum
   280 
   281 def natural_sort_key(my_string):
   282     '''Return a sort key for "natural" ordering, e.g. chr9 < chr10.'''
   283     parts = re.split(r'(\d+)', my_string)
   284     parts[1::2] = map(int, parts[1::2])
   285     return parts
   286 
   287 def nameKey(oneSeqRanges):
   288     return natural_sort_key(oneSeqRanges[0][0])
   289 
   290 def sizeKey(oneSeqRanges):
   291     return sum(b - e for n, b, e, s in oneSeqRanges), nameKey(oneSeqRanges)
   292 
   293 def alignmentKey(seqNamesToLists, oneSeqRanges):
   294     seqName = oneSeqRanges[0][0]
   295     alignmentsOfThisSequence = seqNamesToLists[seqName]
   296     numOfAlignedLetterPairs = sum(i[3] for i in alignmentsOfThisSequence)
   297     toMiddle = numOfAlignedLetterPairs // 2
   298     for i in alignmentsOfThisSequence:
   299         toMiddle -= i[3]
   300         if toMiddle < 0:
   301             return i[1:3]  # sequence-rank and "position" of this alignment
   302 
   303 def rankAndFlipPerSeq(seqRanges):
   304     rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
   305     for rank, group in enumerate(rangesGroupedBySeqName):
   306         seqName, ranges = group
   307         strandNum = next(ranges)[3]
   308         flip = 1 if strandNum < 2 else -1
   309         yield seqName, (rank, flip)
   310 
   311 def alignmentSortData(alignments, seqIndex, otherNamesToRanksAndFlips):
   312     otherIndex = 1 - seqIndex
   313     for i in alignments:
   314         blocks = i[2]
   315         otherRank, otherFlip = otherNamesToRanksAndFlips[i[otherIndex]]
   316         otherPos = otherFlip * abs(blocks[0][otherIndex] +
   317                                    blocks[-1][otherIndex] + blocks[-1][2])
   318         numOfAlignedLetterPairs = sum(i[2] for i in blocks)
   319         yield i[seqIndex], otherRank, otherPos, numOfAlignedLetterPairs
   320 
   321 def mySortedRanges(seqRanges, sortOpt, seqIndex, alignments, otherRanges):
   322     rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
   323     g = [list(ranges) for seqName, ranges in rangesGroupedBySeqName]
   324     for i in g:
   325         if i[0][3] > 1:
   326             i.reverse()
   327     if sortOpt == "1":
   328         g.sort(key=nameKey)
   329     if sortOpt == "2":
   330         g.sort(key=sizeKey)
   331     if sortOpt == "3":
   332         otherNamesToRanksAndFlips = dict(rankAndFlipPerSeq(otherRanges))
   333         alns = sorted(alignmentSortData(alignments, seqIndex,
   334                                         otherNamesToRanksAndFlips))
   335         alnsGroupedBySeqName = itertools.groupby(alns, itemgetter(0))
   336         seqNamesToLists = dict((k, list(v)) for k, v in alnsGroupedBySeqName)
   337         g.sort(key=functools.partial(alignmentKey, seqNamesToLists))
   338     return [j for i in g for j in i]
   339 
   340 def allSortedRanges(opts, alignments, alignmentsB,
   341                     seqRanges1, seqRangesB1, seqRanges2, seqRangesB2):
   342     o1, oB1 = twoValuesFromOption(opts.strands1, ":")
   343     o2, oB2 = twoValuesFromOption(opts.strands2, ":")
   344     if o1 == "1" and o2 == "1":
   345         raise Exception("the strand options have circular dependency")
   346     seqRanges1 = list(rangesWithStrandInfo(seqRanges1, o1, alignments, 0))
   347     seqRanges2 = list(rangesWithStrandInfo(seqRanges2, o2, alignments, 1))
   348     seqRangesB1 = list(rangesWithStrandInfo(seqRangesB1, oB1, alignmentsB, 0))
   349     seqRangesB2 = list(rangesWithStrandInfo(seqRangesB2, oB2, alignmentsB, 1))
   350 
   351     o1, oB1 = twoValuesFromOption(opts.sort1, ":")
   352     o2, oB2 = twoValuesFromOption(opts.sort2, ":")
   353     if o1 == "3" and o2 == "3":
   354         raise Exception("the sort options have circular dependency")
   355     if o1 != "3":
   356         s1 = mySortedRanges(seqRanges1, o1, None, None, None)
   357     if o2 != "3":
   358         s2 = mySortedRanges(seqRanges2, o2, None, None, None)
   359     if o1 == "3":
   360         s1 = mySortedRanges(seqRanges1, o1, 0, alignments, s2)
   361     if o2 == "3":
   362         s2 = mySortedRanges(seqRanges2, o2, 1, alignments, s1)
   363     t1 = mySortedRanges(seqRangesB1, oB1, 0, alignmentsB, s2)
   364     t2 = mySortedRanges(seqRangesB2, oB2, 1, alignmentsB, s1)
   365     return s1 + t1, s2 + t2
   366 
   367 def prettyNum(n):
   368     t = str(n)
   369     groups = []
   370     while t:
   371         groups.append(t[-3:])
   372         t = t[:-3]
   373     return ",".join(reversed(groups))
   374 
   375 def sizeText(size):
   376     suffixes = "bp", "kb", "Mb", "Gb"
   377     for i, x in enumerate(suffixes):
   378         j = 10 ** (i * 3)
   379         if size < j * 10:
   380             return "%.2g" % (1.0 * size / j) + x
   381         if size < j * 1000 or i == len(suffixes) - 1:
   382             return "%.0f" % (1.0 * size / j) + x
   383 
   384 def labelText(seqRange, labelOpt):
   385     seqName, beg, end, strandNum = seqRange
   386     if labelOpt == 1:
   387         return seqName + ": " + sizeText(end - beg)
   388     if labelOpt == 2:
   389         return seqName + ":" + prettyNum(beg) + ": " + sizeText(end - beg)
   390     if labelOpt == 3:
   391         return seqName + ":" + prettyNum(beg) + "-" + prettyNum(end)
   392     return seqName
   393 
   394 def rangeLabels(seqRanges, labelOpt, font, fontsize, image_mode, textRot):
   395     if fontsize:
   396         image_size = 1, 1
   397         im = Image.new(image_mode, image_size)
   398         draw = ImageDraw.Draw(im)
   399     x = y = 0
   400     for r in seqRanges:
   401         text = labelText(r, labelOpt)
   402         if fontsize:
   403             x, y = draw.textsize(text, font=font)
   404             if textRot:
   405                 x, y = y, x
   406         yield text, x, y, r[3]
   407 
   408 def dataFromRanges(sortedRanges, font, fontSize, imageMode, labelOpt, textRot):
   409     for seqName, rangeBeg, rangeEnd, strandNum in sortedRanges:
   410         out = [seqName, str(rangeBeg), str(rangeEnd)]
   411         if strandNum > 0:
   412             out.append(".+-"[strandNum])
   413         logging.info("\t".join(out))
   414     logging.info("")
   415     rangeSizes = [e - b for n, b, e, s in sortedRanges]
   416     labs = list(rangeLabels(sortedRanges, labelOpt, font, fontSize,
   417                             imageMode, textRot))
   418     margin = max(i[2] for i in labs)
   419     # xxx the margin may be too big, because some labels may get omitted
   420     return rangeSizes, labs, margin
   421 
   422 def div_ceil(x, y):
   423     '''Return x / y rounded up.'''
   424     q, r = divmod(x, y)
   425     return q + (r != 0)
   426 
   427 def get_bp_per_pix(rangeSizes, pixTweenRanges, maxPixels):
   428     '''Get the minimum bp-per-pixel that fits in the size limit.'''
   429     logging.info("choosing bp per pixel...")
   430     numOfRanges = len(rangeSizes)
   431     maxPixelsInRanges = maxPixels - pixTweenRanges * (numOfRanges - 1)
   432     if maxPixelsInRanges < numOfRanges:
   433         raise Exception("can't fit the image: too many sequences?")
   434     negLimit = -maxPixelsInRanges
   435     negBpPerPix = sum(rangeSizes) // negLimit
   436     while True:
   437         if sum(i // negBpPerPix for i in rangeSizes) >= negLimit:
   438             return -negBpPerPix
   439         negBpPerPix -= 1
   440 
   441 def getRangePixBegs(rangePixLens, pixTweenRanges, margin):
   442     '''Get the start pixel for each range.'''
   443     rangePixBegs = []
   444     pix_tot = margin - pixTweenRanges
   445     for i in rangePixLens:
   446         pix_tot += pixTweenRanges
   447         rangePixBegs.append(pix_tot)
   448         pix_tot += i
   449     return rangePixBegs
   450 
   451 def pixelData(rangeSizes, bp_per_pix, pixTweenRanges, margin):
   452     '''Return pixel information about the ranges.'''
   453     rangePixLens = [div_ceil(i, bp_per_pix) for i in rangeSizes]
   454     rangePixBegs = getRangePixBegs(rangePixLens, pixTweenRanges, margin)
   455     tot_pix = rangePixBegs[-1] + rangePixLens[-1]
   456     return rangePixBegs, rangePixLens, tot_pix
   457 
   458 def drawLineForward(hits, width, bp_per_pix, beg1, beg2, size):
   459     while True:
   460         q1, r1 = divmod(beg1, bp_per_pix)
   461         q2, r2 = divmod(beg2, bp_per_pix)
   462         hits[q2 * width + q1] |= 1
   463         next_pix = min(bp_per_pix - r1, bp_per_pix - r2)
   464         if next_pix >= size: break
   465         beg1 += next_pix
   466         beg2 += next_pix
   467         size -= next_pix
   468 
   469 def drawLineReverse(hits, width, bp_per_pix, beg1, beg2, size):
   470     while True:
   471         q1, r1 = divmod(beg1, bp_per_pix)
   472         q2, r2 = divmod(beg2, bp_per_pix)
   473         hits[q2 * width + q1] |= 2
   474         next_pix = min(bp_per_pix - r1, r2 + 1)
   475         if next_pix >= size: break
   476         beg1 += next_pix
   477         beg2 -= next_pix
   478         size -= next_pix
   479 
   480 def strandAndOrigin(ranges, beg, size):
   481     isReverseStrand = (beg < 0)
   482     if isReverseStrand:
   483         beg = -(beg + size)
   484     for rangeBeg, rangeEnd, isReverseRange, origin in ranges:
   485         if rangeEnd > beg:  # assumes the ranges are sorted
   486             return (isReverseStrand != isReverseRange), origin
   487 
   488 def alignmentPixels(width, height, alignments, bp_per_pix,
   489                     rangeDict1, rangeDict2):
   490     hits = [0] * (width * height)  # the image data
   491     for seq1, seq2, blocks in alignments:
   492         beg1, beg2, size = blocks[0]
   493         isReverse1, ori1 = strandAndOrigin(rangeDict1[seq1], beg1, size)
   494         isReverse2, ori2 = strandAndOrigin(rangeDict2[seq2], beg2, size)
   495         for beg1, beg2, size in blocks:
   496             if isReverse1:
   497                 beg1 = -(beg1 + size)
   498                 beg2 = -(beg2 + size)
   499             if isReverse1 == isReverse2:
   500                 drawLineForward(hits, width, bp_per_pix,
   501                                 ori1 + beg1, ori2 + beg2, size)
   502             else:
   503                 drawLineReverse(hits, width, bp_per_pix,
   504                                 ori1 + beg1, ori2 - beg2 - 1, size)
   505     return hits
   506 
   507 def expandedSeqDict(seqDict):
   508     '''Allow lookup by short sequence names, e.g. chr7 as well as hg19.chr7.'''
   509     newDict = seqDict.copy()
   510     for name, x in seqDict.items():
   511         if "." in name:
   512             base = name.split(".")[-1]
   513             if base in newDict:  # an ambiguous case was found:
   514                 return seqDict   # so give up completely
   515             newDict[base] = x
   516     return newDict
   517 
   518 def readBed(fileName, rangeDict):
   519     for line in myOpen(fileName):
   520         w = line.split()
   521         if not w: continue
   522         seqName = w[0]
   523         if seqName not in rangeDict: continue
   524         beg = int(w[1])
   525         end = int(w[2])
   526         layer = 900
   527         color = "#fbf"
   528         if len(w) > 4:
   529             if w[4] != ".":
   530                 layer = float(w[4])
   531             if len(w) > 5:
   532                 if len(w) > 8 and w[8].count(",") == 2:
   533                     color = "rgb(" + w[8] + ")"
   534                 else:
   535                     strand = w[5]
   536                     isRev = rangeDict[seqName][0][2]
   537                     if strand == "+" and not isRev or strand == "-" and isRev:
   538                         color = "#ffe8e8"
   539                     if strand == "-" and not isRev or strand == "+" and isRev:
   540                         color = "#e8e8ff"
   541         yield layer, color, seqName, beg, end
   542 
   543 def commaSeparatedInts(text):
   544     return map(int, text.rstrip(",").split(","))
   545 
   546 def readGenePred(opts, fileName, rangeDict):
   547     for line in myOpen(fileName):
   548         fields = line.split()
   549         if not fields: continue
   550         if fields[2] not in "+-": fields = fields[1:]
   551         seqName = fields[1]
   552         if seqName not in rangeDict: continue
   553         #strand = fields[2]
   554         cdsBeg = int(fields[5])
   555         cdsEnd = int(fields[6])
   556         exonBegs = commaSeparatedInts(fields[8])
   557         exonEnds = commaSeparatedInts(fields[9])
   558         for beg, end in zip(exonBegs, exonEnds):
   559             yield 300, opts.exon_color, seqName, beg, end
   560             b = max(beg, cdsBeg)
   561             e = min(end, cdsEnd)
   562             if b < e: yield 400, opts.cds_color, seqName, b, e
   563 
   564 def readRmsk(fileName, rangeDict):
   565     for line in myOpen(fileName):
   566         fields = line.split()
   567         if len(fields) == 17:  # rmsk.txt
   568             seqName = fields[5]
   569             if seqName not in rangeDict: continue  # do this ASAP for speed
   570             beg = int(fields[6])
   571             end = int(fields[7])
   572             strand = fields[9]
   573             repeatClass = fields[11]
   574         elif len(fields) == 15:  # .out
   575             seqName = fields[4]
   576             if seqName not in rangeDict: continue
   577             beg = int(fields[5]) - 1
   578             end = int(fields[6])
   579             strand = fields[8]
   580             repeatClass = fields[10]
   581         else:
   582             continue
   583         if repeatClass in ("Low_complexity", "Simple_repeat"):
   584             yield 200, "#fbf", seqName, beg, end
   585         elif (strand == "+") != rangeDict[seqName][0][2]:
   586             yield 100, "#ffe8e8", seqName, beg, end
   587         else:
   588             yield 100, "#e8e8ff", seqName, beg, end
   589 
   590 def isExtraFirstGapField(fields):
   591     return fields[4].isdigit()
   592 
   593 def readGaps(opts, fileName, rangeDict):
   594     '''Read locations of unsequenced gaps, from an agp or gap file.'''
   595     for line in myOpen(fileName):
   596         w = line.split()
   597         if not w or w[0][0] == "#": continue
   598         if isExtraFirstGapField(w): w = w[1:]
   599         if w[4] not in "NU": continue
   600         seqName = w[0]
   601         if seqName not in rangeDict: continue
   602         end = int(w[2])
   603         beg = end - int(w[5])  # zero-based coordinate
   604         if w[7] == "yes":
   605             yield 3000, opts.bridged_color, seqName, beg, end
   606         else:
   607             yield 2000, opts.unbridged_color, seqName, beg, end
   608 
   609 def bedBoxes(beds, rangeDict, margin, edge, isTop, bpPerPix):
   610     for layer, color, seqName, bedBeg, bedEnd in beds:
   611         for rangeBeg, rangeEnd, isReverseRange, origin in rangeDict[seqName]:
   612             beg = max(bedBeg, rangeBeg)
   613             end = min(bedEnd, rangeEnd)
   614             if beg >= end: continue
   615             if isReverseRange:
   616                 beg, end = -end, -beg
   617             if layer <= 1000:
   618                 # include partly-covered pixels
   619                 b = (origin + beg) // bpPerPix
   620                 e = div_ceil(origin + end, bpPerPix)
   621             else:
   622                 # exclude partly-covered pixels
   623                 b = div_ceil(origin + beg, bpPerPix)
   624                 e = (origin + end) // bpPerPix
   625                 if e <= b: continue
   626                 if bedEnd >= rangeEnd:  # include partly-covered end pixels
   627                     if isReverseRange:
   628                         b = (origin + beg) // bpPerPix
   629                     else:
   630                         e = div_ceil(origin + end, bpPerPix)
   631             if isTop:
   632                 box = b, margin, e, edge
   633             else:
   634                 box = margin, b, edge, e
   635             yield layer, color, box
   636 
   637 def drawAnnotations(im, boxes):
   638     # xxx use partial transparency for different-color overlaps?
   639     for layer, color, box in boxes:
   640         im.paste(color, box)
   641 
   642 def placedLabels(labels, rangePixBegs, rangePixLens, beg, end):
   643     '''Return axis labels with endpoint & sort-order information.'''
   644     maxWidth = end - beg
   645     for i, j, k in zip(labels, rangePixBegs, rangePixLens):
   646         text, textWidth, textHeight, strandNum = i
   647         if textWidth > maxWidth:
   648             continue
   649         labelBeg = j + (k - textWidth) // 2
   650         labelEnd = labelBeg + textWidth
   651         sortKey = textWidth - k
   652         if labelBeg < beg:
   653             sortKey += maxWidth * (beg - labelBeg)
   654             labelBeg = beg
   655             labelEnd = beg + textWidth
   656         if labelEnd > end:
   657             sortKey += maxWidth * (labelEnd - end)
   658             labelEnd = end
   659             labelBeg = end - textWidth
   660         yield sortKey, labelBeg, labelEnd, text, textHeight, strandNum
   661 
   662 def nonoverlappingLabels(labels, minPixTweenLabels):
   663     '''Get a subset of non-overlapping axis labels, greedily.'''
   664     out = []
   665     for i in labels:
   666         beg = i[1] - minPixTweenLabels
   667         end = i[2] + minPixTweenLabels
   668         if all(j[2] <= beg or j[1] >= end for j in out):
   669             out.append(i)
   670     return out
   671 
   672 def axisImage(labels, rangePixBegs, rangePixLens, textRot,
   673               textAln, font, image_mode, opts):
   674     '''Make an image of axis labels.'''
   675     beg = rangePixBegs[0]
   676     end = rangePixBegs[-1] + rangePixLens[-1]
   677     margin = max(i[2] for i in labels)
   678     labels = sorted(placedLabels(labels, rangePixBegs, rangePixLens, beg, end))
   679     minPixTweenLabels = 0 if textRot else opts.label_space
   680     labels = nonoverlappingLabels(labels, minPixTweenLabels)
   681     image_size = (margin, end) if textRot else (end, margin)
   682     im = Image.new(image_mode, image_size, opts.margin_color)
   683     draw = ImageDraw.Draw(im)
   684     for sortKey, labelBeg, labelEnd, text, textHeight, strandNum in labels:
   685         base = margin - textHeight if textAln else 0
   686         position = (base, labelBeg) if textRot else (labelBeg, base)
   687         fill = ("black", opts.forwardcolor, opts.reversecolor)[strandNum]
   688         draw.text(position, text, font=font, fill=fill)
   689     return im
   690 
   691 def rangesWithOrigins(sortedRanges, rangePixBegs, rangePixLens, bpPerPix):
   692     for i, j, k in zip(sortedRanges, rangePixBegs, rangePixLens):
   693         seqName, rangeBeg, rangeEnd, strandNum = i
   694         isReverseRange = (strandNum > 1)
   695         if isReverseRange:
   696             origin = bpPerPix * (j + k) + rangeBeg
   697         else:
   698             origin = bpPerPix * j - rangeBeg
   699         yield seqName, (rangeBeg, rangeEnd, isReverseRange, origin)
   700 
   701 def rangesPerSeq(sortedRanges, rangePixBegs, rangePixLens, bpPerPix):
   702     a = rangesWithOrigins(sortedRanges, rangePixBegs, rangePixLens, bpPerPix)
   703     for k, v in itertools.groupby(a, itemgetter(0)):
   704         yield k, sorted(i[1] for i in v)
   705 
   706 def getFont(opts):
   707     if opts.fontfile:
   708         return ImageFont.truetype(opts.fontfile, opts.fontsize)
   709     fileNames = []
   710     try:
   711         x = ["fc-match", "-f%{file}", "arial"]
   712         p = subprocess.Popen(x, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
   713                              universal_newlines=True)
   714         out, err = p.communicate()
   715         fileNames.append(out)
   716     except OSError as e:
   717         logging.info("fc-match error: " + str(e))
   718     fileNames.append("/Library/Fonts/Arial.ttf")  # for Mac
   719     for i in fileNames:
   720         try:
   721             font = ImageFont.truetype(i, opts.fontsize)
   722             logging.info("font: " + i)
   723             return font
   724         except IOError as e:
   725             logging.info("font load error: " + str(e))
   726     return ImageFont.load_default()
   727 
   728 def sequenceSizesAndNames(seqRanges):
   729     for seqName, ranges in itertools.groupby(seqRanges, itemgetter(0)):
   730         size = sum(e - b for n, b, e in ranges)
   731         yield size, seqName
   732 
   733 def biggestSequences(seqRanges, maxNumOfSequences):
   734     s = sorted(sequenceSizesAndNames(seqRanges), reverse=True)
   735     if len(s) > maxNumOfSequences:
   736         logging.warning("too many sequences - discarding the smallest ones")
   737         s = s[:maxNumOfSequences]
   738     return set(i[1] for i in s)
   739 
   740 def remainingSequenceRanges(seqRanges, alignments, seqIndex):
   741     remainingSequences = set(i[seqIndex] for i in alignments)
   742     return [i for i in seqRanges if i[0] in remainingSequences]
   743 
   744 def lastDotplot(opts, args):
   745     logLevel = logging.INFO if opts.verbose else logging.WARNING
   746     logging.basicConfig(format="%(filename)s: %(message)s", level=logLevel)
   747 
   748     font = getFont(opts)
   749     image_mode = 'RGB'
   750     forward_color = ImageColor.getcolor(opts.forwardcolor, image_mode)
   751     reverse_color = ImageColor.getcolor(opts.reversecolor, image_mode)
   752     zipped_colors = zip(forward_color, reverse_color)
   753     overlap_color = tuple([(i + j) // 2 for i, j in zipped_colors])
   754 
   755     maxGap1, maxGapB1 = twoValuesFromOption(opts.max_gap1, ":")
   756     maxGap2, maxGapB2 = twoValuesFromOption(opts.max_gap2, ":")
   757 
   758     logging.info("reading alignments...")
   759     alnData = readAlignments(args[0], opts)
   760     alignments, seqRanges1, coverDict1, seqRanges2, coverDict2 = alnData
   761     if not alignments: raise Exception("there are no alignments")
   762     logging.info("cutting...")
   763     coverDict1 = dict(mergedRangesPerSeq(coverDict1))
   764     coverDict2 = dict(mergedRangesPerSeq(coverDict2))
   765     minAlignedBases = min(coveredLength(coverDict1), coveredLength(coverDict2))
   766     pad = int(opts.pad * minAlignedBases)
   767     cutRanges1 = list(trimmed(seqRanges1, coverDict1, minAlignedBases,
   768                               maxGap1, pad, pad))
   769     cutRanges2 = list(trimmed(seqRanges2, coverDict2, minAlignedBases,
   770                               maxGap2, pad, pad))
   771 
   772     biggestSeqs1 = biggestSequences(cutRanges1, opts.maxseqs)
   773     cutRanges1 = [i for i in cutRanges1 if i[0] in biggestSeqs1]
   774     alignments = [i for i in alignments if i[0] in biggestSeqs1]
   775     cutRanges2 = remainingSequenceRanges(cutRanges2, alignments, 1)
   776 
   777     biggestSeqs2 = biggestSequences(cutRanges2, opts.maxseqs)
   778     cutRanges2 = [i for i in cutRanges2 if i[0] in biggestSeqs2]
   779     alignments = [i for i in alignments if i[1] in biggestSeqs2]
   780     cutRanges1 = remainingSequenceRanges(cutRanges1, alignments, 0)
   781 
   782     logging.info("reading secondary alignments...")
   783     alnDataB = readSecondaryAlignments(opts, cutRanges1, cutRanges2)
   784     alignmentsB, seqRangesB1, coverDictB1, seqRangesB2, coverDictB2 = alnDataB
   785     logging.info("cutting...")
   786     coverDictB1 = dict(mergedRangesPerSeq(coverDictB1))
   787     coverDictB2 = dict(mergedRangesPerSeq(coverDictB2))
   788     cutRangesB1 = trimmed(seqRangesB1, coverDictB1, minAlignedBases,
   789                           maxGapB1, 0, 0)
   790     cutRangesB2 = trimmed(seqRangesB2, coverDictB2, minAlignedBases,
   791                           maxGapB2, 0, 0)
   792 
   793     logging.info("sorting...")
   794     sortOut = allSortedRanges(opts, alignments, alignmentsB,
   795                               cutRanges1, cutRangesB1, cutRanges2, cutRangesB2)
   796     sortedRanges1, sortedRanges2 = sortOut
   797 
   798     textRot1 = "vertical".startswith(opts.rot1)
   799     i1 = dataFromRanges(sortedRanges1, font,
   800                         opts.fontsize, image_mode, opts.labels1, textRot1)
   801     rangeSizes1, labelData1, tMargin = i1
   802 
   803     textRot2 = "horizontal".startswith(opts.rot2)
   804     i2 = dataFromRanges(sortedRanges2, font,
   805                         opts.fontsize, image_mode, opts.labels2, textRot2)
   806     rangeSizes2, labelData2, lMargin = i2
   807 
   808     maxPixels1 = opts.width  - lMargin
   809     maxPixels2 = opts.height - tMargin
   810     bpPerPix1 = get_bp_per_pix(rangeSizes1, opts.border_pixels, maxPixels1)
   811     bpPerPix2 = get_bp_per_pix(rangeSizes2, opts.border_pixels, maxPixels2)
   812     bpPerPix = max(bpPerPix1, bpPerPix2)
   813     logging.info("bp per pixel = " + str(bpPerPix))
   814 
   815     p1 = pixelData(rangeSizes1, bpPerPix, opts.border_pixels, lMargin)
   816     rangePixBegs1, rangePixLens1, width = p1
   817     rangeDict1 = dict(rangesPerSeq(sortedRanges1, rangePixBegs1,
   818                                    rangePixLens1, bpPerPix))
   819 
   820     p2 = pixelData(rangeSizes2, bpPerPix, opts.border_pixels, tMargin)
   821     rangePixBegs2, rangePixLens2, height = p2
   822     rangeDict2 = dict(rangesPerSeq(sortedRanges2, rangePixBegs2,
   823                                    rangePixLens2, bpPerPix))
   824 
   825     logging.info("width:  " + str(width))
   826     logging.info("height: " + str(height))
   827 
   828     logging.info("processing alignments...")
   829     hits = alignmentPixels(width, height, alignments + alignmentsB, bpPerPix,
   830                            rangeDict1, rangeDict2)
   831 
   832     logging.info("reading annotations...")
   833 
   834     rangeDict1 = expandedSeqDict(rangeDict1)
   835     beds1 = itertools.chain(readBed(opts.bed1, rangeDict1),
   836                             readRmsk(opts.rmsk1, rangeDict1),
   837                             readGenePred(opts, opts.genePred1, rangeDict1),
   838                             readGaps(opts, opts.gap1, rangeDict1))
   839     b1 = bedBoxes(beds1, rangeDict1, tMargin, height, True, bpPerPix)
   840 
   841     rangeDict2 = expandedSeqDict(rangeDict2)
   842     beds2 = itertools.chain(readBed(opts.bed2, rangeDict2),
   843                             readRmsk(opts.rmsk2, rangeDict2),
   844                             readGenePred(opts, opts.genePred2, rangeDict2),
   845                             readGaps(opts, opts.gap2, rangeDict2))
   846     b2 = bedBoxes(beds2, rangeDict2, lMargin, width, False, bpPerPix)
   847 
   848     boxes = sorted(itertools.chain(b1, b2))
   849 
   850     logging.info("drawing...")
   851 
   852     image_size = width, height
   853     im = Image.new(image_mode, image_size, opts.background_color)
   854 
   855     drawAnnotations(im, boxes)
   856 
   857     for i in range(height):
   858         for j in range(width):
   859             store_value = hits[i * width + j]
   860             xy = j, i
   861             if   store_value == 1: im.putpixel(xy, forward_color)
   862             elif store_value == 2: im.putpixel(xy, reverse_color)
   863             elif store_value == 3: im.putpixel(xy, overlap_color)
   864 
   865     if opts.fontsize != 0:
   866         axis1 = axisImage(labelData1, rangePixBegs1, rangePixLens1,
   867                           textRot1, False, font, image_mode, opts)
   868         if textRot1:
   869             axis1 = axis1.transpose(Image.ROTATE_90)
   870         axis2 = axisImage(labelData2, rangePixBegs2, rangePixLens2,
   871                           textRot2, textRot2, font, image_mode, opts)
   872         if not textRot2:
   873             axis2 = axis2.transpose(Image.ROTATE_270)
   874         im.paste(axis1, (0, 0))
   875         im.paste(axis2, (0, 0))
   876 
   877     for i in rangePixBegs1[1:]:
   878         box = i - opts.border_pixels, tMargin, i, height
   879         im.paste(opts.border_color, box)
   880 
   881     for i in rangePixBegs2[1:]:
   882         box = lMargin, i - opts.border_pixels, width, i
   883         im.paste(opts.border_color, box)
   884 
   885     im.save(args[1])
   886 
   887 if __name__ == "__main__":
   888     usage = """%prog --help
   889    or: %prog [options] maf-or-tab-alignments dotplot.png
   890    or: %prog [options] maf-or-tab-alignments dotplot.gif
   891    or: ..."""
   892     description = "Draw a dotplot of pair-wise sequence alignments in MAF or tabular format."
   893     op = optparse.OptionParser(usage=usage, description=description)
   894     op.add_option("-v", "--verbose", action="count",
   895                   help="show progress messages & data about the plot")
   896     # Replace "width" & "height" with a single "length" option?
   897     op.add_option("-x", "--width", metavar="INT", type="int", default=1000,
   898                   help="maximum width in pixels (default: %default)")
   899     op.add_option("-y", "--height", metavar="INT", type="int", default=1000,
   900                   help="maximum height in pixels (default: %default)")
   901     op.add_option("-m", "--maxseqs", type="int", default=100, metavar="M",
   902                   help="maximum number of horizontal or vertical sequences "
   903                   "(default=%default)")
   904     op.add_option("-1", "--seq1", metavar="PATTERN", action="append",
   905                   default=[],
   906                   help="which sequences to show from the 1st genome")
   907     op.add_option("-2", "--seq2", metavar="PATTERN", action="append",
   908                   default=[],
   909                   help="which sequences to show from the 2nd genome")
   910     op.add_option("-c", "--forwardcolor", metavar="COLOR", default="red",
   911                   help="color for forward alignments (default: %default)")
   912     op.add_option("-r", "--reversecolor", metavar="COLOR", default="blue",
   913                   help="color for reverse alignments (default: %default)")
   914     op.add_option("--alignments", metavar="FILE", help="secondary alignments")
   915     op.add_option("--sort1", default="1", metavar="N",
   916                   help="genome1 sequence order: 0=input order, 1=name order, "
   917                   "2=length order, 3=alignment order (default=%default)")
   918     op.add_option("--sort2", default="1", metavar="N",
   919                   help="genome2 sequence order: 0=input order, 1=name order, "
   920                   "2=length order, 3=alignment order (default=%default)")
   921     op.add_option("--strands1", default="0", metavar="N", help=
   922                   "genome1 sequence orientation: 0=forward orientation, "
   923                   "1=alignment orientation (default=%default)")
   924     op.add_option("--strands2", default="0", metavar="N", help=
   925                   "genome2 sequence orientation: 0=forward orientation, "
   926                   "1=alignment orientation (default=%default)")
   927     op.add_option("--max-gap1", metavar="FRAC", default="0.5,2", help=
   928                   "maximum unaligned (end,mid) gap in genome1: "
   929                   "fraction of aligned length (default=%default)")
   930     op.add_option("--max-gap2", metavar="FRAC", default="0.5,2", help=
   931                   "maximum unaligned (end,mid) gap in genome2: "
   932                   "fraction of aligned length (default=%default)")
   933     op.add_option("--pad", metavar="FRAC", type="float", default=0.04, help=
   934                   "pad length when cutting unaligned gaps: "
   935                   "fraction of aligned length (default=%default)")
   936     op.add_option("--border-pixels", metavar="INT", type="int", default=1,
   937                   help="number of pixels between sequences (default=%default)")
   938     op.add_option("--border-color", metavar="COLOR", default="black",
   939                   help="color for pixels between sequences (default=%default)")
   940     # --break-color and/or --break-pixels for intra-sequence breaks?
   941     op.add_option("--margin-color", metavar="COLOR", default="#dcdcdc",
   942                   help="margin color")
   943 
   944     og = optparse.OptionGroup(op, "Text options")
   945     og.add_option("-f", "--fontfile", metavar="FILE",
   946                   help="TrueType or OpenType font file")
   947     og.add_option("-s", "--fontsize", metavar="SIZE", type="int", default=14,
   948                   help="TrueType or OpenType font size (default: %default)")
   949     og.add_option("--labels1", type="int", default=0, metavar="N", help=
   950                   "genome1 labels: 0=name, 1=name:length, "
   951                   "2=name:start:length, 3=name:start-end (default=%default)")
   952     og.add_option("--labels2", type="int", default=0, metavar="N", help=
   953                   "genome2 labels: 0=name, 1=name:length, "
   954                   "2=name:start:length, 3=name:start-end (default=%default)")
   955     og.add_option("--rot1", metavar="ROT", default="h",
   956                   help="text rotation for the 1st genome (default=%default)")
   957     og.add_option("--rot2", metavar="ROT", default="v",
   958                   help="text rotation for the 2nd genome (default=%default)")
   959     op.add_option_group(og)
   960 
   961     og = optparse.OptionGroup(op, "Annotation options")
   962     og.add_option("--bed1", metavar="FILE",
   963                   help="read genome1 annotations from BED file")
   964     og.add_option("--bed2", metavar="FILE",
   965                   help="read genome2 annotations from BED file")
   966     og.add_option("--rmsk1", metavar="FILE", help="read genome1 repeats from "
   967                   "RepeatMasker .out or rmsk.txt file")
   968     og.add_option("--rmsk2", metavar="FILE", help="read genome2 repeats from "
   969                   "RepeatMasker .out or rmsk.txt file")
   970     op.add_option_group(og)
   971 
   972     og = optparse.OptionGroup(op, "Gene options")
   973     og.add_option("--genePred1", metavar="FILE",
   974                   help="read genome1 genes from genePred file")
   975     og.add_option("--genePred2", metavar="FILE",
   976                   help="read genome2 genes from genePred file")
   977     og.add_option("--exon-color", metavar="COLOR", default="PaleGreen",
   978                   help="color for exons (default=%default)")
   979     og.add_option("--cds-color", metavar="COLOR", default="LimeGreen",
   980                   help="color for protein-coding regions (default=%default)")
   981     op.add_option_group(og)
   982 
   983     og = optparse.OptionGroup(op, "Unsequenced gap options")
   984     og.add_option("--gap1", metavar="FILE",
   985                   help="read genome1 unsequenced gaps from agp or gap file")
   986     og.add_option("--gap2", metavar="FILE",
   987                   help="read genome2 unsequenced gaps from agp or gap file")
   988     og.add_option("--bridged-color", metavar="COLOR", default="yellow",
   989                   help="color for bridged gaps (default: %default)")
   990     og.add_option("--unbridged-color", metavar="COLOR", default="orange",
   991                   help="color for unbridged gaps (default: %default)")
   992     op.add_option_group(og)
   993     (opts, args) = op.parse_args()
   994     if len(args) != 2: op.error("2 arguments needed")
   995 
   996     opts.background_color = "white"
   997     opts.label_space = 5     # minimum number of pixels between axis labels
   998 
   999     try: lastDotplot(opts, args)
  1000     except KeyboardInterrupt: pass  # avoid silly error message
  1001     except Exception as e:
  1002         prog = os.path.basename(sys.argv[0])
  1003         sys.exit(prog + ": error: " + str(e))