scripts/last-dotplot
author Martin C. Frith
Mon May 28 14:53:43 2018 +0900 (2018-05-28)
changeset 941 f3f1ec72542f
parent 938 65969d4d464d
child 942 82b69208c878
permissions -rwxr-xr-x
Python3-ify: dotplot, train, convert, swap
     1 #! /usr/bin/env python
     2 
     3 # Read pair-wise alignments in MAF or LAST tabular format: write an
     4 # "Oxford grid", a.k.a. dotplot.
     5 
     6 # TODO: Currently, pixels with zero aligned nt-pairs are white, and
     7 # pixels with one or more aligned nt-pairs are black.  This can look
     8 # too crowded for large genome alignments.  I tried shading each pixel
     9 # according to the number of aligned nt-pairs within it, but the
    10 # result is too faint.  How can this be done better?
    11 
    12 import collections
    13 import functools
    14 import gzip
    15 from fnmatch import fnmatchcase
    16 from operator import itemgetter
    17 import subprocess
    18 import itertools, optparse, os, re, sys
    19 
    20 # Try to make PIL/PILLOW work:
    21 try:
    22     from PIL import Image, ImageDraw, ImageFont, ImageColor
    23 except ImportError:
    24     import Image, ImageDraw, ImageFont, ImageColor
    25 
    26 try:
    27     from future_builtins import zip
    28 except ImportError:
    29     pass
    30 
    31 def myOpen(fileName):  # faster than fileinput
    32     if fileName is None:
    33         return []
    34     if fileName == "-":
    35         return sys.stdin
    36     if fileName.endswith(".gz"):
    37         return gzip.open(fileName)
    38     return open(fileName)
    39 
    40 def warn(message):
    41     if opts.verbose:
    42         prog = os.path.basename(sys.argv[0])
    43         sys.stderr.write(prog + ": " + message + "\n")
    44 
    45 def groupByFirstItem(things):
    46     for k, v in itertools.groupby(things, itemgetter(0)):
    47         yield k, [i[1:] for i in v]
    48 
    49 def croppedBlocks(blocks, ranges1, ranges2):
    50     headBeg1, headBeg2, headSize = blocks[0]
    51     for r1 in ranges1:
    52         for r2 in ranges2:
    53             cropBeg1, cropEnd1 = r1
    54             if headBeg1 < 0:
    55                 cropBeg1, cropEnd1 = -cropEnd1, -cropBeg1
    56             cropBeg2, cropEnd2 = r2
    57             if headBeg2 < 0:
    58                 cropBeg2, cropEnd2 = -cropEnd2, -cropBeg2
    59             for beg1, beg2, size in blocks:
    60                 b1 = max(cropBeg1, beg1)
    61                 e1 = min(cropEnd1, beg1 + size)
    62                 if b1 >= e1: continue
    63                 offset = beg2 - beg1
    64                 b2 = max(cropBeg2, b1 + offset)
    65                 e2 = min(cropEnd2, e1 + offset)
    66                 if b2 >= e2: continue
    67                 yield b2 - offset, b2, e2 - b2
    68 
    69 def tabBlocks(beg1, beg2, blocks):
    70     '''Get the gapless blocks of an alignment, from LAST tabular format.'''
    71     for i in blocks.split(","):
    72         if ":" in i:
    73             x, y = i.split(":")
    74             beg1 += int(x)
    75             beg2 += int(y)
    76         else:
    77             size = int(i)
    78             yield beg1, beg2, size
    79             beg1 += size
    80             beg2 += size
    81 
    82 def mafBlocks(beg1, beg2, seq1, seq2):
    83     '''Get the gapless blocks of an alignment, from MAF format.'''
    84     size = 0
    85     for x, y in zip(seq1, seq2):
    86         if x == "-":
    87             if size:
    88                 yield beg1, beg2, size
    89                 beg1 += size
    90                 beg2 += size
    91                 size = 0
    92             beg2 += 1
    93         elif y == "-":
    94             if size:
    95                 yield beg1, beg2, size
    96                 beg1 += size
    97                 beg2 += size
    98                 size = 0
    99             beg1 += 1
   100         else:
   101             size += 1
   102     if size: yield beg1, beg2, size
   103 
   104 def alignmentInput(lines):
   105     '''Get alignments and sequence lengths, from MAF or tabular format.'''
   106     mafCount = 0
   107     for line in lines:
   108         w = line.split()
   109         if line[0].isdigit():  # tabular format
   110             chr1, beg1, seqlen1 = w[1], int(w[2]), int(w[5])
   111             if w[4] == "-": beg1 -= seqlen1
   112             chr2, beg2, seqlen2 = w[6], int(w[7]), int(w[10])
   113             if w[9] == "-": beg2 -= seqlen2
   114             blocks = tabBlocks(beg1, beg2, w[11])
   115             yield chr1, seqlen1, chr2, seqlen2, blocks
   116         elif line[0] == "s":  # MAF format
   117             if mafCount == 0:
   118                 chr1, beg1, seqlen1, seq1 = w[1], int(w[2]), int(w[5]), w[6]
   119                 if w[4] == "-": beg1 -= seqlen1
   120                 mafCount = 1
   121             else:
   122                 chr2, beg2, seqlen2, seq2 = w[1], int(w[2]), int(w[5]), w[6]
   123                 if w[4] == "-": beg2 -= seqlen2
   124                 blocks = mafBlocks(beg1, beg2, seq1, seq2)
   125                 yield chr1, seqlen1, chr2, seqlen2, blocks
   126                 mafCount = 0
   127 
   128 def seqRequestFromText(text):
   129     if ":" in text:
   130         pattern, interval = text.rsplit(":", 1)
   131         if "-" in interval:
   132             beg, end = interval.rsplit("-", 1)
   133             return pattern, int(beg), int(end)  # beg may be negative
   134     return text, 0, sys.maxsize
   135 
   136 def rangesFromSeqName(seqRequests, name, seqLen):
   137     if seqRequests:
   138         base = name.split(".")[-1]  # allow for names like hg19.chr7
   139         for pat, beg, end in seqRequests:
   140             if fnmatchcase(name, pat) or fnmatchcase(base, pat):
   141                 yield max(beg, 0), min(end, seqLen)
   142     else:
   143         yield 0, seqLen
   144 
   145 def updateSeqs(coverDict, seqRanges, seqName, ranges, coveredRange):
   146     beg, end = coveredRange
   147     if beg < 0:
   148         coveredRange = -end, -beg
   149     if seqName in coverDict:
   150         coverDict[seqName].append(coveredRange)
   151     else:
   152         coverDict[seqName] = [coveredRange]
   153         for beg, end in ranges:
   154             r = seqName, beg, end
   155             seqRanges.append(r)
   156 
   157 def readAlignments(fileName, opts):
   158     '''Get alignments and sequence limits, from MAF or tabular format.'''
   159     seqRequests1 = [seqRequestFromText(i) for i in opts.seq1]
   160     seqRequests2 = [seqRequestFromText(i) for i in opts.seq2]
   161 
   162     alignments = []
   163     seqRanges1 = []
   164     seqRanges2 = []
   165     coverDict1 = {}
   166     coverDict2 = {}
   167     lines = myOpen(fileName)
   168     for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
   169         ranges1 = sorted(rangesFromSeqName(seqRequests1, seqName1, seqLen1))
   170         if not ranges1: continue
   171         ranges2 = sorted(rangesFromSeqName(seqRequests2, seqName2, seqLen2))
   172         if not ranges2: continue
   173         b = list(croppedBlocks(list(blocks), ranges1, ranges2))
   174         if not b: continue
   175         aln = seqName1, seqName2, b
   176         alignments.append(aln)
   177         coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
   178         updateSeqs(coverDict1, seqRanges1, seqName1, ranges1, coveredRange1)
   179         coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
   180         updateSeqs(coverDict2, seqRanges2, seqName2, ranges2, coveredRange2)
   181     return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
   182 
   183 def nameAndRangesFromDict(cropDict, seqName):
   184     if seqName in cropDict:
   185         return seqName, cropDict[seqName]
   186     n = seqName.split(".")[-1]
   187     if n in cropDict:
   188         return n, cropDict[n]
   189     return seqName, []
   190 
   191 def rangesForSecondaryAlignments(primaryRanges, seqLen):
   192     if primaryRanges:
   193         return primaryRanges
   194     return [(0, seqLen)]
   195 
   196 def readSecondaryAlignments(opts, cropRanges1, cropRanges2):
   197     cropDict1 = dict(groupByFirstItem(cropRanges1))
   198     cropDict2 = dict(groupByFirstItem(cropRanges2))
   199 
   200     alignments = []
   201     seqRanges1 = []
   202     seqRanges2 = []
   203     coverDict1 = {}
   204     coverDict2 = {}
   205     lines = myOpen(opts.alignments)
   206     for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
   207         seqName1, ranges1 = nameAndRangesFromDict(cropDict1, seqName1)
   208         seqName2, ranges2 = nameAndRangesFromDict(cropDict2, seqName2)
   209         if not ranges1 and not ranges2:
   210             continue
   211         r1 = rangesForSecondaryAlignments(ranges1, seqLen1)
   212         r2 = rangesForSecondaryAlignments(ranges2, seqLen2)
   213         b = list(croppedBlocks(list(blocks), r1, r2))
   214         if not b: continue
   215         aln = seqName1, seqName2, b
   216         alignments.append(aln)
   217         if not ranges1:
   218             coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
   219             updateSeqs(coverDict1, seqRanges1, seqName1, r1, coveredRange1)
   220         if not ranges2:
   221             coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
   222             updateSeqs(coverDict2, seqRanges2, seqName2, r2, coveredRange2)
   223     return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
   224 
   225 def twoValuesFromOption(text, separator):
   226     if separator in text:
   227         return text.split(separator)
   228     return text, text
   229 
   230 def mergedRanges(ranges):
   231     oldBeg, maxEnd = ranges[0]
   232     for beg, end in ranges:
   233         if beg > maxEnd:
   234             yield oldBeg, maxEnd
   235             oldBeg = beg
   236             maxEnd = end
   237         elif end > maxEnd:
   238             maxEnd = end
   239     yield oldBeg, maxEnd
   240 
   241 def mergedRangesPerSeq(coverDict):
   242     for k, v in coverDict.items():
   243         v.sort()
   244         yield k, list(mergedRanges(v))
   245 
   246 def coveredLength(mergedCoverDict):
   247     return sum(sum(e - b for b, e in v) for v in mergedCoverDict.values())
   248 
   249 def trimmed(seqRanges, coverDict, minAlignedBases, maxGapFrac, endPad, midPad):
   250     maxEndGapFrac, maxMidGapFrac = twoValuesFromOption(maxGapFrac, ",")
   251     maxEndGap = max(float(maxEndGapFrac) * minAlignedBases, endPad * 1.0)
   252     maxMidGap = max(float(maxMidGapFrac) * minAlignedBases, midPad * 2.0)
   253 
   254     for seqName, rangeBeg, rangeEnd in seqRanges:
   255         seqBlocks = coverDict[seqName]
   256         blocks = [i for i in seqBlocks if i[0] < rangeEnd and i[1] > rangeBeg]
   257         if blocks[0][0] - rangeBeg > maxEndGap:
   258             rangeBeg = blocks[0][0] - endPad
   259         for j, y in enumerate(blocks):
   260             if j:
   261                 x = blocks[j - 1]
   262                 if y[0] - x[1] > maxMidGap:
   263                     yield seqName, rangeBeg, x[1] + midPad
   264                     rangeBeg = y[0] - midPad
   265         if rangeEnd - blocks[-1][1] > maxEndGap:
   266             rangeEnd = blocks[-1][1] + endPad
   267         yield seqName, rangeBeg, rangeEnd
   268 
   269 def rangesWithStrandInfo(seqRanges, strandOpt, alignments, seqIndex):
   270     if strandOpt == "1":
   271         forwardMinusReverse = collections.defaultdict(int)
   272         for i in alignments:
   273             blocks = i[2]
   274             beg1, beg2, size = blocks[0]
   275             numOfAlignedLetterPairs = sum(i[2] for i in blocks)
   276             if (beg1 < 0) != (beg2 < 0):  # opposite-strand alignment
   277                 numOfAlignedLetterPairs *= -1
   278             forwardMinusReverse[i[seqIndex]] += numOfAlignedLetterPairs
   279     strandNum = 0
   280     for seqName, beg, end in seqRanges:
   281         if strandOpt == "1":
   282             strandNum = 1 if forwardMinusReverse[seqName] >= 0 else 2
   283         yield seqName, beg, end, strandNum
   284 
   285 def natural_sort_key(my_string):
   286     '''Return a sort key for "natural" ordering, e.g. chr9 < chr10.'''
   287     parts = re.split(r'(\d+)', my_string)
   288     parts[1::2] = map(int, parts[1::2])
   289     return parts
   290 
   291 def nameKey(oneSeqRanges):
   292     return natural_sort_key(oneSeqRanges[0][0])
   293 
   294 def sizeKey(oneSeqRanges):
   295     return sum(b - e for n, b, e, s in oneSeqRanges), nameKey(oneSeqRanges)
   296 
   297 def alignmentKey(seqNamesToLists, oneSeqRanges):
   298     seqName = oneSeqRanges[0][0]
   299     alignmentsOfThisSequence = seqNamesToLists[seqName]
   300     numOfAlignedLetterPairs = sum(i[3] for i in alignmentsOfThisSequence)
   301     toMiddle = numOfAlignedLetterPairs // 2
   302     for i in alignmentsOfThisSequence:
   303         toMiddle -= i[3]
   304         if toMiddle < 0:
   305             return i[1:3]  # sequence-rank and "position" of this alignment
   306 
   307 def rankAndFlipPerSeq(seqRanges):
   308     rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
   309     for rank, group in enumerate(rangesGroupedBySeqName):
   310         seqName, ranges = group
   311         strandNum = next(ranges)[3]
   312         flip = 1 if strandNum < 2 else -1
   313         yield seqName, (rank, flip)
   314 
   315 def alignmentSortData(alignments, seqIndex, otherNamesToRanksAndFlips):
   316     otherIndex = 1 - seqIndex
   317     for i in alignments:
   318         blocks = i[2]
   319         otherRank, otherFlip = otherNamesToRanksAndFlips[i[otherIndex]]
   320         otherPos = otherFlip * abs(blocks[0][otherIndex] +
   321                                    blocks[-1][otherIndex] + blocks[-1][2])
   322         numOfAlignedLetterPairs = sum(i[2] for i in blocks)
   323         yield i[seqIndex], otherRank, otherPos, numOfAlignedLetterPairs
   324 
   325 def mySortedRanges(seqRanges, sortOpt, seqIndex, alignments, otherRanges):
   326     rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
   327     g = [list(ranges) for seqName, ranges in rangesGroupedBySeqName]
   328     for i in g:
   329         if i[0][3] > 1:
   330             i.reverse()
   331     if sortOpt == "1":
   332         g.sort(key=nameKey)
   333     if sortOpt == "2":
   334         g.sort(key=sizeKey)
   335     if sortOpt == "3":
   336         otherNamesToRanksAndFlips = dict(rankAndFlipPerSeq(otherRanges))
   337         alns = sorted(alignmentSortData(alignments, seqIndex,
   338                                         otherNamesToRanksAndFlips))
   339         alnsGroupedBySeqName = itertools.groupby(alns, itemgetter(0))
   340         seqNamesToLists = dict((k, list(v)) for k, v in alnsGroupedBySeqName)
   341         g.sort(key=functools.partial(alignmentKey, seqNamesToLists))
   342     return [j for i in g for j in i]
   343 
   344 def allSortedRanges(opts, alignments, alignmentsB,
   345                     seqRanges1, seqRangesB1, seqRanges2, seqRangesB2):
   346     o1, oB1 = twoValuesFromOption(opts.strands1, ":")
   347     o2, oB2 = twoValuesFromOption(opts.strands2, ":")
   348     if o1 == "1" and o2 == "1":
   349         raise Exception("the strand options have circular dependency")
   350     seqRanges1 = list(rangesWithStrandInfo(seqRanges1, o1, alignments, 0))
   351     seqRanges2 = list(rangesWithStrandInfo(seqRanges2, o2, alignments, 1))
   352     seqRangesB1 = list(rangesWithStrandInfo(seqRangesB1, oB1, alignmentsB, 0))
   353     seqRangesB2 = list(rangesWithStrandInfo(seqRangesB2, oB2, alignmentsB, 1))
   354 
   355     o1, oB1 = twoValuesFromOption(opts.sort1, ":")
   356     o2, oB2 = twoValuesFromOption(opts.sort2, ":")
   357     if o1 == "3" and o2 == "3":
   358         raise Exception("the sort options have circular dependency")
   359     if o1 != "3":
   360         s1 = mySortedRanges(seqRanges1, o1, None, None, None)
   361     if o2 != "3":
   362         s2 = mySortedRanges(seqRanges2, o2, None, None, None)
   363     if o1 == "3":
   364         s1 = mySortedRanges(seqRanges1, o1, 0, alignments, s2)
   365     if o2 == "3":
   366         s2 = mySortedRanges(seqRanges2, o2, 1, alignments, s1)
   367     t1 = mySortedRanges(seqRangesB1, oB1, 0, alignmentsB, s2)
   368     t2 = mySortedRanges(seqRangesB2, oB2, 1, alignmentsB, s1)
   369     return s1 + t1, s2 + t2
   370 
   371 def prettyNum(n):
   372     t = str(n)
   373     groups = []
   374     while t:
   375         groups.append(t[-3:])
   376         t = t[:-3]
   377     return ",".join(reversed(groups))
   378 
   379 def sizeText(size):
   380     suffixes = "bp", "kb", "Mb", "Gb"
   381     for i, x in enumerate(suffixes):
   382         j = 10 ** (i * 3)
   383         if size < j * 10:
   384             return "%.2g" % (1.0 * size / j) + x
   385         if size < j * 1000 or i == len(suffixes) - 1:
   386             return "%.0f" % (1.0 * size / j) + x
   387 
   388 def labelText(seqRange, labelOpt):
   389     seqName, beg, end, strandNum = seqRange
   390     if labelOpt == 1:
   391         return seqName + ": " + sizeText(end - beg)
   392     if labelOpt == 2:
   393         return seqName + ":" + prettyNum(beg) + ": " + sizeText(end - beg)
   394     if labelOpt == 3:
   395         return seqName + ":" + prettyNum(beg) + "-" + prettyNum(end)
   396     return seqName
   397 
   398 def rangeLabels(seqRanges, labelOpt, font, fontsize, image_mode, textRot):
   399     if fontsize:
   400         image_size = 1, 1
   401         im = Image.new(image_mode, image_size)
   402         draw = ImageDraw.Draw(im)
   403     x = y = 0
   404     for r in seqRanges:
   405         text = labelText(r, labelOpt)
   406         if fontsize:
   407             x, y = draw.textsize(text, font=font)
   408             if textRot:
   409                 x, y = y, x
   410         yield text, x, y, r[3]
   411 
   412 def dataFromRanges(sortedRanges, font, fontSize, imageMode, labelOpt, textRot):
   413     for seqName, rangeBeg, rangeEnd, strandNum in sortedRanges:
   414         out = [seqName, str(rangeBeg), str(rangeEnd)]
   415         if strandNum > 0:
   416             out.append(".+-"[strandNum])
   417         warn("\t".join(out))
   418     warn("")
   419     rangeSizes = [e - b for n, b, e, s in sortedRanges]
   420     labs = list(rangeLabels(sortedRanges, labelOpt, font, fontSize,
   421                             imageMode, textRot))
   422     margin = max(i[2] for i in labs)
   423     # xxx the margin may be too big, because some labels may get omitted
   424     return rangeSizes, labs, margin
   425 
   426 def div_ceil(x, y):
   427     '''Return x / y rounded up.'''
   428     q, r = divmod(x, y)
   429     return q + (r != 0)
   430 
   431 def get_bp_per_pix(rangeSizes, pixTweenRanges, maxPixels):
   432     '''Get the minimum bp-per-pixel that fits in the size limit.'''
   433     warn("choosing bp per pixel...")
   434     numOfRanges = len(rangeSizes)
   435     maxPixelsInRanges = maxPixels - pixTweenRanges * (numOfRanges - 1)
   436     if maxPixelsInRanges < numOfRanges:
   437         raise Exception("can't fit the image: too many sequences?")
   438     negLimit = -maxPixelsInRanges
   439     negBpPerPix = sum(rangeSizes) // negLimit
   440     while True:
   441         if sum(i // negBpPerPix for i in rangeSizes) >= negLimit:
   442             return -negBpPerPix
   443         negBpPerPix -= 1
   444 
   445 def getRangePixBegs(rangePixLens, pixTweenRanges, margin):
   446     '''Get the start pixel for each range.'''
   447     rangePixBegs = []
   448     pix_tot = margin - pixTweenRanges
   449     for i in rangePixLens:
   450         pix_tot += pixTweenRanges
   451         rangePixBegs.append(pix_tot)
   452         pix_tot += i
   453     return rangePixBegs
   454 
   455 def pixelData(rangeSizes, bp_per_pix, pixTweenRanges, margin):
   456     '''Return pixel information about the ranges.'''
   457     rangePixLens = [div_ceil(i, bp_per_pix) for i in rangeSizes]
   458     rangePixBegs = getRangePixBegs(rangePixLens, pixTweenRanges, margin)
   459     tot_pix = rangePixBegs[-1] + rangePixLens[-1]
   460     return rangePixBegs, rangePixLens, tot_pix
   461 
   462 def drawLineForward(hits, width, bp_per_pix, beg1, beg2, size):
   463     while True:
   464         q1, r1 = divmod(beg1, bp_per_pix)
   465         q2, r2 = divmod(beg2, bp_per_pix)
   466         hits[q2 * width + q1] |= 1
   467         next_pix = min(bp_per_pix - r1, bp_per_pix - r2)
   468         if next_pix >= size: break
   469         beg1 += next_pix
   470         beg2 += next_pix
   471         size -= next_pix
   472 
   473 def drawLineReverse(hits, width, bp_per_pix, beg1, beg2, size):
   474     while True:
   475         q1, r1 = divmod(beg1, bp_per_pix)
   476         q2, r2 = divmod(beg2, bp_per_pix)
   477         hits[q2 * width + q1] |= 2
   478         next_pix = min(bp_per_pix - r1, r2 + 1)
   479         if next_pix >= size: break
   480         beg1 += next_pix
   481         beg2 -= next_pix
   482         size -= next_pix
   483 
   484 def strandAndOrigin(ranges, beg, size):
   485     isReverseStrand = (beg < 0)
   486     if isReverseStrand:
   487         beg = -(beg + size)
   488     for rangeBeg, rangeEnd, isReverseRange, origin in ranges:
   489         if rangeEnd > beg:  # assumes the ranges are sorted
   490             return (isReverseStrand != isReverseRange), origin
   491 
   492 def alignmentPixels(width, height, alignments, bp_per_pix,
   493                     rangeDict1, rangeDict2):
   494     hits = [0] * (width * height)  # the image data
   495     for seq1, seq2, blocks in alignments:
   496         beg1, beg2, size = blocks[0]
   497         isReverse1, ori1 = strandAndOrigin(rangeDict1[seq1], beg1, size)
   498         isReverse2, ori2 = strandAndOrigin(rangeDict2[seq2], beg2, size)
   499         for beg1, beg2, size in blocks:
   500             if isReverse1:
   501                 beg1 = -(beg1 + size)
   502                 beg2 = -(beg2 + size)
   503             if isReverse1 == isReverse2:
   504                 drawLineForward(hits, width, bp_per_pix,
   505                                 ori1 + beg1, ori2 + beg2, size)
   506             else:
   507                 drawLineReverse(hits, width, bp_per_pix,
   508                                 ori1 + beg1, ori2 - beg2 - 1, size)
   509     return hits
   510 
   511 def expandedSeqDict(seqDict):
   512     '''Allow lookup by short sequence names, e.g. chr7 as well as hg19.chr7.'''
   513     newDict = seqDict.copy()
   514     for name, x in seqDict.items():
   515         if "." in name:
   516             base = name.split(".")[-1]
   517             if base in newDict:  # an ambiguous case was found:
   518                 return seqDict   # so give up completely
   519             newDict[base] = x
   520     return newDict
   521 
   522 def readBed(fileName, rangeDict):
   523     for line in myOpen(fileName):
   524         w = line.split()
   525         if not w: continue
   526         seqName = w[0]
   527         if seqName not in rangeDict: continue
   528         beg = int(w[1])
   529         end = int(w[2])
   530         layer = 900
   531         color = "#fbf"
   532         if len(w) > 4:
   533             if w[4] != ".":
   534                 layer = float(w[4])
   535             if len(w) > 5:
   536                 if len(w) > 8 and w[8].count(",") == 2:
   537                     color = "rgb(" + w[8] + ")"
   538                 else:
   539                     strand = w[5]
   540                     isRev = rangeDict[seqName][0][2]
   541                     if strand == "+" and not isRev or strand == "-" and isRev:
   542                         color = "#ffe8e8"
   543                     if strand == "-" and not isRev or strand == "+" and isRev:
   544                         color = "#e8e8ff"
   545         yield layer, color, seqName, beg, end
   546 
   547 def commaSeparatedInts(text):
   548     return map(int, text.rstrip(",").split(","))
   549 
   550 def readGenePred(opts, fileName, rangeDict):
   551     for line in myOpen(fileName):
   552         fields = line.split()
   553         if not fields: continue
   554         if fields[2] not in "+-": fields = fields[1:]
   555         seqName = fields[1]
   556         if seqName not in rangeDict: continue
   557         #strand = fields[2]
   558         cdsBeg = int(fields[5])
   559         cdsEnd = int(fields[6])
   560         exonBegs = commaSeparatedInts(fields[8])
   561         exonEnds = commaSeparatedInts(fields[9])
   562         for beg, end in zip(exonBegs, exonEnds):
   563             yield 300, opts.exon_color, seqName, beg, end
   564             b = max(beg, cdsBeg)
   565             e = min(end, cdsEnd)
   566             if b < e: yield 400, opts.cds_color, seqName, b, e
   567 
   568 def readRmsk(fileName, rangeDict):
   569     for line in myOpen(fileName):
   570         fields = line.split()
   571         if len(fields) == 17:  # rmsk.txt
   572             seqName = fields[5]
   573             if seqName not in rangeDict: continue  # do this ASAP for speed
   574             beg = int(fields[6])
   575             end = int(fields[7])
   576             strand = fields[9]
   577             repeatClass = fields[11]
   578         elif len(fields) == 15:  # .out
   579             seqName = fields[4]
   580             if seqName not in rangeDict: continue
   581             beg = int(fields[5]) - 1
   582             end = int(fields[6])
   583             strand = fields[8]
   584             repeatClass = fields[10]
   585         else:
   586             continue
   587         if repeatClass in ("Low_complexity", "Simple_repeat"):
   588             yield 200, "#fbf", seqName, beg, end
   589         elif (strand == "+") != rangeDict[seqName][0][2]:
   590             yield 100, "#ffe8e8", seqName, beg, end
   591         else:
   592             yield 100, "#e8e8ff", seqName, beg, end
   593 
   594 def isExtraFirstGapField(fields):
   595     return fields[4].isdigit()
   596 
   597 def readGaps(opts, fileName, rangeDict):
   598     '''Read locations of unsequenced gaps, from an agp or gap file.'''
   599     for line in myOpen(fileName):
   600         w = line.split()
   601         if not w or w[0][0] == "#": continue
   602         if isExtraFirstGapField(w): w = w[1:]
   603         if w[4] not in "NU": continue
   604         seqName = w[0]
   605         if seqName not in rangeDict: continue
   606         end = int(w[2])
   607         beg = end - int(w[5])  # zero-based coordinate
   608         if w[7] == "yes":
   609             yield 3000, opts.bridged_color, seqName, beg, end
   610         else:
   611             yield 2000, opts.unbridged_color, seqName, beg, end
   612 
   613 def bedBoxes(beds, rangeDict, margin, edge, isTop, bpPerPix):
   614     for layer, color, seqName, bedBeg, bedEnd in beds:
   615         for rangeBeg, rangeEnd, isReverseRange, origin in rangeDict[seqName]:
   616             beg = max(bedBeg, rangeBeg)
   617             end = min(bedEnd, rangeEnd)
   618             if beg >= end: continue
   619             if isReverseRange:
   620                 beg, end = -end, -beg
   621             if layer <= 1000:
   622                 # include partly-covered pixels
   623                 b = (origin + beg) // bpPerPix
   624                 e = div_ceil(origin + end, bpPerPix)
   625             else:
   626                 # exclude partly-covered pixels
   627                 b = div_ceil(origin + beg, bpPerPix)
   628                 e = (origin + end) // bpPerPix
   629                 if e <= b: continue
   630                 if bedEnd >= rangeEnd:  # include partly-covered end pixels
   631                     if isReverseRange:
   632                         b = (origin + beg) // bpPerPix
   633                     else:
   634                         e = div_ceil(origin + end, bpPerPix)
   635             if isTop:
   636                 box = b, margin, e, edge
   637             else:
   638                 box = margin, b, edge, e
   639             yield layer, color, box
   640 
   641 def drawAnnotations(im, boxes):
   642     # xxx use partial transparency for different-color overlaps?
   643     for layer, color, box in boxes:
   644         im.paste(color, box)
   645 
   646 def placedLabels(labels, rangePixBegs, rangePixLens, beg, end):
   647     '''Return axis labels with endpoint & sort-order information.'''
   648     maxWidth = end - beg
   649     for i, j, k in zip(labels, rangePixBegs, rangePixLens):
   650         text, textWidth, textHeight, strandNum = i
   651         if textWidth > maxWidth:
   652             continue
   653         labelBeg = j + (k - textWidth) // 2
   654         labelEnd = labelBeg + textWidth
   655         sortKey = textWidth - k
   656         if labelBeg < beg:
   657             sortKey += maxWidth * (beg - labelBeg)
   658             labelBeg = beg
   659             labelEnd = beg + textWidth
   660         if labelEnd > end:
   661             sortKey += maxWidth * (labelEnd - end)
   662             labelEnd = end
   663             labelBeg = end - textWidth
   664         yield sortKey, labelBeg, labelEnd, text, textHeight, strandNum
   665 
   666 def nonoverlappingLabels(labels, minPixTweenLabels):
   667     '''Get a subset of non-overlapping axis labels, greedily.'''
   668     out = []
   669     for i in labels:
   670         beg = i[1] - minPixTweenLabels
   671         end = i[2] + minPixTweenLabels
   672         if all(j[2] <= beg or j[1] >= end for j in out):
   673             out.append(i)
   674     return out
   675 
   676 def axisImage(labels, rangePixBegs, rangePixLens, textRot,
   677               textAln, font, image_mode, opts):
   678     '''Make an image of axis labels.'''
   679     beg = rangePixBegs[0]
   680     end = rangePixBegs[-1] + rangePixLens[-1]
   681     margin = max(i[2] for i in labels)
   682     labels = sorted(placedLabels(labels, rangePixBegs, rangePixLens, beg, end))
   683     minPixTweenLabels = 0 if textRot else opts.label_space
   684     labels = nonoverlappingLabels(labels, minPixTweenLabels)
   685     image_size = (margin, end) if textRot else (end, margin)
   686     im = Image.new(image_mode, image_size, opts.margin_color)
   687     draw = ImageDraw.Draw(im)
   688     for sortKey, labelBeg, labelEnd, text, textHeight, strandNum in labels:
   689         base = margin - textHeight if textAln else 0
   690         position = (base, labelBeg) if textRot else (labelBeg, base)
   691         fill = ("black", opts.forwardcolor, opts.reversecolor)[strandNum]
   692         draw.text(position, text, font=font, fill=fill)
   693     return im
   694 
   695 def rangesWithOrigins(sortedRanges, rangePixBegs, rangePixLens, bpPerPix):
   696     for i, j, k in zip(sortedRanges, rangePixBegs, rangePixLens):
   697         seqName, rangeBeg, rangeEnd, strandNum = i
   698         isReverseRange = (strandNum > 1)
   699         if isReverseRange:
   700             origin = bpPerPix * (j + k) + rangeBeg
   701         else:
   702             origin = bpPerPix * j - rangeBeg
   703         yield seqName, (rangeBeg, rangeEnd, isReverseRange, origin)
   704 
   705 def rangesPerSeq(sortedRanges, rangePixBegs, rangePixLens, bpPerPix):
   706     a = rangesWithOrigins(sortedRanges, rangePixBegs, rangePixLens, bpPerPix)
   707     for k, v in itertools.groupby(a, itemgetter(0)):
   708         yield k, sorted(i[1] for i in v)
   709 
   710 def getFont(opts):
   711     if opts.fontfile:
   712         return ImageFont.truetype(opts.fontfile, opts.fontsize)
   713     fileNames = []
   714     try:
   715         x = ["fc-match", "-f%{file}", "arial"]
   716         p = subprocess.Popen(x, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
   717         out, err = p.communicate()
   718         fileNames.append(out)
   719     except OSError as e:
   720         warn("fc-match error: " + str(e))
   721     fileNames.append("/Library/Fonts/Arial.ttf")  # for Mac
   722     for i in fileNames:
   723         try:
   724             font = ImageFont.truetype(i, opts.fontsize)
   725             warn("font: " + i)
   726             return font
   727         except IOError as e:
   728             warn("font load error: " + str(e))
   729     return ImageFont.load_default()
   730 
   731 def lastDotplot(opts, args):
   732     font = getFont(opts)
   733     image_mode = 'RGB'
   734     forward_color = ImageColor.getcolor(opts.forwardcolor, image_mode)
   735     reverse_color = ImageColor.getcolor(opts.reversecolor, image_mode)
   736     zipped_colors = zip(forward_color, reverse_color)
   737     overlap_color = tuple([(i + j) // 2 for i, j in zipped_colors])
   738 
   739     maxGap1, maxGapB1 = twoValuesFromOption(opts.max_gap1, ":")
   740     maxGap2, maxGapB2 = twoValuesFromOption(opts.max_gap2, ":")
   741 
   742     warn("reading alignments...")
   743     alnData = readAlignments(args[0], opts)
   744     alignments, seqRanges1, coverDict1, seqRanges2, coverDict2 = alnData
   745     if not alignments: raise Exception("there are no alignments")
   746     warn("cutting...")
   747     coverDict1 = dict(mergedRangesPerSeq(coverDict1))
   748     coverDict2 = dict(mergedRangesPerSeq(coverDict2))
   749     minAlignedBases = min(coveredLength(coverDict1), coveredLength(coverDict2))
   750     pad = int(opts.pad * minAlignedBases)
   751     cutRanges1 = list(trimmed(seqRanges1, coverDict1, minAlignedBases,
   752                               maxGap1, pad, pad))
   753     cutRanges2 = list(trimmed(seqRanges2, coverDict2, minAlignedBases,
   754                               maxGap2, pad, pad))
   755 
   756     warn("reading secondary alignments...")
   757     alnDataB = readSecondaryAlignments(opts, cutRanges1, cutRanges2)
   758     alignmentsB, seqRangesB1, coverDictB1, seqRangesB2, coverDictB2 = alnDataB
   759     warn("cutting...")
   760     coverDictB1 = dict(mergedRangesPerSeq(coverDictB1))
   761     coverDictB2 = dict(mergedRangesPerSeq(coverDictB2))
   762     cutRangesB1 = trimmed(seqRangesB1, coverDictB1, minAlignedBases,
   763                           maxGapB1, 0, 0)
   764     cutRangesB2 = trimmed(seqRangesB2, coverDictB2, minAlignedBases,
   765                           maxGapB2, 0, 0)
   766 
   767     warn("sorting...")
   768     sortOut = allSortedRanges(opts, alignments, alignmentsB,
   769                               cutRanges1, cutRangesB1, cutRanges2, cutRangesB2)
   770     sortedRanges1, sortedRanges2 = sortOut
   771 
   772     textRot1 = "vertical".startswith(opts.rot1)
   773     i1 = dataFromRanges(sortedRanges1, font,
   774                         opts.fontsize, image_mode, opts.labels1, textRot1)
   775     rangeSizes1, labelData1, tMargin = i1
   776 
   777     textRot2 = "horizontal".startswith(opts.rot2)
   778     i2 = dataFromRanges(sortedRanges2, font,
   779                         opts.fontsize, image_mode, opts.labels2, textRot2)
   780     rangeSizes2, labelData2, lMargin = i2
   781 
   782     maxPixels1 = opts.width  - lMargin
   783     maxPixels2 = opts.height - tMargin
   784     bpPerPix1 = get_bp_per_pix(rangeSizes1, opts.border_pixels, maxPixels1)
   785     bpPerPix2 = get_bp_per_pix(rangeSizes2, opts.border_pixels, maxPixels2)
   786     bpPerPix = max(bpPerPix1, bpPerPix2)
   787     warn("bp per pixel = " + str(bpPerPix))
   788 
   789     p1 = pixelData(rangeSizes1, bpPerPix, opts.border_pixels, lMargin)
   790     rangePixBegs1, rangePixLens1, width = p1
   791     rangeDict1 = dict(rangesPerSeq(sortedRanges1, rangePixBegs1,
   792                                    rangePixLens1, bpPerPix))
   793 
   794     p2 = pixelData(rangeSizes2, bpPerPix, opts.border_pixels, tMargin)
   795     rangePixBegs2, rangePixLens2, height = p2
   796     rangeDict2 = dict(rangesPerSeq(sortedRanges2, rangePixBegs2,
   797                                    rangePixLens2, bpPerPix))
   798 
   799     warn("width:  " + str(width))
   800     warn("height: " + str(height))
   801 
   802     warn("processing alignments...")
   803     hits = alignmentPixels(width, height, alignments + alignmentsB, bpPerPix,
   804                            rangeDict1, rangeDict2)
   805 
   806     warn("reading annotations...")
   807 
   808     rangeDict1 = expandedSeqDict(rangeDict1)
   809     beds1 = itertools.chain(readBed(opts.bed1, rangeDict1),
   810                             readRmsk(opts.rmsk1, rangeDict1),
   811                             readGenePred(opts, opts.genePred1, rangeDict1),
   812                             readGaps(opts, opts.gap1, rangeDict1))
   813     b1 = bedBoxes(beds1, rangeDict1, tMargin, height, True, bpPerPix)
   814 
   815     rangeDict2 = expandedSeqDict(rangeDict2)
   816     beds2 = itertools.chain(readBed(opts.bed2, rangeDict2),
   817                             readRmsk(opts.rmsk2, rangeDict2),
   818                             readGenePred(opts, opts.genePred2, rangeDict2),
   819                             readGaps(opts, opts.gap2, rangeDict2))
   820     b2 = bedBoxes(beds2, rangeDict2, lMargin, width, False, bpPerPix)
   821 
   822     boxes = sorted(itertools.chain(b1, b2))
   823 
   824     warn("drawing...")
   825 
   826     image_size = width, height
   827     im = Image.new(image_mode, image_size, opts.background_color)
   828 
   829     drawAnnotations(im, boxes)
   830 
   831     for i in range(height):
   832         for j in range(width):
   833             store_value = hits[i * width + j]
   834             xy = j, i
   835             if   store_value == 1: im.putpixel(xy, forward_color)
   836             elif store_value == 2: im.putpixel(xy, reverse_color)
   837             elif store_value == 3: im.putpixel(xy, overlap_color)
   838 
   839     if opts.fontsize != 0:
   840         axis1 = axisImage(labelData1, rangePixBegs1, rangePixLens1,
   841                           textRot1, False, font, image_mode, opts)
   842         if textRot1:
   843             axis1 = axis1.transpose(Image.ROTATE_90)
   844         axis2 = axisImage(labelData2, rangePixBegs2, rangePixLens2,
   845                           textRot2, textRot2, font, image_mode, opts)
   846         if not textRot2:
   847             axis2 = axis2.transpose(Image.ROTATE_270)
   848         im.paste(axis1, (0, 0))
   849         im.paste(axis2, (0, 0))
   850 
   851     for i in rangePixBegs1[1:]:
   852         box = i - opts.border_pixels, tMargin, i, height
   853         im.paste(opts.border_color, box)
   854 
   855     for i in rangePixBegs2[1:]:
   856         box = lMargin, i - opts.border_pixels, width, i
   857         im.paste(opts.border_color, box)
   858 
   859     im.save(args[1])
   860 
   861 if __name__ == "__main__":
   862     usage = """%prog --help
   863    or: %prog [options] maf-or-tab-alignments dotplot.png
   864    or: %prog [options] maf-or-tab-alignments dotplot.gif
   865    or: ..."""
   866     description = "Draw a dotplot of pair-wise sequence alignments in MAF or tabular format."
   867     op = optparse.OptionParser(usage=usage, description=description)
   868     op.add_option("-v", "--verbose", action="count",
   869                   help="show progress messages & data about the plot")
   870     op.add_option("-1", "--seq1", metavar="PATTERN", action="append",
   871                   default=[],
   872                   help="which sequences to show from the 1st genome")
   873     op.add_option("-2", "--seq2", metavar="PATTERN", action="append",
   874                   default=[],
   875                   help="which sequences to show from the 2nd genome")
   876     # Replace "width" & "height" with a single "length" option?
   877     op.add_option("-x", "--width", type="int", default=1000,
   878                   help="maximum width in pixels (default: %default)")
   879     op.add_option("-y", "--height", type="int", default=1000,
   880                   help="maximum height in pixels (default: %default)")
   881     op.add_option("-c", "--forwardcolor", metavar="COLOR", default="red",
   882                   help="color for forward alignments (default: %default)")
   883     op.add_option("-r", "--reversecolor", metavar="COLOR", default="blue",
   884                   help="color for reverse alignments (default: %default)")
   885     op.add_option("--alignments", metavar="FILE", help="secondary alignments")
   886     op.add_option("--sort1", default="1", metavar="N",
   887                   help="genome1 sequence order: 0=input order, 1=name order, "
   888                   "2=length order, 3=alignment order (default=%default)")
   889     op.add_option("--sort2", default="1", metavar="N",
   890                   help="genome2 sequence order: 0=input order, 1=name order, "
   891                   "2=length order, 3=alignment order (default=%default)")
   892     op.add_option("--strands1", default="0", metavar="N", help=
   893                   "genome1 sequence orientation: 0=forward orientation, "
   894                   "1=alignment orientation (default=%default)")
   895     op.add_option("--strands2", default="0", metavar="N", help=
   896                   "genome2 sequence orientation: 0=forward orientation, "
   897                   "1=alignment orientation (default=%default)")
   898     op.add_option("--max-gap1", metavar="FRAC", default="0.5,2", help=
   899                   "maximum unaligned (end,mid) gap in genome1: "
   900                   "fraction of aligned length (default=%default)")
   901     op.add_option("--max-gap2", metavar="FRAC", default="0.5,2", help=
   902                   "maximum unaligned (end,mid) gap in genome2: "
   903                   "fraction of aligned length (default=%default)")
   904     op.add_option("--pad", metavar="FRAC", type="float", default=0.04, help=
   905                   "pad length when cutting unaligned gaps: "
   906                   "fraction of aligned length (default=%default)")
   907     op.add_option("--border-pixels", metavar="INT", type="int", default=1,
   908                   help="number of pixels between sequences (default=%default)")
   909     op.add_option("--border-color", metavar="COLOR", default="black",
   910                   help="color for pixels between sequences (default=%default)")
   911     # --break-color and/or --break-pixels for intra-sequence breaks?
   912     op.add_option("--margin-color", metavar="COLOR", default="#dcdcdc",
   913                   help="margin color")
   914 
   915     og = optparse.OptionGroup(op, "Text options")
   916     og.add_option("-f", "--fontfile", metavar="FILE",
   917                   help="TrueType or OpenType font file")
   918     og.add_option("-s", "--fontsize", metavar="SIZE", type="int", default=14,
   919                   help="TrueType or OpenType font size (default: %default)")
   920     og.add_option("--labels1", type="int", default=0, metavar="N", help=
   921                   "genome1 labels: 0=name, 1=name:length, "
   922                   "2=name:start:length, 3=name:start-end (default=%default)")
   923     og.add_option("--labels2", type="int", default=0, metavar="N", help=
   924                   "genome2 labels: 0=name, 1=name:length, "
   925                   "2=name:start:length, 3=name:start-end (default=%default)")
   926     og.add_option("--rot1", metavar="ROT", default="h",
   927                   help="text rotation for the 1st genome (default=%default)")
   928     og.add_option("--rot2", metavar="ROT", default="v",
   929                   help="text rotation for the 2nd genome (default=%default)")
   930     op.add_option_group(og)
   931 
   932     og = optparse.OptionGroup(op, "Annotation options")
   933     og.add_option("--bed1", metavar="FILE",
   934                   help="read genome1 annotations from BED file")
   935     og.add_option("--bed2", metavar="FILE",
   936                   help="read genome2 annotations from BED file")
   937     og.add_option("--rmsk1", metavar="FILE", help="read genome1 repeats from "
   938                   "RepeatMasker .out or rmsk.txt file")
   939     og.add_option("--rmsk2", metavar="FILE", help="read genome2 repeats from "
   940                   "RepeatMasker .out or rmsk.txt file")
   941     op.add_option_group(og)
   942 
   943     og = optparse.OptionGroup(op, "Gene options")
   944     og.add_option("--genePred1", metavar="FILE",
   945                   help="read genome1 genes from genePred file")
   946     og.add_option("--genePred2", metavar="FILE",
   947                   help="read genome2 genes from genePred file")
   948     og.add_option("--exon-color", metavar="COLOR", default="PaleGreen",
   949                   help="color for exons (default=%default)")
   950     og.add_option("--cds-color", metavar="COLOR", default="LimeGreen",
   951                   help="color for protein-coding regions (default=%default)")
   952     op.add_option_group(og)
   953 
   954     og = optparse.OptionGroup(op, "Unsequenced gap options")
   955     og.add_option("--gap1", metavar="FILE",
   956                   help="read genome1 unsequenced gaps from agp or gap file")
   957     og.add_option("--gap2", metavar="FILE",
   958                   help="read genome2 unsequenced gaps from agp or gap file")
   959     og.add_option("--bridged-color", metavar="COLOR", default="yellow",
   960                   help="color for bridged gaps (default: %default)")
   961     og.add_option("--unbridged-color", metavar="COLOR", default="orange",
   962                   help="color for unbridged gaps (default: %default)")
   963     op.add_option_group(og)
   964     (opts, args) = op.parse_args()
   965     if len(args) != 2: op.error("2 arguments needed")
   966 
   967     opts.background_color = "white"
   968     opts.label_space = 5     # minimum number of pixels between axis labels
   969 
   970     try: lastDotplot(opts, args)
   971     except KeyboardInterrupt: pass  # avoid silly error message
   972     except Exception as e:
   973         prog = os.path.basename(sys.argv[0])
   974         sys.exit(prog + ": error: " + str(e))