3 # Read pair-wise alignments in MAF or LAST tabular format: write an
4 # "Oxford grid", a.k.a. dotplot.
6 # TODO: Currently, pixels with zero aligned nt-pairs are white, and
7 # pixels with one or more aligned nt-pairs are black. This can look
8 # too crowded for large genome alignments. I tried shading each pixel
9 # according to the number of aligned nt-pairs within it, but the
10 # result is too faint. How can this be done better?
15 from fnmatch import fnmatchcase
17 from operator import itemgetter
19 import itertools, optparse, os, re, sys
21 # Try to make PIL/PILLOW work:
23 from PIL import Image, ImageDraw, ImageFont, ImageColor
25 import Image, ImageDraw, ImageFont, ImageColor
28 from future_builtins import zip
32 def myOpen(fileName): # faster than fileinput
37 if fileName.endswith(".gz"):
38 return gzip.open(fileName, "rt") # xxx dubious for Python2
41 def groupByFirstItem(things):
42 for k, v in itertools.groupby(things, itemgetter(0)):
43 yield k, [i[1:] for i in v]
45 def croppedBlocks(blocks, ranges1, ranges2):
46 headBeg1, headBeg2, headSize = blocks[0]
49 cropBeg1, cropEnd1 = r1
51 cropBeg1, cropEnd1 = -cropEnd1, -cropBeg1
52 cropBeg2, cropEnd2 = r2
54 cropBeg2, cropEnd2 = -cropEnd2, -cropBeg2
55 for beg1, beg2, size in blocks:
56 b1 = max(cropBeg1, beg1)
57 e1 = min(cropEnd1, beg1 + size)
60 b2 = max(cropBeg2, b1 + offset)
61 e2 = min(cropEnd2, e1 + offset)
63 yield b2 - offset, b2, e2 - b2
65 def tabBlocks(beg1, beg2, blocks):
66 '''Get the gapless blocks of an alignment, from LAST tabular format.'''
67 for i in blocks.split(","):
74 yield beg1, beg2, size
78 def mafBlocks(beg1, beg2, seq1, seq2):
79 '''Get the gapless blocks of an alignment, from MAF format.'''
81 for x, y in zip(seq1, seq2):
84 yield beg1, beg2, size
91 yield beg1, beg2, size
98 if size: yield beg1, beg2, size
100 def alignmentFromSegment(qrySeqName, qrySeqLen, segment):
101 refSeqLen = sys.maxsize # XXX
102 refSeqName, refSeqBeg, qrySeqBeg, size = segment
103 block = refSeqBeg, qrySeqBeg, size
104 return refSeqName, refSeqLen, qrySeqName, qrySeqLen, [block]
106 def alignmentInput(lines):
107 '''Get alignments and sequence lengths, from MAF or tabular format.'''
117 yield alignmentFromSegment(qrySeqName, qrySeqLen, i)
121 elif len(w) == 2 and qrySeqName and w[1].isdigit():
122 qrySeqLen += int(w[1])
123 elif len(w) == 3 and qrySeqName and w[1].isdigit() and w[2].isdigit():
124 refSeqName, refSeqBeg, refSeqEnd = w[0], int(w[1]), int(w[2])
125 size = abs(refSeqEnd - refSeqBeg)
126 if refSeqBeg > refSeqEnd:
127 refSeqBeg = -refSeqBeg
128 segments.append((refSeqName, refSeqBeg, qrySeqLen, size))
130 elif line[0].isdigit(): # tabular format
131 chr1, beg1, seqlen1 = w[1], int(w[2]), int(w[5])
132 if w[4] == "-": beg1 -= seqlen1
133 chr2, beg2, seqlen2 = w[6], int(w[7]), int(w[10])
134 if w[9] == "-": beg2 -= seqlen2
135 blocks = tabBlocks(beg1, beg2, w[11])
136 yield chr1, seqlen1, chr2, seqlen2, blocks
137 elif line[0] == "s": # MAF format
139 chr1, beg1, seqlen1, seq1 = w[1], int(w[2]), int(w[5]), w[6]
140 if w[4] == "-": beg1 -= seqlen1
143 chr2, beg2, seqlen2, seq2 = w[1], int(w[2]), int(w[5]), w[6]
144 if w[4] == "-": beg2 -= seqlen2
145 blocks = mafBlocks(beg1, beg2, seq1, seq2)
146 yield chr1, seqlen1, chr2, seqlen2, blocks
149 yield alignmentFromSegment(qrySeqName, qrySeqLen, i)
151 def seqRequestFromText(text):
153 pattern, interval = text.rsplit(":", 1)
155 beg, end = interval.rsplit("-", 1)
156 return pattern, int(beg), int(end) # beg may be negative
157 return text, 0, sys.maxsize
159 def rangesFromSeqName(seqRequests, name, seqLen):
161 base = name.split(".")[-1] # allow for names like hg19.chr7
162 for pat, beg, end in seqRequests:
163 if fnmatchcase(name, pat) or fnmatchcase(base, pat):
164 yield max(beg, 0), min(end, seqLen)
168 def updateSeqs(coverDict, seqRanges, seqName, ranges, coveredRange):
169 beg, end = coveredRange
171 coveredRange = -end, -beg
172 if seqName in coverDict:
173 coverDict[seqName].append(coveredRange)
175 coverDict[seqName] = [coveredRange]
176 for beg, end in ranges:
177 r = seqName, beg, end
180 def readAlignments(fileName, opts):
181 '''Get alignments and sequence limits, from MAF or tabular format.'''
182 seqRequests1 = [seqRequestFromText(i) for i in opts.seq1]
183 seqRequests2 = [seqRequestFromText(i) for i in opts.seq2]
190 lines = myOpen(fileName)
191 for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
192 ranges1 = sorted(rangesFromSeqName(seqRequests1, seqName1, seqLen1))
193 if not ranges1: continue
194 ranges2 = sorted(rangesFromSeqName(seqRequests2, seqName2, seqLen2))
195 if not ranges2: continue
196 b = list(croppedBlocks(list(blocks), ranges1, ranges2))
198 aln = seqName1, seqName2, b
199 alignments.append(aln)
200 coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
201 updateSeqs(coverDict1, seqRanges1, seqName1, ranges1, coveredRange1)
202 coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
203 updateSeqs(coverDict2, seqRanges2, seqName2, ranges2, coveredRange2)
204 return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
206 def nameAndRangesFromDict(cropDict, seqName):
207 if seqName in cropDict:
208 return seqName, cropDict[seqName]
209 n = seqName.split(".")[-1]
211 return n, cropDict[n]
214 def rangesForSecondaryAlignments(primaryRanges, seqLen):
219 def readSecondaryAlignments(opts, cropRanges1, cropRanges2):
220 cropDict1 = dict(groupByFirstItem(cropRanges1))
221 cropDict2 = dict(groupByFirstItem(cropRanges2))
228 lines = myOpen(opts.alignments)
229 for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
230 seqName1, ranges1 = nameAndRangesFromDict(cropDict1, seqName1)
231 seqName2, ranges2 = nameAndRangesFromDict(cropDict2, seqName2)
232 if not ranges1 and not ranges2:
234 r1 = rangesForSecondaryAlignments(ranges1, seqLen1)
235 r2 = rangesForSecondaryAlignments(ranges2, seqLen2)
236 b = list(croppedBlocks(list(blocks), r1, r2))
238 aln = seqName1, seqName2, b
239 alignments.append(aln)
241 coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
242 updateSeqs(coverDict1, seqRanges1, seqName1, r1, coveredRange1)
244 coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
245 updateSeqs(coverDict2, seqRanges2, seqName2, r2, coveredRange2)
246 return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
248 def twoValuesFromOption(text, separator):
249 if separator in text:
250 return text.split(separator)
253 def mergedRanges(ranges):
254 oldBeg, maxEnd = ranges[0]
255 for beg, end in ranges:
264 def mergedRangesPerSeq(coverDict):
265 for k, v in coverDict.items():
267 yield k, list(mergedRanges(v))
269 def coveredLength(mergedCoverDict):
270 return sum(sum(e - b for b, e in v) for v in mergedCoverDict.values())
272 def trimmed(seqRanges, coverDict, minAlignedBases, maxGapFrac, endPad, midPad):
273 maxEndGapFrac, maxMidGapFrac = twoValuesFromOption(maxGapFrac, ",")
274 maxEndGap = max(float(maxEndGapFrac) * minAlignedBases, endPad * 1.0)
275 maxMidGap = max(float(maxMidGapFrac) * minAlignedBases, midPad * 2.0)
277 for seqName, rangeBeg, rangeEnd in seqRanges:
278 seqBlocks = coverDict[seqName]
279 blocks = [i for i in seqBlocks if i[0] < rangeEnd and i[1] > rangeBeg]
280 if blocks[0][0] - rangeBeg > maxEndGap:
281 rangeBeg = blocks[0][0] - endPad
282 for j, y in enumerate(blocks):
285 if y[0] - x[1] > maxMidGap:
286 yield seqName, rangeBeg, x[1] + midPad
287 rangeBeg = y[0] - midPad
288 if rangeEnd - blocks[-1][1] > maxEndGap:
289 rangeEnd = blocks[-1][1] + endPad
290 yield seqName, rangeBeg, rangeEnd
292 def rangesWithStrandInfo(seqRanges, strandOpt, alignments, seqIndex):
294 forwardMinusReverse = collections.defaultdict(int)
297 beg1, beg2, size = blocks[0]
298 numOfAlignedLetterPairs = sum(i[2] for i in blocks)
299 if (beg1 < 0) != (beg2 < 0): # opposite-strand alignment
300 numOfAlignedLetterPairs *= -1
301 forwardMinusReverse[i[seqIndex]] += numOfAlignedLetterPairs
303 for seqName, beg, end in seqRanges:
305 strandNum = 1 if forwardMinusReverse[seqName] >= 0 else 2
306 yield seqName, beg, end, strandNum
308 def natural_sort_key(my_string):
309 '''Return a sort key for "natural" ordering, e.g. chr9 < chr10.'''
310 parts = re.split(r'(\d+)', my_string)
311 parts[1::2] = map(int, parts[1::2])
314 def nameKey(oneSeqRanges):
315 return natural_sort_key(oneSeqRanges[0][0])
317 def sizeKey(oneSeqRanges):
318 return sum(b - e for n, b, e, s in oneSeqRanges), nameKey(oneSeqRanges)
320 def alignmentKey(seqNamesToLists, oneSeqRanges):
321 seqName = oneSeqRanges[0][0]
322 alignmentsOfThisSequence = seqNamesToLists[seqName]
323 numOfAlignedLetterPairs = sum(i[3] for i in alignmentsOfThisSequence)
324 toMiddle = numOfAlignedLetterPairs // 2
325 for i in alignmentsOfThisSequence:
328 return i[1:3] # sequence-rank and "position" of this alignment
330 def rankAndFlipPerSeq(seqRanges):
331 rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
332 for rank, group in enumerate(rangesGroupedBySeqName):
333 seqName, ranges = group
334 strandNum = next(ranges)[3]
335 flip = 1 if strandNum < 2 else -1
336 yield seqName, (rank, flip)
338 def alignmentSortData(alignments, seqIndex, otherNamesToRanksAndFlips):
339 otherIndex = 1 - seqIndex
342 otherRank, otherFlip = otherNamesToRanksAndFlips[i[otherIndex]]
343 otherPos = otherFlip * abs(blocks[0][otherIndex] +
344 blocks[-1][otherIndex] + blocks[-1][2])
345 numOfAlignedLetterPairs = sum(i[2] for i in blocks)
346 yield i[seqIndex], otherRank, otherPos, numOfAlignedLetterPairs
348 def mySortedRanges(seqRanges, sortOpt, seqIndex, alignments, otherRanges):
349 rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
350 g = [list(ranges) for seqName, ranges in rangesGroupedBySeqName]
359 otherNamesToRanksAndFlips = dict(rankAndFlipPerSeq(otherRanges))
360 alns = sorted(alignmentSortData(alignments, seqIndex,
361 otherNamesToRanksAndFlips))
362 alnsGroupedBySeqName = itertools.groupby(alns, itemgetter(0))
363 seqNamesToLists = dict((k, list(v)) for k, v in alnsGroupedBySeqName)
364 g.sort(key=functools.partial(alignmentKey, seqNamesToLists))
365 return [j for i in g for j in i]
367 def allSortedRanges(opts, alignments, alignmentsB,
368 seqRanges1, seqRangesB1, seqRanges2, seqRangesB2):
369 o1, oB1 = twoValuesFromOption(opts.strands1, ":")
370 o2, oB2 = twoValuesFromOption(opts.strands2, ":")
371 if o1 == "1" and o2 == "1":
372 raise Exception("the strand options have circular dependency")
373 seqRanges1 = list(rangesWithStrandInfo(seqRanges1, o1, alignments, 0))
374 seqRanges2 = list(rangesWithStrandInfo(seqRanges2, o2, alignments, 1))
375 seqRangesB1 = list(rangesWithStrandInfo(seqRangesB1, oB1, alignmentsB, 0))
376 seqRangesB2 = list(rangesWithStrandInfo(seqRangesB2, oB2, alignmentsB, 1))
378 o1, oB1 = twoValuesFromOption(opts.sort1, ":")
379 o2, oB2 = twoValuesFromOption(opts.sort2, ":")
380 if o1 == "3" and o2 == "3":
381 raise Exception("the sort options have circular dependency")
383 s1 = mySortedRanges(seqRanges1, o1, None, None, None)
385 s2 = mySortedRanges(seqRanges2, o2, None, None, None)
387 s1 = mySortedRanges(seqRanges1, o1, 0, alignments, s2)
389 s2 = mySortedRanges(seqRanges2, o2, 1, alignments, s1)
390 t1 = mySortedRanges(seqRangesB1, oB1, 0, alignmentsB, s2)
391 t2 = mySortedRanges(seqRangesB2, oB2, 1, alignmentsB, s1)
392 return s1 + t1, s2 + t2
398 groups.append(t[-3:])
400 return ",".join(reversed(groups))
403 suffixes = "bp", "kb", "Mb", "Gb"
404 for i, x in enumerate(suffixes):
407 return "%.2g" % (1.0 * size / j) + x
408 if size < j * 1000 or i == len(suffixes) - 1:
409 return "%.0f" % (1.0 * size / j) + x
411 def labelText(seqRange, labelOpt):
412 seqName, beg, end, strandNum = seqRange
414 return seqName + ": " + sizeText(end - beg)
416 return seqName + ":" + prettyNum(beg) + ": " + sizeText(end - beg)
418 return seqName + ":" + prettyNum(beg) + "-" + prettyNum(end)
421 def rangeLabels(seqRanges, labelOpt, font, fontsize, image_mode, textRot):
424 im = Image.new(image_mode, image_size)
425 draw = ImageDraw.Draw(im)
428 text = labelText(r, labelOpt)
430 x, y = draw.textsize(text, font=font)
433 yield text, x, y, r[3]
435 def dataFromRanges(sortedRanges, font, fontSize, imageMode, labelOpt, textRot):
436 for seqName, rangeBeg, rangeEnd, strandNum in sortedRanges:
437 out = [seqName, str(rangeBeg), str(rangeEnd)]
439 out.append(".+-"[strandNum])
440 logging.info("\t".join(out))
442 rangeSizes = [e - b for n, b, e, s in sortedRanges]
443 labs = list(rangeLabels(sortedRanges, labelOpt, font, fontSize,
445 margin = max(i[2] for i in labs)
446 # xxx the margin may be too big, because some labels may get omitted
447 return rangeSizes, labs, margin
450 '''Return x / y rounded up.'''
454 def get_bp_per_pix(rangeSizes, pixTweenRanges, maxPixels):
455 '''Get the minimum bp-per-pixel that fits in the size limit.'''
456 logging.info("choosing bp per pixel...")
457 numOfRanges = len(rangeSizes)
458 maxPixelsInRanges = maxPixels - pixTweenRanges * (numOfRanges - 1)
459 if maxPixelsInRanges < numOfRanges:
460 raise Exception("can't fit the image: too many sequences?")
461 negLimit = -maxPixelsInRanges
462 negBpPerPix = sum(rangeSizes) // negLimit
464 if sum(i // negBpPerPix for i in rangeSizes) >= negLimit:
468 def getRangePixBegs(rangePixLens, pixTweenRanges, margin):
469 '''Get the start pixel for each range.'''
471 pix_tot = margin - pixTweenRanges
472 for i in rangePixLens:
473 pix_tot += pixTweenRanges
474 rangePixBegs.append(pix_tot)
478 def pixelData(rangeSizes, bp_per_pix, pixTweenRanges, margin):
479 '''Return pixel information about the ranges.'''
480 rangePixLens = [div_ceil(i, bp_per_pix) for i in rangeSizes]
481 rangePixBegs = getRangePixBegs(rangePixLens, pixTweenRanges, margin)
482 tot_pix = rangePixBegs[-1] + rangePixLens[-1]
483 return rangePixBegs, rangePixLens, tot_pix
485 def drawLineForward(hits, width, bp_per_pix, beg1, beg2, size):
487 q1, r1 = divmod(beg1, bp_per_pix)
488 q2, r2 = divmod(beg2, bp_per_pix)
489 hits[q2 * width + q1] |= 1
490 next_pix = min(bp_per_pix - r1, bp_per_pix - r2)
491 if next_pix >= size: break
496 def drawLineReverse(hits, width, bp_per_pix, beg1, beg2, size):
498 q1, r1 = divmod(beg1, bp_per_pix)
499 q2, r2 = divmod(beg2, bp_per_pix)
500 hits[q2 * width + q1] |= 2
501 next_pix = min(bp_per_pix - r1, r2 + 1)
502 if next_pix >= size: break
507 def strandAndOrigin(ranges, beg, size):
508 isReverseStrand = (beg < 0)
511 for rangeBeg, rangeEnd, isReverseRange, origin in ranges:
512 if rangeEnd > beg: # assumes the ranges are sorted
513 return (isReverseStrand != isReverseRange), origin
515 def alignmentPixels(width, height, alignments, bp_per_pix,
516 rangeDict1, rangeDict2):
517 hits = [0] * (width * height) # the image data
518 for seq1, seq2, blocks in alignments:
519 beg1, beg2, size = blocks[0]
520 isReverse1, ori1 = strandAndOrigin(rangeDict1[seq1], beg1, size)
521 isReverse2, ori2 = strandAndOrigin(rangeDict2[seq2], beg2, size)
522 for beg1, beg2, size in blocks:
524 beg1 = -(beg1 + size)
525 beg2 = -(beg2 + size)
526 if isReverse1 == isReverse2:
527 drawLineForward(hits, width, bp_per_pix,
528 ori1 + beg1, ori2 + beg2, size)
530 drawLineReverse(hits, width, bp_per_pix,
531 ori1 + beg1, ori2 - beg2 - 1, size)
534 def orientedBlocks(alignments, seqIndex):
535 otherIndex = 1 - seqIndex
537 seq1, seq2, blocks = a
541 b = -(beg1 + size), -(beg2 + size), size
542 yield a[seqIndex], b[seqIndex], a[otherIndex], b[otherIndex], size
544 def drawJoins(im, alignments, bpPerPix, seqIndex, rangeDict1, rangeDict2):
545 blocks = orientedBlocks(alignments, seqIndex)
547 for seq1, beg1, seq2, beg2, size in sorted(blocks):
548 isReverse1, ori1 = strandAndOrigin(rangeDict1[seq1], beg1, size)
549 isReverse2, ori2 = strandAndOrigin(rangeDict2[seq2], beg2, size)
550 end1 = beg1 + size - 1
551 end2 = beg2 + size - 1
558 newPix1 = (ori1 + beg1) // bpPerPix
559 newPix2 = (ori2 + beg2) // bpPerPix
561 lowerPix2 = min(oldPix2, newPix2)
562 upperPix2 = max(oldPix2, newPix2)
563 midPix1 = (oldPix1 + newPix1) // 2
565 midPix1 = (oldPix1 + newPix1 + 1) // 2
566 oldPix1, newPix1 = newPix1, oldPix1
567 if upperPix2 - lowerPix2 > 1 and oldPix1 <= newPix1 <= oldPix1 + 1:
569 box = midPix1, lowerPix2, midPix1 + 1, upperPix2 + 1
571 box = lowerPix2, midPix1, upperPix2 + 1, midPix1 + 1
572 im.paste("lightgray", box)
573 oldPix1 = (ori1 + end1) // bpPerPix
574 oldPix2 = (ori2 + end2) // bpPerPix
577 def expandedSeqDict(seqDict):
578 '''Allow lookup by short sequence names, e.g. chr7 as well as hg19.chr7.'''
579 newDict = seqDict.copy()
580 for name, x in seqDict.items():
582 base = name.split(".")[-1]
583 if base in newDict: # an ambiguous case was found:
584 return seqDict # so give up completely
588 def readBed(fileName, rangeDict):
589 for line in myOpen(fileName):
593 if seqName not in rangeDict: continue
602 if len(w) > 8 and w[8].count(",") == 2:
603 color = "rgb(" + w[8] + ")"
606 isRev = rangeDict[seqName][0][2]
607 if strand == "+" and not isRev or strand == "-" and isRev:
609 if strand == "-" and not isRev or strand == "+" and isRev:
611 yield layer, color, seqName, beg, end
613 def commaSeparatedInts(text):
614 return map(int, text.rstrip(",").split(","))
616 def readGenePred(opts, fileName, rangeDict):
617 for line in myOpen(fileName):
618 fields = line.split()
619 if not fields: continue
620 if fields[2] not in "+-": fields = fields[1:]
622 if seqName not in rangeDict: continue
624 cdsBeg = int(fields[5])
625 cdsEnd = int(fields[6])
626 exonBegs = commaSeparatedInts(fields[8])
627 exonEnds = commaSeparatedInts(fields[9])
628 for beg, end in zip(exonBegs, exonEnds):
629 yield 300, opts.exon_color, seqName, beg, end
632 if b < e: yield 400, opts.cds_color, seqName, b, e
634 def readRmsk(fileName, rangeDict):
635 for line in myOpen(fileName):
636 fields = line.split()
637 if len(fields) == 17: # rmsk.txt
639 if seqName not in rangeDict: continue # do this ASAP for speed
643 repeatClass = fields[11]
644 elif len(fields) == 15: # .out
646 if seqName not in rangeDict: continue
647 beg = int(fields[5]) - 1
650 repeatClass = fields[10]
653 if repeatClass in ("Low_complexity", "Simple_repeat"):
654 yield 200, "#fbf", seqName, beg, end
655 elif (strand == "+") != rangeDict[seqName][0][2]:
656 yield 100, "#ffe8e8", seqName, beg, end
658 yield 100, "#e8e8ff", seqName, beg, end
660 def isExtraFirstGapField(fields):
661 return fields[4].isdigit()
663 def readGaps(opts, fileName, rangeDict):
664 '''Read locations of unsequenced gaps, from an agp or gap file.'''
665 for line in myOpen(fileName):
667 if not w or w[0][0] == "#": continue
668 if isExtraFirstGapField(w): w = w[1:]
669 if w[4] not in "NU": continue
671 if seqName not in rangeDict: continue
673 beg = end - int(w[5]) # zero-based coordinate
675 yield 3000, opts.bridged_color, seqName, beg, end
677 yield 2000, opts.unbridged_color, seqName, beg, end
679 def bedBoxes(beds, rangeDict, margin, edge, isTop, bpPerPix):
680 for layer, color, seqName, bedBeg, bedEnd in beds:
681 for rangeBeg, rangeEnd, isReverseRange, origin in rangeDict[seqName]:
682 beg = max(bedBeg, rangeBeg)
683 end = min(bedEnd, rangeEnd)
684 if beg >= end: continue
686 beg, end = -end, -beg
688 # include partly-covered pixels
689 b = (origin + beg) // bpPerPix
690 e = div_ceil(origin + end, bpPerPix)
692 # exclude partly-covered pixels
693 b = div_ceil(origin + beg, bpPerPix)
694 e = (origin + end) // bpPerPix
696 if bedEnd >= rangeEnd: # include partly-covered end pixels
698 b = (origin + beg) // bpPerPix
700 e = div_ceil(origin + end, bpPerPix)
702 box = b, margin, e, edge
704 box = margin, b, edge, e
705 yield layer, color, box
707 def drawAnnotations(im, boxes):
708 # xxx use partial transparency for different-color overlaps?
709 for layer, color, box in boxes:
712 def placedLabels(labels, rangePixBegs, rangePixLens, beg, end):
713 '''Return axis labels with endpoint & sort-order information.'''
715 for i, j, k in zip(labels, rangePixBegs, rangePixLens):
716 text, textWidth, textHeight, strandNum = i
717 if textWidth > maxWidth:
719 labelBeg = j + (k - textWidth) // 2
720 labelEnd = labelBeg + textWidth
721 sortKey = textWidth - k
723 sortKey += maxWidth * (beg - labelBeg)
725 labelEnd = beg + textWidth
727 sortKey += maxWidth * (labelEnd - end)
729 labelBeg = end - textWidth
730 yield sortKey, labelBeg, labelEnd, text, textHeight, strandNum
732 def nonoverlappingLabels(labels, minPixTweenLabels):
733 '''Get a subset of non-overlapping axis labels, greedily.'''
736 beg = i[1] - minPixTweenLabels
737 end = i[2] + minPixTweenLabels
738 if all(j[2] <= beg or j[1] >= end for j in out):
742 def axisImage(labels, rangePixBegs, rangePixLens, textRot,
743 textAln, font, image_mode, opts):
744 '''Make an image of axis labels.'''
745 beg = rangePixBegs[0]
746 end = rangePixBegs[-1] + rangePixLens[-1]
747 margin = max(i[2] for i in labels)
748 labels = sorted(placedLabels(labels, rangePixBegs, rangePixLens, beg, end))
749 minPixTweenLabels = 0 if textRot else opts.label_space
750 labels = nonoverlappingLabels(labels, minPixTweenLabels)
751 image_size = (margin, end) if textRot else (end, margin)
752 im = Image.new(image_mode, image_size, opts.margin_color)
753 draw = ImageDraw.Draw(im)
754 for sortKey, labelBeg, labelEnd, text, textHeight, strandNum in labels:
755 base = margin - textHeight if textAln else 0
756 position = (base, labelBeg) if textRot else (labelBeg, base)
757 fill = ("black", opts.forwardcolor, opts.reversecolor)[strandNum]
758 draw.text(position, text, font=font, fill=fill)
761 def rangesWithOrigins(sortedRanges, rangePixBegs, rangePixLens, bpPerPix):
762 for i, j, k in zip(sortedRanges, rangePixBegs, rangePixLens):
763 seqName, rangeBeg, rangeEnd, strandNum = i
764 isReverseRange = (strandNum > 1)
766 origin = bpPerPix * (j + k) + rangeBeg
768 origin = bpPerPix * j - rangeBeg
769 yield seqName, (rangeBeg, rangeEnd, isReverseRange, origin)
771 def rangesPerSeq(sortedRanges, rangePixBegs, rangePixLens, bpPerPix):
772 a = rangesWithOrigins(sortedRanges, rangePixBegs, rangePixLens, bpPerPix)
773 for k, v in itertools.groupby(a, itemgetter(0)):
774 yield k, sorted(i[1] for i in v)
778 return ImageFont.truetype(opts.fontfile, opts.fontsize)
781 x = ["fc-match", "-f%{file}", "arial"]
782 p = subprocess.Popen(x, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
783 universal_newlines=True)
784 out, err = p.communicate()
785 fileNames.append(out)
787 logging.info("fc-match error: " + str(e))
788 fileNames.append("/Library/Fonts/Arial.ttf") # for Mac
791 font = ImageFont.truetype(i, opts.fontsize)
792 logging.info("font: " + i)
795 logging.info("font load error: " + str(e))
796 return ImageFont.load_default()
798 def sequenceSizesAndNames(seqRanges):
799 for seqName, ranges in itertools.groupby(seqRanges, itemgetter(0)):
800 size = sum(e - b for n, b, e in ranges)
803 def biggestSequences(seqRanges, maxNumOfSequences):
804 s = sorted(sequenceSizesAndNames(seqRanges), reverse=True)
805 if len(s) > maxNumOfSequences:
806 logging.warning("too many sequences - discarding the smallest ones")
807 s = s[:maxNumOfSequences]
808 return set(i[1] for i in s)
810 def remainingSequenceRanges(seqRanges, alignments, seqIndex):
811 remainingSequences = set(i[seqIndex] for i in alignments)
812 return [i for i in seqRanges if i[0] in remainingSequences]
814 def lastDotplot(opts, args):
815 logLevel = logging.INFO if opts.verbose else logging.WARNING
816 logging.basicConfig(format="%(filename)s: %(message)s", level=logLevel)
820 forward_color = ImageColor.getcolor(opts.forwardcolor, image_mode)
821 reverse_color = ImageColor.getcolor(opts.reversecolor, image_mode)
822 zipped_colors = zip(forward_color, reverse_color)
823 overlap_color = tuple([(i + j) // 2 for i, j in zipped_colors])
825 maxGap1, maxGapB1 = twoValuesFromOption(opts.max_gap1, ":")
826 maxGap2, maxGapB2 = twoValuesFromOption(opts.max_gap2, ":")
828 logging.info("reading alignments...")
829 alnData = readAlignments(args[0], opts)
830 alignments, seqRanges1, coverDict1, seqRanges2, coverDict2 = alnData
831 if not alignments: raise Exception("there are no alignments")
832 logging.info("cutting...")
833 coverDict1 = dict(mergedRangesPerSeq(coverDict1))
834 coverDict2 = dict(mergedRangesPerSeq(coverDict2))
835 minAlignedBases = min(coveredLength(coverDict1), coveredLength(coverDict2))
836 pad = int(opts.pad * minAlignedBases)
837 cutRanges1 = list(trimmed(seqRanges1, coverDict1, minAlignedBases,
839 cutRanges2 = list(trimmed(seqRanges2, coverDict2, minAlignedBases,
842 biggestSeqs1 = biggestSequences(cutRanges1, opts.maxseqs)
843 cutRanges1 = [i for i in cutRanges1 if i[0] in biggestSeqs1]
844 alignments = [i for i in alignments if i[0] in biggestSeqs1]
845 cutRanges2 = remainingSequenceRanges(cutRanges2, alignments, 1)
847 biggestSeqs2 = biggestSequences(cutRanges2, opts.maxseqs)
848 cutRanges2 = [i for i in cutRanges2 if i[0] in biggestSeqs2]
849 alignments = [i for i in alignments if i[1] in biggestSeqs2]
850 cutRanges1 = remainingSequenceRanges(cutRanges1, alignments, 0)
852 logging.info("reading secondary alignments...")
853 alnDataB = readSecondaryAlignments(opts, cutRanges1, cutRanges2)
854 alignmentsB, seqRangesB1, coverDictB1, seqRangesB2, coverDictB2 = alnDataB
855 logging.info("cutting...")
856 coverDictB1 = dict(mergedRangesPerSeq(coverDictB1))
857 coverDictB2 = dict(mergedRangesPerSeq(coverDictB2))
858 cutRangesB1 = trimmed(seqRangesB1, coverDictB1, minAlignedBases,
860 cutRangesB2 = trimmed(seqRangesB2, coverDictB2, minAlignedBases,
863 logging.info("sorting...")
864 sortOut = allSortedRanges(opts, alignments, alignmentsB,
865 cutRanges1, cutRangesB1, cutRanges2, cutRangesB2)
866 sortedRanges1, sortedRanges2 = sortOut
868 textRot1 = "vertical".startswith(opts.rot1)
869 i1 = dataFromRanges(sortedRanges1, font,
870 opts.fontsize, image_mode, opts.labels1, textRot1)
871 rangeSizes1, labelData1, tMargin = i1
873 textRot2 = "horizontal".startswith(opts.rot2)
874 i2 = dataFromRanges(sortedRanges2, font,
875 opts.fontsize, image_mode, opts.labels2, textRot2)
876 rangeSizes2, labelData2, lMargin = i2
878 maxPixels1 = opts.width - lMargin
879 maxPixels2 = opts.height - tMargin
880 bpPerPix1 = get_bp_per_pix(rangeSizes1, opts.border_pixels, maxPixels1)
881 bpPerPix2 = get_bp_per_pix(rangeSizes2, opts.border_pixels, maxPixels2)
882 bpPerPix = max(bpPerPix1, bpPerPix2)
883 logging.info("bp per pixel = " + str(bpPerPix))
885 p1 = pixelData(rangeSizes1, bpPerPix, opts.border_pixels, lMargin)
886 rangePixBegs1, rangePixLens1, width = p1
887 rangeDict1 = dict(rangesPerSeq(sortedRanges1, rangePixBegs1,
888 rangePixLens1, bpPerPix))
890 p2 = pixelData(rangeSizes2, bpPerPix, opts.border_pixels, tMargin)
891 rangePixBegs2, rangePixLens2, height = p2
892 rangeDict2 = dict(rangesPerSeq(sortedRanges2, rangePixBegs2,
893 rangePixLens2, bpPerPix))
895 logging.info("width: " + str(width))
896 logging.info("height: " + str(height))
898 logging.info("processing alignments...")
899 allAlignments = alignments + alignmentsB
900 hits = alignmentPixels(width, height, allAlignments, bpPerPix,
901 rangeDict1, rangeDict2)
903 logging.info("reading annotations...")
905 rangeDict1 = expandedSeqDict(rangeDict1)
906 beds1 = itertools.chain(readBed(opts.bed1, rangeDict1),
907 readRmsk(opts.rmsk1, rangeDict1),
908 readGenePred(opts, opts.genePred1, rangeDict1),
909 readGaps(opts, opts.gap1, rangeDict1))
910 b1 = bedBoxes(beds1, rangeDict1, tMargin, height, True, bpPerPix)
912 rangeDict2 = expandedSeqDict(rangeDict2)
913 beds2 = itertools.chain(readBed(opts.bed2, rangeDict2),
914 readRmsk(opts.rmsk2, rangeDict2),
915 readGenePred(opts, opts.genePred2, rangeDict2),
916 readGaps(opts, opts.gap2, rangeDict2))
917 b2 = bedBoxes(beds2, rangeDict2, lMargin, width, False, bpPerPix)
919 boxes = sorted(itertools.chain(b1, b2))
921 logging.info("drawing...")
923 image_size = width, height
924 im = Image.new(image_mode, image_size, opts.background_color)
926 drawAnnotations(im, boxes)
928 joinA, joinB = twoValuesFromOption(opts.join, ":")
930 drawJoins(im, alignments, bpPerPix, 0, rangeDict1, rangeDict2)
932 drawJoins(im, alignmentsB, bpPerPix, 0, rangeDict1, rangeDict2)
934 drawJoins(im, alignments, bpPerPix, 1, rangeDict2, rangeDict1)
936 drawJoins(im, alignmentsB, bpPerPix, 1, rangeDict2, rangeDict1)
938 for i in range(height):
939 for j in range(width):
940 store_value = hits[i * width + j]
942 if store_value == 1: im.putpixel(xy, forward_color)
943 elif store_value == 2: im.putpixel(xy, reverse_color)
944 elif store_value == 3: im.putpixel(xy, overlap_color)
946 if opts.fontsize != 0:
947 axis1 = axisImage(labelData1, rangePixBegs1, rangePixLens1,
948 textRot1, False, font, image_mode, opts)
950 axis1 = axis1.transpose(Image.ROTATE_90)
951 axis2 = axisImage(labelData2, rangePixBegs2, rangePixLens2,
952 textRot2, textRot2, font, image_mode, opts)
954 axis2 = axis2.transpose(Image.ROTATE_270)
955 im.paste(axis1, (0, 0))
956 im.paste(axis2, (0, 0))
958 for i in rangePixBegs1[1:]:
959 box = i - opts.border_pixels, tMargin, i, height
960 im.paste(opts.border_color, box)
962 for i in rangePixBegs2[1:]:
963 box = lMargin, i - opts.border_pixels, width, i
964 im.paste(opts.border_color, box)
968 if __name__ == "__main__":
969 usage = """%prog --help
970 or: %prog [options] maf-or-tab-alignments dotplot.png
971 or: %prog [options] maf-or-tab-alignments dotplot.gif
973 description = "Draw a dotplot of pair-wise sequence alignments in MAF or tabular format."
974 op = optparse.OptionParser(usage=usage, description=description)
975 op.add_option("-v", "--verbose", action="count",
976 help="show progress messages & data about the plot")
977 # Replace "width" & "height" with a single "length" option?
978 op.add_option("-x", "--width", metavar="INT", type="int", default=1000,
979 help="maximum width in pixels (default: %default)")
980 op.add_option("-y", "--height", metavar="INT", type="int", default=1000,
981 help="maximum height in pixels (default: %default)")
982 op.add_option("-m", "--maxseqs", type="int", default=100, metavar="M",
983 help="maximum number of horizontal or vertical sequences "
984 "(default=%default)")
985 op.add_option("-1", "--seq1", metavar="PATTERN", action="append",
987 help="which sequences to show from the 1st genome")
988 op.add_option("-2", "--seq2", metavar="PATTERN", action="append",
990 help="which sequences to show from the 2nd genome")
991 op.add_option("-c", "--forwardcolor", metavar="COLOR", default="red",
992 help="color for forward alignments (default: %default)")
993 op.add_option("-r", "--reversecolor", metavar="COLOR", default="blue",
994 help="color for reverse alignments (default: %default)")
995 op.add_option("--alignments", metavar="FILE", help="secondary alignments")
996 op.add_option("--sort1", default="1", metavar="N",
997 help="genome1 sequence order: 0=input order, 1=name order, "
998 "2=length order, 3=alignment order (default=%default)")
999 op.add_option("--sort2", default="1", metavar="N",
1000 help="genome2 sequence order: 0=input order, 1=name order, "
1001 "2=length order, 3=alignment order (default=%default)")
1002 op.add_option("--strands1", default="0", metavar="N", help=
1003 "genome1 sequence orientation: 0=forward orientation, "
1004 "1=alignment orientation (default=%default)")
1005 op.add_option("--strands2", default="0", metavar="N", help=
1006 "genome2 sequence orientation: 0=forward orientation, "
1007 "1=alignment orientation (default=%default)")
1008 op.add_option("--max-gap1", metavar="FRAC", default="0.5,2", help=
1009 "maximum unaligned (end,mid) gap in genome1: "
1010 "fraction of aligned length (default=%default)")
1011 op.add_option("--max-gap2", metavar="FRAC", default="0.5,2", help=
1012 "maximum unaligned (end,mid) gap in genome2: "
1013 "fraction of aligned length (default=%default)")
1014 op.add_option("--pad", metavar="FRAC", type="float", default=0.04, help=
1015 "pad length when cutting unaligned gaps: "
1016 "fraction of aligned length (default=%default)")
1017 op.add_option("-j", "--join", default="0", metavar="N", help=
1018 "join: 0=nothing, 1=alignments adjacent in genome1, "
1019 "2=alignments adjacent in genome2 (default=%default)")
1020 op.add_option("--border-pixels", metavar="INT", type="int", default=1,
1021 help="number of pixels between sequences (default=%default)")
1022 op.add_option("--border-color", metavar="COLOR", default="black",
1023 help="color for pixels between sequences (default=%default)")
1024 # --break-color and/or --break-pixels for intra-sequence breaks?
1025 op.add_option("--margin-color", metavar="COLOR", default="#dcdcdc",
1026 help="margin color")
1028 og = optparse.OptionGroup(op, "Text options")
1029 og.add_option("-f", "--fontfile", metavar="FILE",
1030 help="TrueType or OpenType font file")
1031 og.add_option("-s", "--fontsize", metavar="SIZE", type="int", default=14,
1032 help="TrueType or OpenType font size (default: %default)")
1033 og.add_option("--labels1", type="int", default=0, metavar="N", help=
1034 "genome1 labels: 0=name, 1=name:length, "
1035 "2=name:start:length, 3=name:start-end (default=%default)")
1036 og.add_option("--labels2", type="int", default=0, metavar="N", help=
1037 "genome2 labels: 0=name, 1=name:length, "
1038 "2=name:start:length, 3=name:start-end (default=%default)")
1039 og.add_option("--rot1", metavar="ROT", default="h",
1040 help="text rotation for the 1st genome (default=%default)")
1041 og.add_option("--rot2", metavar="ROT", default="v",
1042 help="text rotation for the 2nd genome (default=%default)")
1043 op.add_option_group(og)
1045 og = optparse.OptionGroup(op, "Annotation options")
1046 og.add_option("--bed1", metavar="FILE",
1047 help="read genome1 annotations from BED file")
1048 og.add_option("--bed2", metavar="FILE",
1049 help="read genome2 annotations from BED file")
1050 og.add_option("--rmsk1", metavar="FILE", help="read genome1 repeats from "
1051 "RepeatMasker .out or rmsk.txt file")
1052 og.add_option("--rmsk2", metavar="FILE", help="read genome2 repeats from "
1053 "RepeatMasker .out or rmsk.txt file")
1054 op.add_option_group(og)
1056 og = optparse.OptionGroup(op, "Gene options")
1057 og.add_option("--genePred1", metavar="FILE",
1058 help="read genome1 genes from genePred file")
1059 og.add_option("--genePred2", metavar="FILE",
1060 help="read genome2 genes from genePred file")
1061 og.add_option("--exon-color", metavar="COLOR", default="PaleGreen",
1062 help="color for exons (default=%default)")
1063 og.add_option("--cds-color", metavar="COLOR", default="LimeGreen",
1064 help="color for protein-coding regions (default=%default)")
1065 op.add_option_group(og)
1067 og = optparse.OptionGroup(op, "Unsequenced gap options")
1068 og.add_option("--gap1", metavar="FILE",
1069 help="read genome1 unsequenced gaps from agp or gap file")
1070 og.add_option("--gap2", metavar="FILE",
1071 help="read genome2 unsequenced gaps from agp or gap file")
1072 og.add_option("--bridged-color", metavar="COLOR", default="yellow",
1073 help="color for bridged gaps (default: %default)")
1074 og.add_option("--unbridged-color", metavar="COLOR", default="orange",
1075 help="color for unbridged gaps (default: %default)")
1076 op.add_option_group(og)
1077 (opts, args) = op.parse_args()
1078 if len(args) != 2: op.error("2 arguments needed")
1080 opts.background_color = "white"
1081 opts.label_space = 5 # minimum number of pixels between axis labels
1083 try: lastDotplot(opts, args)
1084 except KeyboardInterrupt: pass # avoid silly error message
1085 except Exception as e:
1086 prog = os.path.basename(sys.argv[0])
1087 sys.exit(prog + ": error: " + str(e))