3 # Read pair-wise alignments in MAF or LAST tabular format: write an
4 # "Oxford grid", a.k.a. dotplot.
6 # TODO: Currently, pixels with zero aligned nt-pairs are white, and
7 # pixels with one or more aligned nt-pairs are black. This can look
8 # too crowded for large genome alignments. I tried shading each pixel
9 # according to the number of aligned nt-pairs within it, but the
10 # result is too faint. How can this be done better?
15 from fnmatch import fnmatchcase
16 from operator import itemgetter
18 import itertools, optparse, os, re, sys
20 # Try to make PIL/PILLOW work:
22 from PIL import Image, ImageDraw, ImageFont, ImageColor
24 import Image, ImageDraw, ImageFont, ImageColor
27 from future_builtins import zip
31 def myOpen(fileName): # faster than fileinput
36 if fileName.endswith(".gz"):
37 return gzip.open(fileName, "rt") # xxx dubious for Python2
42 prog = os.path.basename(sys.argv[0])
43 sys.stderr.write(prog + ": " + message + "\n")
45 def groupByFirstItem(things):
46 for k, v in itertools.groupby(things, itemgetter(0)):
47 yield k, [i[1:] for i in v]
49 def croppedBlocks(blocks, ranges1, ranges2):
50 headBeg1, headBeg2, headSize = blocks[0]
53 cropBeg1, cropEnd1 = r1
55 cropBeg1, cropEnd1 = -cropEnd1, -cropBeg1
56 cropBeg2, cropEnd2 = r2
58 cropBeg2, cropEnd2 = -cropEnd2, -cropBeg2
59 for beg1, beg2, size in blocks:
60 b1 = max(cropBeg1, beg1)
61 e1 = min(cropEnd1, beg1 + size)
64 b2 = max(cropBeg2, b1 + offset)
65 e2 = min(cropEnd2, e1 + offset)
67 yield b2 - offset, b2, e2 - b2
69 def tabBlocks(beg1, beg2, blocks):
70 '''Get the gapless blocks of an alignment, from LAST tabular format.'''
71 for i in blocks.split(","):
78 yield beg1, beg2, size
82 def mafBlocks(beg1, beg2, seq1, seq2):
83 '''Get the gapless blocks of an alignment, from MAF format.'''
85 for x, y in zip(seq1, seq2):
88 yield beg1, beg2, size
95 yield beg1, beg2, size
102 if size: yield beg1, beg2, size
104 def alignmentInput(lines):
105 '''Get alignments and sequence lengths, from MAF or tabular format.'''
109 if line[0].isdigit(): # tabular format
110 chr1, beg1, seqlen1 = w[1], int(w[2]), int(w[5])
111 if w[4] == "-": beg1 -= seqlen1
112 chr2, beg2, seqlen2 = w[6], int(w[7]), int(w[10])
113 if w[9] == "-": beg2 -= seqlen2
114 blocks = tabBlocks(beg1, beg2, w[11])
115 yield chr1, seqlen1, chr2, seqlen2, blocks
116 elif line[0] == "s": # MAF format
118 chr1, beg1, seqlen1, seq1 = w[1], int(w[2]), int(w[5]), w[6]
119 if w[4] == "-": beg1 -= seqlen1
122 chr2, beg2, seqlen2, seq2 = w[1], int(w[2]), int(w[5]), w[6]
123 if w[4] == "-": beg2 -= seqlen2
124 blocks = mafBlocks(beg1, beg2, seq1, seq2)
125 yield chr1, seqlen1, chr2, seqlen2, blocks
128 def seqRequestFromText(text):
130 pattern, interval = text.rsplit(":", 1)
132 beg, end = interval.rsplit("-", 1)
133 return pattern, int(beg), int(end) # beg may be negative
134 return text, 0, sys.maxsize
136 def rangesFromSeqName(seqRequests, name, seqLen):
138 base = name.split(".")[-1] # allow for names like hg19.chr7
139 for pat, beg, end in seqRequests:
140 if fnmatchcase(name, pat) or fnmatchcase(base, pat):
141 yield max(beg, 0), min(end, seqLen)
145 def updateSeqs(coverDict, seqRanges, seqName, ranges, coveredRange):
146 beg, end = coveredRange
148 coveredRange = -end, -beg
149 if seqName in coverDict:
150 coverDict[seqName].append(coveredRange)
152 coverDict[seqName] = [coveredRange]
153 for beg, end in ranges:
154 r = seqName, beg, end
157 def readAlignments(fileName, opts):
158 '''Get alignments and sequence limits, from MAF or tabular format.'''
159 seqRequests1 = [seqRequestFromText(i) for i in opts.seq1]
160 seqRequests2 = [seqRequestFromText(i) for i in opts.seq2]
167 lines = myOpen(fileName)
168 for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
169 ranges1 = sorted(rangesFromSeqName(seqRequests1, seqName1, seqLen1))
170 if not ranges1: continue
171 ranges2 = sorted(rangesFromSeqName(seqRequests2, seqName2, seqLen2))
172 if not ranges2: continue
173 b = list(croppedBlocks(list(blocks), ranges1, ranges2))
175 aln = seqName1, seqName2, b
176 alignments.append(aln)
177 coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
178 updateSeqs(coverDict1, seqRanges1, seqName1, ranges1, coveredRange1)
179 coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
180 updateSeqs(coverDict2, seqRanges2, seqName2, ranges2, coveredRange2)
181 return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
183 def nameAndRangesFromDict(cropDict, seqName):
184 if seqName in cropDict:
185 return seqName, cropDict[seqName]
186 n = seqName.split(".")[-1]
188 return n, cropDict[n]
191 def rangesForSecondaryAlignments(primaryRanges, seqLen):
196 def readSecondaryAlignments(opts, cropRanges1, cropRanges2):
197 cropDict1 = dict(groupByFirstItem(cropRanges1))
198 cropDict2 = dict(groupByFirstItem(cropRanges2))
205 lines = myOpen(opts.alignments)
206 for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
207 seqName1, ranges1 = nameAndRangesFromDict(cropDict1, seqName1)
208 seqName2, ranges2 = nameAndRangesFromDict(cropDict2, seqName2)
209 if not ranges1 and not ranges2:
211 r1 = rangesForSecondaryAlignments(ranges1, seqLen1)
212 r2 = rangesForSecondaryAlignments(ranges2, seqLen2)
213 b = list(croppedBlocks(list(blocks), r1, r2))
215 aln = seqName1, seqName2, b
216 alignments.append(aln)
218 coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
219 updateSeqs(coverDict1, seqRanges1, seqName1, r1, coveredRange1)
221 coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
222 updateSeqs(coverDict2, seqRanges2, seqName2, r2, coveredRange2)
223 return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
225 def twoValuesFromOption(text, separator):
226 if separator in text:
227 return text.split(separator)
230 def mergedRanges(ranges):
231 oldBeg, maxEnd = ranges[0]
232 for beg, end in ranges:
241 def mergedRangesPerSeq(coverDict):
242 for k, v in coverDict.items():
244 yield k, list(mergedRanges(v))
246 def coveredLength(mergedCoverDict):
247 return sum(sum(e - b for b, e in v) for v in mergedCoverDict.values())
249 def trimmed(seqRanges, coverDict, minAlignedBases, maxGapFrac, endPad, midPad):
250 maxEndGapFrac, maxMidGapFrac = twoValuesFromOption(maxGapFrac, ",")
251 maxEndGap = max(float(maxEndGapFrac) * minAlignedBases, endPad * 1.0)
252 maxMidGap = max(float(maxMidGapFrac) * minAlignedBases, midPad * 2.0)
254 for seqName, rangeBeg, rangeEnd in seqRanges:
255 seqBlocks = coverDict[seqName]
256 blocks = [i for i in seqBlocks if i[0] < rangeEnd and i[1] > rangeBeg]
257 if blocks[0][0] - rangeBeg > maxEndGap:
258 rangeBeg = blocks[0][0] - endPad
259 for j, y in enumerate(blocks):
262 if y[0] - x[1] > maxMidGap:
263 yield seqName, rangeBeg, x[1] + midPad
264 rangeBeg = y[0] - midPad
265 if rangeEnd - blocks[-1][1] > maxEndGap:
266 rangeEnd = blocks[-1][1] + endPad
267 yield seqName, rangeBeg, rangeEnd
269 def rangesWithStrandInfo(seqRanges, strandOpt, alignments, seqIndex):
271 forwardMinusReverse = collections.defaultdict(int)
274 beg1, beg2, size = blocks[0]
275 numOfAlignedLetterPairs = sum(i[2] for i in blocks)
276 if (beg1 < 0) != (beg2 < 0): # opposite-strand alignment
277 numOfAlignedLetterPairs *= -1
278 forwardMinusReverse[i[seqIndex]] += numOfAlignedLetterPairs
280 for seqName, beg, end in seqRanges:
282 strandNum = 1 if forwardMinusReverse[seqName] >= 0 else 2
283 yield seqName, beg, end, strandNum
285 def natural_sort_key(my_string):
286 '''Return a sort key for "natural" ordering, e.g. chr9 < chr10.'''
287 parts = re.split(r'(\d+)', my_string)
288 parts[1::2] = map(int, parts[1::2])
291 def nameKey(oneSeqRanges):
292 return natural_sort_key(oneSeqRanges[0][0])
294 def sizeKey(oneSeqRanges):
295 return sum(b - e for n, b, e, s in oneSeqRanges), nameKey(oneSeqRanges)
297 def alignmentKey(seqNamesToLists, oneSeqRanges):
298 seqName = oneSeqRanges[0][0]
299 alignmentsOfThisSequence = seqNamesToLists[seqName]
300 numOfAlignedLetterPairs = sum(i[3] for i in alignmentsOfThisSequence)
301 toMiddle = numOfAlignedLetterPairs // 2
302 for i in alignmentsOfThisSequence:
305 return i[1:3] # sequence-rank and "position" of this alignment
307 def rankAndFlipPerSeq(seqRanges):
308 rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
309 for rank, group in enumerate(rangesGroupedBySeqName):
310 seqName, ranges = group
311 strandNum = next(ranges)[3]
312 flip = 1 if strandNum < 2 else -1
313 yield seqName, (rank, flip)
315 def alignmentSortData(alignments, seqIndex, otherNamesToRanksAndFlips):
316 otherIndex = 1 - seqIndex
319 otherRank, otherFlip = otherNamesToRanksAndFlips[i[otherIndex]]
320 otherPos = otherFlip * abs(blocks[0][otherIndex] +
321 blocks[-1][otherIndex] + blocks[-1][2])
322 numOfAlignedLetterPairs = sum(i[2] for i in blocks)
323 yield i[seqIndex], otherRank, otherPos, numOfAlignedLetterPairs
325 def mySortedRanges(seqRanges, sortOpt, seqIndex, alignments, otherRanges):
326 rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
327 g = [list(ranges) for seqName, ranges in rangesGroupedBySeqName]
336 otherNamesToRanksAndFlips = dict(rankAndFlipPerSeq(otherRanges))
337 alns = sorted(alignmentSortData(alignments, seqIndex,
338 otherNamesToRanksAndFlips))
339 alnsGroupedBySeqName = itertools.groupby(alns, itemgetter(0))
340 seqNamesToLists = dict((k, list(v)) for k, v in alnsGroupedBySeqName)
341 g.sort(key=functools.partial(alignmentKey, seqNamesToLists))
342 return [j for i in g for j in i]
344 def allSortedRanges(opts, alignments, alignmentsB,
345 seqRanges1, seqRangesB1, seqRanges2, seqRangesB2):
346 o1, oB1 = twoValuesFromOption(opts.strands1, ":")
347 o2, oB2 = twoValuesFromOption(opts.strands2, ":")
348 if o1 == "1" and o2 == "1":
349 raise Exception("the strand options have circular dependency")
350 seqRanges1 = list(rangesWithStrandInfo(seqRanges1, o1, alignments, 0))
351 seqRanges2 = list(rangesWithStrandInfo(seqRanges2, o2, alignments, 1))
352 seqRangesB1 = list(rangesWithStrandInfo(seqRangesB1, oB1, alignmentsB, 0))
353 seqRangesB2 = list(rangesWithStrandInfo(seqRangesB2, oB2, alignmentsB, 1))
355 o1, oB1 = twoValuesFromOption(opts.sort1, ":")
356 o2, oB2 = twoValuesFromOption(opts.sort2, ":")
357 if o1 == "3" and o2 == "3":
358 raise Exception("the sort options have circular dependency")
360 s1 = mySortedRanges(seqRanges1, o1, None, None, None)
362 s2 = mySortedRanges(seqRanges2, o2, None, None, None)
364 s1 = mySortedRanges(seqRanges1, o1, 0, alignments, s2)
366 s2 = mySortedRanges(seqRanges2, o2, 1, alignments, s1)
367 t1 = mySortedRanges(seqRangesB1, oB1, 0, alignmentsB, s2)
368 t2 = mySortedRanges(seqRangesB2, oB2, 1, alignmentsB, s1)
369 return s1 + t1, s2 + t2
375 groups.append(t[-3:])
377 return ",".join(reversed(groups))
380 suffixes = "bp", "kb", "Mb", "Gb"
381 for i, x in enumerate(suffixes):
384 return "%.2g" % (1.0 * size / j) + x
385 if size < j * 1000 or i == len(suffixes) - 1:
386 return "%.0f" % (1.0 * size / j) + x
388 def labelText(seqRange, labelOpt):
389 seqName, beg, end, strandNum = seqRange
391 return seqName + ": " + sizeText(end - beg)
393 return seqName + ":" + prettyNum(beg) + ": " + sizeText(end - beg)
395 return seqName + ":" + prettyNum(beg) + "-" + prettyNum(end)
398 def rangeLabels(seqRanges, labelOpt, font, fontsize, image_mode, textRot):
401 im = Image.new(image_mode, image_size)
402 draw = ImageDraw.Draw(im)
405 text = labelText(r, labelOpt)
407 x, y = draw.textsize(text, font=font)
410 yield text, x, y, r[3]
412 def dataFromRanges(sortedRanges, font, fontSize, imageMode, labelOpt, textRot):
413 for seqName, rangeBeg, rangeEnd, strandNum in sortedRanges:
414 out = [seqName, str(rangeBeg), str(rangeEnd)]
416 out.append(".+-"[strandNum])
419 rangeSizes = [e - b for n, b, e, s in sortedRanges]
420 labs = list(rangeLabels(sortedRanges, labelOpt, font, fontSize,
422 margin = max(i[2] for i in labs)
423 # xxx the margin may be too big, because some labels may get omitted
424 return rangeSizes, labs, margin
427 '''Return x / y rounded up.'''
431 def get_bp_per_pix(rangeSizes, pixTweenRanges, maxPixels):
432 '''Get the minimum bp-per-pixel that fits in the size limit.'''
433 warn("choosing bp per pixel...")
434 numOfRanges = len(rangeSizes)
435 maxPixelsInRanges = maxPixels - pixTweenRanges * (numOfRanges - 1)
436 if maxPixelsInRanges < numOfRanges:
437 raise Exception("can't fit the image: too many sequences?")
438 negLimit = -maxPixelsInRanges
439 negBpPerPix = sum(rangeSizes) // negLimit
441 if sum(i // negBpPerPix for i in rangeSizes) >= negLimit:
445 def getRangePixBegs(rangePixLens, pixTweenRanges, margin):
446 '''Get the start pixel for each range.'''
448 pix_tot = margin - pixTweenRanges
449 for i in rangePixLens:
450 pix_tot += pixTweenRanges
451 rangePixBegs.append(pix_tot)
455 def pixelData(rangeSizes, bp_per_pix, pixTweenRanges, margin):
456 '''Return pixel information about the ranges.'''
457 rangePixLens = [div_ceil(i, bp_per_pix) for i in rangeSizes]
458 rangePixBegs = getRangePixBegs(rangePixLens, pixTweenRanges, margin)
459 tot_pix = rangePixBegs[-1] + rangePixLens[-1]
460 return rangePixBegs, rangePixLens, tot_pix
462 def drawLineForward(hits, width, bp_per_pix, beg1, beg2, size):
464 q1, r1 = divmod(beg1, bp_per_pix)
465 q2, r2 = divmod(beg2, bp_per_pix)
466 hits[q2 * width + q1] |= 1
467 next_pix = min(bp_per_pix - r1, bp_per_pix - r2)
468 if next_pix >= size: break
473 def drawLineReverse(hits, width, bp_per_pix, beg1, beg2, size):
475 q1, r1 = divmod(beg1, bp_per_pix)
476 q2, r2 = divmod(beg2, bp_per_pix)
477 hits[q2 * width + q1] |= 2
478 next_pix = min(bp_per_pix - r1, r2 + 1)
479 if next_pix >= size: break
484 def strandAndOrigin(ranges, beg, size):
485 isReverseStrand = (beg < 0)
488 for rangeBeg, rangeEnd, isReverseRange, origin in ranges:
489 if rangeEnd > beg: # assumes the ranges are sorted
490 return (isReverseStrand != isReverseRange), origin
492 def alignmentPixels(width, height, alignments, bp_per_pix,
493 rangeDict1, rangeDict2):
494 hits = [0] * (width * height) # the image data
495 for seq1, seq2, blocks in alignments:
496 beg1, beg2, size = blocks[0]
497 isReverse1, ori1 = strandAndOrigin(rangeDict1[seq1], beg1, size)
498 isReverse2, ori2 = strandAndOrigin(rangeDict2[seq2], beg2, size)
499 for beg1, beg2, size in blocks:
501 beg1 = -(beg1 + size)
502 beg2 = -(beg2 + size)
503 if isReverse1 == isReverse2:
504 drawLineForward(hits, width, bp_per_pix,
505 ori1 + beg1, ori2 + beg2, size)
507 drawLineReverse(hits, width, bp_per_pix,
508 ori1 + beg1, ori2 - beg2 - 1, size)
511 def expandedSeqDict(seqDict):
512 '''Allow lookup by short sequence names, e.g. chr7 as well as hg19.chr7.'''
513 newDict = seqDict.copy()
514 for name, x in seqDict.items():
516 base = name.split(".")[-1]
517 if base in newDict: # an ambiguous case was found:
518 return seqDict # so give up completely
522 def readBed(fileName, rangeDict):
523 for line in myOpen(fileName):
527 if seqName not in rangeDict: continue
536 if len(w) > 8 and w[8].count(",") == 2:
537 color = "rgb(" + w[8] + ")"
540 isRev = rangeDict[seqName][0][2]
541 if strand == "+" and not isRev or strand == "-" and isRev:
543 if strand == "-" and not isRev or strand == "+" and isRev:
545 yield layer, color, seqName, beg, end
547 def commaSeparatedInts(text):
548 return map(int, text.rstrip(",").split(","))
550 def readGenePred(opts, fileName, rangeDict):
551 for line in myOpen(fileName):
552 fields = line.split()
553 if not fields: continue
554 if fields[2] not in "+-": fields = fields[1:]
556 if seqName not in rangeDict: continue
558 cdsBeg = int(fields[5])
559 cdsEnd = int(fields[6])
560 exonBegs = commaSeparatedInts(fields[8])
561 exonEnds = commaSeparatedInts(fields[9])
562 for beg, end in zip(exonBegs, exonEnds):
563 yield 300, opts.exon_color, seqName, beg, end
566 if b < e: yield 400, opts.cds_color, seqName, b, e
568 def readRmsk(fileName, rangeDict):
569 for line in myOpen(fileName):
570 fields = line.split()
571 if len(fields) == 17: # rmsk.txt
573 if seqName not in rangeDict: continue # do this ASAP for speed
577 repeatClass = fields[11]
578 elif len(fields) == 15: # .out
580 if seqName not in rangeDict: continue
581 beg = int(fields[5]) - 1
584 repeatClass = fields[10]
587 if repeatClass in ("Low_complexity", "Simple_repeat"):
588 yield 200, "#fbf", seqName, beg, end
589 elif (strand == "+") != rangeDict[seqName][0][2]:
590 yield 100, "#ffe8e8", seqName, beg, end
592 yield 100, "#e8e8ff", seqName, beg, end
594 def isExtraFirstGapField(fields):
595 return fields[4].isdigit()
597 def readGaps(opts, fileName, rangeDict):
598 '''Read locations of unsequenced gaps, from an agp or gap file.'''
599 for line in myOpen(fileName):
601 if not w or w[0][0] == "#": continue
602 if isExtraFirstGapField(w): w = w[1:]
603 if w[4] not in "NU": continue
605 if seqName not in rangeDict: continue
607 beg = end - int(w[5]) # zero-based coordinate
609 yield 3000, opts.bridged_color, seqName, beg, end
611 yield 2000, opts.unbridged_color, seqName, beg, end
613 def bedBoxes(beds, rangeDict, margin, edge, isTop, bpPerPix):
614 for layer, color, seqName, bedBeg, bedEnd in beds:
615 for rangeBeg, rangeEnd, isReverseRange, origin in rangeDict[seqName]:
616 beg = max(bedBeg, rangeBeg)
617 end = min(bedEnd, rangeEnd)
618 if beg >= end: continue
620 beg, end = -end, -beg
622 # include partly-covered pixels
623 b = (origin + beg) // bpPerPix
624 e = div_ceil(origin + end, bpPerPix)
626 # exclude partly-covered pixels
627 b = div_ceil(origin + beg, bpPerPix)
628 e = (origin + end) // bpPerPix
630 if bedEnd >= rangeEnd: # include partly-covered end pixels
632 b = (origin + beg) // bpPerPix
634 e = div_ceil(origin + end, bpPerPix)
636 box = b, margin, e, edge
638 box = margin, b, edge, e
639 yield layer, color, box
641 def drawAnnotations(im, boxes):
642 # xxx use partial transparency for different-color overlaps?
643 for layer, color, box in boxes:
646 def placedLabels(labels, rangePixBegs, rangePixLens, beg, end):
647 '''Return axis labels with endpoint & sort-order information.'''
649 for i, j, k in zip(labels, rangePixBegs, rangePixLens):
650 text, textWidth, textHeight, strandNum = i
651 if textWidth > maxWidth:
653 labelBeg = j + (k - textWidth) // 2
654 labelEnd = labelBeg + textWidth
655 sortKey = textWidth - k
657 sortKey += maxWidth * (beg - labelBeg)
659 labelEnd = beg + textWidth
661 sortKey += maxWidth * (labelEnd - end)
663 labelBeg = end - textWidth
664 yield sortKey, labelBeg, labelEnd, text, textHeight, strandNum
666 def nonoverlappingLabels(labels, minPixTweenLabels):
667 '''Get a subset of non-overlapping axis labels, greedily.'''
670 beg = i[1] - minPixTweenLabels
671 end = i[2] + minPixTweenLabels
672 if all(j[2] <= beg or j[1] >= end for j in out):
676 def axisImage(labels, rangePixBegs, rangePixLens, textRot,
677 textAln, font, image_mode, opts):
678 '''Make an image of axis labels.'''
679 beg = rangePixBegs[0]
680 end = rangePixBegs[-1] + rangePixLens[-1]
681 margin = max(i[2] for i in labels)
682 labels = sorted(placedLabels(labels, rangePixBegs, rangePixLens, beg, end))
683 minPixTweenLabels = 0 if textRot else opts.label_space
684 labels = nonoverlappingLabels(labels, minPixTweenLabels)
685 image_size = (margin, end) if textRot else (end, margin)
686 im = Image.new(image_mode, image_size, opts.margin_color)
687 draw = ImageDraw.Draw(im)
688 for sortKey, labelBeg, labelEnd, text, textHeight, strandNum in labels:
689 base = margin - textHeight if textAln else 0
690 position = (base, labelBeg) if textRot else (labelBeg, base)
691 fill = ("black", opts.forwardcolor, opts.reversecolor)[strandNum]
692 draw.text(position, text, font=font, fill=fill)
695 def rangesWithOrigins(sortedRanges, rangePixBegs, rangePixLens, bpPerPix):
696 for i, j, k in zip(sortedRanges, rangePixBegs, rangePixLens):
697 seqName, rangeBeg, rangeEnd, strandNum = i
698 isReverseRange = (strandNum > 1)
700 origin = bpPerPix * (j + k) + rangeBeg
702 origin = bpPerPix * j - rangeBeg
703 yield seqName, (rangeBeg, rangeEnd, isReverseRange, origin)
705 def rangesPerSeq(sortedRanges, rangePixBegs, rangePixLens, bpPerPix):
706 a = rangesWithOrigins(sortedRanges, rangePixBegs, rangePixLens, bpPerPix)
707 for k, v in itertools.groupby(a, itemgetter(0)):
708 yield k, sorted(i[1] for i in v)
712 return ImageFont.truetype(opts.fontfile, opts.fontsize)
715 x = ["fc-match", "-f%{file}", "arial"]
716 p = subprocess.Popen(x, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
717 universal_newlines=True)
718 out, err = p.communicate()
719 fileNames.append(out)
721 warn("fc-match error: " + str(e))
722 fileNames.append("/Library/Fonts/Arial.ttf") # for Mac
725 font = ImageFont.truetype(i, opts.fontsize)
729 warn("font load error: " + str(e))
730 return ImageFont.load_default()
732 def lastDotplot(opts, args):
735 forward_color = ImageColor.getcolor(opts.forwardcolor, image_mode)
736 reverse_color = ImageColor.getcolor(opts.reversecolor, image_mode)
737 zipped_colors = zip(forward_color, reverse_color)
738 overlap_color = tuple([(i + j) // 2 for i, j in zipped_colors])
740 maxGap1, maxGapB1 = twoValuesFromOption(opts.max_gap1, ":")
741 maxGap2, maxGapB2 = twoValuesFromOption(opts.max_gap2, ":")
743 warn("reading alignments...")
744 alnData = readAlignments(args[0], opts)
745 alignments, seqRanges1, coverDict1, seqRanges2, coverDict2 = alnData
746 if not alignments: raise Exception("there are no alignments")
748 coverDict1 = dict(mergedRangesPerSeq(coverDict1))
749 coverDict2 = dict(mergedRangesPerSeq(coverDict2))
750 minAlignedBases = min(coveredLength(coverDict1), coveredLength(coverDict2))
751 pad = int(opts.pad * minAlignedBases)
752 cutRanges1 = list(trimmed(seqRanges1, coverDict1, minAlignedBases,
754 cutRanges2 = list(trimmed(seqRanges2, coverDict2, minAlignedBases,
757 warn("reading secondary alignments...")
758 alnDataB = readSecondaryAlignments(opts, cutRanges1, cutRanges2)
759 alignmentsB, seqRangesB1, coverDictB1, seqRangesB2, coverDictB2 = alnDataB
761 coverDictB1 = dict(mergedRangesPerSeq(coverDictB1))
762 coverDictB2 = dict(mergedRangesPerSeq(coverDictB2))
763 cutRangesB1 = trimmed(seqRangesB1, coverDictB1, minAlignedBases,
765 cutRangesB2 = trimmed(seqRangesB2, coverDictB2, minAlignedBases,
769 sortOut = allSortedRanges(opts, alignments, alignmentsB,
770 cutRanges1, cutRangesB1, cutRanges2, cutRangesB2)
771 sortedRanges1, sortedRanges2 = sortOut
773 textRot1 = "vertical".startswith(opts.rot1)
774 i1 = dataFromRanges(sortedRanges1, font,
775 opts.fontsize, image_mode, opts.labels1, textRot1)
776 rangeSizes1, labelData1, tMargin = i1
778 textRot2 = "horizontal".startswith(opts.rot2)
779 i2 = dataFromRanges(sortedRanges2, font,
780 opts.fontsize, image_mode, opts.labels2, textRot2)
781 rangeSizes2, labelData2, lMargin = i2
783 maxPixels1 = opts.width - lMargin
784 maxPixels2 = opts.height - tMargin
785 bpPerPix1 = get_bp_per_pix(rangeSizes1, opts.border_pixels, maxPixels1)
786 bpPerPix2 = get_bp_per_pix(rangeSizes2, opts.border_pixels, maxPixels2)
787 bpPerPix = max(bpPerPix1, bpPerPix2)
788 warn("bp per pixel = " + str(bpPerPix))
790 p1 = pixelData(rangeSizes1, bpPerPix, opts.border_pixels, lMargin)
791 rangePixBegs1, rangePixLens1, width = p1
792 rangeDict1 = dict(rangesPerSeq(sortedRanges1, rangePixBegs1,
793 rangePixLens1, bpPerPix))
795 p2 = pixelData(rangeSizes2, bpPerPix, opts.border_pixels, tMargin)
796 rangePixBegs2, rangePixLens2, height = p2
797 rangeDict2 = dict(rangesPerSeq(sortedRanges2, rangePixBegs2,
798 rangePixLens2, bpPerPix))
800 warn("width: " + str(width))
801 warn("height: " + str(height))
803 warn("processing alignments...")
804 hits = alignmentPixels(width, height, alignments + alignmentsB, bpPerPix,
805 rangeDict1, rangeDict2)
807 warn("reading annotations...")
809 rangeDict1 = expandedSeqDict(rangeDict1)
810 beds1 = itertools.chain(readBed(opts.bed1, rangeDict1),
811 readRmsk(opts.rmsk1, rangeDict1),
812 readGenePred(opts, opts.genePred1, rangeDict1),
813 readGaps(opts, opts.gap1, rangeDict1))
814 b1 = bedBoxes(beds1, rangeDict1, tMargin, height, True, bpPerPix)
816 rangeDict2 = expandedSeqDict(rangeDict2)
817 beds2 = itertools.chain(readBed(opts.bed2, rangeDict2),
818 readRmsk(opts.rmsk2, rangeDict2),
819 readGenePred(opts, opts.genePred2, rangeDict2),
820 readGaps(opts, opts.gap2, rangeDict2))
821 b2 = bedBoxes(beds2, rangeDict2, lMargin, width, False, bpPerPix)
823 boxes = sorted(itertools.chain(b1, b2))
827 image_size = width, height
828 im = Image.new(image_mode, image_size, opts.background_color)
830 drawAnnotations(im, boxes)
832 for i in range(height):
833 for j in range(width):
834 store_value = hits[i * width + j]
836 if store_value == 1: im.putpixel(xy, forward_color)
837 elif store_value == 2: im.putpixel(xy, reverse_color)
838 elif store_value == 3: im.putpixel(xy, overlap_color)
840 if opts.fontsize != 0:
841 axis1 = axisImage(labelData1, rangePixBegs1, rangePixLens1,
842 textRot1, False, font, image_mode, opts)
844 axis1 = axis1.transpose(Image.ROTATE_90)
845 axis2 = axisImage(labelData2, rangePixBegs2, rangePixLens2,
846 textRot2, textRot2, font, image_mode, opts)
848 axis2 = axis2.transpose(Image.ROTATE_270)
849 im.paste(axis1, (0, 0))
850 im.paste(axis2, (0, 0))
852 for i in rangePixBegs1[1:]:
853 box = i - opts.border_pixels, tMargin, i, height
854 im.paste(opts.border_color, box)
856 for i in rangePixBegs2[1:]:
857 box = lMargin, i - opts.border_pixels, width, i
858 im.paste(opts.border_color, box)
862 if __name__ == "__main__":
863 usage = """%prog --help
864 or: %prog [options] maf-or-tab-alignments dotplot.png
865 or: %prog [options] maf-or-tab-alignments dotplot.gif
867 description = "Draw a dotplot of pair-wise sequence alignments in MAF or tabular format."
868 op = optparse.OptionParser(usage=usage, description=description)
869 op.add_option("-v", "--verbose", action="count",
870 help="show progress messages & data about the plot")
871 op.add_option("-1", "--seq1", metavar="PATTERN", action="append",
873 help="which sequences to show from the 1st genome")
874 op.add_option("-2", "--seq2", metavar="PATTERN", action="append",
876 help="which sequences to show from the 2nd genome")
877 # Replace "width" & "height" with a single "length" option?
878 op.add_option("-x", "--width", type="int", default=1000,
879 help="maximum width in pixels (default: %default)")
880 op.add_option("-y", "--height", type="int", default=1000,
881 help="maximum height in pixels (default: %default)")
882 op.add_option("-c", "--forwardcolor", metavar="COLOR", default="red",
883 help="color for forward alignments (default: %default)")
884 op.add_option("-r", "--reversecolor", metavar="COLOR", default="blue",
885 help="color for reverse alignments (default: %default)")
886 op.add_option("--alignments", metavar="FILE", help="secondary alignments")
887 op.add_option("--sort1", default="1", metavar="N",
888 help="genome1 sequence order: 0=input order, 1=name order, "
889 "2=length order, 3=alignment order (default=%default)")
890 op.add_option("--sort2", default="1", metavar="N",
891 help="genome2 sequence order: 0=input order, 1=name order, "
892 "2=length order, 3=alignment order (default=%default)")
893 op.add_option("--strands1", default="0", metavar="N", help=
894 "genome1 sequence orientation: 0=forward orientation, "
895 "1=alignment orientation (default=%default)")
896 op.add_option("--strands2", default="0", metavar="N", help=
897 "genome2 sequence orientation: 0=forward orientation, "
898 "1=alignment orientation (default=%default)")
899 op.add_option("--max-gap1", metavar="FRAC", default="0.5,2", help=
900 "maximum unaligned (end,mid) gap in genome1: "
901 "fraction of aligned length (default=%default)")
902 op.add_option("--max-gap2", metavar="FRAC", default="0.5,2", help=
903 "maximum unaligned (end,mid) gap in genome2: "
904 "fraction of aligned length (default=%default)")
905 op.add_option("--pad", metavar="FRAC", type="float", default=0.04, help=
906 "pad length when cutting unaligned gaps: "
907 "fraction of aligned length (default=%default)")
908 op.add_option("--border-pixels", metavar="INT", type="int", default=1,
909 help="number of pixels between sequences (default=%default)")
910 op.add_option("--border-color", metavar="COLOR", default="black",
911 help="color for pixels between sequences (default=%default)")
912 # --break-color and/or --break-pixels for intra-sequence breaks?
913 op.add_option("--margin-color", metavar="COLOR", default="#dcdcdc",
916 og = optparse.OptionGroup(op, "Text options")
917 og.add_option("-f", "--fontfile", metavar="FILE",
918 help="TrueType or OpenType font file")
919 og.add_option("-s", "--fontsize", metavar="SIZE", type="int", default=14,
920 help="TrueType or OpenType font size (default: %default)")
921 og.add_option("--labels1", type="int", default=0, metavar="N", help=
922 "genome1 labels: 0=name, 1=name:length, "
923 "2=name:start:length, 3=name:start-end (default=%default)")
924 og.add_option("--labels2", type="int", default=0, metavar="N", help=
925 "genome2 labels: 0=name, 1=name:length, "
926 "2=name:start:length, 3=name:start-end (default=%default)")
927 og.add_option("--rot1", metavar="ROT", default="h",
928 help="text rotation for the 1st genome (default=%default)")
929 og.add_option("--rot2", metavar="ROT", default="v",
930 help="text rotation for the 2nd genome (default=%default)")
931 op.add_option_group(og)
933 og = optparse.OptionGroup(op, "Annotation options")
934 og.add_option("--bed1", metavar="FILE",
935 help="read genome1 annotations from BED file")
936 og.add_option("--bed2", metavar="FILE",
937 help="read genome2 annotations from BED file")
938 og.add_option("--rmsk1", metavar="FILE", help="read genome1 repeats from "
939 "RepeatMasker .out or rmsk.txt file")
940 og.add_option("--rmsk2", metavar="FILE", help="read genome2 repeats from "
941 "RepeatMasker .out or rmsk.txt file")
942 op.add_option_group(og)
944 og = optparse.OptionGroup(op, "Gene options")
945 og.add_option("--genePred1", metavar="FILE",
946 help="read genome1 genes from genePred file")
947 og.add_option("--genePred2", metavar="FILE",
948 help="read genome2 genes from genePred file")
949 og.add_option("--exon-color", metavar="COLOR", default="PaleGreen",
950 help="color for exons (default=%default)")
951 og.add_option("--cds-color", metavar="COLOR", default="LimeGreen",
952 help="color for protein-coding regions (default=%default)")
953 op.add_option_group(og)
955 og = optparse.OptionGroup(op, "Unsequenced gap options")
956 og.add_option("--gap1", metavar="FILE",
957 help="read genome1 unsequenced gaps from agp or gap file")
958 og.add_option("--gap2", metavar="FILE",
959 help="read genome2 unsequenced gaps from agp or gap file")
960 og.add_option("--bridged-color", metavar="COLOR", default="yellow",
961 help="color for bridged gaps (default: %default)")
962 og.add_option("--unbridged-color", metavar="COLOR", default="orange",
963 help="color for unbridged gaps (default: %default)")
964 op.add_option_group(og)
965 (opts, args) = op.parse_args()
966 if len(args) != 2: op.error("2 arguments needed")
968 opts.background_color = "white"
969 opts.label_space = 5 # minimum number of pixels between axis labels
971 try: lastDotplot(opts, args)
972 except KeyboardInterrupt: pass # avoid silly error message
973 except Exception as e:
974 prog = os.path.basename(sys.argv[0])
975 sys.exit(prog + ": error: " + str(e))