3 # Read pair-wise alignments in MAF or LAST tabular format: write an
4 # "Oxford grid", a.k.a. dotplot.
6 # TODO: Currently, pixels with zero aligned nt-pairs are white, and
7 # pixels with one or more aligned nt-pairs are black. This can look
8 # too crowded for large genome alignments. I tried shading each pixel
9 # according to the number of aligned nt-pairs within it, but the
10 # result is too faint. How can this be done better?
14 from fnmatch import fnmatchcase
15 from operator import itemgetter
17 import itertools, optparse, os, re, sys
19 # Try to make PIL/PILLOW work:
20 try: from PIL import Image, ImageDraw, ImageFont, ImageColor
21 except ImportError: import Image, ImageDraw, ImageFont, ImageColor
23 def myOpen(fileName): # faster than fileinput
28 if fileName.endswith(".gz"):
29 return gzip.open(fileName)
34 prog = os.path.basename(sys.argv[0])
35 sys.stderr.write(prog + ": " + message + "\n")
37 def groupByFirstItem(things):
38 for k, v in itertools.groupby(things, itemgetter(0)):
39 yield k, [i[1:] for i in v]
41 def croppedBlocks(blocks, ranges1, ranges2):
42 headBeg1, headBeg2, headSize = blocks[0]
45 cropBeg1, cropEnd1 = r1
47 cropBeg1, cropEnd1 = -cropEnd1, -cropBeg1
48 cropBeg2, cropEnd2 = r2
50 cropBeg2, cropEnd2 = -cropEnd2, -cropBeg2
51 for beg1, beg2, size in blocks:
52 b1 = max(cropBeg1, beg1)
53 e1 = min(cropEnd1, beg1 + size)
56 b2 = max(cropBeg2, b1 + offset)
57 e2 = min(cropEnd2, e1 + offset)
59 yield b2 - offset, b2, e2 - b2
61 def tabBlocks(beg1, beg2, blocks):
62 '''Get the gapless blocks of an alignment, from LAST tabular format.'''
63 for i in blocks.split(","):
70 yield beg1, beg2, size
74 def mafBlocks(beg1, beg2, seq1, seq2):
75 '''Get the gapless blocks of an alignment, from MAF format.'''
77 for x, y in itertools.izip(seq1, seq2):
80 yield beg1, beg2, size
87 yield beg1, beg2, size
94 if size: yield beg1, beg2, size
96 def alignmentInput(lines):
97 '''Get alignments and sequence lengths, from MAF or tabular format.'''
101 if line[0].isdigit(): # tabular format
102 chr1, beg1, seqlen1 = w[1], int(w[2]), int(w[5])
103 if w[4] == "-": beg1 -= seqlen1
104 chr2, beg2, seqlen2 = w[6], int(w[7]), int(w[10])
105 if w[9] == "-": beg2 -= seqlen2
106 blocks = tabBlocks(beg1, beg2, w[11])
107 yield chr1, seqlen1, chr2, seqlen2, blocks
108 elif line[0] == "s": # MAF format
110 chr1, beg1, seqlen1, seq1 = w[1], int(w[2]), int(w[5]), w[6]
111 if w[4] == "-": beg1 -= seqlen1
114 chr2, beg2, seqlen2, seq2 = w[1], int(w[2]), int(w[5]), w[6]
115 if w[4] == "-": beg2 -= seqlen2
116 blocks = mafBlocks(beg1, beg2, seq1, seq2)
117 yield chr1, seqlen1, chr2, seqlen2, blocks
120 def seqRequestFromText(text):
122 pattern, interval = text.rsplit(":", 1)
124 beg, end = interval.rsplit("-", 1)
125 return pattern, int(beg), int(end) # beg may be negative
126 return text, 0, sys.maxsize
128 def rangesFromSeqName(seqRequests, name, seqLen):
130 base = name.split(".")[-1] # allow for names like hg19.chr7
131 for pat, beg, end in seqRequests:
132 if fnmatchcase(name, pat) or fnmatchcase(base, pat):
133 yield max(beg, 0), min(end, seqLen)
137 def updateSeqs(coverDict, seqRanges, seqName, ranges, coveredRange):
138 beg, end = coveredRange
140 coveredRange = -end, -beg
141 if seqName in coverDict:
142 coverDict[seqName].append(coveredRange)
144 coverDict[seqName] = [coveredRange]
145 for beg, end in ranges:
146 r = seqName, beg, end
149 def readAlignments(fileName, opts):
150 '''Get alignments and sequence limits, from MAF or tabular format.'''
151 seqRequests1 = map(seqRequestFromText, opts.seq1)
152 seqRequests2 = map(seqRequestFromText, opts.seq2)
159 lines = myOpen(fileName)
160 for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
161 ranges1 = sorted(rangesFromSeqName(seqRequests1, seqName1, seqLen1))
162 if not ranges1: continue
163 ranges2 = sorted(rangesFromSeqName(seqRequests2, seqName2, seqLen2))
164 if not ranges2: continue
165 b = list(croppedBlocks(list(blocks), ranges1, ranges2))
167 aln = seqName1, seqName2, b
168 alignments.append(aln)
169 coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
170 updateSeqs(coverDict1, seqRanges1, seqName1, ranges1, coveredRange1)
171 coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
172 updateSeqs(coverDict2, seqRanges2, seqName2, ranges2, coveredRange2)
173 return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
175 def nameAndRangesFromDict(cropDict, seqName):
176 if seqName in cropDict:
177 return seqName, cropDict[seqName]
178 n = seqName.split(".")[-1]
180 return n, cropDict[n]
183 def rangesForSecondaryAlignments(primaryRanges, seqLen):
188 def readSecondaryAlignments(opts, cropRanges1, cropRanges2):
189 cropDict1 = dict(groupByFirstItem(cropRanges1))
190 cropDict2 = dict(groupByFirstItem(cropRanges2))
197 lines = myOpen(opts.alignments)
198 for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
199 seqName1, ranges1 = nameAndRangesFromDict(cropDict1, seqName1)
200 seqName2, ranges2 = nameAndRangesFromDict(cropDict2, seqName2)
201 if not ranges1 and not ranges2:
203 r1 = rangesForSecondaryAlignments(ranges1, seqLen1)
204 r2 = rangesForSecondaryAlignments(ranges2, seqLen2)
205 b = list(croppedBlocks(list(blocks), r1, r2))
207 aln = seqName1, seqName2, b
208 alignments.append(aln)
210 coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
211 updateSeqs(coverDict1, seqRanges1, seqName1, r1, coveredRange1)
213 coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
214 updateSeqs(coverDict2, seqRanges2, seqName2, r2, coveredRange2)
215 return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
217 def twoValuesFromOption(text, separator):
218 if separator in text:
219 return text.split(separator)
222 def mergedRanges(ranges):
223 oldBeg, maxEnd = ranges[0]
224 for beg, end in ranges:
233 def mergedRangesPerSeq(coverDict):
234 for k, v in coverDict.iteritems():
236 yield k, list(mergedRanges(v))
238 def coveredLength(mergedCoverDict):
239 return sum(sum(e - b for b, e in v) for v in mergedCoverDict.itervalues())
241 def trimmed(seqRanges, coverDict, minAlignedBases, maxGapFrac, endPad, midPad):
242 maxEndGapFrac, maxMidGapFrac = twoValuesFromOption(maxGapFrac, ",")
243 maxEndGap = max(float(maxEndGapFrac) * minAlignedBases, endPad * 1.0)
244 maxMidGap = max(float(maxMidGapFrac) * minAlignedBases, midPad * 2.0)
246 for seqName, rangeBeg, rangeEnd in seqRanges:
247 seqBlocks = coverDict[seqName]
248 blocks = [i for i in seqBlocks if i[0] < rangeEnd and i[1] > rangeBeg]
249 if blocks[0][0] - rangeBeg > maxEndGap:
250 rangeBeg = blocks[0][0] - endPad
251 for j, y in enumerate(blocks):
254 if y[0] - x[1] > maxMidGap:
255 yield seqName, rangeBeg, x[1] + midPad
256 rangeBeg = y[0] - midPad
257 if rangeEnd - blocks[-1][1] > maxEndGap:
258 rangeEnd = blocks[-1][1] + endPad
259 yield seqName, rangeBeg, rangeEnd
261 def natural_sort_key(my_string):
262 '''Return a sort key for "natural" ordering, e.g. chr9 < chr10.'''
263 parts = re.split(r'(\d+)', my_string)
264 parts[1::2] = map(int, parts[1::2])
267 def nameKey(oneSeqRanges):
268 return natural_sort_key(oneSeqRanges[0][0])
270 def sizeKey(oneSeqRanges):
271 return sum(b - e for n, b, e in oneSeqRanges), nameKey(oneSeqRanges)
273 def alignmentKey(seqNamesToLists, oneSeqRanges):
274 seqName = oneSeqRanges[0][0]
275 alignmentsOfThisSequence = seqNamesToLists[seqName]
276 numOfAlignedLetterPairs = sum(i[3] for i in alignmentsOfThisSequence)
277 toMiddle = numOfAlignedLetterPairs // 2
278 for i in alignmentsOfThisSequence:
281 return i[1:3] # sequence-rank and "position" of this alignment
283 def alignmentSortData1(alignments, seqNamesToRanks2):
284 for seqName1, seqName2, blocks in alignments:
285 seqRank2 = seqNamesToRanks2[seqName2]
286 pos2 = abs(blocks[0][1] + blocks[-1][1] + blocks[-1][2])
287 numOfAlignedLetterPairs = sum(i[2] for i in blocks)
288 yield seqName1, seqRank2, pos2, numOfAlignedLetterPairs
290 def alignmentSortData2(alignments, seqNamesToRanks1):
291 for seqName1, seqName2, blocks in alignments:
292 seqRank1 = seqNamesToRanks1[seqName1]
293 pos1 = abs(blocks[0][0] + blocks[-1][0] + blocks[-1][2])
294 numOfAlignedLetterPairs = sum(i[2] for i in blocks)
295 yield seqName2, seqRank1, pos1, numOfAlignedLetterPairs
297 def mySortedRanges(seqRanges, sortOpt,
298 alignmentSortDataFunc, alignments, otherRanges):
299 rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
300 g = [list(ranges) for seqName, ranges in rangesGroupedBySeqName]
306 otherNameGroups = itertools.groupby(i[0] for i in otherRanges)
307 ranksAndNames = enumerate(i[0] for i in otherNameGroups)
308 otherNamesToRanks = dict((n, r) for r, n in ranksAndNames)
309 alns = sorted(alignmentSortDataFunc(alignments, otherNamesToRanks))
310 alnsGroupedBySeqName = itertools.groupby(alns, itemgetter(0))
311 seqNamesToLists = dict((k, list(v)) for k, v in alnsGroupedBySeqName)
312 g.sort(key=functools.partial(alignmentKey, seqNamesToLists))
313 return [j for i in g for j in i]
315 def allSortedRanges(opts, alignments, alignmentsB,
316 seqRanges1, seqRangesB1, seqRanges2, seqRangesB2):
317 o1, oB1 = twoValuesFromOption(opts.sort1, ":")
318 o2, oB2 = twoValuesFromOption(opts.sort2, ":")
319 if o1 == "3" and o2 == "3":
320 raise Exception("the sort options have circular dependency")
322 s1 = mySortedRanges(seqRanges1, o1, None, None, None)
324 s2 = mySortedRanges(seqRanges2, o2, None, None, None)
326 s1 = mySortedRanges(seqRanges1, o1, alignmentSortData1, alignments, s2)
328 s2 = mySortedRanges(seqRanges2, o2, alignmentSortData2, alignments, s1)
329 t1 = mySortedRanges(seqRangesB1, oB1, alignmentSortData1, alignmentsB, s2)
330 t2 = mySortedRanges(seqRangesB2, oB2, alignmentSortData2, alignmentsB, s1)
331 return s1 + t1, s2 + t2
337 groups.append(t[-3:])
339 return ",".join(reversed(groups))
342 suffixes = "bp", "kb", "Mb", "Gb"
343 for i, x in enumerate(suffixes):
346 return "%.2g" % (1.0 * size / j) + x
347 if size < j * 1000 or i == len(suffixes) - 1:
348 return "%.0f" % (1.0 * size / j) + x
350 def labelText(seqRange, labelOpt):
351 seqName, beg, end = seqRange
353 return seqName + ": " + sizeText(end - beg)
355 return seqName + ":" + prettyNum(beg) + ": " + sizeText(end - beg)
357 return seqName + ":" + prettyNum(beg) + "-" + prettyNum(end)
360 def rangeLabels(seqRanges, labelOpt, font, fontsize, image_mode, textRot):
363 im = Image.new(image_mode, image_size)
364 draw = ImageDraw.Draw(im)
367 text = labelText(r, labelOpt)
369 x, y = draw.textsize(text, font=font)
374 def dataFromRanges(sortedRanges, font, fontSize, imageMode, labelOpt, textRot):
375 for i in sortedRanges:
376 warn("\t".join(map(str, i)))
378 rangeSizes = [e - b for n, b, e in sortedRanges]
379 labs = list(rangeLabels(sortedRanges, labelOpt, font, fontSize,
381 margin = max(i[2] for i in labs)
382 # xxx the margin may be too big, because some labels may get omitted
383 return rangeSizes, labs, margin
386 '''Return x / y rounded up.'''
390 def get_bp_per_pix(rangeSizes, pixTweenRanges, maxPixels):
391 '''Get the minimum bp-per-pixel that fits in the size limit.'''
392 warn("choosing bp per pixel...")
393 numOfRanges = len(rangeSizes)
394 maxPixelsInRanges = maxPixels - pixTweenRanges * (numOfRanges - 1)
395 if maxPixelsInRanges < numOfRanges:
396 raise Exception("can't fit the image: too many sequences?")
397 negLimit = -maxPixelsInRanges
398 negBpPerPix = sum(rangeSizes) // negLimit
400 if sum(i // negBpPerPix for i in rangeSizes) >= negLimit:
404 def getRangePixBegs(rangePixLens, pixTweenRanges, margin):
405 '''Get the start pixel for each range.'''
407 pix_tot = margin - pixTweenRanges
408 for i in rangePixLens:
409 pix_tot += pixTweenRanges
410 rangePixBegs.append(pix_tot)
414 def pixelData(rangeSizes, bp_per_pix, pixTweenRanges, margin):
415 '''Return pixel information about the ranges.'''
416 rangePixLens = [div_ceil(i, bp_per_pix) for i in rangeSizes]
417 rangePixBegs = getRangePixBegs(rangePixLens, pixTweenRanges, margin)
418 tot_pix = rangePixBegs[-1] + rangePixLens[-1]
419 return rangePixBegs, rangePixLens, tot_pix
421 def drawLineForward(hits, width, bp_per_pix, beg1, beg2, size):
423 q1, r1 = divmod(beg1, bp_per_pix)
424 q2, r2 = divmod(beg2, bp_per_pix)
425 hits[q2 * width + q1] |= 1
426 next_pix = min(bp_per_pix - r1, bp_per_pix - r2)
427 if next_pix >= size: break
432 def drawLineReverse(hits, width, bp_per_pix, beg1, beg2, size):
435 q1, r1 = divmod(beg1, bp_per_pix)
436 q2, r2 = divmod(beg2, bp_per_pix)
437 hits[q2 * width + q1] |= 2
438 next_pix = min(bp_per_pix - r1, r2 + 1)
439 if next_pix >= size: break
444 def findOrigin(ranges, beg, size):
447 for rangeBeg, rangeEnd, origin in ranges:
451 def alignmentPixels(width, height, alignments, bp_per_pix,
452 rangeDict1, rangeDict2):
453 hits = [0] * (width * height) # the image data
454 for seq1, seq2, blocks in alignments:
455 beg1, beg2, size = blocks[0]
456 ori1 = findOrigin(rangeDict1[seq1], beg1, size)
457 ori2 = findOrigin(rangeDict2[seq2], beg2, size)
458 for beg1, beg2, size in blocks:
460 beg1 = -(beg1 + size)
461 beg2 = -(beg2 + size)
463 drawLineForward(hits, width, bp_per_pix,
464 beg1 + ori1, beg2 + ori2, size)
466 drawLineReverse(hits, width, bp_per_pix,
467 beg1 + ori1, beg2 - ori2, size)
470 def expandedSeqDict(seqDict):
471 '''Allow lookup by short sequence names, e.g. chr7 as well as hg19.chr7.'''
472 newDict = seqDict.copy()
473 for name, x in seqDict.items():
475 base = name.split(".")[-1]
476 if base in newDict: # an ambiguous case was found:
477 return seqDict # so give up completely
481 def readBed(fileName, rangeDict):
482 for line in myOpen(fileName):
486 if seqName not in rangeDict: continue
495 if len(w) > 8 and w[8].count(",") == 2:
496 color = "rgb(" + w[8] + ")"
501 yield layer, color, seqName, beg, end
503 def commaSeparatedInts(text):
504 return map(int, text.rstrip(",").split(","))
506 def readGenePred(opts, fileName, rangeDict):
507 for line in myOpen(fileName):
508 fields = line.split()
509 if not fields: continue
510 if fields[2] not in "+-": fields = fields[1:]
512 if seqName not in rangeDict: continue
514 cdsBeg = int(fields[5])
515 cdsEnd = int(fields[6])
516 exonBegs = commaSeparatedInts(fields[8])
517 exonEnds = commaSeparatedInts(fields[9])
518 for beg, end in zip(exonBegs, exonEnds):
519 yield 300, opts.exon_color, seqName, beg, end
522 if b < e: yield 400, opts.cds_color, seqName, b, e
524 def readRmsk(fileName, rangeDict):
525 for line in myOpen(fileName):
526 fields = line.split()
527 if len(fields) == 17: # rmsk.txt
529 if seqName not in rangeDict: continue # do this ASAP for speed
533 repeatClass = fields[11]
534 elif len(fields) == 15: # .out
536 if seqName not in rangeDict: continue
537 beg = int(fields[5]) - 1
540 repeatClass = fields[10]
543 if repeatClass in ("Low_complexity", "Simple_repeat"):
544 yield 200, "#fbf", seqName, beg, end
546 yield 100, "#ffe8e8", seqName, beg, end
548 yield 100, "#e8e8ff", seqName, beg, end
550 def isExtraFirstGapField(fields):
551 return fields[4].isdigit()
553 def readGaps(opts, fileName, rangeDict):
554 '''Read locations of unsequenced gaps, from an agp or gap file.'''
555 for line in myOpen(fileName):
557 if not w or w[0][0] == "#": continue
558 if isExtraFirstGapField(w): w = w[1:]
559 if w[4] not in "NU": continue
561 if seqName not in rangeDict: continue
563 beg = end - int(w[5]) # zero-based coordinate
565 yield 3000, opts.bridged_color, seqName, beg, end
567 yield 2000, opts.unbridged_color, seqName, beg, end
569 def bedBoxes(beds, rangeDict, margin, edge, isTop, bpPerPix):
570 for layer, color, seqName, bedBeg, bedEnd in beds:
571 for rangeBeg, rangeEnd, origin in rangeDict[seqName]:
572 beg = max(bedBeg, rangeBeg)
573 end = min(bedEnd, rangeEnd)
574 if beg >= end: continue
576 # include partly-covered pixels
577 b = (origin + beg) // bpPerPix
578 e = div_ceil(origin + end, bpPerPix)
580 # exclude partly-covered pixels
581 b = div_ceil(origin + beg, bpPerPix)
582 e = (origin + end) // bpPerPix
584 if end == rangeEnd: # include partly-covered end pixels
585 e = div_ceil(origin + end, bpPerPix)
587 box = b, margin, e, edge
589 box = margin, b, edge, e
590 yield layer, color, box
592 def drawAnnotations(im, boxes):
593 # xxx use partial transparency for different-color overlaps?
594 for layer, color, box in boxes:
597 def placedLabels(labels, rangePixBegs, rangePixLens, beg, end):
598 '''Return axis labels with endpoint & sort-order information.'''
600 for i, j, k in zip(labels, rangePixBegs, rangePixLens):
601 text, textWidth, textHeight = i
602 if textWidth > maxWidth:
604 labelBeg = j + (k - textWidth) // 2
605 labelEnd = labelBeg + textWidth
606 sortKey = textWidth - k
608 sortKey += maxWidth * (beg - labelBeg)
610 labelEnd = beg + textWidth
612 sortKey += maxWidth * (labelEnd - end)
614 labelBeg = end - textWidth
615 yield sortKey, labelBeg, labelEnd, text, textHeight
617 def nonoverlappingLabels(labels, minPixTweenLabels):
618 '''Get a subset of non-overlapping axis labels, greedily.'''
621 beg = i[1] - minPixTweenLabels
622 end = i[2] + minPixTweenLabels
623 if all(j[2] <= beg or j[1] >= end for j in out):
627 def axisImage(labels, rangePixBegs, rangePixLens, textRot,
628 textAln, font, image_mode, opts):
629 '''Make an image of axis labels.'''
630 beg = rangePixBegs[0]
631 end = rangePixBegs[-1] + rangePixLens[-1]
632 margin = max(i[2] for i in labels)
633 labels = sorted(placedLabels(labels, rangePixBegs, rangePixLens, beg, end))
634 minPixTweenLabels = 0 if textRot else opts.label_space
635 labels = nonoverlappingLabels(labels, minPixTweenLabels)
636 image_size = (margin, end) if textRot else (end, margin)
637 im = Image.new(image_mode, image_size, opts.margin_color)
638 draw = ImageDraw.Draw(im)
640 base = margin - i[4] if textAln else 0
641 position = (base, i[1]) if textRot else (i[1], base)
642 draw.text(position, i[3], font=font, fill=opts.text_color)
645 def rangesWithOrigins(sortedRanges, rangePixBegs, bpPerPix):
646 for i, j in zip(sortedRanges, rangePixBegs):
647 seqName, rangeBeg, rangeEnd = i
648 origin = bpPerPix * j - rangeBeg
649 yield seqName, (rangeBeg, rangeEnd, origin)
651 def rangesPerSeq(sortedRanges, rangePixBegs, bpPerPix):
652 a = rangesWithOrigins(sortedRanges, rangePixBegs, bpPerPix)
653 for k, v in itertools.groupby(a, itemgetter(0)):
654 yield k, [i[1] for i in v]
658 return ImageFont.truetype(opts.fontfile, opts.fontsize)
661 x = ["fc-match", "-f%{file}", "arial"]
662 p = subprocess.Popen(x, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
663 out, err = p.communicate()
664 fileNames.append(out)
666 warn("fc-match error: " + str(e))
667 fileNames.append("/Library/Fonts/Arial.ttf") # for Mac
670 font = ImageFont.truetype(i, opts.fontsize)
674 warn("font load error: " + str(e))
675 return ImageFont.load_default()
677 def lastDotplot(opts, args):
680 forward_color = ImageColor.getcolor(opts.forwardcolor, image_mode)
681 reverse_color = ImageColor.getcolor(opts.reversecolor, image_mode)
682 zipped_colors = zip(forward_color, reverse_color)
683 overlap_color = tuple([(i + j) // 2 for i, j in zipped_colors])
685 maxGap1, maxGapB1 = twoValuesFromOption(opts.max_gap1, ":")
686 maxGap2, maxGapB2 = twoValuesFromOption(opts.max_gap2, ":")
688 warn("reading alignments...")
689 alnData = readAlignments(args[0], opts)
690 alignments, seqRanges1, coverDict1, seqRanges2, coverDict2 = alnData
691 if not alignments: raise Exception("there are no alignments")
693 coverDict1 = dict(mergedRangesPerSeq(coverDict1))
694 coverDict2 = dict(mergedRangesPerSeq(coverDict2))
695 minAlignedBases = min(coveredLength(coverDict1), coveredLength(coverDict2))
696 pad = int(opts.pad * minAlignedBases)
697 cutRanges1 = list(trimmed(seqRanges1, coverDict1, minAlignedBases,
699 cutRanges2 = list(trimmed(seqRanges2, coverDict2, minAlignedBases,
702 warn("reading secondary alignments...")
703 alnDataB = readSecondaryAlignments(opts, cutRanges1, cutRanges2)
704 alignmentsB, seqRangesB1, coverDictB1, seqRangesB2, coverDictB2 = alnDataB
706 coverDictB1 = dict(mergedRangesPerSeq(coverDictB1))
707 coverDictB2 = dict(mergedRangesPerSeq(coverDictB2))
708 cutRangesB1 = trimmed(seqRangesB1, coverDictB1, minAlignedBases,
710 cutRangesB2 = trimmed(seqRangesB2, coverDictB2, minAlignedBases,
714 sortOut = allSortedRanges(opts, alignments, alignmentsB,
715 cutRanges1, cutRangesB1, cutRanges2, cutRangesB2)
716 sortedRanges1, sortedRanges2 = sortOut
718 textRot1 = "vertical".startswith(opts.rot1)
719 i1 = dataFromRanges(sortedRanges1, font,
720 opts.fontsize, image_mode, opts.labels1, textRot1)
721 rangeSizes1, labelData1, tMargin = i1
723 textRot2 = "horizontal".startswith(opts.rot2)
724 i2 = dataFromRanges(sortedRanges2, font,
725 opts.fontsize, image_mode, opts.labels2, textRot2)
726 rangeSizes2, labelData2, lMargin = i2
728 maxPixels1 = opts.width - lMargin
729 maxPixels2 = opts.height - tMargin
730 bpPerPix1 = get_bp_per_pix(rangeSizes1, opts.border_pixels, maxPixels1)
731 bpPerPix2 = get_bp_per_pix(rangeSizes2, opts.border_pixels, maxPixels2)
732 bpPerPix = max(bpPerPix1, bpPerPix2)
733 warn("bp per pixel = " + str(bpPerPix))
735 p1 = pixelData(rangeSizes1, bpPerPix, opts.border_pixels, lMargin)
736 rangePixBegs1, rangePixLens1, width = p1
737 rangeDict1 = dict(rangesPerSeq(sortedRanges1, rangePixBegs1, bpPerPix))
739 p2 = pixelData(rangeSizes2, bpPerPix, opts.border_pixels, tMargin)
740 rangePixBegs2, rangePixLens2, height = p2
741 rangeDict2 = dict(rangesPerSeq(sortedRanges2, rangePixBegs2, bpPerPix))
743 warn("width: " + str(width))
744 warn("height: " + str(height))
746 warn("processing alignments...")
747 hits = alignmentPixels(width, height, alignments + alignmentsB, bpPerPix,
748 rangeDict1, rangeDict2)
750 warn("reading annotations...")
752 rangeDict1 = expandedSeqDict(rangeDict1)
753 beds1 = itertools.chain(readBed(opts.bed1, rangeDict1),
754 readRmsk(opts.rmsk1, rangeDict1),
755 readGenePred(opts, opts.genePred1, rangeDict1),
756 readGaps(opts, opts.gap1, rangeDict1))
757 b1 = bedBoxes(beds1, rangeDict1, tMargin, height, True, bpPerPix)
759 rangeDict2 = expandedSeqDict(rangeDict2)
760 beds2 = itertools.chain(readBed(opts.bed2, rangeDict2),
761 readRmsk(opts.rmsk2, rangeDict2),
762 readGenePred(opts, opts.genePred2, rangeDict2),
763 readGaps(opts, opts.gap2, rangeDict2))
764 b2 = bedBoxes(beds2, rangeDict2, lMargin, width, False, bpPerPix)
766 boxes = sorted(itertools.chain(b1, b2))
770 image_size = width, height
771 im = Image.new(image_mode, image_size, opts.background_color)
773 drawAnnotations(im, boxes)
775 for i in range(height):
776 for j in range(width):
777 store_value = hits[i * width + j]
779 if store_value == 1: im.putpixel(xy, forward_color)
780 elif store_value == 2: im.putpixel(xy, reverse_color)
781 elif store_value == 3: im.putpixel(xy, overlap_color)
783 if opts.fontsize != 0:
784 axis1 = axisImage(labelData1, rangePixBegs1, rangePixLens1,
785 textRot1, False, font, image_mode, opts)
787 axis1 = axis1.transpose(Image.ROTATE_90)
788 axis2 = axisImage(labelData2, rangePixBegs2, rangePixLens2,
789 textRot2, textRot2, font, image_mode, opts)
791 axis2 = axis2.transpose(Image.ROTATE_270)
792 im.paste(axis1, (0, 0))
793 im.paste(axis2, (0, 0))
795 for i in rangePixBegs1[1:]:
796 box = i - opts.border_pixels, tMargin, i, height
797 im.paste(opts.border_color, box)
799 for i in rangePixBegs2[1:]:
800 box = lMargin, i - opts.border_pixels, width, i
801 im.paste(opts.border_color, box)
805 if __name__ == "__main__":
806 usage = """%prog --help
807 or: %prog [options] maf-or-tab-alignments dotplot.png
808 or: %prog [options] maf-or-tab-alignments dotplot.gif
810 description = "Draw a dotplot of pair-wise sequence alignments in MAF or tabular format."
811 op = optparse.OptionParser(usage=usage, description=description)
812 op.add_option("-v", "--verbose", action="count",
813 help="show progress messages & data about the plot")
814 op.add_option("-1", "--seq1", metavar="PATTERN", action="append",
816 help="which sequences to show from the 1st genome")
817 op.add_option("-2", "--seq2", metavar="PATTERN", action="append",
819 help="which sequences to show from the 2nd genome")
820 # Replace "width" & "height" with a single "length" option?
821 op.add_option("-x", "--width", type="int", default=1000,
822 help="maximum width in pixels (default: %default)")
823 op.add_option("-y", "--height", type="int", default=1000,
824 help="maximum height in pixels (default: %default)")
825 op.add_option("-c", "--forwardcolor", metavar="COLOR", default="red",
826 help="color for forward alignments (default: %default)")
827 op.add_option("-r", "--reversecolor", metavar="COLOR", default="blue",
828 help="color for reverse alignments (default: %default)")
829 op.add_option("--alignments", metavar="FILE", help="secondary alignments")
830 op.add_option("--sort1", default="1", metavar="N",
831 help="genome1 sequence order: 0=input order, 1=name order, "
832 "2=length order, 3=alignment order (default=%default)")
833 op.add_option("--sort2", default="1", metavar="N",
834 help="genome2 sequence order: 0=input order, 1=name order, "
835 "2=length order, 3=alignment order (default=%default)")
836 op.add_option("--max-gap1", metavar="FRAC", default="0.5,2", help=
837 "maximum unaligned (end,mid) gap in genome1: "
838 "fraction of aligned length (default=%default)")
839 op.add_option("--max-gap2", metavar="FRAC", default="0.5,2", help=
840 "maximum unaligned (end,mid) gap in genome2: "
841 "fraction of aligned length (default=%default)")
842 op.add_option("--pad", metavar="FRAC", type="float", default=0.04, help=
843 "pad length when cutting unaligned gaps: "
844 "fraction of aligned length (default=%default)")
845 op.add_option("--border-pixels", metavar="INT", type="int", default=1,
846 help="number of pixels between sequences (default=%default)")
847 op.add_option("--border-color", metavar="COLOR", default="black",
848 help="color for pixels between sequences (default=%default)")
849 # --break-color and/or --break-pixels for intra-sequence breaks?
850 op.add_option("--margin-color", metavar="COLOR", default="#dcdcdc",
853 og = optparse.OptionGroup(op, "Text options")
854 og.add_option("-f", "--fontfile", metavar="FILE",
855 help="TrueType or OpenType font file")
856 og.add_option("-s", "--fontsize", metavar="SIZE", type="int", default=14,
857 help="TrueType or OpenType font size (default: %default)")
858 og.add_option("--labels1", type="int", default=0, metavar="N", help=
859 "genome1 labels: 0=name, 1=name:length, "
860 "2=name:start:length, 3=name:start-end (default=%default)")
861 og.add_option("--labels2", type="int", default=0, metavar="N", help=
862 "genome2 labels: 0=name, 1=name:length, "
863 "2=name:start:length, 3=name:start-end (default=%default)")
864 og.add_option("--rot1", metavar="ROT", default="h",
865 help="text rotation for the 1st genome (default=%default)")
866 og.add_option("--rot2", metavar="ROT", default="v",
867 help="text rotation for the 2nd genome (default=%default)")
868 op.add_option_group(og)
870 og = optparse.OptionGroup(op, "Annotation options")
871 og.add_option("--bed1", metavar="FILE",
872 help="read genome1 annotations from BED file")
873 og.add_option("--bed2", metavar="FILE",
874 help="read genome2 annotations from BED file")
875 og.add_option("--rmsk1", metavar="FILE", help="read genome1 repeats from "
876 "RepeatMasker .out or rmsk.txt file")
877 og.add_option("--rmsk2", metavar="FILE", help="read genome2 repeats from "
878 "RepeatMasker .out or rmsk.txt file")
879 op.add_option_group(og)
881 og = optparse.OptionGroup(op, "Gene options")
882 og.add_option("--genePred1", metavar="FILE",
883 help="read genome1 genes from genePred file")
884 og.add_option("--genePred2", metavar="FILE",
885 help="read genome2 genes from genePred file")
886 og.add_option("--exon-color", metavar="COLOR", default="PaleGreen",
887 help="color for exons (default=%default)")
888 og.add_option("--cds-color", metavar="COLOR", default="LimeGreen",
889 help="color for protein-coding regions (default=%default)")
890 op.add_option_group(og)
892 og = optparse.OptionGroup(op, "Unsequenced gap options")
893 og.add_option("--gap1", metavar="FILE",
894 help="read genome1 unsequenced gaps from agp or gap file")
895 og.add_option("--gap2", metavar="FILE",
896 help="read genome2 unsequenced gaps from agp or gap file")
897 og.add_option("--bridged-color", metavar="COLOR", default="yellow",
898 help="color for bridged gaps (default: %default)")
899 og.add_option("--unbridged-color", metavar="COLOR", default="orange",
900 help="color for unbridged gaps (default: %default)")
901 op.add_option_group(og)
902 (opts, args) = op.parse_args()
903 if len(args) != 2: op.error("2 arguments needed")
905 opts.text_color = "black"
906 opts.background_color = "white"
907 opts.label_space = 5 # minimum number of pixels between axis labels
909 try: lastDotplot(opts, args)
910 except KeyboardInterrupt: pass # avoid silly error message
911 except Exception as e:
912 prog = os.path.basename(sys.argv[0])
913 sys.exit(prog + ": error: " + str(e))