3 # Read pair-wise alignments in MAF or LAST tabular format: write an
4 # "Oxford grid", a.k.a. dotplot.
6 # TODO: Currently, pixels with zero aligned nt-pairs are white, and
7 # pixels with one or more aligned nt-pairs are black. This can look
8 # too crowded for large genome alignments. I tried shading each pixel
9 # according to the number of aligned nt-pairs within it, but the
10 # result is too faint. How can this be done better?
15 from fnmatch import fnmatchcase
16 from operator import itemgetter
18 import itertools, optparse, os, re, sys
20 # Try to make PIL/PILLOW work:
21 try: from PIL import Image, ImageDraw, ImageFont, ImageColor
22 except ImportError: import Image, ImageDraw, ImageFont, ImageColor
24 def myOpen(fileName): # faster than fileinput
29 if fileName.endswith(".gz"):
30 return gzip.open(fileName)
35 prog = os.path.basename(sys.argv[0])
36 sys.stderr.write(prog + ": " + message + "\n")
38 def groupByFirstItem(things):
39 for k, v in itertools.groupby(things, itemgetter(0)):
40 yield k, [i[1:] for i in v]
42 def croppedBlocks(blocks, ranges1, ranges2):
43 headBeg1, headBeg2, headSize = blocks[0]
46 cropBeg1, cropEnd1 = r1
48 cropBeg1, cropEnd1 = -cropEnd1, -cropBeg1
49 cropBeg2, cropEnd2 = r2
51 cropBeg2, cropEnd2 = -cropEnd2, -cropBeg2
52 for beg1, beg2, size in blocks:
53 b1 = max(cropBeg1, beg1)
54 e1 = min(cropEnd1, beg1 + size)
57 b2 = max(cropBeg2, b1 + offset)
58 e2 = min(cropEnd2, e1 + offset)
60 yield b2 - offset, b2, e2 - b2
62 def tabBlocks(beg1, beg2, blocks):
63 '''Get the gapless blocks of an alignment, from LAST tabular format.'''
64 for i in blocks.split(","):
71 yield beg1, beg2, size
75 def mafBlocks(beg1, beg2, seq1, seq2):
76 '''Get the gapless blocks of an alignment, from MAF format.'''
78 for x, y in itertools.izip(seq1, seq2):
81 yield beg1, beg2, size
88 yield beg1, beg2, size
95 if size: yield beg1, beg2, size
97 def alignmentInput(lines):
98 '''Get alignments and sequence lengths, from MAF or tabular format.'''
102 if line[0].isdigit(): # tabular format
103 chr1, beg1, seqlen1 = w[1], int(w[2]), int(w[5])
104 if w[4] == "-": beg1 -= seqlen1
105 chr2, beg2, seqlen2 = w[6], int(w[7]), int(w[10])
106 if w[9] == "-": beg2 -= seqlen2
107 blocks = tabBlocks(beg1, beg2, w[11])
108 yield chr1, seqlen1, chr2, seqlen2, blocks
109 elif line[0] == "s": # MAF format
111 chr1, beg1, seqlen1, seq1 = w[1], int(w[2]), int(w[5]), w[6]
112 if w[4] == "-": beg1 -= seqlen1
115 chr2, beg2, seqlen2, seq2 = w[1], int(w[2]), int(w[5]), w[6]
116 if w[4] == "-": beg2 -= seqlen2
117 blocks = mafBlocks(beg1, beg2, seq1, seq2)
118 yield chr1, seqlen1, chr2, seqlen2, blocks
121 def seqRequestFromText(text):
123 pattern, interval = text.rsplit(":", 1)
125 beg, end = interval.rsplit("-", 1)
126 return pattern, int(beg), int(end) # beg may be negative
127 return text, 0, sys.maxsize
129 def rangesFromSeqName(seqRequests, name, seqLen):
131 base = name.split(".")[-1] # allow for names like hg19.chr7
132 for pat, beg, end in seqRequests:
133 if fnmatchcase(name, pat) or fnmatchcase(base, pat):
134 yield max(beg, 0), min(end, seqLen)
138 def updateSeqs(coverDict, seqRanges, seqName, ranges, coveredRange):
139 beg, end = coveredRange
141 coveredRange = -end, -beg
142 if seqName in coverDict:
143 coverDict[seqName].append(coveredRange)
145 coverDict[seqName] = [coveredRange]
146 for beg, end in ranges:
147 r = seqName, beg, end
150 def readAlignments(fileName, opts):
151 '''Get alignments and sequence limits, from MAF or tabular format.'''
152 seqRequests1 = [seqRequestFromText(i) for i in opts.seq1]
153 seqRequests2 = [seqRequestFromText(i) for i in opts.seq2]
160 lines = myOpen(fileName)
161 for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
162 ranges1 = sorted(rangesFromSeqName(seqRequests1, seqName1, seqLen1))
163 if not ranges1: continue
164 ranges2 = sorted(rangesFromSeqName(seqRequests2, seqName2, seqLen2))
165 if not ranges2: continue
166 b = list(croppedBlocks(list(blocks), ranges1, ranges2))
168 aln = seqName1, seqName2, b
169 alignments.append(aln)
170 coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
171 updateSeqs(coverDict1, seqRanges1, seqName1, ranges1, coveredRange1)
172 coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
173 updateSeqs(coverDict2, seqRanges2, seqName2, ranges2, coveredRange2)
174 return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
176 def nameAndRangesFromDict(cropDict, seqName):
177 if seqName in cropDict:
178 return seqName, cropDict[seqName]
179 n = seqName.split(".")[-1]
181 return n, cropDict[n]
184 def rangesForSecondaryAlignments(primaryRanges, seqLen):
189 def readSecondaryAlignments(opts, cropRanges1, cropRanges2):
190 cropDict1 = dict(groupByFirstItem(cropRanges1))
191 cropDict2 = dict(groupByFirstItem(cropRanges2))
198 lines = myOpen(opts.alignments)
199 for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
200 seqName1, ranges1 = nameAndRangesFromDict(cropDict1, seqName1)
201 seqName2, ranges2 = nameAndRangesFromDict(cropDict2, seqName2)
202 if not ranges1 and not ranges2:
204 r1 = rangesForSecondaryAlignments(ranges1, seqLen1)
205 r2 = rangesForSecondaryAlignments(ranges2, seqLen2)
206 b = list(croppedBlocks(list(blocks), r1, r2))
208 aln = seqName1, seqName2, b
209 alignments.append(aln)
211 coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
212 updateSeqs(coverDict1, seqRanges1, seqName1, r1, coveredRange1)
214 coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
215 updateSeqs(coverDict2, seqRanges2, seqName2, r2, coveredRange2)
216 return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
218 def twoValuesFromOption(text, separator):
219 if separator in text:
220 return text.split(separator)
223 def mergedRanges(ranges):
224 oldBeg, maxEnd = ranges[0]
225 for beg, end in ranges:
234 def mergedRangesPerSeq(coverDict):
235 for k, v in coverDict.iteritems():
237 yield k, list(mergedRanges(v))
239 def coveredLength(mergedCoverDict):
240 return sum(sum(e - b for b, e in v) for v in mergedCoverDict.itervalues())
242 def trimmed(seqRanges, coverDict, minAlignedBases, maxGapFrac, endPad, midPad):
243 maxEndGapFrac, maxMidGapFrac = twoValuesFromOption(maxGapFrac, ",")
244 maxEndGap = max(float(maxEndGapFrac) * minAlignedBases, endPad * 1.0)
245 maxMidGap = max(float(maxMidGapFrac) * minAlignedBases, midPad * 2.0)
247 for seqName, rangeBeg, rangeEnd in seqRanges:
248 seqBlocks = coverDict[seqName]
249 blocks = [i for i in seqBlocks if i[0] < rangeEnd and i[1] > rangeBeg]
250 if blocks[0][0] - rangeBeg > maxEndGap:
251 rangeBeg = blocks[0][0] - endPad
252 for j, y in enumerate(blocks):
255 if y[0] - x[1] > maxMidGap:
256 yield seqName, rangeBeg, x[1] + midPad
257 rangeBeg = y[0] - midPad
258 if rangeEnd - blocks[-1][1] > maxEndGap:
259 rangeEnd = blocks[-1][1] + endPad
260 yield seqName, rangeBeg, rangeEnd
262 def rangesWithStrandInfo(seqRanges, strandOpt, alignments, seqIndex):
264 forwardMinusReverse = collections.defaultdict(int)
267 beg1, beg2, size = blocks[0]
268 numOfAlignedLetterPairs = sum(i[2] for i in blocks)
269 if (beg1 < 0) != (beg2 < 0): # opposite-strand alignment
270 numOfAlignedLetterPairs *= -1
271 forwardMinusReverse[i[seqIndex]] += numOfAlignedLetterPairs
273 for seqName, beg, end in seqRanges:
275 strandNum = 1 if forwardMinusReverse[seqName] >= 0 else 2
276 yield seqName, beg, end, strandNum
278 def natural_sort_key(my_string):
279 '''Return a sort key for "natural" ordering, e.g. chr9 < chr10.'''
280 parts = re.split(r'(\d+)', my_string)
281 parts[1::2] = map(int, parts[1::2])
284 def nameKey(oneSeqRanges):
285 return natural_sort_key(oneSeqRanges[0][0])
287 def sizeKey(oneSeqRanges):
288 return sum(b - e for n, b, e, s in oneSeqRanges), nameKey(oneSeqRanges)
290 def alignmentKey(seqNamesToLists, oneSeqRanges):
291 seqName = oneSeqRanges[0][0]
292 alignmentsOfThisSequence = seqNamesToLists[seqName]
293 numOfAlignedLetterPairs = sum(i[3] for i in alignmentsOfThisSequence)
294 toMiddle = numOfAlignedLetterPairs // 2
295 for i in alignmentsOfThisSequence:
298 return i[1:3] # sequence-rank and "position" of this alignment
300 def rankAndFlipPerSeq(seqRanges):
301 rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
302 for rank, group in enumerate(rangesGroupedBySeqName):
303 seqName, ranges = group
304 strandNum = next(ranges)[3]
305 flip = 1 if strandNum < 2 else -1
306 yield seqName, (rank, flip)
308 def alignmentSortData(alignments, seqIndex, otherNamesToRanksAndFlips):
309 otherIndex = 1 - seqIndex
312 otherRank, otherFlip = otherNamesToRanksAndFlips[i[otherIndex]]
313 otherPos = otherFlip * abs(blocks[0][otherIndex] +
314 blocks[-1][otherIndex] + blocks[-1][2])
315 numOfAlignedLetterPairs = sum(i[2] for i in blocks)
316 yield i[seqIndex], otherRank, otherPos, numOfAlignedLetterPairs
318 def mySortedRanges(seqRanges, sortOpt, seqIndex, alignments, otherRanges):
319 rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
320 g = [list(ranges) for seqName, ranges in rangesGroupedBySeqName]
329 otherNamesToRanksAndFlips = dict(rankAndFlipPerSeq(otherRanges))
330 alns = sorted(alignmentSortData(alignments, seqIndex,
331 otherNamesToRanksAndFlips))
332 alnsGroupedBySeqName = itertools.groupby(alns, itemgetter(0))
333 seqNamesToLists = dict((k, list(v)) for k, v in alnsGroupedBySeqName)
334 g.sort(key=functools.partial(alignmentKey, seqNamesToLists))
335 return [j for i in g for j in i]
337 def allSortedRanges(opts, alignments, alignmentsB,
338 seqRanges1, seqRangesB1, seqRanges2, seqRangesB2):
339 o1, oB1 = twoValuesFromOption(opts.strands1, ":")
340 o2, oB2 = twoValuesFromOption(opts.strands2, ":")
341 if o1 == "1" and o2 == "1":
342 raise Exception("the strand options have circular dependency")
343 seqRanges1 = list(rangesWithStrandInfo(seqRanges1, o1, alignments, 0))
344 seqRanges2 = list(rangesWithStrandInfo(seqRanges2, o2, alignments, 1))
345 seqRangesB1 = list(rangesWithStrandInfo(seqRangesB1, oB1, alignmentsB, 0))
346 seqRangesB2 = list(rangesWithStrandInfo(seqRangesB2, oB2, alignmentsB, 1))
348 o1, oB1 = twoValuesFromOption(opts.sort1, ":")
349 o2, oB2 = twoValuesFromOption(opts.sort2, ":")
350 if o1 == "3" and o2 == "3":
351 raise Exception("the sort options have circular dependency")
353 s1 = mySortedRanges(seqRanges1, o1, None, None, None)
355 s2 = mySortedRanges(seqRanges2, o2, None, None, None)
357 s1 = mySortedRanges(seqRanges1, o1, 0, alignments, s2)
359 s2 = mySortedRanges(seqRanges2, o2, 1, alignments, s1)
360 t1 = mySortedRanges(seqRangesB1, oB1, 0, alignmentsB, s2)
361 t2 = mySortedRanges(seqRangesB2, oB2, 1, alignmentsB, s1)
362 return s1 + t1, s2 + t2
368 groups.append(t[-3:])
370 return ",".join(reversed(groups))
373 suffixes = "bp", "kb", "Mb", "Gb"
374 for i, x in enumerate(suffixes):
377 return "%.2g" % (1.0 * size / j) + x
378 if size < j * 1000 or i == len(suffixes) - 1:
379 return "%.0f" % (1.0 * size / j) + x
381 def labelText(seqRange, labelOpt):
382 seqName, beg, end, strandNum = seqRange
384 return seqName + ": " + sizeText(end - beg)
386 return seqName + ":" + prettyNum(beg) + ": " + sizeText(end - beg)
388 return seqName + ":" + prettyNum(beg) + "-" + prettyNum(end)
391 def rangeLabels(seqRanges, labelOpt, font, fontsize, image_mode, textRot):
394 im = Image.new(image_mode, image_size)
395 draw = ImageDraw.Draw(im)
398 text = labelText(r, labelOpt)
400 x, y = draw.textsize(text, font=font)
403 yield text, x, y, r[3]
405 def dataFromRanges(sortedRanges, font, fontSize, imageMode, labelOpt, textRot):
406 for seqName, rangeBeg, rangeEnd, strandNum in sortedRanges:
407 out = [seqName, str(rangeBeg), str(rangeEnd)]
409 out.append(".+-"[strandNum])
412 rangeSizes = [e - b for n, b, e, s in sortedRanges]
413 labs = list(rangeLabels(sortedRanges, labelOpt, font, fontSize,
415 margin = max(i[2] for i in labs)
416 # xxx the margin may be too big, because some labels may get omitted
417 return rangeSizes, labs, margin
420 '''Return x / y rounded up.'''
424 def get_bp_per_pix(rangeSizes, pixTweenRanges, maxPixels):
425 '''Get the minimum bp-per-pixel that fits in the size limit.'''
426 warn("choosing bp per pixel...")
427 numOfRanges = len(rangeSizes)
428 maxPixelsInRanges = maxPixels - pixTweenRanges * (numOfRanges - 1)
429 if maxPixelsInRanges < numOfRanges:
430 raise Exception("can't fit the image: too many sequences?")
431 negLimit = -maxPixelsInRanges
432 negBpPerPix = sum(rangeSizes) // negLimit
434 if sum(i // negBpPerPix for i in rangeSizes) >= negLimit:
438 def getRangePixBegs(rangePixLens, pixTweenRanges, margin):
439 '''Get the start pixel for each range.'''
441 pix_tot = margin - pixTweenRanges
442 for i in rangePixLens:
443 pix_tot += pixTweenRanges
444 rangePixBegs.append(pix_tot)
448 def pixelData(rangeSizes, bp_per_pix, pixTweenRanges, margin):
449 '''Return pixel information about the ranges.'''
450 rangePixLens = [div_ceil(i, bp_per_pix) for i in rangeSizes]
451 rangePixBegs = getRangePixBegs(rangePixLens, pixTweenRanges, margin)
452 tot_pix = rangePixBegs[-1] + rangePixLens[-1]
453 return rangePixBegs, rangePixLens, tot_pix
455 def drawLineForward(hits, width, bp_per_pix, beg1, beg2, size):
457 q1, r1 = divmod(beg1, bp_per_pix)
458 q2, r2 = divmod(beg2, bp_per_pix)
459 hits[q2 * width + q1] |= 1
460 next_pix = min(bp_per_pix - r1, bp_per_pix - r2)
461 if next_pix >= size: break
466 def drawLineReverse(hits, width, bp_per_pix, beg1, beg2, size):
468 q1, r1 = divmod(beg1, bp_per_pix)
469 q2, r2 = divmod(beg2, bp_per_pix)
470 hits[q2 * width + q1] |= 2
471 next_pix = min(bp_per_pix - r1, r2 + 1)
472 if next_pix >= size: break
477 def strandAndOrigin(ranges, beg, size):
478 isReverseStrand = (beg < 0)
481 for rangeBeg, rangeEnd, isReverseRange, origin in ranges:
482 if rangeEnd > beg: # assumes the ranges are sorted
483 return (isReverseStrand != isReverseRange), origin
485 def alignmentPixels(width, height, alignments, bp_per_pix,
486 rangeDict1, rangeDict2):
487 hits = [0] * (width * height) # the image data
488 for seq1, seq2, blocks in alignments:
489 beg1, beg2, size = blocks[0]
490 isReverse1, ori1 = strandAndOrigin(rangeDict1[seq1], beg1, size)
491 isReverse2, ori2 = strandAndOrigin(rangeDict2[seq2], beg2, size)
492 for beg1, beg2, size in blocks:
494 beg1 = -(beg1 + size)
495 beg2 = -(beg2 + size)
496 if isReverse1 == isReverse2:
497 drawLineForward(hits, width, bp_per_pix,
498 ori1 + beg1, ori2 + beg2, size)
500 drawLineReverse(hits, width, bp_per_pix,
501 ori1 + beg1, ori2 - beg2 - 1, size)
504 def expandedSeqDict(seqDict):
505 '''Allow lookup by short sequence names, e.g. chr7 as well as hg19.chr7.'''
506 newDict = seqDict.copy()
507 for name, x in seqDict.items():
509 base = name.split(".")[-1]
510 if base in newDict: # an ambiguous case was found:
511 return seqDict # so give up completely
515 def readBed(fileName, rangeDict):
516 for line in myOpen(fileName):
520 if seqName not in rangeDict: continue
529 if len(w) > 8 and w[8].count(",") == 2:
530 color = "rgb(" + w[8] + ")"
533 isRev = rangeDict[seqName][0][2]
534 if strand == "+" and not isRev or strand == "-" and isRev:
536 if strand == "-" and not isRev or strand == "+" and isRev:
538 yield layer, color, seqName, beg, end
540 def commaSeparatedInts(text):
541 return map(int, text.rstrip(",").split(","))
543 def readGenePred(opts, fileName, rangeDict):
544 for line in myOpen(fileName):
545 fields = line.split()
546 if not fields: continue
547 if fields[2] not in "+-": fields = fields[1:]
549 if seqName not in rangeDict: continue
551 cdsBeg = int(fields[5])
552 cdsEnd = int(fields[6])
553 exonBegs = commaSeparatedInts(fields[8])
554 exonEnds = commaSeparatedInts(fields[9])
555 for beg, end in zip(exonBegs, exonEnds):
556 yield 300, opts.exon_color, seqName, beg, end
559 if b < e: yield 400, opts.cds_color, seqName, b, e
561 def readRmsk(fileName, rangeDict):
562 for line in myOpen(fileName):
563 fields = line.split()
564 if len(fields) == 17: # rmsk.txt
566 if seqName not in rangeDict: continue # do this ASAP for speed
570 repeatClass = fields[11]
571 elif len(fields) == 15: # .out
573 if seqName not in rangeDict: continue
574 beg = int(fields[5]) - 1
577 repeatClass = fields[10]
580 if repeatClass in ("Low_complexity", "Simple_repeat"):
581 yield 200, "#fbf", seqName, beg, end
582 elif (strand == "+") != rangeDict[seqName][0][2]:
583 yield 100, "#ffe8e8", seqName, beg, end
585 yield 100, "#e8e8ff", seqName, beg, end
587 def isExtraFirstGapField(fields):
588 return fields[4].isdigit()
590 def readGaps(opts, fileName, rangeDict):
591 '''Read locations of unsequenced gaps, from an agp or gap file.'''
592 for line in myOpen(fileName):
594 if not w or w[0][0] == "#": continue
595 if isExtraFirstGapField(w): w = w[1:]
596 if w[4] not in "NU": continue
598 if seqName not in rangeDict: continue
600 beg = end - int(w[5]) # zero-based coordinate
602 yield 3000, opts.bridged_color, seqName, beg, end
604 yield 2000, opts.unbridged_color, seqName, beg, end
606 def bedBoxes(beds, rangeDict, margin, edge, isTop, bpPerPix):
607 for layer, color, seqName, bedBeg, bedEnd in beds:
608 for rangeBeg, rangeEnd, isReverseRange, origin in rangeDict[seqName]:
609 beg = max(bedBeg, rangeBeg)
610 end = min(bedEnd, rangeEnd)
611 if beg >= end: continue
613 beg, end = -end, -beg
615 # include partly-covered pixels
616 b = (origin + beg) // bpPerPix
617 e = div_ceil(origin + end, bpPerPix)
619 # exclude partly-covered pixels
620 b = div_ceil(origin + beg, bpPerPix)
621 e = (origin + end) // bpPerPix
623 if bedEnd >= rangeEnd: # include partly-covered end pixels
625 b = (origin + beg) // bpPerPix
627 e = div_ceil(origin + end, bpPerPix)
629 box = b, margin, e, edge
631 box = margin, b, edge, e
632 yield layer, color, box
634 def drawAnnotations(im, boxes):
635 # xxx use partial transparency for different-color overlaps?
636 for layer, color, box in boxes:
639 def placedLabels(labels, rangePixBegs, rangePixLens, beg, end):
640 '''Return axis labels with endpoint & sort-order information.'''
642 for i, j, k in zip(labels, rangePixBegs, rangePixLens):
643 text, textWidth, textHeight, strandNum = i
644 if textWidth > maxWidth:
646 labelBeg = j + (k - textWidth) // 2
647 labelEnd = labelBeg + textWidth
648 sortKey = textWidth - k
650 sortKey += maxWidth * (beg - labelBeg)
652 labelEnd = beg + textWidth
654 sortKey += maxWidth * (labelEnd - end)
656 labelBeg = end - textWidth
657 yield sortKey, labelBeg, labelEnd, text, textHeight, strandNum
659 def nonoverlappingLabels(labels, minPixTweenLabels):
660 '''Get a subset of non-overlapping axis labels, greedily.'''
663 beg = i[1] - minPixTweenLabels
664 end = i[2] + minPixTweenLabels
665 if all(j[2] <= beg or j[1] >= end for j in out):
669 def axisImage(labels, rangePixBegs, rangePixLens, textRot,
670 textAln, font, image_mode, opts):
671 '''Make an image of axis labels.'''
672 beg = rangePixBegs[0]
673 end = rangePixBegs[-1] + rangePixLens[-1]
674 margin = max(i[2] for i in labels)
675 labels = sorted(placedLabels(labels, rangePixBegs, rangePixLens, beg, end))
676 minPixTweenLabels = 0 if textRot else opts.label_space
677 labels = nonoverlappingLabels(labels, minPixTweenLabels)
678 image_size = (margin, end) if textRot else (end, margin)
679 im = Image.new(image_mode, image_size, opts.margin_color)
680 draw = ImageDraw.Draw(im)
681 for sortKey, labelBeg, labelEnd, text, textHeight, strandNum in labels:
682 base = margin - textHeight if textAln else 0
683 position = (base, labelBeg) if textRot else (labelBeg, base)
684 fill = ("black", opts.forwardcolor, opts.reversecolor)[strandNum]
685 draw.text(position, text, font=font, fill=fill)
688 def rangesWithOrigins(sortedRanges, rangePixBegs, rangePixLens, bpPerPix):
689 for i, j, k in zip(sortedRanges, rangePixBegs, rangePixLens):
690 seqName, rangeBeg, rangeEnd, strandNum = i
691 isReverseRange = (strandNum > 1)
693 origin = bpPerPix * (j + k) + rangeBeg
695 origin = bpPerPix * j - rangeBeg
696 yield seqName, (rangeBeg, rangeEnd, isReverseRange, origin)
698 def rangesPerSeq(sortedRanges, rangePixBegs, rangePixLens, bpPerPix):
699 a = rangesWithOrigins(sortedRanges, rangePixBegs, rangePixLens, bpPerPix)
700 for k, v in itertools.groupby(a, itemgetter(0)):
701 yield k, sorted(i[1] for i in v)
705 return ImageFont.truetype(opts.fontfile, opts.fontsize)
708 x = ["fc-match", "-f%{file}", "arial"]
709 p = subprocess.Popen(x, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
710 out, err = p.communicate()
711 fileNames.append(out)
713 warn("fc-match error: " + str(e))
714 fileNames.append("/Library/Fonts/Arial.ttf") # for Mac
717 font = ImageFont.truetype(i, opts.fontsize)
721 warn("font load error: " + str(e))
722 return ImageFont.load_default()
724 def lastDotplot(opts, args):
727 forward_color = ImageColor.getcolor(opts.forwardcolor, image_mode)
728 reverse_color = ImageColor.getcolor(opts.reversecolor, image_mode)
729 zipped_colors = zip(forward_color, reverse_color)
730 overlap_color = tuple([(i + j) // 2 for i, j in zipped_colors])
732 maxGap1, maxGapB1 = twoValuesFromOption(opts.max_gap1, ":")
733 maxGap2, maxGapB2 = twoValuesFromOption(opts.max_gap2, ":")
735 warn("reading alignments...")
736 alnData = readAlignments(args[0], opts)
737 alignments, seqRanges1, coverDict1, seqRanges2, coverDict2 = alnData
738 if not alignments: raise Exception("there are no alignments")
740 coverDict1 = dict(mergedRangesPerSeq(coverDict1))
741 coverDict2 = dict(mergedRangesPerSeq(coverDict2))
742 minAlignedBases = min(coveredLength(coverDict1), coveredLength(coverDict2))
743 pad = int(opts.pad * minAlignedBases)
744 cutRanges1 = list(trimmed(seqRanges1, coverDict1, minAlignedBases,
746 cutRanges2 = list(trimmed(seqRanges2, coverDict2, minAlignedBases,
749 warn("reading secondary alignments...")
750 alnDataB = readSecondaryAlignments(opts, cutRanges1, cutRanges2)
751 alignmentsB, seqRangesB1, coverDictB1, seqRangesB2, coverDictB2 = alnDataB
753 coverDictB1 = dict(mergedRangesPerSeq(coverDictB1))
754 coverDictB2 = dict(mergedRangesPerSeq(coverDictB2))
755 cutRangesB1 = trimmed(seqRangesB1, coverDictB1, minAlignedBases,
757 cutRangesB2 = trimmed(seqRangesB2, coverDictB2, minAlignedBases,
761 sortOut = allSortedRanges(opts, alignments, alignmentsB,
762 cutRanges1, cutRangesB1, cutRanges2, cutRangesB2)
763 sortedRanges1, sortedRanges2 = sortOut
765 textRot1 = "vertical".startswith(opts.rot1)
766 i1 = dataFromRanges(sortedRanges1, font,
767 opts.fontsize, image_mode, opts.labels1, textRot1)
768 rangeSizes1, labelData1, tMargin = i1
770 textRot2 = "horizontal".startswith(opts.rot2)
771 i2 = dataFromRanges(sortedRanges2, font,
772 opts.fontsize, image_mode, opts.labels2, textRot2)
773 rangeSizes2, labelData2, lMargin = i2
775 maxPixels1 = opts.width - lMargin
776 maxPixels2 = opts.height - tMargin
777 bpPerPix1 = get_bp_per_pix(rangeSizes1, opts.border_pixels, maxPixels1)
778 bpPerPix2 = get_bp_per_pix(rangeSizes2, opts.border_pixels, maxPixels2)
779 bpPerPix = max(bpPerPix1, bpPerPix2)
780 warn("bp per pixel = " + str(bpPerPix))
782 p1 = pixelData(rangeSizes1, bpPerPix, opts.border_pixels, lMargin)
783 rangePixBegs1, rangePixLens1, width = p1
784 rangeDict1 = dict(rangesPerSeq(sortedRanges1, rangePixBegs1,
785 rangePixLens1, bpPerPix))
787 p2 = pixelData(rangeSizes2, bpPerPix, opts.border_pixels, tMargin)
788 rangePixBegs2, rangePixLens2, height = p2
789 rangeDict2 = dict(rangesPerSeq(sortedRanges2, rangePixBegs2,
790 rangePixLens2, bpPerPix))
792 warn("width: " + str(width))
793 warn("height: " + str(height))
795 warn("processing alignments...")
796 hits = alignmentPixels(width, height, alignments + alignmentsB, bpPerPix,
797 rangeDict1, rangeDict2)
799 warn("reading annotations...")
801 rangeDict1 = expandedSeqDict(rangeDict1)
802 beds1 = itertools.chain(readBed(opts.bed1, rangeDict1),
803 readRmsk(opts.rmsk1, rangeDict1),
804 readGenePred(opts, opts.genePred1, rangeDict1),
805 readGaps(opts, opts.gap1, rangeDict1))
806 b1 = bedBoxes(beds1, rangeDict1, tMargin, height, True, bpPerPix)
808 rangeDict2 = expandedSeqDict(rangeDict2)
809 beds2 = itertools.chain(readBed(opts.bed2, rangeDict2),
810 readRmsk(opts.rmsk2, rangeDict2),
811 readGenePred(opts, opts.genePred2, rangeDict2),
812 readGaps(opts, opts.gap2, rangeDict2))
813 b2 = bedBoxes(beds2, rangeDict2, lMargin, width, False, bpPerPix)
815 boxes = sorted(itertools.chain(b1, b2))
819 image_size = width, height
820 im = Image.new(image_mode, image_size, opts.background_color)
822 drawAnnotations(im, boxes)
824 for i in range(height):
825 for j in range(width):
826 store_value = hits[i * width + j]
828 if store_value == 1: im.putpixel(xy, forward_color)
829 elif store_value == 2: im.putpixel(xy, reverse_color)
830 elif store_value == 3: im.putpixel(xy, overlap_color)
832 if opts.fontsize != 0:
833 axis1 = axisImage(labelData1, rangePixBegs1, rangePixLens1,
834 textRot1, False, font, image_mode, opts)
836 axis1 = axis1.transpose(Image.ROTATE_90)
837 axis2 = axisImage(labelData2, rangePixBegs2, rangePixLens2,
838 textRot2, textRot2, font, image_mode, opts)
840 axis2 = axis2.transpose(Image.ROTATE_270)
841 im.paste(axis1, (0, 0))
842 im.paste(axis2, (0, 0))
844 for i in rangePixBegs1[1:]:
845 box = i - opts.border_pixels, tMargin, i, height
846 im.paste(opts.border_color, box)
848 for i in rangePixBegs2[1:]:
849 box = lMargin, i - opts.border_pixels, width, i
850 im.paste(opts.border_color, box)
854 if __name__ == "__main__":
855 usage = """%prog --help
856 or: %prog [options] maf-or-tab-alignments dotplot.png
857 or: %prog [options] maf-or-tab-alignments dotplot.gif
859 description = "Draw a dotplot of pair-wise sequence alignments in MAF or tabular format."
860 op = optparse.OptionParser(usage=usage, description=description)
861 op.add_option("-v", "--verbose", action="count",
862 help="show progress messages & data about the plot")
863 op.add_option("-1", "--seq1", metavar="PATTERN", action="append",
865 help="which sequences to show from the 1st genome")
866 op.add_option("-2", "--seq2", metavar="PATTERN", action="append",
868 help="which sequences to show from the 2nd genome")
869 # Replace "width" & "height" with a single "length" option?
870 op.add_option("-x", "--width", type="int", default=1000,
871 help="maximum width in pixels (default: %default)")
872 op.add_option("-y", "--height", type="int", default=1000,
873 help="maximum height in pixels (default: %default)")
874 op.add_option("-c", "--forwardcolor", metavar="COLOR", default="red",
875 help="color for forward alignments (default: %default)")
876 op.add_option("-r", "--reversecolor", metavar="COLOR", default="blue",
877 help="color for reverse alignments (default: %default)")
878 op.add_option("--alignments", metavar="FILE", help="secondary alignments")
879 op.add_option("--sort1", default="1", metavar="N",
880 help="genome1 sequence order: 0=input order, 1=name order, "
881 "2=length order, 3=alignment order (default=%default)")
882 op.add_option("--sort2", default="1", metavar="N",
883 help="genome2 sequence order: 0=input order, 1=name order, "
884 "2=length order, 3=alignment order (default=%default)")
885 op.add_option("--strands1", default="0", metavar="N", help=
886 "genome1 sequence orientation: 0=forward orientation, "
887 "1=alignment orientation (default=%default)")
888 op.add_option("--strands2", default="0", metavar="N", help=
889 "genome2 sequence orientation: 0=forward orientation, "
890 "1=alignment orientation (default=%default)")
891 op.add_option("--max-gap1", metavar="FRAC", default="0.5,2", help=
892 "maximum unaligned (end,mid) gap in genome1: "
893 "fraction of aligned length (default=%default)")
894 op.add_option("--max-gap2", metavar="FRAC", default="0.5,2", help=
895 "maximum unaligned (end,mid) gap in genome2: "
896 "fraction of aligned length (default=%default)")
897 op.add_option("--pad", metavar="FRAC", type="float", default=0.04, help=
898 "pad length when cutting unaligned gaps: "
899 "fraction of aligned length (default=%default)")
900 op.add_option("--border-pixels", metavar="INT", type="int", default=1,
901 help="number of pixels between sequences (default=%default)")
902 op.add_option("--border-color", metavar="COLOR", default="black",
903 help="color for pixels between sequences (default=%default)")
904 # --break-color and/or --break-pixels for intra-sequence breaks?
905 op.add_option("--margin-color", metavar="COLOR", default="#dcdcdc",
908 og = optparse.OptionGroup(op, "Text options")
909 og.add_option("-f", "--fontfile", metavar="FILE",
910 help="TrueType or OpenType font file")
911 og.add_option("-s", "--fontsize", metavar="SIZE", type="int", default=14,
912 help="TrueType or OpenType font size (default: %default)")
913 og.add_option("--labels1", type="int", default=0, metavar="N", help=
914 "genome1 labels: 0=name, 1=name:length, "
915 "2=name:start:length, 3=name:start-end (default=%default)")
916 og.add_option("--labels2", type="int", default=0, metavar="N", help=
917 "genome2 labels: 0=name, 1=name:length, "
918 "2=name:start:length, 3=name:start-end (default=%default)")
919 og.add_option("--rot1", metavar="ROT", default="h",
920 help="text rotation for the 1st genome (default=%default)")
921 og.add_option("--rot2", metavar="ROT", default="v",
922 help="text rotation for the 2nd genome (default=%default)")
923 op.add_option_group(og)
925 og = optparse.OptionGroup(op, "Annotation options")
926 og.add_option("--bed1", metavar="FILE",
927 help="read genome1 annotations from BED file")
928 og.add_option("--bed2", metavar="FILE",
929 help="read genome2 annotations from BED file")
930 og.add_option("--rmsk1", metavar="FILE", help="read genome1 repeats from "
931 "RepeatMasker .out or rmsk.txt file")
932 og.add_option("--rmsk2", metavar="FILE", help="read genome2 repeats from "
933 "RepeatMasker .out or rmsk.txt file")
934 op.add_option_group(og)
936 og = optparse.OptionGroup(op, "Gene options")
937 og.add_option("--genePred1", metavar="FILE",
938 help="read genome1 genes from genePred file")
939 og.add_option("--genePred2", metavar="FILE",
940 help="read genome2 genes from genePred file")
941 og.add_option("--exon-color", metavar="COLOR", default="PaleGreen",
942 help="color for exons (default=%default)")
943 og.add_option("--cds-color", metavar="COLOR", default="LimeGreen",
944 help="color for protein-coding regions (default=%default)")
945 op.add_option_group(og)
947 og = optparse.OptionGroup(op, "Unsequenced gap options")
948 og.add_option("--gap1", metavar="FILE",
949 help="read genome1 unsequenced gaps from agp or gap file")
950 og.add_option("--gap2", metavar="FILE",
951 help="read genome2 unsequenced gaps from agp or gap file")
952 og.add_option("--bridged-color", metavar="COLOR", default="yellow",
953 help="color for bridged gaps (default: %default)")
954 og.add_option("--unbridged-color", metavar="COLOR", default="orange",
955 help="color for unbridged gaps (default: %default)")
956 op.add_option_group(og)
957 (opts, args) = op.parse_args()
958 if len(args) != 2: op.error("2 arguments needed")
960 opts.background_color = "white"
961 opts.label_space = 5 # minimum number of pixels between axis labels
963 try: lastDotplot(opts, args)
964 except KeyboardInterrupt: pass # avoid silly error message
965 except Exception as e:
966 prog = os.path.basename(sys.argv[0])
967 sys.exit(prog + ": error: " + str(e))