2 # Author: Martin C. Frith 2008
3 # SPDX-License-Identifier: GPL-3.0-or-later
5 # Read pair-wise alignments in MAF or LAST tabular format: write an
6 # "Oxford grid", a.k.a. dotplot.
8 # TODO: Currently, pixels with zero aligned nt-pairs are white, and
9 # pixels with one or more aligned nt-pairs are black. This can look
10 # too crowded for large genome alignments. I tried shading each pixel
11 # according to the number of aligned nt-pairs within it, but the
12 # result is too faint. How can this be done better?
17 from fnmatch import fnmatchcase
19 from operator import itemgetter
21 import itertools, optparse, os, re, sys
23 # Try to make PIL/PILLOW work:
25 from PIL import Image, ImageDraw, ImageFont, ImageColor
27 import Image, ImageDraw, ImageFont, ImageColor
30 from future_builtins import zip
34 def myOpen(fileName): # faster than fileinput
39 if fileName.endswith(".gz"):
40 return gzip.open(fileName, "rt") # xxx dubious for Python2
43 def groupByFirstItem(things):
44 for k, v in itertools.groupby(things, itemgetter(0)):
45 yield k, [i[1:] for i in v]
47 def croppedBlocks(blocks, ranges1, ranges2):
48 headBeg1, headBeg2, headSize = blocks[0]
51 cropBeg1, cropEnd1 = r1
53 cropBeg1, cropEnd1 = -cropEnd1, -cropBeg1
54 cropBeg2, cropEnd2 = r2
56 cropBeg2, cropEnd2 = -cropEnd2, -cropBeg2
57 for beg1, beg2, size in blocks:
58 b1 = max(cropBeg1, beg1)
59 e1 = min(cropEnd1, beg1 + size)
62 b2 = max(cropBeg2, b1 + offset)
63 e2 = min(cropEnd2, e1 + offset)
65 yield b2 - offset, b2, e2 - b2
67 def tabBlocks(beg1, beg2, blocks):
68 '''Get the gapless blocks of an alignment, from LAST tabular format.'''
69 for i in blocks.split(","):
76 yield beg1, beg2, size
80 def mafBlocks(beg1, beg2, seq1, seq2):
81 '''Get the gapless blocks of an alignment, from MAF format.'''
83 for x, y in zip(seq1, seq2):
86 yield beg1, beg2, size
93 yield beg1, beg2, size
100 if size: yield beg1, beg2, size
102 def alignmentFromSegment(qrySeqName, qrySeqLen, segment):
103 refSeqLen = sys.maxsize # XXX
104 refSeqName, refSeqBeg, qrySeqBeg, size = segment
105 block = refSeqBeg, qrySeqBeg, size
106 return refSeqName, refSeqLen, qrySeqName, qrySeqLen, [block]
108 def alignmentInput(lines):
109 '''Get alignments and sequence lengths, from MAF or tabular format.'''
119 yield alignmentFromSegment(qrySeqName, qrySeqLen, i)
123 elif len(w) == 2 and qrySeqName and w[1].isdigit():
124 qrySeqLen += int(w[1])
125 elif len(w) == 4 and qrySeqName and w[1].isdigit() and w[3].isdigit():
126 refSeqName, refSeqBeg, refSeqEnd = w[0], int(w[1]), int(w[3])
127 size = abs(refSeqEnd - refSeqBeg)
128 if refSeqBeg > refSeqEnd:
129 refSeqBeg = -refSeqBeg
130 segments.append((refSeqName, refSeqBeg, qrySeqLen, size))
132 elif line[0].isdigit(): # tabular format
133 chr1, beg1, seqlen1 = w[1], int(w[2]), int(w[5])
134 if w[4] == "-": beg1 -= seqlen1
135 chr2, beg2, seqlen2 = w[6], int(w[7]), int(w[10])
136 if w[9] == "-": beg2 -= seqlen2
137 blocks = tabBlocks(beg1, beg2, w[11])
138 yield chr1, seqlen1, chr2, seqlen2, blocks
139 elif line[0] == "s": # MAF format
141 chr1, beg1, seqlen1, seq1 = w[1], int(w[2]), int(w[5]), w[6]
142 if w[4] == "-": beg1 -= seqlen1
145 chr2, beg2, seqlen2, seq2 = w[1], int(w[2]), int(w[5]), w[6]
146 if w[4] == "-": beg2 -= seqlen2
147 blocks = mafBlocks(beg1, beg2, seq1, seq2)
148 yield chr1, seqlen1, chr2, seqlen2, blocks
151 yield alignmentFromSegment(qrySeqName, qrySeqLen, i)
153 def seqRequestFromText(text):
155 pattern, interval = text.rsplit(":", 1)
157 beg, end = interval.rsplit("-", 1)
158 return pattern, int(beg), int(end) # beg may be negative
159 return text, 0, sys.maxsize
161 def rangesFromSeqName(seqRequests, name, seqLen):
163 base = name.split(".")[-1] # allow for names like hg19.chr7
164 for pat, beg, end in seqRequests:
165 if fnmatchcase(name, pat) or fnmatchcase(base, pat):
166 yield max(beg, 0), min(end, seqLen)
170 def updateSeqs(coverDict, seqRanges, seqName, ranges, coveredRange):
171 beg, end = coveredRange
173 coveredRange = -end, -beg
174 if seqName in coverDict:
175 coverDict[seqName].append(coveredRange)
177 coverDict[seqName] = [coveredRange]
178 for beg, end in ranges:
179 r = seqName, beg, end
182 def readAlignments(fileName, opts):
183 '''Get alignments and sequence limits, from MAF or tabular format.'''
184 seqRequests1 = [seqRequestFromText(i) for i in opts.seq1]
185 seqRequests2 = [seqRequestFromText(i) for i in opts.seq2]
192 lines = myOpen(fileName)
193 for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
194 ranges1 = sorted(rangesFromSeqName(seqRequests1, seqName1, seqLen1))
195 if not ranges1: continue
196 ranges2 = sorted(rangesFromSeqName(seqRequests2, seqName2, seqLen2))
197 if not ranges2: continue
198 b = list(croppedBlocks(list(blocks), ranges1, ranges2))
200 aln = seqName1, seqName2, b
201 alignments.append(aln)
202 coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
203 updateSeqs(coverDict1, seqRanges1, seqName1, ranges1, coveredRange1)
204 coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
205 updateSeqs(coverDict2, seqRanges2, seqName2, ranges2, coveredRange2)
206 return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
208 def nameAndRangesFromDict(cropDict, seqName):
209 if seqName in cropDict:
210 return seqName, cropDict[seqName]
211 n = seqName.split(".")[-1]
213 return n, cropDict[n]
216 def rangesForSecondaryAlignments(primaryRanges, seqLen):
221 def readSecondaryAlignments(opts, cropRanges1, cropRanges2):
222 cropDict1 = dict(groupByFirstItem(cropRanges1))
223 cropDict2 = dict(groupByFirstItem(cropRanges2))
230 lines = myOpen(opts.alignments)
231 for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
232 seqName1, ranges1 = nameAndRangesFromDict(cropDict1, seqName1)
233 seqName2, ranges2 = nameAndRangesFromDict(cropDict2, seqName2)
234 if not ranges1 and not ranges2:
236 r1 = rangesForSecondaryAlignments(ranges1, seqLen1)
237 r2 = rangesForSecondaryAlignments(ranges2, seqLen2)
238 b = list(croppedBlocks(list(blocks), r1, r2))
240 aln = seqName1, seqName2, b
241 alignments.append(aln)
243 coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
244 updateSeqs(coverDict1, seqRanges1, seqName1, r1, coveredRange1)
246 coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
247 updateSeqs(coverDict2, seqRanges2, seqName2, r2, coveredRange2)
248 return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
250 def twoValuesFromOption(text, separator):
251 if separator in text:
252 return text.split(separator)
255 def mergedRanges(ranges):
256 oldBeg, maxEnd = ranges[0]
257 for beg, end in ranges:
266 def mergedRangesPerSeq(coverDict):
267 for k, v in coverDict.items():
269 yield k, list(mergedRanges(v))
271 def coveredLength(mergedCoverDict):
272 return sum(sum(e - b for b, e in v) for v in mergedCoverDict.values())
274 def trimmed(seqRanges, coverDict, minAlignedBases, maxGapFrac, endPad, midPad):
275 maxEndGapFrac, maxMidGapFrac = twoValuesFromOption(maxGapFrac, ",")
276 maxEndGap = max(float(maxEndGapFrac) * minAlignedBases, endPad * 1.0)
277 maxMidGap = max(float(maxMidGapFrac) * minAlignedBases, midPad * 2.0)
279 for seqName, rangeBeg, rangeEnd in seqRanges:
280 seqBlocks = coverDict[seqName]
281 blocks = [i for i in seqBlocks if i[0] < rangeEnd and i[1] > rangeBeg]
282 if blocks[0][0] - rangeBeg > maxEndGap:
283 rangeBeg = blocks[0][0] - endPad
284 for j, y in enumerate(blocks):
287 if y[0] - x[1] > maxMidGap:
288 yield seqName, rangeBeg, x[1] + midPad
289 rangeBeg = y[0] - midPad
290 if rangeEnd - blocks[-1][1] > maxEndGap:
291 rangeEnd = blocks[-1][1] + endPad
292 yield seqName, rangeBeg, rangeEnd
294 def rangesWithStrandInfo(seqRanges, strandOpt, alignments, seqIndex):
296 forwardMinusReverse = collections.defaultdict(int)
299 beg1, beg2, size = blocks[0]
300 numOfAlignedLetterPairs = sum(i[2] for i in blocks)
301 if (beg1 < 0) != (beg2 < 0): # opposite-strand alignment
302 numOfAlignedLetterPairs *= -1
303 forwardMinusReverse[i[seqIndex]] += numOfAlignedLetterPairs
305 for seqName, beg, end in seqRanges:
307 strandNum = 1 if forwardMinusReverse[seqName] >= 0 else 2
308 yield seqName, beg, end, strandNum
310 def natural_sort_key(my_string):
311 '''Return a sort key for "natural" ordering, e.g. chr9 < chr10.'''
312 parts = re.split(r'(\d+)', my_string)
313 parts[1::2] = map(int, parts[1::2])
316 def nameKey(oneSeqRanges):
317 return natural_sort_key(oneSeqRanges[0][0])
319 def sizeKey(oneSeqRanges):
320 return sum(b - e for n, b, e, s in oneSeqRanges), nameKey(oneSeqRanges)
322 def alignmentKey(seqNamesToLists, oneSeqRanges):
323 seqName = oneSeqRanges[0][0]
324 alignmentsOfThisSequence = seqNamesToLists[seqName]
325 numOfAlignedLetterPairs = sum(i[3] for i in alignmentsOfThisSequence)
326 toMiddle = numOfAlignedLetterPairs // 2
327 for i in alignmentsOfThisSequence:
330 return i[1:3] # sequence-rank and "position" of this alignment
332 def rankAndFlipPerSeq(seqRanges):
333 rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
334 for rank, group in enumerate(rangesGroupedBySeqName):
335 seqName, ranges = group
336 strandNum = next(ranges)[3]
337 flip = 1 if strandNum < 2 else -1
338 yield seqName, (rank, flip)
340 def alignmentSortData(alignments, seqIndex, otherNamesToRanksAndFlips):
341 otherIndex = 1 - seqIndex
344 otherRank, otherFlip = otherNamesToRanksAndFlips[i[otherIndex]]
345 otherPos = otherFlip * abs(blocks[0][otherIndex] +
346 blocks[-1][otherIndex] + blocks[-1][2])
347 numOfAlignedLetterPairs = sum(i[2] for i in blocks)
348 yield i[seqIndex], otherRank, otherPos, numOfAlignedLetterPairs
350 def mySortedRanges(seqRanges, sortOpt, seqIndex, alignments, otherRanges):
351 rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
352 g = [list(ranges) for seqName, ranges in rangesGroupedBySeqName]
361 otherNamesToRanksAndFlips = dict(rankAndFlipPerSeq(otherRanges))
362 alns = sorted(alignmentSortData(alignments, seqIndex,
363 otherNamesToRanksAndFlips))
364 alnsGroupedBySeqName = itertools.groupby(alns, itemgetter(0))
365 seqNamesToLists = dict((k, list(v)) for k, v in alnsGroupedBySeqName)
366 g.sort(key=functools.partial(alignmentKey, seqNamesToLists))
367 return [j for i in g for j in i]
369 def allSortedRanges(opts, alignments, alignmentsB,
370 seqRanges1, seqRangesB1, seqRanges2, seqRangesB2):
371 o1, oB1 = twoValuesFromOption(opts.strands1, ":")
372 o2, oB2 = twoValuesFromOption(opts.strands2, ":")
373 if o1 == "1" and o2 == "1":
374 raise RuntimeError("the strand options have circular dependency")
375 seqRanges1 = list(rangesWithStrandInfo(seqRanges1, o1, alignments, 0))
376 seqRanges2 = list(rangesWithStrandInfo(seqRanges2, o2, alignments, 1))
377 seqRangesB1 = list(rangesWithStrandInfo(seqRangesB1, oB1, alignmentsB, 0))
378 seqRangesB2 = list(rangesWithStrandInfo(seqRangesB2, oB2, alignmentsB, 1))
380 o1, oB1 = twoValuesFromOption(opts.sort1, ":")
381 o2, oB2 = twoValuesFromOption(opts.sort2, ":")
382 if o1 == "3" and o2 == "3":
383 raise RuntimeError("the sort options have circular dependency")
385 s1 = mySortedRanges(seqRanges1, o1, None, None, None)
387 s2 = mySortedRanges(seqRanges2, o2, None, None, None)
389 s1 = mySortedRanges(seqRanges1, o1, 0, alignments, s2)
391 s2 = mySortedRanges(seqRanges2, o2, 1, alignments, s1)
392 t1 = mySortedRanges(seqRangesB1, oB1, 0, alignmentsB, s2)
393 t2 = mySortedRanges(seqRangesB2, oB2, 1, alignmentsB, s1)
394 return s1 + t1, s2 + t2
396 def sizesPerText(texts, font, textDraw):
399 if textDraw is not None:
400 sizes = textDraw.textsize(t, font=font)
407 groups.append(t[-3:])
409 return ",".join(reversed(groups))
412 suffixes = "bp", "kb", "Mb", "Gb"
413 for i, x in enumerate(suffixes):
416 return "%.2g" % (1.0 * size / j) + x
417 if size < j * 1000 or i == len(suffixes) - 1:
418 return "%.0f" % (1.0 * size / j) + x
420 def labelText(seqRange, labelOpt):
421 seqName, beg, end, strandNum = seqRange
423 return seqName + ": " + sizeText(end - beg)
425 return seqName + ":" + prettyNum(beg) + ": " + sizeText(end - beg)
427 return seqName + ":" + prettyNum(beg) + "-" + prettyNum(end)
430 def rangeLabels(seqRanges, labelOpt, font, textDraw, textRot):
433 text = labelText(r, labelOpt)
434 if textDraw is not None:
435 x, y = textDraw.textsize(text, font=font)
438 yield text, x, y, r[3]
440 def dataFromRanges(sortedRanges, font, textDraw, labelOpt, textRot):
441 for seqName, rangeBeg, rangeEnd, strandNum in sortedRanges:
442 out = [seqName, str(rangeBeg), str(rangeEnd)]
444 out.append(".+-"[strandNum])
445 logging.info("\t".join(out))
447 rangeSizes = [e - b for n, b, e, s in sortedRanges]
448 labs = list(rangeLabels(sortedRanges, labelOpt, font, textDraw, textRot))
449 margin = max(i[2] for i in labs)
450 # xxx the margin may be too big, because some labels may get omitted
451 return rangeSizes, labs, margin
454 '''Return x / y rounded up.'''
458 def get_bp_per_pix(rangeSizes, pixTweenRanges, maxPixels):
459 '''Get the minimum bp-per-pixel that fits in the size limit.'''
460 logging.info("choosing bp per pixel...")
461 numOfRanges = len(rangeSizes)
462 maxPixelsInRanges = maxPixels - pixTweenRanges * (numOfRanges - 1)
463 if maxPixelsInRanges < numOfRanges:
464 raise RuntimeError("can't fit the image: too many sequences?")
465 negLimit = -maxPixelsInRanges
466 negBpPerPix = sum(rangeSizes) // negLimit
468 if sum(i // negBpPerPix for i in rangeSizes) >= negLimit:
472 def getRangePixBegs(rangePixLens, pixTweenRanges, margin):
473 '''Get the start pixel for each range.'''
475 pix_tot = margin - pixTweenRanges
476 for i in rangePixLens:
477 pix_tot += pixTweenRanges
478 rangePixBegs.append(pix_tot)
482 def pixelData(rangeSizes, bp_per_pix, pixTweenRanges, margin):
483 '''Return pixel information about the ranges.'''
484 rangePixLens = [div_ceil(i, bp_per_pix) for i in rangeSizes]
485 rangePixBegs = getRangePixBegs(rangePixLens, pixTweenRanges, margin)
486 tot_pix = rangePixBegs[-1] + rangePixLens[-1]
487 return rangePixBegs, rangePixLens, tot_pix
489 def drawLineForward(hits, width, bp_per_pix, beg1, beg2, size):
491 q1, r1 = divmod(beg1, bp_per_pix)
492 q2, r2 = divmod(beg2, bp_per_pix)
493 hits[q2 * width + q1] |= 1
494 next_pix = min(bp_per_pix - r1, bp_per_pix - r2)
495 if next_pix >= size: break
500 def drawLineReverse(hits, width, bp_per_pix, beg1, beg2, size):
502 q1, r1 = divmod(beg1, bp_per_pix)
503 q2, r2 = divmod(beg2, bp_per_pix)
504 hits[q2 * width + q1] |= 2
505 next_pix = min(bp_per_pix - r1, r2 + 1)
506 if next_pix >= size: break
511 def strandAndOrigin(ranges, beg, size):
512 isReverseStrand = (beg < 0)
515 for rangeBeg, rangeEnd, isReverseRange, origin in ranges:
516 if rangeEnd > beg: # assumes the ranges are sorted
517 return (isReverseStrand != isReverseRange), origin
519 def alignmentPixels(width, height, alignments, bp_per_pix,
520 rangeDict1, rangeDict2):
521 hits = [0] * (width * height) # the image data
522 for seq1, seq2, blocks in alignments:
523 beg1, beg2, size = blocks[0]
524 isReverse1, ori1 = strandAndOrigin(rangeDict1[seq1], beg1, size)
525 isReverse2, ori2 = strandAndOrigin(rangeDict2[seq2], beg2, size)
526 for beg1, beg2, size in blocks:
528 beg1 = -(beg1 + size)
529 beg2 = -(beg2 + size)
530 if isReverse1 == isReverse2:
531 drawLineForward(hits, width, bp_per_pix,
532 ori1 + beg1, ori2 + beg2, size)
534 drawLineReverse(hits, width, bp_per_pix,
535 ori1 + beg1, ori2 - beg2 - 1, size)
538 def orientedBlocks(alignments, seqIndex):
539 otherIndex = 1 - seqIndex
541 seq1, seq2, blocks = a
545 b = -(beg1 + size), -(beg2 + size), size
546 yield a[seqIndex], b[seqIndex], a[otherIndex], b[otherIndex], size
548 def drawJoins(im, alignments, bpPerPix, seqIndex, rangeDict1, rangeDict2):
549 blocks = orientedBlocks(alignments, seqIndex)
551 for seq1, beg1, seq2, beg2, size in sorted(blocks):
552 isReverse1, ori1 = strandAndOrigin(rangeDict1[seq1], beg1, size)
553 isReverse2, ori2 = strandAndOrigin(rangeDict2[seq2], beg2, size)
554 end1 = beg1 + size - 1
555 end2 = beg2 + size - 1
562 newPix1 = (ori1 + beg1) // bpPerPix
563 newPix2 = (ori2 + beg2) // bpPerPix
565 lowerPix2 = min(oldPix2, newPix2)
566 upperPix2 = max(oldPix2, newPix2)
567 midPix1 = (oldPix1 + newPix1) // 2
569 midPix1 = (oldPix1 + newPix1 + 1) // 2
570 oldPix1, newPix1 = newPix1, oldPix1
571 if upperPix2 - lowerPix2 > 1 and oldPix1 <= newPix1 <= oldPix1 + 1:
573 box = midPix1, lowerPix2, midPix1 + 1, upperPix2 + 1
575 box = lowerPix2, midPix1, upperPix2 + 1, midPix1 + 1
576 im.paste("lightgray", box)
577 oldPix1 = (ori1 + end1) // bpPerPix
578 oldPix2 = (ori2 + end2) // bpPerPix
581 def expandedSeqDict(seqDict):
582 '''Allow lookup by short sequence names, e.g. chr7 as well as hg19.chr7.'''
583 newDict = seqDict.copy()
584 for name, x in seqDict.items():
586 base = name.split(".")[-1]
587 if base in newDict: # an ambiguous case was found:
588 return seqDict # so give up completely
592 def readBed(fileName, rangeDict):
593 for line in myOpen(fileName):
596 if not w[1].isdigit():
599 if seqName not in rangeDict: continue
600 seqRanges = rangeDict[seqName]
603 if all(beg >= i[2] or end <= i[1] for i in seqRanges):
605 itemName = w[3] if len(w) > 3 and w[3] != "." else ""
612 if len(w) > 8 and w[8].count(",") == 2:
613 color = "rgb(" + w[8] + ")"
616 isRev = (seqRanges[0][3] > 1)
617 if strand == "+" and not isRev or strand == "-" and isRev:
619 if strand == "-" and not isRev or strand == "+" and isRev:
621 yield layer, color, seqName, beg, end, itemName
623 def commaSeparatedInts(text):
624 return map(int, text.rstrip(",").split(","))
626 def readGenePred(opts, fileName, rangeDict):
627 for line in myOpen(fileName):
628 fields = line.split() # xxx tab ???
629 if not fields: continue
630 geneName = fields[12 if len(fields) > 12 else 0] # XXX ???
631 if fields[2] not in "+-":
634 if seqName not in rangeDict: continue
635 seqRanges = rangeDict[seqName]
637 cdsBeg = int(fields[5])
638 cdsEnd = int(fields[6])
639 exonBegs = commaSeparatedInts(fields[8])
640 exonEnds = commaSeparatedInts(fields[9])
641 for beg, end in zip(exonBegs, exonEnds):
642 if all(beg >= i[2] or end <= i[1] for i in seqRanges):
644 yield 300, opts.exon_color, seqName, beg, end, geneName
647 if b < e: yield 400, opts.cds_color, seqName, b, e, ""
649 def readRmsk(fileName, rangeDict):
650 for line in myOpen(fileName):
651 fields = line.split()
652 if len(fields) == 17: # rmsk.txt
654 if seqName not in rangeDict: continue # do this ASAP for speed
658 repeatName = fields[10]
659 repeatClass = fields[11]
660 elif len(fields) == 15: # .out
662 if seqName not in rangeDict: continue
663 beg = int(fields[5]) - 1
666 repeatName = fields[9]
667 repeatClass = fields[10].split("/")[0]
670 seqRanges = rangeDict[seqName]
671 if all(beg >= i[2] or end <= i[1] for i in seqRanges):
673 if repeatClass in ("Low_complexity", "Simple_repeat", "Satellite"):
674 yield 200, "#fbf", seqName, beg, end, repeatName
675 elif (strand == "+") != (seqRanges[0][3] > 1):
676 yield 100, "#ffe8e8", seqName, beg, end, repeatName
678 yield 100, "#e8e8ff", seqName, beg, end, repeatName
680 def isExtraFirstGapField(fields):
681 return fields[4].isdigit()
683 def readGaps(opts, fileName, rangeDict):
684 '''Read locations of unsequenced gaps, from an agp or gap file.'''
685 for line in myOpen(fileName):
687 if not w or w[0][0] == "#": continue
688 if isExtraFirstGapField(w): w = w[1:]
689 if w[4] not in "NU": continue
691 if seqName not in rangeDict: continue
693 beg = end - int(w[5]) # zero-based coordinate
695 yield 3000, opts.bridged_color, seqName, beg, end, ""
697 yield 2000, opts.unbridged_color, seqName, beg, end, ""
699 def bedBoxes(annots, rangeDict, limit, isTop, bpPerPix):
700 beds, textSizes, margin = annots
701 cover = [(limit, limit)]
702 for layer, color, seqName, bedBeg, bedEnd, name in reversed(beds):
703 textWidth, textHeight = textSizes[name]
704 for rangeBeg, rangeEnd, isReverseRange, origin in rangeDict[seqName]:
705 beg = max(bedBeg, rangeBeg)
706 end = min(bedEnd, rangeEnd)
707 if beg >= end: continue
709 beg, end = -end, -beg
711 # include partly-covered pixels
712 pixBeg = (origin + beg) // bpPerPix
713 pixEnd = div_ceil(origin + end, bpPerPix)
715 # exclude partly-covered pixels
716 pixBeg = div_ceil(origin + beg, bpPerPix)
717 pixEnd = (origin + end) // bpPerPix
718 if pixEnd <= pixBeg: continue
719 if bedEnd >= rangeEnd: # include partly-covered end pixels
721 pixBeg = (origin + beg) // bpPerPix
723 pixEnd = div_ceil(origin + end, bpPerPix)
724 nameBeg = (pixBeg + pixEnd - textHeight) // 2
725 nameEnd = nameBeg + textHeight
727 if name and all(e <= nameBeg or b >= nameEnd for b, e in cover):
728 if textWidth <= margin:
729 cover.append((nameBeg, nameEnd))
731 yield layer, color, isTop, pixBeg, pixEnd, n, nameBeg, textWidth
733 def drawAnnotations(im, boxes, tMargin, bMarginBeg, lMargin, rMarginBeg):
734 # xxx use partial transparency for different-color overlaps?
735 for layer, color, isTop, beg, end, name, nameBeg, nameLen in boxes:
737 box = beg, tMargin, end, bMarginBeg
739 box = lMargin, beg, rMarginBeg, end
742 def placedLabels(labels, rangePixBegs, rangePixLens, beg, end):
743 '''Return axis labels with endpoint & sort-order information.'''
745 for i, j, k in zip(labels, rangePixBegs, rangePixLens):
746 text, textWidth, textHeight, strandNum = i
747 if textWidth > maxWidth:
749 labelBeg = j + (k - textWidth) // 2
750 labelEnd = labelBeg + textWidth
751 sortKey = textWidth - k
753 sortKey += maxWidth * (beg - labelBeg)
755 labelEnd = beg + textWidth
757 sortKey += maxWidth * (labelEnd - end)
759 labelBeg = end - textWidth
760 yield sortKey, labelBeg, labelEnd, text, textHeight, strandNum
762 def nonoverlappingLabels(labels, minPixTweenLabels):
763 '''Get a subset of non-overlapping axis labels, greedily.'''
766 beg = i[1] - minPixTweenLabels
767 end = i[2] + minPixTweenLabels
768 if all(j[2] <= beg or j[1] >= end for j in out):
772 def axisImage(labels, rangePixBegs, rangePixLens, textRot,
773 textAln, font, image_mode, opts):
774 '''Make an image of axis labels.'''
775 beg = rangePixBegs[0]
776 end = rangePixBegs[-1] + rangePixLens[-1]
777 margin = max(i[2] for i in labels)
778 labels = sorted(placedLabels(labels, rangePixBegs, rangePixLens, beg, end))
779 minPixTweenLabels = 0 if textRot else opts.label_space
780 labels = nonoverlappingLabels(labels, minPixTweenLabels)
781 image_size = (margin, end) if textRot else (end, margin)
782 im = Image.new(image_mode, image_size, opts.margin_color)
783 draw = ImageDraw.Draw(im)
784 for sortKey, labelBeg, labelEnd, text, textHeight, strandNum in labels:
785 base = margin - textHeight if textAln else 0
786 position = (base, labelBeg) if textRot else (labelBeg, base)
787 fill = ("black", opts.forwardcolor, opts.reversecolor)[strandNum]
788 draw.text(position, text, font=font, fill=fill)
791 def annoTextImage(opts, image_mode, font, margin, length, boxes, isLeftAlign):
792 image_size = margin, length
793 im = Image.new(image_mode, image_size, opts.margin_color)
794 draw = ImageDraw.Draw(im)
795 for layer, color, isTop, beg, end, name, nameBeg, nameLen in boxes:
796 xPosition = 0 if isLeftAlign else margin - nameLen
797 position = xPosition, nameBeg
798 draw.text(position, name, font=font, fill="black")
801 def rangesPerSeq(sortedRanges):
802 for seqName, group in itertools.groupby(sortedRanges, itemgetter(0)):
803 yield seqName, sorted(group)
805 def rangesWithOrigins(sortedRanges, rangePixBegs, rangePixLens, bpPerPix):
806 for i, j, k in zip(sortedRanges, rangePixBegs, rangePixLens):
807 seqName, rangeBeg, rangeEnd, strandNum = i
808 isReverseRange = (strandNum > 1)
810 origin = bpPerPix * (j + k) + rangeBeg
812 origin = bpPerPix * j - rangeBeg
813 yield seqName, (rangeBeg, rangeEnd, isReverseRange, origin)
815 def rangesAndOriginsPerSeq(sortedRanges, rangePixBegs, rangePixLens, bpPerPix):
816 a = rangesWithOrigins(sortedRanges, rangePixBegs, rangePixLens, bpPerPix)
817 for seqName, group in itertools.groupby(a, itemgetter(0)):
818 yield seqName, sorted(i[1] for i in group)
822 return ImageFont.truetype(opts.fontfile, opts.fontsize)
825 x = ["fc-match", "-f%{file}", "arial"]
826 p = subprocess.Popen(x, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
827 universal_newlines=True)
828 out, err = p.communicate()
829 fileNames.append(out)
831 logging.info("fc-match error: " + str(e))
832 fileNames.append("/Library/Fonts/Arial.ttf") # for Mac
835 font = ImageFont.truetype(i, opts.fontsize)
836 logging.info("font: " + i)
839 logging.info("font load error: " + str(e))
840 return ImageFont.load_default()
842 def sequenceSizesAndNames(seqRanges):
843 for seqName, ranges in itertools.groupby(seqRanges, itemgetter(0)):
844 size = sum(e - b for n, b, e in ranges)
847 def biggestSequences(seqRanges, maxNumOfSequences):
848 s = sorted(sequenceSizesAndNames(seqRanges), reverse=True)
849 if len(s) > maxNumOfSequences:
850 logging.warning("too many sequences - discarding the smallest ones")
851 s = s[:maxNumOfSequences]
852 return set(i[1] for i in s)
854 def remainingSequenceRanges(seqRanges, alignments, seqIndex):
855 remainingSequences = set(i[seqIndex] for i in alignments)
856 return [i for i in seqRanges if i[0] in remainingSequences]
858 def readAnnotations(opts, font, textDraw, sortedRanges, totalLength,
859 bedFile, repFile, geneFile, gapFile):
860 rangeDict = expandedSeqDict(dict(rangesPerSeq(sortedRanges)))
861 annots = sorted(itertools.chain(readBed(bedFile, rangeDict),
862 readRmsk(repFile, rangeDict),
863 readGenePred(opts, geneFile, rangeDict),
864 readGaps(opts, gapFile, rangeDict)))
865 names = set(i[5] for i in annots)
866 textSizes = dict(sizesPerText(names, font, textDraw))
867 maxTextLength = totalLength // 2
868 okLengths = [i[0] for i in textSizes.values() if i[0] <= maxTextLength]
869 margin = max(okLengths) if okLengths else 0
870 return annots, textSizes, margin
872 def lastDotplot(opts, args):
873 logLevel = logging.INFO if opts.verbose else logging.WARNING
874 logging.basicConfig(format="%(filename)s: %(message)s", level=logLevel)
878 forward_color = ImageColor.getcolor(opts.forwardcolor, image_mode)
879 reverse_color = ImageColor.getcolor(opts.reversecolor, image_mode)
880 zipped_colors = zip(forward_color, reverse_color)
881 overlap_color = tuple([(i + j) // 2 for i, j in zipped_colors])
883 maxGap1, maxGapB1 = twoValuesFromOption(opts.max_gap1, ":")
884 maxGap2, maxGapB2 = twoValuesFromOption(opts.max_gap2, ":")
886 logging.info("reading alignments...")
887 alnData = readAlignments(args[0], opts)
888 alignments, seqRanges1, coverDict1, seqRanges2, coverDict2 = alnData
889 if not alignments: raise RuntimeError("there are no alignments")
890 logging.info("cutting...")
891 coverDict1 = dict(mergedRangesPerSeq(coverDict1))
892 coverDict2 = dict(mergedRangesPerSeq(coverDict2))
893 minAlignedBases = min(coveredLength(coverDict1), coveredLength(coverDict2))
894 pad = int(opts.pad * minAlignedBases)
895 cutRanges1 = list(trimmed(seqRanges1, coverDict1, minAlignedBases,
897 cutRanges2 = list(trimmed(seqRanges2, coverDict2, minAlignedBases,
900 biggestSeqs1 = biggestSequences(cutRanges1, opts.maxseqs)
901 cutRanges1 = [i for i in cutRanges1 if i[0] in biggestSeqs1]
902 alignments = [i for i in alignments if i[0] in biggestSeqs1]
903 cutRanges2 = remainingSequenceRanges(cutRanges2, alignments, 1)
905 biggestSeqs2 = biggestSequences(cutRanges2, opts.maxseqs)
906 cutRanges2 = [i for i in cutRanges2 if i[0] in biggestSeqs2]
907 alignments = [i for i in alignments if i[1] in biggestSeqs2]
908 cutRanges1 = remainingSequenceRanges(cutRanges1, alignments, 0)
910 logging.info("reading secondary alignments...")
911 alnDataB = readSecondaryAlignments(opts, cutRanges1, cutRanges2)
912 alignmentsB, seqRangesB1, coverDictB1, seqRangesB2, coverDictB2 = alnDataB
913 logging.info("cutting...")
914 coverDictB1 = dict(mergedRangesPerSeq(coverDictB1))
915 coverDictB2 = dict(mergedRangesPerSeq(coverDictB2))
916 cutRangesB1 = trimmed(seqRangesB1, coverDictB1, minAlignedBases,
918 cutRangesB2 = trimmed(seqRangesB2, coverDictB2, minAlignedBases,
921 logging.info("sorting...")
922 sortOut = allSortedRanges(opts, alignments, alignmentsB,
923 cutRanges1, cutRangesB1, cutRanges2, cutRangesB2)
924 sortedRanges1, sortedRanges2 = sortOut
928 textDraw = ImageDraw.Draw(Image.new(image_mode, (1, 1)))
930 textRot1 = "vertical".startswith(opts.rot1)
931 i1 = dataFromRanges(sortedRanges1, font, textDraw, opts.labels1, textRot1)
932 rangeSizes1, labelData1, tMargin = i1
934 textRot2 = "horizontal".startswith(opts.rot2)
935 i2 = dataFromRanges(sortedRanges2, font, textDraw, opts.labels2, textRot2)
936 rangeSizes2, labelData2, lMargin = i2
938 logging.info("reading annotations...")
940 annots1 = readAnnotations(opts, font, textDraw, sortedRanges1, opts.height,
941 opts.bed1, opts.rmsk1, opts.genePred1, opts.gap1)
942 bMargin = annots1[-1]
944 annots2 = readAnnotations(opts, font, textDraw, sortedRanges2, opts.width,
945 opts.bed2, opts.rmsk2, opts.genePred2, opts.gap2)
946 rMargin = annots2[-1]
948 maxPixels1 = opts.width - lMargin - rMargin
949 maxPixels2 = opts.height - tMargin - bMargin
950 bpPerPix1 = get_bp_per_pix(rangeSizes1, opts.border_pixels, maxPixels1)
951 bpPerPix2 = get_bp_per_pix(rangeSizes2, opts.border_pixels, maxPixels2)
952 bpPerPix = max(bpPerPix1, bpPerPix2)
953 logging.info("bp per pixel = " + str(bpPerPix))
955 p1 = pixelData(rangeSizes1, bpPerPix, opts.border_pixels, lMargin)
956 rangePixBegs1, rangePixLens1, rMarginBeg = p1
957 width = rMarginBeg + rMargin
958 rangeDict1 = dict(rangesAndOriginsPerSeq(sortedRanges1, rangePixBegs1,
959 rangePixLens1, bpPerPix))
961 p2 = pixelData(rangeSizes2, bpPerPix, opts.border_pixels, tMargin)
962 rangePixBegs2, rangePixLens2, bMarginBeg = p2
963 height = bMarginBeg + bMargin
964 rangeDict2 = dict(rangesAndOriginsPerSeq(sortedRanges2, rangePixBegs2,
965 rangePixLens2, bpPerPix))
967 logging.info("width: " + str(width))
968 logging.info("height: " + str(height))
970 logging.info("processing alignments...")
971 allAlignments = alignments + alignmentsB
972 hits = alignmentPixels(width, height, allAlignments, bpPerPix,
973 rangeDict1, rangeDict2)
975 rangeDict1 = expandedSeqDict(rangeDict1)
976 rangeDict2 = expandedSeqDict(rangeDict2)
978 boxes1 = list(bedBoxes(annots1, rangeDict1, rMarginBeg, True, bpPerPix))
979 boxes2 = list(bedBoxes(annots2, rangeDict2, bMarginBeg, False, bpPerPix))
980 boxes = sorted(itertools.chain(boxes1, boxes2))
982 logging.info("drawing...")
984 image_size = width, height
985 im = Image.new(image_mode, image_size, opts.background_color)
987 drawAnnotations(im, boxes, tMargin, bMarginBeg, lMargin, rMarginBeg)
989 joinA, joinB = twoValuesFromOption(opts.join, ":")
991 drawJoins(im, alignments, bpPerPix, 0, rangeDict1, rangeDict2)
993 drawJoins(im, alignmentsB, bpPerPix, 0, rangeDict1, rangeDict2)
995 drawJoins(im, alignments, bpPerPix, 1, rangeDict2, rangeDict1)
997 drawJoins(im, alignmentsB, bpPerPix, 1, rangeDict2, rangeDict1)
999 for i in range(height):
1000 for j in range(width):
1001 store_value = hits[i * width + j]
1003 if store_value == 1: im.putpixel(xy, forward_color)
1004 elif store_value == 2: im.putpixel(xy, reverse_color)
1005 elif store_value == 3: im.putpixel(xy, overlap_color)
1007 if opts.fontsize != 0:
1008 axis1 = axisImage(labelData1, rangePixBegs1, rangePixLens1,
1009 textRot1, False, font, image_mode, opts)
1011 axis1 = axis1.transpose(Image.ROTATE_90)
1012 axis2 = axisImage(labelData2, rangePixBegs2, rangePixLens2,
1013 textRot2, textRot2, font, image_mode, opts)
1015 axis2 = axis2.transpose(Image.ROTATE_270)
1016 im.paste(axis1, (0, 0))
1017 im.paste(axis2, (0, 0))
1019 annoImage1 = annoTextImage(opts, image_mode, font, bMargin, width,
1021 annoImage1 = annoImage1.transpose(Image.ROTATE_90)
1022 annoImage2 = annoTextImage(opts, image_mode, font, rMargin, height,
1024 im.paste(annoImage1, (0, bMarginBeg))
1025 im.paste(annoImage2, (rMarginBeg, 0))
1027 for i in rangePixBegs1[1:]:
1028 box = i - opts.border_pixels, tMargin, i, bMarginBeg
1029 im.paste(opts.border_color, box)
1031 for i in rangePixBegs2[1:]:
1032 box = lMargin, i - opts.border_pixels, rMarginBeg, i
1033 im.paste(opts.border_color, box)
1037 if __name__ == "__main__":
1038 usage = """%prog --help
1039 or: %prog [options] maf-or-tab-alignments dotplot.png
1040 or: %prog [options] maf-or-tab-alignments dotplot.gif
1042 description = "Draw a dotplot of pair-wise sequence alignments in MAF or tabular format."
1043 op = optparse.OptionParser(usage=usage, description=description)
1044 op.add_option("-v", "--verbose", action="count",
1045 help="show progress messages & data about the plot")
1046 # Replace "width" & "height" with a single "length" option?
1047 op.add_option("-x", "--width", metavar="INT", type="int", default=1000,
1048 help="maximum width in pixels (default: %default)")
1049 op.add_option("-y", "--height", metavar="INT", type="int", default=1000,
1050 help="maximum height in pixels (default: %default)")
1051 op.add_option("-m", "--maxseqs", type="int", default=100, metavar="M",
1052 help="maximum number of horizontal or vertical sequences "
1053 "(default=%default)")
1054 op.add_option("-1", "--seq1", metavar="PATTERN", action="append",
1056 help="which sequences to show from the 1st genome")
1057 op.add_option("-2", "--seq2", metavar="PATTERN", action="append",
1059 help="which sequences to show from the 2nd genome")
1060 op.add_option("-c", "--forwardcolor", metavar="COLOR", default="red",
1061 help="color for forward alignments (default: %default)")
1062 op.add_option("-r", "--reversecolor", metavar="COLOR", default="blue",
1063 help="color for reverse alignments (default: %default)")
1064 op.add_option("--alignments", metavar="FILE", help="secondary alignments")
1065 op.add_option("--sort1", default="1", metavar="N",
1066 help="genome1 sequence order: 0=input order, 1=name order, "
1067 "2=length order, 3=alignment order (default=%default)")
1068 op.add_option("--sort2", default="1", metavar="N",
1069 help="genome2 sequence order: 0=input order, 1=name order, "
1070 "2=length order, 3=alignment order (default=%default)")
1071 op.add_option("--strands1", default="0", metavar="N", help=
1072 "genome1 sequence orientation: 0=forward orientation, "
1073 "1=alignment orientation (default=%default)")
1074 op.add_option("--strands2", default="0", metavar="N", help=
1075 "genome2 sequence orientation: 0=forward orientation, "
1076 "1=alignment orientation (default=%default)")
1077 op.add_option("--max-gap1", metavar="FRAC", default="0.5,2", help=
1078 "maximum unaligned (end,mid) gap in genome1: "
1079 "fraction of aligned length (default=%default)")
1080 op.add_option("--max-gap2", metavar="FRAC", default="0.5,2", help=
1081 "maximum unaligned (end,mid) gap in genome2: "
1082 "fraction of aligned length (default=%default)")
1083 op.add_option("--pad", metavar="FRAC", type="float", default=0.04, help=
1084 "pad length when cutting unaligned gaps: "
1085 "fraction of aligned length (default=%default)")
1086 op.add_option("-j", "--join", default="0", metavar="N", help=
1087 "join: 0=nothing, 1=alignments adjacent in genome1, "
1088 "2=alignments adjacent in genome2 (default=%default)")
1089 op.add_option("--border-pixels", metavar="INT", type="int", default=1,
1090 help="number of pixels between sequences (default=%default)")
1091 op.add_option("--border-color", metavar="COLOR", default="black",
1092 help="color for pixels between sequences (default=%default)")
1093 # --break-color and/or --break-pixels for intra-sequence breaks?
1094 op.add_option("--margin-color", metavar="COLOR", default="#dcdcdc",
1095 help="margin color")
1097 og = optparse.OptionGroup(op, "Text options")
1098 og.add_option("-f", "--fontfile", metavar="FILE",
1099 help="TrueType or OpenType font file")
1100 og.add_option("-s", "--fontsize", metavar="SIZE", type="int", default=14,
1101 help="TrueType or OpenType font size (default: %default)")
1102 og.add_option("--labels1", type="int", default=0, metavar="N", help=
1103 "genome1 labels: 0=name, 1=name:length, "
1104 "2=name:start:length, 3=name:start-end (default=%default)")
1105 og.add_option("--labels2", type="int", default=0, metavar="N", help=
1106 "genome2 labels: 0=name, 1=name:length, "
1107 "2=name:start:length, 3=name:start-end (default=%default)")
1108 og.add_option("--rot1", metavar="ROT", default="h",
1109 help="text rotation for the 1st genome (default=%default)")
1110 og.add_option("--rot2", metavar="ROT", default="v",
1111 help="text rotation for the 2nd genome (default=%default)")
1112 op.add_option_group(og)
1114 og = optparse.OptionGroup(op, "Annotation options")
1115 og.add_option("--bed1", metavar="FILE",
1116 help="read genome1 annotations from BED file")
1117 og.add_option("--bed2", metavar="FILE",
1118 help="read genome2 annotations from BED file")
1119 og.add_option("--rmsk1", metavar="FILE", help="read genome1 repeats from "
1120 "RepeatMasker .out or rmsk.txt file")
1121 og.add_option("--rmsk2", metavar="FILE", help="read genome2 repeats from "
1122 "RepeatMasker .out or rmsk.txt file")
1123 op.add_option_group(og)
1125 og = optparse.OptionGroup(op, "Gene options")
1126 og.add_option("--genePred1", metavar="FILE",
1127 help="read genome1 genes from genePred file")
1128 og.add_option("--genePred2", metavar="FILE",
1129 help="read genome2 genes from genePred file")
1130 og.add_option("--exon-color", metavar="COLOR", default="PaleGreen",
1131 help="color for exons (default=%default)")
1132 og.add_option("--cds-color", metavar="COLOR", default="LimeGreen",
1133 help="color for protein-coding regions (default=%default)")
1134 op.add_option_group(og)
1136 og = optparse.OptionGroup(op, "Unsequenced gap options")
1137 og.add_option("--gap1", metavar="FILE",
1138 help="read genome1 unsequenced gaps from agp or gap file")
1139 og.add_option("--gap2", metavar="FILE",
1140 help="read genome2 unsequenced gaps from agp or gap file")
1141 og.add_option("--bridged-color", metavar="COLOR", default="yellow",
1142 help="color for bridged gaps (default: %default)")
1143 og.add_option("--unbridged-color", metavar="COLOR", default="orange",
1144 help="color for unbridged gaps (default: %default)")
1145 op.add_option_group(og)
1146 (opts, args) = op.parse_args()
1147 if len(args) != 2: op.error("2 arguments needed")
1149 opts.background_color = "white"
1150 opts.label_space = 5 # minimum number of pixels between axis labels
1152 try: lastDotplot(opts, args)
1153 except KeyboardInterrupt: pass # avoid silly error message
1154 except RuntimeError as e:
1155 prog = os.path.basename(sys.argv[0])
1156 sys.exit(prog + ": error: " + str(e))