Martin@1: #! /usr/bin/env python Martin@1: Martin@272: # Read pair-wise alignments in MAF or LAST tabular format: write an Martin@272: # "Oxford grid", a.k.a. dotplot. Martin@1: Martin@1: # TODO: Currently, pixels with zero aligned nt-pairs are white, and Martin@1: # pixels with one or more aligned nt-pairs are black. This can look Martin@1: # too crowded for large genome alignments. I tried shading each pixel Martin@1: # according to the number of aligned nt-pairs within it, but the Martin@1: # result is too faint. How can this be done better? Martin@1: Martin@914: import functools Martin@875: import gzip Martin@896: from fnmatch import fnmatchcase Martin@906: from operator import itemgetter Martin@903: import subprocess Martin@896: import itertools, optparse, os, re, sys Martin@475: Martin@475: # Try to make PIL/PILLOW work: Martin@475: try: from PIL import Image, ImageDraw, ImageFont, ImageColor Martin@475: except ImportError: import Image, ImageDraw, ImageFont, ImageColor Martin@1: Martin@844: def myOpen(fileName): # faster than fileinput Martin@904: if fileName is None: Martin@904: return [] Martin@844: if fileName == "-": Martin@844: return sys.stdin Martin@875: if fileName.endswith(".gz"): Martin@875: return gzip.open(fileName) Martin@844: return open(fileName) Martin@844: Martin@644: def warn(message): Martin@866: if opts.verbose: Martin@866: prog = os.path.basename(sys.argv[0]) Martin@866: sys.stderr.write(prog + ": " + message + "\n") Martin@644: Martin@911: def groupByFirstItem(things): Martin@911: for k, v in itertools.groupby(things, itemgetter(0)): Martin@911: yield k, [i[1:] for i in v] Martin@911: Martin@908: def croppedBlocks(blocks, ranges1, ranges2): Martin@908: headBeg1, headBeg2, headSize = blocks[0] Martin@908: for r1 in ranges1: Martin@908: for r2 in ranges2: Martin@908: cropBeg1, cropEnd1 = r1 Martin@908: if headBeg1 < 0: Martin@908: cropBeg1, cropEnd1 = -cropEnd1, -cropBeg1 Martin@908: cropBeg2, cropEnd2 = r2 Martin@908: if headBeg2 < 0: Martin@908: cropBeg2, cropEnd2 = -cropEnd2, -cropBeg2 Martin@908: for beg1, beg2, size in blocks: Martin@908: b1 = max(cropBeg1, beg1) Martin@908: e1 = min(cropEnd1, beg1 + size) Martin@908: if b1 >= e1: continue Martin@908: offset = beg2 - beg1 Martin@908: b2 = max(cropBeg2, b1 + offset) Martin@908: e2 = min(cropEnd2, e1 + offset) Martin@908: if b2 >= e2: continue Martin@908: yield b2 - offset, b2, e2 - b2 Martin@840: Martin@482: def tabBlocks(beg1, beg2, blocks): Martin@482: '''Get the gapless blocks of an alignment, from LAST tabular format.''' Martin@482: for i in blocks.split(","): Martin@482: if ":" in i: Martin@482: x, y = i.split(":") Martin@482: beg1 += int(x) Martin@482: beg2 += int(y) Martin@482: else: Martin@482: size = int(i) Martin@482: yield beg1, beg2, size Martin@482: beg1 += size Martin@482: beg2 += size Martin@272: Martin@482: def mafBlocks(beg1, beg2, seq1, seq2): Martin@482: '''Get the gapless blocks of an alignment, from MAF format.''' Martin@482: size = 0 Martin@482: for x, y in itertools.izip(seq1, seq2): Martin@482: if x == "-": Martin@482: if size: Martin@482: yield beg1, beg2, size Martin@482: beg1 += size Martin@482: beg2 += size Martin@482: size = 0 Martin@482: beg2 += 1 Martin@482: elif y == "-": Martin@482: if size: Martin@482: yield beg1, beg2, size Martin@482: beg1 += size Martin@482: beg2 += size Martin@482: size = 0 Martin@482: beg1 += 1 Martin@272: else: Martin@482: size += 1 Martin@482: if size: yield beg1, beg2, size Martin@272: Martin@482: def alignmentInput(lines): Martin@482: '''Get alignments and sequence lengths, from MAF or tabular format.''' Martin@482: mafCount = 0 Martin@272: for line in lines: Martin@272: w = line.split() Martin@272: if line[0].isdigit(): # tabular format Martin@482: chr1, beg1, seqlen1 = w[1], int(w[2]), int(w[5]) Martin@482: if w[4] == "-": beg1 -= seqlen1 Martin@482: chr2, beg2, seqlen2 = w[6], int(w[7]), int(w[10]) Martin@482: if w[9] == "-": beg2 -= seqlen2 Martin@847: blocks = tabBlocks(beg1, beg2, w[11]) Martin@482: yield chr1, seqlen1, chr2, seqlen2, blocks Martin@272: elif line[0] == "s": # MAF format Martin@482: if mafCount == 0: Martin@482: chr1, beg1, seqlen1, seq1 = w[1], int(w[2]), int(w[5]), w[6] Martin@482: if w[4] == "-": beg1 -= seqlen1 Martin@482: mafCount = 1 Martin@482: else: Martin@482: chr2, beg2, seqlen2, seq2 = w[1], int(w[2]), int(w[5]), w[6] Martin@482: if w[4] == "-": beg2 -= seqlen2 Martin@847: blocks = mafBlocks(beg1, beg2, seq1, seq2) Martin@482: yield chr1, seqlen1, chr2, seqlen2, blocks Martin@482: mafCount = 0 Martin@272: Martin@897: def seqRequestFromText(text): Martin@840: if ":" in text: Martin@840: pattern, interval = text.rsplit(":", 1) Martin@840: if "-" in interval: Martin@840: beg, end = interval.rsplit("-", 1) Martin@840: return pattern, int(beg), int(end) # beg may be negative Martin@840: return text, 0, sys.maxsize Martin@840: Martin@909: def rangesFromSeqName(seqRequests, name, seqLen): Martin@908: if seqRequests: Martin@908: base = name.split(".")[-1] # allow for names like hg19.chr7 Martin@908: for pat, beg, end in seqRequests: Martin@908: if fnmatchcase(name, pat) or fnmatchcase(base, pat): Martin@909: yield max(beg, 0), min(end, seqLen) Martin@908: else: Martin@909: yield 0, seqLen Martin@651: Martin@909: def updateSeqs(coverDict, seqRanges, seqName, ranges, coveredRange): Martin@909: beg, end = coveredRange Martin@909: if beg < 0: Martin@909: coveredRange = -end, -beg Martin@909: if seqName in coverDict: Martin@909: coverDict[seqName].append(coveredRange) Martin@839: else: Martin@909: coverDict[seqName] = [coveredRange] Martin@909: for beg, end in ranges: Martin@909: r = seqName, beg, end Martin@909: seqRanges.append(r) Martin@839: Martin@651: def readAlignments(fileName, opts): Martin@839: '''Get alignments and sequence limits, from MAF or tabular format.''' Martin@897: seqRequests1 = map(seqRequestFromText, opts.seq1) Martin@897: seqRequests2 = map(seqRequestFromText, opts.seq2) Martin@840: Martin@482: alignments = [] Martin@909: seqRanges1 = [] Martin@909: seqRanges2 = [] Martin@909: coverDict1 = {} Martin@909: coverDict2 = {} Martin@844: lines = myOpen(fileName) Martin@838: for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines): Martin@909: ranges1 = sorted(rangesFromSeqName(seqRequests1, seqName1, seqLen1)) Martin@909: if not ranges1: continue Martin@909: ranges2 = sorted(rangesFromSeqName(seqRequests2, seqName2, seqLen2)) Martin@909: if not ranges2: continue Martin@909: b = list(croppedBlocks(list(blocks), ranges1, ranges2)) Martin@840: if not b: continue Martin@840: aln = seqName1, seqName2, b Martin@482: alignments.append(aln) Martin@909: coveredRange1 = b[0][0], b[-1][0] + b[-1][2] Martin@909: updateSeqs(coverDict1, seqRanges1, seqName1, ranges1, coveredRange1) Martin@909: coveredRange2 = b[0][1], b[-1][1] + b[-1][2] Martin@909: updateSeqs(coverDict2, seqRanges2, seqName2, ranges2, coveredRange2) Martin@909: return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2 Martin@1: Martin@911: def nameAndRangesFromDict(cropDict, seqName): Martin@911: if seqName in cropDict: Martin@911: return seqName, cropDict[seqName] Martin@911: n = seqName.split(".")[-1] Martin@911: if n in cropDict: Martin@911: return n, cropDict[n] Martin@911: return seqName, [] Martin@911: Martin@911: def rangesForSecondaryAlignments(primaryRanges, seqLen): Martin@911: if primaryRanges: Martin@911: return primaryRanges Martin@911: return [(0, seqLen)] Martin@911: Martin@911: def readSecondaryAlignments(opts, cropRanges1, cropRanges2): Martin@911: cropDict1 = dict(groupByFirstItem(cropRanges1)) Martin@911: cropDict2 = dict(groupByFirstItem(cropRanges2)) Martin@911: Martin@911: alignments = [] Martin@911: seqRanges1 = [] Martin@911: seqRanges2 = [] Martin@911: coverDict1 = {} Martin@911: coverDict2 = {} Martin@911: lines = myOpen(opts.alignments) Martin@911: for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines): Martin@911: seqName1, ranges1 = nameAndRangesFromDict(cropDict1, seqName1) Martin@911: seqName2, ranges2 = nameAndRangesFromDict(cropDict2, seqName2) Martin@911: if not ranges1 and not ranges2: Martin@911: continue Martin@911: r1 = rangesForSecondaryAlignments(ranges1, seqLen1) Martin@911: r2 = rangesForSecondaryAlignments(ranges2, seqLen2) Martin@911: b = list(croppedBlocks(list(blocks), r1, r2)) Martin@911: if not b: continue Martin@911: aln = seqName1, seqName2, b Martin@911: alignments.append(aln) Martin@911: if not ranges1: Martin@911: coveredRange1 = b[0][0], b[-1][0] + b[-1][2] Martin@911: updateSeqs(coverDict1, seqRanges1, seqName1, r1, coveredRange1) Martin@911: if not ranges2: Martin@911: coveredRange2 = b[0][1], b[-1][1] + b[-1][2] Martin@911: updateSeqs(coverDict2, seqRanges2, seqName2, r2, coveredRange2) Martin@911: return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2 Martin@911: Martin@910: def twoValuesFromOption(text, separator): Martin@910: if separator in text: Martin@910: return text.split(separator) Martin@910: return text, text Martin@910: Martin@910: def mergedRanges(ranges): Martin@910: oldBeg, maxEnd = ranges[0] Martin@910: for beg, end in ranges: Martin@910: if beg > maxEnd: Martin@910: yield oldBeg, maxEnd Martin@910: oldBeg = beg Martin@910: maxEnd = end Martin@910: elif end > maxEnd: Martin@910: maxEnd = end Martin@910: yield oldBeg, maxEnd Martin@910: Martin@910: def mergedRangesPerSeq(coverDict): Martin@910: for k, v in coverDict.iteritems(): Martin@910: v.sort() Martin@910: yield k, list(mergedRanges(v)) Martin@910: Martin@910: def coveredLength(mergedCoverDict): Martin@910: return sum(sum(e - b for b, e in v) for v in mergedCoverDict.itervalues()) Martin@910: Martin@910: def trimmed(seqRanges, coverDict, minAlignedBases, maxGapFrac, endPad, midPad): Martin@910: maxEndGapFrac, maxMidGapFrac = twoValuesFromOption(maxGapFrac, ",") Martin@910: maxEndGap = max(float(maxEndGapFrac) * minAlignedBases, endPad * 1.0) Martin@910: maxMidGap = max(float(maxMidGapFrac) * minAlignedBases, midPad * 2.0) Martin@910: Martin@910: for seqName, rangeBeg, rangeEnd in seqRanges: Martin@910: seqBlocks = coverDict[seqName] Martin@910: blocks = [i for i in seqBlocks if i[0] < rangeEnd and i[1] > rangeBeg] Martin@910: if blocks[0][0] - rangeBeg > maxEndGap: Martin@910: rangeBeg = blocks[0][0] - endPad Martin@910: for j, y in enumerate(blocks): Martin@910: if j: Martin@910: x = blocks[j - 1] Martin@910: if y[0] - x[1] > maxMidGap: Martin@910: yield seqName, rangeBeg, x[1] + midPad Martin@910: rangeBeg = y[0] - midPad Martin@910: if rangeEnd - blocks[-1][1] > maxEndGap: Martin@910: rangeEnd = blocks[-1][1] + endPad Martin@910: yield seqName, rangeBeg, rangeEnd Martin@910: Martin@1: def natural_sort_key(my_string): Martin@1: '''Return a sort key for "natural" ordering, e.g. chr9 < chr10.''' Martin@1: parts = re.split(r'(\d+)', my_string) Martin@1: parts[1::2] = map(int, parts[1::2]) Martin@1: return parts Martin@1: Martin@907: def nameKey(oneSeqRanges): Martin@907: return natural_sort_key(oneSeqRanges[0][0]) Martin@907: Martin@907: def sizeKey(oneSeqRanges): Martin@907: return sum(b - e for n, b, e in oneSeqRanges), nameKey(oneSeqRanges) Martin@907: Martin@914: def alignmentKey(seqNamesToLists, oneSeqRanges): Martin@914: seqName = oneSeqRanges[0][0] Martin@914: alignmentsOfThisSequence = seqNamesToLists[seqName] Martin@914: numOfAlignedLetterPairs = sum(i[3] for i in alignmentsOfThisSequence) Martin@914: toMiddle = numOfAlignedLetterPairs // 2 Martin@914: for i in alignmentsOfThisSequence: Martin@914: toMiddle -= i[3] Martin@914: if toMiddle < 0: Martin@914: return i[1:3] # sequence-rank and "position" of this alignment Martin@914: Martin@914: def alignmentSortData1(alignments, seqNamesToRanks2): Martin@914: for seqName1, seqName2, blocks in alignments: Martin@914: seqRank2 = seqNamesToRanks2[seqName2] Martin@914: pos2 = abs(blocks[0][1] + blocks[-1][1] + blocks[-1][2]) Martin@914: numOfAlignedLetterPairs = sum(i[2] for i in blocks) Martin@914: yield seqName1, seqRank2, pos2, numOfAlignedLetterPairs Martin@914: Martin@914: def alignmentSortData2(alignments, seqNamesToRanks1): Martin@914: for seqName1, seqName2, blocks in alignments: Martin@914: seqRank1 = seqNamesToRanks1[seqName1] Martin@914: pos1 = abs(blocks[0][0] + blocks[-1][0] + blocks[-1][2]) Martin@914: numOfAlignedLetterPairs = sum(i[2] for i in blocks) Martin@914: yield seqName2, seqRank1, pos1, numOfAlignedLetterPairs Martin@914: Martin@914: def mySortedRanges(seqRanges, sortOpt, Martin@914: alignmentSortDataFunc, alignments, otherRanges): Martin@913: rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0)) Martin@913: g = [list(ranges) for seqName, ranges in rangesGroupedBySeqName] Martin@907: if sortOpt == "1": Martin@907: g.sort(key=nameKey) Martin@907: if sortOpt == "2": Martin@907: g.sort(key=sizeKey) Martin@914: if sortOpt == "3": Martin@914: otherNameGroups = itertools.groupby(i[0] for i in otherRanges) Martin@914: ranksAndNames = enumerate(i[0] for i in otherNameGroups) Martin@914: otherNamesToRanks = dict((n, r) for r, n in ranksAndNames) Martin@914: alns = sorted(alignmentSortDataFunc(alignments, otherNamesToRanks)) Martin@914: alnsGroupedBySeqName = itertools.groupby(alns, itemgetter(0)) Martin@914: seqNamesToLists = dict((k, list(v)) for k, v in alnsGroupedBySeqName) Martin@914: g.sort(key=functools.partial(alignmentKey, seqNamesToLists)) Martin@907: return [j for i in g for j in i] Martin@907: Martin@914: def allSortedRanges(opts, alignments, alignmentsB, Martin@914: seqRanges1, seqRangesB1, seqRanges2, seqRangesB2): Martin@914: o1, oB1 = twoValuesFromOption(opts.sort1, ":") Martin@914: o2, oB2 = twoValuesFromOption(opts.sort2, ":") Martin@914: if o1 == "3" and o2 == "3": Martin@914: raise Exception("the sort options have circular dependency") Martin@914: if o1 != "3": Martin@914: s1 = mySortedRanges(seqRanges1, o1, None, None, None) Martin@914: if o2 != "3": Martin@914: s2 = mySortedRanges(seqRanges2, o2, None, None, None) Martin@914: if o1 == "3": Martin@914: s1 = mySortedRanges(seqRanges1, o1, alignmentSortData1, alignments, s2) Martin@914: if o2 == "3": Martin@914: s2 = mySortedRanges(seqRanges2, o2, alignmentSortData2, alignments, s1) Martin@914: t1 = mySortedRanges(seqRangesB1, oB1, alignmentSortData1, alignmentsB, s2) Martin@914: t2 = mySortedRanges(seqRangesB2, oB2, alignmentSortData2, alignmentsB, s1) Martin@914: return s1 + t1, s2 + t2 Martin@911: Martin@898: def prettyNum(n): Martin@898: t = str(n) Martin@898: groups = [] Martin@898: while t: Martin@898: groups.append(t[-3:]) Martin@898: t = t[:-3] Martin@904: return ",".join(reversed(groups)) Martin@898: Martin@846: def sizeText(size): Martin@846: suffixes = "bp", "kb", "Mb", "Gb" Martin@846: for i, x in enumerate(suffixes): Martin@846: j = 10 ** (i * 3) Martin@846: if size < j * 10: Martin@846: return "%.2g" % (1.0 * size / j) + x Martin@846: if size < j * 1000 or i == len(suffixes) - 1: Martin@846: return "%.0f" % (1.0 * size / j) + x Martin@846: Martin@898: def labelText(seqRange, labelOpt): Martin@898: seqName, beg, end = seqRange Martin@898: if labelOpt == 1: Martin@898: return seqName + ": " + sizeText(end - beg) Martin@898: if labelOpt == 2: Martin@904: return seqName + ":" + prettyNum(beg) + ": " + sizeText(end - beg) Martin@898: if labelOpt == 3: Martin@904: return seqName + ":" + prettyNum(beg) + "-" + prettyNum(end) Martin@898: return seqName Martin@846: Martin@899: def rangeLabels(seqRanges, labelOpt, font, fontsize, image_mode, textRot): Martin@899: if fontsize: Martin@899: image_size = 1, 1 Martin@899: im = Image.new(image_mode, image_size) Martin@899: draw = ImageDraw.Draw(im) Martin@899: x = y = 0 Martin@899: for r in seqRanges: Martin@899: text = labelText(r, labelOpt) Martin@899: if fontsize: Martin@899: x, y = draw.textsize(text, font=font) Martin@899: if textRot: Martin@899: x, y = y, x Martin@899: yield text, x, y Martin@899: Martin@914: def dataFromRanges(sortedRanges, font, fontSize, imageMode, labelOpt, textRot): Martin@913: for i in sortedRanges: Martin@907: warn("\t".join(map(str, i))) Martin@866: warn("") Martin@913: rangeSizes = [e - b for n, b, e in sortedRanges] Martin@913: labs = list(rangeLabels(sortedRanges, labelOpt, font, fontSize, Martin@913: imageMode, textRot)) Martin@907: margin = max(i[2] for i in labs) Martin@878: # xxx the margin may be too big, because some labels may get omitted Martin@914: return rangeSizes, labs, margin Martin@1: Martin@1: def div_ceil(x, y): Martin@1: '''Return x / y rounded up.''' Martin@1: q, r = divmod(x, y) Martin@1: return q + (r != 0) Martin@1: Martin@896: def get_bp_per_pix(rangeSizes, pixTweenRanges, maxPixels): Martin@1: '''Get the minimum bp-per-pixel that fits in the size limit.''' Martin@904: warn("choosing bp per pixel...") Martin@896: numOfRanges = len(rangeSizes) Martin@896: maxPixelsInRanges = maxPixels - pixTweenRanges * (numOfRanges - 1) Martin@896: if maxPixelsInRanges < numOfRanges: Martin@649: raise Exception("can't fit the image: too many sequences?") Martin@896: negLimit = -maxPixelsInRanges Martin@896: negBpPerPix = sum(rangeSizes) // negLimit Martin@863: while True: Martin@896: if sum(i // negBpPerPix for i in rangeSizes) >= negLimit: Martin@863: return -negBpPerPix Martin@863: negBpPerPix -= 1 Martin@1: Martin@900: def getRangePixBegs(rangePixLens, pixTweenRanges, margin): Martin@900: '''Get the start pixel for each range.''' Martin@900: rangePixBegs = [] Martin@896: pix_tot = margin - pixTweenRanges Martin@900: for i in rangePixLens: Martin@896: pix_tot += pixTweenRanges Martin@900: rangePixBegs.append(pix_tot) Martin@1: pix_tot += i Martin@900: return rangePixBegs Martin@1: Martin@896: def pixelData(rangeSizes, bp_per_pix, pixTweenRanges, margin): Martin@900: '''Return pixel information about the ranges.''' Martin@900: rangePixLens = [div_ceil(i, bp_per_pix) for i in rangeSizes] Martin@900: rangePixBegs = getRangePixBegs(rangePixLens, pixTweenRanges, margin) Martin@900: tot_pix = rangePixBegs[-1] + rangePixLens[-1] Martin@900: return rangePixBegs, rangePixLens, tot_pix Martin@1: Martin@835: def drawLineForward(hits, width, bp_per_pix, beg1, beg2, size): Martin@639: while True: Martin@639: q1, r1 = divmod(beg1, bp_per_pix) Martin@639: q2, r2 = divmod(beg2, bp_per_pix) Martin@835: hits[q2 * width + q1] |= 1 Martin@639: next_pix = min(bp_per_pix - r1, bp_per_pix - r2) Martin@639: if next_pix >= size: break Martin@639: beg1 += next_pix Martin@639: beg2 += next_pix Martin@639: size -= next_pix Martin@639: Martin@835: def drawLineReverse(hits, width, bp_per_pix, beg1, beg2, size): Martin@639: beg2 = -1 - beg2 Martin@639: while True: Martin@639: q1, r1 = divmod(beg1, bp_per_pix) Martin@639: q2, r2 = divmod(beg2, bp_per_pix) Martin@835: hits[q2 * width + q1] |= 2 Martin@639: next_pix = min(bp_per_pix - r1, r2 + 1) Martin@639: if next_pix >= size: break Martin@639: beg1 += next_pix Martin@639: beg2 -= next_pix Martin@639: size -= next_pix Martin@639: Martin@905: def findOrigin(ranges, beg, size): Martin@905: if beg < 0: Martin@905: beg = -(beg + size) Martin@905: for rangeBeg, rangeEnd, origin in ranges: Martin@905: if rangeEnd > beg: Martin@905: return origin Martin@905: Martin@905: def alignmentPixels(width, height, alignments, bp_per_pix, Martin@905: rangeDict1, rangeDict2): Martin@640: hits = [0] * (width * height) # the image data Martin@640: for seq1, seq2, blocks in alignments: Martin@905: beg1, beg2, size = blocks[0] Martin@905: ori1 = findOrigin(rangeDict1[seq1], beg1, size) Martin@905: ori2 = findOrigin(rangeDict2[seq2], beg2, size) Martin@640: for beg1, beg2, size in blocks: Martin@640: if beg1 < 0: Martin@640: beg1 = -(beg1 + size) Martin@640: beg2 = -(beg2 + size) Martin@640: if beg2 >= 0: Martin@835: drawLineForward(hits, width, bp_per_pix, Martin@835: beg1 + ori1, beg2 + ori2, size) Martin@640: else: Martin@835: drawLineReverse(hits, width, bp_per_pix, Martin@835: beg1 + ori1, beg2 - ori2, size) Martin@640: return hits Martin@1: Martin@650: def expandedSeqDict(seqDict): Martin@650: '''Allow lookup by short sequence names, e.g. chr7 as well as hg19.chr7.''' Martin@861: newDict = seqDict.copy() Martin@650: for name, x in seqDict.items(): Martin@861: if "." in name: Martin@861: base = name.split(".")[-1] Martin@861: if base in newDict: # an ambiguous case was found: Martin@861: return seqDict # so give up completely Martin@861: newDict[base] = x Martin@650: return newDict Martin@650: Martin@905: def readBed(fileName, rangeDict): Martin@845: for line in myOpen(fileName): Martin@845: w = line.split() Martin@847: if not w: continue Martin@845: seqName = w[0] Martin@905: if seqName not in rangeDict: continue Martin@845: beg = int(w[1]) Martin@845: end = int(w[2]) Martin@857: layer = 900 Martin@895: color = "#fbf" Martin@858: if len(w) > 4: Martin@858: if w[4] != ".": Martin@858: layer = float(w[4]) Martin@858: if len(w) > 5: Martin@858: if len(w) > 8 and w[8].count(",") == 2: Martin@858: color = "rgb(" + w[8] + ")" Martin@858: elif w[5] == "+": Martin@895: color = "#ffe8e8" Martin@858: elif w[5] == "-": Martin@895: color = "#e8e8ff" Martin@859: yield layer, color, seqName, beg, end Martin@845: Martin@860: def commaSeparatedInts(text): Martin@860: return map(int, text.rstrip(",").split(",")) Martin@860: Martin@905: def readGenePred(opts, fileName, rangeDict): Martin@860: for line in myOpen(fileName): Martin@860: fields = line.split() Martin@860: if not fields: continue Martin@860: if fields[2] not in "+-": fields = fields[1:] Martin@860: seqName = fields[1] Martin@905: if seqName not in rangeDict: continue Martin@860: #strand = fields[2] Martin@860: cdsBeg = int(fields[5]) Martin@860: cdsEnd = int(fields[6]) Martin@860: exonBegs = commaSeparatedInts(fields[8]) Martin@860: exonEnds = commaSeparatedInts(fields[9]) Martin@860: for beg, end in zip(exonBegs, exonEnds): Martin@860: yield 300, opts.exon_color, seqName, beg, end Martin@860: b = max(beg, cdsBeg) Martin@860: e = min(end, cdsEnd) Martin@860: if b < e: yield 400, opts.cds_color, seqName, b, e Martin@860: Martin@905: def readRmsk(fileName, rangeDict): Martin@860: for line in myOpen(fileName): Martin@860: fields = line.split() Martin@860: if len(fields) == 17: # rmsk.txt Martin@860: seqName = fields[5] Martin@905: if seqName not in rangeDict: continue # do this ASAP for speed Martin@860: beg = int(fields[6]) Martin@860: end = int(fields[7]) Martin@860: strand = fields[9] Martin@860: repeatClass = fields[11] Martin@860: elif len(fields) == 15: # .out Martin@860: seqName = fields[4] Martin@905: if seqName not in rangeDict: continue Martin@860: beg = int(fields[5]) - 1 Martin@860: end = int(fields[6]) Martin@860: strand = fields[8] Martin@860: repeatClass = fields[10] Martin@860: else: Martin@860: continue Martin@860: if repeatClass in ("Low_complexity", "Simple_repeat"): Martin@895: yield 200, "#fbf", seqName, beg, end Martin@860: elif strand == "+": Martin@895: yield 100, "#ffe8e8", seqName, beg, end Martin@860: else: Martin@895: yield 100, "#e8e8ff", seqName, beg, end Martin@860: Martin@650: def isExtraFirstGapField(fields): Martin@650: return fields[4].isdigit() Martin@650: Martin@905: def readGaps(opts, fileName, rangeDict): Martin@650: '''Read locations of unsequenced gaps, from an agp or gap file.''' Martin@844: for line in myOpen(fileName): Martin@650: w = line.split() Martin@650: if not w or w[0][0] == "#": continue Martin@650: if isExtraFirstGapField(w): w = w[1:] Martin@650: if w[4] not in "NU": continue Martin@650: seqName = w[0] Martin@905: if seqName not in rangeDict: continue Martin@650: end = int(w[2]) Martin@650: beg = end - int(w[5]) # zero-based coordinate Martin@857: if w[7] == "yes": Martin@859: yield 3000, opts.bridged_color, seqName, beg, end Martin@857: else: Martin@859: yield 2000, opts.unbridged_color, seqName, beg, end Martin@650: Martin@905: def bedBoxes(beds, rangeDict, margin, edge, isTop, bpPerPix): Martin@897: for layer, color, seqName, bedBeg, bedEnd in beds: Martin@905: for rangeBeg, rangeEnd, origin in rangeDict[seqName]: Martin@905: beg = max(bedBeg, rangeBeg) Martin@905: end = min(bedEnd, rangeEnd) Martin@905: if beg >= end: continue Martin@905: if layer <= 1000: Martin@905: # include partly-covered pixels Martin@905: b = (origin + beg) // bpPerPix Martin@905: e = div_ceil(origin + end, bpPerPix) Martin@905: else: Martin@905: # exclude partly-covered pixels Martin@905: b = div_ceil(origin + beg, bpPerPix) Martin@905: e = (origin + end) // bpPerPix Martin@905: if e <= b: continue Martin@912: if end == rangeEnd: # include partly-covered end pixels Martin@912: e = div_ceil(origin + end, bpPerPix) Martin@905: if isTop: Martin@905: box = b, margin, e, edge Martin@905: else: Martin@905: box = margin, b, edge, e Martin@905: yield layer, color, box Martin@845: Martin@857: def drawAnnotations(im, boxes): Martin@857: # xxx use partial transparency for different-color overlaps? Martin@857: for layer, color, box in boxes: Martin@650: im.paste(color, box) Martin@650: Martin@901: def placedLabels(labels, rangePixBegs, rangePixLens, beg, end): Martin@901: '''Return axis labels with endpoint & sort-order information.''' Martin@901: maxWidth = end - beg Martin@901: for i, j, k in zip(labels, rangePixBegs, rangePixLens): Martin@901: text, textWidth, textHeight = i Martin@901: if textWidth > maxWidth: Martin@901: continue Martin@901: labelBeg = j + (k - textWidth) // 2 Martin@901: labelEnd = labelBeg + textWidth Martin@901: sortKey = textWidth - k Martin@901: if labelBeg < beg: Martin@904: sortKey += maxWidth * (beg - labelBeg) Martin@901: labelBeg = beg Martin@901: labelEnd = beg + textWidth Martin@901: if labelEnd > end: Martin@904: sortKey += maxWidth * (labelEnd - end) Martin@901: labelEnd = end Martin@901: labelBeg = end - textWidth Martin@901: yield sortKey, labelBeg, labelEnd, text, textHeight Martin@1: Martin@897: def nonoverlappingLabels(labels, minPixTweenLabels): Martin@1: '''Get a subset of non-overlapping axis labels, greedily.''' Martin@897: out = [] Martin@1: for i in labels: Martin@897: beg = i[1] - minPixTweenLabels Martin@897: end = i[2] + minPixTweenLabels Martin@897: if all(j[2] <= beg or j[1] >= end for j in out): Martin@897: out.append(i) Martin@897: return out Martin@1: Martin@901: def axisImage(labels, rangePixBegs, rangePixLens, textRot, Martin@895: textAln, font, image_mode, opts): Martin@1: '''Make an image of axis labels.''' Martin@900: beg = rangePixBegs[0] Martin@900: end = rangePixBegs[-1] + rangePixLens[-1] Martin@901: margin = max(i[2] for i in labels) Martin@901: labels = sorted(placedLabels(labels, rangePixBegs, rangePixLens, beg, end)) Martin@878: minPixTweenLabels = 0 if textRot else opts.label_space Martin@897: labels = nonoverlappingLabels(labels, minPixTweenLabels) Martin@897: image_size = (margin, end) if textRot else (end, margin) Martin@895: im = Image.new(image_mode, image_size, opts.margin_color) Martin@1: draw = ImageDraw.Draw(im) Martin@1: for i in labels: Martin@878: base = margin - i[4] if textAln else 0 Martin@878: position = (base, i[1]) if textRot else (i[1], base) Martin@646: draw.text(position, i[3], font=font, fill=opts.text_color) Martin@1: return im Martin@1: Martin@906: def rangesWithOrigins(sortedRanges, rangePixBegs, bpPerPix): Martin@906: for i, j in zip(sortedRanges, rangePixBegs): Martin@906: seqName, rangeBeg, rangeEnd = i Martin@906: origin = bpPerPix * j - rangeBeg Martin@906: yield seqName, (rangeBeg, rangeEnd, origin) Martin@906: Martin@906: def rangesPerSeq(sortedRanges, rangePixBegs, bpPerPix): Martin@906: a = rangesWithOrigins(sortedRanges, rangePixBegs, bpPerPix) Martin@906: for k, v in itertools.groupby(a, itemgetter(0)): Martin@906: yield k, [i[1] for i in v] Martin@836: Martin@903: def getFont(opts): Martin@903: if opts.fontfile: Martin@903: return ImageFont.truetype(opts.fontfile, opts.fontsize) Martin@903: fileNames = [] Martin@903: try: Martin@903: x = ["fc-match", "-f%{file}", "arial"] Martin@903: p = subprocess.Popen(x, stdout=subprocess.PIPE, stderr=subprocess.PIPE) Martin@903: out, err = p.communicate() Martin@903: fileNames.append(out) Martin@903: except OSError as e: Martin@903: warn("fc-match error: " + str(e)) Martin@903: fileNames.append("/Library/Fonts/Arial.ttf") # for Mac Martin@903: for i in fileNames: Martin@903: try: Martin@903: font = ImageFont.truetype(i, opts.fontsize) Martin@903: warn("font: " + i) Martin@903: return font Martin@903: except IOError as e: Martin@903: warn("font load error: " + str(e)) Martin@903: return ImageFont.load_default() Martin@903: Martin@648: def lastDotplot(opts, args): Martin@903: font = getFont(opts) Martin@643: image_mode = 'RGB' Martin@643: forward_color = ImageColor.getcolor(opts.forwardcolor, image_mode) Martin@643: reverse_color = ImageColor.getcolor(opts.reversecolor, image_mode) Martin@643: zipped_colors = zip(forward_color, reverse_color) Martin@643: overlap_color = tuple([(i + j) // 2 for i, j in zipped_colors]) Martin@641: Martin@911: maxGap1, maxGapB1 = twoValuesFromOption(opts.max_gap1, ":") Martin@911: maxGap2, maxGapB2 = twoValuesFromOption(opts.max_gap2, ":") Martin@911: Martin@644: warn("reading alignments...") Martin@904: alnData = readAlignments(args[0], opts) Martin@909: alignments, seqRanges1, coverDict1, seqRanges2, coverDict2 = alnData Martin@649: if not alignments: raise Exception("there are no alignments") Martin@910: warn("cutting...") Martin@910: coverDict1 = dict(mergedRangesPerSeq(coverDict1)) Martin@910: coverDict2 = dict(mergedRangesPerSeq(coverDict2)) Martin@910: minAlignedBases = min(coveredLength(coverDict1), coveredLength(coverDict2)) Martin@910: pad = int(opts.pad * minAlignedBases) Martin@910: cutRanges1 = list(trimmed(seqRanges1, coverDict1, minAlignedBases, Martin@911: maxGap1, pad, pad)) Martin@910: cutRanges2 = list(trimmed(seqRanges2, coverDict2, minAlignedBases, Martin@911: maxGap2, pad, pad)) Martin@911: Martin@911: warn("reading secondary alignments...") Martin@911: alnDataB = readSecondaryAlignments(opts, cutRanges1, cutRanges2) Martin@911: alignmentsB, seqRangesB1, coverDictB1, seqRangesB2, coverDictB2 = alnDataB Martin@911: warn("cutting...") Martin@911: coverDictB1 = dict(mergedRangesPerSeq(coverDictB1)) Martin@911: coverDictB2 = dict(mergedRangesPerSeq(coverDictB2)) Martin@911: cutRangesB1 = trimmed(seqRangesB1, coverDictB1, minAlignedBases, Martin@911: maxGapB1, 0, 0) Martin@911: cutRangesB2 = trimmed(seqRangesB2, coverDictB2, minAlignedBases, Martin@911: maxGapB2, 0, 0) Martin@641: Martin@914: warn("sorting...") Martin@914: sortOut = allSortedRanges(opts, alignments, alignmentsB, Martin@914: cutRanges1, cutRangesB1, cutRanges2, cutRangesB2) Martin@914: sortedRanges1, sortedRanges2 = sortOut Martin@914: Martin@878: textRot1 = "vertical".startswith(opts.rot1) Martin@914: i1 = dataFromRanges(sortedRanges1, font, Martin@908: opts.fontsize, image_mode, opts.labels1, textRot1) Martin@914: rangeSizes1, labelData1, tMargin = i1 Martin@846: Martin@878: textRot2 = "horizontal".startswith(opts.rot2) Martin@914: i2 = dataFromRanges(sortedRanges2, font, Martin@908: opts.fontsize, image_mode, opts.labels2, textRot2) Martin@914: rangeSizes2, labelData2, lMargin = i2 Martin@641: Martin@896: maxPixels1 = opts.width - lMargin Martin@896: maxPixels2 = opts.height - tMargin Martin@897: bpPerPix1 = get_bp_per_pix(rangeSizes1, opts.border_pixels, maxPixels1) Martin@897: bpPerPix2 = get_bp_per_pix(rangeSizes2, opts.border_pixels, maxPixels2) Martin@855: bpPerPix = max(bpPerPix1, bpPerPix2) Martin@855: warn("bp per pixel = " + str(bpPerPix)) Martin@641: Martin@900: p1 = pixelData(rangeSizes1, bpPerPix, opts.border_pixels, lMargin) Martin@900: rangePixBegs1, rangePixLens1, width = p1 Martin@906: rangeDict1 = dict(rangesPerSeq(sortedRanges1, rangePixBegs1, bpPerPix)) Martin@900: Martin@900: p2 = pixelData(rangeSizes2, bpPerPix, opts.border_pixels, tMargin) Martin@900: rangePixBegs2, rangePixLens2, height = p2 Martin@906: rangeDict2 = dict(rangesPerSeq(sortedRanges2, rangePixBegs2, bpPerPix)) Martin@900: Martin@847: warn("width: " + str(width)) Martin@847: warn("height: " + str(height)) Martin@839: Martin@644: warn("processing alignments...") Martin@911: hits = alignmentPixels(width, height, alignments + alignmentsB, bpPerPix, Martin@905: rangeDict1, rangeDict2) Martin@641: Martin@904: warn("reading annotations...") Martin@134: Martin@905: rangeDict1 = expandedSeqDict(rangeDict1) Martin@905: beds1 = itertools.chain(readBed(opts.bed1, rangeDict1), Martin@905: readRmsk(opts.rmsk1, rangeDict1), Martin@905: readGenePred(opts, opts.genePred1, rangeDict1), Martin@905: readGaps(opts, opts.gap1, rangeDict1)) Martin@905: b1 = bedBoxes(beds1, rangeDict1, tMargin, height, True, bpPerPix) Martin@845: Martin@905: rangeDict2 = expandedSeqDict(rangeDict2) Martin@905: beds2 = itertools.chain(readBed(opts.bed2, rangeDict2), Martin@905: readRmsk(opts.rmsk2, rangeDict2), Martin@905: readGenePred(opts, opts.genePred2, rangeDict2), Martin@905: readGaps(opts, opts.gap2, rangeDict2)) Martin@905: b2 = bedBoxes(beds2, rangeDict2, lMargin, width, False, bpPerPix) Martin@857: Martin@857: boxes = sorted(itertools.chain(b1, b2)) Martin@904: Martin@904: warn("drawing...") Martin@904: Martin@904: image_size = width, height Martin@904: im = Image.new(image_mode, image_size, opts.background_color) Martin@904: Martin@857: drawAnnotations(im, boxes) Martin@650: Martin@643: for i in range(height): Martin@643: for j in range(width): Martin@643: store_value = hits[i * width + j] Martin@643: xy = j, i Martin@643: if store_value == 1: im.putpixel(xy, forward_color) Martin@643: elif store_value == 2: im.putpixel(xy, reverse_color) Martin@643: elif store_value == 3: im.putpixel(xy, overlap_color) Martin@95: Martin@643: if opts.fontsize != 0: Martin@900: axis1 = axisImage(labelData1, rangePixBegs1, rangePixLens1, Martin@895: textRot1, False, font, image_mode, opts) Martin@878: if textRot1: Martin@878: axis1 = axis1.transpose(Image.ROTATE_90) Martin@900: axis2 = axisImage(labelData2, rangePixBegs2, rangePixLens2, Martin@895: textRot2, textRot2, font, image_mode, opts) Martin@878: if not textRot2: Martin@878: axis2 = axis2.transpose(Image.ROTATE_270) Martin@643: im.paste(axis1, (0, 0)) Martin@643: im.paste(axis2, (0, 0)) Martin@1: Martin@900: for i in rangePixBegs1[1:]: Martin@877: box = i - opts.border_pixels, tMargin, i, height Martin@852: im.paste(opts.border_color, box) Martin@1: Martin@900: for i in rangePixBegs2[1:]: Martin@877: box = lMargin, i - opts.border_pixels, width, i Martin@852: im.paste(opts.border_color, box) Martin@1: Martin@643: im.save(args[1]) Martin@648: Martin@648: if __name__ == "__main__": Martin@649: usage = """%prog --help Martin@649: or: %prog [options] maf-or-tab-alignments dotplot.png Martin@649: or: %prog [options] maf-or-tab-alignments dotplot.gif Martin@649: or: ...""" Martin@649: description = "Draw a dotplot of pair-wise sequence alignments in MAF or tabular format." Martin@649: op = optparse.OptionParser(usage=usage, description=description) Martin@866: op.add_option("-v", "--verbose", action="count", Martin@866: help="show progress messages & data about the plot") Martin@651: op.add_option("-1", "--seq1", metavar="PATTERN", action="append", Martin@840: default=[], Martin@651: help="which sequences to show from the 1st genome") Martin@651: op.add_option("-2", "--seq2", metavar="PATTERN", action="append", Martin@840: default=[], Martin@651: help="which sequences to show from the 2nd genome") Martin@648: # Replace "width" & "height" with a single "length" option? Martin@648: op.add_option("-x", "--width", type="int", default=1000, Martin@648: help="maximum width in pixels (default: %default)") Martin@648: op.add_option("-y", "--height", type="int", default=1000, Martin@648: help="maximum height in pixels (default: %default)") Martin@649: op.add_option("-c", "--forwardcolor", metavar="COLOR", default="red", Martin@649: help="color for forward alignments (default: %default)") Martin@649: op.add_option("-r", "--reversecolor", metavar="COLOR", default="blue", Martin@649: help="color for reverse alignments (default: %default)") Martin@911: op.add_option("--alignments", metavar="FILE", help="secondary alignments") Martin@907: op.add_option("--sort1", default="1", metavar="N", Martin@851: help="genome1 sequence order: 0=input order, 1=name order, " Martin@914: "2=length order, 3=alignment order (default=%default)") Martin@907: op.add_option("--sort2", default="1", metavar="N", Martin@851: help="genome2 sequence order: 0=input order, 1=name order, " Martin@914: "2=length order, 3=alignment order (default=%default)") Martin@910: op.add_option("--max-gap1", metavar="FRAC", default="0.5,2", help= Martin@910: "maximum unaligned (end,mid) gap in genome1: " Martin@910: "fraction of aligned length (default=%default)") Martin@910: op.add_option("--max-gap2", metavar="FRAC", default="0.5,2", help= Martin@910: "maximum unaligned (end,mid) gap in genome2: " Martin@910: "fraction of aligned length (default=%default)") Martin@910: op.add_option("--pad", metavar="FRAC", type="float", default=0.04, help= Martin@910: "pad length when cutting unaligned gaps: " Martin@910: "fraction of aligned length (default=%default)") Martin@852: op.add_option("--border-pixels", metavar="INT", type="int", default=1, Martin@852: help="number of pixels between sequences (default=%default)") Martin@895: op.add_option("--border-color", metavar="COLOR", default="black", Martin@852: help="color for pixels between sequences (default=%default)") Martin@911: # --break-color and/or --break-pixels for intra-sequence breaks? Martin@895: op.add_option("--margin-color", metavar="COLOR", default="#dcdcdc", Martin@895: help="margin color") Martin@846: Martin@850: og = optparse.OptionGroup(op, "Text options") Martin@850: og.add_option("-f", "--fontfile", metavar="FILE", Martin@850: help="TrueType or OpenType font file") Martin@903: og.add_option("-s", "--fontsize", metavar="SIZE", type="int", default=14, Martin@850: help="TrueType or OpenType font size (default: %default)") Martin@898: og.add_option("--labels1", type="int", default=0, metavar="N", help= Martin@898: "genome1 labels: 0=name, 1=name:length, " Martin@898: "2=name:start:length, 3=name:start-end (default=%default)") Martin@898: og.add_option("--labels2", type="int", default=0, metavar="N", help= Martin@898: "genome2 labels: 0=name, 1=name:length, " Martin@898: "2=name:start:length, 3=name:start-end (default=%default)") Martin@878: og.add_option("--rot1", metavar="ROT", default="h", Martin@878: help="text rotation for the 1st genome (default=%default)") Martin@878: og.add_option("--rot2", metavar="ROT", default="v", Martin@878: help="text rotation for the 2nd genome (default=%default)") Martin@850: op.add_option_group(og) Martin@850: Martin@860: og = optparse.OptionGroup(op, "Annotation options") Martin@860: og.add_option("--bed1", metavar="FILE", Martin@860: help="read genome1 annotations from BED file") Martin@860: og.add_option("--bed2", metavar="FILE", Martin@860: help="read genome2 annotations from BED file") Martin@860: og.add_option("--rmsk1", metavar="FILE", help="read genome1 repeats from " Martin@860: "RepeatMasker .out or rmsk.txt file") Martin@860: og.add_option("--rmsk2", metavar="FILE", help="read genome2 repeats from " Martin@860: "RepeatMasker .out or rmsk.txt file") Martin@860: op.add_option_group(og) Martin@860: Martin@860: og = optparse.OptionGroup(op, "Gene options") Martin@860: og.add_option("--genePred1", metavar="FILE", Martin@860: help="read genome1 genes from genePred file") Martin@860: og.add_option("--genePred2", metavar="FILE", Martin@860: help="read genome2 genes from genePred file") Martin@895: og.add_option("--exon-color", metavar="COLOR", default="PaleGreen", Martin@860: help="color for exons (default=%default)") Martin@895: og.add_option("--cds-color", metavar="COLOR", default="LimeGreen", Martin@860: help="color for protein-coding regions (default=%default)") Martin@860: op.add_option_group(og) Martin@860: Martin@650: og = optparse.OptionGroup(op, "Unsequenced gap options") Martin@650: og.add_option("--gap1", metavar="FILE", Martin@650: help="read genome1 unsequenced gaps from agp or gap file") Martin@650: og.add_option("--gap2", metavar="FILE", Martin@650: help="read genome2 unsequenced gaps from agp or gap file") Martin@650: og.add_option("--bridged-color", metavar="COLOR", default="yellow", Martin@650: help="color for bridged gaps (default: %default)") Martin@895: og.add_option("--unbridged-color", metavar="COLOR", default="orange", Martin@650: help="color for unbridged gaps (default: %default)") Martin@650: op.add_option_group(og) Martin@648: (opts, args) = op.parse_args() Martin@648: if len(args) != 2: op.error("2 arguments needed") Martin@648: Martin@648: opts.text_color = "black" Martin@648: opts.background_color = "white" Martin@648: opts.label_space = 5 # minimum number of pixels between axis labels Martin@648: Martin@649: try: lastDotplot(opts, args) Martin@649: except KeyboardInterrupt: pass # avoid silly error message Martin@903: except Exception as e: Martin@649: prog = os.path.basename(sys.argv[0]) Martin@649: sys.exit(prog + ": error: " + str(e))