scripts/last-dotplot
author Martin C. Frith
Thu Mar 02 11:31:57 2017 +0900 (2017-03-02)
changeset 840 85a72978fb7d
parent 839 bbc6f00e683b
child 844 23de4eb3be1d
permissions -rwxr-xr-x
Enabled sequence ranges for last-dotplot.
     1 #! /usr/bin/env python
     2 
     3 # Read pair-wise alignments in MAF or LAST tabular format: write an
     4 # "Oxford grid", a.k.a. dotplot.
     5 
     6 # TODO: Currently, pixels with zero aligned nt-pairs are white, and
     7 # pixels with one or more aligned nt-pairs are black.  This can look
     8 # too crowded for large genome alignments.  I tried shading each pixel
     9 # according to the number of aligned nt-pairs within it, but the
    10 # result is too faint.  How can this be done better?
    11 
    12 import fileinput, fnmatch, itertools, optparse, os, re, sys
    13 
    14 # Try to make PIL/PILLOW work:
    15 try: from PIL import Image, ImageDraw, ImageFont, ImageColor
    16 except ImportError: import Image, ImageDraw, ImageFont, ImageColor
    17 
    18 def warn(message):
    19     prog = os.path.basename(sys.argv[0])
    20     sys.stderr.write(prog + ": " + message + "\n")
    21 
    22 def croppedBlocks(blocks, range1, range2):
    23     cropBeg1, cropEnd1 = range1
    24     cropBeg2, cropEnd2 = range2
    25     if blocks[0][0] < 0: cropBeg1, cropEnd1 = -cropEnd1, -cropBeg1
    26     if blocks[0][1] < 0: cropBeg2, cropEnd2 = -cropEnd2, -cropBeg2
    27     for beg1, beg2, size in blocks:
    28         b1 = max(cropBeg1, beg1)
    29         e1 = min(cropEnd1, beg1 + size)
    30         if b1 >= e1: continue
    31         offset = beg2 - beg1
    32         b2 = max(cropBeg2, b1 + offset)
    33         e2 = min(cropEnd2, e1 + offset)
    34         if b2 >= e2: continue
    35         yield b2 - offset, b2, e2 - b2
    36 
    37 def tabBlocks(beg1, beg2, blocks):
    38     '''Get the gapless blocks of an alignment, from LAST tabular format.'''
    39     for i in blocks.split(","):
    40         if ":" in i:
    41             x, y = i.split(":")
    42             beg1 += int(x)
    43             beg2 += int(y)
    44         else:
    45             size = int(i)
    46             yield beg1, beg2, size
    47             beg1 += size
    48             beg2 += size
    49 
    50 def mafBlocks(beg1, beg2, seq1, seq2):
    51     '''Get the gapless blocks of an alignment, from MAF format.'''
    52     size = 0
    53     for x, y in itertools.izip(seq1, seq2):
    54         if x == "-":
    55             if size:
    56                 yield beg1, beg2, size
    57                 beg1 += size
    58                 beg2 += size
    59                 size = 0
    60             beg2 += 1
    61         elif y == "-":
    62             if size:
    63                 yield beg1, beg2, size
    64                 beg1 += size
    65                 beg2 += size
    66                 size = 0
    67             beg1 += 1
    68         else:
    69             size += 1
    70     if size: yield beg1, beg2, size
    71 
    72 def alignmentInput(lines):
    73     '''Get alignments and sequence lengths, from MAF or tabular format.'''
    74     mafCount = 0
    75     for line in lines:
    76         w = line.split()
    77         if line[0].isdigit():  # tabular format
    78             chr1, beg1, seqlen1 = w[1], int(w[2]), int(w[5])
    79             if w[4] == "-": beg1 -= seqlen1
    80             chr2, beg2, seqlen2 = w[6], int(w[7]), int(w[10])
    81             if w[9] == "-": beg2 -= seqlen2
    82             blocks = list(tabBlocks(beg1, beg2, w[11]))
    83             yield chr1, seqlen1, chr2, seqlen2, blocks
    84         elif line[0] == "s":  # MAF format
    85             if mafCount == 0:
    86                 chr1, beg1, seqlen1, seq1 = w[1], int(w[2]), int(w[5]), w[6]
    87                 if w[4] == "-": beg1 -= seqlen1
    88                 mafCount = 1
    89             else:
    90                 chr2, beg2, seqlen2, seq2 = w[1], int(w[2]), int(w[5]), w[6]
    91                 if w[4] == "-": beg2 -= seqlen2
    92                 blocks = list(mafBlocks(beg1, beg2, seq1, seq2))
    93                 yield chr1, seqlen1, chr2, seqlen2, blocks
    94                 mafCount = 0
    95 
    96 def seqRangeFromText(text):
    97     if ":" in text:
    98         pattern, interval = text.rsplit(":", 1)
    99         if "-" in interval:
   100             beg, end = interval.rsplit("-", 1)
   101             return pattern, int(beg), int(end)  # beg may be negative
   102     return text, 0, sys.maxsize
   103 
   104 def rangeFromSeqName(seqRanges, name, seqLen):
   105     if not seqRanges: return 0, seqLen
   106     base = name.split(".")[-1]  # allow for names like hg19.chr7
   107     for pat, beg, end in seqRanges:
   108         if fnmatch.fnmatchcase(name, pat) or fnmatch.fnmatchcase(base, pat):
   109             return max(beg, 0), min(end, seqLen)
   110     return 0, 0
   111 
   112 def updateSeqLimits(isTrim, seqLimits, seqName, seqRange, blocks, index):
   113     if isTrim:
   114         beg = blocks[0][index]
   115         end = blocks[-1][index] + blocks[-1][2]
   116         if beg < 0: beg, end = -end, -beg
   117         if seqName in seqLimits:
   118             b, e = seqLimits[seqName]
   119             seqLimits[seqName] = min(b, beg), max(e, end)
   120         else:
   121             seqLimits[seqName] = beg, end
   122     else:
   123         seqLimits[seqName] = seqRange
   124 
   125 def readAlignments(fileName, opts):
   126     '''Get alignments and sequence limits, from MAF or tabular format.'''
   127     seqRanges1 = map(seqRangeFromText, opts.seq1)
   128     seqRanges2 = map(seqRangeFromText, opts.seq2)
   129 
   130     alignments = []
   131     seqLimits1 = {}
   132     seqLimits2 = {}
   133     lines = fileinput.input(fileName)
   134     for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
   135         range1 = rangeFromSeqName(seqRanges1, seqName1, seqLen1)
   136         range2 = rangeFromSeqName(seqRanges2, seqName2, seqLen2)
   137         b = list(croppedBlocks(blocks, range1, range2))
   138         if not b: continue
   139         aln = seqName1, seqName2, b
   140         alignments.append(aln)
   141         updateSeqLimits(opts.trim1, seqLimits1, seqName1, range1, b, 0)
   142         updateSeqLimits(opts.trim2, seqLimits2, seqName2, range2, b, 1)
   143     return alignments, seqLimits1, seqLimits2
   144 
   145 def natural_sort_key(my_string):
   146     '''Return a sort key for "natural" ordering, e.g. chr9 < chr10.'''
   147     parts = re.split(r'(\d+)', my_string)
   148     parts[1::2] = map(int, parts[1::2])
   149     return parts
   150 
   151 def get_text_sizes(my_strings, font, fontsize, image_mode):
   152     '''Get widths & heights, in pixels, of some strings.'''
   153     if fontsize == 0: return [(0, 0) for i in my_strings]
   154     image_size = 1, 1
   155     im = Image.new(image_mode, image_size)
   156     draw = ImageDraw.Draw(im)
   157     return [draw.textsize(i, font=font) for i in my_strings]
   158 
   159 def get_seq_info(seqLimits, font, fontsize, image_mode):
   160     '''Return miscellaneous information about the sequences.'''
   161     seqNames = seqLimits.keys()
   162     seqNames.sort(key=natural_sort_key)
   163     seq_sizes = [seqLimits[i][1] - seqLimits[i][0] for i in seqNames]
   164     name_sizes = get_text_sizes(seqNames, font, fontsize, image_mode)
   165     margin = max(zip(*name_sizes)[1])  # maximum text height
   166     return seqNames, seq_sizes, name_sizes, margin
   167 
   168 def div_ceil(x, y):
   169     '''Return x / y rounded up.'''
   170     q, r = divmod(x, y)
   171     return q + (r != 0)
   172 
   173 def tot_seq_pix(seq_sizes, bp_per_pix):
   174     '''Return the total pixels needed for sequences of the given sizes.'''
   175     return sum([div_ceil(i, bp_per_pix) for i in seq_sizes])
   176 
   177 def get_bp_per_pix(seq_sizes, pix_tween_seqs, pix_limit):
   178     '''Get the minimum bp-per-pixel that fits in the size limit.'''
   179     seq_num = len(seq_sizes)
   180     seq_pix_limit = pix_limit - pix_tween_seqs * (seq_num - 1)
   181     if seq_pix_limit < seq_num:
   182         raise Exception("can't fit the image: too many sequences?")
   183     lower_bound = div_ceil(sum(seq_sizes), seq_pix_limit)
   184     for bp_per_pix in itertools.count(lower_bound):  # slow linear search
   185         if tot_seq_pix(seq_sizes, bp_per_pix) <= seq_pix_limit: break
   186     return bp_per_pix
   187 
   188 def get_seq_starts(seq_pix, pix_tween_seqs, margin):
   189     '''Get the start pixel for each sequence.'''
   190     seq_starts = []
   191     pix_tot = margin - pix_tween_seqs
   192     for i in seq_pix:
   193         pix_tot += pix_tween_seqs
   194         seq_starts.append(pix_tot)
   195         pix_tot += i
   196     return seq_starts
   197 
   198 def get_pix_info(seq_sizes, bp_per_pix, pix_tween_seqs, margin):
   199     '''Return pixel information about the sequences.'''
   200     seq_pix = [div_ceil(i, bp_per_pix) for i in seq_sizes]
   201     seq_starts = get_seq_starts(seq_pix, pix_tween_seqs, margin)
   202     tot_pix = seq_starts[-1] + seq_pix[-1]
   203     return seq_pix, seq_starts, tot_pix
   204 
   205 def drawLineForward(hits, width, bp_per_pix, beg1, beg2, size):
   206     while True:
   207         q1, r1 = divmod(beg1, bp_per_pix)
   208         q2, r2 = divmod(beg2, bp_per_pix)
   209         hits[q2 * width + q1] |= 1
   210         next_pix = min(bp_per_pix - r1, bp_per_pix - r2)
   211         if next_pix >= size: break
   212         beg1 += next_pix
   213         beg2 += next_pix
   214         size -= next_pix
   215 
   216 def drawLineReverse(hits, width, bp_per_pix, beg1, beg2, size):
   217     beg2 = -1 - beg2
   218     while True:
   219         q1, r1 = divmod(beg1, bp_per_pix)
   220         q2, r2 = divmod(beg2, bp_per_pix)
   221         hits[q2 * width + q1] |= 2
   222         next_pix = min(bp_per_pix - r1, r2 + 1)
   223         if next_pix >= size: break
   224         beg1 += next_pix
   225         beg2 -= next_pix
   226         size -= next_pix
   227 
   228 def alignmentPixels(width, height, alignments, bp_per_pix, origins1, origins2):
   229     hits = [0] * (width * height)  # the image data
   230     for seq1, seq2, blocks in alignments:
   231         ori1 = origins1[seq1]
   232         ori2 = origins2[seq2]
   233         for beg1, beg2, size in blocks:
   234             if beg1 < 0:
   235                 beg1 = -(beg1 + size)
   236                 beg2 = -(beg2 + size)
   237             if beg2 >= 0:
   238                 drawLineForward(hits, width, bp_per_pix,
   239                                 beg1 + ori1, beg2 + ori2, size)
   240             else:
   241                 drawLineReverse(hits, width, bp_per_pix,
   242                                 beg1 + ori1, beg2 - ori2, size)
   243     return hits
   244 
   245 def expandedSeqDict(seqDict):
   246     '''Allow lookup by short sequence names, e.g. chr7 as well as hg19.chr7.'''
   247     newDict = {}
   248     for name, x in seqDict.items():
   249         base = name.split(".")[-1]
   250         newDict[name] = x
   251         newDict[base] = x
   252     return newDict
   253 
   254 def isExtraFirstGapField(fields):
   255     return fields[4].isdigit()
   256 
   257 def readGaps(fileName, seqLimits):
   258     '''Read locations of unsequenced gaps, from an agp or gap file.'''
   259     if not fileName: return
   260     seqLimits = expandedSeqDict(seqLimits)
   261     for line in fileinput.input(fileName):
   262         w = line.split()
   263         if not w or w[0][0] == "#": continue
   264         if isExtraFirstGapField(w): w = w[1:]
   265         if w[4] not in "NU": continue
   266         seqName = w[0]
   267         if seqName not in seqLimits: continue
   268         cropBeg, cropEnd = seqLimits[seqName]
   269         end = int(w[2])
   270         beg = end - int(w[5])  # zero-based coordinate
   271         b = max(beg, cropBeg)
   272         e = min(end, cropEnd)
   273         if b >= e: continue
   274         bridgedText = w[7]
   275         yield seqName, b, e, bridgedText
   276 
   277 def drawUnsequencedGaps(im, gaps, origins, margin, limit, isTop, bridgedText,
   278                         bp_per_pix, color):
   279     '''Draw rectangles representing unsequenced gaps.'''
   280     for seqName, beg, end, b in gaps:
   281         if b != bridgedText: continue
   282         ori = origins[seqName]
   283         b = div_ceil(ori + beg, bp_per_pix)  # use fully-covered pixels only
   284         e = (ori + end) // bp_per_pix
   285         if e <= b: continue
   286         if isTop: box = b, margin, e, limit
   287         else:     box = margin, b, limit, e
   288         im.paste(color, box)
   289 
   290 def make_label(text, text_size, range_start, range_size):
   291     '''Return an axis label with endpoint & sort-order information.'''
   292     text_width  = text_size[0]
   293     label_start = range_start + (range_size - text_width) // 2
   294     label_end   = label_start + text_width
   295     sort_key    = text_width - range_size
   296     return sort_key, label_start, label_end, text
   297 
   298 def get_nonoverlapping_labels(labels, label_space):
   299     '''Get a subset of non-overlapping axis labels, greedily.'''
   300     nonoverlapping_labels = []
   301     for i in labels:
   302         if True not in [i[1] < j[2] + label_space and j[1] < i[2] + label_space
   303                         for j in nonoverlapping_labels]:
   304             nonoverlapping_labels.append(i)
   305     return nonoverlapping_labels
   306 
   307 def get_axis_image(seqNames, name_sizes, seq_starts, seq_pix,
   308                    font, image_mode, opts):
   309     '''Make an image of axis labels.'''
   310     min_pos = seq_starts[0]
   311     max_pos = seq_starts[-1] + seq_pix[-1]
   312     height = max(zip(*name_sizes)[1])
   313     labels = [make_label(i, j, k, l) for i, j, k, l in
   314               zip(seqNames, name_sizes, seq_starts, seq_pix)]
   315     labels = [i for i in labels if i[1] >= min_pos and i[2] <= max_pos]
   316     labels.sort()
   317     labels = get_nonoverlapping_labels(labels, opts.label_space)
   318     image_size = max_pos, height
   319     im = Image.new(image_mode, image_size, opts.border_shade)
   320     draw = ImageDraw.Draw(im)
   321     for i in labels:
   322         position = i[1], 0
   323         draw.text(position, i[3], font=font, fill=opts.text_color)
   324     return im
   325 
   326 def seqOrigins(seqNames, seq_starts, seqLimits, bp_per_pix):
   327     for i, j in zip(seqNames, seq_starts):
   328         yield i, bp_per_pix * j - seqLimits[i][0]
   329 
   330 def lastDotplot(opts, args):
   331     if opts.fontfile:  font = ImageFont.truetype(opts.fontfile, opts.fontsize)
   332     else:              font = ImageFont.load_default()
   333 
   334     image_mode = 'RGB'
   335     forward_color = ImageColor.getcolor(opts.forwardcolor, image_mode)
   336     reverse_color = ImageColor.getcolor(opts.reversecolor, image_mode)
   337     zipped_colors = zip(forward_color, reverse_color)
   338     overlap_color = tuple([(i + j) // 2 for i, j in zipped_colors])
   339 
   340     warn("reading alignments...")
   341     alignments, seqLimits1, seqLimits2 = readAlignments(args[0], opts)
   342     warn("done")
   343 
   344     if not alignments: raise Exception("there are no alignments")
   345 
   346     seq_info1 = get_seq_info(seqLimits1, font, opts.fontsize, image_mode)
   347     seq_info2 = get_seq_info(seqLimits2, font, opts.fontsize, image_mode)
   348     seqNames1, seq_sizes1, name_sizes1, margin1 = seq_info1
   349     seqNames2, seq_sizes2, name_sizes2, margin2 = seq_info2
   350 
   351     warn("choosing bp per pixel...")
   352     pix_limit1 = opts.width  - margin1
   353     pix_limit2 = opts.height - margin2
   354     bp_per_pix1 = get_bp_per_pix(seq_sizes1, opts.pix_tween_seqs, pix_limit1)
   355     bp_per_pix2 = get_bp_per_pix(seq_sizes2, opts.pix_tween_seqs, pix_limit2)
   356     bp_per_pix = max(bp_per_pix1, bp_per_pix2)
   357     warn("bp per pixel = " + str(bp_per_pix))
   358 
   359     seq_pix1, seq_starts1, width  = get_pix_info(seq_sizes1, bp_per_pix,
   360                                                  opts.pix_tween_seqs, margin1)
   361     seq_pix2, seq_starts2, height = get_pix_info(seq_sizes2, bp_per_pix,
   362                                                  opts.pix_tween_seqs, margin2)
   363 
   364     origins1 = dict(seqOrigins(seqNames1, seq_starts1, seqLimits1, bp_per_pix))
   365     origins2 = dict(seqOrigins(seqNames2, seq_starts2, seqLimits2, bp_per_pix))
   366 
   367     warn("processing alignments...")
   368     hits = alignmentPixels(width, height, alignments, bp_per_pix,
   369                            origins1, origins2)
   370     warn("done")
   371 
   372     image_size = width, height
   373     im = Image.new(image_mode, image_size, opts.background_color)
   374 
   375     origins1 = expandedSeqDict(origins1)
   376     origins2 = expandedSeqDict(origins2)
   377     gaps1 = list(readGaps(opts.gap1, seqLimits1))
   378     gaps2 = list(readGaps(opts.gap2, seqLimits2))
   379     # draw bridged gaps first, then unbridged gaps on top:
   380     drawUnsequencedGaps(im, gaps1, origins1, margin2, height, True, "yes",
   381                         bp_per_pix, opts.bridged_color)
   382     drawUnsequencedGaps(im, gaps2, origins2, margin1, width, False, "yes",
   383                         bp_per_pix, opts.bridged_color)
   384     drawUnsequencedGaps(im, gaps1, origins1, margin2, height, True, "no",
   385                         bp_per_pix, opts.unbridged_color)
   386     drawUnsequencedGaps(im, gaps2, origins2, margin1, width, False, "no",
   387                         bp_per_pix, opts.unbridged_color)
   388 
   389     for i in range(height):
   390         for j in range(width):
   391             store_value = hits[i * width + j]
   392             xy = j, i
   393             if   store_value == 1: im.putpixel(xy, forward_color)
   394             elif store_value == 2: im.putpixel(xy, reverse_color)
   395             elif store_value == 3: im.putpixel(xy, overlap_color)
   396 
   397     if opts.fontsize != 0:
   398         axis1 = get_axis_image(seqNames1, name_sizes1, seq_starts1, seq_pix1,
   399                                font, image_mode, opts)
   400         axis2 = get_axis_image(seqNames2, name_sizes2, seq_starts2, seq_pix2,
   401                                font, image_mode, opts)
   402         axis2 = axis2.transpose(Image.ROTATE_270)  # !!! bug hotspot
   403         im.paste(axis1, (0, 0))
   404         im.paste(axis2, (0, 0))
   405 
   406     for i in seq_starts1[1:]:
   407         box = i - opts.pix_tween_seqs, margin2, i, height
   408         im.paste(opts.border_shade, box)
   409 
   410     for i in seq_starts2[1:]:
   411         box = margin1, i - opts.pix_tween_seqs, width, i
   412         im.paste(opts.border_shade, box)
   413 
   414     im.save(args[1])
   415 
   416 if __name__ == "__main__":
   417     usage = """%prog --help
   418    or: %prog [options] maf-or-tab-alignments dotplot.png
   419    or: %prog [options] maf-or-tab-alignments dotplot.gif
   420    or: ..."""
   421     description = "Draw a dotplot of pair-wise sequence alignments in MAF or tabular format."
   422     op = optparse.OptionParser(usage=usage, description=description)
   423     op.add_option("-1", "--seq1", metavar="PATTERN", action="append",
   424                   default=[],
   425                   help="which sequences to show from the 1st genome")
   426     op.add_option("-2", "--seq2", metavar="PATTERN", action="append",
   427                   default=[],
   428                   help="which sequences to show from the 2nd genome")
   429     # Replace "width" & "height" with a single "length" option?
   430     op.add_option("-x", "--width", type="int", default=1000,
   431                   help="maximum width in pixels (default: %default)")
   432     op.add_option("-y", "--height", type="int", default=1000,
   433                   help="maximum height in pixels (default: %default)")
   434     op.add_option("-f", "--fontfile", metavar="FILE",
   435                   help="TrueType or OpenType font file")
   436     op.add_option("-s", "--fontsize", metavar="SIZE", type="int", default=11,
   437                   help="TrueType or OpenType font size (default: %default)")
   438     op.add_option("-c", "--forwardcolor", metavar="COLOR", default="red",
   439                   help="color for forward alignments (default: %default)")
   440     op.add_option("-r", "--reversecolor", metavar="COLOR", default="blue",
   441                   help="color for reverse alignments (default: %default)")
   442     op.add_option("--trim1", action="store_true",
   443                   help="trim unaligned sequence flanks from the 1st genome")
   444     op.add_option("--trim2", action="store_true",
   445                   help="trim unaligned sequence flanks from the 2nd genome")
   446     og = optparse.OptionGroup(op, "Unsequenced gap options")
   447     og.add_option("--gap1", metavar="FILE",
   448                   help="read genome1 unsequenced gaps from agp or gap file")
   449     og.add_option("--gap2", metavar="FILE",
   450                   help="read genome2 unsequenced gaps from agp or gap file")
   451     og.add_option("--bridged-color", metavar="COLOR", default="yellow",
   452                   help="color for bridged gaps (default: %default)")
   453     og.add_option("--unbridged-color", metavar="COLOR", default="pink",
   454                   help="color for unbridged gaps (default: %default)")
   455     op.add_option_group(og)
   456     (opts, args) = op.parse_args()
   457     if len(args) != 2: op.error("2 arguments needed")
   458 
   459     opts.text_color = "black"
   460     opts.background_color = "white"
   461     opts.pix_tween_seqs = 2  # number of border pixels between sequences
   462     opts.border_shade = 239, 239, 239  # the shade of grey for border pixels
   463     opts.label_space = 5     # minimum number of pixels between axis labels
   464 
   465     try: lastDotplot(opts, args)
   466     except KeyboardInterrupt: pass  # avoid silly error message
   467     except Exception, e:
   468         prog = os.path.basename(sys.argv[0])
   469         sys.exit(prog + ": error: " + str(e))