Made last-dotplot faster for MAF & work with Python 2.4.
authorMartin C. Frith
Mon Sep 29 07:37:32 2014 +0900 (2014-09-29)
changeset 482f5b731ee703e
parent 481 e29f2555e674
child 483 63b30414bb21
Made last-dotplot faster for MAF & work with Python 2.4.
scripts/last-dotplot
     1.1 --- a/scripts/last-dotplot	Wed Sep 24 15:59:01 2014 +0900
     1.2 +++ b/scripts/last-dotplot	Mon Sep 29 07:37:32 2014 +0900
     1.3 @@ -9,7 +9,7 @@
     1.4  # according to the number of aligned nt-pairs within it, but the
     1.5  # result is too faint.  How can this be done better?
     1.6  
     1.7 -import sys, os, re, itertools, optparse
     1.8 +import fileinput, itertools, optparse, os, re, sys
     1.9  
    1.10  # Try to make PIL/PILLOW work:
    1.11  try: from PIL import Image, ImageDraw, ImageFont, ImageColor
    1.12 @@ -53,50 +53,81 @@
    1.13  reverse_color = ImageColor.getcolor(opts.reversecolor, image_mode)
    1.14  overlap_color = tuple([(i+j)//2 for i, j in zip(forward_color, reverse_color)])
    1.15  
    1.16 -def isGapless(alignmentColumn):
    1.17 -    return "-" not in alignmentColumn
    1.18 +def tabBlocks(beg1, beg2, blocks):
    1.19 +    '''Get the gapless blocks of an alignment, from LAST tabular format.'''
    1.20 +    for i in blocks.split(","):
    1.21 +        if ":" in i:
    1.22 +            x, y = i.split(":")
    1.23 +            beg1 += int(x)
    1.24 +            beg2 += int(y)
    1.25 +        else:
    1.26 +            size = int(i)
    1.27 +            yield beg1, beg2, size
    1.28 +            beg1 += size
    1.29 +            beg2 += size
    1.30  
    1.31 -def matchAndInsertLengths(alignmentColumns):
    1.32 -    for k, v in itertools.groupby(alignmentColumns, isGapless):
    1.33 -        if k:
    1.34 -            matchLength = sum(1 for i in v)
    1.35 -            yield str(matchLength)
    1.36 +def mafBlocks(beg1, beg2, seq1, seq2):
    1.37 +    '''Get the gapless blocks of an alignment, from MAF format.'''
    1.38 +    size = 0
    1.39 +    for x, y in itertools.izip(seq1, seq2):
    1.40 +        if x == "-":
    1.41 +            if size:
    1.42 +                yield beg1, beg2, size
    1.43 +                beg1 += size
    1.44 +                beg2 += size
    1.45 +                size = 0
    1.46 +            beg2 += 1
    1.47 +        elif y == "-":
    1.48 +            if size:
    1.49 +                yield beg1, beg2, size
    1.50 +                beg1 += size
    1.51 +                beg2 += size
    1.52 +                size = 0
    1.53 +            beg1 += 1
    1.54          else:
    1.55 -            blockRows = itertools.izip(*v)
    1.56 -            insertLengths = (len(i) - i.count("-") for i in blockRows)
    1.57 -            yield ":".join(map(str, insertLengths))
    1.58 +            size += 1
    1.59 +    if size: yield beg1, beg2, size
    1.60  
    1.61 -def alignmentInput(lines):  # read alignments in either tabular or MAF format
    1.62 +def alignmentInput(lines):
    1.63 +    '''Get alignments and sequence lengths, from MAF or tabular format.'''
    1.64 +    mafCount = 0
    1.65      for line in lines:
    1.66          w = line.split()
    1.67          if line[0].isdigit():  # tabular format
    1.68 -            yield w
    1.69 -        elif line[0] == "a":  # MAF format
    1.70 -            sLines = []
    1.71 +            chr1, beg1, seqlen1 = w[1], int(w[2]), int(w[5])
    1.72 +            if w[4] == "-": beg1 -= seqlen1
    1.73 +            chr2, beg2, seqlen2 = w[6], int(w[7]), int(w[10])
    1.74 +            if w[9] == "-": beg2 -= seqlen2
    1.75 +            blocks = tabBlocks(beg1, beg2, w[11])
    1.76 +            yield chr1, seqlen1, chr2, seqlen2, blocks
    1.77          elif line[0] == "s":  # MAF format
    1.78 -            sLines.append(w)
    1.79 -            if len(sLines) == 2:
    1.80 -                alignmentRows = (i[6] for i in sLines)
    1.81 -                alignmentColumns = itertools.izip(*alignmentRows)
    1.82 -                blocks = ",".join(matchAndInsertLengths(alignmentColumns))
    1.83 -                yield sLines[0][0:6] + sLines[1][1:6] + [blocks]
    1.84 +            if mafCount == 0:
    1.85 +                chr1, beg1, seqlen1, seq1 = w[1], int(w[2]), int(w[5]), w[6]
    1.86 +                if w[4] == "-": beg1 -= seqlen1
    1.87 +                mafCount = 1
    1.88 +            else:
    1.89 +                chr2, beg2, seqlen2, seq2 = w[1], int(w[2]), int(w[5]), w[6]
    1.90 +                if w[4] == "-": beg2 -= seqlen2
    1.91 +                blocks = mafBlocks(beg1, beg2, seq1, seq2)
    1.92 +                yield chr1, seqlen1, chr2, seqlen2, blocks
    1.93 +                mafCount = 0
    1.94  
    1.95 -seq_size_dic1 = {}  # sizes of the first set of sequences
    1.96 -seq_size_dic2 = {}  # sizes of the second set of sequences
    1.97 -alignments = []
    1.98 +def readAlignments(lines):
    1.99 +    '''Get alignments and sequence lengths, from MAF or tabular format.'''
   1.100 +    alignments = []
   1.101 +    seqLengths1 = {}
   1.102 +    seqLengths2 = {}
   1.103 +    for chr1, seqlen1, chr2, seqlen2, blocks in alignmentInput(lines):
   1.104 +        aln = chr1, chr2, blocks
   1.105 +        alignments.append(aln)
   1.106 +        seqLengths1[chr1] = seqlen1
   1.107 +        seqLengths2[chr2] = seqlen2
   1.108 +    return alignments, seqLengths1, seqLengths2
   1.109  
   1.110 -f = open(args[0])
   1.111  sys.stderr.write(my_name + ": reading alignments...\n")
   1.112 -for w in alignmentInput(f):
   1.113 -    seq1, pos1, strand1, size1 = w[1], int(w[2]), w[4], int(w[5])
   1.114 -    seq2, pos2, strand2, size2 = w[6], int(w[7]), w[9], int(w[10])
   1.115 -    blocks = w[11]
   1.116 -    seq_size_dic1[seq1] = size1
   1.117 -    seq_size_dic2[seq2] = size2
   1.118 -    aln = seq1, seq2, pos1, pos2, strand1, strand2, blocks
   1.119 -    alignments.append(aln)
   1.120 +input = fileinput.input(args[0])
   1.121 +alignments, seq_size_dic1, seq_size_dic2 = readAlignments(input)
   1.122  sys.stderr.write(my_name + ": done\n")
   1.123 -f.close()
   1.124  
   1.125  if not alignments:
   1.126      sys.exit(my_name + ": there are no alignments")
   1.127 @@ -177,34 +208,23 @@
   1.128  hits = [0] * (width * height)  # the image data
   1.129  
   1.130  sys.stderr.write(my_name + ": processing alignments...\n")
   1.131 -for aln in alignments:
   1.132 -    seq1, seq2, pos1, pos2, strand1, strand2, blocks = aln
   1.133 -    last1 = seq_size_dic1[seq1] - 1
   1.134 -    last2 = seq_size_dic2[seq2] - 1
   1.135 +for seq1, seq2, blocks in alignments:
   1.136      seq_start1 = seq_start_dic1[seq1]
   1.137      seq_start2 = seq_start_dic2[seq2]
   1.138      my_start = seq_start2 * width + seq_start1
   1.139 -    if strand1 == strand2: store_value = 1
   1.140 -    else:                  store_value = 2
   1.141 -    for i in blocks.split(","):
   1.142 -        if ":" in i:  # it's a gap region: skip over it
   1.143 -            insertLength1, insertLength2 = i.split(":")
   1.144 -            pos1 += int(insertLength1)
   1.145 -            pos2 += int(insertLength2)
   1.146 -        else:  # it's a match region: draw pixels for it
   1.147 -            matchLength = int(i)
   1.148 -            end1 = pos1 + matchLength
   1.149 -            end2 = pos2 + matchLength
   1.150 -            if strand1 == '+': j = xrange(pos1, end1)
   1.151 -            else:              j = xrange(last1 - pos1, last1 - end1, -1)
   1.152 -            if strand2 == '+': k = xrange(pos2, end2)
   1.153 -            else:              k = xrange(last2 - pos2, last2 - end2, -1)
   1.154 -            for real_pos1, real_pos2 in itertools.izip(j, k):
   1.155 -                pix1 = real_pos1 // bp_per_pix
   1.156 -                pix2 = real_pos2 // bp_per_pix
   1.157 -                hits[my_start + pix2 * width + pix1] |= store_value
   1.158 -            pos1 = end1
   1.159 -            pos2 = end2
   1.160 +    for beg1, beg2, size in blocks:
   1.161 +        end1 = beg1 + size
   1.162 +        end2 = beg2 + size
   1.163 +        if beg1 >= 0: j = xrange(beg1, end1)
   1.164 +        else:         j = xrange(-1 - beg1, -1 - end1, -1)
   1.165 +        if beg2 >= 0: k = xrange(beg2, end2)
   1.166 +        else:         k = xrange(-1 - beg2, -1 - end2, -1)
   1.167 +        if beg2 >= 0: store_value = 1
   1.168 +        else:         store_value = 2
   1.169 +        for real_pos1, real_pos2 in itertools.izip(j, k):
   1.170 +            pix1 = real_pos1 // bp_per_pix
   1.171 +            pix2 = real_pos2 // bp_per_pix
   1.172 +            hits[my_start + pix2 * width + pix1] |= store_value
   1.173  sys.stderr.write(my_name + ": done\n")
   1.174  
   1.175  def make_label(text, text_size, range_start, range_size):