Refactor last-dotplot
authorMartin C. Frith
Wed Nov 08 12:17:50 2017 +0900 (2017-11-08)
changeset 9058d52cef22afa
parent 904 c2f50cd4e29d
child 906 21fbf377641f
Refactor last-dotplot
scripts/last-dotplot
     1.1 --- a/scripts/last-dotplot	Wed Nov 08 11:36:01 2017 +0900
     1.2 +++ b/scripts/last-dotplot	Wed Nov 08 12:17:50 2017 +0900
     1.3 @@ -289,11 +289,20 @@
     1.4          beg2 -= next_pix
     1.5          size -= next_pix
     1.6  
     1.7 -def alignmentPixels(width, height, alignments, bp_per_pix, origins1, origins2):
     1.8 +def findOrigin(ranges, beg, size):
     1.9 +    if beg < 0:
    1.10 +        beg = -(beg + size)
    1.11 +    for rangeBeg, rangeEnd, origin in ranges:
    1.12 +        if rangeEnd > beg:
    1.13 +            return origin
    1.14 +
    1.15 +def alignmentPixels(width, height, alignments, bp_per_pix,
    1.16 +                    rangeDict1, rangeDict2):
    1.17      hits = [0] * (width * height)  # the image data
    1.18      for seq1, seq2, blocks in alignments:
    1.19 -        ori1 = origins1[seq1]
    1.20 -        ori2 = origins2[seq2]
    1.21 +        beg1, beg2, size = blocks[0]
    1.22 +        ori1 = findOrigin(rangeDict1[seq1], beg1, size)
    1.23 +        ori2 = findOrigin(rangeDict2[seq2], beg2, size)
    1.24          for beg1, beg2, size in blocks:
    1.25              if beg1 < 0:
    1.26                  beg1 = -(beg1 + size)
    1.27 @@ -317,12 +326,12 @@
    1.28              newDict[base] = x
    1.29      return newDict
    1.30  
    1.31 -def readBed(fileName, seqLimits):
    1.32 +def readBed(fileName, rangeDict):
    1.33      for line in myOpen(fileName):
    1.34          w = line.split()
    1.35          if not w: continue
    1.36          seqName = w[0]
    1.37 -        if seqName not in seqLimits: continue
    1.38 +        if seqName not in rangeDict: continue
    1.39          beg = int(w[1])
    1.40          end = int(w[2])
    1.41          layer = 900
    1.42 @@ -342,13 +351,13 @@
    1.43  def commaSeparatedInts(text):
    1.44      return map(int, text.rstrip(",").split(","))
    1.45  
    1.46 -def readGenePred(opts, fileName, seqLimits):
    1.47 +def readGenePred(opts, fileName, rangeDict):
    1.48      for line in myOpen(fileName):
    1.49          fields = line.split()
    1.50          if not fields: continue
    1.51          if fields[2] not in "+-": fields = fields[1:]
    1.52          seqName = fields[1]
    1.53 -        if seqName not in seqLimits: continue
    1.54 +        if seqName not in rangeDict: continue
    1.55          #strand = fields[2]
    1.56          cdsBeg = int(fields[5])
    1.57          cdsEnd = int(fields[6])
    1.58 @@ -360,19 +369,19 @@
    1.59              e = min(end, cdsEnd)
    1.60              if b < e: yield 400, opts.cds_color, seqName, b, e
    1.61  
    1.62 -def readRmsk(fileName, seqLimits):
    1.63 +def readRmsk(fileName, rangeDict):
    1.64      for line in myOpen(fileName):
    1.65          fields = line.split()
    1.66          if len(fields) == 17:  # rmsk.txt
    1.67              seqName = fields[5]
    1.68 -            if seqName not in seqLimits: continue  # do this ASAP for speed
    1.69 +            if seqName not in rangeDict: continue  # do this ASAP for speed
    1.70              beg = int(fields[6])
    1.71              end = int(fields[7])
    1.72              strand = fields[9]
    1.73              repeatClass = fields[11]
    1.74          elif len(fields) == 15:  # .out
    1.75              seqName = fields[4]
    1.76 -            if seqName not in seqLimits: continue
    1.77 +            if seqName not in rangeDict: continue
    1.78              beg = int(fields[5]) - 1
    1.79              end = int(fields[6])
    1.80              strand = fields[8]
    1.81 @@ -389,7 +398,7 @@
    1.82  def isExtraFirstGapField(fields):
    1.83      return fields[4].isdigit()
    1.84  
    1.85 -def readGaps(opts, fileName, seqLimits):
    1.86 +def readGaps(opts, fileName, rangeDict):
    1.87      '''Read locations of unsequenced gaps, from an agp or gap file.'''
    1.88      for line in myOpen(fileName):
    1.89          w = line.split()
    1.90 @@ -397,7 +406,7 @@
    1.91          if isExtraFirstGapField(w): w = w[1:]
    1.92          if w[4] not in "NU": continue
    1.93          seqName = w[0]
    1.94 -        if seqName not in seqLimits: continue
    1.95 +        if seqName not in rangeDict: continue
    1.96          end = int(w[2])
    1.97          beg = end - int(w[5])  # zero-based coordinate
    1.98          if w[7] == "yes":
    1.99 @@ -405,27 +414,26 @@
   1.100          else:
   1.101              yield 2000, opts.unbridged_color, seqName, beg, end
   1.102  
   1.103 -def bedBoxes(beds, seqLimits, origins, margin, edge, isTop, bpPerPix):
   1.104 +def bedBoxes(beds, rangeDict, margin, edge, isTop, bpPerPix):
   1.105      for layer, color, seqName, bedBeg, bedEnd in beds:
   1.106 -        cropBeg, cropEnd = seqLimits[seqName]
   1.107 -        beg = max(bedBeg, cropBeg)
   1.108 -        end = min(bedEnd, cropEnd)
   1.109 -        if beg >= end: continue
   1.110 -        ori = origins[seqName]
   1.111 -        if layer <= 1000:
   1.112 -            # include partly-covered pixels
   1.113 -            b = (ori + beg) // bpPerPix
   1.114 -            e = div_ceil(ori + end, bpPerPix)
   1.115 -        else:
   1.116 -            # exclude partly-covered pixels
   1.117 -            b = div_ceil(ori + beg, bpPerPix)
   1.118 -            e = (ori + end) // bpPerPix
   1.119 -            if e <= b: continue
   1.120 -        if isTop:
   1.121 -            box = b, margin, e, edge
   1.122 -        else:
   1.123 -            box = margin, b, edge, e
   1.124 -        yield layer, color, box
   1.125 +        for rangeBeg, rangeEnd, origin in rangeDict[seqName]:
   1.126 +            beg = max(bedBeg, rangeBeg)
   1.127 +            end = min(bedEnd, rangeEnd)
   1.128 +            if beg >= end: continue
   1.129 +            if layer <= 1000:
   1.130 +                # include partly-covered pixels
   1.131 +                b = (origin + beg) // bpPerPix
   1.132 +                e = div_ceil(origin + end, bpPerPix)
   1.133 +            else:
   1.134 +                # exclude partly-covered pixels
   1.135 +                b = div_ceil(origin + beg, bpPerPix)
   1.136 +                e = (origin + end) // bpPerPix
   1.137 +                if e <= b: continue
   1.138 +            if isTop:
   1.139 +                box = b, margin, e, edge
   1.140 +            else:
   1.141 +                box = margin, b, edge, e
   1.142 +            yield layer, color, box
   1.143  
   1.144  def drawAnnotations(im, boxes):
   1.145      # xxx use partial transparency for different-color overlaps?
   1.146 @@ -480,9 +488,12 @@
   1.147          draw.text(position, i[3], font=font, fill=opts.text_color)
   1.148      return im
   1.149  
   1.150 -def seqOrigins(seqNames, rangePixBegs, seqLimits, bp_per_pix):
   1.151 +def rangesPerSeq(seqNames, seqLimits, rangePixBegs, bpPerPix):
   1.152      for i, j in zip(seqNames, rangePixBegs):
   1.153 -        yield i, bp_per_pix * j - seqLimits[i][0]
   1.154 +        beg, end = seqLimits[i]
   1.155 +        origin = bpPerPix * j - beg
   1.156 +        r = beg, end, origin
   1.157 +        yield i, [r]
   1.158  
   1.159  def getFont(opts):
   1.160      if opts.fontfile:
   1.161 @@ -538,38 +549,36 @@
   1.162  
   1.163      p1 = pixelData(rangeSizes1, bpPerPix, opts.border_pixels, lMargin)
   1.164      rangePixBegs1, rangePixLens1, width = p1
   1.165 +    rangeDict1 = dict(rangesPerSeq(seqNames1, seqLimits1, rangePixBegs1,
   1.166 +                                   bpPerPix))
   1.167  
   1.168      p2 = pixelData(rangeSizes2, bpPerPix, opts.border_pixels, tMargin)
   1.169      rangePixBegs2, rangePixLens2, height = p2
   1.170 +    rangeDict2 = dict(rangesPerSeq(seqNames2, seqLimits2, rangePixBegs2,
   1.171 +                                   bpPerPix))
   1.172  
   1.173      warn("width:  " + str(width))
   1.174      warn("height: " + str(height))
   1.175  
   1.176 -    origins1 = dict(seqOrigins(seqNames1, rangePixBegs1, seqLimits1, bpPerPix))
   1.177 -    origins2 = dict(seqOrigins(seqNames2, rangePixBegs2, seqLimits2, bpPerPix))
   1.178 -
   1.179      warn("processing alignments...")
   1.180      hits = alignmentPixels(width, height, alignments, bpPerPix,
   1.181 -                           origins1, origins2)
   1.182 +                           rangeDict1, rangeDict2)
   1.183  
   1.184      warn("reading annotations...")
   1.185  
   1.186 -    seqLimits1 = expandedSeqDict(seqLimits1)
   1.187 -    seqLimits2 = expandedSeqDict(seqLimits2)
   1.188 -    origins1 = expandedSeqDict(origins1)
   1.189 -    origins2 = expandedSeqDict(origins2)
   1.190 +    rangeDict1 = expandedSeqDict(rangeDict1)
   1.191 +    beds1 = itertools.chain(readBed(opts.bed1, rangeDict1),
   1.192 +                            readRmsk(opts.rmsk1, rangeDict1),
   1.193 +                            readGenePred(opts, opts.genePred1, rangeDict1),
   1.194 +                            readGaps(opts, opts.gap1, rangeDict1))
   1.195 +    b1 = bedBoxes(beds1, rangeDict1, tMargin, height, True, bpPerPix)
   1.196  
   1.197 -    beds1 = itertools.chain(readBed(opts.bed1, seqLimits1),
   1.198 -                            readRmsk(opts.rmsk1, seqLimits1),
   1.199 -                            readGenePred(opts, opts.genePred1, seqLimits1),
   1.200 -                            readGaps(opts, opts.gap1, seqLimits1))
   1.201 -    b1 = bedBoxes(beds1, seqLimits1, origins1, tMargin, height, True, bpPerPix)
   1.202 -
   1.203 -    beds2 = itertools.chain(readBed(opts.bed2, seqLimits2),
   1.204 -                            readRmsk(opts.rmsk2, seqLimits2),
   1.205 -                            readGenePred(opts, opts.genePred2, seqLimits2),
   1.206 -                            readGaps(opts, opts.gap2, seqLimits2))
   1.207 -    b2 = bedBoxes(beds2, seqLimits2, origins2, lMargin, width, False, bpPerPix)
   1.208 +    rangeDict2 = expandedSeqDict(rangeDict2)
   1.209 +    beds2 = itertools.chain(readBed(opts.bed2, rangeDict2),
   1.210 +                            readRmsk(opts.rmsk2, rangeDict2),
   1.211 +                            readGenePred(opts, opts.genePred2, rangeDict2),
   1.212 +                            readGaps(opts, opts.gap2, rangeDict2))
   1.213 +    b2 = bedBoxes(beds2, rangeDict2, lMargin, width, False, bpPerPix)
   1.214  
   1.215      boxes = sorted(itertools.chain(b1, b2))
   1.216