scripts/last-dotplot
changeset 916 ffe68ba3d865
parent 915 6079fabb49a5
child 925 e1861f956f60
     1.1 --- a/scripts/last-dotplot	Thu Dec 21 16:22:52 2017 +0900
     1.2 +++ b/scripts/last-dotplot	Thu Dec 21 18:02:21 2017 +0900
     1.3 @@ -9,6 +9,7 @@
     1.4  # according to the number of aligned nt-pairs within it, but the
     1.5  # result is too faint.  How can this be done better?
     1.6  
     1.7 +import collections
     1.8  import functools
     1.9  import gzip
    1.10  from fnmatch import fnmatchcase
    1.11 @@ -258,6 +259,22 @@
    1.12              rangeEnd = blocks[-1][1] + endPad
    1.13          yield seqName, rangeBeg, rangeEnd
    1.14  
    1.15 +def rangesWithStrandInfo(seqRanges, strandOpt, alignments, seqIndex):
    1.16 +    if strandOpt == "1":
    1.17 +        forwardMinusReverse = collections.defaultdict(int)
    1.18 +        for i in alignments:
    1.19 +            blocks = i[2]
    1.20 +            beg1, beg2, size = blocks[0]
    1.21 +            numOfAlignedLetterPairs = sum(i[2] for i in blocks)
    1.22 +            if (beg1 < 0) != (beg2 < 0):  # opposite-strand alignment
    1.23 +                numOfAlignedLetterPairs *= -1
    1.24 +            forwardMinusReverse[i[seqIndex]] += numOfAlignedLetterPairs
    1.25 +    strandNum = 0
    1.26 +    for seqName, beg, end in seqRanges:
    1.27 +        if strandOpt == "1":
    1.28 +            strandNum = 1 if forwardMinusReverse[seqName] >= 0 else 2
    1.29 +        yield seqName, beg, end, strandNum
    1.30 +
    1.31  def natural_sort_key(my_string):
    1.32      '''Return a sort key for "natural" ordering, e.g. chr9 < chr10.'''
    1.33      parts = re.split(r'(\d+)', my_string)
    1.34 @@ -280,29 +297,38 @@
    1.35          if toMiddle < 0:
    1.36              return i[1:3]  # sequence-rank and "position" of this alignment
    1.37  
    1.38 -def alignmentSortData(alignments, seqIndex, otherNamesToRanks):
    1.39 +def rankAndFlipPerSeq(seqRanges):
    1.40 +    rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
    1.41 +    for rank, group in enumerate(rangesGroupedBySeqName):
    1.42 +        seqName, ranges = group
    1.43 +        strandNum = next(ranges)[3]
    1.44 +        flip = 1 if strandNum < 2 else -1
    1.45 +        yield seqName, (rank, flip)
    1.46 +
    1.47 +def alignmentSortData(alignments, seqIndex, otherNamesToRanksAndFlips):
    1.48      otherIndex = 1 - seqIndex
    1.49      for i in alignments:
    1.50          blocks = i[2]
    1.51 -        otherRank = otherNamesToRanks[i[otherIndex]]
    1.52 -        otherPos = abs(blocks[0][otherIndex] +
    1.53 -                       blocks[-1][otherIndex] + blocks[-1][2])
    1.54 +        otherRank, otherFlip = otherNamesToRanksAndFlips[i[otherIndex]]
    1.55 +        otherPos = otherFlip * abs(blocks[0][otherIndex] +
    1.56 +                                   blocks[-1][otherIndex] + blocks[-1][2])
    1.57          numOfAlignedLetterPairs = sum(i[2] for i in blocks)
    1.58          yield i[seqIndex], otherRank, otherPos, numOfAlignedLetterPairs
    1.59  
    1.60  def mySortedRanges(seqRanges, sortOpt, seqIndex, alignments, otherRanges):
    1.61      rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
    1.62      g = [list(ranges) for seqName, ranges in rangesGroupedBySeqName]
    1.63 +    for i in g:
    1.64 +        if i[0][3] > 1:
    1.65 +            i.reverse()
    1.66      if sortOpt == "1":
    1.67          g.sort(key=nameKey)
    1.68      if sortOpt == "2":
    1.69          g.sort(key=sizeKey)
    1.70      if sortOpt == "3":
    1.71 -        otherNameGroups = itertools.groupby(i[0] for i in otherRanges)
    1.72 -        ranksAndNames = enumerate(i[0] for i in otherNameGroups)
    1.73 -        otherNamesToRanks = dict((n, r) for r, n in ranksAndNames)
    1.74 +        otherNamesToRanksAndFlips = dict(rankAndFlipPerSeq(otherRanges))
    1.75          alns = sorted(alignmentSortData(alignments, seqIndex,
    1.76 -                                        otherNamesToRanks))
    1.77 +                                        otherNamesToRanksAndFlips))
    1.78          alnsGroupedBySeqName = itertools.groupby(alns, itemgetter(0))
    1.79          seqNamesToLists = dict((k, list(v)) for k, v in alnsGroupedBySeqName)
    1.80          g.sort(key=functools.partial(alignmentKey, seqNamesToLists))
    1.81 @@ -310,6 +336,15 @@
    1.82  
    1.83  def allSortedRanges(opts, alignments, alignmentsB,
    1.84                      seqRanges1, seqRangesB1, seqRanges2, seqRangesB2):
    1.85 +    o1, oB1 = twoValuesFromOption(opts.strands1, ":")
    1.86 +    o2, oB2 = twoValuesFromOption(opts.strands2, ":")
    1.87 +    if o1 == "1" and o2 == "1":
    1.88 +        raise Exception("the strand options have circular dependency")
    1.89 +    seqRanges1 = list(rangesWithStrandInfo(seqRanges1, o1, alignments, 0))
    1.90 +    seqRanges2 = list(rangesWithStrandInfo(seqRanges2, o2, alignments, 1))
    1.91 +    seqRangesB1 = list(rangesWithStrandInfo(seqRangesB1, oB1, alignmentsB, 0))
    1.92 +    seqRangesB2 = list(rangesWithStrandInfo(seqRangesB2, oB2, alignmentsB, 1))
    1.93 +
    1.94      o1, oB1 = twoValuesFromOption(opts.sort1, ":")
    1.95      o2, oB2 = twoValuesFromOption(opts.sort2, ":")
    1.96      if o1 == "3" and o2 == "3":
    1.97 @@ -344,7 +379,7 @@
    1.98              return "%.0f" % (1.0 * size / j) + x
    1.99  
   1.100  def labelText(seqRange, labelOpt):
   1.101 -    seqName, beg, end = seqRange
   1.102 +    seqName, beg, end, strandNum = seqRange
   1.103      if labelOpt == 1:
   1.104          return seqName + ": " + sizeText(end - beg)
   1.105      if labelOpt == 2:
   1.106 @@ -365,13 +400,16 @@
   1.107              x, y = draw.textsize(text, font=font)
   1.108              if textRot:
   1.109                  x, y = y, x
   1.110 -        yield text, x, y
   1.111 +        yield text, x, y, r[3]
   1.112  
   1.113  def dataFromRanges(sortedRanges, font, fontSize, imageMode, labelOpt, textRot):
   1.114 -    for i in sortedRanges:
   1.115 -        warn("\t".join(map(str, i)))
   1.116 +    for seqName, rangeBeg, rangeEnd, strandNum in sortedRanges:
   1.117 +        out = [seqName, str(rangeBeg), str(rangeEnd)]
   1.118 +        if strandNum > 0:
   1.119 +            out.append(".+-"[strandNum])
   1.120 +        warn("\t".join(out))
   1.121      warn("")
   1.122 -    rangeSizes = [e - b for n, b, e in sortedRanges]
   1.123 +    rangeSizes = [e - b for n, b, e, s in sortedRanges]
   1.124      labs = list(rangeLabels(sortedRanges, labelOpt, font, fontSize,
   1.125                              imageMode, textRot))
   1.126      margin = max(i[2] for i in labs)
   1.127 @@ -436,25 +474,26 @@
   1.128          beg2 -= next_pix
   1.129          size -= next_pix
   1.130  
   1.131 -def findOrigin(ranges, beg, size):
   1.132 -    if beg < 0:
   1.133 +def strandAndOrigin(ranges, beg, size):
   1.134 +    isReverseStrand = (beg < 0)
   1.135 +    if isReverseStrand:
   1.136          beg = -(beg + size)
   1.137 -    for rangeBeg, rangeEnd, origin in ranges:
   1.138 +    for rangeBeg, rangeEnd, isReverseRange, origin in ranges:
   1.139          if rangeEnd > beg:
   1.140 -            return origin
   1.141 +            return (isReverseStrand != isReverseRange), origin
   1.142  
   1.143  def alignmentPixels(width, height, alignments, bp_per_pix,
   1.144                      rangeDict1, rangeDict2):
   1.145      hits = [0] * (width * height)  # the image data
   1.146      for seq1, seq2, blocks in alignments:
   1.147          beg1, beg2, size = blocks[0]
   1.148 -        ori1 = findOrigin(rangeDict1[seq1], beg1, size)
   1.149 -        ori2 = findOrigin(rangeDict2[seq2], beg2, size)
   1.150 +        isReverse1, ori1 = strandAndOrigin(rangeDict1[seq1], beg1, size)
   1.151 +        isReverse2, ori2 = strandAndOrigin(rangeDict2[seq2], beg2, size)
   1.152          for beg1, beg2, size in blocks:
   1.153 -            if beg1 < 0:
   1.154 +            if isReverse1:
   1.155                  beg1 = -(beg1 + size)
   1.156                  beg2 = -(beg2 + size)
   1.157 -            if beg2 >= 0:
   1.158 +            if isReverse1 == isReverse2:
   1.159                  drawLineForward(hits, width, bp_per_pix,
   1.160                                  ori1 + beg1, ori2 + beg2, size)
   1.161              else:
   1.162 @@ -489,10 +528,13 @@
   1.163              if len(w) > 5:
   1.164                  if len(w) > 8 and w[8].count(",") == 2:
   1.165                      color = "rgb(" + w[8] + ")"
   1.166 -                elif w[5] == "+":
   1.167 -                    color = "#ffe8e8"
   1.168 -                elif w[5] == "-":
   1.169 -                    color = "#e8e8ff"
   1.170 +                else:
   1.171 +                    strand = w[5]
   1.172 +                    isRev = rangeDict[seqName][0][2]
   1.173 +                    if strand == "+" and not isRev or strand == "-" and isRev:
   1.174 +                        color = "#ffe8e8"
   1.175 +                    if strand == "-" and not isRev or strand == "+" and isRev:
   1.176 +                        color = "#e8e8ff"
   1.177          yield layer, color, seqName, beg, end
   1.178  
   1.179  def commaSeparatedInts(text):
   1.180 @@ -537,7 +579,7 @@
   1.181              continue
   1.182          if repeatClass in ("Low_complexity", "Simple_repeat"):
   1.183              yield 200, "#fbf", seqName, beg, end
   1.184 -        elif strand == "+":
   1.185 +        elif (strand == "+") != rangeDict[seqName][0][2]:
   1.186              yield 100, "#ffe8e8", seqName, beg, end
   1.187          else:
   1.188              yield 100, "#e8e8ff", seqName, beg, end
   1.189 @@ -563,10 +605,12 @@
   1.190  
   1.191  def bedBoxes(beds, rangeDict, margin, edge, isTop, bpPerPix):
   1.192      for layer, color, seqName, bedBeg, bedEnd in beds:
   1.193 -        for rangeBeg, rangeEnd, origin in rangeDict[seqName]:
   1.194 +        for rangeBeg, rangeEnd, isReverseRange, origin in rangeDict[seqName]:
   1.195              beg = max(bedBeg, rangeBeg)
   1.196              end = min(bedEnd, rangeEnd)
   1.197              if beg >= end: continue
   1.198 +            if isReverseRange:
   1.199 +                beg, end = -end, -beg
   1.200              if layer <= 1000:
   1.201                  # include partly-covered pixels
   1.202                  b = (origin + beg) // bpPerPix
   1.203 @@ -576,8 +620,11 @@
   1.204                  b = div_ceil(origin + beg, bpPerPix)
   1.205                  e = (origin + end) // bpPerPix
   1.206                  if e <= b: continue
   1.207 -                if end == rangeEnd:  # include partly-covered end pixels
   1.208 -                    e = div_ceil(origin + end, bpPerPix)
   1.209 +                if bedEnd >= rangeEnd:  # include partly-covered end pixels
   1.210 +                    if isReverseRange:
   1.211 +                        b = (origin + beg) // bpPerPix
   1.212 +                    else:
   1.213 +                        e = div_ceil(origin + end, bpPerPix)
   1.214              if isTop:
   1.215                  box = b, margin, e, edge
   1.216              else:
   1.217 @@ -593,7 +640,7 @@
   1.218      '''Return axis labels with endpoint & sort-order information.'''
   1.219      maxWidth = end - beg
   1.220      for i, j, k in zip(labels, rangePixBegs, rangePixLens):
   1.221 -        text, textWidth, textHeight = i
   1.222 +        text, textWidth, textHeight, strandNum = i
   1.223          if textWidth > maxWidth:
   1.224              continue
   1.225          labelBeg = j + (k - textWidth) // 2
   1.226 @@ -607,7 +654,7 @@
   1.227              sortKey += maxWidth * (labelEnd - end)
   1.228              labelEnd = end
   1.229              labelBeg = end - textWidth
   1.230 -        yield sortKey, labelBeg, labelEnd, text, textHeight
   1.231 +        yield sortKey, labelBeg, labelEnd, text, textHeight, strandNum
   1.232  
   1.233  def nonoverlappingLabels(labels, minPixTweenLabels):
   1.234      '''Get a subset of non-overlapping axis labels, greedily.'''
   1.235 @@ -631,20 +678,25 @@
   1.236      image_size = (margin, end) if textRot else (end, margin)
   1.237      im = Image.new(image_mode, image_size, opts.margin_color)
   1.238      draw = ImageDraw.Draw(im)
   1.239 -    for sortKey, labelBeg, labelEnd, text, textHeight in labels:
   1.240 +    for sortKey, labelBeg, labelEnd, text, textHeight, strandNum in labels:
   1.241          base = margin - textHeight if textAln else 0
   1.242          position = (base, labelBeg) if textRot else (labelBeg, base)
   1.243 -        draw.text(position, text, font=font, fill=opts.text_color)
   1.244 +        fill = ("black", opts.forwardcolor, opts.reversecolor)[strandNum]
   1.245 +        draw.text(position, text, font=font, fill=fill)
   1.246      return im
   1.247  
   1.248 -def rangesWithOrigins(sortedRanges, rangePixBegs, bpPerPix):
   1.249 -    for i, j in zip(sortedRanges, rangePixBegs):
   1.250 -        seqName, rangeBeg, rangeEnd = i
   1.251 -        origin = bpPerPix * j - rangeBeg
   1.252 -        yield seqName, (rangeBeg, rangeEnd, origin)
   1.253 +def rangesWithOrigins(sortedRanges, rangePixBegs, rangePixLens, bpPerPix):
   1.254 +    for i, j, k in zip(sortedRanges, rangePixBegs, rangePixLens):
   1.255 +        seqName, rangeBeg, rangeEnd, strandNum = i
   1.256 +        isReverseRange = (strandNum > 1)
   1.257 +        if isReverseRange:
   1.258 +            origin = bpPerPix * (j + k) + rangeBeg
   1.259 +        else:
   1.260 +            origin = bpPerPix * j - rangeBeg
   1.261 +        yield seqName, (rangeBeg, rangeEnd, isReverseRange, origin)
   1.262  
   1.263 -def rangesPerSeq(sortedRanges, rangePixBegs, bpPerPix):
   1.264 -    a = rangesWithOrigins(sortedRanges, rangePixBegs, bpPerPix)
   1.265 +def rangesPerSeq(sortedRanges, rangePixBegs, rangePixLens, bpPerPix):
   1.266 +    a = rangesWithOrigins(sortedRanges, rangePixBegs, rangePixLens, bpPerPix)
   1.267      for k, v in itertools.groupby(a, itemgetter(0)):
   1.268          yield k, [i[1] for i in v]
   1.269  
   1.270 @@ -729,11 +781,13 @@
   1.271  
   1.272      p1 = pixelData(rangeSizes1, bpPerPix, opts.border_pixels, lMargin)
   1.273      rangePixBegs1, rangePixLens1, width = p1
   1.274 -    rangeDict1 = dict(rangesPerSeq(sortedRanges1, rangePixBegs1, bpPerPix))
   1.275 +    rangeDict1 = dict(rangesPerSeq(sortedRanges1, rangePixBegs1,
   1.276 +                                   rangePixLens1, bpPerPix))
   1.277  
   1.278      p2 = pixelData(rangeSizes2, bpPerPix, opts.border_pixels, tMargin)
   1.279      rangePixBegs2, rangePixLens2, height = p2
   1.280 -    rangeDict2 = dict(rangesPerSeq(sortedRanges2, rangePixBegs2, bpPerPix))
   1.281 +    rangeDict2 = dict(rangesPerSeq(sortedRanges2, rangePixBegs2,
   1.282 +                                   rangePixLens2, bpPerPix))
   1.283  
   1.284      warn("width:  " + str(width))
   1.285      warn("height: " + str(height))
   1.286 @@ -828,6 +882,12 @@
   1.287      op.add_option("--sort2", default="1", metavar="N",
   1.288                    help="genome2 sequence order: 0=input order, 1=name order, "
   1.289                    "2=length order, 3=alignment order (default=%default)")
   1.290 +    op.add_option("--strands1", default="0", metavar="N", help=
   1.291 +                  "genome1 sequence orientation: 0=forward orientation, "
   1.292 +                  "1=alignment orientation (default=%default)")
   1.293 +    op.add_option("--strands2", default="0", metavar="N", help=
   1.294 +                  "genome2 sequence orientation: 0=forward orientation, "
   1.295 +                  "1=alignment orientation (default=%default)")
   1.296      op.add_option("--max-gap1", metavar="FRAC", default="0.5,2", help=
   1.297                    "maximum unaligned (end,mid) gap in genome1: "
   1.298                    "fraction of aligned length (default=%default)")
   1.299 @@ -897,7 +957,6 @@
   1.300      (opts, args) = op.parse_args()
   1.301      if len(args) != 2: op.error("2 arguments needed")
   1.302  
   1.303 -    opts.text_color = "black"
   1.304      opts.background_color = "white"
   1.305      opts.label_space = 5     # minimum number of pixels between axis labels
   1.306