last-dotplot: add strand/orientation options
authorMartin C. Frith
Thu Dec 21 18:02:21 2017 +0900 (2017-12-21)
changeset 916ffe68ba3d865
parent 915 6079fabb49a5
child 917 c83131b68177
last-dotplot: add strand/orientation options
doc/last-dotplot.txt
scripts/last-dotplot
     1.1 --- a/doc/last-dotplot.txt	Thu Dec 21 16:22:52 2017 +0900
     1.2 +++ b/doc/last-dotplot.txt	Thu Dec 21 18:02:21 2017 +0900
     1.3 @@ -99,6 +99,16 @@
     1.4        appearance in the input (0), their names (1), their lengths (2),
     1.5        the left-to-right order of (the midpoints of) their alignments
     1.6        (3).
     1.7 +  --strands1=N
     1.8 +      Put the 1st genome's sequences: in forwards orientation (0), in
     1.9 +      the orientation of most of their aligned bases (1).  In the
    1.10 +      latter case, the labels will be colored (in the same way as the
    1.11 +      alignments) to indicate the sequences' orientations.  You can
    1.12 +      use two colon-separated values for primary and secondary
    1.13 +      alignments.
    1.14 +  --strands2=N
    1.15 +      Put the 2nd genome's sequences: in forwards orientation (0), in
    1.16 +      the orientation of most of their aligned bases (1).
    1.17    --max-gap1=FRAC
    1.18        Maximum unaligned gap in the 1st genome.  For example, if two
    1.19        parts of one DNA read align to widely-separated parts of one
     2.1 --- a/scripts/last-dotplot	Thu Dec 21 16:22:52 2017 +0900
     2.2 +++ b/scripts/last-dotplot	Thu Dec 21 18:02:21 2017 +0900
     2.3 @@ -9,6 +9,7 @@
     2.4  # according to the number of aligned nt-pairs within it, but the
     2.5  # result is too faint.  How can this be done better?
     2.6  
     2.7 +import collections
     2.8  import functools
     2.9  import gzip
    2.10  from fnmatch import fnmatchcase
    2.11 @@ -258,6 +259,22 @@
    2.12              rangeEnd = blocks[-1][1] + endPad
    2.13          yield seqName, rangeBeg, rangeEnd
    2.14  
    2.15 +def rangesWithStrandInfo(seqRanges, strandOpt, alignments, seqIndex):
    2.16 +    if strandOpt == "1":
    2.17 +        forwardMinusReverse = collections.defaultdict(int)
    2.18 +        for i in alignments:
    2.19 +            blocks = i[2]
    2.20 +            beg1, beg2, size = blocks[0]
    2.21 +            numOfAlignedLetterPairs = sum(i[2] for i in blocks)
    2.22 +            if (beg1 < 0) != (beg2 < 0):  # opposite-strand alignment
    2.23 +                numOfAlignedLetterPairs *= -1
    2.24 +            forwardMinusReverse[i[seqIndex]] += numOfAlignedLetterPairs
    2.25 +    strandNum = 0
    2.26 +    for seqName, beg, end in seqRanges:
    2.27 +        if strandOpt == "1":
    2.28 +            strandNum = 1 if forwardMinusReverse[seqName] >= 0 else 2
    2.29 +        yield seqName, beg, end, strandNum
    2.30 +
    2.31  def natural_sort_key(my_string):
    2.32      '''Return a sort key for "natural" ordering, e.g. chr9 < chr10.'''
    2.33      parts = re.split(r'(\d+)', my_string)
    2.34 @@ -280,29 +297,38 @@
    2.35          if toMiddle < 0:
    2.36              return i[1:3]  # sequence-rank and "position" of this alignment
    2.37  
    2.38 -def alignmentSortData(alignments, seqIndex, otherNamesToRanks):
    2.39 +def rankAndFlipPerSeq(seqRanges):
    2.40 +    rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
    2.41 +    for rank, group in enumerate(rangesGroupedBySeqName):
    2.42 +        seqName, ranges = group
    2.43 +        strandNum = next(ranges)[3]
    2.44 +        flip = 1 if strandNum < 2 else -1
    2.45 +        yield seqName, (rank, flip)
    2.46 +
    2.47 +def alignmentSortData(alignments, seqIndex, otherNamesToRanksAndFlips):
    2.48      otherIndex = 1 - seqIndex
    2.49      for i in alignments:
    2.50          blocks = i[2]
    2.51 -        otherRank = otherNamesToRanks[i[otherIndex]]
    2.52 -        otherPos = abs(blocks[0][otherIndex] +
    2.53 -                       blocks[-1][otherIndex] + blocks[-1][2])
    2.54 +        otherRank, otherFlip = otherNamesToRanksAndFlips[i[otherIndex]]
    2.55 +        otherPos = otherFlip * abs(blocks[0][otherIndex] +
    2.56 +                                   blocks[-1][otherIndex] + blocks[-1][2])
    2.57          numOfAlignedLetterPairs = sum(i[2] for i in blocks)
    2.58          yield i[seqIndex], otherRank, otherPos, numOfAlignedLetterPairs
    2.59  
    2.60  def mySortedRanges(seqRanges, sortOpt, seqIndex, alignments, otherRanges):
    2.61      rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
    2.62      g = [list(ranges) for seqName, ranges in rangesGroupedBySeqName]
    2.63 +    for i in g:
    2.64 +        if i[0][3] > 1:
    2.65 +            i.reverse()
    2.66      if sortOpt == "1":
    2.67          g.sort(key=nameKey)
    2.68      if sortOpt == "2":
    2.69          g.sort(key=sizeKey)
    2.70      if sortOpt == "3":
    2.71 -        otherNameGroups = itertools.groupby(i[0] for i in otherRanges)
    2.72 -        ranksAndNames = enumerate(i[0] for i in otherNameGroups)
    2.73 -        otherNamesToRanks = dict((n, r) for r, n in ranksAndNames)
    2.74 +        otherNamesToRanksAndFlips = dict(rankAndFlipPerSeq(otherRanges))
    2.75          alns = sorted(alignmentSortData(alignments, seqIndex,
    2.76 -                                        otherNamesToRanks))
    2.77 +                                        otherNamesToRanksAndFlips))
    2.78          alnsGroupedBySeqName = itertools.groupby(alns, itemgetter(0))
    2.79          seqNamesToLists = dict((k, list(v)) for k, v in alnsGroupedBySeqName)
    2.80          g.sort(key=functools.partial(alignmentKey, seqNamesToLists))
    2.81 @@ -310,6 +336,15 @@
    2.82  
    2.83  def allSortedRanges(opts, alignments, alignmentsB,
    2.84                      seqRanges1, seqRangesB1, seqRanges2, seqRangesB2):
    2.85 +    o1, oB1 = twoValuesFromOption(opts.strands1, ":")
    2.86 +    o2, oB2 = twoValuesFromOption(opts.strands2, ":")
    2.87 +    if o1 == "1" and o2 == "1":
    2.88 +        raise Exception("the strand options have circular dependency")
    2.89 +    seqRanges1 = list(rangesWithStrandInfo(seqRanges1, o1, alignments, 0))
    2.90 +    seqRanges2 = list(rangesWithStrandInfo(seqRanges2, o2, alignments, 1))
    2.91 +    seqRangesB1 = list(rangesWithStrandInfo(seqRangesB1, oB1, alignmentsB, 0))
    2.92 +    seqRangesB2 = list(rangesWithStrandInfo(seqRangesB2, oB2, alignmentsB, 1))
    2.93 +
    2.94      o1, oB1 = twoValuesFromOption(opts.sort1, ":")
    2.95      o2, oB2 = twoValuesFromOption(opts.sort2, ":")
    2.96      if o1 == "3" and o2 == "3":
    2.97 @@ -344,7 +379,7 @@
    2.98              return "%.0f" % (1.0 * size / j) + x
    2.99  
   2.100  def labelText(seqRange, labelOpt):
   2.101 -    seqName, beg, end = seqRange
   2.102 +    seqName, beg, end, strandNum = seqRange
   2.103      if labelOpt == 1:
   2.104          return seqName + ": " + sizeText(end - beg)
   2.105      if labelOpt == 2:
   2.106 @@ -365,13 +400,16 @@
   2.107              x, y = draw.textsize(text, font=font)
   2.108              if textRot:
   2.109                  x, y = y, x
   2.110 -        yield text, x, y
   2.111 +        yield text, x, y, r[3]
   2.112  
   2.113  def dataFromRanges(sortedRanges, font, fontSize, imageMode, labelOpt, textRot):
   2.114 -    for i in sortedRanges:
   2.115 -        warn("\t".join(map(str, i)))
   2.116 +    for seqName, rangeBeg, rangeEnd, strandNum in sortedRanges:
   2.117 +        out = [seqName, str(rangeBeg), str(rangeEnd)]
   2.118 +        if strandNum > 0:
   2.119 +            out.append(".+-"[strandNum])
   2.120 +        warn("\t".join(out))
   2.121      warn("")
   2.122 -    rangeSizes = [e - b for n, b, e in sortedRanges]
   2.123 +    rangeSizes = [e - b for n, b, e, s in sortedRanges]
   2.124      labs = list(rangeLabels(sortedRanges, labelOpt, font, fontSize,
   2.125                              imageMode, textRot))
   2.126      margin = max(i[2] for i in labs)
   2.127 @@ -436,25 +474,26 @@
   2.128          beg2 -= next_pix
   2.129          size -= next_pix
   2.130  
   2.131 -def findOrigin(ranges, beg, size):
   2.132 -    if beg < 0:
   2.133 +def strandAndOrigin(ranges, beg, size):
   2.134 +    isReverseStrand = (beg < 0)
   2.135 +    if isReverseStrand:
   2.136          beg = -(beg + size)
   2.137 -    for rangeBeg, rangeEnd, origin in ranges:
   2.138 +    for rangeBeg, rangeEnd, isReverseRange, origin in ranges:
   2.139          if rangeEnd > beg:
   2.140 -            return origin
   2.141 +            return (isReverseStrand != isReverseRange), origin
   2.142  
   2.143  def alignmentPixels(width, height, alignments, bp_per_pix,
   2.144                      rangeDict1, rangeDict2):
   2.145      hits = [0] * (width * height)  # the image data
   2.146      for seq1, seq2, blocks in alignments:
   2.147          beg1, beg2, size = blocks[0]
   2.148 -        ori1 = findOrigin(rangeDict1[seq1], beg1, size)
   2.149 -        ori2 = findOrigin(rangeDict2[seq2], beg2, size)
   2.150 +        isReverse1, ori1 = strandAndOrigin(rangeDict1[seq1], beg1, size)
   2.151 +        isReverse2, ori2 = strandAndOrigin(rangeDict2[seq2], beg2, size)
   2.152          for beg1, beg2, size in blocks:
   2.153 -            if beg1 < 0:
   2.154 +            if isReverse1:
   2.155                  beg1 = -(beg1 + size)
   2.156                  beg2 = -(beg2 + size)
   2.157 -            if beg2 >= 0:
   2.158 +            if isReverse1 == isReverse2:
   2.159                  drawLineForward(hits, width, bp_per_pix,
   2.160                                  ori1 + beg1, ori2 + beg2, size)
   2.161              else:
   2.162 @@ -489,10 +528,13 @@
   2.163              if len(w) > 5:
   2.164                  if len(w) > 8 and w[8].count(",") == 2:
   2.165                      color = "rgb(" + w[8] + ")"
   2.166 -                elif w[5] == "+":
   2.167 -                    color = "#ffe8e8"
   2.168 -                elif w[5] == "-":
   2.169 -                    color = "#e8e8ff"
   2.170 +                else:
   2.171 +                    strand = w[5]
   2.172 +                    isRev = rangeDict[seqName][0][2]
   2.173 +                    if strand == "+" and not isRev or strand == "-" and isRev:
   2.174 +                        color = "#ffe8e8"
   2.175 +                    if strand == "-" and not isRev or strand == "+" and isRev:
   2.176 +                        color = "#e8e8ff"
   2.177          yield layer, color, seqName, beg, end
   2.178  
   2.179  def commaSeparatedInts(text):
   2.180 @@ -537,7 +579,7 @@
   2.181              continue
   2.182          if repeatClass in ("Low_complexity", "Simple_repeat"):
   2.183              yield 200, "#fbf", seqName, beg, end
   2.184 -        elif strand == "+":
   2.185 +        elif (strand == "+") != rangeDict[seqName][0][2]:
   2.186              yield 100, "#ffe8e8", seqName, beg, end
   2.187          else:
   2.188              yield 100, "#e8e8ff", seqName, beg, end
   2.189 @@ -563,10 +605,12 @@
   2.190  
   2.191  def bedBoxes(beds, rangeDict, margin, edge, isTop, bpPerPix):
   2.192      for layer, color, seqName, bedBeg, bedEnd in beds:
   2.193 -        for rangeBeg, rangeEnd, origin in rangeDict[seqName]:
   2.194 +        for rangeBeg, rangeEnd, isReverseRange, origin in rangeDict[seqName]:
   2.195              beg = max(bedBeg, rangeBeg)
   2.196              end = min(bedEnd, rangeEnd)
   2.197              if beg >= end: continue
   2.198 +            if isReverseRange:
   2.199 +                beg, end = -end, -beg
   2.200              if layer <= 1000:
   2.201                  # include partly-covered pixels
   2.202                  b = (origin + beg) // bpPerPix
   2.203 @@ -576,8 +620,11 @@
   2.204                  b = div_ceil(origin + beg, bpPerPix)
   2.205                  e = (origin + end) // bpPerPix
   2.206                  if e <= b: continue
   2.207 -                if end == rangeEnd:  # include partly-covered end pixels
   2.208 -                    e = div_ceil(origin + end, bpPerPix)
   2.209 +                if bedEnd >= rangeEnd:  # include partly-covered end pixels
   2.210 +                    if isReverseRange:
   2.211 +                        b = (origin + beg) // bpPerPix
   2.212 +                    else:
   2.213 +                        e = div_ceil(origin + end, bpPerPix)
   2.214              if isTop:
   2.215                  box = b, margin, e, edge
   2.216              else:
   2.217 @@ -593,7 +640,7 @@
   2.218      '''Return axis labels with endpoint & sort-order information.'''
   2.219      maxWidth = end - beg
   2.220      for i, j, k in zip(labels, rangePixBegs, rangePixLens):
   2.221 -        text, textWidth, textHeight = i
   2.222 +        text, textWidth, textHeight, strandNum = i
   2.223          if textWidth > maxWidth:
   2.224              continue
   2.225          labelBeg = j + (k - textWidth) // 2
   2.226 @@ -607,7 +654,7 @@
   2.227              sortKey += maxWidth * (labelEnd - end)
   2.228              labelEnd = end
   2.229              labelBeg = end - textWidth
   2.230 -        yield sortKey, labelBeg, labelEnd, text, textHeight
   2.231 +        yield sortKey, labelBeg, labelEnd, text, textHeight, strandNum
   2.232  
   2.233  def nonoverlappingLabels(labels, minPixTweenLabels):
   2.234      '''Get a subset of non-overlapping axis labels, greedily.'''
   2.235 @@ -631,20 +678,25 @@
   2.236      image_size = (margin, end) if textRot else (end, margin)
   2.237      im = Image.new(image_mode, image_size, opts.margin_color)
   2.238      draw = ImageDraw.Draw(im)
   2.239 -    for sortKey, labelBeg, labelEnd, text, textHeight in labels:
   2.240 +    for sortKey, labelBeg, labelEnd, text, textHeight, strandNum in labels:
   2.241          base = margin - textHeight if textAln else 0
   2.242          position = (base, labelBeg) if textRot else (labelBeg, base)
   2.243 -        draw.text(position, text, font=font, fill=opts.text_color)
   2.244 +        fill = ("black", opts.forwardcolor, opts.reversecolor)[strandNum]
   2.245 +        draw.text(position, text, font=font, fill=fill)
   2.246      return im
   2.247  
   2.248 -def rangesWithOrigins(sortedRanges, rangePixBegs, bpPerPix):
   2.249 -    for i, j in zip(sortedRanges, rangePixBegs):
   2.250 -        seqName, rangeBeg, rangeEnd = i
   2.251 -        origin = bpPerPix * j - rangeBeg
   2.252 -        yield seqName, (rangeBeg, rangeEnd, origin)
   2.253 +def rangesWithOrigins(sortedRanges, rangePixBegs, rangePixLens, bpPerPix):
   2.254 +    for i, j, k in zip(sortedRanges, rangePixBegs, rangePixLens):
   2.255 +        seqName, rangeBeg, rangeEnd, strandNum = i
   2.256 +        isReverseRange = (strandNum > 1)
   2.257 +        if isReverseRange:
   2.258 +            origin = bpPerPix * (j + k) + rangeBeg
   2.259 +        else:
   2.260 +            origin = bpPerPix * j - rangeBeg
   2.261 +        yield seqName, (rangeBeg, rangeEnd, isReverseRange, origin)
   2.262  
   2.263 -def rangesPerSeq(sortedRanges, rangePixBegs, bpPerPix):
   2.264 -    a = rangesWithOrigins(sortedRanges, rangePixBegs, bpPerPix)
   2.265 +def rangesPerSeq(sortedRanges, rangePixBegs, rangePixLens, bpPerPix):
   2.266 +    a = rangesWithOrigins(sortedRanges, rangePixBegs, rangePixLens, bpPerPix)
   2.267      for k, v in itertools.groupby(a, itemgetter(0)):
   2.268          yield k, [i[1] for i in v]
   2.269  
   2.270 @@ -729,11 +781,13 @@
   2.271  
   2.272      p1 = pixelData(rangeSizes1, bpPerPix, opts.border_pixels, lMargin)
   2.273      rangePixBegs1, rangePixLens1, width = p1
   2.274 -    rangeDict1 = dict(rangesPerSeq(sortedRanges1, rangePixBegs1, bpPerPix))
   2.275 +    rangeDict1 = dict(rangesPerSeq(sortedRanges1, rangePixBegs1,
   2.276 +                                   rangePixLens1, bpPerPix))
   2.277  
   2.278      p2 = pixelData(rangeSizes2, bpPerPix, opts.border_pixels, tMargin)
   2.279      rangePixBegs2, rangePixLens2, height = p2
   2.280 -    rangeDict2 = dict(rangesPerSeq(sortedRanges2, rangePixBegs2, bpPerPix))
   2.281 +    rangeDict2 = dict(rangesPerSeq(sortedRanges2, rangePixBegs2,
   2.282 +                                   rangePixLens2, bpPerPix))
   2.283  
   2.284      warn("width:  " + str(width))
   2.285      warn("height: " + str(height))
   2.286 @@ -828,6 +882,12 @@
   2.287      op.add_option("--sort2", default="1", metavar="N",
   2.288                    help="genome2 sequence order: 0=input order, 1=name order, "
   2.289                    "2=length order, 3=alignment order (default=%default)")
   2.290 +    op.add_option("--strands1", default="0", metavar="N", help=
   2.291 +                  "genome1 sequence orientation: 0=forward orientation, "
   2.292 +                  "1=alignment orientation (default=%default)")
   2.293 +    op.add_option("--strands2", default="0", metavar="N", help=
   2.294 +                  "genome2 sequence orientation: 0=forward orientation, "
   2.295 +                  "1=alignment orientation (default=%default)")
   2.296      op.add_option("--max-gap1", metavar="FRAC", default="0.5,2", help=
   2.297                    "maximum unaligned (end,mid) gap in genome1: "
   2.298                    "fraction of aligned length (default=%default)")
   2.299 @@ -897,7 +957,6 @@
   2.300      (opts, args) = op.parse_args()
   2.301      if len(args) != 2: op.error("2 arguments needed")
   2.302  
   2.303 -    opts.text_color = "black"
   2.304      opts.background_color = "white"
   2.305      opts.label_space = 5     # minimum number of pixels between axis labels
   2.306