last-dotplot: add alignment-order options
authorMartin C. Frith
Tue Dec 05 17:57:24 2017 +0900 (2017-12-05)
changeset 914f4ca19d126e3
parent 913 e87abc7ae9c6
child 915 6079fabb49a5
last-dotplot: add alignment-order options
doc/last-dotplot.txt
scripts/last-dotplot
     1.1 --- a/doc/last-dotplot.txt	Tue Dec 05 13:41:22 2017 +0900
     1.2 +++ b/doc/last-dotplot.txt	Tue Dec 05 17:57:24 2017 +0900
     1.3 @@ -58,9 +58,8 @@
     1.4  Fonts
     1.5  -----
     1.6  
     1.7 -last-dotplot tries to find a nice font on your computer, but may fail,
     1.8 -in which case it will use an ugly font.  You can specify a font like
     1.9 -this::
    1.10 +last-dotplot tries to find a nice font on your computer, but may fail
    1.11 +and use an ugly font.  You can specify a font like this::
    1.12  
    1.13    last-dotplot -f /usr/share/fonts/liberation/LiberationSans-Regular.ttf alns alns.png
    1.14  
    1.15 @@ -91,12 +90,15 @@
    1.16        secondary alignments that are near the primary alignments.
    1.17    --sort1=N
    1.18        Put the 1st genome's sequences left-to-right in order of: their
    1.19 -      appearance in the input (0), their names (1), their lengths (2).
    1.20 -      You can use two colon-separated values, e.g. "2:1" to specify 2
    1.21 -      for primary and 1 for secondary alignments.
    1.22 +      appearance in the input (0), their names (1), their lengths (2),
    1.23 +      the top-to-bottom order of (the midpoints of) their alignments
    1.24 +      (3).  You can use two colon-separated values, e.g. "2:1" to
    1.25 +      specify 2 for primary and 1 for secondary alignments.
    1.26    --sort2=N
    1.27        Put the 2nd genome's sequences top-to-bottom in order of: their
    1.28 -      appearance in the input (0), their names (1), their lengths (2).
    1.29 +      appearance in the input (0), their names (1), their lengths (2),
    1.30 +      the left-to-right order of (the midpoints of) their alignments
    1.31 +      (3).
    1.32    --max-gap1=FRAC
    1.33        Maximum unaligned gap in the 1st genome.  For example, if two
    1.34        parts of one DNA read align to widely-separated parts of one
     2.1 --- a/scripts/last-dotplot	Tue Dec 05 13:41:22 2017 +0900
     2.2 +++ b/scripts/last-dotplot	Tue Dec 05 17:57:24 2017 +0900
     2.3 @@ -9,6 +9,7 @@
     2.4  # according to the number of aligned nt-pairs within it, but the
     2.5  # result is too faint.  How can this be done better?
     2.6  
     2.7 +import functools
     2.8  import gzip
     2.9  from fnmatch import fnmatchcase
    2.10  from operator import itemgetter
    2.11 @@ -269,18 +270,65 @@
    2.12  def sizeKey(oneSeqRanges):
    2.13      return sum(b - e for n, b, e in oneSeqRanges), nameKey(oneSeqRanges)
    2.14  
    2.15 -def mySortedRanges(seqRanges, sortOpt):
    2.16 +def alignmentKey(seqNamesToLists, oneSeqRanges):
    2.17 +    seqName = oneSeqRanges[0][0]
    2.18 +    alignmentsOfThisSequence = seqNamesToLists[seqName]
    2.19 +    numOfAlignedLetterPairs = sum(i[3] for i in alignmentsOfThisSequence)
    2.20 +    toMiddle = numOfAlignedLetterPairs // 2
    2.21 +    for i in alignmentsOfThisSequence:
    2.22 +        toMiddle -= i[3]
    2.23 +        if toMiddle < 0:
    2.24 +            return i[1:3]  # sequence-rank and "position" of this alignment
    2.25 +
    2.26 +def alignmentSortData1(alignments, seqNamesToRanks2):
    2.27 +    for seqName1, seqName2, blocks in alignments:
    2.28 +        seqRank2 = seqNamesToRanks2[seqName2]
    2.29 +        pos2 = abs(blocks[0][1] + blocks[-1][1] + blocks[-1][2])
    2.30 +        numOfAlignedLetterPairs = sum(i[2] for i in blocks)
    2.31 +        yield seqName1, seqRank2, pos2, numOfAlignedLetterPairs
    2.32 +
    2.33 +def alignmentSortData2(alignments, seqNamesToRanks1):
    2.34 +    for seqName1, seqName2, blocks in alignments:
    2.35 +        seqRank1 = seqNamesToRanks1[seqName1]
    2.36 +        pos1 = abs(blocks[0][0] + blocks[-1][0] + blocks[-1][2])
    2.37 +        numOfAlignedLetterPairs = sum(i[2] for i in blocks)
    2.38 +        yield seqName2, seqRank1, pos1, numOfAlignedLetterPairs
    2.39 +
    2.40 +def mySortedRanges(seqRanges, sortOpt,
    2.41 +                   alignmentSortDataFunc, alignments, otherRanges):
    2.42      rangesGroupedBySeqName = itertools.groupby(seqRanges, itemgetter(0))
    2.43      g = [list(ranges) for seqName, ranges in rangesGroupedBySeqName]
    2.44      if sortOpt == "1":
    2.45          g.sort(key=nameKey)
    2.46      if sortOpt == "2":
    2.47          g.sort(key=sizeKey)
    2.48 +    if sortOpt == "3":
    2.49 +        otherNameGroups = itertools.groupby(i[0] for i in otherRanges)
    2.50 +        ranksAndNames = enumerate(i[0] for i in otherNameGroups)
    2.51 +        otherNamesToRanks = dict((n, r) for r, n in ranksAndNames)
    2.52 +        alns = sorted(alignmentSortDataFunc(alignments, otherNamesToRanks))
    2.53 +        alnsGroupedBySeqName = itertools.groupby(alns, itemgetter(0))
    2.54 +        seqNamesToLists = dict((k, list(v)) for k, v in alnsGroupedBySeqName)
    2.55 +        g.sort(key=functools.partial(alignmentKey, seqNamesToLists))
    2.56      return [j for i in g for j in i]
    2.57  
    2.58 -def allSortedRanges(seqRanges, seqRangesB, sortOpt):
    2.59 -    x, y = twoValuesFromOption(sortOpt, ":")
    2.60 -    return mySortedRanges(seqRanges, x) + mySortedRanges(seqRangesB, y)
    2.61 +def allSortedRanges(opts, alignments, alignmentsB,
    2.62 +                    seqRanges1, seqRangesB1, seqRanges2, seqRangesB2):
    2.63 +    o1, oB1 = twoValuesFromOption(opts.sort1, ":")
    2.64 +    o2, oB2 = twoValuesFromOption(opts.sort2, ":")
    2.65 +    if o1 == "3" and o2 == "3":
    2.66 +        raise Exception("the sort options have circular dependency")
    2.67 +    if o1 != "3":
    2.68 +        s1 = mySortedRanges(seqRanges1, o1, None, None, None)
    2.69 +    if o2 != "3":
    2.70 +        s2 = mySortedRanges(seqRanges2, o2, None, None, None)
    2.71 +    if o1 == "3":
    2.72 +        s1 = mySortedRanges(seqRanges1, o1, alignmentSortData1, alignments, s2)
    2.73 +    if o2 == "3":
    2.74 +        s2 = mySortedRanges(seqRanges2, o2, alignmentSortData2, alignments, s1)
    2.75 +    t1 = mySortedRanges(seqRangesB1, oB1, alignmentSortData1, alignmentsB, s2)
    2.76 +    t2 = mySortedRanges(seqRangesB2, oB2, alignmentSortData2, alignmentsB, s1)
    2.77 +    return s1 + t1, s2 + t2
    2.78  
    2.79  def prettyNum(n):
    2.80      t = str(n)
    2.81 @@ -323,9 +371,7 @@
    2.82                  x, y = y, x
    2.83          yield text, x, y
    2.84  
    2.85 -def dataFromRanges(cutRanges, cutRangesB, sortOpt, font,
    2.86 -                   fontSize, imageMode, labelOpt, textRot):
    2.87 -    sortedRanges = allSortedRanges(cutRanges, cutRangesB, sortOpt)
    2.88 +def dataFromRanges(sortedRanges, font, fontSize, imageMode, labelOpt, textRot):
    2.89      for i in sortedRanges:
    2.90          warn("\t".join(map(str, i)))
    2.91      warn("")
    2.92 @@ -334,7 +380,7 @@
    2.93                              imageMode, textRot))
    2.94      margin = max(i[2] for i in labs)
    2.95      # xxx the margin may be too big, because some labels may get omitted
    2.96 -    return sortedRanges, rangeSizes, labs, margin
    2.97 +    return rangeSizes, labs, margin
    2.98  
    2.99  def div_ceil(x, y):
   2.100      '''Return x / y rounded up.'''
   2.101 @@ -664,15 +710,20 @@
   2.102      cutRangesB2 = trimmed(seqRangesB2, coverDictB2, minAlignedBases,
   2.103                            maxGapB2, 0, 0)
   2.104  
   2.105 +    warn("sorting...")
   2.106 +    sortOut = allSortedRanges(opts, alignments, alignmentsB,
   2.107 +                              cutRanges1, cutRangesB1, cutRanges2, cutRangesB2)
   2.108 +    sortedRanges1, sortedRanges2 = sortOut
   2.109 +
   2.110      textRot1 = "vertical".startswith(opts.rot1)
   2.111 -    i1 = dataFromRanges(cutRanges1, cutRangesB1, opts.sort1, font,
   2.112 +    i1 = dataFromRanges(sortedRanges1, font,
   2.113                          opts.fontsize, image_mode, opts.labels1, textRot1)
   2.114 -    sortedRanges1, rangeSizes1, labelData1, tMargin = i1
   2.115 +    rangeSizes1, labelData1, tMargin = i1
   2.116  
   2.117      textRot2 = "horizontal".startswith(opts.rot2)
   2.118 -    i2 = dataFromRanges(cutRanges2, cutRangesB2, opts.sort2, font,
   2.119 +    i2 = dataFromRanges(sortedRanges2, font,
   2.120                          opts.fontsize, image_mode, opts.labels2, textRot2)
   2.121 -    sortedRanges2, rangeSizes2, labelData2, lMargin = i2
   2.122 +    rangeSizes2, labelData2, lMargin = i2
   2.123  
   2.124      maxPixels1 = opts.width  - lMargin
   2.125      maxPixels2 = opts.height - tMargin
   2.126 @@ -778,10 +829,10 @@
   2.127      op.add_option("--alignments", metavar="FILE", help="secondary alignments")
   2.128      op.add_option("--sort1", default="1", metavar="N",
   2.129                    help="genome1 sequence order: 0=input order, 1=name order, "
   2.130 -                  "2=length order (default=%default)")
   2.131 +                  "2=length order, 3=alignment order (default=%default)")
   2.132      op.add_option("--sort2", default="1", metavar="N",
   2.133                    help="genome2 sequence order: 0=input order, 1=name order, "
   2.134 -                  "2=length order (default=%default)")
   2.135 +                  "2=length order, 3=alignment order (default=%default)")
   2.136      op.add_option("--max-gap1", metavar="FRAC", default="0.5,2", help=
   2.137                    "maximum unaligned (end,mid) gap in genome1: "
   2.138                    "fraction of aligned length (default=%default)")