Add last-dotplot secondary alignments option
authorMartin C. Frith
Wed Nov 08 17:59:10 2017 +0900 (2017-11-08)
changeset 9114080f3f6f73d
parent 910 42653029d900
child 912 8834139fa8a8
Add last-dotplot secondary alignments option
doc/last-dotplot.txt
scripts/last-dotplot
     1.1 --- a/doc/last-dotplot.txt	Wed Nov 08 17:02:56 2017 +0900
     1.2 +++ b/doc/last-dotplot.txt	Wed Nov 08 17:59:10 2017 +0900
     1.3 @@ -83,12 +83,35 @@
     1.4        Color for forward alignments.
     1.5    -r COLOR, --reversecolor=COLOR
     1.6        Color for reverse alignments.
     1.7 +  --alignments=FILE
     1.8 +      Read secondary alignments.  For example: we could use primary
     1.9 +      alignment data with one human DNA read aligned to the human
    1.10 +      genome, and secondary alignment data with the whole chimpanzee
    1.11 +      versus human genomes.  last-dotplot will show the parts of the
    1.12 +      secondary alignments that are near the primary alignments.
    1.13    --sort1=N
    1.14        Put the 1st genome's sequences left-to-right in order of: their
    1.15        appearance in the input (0), their names (1), their lengths (2).
    1.16 +      You can use two colon-separated values, e.g. "2:1" to specify 2
    1.17 +      for primary and 1 for secondary alignments.
    1.18    --sort2=N
    1.19        Put the 2nd genome's sequences top-to-bottom in order of: their
    1.20        appearance in the input (0), their names (1), their lengths (2).
    1.21 +  --max-gap1=FRAC
    1.22 +      Maximum unaligned gap in the 1st genome.  For example, if two
    1.23 +      parts of one DNA read align to widely-separated parts of one
    1.24 +      chromosome, it's probably best to cut the intervening region
    1.25 +      from the dotplot.  FRAC is a fraction of the length of the
    1.26 +      (primary) alignments.  You can specify "inf" to keep all
    1.27 +      unaligned gaps.  You can use two comma-separated values,
    1.28 +      e.g. "0.5,3" to specify 0.5 for end-gaps (unaligned sequence
    1.29 +      ends) and 3 for mid-gaps (between alignments).  You can use two
    1.30 +      colon-separated values (each of which may be comma-separated)
    1.31 +      for primary and secondary alignments.
    1.32 +  --max-gap2=FRAC
    1.33 +      Maximum unaligned gap in the 2nd genome.
    1.34 +  --pad=FRAC
    1.35 +      Length of pad to leave when cutting unaligned gaps.
    1.36    --border-pixels=INT
    1.37        Number of pixels between sequences.
    1.38    --border-color=COLOR
     2.1 --- a/scripts/last-dotplot	Wed Nov 08 17:02:56 2017 +0900
     2.2 +++ b/scripts/last-dotplot	Wed Nov 08 17:59:10 2017 +0900
     2.3 @@ -33,6 +33,10 @@
     2.4          prog = os.path.basename(sys.argv[0])
     2.5          sys.stderr.write(prog + ": " + message + "\n")
     2.6  
     2.7 +def groupByFirstItem(things):
     2.8 +    for k, v in itertools.groupby(things, itemgetter(0)):
     2.9 +        yield k, [i[1:] for i in v]
    2.10 +
    2.11  def croppedBlocks(blocks, ranges1, ranges2):
    2.12      headBeg1, headBeg2, headSize = blocks[0]
    2.13      for r1 in ranges1:
    2.14 @@ -167,6 +171,48 @@
    2.15          updateSeqs(coverDict2, seqRanges2, seqName2, ranges2, coveredRange2)
    2.16      return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
    2.17  
    2.18 +def nameAndRangesFromDict(cropDict, seqName):
    2.19 +    if seqName in cropDict:
    2.20 +        return seqName, cropDict[seqName]
    2.21 +    n = seqName.split(".")[-1]
    2.22 +    if n in cropDict:
    2.23 +        return n, cropDict[n]
    2.24 +    return seqName, []
    2.25 +
    2.26 +def rangesForSecondaryAlignments(primaryRanges, seqLen):
    2.27 +    if primaryRanges:
    2.28 +        return primaryRanges
    2.29 +    return [(0, seqLen)]
    2.30 +
    2.31 +def readSecondaryAlignments(opts, cropRanges1, cropRanges2):
    2.32 +    cropDict1 = dict(groupByFirstItem(cropRanges1))
    2.33 +    cropDict2 = dict(groupByFirstItem(cropRanges2))
    2.34 +
    2.35 +    alignments = []
    2.36 +    seqRanges1 = []
    2.37 +    seqRanges2 = []
    2.38 +    coverDict1 = {}
    2.39 +    coverDict2 = {}
    2.40 +    lines = myOpen(opts.alignments)
    2.41 +    for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
    2.42 +        seqName1, ranges1 = nameAndRangesFromDict(cropDict1, seqName1)
    2.43 +        seqName2, ranges2 = nameAndRangesFromDict(cropDict2, seqName2)
    2.44 +        if not ranges1 and not ranges2:
    2.45 +            continue
    2.46 +        r1 = rangesForSecondaryAlignments(ranges1, seqLen1)
    2.47 +        r2 = rangesForSecondaryAlignments(ranges2, seqLen2)
    2.48 +        b = list(croppedBlocks(list(blocks), r1, r2))
    2.49 +        if not b: continue
    2.50 +        aln = seqName1, seqName2, b
    2.51 +        alignments.append(aln)
    2.52 +        if not ranges1:
    2.53 +            coveredRange1 = b[0][0], b[-1][0] + b[-1][2]
    2.54 +            updateSeqs(coverDict1, seqRanges1, seqName1, r1, coveredRange1)
    2.55 +        if not ranges2:
    2.56 +            coveredRange2 = b[0][1], b[-1][1] + b[-1][2]
    2.57 +            updateSeqs(coverDict2, seqRanges2, seqName2, r2, coveredRange2)
    2.58 +    return alignments, seqRanges1, coverDict1, seqRanges2, coverDict2
    2.59 +
    2.60  def twoValuesFromOption(text, separator):
    2.61      if separator in text:
    2.62          return text.split(separator)
    2.63 @@ -231,6 +277,10 @@
    2.64          g.sort(key=sizeKey)
    2.65      return [j for i in g for j in i]
    2.66  
    2.67 +def allSortedRanges(seqRanges, seqRangesB, sortOpt):
    2.68 +    x, y = twoValuesFromOption(sortOpt, ":")
    2.69 +    return getSortedRanges(seqRanges, x) + getSortedRanges(seqRangesB, y)
    2.70 +
    2.71  def prettyNum(n):
    2.72      t = str(n)
    2.73      groups = []
    2.74 @@ -272,9 +322,9 @@
    2.75                  x, y = y, x
    2.76          yield text, x, y
    2.77  
    2.78 -def dataFromRanges(seqRanges, sortOpt, font,
    2.79 +def dataFromRanges(cutRanges, cutRangesB, sortOpt, font,
    2.80                     fontsize, image_mode, labelOpt, textRot):
    2.81 -    s = getSortedRanges(seqRanges, sortOpt)
    2.82 +    s = allSortedRanges(cutRanges, cutRangesB, sortOpt)
    2.83      for i in s:
    2.84          warn("\t".join(map(str, i)))
    2.85      warn("")
    2.86 @@ -582,6 +632,9 @@
    2.87      zipped_colors = zip(forward_color, reverse_color)
    2.88      overlap_color = tuple([(i + j) // 2 for i, j in zipped_colors])
    2.89  
    2.90 +    maxGap1, maxGapB1 = twoValuesFromOption(opts.max_gap1, ":")
    2.91 +    maxGap2, maxGapB2 = twoValuesFromOption(opts.max_gap2, ":")
    2.92 +
    2.93      warn("reading alignments...")
    2.94      alnData = readAlignments(args[0], opts)
    2.95      alignments, seqRanges1, coverDict1, seqRanges2, coverDict2 = alnData
    2.96 @@ -592,17 +645,28 @@
    2.97      minAlignedBases = min(coveredLength(coverDict1), coveredLength(coverDict2))
    2.98      pad = int(opts.pad * minAlignedBases)
    2.99      cutRanges1 = list(trimmed(seqRanges1, coverDict1, minAlignedBases,
   2.100 -                              opts.max_gap1, pad, pad))
   2.101 +                              maxGap1, pad, pad))
   2.102      cutRanges2 = list(trimmed(seqRanges2, coverDict2, minAlignedBases,
   2.103 -                              opts.max_gap2, pad, pad))
   2.104 +                              maxGap2, pad, pad))
   2.105 +
   2.106 +    warn("reading secondary alignments...")
   2.107 +    alnDataB = readSecondaryAlignments(opts, cutRanges1, cutRanges2)
   2.108 +    alignmentsB, seqRangesB1, coverDictB1, seqRangesB2, coverDictB2 = alnDataB
   2.109 +    warn("cutting...")
   2.110 +    coverDictB1 = dict(mergedRangesPerSeq(coverDictB1))
   2.111 +    coverDictB2 = dict(mergedRangesPerSeq(coverDictB2))
   2.112 +    cutRangesB1 = trimmed(seqRangesB1, coverDictB1, minAlignedBases,
   2.113 +                          maxGapB1, 0, 0)
   2.114 +    cutRangesB2 = trimmed(seqRangesB2, coverDictB2, minAlignedBases,
   2.115 +                          maxGapB2, 0, 0)
   2.116  
   2.117      textRot1 = "vertical".startswith(opts.rot1)
   2.118 -    i1 = dataFromRanges(cutRanges1, opts.sort1, font,
   2.119 +    i1 = dataFromRanges(cutRanges1, cutRangesB1, opts.sort1, font,
   2.120                          opts.fontsize, image_mode, opts.labels1, textRot1)
   2.121      sortedRanges1, rangeSizes1, labelData1, tMargin = i1
   2.122  
   2.123      textRot2 = "horizontal".startswith(opts.rot2)
   2.124 -    i2 = dataFromRanges(cutRanges2, opts.sort2, font,
   2.125 +    i2 = dataFromRanges(cutRanges2, cutRangesB2, opts.sort2, font,
   2.126                          opts.fontsize, image_mode, opts.labels2, textRot2)
   2.127      sortedRanges2, rangeSizes2, labelData2, lMargin = i2
   2.128  
   2.129 @@ -625,7 +689,7 @@
   2.130      warn("height: " + str(height))
   2.131  
   2.132      warn("processing alignments...")
   2.133 -    hits = alignmentPixels(width, height, alignments, bpPerPix,
   2.134 +    hits = alignmentPixels(width, height, alignments + alignmentsB, bpPerPix,
   2.135                             rangeDict1, rangeDict2)
   2.136  
   2.137      warn("reading annotations...")
   2.138 @@ -707,6 +771,7 @@
   2.139                    help="color for forward alignments (default: %default)")
   2.140      op.add_option("-r", "--reversecolor", metavar="COLOR", default="blue",
   2.141                    help="color for reverse alignments (default: %default)")
   2.142 +    op.add_option("--alignments", metavar="FILE", help="secondary alignments")
   2.143      op.add_option("--sort1", default="1", metavar="N",
   2.144                    help="genome1 sequence order: 0=input order, 1=name order, "
   2.145                    "2=length order (default=%default)")
   2.146 @@ -726,6 +791,7 @@
   2.147                    help="number of pixels between sequences (default=%default)")
   2.148      op.add_option("--border-color", metavar="COLOR", default="black",
   2.149                    help="color for pixels between sequences (default=%default)")
   2.150 +    # --break-color and/or --break-pixels for intra-sequence breaks?
   2.151      op.add_option("--margin-color", metavar="COLOR", default="#dcdcdc",
   2.152                    help="margin color")
   2.153