Added trim options to last-dotplot.
authorMartin C. Frith
Thu Mar 02 10:10:51 2017 +0900 (2017-03-02)
changeset 839bbc6f00e683b
parent 838 7b972338fe04
child 840 85a72978fb7d
Added trim options to last-dotplot.
doc/last-dotplot.txt
scripts/last-dotplot
     1.1 --- a/doc/last-dotplot.txt	Thu Mar 02 09:25:14 2017 +0900
     1.2 +++ b/doc/last-dotplot.txt	Thu Mar 02 10:10:51 2017 +0900
     1.3 @@ -54,30 +54,26 @@
     1.4  
     1.5    -h, --help
     1.6        Show a help message, with default option values, and exit.
     1.7 -
     1.8    -1 PATTERN, --seq1=PATTERN
     1.9        Which sequences to show from the 1st genome.
    1.10 -
    1.11    -2 PATTERN, --seq2=PATTERN
    1.12        Which sequences to show from the 2nd genome.
    1.13 -
    1.14    -x WIDTH, --width=WIDTH
    1.15        Maximum width in pixels.
    1.16 -
    1.17    -y HEIGHT, --height=HEIGHT
    1.18        Maximum height in pixels.
    1.19 -
    1.20    -f FILE, --fontfile=FILE
    1.21        TrueType or OpenType font file.
    1.22 -
    1.23    -s SIZE, --fontsize=SIZE
    1.24        TrueType or OpenType font size.
    1.25 -
    1.26    -c COLOR, --forwardcolor=COLOR
    1.27        Color for forward alignments.
    1.28 -
    1.29    -r COLOR, --reversecolor=COLOR
    1.30        Color for reverse alignments.
    1.31 +  --trim1
    1.32 +      Trim unaligned sequence flanks from the 1st genome.
    1.33 +  --trim2
    1.34 +      Trim unaligned sequence flanks from the 2nd genome.
    1.35  
    1.36  Unsequenced gap options
    1.37  ~~~~~~~~~~~~~~~~~~~~~~~
     2.1 --- a/scripts/last-dotplot	Thu Mar 02 09:25:14 2017 +0900
     2.2 +++ b/scripts/last-dotplot	Thu Mar 02 10:10:51 2017 +0900
     2.3 @@ -64,7 +64,7 @@
     2.4              if w[4] == "-": beg1 -= seqlen1
     2.5              chr2, beg2, seqlen2 = w[6], int(w[7]), int(w[10])
     2.6              if w[9] == "-": beg2 -= seqlen2
     2.7 -            blocks = tabBlocks(beg1, beg2, w[11])
     2.8 +            blocks = list(tabBlocks(beg1, beg2, w[11]))
     2.9              yield chr1, seqlen1, chr2, seqlen2, blocks
    2.10          elif line[0] == "s":  # MAF format
    2.11              if mafCount == 0:
    2.12 @@ -74,7 +74,7 @@
    2.13              else:
    2.14                  chr2, beg2, seqlen2, seq2 = w[1], int(w[2]), int(w[5]), w[6]
    2.15                  if w[4] == "-": beg2 -= seqlen2
    2.16 -                blocks = mafBlocks(beg1, beg2, seq1, seq2)
    2.17 +                blocks = list(mafBlocks(beg1, beg2, seq1, seq2))
    2.18                  yield chr1, seqlen1, chr2, seqlen2, blocks
    2.19                  mafCount = 0
    2.20  
    2.21 @@ -86,20 +86,33 @@
    2.22          if fnmatch.fnmatchcase(base, i): return True
    2.23      return False
    2.24  
    2.25 +def updateSeqLimits(isTrim, seqLimits, seqName, seqLen, blocks, index):
    2.26 +    if isTrim:
    2.27 +        beg = blocks[0][index]
    2.28 +        end = blocks[-1][index] + blocks[-1][2]
    2.29 +        if beg < 0: beg, end = -end, -beg
    2.30 +        if seqName in seqLimits:
    2.31 +            b, e = seqLimits[seqName]
    2.32 +            seqLimits[seqName] = min(b, beg), max(e, end)
    2.33 +        else:
    2.34 +            seqLimits[seqName] = beg, end
    2.35 +    else:
    2.36 +        seqLimits[seqName] = 0, seqLen
    2.37 +
    2.38  def readAlignments(fileName, opts):
    2.39 -    '''Get alignments and sequence lengths, from MAF or tabular format.'''
    2.40 +    '''Get alignments and sequence limits, from MAF or tabular format.'''
    2.41      alignments = []
    2.42 -    seqLengths1 = {}
    2.43 -    seqLengths2 = {}
    2.44 +    seqLimits1 = {}
    2.45 +    seqLimits2 = {}
    2.46      lines = fileinput.input(fileName)
    2.47      for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
    2.48          if not isWantedSequenceName(seqName1, opts.seq1): continue
    2.49          if not isWantedSequenceName(seqName2, opts.seq2): continue
    2.50          aln = seqName1, seqName2, blocks
    2.51          alignments.append(aln)
    2.52 -        seqLengths1[seqName1] = seqLen1
    2.53 -        seqLengths2[seqName2] = seqLen2
    2.54 -    return alignments, seqLengths1, seqLengths2
    2.55 +        updateSeqLimits(opts.trim1, seqLimits1, seqName1, seqLen1, blocks, 0)
    2.56 +        updateSeqLimits(opts.trim2, seqLimits2, seqName2, seqLen2, blocks, 1)
    2.57 +    return alignments, seqLimits1, seqLimits2
    2.58  
    2.59  def natural_sort_key(my_string):
    2.60      '''Return a sort key for "natural" ordering, e.g. chr9 < chr10.'''
    2.61 @@ -115,11 +128,11 @@
    2.62      draw = ImageDraw.Draw(im)
    2.63      return [draw.textsize(i, font=font) for i in my_strings]
    2.64  
    2.65 -def get_seq_info(seq_size_dic, font, fontsize, image_mode):
    2.66 +def get_seq_info(seqLimits, font, fontsize, image_mode):
    2.67      '''Return miscellaneous information about the sequences.'''
    2.68 -    seqNames = seq_size_dic.keys()
    2.69 +    seqNames = seqLimits.keys()
    2.70      seqNames.sort(key=natural_sort_key)
    2.71 -    seq_sizes = [seq_size_dic[i] for i in seqNames]
    2.72 +    seq_sizes = [seqLimits[i][1] - seqLimits[i][0] for i in seqNames]
    2.73      name_sizes = get_text_sizes(seqNames, font, fontsize, image_mode)
    2.74      margin = max(zip(*name_sizes)[1])  # maximum text height
    2.75      return seqNames, seq_sizes, name_sizes, margin
    2.76 @@ -213,26 +226,31 @@
    2.77  def isExtraFirstGapField(fields):
    2.78      return fields[4].isdigit()
    2.79  
    2.80 -def readGaps(fileName):
    2.81 +def readGaps(fileName, seqLimits):
    2.82      '''Read locations of unsequenced gaps, from an agp or gap file.'''
    2.83      if not fileName: return
    2.84 +    seqLimits = expandedSeqDict(seqLimits)
    2.85      for line in fileinput.input(fileName):
    2.86          w = line.split()
    2.87          if not w or w[0][0] == "#": continue
    2.88          if isExtraFirstGapField(w): w = w[1:]
    2.89          if w[4] not in "NU": continue
    2.90          seqName = w[0]
    2.91 +        if seqName not in seqLimits: continue
    2.92 +        cropBeg, cropEnd = seqLimits[seqName]
    2.93          end = int(w[2])
    2.94          beg = end - int(w[5])  # zero-based coordinate
    2.95 +        b = max(beg, cropBeg)
    2.96 +        e = min(end, cropEnd)
    2.97 +        if b >= e: continue
    2.98          bridgedText = w[7]
    2.99 -        yield seqName, beg, end, bridgedText
   2.100 +        yield seqName, b, e, bridgedText
   2.101  
   2.102  def drawUnsequencedGaps(im, gaps, origins, margin, limit, isTop, bridgedText,
   2.103                          bp_per_pix, color):
   2.104      '''Draw rectangles representing unsequenced gaps.'''
   2.105      for seqName, beg, end, b in gaps:
   2.106          if b != bridgedText: continue
   2.107 -        if seqName not in origins: continue
   2.108          ori = origins[seqName]
   2.109          b = div_ceil(ori + beg, bp_per_pix)  # use fully-covered pixels only
   2.110          e = (ori + end) // bp_per_pix
   2.111 @@ -277,9 +295,9 @@
   2.112          draw.text(position, i[3], font=font, fill=opts.text_color)
   2.113      return im
   2.114  
   2.115 -def seqOrigins(seqNames, seq_starts, bp_per_pix):
   2.116 +def seqOrigins(seqNames, seq_starts, seqLimits, bp_per_pix):
   2.117      for i, j in zip(seqNames, seq_starts):
   2.118 -        yield i, bp_per_pix * j
   2.119 +        yield i, bp_per_pix * j - seqLimits[i][0]
   2.120  
   2.121  def lastDotplot(opts, args):
   2.122      if opts.fontfile:  font = ImageFont.truetype(opts.fontfile, opts.fontsize)
   2.123 @@ -292,13 +310,13 @@
   2.124      overlap_color = tuple([(i + j) // 2 for i, j in zipped_colors])
   2.125  
   2.126      warn("reading alignments...")
   2.127 -    alignments, seq_size_dic1, seq_size_dic2 = readAlignments(args[0], opts)
   2.128 +    alignments, seqLimits1, seqLimits2 = readAlignments(args[0], opts)
   2.129      warn("done")
   2.130  
   2.131      if not alignments: raise Exception("there are no alignments")
   2.132  
   2.133 -    seq_info1 = get_seq_info(seq_size_dic1, font, opts.fontsize, image_mode)
   2.134 -    seq_info2 = get_seq_info(seq_size_dic2, font, opts.fontsize, image_mode)
   2.135 +    seq_info1 = get_seq_info(seqLimits1, font, opts.fontsize, image_mode)
   2.136 +    seq_info2 = get_seq_info(seqLimits2, font, opts.fontsize, image_mode)
   2.137      seqNames1, seq_sizes1, name_sizes1, margin1 = seq_info1
   2.138      seqNames2, seq_sizes2, name_sizes2, margin2 = seq_info2
   2.139  
   2.140 @@ -314,8 +332,9 @@
   2.141                                                   opts.pix_tween_seqs, margin1)
   2.142      seq_pix2, seq_starts2, height = get_pix_info(seq_sizes2, bp_per_pix,
   2.143                                                   opts.pix_tween_seqs, margin2)
   2.144 -    origins1 = dict(seqOrigins(seqNames1, seq_starts1, bp_per_pix))
   2.145 -    origins2 = dict(seqOrigins(seqNames2, seq_starts2, bp_per_pix))
   2.146 +
   2.147 +    origins1 = dict(seqOrigins(seqNames1, seq_starts1, seqLimits1, bp_per_pix))
   2.148 +    origins2 = dict(seqOrigins(seqNames2, seq_starts2, seqLimits2, bp_per_pix))
   2.149  
   2.150      warn("processing alignments...")
   2.151      hits = alignmentPixels(width, height, alignments, bp_per_pix,
   2.152 @@ -327,8 +346,8 @@
   2.153  
   2.154      origins1 = expandedSeqDict(origins1)
   2.155      origins2 = expandedSeqDict(origins2)
   2.156 -    gaps1 = list(readGaps(opts.gap1))
   2.157 -    gaps2 = list(readGaps(opts.gap2))
   2.158 +    gaps1 = list(readGaps(opts.gap1, seqLimits1))
   2.159 +    gaps2 = list(readGaps(opts.gap2, seqLimits2))
   2.160      # draw bridged gaps first, then unbridged gaps on top:
   2.161      drawUnsequencedGaps(im, gaps1, origins1, margin2, height, True, "yes",
   2.162                          bp_per_pix, opts.bridged_color)
   2.163 @@ -390,6 +409,10 @@
   2.164                    help="color for forward alignments (default: %default)")
   2.165      op.add_option("-r", "--reversecolor", metavar="COLOR", default="blue",
   2.166                    help="color for reverse alignments (default: %default)")
   2.167 +    op.add_option("--trim1", action="store_true",
   2.168 +                  help="trim unaligned sequence flanks from the 1st genome")
   2.169 +    op.add_option("--trim2", action="store_true",
   2.170 +                  help="trim unaligned sequence flanks from the 2nd genome")
   2.171      og = optparse.OptionGroup(op, "Unsequenced gap options")
   2.172      og.add_option("--gap1", metavar="FILE",
   2.173                    help="read genome1 unsequenced gaps from agp or gap file")