scripts/last-dotplot
changeset 839 bbc6f00e683b
parent 838 7b972338fe04
child 840 85a72978fb7d
     1.1 --- a/scripts/last-dotplot	Thu Mar 02 09:25:14 2017 +0900
     1.2 +++ b/scripts/last-dotplot	Thu Mar 02 10:10:51 2017 +0900
     1.3 @@ -64,7 +64,7 @@
     1.4              if w[4] == "-": beg1 -= seqlen1
     1.5              chr2, beg2, seqlen2 = w[6], int(w[7]), int(w[10])
     1.6              if w[9] == "-": beg2 -= seqlen2
     1.7 -            blocks = tabBlocks(beg1, beg2, w[11])
     1.8 +            blocks = list(tabBlocks(beg1, beg2, w[11]))
     1.9              yield chr1, seqlen1, chr2, seqlen2, blocks
    1.10          elif line[0] == "s":  # MAF format
    1.11              if mafCount == 0:
    1.12 @@ -74,7 +74,7 @@
    1.13              else:
    1.14                  chr2, beg2, seqlen2, seq2 = w[1], int(w[2]), int(w[5]), w[6]
    1.15                  if w[4] == "-": beg2 -= seqlen2
    1.16 -                blocks = mafBlocks(beg1, beg2, seq1, seq2)
    1.17 +                blocks = list(mafBlocks(beg1, beg2, seq1, seq2))
    1.18                  yield chr1, seqlen1, chr2, seqlen2, blocks
    1.19                  mafCount = 0
    1.20  
    1.21 @@ -86,20 +86,33 @@
    1.22          if fnmatch.fnmatchcase(base, i): return True
    1.23      return False
    1.24  
    1.25 +def updateSeqLimits(isTrim, seqLimits, seqName, seqLen, blocks, index):
    1.26 +    if isTrim:
    1.27 +        beg = blocks[0][index]
    1.28 +        end = blocks[-1][index] + blocks[-1][2]
    1.29 +        if beg < 0: beg, end = -end, -beg
    1.30 +        if seqName in seqLimits:
    1.31 +            b, e = seqLimits[seqName]
    1.32 +            seqLimits[seqName] = min(b, beg), max(e, end)
    1.33 +        else:
    1.34 +            seqLimits[seqName] = beg, end
    1.35 +    else:
    1.36 +        seqLimits[seqName] = 0, seqLen
    1.37 +
    1.38  def readAlignments(fileName, opts):
    1.39 -    '''Get alignments and sequence lengths, from MAF or tabular format.'''
    1.40 +    '''Get alignments and sequence limits, from MAF or tabular format.'''
    1.41      alignments = []
    1.42 -    seqLengths1 = {}
    1.43 -    seqLengths2 = {}
    1.44 +    seqLimits1 = {}
    1.45 +    seqLimits2 = {}
    1.46      lines = fileinput.input(fileName)
    1.47      for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
    1.48          if not isWantedSequenceName(seqName1, opts.seq1): continue
    1.49          if not isWantedSequenceName(seqName2, opts.seq2): continue
    1.50          aln = seqName1, seqName2, blocks
    1.51          alignments.append(aln)
    1.52 -        seqLengths1[seqName1] = seqLen1
    1.53 -        seqLengths2[seqName2] = seqLen2
    1.54 -    return alignments, seqLengths1, seqLengths2
    1.55 +        updateSeqLimits(opts.trim1, seqLimits1, seqName1, seqLen1, blocks, 0)
    1.56 +        updateSeqLimits(opts.trim2, seqLimits2, seqName2, seqLen2, blocks, 1)
    1.57 +    return alignments, seqLimits1, seqLimits2
    1.58  
    1.59  def natural_sort_key(my_string):
    1.60      '''Return a sort key for "natural" ordering, e.g. chr9 < chr10.'''
    1.61 @@ -115,11 +128,11 @@
    1.62      draw = ImageDraw.Draw(im)
    1.63      return [draw.textsize(i, font=font) for i in my_strings]
    1.64  
    1.65 -def get_seq_info(seq_size_dic, font, fontsize, image_mode):
    1.66 +def get_seq_info(seqLimits, font, fontsize, image_mode):
    1.67      '''Return miscellaneous information about the sequences.'''
    1.68 -    seqNames = seq_size_dic.keys()
    1.69 +    seqNames = seqLimits.keys()
    1.70      seqNames.sort(key=natural_sort_key)
    1.71 -    seq_sizes = [seq_size_dic[i] for i in seqNames]
    1.72 +    seq_sizes = [seqLimits[i][1] - seqLimits[i][0] for i in seqNames]
    1.73      name_sizes = get_text_sizes(seqNames, font, fontsize, image_mode)
    1.74      margin = max(zip(*name_sizes)[1])  # maximum text height
    1.75      return seqNames, seq_sizes, name_sizes, margin
    1.76 @@ -213,26 +226,31 @@
    1.77  def isExtraFirstGapField(fields):
    1.78      return fields[4].isdigit()
    1.79  
    1.80 -def readGaps(fileName):
    1.81 +def readGaps(fileName, seqLimits):
    1.82      '''Read locations of unsequenced gaps, from an agp or gap file.'''
    1.83      if not fileName: return
    1.84 +    seqLimits = expandedSeqDict(seqLimits)
    1.85      for line in fileinput.input(fileName):
    1.86          w = line.split()
    1.87          if not w or w[0][0] == "#": continue
    1.88          if isExtraFirstGapField(w): w = w[1:]
    1.89          if w[4] not in "NU": continue
    1.90          seqName = w[0]
    1.91 +        if seqName not in seqLimits: continue
    1.92 +        cropBeg, cropEnd = seqLimits[seqName]
    1.93          end = int(w[2])
    1.94          beg = end - int(w[5])  # zero-based coordinate
    1.95 +        b = max(beg, cropBeg)
    1.96 +        e = min(end, cropEnd)
    1.97 +        if b >= e: continue
    1.98          bridgedText = w[7]
    1.99 -        yield seqName, beg, end, bridgedText
   1.100 +        yield seqName, b, e, bridgedText
   1.101  
   1.102  def drawUnsequencedGaps(im, gaps, origins, margin, limit, isTop, bridgedText,
   1.103                          bp_per_pix, color):
   1.104      '''Draw rectangles representing unsequenced gaps.'''
   1.105      for seqName, beg, end, b in gaps:
   1.106          if b != bridgedText: continue
   1.107 -        if seqName not in origins: continue
   1.108          ori = origins[seqName]
   1.109          b = div_ceil(ori + beg, bp_per_pix)  # use fully-covered pixels only
   1.110          e = (ori + end) // bp_per_pix
   1.111 @@ -277,9 +295,9 @@
   1.112          draw.text(position, i[3], font=font, fill=opts.text_color)
   1.113      return im
   1.114  
   1.115 -def seqOrigins(seqNames, seq_starts, bp_per_pix):
   1.116 +def seqOrigins(seqNames, seq_starts, seqLimits, bp_per_pix):
   1.117      for i, j in zip(seqNames, seq_starts):
   1.118 -        yield i, bp_per_pix * j
   1.119 +        yield i, bp_per_pix * j - seqLimits[i][0]
   1.120  
   1.121  def lastDotplot(opts, args):
   1.122      if opts.fontfile:  font = ImageFont.truetype(opts.fontfile, opts.fontsize)
   1.123 @@ -292,13 +310,13 @@
   1.124      overlap_color = tuple([(i + j) // 2 for i, j in zipped_colors])
   1.125  
   1.126      warn("reading alignments...")
   1.127 -    alignments, seq_size_dic1, seq_size_dic2 = readAlignments(args[0], opts)
   1.128 +    alignments, seqLimits1, seqLimits2 = readAlignments(args[0], opts)
   1.129      warn("done")
   1.130  
   1.131      if not alignments: raise Exception("there are no alignments")
   1.132  
   1.133 -    seq_info1 = get_seq_info(seq_size_dic1, font, opts.fontsize, image_mode)
   1.134 -    seq_info2 = get_seq_info(seq_size_dic2, font, opts.fontsize, image_mode)
   1.135 +    seq_info1 = get_seq_info(seqLimits1, font, opts.fontsize, image_mode)
   1.136 +    seq_info2 = get_seq_info(seqLimits2, font, opts.fontsize, image_mode)
   1.137      seqNames1, seq_sizes1, name_sizes1, margin1 = seq_info1
   1.138      seqNames2, seq_sizes2, name_sizes2, margin2 = seq_info2
   1.139  
   1.140 @@ -314,8 +332,9 @@
   1.141                                                   opts.pix_tween_seqs, margin1)
   1.142      seq_pix2, seq_starts2, height = get_pix_info(seq_sizes2, bp_per_pix,
   1.143                                                   opts.pix_tween_seqs, margin2)
   1.144 -    origins1 = dict(seqOrigins(seqNames1, seq_starts1, bp_per_pix))
   1.145 -    origins2 = dict(seqOrigins(seqNames2, seq_starts2, bp_per_pix))
   1.146 +
   1.147 +    origins1 = dict(seqOrigins(seqNames1, seq_starts1, seqLimits1, bp_per_pix))
   1.148 +    origins2 = dict(seqOrigins(seqNames2, seq_starts2, seqLimits2, bp_per_pix))
   1.149  
   1.150      warn("processing alignments...")
   1.151      hits = alignmentPixels(width, height, alignments, bp_per_pix,
   1.152 @@ -327,8 +346,8 @@
   1.153  
   1.154      origins1 = expandedSeqDict(origins1)
   1.155      origins2 = expandedSeqDict(origins2)
   1.156 -    gaps1 = list(readGaps(opts.gap1))
   1.157 -    gaps2 = list(readGaps(opts.gap2))
   1.158 +    gaps1 = list(readGaps(opts.gap1, seqLimits1))
   1.159 +    gaps2 = list(readGaps(opts.gap2, seqLimits2))
   1.160      # draw bridged gaps first, then unbridged gaps on top:
   1.161      drawUnsequencedGaps(im, gaps1, origins1, margin2, height, True, "yes",
   1.162                          bp_per_pix, opts.bridged_color)
   1.163 @@ -390,6 +409,10 @@
   1.164                    help="color for forward alignments (default: %default)")
   1.165      op.add_option("-r", "--reversecolor", metavar="COLOR", default="blue",
   1.166                    help="color for reverse alignments (default: %default)")
   1.167 +    op.add_option("--trim1", action="store_true",
   1.168 +                  help="trim unaligned sequence flanks from the 1st genome")
   1.169 +    op.add_option("--trim2", action="store_true",
   1.170 +                  help="trim unaligned sequence flanks from the 2nd genome")
   1.171      og = optparse.OptionGroup(op, "Unsequenced gap options")
   1.172      og.add_option("--gap1", metavar="FILE",
   1.173                    help="read genome1 unsequenced gaps from agp or gap file")