scripts/last-dotplot
changeset 961 ed0fb9b1eb40
parent 957 c54672c8892e
child 980 27c26371226b
     1.1 --- a/scripts/last-dotplot	Mon Oct 29 08:23:59 2018 +0900
     1.2 +++ b/scripts/last-dotplot	Mon Dec 10 08:23:09 2018 +0900
     1.3 @@ -13,6 +13,7 @@
     1.4  import functools
     1.5  import gzip
     1.6  from fnmatch import fnmatchcase
     1.7 +import logging
     1.8  from operator import itemgetter
     1.9  import subprocess
    1.10  import itertools, optparse, os, re, sys
    1.11 @@ -37,11 +38,6 @@
    1.12          return gzip.open(fileName, "rt")  # xxx dubious for Python2
    1.13      return open(fileName)
    1.14  
    1.15 -def warn(message):
    1.16 -    if opts.verbose:
    1.17 -        prog = os.path.basename(sys.argv[0])
    1.18 -        sys.stderr.write(prog + ": " + message + "\n")
    1.19 -
    1.20  def groupByFirstItem(things):
    1.21      for k, v in itertools.groupby(things, itemgetter(0)):
    1.22          yield k, [i[1:] for i in v]
    1.23 @@ -414,8 +410,8 @@
    1.24          out = [seqName, str(rangeBeg), str(rangeEnd)]
    1.25          if strandNum > 0:
    1.26              out.append(".+-"[strandNum])
    1.27 -        warn("\t".join(out))
    1.28 -    warn("")
    1.29 +        logging.info("\t".join(out))
    1.30 +    logging.info("")
    1.31      rangeSizes = [e - b for n, b, e, s in sortedRanges]
    1.32      labs = list(rangeLabels(sortedRanges, labelOpt, font, fontSize,
    1.33                              imageMode, textRot))
    1.34 @@ -430,7 +426,7 @@
    1.35  
    1.36  def get_bp_per_pix(rangeSizes, pixTweenRanges, maxPixels):
    1.37      '''Get the minimum bp-per-pixel that fits in the size limit.'''
    1.38 -    warn("choosing bp per pixel...")
    1.39 +    logging.info("choosing bp per pixel...")
    1.40      numOfRanges = len(rangeSizes)
    1.41      maxPixelsInRanges = maxPixels - pixTweenRanges * (numOfRanges - 1)
    1.42      if maxPixelsInRanges < numOfRanges:
    1.43 @@ -718,18 +714,37 @@
    1.44          out, err = p.communicate()
    1.45          fileNames.append(out)
    1.46      except OSError as e:
    1.47 -        warn("fc-match error: " + str(e))
    1.48 +        logging.info("fc-match error: " + str(e))
    1.49      fileNames.append("/Library/Fonts/Arial.ttf")  # for Mac
    1.50      for i in fileNames:
    1.51          try:
    1.52              font = ImageFont.truetype(i, opts.fontsize)
    1.53 -            warn("font: " + i)
    1.54 +            logging.info("font: " + i)
    1.55              return font
    1.56          except IOError as e:
    1.57 -            warn("font load error: " + str(e))
    1.58 +            logging.info("font load error: " + str(e))
    1.59      return ImageFont.load_default()
    1.60  
    1.61 +def sequenceSizesAndNames(seqRanges):
    1.62 +    for seqName, ranges in itertools.groupby(seqRanges, itemgetter(0)):
    1.63 +        size = sum(e - b for n, b, e in ranges)
    1.64 +        yield size, seqName
    1.65 +
    1.66 +def biggestSequences(seqRanges, maxNumOfSequences):
    1.67 +    s = sorted(sequenceSizesAndNames(seqRanges), reverse=True)
    1.68 +    if len(s) > maxNumOfSequences:
    1.69 +        logging.warning("too many sequences - discarding the smallest ones")
    1.70 +        s = s[:maxNumOfSequences]
    1.71 +    return set(i[1] for i in s)
    1.72 +
    1.73 +def remainingSequenceRanges(seqRanges, alignments, seqIndex):
    1.74 +    remainingSequences = set(i[seqIndex] for i in alignments)
    1.75 +    return [i for i in seqRanges if i[0] in remainingSequences]
    1.76 +
    1.77  def lastDotplot(opts, args):
    1.78 +    logLevel = logging.INFO if opts.verbose else logging.WARNING
    1.79 +    logging.basicConfig(format="%(filename)s: %(message)s", level=logLevel)
    1.80 +
    1.81      font = getFont(opts)
    1.82      image_mode = 'RGB'
    1.83      forward_color = ImageColor.getcolor(opts.forwardcolor, image_mode)
    1.84 @@ -740,11 +755,11 @@
    1.85      maxGap1, maxGapB1 = twoValuesFromOption(opts.max_gap1, ":")
    1.86      maxGap2, maxGapB2 = twoValuesFromOption(opts.max_gap2, ":")
    1.87  
    1.88 -    warn("reading alignments...")
    1.89 +    logging.info("reading alignments...")
    1.90      alnData = readAlignments(args[0], opts)
    1.91      alignments, seqRanges1, coverDict1, seqRanges2, coverDict2 = alnData
    1.92      if not alignments: raise Exception("there are no alignments")
    1.93 -    warn("cutting...")
    1.94 +    logging.info("cutting...")
    1.95      coverDict1 = dict(mergedRangesPerSeq(coverDict1))
    1.96      coverDict2 = dict(mergedRangesPerSeq(coverDict2))
    1.97      minAlignedBases = min(coveredLength(coverDict1), coveredLength(coverDict2))
    1.98 @@ -754,10 +769,20 @@
    1.99      cutRanges2 = list(trimmed(seqRanges2, coverDict2, minAlignedBases,
   1.100                                maxGap2, pad, pad))
   1.101  
   1.102 -    warn("reading secondary alignments...")
   1.103 +    biggestSeqs1 = biggestSequences(cutRanges1, opts.maxseqs)
   1.104 +    cutRanges1 = [i for i in cutRanges1 if i[0] in biggestSeqs1]
   1.105 +    alignments = [i for i in alignments if i[0] in biggestSeqs1]
   1.106 +    cutRanges2 = remainingSequenceRanges(cutRanges2, alignments, 1)
   1.107 +
   1.108 +    biggestSeqs2 = biggestSequences(cutRanges2, opts.maxseqs)
   1.109 +    cutRanges2 = [i for i in cutRanges2 if i[0] in biggestSeqs2]
   1.110 +    alignments = [i for i in alignments if i[1] in biggestSeqs2]
   1.111 +    cutRanges1 = remainingSequenceRanges(cutRanges1, alignments, 0)
   1.112 +
   1.113 +    logging.info("reading secondary alignments...")
   1.114      alnDataB = readSecondaryAlignments(opts, cutRanges1, cutRanges2)
   1.115      alignmentsB, seqRangesB1, coverDictB1, seqRangesB2, coverDictB2 = alnDataB
   1.116 -    warn("cutting...")
   1.117 +    logging.info("cutting...")
   1.118      coverDictB1 = dict(mergedRangesPerSeq(coverDictB1))
   1.119      coverDictB2 = dict(mergedRangesPerSeq(coverDictB2))
   1.120      cutRangesB1 = trimmed(seqRangesB1, coverDictB1, minAlignedBases,
   1.121 @@ -765,7 +790,7 @@
   1.122      cutRangesB2 = trimmed(seqRangesB2, coverDictB2, minAlignedBases,
   1.123                            maxGapB2, 0, 0)
   1.124  
   1.125 -    warn("sorting...")
   1.126 +    logging.info("sorting...")
   1.127      sortOut = allSortedRanges(opts, alignments, alignmentsB,
   1.128                                cutRanges1, cutRangesB1, cutRanges2, cutRangesB2)
   1.129      sortedRanges1, sortedRanges2 = sortOut
   1.130 @@ -785,7 +810,7 @@
   1.131      bpPerPix1 = get_bp_per_pix(rangeSizes1, opts.border_pixels, maxPixels1)
   1.132      bpPerPix2 = get_bp_per_pix(rangeSizes2, opts.border_pixels, maxPixels2)
   1.133      bpPerPix = max(bpPerPix1, bpPerPix2)
   1.134 -    warn("bp per pixel = " + str(bpPerPix))
   1.135 +    logging.info("bp per pixel = " + str(bpPerPix))
   1.136  
   1.137      p1 = pixelData(rangeSizes1, bpPerPix, opts.border_pixels, lMargin)
   1.138      rangePixBegs1, rangePixLens1, width = p1
   1.139 @@ -797,14 +822,14 @@
   1.140      rangeDict2 = dict(rangesPerSeq(sortedRanges2, rangePixBegs2,
   1.141                                     rangePixLens2, bpPerPix))
   1.142  
   1.143 -    warn("width:  " + str(width))
   1.144 -    warn("height: " + str(height))
   1.145 +    logging.info("width:  " + str(width))
   1.146 +    logging.info("height: " + str(height))
   1.147  
   1.148 -    warn("processing alignments...")
   1.149 +    logging.info("processing alignments...")
   1.150      hits = alignmentPixels(width, height, alignments + alignmentsB, bpPerPix,
   1.151                             rangeDict1, rangeDict2)
   1.152  
   1.153 -    warn("reading annotations...")
   1.154 +    logging.info("reading annotations...")
   1.155  
   1.156      rangeDict1 = expandedSeqDict(rangeDict1)
   1.157      beds1 = itertools.chain(readBed(opts.bed1, rangeDict1),
   1.158 @@ -822,7 +847,7 @@
   1.159  
   1.160      boxes = sorted(itertools.chain(b1, b2))
   1.161  
   1.162 -    warn("drawing...")
   1.163 +    logging.info("drawing...")
   1.164  
   1.165      image_size = width, height
   1.166      im = Image.new(image_mode, image_size, opts.background_color)
   1.167 @@ -868,17 +893,20 @@
   1.168      op = optparse.OptionParser(usage=usage, description=description)
   1.169      op.add_option("-v", "--verbose", action="count",
   1.170                    help="show progress messages & data about the plot")
   1.171 +    # Replace "width" & "height" with a single "length" option?
   1.172 +    op.add_option("-x", "--width", metavar="INT", type="int", default=1000,
   1.173 +                  help="maximum width in pixels (default: %default)")
   1.174 +    op.add_option("-y", "--height", metavar="INT", type="int", default=1000,
   1.175 +                  help="maximum height in pixels (default: %default)")
   1.176 +    op.add_option("-m", "--maxseqs", type="int", default=100, metavar="M",
   1.177 +                  help="maximum number of horizontal or vertical sequences "
   1.178 +                  "(default=%default)")
   1.179      op.add_option("-1", "--seq1", metavar="PATTERN", action="append",
   1.180                    default=[],
   1.181                    help="which sequences to show from the 1st genome")
   1.182      op.add_option("-2", "--seq2", metavar="PATTERN", action="append",
   1.183                    default=[],
   1.184                    help="which sequences to show from the 2nd genome")
   1.185 -    # Replace "width" & "height" with a single "length" option?
   1.186 -    op.add_option("-x", "--width", type="int", default=1000,
   1.187 -                  help="maximum width in pixels (default: %default)")
   1.188 -    op.add_option("-y", "--height", type="int", default=1000,
   1.189 -                  help="maximum height in pixels (default: %default)")
   1.190      op.add_option("-c", "--forwardcolor", metavar="COLOR", default="red",
   1.191                    help="color for forward alignments (default: %default)")
   1.192      op.add_option("-r", "--reversecolor", metavar="COLOR", default="blue",