1.1 --- a/doc/last-dotplot.txt Mon Dec 10 07:59:26 2018 +0900
1.2 +++ b/doc/last-dotplot.txt Mon Dec 10 08:23:09 2018 +0900
1.3 @@ -19,50 +19,6 @@
1.4 set of sequences. This document calls a "set of sequences" a
1.5 "genome", though it need not actually be a genome.
1.6
1.7 -Choosing sequences
1.8 -------------------
1.9 -
1.10 -If there are too many sequences, the dotplot will be very cluttered,
1.11 -or the script might give up with an error message. You can exclude
1.12 -sequences with names like "chrUn_random522" like this::
1.13 -
1.14 - last-dotplot -1 'chr[!U]*' -2 'chr[!U]*' alns alns.png
1.15 -
1.16 -Option "-1" selects sequences from the 1st (horizontal) genome, and
1.17 -"-2" selects sequences from the 2nd (vertical) genome. 'chr[!U]*' is
1.18 -a *pattern* that specifies names starting with "chr", followed by any
1.19 -character except U, followed by anything.
1.20 -
1.21 -========== =============================
1.22 -Pattern Meaning
1.23 ----------- -----------------------------
1.24 -``*`` zero or more of any character
1.25 -``?`` any single character
1.26 -``[abc]`` any character in abc
1.27 -``[!abc]`` any character not in abc
1.28 -========== =============================
1.29 -
1.30 -If a sequence name has a dot (e.g. "hg19.chr7"), the pattern is
1.31 -compared to both the whole name and the part after the dot.
1.32 -
1.33 -You can specify more than one pattern, e.g. this gets sequences with
1.34 -names starting in "chr" followed by one or two characters::
1.35 -
1.36 - last-dotplot -1 'chr?' -1 'chr??' alns alns.png
1.37 -
1.38 -You can also specify a sequence range; for example this gets the first
1.39 -1000 bases of chr9::
1.40 -
1.41 - last-dotplot -1 chr9:0-1000 alns alns.png
1.42 -
1.43 -Fonts
1.44 ------
1.45 -
1.46 -last-dotplot tries to find a nice font on your computer, but may fail
1.47 -and use an ugly font. You can specify a font like this::
1.48 -
1.49 - last-dotplot -f /usr/share/fonts/liberation/LiberationSans-Regular.ttf alns alns.png
1.50 -
1.51 Options
1.52 -------
1.53
1.54 @@ -70,14 +26,18 @@
1.55 Show a help message, with default option values, and exit.
1.56 -v, --verbose
1.57 Show progress messages & data about the plot.
1.58 + -x INT, --width=INT
1.59 + Maximum width in pixels.
1.60 + -y INT, --height=INT
1.61 + Maximum height in pixels.
1.62 + -m M, --maxseqs=M
1.63 + Maximum number of horizontal or vertical sequences. If there
1.64 + are >M sequences, the smallest ones (after cutting) will be
1.65 + discarded.
1.66 -1 PATTERN, --seq1=PATTERN
1.67 Which sequences to show from the 1st (horizontal) genome.
1.68 -2 PATTERN, --seq2=PATTERN
1.69 Which sequences to show from the 2nd (vertical) genome.
1.70 - -x WIDTH, --width=WIDTH
1.71 - Maximum width in pixels.
1.72 - -y HEIGHT, --height=HEIGHT
1.73 - Maximum height in pixels.
1.74 -c COLOR, --forwardcolor=COLOR
1.75 Color for forward alignments.
1.76 -r COLOR, --reversecolor=COLOR
1.77 @@ -207,6 +167,54 @@
1.78 An unsequenced gap will be shown only if it covers at least one whole
1.79 pixel.
1.80
1.81 +Choosing sequences
1.82 +------------------
1.83 +
1.84 +For example, you can exclude sequences with names like
1.85 +"chrUn_random522" like this::
1.86 +
1.87 + last-dotplot -1 'chr[!U]*' -2 'chr[!U]*' alns alns.png
1.88 +
1.89 +Option "-1" selects sequences from the 1st (horizontal) genome, and
1.90 +"-2" selects sequences from the 2nd (vertical) genome. 'chr[!U]*' is
1.91 +a *pattern* that specifies names starting with "chr", followed by any
1.92 +character except U, followed by anything.
1.93 +
1.94 +========== =============================
1.95 +Pattern Meaning
1.96 +---------- -----------------------------
1.97 +``*`` zero or more of any character
1.98 +``?`` any single character
1.99 +``[abc]`` any character in abc
1.100 +``[!abc]`` any character not in abc
1.101 +========== =============================
1.102 +
1.103 +If a sequence name has a dot (e.g. "hg19.chr7"), the pattern is
1.104 +compared to both the whole name and the part after the dot.
1.105 +
1.106 +You can specify more than one pattern, e.g. this gets sequences with
1.107 +names starting in "chr" followed by one or two characters::
1.108 +
1.109 + last-dotplot -1 'chr?' -1 'chr??' alns alns.png
1.110 +
1.111 +You can also specify a sequence range; for example this gets the first
1.112 +1000 bases of chr9::
1.113 +
1.114 + last-dotplot -1 chr9:0-1000 alns alns.png
1.115 +
1.116 +Text font
1.117 +---------
1.118 +
1.119 +You can improve the font quality by increasing its size, e.g. to 20
1.120 +points:
1.121 +
1.122 + last-dotplot -s20 my-alignments my-plot.png
1.123 +
1.124 +last-dotplot tries to find a nice font on your computer, but may fail
1.125 +and use an ugly font. You can specify a font like this::
1.126 +
1.127 + last-dotplot -f /usr/share/fonts/liberation/LiberationSans-Regular.ttf alns alns.png
1.128 +
1.129 Colors
1.130 ------
1.131
2.1 --- a/scripts/last-dotplot Mon Dec 10 07:59:26 2018 +0900
2.2 +++ b/scripts/last-dotplot Mon Dec 10 08:23:09 2018 +0900
2.3 @@ -13,6 +13,7 @@
2.4 import functools
2.5 import gzip
2.6 from fnmatch import fnmatchcase
2.7 +import logging
2.8 from operator import itemgetter
2.9 import subprocess
2.10 import itertools, optparse, os, re, sys
2.11 @@ -37,11 +38,6 @@
2.12 return gzip.open(fileName, "rt") # xxx dubious for Python2
2.13 return open(fileName)
2.14
2.15 -def warn(message):
2.16 - if opts.verbose:
2.17 - prog = os.path.basename(sys.argv[0])
2.18 - sys.stderr.write(prog + ": " + message + "\n")
2.19 -
2.20 def groupByFirstItem(things):
2.21 for k, v in itertools.groupby(things, itemgetter(0)):
2.22 yield k, [i[1:] for i in v]
2.23 @@ -414,8 +410,8 @@
2.24 out = [seqName, str(rangeBeg), str(rangeEnd)]
2.25 if strandNum > 0:
2.26 out.append(".+-"[strandNum])
2.27 - warn("\t".join(out))
2.28 - warn("")
2.29 + logging.info("\t".join(out))
2.30 + logging.info("")
2.31 rangeSizes = [e - b for n, b, e, s in sortedRanges]
2.32 labs = list(rangeLabels(sortedRanges, labelOpt, font, fontSize,
2.33 imageMode, textRot))
2.34 @@ -430,7 +426,7 @@
2.35
2.36 def get_bp_per_pix(rangeSizes, pixTweenRanges, maxPixels):
2.37 '''Get the minimum bp-per-pixel that fits in the size limit.'''
2.38 - warn("choosing bp per pixel...")
2.39 + logging.info("choosing bp per pixel...")
2.40 numOfRanges = len(rangeSizes)
2.41 maxPixelsInRanges = maxPixels - pixTweenRanges * (numOfRanges - 1)
2.42 if maxPixelsInRanges < numOfRanges:
2.43 @@ -718,18 +714,37 @@
2.44 out, err = p.communicate()
2.45 fileNames.append(out)
2.46 except OSError as e:
2.47 - warn("fc-match error: " + str(e))
2.48 + logging.info("fc-match error: " + str(e))
2.49 fileNames.append("/Library/Fonts/Arial.ttf") # for Mac
2.50 for i in fileNames:
2.51 try:
2.52 font = ImageFont.truetype(i, opts.fontsize)
2.53 - warn("font: " + i)
2.54 + logging.info("font: " + i)
2.55 return font
2.56 except IOError as e:
2.57 - warn("font load error: " + str(e))
2.58 + logging.info("font load error: " + str(e))
2.59 return ImageFont.load_default()
2.60
2.61 +def sequenceSizesAndNames(seqRanges):
2.62 + for seqName, ranges in itertools.groupby(seqRanges, itemgetter(0)):
2.63 + size = sum(e - b for n, b, e in ranges)
2.64 + yield size, seqName
2.65 +
2.66 +def biggestSequences(seqRanges, maxNumOfSequences):
2.67 + s = sorted(sequenceSizesAndNames(seqRanges), reverse=True)
2.68 + if len(s) > maxNumOfSequences:
2.69 + logging.warning("too many sequences - discarding the smallest ones")
2.70 + s = s[:maxNumOfSequences]
2.71 + return set(i[1] for i in s)
2.72 +
2.73 +def remainingSequenceRanges(seqRanges, alignments, seqIndex):
2.74 + remainingSequences = set(i[seqIndex] for i in alignments)
2.75 + return [i for i in seqRanges if i[0] in remainingSequences]
2.76 +
2.77 def lastDotplot(opts, args):
2.78 + logLevel = logging.INFO if opts.verbose else logging.WARNING
2.79 + logging.basicConfig(format="%(filename)s: %(message)s", level=logLevel)
2.80 +
2.81 font = getFont(opts)
2.82 image_mode = 'RGB'
2.83 forward_color = ImageColor.getcolor(opts.forwardcolor, image_mode)
2.84 @@ -740,11 +755,11 @@
2.85 maxGap1, maxGapB1 = twoValuesFromOption(opts.max_gap1, ":")
2.86 maxGap2, maxGapB2 = twoValuesFromOption(opts.max_gap2, ":")
2.87
2.88 - warn("reading alignments...")
2.89 + logging.info("reading alignments...")
2.90 alnData = readAlignments(args[0], opts)
2.91 alignments, seqRanges1, coverDict1, seqRanges2, coverDict2 = alnData
2.92 if not alignments: raise Exception("there are no alignments")
2.93 - warn("cutting...")
2.94 + logging.info("cutting...")
2.95 coverDict1 = dict(mergedRangesPerSeq(coverDict1))
2.96 coverDict2 = dict(mergedRangesPerSeq(coverDict2))
2.97 minAlignedBases = min(coveredLength(coverDict1), coveredLength(coverDict2))
2.98 @@ -754,10 +769,20 @@
2.99 cutRanges2 = list(trimmed(seqRanges2, coverDict2, minAlignedBases,
2.100 maxGap2, pad, pad))
2.101
2.102 - warn("reading secondary alignments...")
2.103 + biggestSeqs1 = biggestSequences(cutRanges1, opts.maxseqs)
2.104 + cutRanges1 = [i for i in cutRanges1 if i[0] in biggestSeqs1]
2.105 + alignments = [i for i in alignments if i[0] in biggestSeqs1]
2.106 + cutRanges2 = remainingSequenceRanges(cutRanges2, alignments, 1)
2.107 +
2.108 + biggestSeqs2 = biggestSequences(cutRanges2, opts.maxseqs)
2.109 + cutRanges2 = [i for i in cutRanges2 if i[0] in biggestSeqs2]
2.110 + alignments = [i for i in alignments if i[1] in biggestSeqs2]
2.111 + cutRanges1 = remainingSequenceRanges(cutRanges1, alignments, 0)
2.112 +
2.113 + logging.info("reading secondary alignments...")
2.114 alnDataB = readSecondaryAlignments(opts, cutRanges1, cutRanges2)
2.115 alignmentsB, seqRangesB1, coverDictB1, seqRangesB2, coverDictB2 = alnDataB
2.116 - warn("cutting...")
2.117 + logging.info("cutting...")
2.118 coverDictB1 = dict(mergedRangesPerSeq(coverDictB1))
2.119 coverDictB2 = dict(mergedRangesPerSeq(coverDictB2))
2.120 cutRangesB1 = trimmed(seqRangesB1, coverDictB1, minAlignedBases,
2.121 @@ -765,7 +790,7 @@
2.122 cutRangesB2 = trimmed(seqRangesB2, coverDictB2, minAlignedBases,
2.123 maxGapB2, 0, 0)
2.124
2.125 - warn("sorting...")
2.126 + logging.info("sorting...")
2.127 sortOut = allSortedRanges(opts, alignments, alignmentsB,
2.128 cutRanges1, cutRangesB1, cutRanges2, cutRangesB2)
2.129 sortedRanges1, sortedRanges2 = sortOut
2.130 @@ -785,7 +810,7 @@
2.131 bpPerPix1 = get_bp_per_pix(rangeSizes1, opts.border_pixels, maxPixels1)
2.132 bpPerPix2 = get_bp_per_pix(rangeSizes2, opts.border_pixels, maxPixels2)
2.133 bpPerPix = max(bpPerPix1, bpPerPix2)
2.134 - warn("bp per pixel = " + str(bpPerPix))
2.135 + logging.info("bp per pixel = " + str(bpPerPix))
2.136
2.137 p1 = pixelData(rangeSizes1, bpPerPix, opts.border_pixels, lMargin)
2.138 rangePixBegs1, rangePixLens1, width = p1
2.139 @@ -797,14 +822,14 @@
2.140 rangeDict2 = dict(rangesPerSeq(sortedRanges2, rangePixBegs2,
2.141 rangePixLens2, bpPerPix))
2.142
2.143 - warn("width: " + str(width))
2.144 - warn("height: " + str(height))
2.145 + logging.info("width: " + str(width))
2.146 + logging.info("height: " + str(height))
2.147
2.148 - warn("processing alignments...")
2.149 + logging.info("processing alignments...")
2.150 hits = alignmentPixels(width, height, alignments + alignmentsB, bpPerPix,
2.151 rangeDict1, rangeDict2)
2.152
2.153 - warn("reading annotations...")
2.154 + logging.info("reading annotations...")
2.155
2.156 rangeDict1 = expandedSeqDict(rangeDict1)
2.157 beds1 = itertools.chain(readBed(opts.bed1, rangeDict1),
2.158 @@ -822,7 +847,7 @@
2.159
2.160 boxes = sorted(itertools.chain(b1, b2))
2.161
2.162 - warn("drawing...")
2.163 + logging.info("drawing...")
2.164
2.165 image_size = width, height
2.166 im = Image.new(image_mode, image_size, opts.background_color)
2.167 @@ -868,17 +893,20 @@
2.168 op = optparse.OptionParser(usage=usage, description=description)
2.169 op.add_option("-v", "--verbose", action="count",
2.170 help="show progress messages & data about the plot")
2.171 + # Replace "width" & "height" with a single "length" option?
2.172 + op.add_option("-x", "--width", metavar="INT", type="int", default=1000,
2.173 + help="maximum width in pixels (default: %default)")
2.174 + op.add_option("-y", "--height", metavar="INT", type="int", default=1000,
2.175 + help="maximum height in pixels (default: %default)")
2.176 + op.add_option("-m", "--maxseqs", type="int", default=100, metavar="M",
2.177 + help="maximum number of horizontal or vertical sequences "
2.178 + "(default=%default)")
2.179 op.add_option("-1", "--seq1", metavar="PATTERN", action="append",
2.180 default=[],
2.181 help="which sequences to show from the 1st genome")
2.182 op.add_option("-2", "--seq2", metavar="PATTERN", action="append",
2.183 default=[],
2.184 help="which sequences to show from the 2nd genome")
2.185 - # Replace "width" & "height" with a single "length" option?
2.186 - op.add_option("-x", "--width", type="int", default=1000,
2.187 - help="maximum width in pixels (default: %default)")
2.188 - op.add_option("-y", "--height", type="int", default=1000,
2.189 - help="maximum height in pixels (default: %default)")
2.190 op.add_option("-c", "--forwardcolor", metavar="COLOR", default="red",
2.191 help="color for forward alignments (default: %default)")
2.192 op.add_option("-r", "--reversecolor", metavar="COLOR", default="blue",