last-dotplot: add genePred and rmsk options
authorMartin C. Frith
Thu May 18 14:23:58 2017 +0900 (2017-05-18)
changeset 8606e8ff424ce2b
parent 859 1ecdbf2ddb78
child 861 a75b439f0cf2
last-dotplot: add genePred and rmsk options
doc/last-dotplot.txt
scripts/last-dotplot
     1.1 --- a/doc/last-dotplot.txt	Thu May 18 14:06:41 2017 +0900
     1.2 +++ b/doc/last-dotplot.txt	Thu May 18 14:23:58 2017 +0900
     1.3 @@ -85,15 +85,6 @@
     1.4        Number of pixels between sequences.
     1.5    --border-color=COLOR
     1.6        Color for pixels between sequences.
     1.7 -  --bed1=FILE
     1.8 -      Read `BED-format
     1.9 -      <https://genome.ucsc.edu/FAQ/FAQformat.html#format1>`_
    1.10 -      annotations for the 1st genome.  They are drawn as rectangles,
    1.11 -      with coordinates given by the first three BED fields.  The color
    1.12 -      is specified by the RGB field if present, else pale red if the
    1.13 -      strand is "+", pale blue if "-", or pale purple.
    1.14 -  --bed2=FILE
    1.15 -      Read BED-format annotations for the 2nd genome.
    1.16  
    1.17  Text options
    1.18  ~~~~~~~~~~~~
    1.19 @@ -107,6 +98,44 @@
    1.20    --lengths2
    1.21        Show sequence lengths for the 2nd (vertical) genome.
    1.22  
    1.23 +Annotation options
    1.24 +~~~~~~~~~~~~~~~~~~
    1.25 +
    1.26 +These options read annotations of sequence segments, and draw them as
    1.27 +colored horizontal or vertical stripes.  This looks good only if the
    1.28 +annotations are reasonably sparse: e.g. you can't sensibly view 20000
    1.29 +gene annotations in one small dotplot.
    1.30 +
    1.31 +  --bed1=FILE
    1.32 +      Read `BED-format
    1.33 +      <https://genome.ucsc.edu/FAQ/FAQformat.html#format1>`_
    1.34 +      annotations for the 1st genome.  They are drawn as stripes, with
    1.35 +      coordinates given by the first three BED fields.  The color is
    1.36 +      specified by the RGB field if present, else pale red if the
    1.37 +      strand is "+", pale blue if "-", or pale purple.
    1.38 +  --bed2=FILE
    1.39 +      Read BED-format annotations for the 2nd genome.
    1.40 +  --rmsk1=FILE
    1.41 +      Read repeat annotations for the 1st genome, in RepeatMasker .out
    1.42 +      or rmsk.txt format.  The color is pale purple for "low
    1.43 +      complexity" and "simple repeats", else pale red for "+" strand
    1.44 +      and pale blue for "-" strand.
    1.45 +  --rmsk2=FILE
    1.46 +      Read repeat annotations for the 2nd genome.
    1.47 +
    1.48 +Gene options
    1.49 +~~~~~~~~~~~~
    1.50 +
    1.51 +  --genePred1=FILE
    1.52 +      Read gene annotations for the 1st genome in `genePred format
    1.53 +      <https://genome.ucsc.edu/FAQ/FAQformat.html#format9>`_.
    1.54 +  --genePred2=FILE
    1.55 +      Read gene annotations for the 2nd genome.
    1.56 +  --exon-color=COLOR
    1.57 +      Color for exons.
    1.58 +  --cds-color=COLOR
    1.59 +      Color for protein-coding regions.
    1.60 +
    1.61  Unsequenced gap options
    1.62  ~~~~~~~~~~~~~~~~~~~~~~~
    1.63  
    1.64 @@ -124,3 +153,9 @@
    1.65  
    1.66  An unsequenced gap will be shown only if it covers at least one whole
    1.67  pixel.
    1.68 +
    1.69 +Colors
    1.70 +~~~~~~
    1.71 +
    1.72 +Colors can be specified in `various ways described here
    1.73 +<http://effbot.org/imagingbook/imagecolor.htm>`_.
     2.1 --- a/scripts/last-dotplot	Thu May 18 14:06:41 2017 +0900
     2.2 +++ b/scripts/last-dotplot	Thu May 18 14:23:58 2017 +0900
     2.3 @@ -306,6 +306,55 @@
     2.4                      color = "#f4f4ff"
     2.5          yield layer, color, seqName, beg, end
     2.6  
     2.7 +def commaSeparatedInts(text):
     2.8 +    return map(int, text.rstrip(",").split(","))
     2.9 +
    2.10 +def readGenePred(opts, fileName, seqLimits):
    2.11 +    if not fileName: return
    2.12 +    for line in myOpen(fileName):
    2.13 +        fields = line.split()
    2.14 +        if not fields: continue
    2.15 +        if fields[2] not in "+-": fields = fields[1:]
    2.16 +        seqName = fields[1]
    2.17 +        if seqName not in seqLimits: continue
    2.18 +        #strand = fields[2]
    2.19 +        cdsBeg = int(fields[5])
    2.20 +        cdsEnd = int(fields[6])
    2.21 +        exonBegs = commaSeparatedInts(fields[8])
    2.22 +        exonEnds = commaSeparatedInts(fields[9])
    2.23 +        for beg, end in zip(exonBegs, exonEnds):
    2.24 +            yield 300, opts.exon_color, seqName, beg, end
    2.25 +            b = max(beg, cdsBeg)
    2.26 +            e = min(end, cdsEnd)
    2.27 +            if b < e: yield 400, opts.cds_color, seqName, b, e
    2.28 +
    2.29 +def readRmsk(fileName, seqLimits):
    2.30 +    if not fileName: return
    2.31 +    for line in myOpen(fileName):
    2.32 +        fields = line.split()
    2.33 +        if len(fields) == 17:  # rmsk.txt
    2.34 +            seqName = fields[5]
    2.35 +            if seqName not in seqLimits: continue  # do this ASAP for speed
    2.36 +            beg = int(fields[6])
    2.37 +            end = int(fields[7])
    2.38 +            strand = fields[9]
    2.39 +            repeatClass = fields[11]
    2.40 +        elif len(fields) == 15:  # .out
    2.41 +            seqName = fields[4]
    2.42 +            if seqName not in seqLimits: continue
    2.43 +            beg = int(fields[5]) - 1
    2.44 +            end = int(fields[6])
    2.45 +            strand = fields[8]
    2.46 +            repeatClass = fields[10]
    2.47 +        else:
    2.48 +            continue
    2.49 +        if repeatClass in ("Low_complexity", "Simple_repeat"):
    2.50 +            yield 200, "#ffe4ff", seqName, beg, end
    2.51 +        elif strand == "+":
    2.52 +            yield 100, "#fff4f4", seqName, beg, end
    2.53 +        else:
    2.54 +            yield 100, "#f4f4ff", seqName, beg, end
    2.55 +
    2.56  def isExtraFirstGapField(fields):
    2.57      return fields[4].isdigit()
    2.58  
    2.59 @@ -448,10 +497,14 @@
    2.60      origins2 = expandedSeqDict(origins2)
    2.61  
    2.62      beds1 = itertools.chain(readBed(opts.bed1, seqLimits1),
    2.63 +                            readRmsk(opts.rmsk1, seqLimits1),
    2.64 +                            readGenePred(opts, opts.genePred1, seqLimits1),
    2.65                              readGaps(opts, opts.gap1, seqLimits1))
    2.66      b1 = bedBoxes(beds1, seqLimits1, origins1, margin2, height, True, bpPerPix)
    2.67  
    2.68      beds2 = itertools.chain(readBed(opts.bed2, seqLimits2),
    2.69 +                            readRmsk(opts.rmsk2, seqLimits2),
    2.70 +                            readGenePred(opts, opts.genePred2, seqLimits2),
    2.71                              readGaps(opts, opts.gap2, seqLimits2))
    2.72      b2 = bedBoxes(beds2, seqLimits2, origins2, margin1, width, False, bpPerPix)
    2.73  
    2.74 @@ -522,10 +575,6 @@
    2.75      op.add_option("--border-color", metavar="COLOR", default="#dcdcdc",
    2.76                    help="color for pixels between sequences (default=%default)")
    2.77      # xxx --margin-color?
    2.78 -    op.add_option("--bed1", metavar="FILE",
    2.79 -                  help="read genome1 annotations from bed file")
    2.80 -    op.add_option("--bed2", metavar="FILE",
    2.81 -                  help="read genome2 annotations from bed file")
    2.82  
    2.83      og = optparse.OptionGroup(op, "Text options")
    2.84      og.add_option("-f", "--fontfile", metavar="FILE",
    2.85 @@ -538,6 +587,28 @@
    2.86                    help="show sequence lengths for the 2nd (vertical) genome")
    2.87      op.add_option_group(og)
    2.88  
    2.89 +    og = optparse.OptionGroup(op, "Annotation options")
    2.90 +    og.add_option("--bed1", metavar="FILE",
    2.91 +                  help="read genome1 annotations from BED file")
    2.92 +    og.add_option("--bed2", metavar="FILE",
    2.93 +                  help="read genome2 annotations from BED file")
    2.94 +    og.add_option("--rmsk1", metavar="FILE", help="read genome1 repeats from "
    2.95 +                  "RepeatMasker .out or rmsk.txt file")
    2.96 +    og.add_option("--rmsk2", metavar="FILE", help="read genome2 repeats from "
    2.97 +                  "RepeatMasker .out or rmsk.txt file")
    2.98 +    op.add_option_group(og)
    2.99 +
   2.100 +    og = optparse.OptionGroup(op, "Gene options")
   2.101 +    og.add_option("--genePred1", metavar="FILE",
   2.102 +                  help="read genome1 genes from genePred file")
   2.103 +    og.add_option("--genePred2", metavar="FILE",
   2.104 +                  help="read genome2 genes from genePred file")
   2.105 +    og.add_option("--exon-color", metavar="COLOR", default="#dfd",
   2.106 +                  help="color for exons (default=%default)")
   2.107 +    og.add_option("--cds-color", metavar="COLOR", default="#bdb",
   2.108 +                  help="color for protein-coding regions (default=%default)")
   2.109 +    op.add_option_group(og)
   2.110 +
   2.111      og = optparse.OptionGroup(op, "Unsequenced gap options")
   2.112      og.add_option("--gap1", metavar="FILE",
   2.113                    help="read genome1 unsequenced gaps from agp or gap file")