Add maf-convert to gff (with help from C. Plessy)
authorMartin C. Frith
Mon Jun 08 14:28:46 2020 +0900 (23 months ago)
changeset 10626d8d56a74b14
parent 1061 543d36d39ce3
child 1063 bc7b2cdbcb32
Add maf-convert to gff (with help from C. Plessy)
doc/maf-convert.txt
scripts/last-dotplot
scripts/last-train
scripts/maf-convert
scripts/maf-cut
test/maf-convert-test.out
test/maf-convert-test.sh
     1.1 --- a/doc/maf-convert.txt	Thu May 07 08:14:11 2020 +0900
     1.2 +++ b/doc/maf-convert.txt	Mon Jun 08 14:28:46 2020 +0900
     1.3 @@ -3,7 +3,7 @@
     1.4  
     1.5  This script reads alignments in maf_ format, and writes them in
     1.6  another format.  It can write them in these formats: axt_, blast,
     1.7 -blasttab, chain_, html, psl_, sam, tab.  You can use it like this::
     1.8 +blasttab, chain_, gff, html, psl_, sam, tab.  You can use it like this::
     1.9  
    1.10    maf-convert psl my-alignments.maf > my-alignments.psl
    1.11  
    1.12 @@ -36,8 +36,8 @@
    1.13  
    1.14    -j N, --join=N
    1.15           Join neighboring alignments if they are co-linear and
    1.16 -         separated by at most N letters.  This affects psl format
    1.17 -         only.
    1.18 +         separated by at most N letters.  This affects psl and gff
    1.19 +         formats only.
    1.20  
    1.21    -n, --noheader
    1.22           Omit any header lines from the output.  This may be useful if
     2.1 --- a/scripts/last-dotplot	Thu May 07 08:14:11 2020 +0900
     2.2 +++ b/scripts/last-dotplot	Mon Jun 08 14:28:46 2020 +0900
     2.3 @@ -1,4 +1,6 @@
     2.4  #! /usr/bin/env python
     2.5 +# Author: Martin C. Frith 2008
     2.6 +# SPDX-License-Identifier: GPL-3.0-or-later
     2.7  
     2.8  # Read pair-wise alignments in MAF or LAST tabular format: write an
     2.9  # "Oxford grid", a.k.a. dotplot.
     3.1 --- a/scripts/last-train	Thu May 07 08:14:11 2020 +0900
     3.2 +++ b/scripts/last-train	Mon Jun 08 14:28:46 2020 +0900
     3.3 @@ -1,5 +1,6 @@
     3.4  #! /usr/bin/env python
     3.5  # Copyright 2015 Martin C. Frith
     3.6 +# SPDX-License-Identifier: GPL-3.0-or-later
     3.7  
     3.8  # References:
     3.9  # [Fri19] How sequence alignment scores correspond to probability models,
     4.1 --- a/scripts/maf-convert	Thu May 07 08:14:11 2020 +0900
     4.2 +++ b/scripts/maf-convert	Mon Jun 08 14:28:46 2020 +0900
     4.3 @@ -1,6 +1,6 @@
     4.4  #! /usr/bin/env python
     4.5  # Copyright 2010, 2011, 2013, 2014 Martin C. Frith
     4.6 -# Read MAF-format alignments: write them in other formats.
     4.7 +# SPDX-License-Identifier: GPL-3.0-or-later
     4.8  # Seems to work with Python 2.x, x>=6
     4.9  
    4.10  # By "MAF" we mean "multiple alignment format" described in the UCSC
    4.11 @@ -9,8 +9,13 @@
    4.12  from __future__ import print_function
    4.13  
    4.14  from itertools import *
    4.15 +import collections
    4.16  import gzip
    4.17 -import math, optparse, os, signal, sys
    4.18 +import math
    4.19 +import optparse
    4.20 +import os
    4.21 +import signal
    4.22 +import sys
    4.23  
    4.24  try:
    4.25      from future_builtins import map, zip
    4.26 @@ -185,7 +190,7 @@
    4.27      if score:
    4.28          outWords.append(score)
    4.29  
    4.30 -    print(" ".join(outWords))
    4.31 +    print(*outWords)
    4.32      for i in sLines:
    4.33          print(i[6])
    4.34      print()  # print a blank line at the end
    4.35 @@ -243,7 +248,7 @@
    4.36  
    4.37      outWords.append(str(next(axtCounter) + 1))
    4.38  
    4.39 -    print(" ".join(outWords))
    4.40 +    print(*outWords)
    4.41  
    4.42      letterSizes = [i[3] for i in sLines]
    4.43      rows = [i[6] for i in sLines]
    4.44 @@ -360,12 +365,10 @@
    4.45      blockCount = len(blocks)
    4.46      blockSizes, blockStartsA, blockStartsB = map(pslCommaString, zip(*blocks))
    4.47  
    4.48 -    outWords = (matches, mismatches, repMatches, nCount,
    4.49 -                numInsertB, baseInsertB, numInsertA, baseInsertA, strand,
    4.50 -                seqNameB, seqLenB, begB, endB, seqNameA, seqLenA, begA, endA,
    4.51 -                blockCount, blockSizes, blockStartsB, blockStartsA)
    4.52 -
    4.53 -    print("\t".join(map(str, outWords)))
    4.54 +    print(matches, mismatches, repMatches, nCount,
    4.55 +          numInsertB, baseInsertB, numInsertA, baseInsertA, strand,
    4.56 +          seqNameB, seqLenB, begB, endB, seqNameA, seqLenA, begA, endA,
    4.57 +          blockCount, blockSizes, blockStartsB, blockStartsA, sep="\t")
    4.58  
    4.59  def mafConvertToPsl(opts, lines):
    4.60      if opts.join:
    4.61 @@ -679,8 +682,8 @@
    4.62      mismatches = alnSize - matches - rowA.count("-") - rowB.count("-")
    4.63      gapOpens = gapRunCount(rowA) + gapRunCount(rowB)
    4.64  
    4.65 -    out = [seqNameB, seqNameA, matchPercent, str(alnSize), str(mismatches),
    4.66 -           str(gapOpens), begB, endB, begA, endA]
    4.67 +    out = [seqNameB, seqNameA, matchPercent, alnSize, mismatches,
    4.68 +           gapOpens, begB, endB, begA, endA]
    4.69  
    4.70      score, evalue = scoreAndEvalue(aLine)
    4.71      if evalue:
    4.72 @@ -689,12 +692,76 @@
    4.73              bitScore = opts.bitScoreA * float(score) - opts.bitScoreB
    4.74              out.append("%.3g" % bitScore)
    4.75  
    4.76 -    print("\t".join(out))
    4.77 +    print(*out, sep="\t")
    4.78  
    4.79  def mafConvertToBlastTab(opts, lines):
    4.80      for maf in mafInput(opts, lines):
    4.81          writeBlastTab(opts, maf)
    4.82  
    4.83 +##### Routines for converting to GFF format: #####
    4.84 +
    4.85 +def writeGffHeader():
    4.86 +    print("##gff-version 3")
    4.87 +
    4.88 +def gffFromMaf(maf):
    4.89 +    aLine, sLines, qLines, pLines = maf
    4.90 +    fieldsA, fieldsB = pairOrDie(sLines, "GFF")
    4.91 +    seqNameA, seqLenA, strandA, letterSizeA, begA, endA, rowA = fieldsA
    4.92 +    seqNameB, seqLenB, strandB, letterSizeB, begB, endB, rowB = fieldsB
    4.93 +
    4.94 +    score = "."
    4.95 +    for i in aLine.split():
    4.96 +        if i.startswith("score="):
    4.97 +            score = i[6:]
    4.98 +
    4.99 +    if strandA == "-":
   4.100 +        begA, endA = seqLenA - endA, seqLenA - begA
   4.101 +    if strandB == "-":
   4.102 +        begB, endB = seqLenB - endB, seqLenB - begB
   4.103 +    begA += 1
   4.104 +    begB += 1
   4.105 +
   4.106 +    strand = "+" if strandA == strandB else "-"
   4.107 +
   4.108 +    return seqNameA, begA, endA, strand, score, seqNameB, begB, endB
   4.109 +
   4.110 +def writeOneGff(gff, typeOfThing, parentId):
   4.111 +    seqNameA, begA, endA, strand, score, seqNameB, begB, endB = gff
   4.112 +    target = "Target={0} {1} {2}".format(seqNameB, begB, endB)
   4.113 +    name = "Name={0}:{1}-{2}".format(seqNameB, begB, endB)
   4.114 +    attributes = [target, name]
   4.115 +    if parentId:
   4.116 +        attributes.append("Parent=" + parentId)
   4.117 +    print(seqNameA, "maf-convert", typeOfThing, begA, endA, score, strand, ".",
   4.118 +          ";".join(attributes), sep="\t")
   4.119 +
   4.120 +def writeGffGroup(qryNameCounts, mafs):
   4.121 +    gffs = [gffFromMaf(i) for i in mafs]
   4.122 +    seqNameA, begA, endA, strand, score, seqNameB, begB, endB = gffs[0]
   4.123 +    endA = gffs[-1][2]
   4.124 +    if strand == "+":
   4.125 +        endB = gffs[-1][7]
   4.126 +    else:
   4.127 +        begB = gffs[-1][6]
   4.128 +    qryNameCounts[seqNameB] += 1
   4.129 +    parentId = "{0}.{1}".format(seqNameB, qryNameCounts[seqNameB])
   4.130 +    myId = "ID=" + parentId
   4.131 +    name = "Name={0}:{1}-{2}".format(seqNameB, begB, endB)
   4.132 +    attributes = [myId, name]
   4.133 +    print(seqNameA, "maf-convert", "match", begA, endA, ".", strand, ".",
   4.134 +          ";".join(attributes), sep="\t")
   4.135 +    for i in gffs:
   4.136 +        writeOneGff(i, "match_part", parentId)
   4.137 +
   4.138 +def mafConvertToGff(opts, lines):
   4.139 +    qryNameCounts = collections.defaultdict(int)
   4.140 +    if opts.join:
   4.141 +        for i in mafGroupInput(opts, lines):
   4.142 +            writeGffGroup(qryNameCounts, i)
   4.143 +    else:
   4.144 +        for i in mafInput(opts, lines):
   4.145 +            writeOneGff(gffFromMaf(i), "match", None)
   4.146 +
   4.147  ##### Routines for converting to HTML format: #####
   4.148  
   4.149  def writeHtmlHeader():
   4.150 @@ -812,6 +879,8 @@
   4.151          mafConvertToBlastTab(opts, lines)
   4.152      elif isFormat(formatName, "chain"):
   4.153          mafConvertToChain(opts, lines)
   4.154 +    elif isFormat(formatName, "gff"):
   4.155 +        mafConvertToGff(opts, lines)
   4.156      elif isFormat(formatName, "html"):
   4.157          mafConvertToHtml(opts, lines)
   4.158      elif isFormat(formatName, "psl"):
   4.159 @@ -832,6 +901,8 @@
   4.160      opts.bitScoreB = None
   4.161  
   4.162      if not opts.noheader:
   4.163 +        if isFormat(formatName, "gff"):
   4.164 +            writeGffHeader()
   4.165          if isFormat(formatName, "html"):
   4.166              writeHtmlHeader()
   4.167          if isFormat(formatName, "sam"):
   4.168 @@ -859,6 +930,7 @@
   4.169    %prog blast mafFile(s)
   4.170    %prog blasttab mafFile(s)
   4.171    %prog chain mafFile(s)
   4.172 +  %prog gff mafFile(s)
   4.173    %prog html mafFile(s)
   4.174    %prog psl mafFile(s)
   4.175    %prog sam mafFile(s)
     5.1 --- a/scripts/maf-cut	Thu May 07 08:14:11 2020 +0900
     5.2 +++ b/scripts/maf-cut	Mon Jun 08 14:28:46 2020 +0900
     5.3 @@ -1,4 +1,5 @@
     5.4  #! /usr/bin/env python
     5.5 +# Author: Martin C. Frith 2018
     5.6  
     5.7  from __future__ import print_function
     5.8  
     6.1 --- a/test/maf-convert-test.out	Thu May 07 08:14:11 2020 +0900
     6.2 +++ b/test/maf-convert-test.out	Mon Jun 08 14:28:46 2020 +0900
     6.3 @@ -4,6 +4,7 @@
     6.4    maf-convert blast mafFile(s)
     6.5    maf-convert blasttab mafFile(s)
     6.6    maf-convert chain mafFile(s)
     6.7 +  maf-convert gff mafFile(s)
     6.8    maf-convert html mafFile(s)
     6.9    maf-convert psl mafFile(s)
    6.10    maf-convert sam mafFile(s)
    6.11 @@ -8706,6 +8707,81 @@
    6.12  46
    6.13  chain 156 chr3 198022430 + 37456613 37456655 SRR359290.9033 75 + 0 42 196
    6.14  42
    6.15 +##gff-version 3
    6.16 +chr3	maf-convert	match	23607557	23607579	138	+	.	Target=102 674 696;Name=102:674-696
    6.17 +chr14	maf-convert	match	104715673	104715703	126	+	.	Target=102 480 508;Name=102:480-508
    6.18 +chr2	maf-convert	match	240456632	240456656	125	+	.	Target=102 92 118;Name=102:92-118
    6.19 +chr11	maf-convert	match	67464147	67464169	124	+	.	Target=102 477 499;Name=102:477-499
    6.20 +chr1	maf-convert	match	21590631	21590682	121	+	.	Target=102 468 504;Name=102:468-504
    6.21 +chr10	maf-convert	match	122813035	122813060	120	+	.	Target=102 473 499;Name=102:473-499
    6.22 +chr10	maf-convert	match	124281395	124281414	120	+	.	Target=102 619 638;Name=102:619-638
    6.23 +chr1	maf-convert	match	161050906	161050943	119	+	.	Target=102 459 490;Name=102:459-490
    6.24 +chr5	maf-convert	match	93489543	93489564	118	+	.	Target=102 7 28;Name=102:7-28
    6.25 +chr14	maf-convert	match	80332130	80332156	118	+	.	Target=102 641 667;Name=102:641-667
    6.26 +chr15	maf-convert	match	45614750	45614773	118	+	.	Target=102 468 491;Name=102:468-491
    6.27 +chr8	maf-convert	match	76840859	76840909	117	+	.	Target=102 426 465;Name=102:426-465
    6.28 +chr3	maf-convert	match	152862894	152862920	116	+	.	Target=102 88 112;Name=102:88-112
    6.29 +chr1	maf-convert	match	145870018	145870036	114	+	.	Target=102 141 159;Name=102:141-159
    6.30 +chr5	maf-convert	match	4867426	4867444	114	+	.	Target=102 473 491;Name=102:473-491
    6.31 +chr5	maf-convert	match	74376578	74376596	114	+	.	Target=102 11 29;Name=102:11-29
    6.32 +chr7	maf-convert	match	157969524	157969545	114	+	.	Target=102 494 515;Name=102:494-515
    6.33 +chr13	maf-convert	match	113342216	113342234	114	+	.	Target=102 471 489;Name=102:471-489
    6.34 +chr17	maf-convert	match	19079610	19079628	114	+	.	Target=102 471 489;Name=102:471-489
    6.35 +chr17	maf-convert	match	19205792	19205810	114	+	.	Target=102 471 489;Name=102:471-489
    6.36 +chr17	maf-convert	match	27793640	27793658	114	+	.	Target=102 547 565;Name=102:547-565
    6.37 +chr18	maf-convert	match	8561209	8561245	114	+	.	Target=102 460 490;Name=102:460-490
    6.38 +chr21	maf-convert	match	14291181	14291209	114	+	.	Target=102 667 693;Name=102:667-693
    6.39 +chr22	maf-convert	match	23657491	23657509	114	+	.	Target=102 471 489;Name=102:471-489
    6.40 +chrX	maf-convert	match	83302602	83302620	114	+	.	Target=102 681 699;Name=102:681-699
    6.41 +chr5	maf-convert	match	173701200	173701222	113	+	.	Target=102 509 533;Name=102:509-533
    6.42 +chr3	maf-convert	match	127741376	127741402	112	+	.	Target=102 486 508;Name=102:486-508
    6.43 +chr4	maf-convert	match	3839063	3839083	112	+	.	Target=102 434 454;Name=102:434-454
    6.44 +chr18	maf-convert	match	55335093	55335113	112	+	.	Target=102 257 277;Name=102:257-277
    6.45 +chr17	maf-convert	match	28925930	28925955	111	+	.	Target=102 465 494;Name=102:465-494
    6.46 +chr3	maf-convert	match	186865652	186865672	110	+	.	Target=102 443 463;Name=102:443-463
    6.47 +chr5	maf-convert	match	169763616	169763644	110	+	.	Target=102 488 512;Name=102:488-512
    6.48 +chr7	maf-convert	match	65260307	65260327	110	+	.	Target=102 443 463;Name=102:443-463
    6.49 +chr19	maf-convert	match	47072202	47072237	110	+	.	Target=102 372 403;Name=102:372-403
    6.50 +chr20	maf-convert	match	4034148	4034168	110	+	.	Target=102 443 463;Name=102:443-463
    6.51 +chr19	maf-convert	match	1390856	1391052	954	-	.	Target=102 222 410;Name=102:222-410
    6.52 +chr19	maf-convert	match	1388830	1388938	495	-	.	Target=102 402 509;Name=102:402-509
    6.53 +chr19	maf-convert	match	1388524	1388593	321	-	.	Target=102 507 575;Name=102:507-575
    6.54 +chr19	maf-convert	match	1393241	1393331	320	-	.	Target=102 99 178;Name=102:99-178
    6.55 +chr19	maf-convert	match	1395388	1395484	291	-	.	Target=102 4 102;Name=102:4-102
    6.56 +chr19	maf-convert	match	1391117	1391163	237	-	.	Target=102 179 225;Name=102:179-225
    6.57 +chr19	maf-convert	match	1387808	1387847	215	-	.	Target=102 575 616;Name=102:575-616
    6.58 +chr19	maf-convert	match	1383911	1383942	178	-	.	Target=102 614 645;Name=102:614-645
    6.59 +chr7	maf-convert	match	101590380	101590420	133	-	.	Target=102 484 524;Name=102:484-524
    6.60 +chr12	maf-convert	match	71544987	71545025	129	-	.	Target=102 459 490;Name=102:459-490
    6.61 +chr19	maf-convert	match	17615431	17615451	126	-	.	Target=102 626 646;Name=102:626-646
    6.62 +chr19	maf-convert	match	18430560	18430588	124	-	.	Target=102 481 505;Name=102:481-505
    6.63 +chr8	maf-convert	match	98433262	98433291	121	-	.	Target=102 485 509;Name=102:485-509
    6.64 +chr4	maf-convert	match	54783705	54783740	120	-	.	Target=102 459 488;Name=102:459-488
    6.65 +chr3	maf-convert	match	130609409	130609434	119	-	.	Target=102 486 510;Name=102:486-510
    6.66 +chr3	maf-convert	match	37588377	37588414	117	-	.	Target=102 90 126;Name=102:90-126
    6.67 +chr16	maf-convert	match	722484	722513	116	-	.	Target=102 88 113;Name=102:88-113
    6.68 +chr2	maf-convert	match	192414463	192414489	115	-	.	Target=102 501 526;Name=102:501-526
    6.69 +chr7	maf-convert	match	101723246	101723286	115	-	.	Target=102 458 491;Name=102:458-491
    6.70 +chr10	maf-convert	match	126932870	126932910	115	-	.	Target=102 605 637;Name=102:605-637
    6.71 +chr22	maf-convert	match	20726589	20726617	115	-	.	Target=102 69 95;Name=102:69-95
    6.72 +chr22	maf-convert	match	46458801	46458822	115	-	.	Target=102 636 657;Name=102:636-657
    6.73 +chr3	maf-convert	match	89815480	89815498	114	-	.	Target=102 503 521;Name=102:503-521
    6.74 +chr5	maf-convert	match	171574455	171574473	114	-	.	Target=102 457 475;Name=102:457-475
    6.75 +chr6	maf-convert	match	1842832	1842850	114	-	.	Target=102 471 489;Name=102:471-489
    6.76 +chr7	maf-convert	match	100370597	100370615	114	-	.	Target=102 471 489;Name=102:471-489
    6.77 +chr12	maf-convert	match	19357536	19357554	114	-	.	Target=102 489 507;Name=102:489-507
    6.78 +chr17	maf-convert	match	19172451	19172469	114	-	.	Target=102 471 489;Name=102:471-489
    6.79 +chr17	maf-convert	match	29190070	29190088	114	-	.	Target=102 478 496;Name=102:478-496
    6.80 +chr17	maf-convert	match	58670491	58670509	114	-	.	Target=102 471 489;Name=102:471-489
    6.81 +chr20	maf-convert	match	17684745	17684763	114	-	.	Target=102 103 121;Name=102:103-121
    6.82 +chr21	maf-convert	match	43027935	43027953	114	-	.	Target=102 485 503;Name=102:485-503
    6.83 +chrX	maf-convert	match	39928417	39928435	114	-	.	Target=102 471 489;Name=102:471-489
    6.84 +chr3	maf-convert	match	33026625	33026648	112	-	.	Target=102 503 526;Name=102:503-526
    6.85 +chr5	maf-convert	match	177958542	177958564	111	-	.	Target=102 541 563;Name=102:541-563
    6.86 +chr1	maf-convert	match	155997874	155997913	110	-	.	Target=102 486 524;Name=102:486-524
    6.87 +chr6	maf-convert	match	65786102	65786124	110	-	.	Target=102 224 246;Name=102:224-246
    6.88 +chr6	maf-convert	match	170257702	170257727	110	-	.	Target=102 485 508;Name=102:485-508
    6.89 +chr15	maf-convert	match	78230486	78230510	110	-	.	Target=102 510 533;Name=102:510-533
    6.90  
    6.91  <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
    6.92   "http://www.w3.org/TR/html4/strict.dtd">
     7.1 --- a/test/maf-convert-test.sh	Thu May 07 08:14:11 2020 +0900
     7.2 +++ b/test/maf-convert-test.sh	Mon Jun 08 14:28:46 2020 +0900
     7.3 @@ -19,6 +19,7 @@
     7.4      $r blast $maf2
     7.5      $r blasttab $maf2
     7.6      head -n999 $maf1 | $r chain
     7.7 +    $r gff 102.maf
     7.8      $r html -l100 $maf2
     7.9      head -n999 $maf1 | $r -n html
    7.10      head -n999 $maf1 | $r psl
    7.11 @@ -31,5 +32,4 @@
    7.12      $r -d sam $maf1
    7.13      head -n999 $maf1 | $r -n tab
    7.14      head -n999 $maf1 | $r tab
    7.15 -} |
    7.16 -diff maf-convert-test.out -
    7.17 +} | diff -u maf-convert-test.out -