scripts/maf-sort
author Martin C. Frith
Fri Jun 02 18:40:29 2017 +0900 (2017-06-02)
changeset 863 6a4915d5b5cb
parent 159 6c6ac7c8f6ed
permissions -rwxr-xr-x
last-dotplot: get bp-per-pixel faster
     1 #! /bin/sh
     2 
     3 # Sort MAF-format alignments by sequence name, then strand, then start
     4 # position, then end position, of the top sequence.  Also, merge
     5 # identical alignments.  Comment lines starting with "#" are written
     6 # at the top, in unchanged order.  If option "-d" is specified, then
     7 # alignments that appear only once are omitted (like uniq -d).
     8 
     9 # Minor flaws, that do not matter for typical MAF input:
    10 # 1) It might not work if the input includes TABs.
    11 # 2) Preceding whitespace is considered part of the sequence name.  I
    12 # want to use sort -b, but it seems to be broken in different ways for
    13 # different versions of sort!
    14 # 3) Alignments with differences in whitespace are considered
    15 # non-identical.
    16 
    17 # This script uses perl instead of specialized commands like uniq.
    18 # The reason is that, on some systems (e.g. Mac OS X), uniq doesn't
    19 # work with long lines.
    20 
    21 # Make "sort" use a standard ordering:
    22 LC_ALL=C
    23 export LC_ALL
    24 
    25 uniqOpt=1
    26 whichSequence=1
    27 while getopts hdn: opt
    28 do
    29     case $opt in
    30 	h)  cat <<EOF
    31 Usage: $(basename $0) [options] my-alignments.maf
    32 
    33 Options:
    34   -h  show this help message and exit
    35   -d  only print duplicate alignments
    36   -n  sort by the n-th sequence (default: 1)
    37 EOF
    38 	    exit
    39 	    ;;
    40 	d)  uniqOpt=2
    41             ;;
    42 	n)  whichSequence="$OPTARG"
    43 	    ;;
    44     esac
    45 done
    46 shift $((OPTIND - 1))
    47 
    48 baseField=$((6 * $whichSequence))
    49 a=$(($baseField - 4))
    50 a=$a,$a
    51 b=$(($baseField - 1))
    52 b=$b,$b
    53 c=$(($baseField - 3))
    54 c=$c,$c
    55 d=$(($baseField - 2))
    56 d=$d,$d
    57 
    58 # 1) Add digits to "#" lines, so that sorting won't change their order.
    59 # 2) Replace spaces, except in "s" lines.
    60 # 3) Join each alignment into one big line.
    61 perl -pe '
    62 s/^#/sprintf("#%.9d",$c++)/e;
    63 y/ /\a/ unless /^s/;
    64 y/\n/\b/ if /^\w/;
    65 ' "$@" |
    66 
    67 sort -k$a -k$b -k${c}n -k${d}n |  # sort the lines
    68 
    69 # Print only the first (or second) of each run of identical lines:
    70 perl -ne '$c = 0 if $x ne $_; $x = $_; print if ++$c == '$uniqOpt |
    71 
    72 # 1) Remove the digits from "#" lines.
    73 # 2) Restore spaces and newlines.
    74 perl -pe '
    75 s/^#.{9}/#/;
    76 y/\a\b/ \n/;
    77 '