scripts/maf-sort
author Martin C. Frith
Fri Jun 02 18:40:29 2017 +0900 (2017-06-02)
changeset 863 6a4915d5b5cb
parent 159 6c6ac7c8f6ed
permissions -rwxr-xr-x
last-dotplot: get bp-per-pixel faster
Martin@1
     1
#! /bin/sh
Martin@1
     2
Martin@144
     3
# Sort MAF-format alignments by sequence name, then strand, then start
Martin@144
     4
# position, then end position, of the top sequence.  Also, merge
Martin@144
     5
# identical alignments.  Comment lines starting with "#" are written
Martin@144
     6
# at the top, in unchanged order.  If option "-d" is specified, then
Martin@144
     7
# alignments that appear only once are omitted (like uniq -d).
Martin@45
     8
Martin@159
     9
# Minor flaws, that do not matter for typical MAF input:
Martin@159
    10
# 1) It might not work if the input includes TABs.
Martin@159
    11
# 2) Preceding whitespace is considered part of the sequence name.  I
Martin@48
    12
# want to use sort -b, but it seems to be broken in different ways for
Martin@48
    13
# different versions of sort!
Martin@159
    14
# 3) Alignments with differences in whitespace are considered
Martin@48
    15
# non-identical.
Martin@47
    16
Martin@94
    17
# This script uses perl instead of specialized commands like uniq.
Martin@94
    18
# The reason is that, on some systems (e.g. Mac OS X), uniq doesn't
Martin@94
    19
# work with long lines.
Martin@94
    20
Martin@45
    21
# Make "sort" use a standard ordering:
Martin@45
    22
LC_ALL=C
Martin@45
    23
export LC_ALL
Martin@45
    24
Martin@94
    25
uniqOpt=1
Martin@158
    26
whichSequence=1
Martin@158
    27
while getopts hdn: opt
Martin@67
    28
do
Martin@67
    29
    case $opt in
Martin@67
    30
	h)  cat <<EOF
Martin@67
    31
Usage: $(basename $0) [options] my-alignments.maf
Martin@67
    32
Martin@67
    33
Options:
Martin@67
    34
  -h  show this help message and exit
Martin@67
    35
  -d  only print duplicate alignments
Martin@158
    36
  -n  sort by the n-th sequence (default: 1)
Martin@67
    37
EOF
Martin@67
    38
	    exit
Martin@67
    39
	    ;;
Martin@94
    40
	d)  uniqOpt=2
Martin@67
    41
            ;;
Martin@158
    42
	n)  whichSequence="$OPTARG"
Martin@158
    43
	    ;;
Martin@67
    44
    esac
Martin@67
    45
done
Martin@67
    46
shift $((OPTIND - 1))
Martin@1
    47
Martin@158
    48
baseField=$((6 * $whichSequence))
Martin@158
    49
a=$(($baseField - 4))
Martin@158
    50
a=$a,$a
Martin@158
    51
b=$(($baseField - 1))
Martin@158
    52
b=$b,$b
Martin@158
    53
c=$(($baseField - 3))
Martin@158
    54
c=$c,$c
Martin@158
    55
d=$(($baseField - 2))
Martin@158
    56
d=$d,$d
Martin@158
    57
Martin@159
    58
# 1) Add digits to "#" lines, so that sorting won't change their order.
Martin@159
    59
# 2) Replace spaces, except in "s" lines.
Martin@159
    60
# 3) Join each alignment into one big line.
Martin@159
    61
perl -pe '
Martin@159
    62
s/^#/sprintf("#%.9d",$c++)/e;
Martin@159
    63
y/ /\a/ unless /^s/;
Martin@159
    64
y/\n/\b/ if /^\w/;
Martin@159
    65
' "$@" |
Martin@1
    66
Martin@158
    67
sort -k$a -k$b -k${c}n -k${d}n |  # sort the lines
Martin@94
    68
Martin@159
    69
# Print only the first (or second) of each run of identical lines:
Martin@94
    70
perl -ne '$c = 0 if $x ne $_; $x = $_; print if ++$c == '$uniqOpt |
Martin@94
    71
Martin@159
    72
# 1) Remove the digits from "#" lines.
Martin@159
    73
# 2) Restore spaces and newlines.
Martin@159
    74
perl -pe '
Martin@159
    75
s/^#.{9}/#/;
Martin@159
    76
y/\a\b/ \n/;
Martin@159
    77
'