scripts/last-train
author Martin C. Frith
Fri Jun 02 18:40:29 2017 +0900 (2017-06-02)
changeset 863 6a4915d5b5cb
parent 848 7c09e0d848f2
child 867 f9ec9c71d72e
permissions -rwxr-xr-x
last-dotplot: get bp-per-pixel faster
Martin@587
     1
#! /usr/bin/env python
Martin@587
     2
# Copyright 2015 Martin C. Frith
Martin@587
     3
Martin@761
     4
import math, optparse, os, random, signal, subprocess, sys, tempfile
Martin@587
     5
Martin@848
     6
def randomSample(things, sampleSize):
Martin@848
     7
    """Randomly get sampleSize things (or all if fewer)."""
Martin@848
     8
    reservoir = []  # "reservoir sampling" algorithm
Martin@848
     9
    for i, x in enumerate(things):
Martin@848
    10
        if i < sampleSize:
Martin@848
    11
            reservoir.append(x)
Martin@848
    12
        else:
Martin@848
    13
            r = random.randrange(i + 1)
Martin@848
    14
            if r < sampleSize:
Martin@848
    15
                reservoir[r] = x
Martin@848
    16
    return reservoir
Martin@848
    17
Martin@763
    18
def writeWords(outFile, words):
Martin@763
    19
    outFile.write(" ".join(words) + "\n")
Martin@587
    20
Martin@761
    21
def seqInput(fileNames):
Martin@761
    22
    for name in fileNames:
Martin@761
    23
        with open(name) as file:
Martin@761
    24
            seqType = 0
Martin@761
    25
            for line in file:
Martin@761
    26
                if seqType == 0:
Martin@761
    27
                    if line[0] == ">":
Martin@761
    28
                        seqType = 1
Martin@761
    29
                        seq = []
Martin@761
    30
                    elif line[0] == "@":
Martin@761
    31
                        seqType = 2
Martin@761
    32
                        lineType = 1
Martin@761
    33
                elif seqType == 1:  # fasta
Martin@761
    34
                    if line[0] == ">":
Martin@761
    35
                        yield "".join(seq), ""
Martin@761
    36
                        seq = []
Martin@761
    37
                    else:
Martin@761
    38
                        seq.append(line.rstrip())
Martin@761
    39
                elif seqType == 2:  # fastq
Martin@761
    40
                    if lineType == 1:
Martin@761
    41
                        seq = line.rstrip()
Martin@761
    42
                    elif lineType == 3:
Martin@761
    43
                        yield seq, line.rstrip()
Martin@761
    44
                    lineType = (lineType + 1) % 4
Martin@761
    45
            if seqType == 1: yield "".join(seq), ""
Martin@761
    46
Martin@848
    47
def isGoodChunk(chunk):
Martin@848
    48
    for i in chunk:
Martin@848
    49
        for j in i[3]:
Martin@848
    50
            if j not in "Nn":
Martin@848
    51
                return True
Martin@848
    52
    return False
Martin@761
    53
Martin@848
    54
def chunkInput(opts, sequences):
Martin@848
    55
    chunkCount = 0
Martin@848
    56
    chunk = []
Martin@848
    57
    wantedLength = opts.sample_length
Martin@848
    58
    for i, x in enumerate(sequences):
Martin@848
    59
        seq, qual = x
Martin@848
    60
        if all(i in "Nn" for i in seq): continue
Martin@848
    61
        seqLength = len(seq)
Martin@848
    62
        beg = 0
Martin@848
    63
        while beg < seqLength:
Martin@848
    64
            length = min(wantedLength, seqLength - beg)
Martin@848
    65
            end = beg + length
Martin@848
    66
            segment = i, beg, end, seq[beg:end], qual[beg:end]
Martin@848
    67
            chunk.append(segment)
Martin@848
    68
            wantedLength -= length
Martin@848
    69
            if not wantedLength:
Martin@848
    70
                if isGoodChunk(chunk):
Martin@848
    71
                    yield chunk
Martin@848
    72
                    chunkCount += 1
Martin@848
    73
                chunk = []
Martin@848
    74
                wantedLength = opts.sample_length
Martin@848
    75
            beg = end
Martin@848
    76
    if chunk and chunkCount < opts.sample_number:
Martin@848
    77
        yield chunk
Martin@848
    78
Martin@848
    79
def writeSegment(outfile, segment):
Martin@848
    80
    if not segment: return
Martin@848
    81
    i, beg, end, seq, qual = segment
Martin@848
    82
    name = str(i) + ":" + str(beg)
Martin@848
    83
    if qual:
Martin@848
    84
        outfile.write("@" + name + "\n")
Martin@848
    85
        outfile.write(seq)
Martin@848
    86
        outfile.write("\n+\n")
Martin@848
    87
        outfile.write(qual)
Martin@848
    88
    else:
Martin@848
    89
        outfile.write(">" + name + "\n")
Martin@848
    90
        outfile.write(seq)
Martin@848
    91
    outfile.write("\n")
Martin@761
    92
Martin@761
    93
def getSeqSample(opts, queryFiles, outfile):
Martin@848
    94
    sequences = seqInput(queryFiles)
Martin@848
    95
    chunks = chunkInput(opts, sequences)
Martin@848
    96
    sample = randomSample(chunks, opts.sample_number)
Martin@848
    97
    sample.sort()
Martin@848
    98
    x = None
Martin@848
    99
    for chunk in sample:
Martin@848
   100
        for y in chunk:
Martin@848
   101
            if x and y[0] == x[0] and y[1] == x[2]:
Martin@848
   102
                x = x[0], x[1], y[2], x[3] + y[3], x[4] + y[4]
Martin@848
   103
            else:
Martin@848
   104
                writeSegment(outfile, x)
Martin@848
   105
                x = y
Martin@848
   106
    writeSegment(outfile, x)
Martin@761
   107
Martin@587
   108
def scaleFromHeader(lines):
Martin@587
   109
    for line in lines:
Martin@587
   110
        for i in line.split():
Martin@587
   111
            if i.startswith("t="):
Martin@587
   112
                return float(i[2:])
Martin@587
   113
    raise Exception("couldn't read the scale")
Martin@587
   114
Martin@587
   115
def scoreMatrixFromHeader(lines):
Martin@587
   116
    matrix = []
Martin@587
   117
    for line in lines:
Martin@587
   118
        w = line.split()
Martin@587
   119
        if len(w) > 2 and len(w[1]) == 1:
Martin@587
   120
            matrix.append(w[1:])
Martin@587
   121
        elif matrix:
Martin@587
   122
            break
Martin@587
   123
    return matrix
Martin@587
   124
Martin@587
   125
def scaledMatrix(matrix, scaleIncrease):
Martin@587
   126
    return matrix[0:1] + [i[0:1] + [int(j) * scaleIncrease for j in i[1:]]
Martin@587
   127
                          for i in matrix[1:]]
Martin@587
   128
Martin@623
   129
def countsFromLastOutput(lines, opts):
Martin@587
   130
    matrix = []
Martin@658
   131
    # use +1 pseudocounts as a kludge to mitigate numerical problems:
Martin@658
   132
    matches = 1.0
Martin@658
   133
    deletes = 2.0  # 1 open + 1 extension
Martin@658
   134
    inserts = 2.0  # 1 open + 1 extension
Martin@658
   135
    delOpens = 1.0
Martin@658
   136
    insOpens = 1.0
Martin@658
   137
    alignments = 0  # no pseudocount here
Martin@587
   138
    for line in lines:
Martin@587
   139
        if line[0] == "s":
Martin@587
   140
            strand = line.split()[4]  # slow?
Martin@587
   141
        if line[0] == "c":
Martin@587
   142
            c = map(float, line.split()[1:])
Martin@587
   143
            if not matrix:
Martin@587
   144
                matrixSize = int(math.sqrt(len(c) - 10))
Martin@658
   145
                matrix = [[1.0] * matrixSize for i in range(matrixSize)]
Martin@587
   146
            identities = sum(c[i * matrixSize + i] for i in range(matrixSize))
Martin@587
   147
            alignmentLength = c[-10] + c[-9] + c[-8]
Martin@623
   148
            if 100 * identities > opts.pid * alignmentLength: continue
Martin@587
   149
            for i in range(matrixSize):
Martin@587
   150
                for j in range(matrixSize):
Martin@633
   151
                    if strand == "+" or opts.S == "0":
Martin@587
   152
                        matrix[i][j]       += c[i * matrixSize + j]
Martin@587
   153
                    else:
Martin@587
   154
                        matrix[-1-i][-1-j] += c[i * matrixSize + j]
Martin@587
   155
            matches += c[-10]
Martin@587
   156
            deletes += c[-9]
Martin@587
   157
            inserts += c[-8]
Martin@587
   158
            delOpens += c[-7]
Martin@587
   159
            insOpens += c[-6]
Martin@587
   160
            alignments += 1
Martin@587
   161
    gapCounts = matches, deletes, inserts, delOpens, insOpens, alignments
Martin@587
   162
    return matrix, gapCounts
Martin@587
   163
Martin@587
   164
def scoreFromProb(scale, prob):
Martin@638
   165
    if prob > 0: logProb = math.log(prob)
Martin@638
   166
    else:        logProb = -800  # exp(-800) is exactly zero, on my computer
Martin@638
   167
    return int(round(scale * logProb))
Martin@587
   168
Martin@587
   169
def costFromProb(scale, prob):
Martin@587
   170
    return -scoreFromProb(scale, prob)
Martin@587
   171
Martin@763
   172
def guessAlphabet(matrixSize):
Martin@763
   173
    if matrixSize ==  4: return "ACGT"
Martin@763
   174
    if matrixSize == 20: return "ACDEFGHIKLMNPQRSTVWY"
Martin@763
   175
    raise Exception("can't handle unusual alphabets")
Martin@763
   176
Martin@763
   177
def matrixWithLetters(matrix):
Martin@763
   178
    alphabet = guessAlphabet(len(matrix))
Martin@763
   179
    return [alphabet] + [[a] + i for a, i in zip(alphabet, matrix)]
Martin@763
   180
Martin@763
   181
def writeMatrixHead(outFile, prefix, alphabet, formatString):
Martin@763
   182
    writeWords(outFile, [prefix + " "] + [formatString % k for k in alphabet])
Martin@763
   183
Martin@763
   184
def writeMatrixBody(outFile, prefix, alphabet, matrix, formatString):
Martin@763
   185
    for i, j in zip(alphabet, matrix):
Martin@763
   186
        writeWords(outFile, [prefix + i] + [formatString % k for k in j])
Martin@763
   187
Martin@763
   188
def writeCountMatrix(outFile, matrix, prefix):
Martin@763
   189
    alphabet = guessAlphabet(len(matrix))
Martin@763
   190
    writeMatrixHead(outFile, prefix, alphabet, "%-14s")
Martin@763
   191
    writeMatrixBody(outFile, prefix, alphabet, matrix, "%-14s")
Martin@763
   192
Martin@763
   193
def writeProbMatrix(outFile, matrix, prefix):
Martin@763
   194
    alphabet = guessAlphabet(len(matrix))
Martin@763
   195
    writeMatrixHead(outFile, prefix, alphabet, "%-14s")
Martin@763
   196
    writeMatrixBody(outFile, prefix, alphabet, matrix, "%-14g")
Martin@763
   197
Martin@763
   198
def writeScoreMatrix(outFile, matrix, prefix):
Martin@763
   199
    alphabet = guessAlphabet(len(matrix))
Martin@763
   200
    writeMatrixHead(outFile, prefix, alphabet, "%6s")
Martin@763
   201
    writeMatrixBody(outFile, prefix, alphabet, matrix, "%6s")
Martin@763
   202
Martin@763
   203
def writeMatrixWithLetters(outFile, matrix, prefix):
Martin@763
   204
    head = matrix[0]
Martin@763
   205
    tail = matrix[1:]
Martin@763
   206
    left = [i[0] for i in tail]
Martin@763
   207
    body = [i[1:] for i in tail]
Martin@763
   208
    writeMatrixHead(outFile, prefix, head, "%6s")
Martin@763
   209
    writeMatrixBody(outFile, prefix, left, body, "%6s")
Martin@763
   210
Martin@587
   211
def matProbsFromCounts(counts, opts):
Martin@587
   212
    r = range(len(counts))
Martin@587
   213
    if opts.revsym:  # add complement (reverse strand) substitutions
Martin@587
   214
        counts = [[counts[i][j] + counts[-1-i][-1-j] for j in r] for i in r]
Martin@587
   215
    if opts.matsym:  # symmetrize the substitution matrix
Martin@587
   216
        counts = [[counts[i][j] + counts[j][i] for j in r] for i in r]
Martin@587
   217
    identities = sum(counts[i][i] for i in r)
Martin@587
   218
    total = sum(map(sum, counts))
Martin@587
   219
    probs = [[j / total for j in i] for i in counts]
Martin@587
   220
Martin@587
   221
    print "# substitution percent identity: %g" % (100 * identities / total)
Martin@587
   222
    print
Martin@764
   223
    print "# count matrix (query letters = columns, reference letters = rows):"
Martin@763
   224
    writeCountMatrix(sys.stdout, counts, "# ")
Martin@587
   225
    print
Martin@764
   226
    print "# probability matrix (query letters = columns, reference letters = rows):"
Martin@763
   227
    writeProbMatrix(sys.stdout, probs, "# ")
Martin@587
   228
    print
Martin@587
   229
Martin@587
   230
    return probs
Martin@587
   231
Martin@587
   232
def gapProbsFromCounts(counts, opts):
Martin@587
   233
    matches, deletes, inserts, delOpens, insOpens, alignments = counts
Martin@587
   234
    if not alignments: raise Exception("no alignments")
Martin@587
   235
    gaps = deletes + inserts
Martin@587
   236
    gapOpens = delOpens + insOpens
Martin@658
   237
    denominator = matches + gapOpens + (alignments + 1)  # +1 pseudocount
Martin@587
   238
    if opts.gapsym:
Martin@587
   239
        delOpenProb = gapOpens / denominator / 2
Martin@587
   240
        insOpenProb = gapOpens / denominator / 2
Martin@658
   241
        delExtendProb = (gaps - gapOpens) / gaps
Martin@658
   242
        insExtendProb = (gaps - gapOpens) / gaps
Martin@587
   243
    else:
Martin@587
   244
        delOpenProb = delOpens / denominator
Martin@587
   245
        insOpenProb = insOpens / denominator
Martin@658
   246
        delExtendProb = (deletes - delOpens) / deletes
Martin@658
   247
        insExtendProb = (inserts - insOpens) / inserts
Martin@587
   248
Martin@587
   249
    print "# aligned letter pairs:", matches
Martin@587
   250
    print "# deletes:", deletes
Martin@587
   251
    print "# inserts:", inserts
Martin@587
   252
    print "# delOpens:", delOpens
Martin@587
   253
    print "# insOpens:", insOpens
Martin@587
   254
    print "# alignments:", alignments
Martin@658
   255
    print "# mean delete size: %g" % (deletes / delOpens)
Martin@658
   256
    print "# mean insert size: %g" % (inserts / insOpens)
Martin@587
   257
    print "# delOpenProb: %g" % delOpenProb
Martin@587
   258
    print "# insOpenProb: %g" % insOpenProb
Martin@587
   259
    print "# delExtendProb: %g" % delExtendProb
Martin@587
   260
    print "# insExtendProb: %g" % insExtendProb
Martin@587
   261
    print
Martin@587
   262
Martin@587
   263
    delCloseProb = 1 - delExtendProb
Martin@587
   264
    insCloseProb = 1 - insExtendProb
Martin@587
   265
    firstDelProb = delOpenProb * delCloseProb
Martin@587
   266
    firstInsProb = insOpenProb * insCloseProb
Martin@752
   267
Martin@752
   268
    # If we define "an alignment" to mean "a set of indistinguishable
Martin@752
   269
    # paths", then:
Martin@587
   270
    #delExtendProb += firstDelProb
Martin@587
   271
    #insExtendProb += firstInsProb
Martin@752
   272
    # Else, this ensures gap existence cost >= 0:
Martin@752
   273
    delExtendProb = max(delExtendProb, firstDelProb)
Martin@752
   274
    insExtendProb = max(insExtendProb, firstInsProb)
Martin@752
   275
Martin@587
   276
    delExistProb = firstDelProb / delExtendProb
Martin@587
   277
    insExistProb = firstInsProb / insExtendProb
Martin@587
   278
Martin@587
   279
    return delExistProb, insExistProb, delExtendProb, insExtendProb
Martin@587
   280
Martin@587
   281
def scoreFromLetterProbs(scale, pairProb, prob1, prob2):
Martin@587
   282
    probRatio = pairProb / (prob1 * prob2)
Martin@587
   283
    return scoreFromProb(scale, probRatio)
Martin@587
   284
Martin@587
   285
def matScoresFromProbs(scale, probs):
Martin@587
   286
    rowProbs = map(sum, probs)
Martin@587
   287
    colProbs = map(sum, zip(*probs))
Martin@587
   288
    return [[scoreFromLetterProbs(scale, j, x, y) for j, y in zip(i, colProbs)]
Martin@587
   289
            for i, x in zip(probs, rowProbs)]
Martin@587
   290
Martin@587
   291
def gapCostsFromProbs(scale, probs):
Martin@587
   292
    delExistProb, insExistProb, delExtendProb, insExtendProb = probs
Martin@587
   293
    delExistCost = costFromProb(scale, delExistProb)
Martin@587
   294
    insExistCost = costFromProb(scale, insExistProb)
Martin@587
   295
    delExtendCost = costFromProb(scale, delExtendProb)
Martin@587
   296
    insExtendCost = costFromProb(scale, insExtendProb)
Martin@587
   297
    if delExtendCost == 0: delExtendCost = 1
Martin@587
   298
    if insExtendCost == 0: insExtendCost = 1
Martin@587
   299
    return delExistCost, insExistCost, delExtendCost, insExtendCost
Martin@587
   300
Martin@587
   301
def writeLine(out, *things):
Martin@587
   302
    out.write(" ".join(map(str, things)) + "\n")
Martin@587
   303
Martin@587
   304
def writeGapCosts(gapCosts, out):
Martin@587
   305
    delExistCost, insExistCost, delExtendCost, insExtendCost = gapCosts
Martin@587
   306
    writeLine(out, "#last -a", delExistCost)
Martin@587
   307
    writeLine(out, "#last -A", insExistCost)
Martin@587
   308
    writeLine(out, "#last -b", delExtendCost)
Martin@587
   309
    writeLine(out, "#last -B", insExtendCost)
Martin@587
   310
Martin@587
   311
def printGapCosts(gapCosts):
Martin@587
   312
    delExistCost, insExistCost, delExtendCost, insExtendCost = gapCosts
Martin@587
   313
    print "# delExistCost:", delExistCost
Martin@587
   314
    print "# insExistCost:", insExistCost
Martin@587
   315
    print "# delExtendCost:", delExtendCost
Martin@587
   316
    print "# insExtendCost:", insExtendCost
Martin@587
   317
    print
Martin@587
   318
Martin@587
   319
def tryToMakeChildProgramsFindable():
Martin@587
   320
    myDir = os.path.dirname(__file__)
Martin@637
   321
    srcDir = os.path.join(myDir, os.pardir, "src")
Martin@637
   322
    # put srcDir first, to avoid getting older versions of LAST:
Martin@637
   323
    os.environ["PATH"] = srcDir + os.pathsep + os.environ["PATH"]
Martin@587
   324
Martin@587
   325
def fixedLastalArgs(opts):
Martin@587
   326
    x = ["lastal", "-j7"]
Martin@587
   327
    if opts.D: x.append("-D" + opts.D)
Martin@587
   328
    if opts.E: x.append("-E" + opts.E)
Martin@636
   329
    if opts.s: x.append("-s" + opts.s)
Martin@633
   330
    if opts.S: x.append("-S" + opts.S)
Martin@853
   331
    if opts.C: x.append("-C" + opts.C)
Martin@587
   332
    if opts.T: x.append("-T" + opts.T)
Martin@636
   333
    if opts.m: x.append("-m" + opts.m)
Martin@694
   334
    if opts.P: x.append("-P" + opts.P)
Martin@587
   335
    if opts.Q: x.append("-Q" + opts.Q)
Martin@587
   336
    return x
Martin@587
   337
Martin@761
   338
def doTraining(opts, args):
Martin@587
   339
    tryToMakeChildProgramsFindable()
Martin@587
   340
    scaleIncrease = 20  # while training, up-scale the scores by this amount
Martin@587
   341
    x = fixedLastalArgs(opts)
Martin@587
   342
    if opts.r: x.append("-r" + opts.r)
Martin@587
   343
    if opts.q: x.append("-q" + opts.q)
Martin@587
   344
    if opts.p: x.append("-p" + opts.p)
Martin@587
   345
    if opts.a: x.append("-a" + opts.a)
Martin@587
   346
    if opts.b: x.append("-b" + opts.b)
Martin@587
   347
    if opts.A: x.append("-A" + opts.A)
Martin@587
   348
    if opts.B: x.append("-B" + opts.B)
Martin@587
   349
    x += args
Martin@587
   350
    y = ["last-split", "-n"]
Martin@587
   351
    p = subprocess.Popen(x, stdout=subprocess.PIPE)
Martin@587
   352
    q = subprocess.Popen(y, stdin=p.stdout, stdout=subprocess.PIPE)
Martin@587
   353
    externalScale = scaleFromHeader(q.stdout)
Martin@587
   354
    internalScale = externalScale * scaleIncrease
Martin@587
   355
    if opts.Q:
Martin@587
   356
        externalMatrix = scoreMatrixFromHeader(q.stdout)
Martin@587
   357
        internalMatrix = scaledMatrix(externalMatrix, scaleIncrease)
Martin@587
   358
    oldParameters = []
Martin@587
   359
Martin@587
   360
    print "# maximum percent identity:", opts.pid
Martin@587
   361
    print "# scale of score parameters:", externalScale
Martin@587
   362
    print "# scale used while training:", internalScale
Martin@587
   363
    print
Martin@587
   364
Martin@587
   365
    while True:
Martin@587
   366
        print "#", " ".join(x)
Martin@587
   367
        print
Martin@762
   368
        sys.stdout.flush()
Martin@623
   369
        matCounts, gapCounts = countsFromLastOutput(q.stdout, opts)
Martin@587
   370
        gapProbs = gapProbsFromCounts(gapCounts, opts)
Martin@587
   371
        gapCosts = gapCostsFromProbs(internalScale, gapProbs)
Martin@587
   372
        printGapCosts(gapCosts)
Martin@587
   373
        if opts.Q:
Martin@587
   374
            if gapCosts in oldParameters: break
Martin@587
   375
            oldParameters.append(gapCosts)
Martin@587
   376
        else:
Martin@587
   377
            matProbs = matProbsFromCounts(matCounts, opts)
Martin@587
   378
            matScores = matScoresFromProbs(internalScale, matProbs)
Martin@764
   379
            print "# score matrix (query letters = columns, reference letters = rows):"
Martin@763
   380
            writeScoreMatrix(sys.stdout, matScores, "# ")
Martin@763
   381
            print
Martin@587
   382
            parameters = gapCosts, matScores
Martin@587
   383
            if parameters in oldParameters: break
Martin@587
   384
            oldParameters.append(parameters)
Martin@587
   385
            internalMatrix = matrixWithLetters(matScores)
Martin@587
   386
        x = fixedLastalArgs(opts)
Martin@587
   387
        x.append("-p-")
Martin@587
   388
        x += args
Martin@587
   389
        p = subprocess.Popen(x, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
Martin@587
   390
        writeGapCosts(gapCosts, p.stdin)
Martin@763
   391
        writeMatrixWithLetters(p.stdin, internalMatrix, "")
Martin@587
   392
        p.stdin.close()
Martin@587
   393
        # in python2.6, the next line must come after p.stdin.close()
Martin@587
   394
        q = subprocess.Popen(y, stdin=p.stdout, stdout=subprocess.PIPE)
Martin@587
   395
Martin@587
   396
    gapCosts = gapCostsFromProbs(externalScale, gapProbs)
Martin@587
   397
    writeGapCosts(gapCosts, sys.stdout)
Martin@636
   398
    if opts.s: writeLine(sys.stdout, "#last -s", opts.s)
Martin@636
   399
    if opts.S: writeLine(sys.stdout, "#last -S", opts.S)
Martin@587
   400
    if not opts.Q:
Martin@587
   401
        matScores = matScoresFromProbs(externalScale, matProbs)
Martin@587
   402
        externalMatrix = matrixWithLetters(matScores)
Martin@764
   403
    print "# score matrix (query letters = columns, reference letters = rows):"
Martin@763
   404
    writeMatrixWithLetters(sys.stdout, externalMatrix, "")
Martin@587
   405
Martin@761
   406
def lastTrain(opts, args):
Martin@848
   407
    if opts.sample_number:
Martin@761
   408
        random.seed(math.pi)
Martin@761
   409
        refName = args[0]
Martin@761
   410
        queryFiles = args[1:]
Martin@761
   411
        try:
Martin@761
   412
            with tempfile.NamedTemporaryFile(delete=False) as f:
Martin@761
   413
                getSeqSample(opts, queryFiles, f)
Martin@761
   414
            doTraining(opts, [refName, f.name])
Martin@761
   415
        finally:
Martin@761
   416
            os.remove(f.name)
Martin@761
   417
    else:
Martin@761
   418
        doTraining(opts, args)
Martin@761
   419
Martin@587
   420
if __name__ == "__main__":
Martin@587
   421
    signal.signal(signal.SIGPIPE, signal.SIG_DFL)  # avoid silly error message
Martin@761
   422
    usage = "%prog [options] lastdb-name sequence-file(s)"
Martin@587
   423
    description = "Try to find suitable score parameters for aligning the given sequences."
Martin@587
   424
    op = optparse.OptionParser(usage=usage, description=description)
Martin@587
   425
    og = optparse.OptionGroup(op, "Training options")
Martin@587
   426
    og.add_option("--revsym", action="store_true",
Martin@587
   427
                  help="force reverse-complement symmetry")
Martin@587
   428
    og.add_option("--matsym", action="store_true",
Martin@587
   429
                  help="force symmetric substitution matrix")
Martin@587
   430
    og.add_option("--gapsym", action="store_true",
Martin@587
   431
                  help="force insertion/deletion symmetry")
Martin@587
   432
    og.add_option("--pid", type="float", default=100, help=
Martin@587
   433
                  "skip alignments with > PID% identity (default: %default)")
Martin@848
   434
    og.add_option("--sample-number", type="int", default=500, metavar="N",
Martin@848
   435
                  help="number of random sequence samples (default: %default)")
Martin@848
   436
    og.add_option("--sample-length", type="int", default=2000, metavar="L",
Martin@848
   437
                  help="length of each sample (default: %default)")
Martin@587
   438
    op.add_option_group(og)
Martin@587
   439
    og = optparse.OptionGroup(op, "Initial parameter options")
Martin@587
   440
    og.add_option("-r", metavar="SCORE",
Martin@587
   441
                  help="match score (default: 6 if Q>0, else 5)")
Martin@587
   442
    og.add_option("-q", metavar="COST",
Martin@587
   443
                  help="mismatch cost (default: 18 if Q>0, else 5)")
Martin@587
   444
    og.add_option("-p", metavar="NAME", help="match/mismatch score matrix")
Martin@587
   445
    og.add_option("-a", metavar="COST",
Martin@587
   446
                  help="gap existence cost (default: 21 if Q>0, else 15)")
Martin@587
   447
    og.add_option("-b", metavar="COST",
Martin@587
   448
                  help="gap extension cost (default: 9 if Q>0, else 3)")
Martin@587
   449
    og.add_option("-A", metavar="COST", help="insertion existence cost")
Martin@587
   450
    og.add_option("-B", metavar="COST", help="insertion extension cost")
Martin@587
   451
    op.add_option_group(og)
Martin@587
   452
    og = optparse.OptionGroup(op, "Alignment options")
Martin@587
   453
    og.add_option("-D", metavar="LENGTH",
Martin@587
   454
                  help="query letters per random alignment (default: 1e6)")
Martin@587
   455
    og.add_option("-E", metavar="EG2",
Martin@587
   456
                  help="maximum expected alignments per square giga")
Martin@636
   457
    og.add_option("-s", metavar="STRAND", help=
Martin@636
   458
                  "0=reverse, 1=forward, 2=both (default: 2 if DNA, else 1)")
Martin@633
   459
    og.add_option("-S", metavar="NUMBER", default="1", help=
Martin@633
   460
                  "score matrix applies to forward strand of: " +
Martin@633
   461
                  "0=reference, 1=query (default: %default)")
Martin@853
   462
    og.add_option("-C", metavar="COUNT", help=
Martin@853
   463
                  "omit gapless alignments in COUNT others with > score-per-length")
Martin@587
   464
    og.add_option("-T", metavar="NUMBER",
Martin@587
   465
                  help="type of alignment: 0=local, 1=overlap (default: 0)")
Martin@636
   466
    og.add_option("-m", metavar="COUNT", help=
Martin@636
   467
                  "maximum initial matches per query position (default: 10)")
Martin@694
   468
    og.add_option("-P", metavar="THREADS",
Martin@694
   469
                  help="number of parallel threads")
Martin@587
   470
    og.add_option("-Q", metavar="NUMBER",
Martin@587
   471
                  help="input format: 0=fasta, 1=fastq-sanger")
Martin@587
   472
    op.add_option_group(og)
Martin@587
   473
    (opts, args) = op.parse_args()
Martin@587
   474
    if len(args) < 2: op.error("I need a lastdb index and query sequences")
Martin@587
   475
    if not opts.p and (not opts.Q or opts.Q == "0"):
Martin@587
   476
        if not opts.r: opts.r = "5"
Martin@587
   477
        if not opts.q: opts.q = "5"
Martin@587
   478
        if not opts.a: opts.a = "15"
Martin@587
   479
        if not opts.b: opts.b = "3"
Martin@587
   480
Martin@587
   481
    try: lastTrain(opts, args)
Martin@587
   482
    except KeyboardInterrupt: pass  # avoid silly error message
Martin@587
   483
    except Exception, e:
Martin@587
   484
        prog = os.path.basename(sys.argv[0])
Martin@587
   485
        sys.exit(prog + ": error: " + str(e))