Make lastal startup even faster
authorMartin C. Frith
Mon Apr 01 10:03:47 2019 +0900 (5 months ago)
changeset 9799b416e560dc3
parent 978 d429589d040e
child 980 27c26371226b
Make lastal startup even faster
src/LambdaCalculator.cc
src/LambdaCalculator.hh
src/TantanMasker.cc
src/lastal.cc
src/makefile
src/mcf_substitution_matrix_stats.cc
src/mcf_substitution_matrix_stats.hh
     1.1 --- a/src/LambdaCalculator.cc	Mon Apr 01 08:29:58 2019 +0900
     1.2 +++ b/src/LambdaCalculator.cc	Mon Apr 01 10:03:47 2019 +0900
     1.3 @@ -2,26 +2,13 @@
     1.4  
     1.5  #include "LambdaCalculator.hh"
     1.6  #include "cbrc_linalg.hh"
     1.7 -#include <vector>
     1.8  #include <cassert>
     1.9 -#include <cstdio>  // sprintf
    1.10 -#include <cstdlib>
    1.11  #include <cfloat>
    1.12  #include <cmath>
    1.13  #include <numeric>
    1.14  //#include <iostream>
    1.15  using namespace std;
    1.16  
    1.17 -static double roundToFewDigits(double x)
    1.18 -{
    1.19 -  // This rounding fixes some inaccuracies, e.g. for DNA with a simple
    1.20 -  // match/mismatch matrix it's likely to make all the probabilities
    1.21 -  // exactly 0.25, as they should be.
    1.22 -  char buffer[32];
    1.23 -  sprintf(buffer, "%g", x);
    1.24 -  return atof(buffer);
    1.25 -}
    1.26 -
    1.27  static bool calculate_inv_sum(double **matrix, int alpha_size, double tau, double* inv_sum, double **tmpMat, double *tmpVec)
    1.28  {
    1.29    for (int i=0; i<alpha_size; i++)
     2.1 --- a/src/LambdaCalculator.hh	Mon Apr 01 08:29:58 2019 +0900
     2.2 +++ b/src/LambdaCalculator.hh	Mon Apr 01 10:03:47 2019 +0900
     2.3 @@ -14,11 +14,23 @@
     2.4  #define LAMBDA_CALCULATOR_HH
     2.5  
     2.6  #include <vector>
     2.7 +#include <stdio.h>  // sprintf
     2.8 +#include <stdlib.h>  // atof
     2.9  
    2.10  namespace cbrc{
    2.11  
    2.12  typedef const int *const_int_ptr;
    2.13  
    2.14 +inline double roundToFewDigits(double x)
    2.15 +{
    2.16 +  // This rounding fixes some inaccuracies, e.g. for DNA with a simple
    2.17 +  // match/mismatch matrix it's likely to make all the probabilities
    2.18 +  // exactly 0.25, as they should be.
    2.19 +  char buffer[32];
    2.20 +  sprintf(buffer, "%g", x);
    2.21 +  return atof(buffer);
    2.22 +}
    2.23 +
    2.24  class LambdaCalculator{
    2.25   public:
    2.26    LambdaCalculator() { setBad(); }
     3.1 --- a/src/TantanMasker.cc	Mon Apr 01 08:29:58 2019 +0900
     3.2 +++ b/src/TantanMasker.cc	Mon Apr 01 10:03:47 2019 +0900
     3.3 @@ -2,7 +2,7 @@
     3.4  
     3.5  #include "TantanMasker.hh"
     3.6  #include "ScoreMatrix.hh"
     3.7 -#include "LambdaCalculator.hh"
     3.8 +#include "mcf_substitution_matrix_stats.hh"
     3.9  #include <algorithm>
    3.10  #include <cassert>
    3.11  #include <cmath>
    3.12 @@ -20,6 +20,7 @@
    3.13  
    3.14    unsigned alphabetSize;
    3.15    ScoreMatrix s;
    3.16 +  const char *matrixName = "";
    3.17  
    3.18    if (isATrichDna) {
    3.19      repeatProb = 0.01;
    3.20 @@ -34,7 +35,8 @@
    3.21    } else if (isProtein) {
    3.22      maxRepeatOffset = 50;
    3.23      alphabetSize = 20;
    3.24 -    s.fromString(ScoreMatrix::stringFromName("BL62"));
    3.25 +    matrixName = "BL62";
    3.26 +    s.fromString(ScoreMatrix::stringFromName(matrixName));
    3.27    } else {
    3.28      alphabetSize = alphabet.size();
    3.29      s.setMatchMismatch(1, 1, alphabet);
    3.30 @@ -45,13 +47,13 @@
    3.31    int *scoreMat[scoreMatrixRowSize];
    3.32    std::copy(s.caseInsensitive, s.caseInsensitive + alphabetSize, scoreMat);
    3.33  
    3.34 -  LambdaCalculator c;
    3.35 -  c.calculate(scoreMat, alphabetSize);
    3.36 -  assert(!c.isBad());
    3.37 +  mcf::SubstitutionMatrixStats stats;
    3.38 +  stats.calcUnbiased(matrixName, scoreMat, alphabetSize);
    3.39 +  assert(!stats.isBad());
    3.40  
    3.41    for (int i = 0; i < scoreMatrixRowSize; ++i)
    3.42      for (int j = 0; j < scoreMatrixRowSize; ++j)
    3.43 -      probMatrix[i][j] = std::exp(c.lambda() * s.caseInsensitive[i][j]);
    3.44 +      probMatrix[i][j] = std::exp(stats.lambda() * s.caseInsensitive[i][j]);
    3.45  }
    3.46  
    3.47  }
     4.1 --- a/src/lastal.cc	Mon Apr 01 08:29:58 2019 +0900
     4.2 +++ b/src/lastal.cc	Mon Apr 01 10:03:47 2019 +0900
     4.3 @@ -95,7 +95,8 @@
     4.4  }
     4.5  
     4.6  // Meaningless for PSSMs, unless they have the same scale as the score matrix
     4.7 -static void calculateSubstitutionScoreMatrixStatistics() {
     4.8 +static void
     4.9 +calculateSubstitutionScoreMatrixStatistics(const std::string &matrixName) {
    4.10    int *scoreMat[scoreMatrixRowSize];
    4.11    // the case-sensitivity of the matrix makes no difference here
    4.12    std::copy(scoreMatrix.caseSensitive,
    4.13 @@ -103,8 +104,9 @@
    4.14  
    4.15    mcf::SubstitutionMatrixStats &stats = fwdMatrices.stats;
    4.16    if (args.temperature < 0) {
    4.17 +    const char *canonicalMatrixName = ScoreMatrix::canonicalName(matrixName);
    4.18      LOG("calculating matrix probabilities...");
    4.19 -    stats.calcUnbiased(scoreMat, alph.size);
    4.20 +    stats.calcUnbiased(canonicalMatrixName, scoreMat, alph.size);
    4.21      if (stats.isBad()) {
    4.22        static const char msg[] =
    4.23  	"can't calculate probabilities: maybe the mismatch costs are too weak";
    4.24 @@ -152,7 +154,7 @@
    4.25  
    4.26    scoreMatrix.init(alph.encode);
    4.27    if (args.outputType > 0) {
    4.28 -    calculateSubstitutionScoreMatrixStatistics();
    4.29 +    calculateSubstitutionScoreMatrixStatistics(matrixName);
    4.30      const mcf::SubstitutionMatrixStats &stats = fwdMatrices.stats;
    4.31      if (!stats.isBad()) {
    4.32        scoreMatrix.addAmbiguousScores(alph.letters == alph.dna,
     5.1 --- a/src/makefile	Mon Apr 01 08:29:58 2019 +0900
     5.2 +++ b/src/makefile	Mon Apr 01 10:03:47 2019 +0900
     5.3 @@ -16,7 +16,8 @@
     5.4  
     5.5  indexObj0 = Alphabet.o CyclicSubsetSeed.o LambdaCalculator.o	\
     5.6  ScoreMatrix.o SubsetMinimizerFinder.o TantanMasker.o fileMap.o	\
     5.7 -cbrc_linalg.o tantan.o LastdbArguments.o
     5.8 +cbrc_linalg.o mcf_substitution_matrix_stats.o tantan.o		\
     5.9 +LastdbArguments.o
    5.10  
    5.11  indexObj4 = MultiSequence.o MultiSequenceQual.o SubsetSuffixArray.o	\
    5.12  SubsetSuffixArraySort.o lastdb.o
    5.13 @@ -201,7 +202,7 @@
    5.14  SubsetSuffixArraySort.o SubsetSuffixArraySort.o8: SubsetSuffixArraySort.cc SubsetSuffixArray.hh \
    5.15   CyclicSubsetSeed.hh VectorOrMmap.hh Mmap.hh fileMap.hh stringify.hh
    5.16  TantanMasker.o TantanMasker.o8: TantanMasker.cc TantanMasker.hh ScoreMatrixRow.hh \
    5.17 - tantan.hh ScoreMatrix.hh LambdaCalculator.hh
    5.18 + tantan.hh ScoreMatrix.hh mcf_substitution_matrix_stats.hh
    5.19  TwoQualityScoreMatrix.o TwoQualityScoreMatrix.o8: TwoQualityScoreMatrix.cc \
    5.20   TwoQualityScoreMatrix.hh mcf_substitution_matrix_stats.hh \
    5.21   ScoreMatrixRow.hh qualityScoreUtil.hh stringify.hh
     6.1 --- a/src/mcf_substitution_matrix_stats.cc	Mon Apr 01 08:29:58 2019 +0900
     6.2 +++ b/src/mcf_substitution_matrix_stats.cc	Mon Apr 01 10:03:47 2019 +0900
     6.3 @@ -8,6 +8,18 @@
     6.4  #include <stdexcept>
     6.5  #include <assert.h>
     6.6  #include <math.h>
     6.7 +#include <string.h>  // strcmp
     6.8 +
     6.9 +#define COUNTOF(a) (sizeof (a) / sizeof *(a))
    6.10 +
    6.11 +const struct {
    6.12 +  const char *name;
    6.13 +  double scale;
    6.14 +} substitutionMatrixScales[] = {
    6.15 +  {"BL62", 3.0861133577701141},
    6.16 +  {"BL80", 2.8253295934982616},
    6.17 +  {"PAM30", 2.8848596855435313},
    6.18 +};
    6.19  
    6.20  static double checkedExp(double lambda, int score) {
    6.21    double y = exp(lambda * score);
    6.22 @@ -38,7 +50,7 @@
    6.23    assert(sum > 0);
    6.24    double bias = 1 / sum;
    6.25    for (unsigned i = 0; i < size; ++i) {
    6.26 -    probs[i] *= bias;
    6.27 +    probs[i] = cbrc::roundToFewDigits(probs[i] * bias);
    6.28    }
    6.29    return bias;
    6.30  }
    6.31 @@ -70,8 +82,16 @@
    6.32    mBias = (bias1 + bias2) / 2;
    6.33  }
    6.34  
    6.35 -void SubstitutionMatrixStats::calcUnbiased(const const_int_ptr *scoreMatrix,
    6.36 +void SubstitutionMatrixStats::calcUnbiased(const char *matrixName,
    6.37 +					   const const_int_ptr *scoreMatrix,
    6.38  					   unsigned size) {
    6.39 +  for (size_t i = 0; i < COUNTOF(substitutionMatrixScales); ++i) {
    6.40 +    if (strcmp(matrixName, substitutionMatrixScales[i].name) == 0) {
    6.41 +      calcFromScale(scoreMatrix, size, substitutionMatrixScales[i].scale);
    6.42 +      return;
    6.43 +    }
    6.44 +  }
    6.45 +
    6.46    cbrc::LambdaCalculator c;
    6.47    c.calculate(scoreMatrix, size);
    6.48    mLambda = c.lambda();
     7.1 --- a/src/mcf_substitution_matrix_stats.hh	Mon Apr 01 08:29:58 2019 +0900
     7.2 +++ b/src/mcf_substitution_matrix_stats.hh	Mon Apr 01 10:03:47 2019 +0900
     7.3 @@ -48,8 +48,10 @@
     7.4    void calcFromScale(const const_int_ptr *scoreMatrix, unsigned size,
     7.5  		     double scale);
     7.6  
     7.7 -  // set bias = 1 and calculate the other values
     7.8 -  void calcUnbiased(const const_int_ptr *scoreMatrix, unsigned size);
     7.9 +  // Set bias = 1 and calculate the other values.
    7.10 +  // matrixName is used only to lookup pre-calculated cases.
    7.11 +  void calcUnbiased(const char *matrixName,
    7.12 +		    const const_int_ptr *scoreMatrix, unsigned size);
    7.13  
    7.14    // Set it to the parameters for aligning the reverse DNA strands.
    7.15    // j = complement[i] means the j-th letter is the complement of the