Make last-split much faster
authorMartin C. Frith
Fri Mar 13 07:38:42 2020 +0900 (2 months ago)
changeset 10566f4ec587fe25
parent 1055 3887935792ab
child 1057 ff73ca674104
Make last-split much faster
src/split/cbrc_split_aligner.cc
src/split/cbrc_split_aligner.hh
     1.1 --- a/src/split/cbrc_split_aligner.cc	Fri Mar 13 06:22:27 2020 +0900
     1.2 +++ b/src/split/cbrc_split_aligner.cc	Fri Mar 13 07:38:42 2020 +0900
     1.3 @@ -346,7 +346,7 @@
     1.4      long scoreFromJump = maxScore + restartScore;
     1.5      for (const unsigned *x = inplayAlnBeg; x < inplayAlnEnd; ++x) {
     1.6        size_t ij = matrixRowOrigins[*x] + j;
     1.7 -      long s = std::max(scoreFromJump, Vmat[ij] + Dmat[ij]) + Amat[ij];
     1.8 +      long s = std::max(scoreFromJump, Vmat[ij] + Smat[ij*2]) + Smat[ij*2+1];
     1.9        Vmat[ij + 1] = s;
    1.10        maxScore = std::max(maxScore, s);
    1.11      }
    1.12 @@ -382,9 +382,9 @@
    1.13  	      s = std::max(s,
    1.14  			   scoreFromSplice(i, j, oldNumInplay, oldInplayPos));
    1.15  	    s += spliceEndScore(ij);
    1.16 -	    s = std::max(s, Vmat[ij] + Dmat[ij]);
    1.17 +	    s = std::max(s, Vmat[ij] + Smat[ij*2]);
    1.18  	    if (alns[i].qstart == j && s < 0) s = 0;
    1.19 -	    s += Amat[ij];
    1.20 +	    s += Smat[ij*2+1];
    1.21  
    1.22  	    Vmat[ij + 1] = s;
    1.23  	    sMax = std::max(sMax, s + spliceBegScore(ij + 1));
    1.24 @@ -435,7 +435,7 @@
    1.25    for (;;) {
    1.26      --j;
    1.27      size_t ij = matrixRowOrigins[i] + j;
    1.28 -    long score = Vmat[ij + 1] - Amat[ij];
    1.29 +    long score = Vmat[ij + 1] - Smat[ij*2+1];
    1.30      if (restartProb <= 0 && alns[i].qstart == j && score == 0) {
    1.31        queryBegs.push_back(j);
    1.32        return;
    1.33 @@ -447,7 +447,7 @@
    1.34      // makes some other kinds of boundary less clean.  What's the best
    1.35      // procedure for tied scores?
    1.36  
    1.37 -    bool isStay = (score == Vmat[ij] + Dmat[ij]);
    1.38 +    bool isStay = (score == Vmat[ij] + Smat[ij*2]);
    1.39      if (isStay && alns[i].isForwardStrand()) continue;
    1.40  
    1.41      long s = score - spliceEndScore(ij);
    1.42 @@ -475,8 +475,8 @@
    1.43    unsigned i = alnNum;
    1.44    for (unsigned j = queryBeg; j < queryEnd; ++j) {
    1.45      size_t ij = matrixRowOrigins[i] + j;
    1.46 -    score += Amat[ij];
    1.47 -    if (j > queryBeg) score += Dmat[ij];
    1.48 +    score += Smat[ij*2+1];
    1.49 +    if (j > queryBeg) score += Smat[ij*2];
    1.50    }
    1.51    return score;
    1.52  }
    1.53 @@ -571,7 +571,8 @@
    1.54      double pSum = 0.0;
    1.55      for (unsigned *x = inplayAlnBeg; x < inplayAlnEnd; ++x) {
    1.56        size_t ij = matrixRowOrigins[*x] + j;
    1.57 -      double p = (probFromJump + Fmat[ij] * Dexp[ij]) * Aexp[ij] * rescale;
    1.58 +      double p =
    1.59 +	(probFromJump + Fmat[ij] * Sexp[ij*2]) * Sexp[ij*2+1] * rescale;
    1.60        Fmat[ij + 1] = p;
    1.61        pSum += p;
    1.62      }
    1.63 @@ -611,9 +612,9 @@
    1.64  	    if (splicePrior > 0.0)
    1.65  	      p += probFromSpliceF(i, j, oldNumInplay, oldInplayPos);
    1.66  	    p *= spliceEndProb(ij);
    1.67 -	    p += Fmat[ij] * Dexp[ij];
    1.68 +	    p += Fmat[ij] * Sexp[ij*2];
    1.69  	    if (alns[i].qstart == j) p += begprob;
    1.70 -	    p = p * Aexp[ij] * rescale;
    1.71 +	    p = p * Sexp[ij*2+1] * rescale;
    1.72  
    1.73  	    Fmat[ij + 1] = p;
    1.74  	    if (alns[i].qend == j+1) zF += p;
    1.75 @@ -656,7 +657,7 @@
    1.76      double pSum = 0.0;
    1.77      for (const unsigned *x = inplayAlnBeg; x < inplayAlnEnd; ++x) {
    1.78        size_t ij = matrixRowOrigins[*x] + j;
    1.79 -      double p = (sumOfProbs + Bmat[ij] * Dexp[ij]) * Aexp[ij - 1] * rescale;
    1.80 +      double p = (sumOfProbs + Bmat[ij] * Sexp[ij*2]) * Sexp[ij*2-1] * rescale;
    1.81        Bmat[ij - 1] = p;
    1.82        pSum += p;
    1.83      }
    1.84 @@ -691,9 +692,9 @@
    1.85  	    if (splicePrior > 0.0)
    1.86  	      p += probFromSpliceB(i, j, oldNumInplay, oldInplayPos);
    1.87  	    p *= spliceBegProb(ij);
    1.88 -	    p += Bmat[ij] * Dexp[ij];
    1.89 +	    p += Bmat[ij] * Sexp[ij*2];
    1.90  	    if (alns[i].qend == j) p += endprob;
    1.91 -	    p = p * Aexp[ij - 1] * rescale;
    1.92 +	    p = p * Sexp[ij*2-1] * rescale;
    1.93  
    1.94  	    Bmat[ij - 1] = p;
    1.95  	    //if (alns[i].qstart == j-1) zB += p;
    1.96 @@ -713,10 +714,10 @@
    1.97    for (unsigned pos = alnBeg; pos < alnEnd; ++pos) {
    1.98      size_t ij = matrixRowOrigins[i] + j;
    1.99      if (alns[i].qalign[pos] == '-') {
   1.100 -      double value = Fmat[ij] * Bmat[ij] * Dexp[ij] * cell(rescales, j);
   1.101 +      double value = Fmat[ij] * Bmat[ij] * Sexp[ij*2] * cell(rescales, j);
   1.102        output.push_back(value);
   1.103      } else {
   1.104 -      double value = Fmat[ij + 1] * Bmat[ij] / Aexp[ij];
   1.105 +      double value = Fmat[ij + 1] * Bmat[ij] / Sexp[ij*2+1];
   1.106        if (value != value) value = 0.0;
   1.107        output.push_back(value);
   1.108        j++;
   1.109 @@ -726,9 +727,9 @@
   1.110  }
   1.111  
   1.112  // The next routine represents affine gap scores in a cunning way.
   1.113 -// Amat holds scores at query bases, and at every base that is aligned
   1.114 +// Aij holds scores at query bases, and at every base that is aligned
   1.115  // to a gap it gets a score of insExistenceScore + insExtensionScore.
   1.116 -// Dmat holds scores between query bases, and between every pair of
   1.117 +// Dij holds scores between query bases, and between every pair of
   1.118  // bases that are both aligned to gaps it gets a score of
   1.119  // -insExistenceScore.  This produces suitable affine gap scores, even
   1.120  // if we jump from one alignment to another in the middle of a gap.
   1.121 @@ -740,18 +741,17 @@
   1.122    const UnsplitAlignment& a = alns[i];
   1.123    const size_t origin = matrixRowOrigins[i];
   1.124  
   1.125 -  int *AmatB = &Amat[origin + dpBeg(i)];
   1.126 -  int *DmatB = &Dmat[origin + dpBeg(i)];
   1.127 -  int *AmatS = &Amat[origin + a.qstart];
   1.128 -  int *AmatE = &Amat[origin + dpEnd(i)];
   1.129 +  int *matBeg = &Smat[(origin + dpBeg(i)) * 2];
   1.130 +  int *alnBeg = &Smat[(origin + a.qstart) * 2];
   1.131 +  int *matEnd = &Smat[(origin + dpEnd(i)) * 2];
   1.132  
   1.133    int delScore = 0;
   1.134    int insCompensationScore = 0;
   1.135  
   1.136    // treat any query letters before the alignment as insertions:
   1.137 -  while (AmatB < AmatS) {
   1.138 -    *AmatB++ = firstInsScore;
   1.139 -    *DmatB++ = delScore + insCompensationScore;
   1.140 +  while (matBeg < alnBeg) {
   1.141 +    *matBeg++ = delScore + insCompensationScore;
   1.142 +    *matBeg++ = firstInsScore;
   1.143      delScore = 0;
   1.144      insCompensationScore = tweenInsScore;
   1.145    }
   1.146 @@ -765,8 +765,8 @@
   1.147      unsigned char y = *qAlign++;
   1.148      int q = qQual ? (*qQual++ - qualityOffset) : (numQualCodes - 1);
   1.149      if (x == '-') {  // gap in reference sequence: insertion
   1.150 -      *AmatB++ = firstInsScore;
   1.151 -      *DmatB++ = delScore + insCompensationScore;
   1.152 +      *matBeg++ = delScore + insCompensationScore;
   1.153 +      *matBeg++ = firstInsScore;
   1.154        delScore = 0;
   1.155        insCompensationScore = tweenInsScore;
   1.156      } else if (y == '-') {  // gap in query sequence: deletion
   1.157 @@ -776,8 +776,8 @@
   1.158      } else {
   1.159        assert(q >= 0);
   1.160        if (q >= numQualCodes) q = numQualCodes - 1;
   1.161 -      *AmatB++ = score_mat[x % 64][y % 64][q];
   1.162 -      *DmatB++ = delScore;
   1.163 +      *matBeg++ = delScore;
   1.164 +      *matBeg++ = score_mat[x % 64][y % 64][q];
   1.165        delScore = 0;
   1.166        insCompensationScore = 0;
   1.167      }
   1.168 @@ -786,14 +786,14 @@
   1.169    }
   1.170  
   1.171    // treat any query letters after the alignment as insertions:
   1.172 -  while (AmatB < AmatE) {
   1.173 -    *AmatB++ = firstInsScore;
   1.174 -    *DmatB++ = delScore + insCompensationScore;
   1.175 +  while (matBeg < matEnd) {
   1.176 +    *matBeg++ = delScore + insCompensationScore;
   1.177 +    *matBeg++ = firstInsScore;
   1.178      delScore = 0;
   1.179      insCompensationScore = tweenInsScore;
   1.180    }
   1.181  
   1.182 -  *DmatB++ = delScore;
   1.183 +  *matBeg++ = delScore;
   1.184  }
   1.185  
   1.186  void SplitAligner::initRbegsAndEnds() {
   1.187 @@ -1072,10 +1072,8 @@
   1.188  }
   1.189  
   1.190  void SplitAligner::initMatricesForOneQuery() {
   1.191 -  resizeMatrix(Amat);
   1.192 -  resizeMatrix(Dmat);
   1.193 -  resizeMatrix(Aexp);
   1.194 -  resizeMatrix(Dexp);
   1.195 +  resizeDoubleMatrix(Smat);
   1.196 +  resizeDoubleMatrix(Sexp);
   1.197  
   1.198    for (unsigned i = 0; i < numAlns; i++) calcBaseScores(i);
   1.199  
   1.200 @@ -1091,8 +1089,7 @@
   1.201      for (unsigned i = 0; i < numAlns; ++i) initSpliceSignals(i);
   1.202    }
   1.203  
   1.204 -  transform(Amat.begin(), Amat.end(), Aexp.begin(), scaledExp);
   1.205 -  transform(Dmat.begin(), Dmat.end(), Dexp.begin(), scaledExp);
   1.206 +  transform(Smat.begin(), Smat.end(), Sexp.begin(), scaledExp);
   1.207    // if x/scale < about -745, then exp(x/scale) will be exactly 0.0
   1.208  }
   1.209  
     2.1 --- a/src/split/cbrc_split_aligner.hh	Fri Mar 13 06:22:27 2020 +0900
     2.2 +++ b/src/split/cbrc_split_aligner.hh	Fri Mar 13 07:38:42 2020 +0900
     2.3 @@ -152,12 +152,22 @@
     2.4      std::vector<unsigned> dpBegs;  // dynamic programming begin coords
     2.5      std::vector<unsigned> dpEnds;  // dynamic programming end coords
     2.6      std::vector<size_t> matrixRowOrigins;  // layout of ragged matrices
     2.7 -    std::vector<int> Amat;  // scores at query bases, for each candidate
     2.8 -    std::vector<int> Dmat;  // scores between query bases, for each candidate
     2.9 +
    2.10 +    std::vector<int> Smat;
    2.11 +    // Smat holds position-specific substitution, insertion, and
    2.12 +    // deletion scores for the candidate alignments of one query
    2.13 +    // sequence to a genome.  These scores, for each candidate
    2.14 +    // alignment i, are called Aij and Dij in [Frith&Kawaguchi 2015].
    2.15 +    // Aij holds scores at query bases, in Smat[i][1,3,5,...,2n-1].
    2.16 +    // Dij holds scores between query bases, in Smat[i][0,2,4,...,2n].
    2.17 +
    2.18      std::vector<long> Vmat;  // DP matrix for Viterbi algorithm
    2.19      std::vector<long> Vvec;  // DP vector for Viterbi algorithm
    2.20 -    std::vector<double> Aexp;
    2.21 -    std::vector<double> Dexp;
    2.22 +
    2.23 +    std::vector<double> Sexp;
    2.24 +    // Sexp holds exp(Smat / t): these values are called A'ij and D'ij
    2.25 +    // in [Frith&Kawaguchi 2015].
    2.26 +
    2.27      std::vector<double> Fmat;  // DP matrix for Forward algorithm
    2.28      std::vector<double> Bmat;  // DP matrix for Backward algorithm
    2.29      std::vector<double> rescales;  // the usual scaling for numerical stability
    2.30 @@ -285,11 +295,19 @@
    2.31      void resizeMatrix(T& m) const {
    2.32        // This reserves size for a ragged matrix, which is actually
    2.33        // stored in a flat vector.  There are numAlns rows, and row i
    2.34 -      // has dpEnd(i) - dpBeg(i) + 1 cells.  (The final cell per row
    2.35 -      // is used in some matrices but not others.)
    2.36 +      // has dpEnd(i) - dpBeg(i) + 1 cells.
    2.37        m.resize(cellsPerDpMatrix());
    2.38      }
    2.39  
    2.40 +    template<typename T>
    2.41 +    void resizeDoubleMatrix(T& m) const {
    2.42 +      // This reserves size for Smat and Sexp, which contain 2
    2.43 +      // interpolated matrices (Aij and Dij).  The final cell per row
    2.44 +      // is never used, because there's one less Aij than Dij per
    2.45 +      // candidate alignment.
    2.46 +      m.resize(cellsPerDpMatrix() * 2);
    2.47 +    }
    2.48 +
    2.49      double probFromSpliceF(unsigned i, unsigned j, unsigned oldNumInplay,
    2.50  			   unsigned& oldInplayPos) const;
    2.51