Refactor
authorMartin C. Frith
Tue Feb 26 23:56:16 2019 +0900 (3 months ago)
changeset 9716189dce68b46
parent 970 2073177a7920
child 972 ae880ec00591
Refactor
src/lastal.cc
     1.1 --- a/src/lastal.cc	Tue Feb 26 17:39:11 2019 +0900
     1.2 +++ b/src/lastal.cc	Tue Feb 26 23:56:16 2019 +0900
     1.3 @@ -52,6 +52,18 @@
     1.4    std::vector<AlignmentText> textAlns;
     1.5  };
     1.6  
     1.7 +struct SubstitutionMatrices {
     1.8 +  int scores[scoreMatrixRowSize][scoreMatrixRowSize];
     1.9 +  int scoresMasked[scoreMatrixRowSize][scoreMatrixRowSize];
    1.10 +  mcf::SubstitutionMatrixStats stats;
    1.11 +  OneQualityScoreMatrix oneQual;
    1.12 +  OneQualityScoreMatrix oneQualMasked;
    1.13 +  OneQualityExpMatrix oneQualExp;
    1.14 +  QualityPssmMaker maker;
    1.15 +  TwoQualityScoreMatrix twoQual;
    1.16 +  TwoQualityScoreMatrix twoQualMasked;
    1.17 +};
    1.18 +
    1.19  namespace {
    1.20    typedef unsigned long long countT;
    1.21  
    1.22 @@ -63,29 +75,15 @@
    1.23    const unsigned maxNumOfIndexes = 16;
    1.24    SubsetSuffixArray suffixArrays[maxNumOfIndexes];
    1.25    ScoreMatrix scoreMatrix;
    1.26 -  int scoreMatrixRev[scoreMatrixRowSize][scoreMatrixRowSize];
    1.27 -  int scoreMatrixRevMasked[scoreMatrixRowSize][scoreMatrixRowSize];
    1.28 +  SubstitutionMatrices fwdMatrices;
    1.29 +  SubstitutionMatrices revMatrices;
    1.30    GeneralizedAffineGapCosts gapCosts;
    1.31    std::vector<LastAligner> aligners;
    1.32 -  mcf::SubstitutionMatrixStats scoreMatrixStats;
    1.33 -  mcf::SubstitutionMatrixStats scoreMatrixStatsRev;
    1.34    LastEvaluer evaluer;
    1.35    MultiSequence query;  // sequence that hasn't been indexed by lastdb
    1.36    MultiSequence text;  // sequence that has been indexed by lastdb
    1.37    std::vector< std::vector<countT> > matchCounts;  // used if outputType == 0
    1.38 -  OneQualityScoreMatrix oneQualityMatrix;
    1.39 -  OneQualityScoreMatrix oneQualityMatrixMasked;
    1.40 -  OneQualityScoreMatrix oneQualityMatrixRev;
    1.41 -  OneQualityScoreMatrix oneQualityMatrixRevMasked;
    1.42 -  OneQualityExpMatrix oneQualityExpMatrix;
    1.43 -  OneQualityExpMatrix oneQualityExpMatrixRev;
    1.44 -  QualityPssmMaker qualityPssmMaker;
    1.45 -  QualityPssmMaker qualityPssmMakerRev;
    1.46    sequenceFormat::Enum referenceFormat = sequenceFormat::fasta;
    1.47 -  TwoQualityScoreMatrix twoQualityMatrix;
    1.48 -  TwoQualityScoreMatrix twoQualityMatrixMasked;
    1.49 -  TwoQualityScoreMatrix twoQualityMatrixRev;
    1.50 -  TwoQualityScoreMatrix twoQualityMatrixRevMasked;
    1.51    int minScoreGapless;
    1.52    int isCaseSensitiveSeeds = -1;  // initialize it to an "error" value
    1.53    unsigned numOfIndexes = 1;  // assume this value, if unspecified
    1.54 @@ -104,10 +102,11 @@
    1.55    std::copy(scoreMatrix.caseSensitive,
    1.56  	    scoreMatrix.caseSensitive + alph.size, scoreMat);
    1.57  
    1.58 +  mcf::SubstitutionMatrixStats &stats = fwdMatrices.stats;
    1.59    if (args.temperature < 0) {
    1.60      LOG("calculating matrix probabilities...");
    1.61 -    scoreMatrixStats.calcUnbiased(scoreMat, alph.size);
    1.62 -    if (scoreMatrixStats.isBad()) {
    1.63 +    stats.calcUnbiased(scoreMat, alph.size);
    1.64 +    if (stats.isBad()) {
    1.65        static const char msg[] =
    1.66  	"can't calculate probabilities: maybe the mismatch costs are too weak";
    1.67        if (isUseQuality(args.inputFormat) || args.outputType > 3) ERR(msg);
    1.68 @@ -115,16 +114,16 @@
    1.69        return;
    1.70      }
    1.71    } else {
    1.72 -    scoreMatrixStats.calcFromScale(scoreMat, alph.size, args.temperature);
    1.73 +    stats.calcFromScale(scoreMat, alph.size, args.temperature);
    1.74    }
    1.75  
    1.76 -  scoreMatrixStatsRev = scoreMatrixStats;
    1.77 -  scoreMatrixStatsRev.flipDnaStrands(alph.complement);
    1.78 +  revMatrices.stats = stats;
    1.79 +  revMatrices.stats.flipDnaStrands(alph.complement);
    1.80  
    1.81 -  const double *p1 = scoreMatrixStats.letterProbs1();
    1.82 -  const double *p2 = scoreMatrixStats.letterProbs2();
    1.83 +  const double *p1 = stats.letterProbs1();
    1.84 +  const double *p2 = stats.letterProbs2();
    1.85  
    1.86 -  LOG( "matrix lambda=" << scoreMatrixStats.lambda() );
    1.87 +  LOG( "matrix lambda=" << stats.lambda() );
    1.88    LOG( "matrix letter frequencies (upper=reference, lower=query):" );
    1.89    if( args.verbosity > 0 ){
    1.90      std::cerr << std::left;
    1.91 @@ -155,10 +154,11 @@
    1.92    scoreMatrix.init(alph.encode);
    1.93    if (args.outputType > 0) {
    1.94      calculateSubstitutionScoreMatrixStatistics();
    1.95 -    if (alph.letters == alph.dna && !scoreMatrixStats.isBad()) {
    1.96 -      scoreMatrix.addAmbiguousScores(alph.encode, scoreMatrixStats.lambda(),
    1.97 -				     scoreMatrixStats.letterProbs1(),
    1.98 -				     scoreMatrixStats.letterProbs2());
    1.99 +    const mcf::SubstitutionMatrixStats &stats = fwdMatrices.stats;
   1.100 +    if (alph.letters == alph.dna && !stats.isBad()) {
   1.101 +      scoreMatrix.addAmbiguousScores(alph.encode, stats.lambda(),
   1.102 +				     stats.letterProbs1(),
   1.103 +				     stats.letterProbs2());
   1.104        scoreMatrix.init(alph.encode);
   1.105      }
   1.106    }
   1.107 @@ -172,8 +172,8 @@
   1.108    // maxScore = std::max(args.maxDropGapped, args.maxDropFinal) + 1;
   1.109  
   1.110    if( args.isQueryStrandMatrix && args.strand != 1 ){
   1.111 -    complementMatrix( scoreMatrix.caseInsensitive, scoreMatrixRev );
   1.112 -    complementMatrix( scoreMatrix.caseSensitive, scoreMatrixRevMasked );
   1.113 +    complementMatrix( scoreMatrix.caseInsensitive, revMatrices.scores );
   1.114 +    complementMatrix( scoreMatrix.caseSensitive, revMatrices.scoresMasked );
   1.115    }
   1.116  }
   1.117  
   1.118 @@ -186,7 +186,7 @@
   1.119  		   "quality data not used for DNA-versus-protein alignment" );
   1.120  
   1.121    const ScoreMatrixRow* m = scoreMatrix.caseSensitive;  // case isn't relevant
   1.122 -  const ScoreMatrixRow* mRev = scoreMatrixRev;
   1.123 +  const ScoreMatrixRow* mRev = revMatrices.scores;
   1.124    bool isMatchMismatch = (args.matrixFile.empty() && args.matchScore > 0);
   1.125    bool isPhred1 = isPhred( referenceFormat );
   1.126    int offset1 = qualityOffset( referenceFormat );
   1.127 @@ -198,61 +198,62 @@
   1.128      if( isUseFastq( args.inputFormat ) ){
   1.129        LOG( "calculating per-quality scores..." );
   1.130        if( args.maskLowercase > 0 )
   1.131 -	oneQualityMatrixMasked.init(m, alph.size, scoreMatrixStats,
   1.132 -				    isPhred2, offset2, toUnmasked, true);
   1.133 +	fwdMatrices.oneQualMasked.init(m, alph.size, fwdMatrices.stats,
   1.134 +				       isPhred2, offset2, toUnmasked, true);
   1.135        if( args.maskLowercase < 3 )
   1.136 -	oneQualityMatrix.init(m, alph.size, scoreMatrixStats,
   1.137 -			      isPhred2, offset2, toUnmasked, false);
   1.138 +	fwdMatrices.oneQual.init(m, alph.size, fwdMatrices.stats,
   1.139 +				 isPhred2, offset2, toUnmasked, false);
   1.140        const OneQualityScoreMatrix &q = (args.maskLowercase < 3) ?
   1.141 -	oneQualityMatrix : oneQualityMatrixMasked;
   1.142 +	fwdMatrices.oneQual : fwdMatrices.oneQualMasked;
   1.143        if( args.outputType > 3 )
   1.144 -        oneQualityExpMatrix.init( q, args.temperature );
   1.145 +        fwdMatrices.oneQualExp.init( q, args.temperature );
   1.146        if( args.verbosity > 0 )
   1.147  	writeOneQualityScoreMatrix( q, alph.letters.c_str(),
   1.148  				    offset2, std::cerr );
   1.149        if( args.isQueryStrandMatrix && args.strand != 1 ){
   1.150  	if( args.maskLowercase > 0 )
   1.151 -	  oneQualityMatrixRevMasked.init(mRev, alph.size, scoreMatrixStatsRev,
   1.152 +	  revMatrices.oneQualMasked.init(mRev, alph.size, revMatrices.stats,
   1.153  					 isPhred2, offset2, toUnmasked, true);
   1.154  	if( args.maskLowercase < 3 )
   1.155 -	  oneQualityMatrixRev.init(mRev, alph.size, scoreMatrixStatsRev,
   1.156 +	  revMatrices.oneQual.init(mRev, alph.size, revMatrices.stats,
   1.157  				   isPhred2, offset2, toUnmasked, false);
   1.158  	const OneQualityScoreMatrix &qRev = (args.maskLowercase < 3) ?
   1.159 -	  oneQualityMatrixRev : oneQualityMatrixRevMasked;
   1.160 +	  revMatrices.oneQual : revMatrices.oneQualMasked;
   1.161  	if( args.outputType > 3 )
   1.162 -	  oneQualityExpMatrixRev.init( qRev, args.temperature );
   1.163 +	  revMatrices.oneQualExp.init( qRev, args.temperature );
   1.164        }
   1.165      }
   1.166      if( isUseQuality(args.inputFormat) ){
   1.167 -      double lambda = scoreMatrixStats.lambda();
   1.168 -      qualityPssmMaker.init(m, alph.size, lambda, isMatchMismatch,
   1.169 -			    args.matchScore, -args.mismatchCost,
   1.170 -			    isPhred2, offset2, toUnmasked);
   1.171 +      fwdMatrices.maker.init(m, alph.size, fwdMatrices.stats.lambda(),
   1.172 +			     isMatchMismatch,
   1.173 +			     args.matchScore, -args.mismatchCost,
   1.174 +			     isPhred2, offset2, toUnmasked);
   1.175        if( args.isQueryStrandMatrix && args.strand != 1 )
   1.176 -	qualityPssmMakerRev.init(mRev, alph.size, lambda, isMatchMismatch,
   1.177 -				 args.matchScore, -args.mismatchCost,
   1.178 -				 isPhred2, offset2, toUnmasked);
   1.179 +	revMatrices.maker.init(mRev, alph.size, revMatrices.stats.lambda(),
   1.180 +			       isMatchMismatch,
   1.181 +			       args.matchScore, -args.mismatchCost,
   1.182 +			       isPhred2, offset2, toUnmasked);
   1.183      }
   1.184    }
   1.185    else{
   1.186      if( isUseFastq( args.inputFormat ) ){
   1.187        if( args.maskLowercase > 0 )
   1.188 -	twoQualityMatrixMasked.init(m, scoreMatrixStats,
   1.189 -				    isPhred1, offset1, isPhred2, offset2,
   1.190 -				    toUnmasked, true, isMatchMismatch);
   1.191 +	fwdMatrices.twoQualMasked.init(m, fwdMatrices.stats,
   1.192 +				       isPhred1, offset1, isPhred2, offset2,
   1.193 +				       toUnmasked, true, isMatchMismatch);
   1.194        if( args.maskLowercase < 3 )
   1.195 -	twoQualityMatrix.init(m, scoreMatrixStats,
   1.196 -			      isPhred1, offset1, isPhred2, offset2,
   1.197 -			      toUnmasked, false, isMatchMismatch);
   1.198 +	fwdMatrices.twoQual.init(m, fwdMatrices.stats,
   1.199 +				 isPhred1, offset1, isPhred2, offset2,
   1.200 +				 toUnmasked, false, isMatchMismatch);
   1.201        if( args.outputType > 3 )
   1.202          ERR( "fastq-versus-fastq column probabilities not implemented" );
   1.203        if( args.isQueryStrandMatrix && args.strand != 1 ){
   1.204  	if( args.maskLowercase > 0 )
   1.205 -	  twoQualityMatrixRevMasked.init(mRev, scoreMatrixStatsRev,
   1.206 +	  revMatrices.twoQualMasked.init(mRev, revMatrices.stats,
   1.207  					 isPhred1, offset1, isPhred2, offset2,
   1.208  					 toUnmasked, true, isMatchMismatch);
   1.209  	if( args.maskLowercase < 3 )
   1.210 -	  twoQualityMatrixRev.init(mRev, scoreMatrixStatsRev,
   1.211 +	  revMatrices.twoQual.init(mRev, revMatrices.stats,
   1.212  				   isPhred1, offset1, isPhred2, offset2,
   1.213  				   toUnmasked, false, isMatchMismatch);
   1.214        }
   1.215 @@ -267,7 +268,8 @@
   1.216  // Calculate statistical parameters for the alignment scoring scheme
   1.217  static void calculateScoreStatistics(const std::string& matrixName,
   1.218  				     countT refLetters) {
   1.219 -  if (scoreMatrixStats.isBad()) return;
   1.220 +  const mcf::SubstitutionMatrixStats &stats = fwdMatrices.stats;
   1.221 +  if (stats.isBad()) return;
   1.222    const char *canonicalMatrixName = ScoreMatrix::canonicalName( matrixName );
   1.223    if (args.temperature > 0 && !matrixName.empty()) canonicalMatrixName = " ";
   1.224    bool isGapped = (args.outputType > 1);
   1.225 @@ -276,8 +278,7 @@
   1.226    try{
   1.227      evaluer.init( canonicalMatrixName, args.matchScore, args.mismatchCost,
   1.228                    alph.letters.c_str(), scoreMatrix.caseSensitive,
   1.229 -		  scoreMatrixStats.letterProbs1(),
   1.230 -		  scoreMatrixStats.letterProbs2(), isGapped,
   1.231 +		  stats.letterProbs1(), stats.letterProbs2(), isGapped,
   1.232                    gapCosts.delExist, gapCosts.delExtend,
   1.233                    gapCosts.insExist, gapCosts.insExtend,
   1.234                    args.frameshiftCost, geneticCode, isStandardGeneticCode,
   1.235 @@ -394,35 +395,35 @@
   1.236    if (strand == '+' || !args.isQueryStrandMatrix)
   1.237      return isMask ? scoreMatrix.caseSensitive : scoreMatrix.caseInsensitive;
   1.238    else
   1.239 -    return isMask ? scoreMatrixRevMasked : scoreMatrixRev;
   1.240 +    return isMask ? revMatrices.scoresMasked : revMatrices.scores;
   1.241  }
   1.242  
   1.243  const OneQualityScoreMatrix &getOneQualityMatrix(char strand, bool isMask) {
   1.244    if (strand == '+' || !args.isQueryStrandMatrix)
   1.245 -    return isMask ? oneQualityMatrixMasked : oneQualityMatrix;
   1.246 +    return isMask ? fwdMatrices.oneQualMasked : fwdMatrices.oneQual;
   1.247    else
   1.248 -    return isMask ? oneQualityMatrixRevMasked : oneQualityMatrixRev;
   1.249 +    return isMask ? revMatrices.oneQualMasked : revMatrices.oneQual;
   1.250  }
   1.251  
   1.252  const TwoQualityScoreMatrix &getTwoQualityMatrix(char strand, bool isMask) {
   1.253    if (strand == '+' || !args.isQueryStrandMatrix)
   1.254 -    return isMask ? twoQualityMatrixMasked : twoQualityMatrix;
   1.255 +    return isMask ? fwdMatrices.twoQualMasked : fwdMatrices.twoQual;
   1.256    else
   1.257 -    return isMask ? twoQualityMatrixRevMasked : twoQualityMatrixRev;
   1.258 +    return isMask ? revMatrices.twoQualMasked : revMatrices.twoQual;
   1.259  }
   1.260  
   1.261  const OneQualityExpMatrix &getOneQualityExpMatrix(char strand) {
   1.262    if (strand == '+' || !args.isQueryStrandMatrix)
   1.263 -    return oneQualityExpMatrix;
   1.264 +    return fwdMatrices.oneQualExp;
   1.265    else
   1.266 -    return oneQualityExpMatrixRev;
   1.267 +    return revMatrices.oneQualExp;
   1.268  }
   1.269  
   1.270  const QualityPssmMaker &getQualityPssmMaker(char strand) {
   1.271    if (strand == '+' || !args.isQueryStrandMatrix)
   1.272 -    return qualityPssmMaker;
   1.273 +    return fwdMatrices.maker;
   1.274    else
   1.275 -    return qualityPssmMakerRev;
   1.276 +    return revMatrices.maker;
   1.277  }
   1.278  
   1.279  static const uchar *getQueryQual(size_t queryNum) {
   1.280 @@ -738,12 +739,12 @@
   1.281        if (args.outputType == 7) {
   1.282  	centroid.setScoreMatrix(dis.m, args.temperature);
   1.283  	bool isFwd = (strand == '+' || !args.isQueryStrandMatrix);
   1.284 -	const mcf::SubstitutionMatrixStats &stats =
   1.285 -	  isFwd ? scoreMatrixStats : scoreMatrixStatsRev;
   1.286 +	const SubstitutionMatrices &matrices =
   1.287 +	  isFwd ? fwdMatrices : revMatrices;
   1.288  	centroid.setLetterProbsPerPosition(alph.size, queryLen, dis.b, dis.j,
   1.289  					   isUseFastq(args.inputFormat),
   1.290 -					   qualityPssmMaker.qualToProbRight(),
   1.291 -					   stats.letterProbs2(),
   1.292 +					   matrices.maker.qualToProbRight(),
   1.293 +					   matrices.stats.letterProbs2(),
   1.294  					   alph.numbersToUppercase);
   1.295        }
   1.296        centroid.setPssm(dis.p, queryLen, args.temperature,
   1.297 @@ -1214,7 +1215,7 @@
   1.298  
   1.299    if( args.outputType > 0 ) calculateScoreStatistics( matrixName, refLetters );
   1.300    int minScore = evaluer.minScore( args.maxEvalue, 1e18 );
   1.301 -  args.setDefaultsFromMatrix( scoreMatrixStats.lambda(), minScore );
   1.302 +  args.setDefaultsFromMatrix( fwdMatrices.stats.lambda(), minScore );
   1.303    minScoreGapless = calcMinScoreGapless(refLetters);
   1.304    if( !isMultiVolume ) args.minScoreGapless = minScoreGapless;
   1.305    if( args.outputType > 0 ) makeQualityScorers();