Allow score matrix with letter frequencies
authorMartin C. Frith
Mon Oct 07 20:30:57 2019 +0900 (6 weeks ago)
changeset 98950bcc51ca59b
parent 988 ca6613dd1f06
child 990 4011a21f96ad
Allow score matrix with letter frequencies
src/ScoreMatrix.cc
src/ScoreMatrix.hh
src/lastal.cc
src/mcf_substitution_matrix_stats.cc
src/mcf_substitution_matrix_stats.hh
     1.1 --- a/src/ScoreMatrix.cc	Mon Oct 07 12:46:14 2019 +0900
     1.2 +++ b/src/ScoreMatrix.cc	Mon Oct 07 20:30:57 2019 +0900
     1.3 @@ -130,6 +130,39 @@
     1.4    }
     1.5  }
     1.6  
     1.7 +static void calcSomeLetterProbs(const std::string &symbols,
     1.8 +				const std::vector<double> &freqs,
     1.9 +				double probs[], unsigned alphabetSizeForProbs,
    1.10 +				const uchar symbolToIndex[]) {
    1.11 +  double sum = 0;
    1.12 +  for (size_t i = 0; i < freqs.size(); ++i) {
    1.13 +    unsigned j = s2i(symbolToIndex, symbols[i]);
    1.14 +    if (j < alphabetSizeForProbs) {
    1.15 +      if (freqs[i] < 0) throw Err("bad score matrix: letter frequency < 0");
    1.16 +      sum += freqs[i];
    1.17 +    }
    1.18 +  }
    1.19 +  if (sum <= 0) throw Err("bad score matrix: no positive letter frequencies");
    1.20 +
    1.21 +  std::fill_n(probs, alphabetSizeForProbs, 0.0);
    1.22 +
    1.23 +  for (size_t i = 0; i < freqs.size(); ++i) {
    1.24 +    unsigned j = s2i(symbolToIndex, symbols[i]);
    1.25 +    if (j < alphabetSizeForProbs) {
    1.26 +      probs[j] = freqs[i] / sum;
    1.27 +    }
    1.28 +  }
    1.29 +}
    1.30 +
    1.31 +void ScoreMatrix::calcLetterProbs(double rowProbs[], double colProbs[],
    1.32 +				  unsigned alphabetSizeForProbs,
    1.33 +				  const uchar symbolToIndex[]) const {
    1.34 +  calcSomeLetterProbs(rowSymbols, rowFrequencies, rowProbs,
    1.35 +		      alphabetSizeForProbs, symbolToIndex);
    1.36 +  calcSomeLetterProbs(colSymbols, colFrequencies, colProbs,
    1.37 +		      alphabetSizeForProbs, symbolToIndex);
    1.38 +}
    1.39 +
    1.40  void ScoreMatrix::writeCommented( std::ostream& stream ) const{
    1.41    int colWidth = colSymbols.size() < 20 ? 3 : 2;
    1.42  
    1.43 @@ -152,35 +185,63 @@
    1.44    std::string tmpRowSymbols;
    1.45    std::string tmpColSymbols;
    1.46    std::vector< std::vector<int> > tmpCells;
    1.47 -  std::string line;
    1.48 +  std::vector<double> tmpRowFreqs;
    1.49 +  std::vector<double> tmpColFreqs;
    1.50 +  std::string line, word;
    1.51  
    1.52 -  while (getline(stream, line)) {
    1.53 +  while (stream) {
    1.54 +    if (!getline(stream, line)) {
    1.55 +      if (stream.eof() && !stream.bad()) stream.clear(std::ios::eofbit);
    1.56 +      break;
    1.57 +    }
    1.58      std::istringstream iss(line);
    1.59 -    char c;
    1.60 -    if (!(iss >> c)) continue;  // skip blank lines
    1.61 +    if (!(iss >> word)) continue;  // skip blank lines
    1.62      if (tmpColSymbols.empty()) {
    1.63 -      if (c == '#') continue;  // skip comment lines at the top
    1.64 +      if (word[0] == '#') continue;  // skip comment lines at the top
    1.65        do {
    1.66 -	tmpColSymbols.push_back(c);
    1.67 -      } while (iss >> c);
    1.68 +	if (word.size() > 1) stream.setstate(std::ios::failbit);
    1.69 +	tmpColSymbols.push_back(word[0]);
    1.70 +      } while (iss >> word);
    1.71      } else {
    1.72 -      tmpRowSymbols.push_back(c);
    1.73 -      tmpCells.resize( tmpCells.size() + 1 );
    1.74 +      std::vector<int> row;
    1.75        int score;
    1.76 -      while( iss >> score ){
    1.77 -	tmpCells.back().push_back(score);
    1.78 +      double freq;
    1.79 +      for (size_t i = 0; i < tmpColSymbols.size(); ++i) {
    1.80 +	iss >> score;
    1.81 +	row.push_back(score);
    1.82        }
    1.83 -      if (tmpCells.back().size() != tmpColSymbols.size()) {
    1.84 -	throw Err("bad score matrix");
    1.85 +      if (word.size() == 1 && iss) {
    1.86 +	tmpRowSymbols.push_back(word[0]);
    1.87 +	tmpCells.push_back(row);
    1.88 +	if (iss >> freq) {
    1.89 +	  tmpRowFreqs.push_back(freq);
    1.90 +	  if (tmpRowFreqs.size() < tmpCells.size()) {
    1.91 +	    stream.setstate(std::ios::failbit);
    1.92 +	  }
    1.93 +	}
    1.94 +      } else {
    1.95 +	std::istringstream iss2(line);
    1.96 +	while (iss2 >> freq) {
    1.97 +	  tmpColFreqs.push_back(freq);
    1.98 +	}
    1.99 +	if (tmpColFreqs.size() > tmpColSymbols.size() || tmpColFreqs.empty()) {
   1.100 +	  stream.setstate(std::ios::failbit);
   1.101 +	}
   1.102 +	break;
   1.103        }
   1.104      }
   1.105    }
   1.106  
   1.107 -  if (stream.eof() && !stream.bad() && !tmpCells.empty()) {
   1.108 -    stream.clear();
   1.109 +  if (tmpCells.empty() || tmpRowFreqs.empty() != tmpColFreqs.empty()) {
   1.110 +    stream.setstate(std::ios::failbit);
   1.111 +  }
   1.112 +
   1.113 +  if (stream) {
   1.114      m.rowSymbols.swap(tmpRowSymbols);
   1.115      m.colSymbols.swap(tmpColSymbols);
   1.116      m.cells.swap(tmpCells);
   1.117 +    m.rowFrequencies.swap(tmpRowFreqs);
   1.118 +    m.colFrequencies.swap(tmpColFreqs);
   1.119    }
   1.120  
   1.121    return stream;
     2.1 --- a/src/ScoreMatrix.hh	Mon Oct 07 12:46:14 2019 +0900
     2.2 +++ b/src/ScoreMatrix.hh	Mon Oct 07 20:30:57 2019 +0900
     2.3 @@ -44,9 +44,19 @@
     2.4  
     2.5    void writeCommented( std::ostream& stream ) const;  // write preceded by "#"
     2.6  
     2.7 +  bool hasLetterFrequencies() const
     2.8 +  { return rowFrequencies.size() && colFrequencies.size(); }
     2.9 +
    2.10 +  // store normalized letter frequencies in rowProbs and colProbs
    2.11 +  void calcLetterProbs(double *rowProbs, double *colProbs,
    2.12 +		       unsigned alphabetSizeForProbs,
    2.13 +		       const uchar symbolToIndex[]) const;
    2.14 +
    2.15    std::string rowSymbols;  // row headings (letters)
    2.16    std::string colSymbols;  // column headings (letters)
    2.17    std::vector< std::vector<int> > cells;  // scores
    2.18 +  std::vector<double> rowFrequencies;
    2.19 +  std::vector<double> colFrequencies;
    2.20    int caseSensitive[ALPHABET_CAPACITY][ALPHABET_CAPACITY];
    2.21    int caseInsensitive[ALPHABET_CAPACITY][ALPHABET_CAPACITY];
    2.22    int minScore;
     3.1 --- a/src/lastal.cc	Mon Oct 07 12:46:14 2019 +0900
     3.2 +++ b/src/lastal.cc	Mon Oct 07 20:30:57 2019 +0900
     3.3 @@ -103,19 +103,31 @@
     3.4  	    scoreMatrix.caseSensitive + alph.size, scoreMat);
     3.5  
     3.6    mcf::SubstitutionMatrixStats &stats = fwdMatrices.stats;
     3.7 -  if (args.temperature < 0) {
     3.8 -    const char *canonicalMatrixName = ScoreMatrix::canonicalName(matrixName);
     3.9 -    LOG("calculating matrix probabilities...");
    3.10 -    stats.calcUnbiased(canonicalMatrixName, scoreMat, alph.size);
    3.11 -    if (stats.isBad()) {
    3.12 -      static const char msg[] =
    3.13 -	"can't calculate probabilities: maybe the mismatch costs are too weak";
    3.14 -      if (isUseQuality(args.inputFormat) || args.outputType > 3) ERR(msg);
    3.15 -      LOG(msg);
    3.16 -      return;
    3.17 +  if (scoreMatrix.hasLetterFrequencies()) {
    3.18 +    double *p1 = stats.sizedLetterProbs1(alph.size);
    3.19 +    double *p2 = stats.sizedLetterProbs2(alph.size);
    3.20 +    scoreMatrix.calcLetterProbs(p1, p2, alph.size, alph.encode);
    3.21 +    if (args.temperature < 0) {
    3.22 +      ERR("not implemented");
    3.23 +    } else {
    3.24 +      stats.calcBias(scoreMat, alph.size, args.temperature);
    3.25 +      LOG("score matrix bias=" << stats.bias());
    3.26      }
    3.27    } else {
    3.28 -    stats.calcFromScale(scoreMat, alph.size, args.temperature);
    3.29 +    if (args.temperature < 0) {
    3.30 +      const char *canonicalMatrixName = ScoreMatrix::canonicalName(matrixName);
    3.31 +      LOG("calculating matrix probabilities...");
    3.32 +      stats.calcUnbiased(canonicalMatrixName, scoreMat, alph.size);
    3.33 +      if (stats.isBad()) {
    3.34 +	static const char msg[] = "can't calculate probabilities: "
    3.35 +	  "maybe the mismatch costs are too weak";
    3.36 +	if (isUseQuality(args.inputFormat) || args.outputType > 3) ERR(msg);
    3.37 +	LOG(msg);
    3.38 +	return;
    3.39 +      }
    3.40 +    } else {
    3.41 +      stats.calcFromScale(scoreMat, alph.size, args.temperature);
    3.42 +    }
    3.43    }
    3.44  
    3.45    revMatrices.stats = stats;
     4.1 --- a/src/mcf_substitution_matrix_stats.cc	Mon Oct 07 12:46:14 2019 +0900
     4.2 +++ b/src/mcf_substitution_matrix_stats.cc	Mon Oct 07 20:30:57 2019 +0900
     4.3 @@ -82,6 +82,20 @@
     4.4    mBias = (bias1 + bias2) / 2;
     4.5  }
     4.6  
     4.7 +void SubstitutionMatrixStats::calcBias(const const_int_ptr *scoreMatrix,
     4.8 +				       unsigned size, double scale) {
     4.9 +  assert(scale > 0);
    4.10 +  mLambda = 1 / scale;
    4.11 +
    4.12 +  mBias = 0;
    4.13 +  for (unsigned i = 0; i < size; ++i) {
    4.14 +    for (unsigned j = 0; j < size; ++j) {
    4.15 +      double e = checkedExp(mLambda, scoreMatrix[i][j]);
    4.16 +      mBias += mLetterProbs1[i] * mLetterProbs2[j] * e;
    4.17 +    }
    4.18 +  }
    4.19 +}
    4.20 +
    4.21  void SubstitutionMatrixStats::calcUnbiased(const char *matrixName,
    4.22  					   const const_int_ptr *scoreMatrix,
    4.23  					   unsigned size) {
     5.1 --- a/src/mcf_substitution_matrix_stats.hh	Mon Oct 07 12:46:14 2019 +0900
     5.2 +++ b/src/mcf_substitution_matrix_stats.hh	Mon Oct 07 20:30:57 2019 +0900
     5.3 @@ -44,6 +44,18 @@
     5.4    const double *letterProbs2() const
     5.5    { return mLetterProbs2.empty() ? 0 : &mLetterProbs2[0]; }
     5.6  
     5.7 +  // set up an array for writing row-letter probabilities into
     5.8 +  double *sizedLetterProbs1(unsigned size)
     5.9 +  { mLetterProbs1.resize(size); return size ? &mLetterProbs1[0] : 0; }
    5.10 +
    5.11 +  // set up an array for writing column-letter probabilities into
    5.12 +  double *sizedLetterProbs2(unsigned size)
    5.13 +  { mLetterProbs2.resize(size); return size ? &mLetterProbs2[0] : 0; }
    5.14 +
    5.15 +  // calculate the bias, given the scale and the letter probabilities
    5.16 +  // (which need not be homogeneous)
    5.17 +  void calcBias(const const_int_ptr *scoreMatrix, unsigned size, double scale);
    5.18 +
    5.19    // set scale/lambda and calculate the other values
    5.20    void calcFromScale(const const_int_ptr *scoreMatrix, unsigned size,
    5.21  		     double scale);