Omit non-alphabet letters for DB seq lengths
authorMartin C. Frith
Tue Oct 08 19:21:37 2019 +0900 (6 weeks ago)
changeset 9917f841cc97f20
parent 990 4011a21f96ad
child 992 e57e547b8235
Omit non-alphabet letters for DB seq lengths
src/lastal.cc
src/lastdb.cc
     1.1 --- a/src/lastal.cc	Tue Oct 08 17:19:10 2019 +0900
     1.2 +++ b/src/lastal.cc	Tue Oct 08 19:21:37 2019 +0900
     1.3 @@ -312,8 +312,8 @@
     1.4      if( word == "version" ) iss >> version;
     1.5      if( word == "alphabet" ) iss >> alph;
     1.6      if( word == "numofsequences" ) iss >> refSequences;
     1.7 -    if( word == "maxsequencelength" ) iss >> refMaxSeqLen;
     1.8      if( word == "numofletters" ) iss >> refLetters;
     1.9 +    if( word == "maxsequenceletters" ) iss >> refMaxSeqLen;
    1.10      if( word == "maxunsortedinterval" ) iss >> minSeedLimit;
    1.11      if( word == "keeplowercase" ) iss >> isKeepRefLowercase;
    1.12      if( word == "tantansetting" ) iss >> refTantanSetting;
     2.1 --- a/src/lastdb.cc	Tue Oct 08 17:19:10 2019 +0900
     2.2 +++ b/src/lastdb.cc	Tue Oct 08 19:21:37 2019 +0900
     2.3 @@ -109,8 +109,8 @@
     2.4      << '\n';
     2.5    f << "alphabet=" << alph << '\n';
     2.6    f << "numofsequences=" << sequenceCount << '\n';
     2.7 -  f << "maxsequencelength=" << maxSeqLen << '\n';
     2.8    f << "numofletters=" << letterTotal << '\n';
     2.9 +  f << "maxsequenceletters=" << maxSeqLen << '\n';
    2.10    f << "letterfreqs=";
    2.11    for( unsigned i = 0; i < letterCounts.size(); ++i ){
    2.12      if( i > 0 ) f << ' ';
    2.13 @@ -179,7 +179,7 @@
    2.14  // Make one database volume, from one batch of sequences
    2.15  void makeVolume(std::vector<CyclicSubsetSeed>& seeds, MultiSequence& multi,
    2.16  		const LastdbArguments& args, const Alphabet& alph,
    2.17 -		std::vector<countT>& letterCounts, size_t& maxSeqLenSeen,
    2.18 +		std::vector<countT>& letterCountsSeen, size_t& maxSeqLenSeen,
    2.19  		const TantanMasker& masker, unsigned numOfThreads,
    2.20  		const std::string& seedText, const std::string& baseName) {
    2.21    size_t numOfIndexes = seeds.size();
    2.22 @@ -187,18 +187,20 @@
    2.23    size_t textLength = multi.seqBeg(numOfSequences);
    2.24    const uchar* seq = multi.seqReader();
    2.25  
    2.26 -  std::vector<countT> letterCountsInThisVolume(alph.size);
    2.27 -  alph.count(seq, seq + textLength, &letterCountsInThisVolume[0]);
    2.28 -  for (unsigned c = 0; c < alph.size; ++c) {
    2.29 -    letterCounts[c] += letterCountsInThisVolume[c];
    2.30 +  std::vector<countT> letterCounts(alph.size);
    2.31 +  size_t maxSeqLen = 0;
    2.32 +  size_t letterTotal = 0;
    2.33 +  for (size_t i = 0; i < numOfSequences; ++i) {
    2.34 +    alph.count(seq + multi.seqBeg(i), seq + multi.seqEnd(i), &letterCounts[0]);
    2.35 +    size_t t = accumulate(letterCounts.begin(), letterCounts.end(), countT(0));
    2.36 +    maxSeqLen = std::max(maxSeqLen, t - letterTotal);
    2.37 +    letterTotal = t;
    2.38    }
    2.39  
    2.40 -  size_t maxSeqLenInThisVolume = 0;
    2.41 -  for (size_t i = 0; i < numOfSequences; ++i) {
    2.42 -    size_t s = multi.seqLen(i);
    2.43 -    maxSeqLenInThisVolume = std::max(maxSeqLenInThisVolume, s);
    2.44 +  for (unsigned c = 0; c < alph.size; ++c) {
    2.45 +    letterCountsSeen[c] += letterCounts[c];
    2.46    }
    2.47 -  maxSeqLenSeen = std::max(maxSeqLenSeen, maxSeqLenInThisVolume);
    2.48 +  maxSeqLenSeen = std::max(maxSeqLenSeen, maxSeqLen);
    2.49  
    2.50    if (args.isCountsOnly) return;
    2.51  
    2.52 @@ -209,7 +211,7 @@
    2.53  
    2.54    LOG( "writing..." );
    2.55    writePrjFile( baseName + ".prj", args, alph, numOfSequences,
    2.56 -		maxSeqLenInThisVolume, letterCountsInThisVolume,
    2.57 +		maxSeqLen, letterCounts,
    2.58  		multi.qualsPerLetter(), -1, numOfIndexes, seedText );
    2.59    multi.toFiles( baseName );
    2.60