From 104684e6430c6d0d3dbaffbe4032237a0b469065 Mon Sep 17 00:00:00 2001 From: Adam Azarchs Date: Wed, 23 Mar 2022 14:01:51 -0700 Subject: [PATCH 1/9] Minor modernizations and optimizations in STAR code. Use std::unique_ptr in more places. Replace c-style arrays with c++ std::array. Use defaulted constructors/destructors where appropriate. Make it easier for the compiler to vectorize comparisons in the inner loop of stitchAlignToTranscript. --- src/lib.rs | 2 +- star-sys/STAR/source/InOutStreams.h | 2 +- star-sys/STAR/source/IncludeDefine.h | 5 +- star-sys/STAR/source/ParameterInfo.h | 37 ++++---- star-sys/STAR/source/Parameters.cpp | 2 +- .../source/ReadAlign_peOverlapMergeMap.cpp | 2 +- .../STAR/source/SoloReadFeature_record.cpp | 3 +- star-sys/STAR/source/TimeFunctions.h | 4 +- star-sys/STAR/source/Transcript.cpp | 18 ++-- star-sys/STAR/source/Transcript.h | 63 ++++++------- .../STAR/source/Transcript_alignScore.cpp | 4 +- .../STAR/source/Transcript_generateCigarP.cpp | 2 +- .../source/Transcript_variationAdjust.cpp | 2 +- .../STAR/source/stitchAlignToTranscript.cpp | 88 ++++++++++++------- star-sys/build.rs | 2 +- 15 files changed, 125 insertions(+), 111 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 09687c3..882ca8a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -60,7 +60,7 @@ impl StarReference { // recover stray CStrings to prevent leaked memory nvec.into_iter().for_each(|ptr| unsafe { - CString::from_raw(ptr); + drop(CString::from_raw(ptr)); }); let inner = InnerStarReference { diff --git a/star-sys/STAR/source/InOutStreams.h b/star-sys/STAR/source/InOutStreams.h index 5d3e845..e530975 100644 --- a/star-sys/STAR/source/InOutStreams.h +++ b/star-sys/STAR/source/InOutStreams.h @@ -8,7 +8,7 @@ template > class NullBuf: public std::basic_streambuf { - inline typename traits::int_type overflow(typename traits::int_type c) { + inline typename traits::int_type overflow(typename traits::int_type c) override { return traits::not_eof(c); } }; diff --git a/star-sys/STAR/source/IncludeDefine.h b/star-sys/STAR/source/IncludeDefine.h index 25a7f0f..5298b3c 100644 --- a/star-sys/STAR/source/IncludeDefine.h +++ b/star-sys/STAR/source/IncludeDefine.h @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include @@ -18,9 +17,9 @@ #include #include #include -#include +#include #include -#include +#include #include "VERSION" diff --git a/star-sys/STAR/source/ParameterInfo.h b/star-sys/STAR/source/ParameterInfo.h index 89fd3c9..d7b873e 100644 --- a/star-sys/STAR/source/ParameterInfo.h +++ b/star-sys/STAR/source/ParameterInfo.h @@ -3,12 +3,17 @@ class ParameterInfoBase { public: - string nameString; //string that identifies parameter + const char* nameString; //string that identifies parameter int inputLevel; //where the parameter was defined int inputLevelAllowed; //at which inpurt level parameter definition is allowed virtual void inputValues(istringstream &streamIn) =0; friend std::ostream& operator<< (std::ostream& o, ParameterInfoBase const& b); - virtual ~ParameterInfoBase() {}; + ParameterInfoBase(const char* nameString, int inputLevel, int inputLevelAllowed); + virtual ~ParameterInfoBase() = default; + ParameterInfoBase(const ParameterInfoBase&) = default; + ParameterInfoBase &operator=(const ParameterInfoBase&) = delete; + ParameterInfoBase(ParameterInfoBase&&) = default; + ParameterInfoBase& operator=(ParameterInfoBase&&) = default; protected: virtual void printValues(std::ostream& o) const = 0; }; @@ -68,20 +73,16 @@ class ParameterInfoScalar : public ParameterInfoBase { parameterType *value; vector allowedValues; - ParameterInfoScalar(int inputLevelIn, int inputLevelAllowedIn, string nameStringIn, parameterType* valueIn) { - nameString=nameStringIn; - inputLevel=inputLevelIn; - inputLevelAllowed=inputLevelAllowedIn; - value=valueIn; - }; + ParameterInfoScalar(int inputLevelIn, int inputLevelAllowedIn, const char* nameStringIn, parameterType* valueIn) + : ParameterInfoBase(nameStringIn, inputLevelIn, inputLevelAllowedIn), + value(valueIn) {}; - void inputValues(istringstream &streamIn) { + void inputValues(istringstream &streamIn) override { *value=inputOneValue (streamIn); }; - ~ParameterInfoScalar() {}; protected: - virtual void printValues(std::ostream& outStr) const { + void printValues(std::ostream& outStr) const override { printOneValue(value, outStr); }; @@ -93,14 +94,11 @@ class ParameterInfoVector : public ParameterInfoBase { vector *value; vector allowedValues; - ParameterInfoVector(int inputLevelIn, int inputLevelAllowedIn, string nameStringIn, vector *valueIn) { - nameString=nameStringIn; - inputLevel=inputLevelIn; - inputLevelAllowed=inputLevelAllowedIn; - value=valueIn; - }; + ParameterInfoVector(int inputLevelIn, int inputLevelAllowedIn, const char* nameStringIn, vector *valueIn) + : ParameterInfoBase(nameStringIn, inputLevelIn, inputLevelAllowedIn), + value(valueIn) {}; - void inputValues(istringstream &streamIn) { + void inputValues(istringstream &streamIn) override { (*value).clear(); while (streamIn.good()) { (*value).push_back(inputOneValue (streamIn)); @@ -108,9 +106,8 @@ class ParameterInfoVector : public ParameterInfoBase { }; }; - ~ParameterInfoVector() {}; protected: - virtual void printValues(std::ostream& outStr) const { + void printValues(std::ostream& outStr) const override { for (int ii=0; ii < (int) (*value).size(); ii++) { printOneValue(&(*value).at(ii),outStr); outStr<<" "; diff --git a/star-sys/STAR/source/Parameters.cpp b/star-sys/STAR/source/Parameters.cpp index 75bea9a..820e651 100755 --- a/star-sys/STAR/source/Parameters.cpp +++ b/star-sys/STAR/source/Parameters.cpp @@ -414,7 +414,7 @@ void Parameters::inputParameters (int argInN, char* argIn[]) {//input parameters for (uint ii=0; iiinputLevel>0) { inOut->logMain << setw(PAR_NAME_PRINT_WIDTH) << parArray[ii]->nameString <<" "<< *(parArray[ii]) << endl; - if (parArray[ii]->nameString != "parametersFiles" ) { + if (strcmp(parArray[ii]->nameString, "parametersFiles")) { clFull << " --" << parArray[ii]->nameString << " " << *(parArray[ii]); }; }; diff --git a/star-sys/STAR/source/ReadAlign_peOverlapMergeMap.cpp b/star-sys/STAR/source/ReadAlign_peOverlapMergeMap.cpp index 5dbe410..757c6d8 100644 --- a/star-sys/STAR/source/ReadAlign_peOverlapMergeMap.cpp +++ b/star-sys/STAR/source/ReadAlign_peOverlapMergeMap.cpp @@ -141,7 +141,7 @@ void ReadAlign::peMergeMates() { return; }; -void Transcript::peOverlapSEtoPE(uint* mateStart, Transcript &t) {//convert alignment from merged-SE to PE +void Transcript::peOverlapSEtoPE(const uint* mateStart, const Transcript &t) {//convert alignment from merged-SE to PE uint mLen[2]; mLen[0]=readLength[t.Str]; diff --git a/star-sys/STAR/source/SoloReadFeature_record.cpp b/star-sys/STAR/source/SoloReadFeature_record.cpp index 87c3960..0723e18 100755 --- a/star-sys/STAR/source/SoloReadFeature_record.cpp +++ b/star-sys/STAR/source/SoloReadFeature_record.cpp @@ -60,8 +60,7 @@ void SoloReadFeature::record(SoloReadBarcode &soloBar, uint nTr, set &re stats.V[stats.nAmbigFeature]++; return; }; - bool sjAnnot; - alignOut->extractSpliceJunctions(readSJs, sjAnnot); + bool sjAnnot = alignOut->extractSpliceJunctions(readSJs); if ( readSJs.empty() || (sjAnnot && readGene.size()==0) ) {//no junctions, or annotated junction buy no gene (i.e. read does not fully match transcript) stats.V[stats.nNoFeature]++; return; diff --git a/star-sys/STAR/source/TimeFunctions.h b/star-sys/STAR/source/TimeFunctions.h index 5512567..d4ea6cc 100644 --- a/star-sys/STAR/source/TimeFunctions.h +++ b/star-sys/STAR/source/TimeFunctions.h @@ -1,9 +1,9 @@ #ifndef TIME_FUNCTIONS_DEF #define TIME_FUNCTIONS_DEF #include -#include +#include string timeMonthDayTime(); -string timeMonthDayTime(time_t &rawTime); +string timeMonthDayTime(std::time_t &rawTime); #endif diff --git a/star-sys/STAR/source/Transcript.cpp b/star-sys/STAR/source/Transcript.cpp index d7f63b5..a7d898c 100644 --- a/star-sys/STAR/source/Transcript.cpp +++ b/star-sys/STAR/source/Transcript.cpp @@ -1,10 +1,5 @@ #include "Transcript.h" -Transcript::Transcript() -{ - reset(); -}; - void Transcript::reset() { extendL=0; @@ -23,9 +18,9 @@ void Transcript::reset() { nGap=0; lGap=0; lDel=0; lIns=0; nDel=0; nIns=0; nUnique=nAnchor=0; -}; +} -void Transcript::add(Transcript *trIn) { +void Transcript::add(const Transcript *trIn) noexcept { maxScore+=trIn->maxScore; nMatch+=trIn->nMatch; nMM+=trIn->nMM; @@ -33,11 +28,11 @@ void Transcript::add(Transcript *trIn) { lDel+=trIn->lDel; nDel+=trIn->nDel; lIns+=trIn->lIns; nIns+=trIn->nIns; nUnique+=trIn->nUnique; -}; +} -void Transcript::extractSpliceJunctions(vector> &sjOut, bool &annotYes) +bool Transcript::extractSpliceJunctions(vector> &sjOut) const { - annotYes=true; + bool annotYes=true; for (uint64 iex=0; iex=0) {//only record junctions, not indels or mate gap array sj; @@ -48,5 +43,6 @@ void Transcript::extractSpliceJunctions(vector> &sjOut, bool &an annotYes=false;//if one of the SJs is unannoated, annotYes=false }; }; -}; + return annotYes; +} diff --git a/star-sys/STAR/source/Transcript.h b/star-sys/STAR/source/Transcript.h index d9c46fe..4807718 100644 --- a/star-sys/STAR/source/Transcript.h +++ b/star-sys/STAR/source/Transcript.h @@ -1,6 +1,8 @@ #ifndef CODE_Transcript #define CODE_Transcript +#include + #include "IncludeDefine.h" #include "Parameters.h" #include "Variation.h" @@ -8,13 +10,13 @@ class Transcript { public: - uint exons[MAX_N_EXONS][EX_SIZE]; //coordinates of all exons: r-start, g-start, length - uint shiftSJ[MAX_N_EXONS][2]; //shift of the SJ coordinates due to genomic micro-repeats - int canonSJ[MAX_N_EXONS]; //canonicity of each junction - uint8 sjAnnot[MAX_N_EXONS]; //anotated or not - uint8 sjStr[MAX_N_EXONS]; //strand of the junction + std::array, MAX_N_EXONS> exons; //coordinates of all exons: r-start, g-start, length + std::array, MAX_N_EXONS> shiftSJ; //shift of the SJ coordinates due to genomic micro-repeats + std::array canonSJ; //canonicity of each junction + std::array sjAnnot; //anotated or not + std::array sjStr; //strand of the junction - uint intronMotifs[3]; + std::array intronMotifs; uint8 sjMotifStrand; uint nExons; //number of exons in the read transcript @@ -29,43 +31,44 @@ class Transcript { int iFrag; //frag number of the transcript, if the the transcript contains only one frag //loci - uint rStart, roStart, rLength, gStart, gLength, cStart; //read, original read, and genomic start/length, chromosome start + uint rStart = 0; //read + uint roStart = 0; //original read + uint rLength = 0; //read length + uint gStart = 0; //genomic start + uint gLength = 0; //genomic length + uint cStart; //chromosome start uint Chr,Str,roStr; //chromosome and strand and original read Strand - bool primaryFlag; + bool primaryFlag = false; - uint nMatch;//min number of matches - uint nMM;//max number of mismatches + uint nMatch = 0;//min number of matches + uint nMM = 0;//max number of mismatches uint mappedLength; //total mapped length, sum of lengths of all blocks(exons) - uint extendL; //extension length - intScore maxScore; //maximum Score + uint extendL = 0; //extension length + intScore maxScore = 0; //maximum Score - uint nGap, lGap; //number of genomic gaps (>alignIntronMin) and their total length - uint nDel; //number of genomic deletions (ie genomic gaps) - uint nIns; //number of (ie read gaps) - uint lDel; //total genomic deletion length - uint lIns; //total genomic insertion length + uint nGap = 0; + uint lGap = 0; //number of genomic gaps (>alignIntronMin) and their total length + uint nDel = 0; //number of genomic deletions (ie genomic gaps) + uint nIns = 0; //number of (ie read gaps) + uint lDel = 0; //total genomic deletion length + uint lIns = 0; //total genomic insertion length - uint nUnique, nAnchor; //number of unique pieces in the alignment, number of anchor pieces in the alignment + uint nUnique = 0; //number of anchor pieces in the alignment + uint nAnchor = 0; //number of unique pieces in the alignment vector varInd; vector varGenCoord, varReadCoord ; vector varAllele; - Transcript(); //resets to 0 void reset(); //reset to 0 - void resetMapG(); // reset map to 0 - void resetMapG(uint); // reset map to 0 for Lread bases - void add(Transcript*); // add - intScore alignScore(char **Read1, char *G, const Parameters &P); - int variationAdjust(const Genome &mapGen, char *R); - string generateCigarP(); //generates CIGAR - void peOverlapSEtoPE(uint* mSta, Transcript &t); - void extractSpliceJunctions(vector> &sjOut, bool &annotYes); - -private: - + void add(const Transcript*) noexcept; // add + intScore alignScore(const char **Read1, const char *G, const Parameters &P); + int variationAdjust(const Genome &mapGen, const char *R); + string generateCigarP() const; //generates CIGAR + void peOverlapSEtoPE(const uint* mSta, const Transcript &t); + bool extractSpliceJunctions(vector> &sjOut) const; }; #endif diff --git a/star-sys/STAR/source/Transcript_alignScore.cpp b/star-sys/STAR/source/Transcript_alignScore.cpp index b6bcfd7..ac2a3c8 100644 --- a/star-sys/STAR/source/Transcript_alignScore.cpp +++ b/star-sys/STAR/source/Transcript_alignScore.cpp @@ -1,11 +1,11 @@ #include #include "Transcript.h" -intScore Transcript::alignScore(char **Read1, char *G, const Parameters &P) {//re-calculates score and number of mismatches +intScore Transcript::alignScore(const char **Read1, const char *G, const Parameters &P) {//re-calculates score and number of mismatches maxScore=0; nMM=0; nMatch=0; - char* R=Read1[roStr==0 ? 0:2]; + const char* R=Read1[roStr==0 ? 0:2]; for (uint iex=0;iexexons[trA->nExons-1][EX_sjA]==sjAB \ - && trA->exons[trA->nExons-1][EX_iFrag]==iFragB && rBstart==rAend+1 && gAend+1exons[trA->nExons-1][EX_L]<=mapGen.sjdbShiftLeft[sjAB]) ) { + auto& penultimateExon = trA->exons[trA->nExons-1]; + if (sjAB!=((uint) -1) && penultimateExon[EX_iFrag]==iFragB \ + && penultimateExon[EX_sjA]==sjAB && rBstart==rAend+1 && gAend+1exons[trA->nExons][EX_L] = L; //new exon length - trA->exons[trA->nExons][EX_R] = rBstart; //new exon r-start - trA->exons[trA->nExons][EX_G] = gBstart; //new exon g-start + auto& exon = trA->exons[trA->nExons]; + exon[EX_R] = rBstart; //new exon r-start + exon[EX_G] = gBstart; //new exon g-start + exon[EX_L] = L; //new exon length trA->canonSJ[trA->nExons-1]=mapGen.sjdbMotif[sjAB]; //mark sj-db trA->shiftSJ[trA->nExons-1][0]=mapGen.sjdbShiftLeft[sjAB]; trA->shiftSJ[trA->nExons-1][1]=mapGen.sjdbShiftRight[sjAB]; @@ -36,7 +38,7 @@ intScore stitchAlignToTranscript(uint rAend, uint gAend, uint rBstart, uint gBst trA->sjAnnot[trA->nExons-1]=0; trA->sjStr[trA->nExons-1]=0; - if (trA->exons[trA->nExons-1][EX_iFrag]==iFragB) {//stitch aligns on the same fragment + if (penultimateExon[EX_iFrag]==iFragB) {//stitch aligns on the same fragment uint gBend=gBstart+L-1; uint rBend=rBstart+L-1; @@ -51,7 +53,7 @@ intScore stitchAlignToTranscript(uint rAend, uint gAend, uint rBstart, uint gBst //check if r-overlapping fully and exit if (rBend<=rAend) return -1000001; - if (gBend<=gAend && trA->exons[trA->nExons-1][EX_iFrag]==iFragB) return -1000002; + if (gBend<=gAend && penultimateExon[EX_iFrag]==iFragB) return -1000002; //shift the B 5' if overlaps A 3' if (rBstart<=rAend) { @@ -60,7 +62,9 @@ intScore stitchAlignToTranscript(uint rAend, uint gAend, uint rBstart, uint gBst L=rBend-rBstart+1; }; - for (uint ii=rBstart;ii<=rBend;ii++) Score+=scoreMatch; //add QS for mapped portions + if (rBend > rBstart) { + Score+=scoreMatch*(rBend-rBstart); //add QS for mapped portions + } int gGap=gBstart-gAend-1; //could be < 0 for insertions int rGap=rBstart-rAend-1;//>0 always since we removed overlap @@ -103,18 +107,24 @@ intScore stitchAlignToTranscript(uint rAend, uint gAend, uint rBstart, uint gBst int Score1=0; int jR1=1; //junction location in R-space + int ex_l = trA->exons[trA->nExons-1][EX_L]; do { // 1. move left, until the score for MM is less than canonical advantage jR1--; - if ( R[rAend+jR1]!=G[gBstart1+jR1] && G[gBstart1+jR1]<4 && R[rAend+jR1]==G[gAend+jR1]) Score1 -= scoreMatch; - } while ( Score1+P.scoreStitchSJshift >= 0 && int(trA->exons[trA->nExons-1][EX_L]) + jR1 > 1);//>=P.alignSJoverhangMin); //also check that we are still within the exon + auto rEnd = R[rAend+jR1]; + auto gStart = G[gBstart1+jR1]; + if ( rEnd!=gStart && gStart<4 && rEnd==G[gAend+jR1]) Score1 -= scoreMatch; + } while ( Score1+P.scoreStitchSJshift >= 0 && ex_l + jR1 > 1);//>=P.alignSJoverhangMin); //also check that we are still within the exon int maxScore2=-999999; Score1=0; int jPen=0; do { // 2. scan to the right to find the best junction locus // ?TODO? if genome base is N, how to score? - if ( R[rAend+jR1]==G[gAend+jR1] && R[rAend+jR1]!=G[gBstart1+jR1] ) Score1+=scoreMatch; - if ( R[rAend+jR1]!=G[gAend+jR1] && R[rAend+jR1]==G[gBstart1+jR1] ) Score1-=scoreMatch; + auto rEnd = R[rAend+jR1]; + auto gEnd = G[gAend+jR1]; + auto gStart = G[gBstart1+jR1]; + if ( rEnd==gEnd && rEnd!=gStart ) Score1+=scoreMatch; + else if ( rEnd!=gEnd && rEnd==gStart ) Score1-=scoreMatch; int jCan1=-1; //this marks Deletion int jPen1=0; @@ -122,20 +132,29 @@ intScore stitchAlignToTranscript(uint rAend, uint gAend, uint rBstart, uint gBst if (Del>=P.alignIntronMin) {//only check intron motif for large gaps= non-Dels //check if the intron is canonical, or semi-canonical - if ( G[gAend+jR1+1]==2 && G[gAend+jR1+2]==3 && G[gBstart1+jR1-1]==0 && G[gBstart1+jR1]==2 ) {//GTAG + // Pack into an aligned array to make it easier for the + // compiler to vectorize the comparison. + array g4 { G[gAend+jR1+1], G[gAend+jR1+2], G[gBstart1+jR1-1], G[gBstart1+jR1] }; + constexpr array GTAG = { 2, 3, 0, 2 }; + constexpr array CTAC = { 1, 3, 0, 1 }; + constexpr array GCAG = { 2, 1, 0, 2 }; + constexpr array CTGC = { 1, 3, 2, 1 }; + constexpr array ATAC = { 0, 3, 0, 1 }; + constexpr array GTAT = { 2, 3, 0, 3 }; + if ( g4 == GTAG ) { jCan1=1; - } else if ( G[gAend+jR1+1]==1 && G[gAend+jR1+2]==3 && G[gBstart1+jR1-1]==0 && G[gBstart1+jR1]==1 ) {//CTAC + } else if ( g4 == CTAC) { jCan1=2; - } else if ( G[gAend+jR1+1]==2 && G[gAend+jR1+2]==1 && G[gBstart1+jR1-1]==0 && G[gBstart1+jR1]==2 ) {//GCAG + } else if ( g4 == GCAG ) { jCan1=3; jPen1=P.scoreGapGCAG; - } else if ( G[gAend+jR1+1]==1 && G[gAend+jR1+2]==3 && G[gBstart1+jR1-1]==2 && G[gBstart1+jR1]==1 ) {//CTGC + } else if ( g4 == CTGC ) { jCan1=4; jPen1=P.scoreGapGCAG; - } else if ( G[gAend+jR1+1]==0 && G[gAend+jR1+2]==3 && G[gBstart1+jR1-1]==0 && G[gBstart1+jR1]==1 ) {//ATAC + } else if ( g4 == ATAC) { jCan1=5; jPen1=P.scoreGapATAC; - } else if ( G[gAend+jR1+1]==2 && G[gAend+jR1+2]==3 && G[gBstart1+jR1-1]==0 && G[gBstart1+jR1]==3 ) {//GTAT + } else if ( g4 == GTAT ) { jCan1=6; jPen1=P.scoreGapATAC; } else { @@ -325,18 +344,20 @@ intScore stitchAlignToTranscript(uint rAend, uint gAend, uint rBstart, uint gBst if (Del==0 && Ins==0) {//no gap => no new exon, extend the boundary of the previous exon trA->exons[trA->nExons-1][EX_L] += rBend-rAend; } else if (Del>0) { //deletion:ca only have Del> or Ins>0 - trA->exons[trA->nExons-1][EX_L] += jR; //correct the previous exon boundary - trA->exons[trA->nExons][EX_L] = rBend-rAend-jR; //new exon length - trA->exons[trA->nExons][EX_R] = rAend+jR+1; //new exon r-start - trA->exons[trA->nExons][EX_G] = gBstart1+jR+1; //new exon g-start + trA->exons[trA->nExons-1][EX_L] += jR; + auto& exon = trA->exons[trA->nExons]; + exon[EX_L] = rBend-rAend-jR; //new exon length + exon[EX_R] = rAend+jR+1; //new exon r-start + exon[EX_G] = gBstart1+jR+1; //new exon g-start trA->nExons++; } else if (Ins>0) { //Ins>0; trA->nIns += nIns; trA->lIns += Ins; trA->exons[trA->nExons-1][EX_L] += jR; //correct the previous exon boundary - trA->exons[trA->nExons][EX_L] = rBend-rAend-jR-Ins; //new exon length - trA->exons[trA->nExons][EX_R] = rAend+jR+Ins+1; //new exon r-start - trA->exons[trA->nExons][EX_G] = gAend+1+jR; //new exon g-start + auto& exon = trA->exons[trA->nExons]; + exon[EX_L] = rBend-rAend-jR-Ins; //new exon length + exon[EX_R] = rAend+jR+Ins+1; //new exon r-start + exon[EX_G] = gAend+1+jR; //new exon g-start trA->canonSJ[trA->nExons-1]=-2; //mark insertion trA->sjAnnot[trA->nExons-1]=0; trA->nExons++; @@ -364,8 +385,6 @@ intScore stitchAlignToTranscript(uint rAend, uint gAend, uint rBstart, uint gBst //>1 //AATATTTGGAACACTTATGTGAAAAATGATTTGTTTTTCTGAAATTTACGTTTCTCTCTGAGTCCTGTAACTGTCC - - trExtend.reset(); if ( extendAlign(R, G, rAend+1, gAend+1, 1, 1, DEF_readSeqLengthMax, trA->nMatch, trA->nMM, outFilterMismatchNmaxTotal, P.outFilterMismatchNoverLmax, \ P.alignEndsType.ext[trA->exons[trA->nExons-1][EX_iFrag]][1], &trExtend) ) { @@ -375,9 +394,10 @@ intScore stitchAlignToTranscript(uint rAend, uint gAend, uint rBstart, uint gBst trA->exons[trA->nExons-1][EX_L] += trExtend.extendL; };// if extendAlign for read A - trA->exons[trA->nExons][EX_R] = rBstart; - trA->exons[trA->nExons][EX_G] = gBstart; - trA->exons[trA->nExons][EX_L] = L; + auto& exon = trA->exons[trA->nExons]; + exon[EX_R] = rBstart; + exon[EX_G] = gBstart; + exon[EX_L] = L; trA->nMatch += L; trExtend.reset(); @@ -389,9 +409,9 @@ intScore stitchAlignToTranscript(uint rAend, uint gAend, uint rBstart, uint gBst trA->add(&trExtend); Score += trExtend.maxScore; - trA->exons[trA->nExons][EX_R] -= trExtend.extendL; - trA->exons[trA->nExons][EX_G] -= trExtend.extendL; - trA->exons[trA->nExons][EX_L] += trExtend.extendL; + exon[EX_R] -= trExtend.extendL; + exon[EX_G] -= trExtend.extendL; + exon[EX_L] += trExtend.extendL; }; //if extendAlign B trA->canonSJ[trA->nExons-1]=-3; //mark different fragments junction diff --git a/star-sys/build.rs b/star-sys/build.rs index 7c70b1a..27ccc27 100644 --- a/star-sys/build.rs +++ b/star-sys/build.rs @@ -123,7 +123,7 @@ fn main() { .cpp_link_stdlib(Some(libcxx())) .define("COMPILATION_TIME_PLACE", "\"build.rs\"") .files(FILES) - .flag("-std=c++11") + .flag("-std=c++17") .flag("-Wall") .flag("-Wextra") .flag("-Werror") From 565ee6738b87b3deec5dacf4f5e7826ff8a8b485 Mon Sep 17 00:00:00 2001 From: Adam Azarchs Date: Wed, 23 Mar 2022 14:06:58 -0700 Subject: [PATCH 2/9] Use -fvisibility=hidden and -ffunction-sections. We're linking statically, and this lets the compiler be better at dead-code elimination. --- star-sys/build.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/star-sys/build.rs b/star-sys/build.rs index 27ccc27..fb55ad1 100644 --- a/star-sys/build.rs +++ b/star-sys/build.rs @@ -127,5 +127,7 @@ fn main() { .flag("-Wall") .flag("-Wextra") .flag("-Werror") + .flag("-fvisibility=hidden") + .flag("-ffunction-sections") .compile("orbit"); } From 5a4dfb395757e580eec9551aef07e1b00241562e Mon Sep 17 00:00:00 2001 From: Adam Azarchs Date: Wed, 23 Mar 2022 14:18:02 -0700 Subject: [PATCH 3/9] Missing edit. --- star-sys/STAR/source/Parameters.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/star-sys/STAR/source/Parameters.cpp b/star-sys/STAR/source/Parameters.cpp index 820e651..731ac4e 100755 --- a/star-sys/STAR/source/Parameters.cpp +++ b/star-sys/STAR/source/Parameters.cpp @@ -14,6 +14,9 @@ #define PAR_NAME_PRINT_WIDTH 30 +ParameterInfoBase::ParameterInfoBase(const char* nameStringIn, int inputLevelIn, int inputLevelAllowedIn) + : nameString(nameStringIn), inputLevel(inputLevelIn), inputLevelAllowed(inputLevelAllowedIn) {} + Parameters::Parameters() {//initalize parameters info inOut = new InOutStreams; From 104344b98539e3e4c3a2e7d00f3f5ada3945fcb8 Mon Sep 17 00:00:00 2001 From: Nigel Delaney Date: Fri, 2 Sep 2022 14:37:35 -0700 Subject: [PATCH 4/9] temp log --- src/lib.rs | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 09687c3..3f3c910 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,6 +8,7 @@ use std::os::raw::c_char; use std::os::raw::c_int; use std::path::Path; use std::sync::Arc; +use std::time::Instant; use anyhow::{format_err, Error}; use rust_htslib::bam; @@ -253,6 +254,7 @@ impl StarAligner { /// Aligns a given read and produces BAM records pub fn align_read(&mut self, name: &[u8], read: &[u8], qual: &[u8]) -> Vec { + let now = Instant::now(); // STAR will throw an error on empty reads - so just construct an empty record. if read.len() == 0 { // Make an unmapped record and return it @@ -261,7 +263,17 @@ impl StarAligner { Self::prepare_fastq(&mut self.fastq1, name, read, qual); align_read_rust(self.aligner, self.fastq1.as_slice(), &mut self.aln_buf).unwrap(); - self.parse_sam_to_records(name) + let record= self.parse_sam_to_records(name); + let new_now = Instant::now(); + let (chr, pos, cigar, cigar_ops) = if record.len() > 0 { + let rec = &record[0]; + (rec.tid().to_string(), rec.pos().to_string(), format!("{}", rec.cigar()), rec.cigar().len().to_string()) + } else { + ("NA".to_string(), "NA".to_string(), "NA".to_string(), "NA".to_string()) + }; + println!("{:?},{:?},{:?},{},{},{},{}", std::str::from_utf8(&read).unwrap(), new_now.duration_since(now), record.len(), chr, pos, cigar, cigar_ops); + record + } /// Aligns a given read and return the resulting SAM string From 9efdf32459875e877969d5415c4327b800fd26b3 Mon Sep 17 00:00:00 2001 From: Nigel Delaney Date: Fri, 2 Sep 2022 15:11:26 -0700 Subject: [PATCH 5/9] Update htslib --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c2d9713..2fa271a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -252,9 +252,9 @@ checksum = "7fe5bd57d1d7414c6b5ed48563a2c855d995ff777729dcd91c369ec7fea395ae" [[package]] name = "rust-htslib" -version = "0.38.2" +version = "0.39.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2aca6626496389f6e015e25433b85e2895ad3644b44de91167d847bf2d8c1a1c" +checksum = "239ef7334dbf59acd56b7a6fa62a525ed7e36d6239a686ed4ff61bc794108e53" dependencies = [ "bio-types", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index eb12309..19ee2e3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,7 +11,7 @@ include = ["wrapper.h", "src/lib.rs", "LICENSE", "README.md"] [dependencies] anyhow = "1" libc = "0.2" -rust-htslib = { version = "0.38.2", default-features = false, features = ["serde_feature"] } +rust-htslib = { version = "0.39.5", default-features = false, features = ["serde_feature"] } star-sys = { version = "0.2", path = "star-sys" } [profile.release] From 782c6a9869b85564aedc58230eb2d88ff04028b4 Mon Sep 17 00:00:00 2001 From: Nigel Delaney Date: Sat, 3 Sep 2022 15:05:17 -0700 Subject: [PATCH 6/9] Optimize by default --- star-sys/build.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/star-sys/build.rs b/star-sys/build.rs index 7c70b1a..830c24f 100644 --- a/star-sys/build.rs +++ b/star-sys/build.rs @@ -127,5 +127,6 @@ fn main() { .flag("-Wall") .flag("-Wextra") .flag("-Werror") + .flag("-O3") .compile("orbit"); } From b34d0ffd4e0538f6de978a2392844ac228172156 Mon Sep 17 00:00:00 2001 From: Adam Azarchs Date: Tue, 6 Sep 2022 16:15:00 -0700 Subject: [PATCH 7/9] Help the vectorizer out. --- star-sys/STAR/source/stitchAlignToTranscript.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/star-sys/STAR/source/stitchAlignToTranscript.cpp b/star-sys/STAR/source/stitchAlignToTranscript.cpp index d8caab5..33d9832 100644 --- a/star-sys/STAR/source/stitchAlignToTranscript.cpp +++ b/star-sys/STAR/source/stitchAlignToTranscript.cpp @@ -5,6 +5,19 @@ #include "binarySearch2.h" // #include "stitchGapIndel.cpp" +namespace std { + +// Specialize array comparison to help the vectorizer out. +// +// Comparing 2 arrays of width 4 should just end up being comparison of 32-bit +// values. The compiler has trouble figuring that out if it goes through the +// default implementation, which uses std::equal. +template<> +inline bool operator==(const array& a, const array& b) { + return (*reinterpret_cast(&a[0])) == (*reinterpret_cast(&b[0])); +} + +} intScore stitchAlignToTranscript(uint rAend, uint gAend, uint rBstart, uint gBstart, uint L, uint iFragB, uint sjAB, const Parameters& P, char* R, const Genome &mapGen, Transcript *trA, const uint outFilterMismatchNmaxTotal) { //stitch together A and B, extend in the gap, returns max score From a0f5d95eabb5c9366fda5c361b1b32127a53633c Mon Sep 17 00:00:00 2001 From: Adam Azarchs Date: Tue, 6 Sep 2022 16:40:40 -0700 Subject: [PATCH 8/9] Rustfmt. --- src/lib.rs | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 9e82d9c..0b7e21c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -263,17 +263,35 @@ impl StarAligner { Self::prepare_fastq(&mut self.fastq1, name, read, qual); align_read_rust(self.aligner, self.fastq1.as_slice(), &mut self.aln_buf).unwrap(); - let record= self.parse_sam_to_records(name); + let record = self.parse_sam_to_records(name); let new_now = Instant::now(); let (chr, pos, cigar, cigar_ops) = if record.len() > 0 { let rec = &record[0]; - (rec.tid().to_string(), rec.pos().to_string(), format!("{}", rec.cigar()), rec.cigar().len().to_string()) + ( + rec.tid().to_string(), + rec.pos().to_string(), + format!("{}", rec.cigar()), + rec.cigar().len().to_string(), + ) } else { - ("NA".to_string(), "NA".to_string(), "NA".to_string(), "NA".to_string()) + ( + "NA".to_string(), + "NA".to_string(), + "NA".to_string(), + "NA".to_string(), + ) }; - println!("{:?},{:?},{:?},{},{},{},{}", std::str::from_utf8(&read).unwrap(), new_now.duration_since(now), record.len(), chr, pos, cigar, cigar_ops); + println!( + "{:?},{:?},{:?},{},{},{},{}", + std::str::from_utf8(&read).unwrap(), + new_now.duration_since(now), + record.len(), + chr, + pos, + cigar, + cigar_ops + ); record - } /// Aligns a given read and return the resulting SAM string From 205a83d6eacb7bfa3be45d9e116656359dbc86cc Mon Sep 17 00:00:00 2001 From: Adam Azarchs Date: Wed, 7 Sep 2022 11:47:19 -0700 Subject: [PATCH 9/9] clippy fix --- src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 0b7e21c..55b4bc5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -265,7 +265,7 @@ impl StarAligner { align_read_rust(self.aligner, self.fastq1.as_slice(), &mut self.aln_buf).unwrap(); let record = self.parse_sam_to_records(name); let new_now = Instant::now(); - let (chr, pos, cigar, cigar_ops) = if record.len() > 0 { + let (chr, pos, cigar, cigar_ops) = if !record.is_empty() { let rec = &record[0]; ( rec.tid().to_string(), @@ -283,7 +283,7 @@ impl StarAligner { }; println!( "{:?},{:?},{:?},{},{},{},{}", - std::str::from_utf8(&read).unwrap(), + std::str::from_utf8(read).unwrap(), new_now.duration_since(now), record.len(), chr,