pax_global_header00006660000000000000000000000064125771266220014524gustar00rootroot0000000000000052 comment=29d34c5acf5b2c7bb3d5a7ee2730f4f2d7a7a522 transrate-tools-1.0.0/000077500000000000000000000000001257712662200146635ustar00rootroot00000000000000transrate-tools-1.0.0/.gitignore000066400000000000000000000001471257712662200166550ustar00rootroot00000000000000CMakeCache.txt CMakeFiles/ CTestTestfile.cmake cmake_install.cmake Makefile src/bam-read src/bam-split transrate-tools-1.0.0/.gitmodules000066400000000000000000000001331257712662200170350ustar00rootroot00000000000000[submodule "bamtools"] path = bamtools url = https://github.com/pezmaster31/bamtools.git transrate-tools-1.0.0/CMakeLists.txt000066400000000000000000000027661257712662200174360ustar00rootroot00000000000000cmake_minimum_required (VERSION 2.8) enable_testing() project (transrate-tools) set(CPACK_PACKAGE_VERSION "1.0.0") set(CPACK_PACKAGE_VERSION_MAJOR "1") set(CPACK_PACKAGE_VERSION_MINOR "0") set(CPACK_PACKAGE_VERSION_PATCH "0") set(CPACK_GENERATOR "TGZ") set(CPACK_SOURCE_GENERATOR "TGZ") set(CPACK_PACKAGE_VENDOR "Hibberd Lab @ University of Cambridge") set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "transrate-tools - alignment parsing for transrate") set(CPACK_PACKAGE_NAME "${CMAKE_PROJECT_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") set(CPACK_SOURCE_PACKAGE_FILE_NAME "${CMAKE_PROJECT_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}-Source") set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/") SET(CMAKE_SHARED_LIBRARY_LINK_C_FLAGS) ## Set the standard required compile flags set (CMAKE_CXX_FLAGS "-fPIC -O3 -DHAVE_ANSI_TERM -DHAVE_SSTREAM -DHAVE_CONFIG_H -Wall -std=c++11") set(CMAKE_FIND_LIBRARY_SUFFIXES ".a") ## # Record this top-level path ## set (GAT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) # Have CMake tell us what it's doing set (CMAKE_VERBOSE_MAKEFILE true) if("${CMAKE_SYSTEM}" MATCHES "Linux") set(CMAKE_EXE_LINKER_FLAGS "-static-libgcc -static-libstdc++") endif("${CMAKE_SYSTEM}" MATCHES "Linux") if(APPLE) find_package (ZLIB REQUIRED) else() find_package (ZLIB REQUIRED STATIC) endif() # Recurse into transrate-tools source directory add_subdirectory ( src ) transrate-tools-1.0.0/README.md000066400000000000000000000026141257712662200161450ustar00rootroot00000000000000transrate-tools =============== Command-line tools used by [transrate](http://github.com/blahah/transrate) for processing bam files. Written in C++ and using Bamtools. ### building transrate-tools uses C++11 features, so you'll need at least g++ 4.8 installed. On OSX you can install the latest gcc with `brew install gcc49`. Note that on OSX you always need to tell `cmake` where to find gcc, using the option `-DCMAKE_CXX_COMPILER=$(which g++-4.9)`. Make sure you clone with submodules: ```bash $ git clone --recursive git@github.com:cboursnell/transrate-tools.git ``` And you'll need cmake installed. Next, build bamtools: on linux: ```bash $ cd bamtools $ mkdir build $ cd build $ cmake .. $ make $ cd ../../ ``` or on OSX: ```bash $ cd bamtools $ mkdir build $ cd build $ cmake -DCMAKE_CXX_COMPILER=$(which g++-4.9) .. $ make $ cd ../../ ``` Then build transrate-tools... on linux: ```bash $ cmake . $ make ``` on OSX: ```bash $ cmake -DCMAKE_CXX_COMPILER=$(which g++-4.9) . $ make ``` The executable is called `bam-read` and will be in the `src` directory. ### bam-read Parse bam files after multi-mapping reads have been assigned and aggregate read mapping information by contig. Currently captured: - name - bases mapped - edit distance - bridges - length - reads mapped - both mapped - proper pair - good - uncovered bases - mean mapq - probability of coverage not being segmented transrate-tools-1.0.0/src/000077500000000000000000000000001257712662200154525ustar00rootroot00000000000000transrate-tools-1.0.0/src/CMakeLists.txt000066400000000000000000000010211257712662200202040ustar00rootroot00000000000000set(BAMREAD_SRCS bam-read.cpp pileup.cpp segmenter.cpp) set(BAMREAD_HEADERS pileup.h segmenter.h) include_directories(${GAT_SOURCE_DIR}/bamtools/include) link_directories(${GAT_SOURCE_DIR}/bamtools/lib) set(LIBRARIES ${GAT_SOURCE_DIR}/bamtools/lib/libbamtools.a ${ZLIB_LIBRARIES}) add_executable(bam-read ${BAMREAD_SRCS} ${BAMREAD_HEADERS}) set_target_properties(bam-read PROPERTIES LINK_SEARCH_START_STATIC 1) set_target_properties(bam-read PROPERTIES LINK_SEARCH_END_STATIC 1) target_link_libraries(bam-read ${LIBRARIES}) transrate-tools-1.0.0/src/bam-read.cpp000066400000000000000000000146471257712662200176420ustar00rootroot00000000000000#include "bam-read.h" int BamRead::estimate_fragment_size(std::string file) { if (!reader.Open(file)) { cerr << "Could not open BAM file" << endl; return 1; } int count = 0; std::string name = ""; std::string prev = ""; int pos1 = -1; int pos2 = -1; int len1 = -1; int len2 = -1; int fragment = 0; // mean fragment length int mean = 0; double mn = -1; double m = -1; double s = 0; bool is_reversed; bool is_mate_reversed; while (reader.GetNextAlignment(alignment) && count < 10000) { if (name != "") { prev = name; pos2 = pos1; len2 = len1; } if (alignment.IsPrimaryAlignment()) { // does this do anything? name = alignment.Name; pos1 = alignment.Position; len1 = alignment.Length; if (prev == name) { if (pos1 >= 0 && pos2 >= 0) { is_reversed = alignment.IsReverseStrand(); is_mate_reversed = alignment.IsMateReverseStrand(); if (!is_reversed && is_mate_reversed) { if (pos1 > alignment.MatePosition) { continue; } } else if (is_reversed && !is_mate_reversed) { if (alignment.MatePosition > pos1) { continue; } } if (pos1 > pos2) { fragment = (pos1 - pos2 + len1); } else { fragment = (pos2 - pos1 + len2); } if (count > 0) { mn = m + (fragment - m)/count; s = s + ((fragment - m) * (fragment - mn)); m = mn; } else { m = fragment; } mean += fragment; ++count; } } } } mean=mean/(double)count; // cout << "count: " << count << " mean:" << mean << endl; s = sqrt(s/(count-1)); // cout << "sd: " << s << endl; realistic_distance = (int)(3 * s + mean); return 0; } int BamRead::load_bam(std::string file) { if (!reader.Open(file)) { cerr << "Could not open BAM file" << endl; return 1; } // load the sam header SamSequenceDictionary dictionary = reader.GetHeader().Sequences; seq_count = dictionary.Size(); array.resize(seq_count); // fill the hash map with initial values std::vector::iterator it = dictionary.Begin(); int len = 0; for (int i = 0; i < seq_count; i++) { if (it[i].HasLength()) { len = atoi(it[i].Length.c_str()); } else { len = 0; } // array[i] = TransratePileup(it[i].Name, len); array[i].setName(it[i].Name); array[i].setLength(len); } // loop through the bam file double scale = 0.65; double seq_true = 0; while (reader.GetNextAlignment(alignment)) { // read must be mapped if (!alignment.IsMapped()) { continue; } refid = alignment.RefID; ++array[refid].reads_mapped; array[refid].addAlignment(alignment); // store edit distance for sequence accuracy calculation if (alignment.HasTag("NM")) { if (alignment.GetTag("NM", nm_tag)) { len = alignment.Length; scale = (len - 35)/(double)len; seq_true = (len - nm_tag)/(double)len; seq_true = (seq_true - scale) * (1 / (1 - scale)); array[refid].p_seq_true += seq_true; } } // count fragments where either or both mates mapped if (alignment.IsFirstMate() || (alignment.IsSecondMate() && !alignment.IsMateMapped())) { ++array[refid].fragments_mapped; } // from now on ignore fragments unless both mates mapped if (!(alignment.IsFirstMate() && alignment.IsMateMapped())) { continue; } ++array[refid].both_mapped; // // count proper pairs, although we have our own definition because // // not all aligners use the same definition if (alignment.IsProperPair()) { ++array[refid].properpair; } // // mates must align to same contig, otherwise we record a bridge if (refid != alignment.MateRefID) { ++array[refid].bridges; continue; } ldist = max(alignment.Position-alignment.MatePosition, alignment.MatePosition-alignment.Position); if (ldist > realistic_distance) { // mates are too far apart continue; } // read orientation must match the generated library // in this case we only test for FR/RF orientation, // that is - we expect mates to be on opposite strands bool is_reversed = alignment.IsReverseStrand(); bool is_mate_reversed = alignment.IsMateReverseStrand(); if (!is_reversed && is_mate_reversed) { // in FR orientation, first read must start // before second read if (alignment.Position < alignment.MatePosition) { array[refid].good++; } } else if (is_reversed && !is_mate_reversed) { // in RF orientation, second read must start // before first read if (alignment.MatePosition < alignment.Position) { array[refid].good++; } } } // end of bam file for (int i = 0; i < seq_count; i++) { array[i].calculateUncoveredBases(); array[i].setPNotSegmented(); } return 0; } int main (int argc, char* argv[]) { BamRead bam; if (argc >= 3) { if (argc == 4) { // user has supplied a segmentation null prior nullprior = atof(argv[3]); } string infile = argv[1]; bam.estimate_fragment_size(infile); bam.load_bam(infile); // open file for writing std::ofstream output; output.open (argv[2]); output << "name,p_seq_true,bridges,length,fragments_mapped," "both_mapped,properpair,good,bases_uncovered," "p_not_segmented" << endl; for (int i = 0; i < bam.seq_count; i++) { output << bam.array[i].name << ","; if (bam.array[i].reads_mapped > 0) { output << bam.array[i].p_seq_true/bam.array[i].reads_mapped << ","; } else { output << "1,"; } output << bam.array[i].bridges << ","; output << bam.array[i].ref_length << ","; output << bam.array[i].fragments_mapped << ","; output << bam.array[i].both_mapped << ","; output << bam.array[i].properpair << ","; output << bam.array[i].good << ","; output << bam.array[i].bases_uncovered << ","; output << bam.array[i].p_not_segmented << endl; } output.close(); return 0; } else { cout << "bam-read version 1.0.0\n" << "Usage:\n" << "bam-read \n\n" << "example:\n" << "bam-read in.bam out.csv 0.95" << endl; return 1; } } transrate-tools-1.0.0/src/bam-read.h000066400000000000000000000007671257712662200173050ustar00rootroot00000000000000#include #include #include #include #include #include "api/BamReader.h" #include "pileup.h" using namespace BamTools; using namespace std; double nullprior = 0.7; class BamRead { public: int estimate_fragment_size(std::string file); int load_bam(std::string); int bar; int seq_count; uint32_t nm_tag; int ldist; int realistic_distance; int refid; BamReader reader; BamAlignment alignment; std::vector array; }; transrate-tools-1.0.0/src/pileup.cpp000066400000000000000000000041161257712662200174560ustar00rootroot00000000000000#include "pileup.h" using namespace BamTools; using namespace std; //constructor TransratePileup::TransratePileup() { ref_length = 100; bases_mapped = 0; p_seq_true = 0.0; bridges = 0; name = "unknown"; reads_mapped = 0; fragments_mapped = 0; both_mapped = 0; properpair = 0; good = 0; bases_uncovered = 0; p_unique = 0; p_not_segmented = 0; coverage.resize(ref_length); } void TransratePileup::setName(std::string _name) { name = _name; } void TransratePileup::setLength(int len) { ref_length = len; coverage.resize(len); } int TransratePileup::getCoverage(int i) { return coverage[i]; } vector TransratePileup::getCoverageArray() { return coverage; } void TransratePileup::calculateUncoveredBases() { for (int i = 0; i < ref_length; ++i) { if (coverage[i]==0) { ++bases_uncovered; } } } void TransratePileup::setPNotSegmented() { vector states(30); int bin_width = (ref_length / 30)+1; // ceil int counter=0; int pos=0; int total=0; for (int i = 0; i < ref_length; ++i) { total += coverage[i]; counter++; if (counter==bin_width || i==ref_length-1) { states[pos] = (int)min(24.0, max(0.0,log2(total/counter))); pos++; counter=0; total=0; } } // Segmenter segmenter(states, nullprior); p_not_segmented = segmenter.prob_k_given_R(0); // prob of 0 change points (1 segment) } void TransratePileup::addAlignment(const BamAlignment& alignment) { int numCigarOps = alignment.CigarData.size(); int pos = alignment.Position; bases_mapped += alignment.Length; for (int i = 0; i < numCigarOps; ++i) { const CigarOp& op = alignment.CigarData.at(i); if (op.Type == 'M') { // increase coverage by 1 for each position that is a match for(p = 0; p < (int)op.Length; ++p) { if (pos < ref_length) { ++coverage[pos]; } ++pos; } } if (op.Type == 'I') { // I means insertion - gaps in reference } if (op.Type == 'D' || op.Type == 'S') { // D means deletion - gaps in read pos += (int)op.Length; } } } transrate-tools-1.0.0/src/pileup.h000066400000000000000000000017671257712662200171340ustar00rootroot00000000000000#include #include #include #include #include "api/BamAlignment.h" #include "segmenter.h" extern double nullprior; using namespace BamTools; using namespace std; class TransratePileup { private: // instance variables int p; vector coverage; public: // constructor TransratePileup(); // instance variables i can't be bothered to write accessor methods for int ref_length; int bases_mapped; double p_seq_true; int bridges; int length; string name; int reads_mapped; int fragments_mapped; int both_mapped; int properpair; int good; int bases_uncovered; double p_unique; double p_not_segmented; // methods void clearCoverage(int rl); void setName(std::string); void setLength(int); int getCoverage(int i); vector getCoverageArray(); void calculateUncoveredBases(); void setPNotSegmented(); void addAlignment(const BamAlignment& alignment); }; transrate-tools-1.0.0/src/segmenter.cpp000066400000000000000000000060411257712662200201500ustar00rootroot00000000000000// make sure to compile with flag: -std=c++11 #include "segmenter.h" using namespace std; Segmenter::Segmenter (vector &seq_) { seq = seq_; nullprior = 0.7; load_states(); } Segmenter::Segmenter (vector &seq_, double nullprior_) { seq = seq_; nullprior = nullprior_; load_states(); } void Segmenter::load_states() { states_.resize(24, 0); for (int i = 0 ; i < seq.size() ; ++i) { if (seq[i]<24) { ++states_[seq[i]]; } else { ++states_[23]; } } pRk_.resize(2, -1.0); } double Segmenter::marginal_likelihood_R() { double sum = 0.0; for (int k = 0; k < 2; ++k) { sum += prior_k(k) * prob_R_given_k(k); } return sum; } double Segmenter::prior_k(int k) { if (k == 0) { return nullprior; } else { return ((1.0 - nullprior) / maxk); } } double Segmenter::prob_R_given_k(int k) { float result = pRk_[k]; if (result == -1.0) { if (k == 0) { result = prob_R_given_zero_k(); } else if (k == 1) { result = prob_R_given_unit_k(); } pRk_[k] = result; } return result; } double Segmenter::prob_R_given_zero_k() { if (pRk_[0] != -1.0) { return pRk_[0]; } double result = prob_R_given_k_rhs(states_, seq.size()); pRk_[0] = result; return result; } double Segmenter::prob_R_given_unit_k() { if (pRk_[1] != -1.0) { return pRk_[1]; } vector lstates = states_; int total = seq.size(); vector > pmat(total, vector(total, 0.0)); for (int i = 0; i < total; ++i) { vector segstates = lstates; for (int j = total - 1; j >= i; --j) { // get the probability of this segment given k=0 double segresult = prob_R_given_k_rhs(segstates, j + 1 - i); // save it for lookup pmat[i][j] = segresult; // decrease the count of the last base in this segment --segstates[seq[j]]; } // decrease the count of the first base in this sequence --lstates[seq[i]]; } // we have a flat prior on observing any given segmentation // to avoid zero priors, we have a lower limit of FLT_MIN double pA = max((1.0 / (double)(total - 1)), (double)FLT_MIN); // now we calculate the probability for k=1 double result = 0.0; for (int i = 0; i < total - 1; ++i) { // probability for the sequence to the left of the change point double left = pmat[0][i]; // probability for the sequence to the right of the change point double right = pmat[i + 1][total - 1]; // probability for this segmentation double product = left * right * pA; result += product; } pRk_[1] = result; return result; } double Segmenter::prob_R_given_k_rhs(vector &states, int length) { int nstates = states.size(); double l = tgamma(nstates); double upper = 1; for (auto p : states) { upper *= tgamma(p + 1); } double lower = tgamma(length + nstates); return l * (upper / lower); } double Segmenter::prob_k_given_R(int k) { double pRk = prob_R_given_k(k); double pk = prior_k(k); double mlR = marginal_likelihood_R(); return pRk * pk / mlR; } transrate-tools-1.0.0/src/segmenter.h000066400000000000000000000013231257712662200176130ustar00rootroot00000000000000#include #include #include /* modf, tgamma */ #include /* FLT_MIN */ using namespace std; class Segmenter { vector seq; const int maxk = 1; double nullprior; vector states_; vector pRk_; public: Segmenter (vector &seq_, double nullprior_); Segmenter (vector &seq_); // sequence states void load_states(); vector states() { return states_; }; // probability calculations double marginal_likelihood_R(); double prior_k(int k); double prob_R_given_k(int k); double prob_R_given_zero_k(); double prob_R_given_unit_k(); double prob_R_given_k_rhs(vector &states, int length); double prob_k_given_R(int k); };