transterm_hp_v2.09/0000775000265600020320000000000011652465416013562 5ustar tilleaadmintransterm_hp_v2.09/ermolaeva-score.h0000664000265600020320000000606711514142021017005 0ustar tilleaadmin/* This file is part of TransTerm v2.0 BETA and is covered by the GNU GPL * License version 2.0. See file LICENSE.txt for more details. */ #ifndef ERMOLAEVA_SCORE_H #define ERMOLAEVA_SCORE_H #include "seq.h" typedef Energy (*HPScoreFcn)(char, char); Energy tail_score(const Term &, Direction); Energy hairpin_score(HPScoreFcn, const Term & ); Energy reverse_pair(char, char); Energy forward_pair(char, char); Energy loop_penalty(int); void set_loop_pen(const string &); //==== simple scores === /* const Energy GC = -3; const Energy AU = -2; const Energy GU = -1; const Energy MM = 4; */ //=== SVM model scores === /* const Energy GC = -1.99; const Energy AU = -0.99; const Energy GU = 1; const Energy MM = 4; const float LOOP_PENALTY[20]={1000,1000,1000, -0.75,0.25,1.24, 1.25, 2.50, 3.75, 5.00, 6.25, 7.50, 8.75, 10.00, 11.25, 12.50, 13.75, 15.00, 16.25, 17.5}; */ //=== SVM model scores === /* const Energy GC = -1; const Energy AU = -0.67; const Energy GU = 0.46; const Energy MM = 1.83; const float LOOP_PENALTY[20]={1000,1000,1000, 0.18, 0.0, 0.21, 0.33, 0.66, 0.99, 1.32, 1.65, 1.98, 2.31, 2.64, 2.97, 3.30, 3.63, 3.96, 4.29 }; */ //=== Ermolaeva et al Scores === extern Energy ENERGY_CUTOFF; extern Energy TAIL_CUTOFF; extern Energy LOOP_PENALTY[20]; // stem energies extern Energy GC; extern Energy AU; extern Energy GU; extern Energy MM; extern Energy GAP; extern Energy MM_OPEN; // stem size limits //const unsigned MAX_STEM = 22; //const unsigned MAX_LOOP = 13; //const int MAX_STEM = 22; //const int MAX_STEM = 200; //const int MAX_LOOP = 13; //const int MAX_HP = 2*MAX_STEM + MAX_LOOP + 2; extern int MIN_STEM; extern int MIN_LOOP; extern int MAX_STEM; extern int MAX_LOOP; extern int MAX_HP; const int REALLY_MAX_HP = 1000; // size of the allocated table // required # of U's in the window next to the tail extern int UWINDOW_SIZE; extern int UWINDOW_REQUIRE; // the energy of a hairpin pair on the forward strand inline Energy forward_pair(char ch1, char ch2) { if((ch1 == 'G' && ch2 == 'C') || (ch1 == 'C' && ch2 == 'G')) return GC; if((ch1 == 'T' && ch2 == 'A') || (ch1 == 'A' && ch2 == 'T')) return AU; if((ch1 == 'T' && ch2 == 'G') || (ch1 == 'G' && ch2 == 'T')) return GU; if(ch1 == PADDING_CHAR || ch2 == PADDING_CHAR) return 1000.0; return MM; } // the energy of a hairpin pair on the reverse strand inline Energy reverse_pair(char ch1, char ch2) { if((ch1 == 'G' && ch2 == 'C') || (ch1 == 'C' && ch2 == 'G')) return GC; if((ch1 == 'T' && ch2 == 'A') || (ch1 == 'A' && ch2 == 'T')) return AU; if((ch1 == 'A' && ch2 == 'C') || (ch1 == 'C' && ch2 == 'A')) return GU; if(ch1 == PADDING_CHAR || ch2 == PADDING_CHAR) return 1000.0; return MM; } // calculate the loop penalty inline Energy loop_penalty(int len) { if(len > MAX_LOOP) return 1000.0; // if(len >= sizeof(LOOP_PENALTY)/sizeof(float)) return 1000.0; return LOOP_PENALTY[len]; } #endif transterm_hp_v2.09/2ndscore.cc0000664000265600020320000001622311514142021015571 0ustar tilleaadmin/* This file is part of TransTerm v2.0 BETA and is covered by the GNU GPL * License version 2.0. See file LICENSE.txt for more details. * * Author: Carl Kingsford, (c) 2005-2006 */ // 2ndscore will read in a fasta sequence and assign two secondary structure // scores to every position: the score of the best hairpin to the left the // score of the best hairpin on the complement-strand, anchored at the right #include #include #include #include #include #include #include #include #include "seq.h" #include "util.h" #include "ermolaeva-score.h" #include "search.h" bool no_gaps = false; // needed for legacy code bool print_seq = true; // print the hairpin sequence bool print_pos_strand = true; bool print_neg_strand = true; void usage() { cerr << "Usage: 2ndscore in.fasta" << endl; exit(3); } // print out the values for the essential options void print_options(ostream & out) { out << endl << "--gc=" << GC << " " << "--au=" << AU << " " << "--gu=" << GU << " " << "--mm=" << MM << " " << "--gap=" << GAP << endl; out << "--max-len=" << MAX_HP << " " << "--max-loop=" << MAX_LOOP << " " << "--min-loop=" << MIN_LOOP << endl; out << "--loop-penalty="; for(int i = MIN_LOOP; i <= MAX_LOOP; i++) { if(i != MIN_LOOP) out << ","; out << LOOP_PENALTY[i]; } out << endl << endl; } int process_options(int argc, char * argv[]) { const char * OPTIONS = "hS"; enum {GC_OPT, AU_OPT, GU_OPT, MM_OPT, GAP_OPT, MINSTEM_OPT, MINLOOP_OPT, NOPOS_OPT, NONEG_OPT, MAXLEN_OPT, MAXLOOP_OPT, LOOPPEN_OPT }; static struct option long_options[] = { {"gc", 1, 0, GC_OPT}, {"au", 1, 0, AU_OPT}, {"gu", 1, 0, GU_OPT}, {"mm", 1, 0, MM_OPT}, {"gap", 1, 0, GAP_OPT}, {"loop-penalty", 1, 0, LOOPPEN_OPT}, {"min-loop", 1, 0, MINLOOP_OPT}, {"max-len", 1, 0, MAXLEN_OPT}, {"max-loop", 1, 0, MAXLOOP_OPT}, {"no-fwd", 0, 0, NOPOS_OPT}, {"no-rvs", 0, 0, NONEG_OPT}, {"help", 0, 0, 'h'}, {0,0,0,0} }; int len = MAX_HP; int loop = MAX_LOOP; // opterr = 0; // don't print msg --- we'll do it int a; while((a=getopt_long(argc, argv, OPTIONS, long_options, 0)) != -1) { switch(a) { case 'h': usage(); break; // energy function options case GC_OPT: GC = atof(optarg); break; case AU_OPT: AU = atof(optarg); break; case GU_OPT: GU = atof(optarg); break; case MM_OPT: MM = atof(optarg); break; case GAP_OPT: GAP = atof(optarg); break; case LOOPPEN_OPT: set_loop_pen(optarg); break; // filtering options case MINSTEM_OPT: MIN_STEM = atoi(optarg); break; case MINLOOP_OPT: MIN_LOOP = atoi(optarg); break; case MAXLEN_OPT: len = atoi(optarg); break; case MAXLOOP_OPT: loop = atoi(optarg); break; case 'S': print_seq = false; break; case NOPOS_OPT: print_pos_strand = false; break; case NONEG_OPT: print_neg_strand = false; break; default: cerr << "Error: unknown option. " << endl; exit(3); } } if(len > REALLY_MAX_HP) { cerr << "Error: must search for hairpins with total length smaller than " << REALLY_MAX_HP << endl; cerr << "(recompile after changing REALLY_MAX_HP to increase)" << endl; exit(3); } if (loop >= len) { cerr << "Error: max-loop must be less than max-len" << endl; exit(3); } // set the global constants MAX_STEM, MAX_LOOP, MAX_HP set_max_len_loop(len, loop); if(MAX_STEM < MIN_STEM || MIN_STEM < 1) { cerr << "Error: min-stem must be <= max-stem and > 0" << endl; exit(3); } if(MAX_LOOP < MIN_LOOP) { cerr << "Error: max-loop must be >= min-loop" << endl; exit(3); } if(optind >= argc) usage(); return optind; } // run every_hairpin_energy() above going both forward and backward void every_hairpin_energy( Seq & seq, vector & fwd_strand, vector & rvs_strand) { every_hairpin_energy(seq, FORWARD, fwd_strand); every_hairpin_energy(seq, REVERSE, rvs_strand); } // output the list of scores to the stream given by out void print_hp_energies( ostream & out, const Seq & seq, vector & scores, Direction dir, int padding) { // format is header line starting with > followed by a description and the // last word on the header line will be FORWARD or REVERSE indicating to // which strand the scores apply out << ">" << seq.name << " " << seq.desc << (dir==FORWARD?" FORWARD":" REVERSE") << endl; // XXX: remember that scores.size() = 1 mroe than the actual size b/c the // zero entry is not used for(unsigned i = MAX_HP + 1; i < scores.size() - MAX_HP; i++) { // i is one based: assert(seq.dna[i-1] != PADDING_CHAR); //Energy hpe = (scores[i].stem_len == 0)?10.0:scores[i].hp_energy; int s = seqindex(seq, scores[i].start) - padding; int e = seqindex(seq, scores[i].end) - padding; if(s > 0 && e < (int)scores.size() - padding) { // print the energy (or None if no stem was found) if(scores[i].stem_len == 0) { out << setw(4) << "None" << " "; } else { out << setw(4) << setprecision(4) << scores[i].hp_energy << " "; } // print the coordinates out << setw(7) << s << " .. " << setw(7) << e << " "; // print the hairpin if(print_seq) print_term_seq(out, scores[i]); out << endl; } } } int main(int argc, char * argv[]) { cerr << "2ndscore (" << __DATE__ << ")" << endl; int first_file_index = process_options(argc, argv); print_options(cerr); // holds all the sequences read Genome dna; // read the all given fasta file into the dna variable for(int i = first_file_index; i < argc; i++) { string seq_filename = argv[first_file_index]; ifstream seq_file(seq_filename.c_str()); if(!seq_file) { cerr << "Error: couldn't read file: " << seq_filename << endl; exit(3); } read_seqs(seq_file, dna); } // for every sequence, output the scores for both the positive and // negative strands int i = 0; for(EVERY_CHROM(dna, C)) { vector terms; cerr << ++i << ". Seq: " << (*C)->name << " (length " << (*C)->length << ")"; pad_seq(**C, MAX_HP); if (print_pos_strand) { every_hairpin_energy(**C, FORWARD, terms); print_hp_energies(cout, **C, terms, FORWARD, MAX_HP); } if(print_neg_strand) { every_hairpin_energy(**C, REVERSE, terms); print_hp_energies(cout, **C, terms, REVERSE, MAX_HP); } cerr << endl; } } transterm_hp_v2.09/ermolaeva-score.cc0000664000265600020320000000500111514142021017126 0ustar tilleaadmin/* This file is part of TransTerm v2.0 BETA and is covered by the GNU GPL * License version 2.0. See file LICENSE.txt for more details. */ #include #include "transterm.h" #include "ermolaeva-score.h" #include "util.h" Energy MM_OPEN = 0.0; // this must be 0.0 (do not change) // the params from the v1.0 paper (they have not yet been updated) Energy GC = -2.3; Energy AU = -0.9; Energy GU = 1.3; Energy MM = 3.5; Energy GAP = 6.0; Energy LOOP_PENALTY[20]={1000,1000,1000,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17}; /* // mfold-derived parameters Energy GC = -2.1; Energy AU = -1.3; Energy GU = -0.8; //Energy MM = 0.8; Energy MM = 3.5; Energy GAP = 3.9; //Energy GAP = 6; Energy LOOP_PENALTY[20] = {1000,1000,1000, 4.10, 4.90, 4.40, 4.70, 5.00, 5.10, 5.20, 5.30, 5.40, 5.50, 5.60, 5.70, 5.80, 5.80, 5.90, 5.90, 6.00}; */ // format of str: f1,f2,f3,f4,f5 // f1 is the cost of a loop of length MIN_LOOP // theere are fewer terms than needed to get up to MAX_LOOP, then // the last term is repeated so "0,2" would give cost 0 to any loop // of length MIN_LOOP, and cost 2 to any larger loop. // extra terms are ignored void set_loop_pen(const string & str) { for(int i = 0; i < MIN_LOOP;i++) LOOP_PENALTY[i] = 1000; vector pen; split(str, ',', pen); for(int i = MIN_LOOP; i <= MAX_LOOP; i++) { int j = i - MIN_LOOP; LOOP_PENALTY[i] = atof(((unsigned(j) < pen.size())?pen[j]:pen.back()).c_str()); } } // score the hairpin in the terminator t Energy hairpin_score(HPScoreFcn score, const Term & t) { Energy e = loop_penalty(t.loop_len) + ((t.gap != 0) ? GAP : 0.0); for(int j = 0; j < t.stem_len; j++) { // adjust for the single gap if we pass it int left, right; left = (t.gap < 0 && j >= abs(t.gap)) ? 1 : 0; right = (t.gap > 0 && j >= abs(t.gap)) ? 1 : 0; e += score(*(t.left_stem_base() + j + left), *(t.right_stem_base() - j - right)); } return e; } // calculate the tail score (at the given end of the Term) Energy tail_score(const Term & ter, Direction dir) { int inc = 1; const char * start = ter.right_stem_base() + 1; char letter = 'T'; if(dir == REVERSE) { inc = -1; start = ter.left_stem_base() - 1; letter = 'A'; } Energy sum = 0.0, prev = 1.0; for(int i = 0; i < 15; i++) { prev *= ((*start == letter) ? 0.9 : 0.6); start += inc; sum += prev; } return -sum; } transterm_hp_v2.09/search.cc0000664000265600020320000006435411514142021015327 0ustar tilleaadmin/* This file is part of TransTerm v2.0 BETA and is covered by the GNU GPL * License version 2.0. See file LICENSE.txt for more details. */ #include #include #include #include #include #include #include "transterm.h" #include "seq.h" #include "ermolaeva-score.h" #include "util.h" int UWINDOW_SIZE = 6; int UWINDOW_REQUIRE = 3; int MIN_STEM = 4; int MIN_LOOP = 3; Energy ENERGY_CUTOFF = -2; Energy TAIL_CUTOFF = -2.5; // MAX_STEM is only used for the version 1 search scheme --- // it should not be changed or used in new code. int MAX_STEM = 22; int MAX_LOOP = 13; int MAX_HP = 2*MAX_STEM + MAX_LOOP + 2; void set_max_len_loop(int len, int loop) { MAX_LOOP = loop; MAX_HP = len; assert(MAX_HP < REALLY_MAX_HP); assert(MAX_LOOP < MAX_HP); } // return true if the hp represented by t is a candidate hp bool is_candidate_hp(HPScoreFcn score, const Term & t) { return score(*t.left_stem_base(), *t.right_stem_base()) < AU && score(*t.left_stem_top(), *t.right_stem_top()) < 0 && (score(*(t.left_stem_top() + 1), *(t.right_stem_top() - 1)) > 0 || t.loop_len == MIN_LOOP || t.loop_len == MIN_LOOP + 1); } // Check if there is at least three consecutive nucleotides of appropriate type // that letter among 7 characters after cp inline bool check_tail(char letter, SeqPtr cp) { return (cp[0] == letter && cp[1] == letter && cp[2] == letter) || (cp[1] == letter && cp[2] == letter && cp[3] == letter) || (cp[2] == letter && cp[3] == letter && cp[4] == letter) || (cp[3] == letter && cp[4] == letter && cp[5] == letter) || (cp[4] == letter && cp[5] == letter && cp[6] == letter); } // pair terminators that have the same coordinates going in the opposite // directions. (the 'partner' member points to the other member of the pair) void pair_bidirect(vector & in) { Term * prev = 0; for(EVERY_TERM(in, T)) { if(prev && (*T)->left() == prev->left() && (*T)->right() == prev->right() && (*T)->dir() != prev->dir()) { assert(!(*T)->partner && !prev->partner); (*T)->partner = prev; prev->partner = *T; prev = 0; } else { prev = *T; } } } struct by_ascending_rightend { bool operator()(const Term * t1, const Term * t2) { return t1->right() < t2->right(); } }; void insert_by_rightend(list & pq, Term * term) { list x; x.push_back(term); pq.merge(x, by_ascending_rightend()); } // precond: terminators are sorted by left endpoint void find_same_overlapping(vector & in) { typedef list TermPQ; TermPQ forward_queue, reverse_queue; for(EVERY_TERM(in, T)) { TermPQ * que = ((*T)->dir() == FORWARD) ? &forward_queue : &reverse_queue; while(!que->empty() && que->front()->right() <= (*T)->left()) { que->pop_front(); } copy(que->begin(), que->end(), back_inserter((*T)->overlapping)); for(TermPQ::iterator R = que->begin(); R != que->end(); ++R) { (*R)->overlapping.push_back(*T); } insert_by_rightend(*que, *T); } } // precond: terminators are sorted by left endpoint void find_opp_overlapping(vector & in) { typedef list TermPQ; TermPQ forward_queue, reverse_queue; TermPQ *mydirQ, *oppdirQ; // for every terminator in the list sorted by leftend point for(EVERY_TERM(in, T)) { // use direction to decide which queues are which if((*T)->dir() == FORWARD) { mydirQ = &forward_queue; oppdirQ = &reverse_queue; } else { mydirQ = &reverse_queue; oppdirQ = &forward_queue; } // save me in my direction's queue and the global queue insert_by_rightend(*mydirQ, *T); // remove all the guys that we've passed while(!oppdirQ->empty() && oppdirQ->front()->right() <= (*T)->left()) { oppdirQ->pop_front(); } // everyone still in the queue starts before me and ends after my // left end, and thus overlaps me copy(oppdirQ->begin(), oppdirQ->end(), back_inserter((*T)->opp_overlapping)); // everyone that overlaps me, I overlap for(TermPQ::iterator R = oppdirQ->begin(); R != oppdirQ->end(); ++R) { (*R)->opp_overlapping.push_back(*T); } } // this is just a big assertion statement --- it checks that what we // tagged as overlapping above really does for(EVERY_TERM_CONST(in, T)) { for(list::const_iterator R = (*T)->opp_overlapping.begin(); R != (*T)->opp_overlapping.end(); ++R) { // all the opposite overlapping genes must be on the other strand assert((*T)->dir() != (*R)->dir()); // the terminators must overlap assert(hp_overlap(**T, **R)); } } } // handles the dynamic programmign table in a memory efficient way because we // are looking for hairpins of bounded size (= 0; i--) { int ii = idx(i); int ip1 = idx(i+1); if(i+MIN_LOOP-1 < MAX_HP) { int iml = idx(i+MIN_LOOP-1); stbl_idx(ii, iml) = loop_penalty(MIN_LOOP); rtbl_idx(ii, iml) = LOOP; } if(i+MIN_LOOP < MAX_HP) { int iml = idx(i+MIN_LOOP); stbl_idx(ii, iml) = loop_penalty(MIN_LOOP+1); rtbl_idx(ii, iml) = LOOP; } for(int j = i + MIN_LOOP+1; j < MAX_HP; j++) { int jj = idx(j); int jm1 = idx(j-1); Reason & rr = rtbl_idx(ii,jj); float & ss = stbl_idx(ii,jj); float y; rr = LOOP; ss = loop_penalty(j-i+1); Energy sij = score(i,j); bool mm_open = sij >= MM && rtbl_idx(ip1, jm1)!=MISMATCH; if((y = sij + stbl_idx(ip1,jm1) + (mm_open?MM_OPEN:0.0)) <= ss) { rr = (sij= 2*MIN_STEM + MIN_LOOP - 1; j--) { // first clause requires at least a weak pairing for first base pair in // the stem if(//dp.score(0,j) < MM && // MM was AU in transterm 1.0 (j == MAX_HP-1 || dp.stbl(0,j) < hpe)) { hpe = dp.stbl(0, j); best_j = j; } } return hpe; } // follow the traceback arrows to get the best terminator with endpoints [cp, // best_j]. Term make_best_term( const Seq & seq, Direction dir, const HPDPTable & dp, int best_j, Energy hpe) { int i,j; i = 0; j = best_j; list gaps; while(dp.rtbl(i, j) != HPDPTable::LOOP) { switch(dp.rtbl(i, j)) { case HPDPTable::LOOP: break; case HPDPTable::I_GAP: i++; gaps.push_back(j); break; case HPDPTable::J_GAP: j--; gaps.push_back(-i); break; case HPDPTable::MATCH: // fall through case HPDPTable::MISMATCH: i++; j--; break; default: cerr << "No value for: " << i << " " << j << endl; assert(false); } } /* char tail_proximal = *ptr_for(i); char tail_distal = *ptr_for(j); if (dp.rtbl(0, best_j) == HPDPTable::I_GAP || dp.rtbl(0, best_j) == HPDPTable::J_GAP || (dir == FORWARD && tail_proximal != ' */ return Term(&seq, dir, dp.ptr_for(best_j), dp.ptr_for(j+1), dp.ptr_for(i-1), dp.ptr_for(0), gaps, hpe); } // look at last added term, if this one dominates, keep the one with the // better tail score. Because of intervening overlaping (but non-dominating) // terminators, dominating terms may appear in output void add_greedy_nodominating(const Term & t, vector & terms) { // if last term is inside t, keep the one with the best tail score. if(!terms.empty() && ((t.dir() == FORWARD && terms.back()->left() >= t.left()) || (t.dir() == REVERSE && terms.back()->right() <= t.right()))) { if(t.tail_energy < terms.back()->tail_energy) { *terms.back() = t; } } else { terms.push_back(new Term(t)); } } // keep all overlapping terminators and let the confidence sort things out void add_all_terminators(const Term & t, vector & terms) { // make sure that we extend the tail as far as possible // if((dir == FORWARD && first_pair(t) != "AT") || // (dir == REVERSE && first_pair(t) != "TA")) // { terms.push_back(new Term(t)); // } } // search all previous terms, if this one dominates any, keep the one with the // best tail score. If nooverlaps == true, then we remove all /overlapping/ // (not just dominating) --- this may not output the /best/ set of // non-overlapping hp, however void add_nodominating( const Term & t, vector & terms, bool nooverlaps = false ) { // among the terminators in a domination chain involving t, find the one // with the best tail score (may be t) const Term * best = &t; for(vector::reverse_iterator T = terms.rbegin(); T != terms.rend(); ++T) { if(!hp_overlap(t, **T)) break; if(nooverlaps || dominates(t, **T)) { best = (t.tail_energy <= (*T)->tail_energy) ? &t : *T; } } // if the best one is t, then we have to remove the ones that it // dominates. Otherwise, the best one is already in the list and t is // redundant. if(best == &t) { // (note: reverse_iterator was segfaulting for no apparent reason // when we erase() --- thus, we count a forward iterator backward) for(vector::iterator T = terms.end(); T != terms.begin();) { T--; if(!hp_overlap(t, **T)) break; // invalidates T and those T we've already checked if(nooverlaps || dominates(t, **T)) terms.erase(T); } terms.push_back(new Term(t)); } } // return true if there's a run of >= 5 A within 3 of the left side and >= 5 T within 3 of the // right side and the terminators stem is >= 10 bases bool has_bad_tails(const Term & t) { // 01234567 // ...AAAAA // ..AAAAA. // .AAAAA.. // AAAAA... int stem_len = min(t.left_stem_top() - t.left_stem_base(), t.right_stem_base() - t.right_stem_top()); const int REQUIRE_AT = 5; const int LEADING = stem_len / 2 - REQUIRE_AT; // if both stems are at least 10 bases if (stem_len > 12) { // check to see if there is a stretch of As on the left int A = 0; for(SeqPtr cp = t.left_stem_base(); cp < t.left_stem_base() + LEADING + REQUIRE_AT && cp <= t.left_stem_top(); ++cp) { A = (*cp == 'A') ? A+1 : 0; if(A >= REQUIRE_AT) break; } if (A < REQUIRE_AT) return false; // check to see if there is a stretch of Ts on the right int T = 0; for(SeqPtr cp = t.right_stem_base(); cp > t.right_stem_base() - LEADING - REQUIRE_AT && cp >= t.right_stem_top(); --cp) { T = (*cp == 'T') ? T+1 : 0; if(T >= REQUIRE_AT) break; } if (T < REQUIRE_AT) return false; return true; } return false; } // we add t (with scores (H,T)) if there are no terminators that completely dominate // t, where a terminator A completely dominates B if A hairpin is a supersequence of // Bs and H(B) > H(A) and T(B) > T(A). void add_non_completely_dominating( const Term & t, vector & terms) { list remove_these; // we assume we're going to keep t bool add_t = true; //if(abs(seqindex(*t.seq, t.right()) - 253468) < 1000) cerr << "XX: " << t << " "; // walk backwards untill we no longer overlap t for(vector::reverse_iterator T = terms.rbegin(); T != terms.rend(); ++T) { if(!hp_overlap(t, **T)) break; // if t dominates one of the prevous ones: if(dominates(t, **T)) { // if there is already something there that t dominates and t looks like // it might be a bad extension, don't add it if(has_bad_tails(t)) return; //if(abs(seqindex(*t.seq, t.right()) - 253468) < 1000) cerr << "good tails "; // if new terminator t dominates this previous terminator and uniformly // has better hp and tail scores: then there is no reason to keep the old one if(t.tail_energy <= (*T)->tail_energy && t.hp_energy <= (*T)->hp_energy) { remove_these.push_back(*T); } // convesely, if the old one is better uniformly better, then there's no // need to keep this new one if(remove_these.empty() && t.tail_energy > (*T)->tail_energy && t.hp_energy > (*T)->hp_energy) { add_t = false; } } } //if(abs(seqindex(*t.seq, t.right()) - 253468) < 1000) cerr << add_t << endl; // below depends on the ordering of the terms in the sequences. In // remove_these they are ordered in REVERSE order so that the first // terminator is the lastest one in terms. // if we have to remove some things that t completely domianates if(!remove_these.empty()) { // (note: reverse_iterator was segfaulting for no apparent reason // when we erase() --- thus, we count a forward iterator backward) for(vector::iterator T = terms.end(); T != terms.begin();) { T--; if(!hp_overlap(t, **T)) break; if(*T == remove_these.front()) { // invalidates T and those T we've already checked terms.erase(T); remove_these.pop_front(); } } } if(add_t) terms.push_back(new Term(t)); } // return the number of letter in between cp and last (inclusive) int count_letters(char letter, SeqPtr cp, SeqPtr last) { SeqPtr end = max(cp, last); int count = 0; for(SeqPtr start = min(cp, last); start != end; start++) { if(*start == letter) count++; } if(*end == letter) count++; return count; } // find the best hairpin A overlapping the given terminator on the left for // FORWARD terminators, or the right for REVERSE terminators // ----------.....--------- Term // --------------------- Antiterm // // return true if anything found bool best_overlapping_hairpin( const Term & term, Term & best_term, int & best_overlap) { Direction dir = term.dir(); SeqPtr stem_end, stem_start; if(dir == FORWARD) { stem_end = term.left_stem_top()+1; stem_start = term.left_stem_base(); } else { stem_end = term.right_stem_top()-1; stem_start = term.right_stem_base(); } SeqPtr cp = stem_start - dir * MAX_HP + 2; int best_j = 0, j; Energy best_hpe = 1000, hpe; HPDPTable dp(cp, dir); for(; cp != stem_end; cp += dir) { // cp is a valid starting point for an antiterm if((dir == FORWARD && cp >= stem_start) || (dir == REVERSE && cp <= stem_start)) { dp.update(); if((hpe = find_best_hp(dp, j)) < best_hpe) { best_j = j; best_hpe = hpe; best_overlap = abs(stem_start - cp) + 1; best_term = make_best_term(*term.seq, dir, dp, best_j, hpe); } } dp.rotate(); } return best_j > 0; } // search for terminators along the 'dir' strand using an efficient dynamic // programming algorithm. puts the terms into seq.terms. void find_terms_dp(Seq & seq, Direction dir) { Energy hpe; int best_j; SeqPtr cp, end_cp; int tailoffset = 0; char letter = 'T'; if(dir == FORWARD) { cp = seq.left() + MAX_HP; end_cp = seq.right() - 15; tailoffset = 1; letter = 'T'; } else { cp = seq.right() - MAX_HP; end_cp = seq.left() + 15; tailoffset = -7; letter = 'A'; } int in_window = count_letters(letter, cp+1*dir, cp+UWINDOW_SIZE*dir); HPDPTable dp(cp, dir); // for every character that has a plausible tail for (; cp != end_cp; cp += dir) { if(in_window >= UWINDOW_REQUIRE && *cp != letter) //check_tail(letter, cp + tailoffset)) { dp.update(); if((hpe = find_best_hp(dp, best_j)) < ENERGY_CUTOFF) { Term t = make_best_term(seq, dir, dp, best_j, hpe); //t.hp_energy /= t.stem_len; // uncomment to use per-base energies // filter "hairpins" with too small a stem if(t.stem_len >= MIN_STEM) { t.tail_energy = tail_score(t, dir); if(t.tail_energy < TAIL_CUTOFF) { //add_greedy_nodominating(t, seq.terms); //add_all_terminators(t, seq.terms); add_non_completely_dominating(t, seq.terms); } } } } dp.rotate(); // update the sliding window of size 6 if(*(cp+1*dir) == letter) in_window--; if(*(cp+(UWINDOW_SIZE+1)*dir) == letter) in_window++; } } // find the terminators using a efficient dynamic programmign algorithm void find_terms_dp(Seq & seq) { find_terms_dp(seq, FORWARD); find_terms_dp(seq, REVERSE); sort(seq.terms.begin(), seq.terms.end(), region_isleftof); pair_bidirect(seq.terms); find_opp_overlapping(seq.terms); find_same_overlapping(seq.terms); } //============================================================ // 2ndscore.cc code: used to output hp energies for every posn //============================================================ // return a vector scores[] of length = len(seq) where scores[i] == the best // hairpin score for a hairpin anchored at position i. (the positions are // 1-based and entry 0 is not used.) void every_hairpin_energy(Seq & seq, Direction dir, vector & scores) { // compute the start and end of the ranges SeqPtr cp, end_cp; if(dir == FORWARD) { cp = seq.left() + MAX_HP; end_cp = seq.right() - MAX_HP; } else { cp = seq.right() - MAX_HP; end_cp = seq.left() + MAX_HP; } // this variable stores the complete dynamic programming table HPDPTable dp(cp, dir); // we assume all energies are < MAX_ENERGY (= 10000) [XXX: this assignment // is needed only b/c we don't yet compute the scores for the very start // and ends of the sequences] scores.resize(seq.length+1); for(; cp != end_cp; cp += dir) { int best_j; // not used, but must be passed to find_best_hp() dp.update(); Energy hpe = find_best_hp(dp, best_j); Term t = make_best_term(seq, dir, dp, best_j, hpe); t.tail_energy = 0.0; scores[seqindex(seq, cp)] = t; dp.rotate(); } } //============================================================ // Version 1.0 Search Scheme //============================================================ // find the terminators going in a particular direction. We split the directions // so that we can bail out of the loop early to save time void find_terms_ermolaeva(Seq & seq, Direction dir) { // Ermolaeva, et al algorithm: brute force. for every position, stem // length, gap position, and loop length, check both strands to see if we // get a terminator with good score HPScoreFcn score; int tailoffset; char letter; int start_buf, end_buf; if(dir == FORWARD) { score = &forward_pair; tailoffset = 1; letter = 'T'; start_buf = 2*MAX_STEM + MAX_LOOP + 15 + ((no_gaps)?0:1); end_buf = 15; } else { score = &reverse_pair; tailoffset = -7; letter = 'A'; start_buf = 15; end_buf = 2*MAX_STEM + MAX_LOOP + 15 + ((no_gaps)?0:1); } Term last_term; bool have_last_term = false; for (SeqPtr cp = seq.left() + start_buf; cp < seq.right()-end_buf; cp++) { if(!check_tail(letter, cp + tailoffset)) continue; for(int stem_len = MAX_STEM; stem_len >= MIN_STEM; stem_len--) { // gap is a location of the gap in the stem (maximum one gap per // hairpin is allowed). If gap = 0 than there is no gaps in athe // stem. If gap < 0 than the gap is in the right side of the stem // (so a nucleotide that don't have pair is in the left side. If // gap > 0 than the gap is located in the left side. The absolute // value of the gap is a number of nucleotides located in the stem // before the gap (counting from the end opposite to loop) int min_gap = 0, max_gap = 0; if(!no_gaps) { min_gap = -stem_len+2; max_gap = stem_len-2; } for(int loop_len = MAX_LOOP; loop_len >= MIN_LOOP; loop_len--) { for(int gap = min_gap; gap <= max_gap; gap++) { Term term(&seq, dir, cp, stem_len, loop_len, gap); if(term.left_stem_base() < seq.left()) continue; if(!is_candidate_hp(score, term)) continue; term.hp_energy = hairpin_score(score, term); if(term.hp_energy < ENERGY_CUTOFF) { term.tail_energy = tail_score(term, dir); // these rules for printing are a little strange --- // but they duplicate what Ermolaeva et al. use for // their paper if(have_last_term && !hp_overlap(term, last_term)) { if(last_term.tail_energy < TAIL_CUTOFF) { seq.terms.push_back(new Term(last_term)); } last_term = term; have_last_term = true; } else if(!have_last_term || term.hp_energy < last_term.hp_energy) { last_term = term; have_last_term = true; } } } } } } if(last_term.tail_energy < TAIL_CUTOFF) { seq.terms.push_back(new Term(last_term)); } } // find the terminators. will place the term objects into the sequence's // term vector. The list of terms will be softed by their right() endpoint // and their partners will be identified. void find_terms_ermolaeva(Seq & seq) { find_terms_ermolaeva(seq, REVERSE); find_terms_ermolaeva(seq, FORWARD); sort(seq.terms.begin(), seq.terms.end(), region_isleftof); pair_bidirect(seq.terms); } transterm_hp_v2.09/analysis.cc0000664000265600020320000003116111514142021015673 0ustar tilleaadmin/* This file is part of TransTerm v2.0 BETA and is covered by the GNU GPL * License version 2.0. See file LICENSE.txt for more details. */ #include #include #include #include #include #include #include "map-output.h" #include "conf.h" #include "seq.h" #include "util.h" #include "ermolaeva-score.h" #include "transterm.h" class T2THits : public EventResponder { public: T2THits(int ms, Confidence & conf) : _min_gene_span(ms), _conf(conf), _conf_histo(101), _term_histo(101), _comp_histo(101) { init0(); _t2t_count = 0; _good_term_count = 0; } virtual ~T2THits() {} void start(const Seq & seq, Direction dir) { init0(); _dir = dir; } void init0() { _best_conf = 0; _good_region = false; _sense_gene_span = 0; } void enter_intergene(RegionType r, Direction d, const Event & e) { EventResponder::enter_intergene(r, d, e); bool gr = r == TAIL2TAIL && _sense_gene_span >= _min_gene_span; if(gr && !_good_region) _t2t_count++; _good_region = gr; _best_conf = 0; _region_hit = false; } void leave_intergene(RegionType r, Direction d, const Event & e) { EventResponder::leave_intergene(r, d, e); if(r == TAIL2TAIL && _good_region) { if(_region_hit) { assert(_best_conf <= 100); _conf_histo[_best_conf]++; } _good_region = false; } } void leave_gene(const Event & e) { EventResponder::leave_gene(e); if((_dir == FORWARD && e.kind == Event::ForwardGeneEnd) || (_dir == REVERSE && e.kind == Event::ReverseGeneEnd)) { _sense_gene_span++; } else { _sense_gene_span = 0; } } void terminator(const Term * term) { EventResponder::terminator(term); if(_good_region && term->dir() != _dir) { int c = er_confidence(*this, _conf, *term); _best_conf = max(c, _best_conf); _good_term_count++; _region_hit = true; _term_histo[c]++; } } const vector & term_histo() const { return _term_histo; } const vector & histo() const { return _conf_histo; } const vector & comp_histo() const { return _comp_histo; } virtual int t2tregion_count() const { return _t2t_count; } int good_terms() const { return _good_term_count; } protected: Direction _dir; int _min_gene_span; Confidence & _conf; int _sense_gene_span, _best_conf; bool _good_region; bool _region_hit; int _t2t_count; int _good_term_count; vector _conf_histo; vector _term_histo; vector _comp_histo; }; // return true if (l1,r1) and (l2,r2) intersect bool interval_intersect(SeqPtr l1, SeqPtr r1, SeqPtr l2, SeqPtr r2) { return (l1> l2 && l1< r2) || (r1> l2 && r1< r2) || (l1< l2 && r1> r2); } bool by_left_side(const Region * a, const Region * b) { return a->left() < b->left(); } // from a small list of terminators, remove those that dominate something // in the list void remove_dominating(vector & terms) { vector out; bool keep_T; for(vector::iterator T = terms.begin(); T != terms.end(); ++T) { // check every term (!= T) to see if T dominates it keep_T = true; for(vector::iterator R = terms.begin(); R != terms.end(); ++R) { if (*R == *T) continue; // if R is dominated by T, we want to keep R (at this stage) and // not T --- the rationale is that since these all have the same // confidence, R probably includes less of the tail pairing if (dominates(**T, **R)) keep_T = false; } if(keep_T) out.push_back(*T); } // copy the new list into the old list terms = out; } /* When scanning in reverse, we're looking for terminators for FORWARD directed genes; when scanning forward, we're looking for terminators for REVERSE directed genes. We scan in the opposite direction so that we can count the # of genes facing the gene of interest. */ class DeHoonT2THits : public T2THits { public: DeHoonT2THits(int ms, Confidence & conf, int rs, ostream & out) : T2THits(ms, conf), _region_size(rs), _out(out), _t2t_region_count(0) { // init list } virtual ~DeHoonT2THits() {} int t2tregion_count() const { return _t2t_region_count; } void start(const Seq & seq, Direction dir) { T2THits::start(seq, dir); _terms.clear(); } void terminator(const Term * term) { // intentailly bypass parent class EventResponder::terminator(term); if(term->dir() != _dir) _terms.push_back(term); } void event(const Event & e) { /* when scanning forward, e.g., looking for terminators for rvs genes, we don't look past starts of genes on the same strand: ----> *|*---> */ EventResponder::event(e); if((_dir == FORWARD && e.kind == Event::ReverseGeneStart) || (_dir == REVERSE && e.kind == Event::ForwardGeneStart)) { _terms.clear(); } } void leave_intergene(RegionType r, Direction d, const Event & e) { if(r == TAIL2TAIL && _good_region) { _t2t_region_count++; // copy the terms that are within the range vector regterms; for(int i = _terms.size()-1; i >= 0; i--) { if((_dir == FORWARD && _terms[i]->right() < e.place - _region_size) || (_dir == REVERSE && _terms[i]->left() > e.place + _region_size)) break; regterms.push_back(_terms[i]); } _good_term_count += regterms.size(); int best_conf = 0, comp_best = 0; // const Term * best = 0; // list of all the terminators that have the best confidence (handles ties) vector best_list; sort(regterms.begin(), regterms.end(), by_left_side); SeqPtr left = 0, right = 0; for(vector::const_iterator T = regterms.begin(); T != regterms.end(); ++T) { // track the best overall int c = er_confidence(*this, _conf, **T); if(c >= best_conf || best_list.empty()) { if(c > best_conf) { best_list.clear(); } best_list.push_back(*T); // best = *T; best_conf = c; } // track the histogram of confidences _term_histo[c]++; // track the best confidence in the current component // note: will be false when left == right == 0 if(interval_intersect((*T)->left(), (*T)->right(), left, right)) { // we extend the current component right = max((*T)->right(), right); comp_best = max(comp_best, c); } else { if(left != 0 && right != 0) _comp_histo[comp_best]++; // starting a new component left = (*T)->left(); right = (*T)->right(); comp_best = c; } } if(left != 0 && right != 0) _comp_histo[comp_best]++; if(!regterms.empty()) { assert(best_conf <= 100); _conf_histo[best_conf]++; } // given the choice between A and B, if A domaininates B, we // output *B* (since they all have the same confidnece, by // definiation) remove_dominating(best_list); // output all the terminators with the best confidence (each on its own line) for(vector::const_iterator T = best_list.begin(); T != best_list.end(); ++T) { _out << setw(10) << e.reg->name << " " << setw(3) << regterms.size(); //if(!regterms.empty()) //{ SeqPtr tplace = (_dir==REVERSE)?(*T)->left():(*T)->right(); SeqPtr gene_end = (_dir==REVERSE)?e.reg->right():e.reg->left(); _out << " " << setw(3) << best_conf << " " << setw(3) << (gene_end - tplace)*_dir << " " << **T; //} _out << endl; } if(best_list.empty()) { _out << setw(10) << e.reg->name << " " << setw(3) << regterms.size() << " NONE" << endl; } _good_region = false; EventResponder::leave_intergene(r, d, e); } } private: vector _terms; int _region_size; ostream & _out; int _t2t_region_count; }; void t2t_hitanal( ostream & out, const Genome & g, Confidence & conf, int min_span, bool show_all) { DeHoonT2THits hits(min_span, conf, 500 + gene_start_cut, out); for(EVERY_CHROM_CONST(g, C)) { scan_events(**C, hits, gene_start_cut, gene_end_cut); reverse_scan_events(**C, hits, gene_start_cut, gene_end_cut); } int total, hit = 0, numterms = 0, numcomp = 0; total = hits.t2tregion_count(); out << endl << "SUMMARY" << endl << endl; out << total << " putative operon ends contain " << hits.good_terms() << " possible terminators." << endl; out << "Percent operon ends hit by terminators of confidence >= x:" << endl; out << "\t" << "x" << "\t" << "#>=x" << "\t" << "hits>=x" << "\t" << "%hits>=x" << endl; for(int i = 100; i >= 0; i--) { hit += hits.histo()[i]; numterms += hits.term_histo()[i]; numcomp += hits.comp_histo()[i]; if(i % 10 == 0 || show_all) { out << "\t" << i << "\t" << numterms << "\t" << numcomp << "\t" << hit << "\t" << int(float(hit)/total*100+0.5) << ((i==100)?" %":"") << endl; } } out << endl; } class Tail2TailScores : public EventResponder { public: Tail2TailScores(Confidence & conf) : ttscores(102), allscores(102), _t2t(-1), _conf(conf) {} void start(const Seq & seq, Direction dir) { _t2t = -1; } void terminator(const Term * term) { int c = er_confidence(*this, _conf, *term); allscores[c+1]++; if(in_t2t()) { _t2t = max(_t2t, int(c)); } } void leave_intergene(RegionType r, Direction d, const Event & e) { EventResponder::leave_intergene(r, d, e); if(r == TAIL2TAIL) { ttscores[_t2t+1]++; _t2t = -1; } } vector ttscores; vector allscores; private: int _t2t; Confidence & _conf; }; // write the data for a plot to out. For every confidnece, count the # of // terms with confidence >= x, and the number of Tail-to-tail regions hit by // those terms. void plot_tthits_vs_terms(ostream & out, Confidence & conf, Genome & g) { Tail2TailScores tts(conf); for(EVERY_CHROM_CONST(g, C)) scan_events(**C, tts, gene_start_cut, gene_end_cut); unsigned long allsum = 0, ttsum = 0; for(int i=101; i>=0; i--) { allsum += tts.allscores[i]; ttsum += tts.ttscores[i]; out << allsum << " " << ttsum << " " << i-1 << endl; } } int count_starts_in_genes(const Seq & seq, Direction dir) { int num_starts = 0; for(EVERY_REGION_CONST(seq.genes, G)) { int in_window = 0; char letter = ((*G)->dir() == dir)?'T':'A'; SeqPtr s; int i; for(i = 0, s = (*G)->left(); s<= (*G)->right() && i < UWINDOW_SIZE; s++, i++) { if(*s == letter) in_window++; } SeqPtr leaving = (*G)->left(); for(; s <= (*G)->right(); s++, leaving++) { if(in_window >= UWINDOW_REQUIRE) num_starts++; if(*s == letter) in_window++; if(*leaving == letter) in_window--; } } return num_starts; } transterm_hp_v2.09/transterm.h0000664000265600020320000000056611514142021015736 0ustar tilleaadmin/* This file is part of TransTerm v2.0 BETA and is covered by the GNU GPL * License version 2.0. See file LICENSE.txt for more details. */ #ifndef TRANSTERM_H #define TRANSTERM_H #include #include #include #include using namespace std; // global options extern bool no_gaps; extern int gene_start_cut; extern int gene_end_cut; #endif transterm_hp_v2.09/Makefile0000664000265600020320000000460011514142021015177 0ustar tilleaadmin# $Id: Makefile,v 1.10 2006/05/27 21:14:03 carlk Exp carlk $ # Makefile for TransTermHP # Usage: "make transterm" to make the main program # "make clean" removes all results of the compilation # "make no_obj" removes all the .o files VER=2.09 # files used for the main transterm program OBJ = seq.o \ distr.o \ util.o \ map-output.o \ gene-reader.o \ search.o \ conf.o \ ermolaeva-score.o \ ermolaeva-oldconf.o \ analysis.o \ anti.o \ transterm.o # files used for the 2ndscore program SECONDARY_OBJ = seq.o \ search.o \ ermolaeva-score.o \ util.o \ 2ndscore.o # files to include in the distribution (.cc and .h are included automatically) OTHERFILES = USAGE.txt \ expterm.dat \ LICENSE.txt \ RELEASE-NOTES.txt \ calibrate.sh \ random_fasta.py \ make_expterm.py \ mfold_rna.sh \ Makefile # These options are for GCC/G++ --- you may have to change them if you # use a different compiler CXXFLAGS = -g -O3 -Wall -pedantic all: transterm 2ndscore transterm: $(OBJ) $(CXX) $(CXXFLAGS) -o $@ $^ 2ndscore: $(SECONDARY_OBJ) $(CXX) $(CXXFLAGS) -o $@ $^ clean: no_obj rm -f transterm 2ndscore core.* no_obj: rm -f *.o distribution: mkdir -p transterm_hp_v$(VER) cp $(OBJ:.o=.cc) $(SECONDARY_OBJ:.o=.cc) *.h $(OTHERFILES) transterm_hp_v$(VER) make -C transterm_hp_v$(VER) all no_obj test: time ./transterm -r expterm.dat -c 0 ../test/*.{fna,ptt} > ../test/tmp.tt -diff ../test/tmp.tt ../test/correct.tt # DO NOT DELETE 2ndscore.o: seq.h util.h ermolaeva-score.h 2ndscore.o: search.h analysis.o: map-output.h seq.h conf.h distr.h util.h ermolaeva-score.h analysis.o: transterm.h anti.o: conf.h distr.h seq.h transterm.h conf.o: transterm.h seq.h distr.h conf.h distr.o: distr.h ermolaeva-oldconf.o: seq.h conf.h distr.h util.h ermolaeva-score.o: transterm.h ermolaeva-score.h seq.h gene-reader.o: gene-reader.h seq.h util.h map-output.o: map-output.h seq.h conf.h distr.h util.h transterm.h search.o: transterm.h seq.h ermolaeva-score.h util.h seq.o: seq.h util.h transterm.o: transterm.h seq.h util.h map-output.h transterm.o: conf.h distr.h gene-reader.h analysis.h ermolaeva-score.h util.o: util.h conf.o: distr.h seq.h ermolaeva-score.o: seq.h gene-reader.o: seq.h map-output.o: seq.h conf.h distr.h search.o: seq.h transterm_hp_v2.09/transterm.cc0000664000265600020320000002463411514142021016076 0ustar tilleaadmin/* This file is part of TransTermHP v2.0 BETA and is covered by the GNU GPL * License version 2.0. See file LICENSE.txt for more details. * * Author: Carl Kingsford, (c) 2005-2006 */ #include #include #include #include #include #include #include #include #include "transterm.h" #include "seq.h" #include "util.h" #include "map-output.h" #include "gene-reader.h" #include "analysis.h" #include "ermolaeva-score.h" #include "search.h" void find_terms_dp(Seq & ); void output_anti_terms(ostream &, const Genome &, Confidence &, int); // OPTIONS bool no_gaps = false; int conf_cutoff = 76; bool print_seq = true; bool show_all_t2t_roc = false; bool show_gaps = false; bool only_good_context = true; string t2tperf_file = ""; string tthitfile = ""; string bag_file = ""; string antifile = ""; Confidence * conf; int gene_start_cut = 0; int gene_end_cut = 25; // output the usage information & exit void usage(void) { cerr << "usage: transterm [options] *.fasta *.coords" << endl; cerr << "See the USAGE.txt file for available options" << endl; exit(3); } // print out the values for the essential options void print_options(ostream & out) { out << endl << "--gc=" << GC << " " << "--au=" << AU << " " << "--gu=" << GU << " " << "--mm=" << MM << " " << "--gap=" << GAP << endl; // out << "--max-stem=" << MAX_STEM << " " out << "--max-len=" << MAX_HP << " " << "--min-stem=" << MIN_STEM << " " << "--max-loop=" << MAX_LOOP << " " << "--min-loop=" << MIN_LOOP << endl; out << "--uwin-length=" << UWINDOW_SIZE << " " << "--uwin-require=" << UWINDOW_REQUIRE << " " << "--max-hp-score=" << ENERGY_CUTOFF << " " << "--max-tail-score=" << TAIL_CUTOFF << endl; out << "--loop-penalty="; for(int i = MIN_LOOP; i <= MAX_LOOP; i++) { if(i != MIN_LOOP) out << ","; out << LOOP_PENALTY[i]; } out << endl << "--start-cut=" << gene_start_cut << " --end-cut=" << gene_end_cut; out << endl << endl; } // parse the command line int process_options(int argc, char * argv[]) { const char * OPTIONS = "hc:Sr:p:"; enum {GC_OPT, AU_OPT, GU_OPT, MM_OPT, GAP_OPT, MINSTEM_OPT, MINLOOP_OPT, LOOPPEN_OPT, UWINLEN_OPT, UWINREQ_OPT, OVERLAP_OPT, MAXHP_OPT, MAXTAIL_OPT, V1CONF_OPT, RANDCONF_OPT, BAGOUTPUT_OPT, T2TPERF_OPT, ANTITERMS_OPT, SHOW_ALL_T2T_ROC_OPT, SHOW_GAPS_OPT, STARTCUT_OPT, ENDCUT_OPT, MAXLEN_OPT, MAXLOOP_OPT, ALLCONTEXT_OPT, OLDRANDCONF_OPT}; static struct option long_options[] = { {"gc", 1, 0, GC_OPT}, {"au", 1, 0, AU_OPT}, {"gu", 1, 0, GU_OPT}, {"mm", 1, 0, MM_OPT}, {"gap", 1, 0, GAP_OPT}, {"min-stem", 1, 0, MINSTEM_OPT}, {"min-loop", 1, 0, MINLOOP_OPT}, {"max-loop", 1, 0, MAXLOOP_OPT}, {"loop-penalty", 1, 0, LOOPPEN_OPT}, {"uwin-size", 1, 0, UWINLEN_OPT}, {"uwin-require", 1, 0, UWINREQ_OPT}, {"overlap", 1, 0, OVERLAP_OPT}, // NYI {"max-hp-score", 1, 0, MAXHP_OPT}, {"max-tail-score", 1, 0, MAXTAIL_OPT}, {"max-len", 1, 0, MAXLEN_OPT}, {"start-cut", 1, 0, STARTCUT_OPT}, {"end-cut", 1, 0, ENDCUT_OPT}, {"v1-conf", 0, 0, V1CONF_OPT}, {"old-rand-conf", 1, 0, OLDRANDCONF_OPT}, {"rand-conf", 1, 0, 'r'}, {"pval-conf", 1, 0, 'p'}, {"t2t-perf", 1, 0, T2TPERF_OPT}, {"full-t2t-roc", 0, 0, SHOW_ALL_T2T_ROC_OPT}, {"show-gaps", 0, 0, SHOW_GAPS_OPT}, {"antiterms", 1, 0, ANTITERMS_OPT}, {"all-context", 0, 0, ALLCONTEXT_OPT}, {"help", 0, 0, 'h'}, {"min-conf", 1, 0, 'c'}, {"bag-output", 1, 0, BAGOUTPUT_OPT}, {0,0,0,0} }; int len = MAX_HP; int loop = MAX_LOOP; // opterr = 0; // don't print msg --- we'll do it int a; while((a=getopt_long(argc, argv, OPTIONS, long_options, 0)) != -1) { switch(a) { case 'h': usage(); break; // energy function options case GC_OPT: GC = atof(optarg); break; case AU_OPT: AU = atof(optarg); break; case GU_OPT: GU = atof(optarg); break; case MM_OPT: MM = atof(optarg); break; case GAP_OPT: GAP = atof(optarg); break; case LOOPPEN_OPT: set_loop_pen(optarg); break; // filtering options case MINSTEM_OPT: MIN_STEM = atoi(optarg); break; case MINLOOP_OPT: MIN_LOOP = atoi(optarg); break; case UWINLEN_OPT: UWINDOW_SIZE = atoi(optarg); break; case UWINREQ_OPT: UWINDOW_REQUIRE = atoi(optarg); break; case MAXHP_OPT: ENERGY_CUTOFF = atof(optarg); break; case MAXTAIL_OPT: TAIL_CUTOFF = atof(optarg); break; case MAXLEN_OPT: len = atoi(optarg); break; case MAXLOOP_OPT: loop = atoi(optarg); break; case OVERLAP_OPT: cerr << "Error: not yet implemented" << endl; exit(3); case STARTCUT_OPT: gene_start_cut = atoi(optarg); break; case ENDCUT_OPT: gene_end_cut = atoi(optarg); break; // confidence options case V1CONF_OPT: conf = new ErmolaevaConfidence(); cerr << "WARNING: USING VERSION 1.0 CONFIDENCE." << endl << " (use -p expterm.dat to use v2.0 confidence)" << endl << "Version 1 confidence scheme exists only for debugging and" << "historical reasons. Use the updated scoring system." << endl; break; case OLDRANDCONF_OPT: conf = new RandomConfidence(optarg); cout << "--rand-conf=" << string(optarg) << endl; break; case 'p': case 'r': conf = new RandomPValueConfidence(optarg); cout << "--pval-conf=" << string(optarg) << endl; break; // output options case BAGOUTPUT_OPT: bag_file = optarg; break; case T2TPERF_OPT: t2tperf_file = optarg; break; case SHOW_ALL_T2T_ROC_OPT: show_all_t2t_roc = true; break; case SHOW_GAPS_OPT: show_gaps = true; break; case ALLCONTEXT_OPT: only_good_context = false; break; case ANTITERMS_OPT: antifile = optarg; break; case 'c': conf_cutoff = atoi(optarg); break; case 'S': print_seq = false; break; default: cerr << "Error: unknown option. " << endl; exit(3); } } if(len > REALLY_MAX_HP) { cerr << "Error: must search for hairpins with total length smaller than " << REALLY_MAX_HP << endl; cerr << "(recompile after changing REALLY_MAX_HP to increase)" << endl; exit(3); } if (loop >= len) { cerr << "Error: max-loop must be less than max-len" << endl; exit(3); } // set the global constants MAX_STEM, MAX_LOOP, MAX_HP set_max_len_loop(len, loop); // by default, use v1 confidence (since we don't know where to find // the data file necessary for rand-conf if(!conf) { cerr << "You must specify a background distribution file with '-p expterm.dat'." << endl; exit(3); } if(MAX_STEM < MIN_STEM || MIN_STEM < 1) { cerr << "Error: min-stem must be <= max-stem and > 0" << endl; exit(3); } if(MAX_LOOP < MIN_LOOP) { cerr << "Error: max-loop must be >= min-loop" << endl; exit(3); } if(optind+1 >= argc) usage(); return optind; } // the main program int main(int argc, char * argv[]) { cout << "TransTermHP v2.08 (built on " << __DATE__ << ")" << endl; time_t start_time = time(NULL); // read commandline options int first_fileindex = process_options(argc, argv); print_options(cout); Genome chroms; // the rest of the args are filenames // NOTE: the seq file must come before the annotation file for(int i = first_fileindex; i < argc; i++) { // if the reader factor can find a reader, then we assume this is an // annotation file GeneReader * reader = gene_reader_factory(argv[i]); if(reader) { if(reader->good()) { reader->read_genes(chroms); delete reader; } else { cerr << "Couldn't read: " << argv[i] << endl; exit(3); } } else { ifstream seq_file(argv[i]); if(!seq_file) { cerr << "Error: couldn't read file: " << argv[i]<< endl; exit(3); } read_seqs(seq_file, chroms); } } sort_genes(chroms); // for every sequence, find the terms for(EVERY_CHROM(chroms, C)) { cerr << "Seq: " << (*C)->name << " (length " << (*C)->length << ", " << (*C)->genes.size() << " genes) "; if((*C)->length <= unsigned(MAX_HP + 15)) { cerr << endl << "Error: input sequences must have length > " << MAX_HP + 15 << endl; exit(3); } find_terms_dp(**C); cerr << endl; } cerr << endl; // analize the found terminators to make the confidence function conf->prepare(chroms); // output summary of the % of T2T regions we hit if(t2tperf_file != "") { ofstream out(t2tperf_file.c_str()); if(!out) { cerr << "Couldn't open output file: " << t2tperf_file << endl; exit(3); } t2t_hitanal(out, chroms, *conf, 2, show_all_t2t_roc); } // output the main map of terminators & genes output_map(cout, chroms, *conf, conf_cutoff, print_seq, only_good_context); #if 0 if(tthitfile != "") { ofstream out(tthitfile.c_str()); plot_tthits_vs_terms(out, *conf, chroms); } #endif if(antifile != "") { ofstream out(antifile.c_str()); output_anti_terms(out, chroms, *conf, 90); } if(bag_file != "") { ofstream out(bag_file.c_str()); output_best_term(out, *conf, chroms); } // print out the elapsed time time_t seconds = time(NULL) - start_time; cerr << "Wall clock time = " << seconds << " seconds." << endl; return 0; } transterm_hp_v2.09/mfold_rna.sh0000775000265600020320000000042511514142021016040 0ustar tilleaadmin#!/bin/sh if [ ${#} -lt 2 ] ; then echo "usage: $0 program_name [program options]" > /dev/stderr exit 3 fi prog=$1 shift $prog --gc=-2.1 --au=-1.3 --gu=-0.8 --mm=3.5 --gap=3.9 \ --loop-penalty=4.1,4.9,4.4,4.7,5,5.1,5.2,5.3,5.4,5.5,5.6,5.7,5.8,5.8,5.9,6 $* transterm_hp_v2.09/random_fasta.py0000775000265600020320000000177011514142021016557 0ustar tilleaadmin#!/usr/bin/env python # generate a FASTA file with length random characters import random, sys def random_base(at): if random.random() < at: return "AT"[random.randint(0,1)] else: return "GC"[random.randint(0,1)] def random_fasta(fasta, length, at): fasta = open(fasta, 'w') print >> fasta, ">random uniform length=", length for i in range(1,length+1): fasta.write(random_base(at)) if i % 60 == 0: fasta.write("\n") fasta.close() def random_genes(cfile, length): coords = open(cfile, 'w') i = 1 while i < length: i += random.randint(5, 500) j = i + random.randint(200, 2000) if(j < length): print >> coords, "UNK", i, j, "random" i = j coords.close() # main program: def main(): at, length, file, coordfile = float(sys.argv[1]), int(sys.argv[2]), sys.argv[3], sys.argv[4] random_fasta(file, length, at) random_genes(coordfile, length) if __name__ == '__main__': main() transterm_hp_v2.09/util.cc0000664000265600020320000000254411514142021015030 0ustar tilleaadmin/* This file is part of TransTerm v2.0 BETA and is covered by the GNU GPL * License version 2.0. See file LICENSE.txt for more details. */ #include #include "util.h" // print a status percent void print_status(ostream & out, unsigned long cur, unsigned long max) { ostringstream oss; oss << (int)(100 * ((float)cur) / max) << "%"; out << oss.str(); for(unsigned i = 0; i < oss.str().length(); i++) out << "\b"; } // split a string into fields separated by a single character void split(const string & s, char sep, vector & out) { unsigned i = 0; out.clear(); out.push_back(""); for(unsigned j = 0; j < s.length(); j++) { if(s[j] == sep) { i++; out.push_back(""); } else { out[i] += s[j]; } } } // remove whitespace at the front of s string trim_front(const string & s) { unsigned i; for(i = 0; i #include #include #include #include #include #include #include "transterm.h" #include "seq.h" #include "distr.h" #include "conf.h" // return the min and max values for the given energy kind among the terms // high and low are /cumulative/ over successive calls void energy_range( Term::EnergyKind k, const ConstTermVec & terms, double & low, double & high) { for(EVERY_CTERM_CONST(terms, T)) { low = min(low, (*T)->energy(k)); high = max(high, (*T)->energy(k)); } } // compute the energy distiribution of the given kind of energy void term_energy_dist( Term::EnergyKind k, const ConstTermVec & terms, Distribution & dist) { for(EVERY_CTERM_CONST(terms, T)) { dist.at((*T)->energy(k)) += 1.0; } for(unsigned i = 0; i < dist.size(); i++) { dist[i] /= terms.size(); } } // return the signal to noise distribution Distribution signal_to_noise( Term::EnergyKind k, const ConstTermVec & signal, const ConstTermVec & noise) { const int NUM_BINS = 9; // get the range for the energy values Energy low = 1000, high = -1000; energy_range(k, signal, low, high); energy_range(k, noise, low, high); // compute the individual distributions Distribution sigdist(low, high, NUM_BINS), noidist(low, high, NUM_BINS); term_energy_dist(k, signal, sigdist); term_energy_dist(k, noise, noidist); // divide the noise by the signal Distribution dist(low, high, NUM_BINS); for(int i = 0; i < NUM_BINS; i++) { dist[i] = (sigdist[i]>0)?(noidist[i]/sigdist[i]):0; } return dist; } // collect the terminators taht are in (and only in) the given regiontype. The // list can be retrieved with terms() class PureRegionTerms : public EventResponder { public: PureRegionTerms(RegionType r, bool c=true) : _region(r), _codir(c) {} virtual ~PureRegionTerms() {} void terminator(const Term * term) { bool save = false; switch(_region) { // terms in gene regions must not be in any kind of integenic // region and, if user wants codir only, the sense of the // terminator must match some enclosing gene case GENE: save = !in_t2t() && !in_h2t_fwd() && !in_h2t_rvs(); if(save && _codir) { save = (fwd_gene_count() > 0 && term->dir() == FORWARD) || (rvs_gene_count() > 0 && term->dir() == REVERSE); } break; case HEAD2TAIL: save = gene_count() == 0 && (in_h2t_fwd() || in_h2t_rvs()); if(save && _codir) { save = (in_h2t_fwd() && term->dir() == FORWARD) || (in_h2t_rvs() && term->dir() == REVERSE); } break; case TAIL2TAIL: save = gene_count() == 0 && in_t2t(); break; case HEAD2HEAD: save = gene_count() == 0 && in_h2h(); break; } if(save) _tvec.push_back(term); } const ConstTermVec & terms() const { return _tvec; } private: RegionType _region; ConstTermVec _tvec; bool _codir; }; // count the # of AT characters between [start,end] (inclusive). Also return // the total # of characters void count_at(SeqPtr start, SeqPtr end, unsigned long & at, unsigned long & len) { for(; start <= end; ++start) { if(*start == 'A' || *start == 'T') at++; len++; } } // count the AT content of the gene and intergenic regions class ATContent : public EventResponder { public: ATContent() { _gene_at = _gene_len = 0; _nongene_at = _nongene_len = 0; } virtual ~ATContent() {} void start(const Seq & seq, Direction dir) { EventResponder::start(seq, dir); _nongene_start = seq.left(); _gene_start = 0; } void enter_gene(const Event & e) { EventResponder::enter_gene(e); // if we just left a nongene region if(gene_count() == 1) { if(_nongene_start && e.place > _nongene_start) { count_at(_nongene_start, e.place-1, _nongene_at, _nongene_len); // cerr << "NONGENE: " << seqindex(*e.reg->seq, _nongene_start) << " " << seqindex(*e.reg->seq, e.place-1) << endl; _nongene_start = 0; } _gene_start = e.place; } } void leave_gene(const Event & e) { EventResponder::leave_gene(e); // cerr << "LEFT: " << e.reg->name << " at " << seqindex(*e.reg->seq, e.place) << endl; // if we just entered a non-gene region if(gene_count() == 0) { if(_gene_start) { count_at(_gene_start, e.place, _gene_at, _gene_len); // cerr << "NONGENE: " << e.reg->name << " " << seqindex(*e.reg->seq, _gene_start) << " " << seqindex(*e.reg->seq, e.place-1) << endl; _gene_start = 0; } _nongene_start = e.place + 1; } } unsigned long gene_at() const { return _gene_at; } unsigned long gene_len() const { return _gene_len; } unsigned long nongene_at() const { return _nongene_at; } unsigned long nongene_len() const { return _nongene_len; } private: SeqPtr _gene_start; SeqPtr _nongene_start; unsigned long _gene_at, _gene_len; unsigned long _nongene_at, _nongene_len; }; // compute the total length of the intergenic regions of the given type class RegionLength : public EventResponder { public: RegionLength(RegionType r) : _region(r), _len(0) {} virtual ~RegionLength() {} void start(const Seq & seq, Direction dir) { EventResponder::start(seq, dir); _ig_count = 0; _s = 0; } void enter_intergene(RegionType r, Direction d, const Event & e) { EventResponder::enter_intergene(r, d, e); if(r == _region) { _ig_count++; if(_ig_count == 1) _s = e.place + 1; } } void leave_intergene(RegionType r, Direction d, const Event & e) { EventResponder::leave_intergene(r, d, e); if(r == _region) { assert(_ig_count > 0); _ig_count--; if(_ig_count == 0) { assert(_s != 0); _len += e.place - _s; // correct without the +1 _s = 0; } } } unsigned long length() const { return _len; } private: RegionType _region; unsigned long _len; SeqPtr _s; int _ig_count; }; // process the genome creating the statistic necessary for score() to assign a // confidence value. void ErmolaevaConfidence::prepare(const Genome & seqs) { PureRegionTerms gene(GENE), h2t(HEAD2TAIL), t2t(TAIL2TAIL, false); RegionLength h2t_len(HEAD2TAIL), t2t_len(TAIL2TAIL); ATContent at; for(EVERY_CHROM_CONST(seqs, C)) { // gene_er, h2t_er, and t2t_er are event responders that manintain // a running list of matching terms. scan_events(**C, gene, gene_start_cut, gene_end_cut); //100 scan_events(**C, h2t, gene_start_cut, gene_end_cut); //50 scan_events(**C, t2t, gene_start_cut, gene_end_cut); //50 // comptue the length of the T2T and H2T regions scan_events(**C, h2t_len, gene_start_cut, gene_end_cut); //50 scan_events(**C, t2t_len, gene_start_cut, gene_end_cut); //50 // compute the at content of the gene and integenic regions scan_events(**C, at, gene_start_cut, gene_end_cut); //50 } // can't compute confidence if we have no gene_terms if(gene.terms().empty()) { prepared = false; cout << "warning: no examples in genes; can't compute conf." << endl; return; } // compute K --- correction for AT content double at_in, at_not; at_in = double(at.gene_at()) / at.gene_len(); at_not = double(at.nongene_at()) / at.nongene_len(); K = (840*at_not*at_not - 1215.65*at_not + 448.9593) / (840*at_in*at_in - 1215.65*at_in + 448.9593); // output status messages cout << "Genes: " << at_in << " %AT, " << at.gene_len() << " nt, " << gene.terms().size() << " terms." << endl; cout << "Intergenic: " << at_not << " %AT, " << "H2T: " << h2t_len.length() << " nt, " << h2t.terms().size() << " terms; " << "T2T: " << t2t_len.length() << " nt, " << t2t.terms().size() << " terms. " << endl; t2t_L = double(t2t_len.length()) / at.gene_len(); h2t_L = double(h2t_len.length()) / at.gene_len(); t2t_hp = signal_to_noise(Term::HAIRPIN, t2t.terms(), gene.terms()); t2t_tail = signal_to_noise(Term::TAIL, t2t.terms(), gene.terms()); h2t_hp = signal_to_noise(Term::HAIRPIN, h2t.terms(), gene.terms()); h2t_tail = signal_to_noise(Term::TAIL, h2t.terms(), gene.terms()); t2t_N = 2.0 * gene.terms().size() / t2t.terms().size(); h2t_N = double(gene.terms().size()) / h2t.terms().size(); prepared = true; } // compute the confidence score of the given terminator, which is in a region // whose type is given by where (gene, head-to-tail or tail-to-tail). If the // terminator has a 'partner' we modifiy the confidence score int ErmolaevaConfidence::score(const Term & term, RegionType where) const { int c1 = score_one(term, where); // if this term has a partner (going in the other dir), and we're in a // TAIL2TAIL region (meaning that a bidirectional terminator makes sense) // we modify the confidence to take this into account if(term.partner && where == TAIL2TAIL) { assert(term.partner->dir() != term.dir()); int c2 = score_one(*term.partner, where); if(c1 >= 50 && c2 >= 50) { c1 = int((1.0 - (1.0 - c1/100.0)*(1.0 - c2/100.0))*100.0 + 0.5); } } return c1; } // compute the confidence score of the given terminator, which is in a region // whose type is given by where (gene, head-to-tail or tail-to-tail) int ErmolaevaConfidence::score_one(const Term & term, RegionType where) const { if(!prepared) return 0; const Distribution * taild, *hpd; double N, L; if(where == TAIL2TAIL) { taild = &t2t_tail; hpd = &t2t_hp; N = t2t_N; L = t2t_L; } else if(where == HEAD2TAIL) { taild = &h2t_tail; hpd = &h2t_hp; N = h2t_N; L = h2t_L; } else return 0; double qE, qT; qE = max(hpd->interp(term.hp_energy), 0.0); qT = max(taild->interp(term.tail_energy), 0.0); return unsigned(max(1-N*L*qE*qT*K, 0.0) * 100.0 + 0.5); } RandomPValueConfidence::RandomPValueConfidence(const string & fn) : RandomConfidence(fn) { sum_exp_table(); } int RandomPValueConfidence::score(const Term & term, RegionType reg) const { double at = _emp_at.find(reg)->second; return histvalue(get_table(at), term, at); } void RandomPValueConfidence::sum_exp_table() { for(map::iterator H = _exp_table.begin(); H != _exp_table.end(); ++H) { Histogram2d & hist = H->second; for(int i = 0; i < _nbins; i++) { for(int j = 0; j < _nbins; j++) { // In general, new_h[i][j] = new_h[i-1, j] + new_h[i,j-1] - new_h[i-1,j-1] // edge cases omit some of the terms if (i > 0) hist[i][j] += hist[i-1][j]; if (j > 0) hist[i][j] += hist[i][j-1]; if (i > 0 && j > 0) hist[i][j] -= hist[i-1][j-1]; } } // double total_ex = double(hist[_nbins-1][_nbins-1]); //CURRENT: double max_log = log(1.0 / total_ex); double max_log = log(1.0 / _sample_size); double ss_log = log(double(_sample_size)); // x is Rgc in the paper for(int i = 0; i < _nbins; i++) { for(int j = 0; j < _nbins; j++) { // cout << hist[i][j] << " " << total_ex << " " << max_log << " "; unsigned long x = max(hist[i][j], (unsigned long)1); //CURRENT: hist[i][j] = int(100.0 * log(x / total_ex) / max_log); hist[i][j] = int( 100.0 * (log((long double)x) - ss_log) / max_log ); // cout << hist[i][j] << endl; assert(hist[i][j] >= 0 && hist[i][j] <= 100); //hist[i][j] = int(100.0 * (1.0 - (hist[i][j] / total_ex))); } } } } //============================================================ // Random Confidence Scheme //============================================================ void debug_print_emp_table(RandomConfidence::Histogram2d & T) { for(RandomConfidence::Histogram2d::iterator I = T.begin(); I != T.end(); ++I) { for(vector::iterator J = I->begin(); J != I->end(); ++J) { cout << *J << " "; } cout << endl; } } // read the given exphist file RandomConfidence::RandomConfidence(const string & fn) : _prepared(false) { read_exp_table(fn); } // create the exp tables for each of the region types void RandomConfidence::prepare(const Genome & seqs) { PureRegionTerms gene(GENE), h2t(HEAD2TAIL), t2t(TAIL2TAIL, false), h2h(HEAD2HEAD, false); RegionLength h2t_len(HEAD2TAIL), t2t_len(TAIL2TAIL), h2h_len(HEAD2HEAD); ATContent at; for(EVERY_CHROM_CONST(seqs, C)) { // gene_er, h2t_er, and t2t_er are event responders that manintain // a running list of matching terms. scan_events(**C, gene, gene_start_cut, gene_end_cut); // 100 scan_events(**C, h2t, gene_start_cut, gene_end_cut); // 50 scan_events(**C, t2t, gene_start_cut, gene_end_cut); // 50 scan_events(**C, h2h, gene_start_cut, gene_end_cut); // 50 // comptue the length of the T2T and H2T regions scan_events(**C, h2t_len, gene_start_cut, gene_end_cut); // 50 scan_events(**C, t2t_len, gene_start_cut, gene_end_cut); // 50 scan_events(**C, h2h_len, gene_start_cut, gene_end_cut); // 50 // compute the at content of the gene and integenic regions scan_events(**C, at, gene_start_cut, gene_end_cut); // 50 } _emp_at[TAIL2TAIL] = double(at.nongene_at()) / at.nongene_len(); _emp_at[HEAD2HEAD] = _emp_at[HEAD2TAIL] = _emp_at[TAIL2TAIL]; _emp_at[GENE] = double(at.gene_at()) / at.gene_len(); //_emp_at[GENE] = _emp_at[TAIL2TAIL]; // XXX: test using only intergenic AT content _emp_len[TAIL2TAIL] = 2*t2t_len.length(); _emp_len[HEAD2TAIL] = h2t_len.length(); _emp_len[HEAD2HEAD] = 2*h2h_len.length(); _emp_len[GENE] = at.gene_len(); // output status messages cout << "Genes: " << _emp_at[GENE] << " %AT, " << at.gene_len() << " nt, " << gene.terms().size() << " terms." << endl; cout << "Intergenic: " << _emp_at[TAIL2TAIL] << " %AT, " << "H2T: " << h2t_len.length() << " nt, " << h2t.terms().size() << " terms; " << "T2T: " << t2t_len.length() << " nt, " << t2t.terms().size() << " terms; " << "H2H: " << h2h_len.length() << " nt, " << h2h.terms().size() << " terms. " << endl; fill_emp_table(GENE, gene.terms()); fill_emp_table(HEAD2TAIL, h2t.terms()); fill_emp_table(TAIL2TAIL, t2t.terms()); fill_emp_table(HEAD2HEAD, h2h.terms()); //debug_print_emp_table(_emp_table[TAIL2TAIL]); _prepared = true; } // return the score for the given terminator assuming its from the region reg int RandomConfidence::score(const Term & term, RegionType reg) const { assert(_prepared); // NOTE: we must use x.find(reg)->second rather than x[reg] since // this function is const and operator[] is not unsigned long len = _emp_len.find(reg)->second; double at = _emp_at.find(reg)->second; double expv = double(histvalue(get_table(at), term, at)); unsigned long empv = histvalue(_emp_table.find(reg)->second, term, at); if(empv == 0) return 0; //cout << expv << " " << _sample_size; expv *= double(len) / _sample_size; #if 0 //cout << "C: " << reg << " " << len << " " << at << " " << expv << " " << empv << " " cout << "C: " << reg << " " << len << " " << term.hp_energy << " " << term.tail_energy << " " << expv << " " << empv << " " << " at=" << get_best_at(at) << " " << unsigned(100.0 * max(1.0 - expv / empv, 0.0)) << endl; #endif return unsigned(100.0 * max(1.0 - expv / empv, 0.0)); } // return teh bin index for a histogram between [low,hi] of n bins, for value // v bins numbered 0 to n -1. if v is outside [low,hi], return the extream // bins int RandomConfidence::hbin(double low, double hi, int n, double v) const { if(v < low) return 0; if(v > hi) return n-1; return unsigned((v-low) / ((hi-low)/n)); } // return the value of the 2d histogram appropriate for term t unsigned long RandomConfidence::histvalue(const Histogram2d & hist, const Term & t, double at) const { int ati = get_best_at(at); #if 0 cout << ati << " " << _low_hp.find(ati)->second << " " << _high_hp.find(ati)->second << " " << _low_tail.find(ati)->second << " " << _high_tail.find(ati)->second << " " << t.hp_energy << " " << t.tail_energy << " "; int i = hbin(_low_hp.find(ati)->second, _high_hp.find(ati)->second, _nbins, t.hp_energy); int j = hbin(_low_tail.find(ati)->second, _high_tail.find(ati)->second, _nbins, t.tail_energy); cout << i << " " << j << " | "; #endif return hist[hbin(_low_hp.find(ati)->second, _high_hp.find(ati)->second, _nbins, t.hp_energy)] [hbin(_low_tail.find(ati)->second, _high_tail.find(ati)->second, _nbins, t.tail_energy)]; } // return the value of the 2d histogram appropriate for term t unsigned long & RandomConfidence::histvalue(Histogram2d & hist, const Term & t, double at) { int ati = get_best_at(at); return hist[hbin(_low_hp[ati], _high_hp[ati], _nbins, t.hp_energy)] [hbin(_low_tail[ati], _high_tail[ati], _nbins, t.tail_energy)]; } // given a list of terms that come from reg type reg, construct an emp // 2d histogram void RandomConfidence::fill_emp_table(RegionType reg, const ConstTermVec & terms) { _emp_table[reg].resize(_nbins); for(int i = 0; i < _nbins; i++) _emp_table[reg][i].resize(_nbins); for(ConstTermVec::const_iterator T = terms.begin(); T != terms.end(); ++T) { histvalue(_emp_table[reg], **T, _emp_at[reg])++; } } int RandomConfidence::get_best_at(double at) const { assert(0 < at && at < 1); double best_diff = 1000; int best_ati = -1; for(map::const_iterator H = _exp_table.begin(); H != _exp_table.end(); ++H) { if(fabs(int(H->first) - 100*at) < best_diff) { best_diff = fabs(int(H->first) - 100*at); best_ati = H->first; } } assert(best_ati > 0); return best_ati; } // find the histogram table that has an at content closest to at RandomConfidence::Histogram2d & RandomConfidence::get_table(double at) { assert(0 < at && at < 1); //int ati = int(100*at); double best_diff = 1000; Histogram2d * hist = 0; for(map::iterator H = _exp_table.begin(); H != _exp_table.end(); ++H) { if(!hist || fabs(int(H->first) - 100*at) < best_diff) { hist = &H->second; best_diff = fabs(int(H->first) - 100*at); } } return *hist; } // find the histogram table that has an at content closest to at const RandomConfidence::Histogram2d & RandomConfidence::get_table(double at) const { assert(0 < at && at < 1); //int ati = int(100*at); double best_diff = 1000; const Histogram2d * hist = 0; for(map::const_iterator H = _exp_table.begin(); H != _exp_table.end(); ++H) { if(!hist || fabs(int(H->first) - 100*at) < best_diff) { hist = &H->second; best_diff = fabs(int(H->first) - 100*at); } } return *hist; } // read the expterms.dat file given by fn the histogram matrix is written so // that the tail scores go along the ROWS // In the histograms, the organization is: H[hp_score][tail_score] // in the file, rows are tail scores, columns are hp_scores void RandomConfidence::read_exp_table(const string & fn) { ifstream in(fn.c_str()); if(!in.good()) { cerr << "Couldn't open data file: " << fn << endl; exit(3); } in >> _sample_size >> _nbins ; //>> _low_hp >> _high_hp >> _low_tail >> _high_tail; cout << _nbins << " bins; sample size is " << _sample_size << endl; //cout << "hp range = " << _low_hp << " to " << _high_hp // << ". tail range = " << _low_tail << " to " << _high_tail << endl; double at; while(in >> at) { if(at < 0.0) break; // a negative AT value means stop int ati = int(at*100 + 0.001); in >> _low_hp[ati] >> _high_hp[ati] >> _low_tail[ati] >> _high_tail[ati]; _exp_table[ati].resize(_nbins); for(int i = 0; i < _nbins; i++) _exp_table[ati][i].resize(_nbins); for(int i = 0; i < _nbins; i++) { // j is the column indx == hp scores for(int j = 0; j < _nbins; j++) { in >> _exp_table[ati][j][i]; } } } } // from inside an event responder, compute the best confidence score for the // regions the event responder is currently in int er_confidence( const EventResponder & er, const Confidence & conf, const Term & term) { int c_h2t, c_t2t, c_gene, c_h2h; c_h2t = c_t2t = c_gene = c_h2h = 0; if(er.in_t2t()) { c_t2t = conf.score(term, TAIL2TAIL); } #if 0 if(er.in_h2t_fwd() && term.dir() == FORWARD || er.in_h2t_rvs() && term.dir() == REVERSE) #endif if(er.in_h2t_fwd() || er.in_h2t_rvs()) { c_h2t = conf.score(term, HEAD2TAIL); } if(er.in_h2h()) { c_h2h = conf.score(term, HEAD2HEAD); } if(er.gene_count() > 0) c_gene = conf.score(term, GENE); return max(max(max(c_h2t, c_t2t), c_gene), c_h2h); } transterm_hp_v2.09/distr.h0000664000265600020320000000370511514142021015042 0ustar tilleaadmin/* This file is part of TransTerm v2.0 BETA and is covered by the GNU GPL * License version 2.0. See file LICENSE.txt for more details. */ #ifndef DISTR_H #define DISTR_H #include #include #include using namespace std; // represents a "distribution" ---- really a binned vector so that float // values go into the appropriate bin struct Distribution { // default constructor makes empty null distribution Distribution() : _min(0), _max(0), bs(0) {} // distributions have a fixed range [mmin, mmax] and a set # of bins // the range is broken into evenly sized bins Distribution(double mmin, double mmax, unsigned bins) : _min(mmin), _max(mmax), bs((_max-_min)/bins), d(bins) {} // the number of bins in the histogram unsigned size() const { return d.size(); } // the 'x' range of values represented by bin i double bin_lower(unsigned i) const { return _min + i*bs; } double bin_upper(unsigned i) const { return min(_max,_min + (i+1)*bs); } // the range of values represented by the histogram (as defined when // created --- does not mean that we've /seen/ a value that low) double low() const { return _min; } double high() const { return _max; } // the bin # (0-based) that the value v would go into unsigned binfor(double v) const; // accessors: [i] gives the value of bin i (0-based) --- if the distr is // not const, can be used as an lvalue: dist[2] += 1.2; at(v) gives the // value of the bin that v would go into (also can be used as an lvalue) double operator[](unsigned i) const { return d[i]; } double & operator[](unsigned i) { return d[i]; } double & at(double v) { return d[binfor(v)]; } double at(double v) const { return d[binfor(v)]; } // interpolate a value for v double interp(double v) const; protected: double _min, _max, bs; vector d; }; ostream & operator<<(ostream &, const Distribution &); #endif transterm_hp_v2.09/search.h0000664000265600020320000000030711514142021015155 0ustar tilleaadmin#ifndef SEARCH_H #include #include "seq.h" using namespace std; void every_hairpin_energy(Seq &, Direction, vector &); int set_max_len_loop(int, int); extern bool no_gaps; #endif transterm_hp_v2.09/anti.cc0000664000265600020320000000345511514142021015010 0ustar tilleaadmin/* This file is part of TransTerm v2.0 BETA and is covered by the GNU GPL * License version 2.0. See file LICENSE.txt for more details. */ // The code to search for anti-terminators is currently in developement // and does not work as well as one would hope #include #include #include "conf.h" #include "seq.h" #include "transterm.h" bool best_overlapping_hairpin(const Term &, Term &, int &); class OutputAntiTerms : public EventResponder { public: OutputAntiTerms(ostream & out, Confidence & conf, int cutoff) : _out(out), _conf(conf), _cutoff(cutoff) {} virtual ~OutputAntiTerms() {} void terminator(const Term * term) { int c = er_confidence(*this, _conf, *term); if(c >= _cutoff) { Term best_term; int overl; if(best_overlapping_hairpin(*term, best_term, overl) && best_term.hp_energy < -2.0) { _out << c << " " << *term << " " << best_term.hp_energy << " " << overl; print_term_seq(_out, best_term); _out << endl; } } } private: ostream & _out; const Confidence & _conf; int _cutoff; }; /* 3215191 - 3215211 + R conf -5.5 -6.28967 -9.9 overlap CCGTTTGTTCCCCGGctgttttttATCAAaaatcagttTTTTTTATTTCTA TGACTTGATGACACaaaaaacagccgTTTGTTCCCcggctgttttttATC */ void output_anti_terms( ostream & out, const Genome & g, Confidence & conf, int cutoff) { OutputAntiTerms oat(out, conf, cutoff); out << "Anti-terminators: " << endl; for(EVERY_CHROM_CONST(g, C)) { out << "In " << (*C)->name << " :" << endl; scan_events(**C, oat, 100, 100); // XXX: think about what good cutoffs are } } transterm_hp_v2.09/LICENSE.txt0000664000265600020320000004313111514142021015364 0ustar tilleaadmin GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc. 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Library General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Library General Public License instead of this License. transterm_hp_v2.09/distr.cc0000664000265600020320000000340411514142021015174 0ustar tilleaadmin/* This file is part of TransTerm v2.0 BETA and is covered by the GNU GPL * License version 2.0. See file LICENSE.txt for more details. */ #include #include "distr.h" // Distribution objects are used by the v1.0 confidence scheme // output a distribution (surrounded by []) ostream & operator<<(ostream & out, const Distribution & dist) { out << "[ "; for(unsigned i = 0; i< dist.size();i++) { out << dist[i] << " "; } out << "] "; return out; } // the bin # (0-based) that the value v would go into unsigned Distribution::binfor(double v) const { if(v <= low()) return 0; if(v >= high()) return size()-1; return min(unsigned((v-_min) / bs), size()-1); } // interpolate a value for v. Schematically: // * , // | * | // | | | | // (-----)(-----)(-----)(-----) // a b c d // if v is in [b,c] (the midpoints of two bins) then the line between the two // * (the values for those two bins), is computed using the fact that the // value of that line at x= (b+c)/2 is ([b]+[c])/2 (we use the bin value // directly for values at the very begining and end) double Distribution::interp(double v) const { unsigned b = binfor(v); unsigned otherb = 0; bool change = false; double x = 0.0; if(b > 0 && v <= (bin_lower(b) + bin_upper(b))/2.0) { otherb = b-1; x = bin_lower(b); change = true; } else if(b < size()-1) { otherb = b+1; x = bin_upper(b); change = true; } double E; if(change) { E = (d[otherb] + d[b])/2.0 + (v - x) * (d[b] - d[otherb])/(bin_upper(b) - bin_upper(otherb)); } else { E = d[b]; } return E; } transterm_hp_v2.09/gene-reader.cc0000664000265600020320000001034211514142021016224 0ustar tilleaadmin/* This file is part of TransTerm v2.0 BETA and is covered by the GNU GPL * License version 2.0. See file LICENSE.txt for more details. */ #include #include #include #include #include "gene-reader.h" #include "util.h" // return the right kind of annotation reader given teh extn of the filename // if none, return 0 GeneReader * gene_reader_factory(const string & fn) { string extn = fn.substr(fn.rfind('.')+1); if(extn == "ptt") return new PTTReader(fn); if(extn == "coords" || extn == "crd") return new CoordsReader(fn); return 0; } // make a new PTTReader. PTTReader::PTTReader(const string & fn) : _in(fn.c_str()) { unsigned dotpos = fn.rfind('.'); // remove the extn _id = (dotpos == 0)?fn:fn.substr(0, dotpos); // remove leading paths .../ _id = _id.substr(_id.rfind('/')+1); } // read the genes and put them into the genome bool PTTReader::read_genes(Genome & g) { string line; assert(good()); // read the header getline(_in, line); getline(_in, line); getline(_in, line); while(getline(_in, line)) { string loc, strand, pid, gene, syn, code, cog, desc, name; int len; istringstream iss(line); iss >> loc >> strand >> len >> pid >> gene >> syn >> code >> cog; getline(iss, desc); desc = trim_front(desc); if(gene != "-") name = gene; else if(syn != "-") name = syn; else if(pid != "-") name = pid; else name = "UNK"; vector locvec; split(loc, '.', locvec); if(locvec.size() != 3) { cerr << "Bad location format: " << loc << endl; exit(3); } unsigned long start, end; start = atol(locvec[0].c_str()); end = atol(locvec[2].c_str()); // make sure that the coords are given as left..right where left <= right if(end < start) { cerr << "WARNING: TransTerm does not handle genomes with genes that wrap around." << endl; continue; } if(strand == "-") { swap(start, end); } else if(strand != "+") { cerr << "Unknown strand value: " << strand << endl; exit(3); } Seq * s = chrom_for_id(g, _id); if(s) { if(start > s->length || end > s->length || start < 0 || end < 0) { cerr << "Bad gene coordinates: " << start << " - " << end << endl; exit(3); } s->genes.push_back( new Region(name, s, s->dna + start - 1, s->dna + end - 1, desc)); } else { cerr << "Can't find seq for id: " << _id << endl; exit(3); } } return true; } // make a new reader for .coords file CoordsReader::CoordsReader(const string & fn) : _in(fn.c_str()) { } // Read the gene cords file a stream formated as: // gene_name start end chrom_id // where if start > end the gene runs on the other strand. // Start and end are 1-based bool CoordsReader::read_genes(Genome & g) { assert(good()); Seq * s; string line; while(getline(_in, line)) { string name, chid; unsigned long startidx, endidx; istringstream iss(line); chid = ""; iss >> name >> startidx >> endidx >> chid; // if there was no chid, we assume that its b/c we are missing // the gene names. if(chid == "") { istringstream iss(line); iss >> startidx >> endidx >> chid; name = "UNK"; } s = chrom_for_id(g, chid); if(s) { if(startidx > s->length || endidx > s->length || startidx <= 0 || endidx <= 0) { cerr << "Bad gene coordinates: " << startidx << " .. " << endidx << endl; exit(3); } s->genes.push_back( new Region(name, s, s->dna + startidx - 1, s->dna + endidx - 1)); } else { cerr << "Unknown chromosome id: " << chid << endl; exit(3); } } return true; } transterm_hp_v2.09/expterm.dat0000664000265600020320000013133411514142021015722 0ustar tilleaadmin20000000 25 0.26 -26.38 -1.9 -5.744018 -2.4001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 3 0 1 2 2 2 4 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 2 6 2 4 4 7 6 14 6 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 4 0 0 1 4 5 0 0 0 0 0 0 0 1 0 0 1 0 1 1 0 0 0 2 4 6 4 8 6 15 14 0 0 0 0 0 1 0 0 0 0 0 2 0 2 1 7 2 5 10 13 5 12 25 25 21 0 0 0 0 0 0 0 0 0 0 1 3 2 3 4 7 7 19 17 21 26 31 45 49 64 0 0 0 0 0 0 0 0 0 1 2 0 2 2 2 1 7 8 12 18 18 25 29 46 39 0 0 0 0 1 0 0 0 2 1 0 1 1 3 5 10 13 16 33 41 29 43 63 66 68 0 0 0 0 0 0 0 1 1 3 2 1 6 6 13 16 26 39 62 84 69 119 142 175 159 0 0 0 0 1 0 1 0 4 7 6 12 17 19 28 35 59 108 160 181 179 297 369 385 391 0 0 0 0 0 0 0 1 0 3 4 4 6 8 16 18 45 47 82 119 102 154 184 212 222 0 0 0 1 0 1 0 0 3 7 5 8 15 23 29 42 88 122 177 220 206 302 469 492 490 0 0 0 0 1 1 1 2 3 5 6 7 21 28 53 62 105 155 224 294 306 377 585 649 649 0 0 0 0 1 1 2 2 5 7 4 14 26 40 42 73 110 170 235 332 292 490 615 705 690 0 1 0 0 0 1 1 2 2 5 5 13 21 33 43 61 107 147 224 316 262 367 534 643 591 0 0 0 0 0 0 2 2 6 14 14 21 36 38 70 96 166 230 315 492 381 555 775 930 897 0 0 2 1 0 2 0 2 5 4 5 21 23 39 57 87 110 180 294 337 344 515 637 717 691 0 0 0 0 0 0 0 0 4 6 7 10 11 8 19 24 45 65 117 122 104 165 242 280 212 0.28 -31.15 -1.9 -5.656893 -2.40016 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 3 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 1 3 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 3 3 5 4 10 10 8 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 5 6 8 11 16 13 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 3 8 11 5 11 20 23 0 0 0 0 0 0 0 0 0 0 0 0 1 0 3 4 4 4 8 11 17 32 24 56 47 0 0 0 0 0 0 0 0 0 0 0 0 2 0 1 3 5 14 14 18 29 41 58 61 54 0 0 0 0 0 0 0 1 1 0 0 0 1 1 4 5 11 13 12 24 58 45 85 104 94 0 0 0 0 0 0 0 0 0 0 0 1 0 2 1 5 7 17 20 30 63 67 83 108 120 0 0 0 0 0 0 0 0 1 0 0 0 1 2 3 9 15 21 34 73 92 135 222 255 275 0 0 0 0 0 0 0 0 1 1 2 3 6 15 16 12 35 65 84 157 210 253 359 506 438 0 0 0 0 0 0 0 0 0 0 2 3 2 4 12 10 26 44 61 104 163 172 280 361 376 0 1 0 0 0 0 0 0 0 0 4 1 7 11 16 27 43 65 85 173 286 347 462 618 619 0 0 0 0 0 0 1 0 1 0 4 9 15 17 33 40 67 141 195 348 470 568 893 1104 1141 0 0 0 0 0 1 1 1 0 1 0 3 11 17 13 30 39 78 98 180 243 274 437 544 592 0 0 0 0 0 1 1 0 1 1 4 4 6 12 29 45 55 87 156 272 398 454 703 849 832 0 0 0 0 0 0 0 0 0 4 8 6 8 16 31 50 85 135 206 355 534 588 916 1072 1188 0 0 0 0 0 0 0 2 1 1 0 3 10 11 30 45 68 125 173 296 454 517 747 925 926 0 0 0 0 0 0 0 0 0 1 1 6 2 1 8 4 17 44 53 90 130 171 233 250 295 0.30 -28.076 -1.9 -6.0827516 -2.4001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 2 0 1 1 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 0 0 0 1 1 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 2 0 3 0 2 5 6 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 2 3 4 5 5 3 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 8 5 7 7 7 9 20 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 4 5 6 4 14 4 22 18 27 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 5 9 8 13 22 27 29 43 0 0 0 0 0 0 0 1 0 1 0 1 4 2 4 7 9 13 19 30 43 49 64 87 95 0 0 0 0 0 0 0 0 0 0 0 0 1 0 2 4 9 8 24 20 32 39 54 72 85 0 0 0 0 0 0 0 0 0 0 0 0 2 4 6 7 10 16 27 37 70 75 111 130 129 0 0 0 0 0 0 0 1 0 0 0 2 3 7 14 14 34 36 73 86 120 197 250 284 291 0 0 0 0 0 0 0 1 0 3 3 9 7 8 17 23 33 61 114 169 241 277 444 479 526 0 0 0 0 0 1 0 0 0 0 2 2 12 8 19 37 58 88 156 205 305 379 542 672 693 0 1 0 0 0 1 0 2 2 5 1 10 12 16 24 35 69 72 173 182 339 400 618 729 757 0 0 0 0 0 0 0 0 1 1 2 4 13 16 34 55 95 123 256 309 494 569 838 1046 1097 0 1 0 0 1 0 0 0 0 3 5 11 7 9 25 49 49 84 188 215 334 432 637 740 793 0 0 0 0 0 0 0 0 1 1 7 12 14 37 42 68 128 146 290 361 562 718 1090 1189 1331 0 0 0 0 0 1 1 0 2 2 4 5 13 18 33 64 96 125 228 296 458 513 849 992 1059 0 0 0 0 0 0 0 0 3 1 3 5 4 3 10 11 32 43 104 109 194 215 338 355 398 0.32 -33.27 -1.9 -5.9486192 -2.4001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 3 4 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 3 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 2 1 3 1 7 6 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 2 0 1 3 3 9 16 19 21 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 1 1 3 16 6 15 21 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 2 4 7 8 15 24 32 44 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 6 8 12 10 40 60 53 79 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 4 7 8 18 24 42 59 115 117 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 3 3 6 7 18 44 65 111 146 127 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 7 4 8 23 48 61 90 125 195 217 0 0 0 0 0 0 0 0 0 0 0 0 1 5 3 7 14 22 54 110 163 248 375 546 585 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 9 11 20 40 104 120 233 375 445 561 0 0 0 0 0 0 0 0 1 0 2 0 0 1 6 12 32 40 81 165 246 408 581 824 930 0 0 0 0 0 0 0 0 0 1 0 2 1 4 4 20 29 59 100 181 242 472 595 911 1064 0 0 0 0 0 0 0 0 1 0 2 0 3 8 17 21 28 67 108 274 351 587 850 1164 1258 0 0 0 1 0 0 0 0 3 0 1 2 3 9 10 19 29 72 87 204 271 482 721 986 1097 0 1 0 0 0 0 0 0 1 0 2 1 2 3 20 22 37 69 151 341 425 715 1093 1457 1643 0 0 0 0 0 0 0 0 1 0 1 3 3 7 8 23 36 49 114 265 360 614 745 1155 1261 0 0 0 0 0 0 0 0 0 0 0 0 2 0 1 9 11 23 41 84 108 208 341 443 493 0.34 -21.504 -1.9 -5.7965198 -2.4001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 2 1 2 3 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 1 0 0 1 1 2 4 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 3 6 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 2 4 2 2 4 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 3 3 3 4 1 5 10 6 17 21 0 0 0 0 0 1 0 0 0 0 0 0 0 0 2 1 2 2 7 5 8 13 13 11 14 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 3 4 6 4 11 8 16 19 27 23 0 0 0 0 0 0 0 1 0 2 0 0 0 0 2 3 9 9 12 12 23 31 24 39 64 0 0 0 0 0 0 1 0 3 3 1 3 4 2 8 15 21 27 32 24 47 68 64 90 94 0 0 0 0 1 0 0 0 0 1 0 1 1 2 3 9 12 14 13 28 42 48 45 62 83 0 0 0 0 0 0 1 0 0 1 0 4 3 4 14 12 15 21 39 46 52 82 76 108 115 0 1 0 1 0 0 1 3 3 2 4 5 7 11 26 19 34 43 78 73 142 197 145 212 248 0 0 0 0 2 0 1 1 1 5 6 10 19 17 43 71 78 87 168 145 225 374 376 506 579 0 0 0 1 0 1 0 2 2 5 2 9 13 15 17 27 50 48 97 80 163 181 187 252 301 0 0 0 0 0 2 4 1 4 7 3 13 25 22 41 72 99 126 190 197 305 465 400 578 670 0 0 0 2 1 0 2 5 4 5 9 13 22 45 49 98 126 165 269 223 392 580 551 698 872 0 0 0 0 0 5 1 3 6 11 14 15 38 36 64 90 131 143 259 234 408 548 568 708 847 0 0 1 0 0 1 2 3 5 7 7 19 27 31 68 95 113 148 268 287 410 583 510 710 815 0 1 0 1 0 4 2 5 7 12 11 32 27 35 93 115 189 221 360 301 539 746 784 1010 1121 0 0 0 0 0 2 1 6 5 5 16 26 25 32 63 109 147 163 305 247 433 605 680 844 957 0 0 0 0 0 1 2 3 4 5 3 6 7 9 24 31 40 58 89 82 159 216 193 252 287 0.36 -24.26 -1.9 -5.6176554 -2.4001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 2 1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 2 2 6 2 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 2 3 2 2 1 3 4 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 3 3 11 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 1 0 1 2 4 8 8 15 18 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 3 4 4 8 11 13 21 18 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 5 7 3 9 12 16 13 34 18 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 4 13 7 11 19 16 21 34 32 0 1 0 0 0 0 0 0 0 0 0 1 2 6 4 9 6 20 12 42 52 59 80 119 116 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 2 9 13 21 23 35 37 60 65 0 0 0 0 0 0 1 0 0 0 1 2 0 4 3 10 14 22 18 57 62 88 87 124 142 0 0 0 0 0 0 1 0 0 1 0 0 1 2 4 9 11 27 28 64 68 85 98 151 174 0 0 0 0 0 0 0 0 0 0 2 2 7 7 9 15 37 52 52 139 159 193 240 313 393 0 0 0 0 0 0 0 0 0 0 2 6 5 9 12 27 46 62 66 166 204 274 317 465 499 0 0 0 0 0 0 0 0 1 1 1 4 2 14 15 27 45 88 88 170 246 286 359 501 556 0 0 0 0 0 0 0 1 0 1 0 6 14 10 17 28 44 91 115 219 241 368 393 585 684 0 0 0 0 0 2 1 0 3 4 2 5 12 26 32 64 78 160 191 369 434 663 678 1067 1211 0 0 0 0 0 0 0 0 0 2 4 6 10 8 29 29 57 81 114 255 279 368 411 596 694 0 0 0 0 0 0 0 1 0 5 6 12 12 19 27 68 87 132 146 328 407 584 587 940 970 0 0 0 0 2 0 0 4 2 1 9 8 6 21 34 62 109 133 174 372 492 618 697 1021 1123 0 0 0 0 1 0 1 1 0 5 7 6 12 18 35 75 99 116 188 389 461 605 652 987 1132 0 0 0 0 0 0 0 0 1 0 3 2 3 4 8 7 23 33 42 61 97 151 159 218 243 0.38 -23.836 -1.9 -5.955011 -2.4001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 2 0 0 3 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 1 1 1 1 2 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 3 4 7 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 2 1 2 5 6 5 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 3 2 5 4 8 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 2 4 8 7 15 12 24 26 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 3 7 10 13 21 24 20 0 0 0 0 0 0 0 0 0 0 0 1 1 2 0 5 3 5 9 14 13 29 21 50 50 0 0 0 0 0 0 0 0 0 1 0 1 2 2 1 4 4 10 19 23 25 45 50 75 82 0 0 0 0 0 0 0 0 0 1 1 0 1 2 2 6 6 10 16 29 34 61 72 96 113 0 0 0 0 0 1 0 0 1 0 1 1 2 2 3 5 10 12 39 42 64 85 100 140 166 0 0 0 0 0 0 0 0 1 0 0 3 5 5 4 9 10 23 35 65 70 93 123 202 212 0 0 0 0 0 0 0 0 1 3 1 6 6 8 12 24 39 59 71 126 176 266 306 494 573 0 1 0 0 0 0 0 1 0 0 2 2 2 7 13 28 42 46 100 130 181 243 269 434 507 0 0 0 1 0 0 0 0 0 1 2 7 10 8 16 37 50 97 133 234 283 404 448 746 766 0 0 0 0 0 0 1 1 0 1 5 1 14 14 29 47 56 118 135 259 320 471 486 852 923 0 0 0 1 0 0 0 0 3 3 2 5 13 17 33 40 72 124 160 280 377 546 622 997 1101 0 0 0 0 0 0 0 1 1 5 2 6 15 14 26 48 68 134 164 267 338 472 613 833 957 0 0 0 1 1 2 1 0 2 2 5 10 18 10 30 62 107 175 222 366 523 688 784 1300 1374 0 0 0 0 0 0 0 3 3 1 3 8 12 10 29 58 73 135 178 306 426 553 639 933 1103 0 0 0 0 0 0 0 0 1 1 3 1 3 6 14 19 21 48 54 130 134 214 207 380 388 0.40 -29.984 -1.9 -6.0321896 -2.4001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 3 3 5 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 5 9 9 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 4 3 10 11 6 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 3 5 9 7 5 12 19 27 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 3 9 12 29 19 41 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 3 7 11 15 39 43 63 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 2 8 11 30 33 50 92 95 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 3 3 5 12 22 34 59 108 119 193 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 2 2 4 18 28 42 73 116 210 213 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 4 8 22 47 75 100 138 211 268 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 4 11 21 34 66 135 209 324 435 575 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 9 14 18 47 80 171 243 372 580 664 0 1 0 0 0 0 0 0 0 0 0 1 1 1 8 5 20 45 68 141 249 353 567 880 1060 0 0 0 0 0 0 0 0 1 1 1 0 3 3 5 13 26 48 74 171 277 428 633 974 1223 0 0 0 0 0 0 0 0 0 1 0 1 3 2 6 18 26 46 91 186 310 504 761 1157 1409 0 0 0 0 0 0 0 0 0 2 1 0 2 2 6 15 23 57 60 153 282 402 621 924 1122 0 0 0 0 0 0 0 0 0 0 1 2 4 6 5 19 39 47 106 223 421 646 993 1426 1767 0 0 0 0 0 0 0 0 0 0 1 2 3 3 5 11 27 49 88 220 317 532 735 1166 1376 0 0 0 0 0 0 0 0 0 0 0 0 1 1 5 2 11 18 45 57 108 172 253 394 514 0.42 -22.14 -1.9 -6.2625 -2.40003 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 1 1 1 0 4 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 2 3 5 3 6 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 2 0 4 3 3 5 13 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 2 5 6 6 7 9 18 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 2 2 2 8 4 11 22 15 22 35 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 4 11 6 4 11 11 9 24 25 0 0 0 0 0 0 0 0 1 0 0 1 0 0 3 0 1 4 7 15 23 22 32 41 52 0 0 0 0 0 0 0 0 0 0 0 0 2 0 4 3 3 18 21 18 32 51 56 84 107 0 0 0 0 0 0 0 0 0 0 0 1 3 1 4 5 9 22 29 34 42 59 86 100 150 0 0 0 0 0 0 0 0 0 0 1 3 2 5 6 9 14 30 36 33 74 96 116 157 218 0 0 0 0 0 1 0 0 0 0 3 1 2 6 6 9 28 56 64 74 147 188 206 291 364 0 0 0 0 0 1 2 1 1 0 3 3 9 12 23 20 46 85 146 114 225 320 315 566 685 0 0 0 0 0 1 0 0 0 0 1 3 3 13 26 14 30 81 112 124 207 294 305 522 568 0 0 0 0 0 0 0 0 0 1 1 2 9 13 16 28 56 107 173 157 339 402 457 711 887 0 0 0 0 0 1 0 0 2 3 5 10 9 15 41 38 88 169 275 239 519 705 751 1079 1428 0 0 0 0 0 1 0 0 2 2 6 8 10 18 29 37 55 124 187 186 365 467 534 775 1035 0 0 0 0 0 1 0 0 2 2 4 9 10 23 49 59 85 196 280 278 525 697 786 1191 1496 0 1 0 1 0 0 1 1 1 7 6 4 3 19 36 40 66 155 241 210 436 589 622 946 1149 0 0 0 0 0 0 0 0 1 0 1 3 4 7 13 19 26 55 75 71 163 207 208 346 446 0.44 -20.762 -1.9 -6.4775592 -2.4001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 2 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 2 0 2 2 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 1 1 0 1 7 2 7 7 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 2 5 5 1 9 8 9 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 1 1 3 7 5 5 9 13 18 8 0 0 0 0 0 0 0 0 0 1 0 0 0 0 3 2 2 5 9 5 11 19 20 33 31 0 0 0 0 0 0 0 0 0 0 0 2 0 2 1 3 4 2 12 10 13 28 17 35 42 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 6 4 10 15 23 21 35 41 65 70 0 0 0 0 0 0 0 0 0 0 1 2 1 7 3 13 11 14 38 38 51 105 87 156 164 0 0 0 0 0 0 1 0 0 0 1 1 3 2 5 8 15 24 31 32 48 104 80 130 149 0 0 0 0 0 0 0 0 2 2 2 0 4 3 7 18 17 21 60 75 88 164 118 245 268 0 0 1 0 0 1 1 0 3 0 5 1 8 13 9 30 46 66 115 120 174 327 254 470 533 0 0 0 1 0 0 0 1 0 5 1 2 5 13 20 26 40 50 103 139 176 310 310 478 531 0 1 0 1 0 0 1 0 0 1 1 3 10 19 18 32 73 104 170 223 289 486 411 781 867 0 0 0 1 1 0 2 1 3 5 8 10 10 12 22 55 97 164 235 290 438 767 613 1182 1186 0 0 0 0 0 1 2 0 2 3 1 6 7 18 31 63 69 107 162 231 279 547 450 815 868 0 1 0 1 0 0 1 2 1 2 5 8 17 25 30 82 106 165 261 309 412 774 626 1141 1263 0 0 1 1 0 0 0 0 2 1 7 11 20 21 29 77 104 179 266 339 413 766 657 1232 1291 0 0 0 0 0 0 1 0 1 0 2 3 6 8 11 28 31 49 82 114 166 277 229 409 446 0.46 -20.656 -1.9 -6.2010264 -2.4001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 4 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 0 3 5 4 3 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 2 2 2 6 2 5 4 11 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 3 7 3 8 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 2 6 10 2 5 10 20 19 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 5 6 11 11 26 16 38 56 0 0 0 0 0 0 0 0 0 0 1 0 0 0 2 3 3 5 9 7 18 30 17 33 30 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 3 6 7 10 18 45 42 65 71 0 0 0 0 0 0 0 0 0 0 1 0 2 2 3 6 10 13 28 36 45 84 87 147 178 0 0 0 0 0 0 0 0 0 1 0 0 3 1 5 11 9 15 25 35 42 69 51 116 125 0 0 0 0 0 0 1 0 0 1 0 2 4 5 8 13 15 22 34 38 67 151 94 205 215 0 0 0 0 0 0 0 1 0 2 1 2 8 6 11 23 27 49 82 92 131 249 194 367 399 0 0 0 0 0 0 0 0 0 0 2 4 7 10 16 31 42 68 110 135 186 322 292 579 630 0 0 0 0 0 0 0 0 0 1 2 3 5 6 13 33 40 63 91 120 210 324 277 510 538 0 0 0 0 0 0 0 0 0 1 2 5 4 15 9 44 75 93 126 183 261 439 391 770 779 0 0 0 0 0 0 2 0 0 3 2 8 10 19 25 57 77 144 224 261 394 678 574 1076 1171 0 1 0 1 1 0 2 1 0 2 2 4 7 13 18 39 43 76 154 168 280 466 411 766 885 0 0 0 0 1 1 0 3 2 5 4 17 19 23 33 76 104 145 222 297 406 685 583 1219 1277 0 0 0 0 1 0 0 1 2 2 5 2 7 14 15 61 80 110 192 273 336 553 514 1029 1060 0 0 0 0 0 0 0 0 0 0 2 2 5 7 9 16 23 45 50 83 110 205 192 357 330 0.48 -21.398 -1.9 -6.0444326 -2.4001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 2 1 1 1 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 0 2 3 5 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 3 4 5 5 8 14 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 4 1 3 2 7 7 13 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 6 3 14 12 15 22 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 2 4 3 9 6 11 19 25 37 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 3 3 11 9 7 18 28 40 33 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 3 6 5 7 11 15 39 39 54 74 0 0 0 0 0 0 0 0 0 0 0 1 0 0 5 3 6 2 17 14 32 52 56 97 94 0 0 0 0 0 0 0 0 0 0 0 0 1 2 1 1 4 13 17 21 44 71 76 122 124 0 0 0 0 0 0 0 0 0 0 0 1 0 1 4 8 7 9 26 39 49 79 113 170 178 0 0 0 0 0 0 0 0 0 0 0 1 1 3 4 9 14 26 31 46 60 97 128 257 263 0 0 0 0 0 0 0 0 0 0 3 0 2 4 6 17 29 41 84 102 114 211 237 464 464 0 0 0 0 0 0 0 0 0 2 0 1 5 6 11 12 20 41 80 115 147 223 301 480 506 0 0 0 0 0 0 0 0 0 0 1 5 2 3 18 24 45 65 133 163 225 342 441 690 828 0 0 0 0 0 1 0 0 1 2 1 2 7 11 15 27 46 60 138 161 276 419 460 770 849 0 0 0 0 0 0 0 1 0 1 2 4 9 8 9 24 55 76 173 184 316 467 548 928 950 0 0 0 0 0 0 0 0 2 0 1 1 3 7 13 28 38 76 167 201 258 411 466 803 808 0 0 0 0 0 0 0 0 0 1 2 6 6 12 21 40 65 89 210 251 371 602 687 1105 1128 0 0 0 0 0 0 0 0 0 2 1 4 5 10 18 43 68 85 178 208 315 507 579 972 1043 0 1 0 0 0 0 0 0 0 0 0 2 0 2 11 14 18 30 60 63 77 141 182 326 294 0.50 -20.656 -1.9 -6.3148068 -2.4001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 2 1 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 2 1 2 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 2 1 2 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 0 1 4 5 4 6 7 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 2 2 6 4 7 13 8 21 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 1 4 7 5 9 7 16 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 6 5 6 17 21 23 40 27 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 1 3 4 10 8 8 15 30 36 34 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 4 5 4 10 11 19 42 33 84 68 0 0 0 0 0 0 0 0 0 0 0 0 0 1 6 7 4 10 18 25 37 65 59 128 123 0 0 0 0 0 0 0 0 0 2 1 0 2 1 4 3 15 10 19 35 48 81 63 157 153 0 0 0 0 0 0 1 1 0 0 2 0 1 1 4 4 12 19 29 41 65 92 103 215 236 0 0 0 0 0 0 0 0 0 0 3 0 3 1 2 13 27 28 36 76 102 205 146 359 339 0 0 0 0 0 0 0 2 1 2 0 3 5 7 15 19 35 55 81 124 184 274 300 631 613 0 0 0 0 0 0 0 0 1 2 0 3 6 4 11 21 26 56 83 110 173 289 252 512 529 0 0 0 0 0 0 1 0 1 2 0 3 3 10 16 20 37 77 137 169 263 421 424 802 777 0 0 0 0 0 0 1 0 0 3 3 3 8 12 30 37 62 112 186 251 309 605 543 1117 1152 0 0 0 0 0 2 0 2 2 0 2 3 7 12 24 41 55 84 154 193 301 505 444 889 885 0 1 0 0 1 0 0 1 1 2 3 9 5 11 23 36 60 102 202 252 390 610 532 1131 1149 0 0 0 0 0 0 2 1 0 3 3 1 8 14 10 36 68 108 171 219 309 576 508 957 1023 0 0 0 0 0 0 0 0 0 3 2 1 2 5 8 16 27 34 58 87 103 197 172 357 332 0.52 -17.582 -1.9 -6.746142 -2.4001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 4 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 3 1 5 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 2 2 3 3 4 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 3 4 7 6 7 8 0 0 0 0 0 0 0 0 0 0 0 0 1 0 3 0 0 3 4 5 2 7 10 12 14 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 1 6 4 9 11 11 27 11 18 0 0 0 0 0 0 0 1 0 0 0 0 2 1 4 2 1 5 7 12 16 28 33 23 49 0 0 0 0 0 0 0 0 1 1 1 1 3 3 2 3 7 13 14 20 23 25 39 44 61 0 0 0 0 0 0 0 0 0 1 3 2 0 6 3 12 7 17 20 29 49 66 102 77 96 0 0 0 0 0 0 0 0 0 0 0 1 4 4 4 9 14 36 34 38 48 82 122 95 152 0 0 0 0 1 0 1 0 0 1 1 3 4 6 14 17 7 44 40 59 73 130 169 176 212 0 0 0 0 1 0 0 1 1 1 5 2 6 3 12 24 21 67 72 115 119 226 262 276 353 0 1 0 0 0 0 0 0 1 2 5 5 9 18 18 33 21 104 78 168 174 272 389 376 493 0 0 0 0 0 0 0 2 1 2 3 7 5 26 42 32 38 121 128 215 216 372 523 522 653 0 0 0 0 1 1 1 2 1 4 3 10 18 26 32 58 54 167 145 287 272 458 662 561 816 0 0 0 0 0 0 2 1 1 6 6 6 12 25 38 60 57 177 152 285 280 517 666 593 870 0 0 0 1 0 1 1 2 1 2 3 8 22 21 43 80 71 216 154 326 375 632 832 718 1014 0 0 0 0 0 0 1 1 5 8 7 13 15 32 51 75 84 227 208 311 351 593 814 752 1002 0 0 0 0 1 0 0 0 2 0 2 2 6 10 28 27 29 73 77 128 130 228 318 288 364 0.54 -19.066 -1.9 -6.8880124 -2.4001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 3 0 0 1 2 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 2 1 1 4 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 2 1 1 3 6 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 2 3 1 7 9 6 7 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 2 0 1 1 3 4 10 9 14 14 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 3 0 4 3 4 15 12 27 19 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 3 0 1 6 17 14 34 30 56 46 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 5 4 5 5 13 15 29 41 50 66 0 0 0 0 0 0 0 0 1 0 0 1 0 1 2 5 7 14 7 26 24 62 61 101 93 0 0 0 0 0 0 0 0 0 0 0 2 1 6 6 9 12 21 19 53 53 117 118 173 159 0 0 0 0 0 0 0 0 0 0 0 2 3 6 3 8 14 28 28 62 59 134 140 178 186 0 0 0 0 0 0 0 0 0 1 0 1 0 2 8 9 26 43 48 97 97 180 204 302 312 0 0 0 0 0 0 1 1 1 1 2 3 8 11 14 21 45 63 66 201 161 354 393 552 499 0 0 0 0 0 0 1 1 2 1 6 1 3 9 19 30 42 83 98 200 179 345 375 635 584 0 0 0 0 1 0 1 3 1 3 1 3 7 10 26 47 50 91 144 267 219 555 549 838 822 0 0 0 1 1 0 0 0 1 2 1 7 6 18 22 46 45 122 123 287 248 558 546 889 869 0 0 0 0 0 0 1 1 0 3 2 6 7 6 30 55 66 140 154 355 312 625 660 951 962 0 1 0 0 1 0 0 0 2 3 2 7 9 12 14 45 63 138 176 365 324 671 696 1036 994 0 0 0 0 0 0 0 0 1 0 0 1 5 6 7 22 39 40 80 135 125 265 239 422 390 0.56 -16.31 -1.9 -6.052885 -2.40003 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 2 2 1 3 2 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 2 0 2 0 3 0 4 2 6 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 2 0 4 2 3 2 6 4 5 0 0 0 0 0 0 0 0 0 1 1 0 0 0 2 1 1 2 1 3 3 5 8 8 11 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 1 0 7 3 4 7 11 4 13 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 4 1 6 6 8 14 11 17 22 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 2 6 12 5 12 22 38 25 25 0 0 0 0 0 0 0 1 0 0 1 0 0 0 5 2 4 3 12 10 17 25 47 44 49 0 0 0 0 0 0 0 1 1 0 0 1 2 1 6 8 6 5 18 20 21 38 44 55 61 0 0 0 0 0 0 0 0 0 2 1 2 3 3 9 9 9 14 34 19 37 52 77 72 86 0 0 0 0 1 0 0 0 0 1 5 0 0 4 13 7 14 19 32 31 57 75 96 83 120 0 0 0 0 0 0 0 0 0 0 1 2 2 7 9 16 23 17 69 36 56 89 142 128 149 0 0 0 0 0 0 0 0 0 2 4 2 2 4 13 14 35 32 64 64 91 148 200 161 179 0 0 0 0 0 0 1 2 1 2 4 5 8 10 19 15 47 48 109 89 138 190 297 284 286 0 0 0 0 0 0 1 0 2 2 5 5 11 8 28 20 51 38 120 105 133 203 326 278 339 0 0 0 0 0 1 1 2 0 1 2 7 10 12 33 32 58 74 170 145 210 314 477 423 491 0 1 1 0 0 0 2 0 2 5 7 3 9 25 38 40 85 82 212 153 250 346 511 507 578 0 2 2 0 1 1 1 0 1 7 7 5 19 24 50 53 77 97 241 187 261 377 642 516 661 0 0 0 0 0 0 2 4 5 0 5 9 15 27 53 40 65 76 226 145 247 364 558 534 613 0 0 0 2 0 0 0 0 1 3 6 8 14 13 56 53 129 102 253 235 323 446 681 687 757 0 0 0 0 0 1 2 0 1 0 8 9 14 19 44 35 79 88 236 162 267 416 622 596 619 0 0 0 0 0 0 0 0 0 1 4 2 3 7 19 13 34 29 74 61 77 117 149 145 192 0.58 -14.826 -1.9 -6.589262 -2.4001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 0 0 1 0 2 2 1 5 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 2 1 5 2 1 3 8 4 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 6 2 4 4 11 7 13 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 2 4 5 2 15 6 17 8 10 0 0 0 0 0 0 0 0 0 0 0 0 0 2 4 3 3 4 8 8 17 9 20 11 23 0 0 0 0 0 0 0 0 0 0 1 0 3 2 6 0 6 4 6 18 25 13 27 34 41 0 0 0 0 0 0 0 1 0 0 0 1 2 5 7 4 5 9 14 21 34 9 60 42 43 0 0 0 0 0 0 0 0 0 2 3 1 4 5 2 11 10 11 24 16 50 24 60 55 70 0 0 0 0 0 0 0 2 1 1 1 3 4 10 5 13 16 17 48 53 66 43 127 85 151 0 0 0 0 0 0 1 1 0 1 1 5 2 9 6 19 13 24 49 43 84 64 155 101 143 0 0 0 0 0 1 0 1 1 0 3 6 4 11 11 27 40 25 61 69 153 97 194 139 184 0 0 0 0 1 1 1 4 3 4 3 6 8 16 11 43 49 55 128 97 198 133 331 268 347 0 0 0 0 0 1 0 2 5 4 5 6 8 28 17 41 59 57 154 128 265 152 366 269 400 0 1 0 0 0 1 1 1 4 5 7 20 16 31 24 61 65 97 160 164 353 219 535 368 536 0 0 0 1 0 2 2 3 4 5 9 17 33 58 46 100 84 115 223 231 407 275 656 461 658 0 0 0 0 0 0 1 3 2 11 7 6 27 36 32 78 85 106 215 247 404 253 635 433 573 0 1 1 0 1 3 1 2 2 7 9 18 28 52 40 96 91 122 250 238 462 304 730 518 770 0 0 0 0 1 1 2 3 3 11 10 16 21 42 46 101 89 133 262 267 485 291 704 539 742 0 1 0 0 1 2 1 2 1 4 1 5 11 19 15 35 22 41 89 99 153 94 255 211 273 0.60 -16.31 -1.9 -6.746142 -2.4001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 3 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 2 0 0 3 4 3 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 1 5 5 4 4 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 1 2 8 2 6 3 4 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 1 2 2 4 7 6 14 12 16 9 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 1 4 2 13 13 14 21 16 0 0 0 0 0 0 0 0 0 0 0 0 1 1 2 0 4 1 20 4 5 21 26 36 28 0 0 0 0 0 0 0 1 0 0 0 0 0 0 5 3 1 4 16 11 25 35 36 52 53 0 0 0 0 0 0 0 0 0 0 1 0 2 1 5 4 5 13 20 27 15 53 60 77 56 0 0 0 0 0 0 0 0 0 1 0 0 1 6 7 4 11 20 43 37 36 84 128 97 118 0 0 0 1 0 0 1 0 0 0 1 0 2 4 10 5 13 21 53 51 49 88 132 149 140 0 0 0 0 0 0 0 0 0 3 2 2 6 5 5 12 16 26 63 64 65 135 161 194 205 0 0 0 0 1 0 1 0 0 5 6 0 7 13 16 22 28 48 116 94 130 239 300 350 296 0 0 0 0 0 0 0 0 0 1 6 5 6 10 31 17 55 53 162 104 144 257 376 402 374 0 0 0 0 0 1 2 0 2 2 5 7 9 12 27 25 53 82 165 154 193 350 500 601 511 0 1 0 0 0 0 3 3 1 1 5 9 13 24 32 33 92 98 198 143 247 451 567 653 663 0 0 0 0 0 0 1 1 0 2 2 7 15 11 47 38 84 90 244 197 267 463 557 638 592 0 1 0 0 0 0 1 1 0 3 8 7 7 18 44 59 75 109 258 192 287 472 690 774 709 0 0 0 0 0 0 2 0 4 1 11 7 11 16 34 43 73 101 269 214 307 502 681 751 694 0 0 0 0 0 0 0 0 0 2 3 2 5 4 18 17 26 39 113 77 100 189 230 304 233 0.62 -15.78 -1.9 -6.4900142 -2.4001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 2 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 2 1 1 1 0 3 3 1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 4 1 1 1 4 5 6 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 5 3 2 6 3 9 11 0 0 0 0 0 1 0 0 2 0 0 1 0 0 2 1 2 2 1 9 4 8 22 13 8 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 1 2 2 8 5 9 13 8 21 21 0 0 0 0 0 0 0 0 1 0 0 0 2 0 1 2 3 5 5 8 13 22 19 33 19 0 0 0 0 0 0 0 0 1 1 0 0 0 0 2 4 4 11 10 16 17 36 42 54 40 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 7 12 24 17 23 36 38 66 55 0 0 0 0 0 0 1 1 0 0 1 0 0 0 4 6 9 14 24 25 33 55 68 90 76 0 0 0 0 0 0 0 2 0 1 0 3 7 1 12 6 15 32 32 38 71 107 106 181 139 0 0 0 0 0 0 0 0 0 1 2 0 2 3 6 8 21 21 36 49 86 122 117 177 133 0 1 0 0 0 0 2 2 0 0 2 5 4 7 10 10 23 46 51 62 105 144 166 246 208 0 0 0 0 0 0 0 1 1 2 0 3 10 12 19 22 39 69 97 97 155 301 241 414 318 0 0 0 0 0 0 0 0 1 1 2 2 6 8 18 23 40 97 113 107 157 249 276 450 331 0 0 0 0 0 0 1 1 1 4 2 6 10 14 25 42 59 82 161 156 230 404 391 603 502 0 0 1 0 0 1 0 2 2 2 4 6 22 19 26 45 80 115 207 177 280 490 515 730 655 0 0 0 0 0 0 0 1 2 1 7 1 11 17 37 33 58 109 187 147 246 394 399 641 483 0 0 0 1 0 0 1 2 2 0 2 9 14 20 37 42 77 123 233 184 315 511 534 787 640 0 0 0 1 0 0 0 1 1 2 3 2 12 29 24 38 76 121 211 173 251 486 495 744 615 0 0 0 0 0 0 1 0 0 2 2 2 4 9 8 12 22 47 68 58 95 155 184 253 201 0.64 -15.78 -1.9 -6.8520932 -2.40003 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 2 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 0 3 1 0 4 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 7 5 5 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 3 4 5 7 7 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 3 5 3 3 8 9 8 20 17 0 0 0 0 0 0 0 1 0 0 0 0 0 0 2 3 3 2 5 3 8 11 17 27 13 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 3 3 2 10 7 15 29 19 31 15 0 0 0 0 0 0 1 0 0 0 2 0 1 1 2 1 1 9 15 17 27 43 32 68 41 0 0 0 0 0 0 0 0 0 0 0 0 0 2 4 5 7 17 16 23 27 36 53 75 67 0 1 0 0 0 1 0 0 1 0 2 1 5 4 4 7 12 30 33 26 40 82 92 142 90 0 0 0 0 0 0 0 0 1 0 0 0 4 5 5 11 17 27 43 35 68 116 99 195 147 0 0 0 0 0 0 0 0 0 1 0 3 2 6 11 10 20 31 54 60 72 154 150 256 182 0 0 0 0 0 0 0 0 1 2 0 4 5 9 10 17 31 54 96 70 129 228 224 362 268 0 0 0 0 0 0 2 2 2 2 4 2 8 11 23 23 41 68 115 89 158 300 287 507 354 0 0 0 0 1 0 0 2 3 1 3 5 8 14 19 23 44 78 126 151 215 386 354 615 419 0 0 0 0 0 2 0 1 3 1 0 3 18 16 25 33 61 120 177 160 226 436 428 749 543 0 0 0 0 1 1 1 0 1 1 4 2 12 17 23 37 54 120 201 154 260 476 457 738 565 0 0 0 0 2 1 0 0 1 4 2 2 15 20 19 33 69 136 227 193 294 551 585 912 692 0 1 0 1 0 0 0 2 3 2 8 3 9 13 39 37 68 118 214 189 293 530 505 813 645 0 0 1 0 0 0 1 0 1 1 2 2 2 3 13 9 23 48 68 70 101 204 194 309 229 0.66 -15.568 -1.9 -7.0826114 -2.40003 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 3 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 2 0 1 1 1 2 4 4 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 1 2 2 8 8 5 8 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 0 0 0 4 3 5 8 13 17 12 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 8 5 4 9 22 17 15 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 5 6 7 17 15 31 26 25 0 0 0 0 0 0 0 0 0 0 0 0 1 4 3 1 3 5 10 11 18 28 47 54 40 0 0 0 0 0 0 0 0 0 0 0 2 1 0 1 5 13 7 18 14 30 46 71 57 61 0 0 0 0 0 0 0 0 0 0 0 0 0 2 3 6 9 8 34 20 39 63 90 85 74 0 0 0 0 0 0 0 0 0 0 2 1 3 3 5 13 25 14 48 32 64 91 143 142 152 0 0 0 0 0 0 1 1 0 0 3 1 0 3 3 8 24 25 75 55 93 121 188 199 167 0 0 0 0 1 0 1 0 0 3 2 4 2 13 10 16 45 29 75 70 122 181 287 255 232 0 0 0 0 0 0 0 0 1 0 2 1 6 13 16 24 61 43 116 98 208 289 477 468 376 0 1 0 0 0 0 0 0 1 1 3 1 9 8 19 19 56 50 142 108 223 303 468 472 410 0 1 0 0 0 0 0 1 0 2 4 5 13 9 20 31 96 68 183 140 306 415 589 627 546 0 0 1 1 1 1 1 0 1 4 2 11 14 14 24 32 94 91 198 166 353 460 649 699 600 0 0 0 1 0 1 0 2 1 1 5 5 25 14 34 33 117 112 240 194 388 499 779 714 610 0 0 0 0 0 0 0 0 2 3 1 4 11 15 43 40 113 90 217 201 334 510 785 741 619 0 0 0 0 0 0 0 1 3 1 0 2 7 5 17 13 43 35 83 59 149 216 336 282 252 0.68 -13.554 -1.9 -7.2722094 -2.4001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 3 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 4 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 2 1 2 5 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 2 3 3 3 6 8 6 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 2 0 2 1 0 12 4 4 13 10 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 2 1 4 2 9 10 11 14 17 11 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 2 5 12 8 10 23 13 18 26 14 0 0 0 0 0 0 0 0 0 1 0 0 1 0 4 2 5 10 11 12 33 20 27 42 9 0 0 0 0 0 1 0 1 0 0 1 3 4 6 1 12 3 10 18 22 54 26 56 79 49 0 0 0 0 0 0 0 0 1 0 0 0 0 4 1 15 8 22 23 22 63 33 69 97 51 0 0 0 0 0 0 0 0 1 0 0 3 4 11 14 23 20 40 36 41 96 80 100 168 93 0 1 0 0 0 1 0 0 2 1 5 4 4 6 14 20 15 49 36 55 115 110 138 218 104 0 0 0 0 1 1 0 1 3 4 5 2 6 10 22 36 31 71 65 79 157 133 218 325 196 0 0 1 0 0 0 1 0 3 0 4 7 9 19 27 48 25 102 90 129 328 237 321 487 285 0 0 2 0 0 0 1 1 2 2 4 6 14 18 25 45 52 129 124 155 298 275 365 538 280 0 0 1 1 0 1 1 0 5 2 9 10 14 25 47 59 74 172 166 184 432 352 518 747 442 0 0 0 0 1 1 0 4 7 8 10 10 28 26 35 87 75 189 193 246 466 393 587 861 460 0 0 0 1 0 3 3 3 0 4 5 13 13 36 53 79 73 203 178 203 478 384 615 898 465 0 1 0 0 0 1 2 6 3 10 12 14 22 30 62 81 77 205 190 234 532 383 653 890 480 0 0 0 0 0 0 1 2 3 1 0 4 7 11 17 24 28 70 87 103 213 169 251 333 170 0.70 -14.72 -1.9 -6.4828486 -2.4001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 2 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 2 4 3 3 0 0 0 0 0 0 0 0 0 0 0 0 1 0 2 0 0 1 2 1 4 2 7 12 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 3 0 4 4 5 7 7 11 4 0 0 0 0 0 0 0 0 0 0 0 1 0 2 1 3 2 3 4 2 11 11 15 16 11 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 3 2 8 8 16 6 20 29 23 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 1 0 8 12 9 15 17 23 36 20 0 0 0 0 0 0 0 0 0 0 0 1 2 1 3 1 3 4 6 10 21 13 32 65 36 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 6 6 6 19 21 44 31 63 77 65 0 0 0 0 0 0 0 0 0 0 0 1 0 3 5 1 4 17 12 21 52 36 84 88 65 0 0 0 0 0 0 0 1 1 1 2 2 2 3 1 9 6 11 35 32 95 63 96 126 75 0 0 0 0 0 0 0 0 0 0 1 1 1 7 7 7 15 27 47 56 117 90 141 194 130 0 0 0 0 0 0 0 1 0 0 1 2 1 2 8 16 17 27 48 45 141 81 194 234 161 0 1 0 0 0 1 0 0 1 1 1 3 10 7 12 19 21 33 86 69 200 119 252 336 263 0 0 0 0 0 0 1 0 4 3 1 7 6 11 14 21 43 62 105 112 268 209 331 470 314 0 0 0 0 0 0 0 3 2 1 1 4 5 4 21 37 44 54 119 130 301 223 363 459 326 0 0 0 0 0 0 0 0 0 4 3 6 12 8 26 19 41 90 159 150 402 290 493 664 454 0 0 0 0 0 1 0 2 3 5 4 8 6 15 25 32 60 107 185 193 505 328 583 785 545 0 0 0 0 0 2 2 0 3 1 4 2 7 11 22 40 63 71 142 142 410 271 530 639 466 0 0 0 0 0 1 2 1 2 3 2 5 11 20 30 46 72 100 177 200 487 357 640 791 566 0 0 0 0 0 0 0 0 1 0 4 6 6 16 19 42 64 85 177 174 463 304 578 729 472 0 0 0 0 0 0 0 0 1 1 2 0 1 8 4 10 29 27 67 61 146 112 170 223 154 0.72 -13.448 -1.9 -7.0476766 -2.40016 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 5 4 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 3 1 2 0 2 7 2 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 3 2 3 5 7 9 4 0 0 0 0 0 0 0 0 0 0 0 1 0 0 2 1 1 8 2 3 5 8 8 19 6 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 4 7 2 10 11 12 31 11 0 0 1 0 0 0 0 0 0 0 0 0 1 3 2 2 5 4 8 10 21 13 24 35 26 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 5 3 4 9 9 20 16 26 57 24 0 0 0 0 0 0 0 0 0 1 2 1 4 2 3 2 6 12 12 19 47 40 46 80 35 0 0 0 0 0 0 2 0 2 0 6 0 3 6 7 5 13 17 13 19 58 49 64 112 47 0 0 0 0 1 0 0 0 1 0 0 1 5 3 4 9 17 27 32 37 84 73 101 151 71 0 0 0 0 0 0 0 0 0 0 5 0 1 8 14 18 26 37 50 51 128 108 116 275 132 0 0 0 0 0 0 0 0 1 3 1 6 10 11 12 21 26 48 57 90 159 120 175 320 153 0 0 1 0 0 0 0 1 0 3 5 2 6 12 19 29 28 62 99 84 253 171 249 408 192 0 0 0 0 0 2 0 1 1 3 4 9 15 14 27 43 59 108 134 126 322 270 394 661 314 0 0 0 0 0 0 0 3 3 0 6 10 12 21 26 44 64 95 137 120 379 276 393 687 328 0 0 0 0 0 0 3 1 5 0 5 11 14 23 29 54 71 149 177 202 421 365 554 919 408 0 0 0 0 1 1 1 2 2 5 6 11 9 27 28 57 79 128 177 203 493 405 522 909 420 0 1 0 0 1 1 1 3 2 6 7 12 16 25 35 80 93 158 208 191 537 427 601 1051 463 0 0 0 0 2 2 2 2 6 7 9 11 23 19 39 67 71 159 185 207 511 406 596 990 442 0 0 0 0 0 0 1 0 3 1 3 5 12 5 18 35 30 50 68 74 184 161 198 404 190 0.74 -15.144 -1.9 -6.8880124 -2.4001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 4 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 3 0 8 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 3 1 3 8 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 1 1 3 9 1 13 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 5 2 9 8 20 11 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 3 4 3 19 9 22 11 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 3 0 3 7 9 27 25 41 20 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 3 6 7 6 13 7 29 29 45 32 0 0 0 0 0 0 0 1 0 0 0 2 0 1 1 3 3 4 17 18 15 53 45 81 36 0 0 0 0 0 0 0 0 0 1 0 0 1 2 7 6 8 14 22 26 25 85 67 126 82 0 0 0 0 0 0 0 1 1 0 2 1 1 1 8 2 6 8 19 30 39 97 88 153 94 0 0 0 0 0 0 0 0 0 0 0 0 0 3 7 5 12 16 37 46 49 174 135 196 127 0 0 0 0 0 0 0 0 0 1 1 2 1 1 6 11 20 20 57 54 66 207 202 330 135 0 0 0 0 0 0 0 0 0 2 0 1 1 0 6 23 21 27 49 76 99 271 207 396 206 0 1 0 0 0 1 0 0 1 1 1 2 4 10 12 20 31 44 61 125 142 354 307 530 290 0 0 0 0 0 1 1 0 0 1 2 4 3 7 27 29 25 54 121 125 153 480 437 639 377 0 0 0 0 0 1 0 1 0 2 3 6 9 8 15 34 52 68 122 143 193 607 491 831 456 0 0 0 0 0 0 0 1 1 4 3 6 8 15 26 43 59 69 155 173 239 683 595 983 578 0 0 0 0 0 0 0 2 2 3 4 6 7 11 25 50 52 87 153 203 239 676 635 969 492 0 0 0 0 1 2 0 0 0 0 1 7 7 12 31 35 79 97 168 215 266 789 611 1069 569 0 0 0 0 0 0 0 1 0 0 7 6 4 6 30 33 52 78 167 192 239 712 660 1017 531 0 0 0 0 0 0 0 1 0 0 1 2 0 6 10 17 26 25 44 73 100 261 217 368 183 0.76 -15.886 -1.9 -6.8880124 -2.4001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 2 1 2 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 2 5 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 3 3 11 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 3 2 5 4 16 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 2 5 6 1 15 14 27 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 2 3 6 9 14 11 31 14 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 4 3 6 10 10 21 19 63 19 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 6 2 2 5 17 12 45 25 77 34 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 2 4 6 15 15 19 55 49 108 52 0 0 0 0 0 0 0 0 0 1 0 0 2 1 2 3 3 7 23 22 28 76 63 178 72 0 0 0 0 0 0 0 0 0 0 0 0 3 5 1 6 2 7 34 32 35 90 70 208 101 0 0 0 0 0 1 0 0 0 0 0 2 2 2 5 7 9 18 24 50 61 171 104 335 138 0 0 0 0 0 0 0 0 0 0 1 2 2 1 4 10 13 25 62 65 69 187 178 471 190 0 0 0 0 0 0 0 0 0 0 0 0 3 2 7 15 12 29 69 74 83 224 208 571 237 0 0 0 0 1 0 0 0 0 1 0 2 3 4 13 11 17 31 87 125 122 339 249 746 329 0 0 0 0 0 0 0 0 0 0 1 1 3 5 17 16 28 50 99 141 143 421 329 1007 449 0 0 0 0 0 0 0 0 1 0 1 2 4 9 19 19 23 46 152 173 172 504 382 1192 498 0 0 0 0 0 0 1 0 1 1 0 6 7 10 16 30 42 65 176 199 233 559 441 1404 523 0 0 0 0 1 0 1 0 1 1 3 3 5 14 12 20 34 61 163 204 226 616 499 1383 567 0 1 0 0 0 0 0 1 2 2 2 4 3 14 20 22 53 80 202 219 250 701 507 1586 636 0 0 0 0 1 0 0 1 1 0 4 3 4 8 20 22 45 62 172 234 230 599 488 1503 548 0 0 0 0 0 1 1 1 0 0 1 2 2 7 6 8 17 29 57 71 85 210 179 522 194 0.78 -14.296 -1.9 -7.2722094 -2.4001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 2 6 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 4 1 1 4 4 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 4 1 5 3 9 4 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 3 3 6 8 6 20 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 3 6 10 8 16 21 7 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 2 4 5 5 13 15 18 37 16 0 0 0 0 0 0 0 0 0 1 1 0 1 0 2 2 1 2 14 12 16 26 23 52 13 0 0 0 0 0 0 0 1 0 0 0 0 1 2 1 2 3 12 13 22 35 26 37 73 33 0 0 0 0 0 0 0 0 1 0 0 1 3 0 1 1 6 14 21 27 44 56 63 127 58 0 0 0 0 0 0 1 0 2 1 0 1 3 0 2 7 3 31 23 34 81 72 72 197 93 0 0 0 0 0 1 0 0 0 0 0 1 2 0 0 7 7 22 50 49 110 100 118 249 98 0 0 0 0 0 0 0 0 0 1 3 5 1 2 4 12 11 41 54 75 134 132 179 383 167 0 0 0 1 0 0 0 0 0 3 1 1 4 8 9 15 20 46 79 75 188 150 217 490 177 0 0 0 0 0 0 0 0 0 1 1 3 9 11 10 13 29 54 108 109 241 233 319 625 263 0 1 0 0 1 0 0 1 2 0 3 3 4 6 22 23 37 95 152 157 366 388 404 987 385 0 0 0 0 1 1 1 0 4 2 3 5 7 11 13 18 49 117 161 193 420 392 490 1044 426 0 0 1 0 0 1 0 0 0 4 3 3 11 12 24 37 55 149 225 254 550 460 629 1307 530 0 1 0 0 0 1 1 1 3 3 1 7 11 21 30 38 72 146 205 274 604 571 692 1536 558 0 0 0 0 0 0 0 0 2 7 4 7 8 17 22 37 75 159 245 271 630 569 686 1551 563 0 0 0 1 0 0 0 0 5 0 2 4 14 18 24 41 70 155 223 280 605 555 682 1487 497 0 0 0 0 0 0 0 0 0 3 0 2 4 8 6 17 22 58 79 109 219 209 258 613 181 0.80 -18.006 -1.9 -7.2237108 -2.40016 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 2 2 6 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 3 5 4 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 0 1 8 5 11 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 4 1 6 9 18 9 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 2 0 2 2 3 13 23 40 23 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 2 4 7 11 20 28 54 24 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 2 2 4 5 10 32 40 92 47 0 0 0 0 0 0 0 0 0 0 0 0 2 0 1 3 3 1 3 13 23 46 51 109 70 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 4 5 11 9 40 68 97 166 124 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 3 4 8 23 27 51 113 134 257 178 0 0 0 0 0 0 0 1 0 0 0 1 0 2 2 3 7 5 36 41 50 124 169 324 217 0 0 0 0 0 0 0 0 0 0 0 2 2 1 5 4 4 9 31 57 93 254 223 592 328 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 13 14 55 46 105 244 304 599 366 0 0 0 0 0 0 0 0 0 0 0 3 2 5 3 5 13 24 62 76 128 355 400 892 483 0 0 0 0 0 0 0 1 0 0 0 3 1 1 3 6 24 29 80 95 221 503 548 1255 717 0 0 0 0 0 0 0 0 0 0 0 1 2 2 12 16 19 31 106 139 204 557 634 1417 770 0 0 0 0 0 1 0 0 0 0 0 1 3 9 5 21 42 27 120 149 247 698 757 1699 971 0 1 0 0 0 0 0 0 0 0 1 1 1 6 9 13 34 39 132 170 306 701 792 1850 949 0 0 0 0 0 0 0 0 0 0 1 0 0 5 15 16 35 53 135 173 299 810 813 1911 1007 0 0 0 0 0 0 0 0 0 2 0 1 1 7 8 17 33 44 133 163 301 731 811 1797 964 0 0 0 0 0 0 0 0 0 0 0 3 1 1 3 4 8 16 42 65 120 267 305 628 347 0.82 -13.766 -1.9 -7.2237186 -2.40003 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 2 2 2 4 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 1 4 2 5 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 1 1 2 3 8 2 3 13 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 2 4 4 8 7 12 5 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 3 1 3 4 8 10 18 31 13 0 0 0 0 0 0 0 0 0 0 0 1 1 2 0 1 4 2 2 8 10 22 15 43 18 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 4 6 6 10 11 20 35 28 78 33 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 5 7 11 10 13 28 34 52 85 36 0 0 0 0 0 1 0 0 0 0 0 1 1 3 4 5 4 12 18 20 44 64 51 154 53 0 0 0 0 0 0 0 0 0 0 1 1 1 1 5 12 9 16 18 24 66 87 87 215 67 0 0 0 0 0 0 0 0 1 0 1 4 2 3 6 10 19 22 30 52 97 110 103 350 118 0 0 0 0 0 0 0 0 0 1 1 2 5 4 4 17 13 26 48 64 129 136 149 397 141 0 0 0 0 1 0 0 0 0 0 2 7 3 9 14 16 36 38 84 110 177 226 223 613 211 0 0 0 0 0 0 0 1 0 2 1 2 6 9 16 23 30 45 98 134 244 292 298 742 271 0 0 0 0 0 2 0 1 2 3 1 3 8 10 14 33 52 54 130 159 298 384 362 1004 365 0 0 0 0 0 1 2 1 2 5 1 9 10 12 30 33 56 82 167 236 402 503 525 1382 464 0 0 0 0 0 0 1 1 4 2 3 8 11 12 31 70 88 86 200 261 440 567 518 1643 479 0 0 0 0 0 0 0 0 2 4 5 8 9 17 41 60 95 115 227 255 573 676 663 1832 683 0 0 0 0 0 0 0 0 1 4 3 6 22 13 43 62 90 133 274 280 603 710 704 2065 612 0 1 0 0 0 0 2 3 3 2 7 10 21 11 43 72 89 121 258 357 594 718 702 2024 614 0 0 0 0 0 1 1 0 4 1 7 14 14 15 31 77 97 138 230 287 564 688 699 1918 648 0 0 0 0 0 0 0 0 2 2 0 1 5 4 27 28 43 50 93 112 185 272 282 717 200 0.84 -15.144 -1.9 -7.3530492 -2.40003 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 6 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 3 3 5 7 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 2 3 2 4 1 15 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 4 7 9 8 17 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 2 1 2 6 3 16 16 48 19 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 3 4 9 7 33 21 68 19 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 3 4 9 8 15 48 34 76 46 0 0 0 0 0 0 0 0 0 0 0 1 0 0 2 3 4 6 9 16 17 58 44 121 59 0 0 0 0 0 0 0 0 0 0 0 0 2 2 7 2 6 10 13 17 26 96 71 184 68 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 6 10 16 22 30 31 128 95 259 109 0 0 0 0 0 0 0 0 0 1 1 1 1 4 6 4 14 20 40 43 62 203 156 394 124 0 0 0 0 0 0 0 0 0 0 1 1 4 3 6 14 17 26 49 61 95 253 164 560 211 0 0 0 0 0 0 0 0 0 0 0 3 2 4 10 15 20 42 54 70 110 376 254 689 294 0 0 0 0 0 0 0 0 1 0 1 1 5 7 18 22 17 45 60 100 204 497 331 838 338 0 0 0 0 1 0 2 0 2 0 2 2 3 8 14 23 35 54 97 118 203 573 404 1207 470 0 0 0 0 0 0 0 0 1 0 3 3 5 4 15 35 60 83 120 160 311 819 597 1624 627 0 1 0 0 1 0 0 1 0 1 4 5 7 11 16 34 53 93 114 225 337 938 616 1857 688 0 0 0 0 1 0 1 0 0 0 4 8 9 13 22 53 75 93 168 265 368 1145 791 2260 806 0 0 0 0 0 0 0 2 0 2 8 4 9 14 33 70 64 109 202 259 440 1225 891 2412 858 0 0 0 0 0 0 0 0 0 4 4 11 10 11 42 50 62 115 194 254 433 1269 861 2520 832 0 0 0 0 0 1 1 0 1 2 2 6 11 10 32 50 64 116 181 258 378 1165 837 2355 793 0 0 0 0 0 0 0 2 1 1 0 3 3 3 9 14 22 49 69 96 167 470 283 822 302 -1 --gc=-2.3 --au=-0.9 --gu=1.3 --mm=3.5 --gap=6 --max-len=59 --min-stem=4 --max-loop=13 --min-loop=3 --uwin-length=6 --uwin-require=3 --max-hp-score=-2 --max-tail-score=-2.5 --loop-penalty=1,2,3,4,5,6,7,8,9,10,11 --start-cut=0 --end-cut=25 transterm_hp_v2.09/make_expterm.py0000775000265600020320000001277311514142021016607 0ustar tilleaadmin#!/usr/bin/env python import sys def make_zero_matrix(n): A = [] for i in range(0, n): A.append([0] * n) return A def dxdy(at, num_bins): dx = (max_hp[at] - min_hp[at]) / float(num_bins) dy = (max_tail[at] - min_tail[at]) / float(num_bins) return dx, dy def set_ranges_from_file(termfile, num_bins): global min_hp, max_hp, min_tail, max_tail min_hp = {} max_hp = {} min_tail = {} max_tail = {} infile = open(termfile) for line in infile: s = line[:-1].split() at, hp, tail = s[0], float(s[1]), float(s[2]) if at not in min_hp or hp < min_hp[at]: min_hp[at] = hp if at not in max_hp or hp > max_hp[at]: max_hp[at] = hp if at not in min_tail or tail < min_tail[at]: min_tail[at] = tail if at not in max_tail or tail > max_tail[at]: max_tail[at] = tail # add in a buffer in the range = to the size of 1 bin on the low end # and 0.1 on the high end (the later is for numerical problems) for at in min_hp: # NOTE: b/c we change the ranges here, this dx dy is only an estimate... dx, dy = dxdy(at, num_bins) min_hp[at] -= 1.5*dx min_tail[at] -= 1.5*dy max_hp[at] += 0.1 max_tail[at] += 0.1 infile.close() def hist2d_from_file(termfile, num_bins): # hp_range, tail_range): # compute the bin sizes #dx = (hp_range[1] - hp_range[0]) / float(num_bins) #dy = (tail_range[1] - tail_range[0]) / float(num_bins) D = {} infile = open(termfile) for line in infile: s = line[:-1].split() at, hp, tail = s[0], float(s[1]), float(s[2]) # if warn_if_out_of_range(hp, hp_range, "hairpin") or \ # warn_if_out_of_range(tail, tail_range, "tail"): # continue if at not in D: D[at] = make_zero_matrix(num_bins) dx, dy = dxdy(at, num_bins) i = int((hp - min_hp[at])/dx) j = int((tail - min_tail[at])/dy) if i == 0 or j == 0: print >> sys.stderr, at, i, j, hp, tail, min_hp[at], min_tail[at], dx, dy if not (0 <= i < num_bins and 0 <= j < num_bins): print >> sys.stderr, "WARNING: out of range values:", i, j, at, hp, tail print >> sys.stderr, "Ranges=", min_hp[at], max_hp[at], min_tail[at], max_tail[at] continue D[at][i][j] += 1 infile.close() return D warned = {} def warn_if_out_of_range(value, rng, title): if value <= rng[0] or value >= rng[1]: if title not in warned: print >> sys.stderr, "@" * 60 print >> sys.stderr, """WARNING: random %s generated with energy lower than supplied range. Such examples are ignored. I suggest you re-run calibrate.sh after changing the lowerbound in the range variable therein.""" % (title) print >> sys.stderr, "Range = ", rng, "Seen = ", value print >> sys.stderr, "@" * 60 warned[title] = True return True return False def print_matrix(A): """Write out the matrix (transposed)""" for j in range(len(A)): for i in range(len(A)): print A[i][j], print def main(): # read the input infile = sys.argv[1] seqlen = int(sys.argv[2]) num_bins = int(sys.argv[3]) #hp_range = (float(sys.argv[4]), float(sys.argv[5])) #tail_range = (float(sys.argv[6]), float(sys.argv[7])) # print the header print seqlen, num_bins # hp_range[0], hp_range[1], tail_range[0], tail_range[1] set_ranges_from_file(infile, num_bins) D = hist2d_from_file(infile, num_bins) #, hp_range, tail_range) # for every at value, compute and print the matrix for at in sorted(D): print at, min_hp[at], max_hp[at], min_tail[at], max_tail[at] print_matrix(D[at]) if __name__ == '__main__': main() def read_random_terms(infile, at): """Returns a 3-tuple: (D, HPRange, TailRange), where D is a dict maping %at values to a list of (hp, tail) tuples, and the ranges are pairs giving the (min, max) seen""" L = [] inf=open(infile) first = True for line in inf: line = line[:-1] at, hp, tail = line.split() at, hp, tail = float(at), float(hp), float(tail) if at not in D: D[at] = [] D[at].append((hp, tail)) # track the global ranges of the hp and tail scores if first: minhp = maxhp = hp mintail = maxtail = tail first = False else: minhp, maxhp = min(minhp, hp), max(maxhp, hp) mintail, maxtail = min(mintail, tail), max(maxtail, tail) inf.close() return (D, (minhp, maxhp), (mintail, maxtail)) def hist2d(terms, num_bins, hp_range, tail_range): """Build a matrix that is the 2d histogram""" # compute the bin sizes dx = (hp_range[1] - hp_range[0]) / float(num_bins) dy = (tail_range[1] - tail_range[0]) / float(num_bins) # make a num_bins by num_bins zero matrix A = [] for i in range(0,num_bins): A.append([0] * num_bins) # fill in the matrix total = 0 for i in range(0, num_bins): hp_slice = [(hp, tail) for (hp, tail) in terms if hp >= i * dx + hp_range[0] and hp < (i+1)*dx + hp_range[0]] for j in range(0, num_bins): A[i][j] = len([1 for (hp, tail) in hp_slice if tail >= j * dy + tail_range[0] and tail < (j+1)*dy + tail_range[0]]) total += A[i][j] return A transterm_hp_v2.09/map-output.h0000664000265600020320000000062511514142021016026 0ustar tilleaadmin/* This file is part of TransTerm v2.0 BETA and is covered by the GNU GPL * License version 2.0. See file LICENSE.txt for more details. */ #ifndef MAP_OUTPUT_H #define MAP_OUTPUT_H #include #include "seq.h" #include "conf.h" void output_map(ostream &, const Genome &, Confidence &, int=90, bool=true, bool=true); void output_best_term(ostream &, const Confidence &, const Genome &); #endif transterm_hp_v2.09/gene-reader.h0000664000265600020320000000211511514142021016065 0ustar tilleaadmin/* This file is part of TransTerm v2.0 BETA and is covered by the GNU GPL * License version 2.0. See file LICENSE.txt for more details. */ #ifndef GENE_READER_H #define GENE_READER_H #include #include #include #include "seq.h" using namespace std; // abstract base class of all classes that can read annotation files class GeneReader { public: virtual bool read_genes(Genome &) = 0; virtual ~GeneReader() {}; virtual bool good() = 0; }; // read a .coords file from TIGR CMR class CoordsReader : public GeneReader { public: CoordsReader(const string &); virtual ~CoordsReader() {} bool read_genes(Genome &); bool good() { return _in.good(); } private: ifstream _in; }; // read a .ptt file from GenBank class PTTReader : public GeneReader { public: PTTReader(const string &); virtual ~PTTReader() {} bool read_genes(Genome &); bool good() { return _in.good(); } private: ifstream _in; string _id; }; // return the correct reader class given a filename GeneReader * gene_reader_factory(const string &); #endif transterm_hp_v2.09/RELEASE-NOTES.txt0000664000265600020320000000115111514142021016244 0ustar tilleaadminVersion 2.07 (Released on Jan. 21, 2008) - Fixed bug that causes -max-loop option to not be recognized - Added format of the .bag file to USAGE.txt - Added description of "hack" to handle non-annotated sequence to USAGE.txt Release history not available for versions 2.01 through 2.06 Version 2.0 BETA - first released version of C++ rewrite - new confidence scheme - more informative output - better handling of overlapping genes and terminators - over 10x faster - more general model for gaps in hairpins Known issues: - finding antiterminators does not work transterm_hp_v2.09/util.h0000664000265600020320000000073711514142021014674 0ustar tilleaadmin/* This file is part of TransTerm v2.0 BETA and is covered by the GNU GPL * License version 2.0. See file LICENSE.txt for more details. */ #ifndef UTIL_H #define UTIL_H #include #include #include #include #include using namespace std; void print_status(ostream &, unsigned long, unsigned long); void split(const string &, char, vector &); string center(const string &, int); string trim_front(const string &); #endif transterm_hp_v2.09/calibrate.sh0000775000265600020320000000636211514142021016033 0ustar tilleaadmin#!/bin/sh #============================= # Global Constants #============================= #HP_RANGE="-30 -2" # range of possible hp scores #TAIL_RANGE="-7.15 -2" # range of possible tail scoress NUM_BINS=25 # number of bins in the histogram SAMPLESIZE=20000000 # length of random dna sequence to generate #HP_RANGE="-25 -2" # range of possible hp scores #TAIL_RANGE="-7 -2" # range of possible tail scoress #NUM_BINS=10 # number of bins in the histogram #SAMPLESIZE=200000 # length of random dna sequence to generate # values for %AT that we generate data for: AT="0.26 0.28 0.30 0.32 0.34 0.36 0.38" AT="$AT 0.40 0.42 0.44 0.46 0.48 0.50 0.52 0.54 0.56 0.58" AT="$AT 0.60 0.62 0.64 0.66 0.68 0.70 0.72 0.74 0.76 0.78" AT="$AT 0.80 0.82 0.84" #============================= # Process commandline options #============================= if [ ${#} -lt 1 ] ; then echo "usage: `basename $0` outputfile.dat [transterm options]" > /dev/stderr exit 1 fi output=$1 shift #=========================== # Find helper scripts #=========================== findprog() { if [ -x "./$1" ] ; then echo "./$1" elif [ -x "`which $1`" ] ; then echo "$1" else echo "`basename $0`: $1 must be in the current directory or on your PATH." > /dev/stderr exit 2 fi } RAND=`findprog random_fasta.py` TRANSTERM=`findprog transterm` MAKE_EXPTERM=`findprog make_expterm.py` #============================== # Ensure python is avialable #============================== if [ ! -x "`which python`" ] ; then echo "`basename $0`: python must be installed to run calibration scripts" > /dev/stderr exit 3 fi #========================================= # Generate and calibrate on random data #========================================= rm -f random_terms.dat echo "NOTE: warnings about 'using version 1.0 confidence' are expected and OK." > /dev/stderr echo "TransTerm Options = " $* $TRANSTERM $* --v1-conf -S -c 0 --all-context /dev/null /dev/null > /dev/stderr for at in $AT ; do echo "`basename $0`: Running TransTerm on random sequence with %AT = $at" > /dev/stderr #================================ # Generate random DNA data #================================ $RAND $at $SAMPLESIZE "random$at.fasta" "random$at.coords" #================================================== # Run Transterm on that random sequence # output all terminators regardless of confidence # in a striped down format #================================================== $TRANSTERM $* --v1-conf -S -c 0 "random$at.fasta" "random$at.coords" \ | awk '/TERM/ { print at, $9, $10 }' at=$at >> random_terms.dat #================================================= # Clean everything up #================================================= rm random$at.coords random$at.fasta done #=================================== # Make the actual expterms.dat file #=================================== echo "Creating distribution file in $output" > /dev/stderr $MAKE_EXPTERM random_terms.dat $SAMPLESIZE $NUM_BINS > $output echo "-1" >> $output $TRANSTERM $* -S -c 0 /dev/null /dev/null | grep -- "--" >> $output echo "Done. You can run transterm with this background distribution using '-r $output'" transterm_hp_v2.09/analysis.h0000664000265600020320000000057611514142021015543 0ustar tilleaadmin/* This file is part of TransTerm v2.0 BETA and is covered by the GNU GPL * License version 2.0. See file LICENSE.txt for more details. */ #ifndef ANALYSIS_H #define ANALYSIS_H void plot_tthits_vs_terms(ostream &, Confidence &, Genome &); void t2t_hitanal(ostream &, const Genome &, Confidence &, int, bool ); unsigned count_starts_in_genes(const Seq &, Direction dir); #endif transterm_hp_v2.09/map-output.cc0000664000265600020320000002605311514142021016167 0ustar tilleaadmin/* This file is part of TransTerm v2.0 BETA and is covered by the GNU GPL * License version 2.0. See file LICENSE.txt for more details. */ #include #include #include #include #include "map-output.h" #include "conf.h" #include "seq.h" #include "util.h" #include "transterm.h" void output_header(ostream & out) { out << endl; out << "Each terminator entry starts in column 3 and is of the form:" << endl; out << left << setw(11) << " term #" << " " << right << setw(8) << "start" << " - " << left << setw(8) << "end" << " " << "+/-" << " " << " region"; out << right << setw(4) << "conf" << " " << setw(4) << "hp" << " " << setw(8) << "tail" << " | notes"; out << endl; out << "Followed by the sequence of the 5' tail, 5' stem, loop, 3' stem, and 3' tail." << endl; out << "Genes are interspersed, and start the first column." << endl; out << endl << endl; } // output a description of the gene void output_gene(ostream & out, const Region & gene) { out << left << setw(12) << gene.name << " " << right << setw(8) << seqindex(*gene.seq, gene.start) << " - " << left << setw(8) << seqindex(*gene.seq, gene.end) << " " << dir_str(gene.dir()) << " | " << gene.desc << endl; } // output the info about a terminator void output_terminator( ostream & out, const Term & term, int conf, int count, bool print_seq, bool in_fwd_gene, bool in_rvs_gene, bool int2t, bool inh2tforward, bool inh2treverse, bool inh2h) { //if(print_seq) out << endl; ostringstream oss; oss << " TERM " << count; SeqPtr ll, rr; if(term.dir() == FORWARD) { ll = term.left(); rr = term.right(); } else { ll = term.right(); rr = term.left(); } // output the type of region we are considered to be in string loc = ""; if((in_fwd_gene && term.dir() == FORWARD) || (in_rvs_gene && term.dir() == REVERSE)) loc += 'G'; // in a gene, coding strand if((in_fwd_gene && term.dir() == REVERSE) || (in_rvs_gene && term.dir() == FORWARD)) loc += 'g'; // in a gene, noncoding if(int2t) loc += 'T'; if(inh2tforward) loc += ((term.dir() == FORWARD)?'F':'f'); if(inh2treverse) loc += ((term.dir() == REVERSE)?'R':'r'); if(inh2h) loc += "H"; if(loc == "") loc = "N"; // the location & direction out << left << setw(12) << oss.str() << " " << right << setw(8) << seqindex(*term.seq, ll) << " - " << left << setw(8) << seqindex(*term.seq, rr) << " " << dir_str(term.dir()) << " "; out << setw(3) << loc << " "; // the scores out << right << setw(3) << conf << " " << setw(5) << term.hp_energy << " " << setw(8) << term.tail_energy << " | "; // the notes string comma = ""; #if 0 if(term.partner) { out << "bidir"; comma = ", "; } #endif if(term.gap != 0) { out << comma << "gap " << term.gaps.size(); comma = ", "; } if(!term.opp_overlapping.empty()) { out << comma << "opp_overlap"; for(list::const_iterator T = term.opp_overlapping.begin(); T != term.opp_overlapping.end(); ++T) { out << " " << seqindex(*(*T)->seq, (*T)->left()); } comma = ", "; } if(!term.overlapping.empty()) { out << comma << "overlap"; for(list::const_iterator T = term.overlapping.begin(); T != term.overlapping.end(); ++T) { out << " " << seqindex(*(*T)->seq, (*T)->left()); } comma = ", "; } #if 0 // mark terminators that got 0 conf b/c their direction was inconsistent // with the region's direction if(!int2t && !(inh2tforward && inh2treverse) && (inh2tforward && term.dir() == REVERSE || inh2treverse && term.dir() == FORWARD)) out << comma << "wrong dir"; #endif out << endl; if(print_seq) { print_term_seq(out, term); out << endl; } } // used from output_map() in a call to scan_events(), which will send messages // to this EventResponder when scanning a sequence from left to right. class MapOutputer : public EventResponder { public: MapOutputer(ostream & out, Confidence & conf, int co=90, bool print_seq=true, bool only_good_context=false) : EventResponder(), _out(out), _conf(conf), _conf_cutoff(co), _print_seq(print_seq), _only_good_context(only_good_context) {} virtual ~MapOutputer() {} // output the name of the seq and its length void start(const Seq & seq, Direction dir) { EventResponder::start(seq, dir); _out << "SEQUENCE " << seq.name << " " << seq.desc << " (length " << seq.length << ")" << endl << endl; _last_was_term = false; _term_count = 0; } // when we see a terminator, compute its conf given its genomic context // and output it if it passes the cutoff void terminator(const Term * term) { EventResponder::terminator(term); int c = er_confidence(*this, _conf, *term); if(c >= _conf_cutoff && (!_only_good_context || good_context(term))) { output_terminator(_out, *term, c, ++_term_count, _print_seq, fwd_gene_count()>0, rvs_gene_count()>0, in_t2t(), in_h2t_fwd(), in_h2t_rvs(), in_h2h()); _last_was_term = true; } } // for gene /ends/ (either -> or <-) we output the gene. void event(const Event & e) { if(e.kind == Event::ForwardGeneEnd || e.kind == Event::ReverseGeneEnd) { //if(_last_was_term && _print_seq) _out << endl; _last_was_term = false; output_gene(_out, *e.reg); } } // output some newlines to end the printout void end() { _out << endl; // if(!_last_was_term || !_print_seq) _out << endl; } bool good_context(const Term * t) { return in_t2t() || (t->dir() == FORWARD && in_h2t_fwd()) || (t->dir() == REVERSE && in_h2t_rvs()); } private: ostream & _out; Confidence & _conf; int _conf_cutoff; bool _print_seq; bool _only_good_context; bool _last_was_term; int _term_count; }; // output to out a map of each chrom seq in genome g void output_map( ostream & out, const Genome & g, Confidence & conf, int conf_cutoff, bool print_seq, bool only_good_context) { MapOutputer mo(out, conf, conf_cutoff, print_seq, only_good_context); output_header(out); for(EVERY_CHROM_CONST(g, C)) scan_events(**C, mo, gene_start_cut, gene_end_cut); } // |-----> ... |---> // |-----> <-----| class BestAfterGene : public EventResponder { public: BestAfterGene(ostream & out, const Confidence & conf) : _out(out), _conf(conf) {} virtual ~BestAfterGene() {} void start(const Seq & seq, Direction dir) { EventResponder::start(seq, dir); _dir = dir; _best_terms.clear(); _best_confs.clear(); _best_dist.clear(); _gene_names.clear(); _name = ""; _gene_end = 0; _in_intergene = false; //_best_term = 0; //_best_conf = 0; } void terminator(const Term * t) { EventResponder::terminator(t); SeqPtr term_pos = (_dir == FORWARD)?t->left():t->right(); // if the terminator is facing the right way, if we have a gene that it // might be terminating, and its near enough to that gene // we continue until we leave the intergenic region or are 500bp from // the end of the gene, whichever is FARTHER bool in_gene = ((_dir==FORWARD)?fwd_gene_count():rvs_gene_count())>0; if(t->dir() == _dir && _name != "" && (abs(term_pos - _gene_end) <= 525) && !in_gene) //|| _in_intergene)) { int c = er_confidence(*this, _conf, *t); if(c > _best_confs.back() || _best_terms.back() == 0) { _best_terms.back() = t; _best_confs.back() = c; _best_dist.back() = int(term_pos - _gene_end) * _dir; } } } void leave_gene(const Event & e) { EventResponder::leave_gene(e); if((_dir == FORWARD && e.kind == Event::ForwardGeneEnd) || (_dir == REVERSE && e.kind == Event::ReverseGeneEnd)) { _gene_end = e.place; // we append & + the position of the gene b/c gene names // are not unique ostringstream oss; oss << seqindex(*e.reg->seq, _gene_end); //oss << int(_gene_end); _name = e.reg->name + "&" + oss.str(); _gene_names.push_back(_name); _best_terms.push_back(0); _best_confs.push_back(0); _best_dist.push_back(0); _in_intergene = true; } } void enter_gene(const Event & e) { EventResponder::enter_gene(e); #if 0 _name = ""; _gene_end = 0; #endif // replace the above two lines with the following commented lines // if you want to only stop when a co-directed gene start is encountered if((_dir == FORWARD && e.kind == Event::ForwardGeneStart) || (_dir == REVERSE && e.kind == Event::ReverseGeneStart)) { _name = ""; _gene_end = 0; } else { _in_intergene = false; } } void end() { EventResponder::end(); for(unsigned i = 0; i < _gene_names.size(); i++) { int best_conf = _best_confs[i]; const Term * best_term = _best_terms[i]; string name = _gene_names[i].substr(0,_gene_names[i].rfind('&')); _out << setw(12) << name << " "; // << dir_str(_dir) << " "; if(best_term) { _out << *best_term; #if 0 _out << setw(8) << seqindex(*best_term->seq, best_term->left()) << " - " << setw(8) << seqindex(*best_term->seq, best_term->right()) << " " << setw(3) << best_conf << " "; print_term_seq(_out, *best_term); #endif _out << " " << setw(3) << best_conf << " " << _best_dist[i]; } else { _out << "NONE"; } _out << endl; } } private: Direction _dir; vector _gene_names; vector _best_confs; vector _best_dist; vector _best_terms; SeqPtr _gene_end; string _name; ostream & _out; const Confidence & _conf; bool _in_intergene; }; void output_best_term(ostream & out, const Confidence & conf, const Genome & g) { BestAfterGene bag(out, conf); for(EVERY_CHROM_CONST(g, C)) { scan_events(**C, bag, gene_start_cut, gene_end_cut); reverse_scan_events(**C, bag, gene_start_cut, gene_end_cut); } } transterm_hp_v2.09/USAGE.txt0000664000265600020320000003705711514142021015160 0ustar tilleaadminTransTermHP Version 2.07 CONTENTS 0. LICENSE & CREDITS 1. INSTALLATION 2. TRANSTERM USAGE 3. FORMAT OF THE TRANSTERM OUTPUT 4. TRANSTERM COMMAND LINE OPTIONS 5. RECALIBRATING USING DIFFERENT PARAMETERS 6. FORMAT OF THE EXPTERMS.DAT FILE 7. PORTING NOTES 8. 2NDSCORE PROGRAM 9. FORMAT OF .BAG FILES 10. USING TRANSTERM WITHOUT GENOME ANNOTATIONS 0. LICENSE & CREDITS TransTermHP v. 2.0 is a complete rewrite by Carl Kingsford of TransTerm v. 1.0, originally written by Maria D. Ermolaeva. The first TransTermHP was described in the paper: [1] Maria D. Ermolaeva, Hanif G. Khalak, Owen White, Hamilton O. Smith and Steven L. Salzberg. Prediction of Transcription Terminators in Bacterial Genomes. J Mol Biol 301, (1), 27-33 (2000) TransTermHP v 2.0 is free software and is distributed under the GNU Public License. See the file LICENSE.txt included with TransTermHP for complete details. 1. INSTALLATION At present, TransTermHP has only been tested on UNIX-like systems with the GCC/G++ compiler. To compile TransTermHP on such a system, "cd" into the TransTermHP src directory, and type: make clean transterm If there are no errors reported, there should be a "transterm" executable file in the same directory. You can move this executable anyplace that is convenient. To save space, you can type: make no_obj to remove all the .o files that were created during compilation. If you want to use TransTermHP on a non-UNIX-like system, see 'PORTING NOTES' below for some tips. 2. TRANSTERM USAGE The standard usage of TransTermHP is: transterm -p expterm.dat seq.fasta annotation.ptt > output.tt Any number of fasta and annotation files can be listed but fasta files should come before annotation files. The type of the file is determined by the extension: .ptt a GenBank ptt annotation file .coords or .crd a simple annotation file Each line of a .coords or .crd file has the format: gene_name start end chrom_id The chrom_id specifies which sequence the annotation should apply to. For a .ptt file, the chrom_id is taken to be the filename with the path and extension removed. A filename with any other extension is assumed to be a fasta file. When processing an annotation for a chromosom with id = ID, the first word of the '>' lines of the input sequences are searched for ID. Because there is no good standard for how the '>' line is formated, several heuristics are tried to find ID in the '>' line. In the order tried, they are: >ID >junk|cmr:ID|junk or junk|ID|junk >junk|gi|ID|junk or >junk|gi|ID.junk|junk >junk:ID The option '-p expterm.dat' uses the newest confidence scheme, where expterm.dat is the path to the file of that name supplied with TransTermHP. If '-p expterm.dat' is omited, the version 1.0 confidence scheme is used. See section 'COMMAND LINE OPTIONS' for more detail. 3. FORMAT OF THE TRANSTERM OUTPUT The organism's genes are listed sorted by their end coordinate and terminators are output between them. A terminator entry looks like this: TERM 19 15310 - 15327 - F 99 -12.7 -4.0 |bidir (name) (start - end) (sense)(loc) (conf) (hp) (tail) (notes) where 'conf' is the overall confidence score, 'hp' is the hairpin score, and 'tail' is the tail score. 'Conf' (which ranges from 0 to 100) is what you probably want to use to assess the quality of a terminator. Higher is better. The confidence, hp score, and tail scores are described in the paper cited above. 'Loc' gives type of region the terminator is in: 'G' = in the interior of a gene (at least 50bp from an end), 'F' = between two +strand genes, 'R' = between two -strand genes, 'T' = between the ends of a +strand gene and a -strand gene, 'H' = between the starts of a +strand gene and a -strand gene, 'N' = none of the above (for the start and end of the DNA) Because of how overlapping genes are handled, these designations are not exclusive. 'G', 'F', or 'R' can also be given in lowercase, indicating that the terminator is on the opposite strand as the region. Unless the --all-context option is given, only candidate terminators that appear to be in an appropriate genome context (e.g. T, F, R) are output. Following the TERM line is the sequence of the hairpin and the 5' and 3' tails, always written 5' to 3'. 4. TRANSTERM COMMAND LINE OPTIONS You can also set how large a hairpin must be to be considered: --min-stem=n Stem must be n nucleotides long --min-loop=n Loop portion of the hairpin must be at least n long You can also set the maximum size of the hairpin that will be found: --max-len=n Total extent of hairpin <= n NT long --max-loop=n The loop portion can be no longer than n The maximum length is the total length for the hairpin portion (2 stems, 1 loop) and does not include the U-tail. It's measured in nuceotides in the input sequence, so because of gaps, the actual structure may be longer than max-len. Max-len must be less than the compiled-in constant REALLY_MAX_UP (which by default is 1000). To increase the size of structures found recompile after increasing this constant. TransTermHP assigns a score to the hairpin and tail portions of potential terminators. Lower scores are considered better. Many of the constants used in scoring hairpins can be set from the command line: --gc=f Score of a G-C pair --au=f Score of an A-U pair --gu=f Score of a G-U pair --mm=f Score of any other pair --gap=f Score of a gap in the hairpin The cost of loops of various lengths can be set using: --loop-penalty=f1,f2,f3,f4,f5,...fn where f1 is the cost of a loop of length --min-loop, f2 is the cost of a loop of length --min-loop+1, as so on. If there are too few terms to cover up to max-loop, the last term is repeated. Thus --loop-penalty=0,2 would assign cost 0 to any loop of length min-loop, and 2 to any longer loop (up to max-loop, after which longer loops are given infinite scores). Extra terms are ignored. Note that if you are using the --pval-conf confidence scheme (see below), you must regenerate the expterm.dat file if you change any of the above constants. To weed out any potential terminator with tail or hairpin scores that are too large, you can use the following options: --max-hp-score=f Maximum allowable hairpin score --max-tail-score=f Maximum allowable tail score Terminator hairpins must be adjacent to a "U-rich" region. You can adjust the constants the define what constitutes a U-rich region. Using the options: --uwin-size=s --uwin-require=r requires that there are at least r 'U' nucleotides in the s-nucleotide-long window adjacent to the hairpin. Again, if you change these constants, you should regenerate expterms.dat. Before the main output, TransTermHP will output the values of the above options in a format suitable to be used on the command line. In addition to the tail and hairpin scores, each possible terminator is assigned a confidence --- a value between 0 and 100 that indicates how likely it is that the sequence is a terminator. The scoring scheme needs a background file (supplied with TransTermHP) that is specified using: --pval-conf expterms.dat This will use the distribution in the file expterms.dat as the background. (You can abreivate this as "-p expterms.dat".) Though the supplied expterms.dat file is derived from random sequences, any background distribution can be used by supplying your own expterms.dat file. See below for the format of expterms.dat. The values in expterms.dat depend on the scoring constants, definition of u-rich regions, and the maximum allowed tail and hp scores. Thus, if you change any of these constants using the options above, you should regenerate expterms.dat. The main output of TransTermHP is a list of terminators interleaved between a listing of the gene annotations that were provided as input. This output can be customized in a few ways: -S Don't output the terminator sequences --min-conf=n Only output terminators with confidence >= n (can abbreviate this as -c n; default is 76.) Additional analysis output can be obtained with the following options: --bag-output file.bag Output the Best terminator After Gene --t2t-perf file.t2t Output a summary of which tail-to-tail regions have good terminators 5. RECALIBRATING USING DIFFERENT PARAMETERS As mentioned above, if you change any of the basic scoring function and search parameters and are using the version 2.0 confidence scheme (recommended) then you have to recompute the values in the expterm.dat file. If you have python installed this is easy (though perhaps time consuming). You can issue the command: % calibrate.sh newexpterms.dat [OPTIONS TO TRANSTERM] where "[OPTIONS TO TRANSTERM]" are TransTermHP options (discussed above) that set the parameters to what you want them to be. After calibrate.sh finishes, newexpterms.dat will be in the current directory and can serve as an argument to -p when using the same parameters you passed to calibrate.sh. Note that for the newexpterms.dat to be valid, you must supply the same basic parameters to TransTermHP on subsequent runs. TransTerm (or newexpterms.dat) will not remember these parameters for you. The best way to handle this is to make a shell script wrapper around transterm that always passes in your new parameters. Output formating parameters do not require regeneration of expterms.dat --- see discussion above for which parameters expterm.dat depends on. 6. FORMAT OF THE EXPTERMS.DAT FILE The 'pval-conf' confidence scheme, selected with the option "--pval-conf expterms.dat" (or '-p expterms.dat') computes the confidence of a terminator with HP energy E and tail energy T as follows. First, the ranges of HP energies and tail energies are evenly divided into bins, and the appropriate bins e and t are found for E and T. Then the confidence is computed as described in [2]. The first line of expterms.dat contains 6 numbers: seqlen num_bins The (low_hp, high_hp) and (low_tail, high_tail) ranges give the bounds on the hairpin and tail scores. The integer num_bins gives the number of equally-sized bins into which those ranges are divided. Seqlen gives the length of the random sequence that was used to generate the data in the rest of the file. Following this line are any number of (at, R, M) triples, where 'at' is the AT content, R is a 4-tuple (low_hp, high_hp, low_tail, high_tail) giving the range of the HP and tail scores observed in random sequences of this AT content, and M is the distribution matrix. These (at, R, M) triples are formated as follows: at low_hp high_hp low_tail high_tail n11 n12 n13 n14 ... n1,num_bins n21 ... ... n_num_bins,1 ... The mu_r(e,t) term is computed by selecting the matrix with the at value closest to the computed %AT of the region r. If the total length of region r sequence is L_r, then mu_r(e,t) = n_t_e * L_r/seqlen where n_t_e is the entry in the t-th row and e-th column of the selected matrix, and seqlen is the first number in the first line of the file. 7. PORTING NOTES If you want to run TransTermHP on a non-UNIX-like system, you should take note of the following: * gene-reader.cc assumes that the filename extension separators is "." and the path separator is "/". * getopt_long() is used to process the command line arguments. 8. 2NDSCORE PROGRAM The package also comes with a program '2ndscore' which will find the best hairpin anchored at each position. The basic usage is: 2ndscore in.fasta > out.hairpins For every position in the sequence this will output a line: -0.6 52 .. 62 TTCCTAAAGGTTCCA GCG CAAAA TGC CATAAGCACCACATT (score) (start .. end) (left context) (hairpin) (right contenxt) For positions near the ends of the sequences, the context may be padded with 'x' characters. If no hairpin can be found, the score will be 'None'. Multiple fasta files can be given and multiple sequences can be in each fasta file. The output for each sequence will be separated by a line starting with '>' and containing the FASTA description of the sequence. Because the hairpin scores of the plus-strand and minus-strand may differ (due to GU binding in RNA), by default 2ndscore outputs two sets of hairpins for every sequence: the FORWARD hairpins and the REVERSE hairpins. All the forward hairpins are output first, and are identified by having the word 'FORWARD' at the end of the '>' line preceding them. Similarly, the REVERSE hairpins are listed after a '>' line ending with 'REVERSE'. If you want to search only one or the other strand, you can use: --no-fwd Don't print the FORWARD hairpins --no-rvs Don't print the REVERSE hairpins You can set the energy function used, just as with transterm with the --gc, --au, --gu, --mm, --gap options. The --min-loop, --max-loop, and --max-len options are also supported. 9. FORMAT OF THE .BAG FILES The columns for the .bag files are, in order: 1. gene_name 2. terminator_start 3. terminator_end 4. hairpin_score 5. tail_score 6. terminator_sequence 7. terminator_confidence: a combination of the hairpin and tail score that takes into account how likely such scores are in a random sequence. This is the main "score" for the terminator and is computed as described in the paper. 8. APPROXIMATE_distance_from_end_of_gene: The *approximate* number of base pairs between the end of the gene and the start of the terminator. This is approximate in several ways: First, (and most important) TransTermHP doesn't always use the real gene ends. Depending on the options you give it may trim some off the ends of genes to handle terminators that partially overlap with genes. Second, where the terminator "begins" isn't that well defined. This field is intended only for a sanity check (terminators reported to be the best near the ends of genes shouldn't be _too far_ from the end of the gene). 10. USING TRANSTERM WITHOUT GENOME ANNOTATIONS TransTermHP uses known gene information for only 3 things: (1) tagging the putative terminators as either "inside genes" or "intergenic," (2) choosing the background GC-content percentage to compute the scores, because genes often have different GC content than the intergenic regions, and (3) producing slightly more readable output. Items (1) and (3) are not really necessary, and (2) has no effect if your genes have about the same GC-content as your intergenic regions. Unfortunately, TransTermHP doesn't yet have a simple option to run without an annotation file (either .ptt or .coords), and requires at least 2 genes to be present. The solution is to create fake, small genes that flank each chromosome. To do this, make a fake.coords file that contains only these two lines: fakegene1 1 2 chome_id fakegene2 L-1 L chrom_id where L is the length of the input sequence and L-1 is 1 less than the length of the input sequence. "chrom_id" should be the word directly following the ">" in the .fasta file containing your sequence. (If, for example, your .fasta file began with ">seq1", then chrom_id = seq1). This creates a "fake" annotation with two 1-base-long genes flanking the sequence in a tail-to-tail arrangement: --> <--. TransTermHP can then be run with: transterm -p expterm.dat sequence.fasta fake.coords If the G/C content of your intergenic regions is about the same as your genes, then this won't have too much of an effect on the scores terminators receive. On the other hand, this use of TransTermHP hasn't been tested much at all, so it's hard to vouch for its accuracy. transterm_hp_v2.09/conf.h0000664000265600020320000000532211514142021014637 0ustar tilleaadmin/* This file is part of TransTerm v2.0 BETA and is covered by the GNU GPL * License version 2.0. See file LICENSE.txt for more details. */ #ifndef CONF_H #define CONF_H #include #include "distr.h" #include "seq.h" // Abstract base class for a confidence algorithm. Users must call prepare() // first and then can call score() to get confidence score for a terminator class Confidence { public: virtual void prepare(const Genome &) = 0; virtual int score(const Term &, RegionType) const = 0; virtual ~Confidence() {} }; // compute the confidence as described in the Ermolaeva et al 2000 paper. // Note: the terms and genes must be sorted by their leftmost point before // the call to prepare. class ErmolaevaConfidence : public Confidence { public: ErmolaevaConfidence() : prepared(false) {} virtual ~ErmolaevaConfidence() {} void prepare(const Genome &); int score(const Term &, RegionType) const; protected: double K; double t2t_L, h2t_L; double t2t_N, h2t_N; Distribution h2t_hp, h2t_tail; Distribution t2t_hp, t2t_tail; bool prepared; int score_one(const Term &, RegionType) const; }; // compute the confidence using the background distribution (usually random) // read in from a given file. class RandomConfidence : public Confidence { public: RandomConfidence(const string &); virtual ~RandomConfidence() {} void prepare(const Genome &); int score(const Term &, RegionType) const; typedef vector > Histogram2d; protected: Histogram2d & get_table(double); const Histogram2d & get_table(double) const; void read_exp_table(const string &); void fill_emp_table(RegionType, const ConstTermVec &); unsigned long histvalue(const Histogram2d & hist, const Term &, double) const; unsigned long & histvalue(Histogram2d & hist, const Term &, double); int hbin(double, double, int, double) const; int get_best_at(double) const; unsigned long _sample_size; //double _low_hp, _high_hp; //double _low_tail, _high_tail; map _low_hp, _high_hp, _low_tail, _high_tail; int _nbins; map _exp_table; map _emp_table; map _emp_len; map _emp_at; bool _prepared; }; class RandomPValueConfidence : public RandomConfidence { public: RandomPValueConfidence(const string &); virtual ~RandomPValueConfidence() {} virtual int score(const Term &, RegionType) const ; protected: void sum_exp_table(); }; int er_confidence(const EventResponder &, const Confidence &, const Term &); Distribution signal_to_noise(Term::EnergyKind, const ConstTermVec &, const ConstTermVec &); #endif transterm_hp_v2.09/seq.h0000664000265600020320000001766011514142021014512 0ustar tilleaadmin/* This file is part of TransTerm v2.0 BETA and is covered by the GNU GPL * License version 2.0. See file LICENSE.txt for more details. */ #ifndef SEQ_H #define SEQ_H #include #include #include #include #include #include using namespace std; typedef double Energy; enum Direction {REVERSE=-1, BIDIR=0, FORWARD=1}; enum RegionType { GENE, HEAD2TAIL, TAIL2TAIL, HEAD2HEAD }; typedef const char * SeqPtr; class Seq; class Confidence; // Represents a region of a sequence [start,end] (both pointers are inclusive) struct Region { string name; // a description of the region SeqPtr start, end; // pointers into seq at the start & end of the seq const Seq * seq; // seq of which this is a region string desc; Region(const string & n, const Seq * seqq, SeqPtr s, SeqPtr e, const string & d = "") : name(n), start(s), end(e), seq(seqq), desc(d) {} Region() : name(""), start(0), end(0), seq(0) {} virtual ~Region() {} // Direction of the region. Regions always "run" from start to end // If end < start, the dreiction is "REVERSE" virtual Direction dir() const { return (end < start) ? REVERSE : FORWARD; } // the 'left' of the region is always the 5' end of the sequence, // regardless of the direction of the region. The 'right', likewise, is // always the 3' end. SeqPtr left() const { return min(start,end); } SeqPtr right() const { return max(start,end); } int length() const { return abs(start-end) + 1; } }; // represents a terminator region. the 'region' is defined as the hairpin // sequence --- the tail 'region' is not included. struct Term : public Region { // terminator geometry int gap; int stem_len, loop_len; list gaps; // terminator scores Energy hp_energy, tail_energy; int conf; // a link to an 'equiv' term const Term * partner; list opp_overlapping, overlapping; Term() : Region(), gap(0), stem_len(0), loop_len(0) { init0(); lst = rst = 0; } Term(const Seq * s, Direction d, SeqPtr base, int sl, int ll, int g) : Region("", s, base - d*(geolength(sl, ll, g) - 1), base), gap(g), stem_len(sl), loop_len(ll), sense(d) { init0(); rst = right() - stem_len + 1 - ((gap>0)?1:0); lst = right_stem_top() - loop_len - 1; } Term(const Seq * s, Direction d, SeqPtr lsb1, SeqPtr lst1, SeqPtr rst1, SeqPtr rsb1, list & glist, Energy hpe = 0) : Region("", s, min(lsb1, rsb1), max(lsb1, rsb1)) { init0(); lst = min(lst1, rst1); rst = max(lst1, rst1); sense = d; hp_energy = hpe; loop_len = rst - lst - 1; stem_len = right() - rst + 1; // for backwards compatability, gap = true if there is a gap gap = (abs(lst - left() + 1) != stem_len) ? 1 : 0; gaps = glist; } virtual ~Term() { // if(gaps) // { // delete gaps; // gaps = 0; // } } Direction dir() const { return sense; } Direction & dir() { return sense; } // accessors for the energy assigned to this terminator enum EnergyKind {TAIL, HAIRPIN}; Energy energy(EnergyKind k) const { return (k==HAIRPIN)?hp_energy:tail_energy; } // Pointers into the geometry of the terminator SeqPtr left_stem_base() const { return left(); } SeqPtr right_stem_base() const { return right(); } SeqPtr left_stem_top() const { return lst; } SeqPtr right_stem_top() const { return rst; } private: // return the size of the hairpin with the given geometry int geolength(int stem_len, int loop_len, int gap) { return 2*stem_len + loop_len + ((gap!=0)?1:0); } // set the things that shd be 0 to 0 void init0() { hp_energy = tail_energy = 0.0; conf = 0; partner = 0; } Direction sense; SeqPtr lst, rst; }; typedef vector ConstTermVec; // Represents a sequence struct Seq { string name; string desc; unsigned long length; // number of characters in the seq char * dna; // pointer to the sequence data vector terms; // list of Term features vector genes; // list of gene features Seq() : name(""), desc(""), length(0), dna(0) {} ~Seq() { clear(); } void clear(); SeqPtr left() const { return dna; } SeqPtr right() const { return dna + length - 1; } }; typedef vector Genome; // represents an event happening in the sequence. struct Event { const Region * reg; // for paired events, extent is the location of the other event SeqPtr place, extent; enum Kind { Terminator, ForwardGeneStart, ForwardGeneEnd, ReverseGeneStart, ReverseGeneEnd } kind; Event() : reg(0), place(0), extent(0) {} Event(const Region * r, SeqPtr p, Kind k, SeqPtr x=0) : reg(r), place(p), extent(x), kind(k) {} }; typedef vector::const_iterator event_iterator; class EventResponder { public: virtual ~EventResponder() {} virtual void start(const Seq & seq, Direction dir) { _fwd_gene = _rvs_gene = 0; } virtual void end() {} virtual void event(const Event & e) {} virtual void terminator(const Term * term) {} // if these are called from the subclass, they'll manage gene_count() virtual void enter_gene(const Event & e); virtual void leave_gene(const Event & e); virtual void enter_intergene(RegionType r, Direction d, const Event & e) { set(r, d, true); } virtual void leave_intergene(RegionType r, Direction d, const Event & e) { set(r, d, false); } friend int er_confidence(const EventResponder &, const Confidence &, const Term &); protected: // you can't create a plain EventResponder --- must subclass EventResponder() { _fwd_gene = _rvs_gene = 0; _t2t = _h2t_fwd = _h2t_rvs = _h2h = false; } bool in_t2t() const { return _t2t; } bool in_h2h() const { return _h2h; } bool in_h2t_fwd() const { return _h2t_fwd; } bool in_h2t_rvs() const { return _h2t_rvs; } int gene_count() const { return _fwd_gene + _rvs_gene; } int rvs_gene_count() const { return _rvs_gene; } int fwd_gene_count() const { return _fwd_gene; } private: void set(RegionType r, Direction d, bool tf); int _fwd_gene, _rvs_gene; bool _t2t, _h2t_fwd, _h2t_rvs, _h2h; }; #define EVERY_CHROM(ch, C) \ Genome::iterator C = ch.begin(); C != ch.end(); ++C #define EVERY_CHROM_CONST(ch, C) \ Genome::const_iterator C = ch.begin(); C != ch.end(); ++C #define EVERY_REGION(vec, R) \ vector::iterator R = vec.begin(); R != vec.end(); ++R #define EVERY_REGION_CONST(vec, R) \ vector::const_iterator R = vec.begin(); R != vec.end(); ++R #define EVERY_TERM(vec, T) \ vector::iterator T = vec.begin(); T != vec.end(); ++T #define EVERY_TERM_CONST(vec, T) \ vector::const_iterator T = vec.begin(); T != vec.end(); ++T #define EVERY_CTERM_CONST(vec, T) \ vector::const_iterator T = vec.begin(); T != vec.end(); ++T const char PADDING_CHAR = 'x'; int seqindex(const Seq &, SeqPtr); string subseq(SeqPtr, SeqPtr); void read_seqs(istream &, Genome &); void pad_seqs(Genome &, int); void pad_seq(Seq &, int); bool region_isleftof(const Region *, const Region *); bool hp_overlap(const Term &, const Term &); bool dominates(const Term &, const Term &); string dir_str(Direction); void scan_events(const Seq &, EventResponder &, int, int); void reverse_scan_events(const Seq &, EventResponder &, int, int); void sort_genes(Genome &); Seq * chrom_for_id(Genome &, const string &); void print_term_seq(ostream &, const Term &); ostream & operator<<(ostream &, const Term &); #endif transterm_hp_v2.09/seq.cc0000664000265600020320000005234611514142021014650 0ustar tilleaadmin/* This file is part of TransTerm v2.0 BETA and is covered by the GNU GPL * License version 2.0. See file LICENSE.txt for more details. */ #include #include #include #include #include #include "util.h" #include "seq.h" const size_t INIT_SEQ_ALLOC = 1000000; // free the memory reserved for the sequence data. we clear the name // and descriptions too void Seq::clear() { name = desc = ""; if(dna) free(dna); dna = 0; length = 0; terms.clear(); genes.clear(); } // convert a direction into a string string dir_str(Direction dir) { switch(dir) { case FORWARD: return "+"; case REVERSE: return "-"; case BIDIR: return "+/-"; } return ""; } // return the 1-based seqindex for the position cp int seqindex(const Seq & seq, SeqPtr cp) { return cp - seq.left() + 1; } // extract a substring of a character array as a string // inclusive: [cp1, cp2] string subseq(SeqPtr cp1, SeqPtr cp2) { string str = ""; for(; *cp1 && cp1 <= cp2; cp1++) str += *cp1; return str; } // used by read_seq to reallocate the space for the sequence void resize_buffer(Seq & seq, size_t bufsize) { seq.dna = (char*)realloc(seq.dna, bufsize*sizeof(char)); if(!seq.dna) { cerr << "Couldn't allocate enough memory for sequence." << endl; exit(3); } } // Read a single sequence from the input stream (fasta format) // return a ptr to the new sequence object Seq * read_seq(istream & in) { int i; Seq * seq = new Seq(); // find first line starting with > while((i = in.get()) && i != EOF && i != '>') { } if(i == EOF) return false; // read the name while((i = in.get()) && i != EOF && !isspace(i)) { seq->name += (char)i; } if(i == EOF) return false; if(seq->name.length() == 0) { cerr << "Sequence has no name on the '>' fasta line." << endl; exit(3); } // if i == '\n' then there is no description seq->desc = ""; if(i != '\n') { // read the desc while((i = in.get()) && i != EOF && i != '\n') { seq->desc += (char)i; } } if(i == EOF) return false; // allocate initial buffer of 1Mb for seq size_t bufsize = INIT_SEQ_ALLOC; resize_buffer(*seq, bufsize); // read until EOF or '>' unsigned long cc = 0; while((i = in.get()) && i != EOF && i != '>') { // any whitespace is ignored if(isspace(i)) continue; if(i=='>') { cerr << "TransTermHP does not support multiple sequences in a single" << " FASTA file. Spilt the sequences into multiple files." << endl; exit(3); } // if we've run out of space, double it if(cc > bufsize) { bufsize *= 2; resize_buffer(*seq, bufsize); } seq->dna[cc++] = (char)toupper(i); } // terminate sequence with '\0' && free unused memory seq->length = cc; seq->dna[cc++] = 0; resize_buffer(*seq, cc); // unget the stopping character if(i != EOF) in.putback(i); return seq; } // read all the sequences in the given file void read_seqs(istream & in, Genome & seqs) { Seq * seq; while((seq = read_seq(in))) { seqs.push_back(seq); } } // add padding ' ' characters to each end of the sequence void pad_seq(Seq & dna, int padding) { char * newseq = (char*)calloc(dna.length + 2*padding, sizeof(char)); memset(newseq, PADDING_CHAR, padding); memcpy(newseq + padding, dna.dna, dna.length); memset(newseq + padding + dna.length, PADDING_CHAR, padding); free(dna.dna); dna.dna = newseq; dna.length += 2 * padding; } // add padding to each of the sequences in the genome void pad_seqs(Genome & chroms, int padding) { for(Genome::iterator C = chroms.begin(); C != chroms.end(); ++C) { pad_seq(**C, padding); } } // for sorting: return true if region t1 starts to the left of t2 bool region_isleftof(const Region * t1, const Region * t2) { return t1->left() < t2->left(); } // return true if the hairpin regions overlap bool hp_overlap(const Term & t1, const Term & t2) { SeqPtr b1 = t1.left_stem_base(); SeqPtr e1 = t1.right_stem_base(); SeqPtr b2 = t2.left_stem_base(); SeqPtr e2 = t2.right_stem_base(); return (b1>=b2 && b1 <= e2) || (e1>=b2 && e1<=e2) || (b1<=b2 && e1>=e2); } // return true if t1 dominates t2 bool dominates(const Term & t1, const Term & t2) { return t1.left() <= t2.left() && t1.right() >= t2.right(); } //====================================================================== // TERMINATOR SEQUENCE OUTPUT //====================================================================== string concat_dir(const string & str1, char ch, Direction dir) { if(dir == REVERSE) { return str1 + ch; } else { return ch + str1; } } void popgaps(string & out, list & gaps, int i, Direction dir) { while(!gaps.empty() && abs(gaps.front()) == i) { out = concat_dir(out, '-', dir); gaps.pop_front(); } } bool abs_int_cmp(int a, int b) { return abs(a) < abs(b); } string addgaps(const string & str, const Term & term) { if(term.gaps.size() == 0) return str; #if 0 cout << "ORIG: " << str << endl; for(list::const_iterator G = term.gaps.begin(); G != term.gaps.end(); ++G) { cout << " " << *G; } cout << endl; #endif // copy and sort the list of gaps list gaps(term.gaps); gaps.sort(abs_int_cmp); Direction dir = term.dir(); int starti, endi; if(dir == REVERSE) { starti = 0; endi = str.length()-1; } else { starti = str.length()-1; endi = 0; } // copy every character in the input str over to the output string // possibly inserting gaps as required by the gaps list int reali = 0; string out = ""; for(int i = starti; ; i -= dir) { if(str[i] == ' ') { out = concat_dir(out, ' ', dir); continue; } if(gaps.empty() || gaps.front() <= 0) { popgaps(out, gaps, reali, dir); out = concat_dir(out, str[i], dir); } else { out = concat_dir(out, str[i], dir); popgaps(out, gaps, reali, dir); } reali++; if(i == endi) break; } return out; } string term_seq(const Term & term, bool withgaps) { string lefttail, seq, righttail; lefttail = subseq(term.left_stem_base()-15, term.left_stem_base()-1); seq = subseq(term.left_stem_base(), term.left_stem_top()) + " " + subseq(term.left_stem_top()+1,term.right_stem_top()-1) + " " + subseq(term.right_stem_top(), term.right_stem_base()); righttail = subseq(term.right_stem_base()+1, term.right_stem_base()+15); if(withgaps) seq = addgaps(seq, term); return lefttail + " " + center(seq, 45) + " " + righttail; } // output the tails, stem and loop sequence of a terminator void print_term_seq(ostream & out, const Term & term) { out << " " << term_seq(term, true); #if 0 out << " " << subseq(term.left_stem_base()-15, term.left_stem_base()-1) << " " << center(subseq(term.left_stem_base(), term.left_stem_top()) + " " + subseq(term.left_stem_top()+1,term.right_stem_top()-1) + " " + subseq(term.right_stem_top(), term.right_stem_base()), 45) << " " << subseq(term.right_stem_base()+1, term.right_stem_base()+15); #endif } // mostly for debugging, output a terminator to a stream ostream & operator<<(ostream & out, const Term & t) { out << setw(7) << seqindex(*t.seq, t.left()) << " .. " << setw(7) << seqindex(*t.seq, t.right()) << " " << setw(1) << dir_str(t.dir()) << " " << setw(5) << t.hp_energy << " " << setw(9) << t.tail_energy << " "; print_term_seq(out, t); return out; // << endl; } //============================================================================= // Event Functions //============================================================================= // true if event starts a region when scanning to the right bool is_gene_left_end(const Event & e) { return e.kind == Event::ForwardGeneStart || e.kind == Event::ReverseGeneEnd; } bool is_gene_right_end(const Event & e) { return e.kind == Event::ForwardGeneEnd || e.kind == Event::ReverseGeneStart; } bool is_rvs_gene_event(const Event & e) { return e.kind == Event::ReverseGeneStart || e.kind == Event::ReverseGeneEnd; } bool is_fwd_gene_event(const Event & e) { return e.kind == Event::ForwardGeneStart || e.kind == Event::ForwardGeneEnd; } // track the number of fwd and rvs genes that we're in void EventResponder::enter_gene(const Event & e) { if(is_fwd_gene_event(e)) { _fwd_gene++; } else if(is_rvs_gene_event(e)) { _rvs_gene++; } else assert(false); } // track the number of fwd and rvs genes that we're in void EventResponder::leave_gene(const Event & e) { if(is_fwd_gene_event(e)) { assert(_fwd_gene > 0); _fwd_gene--; } else if(is_rvs_gene_event(e)) { assert(_rvs_gene > 0); _rvs_gene--; } else assert(false); } // (private) set the correct flags given the event type seen void EventResponder::set(RegionType r, Direction d, bool tf) { if(r == TAIL2TAIL) { _t2t = tf; } else if(r == HEAD2TAIL && d == FORWARD) { _h2t_fwd = tf; } else if(r == HEAD2TAIL && d == REVERSE) { _h2t_rvs = tf; } else if(r == HEAD2HEAD) { _h2h = tf; } } // for sorting by event place bool by_event_location(const Event & e1, const Event & e2) { return e1.place < e2.place; } // for sorting by event place bool reverse_by_event_location(const Event & e1, const Event & e2) { return e1.place > e2.place; } // create an unsorted list of events void populate_events( const Seq & seq, vector & events, int start_cut_in, int end_cut_in, Direction dir) { const int MIN_GENE_SIZE = 50; // never shrink genes smaller than this for(EVERY_REGION_CONST(seq.genes, G)) { SeqPtr start, end; int gene_len; int start_cut, end_cut; // cut = max(0, (signed((*G)->length()) - signed(MIN_GENE_SIZE)) / 2); // cut = min(start_cut_in, cut); // first we make the full start cut (never making the gene less than MIN_GENE_SIZE) // then we make the end cut, never making the gene less than MIN_GENE_SIZE. In other // words, start_cut gets preference over end_cut gene_len = (*G)->length(); start_cut = min(start_cut_in, max(0, gene_len - signed(MIN_GENE_SIZE))); gene_len -= start_cut; end_cut = min(end_cut_in, max(0, gene_len - signed(MIN_GENE_SIZE))); // cout << "CUT: " << (*G)->name << " " << (*G)->length() << " " // << cut << " " << start_cut << " " << end_cut << endl; if((*G)->dir() == FORWARD) { start = (*G)->start + start_cut; end = (*G)->end - end_cut; events.push_back(Event(*G, start, Event::ForwardGeneStart, end)); events.push_back(Event(*G, end, Event::ForwardGeneEnd, start)); } else { start = (*G)->start - start_cut; end = (*G)->end + end_cut; events.push_back(Event(*G, start, Event::ReverseGeneStart, end)); events.push_back(Event(*G, end, Event::ReverseGeneEnd, start)); } } // when scanning in reverse, we use the right() end of terminators for(EVERY_TERM_CONST(seq.terms, T)) { events.push_back(Event(*T, (dir==REVERSE) ? ((*T)->right()) : ((*T)->left()), Event::Terminator)); } } // return rightmost event of type k between E and end (exclusive) // that can be reached /without/ crossing a full gene. event_iterator rightmost_nocross(event_iterator E, event_iterator end, Event::Kind k) { event_iterator rightmost = end; SeqPtr right = E->reg->seq->right(); for(++E; E != end && E->place <= right; ++E) { if(is_gene_left_end(*E)) right = min(right, E->extent); if(E->kind == k) rightmost = E; } return rightmost; } // return leftmost event of type k between E and end (exclusive) that can be // reached /without/ crossing a full gene. (events must be sorted from right to // left) event_iterator leftmost_nocross(event_iterator E, event_iterator end, Event::Kind k) { event_iterator leftmost = end; SeqPtr left = E->reg->seq->left(); for(++E; E != end && E->place >= left; ++E) { if(is_gene_right_end(*E)) left = max(left, E->extent); if(E->kind == k) leftmost = E; } return leftmost; } // process the events for the sequence left to right void scan_events(const Seq & seq, EventResponder & er, int start_cut, int end_cut) { // create a event 'queue' that we'll process vector events; populate_events(seq, events, start_cut, end_cut, FORWARD); sort(events.begin(), events.end(), by_event_location); // these mark when the regions should end. If they equal events.end() then // we aren't in the region of the given type event_iterator h2t_fwd_end = events.end(); event_iterator h2t_rvs_end = events.end(); event_iterator t2t_end = events.end(); event_iterator h2h_end = events.end(); er.start(seq, FORWARD); for(event_iterator E = events.begin(); E != events.end(); ++E) { event_iterator e; er.event(*E); if(E == h2t_fwd_end) { er.leave_intergene(HEAD2TAIL, FORWARD, *h2t_fwd_end); h2t_fwd_end = events.end(); } if(E == h2t_rvs_end) { er.leave_intergene(HEAD2TAIL, REVERSE, *h2t_rvs_end); h2t_rvs_end = events.end(); } if(E == t2t_end) { er.leave_intergene(TAIL2TAIL, BIDIR, *t2t_end); t2t_end = events.end(); } if(E == h2h_end) { er.leave_intergene(HEAD2HEAD, BIDIR, *h2h_end); h2h_end = events.end(); } // enter and leave genes switch(E->kind) { case Event::ForwardGeneStart: er.enter_gene(*E); break; case Event::ReverseGeneEnd: er.enter_gene(*E); break; case Event::ForwardGeneEnd: er.leave_gene(*E); break; case Event::ReverseGeneStart: er.leave_gene(*E); break; default: break; } switch(E->kind) { case Event::Terminator: er.terminator(((Term*)E->reg)); break; case Event::ForwardGeneEnd: // Either a H2T forward or a t2t region can be begun with -> e = rightmost_nocross(E, events.end(), Event::ForwardGeneStart); if(h2t_fwd_end == events.end() && e != events.end()) { er.enter_intergene(HEAD2TAIL, FORWARD, *E); } h2t_fwd_end = e; e = rightmost_nocross(E, events.end(), Event::ReverseGeneEnd); if(t2t_end == events.end() && e != events.end()) { er.enter_intergene(TAIL2TAIL, BIDIR, *E); } t2t_end = e; break; case Event::ReverseGeneStart: // only a H2T reverse or H2H can start at a --| e = rightmost_nocross(E, events.end(), Event::ReverseGeneEnd); if(h2t_rvs_end == events.end() && e != events.end()) { er.enter_intergene(HEAD2TAIL, REVERSE, *E); } h2t_rvs_end = e; // head to heads start with --| end with |-- e = rightmost_nocross(E, events.end(), Event::ForwardGeneStart); if(h2h_end == events.end() && e != events.end()) { er.enter_intergene(HEAD2HEAD, BIDIR, *E); } h2h_end = e; break; default: break; } } er.end(); } // process the events for the sequence right to left void reverse_scan_events(const Seq & seq, EventResponder & er, int start_cut, int end_cut) { // create a event 'queue' that we'll process vector events; populate_events(seq, events, start_cut, end_cut, REVERSE); sort(events.begin(), events.end(), reverse_by_event_location); // these mark when the regions should end. If they equal events.end() then // we aren't in the region of the given type event_iterator h2t_fwd_end = events.end(); event_iterator h2t_rvs_end = events.end(); event_iterator t2t_end = events.end(); event_iterator h2h_end = events.end(); er.start(seq, REVERSE); for(event_iterator E = events.begin(); E != events.end(); ++E) { event_iterator e; er.event(*E); if(E == h2t_fwd_end) { er.leave_intergene(HEAD2TAIL, FORWARD, *h2t_fwd_end); h2t_fwd_end = events.end(); } if(E == h2t_rvs_end) { er.leave_intergene(HEAD2TAIL, REVERSE, *h2t_rvs_end); h2t_rvs_end = events.end(); } if(E == t2t_end) { er.leave_intergene(TAIL2TAIL, BIDIR, *t2t_end); t2t_end = events.end(); } if(E == h2h_end) { er.leave_intergene(HEAD2HEAD, BIDIR, *h2h_end); h2h_end = events.end(); } switch(E->kind) { case Event::ForwardGeneEnd: er.enter_gene(*E); break; case Event::ReverseGeneStart: er.enter_gene(*E); break; case Event::ForwardGeneStart: er.leave_gene(*E); break; case Event::ReverseGeneEnd: er.leave_gene(*E); break; default: break; } switch(E->kind) { case Event::Terminator: er.terminator(((Term*)E->reg)); break; case Event::ForwardGeneStart: // Either a H2T forward or a t2t region can be begun with -> e = leftmost_nocross(E, events.end(), Event::ForwardGeneEnd); if(h2t_fwd_end == events.end() && e != events.end()) { er.enter_intergene(HEAD2TAIL, FORWARD, *E); } h2t_fwd_end = e; e = leftmost_nocross(E, events.end(), Event::ReverseGeneStart); if(h2h_end == events.end() && e != events.end()) { er.enter_intergene(HEAD2HEAD, BIDIR, *E); } h2h_end = e; break; case Event::ReverseGeneEnd: // only a H2T reverse can start at a --| e = leftmost_nocross(E, events.end(), Event::ReverseGeneStart); if(h2t_rvs_end == events.end() && e != events.end()) { er.enter_intergene(HEAD2TAIL, REVERSE, *E); } h2t_rvs_end = e; e = leftmost_nocross(E, events.end(), Event::ForwardGeneEnd); if(t2t_end == events.end() && e != events.end()) { er.enter_intergene(TAIL2TAIL, BIDIR, *E); } t2t_end = e; break; default: break; } } er.end(); } // given an id, try to find the rigth chromosome (seq) in the genome // b/c of poor consistency (aka no consistency) in the naming schemes, // we try a bunch of heuristics. Seq * chrom_for_id(Genome & g, const string & id) { vector vec; // try exact match first for(EVERY_CHROM(g, C)) { if((*C)->name == id) return *C; } // we also try to find a chrom with junk|cmr:ID|junk for(EVERY_CHROM(g, C)) { split((*C)->name, '|', vec); for(unsigned i = 0; i < vec.size(); ++i) { if(vec[i].substr(0,4) == "cmr:" && vec[i].substr(4) == id) return *C; } } // next we try to find gi|ID or gb|ID or gb|ID.junk for(EVERY_CHROM(g, C)) { split((*C)->name, '|', vec); for(unsigned i = 0; i < vec.size() - 1; ++i) { if((vec[i] == "gi" && vec[i+1] == id) || ((vec[i] == "gb" || vec[i] == "ref") && (vec[i+1] == id || vec[i+1].substr(0,vec[i+1].rfind('.')) == id))) { return *C; } } } // finally, we check the first part of x|y|z to see if x = junk:ID for(EVERY_CHROM(g, C)) { split((*C)->name, '|', vec); if(!vec.empty() && vec[0].substr(vec[0].rfind(':')+1) == id) { return *C; } } return 0; } bool same_coords(const Region * r1, const Region * r2) { return r1->start == r2->start && r1->end == r2->end; } // sort all the genes in the genome by their left end point void sort_genes(Genome & g) { for(EVERY_CHROM(g, C)) { sort((*C)->genes.begin(), (*C)->genes.end(), region_isleftof); // remove any duplicate genes vector::iterator e = unique((*C)->genes.begin(), (*C)->genes.end(), same_coords); (*C)->genes.erase(e, (*C)->genes.end()); } } transterm_hp_v2.09/ermolaeva-oldconf.cc0000664000265600020320000002617711514142021017460 0ustar tilleaadmin/* This file is part of TransTerm v2.0 BETA and is covered by the GNU GPL * License version 2.0. See file LICENSE.txt for more details. */ #include #include #include "seq.h" #include "conf.h" #include "util.h" //========================================================================== // Version 1.0's confidence & output scheme (deprecated) // the code below is /not/ efficient /or/ elegant. //========================================================================== class ErmolaevaConfVer1 : public ErmolaevaConfidence { public: ErmolaevaConfVer1() : ErmolaevaConfidence() {} virtual ~ErmolaevaConfVer1() {} void prepare(const Genome &); }; // count the # of at and gc in the string. Counts are added to at and gc void count_atgc(const string & s, unsigned long * at, unsigned long * gc) { for(unsigned i = 0; i < s.length(); i++) { if(s[i] == 'A' || s[i] == 'T') (*at)++; else (*gc)++; } } // the fraction of ingene bases that are A or T // does not count the first or last 100 bases of the gene double at_percent_ingenes(const Genome & seqs, int buf) { string gene; unsigned long at = 0, gc = 0; for(EVERY_CHROM_CONST(seqs, C)) { for(EVERY_REGION_CONST((*C)->genes, G)) { gene = subseq((*G)->left()+buf, (*G)->right()-buf); count_atgc(gene, &at, &gc); } } return ((float)at)/(at+gc); } // assumes that the genes are sorted double at_percent_notgenes(const Genome & seqs, int buf) { string gene; unsigned long at = 0, gc = 0; for(EVERY_CHROM_CONST(seqs, C)) { SeqPtr max_seen_pos = (*C)->left(); for(EVERY_REGION_CONST((*C)->genes, G)) { SeqPtr nextgene = (*G)->left(); string intergene = subseq( max(max_seen_pos-buf, (*C)->left()), nextgene+buf); count_atgc(intergene, &at, &gc); max_seen_pos = (*G)->right(); } } return ((float)at)/(at + gc); } // return a list of tail-to-tail regions void tail_to_tail_regions(const vector & genes, vector & reg) { Region * prev = 0; reg.clear(); for(EVERY_REGION_CONST(genes, G)) { // Look for conseqative genes of the form ---> <---. // If they overlap: // ----> // <----- // then the 'tail to tail' region is the overlap region. if(G != genes.begin() && prev->dir() != (*G)->dir() && abs(prev->end - (*G)->end) < abs(prev->start - (*G)->start)) { reg.push_back( new Region(prev->name + "><" + (*G)->name, (*G)->seq, min(prev->end, (*G)->end), max(prev->end, (*G)->end))); } prev = *G; } } // return a list of head-to-tail regions void head_to_tail_regions(const vector & genes, vector & reg) { Region * prev = 0; reg.clear(); for(EVERY_REGION_CONST(genes, G)) { // look for two genes in the same dir: --> --> or <-- <-- if(G != genes.begin() && prev->dir() == (*G)->dir()) { string name; SeqPtr begin, end; // If they overlap, then the region is the overlap region, // going in the same direction as the genes direction if(prev->dir() == FORWARD) { name = prev->name + "->" + (*G)->name; begin = min(prev->end, (*G)->start); end = max(prev->end, (*G)->start); if(begin==end) end++; } else { name = prev->name + "<-" + (*G)->name; begin = max((*G)->end, prev->start); end = min((*G)->end, prev->start); if(begin == end) begin++; } reg.push_back(new Region(name, (*G)->seq, begin, end)); } prev = *G; } } // output new terms that are in the given regions. The terms must be sorted // the terms will be copies of the old terms unsigned long copy_terms_in_regions( const ConstTermVec & terms, const vector & reg, ConstTermVec & out, int cut, bool require_codirect = false) { unsigned long len = 0; unsigned i = 0; for(EVERY_REGION_CONST(reg, R)) { if(abs((*R)->start - (*R)->end) > cut*2) { SeqPtr start, end, reg_right, reg_left; // cut the sequences by the given amount int dir = ((*R)->dir() == FORWARD) ? 1 : -1; start = (*R)->start + cut * dir; end = (*R)->end - cut * dir; len += abs(start - end); reg_right = max(start, end); reg_left = min(start, end); for(; i < terms.size() && terms[i]->right_stem_base() <= reg_right; i++) { if(terms[i]->left_stem_base() >= reg_left && (!require_codirect || terms[i]->dir() == (*R)->dir())) { Term * t = new Term(*terms[i]); t->name = (*R)->name; out.push_back(t); } } } } return len; } const int GENE_CUT = 100; const int H2T_CUT = -50; const int T2T_CUT = -50; // following two functions used /ONLY/ to wedge old code into the new scheme to // support the version 1.0 way of doing this. Version 1.0 is only supported for // comparison, debuggin, and completness -- do not use these two functions void to_const_term_vec(vector vec, ConstTermVec & out) { out.clear(); copy(vec.begin(), vec.end(), back_inserter(out)); } void force_remove_const(const ConstTermVec & in, vector vec) { for(EVERY_CTERM_CONST(in, T)) { vec.push_back(const_cast(*T)); } } // compute statistics on the genome object to prepare for assessing the // confidence of a terminator with score(). This must be called before score // and both genes, and terminators must be sorted by their leftmost point void ErmolaevaConfVer1::prepare(const Genome & seqs) { ConstTermVec gene_terms, h2t_terms, t2t_terms; unsigned long gene_len = 0, h2t_len = 0, t2t_len = 0; // get the terms in tail-to-tail and tail-to-head and gene regions for(EVERY_CHROM_CONST(seqs, C)) { // hack to conver vector to vector ConstTermVec terms; to_const_term_vec((*C)->terms, terms); // "true" means only co-directional terms gene_len += copy_terms_in_regions(terms, (*C)->genes, gene_terms, GENE_CUT, true); vector reg; head_to_tail_regions((*C)->genes, reg); h2t_len += copy_terms_in_regions(terms, reg, h2t_terms, H2T_CUT, true); tail_to_tail_regions((*C)->genes, reg); t2t_len += copy_terms_in_regions(terms, reg, t2t_terms, T2T_CUT); } // can't compute confidence if we have no gene_terms if(gene_terms.empty()) { prepared = false; cout << "warning: no examples in genes; can't compute conf." << endl; return; } // compute K --- correction for AT content double at_in, at_not; at_in = at_percent_ingenes(seqs, 100); at_not = at_percent_notgenes(seqs, 100); K = (840*at_not*at_not - 1215.65*at_not + 448.9593) / (840*at_in*at_in - 1215.65*at_in + 448.9593); cout << "Genes: " << at_in << " %AT, " << gene_len << " nt, " << gene_terms.size() << " terms." << endl; cout << "Intergenic: " << at_not << " %AT, " << "H2T: " << h2t_len << " nt, " << h2t_terms.size() << " terms; " << "T2T: " << t2t_len << " nt, " << t2t_terms.size() << " terms. " << endl; t2t_L = double(t2t_len) / gene_len; h2t_L = double(h2t_len) / gene_len; t2t_hp = signal_to_noise(Term::HAIRPIN, t2t_terms, gene_terms); t2t_tail = signal_to_noise(Term::TAIL, t2t_terms, gene_terms); h2t_hp = signal_to_noise(Term::HAIRPIN, h2t_terms, gene_terms); h2t_tail = signal_to_noise(Term::TAIL, h2t_terms, gene_terms); t2t_N = 2.0 * gene_terms.size() / t2t_terms.size(); h2t_N = double(gene_terms.size()) / h2t_terms.size(); prepared = true; } // patch up the scores for terms in out to account for possible bidirected // terminators. (this is a hack to duplicate version 1.0's scheme) void pair_bidirect( ConstTermVec & in, ConstTermVec & out) { const Term * prev = 0; for(EVERY_CTERM_CONST(in, T)) { if(prev && (*T)->left() == prev->left() && (*T)->right() == prev->right()) { Term * t = new Term(*prev); t->conf = int((1.0 - (1.0 - (*T)->conf/100.0)*(1.0 - t->conf/100.0))*100.0 + 0.5); t->dir() = BIDIR; out.push_back(t); prev = 0; } else if(prev) { Term * t = new Term(*prev); out.push_back(t); prev = 0; } else { prev = *T; } } } // called by confidence_ermolaeva() to copy and add the confidence // this is used only to duplicate the version 1.0 scheme void add_confidence( ConstTermVec & out, const ConstTermVec & in, RegionType where, Confidence & conf) { for(EVERY_CTERM_CONST(in, T)) { Term * t = new Term(**T); t->conf = conf.score(**T, where); out.push_back(t); } } // for backwards compatibility, this function will compute something similar to // TransTerm version 1.0 void confidence_ermolaeva(Genome & seqs, ConstTermVec & out) { ErmolaevaConfidence conf; conf.prepare(seqs); ConstTermVec gene_terms, h2t_terms, t2t_terms; vector reg; for(EVERY_CHROM_CONST(seqs, C)) { ConstTermVec terms; to_const_term_vec((*C)->terms, terms); head_to_tail_regions((*C)->genes, reg); copy_terms_in_regions(terms, reg, h2t_terms, H2T_CUT, true); tail_to_tail_regions((*C)->genes, reg); copy_terms_in_regions(terms, reg, t2t_terms, T2T_CUT); } ConstTermVec tmp; add_confidence(tmp, t2t_terms, TAIL2TAIL, conf); pair_bidirect(tmp, out); add_confidence(out, h2t_terms, HEAD2TAIL, conf); } // print the terminators void print_terms(ostream & out, const vector & vec) { for(EVERY_TERM_CONST(vec, T)) { Term & ter = **T; out << setw(15) << ((ter.name=="")?"n/a":ter.name) << " " << setw(3) << ter.conf << " " << setw(5) << ter.hp_energy << " " << setw(8) << ter.tail_energy << " " << setw(7) << seqindex(*ter.seq, ter.right_stem_base()) << " " << setw(2) << ter.dir() << " "; out << subseq(ter.left_stem_base()-15, ter.left_stem_base()-1) << " " << center(subseq(ter.left_stem_base(), ter.left_stem_top()) + " " + subseq(ter.left_stem_top()+1,ter.right_stem_top()-1) + " " + subseq(ter.right_stem_top(), ter.right_stem_base()), 45) << " " << subseq(ter.right_stem_base()+1, ter.right_stem_base()+15) << " "; out << ter.gap << " " << ter.seq->name << endl; } }