From 1f5a30b0df77a64fd5c3887b795e1dd0e487b972 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 3 Jan 2012 21:05:31 +0000 Subject: [PATCH] My complete changes on my laptop, with specificity bug fix + ahocorasick + sets git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@393 60f365c0-8329-0410-b2a4-ec073aeeaa1d --- .cproject | 364 +++++--- src/ecoprimer.c | 220 ++++- src/global.mk | 3 +- src/libecoprimer/Makefile | 3 +- src/libecoprimer/PrimerSets.c | 1498 ++++++++++++++++++++++++++++-- src/libecoprimer/PrimerSets.h | 21 +- src/libecoprimer/ahocorasick.c | 479 ++++++++++ src/libecoprimer/ahocorasick.h | 43 + src/libecoprimer/ecoprimer.h | 6 +- src/libecoprimer/filtering.c | 4 +- src/libecoprimer/pairs.c | 20 +- src/libecoprimer/strictprimers.c | 9 +- src/libecoprimer/taxstats.c | 86 +- 13 files changed, 2502 insertions(+), 254 deletions(-) create mode 100755 src/libecoprimer/ahocorasick.c create mode 100755 src/libecoprimer/ahocorasick.h diff --git a/.cproject b/.cproject index 9f894fd..f99343a 100644 --- a/.cproject +++ b/.cproject @@ -1,151 +1,221 @@ - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/ecoprimer.c b/src/ecoprimer.c index b159c5e..7f0cd52 100644 --- a/src/ecoprimer.c +++ b/src/ecoprimer.c @@ -7,6 +7,7 @@ #include "libecoprimer/ecoprimer.h" #include "libecoprimer/PrimerSets.h" +#include "libecoprimer/ahocorasick.h" #include #include #include @@ -25,6 +26,8 @@ static int cmpprintedpairs(const void* p1,const void* p2); //float _Z27calculateMeltingTemperature_ (char * seq1, char * seq2); +pwordcount_t reduce_words_to_debug (pwordcount_t words, poptions_t options); +void print_wordwith_positions (primer_t prm, uint32_t seqdbsize, poptions_t options); void* lib_handle = NULL; float (*calcMelTemp)(char*, char*); @@ -71,12 +74,12 @@ static void PrintHelp() PP "-m : Salt correction method for Tm computation (SANTALUCIA : 1 or OWCZARZY:2, default=1)\n\n"); PP "-a : Salt contentration in M for Tm computation (default 0.05 M)\n\n"); PP "-U : No multi match\n\n"); - PP "-U : Define the [R]eference sequence identifier (must be part of example set)\n\n"); + PP "-R : Define the [R]eference sequence identifier (must be part of example set)\n\n"); PP "-A : Print the list of all identifier of sequences present in the database\n\n"); PP "-f : Remove data mining step during strict primer identification\n\n"); PP "-v : Store statistic file about memory usage during strict primer identification\n\n"); - PP "-p : Print sets of primers\n\n"); - PP "-T : Ignore pairs having specificity below this Threshold\n\n"); + PP "-p : Print sets of primers (may take several minutes after primers have been designed!)\n\n"); + PP "-T : Ignore pairs having specificity below this Threshold\n\n"); PP "\n"); PP "------------------------------------------\n"); PP "Table result description : \n"); @@ -151,6 +154,9 @@ void initoptions(poptions_t options) options->printAC=FALSE; options->print_sets_of_primers = FALSE; options->specificity_threshold = 0.6; + options->links_cnt = 1; + options->max_links_percent = -1; /*graph only those primers having maximum 15% links*/ + options->filter_on_links = FALSE; } void printapair(int32_t index,ppair_t pair, poptions_t options) @@ -165,7 +171,7 @@ void printapair(int32_t index,ppair_t pair, poptions_t options) bool_t good2=pair->p2->good; bool_t goodtmp; bool_t strand; - uint32_t i; + uint32_t i, j; float temp; CNNParams nnparams; @@ -296,6 +302,12 @@ void printapair(int32_t index,ppair_t pair, poptions_t options) else printf("\t\t"); +/* j=0; + for (i=0; idbsize; i++) + if (pair->wellIdentifiedSeqs[i] == 1) + j++; + printf("%d", j);*/ + printf("\n"); } @@ -335,6 +347,7 @@ uint32_t filterandsortpairs(ppair_t* sortedpairs,uint32_t count, poptions_t opti else qfp=0.0; sortedpairs[i]->wellIdentifiedSeqs = NULL; //TR 05/09/10 - wellIdentified needed for primer sets + sortedpairs[i]->coveredSeqs = NULL; //TR 05/09/10 - wellIdentified needed for primer sets sortedpairs[i]->quorumin = q; sortedpairs[i]->quorumout = qfp; sortedpairs[i]->yule = q - qfp; @@ -345,13 +358,13 @@ uint32_t filterandsortpairs(ppair_t* sortedpairs,uint32_t count, poptions_t opti { //TR 05/09/10 - wellIdentified needed for primer sets sortedpairs[j]->wellIdentifiedSeqs = ECOMALLOC(options->dbsize * sizeof(int),"Cannot allocate well_identified_array"); - (void)taxonomycoverage(sortedpairs[j],options); + sortedpairs[j]->coveredSeqs = ECOMALLOC(options->dbsize * sizeof(int),"Cannot allocate well_identified_array"); + (void)taxonomycoverage(sortedpairs[j],options, seqdb, options->dbsize); taxonomyspecificity(sortedpairs[j], seqdb, options->dbsize); //j++; //if specificity less than user provieded threshold (default 60%) then ignore this pair if (sortedpairs[j]->bs >= options->specificity_threshold) j++; - } } @@ -369,7 +382,8 @@ void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy, size_t count; char *taxon[]={"taxon","taxa"}; ecotx_t *current_taxon; - pairset pair_sets; + //pairset pair_sets; + pairset *pset = NULL; //printf("Index\tPrimer1\tPrimer2\tGB\tInexampleCount\tOutexampleCount\tYule\tIntaxaCount\tOuttaxaCount\tCoverage\tSpecificity\tMinAmplifiedLength\tMaxAmplifiedLength\tAvgAmplifiedLength\n"); @@ -388,7 +402,7 @@ void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy, for (i=0;ipaircount;i++,j++) sortedpairs[j]=pl->pairs+i; - + count=filterandsortpairs(sortedpairs,pairs->count,options, seqdb); getThermoProperties(sortedpairs, count, options); @@ -451,15 +465,55 @@ void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy, printf("# DB sequences are considered as linear\n"); printf("# Pairs having specificity less than %0.2f will be ignored\n", options->specificity_threshold); printf("#\n"); + for (i=0;i < count;i++) printapair(i,sortedpairs[i],options); + + if (options->filter_on_links) + { + fprintf (stderr, "Old size: %d, ", count); + count = primers_changeSortedArray (&sortedpairs, count, options); + //count = primers_filterWithGivenLinks (&sortedpairs, count, options); + fprintf (stderr, "New size: %d\n", count); + + if (count == 0) + { + fprintf (stderr, "No pairs passed the links constraints.\n"); + printf ("No pairs passed the links constraints.\n"); + return; + } + + for (i=0;i < count;i++) + printapair(i,sortedpairs[i],options); + } if (options->print_sets_of_primers == TRUE) { - pair_sets = build_primers_set (sortedpairs, count, seqdb, options); + /*pair_sets = build_primers_set (sortedpairs, count, seqdb, options); + printf("Results from Greedy Algorithm and some other possibilities:\n"); some_other_set_possibilities (&pair_sets, sortedpairs, count, seqdb, options); + printf("Results from simulated Anealing:\n"); + sets_by_SimulatedAnealing (&pair_sets, sortedpairs, count, seqdb, options); + printf("Results from Tabu Search:\n"); + sets_by_TabuSearch (&pair_sets, sortedpairs, count, seqdb, options);*/ + //pset = sets_by_BruteForce (sortedpairs, count, seqdb, options); + //if (pset) + /*/{ + printf("Results from simulated Anealing:\n"); + sets_by_SimulatedAnealing (pset, sortedpairs, count, seqdb, options); + printf("Results from Tabu Search:\n"); + sets_by_TabuSearch (pset, sortedpairs, count, seqdb, options); + + if (pset) + { + ECOFREE (pset->set_wellIdentifiedTaxa, "Could not free memory for pair set wi"); + ECOFREE (pset, "Could not free memory for pair"); + } + }*/ + build_and_print_sets (sortedpairs, count, seqdb, options); } + //primers_graph_graphviz (sortedpairs, count, options); } @@ -545,7 +599,7 @@ int main(int argc, char **argv) initoptions(&options); - while ((carg = getopt(argc, argv, "hAfvcUDSpE:d:l:L:e:i:r:R:q:3:s:x:t:O:m:a:T:")) != -1) { + while ((carg = getopt(argc, argv, "hAfvcUDSpbE:d:l:L:e:i:r:R:q:3:s:x:t:O:m:a:T:k:M:")) != -1) { switch (carg) { /* ---------------------------- */ @@ -711,15 +765,31 @@ int main(int argc, char **argv) /* -------------------- */ case 'p': /* print sets of primers */ /* --------------------------------- */ - options.print_sets_of_primers = TRUE; + //options.print_sets_of_primers = TRUE; break; - + + /* --------------------------------- */ + case 'T': /* Ignore pairs having specificity below this Threshold */ /* --------------------------------- */ - case 'T': /* Ignore pairs having specificity below this Threshold */ + sscanf(optarg,"%f",&(options.specificity_threshold)); + break; + /* --------------------------------- */ - sscanf(optarg,"%f",&(options.specificity_threshold)); + case 'M': /* Max link percentage for graph */ + /* --------------------------------- */ + sscanf(optarg,"%f",&(options.max_links_percent)); break; - + + /* --------------------------------- */ + case 'k': /* links count */ + /* --------------------------------- */ + sscanf(optarg,"%d",&(options.links_cnt)); + break; + + case 'b': + options.filter_on_links = TRUE; + break; + case '?': /* bad option */ /* -------------------- */ errflag++; @@ -779,7 +849,11 @@ int main(int argc, char **argv) words = lookforStrictPrimer(seqdb,seqdbsize,insamples,&options); fprintf(stderr,"\n Strict primer count : %d\n",words->size); - + + /*/TR Testing + fprintf(stderr,"\nReducing for debugging\n"); + words = reduce_words_to_debug (words, &options); + ///*/ // options.filtering=FALSE; // words2= lookforStrictPrimer(seqdb,seqdbsize,insamples,&options); // fprintf(stderr,"\n Strict primer count : %d\n",words2->size); @@ -802,7 +876,6 @@ int main(int argc, char **argv) for (i=0; isize); i++) fprintf(stderr," + Primer : %s sequence count : %d\n",ecoUnhashWord(words->words[i],options.primer_length),words->strictcount[i]); - fprintf(stderr,"\nEncoding sequences for fuzzy pattern matching...\n"); for (i=0;istrictcount,"Free strict primer count table"); - primers = lookforAproxPrimer(seqdb,seqdbsize,insamples,words,&options); + if (options.error_max == 0)//aho, if(options.error_max == 0 && 0) old + primers = ahoc_lookforStrictPrimers (seqdb,seqdbsize,insamples,words,&options); + else + primers = lookforAproxPrimer(seqdb,seqdbsize,insamples,words,&options); + + //for (i=0; isize; i++) + // print_wordwith_positions (primers->primers[i], seqdbsize, &options); ECOFREE(words->words,"Free strict primer table"); ECOFREE(words,"Free strict primer structure"); @@ -833,3 +912,108 @@ int main(int argc, char **argv) return 0; } + +#define DEBUG_WORDS_CNT 14 +pwordcount_t reduce_words_to_debug (pwordcount_t words, poptions_t options) +{ + uint32_t i, k; + pwordcount_t new_words; + char *rwrd; + char dwrd[20]; + /*char *strict_words[DEBUG_WORDS_CNT] = {"GAGTCTCTGCACCTATCC", "GCAATCCTGAGCCAAATC", "ACCCCTAACCACAACTCA", + "TCCGAACCGACTGATGTT", "GAAGCTTGGGTGAAACTA", "GGAGAACCAGCTAGCTCT", "GCTGGTTCTCCCCGAAAT", + "TCGATTTGGTACCGCTCT", "AAAGGAGAGAGAGGGATT", "GGATTGCTAATCCGTTGT", "CCCCCATCGTCTCACTGG", + "TGAGGCGCAGCAGTTGAC", "GCGCTACGGCGCTGAAGT", "TTTCCTGGGAGTATGGCA"};*/ + char *strict_words[DEBUG_WORDS_CNT] = {"CTCCGGTCTGAACTCAGA", "TGTTGGATCAGGACATCC", "TAGATAGAAACCGACCTG", + "TGGTGCAGCCGCTATTAA", "AGATAGAAACTGACCTGG", "TGGTGCAGCCGCTATTAA", "CTAATGGTGCAGCCGCTA", + "TAGAAACTGACCTGGATT", "AGATAGAAACCGACCTGG", "ATGGTGCAGCCGCTATTA", "ATAGATAGAAACCGACCT", + "GCCGCTATTAAGGGTTCG", "GGTGCAGCCGCTATTAAG", "TAGAAACTGACCTGGATT"}; + int word_seen[DEBUG_WORDS_CNT]; + + + new_words = ECOMALLOC(sizeof(wordcount_t),"Cannot allocate memory for word count structure"); + new_words->inseqcount = words->inseqcount; + new_words->outseqcount = words->outseqcount; + new_words->size = DEBUG_WORDS_CNT; + new_words->strictcount = ECOMALLOC((new_words->size*sizeof(uint32_t)), "Cannot allocate memory for word count table"); + new_words->words = ECOMALLOC(new_words->size*sizeof(word_t), "I cannot allocate memory for debug words"); + + for (k = 0; k < DEBUG_WORDS_CNT; k++) + word_seen[k] = 0; + + for (i=0; i < words->size; i++) + { + rwrd = ecoUnhashWord(words->words[i],options->primer_length); + strcpy (dwrd, rwrd); + rwrd = ecoUnhashWord(ecoComplementWord(words->words[i],options->primer_length),options->primer_length); + for (k = 0; k < DEBUG_WORDS_CNT; k++) + { + if (strcmp (dwrd, strict_words[k]) == 0) break; + if (strcmp (rwrd, strict_words[k]) == 0) break; + } + + if (k < DEBUG_WORDS_CNT) + { + if (word_seen[k] == 0) + { + new_words->words[k] = words->words[i]; + new_words->strictcount[k] = words->strictcount[i]; + } + word_seen[k]++; + } + } + + fprintf (stderr, "Debug Words Info:\n"); + for (k = 0; k < DEBUG_WORDS_CNT; k++) + fprintf (stderr, "%s:%d\n", strict_words[k], word_seen[k]); + + + //clean input wods; + ECOFREE(words->words,"Clean word table"); + ECOFREE(words->strictcount,"Clean word count table"); + ECOFREE(words,"Clean word structure"); + + return new_words; +} + +void print_wordwith_positions (primer_t prm, uint32_t seqdbsize, poptions_t options) +{ + char *wrd; + uint32_t i, j; + char *twrd = "GCCTGTTTACCAAAAACA"; + + wrd = ecoUnhashWord(prm.word,options->primer_length); + + if (strcmp (twrd, wrd) == 0) + { + printf ("Positions for Word: %s\n", wrd); + for (i=0; i 0) + { + printf ("%d:", i); + if (prm.directCount[i] == 1) + printf ("%d", prm.directPos[i].value); + else + for (j=0; j 0) + { + printf ("%d:", i); + if (prm.reverseCount[i] == 1) + printf ("%d", prm.reversePos[i].value); + else + for (j=0; j +#include +#include //#include "ecoprimer.h" #include "PrimerSets.h" +int TabuList[PRIMERS_IN_SET_COUNT]; +int next_tabu_slot = -1; +int total_pairs = -1; +//int32_t total_wi = -1; + int32_t counttaxon(int32_t taxid); +int find_in_tabu (int index); + +int32_t count_taxons (int32_t taxid) +{ + static int32_t count = 0; + static int32_t slots = 0; + static int32_t *taxon_array = NULL; + int32_t i; + + if (taxid == -1) + { + if (taxon_array) + ECOFREE (taxon_array, "Could not free memory for taxon array"); + taxon_array = NULL; + slots = 0; + count = 0; + } + else + { + for (i = 0; i < count; i++) + { + if (taxid == taxon_array[i]) return count; + } + + if (count == slots) + { + slots += 500; + + if (taxon_array == NULL) + { + taxon_array = (int32_t *) ECOMALLOC(slots*sizeof (int32_t), + "Could not allocate memory for taxon array"); + } + else + taxon_array = (int32_t *) ECOREALLOC(taxon_array, slots*sizeof (int32_t), + "Could not reallocate memory for taxon array"); + } + taxon_array[count] = taxid; + count++; + } + return count; +} float get_set_coverage (pairset *p_set, SetParams *pparams, int32_t pidx_toexclude) { int32_t i, j; float cov; - pprimer_t prmr1; - pprimer_t prmr2; int32_t s_intaxa = 0; + int32_t seqcount; - counttaxon(-1); + //counttaxon(-1); + count_taxons (-1); for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) { if (p_set->set_pairs[i] == -1 || pidx_toexclude == i) continue; - prmr1 = pparams->sortedpairs[p_set->set_pairs[i]]->p1; - prmr2 = pparams->sortedpairs[p_set->set_pairs[i]]->p2; - for (j = 0; j < pparams->options->dbsize; j++) - if ((prmr1->directCount[j] > 0 || prmr1->reverseCount[j] > 0) - && (prmr2->directCount[j] > 0 || prmr2->reverseCount[j] > 0) - && pparams->seqdb[j]->isexample && pparams->seqdb[j]->ranktaxonid > 0) - { - s_intaxa = counttaxon(pparams->seqdb[j]->ranktaxonid); - } + seqcount=pparams->sortedpairs[p_set->set_pairs[i]]->pcr.ampcount; + for (j=0; j < seqcount; j++) + if (pparams->sortedpairs[p_set->set_pairs[i]]->pcr.amplifias[j].sequence->isexample + && pparams->sortedpairs[p_set->set_pairs[i]]->pcr.amplifias[j].sequence->ranktaxonid > 0 ) + s_intaxa = count_taxons(pparams->sortedpairs[p_set->set_pairs[i]]->pcr.amplifias[j].sequence->ranktaxonid); + } //fprintf(stderr, "%d/%d\n", s_intaxa, pparams->options->intaxa); + p_set->set_intaxa = s_intaxa; cov = s_intaxa*1.0/(pparams->options->intaxa*1.0); - + count_taxons (-1); return cov; } @@ -39,52 +86,172 @@ void set_cov_spc (pairset *p_set, SetParams *pparams) int32_t i; int32_t ssp = 0; + count_taxons (-1); for (i = 0; i < pparams->options->dbsize; i++) - if (p_set->set_wellIdentifiedTaxa[i] == 1) ssp++; - - //set specificity - p_set->set_specificity = ssp*1.0/(pparams->options->insamples*1.0); - + if (p_set->set_wellIdentifiedTaxa[i] == 1) + ssp = count_taxons (pparams->seqdb[i]->ranktaxonid); + //set coverage p_set->set_coverage = get_set_coverage (p_set, pparams, -1); + + //set specificity + p_set->set_specificity = ((float)ssp)/((float)p_set->set_intaxa); + p_set->set_wi_cnt = ssp; + count_taxons (-1); } -int ok_to_add (int id, pairset *pair_set) +//currently used only to open dead lock +void tabu_aspiration (pairset *pair_set) { int i; + int count = 0; + + for (i=0; iset_pairs[i] != -1) + count++; + if (TabuList[i] != -1) + count++; + } + + if (count == total_pairs) + for (i=0; iset_pairs[i] == id) return 0; + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + if (pair_set->set_pairs[i] != -1) secnt++; + + //if (secnt == PRIMERS_IN_SET_COUNT) return 0; + if (secnt == 0) return 1; + + lnk_prcnt = 1.0; + if (secnt > pparams->options->links_cnt) + lnk_prcnt = (pparams->options->links_cnt*1.0)/(secnt*1.0); + + //TR 6/2/11: new elements must have some links with atleast one elem of set + if (get_links_distribution (id, pair_set, pparams) < lnk_prcnt) return 0; + + //if in tabu search search tabu list as well + if (next_tabu_slot != -1) + { + //effency_switch is only there to avoid tabu_aspiration execution + //each time + effency_switch++; + if ((effency_switch%5) == 0) + { + effency_switch=1; + tabu_aspiration(pair_set); + } + + i = find_in_tabu (id); + if (i != -1) return 0; + } return 1; } +pairset build_primers_set_greedy_cov (SetParams *params) +{ + pairset pair_set; + int32_t i; + int32_t pset_idx; + int32_t prb_idx; + + memset (&pair_set, 0, sizeof(pairset)); + pair_set.set_wellIdentifiedTaxa = (int *) ECOMALLOC(params->options->dbsize*sizeof (int), + "Could not allocate memory for pair set"); + + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + pair_set.set_pairs[i] = -1; + + pset_idx = 0; + prb_idx = 0; + //add first pair by default, this should be the one having highiest specificty + add_pair_in_set (&pair_set, pset_idx, prb_idx, params); + pset_idx++; + + while (pset_idx < PRIMERS_IN_SET_COUNT) + { + if (pair_set.set_coverage == 1.0) break; + prb_idx = get_next_option_increasing_cov (&pair_set, params); + if (prb_idx == 0) break; + add_pair_in_set (&pair_set, pset_idx, prb_idx, params); + pset_idx++; + } + //get_set_mean_cov_stats (&pair_set, ¶ms); + reset_set_props (&pair_set, params); + + return pair_set; +} + +int32_t get_next_option_increasing_cov (pairset *pair_set, SetParams *pparams) +{ + float ini_set_cov; + float set_cov; + float cov_diff = 0.; + int32_t i, id_slot = -1, max_id = 0; + + //coverage already 1, dont proceed. + if (pair_set->set_coverage == 1.0) return 0; + + //find next available slot in the set + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + if (pair_set->set_pairs[i] == -1) + { + id_slot = i; + break; + } + //set already full + if (id_slot == -1) return 0; + //save original set coverage + ini_set_cov = pair_set->set_coverage; + for (i = 1; i < pparams->sorted_count; i++) + { + if (ok_to_add (i, pair_set, pparams) == 0) continue; + pair_set->set_pairs[id_slot] = i; + set_cov = get_set_coverage (pair_set, pparams, -1); + if ((set_cov - ini_set_cov) > cov_diff) + { + cov_diff = set_cov - ini_set_cov; + max_id = i; + } + } + pair_set->set_pairs[id_slot] = -1; + return max_id; +} + + //1. Add in set the first pair having highiest specificity //2. For the sequences not WI by primers in set, calculate count for each pair equal to number of seqs WI by it //3. Take the pair with highiest such count and see its links with primers already in set, -//4. If no/insufficient links, take next pair +//4. If no/insufficient links, take next pair else add pair in set //5. repeate 3,4 untill pair gets added in set -//6. Repeate 2 to 5 until set complete +//6. Repeate 2 to 5 until set completes -pairset build_primers_set (ppair_t* sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, - poptions_t options) +pairset build_primers_set_greedy_spc (SetParams *params) { - SetParams params; pairset pair_set; int32_t i; int32_t pset_idx; int32_t prb_idx; int *pair_wi_count_sorted_ids; - params.sortedpairs = sortedpairs; - params.sorted_count = sorted_count; - params.seqdb = seqdb; - params.options = options; - memset (&pair_set, 0, sizeof(pairset)); - pair_set.set_wellIdentifiedTaxa = (int *) ECOMALLOC(options->dbsize*sizeof (int), + pair_set.set_wellIdentifiedTaxa = (int *) ECOMALLOC(params->options->dbsize*sizeof (int), "Could not allocate memory for pair set"); - pair_wi_count_sorted_ids = (int *) ECOMALLOC(sorted_count*sizeof (int), + pair_wi_count_sorted_ids = (int *) ECOMALLOC(params->sorted_count*sizeof (int), "Could not allocate memory for pair_wi_count_sorted"); for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) @@ -93,43 +260,33 @@ pairset build_primers_set (ppair_t* sortedpairs, int32_t sorted_count, pecodnadb pset_idx = 0; prb_idx = 0; //add first pair by default, this should be the one having highiest specificty - add_pair_in_set (&pair_set, pset_idx, prb_idx, ¶ms); + add_pair_in_set (&pair_set, pset_idx, prb_idx, params); pset_idx++; while (pset_idx < PRIMERS_IN_SET_COUNT) { //get a sorted list of pair ids with the pair well identifying most of the remaining seqs at top - get_next_pair_options (pair_wi_count_sorted_ids, &pair_set, ¶ms); + get_next_pair_options (pair_wi_count_sorted_ids, &pair_set, params); if (pair_wi_count_sorted_ids[0] == 0) { fprintf (stderr, "No further pair found, total so far %d\n", pset_idx); break; - } - else - fprintf (stderr, "Some counts found\n"); + } - - for (prb_idx = 0; prb_idx < sorted_count; prb_idx++) + for (prb_idx = 0; prb_idx < params->sorted_count; prb_idx++) { if (pair_wi_count_sorted_ids[prb_idx]) - if (ok_to_add (pair_wi_count_sorted_ids[prb_idx], &pair_set)) + if (ok_to_add (pair_wi_count_sorted_ids[prb_idx], &pair_set, params)) { //fprintf (stderr, "Oktoadd\n"); - float ldist = get_links_distribution (pair_wi_count_sorted_ids[prb_idx], &pair_set, ¶ms); - fprintf (stderr, "ldist:%f\n", ldist); - if (ldist >= 0.7) - { - add_pair_in_set (&pair_set, pset_idx, pair_wi_count_sorted_ids[prb_idx], ¶ms); - pset_idx++; - break; - } - else - fprintf (stderr, "ldist:%f\n", ldist); + add_pair_in_set (&pair_set, pset_idx, pair_wi_count_sorted_ids[prb_idx], params); + pset_idx++; } + if (pset_idx == PRIMERS_IN_SET_COUNT) break; } - if (prb_idx == sorted_count) + if (prb_idx == params->sorted_count) { fprintf (stderr, "No further pair found, total so far %d\n", pset_idx); break; @@ -142,8 +299,8 @@ pairset build_primers_set (ppair_t* sortedpairs, int32_t sorted_count, pecodnadb } } - get_set_mean_cov_stats (&pair_set, ¶ms); - + //get_set_mean_cov_stats (&pair_set, ¶ms); + reset_set_props (&pair_set, params); return pair_set; } @@ -253,7 +410,14 @@ void add_pair_in_set (pairset *pair_set, int32_t pset_idx, int32_t prb_idx, SetP int *pwi; int32_t i; + if (prb_idx < 0 || prb_idx >= pparams->sorted_count) return; pair_set->set_pairs[pset_idx] = prb_idx; + +// fprintf (stderr, "%d:", prb_idx); + //fprintf (stderr, "%d:", pparams->sortedpairs[prb_idx]); + //fprintf (stderr, "%d:", pparams->sortedpairs[prb_idx]->wellIdentifiedSeqs); + //fprintf (stderr, "%d:%d, ", i, pair_count[i]); + pwi = pparams->sortedpairs[prb_idx]->wellIdentifiedSeqs; for (i = 0; i < pparams->options->dbsize; i++) if (pwi[i] == 1) @@ -262,6 +426,42 @@ void add_pair_in_set (pairset *pair_set, int32_t pset_idx, int32_t prb_idx, SetP set_cov_spc (pair_set, pparams); } +int isthisset (pairset *pair_set) +{ + int set1[PRIMERS_IN_SET_COUNT]; + int set2[PRIMERS_IN_SET_COUNT]; + int i,j=0,k; + for (i=0; iset_pairs[i] != -1) set2[j++]=pair_set->set_pairs[i]; + } + set1[0]=0;set1[1]=2;set1[2]=7; + if (j==3) + { + for(i=0;i<3;i++) + { + for(k=0;k<3;k++) + if (set2[k]==set1[i])break; + if (k==3)break; + } + if(i==3) return 1;//found + } + + set1[0]=0;set1[1]=2;set1[2]=37; + if (j==3) + { + for(i=0;i<3;i++) + { + for(k=0;k<3;k++) + if (set2[k]==set1[i])break; + if (k==3)break; + } + if(i==3) return 1;//found + } + return 0; +} + void get_set_mean_cov_stats (pairset *pair_set, SetParams *pparams) { int32_t i, j, k; @@ -270,20 +470,44 @@ void get_set_mean_cov_stats (pairset *pair_set, SetParams *pparams) double msum; int *p1wi; int *p2wi; + int dbg=0; + + dbg=isthisset(pair_set); for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) { if (pair_set->set_pairs[i] == -1) continue; p1wi = pparams->sortedpairs[pair_set->set_pairs[i]]->wellIdentifiedSeqs; + + if(dbg) + { + printf ("\n\nWellIdentified for primer pair:%d\n",pair_set->set_pairs[i]); + for (k = 0; k < pparams->options->dbsize; k++) + if(p1wi[k]==1) + printf("%d,",k); + printf("\n"); + } + for (j = i+1; j < PRIMERS_IN_SET_COUNT; j++) { if (pair_set->set_pairs[j] == -1) continue; p2wi = pparams->sortedpairs[pair_set->set_pairs[j]]->wellIdentifiedSeqs; interseq_vals[interseq_cnt] = 0; + if (dbg) + { + printf ("Intersection for %d and %d:\n", pair_set->set_pairs[i], pair_set->set_pairs[j]); + } + for (k = 0; k < pparams->options->dbsize; k++) if (p1wi[k] == 1 && p2wi[k] == 1) + { interseq_vals[interseq_cnt]++; + if(dbg) + printf("%d,",k); + } + if(dbg) + printf("\n"); interseq_cnt++; } } @@ -305,7 +529,83 @@ void get_set_mean_cov_stats (pairset *pair_set, SetParams *pparams) pair_set->set_lcov = msum/interseq_cnt; if (pair_set->set_lcov != 0) - pair_set->set_score = (pair_set->set_coverage*pair_set->set_specificity*pair_set->set_lmean)/pair_set->set_lcov; + //pair_set->set_score = (pair_set->set_coverage*pair_set->set_specificity*pair_set->set_specificity*(sqrt(pair_set->set_lmean)))/sqrt(sqrt(pair_set->set_lcov)); + pair_set->set_score = (pair_set->set_coverage*pair_set->set_specificity*(pair_set->set_lmean))/sqrt(pair_set->set_lcov); +} + +void get_set_mean_cov_normalised_stats (pairset *pair_set, SetParams *pparams) +{ + int32_t i, j, k; + int interseq_vals[PRIMERS_IN_SET_COUNT*PRIMERS_IN_SET_COUNT]; + int interseq_cnt = 0; + double msum; + int *p1wi; + int *p2wi; + int dbg=0; + + dbg=isthisset(pair_set); + + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + { + if (pair_set->set_pairs[i] == -1) continue; + p1wi = pparams->sortedpairs[pair_set->set_pairs[i]]->wellIdentifiedSeqs; + + if(dbg) + { + printf ("\n\nWellIdentified for primer pair:%d\n",pair_set->set_pairs[i]); + for (k = 0; k < pparams->options->dbsize; k++) + if(p1wi[k]==1) + printf("%d,",k); + printf("\n"); + } + + for (j = i+1; j < PRIMERS_IN_SET_COUNT; j++) + { + if (pair_set->set_pairs[j] == -1) continue; + p2wi = pparams->sortedpairs[pair_set->set_pairs[j]]->wellIdentifiedSeqs; + interseq_vals[interseq_cnt] = 0; + + if (dbg) + { + printf ("Intersection for %d and %d:\n", pair_set->set_pairs[i], pair_set->set_pairs[j]); + } + + for (k = 0; k < pparams->options->dbsize; k++) + if (p1wi[k] == 1 && p2wi[k] == 1) + { + interseq_vals[interseq_cnt]++; + if(dbg) + printf("%d,",k); + } + if(dbg) + printf("\n"); + interseq_cnt++; + } + } + + //calculate mean + msum = 0; + pair_set->set_score = 0; + pair_set->set_lmean = 0; + pair_set->set_lcov = -1; + if (interseq_cnt == 0) return; + + for (i = 0; i < interseq_cnt; i++) + msum += interseq_vals[i]; + pair_set->set_lmean = msum/interseq_cnt; + + msum = 0; + for (i = 0; i < interseq_cnt; i++) + msum += (interseq_vals[i] - pair_set->set_lmean)*(interseq_vals[i] - pair_set->set_lmean); + pair_set->set_lcov = msum/interseq_cnt; + + if (pair_set->set_lcov != 0) + { + //normalised links + double nl = pair_set->set_lmean/sqrt (pair_set->set_lcov); + nl = nl/pparams->options->insamples; //max links cannot be more than insample value + pair_set->set_score = pair_set->set_coverage*pair_set->set_specificity*nl; + } } void reset_set_props (pairset *pair_set, SetParams *pparams) @@ -313,6 +613,7 @@ void reset_set_props (pairset *pair_set, SetParams *pparams) int *pwi; int i, j; + memset (pair_set->set_wellIdentifiedTaxa, 0, pparams->options->dbsize*sizeof(int)); for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) { if (pair_set->set_pairs[i] == -1) continue; @@ -323,7 +624,13 @@ void reset_set_props (pairset *pair_set, SetParams *pparams) } set_cov_spc (pair_set, pparams); - get_set_mean_cov_stats (pair_set, pparams); + + //TR 6/2/11: commented following, now score is just product of spc and cov + //get_set_mean_cov_stats (pair_set, pparams); + //get_set_mean_cov_normalised_stats (pair_set, pparams); + //pair_set->set_score = pair_set->set_coverage*pair_set->set_specificity; + pair_set->set_score = pair_set->set_coverage; + //pair_set->set_score = pair_set->set_specificity; } void print_set_info (pairset *pair_set, SetParams *pparams) @@ -331,9 +638,13 @@ void print_set_info (pairset *pair_set, SetParams *pparams) int i; int printed1st = 0; - printf ("%f\t%f\t%f\t%f\t%f\t", pair_set->set_specificity, - pair_set->set_coverage,pair_set->set_lmean, - pair_set->set_lcov,pair_set->set_score); + //TR 6/2/11: commented following, now score is just product of spc and cov + //printf ("%4.3f\t%4.3f\t%4.3f\t%4.3f\t%4.3f\t", pair_set->set_specificity, + // pair_set->set_coverage,pair_set->set_lmean, + // sqrt(pair_set->set_lcov),pair_set->set_score); + + printf ("%4.3f\t%4.3f\t%4.3f\t", pair_set->set_coverage, + pair_set->set_specificity,pair_set->set_score); for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) { @@ -345,6 +656,7 @@ void print_set_info (pairset *pair_set, SetParams *pparams) printf ("%d", pair_set->set_pairs[i]); printed1st = 1; } + printf ("\t%d\t%d", pair_set->set_intaxa, pair_set->set_wi_cnt); printf ("\n"); } @@ -372,7 +684,7 @@ void some_other_set_possibilities (pairset *pair_set, for (j = 0; j < sorted_count; j++) { - if (ok_to_add (j, pair_set)) + if (ok_to_add (j, pair_set, ¶ms)) { tmp_pair_set = *pair_set; tmp_pair_set.set_pairs[i] = j; @@ -386,3 +698,1073 @@ void some_other_set_possibilities (pairset *pair_set, ECOFREE (wi, "Could not free memory for pair set wi"); } +pairset clone_set (pairset *s, int32_t dbsize) +{ + pairset clone; + + clone = *s; + clone.set_wellIdentifiedTaxa = (int *) ECOMALLOC(dbsize*sizeof (int), + "Could not allocate memory for pair set"); + memcpy (clone.set_wellIdentifiedTaxa, s->set_wellIdentifiedTaxa, dbsize*sizeof (int)); + return clone; +} + +void add_in_tabu (int index) +{ + if (next_tabu_slot == -1) return; + + TabuList[next_tabu_slot] = index; + next_tabu_slot++; + if (next_tabu_slot >= PRIMERS_IN_SET_COUNT) next_tabu_slot = 0; +} + +int find_in_tabu (int index) +{ + int i; + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + if (TabuList[i] == index) return i; + + return -1; +} + +//do random changes in the seed set to generate new set +pairset get_neighbor (pairset *set, SetParams *params) +{ + int pinset = 0; + int i, j, id, cnt; + int how_many_to_replace; + pairset nset; + int replace_idx; + + //take the seed set as next neighbour sets + nset = clone_set (set, params->options->dbsize); + //see how many elements are in this set + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + { + if (nset.set_pairs[i] != -1) pinset++; + } + + if (pinset == params->sorted_count) return nset; + + //Randomly get number of elements to be replaced + //with new unused elements + how_many_to_replace = rand ()%pinset + 1; + //replace these many elements in the seed set + for (i = 0; i < how_many_to_replace; i++) + { + do + { + //we wil choose a random unused element as new element + id = rand ()%params->sorted_count; + }while (ok_to_add (id, &nset, params) == 0); + //again choose a random element in the set to replace + replace_idx = rand ()%pinset+1; + cnt = 0; + for (j = 0; j < PRIMERS_IN_SET_COUNT; j++) + { + if (nset.set_pairs[j] != -1) cnt++; + if (cnt == replace_idx) + { + if (next_tabu_slot != -1) + add_in_tabu (nset.set_pairs[j]); + + nset.set_pairs[j] = id; + break; + } + } + } + reset_set_props (&nset, params); + return nset; +} + +//remove a random number of least contributing elements +//from the seed set with random elements from the remaining +pairset get_neighbor4 (pairset *set, SetParams *params) +{ + int pinset = 0; + int i, j, id, id2; + int how_many_to_replace; + pairset nset; + //int replace_idx; + float contribution[PRIMERS_IN_SET_COUNT]; + int usedids[PRIMERS_IN_SET_COUNT]; + float sscore; + float leastContri; + //float lastLeastcontri; + int k, l; + + //take the seed set as next neighbour sets + nset = clone_set (set, params->options->dbsize); + //see how many elements are in this set + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + { + if (nset.set_pairs[i] != -1) pinset++; + } + + if (pinset == params->sorted_count) return nset; + + //Randomly get number of elements to be replaced + //with new unused elements + how_many_to_replace = rand ()%pinset + 1; + + //calculate contribution of each element in the set + sscore = nset.set_score; + //fprintf (stderr, "{%f-", sscore); + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + { + contribution[i] = 10000.0; + if (nset.set_pairs[i] != -1) + { + id = nset.set_pairs[i]; + nset.set_pairs[i] = -1; + reset_set_props (&nset, params); + contribution[i] = sscore - nset.set_score; + //fprintf (stderr, "[%f;%f]", nset.set_score, contribution[i]); + nset.set_pairs[i] = id; + reset_set_props (&nset, params); + //fprintf (stderr, "%f:, ", nset.set_score); + } + usedids[i] = -1; + } + + //lastLeastcontri = 10000.0; + k=0; + //replace these many elements in the seed set + //fprintf (stderr, "} (%d) ", how_many_to_replace); + for (i = 0; i < how_many_to_replace; i++) + { + do + { + //we wil choose a random unused element as new element + id = rand ()%params->sorted_count; + }while (ok_to_add (id, &nset, params) == 0); + + leastContri = 10000.0; + id2 = -1; + for (j = 0; j < PRIMERS_IN_SET_COUNT; j++) + { + if (nset.set_pairs[j] == -1) continue; + for (l = 0; l < k; l++) + if (usedids[l] == j) break; + + if (leastContri > contribution[j] /*&& leastContri >= lastLeastcontri*/ && l == k) + { + leastContri = contribution[j]; + id2 = j; + //fprintf (stderr, "%f:%d, ", leastContri, id2); + } + } + + if (id2 != -1) + { + usedids[k++] = id2; + //lastLeastcontri = leastContri; + if (next_tabu_slot != -1) + add_in_tabu (nset.set_pairs[id2]); + + nset.set_pairs[id2] = id; + } + } + //fprintf (stderr, "\n"); + reset_set_props (&nset, params); + return nset; +} + +//by replacing one element of the set with next from unused +int which_one_to_replace; +int last_replaced_with; +pairset get_neighbor2 (pairset *set, SetParams *params) +{ + int pinset = 0; + int i, j; + pairset nset; + + //take the seed set as next neighbour sets + nset = clone_set (set, params->options->dbsize); + //see how many elements are in this set + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + { + if (nset.set_pairs[i] != -1) pinset++; + } + + if (pinset == params->sorted_count) return nset; + + if (which_one_to_replace == pinset) which_one_to_replace = 0; + if (last_replaced_with == params->sorted_count) last_replaced_with = 0; + + j = -1; + for (i = 0; i < pinset; i++) + { + if (nset.set_pairs[i] != -1) j++; + if (j == which_one_to_replace) break; + } + + for (j = last_replaced_with; j < params->sorted_count; j++) + if (ok_to_add (j, &nset, params) == 1) break; + + if (j < params->sorted_count) + { + if (next_tabu_slot != -1) + add_in_tabu (nset.set_pairs[i]); + + nset.set_pairs[i] = j; + reset_set_props (&nset, params); + } + last_replaced_with++; + which_one_to_replace++; + + return nset; +} + +//replace element having least contribution with the next from the list +pairset get_neighbor3 (pairset *set, SetParams *params) +{ + int pinset = 0; + int i, j, id; + pairset nset; + float least_contribution; + float sscore; + + //take the seed set as next neighbour sets + nset = clone_set (set, params->options->dbsize); + //see how many elements are in this set + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + { + if (nset.set_pairs[i] != -1) pinset++; + } + + if (pinset == params->sorted_count) return nset; + + if (last_replaced_with == params->sorted_count) last_replaced_with = 0; + + sscore = nset.set_score; + which_one_to_replace = -1; + least_contribution = 1000.0; //impossible + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + { + if (nset.set_pairs[i] != -1) + { + id = nset.set_pairs[i]; + nset.set_pairs[i] = -1; + reset_set_props (&nset, params); + if ((sscore - nset.set_score) < least_contribution) + { + which_one_to_replace = i; + least_contribution = sscore - nset.set_score; + } + nset.set_pairs[i] = id; + } + } + + for (j = last_replaced_with; j < params->sorted_count; j++) + if (ok_to_add (j, &nset, params) == 1) break; + + if (j < params->sorted_count && which_one_to_replace != -1) + { + if (next_tabu_slot != -1) + add_in_tabu (nset.set_pairs[which_one_to_replace]); + + nset.set_pairs[which_one_to_replace] = j; + reset_set_props (&nset, params); + } + last_replaced_with++; + + return nset; +} + +float sa_P (float score, float nscore, float tem) +{ + if (nscore >= score) return 0.95; + if (nscore < score && tem > 0.3) return 0.1; + if (nscore < score && tem > 0.1) return 0.0001; + return 0.0; +} + +void extend_set_to (pairset *pair_set, SetParams *params, int extend_to_cnt) +{ + int pinset = 0; + int i, j; + if (extend_to_cnt > PRIMERS_IN_SET_COUNT) return; + + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + if (pair_set->set_pairs[i] != -1) pinset++; + + if (pinset >= extend_to_cnt) return; + + for (i = 0; i < params->sorted_count; i++) + { + if (ok_to_add (i, pair_set, params) == 1) + { + for (j = 0; j < PRIMERS_IN_SET_COUNT; j++) + if (pair_set->set_pairs[j] == -1) break; + add_pair_in_set (pair_set, j, i, params); + reset_set_props (pair_set, params); + pinset++; + if (pinset >= extend_to_cnt) break; + } + } +} + +pairset * extend_set_randomly (pairset *pset, SetParams *params, int extend_to_cnt) +{ + int pinset = 0; + int i = 0; + int id; + pairset *pair_set; + + if (extend_to_cnt > PRIMERS_IN_SET_COUNT) return pset; + + if (pset == NULL) + { + pair_set = (pairset *) ECOMALLOC(sizeof (pairset), + "Could not allocate memory for pair"); + pair_set->set_wellIdentifiedTaxa = (int *) ECOMALLOC(params->options->dbsize*sizeof (int), + "Could not allocate memory for pair set WI"); + + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + pair_set->set_pairs[i] = -1; + } + else + pair_set = pset; + + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + if (pair_set->set_pairs[i] != -1) pinset++; + + if (pinset >= extend_to_cnt) return pair_set; + + i = 0; + if (pinset == 0) + { + id = rand ()%params->sorted_count; + add_pair_in_set (pair_set, i, id, params); + i++; + pinset++; + } + + while (pinset < extend_to_cnt) + { + do + { + //we wil choose a random unused element as new element + id = rand ()%params->sorted_count; + }while (ok_to_add (id, pair_set, params) == 0); + add_pair_in_set (pair_set, i, id, params); + i++; + pinset++; + } + return pair_set; +} + +void set_reduce_to_best (pairset *pair_set, SetParams *params) +{ + int original_members[PRIMERS_IN_SET_COUNT]; + int i; + int mcnt = 0; + float max_score; + int m_to_remove = -1; + int tmem; + + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + { + original_members[i] = pair_set->set_pairs[i]; + if (original_members[i] != -1) mcnt++; + } + + while (mcnt > 3) + { + max_score = pair_set->set_score; + m_to_remove = -1; + + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + { + if (pair_set->set_pairs[i] == -1) continue; + tmem = pair_set->set_pairs[i]; + pair_set->set_pairs[i] = -1; //remove current element temporarily + reset_set_props (pair_set, params); + pair_set->set_pairs[i] = tmem; //restore + + if (max_score <= pair_set->set_score) + { + max_score = pair_set->set_score; + m_to_remove = i; + } + } + + if (m_to_remove != -1) + { + pair_set->set_pairs[m_to_remove] = -1; //remove element + reset_set_props (pair_set, params); + mcnt--; + } + else + { + reset_set_props (pair_set, params); + break; + } + } +} + +float sa_temp (int k, int kmax) +{ + return ((kmax - k)*1.0)/(kmax*1.0); +} + +void sets_by_SimulatedAnealing (pairset *pair_set, + ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options) +{ + SetParams params; + pairset set, bset, nset; + float score, bscore, nscore; + int k = 0, mcnt=0; + int kmax = sorted_count*10; + float max_score = 10000.0; + float min_spc, min_cov; + float max_spc, max_cov; + double randval = 0; + + int32_t count_per_size; + int32_t size_counter = 1; + + srand ( time(NULL) ); + params.sortedpairs = sortedpairs; + params.sorted_count = sorted_count; + params.seqdb = seqdb; + params.options = options; + which_one_to_replace = 0; + last_replaced_with = 0; + next_tabu_slot = -1; //dont use tabu search props + total_pairs = sorted_count; + + if (pair_set == NULL) + { + pair_set = extend_set_randomly (NULL, ¶ms, 3); + printf("\nStart Random seed set for Simulated :\n"); + print_set_info (&pair_set, ¶ms); + } + min_spc = max_spc = pair_set->set_specificity; + min_cov = max_cov = pair_set->set_coverage; + + for (k = 0; k < PRIMERS_IN_SET_COUNT; k++) + { + if (pair_set->set_pairs[k] != -1) mcnt++; + } + count_per_size = kmax/(PRIMERS_IN_SET_COUNT-mcnt); + k = 1; + + set = clone_set (pair_set, options->dbsize); + + /*if (mcnt < 5) + { + printf ("Set before extension:\n"); + print_set_info (&set, ¶ms); + extend_set_to (&set, ¶ms, 5); + printf ("Set after extension:\n"); + print_set_info (&set, ¶ms); + set_reduce_to_best (&set, ¶ms); + printf ("Set after reduction to best size:\n"); + print_set_info (&set, ¶ms); + }*/ + mcnt++; + extend_set_to (&set, ¶ms, mcnt); + + bset = clone_set (&set, options->dbsize); + score = bset.set_score; + bscore = score; + nset.set_wellIdentifiedTaxa = NULL; + + //srand ( time(NULL) ); + + while (k <= kmax && score < max_score) + { + if (k == (size_counter*count_per_size)) + { + size_counter++; + mcnt++; + extend_set_to (&set, ¶ms, mcnt); + } + //nset = get_neighbor (&set, ¶ms); //all random + //nset = get_neighbor2 (&set, ¶ms); //replace next with next available + nset = get_neighbor3 (&set, ¶ms); //replace the one with least contribution with next + //nset = get_neighbor4 (&set, ¶ms); //replace randome no of least contributing elements with random elements in the remaining set + + if (nset.set_specificity < min_spc) + min_spc = nset.set_specificity; + + if (nset.set_specificity > max_spc) + max_spc = nset.set_specificity; + + if (nset.set_coverage < min_cov) + min_cov = nset.set_coverage; + + if (nset.set_coverage > max_cov) + max_cov = nset.set_coverage; + + nscore = nset.set_score; + printf ("Neighbor: "); + print_set_info (&nset, ¶ms); + + if (nscore > bscore) + { + ECOFREE (bset.set_wellIdentifiedTaxa, "Could not free memory for pair set wi"); + bset = clone_set (&nset, options->dbsize); + bscore = nscore; + printf ("best: "); + print_set_info (&nset, ¶ms); + } + + randval = (double)rand()/(double)RAND_MAX; + if (sa_P (score, nscore, sa_temp (k,kmax)) > randval) + { + ECOFREE (set.set_wellIdentifiedTaxa, "Could not free memory for pair set wi"); + set = clone_set (&nset, options->dbsize); + score = nscore; + //which_one_to_replace = 0; + //last_replaced_with = 0; + //printf ("Seed Set: "); + //print_set_info (&set, ¶ms); + } + k++; + ECOFREE (nset.set_wellIdentifiedTaxa, "Could not free memory for pair set wi"); + } + printf ("Minimum specificity: %0.3f, Maximum specificity: %0.3f, range: %0.3f\n",min_spc, max_spc, max_spc-min_spc); + printf ("Minimum coverage: %0.3f, Maximum coverage: %0.3f, range: %0.3f\n",min_cov, max_cov, max_cov-min_cov); +} + + +void sets_by_TabuSearch (pairset *pair_set, + ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options) +{ + SetParams params; + pairset set, bset, nset; + float bscore, nscore; + int k = 0, mcnt=0; + int kmax = sorted_count*10; + float max_score = 1000.0; + float min_spc, min_cov; + float max_spc, max_cov; + + int32_t count_per_size; + int32_t size_counter = 1; + + srand ( time(NULL) ); + params.sortedpairs = sortedpairs; + params.sorted_count = sorted_count; + params.seqdb = seqdb; + params.options = options; + which_one_to_replace = 0; + last_replaced_with = 0; + next_tabu_slot = 0; //use tabu search + total_pairs = sorted_count; + + if (pair_set == NULL) + pair_set = extend_set_randomly (NULL, ¶ms, 3); + min_spc = max_spc = pair_set->set_specificity; + min_cov = max_cov = pair_set->set_coverage; + + for (k = 0; k < PRIMERS_IN_SET_COUNT; k++) + { + if (pair_set->set_pairs[k] != -1) mcnt++; + } + count_per_size = kmax/(PRIMERS_IN_SET_COUNT-mcnt); + + set = clone_set (pair_set, options->dbsize); + /*if (mcnt < 5) + { + printf ("Set before extension:\n"); + print_set_info (&set, ¶ms); + extend_set_to (&set, ¶ms, 5); + printf ("Set after extension:\n"); + print_set_info (&set, ¶ms); + set_reduce_to_best (&set, ¶ms); + printf ("Set after reduction to best size:\n"); + print_set_info (&set, ¶ms); + }*/ + mcnt++; + extend_set_to (&set, ¶ms, mcnt); + + bset = clone_set (&set, options->dbsize); + bscore = bset.set_score; + nset.set_wellIdentifiedTaxa = NULL; + + for (k = 0; k < PRIMERS_IN_SET_COUNT; k++) + TabuList[k] = -1; + + k = 1; + while (k < kmax && bscore < max_score) + { + if (k == (size_counter*count_per_size)) + { + size_counter++; + mcnt++; + extend_set_to (&set, ¶ms, mcnt); + } + + //nset = get_neighbor (&set, ¶ms); //all random + //nset = get_neighbor2 (&set, ¶ms); //replace next with next available + nset = get_neighbor3 (&set, ¶ms); //replace the one with least contribution with next + //nset = get_neighbor4 (&set, ¶ms); //replace randome no of least contributing elements with random elements in the remaining set + + if (nset.set_specificity < min_spc) + min_spc = nset.set_specificity; + + if (nset.set_specificity > max_spc) + max_spc = nset.set_specificity; + + if (nset.set_coverage < min_cov) + min_cov = nset.set_coverage; + + if (nset.set_coverage > max_cov) + max_cov = nset.set_coverage; + + nscore = nset.set_score; + printf ("Neighbor: "); + print_set_info (&nset, ¶ms); + + if (nscore > bscore) + { + ECOFREE (bset.set_wellIdentifiedTaxa, "Could not free memory for pair set wi"); + bset = clone_set (&nset, options->dbsize); + bscore = nscore; + printf ("best: "); + print_set_info (&nset, ¶ms); + } + ECOFREE (set.set_wellIdentifiedTaxa, "Could not free memory for pair set wi"); + set = nset; + k++; + } + printf ("Minimum specificity: %0.3f, Maximum specificity: %0.3f, range: %0.3f\n",min_spc, max_spc, max_spc-min_spc); + printf ("Minimum coverage: %0.3f, Maximum coverage: %0.3f, range: %0.3f\n",min_cov, max_cov, max_cov-min_cov); +} + +pairset *sets_by_BruteForce (ppair_t * sortedpairs, +//void sets_by_BruteForce (ppair_t * sortedpairs, + int32_t sorted_count, pecodnadb_t seqdb, poptions_t options) +{ + SetParams params; + params.sortedpairs = sortedpairs; + params.sorted_count = sorted_count; + params.seqdb = seqdb; + params.options = options; + + pairset set; + pairset *pset = NULL; + if (sorted_count < 3) + { + printf ("Too few primer pairs to find a pair set.\n"); + return NULL; + } + + set.set_wellIdentifiedTaxa = (int *) ECOMALLOC(options->dbsize*sizeof (int), + "Could not allocate memory for pair set WI"); + int32_t set_indeces_array[PRIMERS_IN_SET_COUNT]; + //int start_elements = 3; + int end_elements = 3; + int current_elements = 3; + int32_t i, j; + float maxscore = -1000.0; + int maxcount = 2000; + int counter = 0; + + if (sorted_count <= PRIMERS_IN_SET_COUNT) + end_elements = sorted_count; + + if (end_elements < sorted_count) + { + pset = (pairset *) ECOMALLOC(sizeof (pairset), + "Could not allocate memory for pair"); + pset->set_wellIdentifiedTaxa = (int *) ECOMALLOC(options->dbsize*sizeof (int), + "Could not allocate memory for pair set WI"); + } + + while (current_elements <= end_elements) + { + for (i=0; idbsize*sizeof (int)); + for (i=0; i maxscore) + { + maxscore = set.set_score; + printf ("best: "); + print_set_info (&set, ¶ms); + + ECOFREE (pset->set_wellIdentifiedTaxa, "Could not free memory for pair set wi"); + *pset = clone_set (&set, options->dbsize); + } + else + { + printf ("set: "); + print_set_info (&set, ¶ms); + } + + for (i=current_elements-1; i>0; i--) + { + set_indeces_array[i]++; + if (set_indeces_array[i] == (sorted_count+(i-current_elements+1))) + { + set_indeces_array[i] = set_indeces_array[i-1]+2; + for (j=i+1; j= sorted_count) + { + break; + } + if (i < current_elements) //above loop broken? + { + current_elements++; + break; + } + counter++; + if (counter > maxcount) + break; + } + if (counter > maxcount) + break; + } + return pset; +} + +void build_and_print_sets (ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options) +{ + SetParams params; + pairset pair_set; + pairset *pset; + + params.sortedpairs = sortedpairs; + params.sorted_count = sorted_count; + params.seqdb = seqdb; + params.options = options; + + pair_set = build_primers_set_greedy_spc (¶ms); + printf("Greedy algorithm results based on specificity:\n"); + print_set_info (&pair_set, ¶ms); + if (pair_set.set_wellIdentifiedTaxa) + ECOFREE (pair_set.set_wellIdentifiedTaxa, "Could not free memory for pair set wi"); + + pair_set = build_primers_set_greedy_cov (¶ms); + printf("\nGreedy algorithm results based on coverage:\n"); + print_set_info (&pair_set, ¶ms); + if (pair_set.set_wellIdentifiedTaxa) + ECOFREE (pair_set.set_wellIdentifiedTaxa, "Could not free memory for pair set wi"); + + pset = extend_set_randomly (NULL, ¶ms, 3); + printf("\nStart Random seed set:\n"); + print_set_info (pset, ¶ms); + + printf("\nResults from simulated Anealing:\n"); + sets_by_SimulatedAnealing (pset, sortedpairs, sorted_count, seqdb, options); + printf("\nResults from Tabu Search:\n"); + sets_by_TabuSearch (pset, sortedpairs, sorted_count, seqdb, options); +} + +void primers_graph_graphviz (ppair_t * sortedpairs, + int32_t sorted_count, poptions_t options) +{ + int32_t i, j, k, total_links; + char fileName[100] = "PrimerLinks"; + int *owi; + int *iwi; + int allowedtaxa; + FILE *of; + + srand ( time(NULL) ); + sprintf (fileName, "PrimerLinks_%d.gv", rand ()); + of = fopen (fileName, "w"); + + fprintf (of, "graph primerlinks {\n"); + for (i=0; iwellIdentifiedSeqs; + + for (j=i+1; jwellIdentifiedSeqs; + total_links = 0; + + for (k=0; kdbsize; k++) + if (owi[k] == 1 && iwi[k] == 1) + total_links++; + + //if (total_links > 0 && ((total_links*1.0) <= (options->max_links_percent*options->dbsize))) + //if (total_links > 0 && total_links <= options->max_links_percent) + + if((sortedpairs[i]->intaxa - sortedpairs[i]->notwellidentifiedtaxa) < (sortedpairs[j]->intaxa - sortedpairs[j]->notwellidentifiedtaxa)) + allowedtaxa = (sortedpairs[i]->intaxa - sortedpairs[i]->notwellidentifiedtaxa)/2; + else + allowedtaxa = (sortedpairs[j]->intaxa - sortedpairs[j]->notwellidentifiedtaxa)/2; + + + //if (total_links > 5 && (total_links <= options->max_links_percent || options->max_links_percent==-1)) + // fprintf (of, "\t%d -- %d [label=\"%d: %0.2f: %0.2f\"];\n", i, j, total_links,sortedpairs[i]->bc,sortedpairs[j]->bc ); + + allowedtaxa = options->max_links_percent; + if (total_links > 5 && total_links < allowedtaxa) + fprintf (of, "\t%d -- %d [label=\"%d: %0.2f: %0.2f\"];\n", i, j, total_links,sortedpairs[i]->bc,sortedpairs[j]->bc ); + //fprintf (of, "\t%d\t%d\t%d\n", i, j, total_links); + + //fprintf (of, "\t%d\t%d\t%d\n", i, j, total_links); + //fprintf (of, "\t%d -- %d;\n", i, j, total_links); + } + } + fprintf (of, "}\n"); + fclose (of); +} + +int32_t *addinset (int32_t *set, int32_t i, int32_t j, int32_t* slots, int32_t *index) +{ + int32_t k; + + if (*index == *slots) + { + *slots += 50; + set = ECOREALLOC(set, (*slots)*sizeof (int32_t), + "Could not allocate memory for index set."); + } + + if (i > -1) + { + for (k=0; k<*index; k++) + if (set[k] == i) break; + if (k== *index) + set[(*index)++] = i; + } + + if (j > -1) + { + for (k=0; k<*index; k++) + if (set[k] == j) break; + if (k== *index) + set[(*index)++] = j; + } + + return set; +} + +size_t primers_changeSortedArray (ppair_t ** pairs, + size_t sorted_count, poptions_t options) +{ + int32_t i, j, k, l, total_links; + int *owi; + int *iwi; + int allowedtaxa; + ppair_t *sortedpairs = *pairs; + bool_t passed; + + int32_t *idx_set = NULL; + int32_t slots=50, index=0; + + idx_set = ECOMALLOC(slots*sizeof (int32_t), + "Could not allocate memory for index set."); + + for (i=0; iwellIdentifiedSeqs; + passed = FALSE; + + for (j=0; jwellIdentifiedSeqs; + total_links = 0; + + for (k=0; kdbsize; k++) + if (owi[k] == 1 && iwi[k] == 1) + total_links++; + + //if (total_links > 0 && ((total_links*1.0) <= (options->max_links_percent*options->dbsize))) + //if (total_links > 0 && total_links <= options->max_links_percent) + + if((sortedpairs[i]->intaxa - sortedpairs[i]->notwellidentifiedtaxa) < (sortedpairs[j]->intaxa - sortedpairs[j]->notwellidentifiedtaxa)) + allowedtaxa = (sortedpairs[i]->intaxa - sortedpairs[i]->notwellidentifiedtaxa)/2; + else + allowedtaxa = (sortedpairs[j]->intaxa - sortedpairs[j]->notwellidentifiedtaxa)/2; + + + //if (total_links > 5 && (total_links <= options->max_links_percent || options->max_links_percent==-1)) + // fprintf (of, "\t%d -- %d [label=\"%d: %0.2f: %0.2f\"];\n", i, j, total_links,sortedpairs[i]->bc,sortedpairs[j]->bc ); + + if (options->max_links_percent > 0) + { + allowedtaxa = options->max_links_percent; + if (total_links > allowedtaxa) + passed = TRUE; + break; + } + else + if (!(total_links > 5 && total_links <= allowedtaxa)) + { + //idx_set = addinset (idx_set, i, j, &slots, &index); + passed = TRUE; + break; + } + } + if (passed == TRUE) + idx_set = addinset (idx_set, i, -1, &slots, &index); + } + + i=-1; + for (j=0; jwellIdentifiedSeqs, "Cannot free wi for changing sorted array"); + ECOFREE (sortedpairs[j]->pcr.amplifias, "Cannot free wi for changing sorted array"); + if (i == -1) i = j; + } + else + { + if (i != -1) + sortedpairs[i++] = sortedpairs[j]; + } + } + ECOFREE (idx_set, "Cannot free index set."); + if (i != -1) + { + *pairs = ECOREALLOC (*pairs, i*sizeof(pair_t), "Cannot free wi for changing sorted array"); + } + else i=sorted_count; + return i; +} + + +int32_t *addinset_withLinks (int32_t *set, int32_t i, int32_t* slots, int32_t *index, ppair_t *pairs, poptions_t options) +{ + int32_t j, k, total_links; + int *owi; + int *iwi; + bool_t passed = TRUE; + int allowedtaxa; + + //see if we need to extend the set array + if (*index == *slots) + { + *slots += 50; + set = ECOREALLOC(set, (*slots)*sizeof (int32_t), + "Could not allocate memory for index set."); + } + + //find no of links of current element i with all the elements + //in the set to see that they are within limit + owi = pairs[i]->coveredSeqs; + for (j=0; j<*index; j++) + { + iwi = pairs[set[j]]->coveredSeqs; + total_links = 0; + + for (k=0; kdbsize; k++) + if (owi[k] == 1 && iwi[k] == 1) + total_links++; + + //if((pairs[i]->intaxa - pairs[i]->notwellidentifiedtaxa) < (pairs[set[j]]->intaxa - pairs[set[j]]->notwellidentifiedtaxa)) + // allowedtaxa = (pairs[i]->intaxa - pairs[i]->notwellidentifiedtaxa)/2; + //else + // allowedtaxa = (pairs[set[j]]->intaxa - pairs[set[j]]->notwellidentifiedtaxa)/2; + + if(pairs[i]->intaxa < pairs[set[j]]->intaxa) + allowedtaxa = pairs[i]->intaxa/2; + else + allowedtaxa = pairs[set[j]]->intaxa/2; + + if (!(total_links > 5 && total_links <= allowedtaxa)) + passed = FALSE; + } + + //links respect the limits with all set elements + if (passed) + { + for (k=0; k<*index; k++) + if (set[k] == i) break; + if (k== *index) + set[(*index)++] = i; + } + + return set; +} + +size_t primers_filterWithGivenLinks (ppair_t ** pairs, + size_t sorted_count, poptions_t options) +{ + int32_t i, j, k; + ppair_t *sortedpairs = *pairs; + bool_t passed; + + int32_t *idx_set = NULL; + int32_t slots=50, index=0; + int *cov = ECOMALLOC(options->dbsize*sizeof (int), + "Could not allocate memory for index set."); + + idx_set = ECOMALLOC(slots*sizeof (int32_t), + "Could not allocate memory for index set."); + + for (i=sorted_count-1; i>=0; i--) + { + idx_set = addinset_withLinks (idx_set, i, &slots, &index, sortedpairs, options); + } + + i=-1; + for (j=0; jcoveredSeqs, "Cannot free wi for changing sorted array"); + ECOFREE (sortedpairs[j]->wellIdentifiedSeqs, "Cannot free wi for changing sorted array"); + ECOFREE (sortedpairs[j]->pcr.amplifias, "Cannot free wi for changing sorted array"); + if (i == -1) i = j; + } + else + { + if (i != -1) + sortedpairs[i++] = sortedpairs[j]; + } + } + ECOFREE (idx_set, "Cannot free index set."); + if (i != -1) + { + *pairs = ECOREALLOC (*pairs, i*sizeof(pair_t), "Cannot free wi for changing sorted array"); + } + else i=sorted_count; + + + for (j=0; jdbsize; k++) + if ((*pairs)[j]->coveredSeqs[k] == 1) + cov[k] = 1; + j=0; + for (k=0; kdbsize; k++) + if (cov[k] == 1) + j++; + fprintf (stderr, "\nALL ELEMENTS COVERAGE: (%d/%d) %0.2f\n", j, options->intaxa, j*1.0/options->intaxa); + ECOFREE (cov, "Cannot free cov"); + + return i; +} diff --git a/src/libecoprimer/PrimerSets.h b/src/libecoprimer/PrimerSets.h index 8e46432..44d6294 100644 --- a/src/libecoprimer/PrimerSets.h +++ b/src/libecoprimer/PrimerSets.h @@ -13,6 +13,8 @@ typedef struct { float set_lmean; float set_lcov; float set_score; + int32_t set_intaxa; + int32_t set_wi_cnt; }pairset; typedef struct{ @@ -33,9 +35,24 @@ typedef struct{ void add_pair_in_set (pairset *pair_set, int32_t pset_idx, int32_t prb_idx, SetParams *pparams); void get_next_pair_options (int *pair_wi_count_sorted_ids, pairset *pair_set, SetParams *pparams); float get_links_distribution (int prb_idx, pairset *prob_set, SetParams *pparams); -pairset build_primers_set (ppair_t* sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, - poptions_t options); +pairset build_primers_set_greedy_spc (SetParams *pparams); void get_set_mean_cov_stats (pairset *prob_set, SetParams *pparams); void some_other_set_possibilities (pairset *pair_set, ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options); +void sets_by_SimulatedAnealing (pairset *pair_set, + ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options); +void sets_by_TabuSearch (pairset *pair_set, + ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options); +pairset * sets_by_BruteForce (ppair_t * sortedpairs, + int32_t sorted_count, pecodnadb_t seqdb, poptions_t options); +pairset * extend_set_randomly (pairset *pair_set, SetParams *params, int extend_to_cnt); +void build_and_print_sets (ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options); +int32_t get_next_option_increasing_cov (pairset *pair_set, SetParams *pparams); +void reset_set_props (pairset *pair_set, SetParams *pparams); +void primers_graph_graphviz (ppair_t * sortedpairs, + int32_t sorted_count, poptions_t options); +size_t primers_changeSortedArray (ppair_t ** pairs, + size_t sorted_count, poptions_t options); +size_t primers_filterWithGivenLinks (ppair_t ** pairs, + size_t sorted_count, poptions_t options); #endif diff --git a/src/libecoprimer/ahocorasick.c b/src/libecoprimer/ahocorasick.c new file mode 100755 index 0000000..078be0d --- /dev/null +++ b/src/libecoprimer/ahocorasick.c @@ -0,0 +1,479 @@ +/* + * ahocorasick.h + * + * Created on: 26 march 2011 + * Author: tiayyba + */ +#include +#include "hashencoder.h" +#include "ahocorasick.h" + +void ahoc_graphKeywordTree (aho_state *root); +aho_state *groot = NULL; //just for graph testing + +#define BASEATINDEX(w, l, i) (uint8_t)((((w)&(0x3LLU<<(((l)-(i))*2)))>>(((l)-(i))*2)) & 0x3LLU) + +void ahoc_addOutputElement (aho_state *node, bool_t isdirect, uint32_t idx) +{ + if (!node) return; + if (node->output.count == 0) + node->output.out_set = ECOMALLOC(sizeof(aho_output), + "Cannot allocate memory for aho-corasick state output element"); + else + node->output.out_set = ECOREALLOC(node->output.out_set, (node->output.count+1)*sizeof(aho_output), + "Cannot allocate memory for aho-corasick state output element"); + node->output.out_set[node->output.count].wordidx = idx; + node->output.out_set[node->output.count].isdirect = isdirect; + node->output.count++; +} + +//is the passed output element in the set +bool_t ahoc_isOutputIn (aho_state *node, aho_output ot) +{ + uint32_t i; + + for (i=0; ioutput.count; i++) + if (node->output.out_set[i].isdirect == ot.isdirect && node->output.out_set[i].wordidx == ot.wordidx) return TRUE; + return FALSE; +} + +//take union of output of the two nodes and put in node1 +void ahoc_unionOutputElements (aho_state *node1, aho_state *node2) +{ + uint32_t i; + + for (i=0; ioutput.count; i++) + if (ahoc_isOutputIn (node1, node2->output.out_set[i]) == FALSE) + ahoc_addOutputElement (node1, node2->output.out_set[i].isdirect, node2->output.out_set[i].wordidx); +} + +void ahoc_addKeyword (aho_state *root, word_t w, bool_t isdirect, uint32_t idx, poptions_t options) +{ + uint32_t i; + aho_state *nextnode = root; + uint8_t basecode; + static uint32_t state_id = 0; + + //fprintf (stderr, "%s\n", ecoUnhashWord(w, options->primer_length)); + for (i=1; i<=options->primer_length; i++) + { + basecode = BASEATINDEX (w, options->primer_length, i); + //fprintf (stderr, "%d", basecode); + if (nextnode->next[basecode] == NULL) + { + //add new state + nextnode->next[basecode] = ECOMALLOC(sizeof(aho_state), + "Cannot allocate memory for aho-corasick state"); + nextnode = nextnode->next[basecode]; + //initialize state + nextnode->id = ++state_id; + nextnode->next[0]=nextnode->next[1]=nextnode->next[2]=nextnode->next[3]=NULL; + nextnode->fail = NULL; + nextnode->output.count = 0; + } + else + nextnode = nextnode->next[basecode]; + } + //fprintf (stderr, "\n", basecode); + //new pattern addess so add node ouptup element + ahoc_addOutputElement (nextnode, isdirect, idx); +} + +void ahoc_buildKeywordTree (aho_state *root, pwordcount_t words, poptions_t options) +{ + uint32_t i; + if (!root) return; + + //init root + root->id = 0; + root->next[0]=root->next[1]=root->next[2]=root->next[3]=NULL; + root->fail = NULL; + root->output.count = 0; + + //now add each word as a pattern in the keyword tree + for (i=0; isize; i++) + { + //add direct word + word_t w=WORD(words->words[i]); + ahoc_addKeyword (root, w, TRUE, i, options); + + //add reverse word + w=ecoComplementWord(w,options->primer_length); + ahoc_addKeyword (root, w, FALSE, i, options); + } + + //loop on root if some base has no out going edge from roots + for (i=0; i<4; i++) + if (root->next[i] == NULL) + root->next[i] = root; +} + +void ahoc_enqueue (aho_queue *ahoqueue, aho_state *node) +{ + queue_node *q; + if (node == NULL) return; + + q = ECOMALLOC(sizeof(queue_node), + "Cannot allocate memory for aho-corasick queue node"); + q->state_node = node; + q->next = NULL; + + if (ahoqueue->first == NULL) + { + ahoqueue->first = q; + ahoqueue->last = q; + } + else + { + ahoqueue->last->next = q; + ahoqueue->last = q; + } +} + +aho_state *ahoc_dequeue (aho_queue *ahoqueue) +{ + aho_state *node = NULL; + queue_node *q; + + if (ahoqueue->first == NULL) return node; + q = ahoqueue->first; + ahoqueue->first = q->next; + + node = q->state_node; + ECOFREE (q, "Cannot free memory for aho-corasick queue node"); + return node; +} + +//set fail links and output sets for the keyword tree +void ahoc_updateForFailAndOutput (aho_state *root) +{ + int32_t i; + aho_queue Q; + aho_state *node_r; + aho_state *node_u; + aho_state *node_v; + + //empty queue + Q.first = NULL; + Q.last = NULL; + + //for us alphabet has 4 elements, A=0, C=1, G=2 and T=3 + for (i=0; i<4; i++) + { + if (root->next[i] != root && root->next[i] != NULL) + { + root->next[i]->fail = root; + ahoc_enqueue (&Q, root->next[i]); + } + } + + //while queue not empty + while (Q.first != NULL) + { + node_r = ahoc_dequeue (&Q); + for (i=0; i<4; i++) + { + if (node_r->next[i] != NULL) + { + node_u = node_r->next[i]; + ahoc_enqueue (&Q, node_u); + node_v = node_r->fail; + while (node_v->next[i] == NULL) + node_v = node_v->fail; + node_u->fail = node_v->next[i]; + ahoc_unionOutputElements (node_u, node_u->fail); + } + } + } +} + +void ahoc_freeKeywordTree (aho_state *node) +{ + int i; + for (i=0; i<4; i++) + if (node->next[i]) + ahoc_freeKeywordTree (node->next[i]); + if (node->output.count > 0) + ECOFREE (node->output.out_set, "Free failed for node output"); + ECOFREE (node, "Free failed for node"); +} + +pprimercount_t ahoc_lookforStrictPrimers (pecodnadb_t database, uint32_t seqdbsize,uint32_t exampleCount, + pwordcount_t words,poptions_t options) +{ + aho_state automaton_root; + aho_state *curr_state; + //uint32_t inSequenceQuorum; + uint32_t outSequenceQuorum; + pprimer_t data; + pprimercount_t primers; + uint32_t i, j, k; + int32_t pos; + uint32_t lmax; + char *base; + int8_t code; + uint32_t goodPrimers=0; + static int iii=0; + + + //inSequenceQuorum = (uint32_t)floor((float)exampleCount * options->sensitivity_quorum); + outSequenceQuorum = (uint32_t)floor((float)(seqdbsize-exampleCount) * options->false_positive_quorum); + + //fprintf(stderr," Primers should be at least present in %d/%d example sequences\n",inSequenceQuorum,exampleCount); + fprintf(stderr," Primers should not be present in more than %d/%d counterexample sequences\n",outSequenceQuorum,(seqdbsize-exampleCount)); + + data = ECOMALLOC(words->size * sizeof(primer_t), + "Cannot allocate memory for fuzzy matching results"); + for (i=0; i < words->size; i++) + { + data[i].word=WORD(words->words[i]); + data[i].inexample = 0; + data[i].outexample= 0; + + data[i].directCount=ECOMALLOC(seqdbsize * sizeof(uint32_t), + "Cannot allocate memory for primer position"); + data[i].directPos = ECOMALLOC(seqdbsize * sizeof(poslist_t), + "Cannot allocate memory for primer position"); + data[i].reverseCount=ECOMALLOC(seqdbsize * sizeof(uint32_t), + "Cannot allocate memory for primer position"); + data[i].reversePos = ECOMALLOC(seqdbsize * sizeof(poslist_t), + "Cannot allocate memory for primer position"); + } + + //build keywords automaton + ahoc_buildKeywordTree (&automaton_root, words, options); + //set fail links and output sets + ahoc_updateForFailAndOutput (&automaton_root); + + //debug; print keywordtree in a gv file + //ahoc_graphKeywordTree (&automaton_root); + + //loop on each sequence for its each base and find words + for (i=0; i < seqdbsize; i++) + { + if(database[i]->SQ_length <= options->primer_length) continue; + + lmax = database[i]->SQ_length; + if (!options->circular) + lmax += options->primer_length-1; + curr_state = &automaton_root; + + for (j=0,base=database[i]->SQ; jSQ_length) base=database[i]->SQ; + + //code = encoder[(*base) - 'A']; + code = *base; + //if (iii++ < 30) + // fprintf (stderr, "%d:%d,", *base, code); + if (code < 0 || code > 3) + { + //if error char, start from root for next character + //+forget any incomplete words + curr_state = &automaton_root; + continue; + } + while (curr_state->next[code] == NULL) curr_state = curr_state->fail; + curr_state = curr_state->next[code]; + + //start position of primer is options->primer_length-1 chars back + pos = j-options->primer_length+1; + if (pos < 0) pos = database[i]->SQ_length+pos; + + //set output, if there is some output on this state then + //+all words in the output set complete here, so increment their + //+found properties for current sequence + for (k=0; koutput.count; k++) + { + if (curr_state->output.out_set[k].isdirect) + data[curr_state->output.out_set[k].wordidx].directCount[i]++; + else + data[curr_state->output.out_set[k].wordidx].reverseCount[i]++; + + if (options->no_multi_match) + { + if ((data[curr_state->output.out_set[k].wordidx].directCount[i] + + data[curr_state->output.out_set[k].wordidx].reverseCount[i]) > 1) + //since multimach not allowd, set an indication on 1st seq position that + //+ a multimatch was found, so that this word will be filtered out + //+ and because of first postion we wont have to search the whole array + //+ to find if it voilated nomultimatch constraint for some seq + data[curr_state->output.out_set[k].wordidx].directCount[0] = 2; + else + { + if (curr_state->output.out_set[k].isdirect) + //direct word found on jth position of ith sequence + data[curr_state->output.out_set[k].wordidx].directPos[i].value = (uint32_t)pos; + else + //reverse word found on jth position of ith sequence + data[curr_state->output.out_set[k].wordidx].reversePos[i].value = (uint32_t)pos; + } + } + else + { + //okay multi match allowed + if (curr_state->output.out_set[k].isdirect) + { + if (data[curr_state->output.out_set[k].wordidx].directCount[i] == 1) + data[curr_state->output.out_set[k].wordidx].directPos[i].value = (uint32_t)pos; + else + { + //need to create or extend the positions list + if (data[curr_state->output.out_set[k].wordidx].directCount[i] == 2) + { + //for second element, first was put in .value, so dont forget to copy that in the array too + data[curr_state->output.out_set[k].wordidx].directPos[i].pointer = ECOMALLOC(2 * sizeof(uint32_t), + "Cannot allocate memory for primer position"); + data[curr_state->output.out_set[k].wordidx].directPos[i].pointer[0] = data[curr_state->output.out_set[k].wordidx].directPos[i].value; + data[curr_state->output.out_set[k].wordidx].directPos[i].pointer[1] = (uint32_t)pos; + } + else + { + //for third or greater element + data[curr_state->output.out_set[k].wordidx].directPos[i].pointer = ECOREALLOC(data[curr_state->output.out_set[k].wordidx].directPos[i].pointer, + data[curr_state->output.out_set[k].wordidx].directCount[i] * sizeof(uint32_t), + "Cannot allocate memory for primer position"); + data[curr_state->output.out_set[k].wordidx].directPos[i].pointer[data[curr_state->output.out_set[k].wordidx].directCount[i]-1] = (uint32_t)pos; + } + } + } + else + { + if (data[curr_state->output.out_set[k].wordidx].reverseCount[i] == 1) + data[curr_state->output.out_set[k].wordidx].reversePos[i].value = (uint32_t)pos; + else + { + //need to create or extend the positions list + if (data[curr_state->output.out_set[k].wordidx].reverseCount[i] == 2) + { + //for second element, first was put in .value, so dont forget to copy that in the array too + data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer = ECOMALLOC(2 * sizeof(uint32_t), + "Cannot allocate memory for primer position"); + data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer[0] = data[curr_state->output.out_set[k].wordidx].reversePos[i].value; + data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer[1] = (uint32_t)pos; + } + else + { + //for third or greater element + data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer = ECOREALLOC(data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer, + data[curr_state->output.out_set[k].wordidx].reverseCount[i] * sizeof(uint32_t), + "Cannot allocate memory for primer position"); + data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer[data[curr_state->output.out_set[k].wordidx].reverseCount[i]-1] = (uint32_t)pos; + } + } + } + } + //dont forget to increment inexample or outexample count, but only once for a sequence + if ((data[curr_state->output.out_set[k].wordidx].directCount[i] + + data[curr_state->output.out_set[k].wordidx].reverseCount[i]) == 1) + { + if (database[i]->isexample) + data[curr_state->output.out_set[k].wordidx].inexample++; + else + data[curr_state->output.out_set[k].wordidx].outexample++; + } + } + } + } + + //Only thing that remains is to remove the failed words + for (i=0,j=0; isize; i++) + { + fprintf(stderr,"Primers %5d/%lld analyzed => sequence : %s in %d example and %d counterexample sequences \r", + i+1,words->size,ecoUnhashWord(data[i].word,options->primer_length), + data[i].inexample,data[i].outexample); + + //if (data[i].inexample < inSequenceQuorum || (data[i].directCount[0] == 2 && options->no_multi_match)) + if (data[i].directCount[0] == 2 && options->no_multi_match) + { + //bad word, delete from the array + for (k=0; k 1) + ECOFREE (data[i].directPos[k].pointer, "Cannot free position pointer."); + if (data[i].reverseCount[k] > 1) + ECOFREE (data[i].reversePos[k].pointer, "Cannot free position pointer."); + } + ECOFREE (data[i].directCount, "Cannot free position pointer."); + ECOFREE (data[i].directPos, "Cannot free position pointer."); + ECOFREE (data[i].reverseCount, "Cannot free position pointer."); + ECOFREE (data[i].reversePos, "Cannot free position pointer."); + } + else + { + //data[i].good = data[i].inexample >= inSequenceQuorum && data[i].outexample <= outSequenceQuorum; + data[i].good = data[i].outexample <= outSequenceQuorum; + goodPrimers+=data[i].good? 1:0; + if (j < i) + data[j] = data[i]; + j++; + } + } + fprintf(stderr,"\n\nOn %lld analyzed primers %d respect quorum conditions\n",words->size,goodPrimers); + fprintf(stderr,"Conserved primers for further analysis : %d/%lld\n",j,words->size); + + primers = ECOMALLOC(sizeof(primercount_t),"Cannot allocate memory for primer table"); + primers->primers=ECOREALLOC(data, + j * sizeof(primer_t), + "Cannot reallocate memory for fuzzy matching results"); + primers->size=j; + + //free memory of keyword table + for (i=0; i<4; i++) + if (automaton_root.next[i] != &automaton_root) + ahoc_freeKeywordTree (automaton_root.next[i]); + + return primers; +} + +void ahoc_graphPrintNodesInfo (aho_state *node, FILE* gfile) +{ + uint32_t i; + fprintf (gfile, "\"%d\"[\n", node->id); + fprintf (gfile, "label=\"%d\\n", node->id); + for (i=0; ioutput.count; i++) + fprintf (gfile, "%d%c,", node->output.out_set[i].wordidx, node->output.out_set[i].isdirect?'d':'r'); + fprintf (gfile, "\"\n];\n"); + + for (i=0; i<4; i++) + if (node->next[i] != NULL && node->next[i] != node) + ahoc_graphPrintNodesInfo (node->next[i], gfile); +} + +void ahoc_graphPrintNodesLinks (aho_state *node, FILE* gfile) +{ + uint32_t i; + static int j=0; + + for (i=0; i<4; i++) + if (node->next[i] != NULL && node->next[i] != node) + { + fprintf (gfile, "\"%d\" -> \"%d\" [\n", node->id, node->next[i]->id); + fprintf (gfile, "label=\"%c\"\n];\n", "ACGT"[i]); + } + + if (j++ < 40) + if (node->fail != NULL && node->fail != groot) + { + fprintf (gfile, "\"%d\" -> \"%d\" [\n", node->id, node->fail->id); + fprintf (gfile, "color= \"red\"\n];\n"); + } + + for (i=0; i<4; i++) + if (node->next[i] != NULL && node->next[i] != node) + ahoc_graphPrintNodesLinks (node->next[i], gfile); +} + +void ahoc_graphKeywordTree (aho_state *root) +{ + FILE *gfile; + + groot=root; + gfile = fopen ("keywordtree.gv", "w"); + fprintf (gfile, "digraph keywordtree {\n"); + ahoc_graphPrintNodesInfo (root, gfile); + ahoc_graphPrintNodesLinks (root, gfile); + fprintf (gfile, "}\n"); + fclose(gfile); +} + diff --git a/src/libecoprimer/ahocorasick.h b/src/libecoprimer/ahocorasick.h new file mode 100755 index 0000000..097af4f --- /dev/null +++ b/src/libecoprimer/ahocorasick.h @@ -0,0 +1,43 @@ +/* + * ahocorasick.h + * + * Created on: 26 march 2011 + * Author: tiayyba + */ + +#ifndef H_ahocorasick +#define H_ahocorasick + +#include "ecoprimer.h" + +typedef struct aho_output_t{ + uint32_t wordidx; //index of strict word (dont save the word of 64B) + bool_t isdirect; //we need to find both direct and reverse words so we must know which one is it +}aho_output; + +typedef struct aho_output_count_t{ + uint32_t count; + aho_output *out_set; +}aho_output_count; + +typedef struct aho_state_t{ + int32_t id; + struct aho_state_t *next[4]; //for labels A=0,C=1,G=2 and T=3 + struct aho_state_t *fail; + aho_output_count output; +}aho_state; + +typedef struct queue_node_t { + aho_state *state_node; + struct queue_node_t *next; +}queue_node; + +typedef struct{ + queue_node *first; + queue_node *last; +}aho_queue; + +pprimercount_t ahoc_lookforStrictPrimers (pecodnadb_t database, uint32_t seqdbsize,uint32_t exampleCount, + pwordcount_t words,poptions_t options); +#endif /* H_ahocorasick */ + diff --git a/src/libecoprimer/ecoprimer.h b/src/libecoprimer/ecoprimer.h index b2f61fd..37fefd1 100644 --- a/src/libecoprimer/ecoprimer.h +++ b/src/libecoprimer/ecoprimer.h @@ -176,6 +176,7 @@ typedef struct { int *wellIdentifiedSeqs; //< an array having elements equla to total seqs // values are either 0 or 1, if seq is well identified // its 1 else 0 + int *coveredSeqs; //< an array having elements equal to total seqs, 1 if seq is covered else 0 // these statistics are relative to inexample sequences @@ -291,6 +292,9 @@ typedef struct { PNNParams pnparm; bool_t print_sets_of_primers; float specificity_threshold; + int links_cnt; + float max_links_percent; + bool_t filter_on_links; } options_t, *poptions_t; typedef ecoseq_t **pecodnadb_t; @@ -350,7 +354,7 @@ int32_t getrankdbstats(pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxonomy, poptions_t options); -float taxonomycoverage(ppair_t pair, poptions_t options); +float taxonomycoverage(ppair_t pair, poptions_t options, pecodnadb_t seqdb,uint32_t seqdbsize); char ecoComplementChar(char base); void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize); diff --git a/src/libecoprimer/filtering.c b/src/libecoprimer/filtering.c index a585f9e..24e1387 100644 --- a/src/libecoprimer/filtering.c +++ b/src/libecoprimer/filtering.c @@ -114,6 +114,8 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest, error<<= 1; error&=ERRORMASK(FWORDSIZE); + //code = -1; + //if((*base) >= 'A' && (*base) <= 'Z') code = encoder[(*base) - 'A']; if (code <0) { @@ -154,7 +156,7 @@ int32_t *filteringSeq(pecodnadb_t database, uint32_t seqdbsize, for (i=0;iisexample) + if (database[i]->isexample && database[i]->SQ_length > options->primer_length) { j++; wordscount=ecoFilteringHashSequence(wordscount, diff --git a/src/libecoprimer/pairs.c b/src/libecoprimer/pairs.c index ecb302b..88c1c4b 100644 --- a/src/libecoprimer/pairs.c +++ b/src/libecoprimer/pairs.c @@ -179,7 +179,7 @@ static void buildPrimerPairsForOneSeq(uint32_t seqid, uint32_t i,j,k; uint32_t matchcount=0; pprimermatch_t matches = NULL; - primermatchcount_t seqmatchcount; + //primermatchcount_t seqmatchcount; ppair_t pcurrent; pair_t current; pprimer_t wswp; @@ -189,9 +189,9 @@ static void buildPrimerPairsForOneSeq(uint32_t seqid, //char prmr[50]; //float mtemp; word_t w1, w1a, omask = (0x1L << (options->strict_three_prime*2)) -1; - word_t w2, w2a, wtmp; + word_t w2, w2a;//, wtmp; uint32_t bp1,bp2; - + //prmr[options->primer_length] = '\0'; for (i=0;i < primers->size; i++) @@ -252,16 +252,17 @@ static void buildPrimerPairsForOneSeq(uint32_t seqid, { // For all primers matching the sequence - //for(j=i+1; - // (jprimer_length) < options->lmax); - // j++ - // ) + /*for(j=i+1; + (jprimer_length) < options->lmax); + j++ + )//*/ for (j=i+1; jprimer_length) continue; distance = matches[j].position - matches[i].position - options->primer_length; if (distance >= options->lmax) break; + // For all not too far primers @@ -269,9 +270,7 @@ static void buildPrimerPairsForOneSeq(uint32_t seqid, && (distance > options->lmin) ) { - // If possible primer pair - current.p1 = matches[i].primer; current.asdirect1=matches[i].strand; current.p2 = matches[j].primer; @@ -456,7 +455,6 @@ static void buildPrimerPairsForOneSeq(uint32_t seqid, } } } - pairs->count=paircount; } diff --git a/src/libecoprimer/strictprimers.c b/src/libecoprimer/strictprimers.c index e6e7f4e..d555720 100644 --- a/src/libecoprimer/strictprimers.c +++ b/src/libecoprimer/strictprimers.c @@ -108,10 +108,11 @@ void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circ table->inseqcount++; - - table->strictcount = ECOREALLOC(table->strictcount,buffersize*sizeof(uint32_t), + //fprintf (stderr, "\nOldAddress: %x", table->strictcount); + table->strictcount = ECOREALLOC(table->strictcount,(buffersize+5000)*sizeof(uint32_t), "Cannot allocate memory to extend example word count table"); - + //fprintf (stderr, " NewAddress: %x\n", table->strictcount); + for (i=table->size; i < buffersize; i++) table->strictcount[i]=1; @@ -172,7 +173,7 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize, for (i=0;iisexample) + if (database[i]->isexample && database[i]->SQ_length > options->primer_length) { if (first) diff --git a/src/libecoprimer/taxstats.c b/src/libecoprimer/taxstats.c index cdc97a5..0898303 100644 --- a/src/libecoprimer/taxstats.c +++ b/src/libecoprimer/taxstats.c @@ -6,10 +6,46 @@ */ #include +//void tdestroy (void *root, void (*free_node)(void *nodep)); + #include "ecoprimer.h" static int cmptaxon(const void *t1, const void* t2); +void **tree_root = NULL; +int delete_passes = 0; + +void delete_twalkaction (const void *node, VISIT order, int level) +{ + switch (order) + { + case preorder: + delete_passes++; + break; + case postorder: + delete_passes++; + break; + case endorder: + delete_passes++; + break; + case leaf: + if (tree_root) + tdelete (node, tree_root,cmptaxon); + delete_passes++; + break; + } +} + +void free_tree_nodes (void *tree) +{ + while (1) + { + delete_passes = 0; + twalk (tree, delete_twalkaction); + if (delete_passes <= 1) break; + } +} + static int cmptaxon(const void *t1, const void* t2) { const size_t taxid1=(size_t)t1; @@ -35,7 +71,12 @@ int32_t counttaxon(int32_t taxid) if (taxid==-1) { if (taxontree) + { + tree_root = (void **)&taxontree; + //free_tree_nodes (taxontree); ECOFREE(taxontree,"Free taxon tree"); + tree_root = NULL; + } taxontree=NULL; taxoncount=0; return 0; @@ -97,22 +138,30 @@ int32_t getrankdbstats(pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *tax } -float taxonomycoverage(ppair_t pair, poptions_t options) +float taxonomycoverage(ppair_t pair, poptions_t options, pecodnadb_t seqdb,uint32_t seqdbsize) { int32_t seqcount; int32_t i; int32_t incount=0; int32_t outcount=0; + uint32_t j; + memset (pair->coveredSeqs, 0, seqdbsize*sizeof (int)); seqcount=pair->pcr.ampcount; counttaxon(-1); for (i=0; i < seqcount; i++) if (pair->pcr.amplifias[i].sequence->isexample && pair->pcr.amplifias[i].sequence->ranktaxonid > 0 ) + { incount = counttaxon(pair->pcr.amplifias[i].sequence->ranktaxonid); + for (j=0; jpcr.amplifias[i].sequence == seqdb[j]) + {pair->coveredSeqs[j] = 1; break;} + } + counttaxon(-1); for (i=0; i < seqcount; i++) if (!pair->pcr.amplifias[i].sequence->isexample @@ -145,12 +194,14 @@ static int cmpamp(const void *ampf1, const void* ampf2) { incr = -1; j = pampf1->length - 1; + if (pampf2->strand) { pampf1 = (pamptotaxon_t) ampf2; pampf2 = (pamptotaxon_t) ampf1; chd = 1; } + //j = pampf2->length - 1; should have been here and pampf2 instead of pampf1? } len = (pampf1->length <= pampf2->length)? pampf1->length: pampf2->length; @@ -173,6 +224,7 @@ static int cmpamp(const void *ampf1, const void* ampf2) return 0; }*/ + static int cmpamp(const void *ampf1, const void* ampf2) { int i; @@ -183,10 +235,10 @@ static int cmpamp(const void *ampf1, const void* ampf2) char *ch2; int incr1; int incr2; - + pamptotaxon_t pampf1 = (pamptotaxon_t) ampf1; pamptotaxon_t pampf2 = (pamptotaxon_t) ampf2; - + ch1 = pampf1->amplifia; ch2 = pampf2->amplifia; @@ -218,7 +270,7 @@ static int cmpamp(const void *ampf1, const void* ampf2) if (pampf1->length > pampf2->length) return 1; if (pampf2->length > pampf1->length) return -1; - + return 0; } @@ -242,6 +294,8 @@ void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize) uint32_t i, j; uint32_t ampfindex = 0; int32_t taxid; + uint32_t wellidentifiedcount; + void *ampftree = NULL; pamptotaxon_t pcurrentampf; pamptotaxon_t *ptmp; @@ -278,11 +332,14 @@ void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize) } memset (pair->wellIdentifiedSeqs, 0, seqdbsize*sizeof (int)); - counttaxon(-1); + //counttaxon(-1); for (i = 0; i < ampfindex; i++) { if (ampfwithtaxtree[i].taxoncount > 1) - twalk(ampfwithtaxtree[i].taxontree, twalkaction); + { + //printf ("\nampfwithtaxtree[i].taxoncount: %d\n", ampfwithtaxtree[i].taxoncount); + //twalk(ampfwithtaxtree[i].taxontree, twalkaction); + } //TR 5/9/10 - added code for well identified seqs else if(ampfwithtaxtree[i].taxoncount == 1) /*well identified*/ { @@ -293,6 +350,7 @@ void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize) { for (j = 0; j < seqdbsize; j++) if (seqdb[j]->ranktaxonid == gtxid + && seqdb[j]->isexample &&(pair->p1->directCount[j] > 0 || pair->p1->reverseCount[j] > 0) && (pair->p2->directCount[j] > 0 @@ -303,10 +361,18 @@ void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize) } } } - - pair->notwellidentifiedtaxa = counttaxon(-2); - pair->bs = ((float)pair->intaxa - (float)pair->notwellidentifiedtaxa) / pair->intaxa; - + //printf ("\n"); + counttaxon(-1); + wellidentifiedcount = 0; + for (j = 0; j < seqdbsize; j++) + if (pair->wellIdentifiedSeqs[j] == 1) + counttaxon(seqdb[j]->ranktaxonid); + wellidentifiedcount = counttaxon(-2); + //pair->notwellidentifiedtaxa = counttaxon(-2); + pair->notwellidentifiedtaxa = (pair->intaxa-wellidentifiedcount); //counttaxon(-2); + //pair->bs = ((float)pair->intaxa - (float)pair->notwellidentifiedtaxa) / pair->intaxa; + pair->bs = ((float)wellidentifiedcount) / (float)pair->intaxa; + ECOFREE (ampfwithtaxtree, "Free amplifia table"); }