diff --git a/.cproject b/.cproject
index 9f894fd..f99343a 100644
--- a/.cproject
+++ b/.cproject
@@ -1,151 +1,221 @@
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/ecoprimer.c b/src/ecoprimer.c
index b159c5e..7f0cd52 100644
--- a/src/ecoprimer.c
+++ b/src/ecoprimer.c
@@ -7,6 +7,7 @@
#include "libecoprimer/ecoprimer.h"
#include "libecoprimer/PrimerSets.h"
+#include "libecoprimer/ahocorasick.h"
#include
#include
#include
@@ -25,6 +26,8 @@
static int cmpprintedpairs(const void* p1,const void* p2);
//float _Z27calculateMeltingTemperature_ (char * seq1, char * seq2);
+pwordcount_t reduce_words_to_debug (pwordcount_t words, poptions_t options);
+void print_wordwith_positions (primer_t prm, uint32_t seqdbsize, poptions_t options);
void* lib_handle = NULL;
float (*calcMelTemp)(char*, char*);
@@ -71,12 +74,12 @@ static void PrintHelp()
PP "-m : Salt correction method for Tm computation (SANTALUCIA : 1 or OWCZARZY:2, default=1)\n\n");
PP "-a : Salt contentration in M for Tm computation (default 0.05 M)\n\n");
PP "-U : No multi match\n\n");
- PP "-U : Define the [R]eference sequence identifier (must be part of example set)\n\n");
+ PP "-R : Define the [R]eference sequence identifier (must be part of example set)\n\n");
PP "-A : Print the list of all identifier of sequences present in the database\n\n");
PP "-f : Remove data mining step during strict primer identification\n\n");
PP "-v : Store statistic file about memory usage during strict primer identification\n\n");
- PP "-p : Print sets of primers\n\n");
- PP "-T : Ignore pairs having specificity below this Threshold\n\n");
+ PP "-p : Print sets of primers (may take several minutes after primers have been designed!)\n\n");
+ PP "-T : Ignore pairs having specificity below this Threshold\n\n");
PP "\n");
PP "------------------------------------------\n");
PP "Table result description : \n");
@@ -151,6 +154,9 @@ void initoptions(poptions_t options)
options->printAC=FALSE;
options->print_sets_of_primers = FALSE;
options->specificity_threshold = 0.6;
+ options->links_cnt = 1;
+ options->max_links_percent = -1; /*graph only those primers having maximum 15% links*/
+ options->filter_on_links = FALSE;
}
void printapair(int32_t index,ppair_t pair, poptions_t options)
@@ -165,7 +171,7 @@ void printapair(int32_t index,ppair_t pair, poptions_t options)
bool_t good2=pair->p2->good;
bool_t goodtmp;
bool_t strand;
- uint32_t i;
+ uint32_t i, j;
float temp;
CNNParams nnparams;
@@ -296,6 +302,12 @@ void printapair(int32_t index,ppair_t pair, poptions_t options)
else
printf("\t\t");
+/* j=0;
+ for (i=0; idbsize; i++)
+ if (pair->wellIdentifiedSeqs[i] == 1)
+ j++;
+ printf("%d", j);*/
+
printf("\n");
}
@@ -335,6 +347,7 @@ uint32_t filterandsortpairs(ppair_t* sortedpairs,uint32_t count, poptions_t opti
else qfp=0.0;
sortedpairs[i]->wellIdentifiedSeqs = NULL; //TR 05/09/10 - wellIdentified needed for primer sets
+ sortedpairs[i]->coveredSeqs = NULL; //TR 05/09/10 - wellIdentified needed for primer sets
sortedpairs[i]->quorumin = q;
sortedpairs[i]->quorumout = qfp;
sortedpairs[i]->yule = q - qfp;
@@ -345,13 +358,13 @@ uint32_t filterandsortpairs(ppair_t* sortedpairs,uint32_t count, poptions_t opti
{
//TR 05/09/10 - wellIdentified needed for primer sets
sortedpairs[j]->wellIdentifiedSeqs = ECOMALLOC(options->dbsize * sizeof(int),"Cannot allocate well_identified_array");
- (void)taxonomycoverage(sortedpairs[j],options);
+ sortedpairs[j]->coveredSeqs = ECOMALLOC(options->dbsize * sizeof(int),"Cannot allocate well_identified_array");
+ (void)taxonomycoverage(sortedpairs[j],options, seqdb, options->dbsize);
taxonomyspecificity(sortedpairs[j], seqdb, options->dbsize);
//j++;
//if specificity less than user provieded threshold (default 60%) then ignore this pair
if (sortedpairs[j]->bs >= options->specificity_threshold)
j++;
-
}
}
@@ -369,7 +382,8 @@ void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy,
size_t count;
char *taxon[]={"taxon","taxa"};
ecotx_t *current_taxon;
- pairset pair_sets;
+ //pairset pair_sets;
+ pairset *pset = NULL;
//printf("Index\tPrimer1\tPrimer2\tGB\tInexampleCount\tOutexampleCount\tYule\tIntaxaCount\tOuttaxaCount\tCoverage\tSpecificity\tMinAmplifiedLength\tMaxAmplifiedLength\tAvgAmplifiedLength\n");
@@ -388,7 +402,7 @@ void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy,
for (i=0;ipaircount;i++,j++)
sortedpairs[j]=pl->pairs+i;
-
+
count=filterandsortpairs(sortedpairs,pairs->count,options, seqdb);
getThermoProperties(sortedpairs, count, options);
@@ -451,15 +465,55 @@ void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy,
printf("# DB sequences are considered as linear\n");
printf("# Pairs having specificity less than %0.2f will be ignored\n", options->specificity_threshold);
printf("#\n");
+
for (i=0;i < count;i++)
printapair(i,sortedpairs[i],options);
+
+ if (options->filter_on_links)
+ {
+ fprintf (stderr, "Old size: %d, ", count);
+ count = primers_changeSortedArray (&sortedpairs, count, options);
+ //count = primers_filterWithGivenLinks (&sortedpairs, count, options);
+ fprintf (stderr, "New size: %d\n", count);
+
+ if (count == 0)
+ {
+ fprintf (stderr, "No pairs passed the links constraints.\n");
+ printf ("No pairs passed the links constraints.\n");
+ return;
+ }
+
+ for (i=0;i < count;i++)
+ printapair(i,sortedpairs[i],options);
+ }
if (options->print_sets_of_primers == TRUE)
{
- pair_sets = build_primers_set (sortedpairs, count, seqdb, options);
+ /*pair_sets = build_primers_set (sortedpairs, count, seqdb, options);
+ printf("Results from Greedy Algorithm and some other possibilities:\n");
some_other_set_possibilities (&pair_sets, sortedpairs, count, seqdb, options);
+ printf("Results from simulated Anealing:\n");
+ sets_by_SimulatedAnealing (&pair_sets, sortedpairs, count, seqdb, options);
+ printf("Results from Tabu Search:\n");
+ sets_by_TabuSearch (&pair_sets, sortedpairs, count, seqdb, options);*/
+ //pset = sets_by_BruteForce (sortedpairs, count, seqdb, options);
+ //if (pset)
+ /*/{
+ printf("Results from simulated Anealing:\n");
+ sets_by_SimulatedAnealing (pset, sortedpairs, count, seqdb, options);
+ printf("Results from Tabu Search:\n");
+ sets_by_TabuSearch (pset, sortedpairs, count, seqdb, options);
+
+ if (pset)
+ {
+ ECOFREE (pset->set_wellIdentifiedTaxa, "Could not free memory for pair set wi");
+ ECOFREE (pset, "Could not free memory for pair");
+ }
+ }*/
+ build_and_print_sets (sortedpairs, count, seqdb, options);
}
+ //primers_graph_graphviz (sortedpairs, count, options);
}
@@ -545,7 +599,7 @@ int main(int argc, char **argv)
initoptions(&options);
- while ((carg = getopt(argc, argv, "hAfvcUDSpE:d:l:L:e:i:r:R:q:3:s:x:t:O:m:a:T:")) != -1) {
+ while ((carg = getopt(argc, argv, "hAfvcUDSpbE:d:l:L:e:i:r:R:q:3:s:x:t:O:m:a:T:k:M:")) != -1) {
switch (carg) {
/* ---------------------------- */
@@ -711,15 +765,31 @@ int main(int argc, char **argv)
/* -------------------- */
case 'p': /* print sets of primers */
/* --------------------------------- */
- options.print_sets_of_primers = TRUE;
+ //options.print_sets_of_primers = TRUE;
break;
-
+
+ /* --------------------------------- */
+ case 'T': /* Ignore pairs having specificity below this Threshold */
/* --------------------------------- */
- case 'T': /* Ignore pairs having specificity below this Threshold */
+ sscanf(optarg,"%f",&(options.specificity_threshold));
+ break;
+
/* --------------------------------- */
- sscanf(optarg,"%f",&(options.specificity_threshold));
+ case 'M': /* Max link percentage for graph */
+ /* --------------------------------- */
+ sscanf(optarg,"%f",&(options.max_links_percent));
break;
-
+
+ /* --------------------------------- */
+ case 'k': /* links count */
+ /* --------------------------------- */
+ sscanf(optarg,"%d",&(options.links_cnt));
+ break;
+
+ case 'b':
+ options.filter_on_links = TRUE;
+ break;
+
case '?': /* bad option */
/* -------------------- */
errflag++;
@@ -779,7 +849,11 @@ int main(int argc, char **argv)
words = lookforStrictPrimer(seqdb,seqdbsize,insamples,&options);
fprintf(stderr,"\n Strict primer count : %d\n",words->size);
-
+
+ /*/TR Testing
+ fprintf(stderr,"\nReducing for debugging\n");
+ words = reduce_words_to_debug (words, &options);
+ ///*/
// options.filtering=FALSE;
// words2= lookforStrictPrimer(seqdb,seqdbsize,insamples,&options);
// fprintf(stderr,"\n Strict primer count : %d\n",words2->size);
@@ -802,7 +876,6 @@ int main(int argc, char **argv)
for (i=0; isize); i++)
fprintf(stderr," + Primer : %s sequence count : %d\n",ecoUnhashWord(words->words[i],options.primer_length),words->strictcount[i]);
-
fprintf(stderr,"\nEncoding sequences for fuzzy pattern matching...\n");
for (i=0;istrictcount,"Free strict primer count table");
- primers = lookforAproxPrimer(seqdb,seqdbsize,insamples,words,&options);
+ if (options.error_max == 0)//aho, if(options.error_max == 0 && 0) old
+ primers = ahoc_lookforStrictPrimers (seqdb,seqdbsize,insamples,words,&options);
+ else
+ primers = lookforAproxPrimer(seqdb,seqdbsize,insamples,words,&options);
+
+ //for (i=0; isize; i++)
+ // print_wordwith_positions (primers->primers[i], seqdbsize, &options);
ECOFREE(words->words,"Free strict primer table");
ECOFREE(words,"Free strict primer structure");
@@ -833,3 +912,108 @@ int main(int argc, char **argv)
return 0;
}
+
+#define DEBUG_WORDS_CNT 14
+pwordcount_t reduce_words_to_debug (pwordcount_t words, poptions_t options)
+{
+ uint32_t i, k;
+ pwordcount_t new_words;
+ char *rwrd;
+ char dwrd[20];
+ /*char *strict_words[DEBUG_WORDS_CNT] = {"GAGTCTCTGCACCTATCC", "GCAATCCTGAGCCAAATC", "ACCCCTAACCACAACTCA",
+ "TCCGAACCGACTGATGTT", "GAAGCTTGGGTGAAACTA", "GGAGAACCAGCTAGCTCT", "GCTGGTTCTCCCCGAAAT",
+ "TCGATTTGGTACCGCTCT", "AAAGGAGAGAGAGGGATT", "GGATTGCTAATCCGTTGT", "CCCCCATCGTCTCACTGG",
+ "TGAGGCGCAGCAGTTGAC", "GCGCTACGGCGCTGAAGT", "TTTCCTGGGAGTATGGCA"};*/
+ char *strict_words[DEBUG_WORDS_CNT] = {"CTCCGGTCTGAACTCAGA", "TGTTGGATCAGGACATCC", "TAGATAGAAACCGACCTG",
+ "TGGTGCAGCCGCTATTAA", "AGATAGAAACTGACCTGG", "TGGTGCAGCCGCTATTAA", "CTAATGGTGCAGCCGCTA",
+ "TAGAAACTGACCTGGATT", "AGATAGAAACCGACCTGG", "ATGGTGCAGCCGCTATTA", "ATAGATAGAAACCGACCT",
+ "GCCGCTATTAAGGGTTCG", "GGTGCAGCCGCTATTAAG", "TAGAAACTGACCTGGATT"};
+ int word_seen[DEBUG_WORDS_CNT];
+
+
+ new_words = ECOMALLOC(sizeof(wordcount_t),"Cannot allocate memory for word count structure");
+ new_words->inseqcount = words->inseqcount;
+ new_words->outseqcount = words->outseqcount;
+ new_words->size = DEBUG_WORDS_CNT;
+ new_words->strictcount = ECOMALLOC((new_words->size*sizeof(uint32_t)), "Cannot allocate memory for word count table");
+ new_words->words = ECOMALLOC(new_words->size*sizeof(word_t), "I cannot allocate memory for debug words");
+
+ for (k = 0; k < DEBUG_WORDS_CNT; k++)
+ word_seen[k] = 0;
+
+ for (i=0; i < words->size; i++)
+ {
+ rwrd = ecoUnhashWord(words->words[i],options->primer_length);
+ strcpy (dwrd, rwrd);
+ rwrd = ecoUnhashWord(ecoComplementWord(words->words[i],options->primer_length),options->primer_length);
+ for (k = 0; k < DEBUG_WORDS_CNT; k++)
+ {
+ if (strcmp (dwrd, strict_words[k]) == 0) break;
+ if (strcmp (rwrd, strict_words[k]) == 0) break;
+ }
+
+ if (k < DEBUG_WORDS_CNT)
+ {
+ if (word_seen[k] == 0)
+ {
+ new_words->words[k] = words->words[i];
+ new_words->strictcount[k] = words->strictcount[i];
+ }
+ word_seen[k]++;
+ }
+ }
+
+ fprintf (stderr, "Debug Words Info:\n");
+ for (k = 0; k < DEBUG_WORDS_CNT; k++)
+ fprintf (stderr, "%s:%d\n", strict_words[k], word_seen[k]);
+
+
+ //clean input wods;
+ ECOFREE(words->words,"Clean word table");
+ ECOFREE(words->strictcount,"Clean word count table");
+ ECOFREE(words,"Clean word structure");
+
+ return new_words;
+}
+
+void print_wordwith_positions (primer_t prm, uint32_t seqdbsize, poptions_t options)
+{
+ char *wrd;
+ uint32_t i, j;
+ char *twrd = "GCCTGTTTACCAAAAACA";
+
+ wrd = ecoUnhashWord(prm.word,options->primer_length);
+
+ if (strcmp (twrd, wrd) == 0)
+ {
+ printf ("Positions for Word: %s\n", wrd);
+ for (i=0; i 0)
+ {
+ printf ("%d:", i);
+ if (prm.directCount[i] == 1)
+ printf ("%d", prm.directPos[i].value);
+ else
+ for (j=0; j 0)
+ {
+ printf ("%d:", i);
+ if (prm.reverseCount[i] == 1)
+ printf ("%d", prm.reversePos[i].value);
+ else
+ for (j=0; j
+#include
+#include
//#include "ecoprimer.h"
#include "PrimerSets.h"
+int TabuList[PRIMERS_IN_SET_COUNT];
+int next_tabu_slot = -1;
+int total_pairs = -1;
+//int32_t total_wi = -1;
+
int32_t counttaxon(int32_t taxid);
+int find_in_tabu (int index);
+
+int32_t count_taxons (int32_t taxid)
+{
+ static int32_t count = 0;
+ static int32_t slots = 0;
+ static int32_t *taxon_array = NULL;
+ int32_t i;
+
+ if (taxid == -1)
+ {
+ if (taxon_array)
+ ECOFREE (taxon_array, "Could not free memory for taxon array");
+ taxon_array = NULL;
+ slots = 0;
+ count = 0;
+ }
+ else
+ {
+ for (i = 0; i < count; i++)
+ {
+ if (taxid == taxon_array[i]) return count;
+ }
+
+ if (count == slots)
+ {
+ slots += 500;
+
+ if (taxon_array == NULL)
+ {
+ taxon_array = (int32_t *) ECOMALLOC(slots*sizeof (int32_t),
+ "Could not allocate memory for taxon array");
+ }
+ else
+ taxon_array = (int32_t *) ECOREALLOC(taxon_array, slots*sizeof (int32_t),
+ "Could not reallocate memory for taxon array");
+ }
+ taxon_array[count] = taxid;
+ count++;
+ }
+ return count;
+}
float get_set_coverage (pairset *p_set, SetParams *pparams, int32_t pidx_toexclude)
{
int32_t i, j;
float cov;
- pprimer_t prmr1;
- pprimer_t prmr2;
int32_t s_intaxa = 0;
+ int32_t seqcount;
- counttaxon(-1);
+ //counttaxon(-1);
+ count_taxons (-1);
for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
{
if (p_set->set_pairs[i] == -1 || pidx_toexclude == i) continue;
- prmr1 = pparams->sortedpairs[p_set->set_pairs[i]]->p1;
- prmr2 = pparams->sortedpairs[p_set->set_pairs[i]]->p2;
- for (j = 0; j < pparams->options->dbsize; j++)
- if ((prmr1->directCount[j] > 0 || prmr1->reverseCount[j] > 0)
- && (prmr2->directCount[j] > 0 || prmr2->reverseCount[j] > 0)
- && pparams->seqdb[j]->isexample && pparams->seqdb[j]->ranktaxonid > 0)
- {
- s_intaxa = counttaxon(pparams->seqdb[j]->ranktaxonid);
- }
+ seqcount=pparams->sortedpairs[p_set->set_pairs[i]]->pcr.ampcount;
+ for (j=0; j < seqcount; j++)
+ if (pparams->sortedpairs[p_set->set_pairs[i]]->pcr.amplifias[j].sequence->isexample
+ && pparams->sortedpairs[p_set->set_pairs[i]]->pcr.amplifias[j].sequence->ranktaxonid > 0 )
+ s_intaxa = count_taxons(pparams->sortedpairs[p_set->set_pairs[i]]->pcr.amplifias[j].sequence->ranktaxonid);
+
}
//fprintf(stderr, "%d/%d\n", s_intaxa, pparams->options->intaxa);
+ p_set->set_intaxa = s_intaxa;
cov = s_intaxa*1.0/(pparams->options->intaxa*1.0);
-
+ count_taxons (-1);
return cov;
}
@@ -39,52 +86,172 @@ void set_cov_spc (pairset *p_set, SetParams *pparams)
int32_t i;
int32_t ssp = 0;
+ count_taxons (-1);
for (i = 0; i < pparams->options->dbsize; i++)
- if (p_set->set_wellIdentifiedTaxa[i] == 1) ssp++;
-
- //set specificity
- p_set->set_specificity = ssp*1.0/(pparams->options->insamples*1.0);
-
+ if (p_set->set_wellIdentifiedTaxa[i] == 1)
+ ssp = count_taxons (pparams->seqdb[i]->ranktaxonid);
+
//set coverage
p_set->set_coverage = get_set_coverage (p_set, pparams, -1);
+
+ //set specificity
+ p_set->set_specificity = ((float)ssp)/((float)p_set->set_intaxa);
+ p_set->set_wi_cnt = ssp;
+ count_taxons (-1);
}
-int ok_to_add (int id, pairset *pair_set)
+//currently used only to open dead lock
+void tabu_aspiration (pairset *pair_set)
{
int i;
+ int count = 0;
+
+ for (i=0; iset_pairs[i] != -1)
+ count++;
+ if (TabuList[i] != -1)
+ count++;
+ }
+
+ if (count == total_pairs)
+ for (i=0; iset_pairs[i] == id) return 0;
+ for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
+ if (pair_set->set_pairs[i] != -1) secnt++;
+
+ //if (secnt == PRIMERS_IN_SET_COUNT) return 0;
+ if (secnt == 0) return 1;
+
+ lnk_prcnt = 1.0;
+ if (secnt > pparams->options->links_cnt)
+ lnk_prcnt = (pparams->options->links_cnt*1.0)/(secnt*1.0);
+
+ //TR 6/2/11: new elements must have some links with atleast one elem of set
+ if (get_links_distribution (id, pair_set, pparams) < lnk_prcnt) return 0;
+
+ //if in tabu search search tabu list as well
+ if (next_tabu_slot != -1)
+ {
+ //effency_switch is only there to avoid tabu_aspiration execution
+ //each time
+ effency_switch++;
+ if ((effency_switch%5) == 0)
+ {
+ effency_switch=1;
+ tabu_aspiration(pair_set);
+ }
+
+ i = find_in_tabu (id);
+ if (i != -1) return 0;
+ }
return 1;
}
+pairset build_primers_set_greedy_cov (SetParams *params)
+{
+ pairset pair_set;
+ int32_t i;
+ int32_t pset_idx;
+ int32_t prb_idx;
+
+ memset (&pair_set, 0, sizeof(pairset));
+ pair_set.set_wellIdentifiedTaxa = (int *) ECOMALLOC(params->options->dbsize*sizeof (int),
+ "Could not allocate memory for pair set");
+
+ for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
+ pair_set.set_pairs[i] = -1;
+
+ pset_idx = 0;
+ prb_idx = 0;
+ //add first pair by default, this should be the one having highiest specificty
+ add_pair_in_set (&pair_set, pset_idx, prb_idx, params);
+ pset_idx++;
+
+ while (pset_idx < PRIMERS_IN_SET_COUNT)
+ {
+ if (pair_set.set_coverage == 1.0) break;
+ prb_idx = get_next_option_increasing_cov (&pair_set, params);
+ if (prb_idx == 0) break;
+ add_pair_in_set (&pair_set, pset_idx, prb_idx, params);
+ pset_idx++;
+ }
+ //get_set_mean_cov_stats (&pair_set, ¶ms);
+ reset_set_props (&pair_set, params);
+
+ return pair_set;
+}
+
+int32_t get_next_option_increasing_cov (pairset *pair_set, SetParams *pparams)
+{
+ float ini_set_cov;
+ float set_cov;
+ float cov_diff = 0.;
+ int32_t i, id_slot = -1, max_id = 0;
+
+ //coverage already 1, dont proceed.
+ if (pair_set->set_coverage == 1.0) return 0;
+
+ //find next available slot in the set
+ for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
+ if (pair_set->set_pairs[i] == -1)
+ {
+ id_slot = i;
+ break;
+ }
+ //set already full
+ if (id_slot == -1) return 0;
+ //save original set coverage
+ ini_set_cov = pair_set->set_coverage;
+ for (i = 1; i < pparams->sorted_count; i++)
+ {
+ if (ok_to_add (i, pair_set, pparams) == 0) continue;
+ pair_set->set_pairs[id_slot] = i;
+ set_cov = get_set_coverage (pair_set, pparams, -1);
+ if ((set_cov - ini_set_cov) > cov_diff)
+ {
+ cov_diff = set_cov - ini_set_cov;
+ max_id = i;
+ }
+ }
+ pair_set->set_pairs[id_slot] = -1;
+ return max_id;
+}
+
+
//1. Add in set the first pair having highiest specificity
//2. For the sequences not WI by primers in set, calculate count for each pair equal to number of seqs WI by it
//3. Take the pair with highiest such count and see its links with primers already in set,
-//4. If no/insufficient links, take next pair
+//4. If no/insufficient links, take next pair else add pair in set
//5. repeate 3,4 untill pair gets added in set
-//6. Repeate 2 to 5 until set complete
+//6. Repeate 2 to 5 until set completes
-pairset build_primers_set (ppair_t* sortedpairs, int32_t sorted_count, pecodnadb_t seqdb,
- poptions_t options)
+pairset build_primers_set_greedy_spc (SetParams *params)
{
- SetParams params;
pairset pair_set;
int32_t i;
int32_t pset_idx;
int32_t prb_idx;
int *pair_wi_count_sorted_ids;
- params.sortedpairs = sortedpairs;
- params.sorted_count = sorted_count;
- params.seqdb = seqdb;
- params.options = options;
-
memset (&pair_set, 0, sizeof(pairset));
- pair_set.set_wellIdentifiedTaxa = (int *) ECOMALLOC(options->dbsize*sizeof (int),
+ pair_set.set_wellIdentifiedTaxa = (int *) ECOMALLOC(params->options->dbsize*sizeof (int),
"Could not allocate memory for pair set");
- pair_wi_count_sorted_ids = (int *) ECOMALLOC(sorted_count*sizeof (int),
+ pair_wi_count_sorted_ids = (int *) ECOMALLOC(params->sorted_count*sizeof (int),
"Could not allocate memory for pair_wi_count_sorted");
for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
@@ -93,43 +260,33 @@ pairset build_primers_set (ppair_t* sortedpairs, int32_t sorted_count, pecodnadb
pset_idx = 0;
prb_idx = 0;
//add first pair by default, this should be the one having highiest specificty
- add_pair_in_set (&pair_set, pset_idx, prb_idx, ¶ms);
+ add_pair_in_set (&pair_set, pset_idx, prb_idx, params);
pset_idx++;
while (pset_idx < PRIMERS_IN_SET_COUNT)
{
//get a sorted list of pair ids with the pair well identifying most of the remaining seqs at top
- get_next_pair_options (pair_wi_count_sorted_ids, &pair_set, ¶ms);
+ get_next_pair_options (pair_wi_count_sorted_ids, &pair_set, params);
if (pair_wi_count_sorted_ids[0] == 0)
{
fprintf (stderr, "No further pair found, total so far %d\n", pset_idx);
break;
- }
- else
- fprintf (stderr, "Some counts found\n");
+ }
-
- for (prb_idx = 0; prb_idx < sorted_count; prb_idx++)
+ for (prb_idx = 0; prb_idx < params->sorted_count; prb_idx++)
{
if (pair_wi_count_sorted_ids[prb_idx])
- if (ok_to_add (pair_wi_count_sorted_ids[prb_idx], &pair_set))
+ if (ok_to_add (pair_wi_count_sorted_ids[prb_idx], &pair_set, params))
{
//fprintf (stderr, "Oktoadd\n");
- float ldist = get_links_distribution (pair_wi_count_sorted_ids[prb_idx], &pair_set, ¶ms);
- fprintf (stderr, "ldist:%f\n", ldist);
- if (ldist >= 0.7)
- {
- add_pair_in_set (&pair_set, pset_idx, pair_wi_count_sorted_ids[prb_idx], ¶ms);
- pset_idx++;
- break;
- }
- else
- fprintf (stderr, "ldist:%f\n", ldist);
+ add_pair_in_set (&pair_set, pset_idx, pair_wi_count_sorted_ids[prb_idx], params);
+ pset_idx++;
}
+ if (pset_idx == PRIMERS_IN_SET_COUNT) break;
}
- if (prb_idx == sorted_count)
+ if (prb_idx == params->sorted_count)
{
fprintf (stderr, "No further pair found, total so far %d\n", pset_idx);
break;
@@ -142,8 +299,8 @@ pairset build_primers_set (ppair_t* sortedpairs, int32_t sorted_count, pecodnadb
}
}
- get_set_mean_cov_stats (&pair_set, ¶ms);
-
+ //get_set_mean_cov_stats (&pair_set, ¶ms);
+ reset_set_props (&pair_set, params);
return pair_set;
}
@@ -253,7 +410,14 @@ void add_pair_in_set (pairset *pair_set, int32_t pset_idx, int32_t prb_idx, SetP
int *pwi;
int32_t i;
+ if (prb_idx < 0 || prb_idx >= pparams->sorted_count) return;
pair_set->set_pairs[pset_idx] = prb_idx;
+
+// fprintf (stderr, "%d:", prb_idx);
+ //fprintf (stderr, "%d:", pparams->sortedpairs[prb_idx]);
+ //fprintf (stderr, "%d:", pparams->sortedpairs[prb_idx]->wellIdentifiedSeqs);
+ //fprintf (stderr, "%d:%d, ", i, pair_count[i]);
+
pwi = pparams->sortedpairs[prb_idx]->wellIdentifiedSeqs;
for (i = 0; i < pparams->options->dbsize; i++)
if (pwi[i] == 1)
@@ -262,6 +426,42 @@ void add_pair_in_set (pairset *pair_set, int32_t pset_idx, int32_t prb_idx, SetP
set_cov_spc (pair_set, pparams);
}
+int isthisset (pairset *pair_set)
+{
+ int set1[PRIMERS_IN_SET_COUNT];
+ int set2[PRIMERS_IN_SET_COUNT];
+ int i,j=0,k;
+ for (i=0; iset_pairs[i] != -1) set2[j++]=pair_set->set_pairs[i];
+ }
+ set1[0]=0;set1[1]=2;set1[2]=7;
+ if (j==3)
+ {
+ for(i=0;i<3;i++)
+ {
+ for(k=0;k<3;k++)
+ if (set2[k]==set1[i])break;
+ if (k==3)break;
+ }
+ if(i==3) return 1;//found
+ }
+
+ set1[0]=0;set1[1]=2;set1[2]=37;
+ if (j==3)
+ {
+ for(i=0;i<3;i++)
+ {
+ for(k=0;k<3;k++)
+ if (set2[k]==set1[i])break;
+ if (k==3)break;
+ }
+ if(i==3) return 1;//found
+ }
+ return 0;
+}
+
void get_set_mean_cov_stats (pairset *pair_set, SetParams *pparams)
{
int32_t i, j, k;
@@ -270,20 +470,44 @@ void get_set_mean_cov_stats (pairset *pair_set, SetParams *pparams)
double msum;
int *p1wi;
int *p2wi;
+ int dbg=0;
+
+ dbg=isthisset(pair_set);
for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
{
if (pair_set->set_pairs[i] == -1) continue;
p1wi = pparams->sortedpairs[pair_set->set_pairs[i]]->wellIdentifiedSeqs;
+
+ if(dbg)
+ {
+ printf ("\n\nWellIdentified for primer pair:%d\n",pair_set->set_pairs[i]);
+ for (k = 0; k < pparams->options->dbsize; k++)
+ if(p1wi[k]==1)
+ printf("%d,",k);
+ printf("\n");
+ }
+
for (j = i+1; j < PRIMERS_IN_SET_COUNT; j++)
{
if (pair_set->set_pairs[j] == -1) continue;
p2wi = pparams->sortedpairs[pair_set->set_pairs[j]]->wellIdentifiedSeqs;
interseq_vals[interseq_cnt] = 0;
+ if (dbg)
+ {
+ printf ("Intersection for %d and %d:\n", pair_set->set_pairs[i], pair_set->set_pairs[j]);
+ }
+
for (k = 0; k < pparams->options->dbsize; k++)
if (p1wi[k] == 1 && p2wi[k] == 1)
+ {
interseq_vals[interseq_cnt]++;
+ if(dbg)
+ printf("%d,",k);
+ }
+ if(dbg)
+ printf("\n");
interseq_cnt++;
}
}
@@ -305,7 +529,83 @@ void get_set_mean_cov_stats (pairset *pair_set, SetParams *pparams)
pair_set->set_lcov = msum/interseq_cnt;
if (pair_set->set_lcov != 0)
- pair_set->set_score = (pair_set->set_coverage*pair_set->set_specificity*pair_set->set_lmean)/pair_set->set_lcov;
+ //pair_set->set_score = (pair_set->set_coverage*pair_set->set_specificity*pair_set->set_specificity*(sqrt(pair_set->set_lmean)))/sqrt(sqrt(pair_set->set_lcov));
+ pair_set->set_score = (pair_set->set_coverage*pair_set->set_specificity*(pair_set->set_lmean))/sqrt(pair_set->set_lcov);
+}
+
+void get_set_mean_cov_normalised_stats (pairset *pair_set, SetParams *pparams)
+{
+ int32_t i, j, k;
+ int interseq_vals[PRIMERS_IN_SET_COUNT*PRIMERS_IN_SET_COUNT];
+ int interseq_cnt = 0;
+ double msum;
+ int *p1wi;
+ int *p2wi;
+ int dbg=0;
+
+ dbg=isthisset(pair_set);
+
+ for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
+ {
+ if (pair_set->set_pairs[i] == -1) continue;
+ p1wi = pparams->sortedpairs[pair_set->set_pairs[i]]->wellIdentifiedSeqs;
+
+ if(dbg)
+ {
+ printf ("\n\nWellIdentified for primer pair:%d\n",pair_set->set_pairs[i]);
+ for (k = 0; k < pparams->options->dbsize; k++)
+ if(p1wi[k]==1)
+ printf("%d,",k);
+ printf("\n");
+ }
+
+ for (j = i+1; j < PRIMERS_IN_SET_COUNT; j++)
+ {
+ if (pair_set->set_pairs[j] == -1) continue;
+ p2wi = pparams->sortedpairs[pair_set->set_pairs[j]]->wellIdentifiedSeqs;
+ interseq_vals[interseq_cnt] = 0;
+
+ if (dbg)
+ {
+ printf ("Intersection for %d and %d:\n", pair_set->set_pairs[i], pair_set->set_pairs[j]);
+ }
+
+ for (k = 0; k < pparams->options->dbsize; k++)
+ if (p1wi[k] == 1 && p2wi[k] == 1)
+ {
+ interseq_vals[interseq_cnt]++;
+ if(dbg)
+ printf("%d,",k);
+ }
+ if(dbg)
+ printf("\n");
+ interseq_cnt++;
+ }
+ }
+
+ //calculate mean
+ msum = 0;
+ pair_set->set_score = 0;
+ pair_set->set_lmean = 0;
+ pair_set->set_lcov = -1;
+ if (interseq_cnt == 0) return;
+
+ for (i = 0; i < interseq_cnt; i++)
+ msum += interseq_vals[i];
+ pair_set->set_lmean = msum/interseq_cnt;
+
+ msum = 0;
+ for (i = 0; i < interseq_cnt; i++)
+ msum += (interseq_vals[i] - pair_set->set_lmean)*(interseq_vals[i] - pair_set->set_lmean);
+ pair_set->set_lcov = msum/interseq_cnt;
+
+ if (pair_set->set_lcov != 0)
+ {
+ //normalised links
+ double nl = pair_set->set_lmean/sqrt (pair_set->set_lcov);
+ nl = nl/pparams->options->insamples; //max links cannot be more than insample value
+ pair_set->set_score = pair_set->set_coverage*pair_set->set_specificity*nl;
+ }
}
void reset_set_props (pairset *pair_set, SetParams *pparams)
@@ -313,6 +613,7 @@ void reset_set_props (pairset *pair_set, SetParams *pparams)
int *pwi;
int i, j;
+ memset (pair_set->set_wellIdentifiedTaxa, 0, pparams->options->dbsize*sizeof(int));
for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
{
if (pair_set->set_pairs[i] == -1) continue;
@@ -323,7 +624,13 @@ void reset_set_props (pairset *pair_set, SetParams *pparams)
}
set_cov_spc (pair_set, pparams);
- get_set_mean_cov_stats (pair_set, pparams);
+
+ //TR 6/2/11: commented following, now score is just product of spc and cov
+ //get_set_mean_cov_stats (pair_set, pparams);
+ //get_set_mean_cov_normalised_stats (pair_set, pparams);
+ //pair_set->set_score = pair_set->set_coverage*pair_set->set_specificity;
+ pair_set->set_score = pair_set->set_coverage;
+ //pair_set->set_score = pair_set->set_specificity;
}
void print_set_info (pairset *pair_set, SetParams *pparams)
@@ -331,9 +638,13 @@ void print_set_info (pairset *pair_set, SetParams *pparams)
int i;
int printed1st = 0;
- printf ("%f\t%f\t%f\t%f\t%f\t", pair_set->set_specificity,
- pair_set->set_coverage,pair_set->set_lmean,
- pair_set->set_lcov,pair_set->set_score);
+ //TR 6/2/11: commented following, now score is just product of spc and cov
+ //printf ("%4.3f\t%4.3f\t%4.3f\t%4.3f\t%4.3f\t", pair_set->set_specificity,
+ // pair_set->set_coverage,pair_set->set_lmean,
+ // sqrt(pair_set->set_lcov),pair_set->set_score);
+
+ printf ("%4.3f\t%4.3f\t%4.3f\t", pair_set->set_coverage,
+ pair_set->set_specificity,pair_set->set_score);
for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
{
@@ -345,6 +656,7 @@ void print_set_info (pairset *pair_set, SetParams *pparams)
printf ("%d", pair_set->set_pairs[i]);
printed1st = 1;
}
+ printf ("\t%d\t%d", pair_set->set_intaxa, pair_set->set_wi_cnt);
printf ("\n");
}
@@ -372,7 +684,7 @@ void some_other_set_possibilities (pairset *pair_set,
for (j = 0; j < sorted_count; j++)
{
- if (ok_to_add (j, pair_set))
+ if (ok_to_add (j, pair_set, ¶ms))
{
tmp_pair_set = *pair_set;
tmp_pair_set.set_pairs[i] = j;
@@ -386,3 +698,1073 @@ void some_other_set_possibilities (pairset *pair_set,
ECOFREE (wi, "Could not free memory for pair set wi");
}
+pairset clone_set (pairset *s, int32_t dbsize)
+{
+ pairset clone;
+
+ clone = *s;
+ clone.set_wellIdentifiedTaxa = (int *) ECOMALLOC(dbsize*sizeof (int),
+ "Could not allocate memory for pair set");
+ memcpy (clone.set_wellIdentifiedTaxa, s->set_wellIdentifiedTaxa, dbsize*sizeof (int));
+ return clone;
+}
+
+void add_in_tabu (int index)
+{
+ if (next_tabu_slot == -1) return;
+
+ TabuList[next_tabu_slot] = index;
+ next_tabu_slot++;
+ if (next_tabu_slot >= PRIMERS_IN_SET_COUNT) next_tabu_slot = 0;
+}
+
+int find_in_tabu (int index)
+{
+ int i;
+ for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
+ if (TabuList[i] == index) return i;
+
+ return -1;
+}
+
+//do random changes in the seed set to generate new set
+pairset get_neighbor (pairset *set, SetParams *params)
+{
+ int pinset = 0;
+ int i, j, id, cnt;
+ int how_many_to_replace;
+ pairset nset;
+ int replace_idx;
+
+ //take the seed set as next neighbour sets
+ nset = clone_set (set, params->options->dbsize);
+ //see how many elements are in this set
+ for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
+ {
+ if (nset.set_pairs[i] != -1) pinset++;
+ }
+
+ if (pinset == params->sorted_count) return nset;
+
+ //Randomly get number of elements to be replaced
+ //with new unused elements
+ how_many_to_replace = rand ()%pinset + 1;
+ //replace these many elements in the seed set
+ for (i = 0; i < how_many_to_replace; i++)
+ {
+ do
+ {
+ //we wil choose a random unused element as new element
+ id = rand ()%params->sorted_count;
+ }while (ok_to_add (id, &nset, params) == 0);
+ //again choose a random element in the set to replace
+ replace_idx = rand ()%pinset+1;
+ cnt = 0;
+ for (j = 0; j < PRIMERS_IN_SET_COUNT; j++)
+ {
+ if (nset.set_pairs[j] != -1) cnt++;
+ if (cnt == replace_idx)
+ {
+ if (next_tabu_slot != -1)
+ add_in_tabu (nset.set_pairs[j]);
+
+ nset.set_pairs[j] = id;
+ break;
+ }
+ }
+ }
+ reset_set_props (&nset, params);
+ return nset;
+}
+
+//remove a random number of least contributing elements
+//from the seed set with random elements from the remaining
+pairset get_neighbor4 (pairset *set, SetParams *params)
+{
+ int pinset = 0;
+ int i, j, id, id2;
+ int how_many_to_replace;
+ pairset nset;
+ //int replace_idx;
+ float contribution[PRIMERS_IN_SET_COUNT];
+ int usedids[PRIMERS_IN_SET_COUNT];
+ float sscore;
+ float leastContri;
+ //float lastLeastcontri;
+ int k, l;
+
+ //take the seed set as next neighbour sets
+ nset = clone_set (set, params->options->dbsize);
+ //see how many elements are in this set
+ for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
+ {
+ if (nset.set_pairs[i] != -1) pinset++;
+ }
+
+ if (pinset == params->sorted_count) return nset;
+
+ //Randomly get number of elements to be replaced
+ //with new unused elements
+ how_many_to_replace = rand ()%pinset + 1;
+
+ //calculate contribution of each element in the set
+ sscore = nset.set_score;
+ //fprintf (stderr, "{%f-", sscore);
+ for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
+ {
+ contribution[i] = 10000.0;
+ if (nset.set_pairs[i] != -1)
+ {
+ id = nset.set_pairs[i];
+ nset.set_pairs[i] = -1;
+ reset_set_props (&nset, params);
+ contribution[i] = sscore - nset.set_score;
+ //fprintf (stderr, "[%f;%f]", nset.set_score, contribution[i]);
+ nset.set_pairs[i] = id;
+ reset_set_props (&nset, params);
+ //fprintf (stderr, "%f:, ", nset.set_score);
+ }
+ usedids[i] = -1;
+ }
+
+ //lastLeastcontri = 10000.0;
+ k=0;
+ //replace these many elements in the seed set
+ //fprintf (stderr, "} (%d) ", how_many_to_replace);
+ for (i = 0; i < how_many_to_replace; i++)
+ {
+ do
+ {
+ //we wil choose a random unused element as new element
+ id = rand ()%params->sorted_count;
+ }while (ok_to_add (id, &nset, params) == 0);
+
+ leastContri = 10000.0;
+ id2 = -1;
+ for (j = 0; j < PRIMERS_IN_SET_COUNT; j++)
+ {
+ if (nset.set_pairs[j] == -1) continue;
+ for (l = 0; l < k; l++)
+ if (usedids[l] == j) break;
+
+ if (leastContri > contribution[j] /*&& leastContri >= lastLeastcontri*/ && l == k)
+ {
+ leastContri = contribution[j];
+ id2 = j;
+ //fprintf (stderr, "%f:%d, ", leastContri, id2);
+ }
+ }
+
+ if (id2 != -1)
+ {
+ usedids[k++] = id2;
+ //lastLeastcontri = leastContri;
+ if (next_tabu_slot != -1)
+ add_in_tabu (nset.set_pairs[id2]);
+
+ nset.set_pairs[id2] = id;
+ }
+ }
+ //fprintf (stderr, "\n");
+ reset_set_props (&nset, params);
+ return nset;
+}
+
+//by replacing one element of the set with next from unused
+int which_one_to_replace;
+int last_replaced_with;
+pairset get_neighbor2 (pairset *set, SetParams *params)
+{
+ int pinset = 0;
+ int i, j;
+ pairset nset;
+
+ //take the seed set as next neighbour sets
+ nset = clone_set (set, params->options->dbsize);
+ //see how many elements are in this set
+ for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
+ {
+ if (nset.set_pairs[i] != -1) pinset++;
+ }
+
+ if (pinset == params->sorted_count) return nset;
+
+ if (which_one_to_replace == pinset) which_one_to_replace = 0;
+ if (last_replaced_with == params->sorted_count) last_replaced_with = 0;
+
+ j = -1;
+ for (i = 0; i < pinset; i++)
+ {
+ if (nset.set_pairs[i] != -1) j++;
+ if (j == which_one_to_replace) break;
+ }
+
+ for (j = last_replaced_with; j < params->sorted_count; j++)
+ if (ok_to_add (j, &nset, params) == 1) break;
+
+ if (j < params->sorted_count)
+ {
+ if (next_tabu_slot != -1)
+ add_in_tabu (nset.set_pairs[i]);
+
+ nset.set_pairs[i] = j;
+ reset_set_props (&nset, params);
+ }
+ last_replaced_with++;
+ which_one_to_replace++;
+
+ return nset;
+}
+
+//replace element having least contribution with the next from the list
+pairset get_neighbor3 (pairset *set, SetParams *params)
+{
+ int pinset = 0;
+ int i, j, id;
+ pairset nset;
+ float least_contribution;
+ float sscore;
+
+ //take the seed set as next neighbour sets
+ nset = clone_set (set, params->options->dbsize);
+ //see how many elements are in this set
+ for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
+ {
+ if (nset.set_pairs[i] != -1) pinset++;
+ }
+
+ if (pinset == params->sorted_count) return nset;
+
+ if (last_replaced_with == params->sorted_count) last_replaced_with = 0;
+
+ sscore = nset.set_score;
+ which_one_to_replace = -1;
+ least_contribution = 1000.0; //impossible
+ for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
+ {
+ if (nset.set_pairs[i] != -1)
+ {
+ id = nset.set_pairs[i];
+ nset.set_pairs[i] = -1;
+ reset_set_props (&nset, params);
+ if ((sscore - nset.set_score) < least_contribution)
+ {
+ which_one_to_replace = i;
+ least_contribution = sscore - nset.set_score;
+ }
+ nset.set_pairs[i] = id;
+ }
+ }
+
+ for (j = last_replaced_with; j < params->sorted_count; j++)
+ if (ok_to_add (j, &nset, params) == 1) break;
+
+ if (j < params->sorted_count && which_one_to_replace != -1)
+ {
+ if (next_tabu_slot != -1)
+ add_in_tabu (nset.set_pairs[which_one_to_replace]);
+
+ nset.set_pairs[which_one_to_replace] = j;
+ reset_set_props (&nset, params);
+ }
+ last_replaced_with++;
+
+ return nset;
+}
+
+float sa_P (float score, float nscore, float tem)
+{
+ if (nscore >= score) return 0.95;
+ if (nscore < score && tem > 0.3) return 0.1;
+ if (nscore < score && tem > 0.1) return 0.0001;
+ return 0.0;
+}
+
+void extend_set_to (pairset *pair_set, SetParams *params, int extend_to_cnt)
+{
+ int pinset = 0;
+ int i, j;
+ if (extend_to_cnt > PRIMERS_IN_SET_COUNT) return;
+
+ for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
+ if (pair_set->set_pairs[i] != -1) pinset++;
+
+ if (pinset >= extend_to_cnt) return;
+
+ for (i = 0; i < params->sorted_count; i++)
+ {
+ if (ok_to_add (i, pair_set, params) == 1)
+ {
+ for (j = 0; j < PRIMERS_IN_SET_COUNT; j++)
+ if (pair_set->set_pairs[j] == -1) break;
+ add_pair_in_set (pair_set, j, i, params);
+ reset_set_props (pair_set, params);
+ pinset++;
+ if (pinset >= extend_to_cnt) break;
+ }
+ }
+}
+
+pairset * extend_set_randomly (pairset *pset, SetParams *params, int extend_to_cnt)
+{
+ int pinset = 0;
+ int i = 0;
+ int id;
+ pairset *pair_set;
+
+ if (extend_to_cnt > PRIMERS_IN_SET_COUNT) return pset;
+
+ if (pset == NULL)
+ {
+ pair_set = (pairset *) ECOMALLOC(sizeof (pairset),
+ "Could not allocate memory for pair");
+ pair_set->set_wellIdentifiedTaxa = (int *) ECOMALLOC(params->options->dbsize*sizeof (int),
+ "Could not allocate memory for pair set WI");
+
+ for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
+ pair_set->set_pairs[i] = -1;
+ }
+ else
+ pair_set = pset;
+
+ for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
+ if (pair_set->set_pairs[i] != -1) pinset++;
+
+ if (pinset >= extend_to_cnt) return pair_set;
+
+ i = 0;
+ if (pinset == 0)
+ {
+ id = rand ()%params->sorted_count;
+ add_pair_in_set (pair_set, i, id, params);
+ i++;
+ pinset++;
+ }
+
+ while (pinset < extend_to_cnt)
+ {
+ do
+ {
+ //we wil choose a random unused element as new element
+ id = rand ()%params->sorted_count;
+ }while (ok_to_add (id, pair_set, params) == 0);
+ add_pair_in_set (pair_set, i, id, params);
+ i++;
+ pinset++;
+ }
+ return pair_set;
+}
+
+void set_reduce_to_best (pairset *pair_set, SetParams *params)
+{
+ int original_members[PRIMERS_IN_SET_COUNT];
+ int i;
+ int mcnt = 0;
+ float max_score;
+ int m_to_remove = -1;
+ int tmem;
+
+ for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
+ {
+ original_members[i] = pair_set->set_pairs[i];
+ if (original_members[i] != -1) mcnt++;
+ }
+
+ while (mcnt > 3)
+ {
+ max_score = pair_set->set_score;
+ m_to_remove = -1;
+
+ for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
+ {
+ if (pair_set->set_pairs[i] == -1) continue;
+ tmem = pair_set->set_pairs[i];
+ pair_set->set_pairs[i] = -1; //remove current element temporarily
+ reset_set_props (pair_set, params);
+ pair_set->set_pairs[i] = tmem; //restore
+
+ if (max_score <= pair_set->set_score)
+ {
+ max_score = pair_set->set_score;
+ m_to_remove = i;
+ }
+ }
+
+ if (m_to_remove != -1)
+ {
+ pair_set->set_pairs[m_to_remove] = -1; //remove element
+ reset_set_props (pair_set, params);
+ mcnt--;
+ }
+ else
+ {
+ reset_set_props (pair_set, params);
+ break;
+ }
+ }
+}
+
+float sa_temp (int k, int kmax)
+{
+ return ((kmax - k)*1.0)/(kmax*1.0);
+}
+
+void sets_by_SimulatedAnealing (pairset *pair_set,
+ ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options)
+{
+ SetParams params;
+ pairset set, bset, nset;
+ float score, bscore, nscore;
+ int k = 0, mcnt=0;
+ int kmax = sorted_count*10;
+ float max_score = 10000.0;
+ float min_spc, min_cov;
+ float max_spc, max_cov;
+ double randval = 0;
+
+ int32_t count_per_size;
+ int32_t size_counter = 1;
+
+ srand ( time(NULL) );
+ params.sortedpairs = sortedpairs;
+ params.sorted_count = sorted_count;
+ params.seqdb = seqdb;
+ params.options = options;
+ which_one_to_replace = 0;
+ last_replaced_with = 0;
+ next_tabu_slot = -1; //dont use tabu search props
+ total_pairs = sorted_count;
+
+ if (pair_set == NULL)
+ {
+ pair_set = extend_set_randomly (NULL, ¶ms, 3);
+ printf("\nStart Random seed set for Simulated :\n");
+ print_set_info (&pair_set, ¶ms);
+ }
+ min_spc = max_spc = pair_set->set_specificity;
+ min_cov = max_cov = pair_set->set_coverage;
+
+ for (k = 0; k < PRIMERS_IN_SET_COUNT; k++)
+ {
+ if (pair_set->set_pairs[k] != -1) mcnt++;
+ }
+ count_per_size = kmax/(PRIMERS_IN_SET_COUNT-mcnt);
+ k = 1;
+
+ set = clone_set (pair_set, options->dbsize);
+
+ /*if (mcnt < 5)
+ {
+ printf ("Set before extension:\n");
+ print_set_info (&set, ¶ms);
+ extend_set_to (&set, ¶ms, 5);
+ printf ("Set after extension:\n");
+ print_set_info (&set, ¶ms);
+ set_reduce_to_best (&set, ¶ms);
+ printf ("Set after reduction to best size:\n");
+ print_set_info (&set, ¶ms);
+ }*/
+ mcnt++;
+ extend_set_to (&set, ¶ms, mcnt);
+
+ bset = clone_set (&set, options->dbsize);
+ score = bset.set_score;
+ bscore = score;
+ nset.set_wellIdentifiedTaxa = NULL;
+
+ //srand ( time(NULL) );
+
+ while (k <= kmax && score < max_score)
+ {
+ if (k == (size_counter*count_per_size))
+ {
+ size_counter++;
+ mcnt++;
+ extend_set_to (&set, ¶ms, mcnt);
+ }
+ //nset = get_neighbor (&set, ¶ms); //all random
+ //nset = get_neighbor2 (&set, ¶ms); //replace next with next available
+ nset = get_neighbor3 (&set, ¶ms); //replace the one with least contribution with next
+ //nset = get_neighbor4 (&set, ¶ms); //replace randome no of least contributing elements with random elements in the remaining set
+
+ if (nset.set_specificity < min_spc)
+ min_spc = nset.set_specificity;
+
+ if (nset.set_specificity > max_spc)
+ max_spc = nset.set_specificity;
+
+ if (nset.set_coverage < min_cov)
+ min_cov = nset.set_coverage;
+
+ if (nset.set_coverage > max_cov)
+ max_cov = nset.set_coverage;
+
+ nscore = nset.set_score;
+ printf ("Neighbor: ");
+ print_set_info (&nset, ¶ms);
+
+ if (nscore > bscore)
+ {
+ ECOFREE (bset.set_wellIdentifiedTaxa, "Could not free memory for pair set wi");
+ bset = clone_set (&nset, options->dbsize);
+ bscore = nscore;
+ printf ("best: ");
+ print_set_info (&nset, ¶ms);
+ }
+
+ randval = (double)rand()/(double)RAND_MAX;
+ if (sa_P (score, nscore, sa_temp (k,kmax)) > randval)
+ {
+ ECOFREE (set.set_wellIdentifiedTaxa, "Could not free memory for pair set wi");
+ set = clone_set (&nset, options->dbsize);
+ score = nscore;
+ //which_one_to_replace = 0;
+ //last_replaced_with = 0;
+ //printf ("Seed Set: ");
+ //print_set_info (&set, ¶ms);
+ }
+ k++;
+ ECOFREE (nset.set_wellIdentifiedTaxa, "Could not free memory for pair set wi");
+ }
+ printf ("Minimum specificity: %0.3f, Maximum specificity: %0.3f, range: %0.3f\n",min_spc, max_spc, max_spc-min_spc);
+ printf ("Minimum coverage: %0.3f, Maximum coverage: %0.3f, range: %0.3f\n",min_cov, max_cov, max_cov-min_cov);
+}
+
+
+void sets_by_TabuSearch (pairset *pair_set,
+ ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options)
+{
+ SetParams params;
+ pairset set, bset, nset;
+ float bscore, nscore;
+ int k = 0, mcnt=0;
+ int kmax = sorted_count*10;
+ float max_score = 1000.0;
+ float min_spc, min_cov;
+ float max_spc, max_cov;
+
+ int32_t count_per_size;
+ int32_t size_counter = 1;
+
+ srand ( time(NULL) );
+ params.sortedpairs = sortedpairs;
+ params.sorted_count = sorted_count;
+ params.seqdb = seqdb;
+ params.options = options;
+ which_one_to_replace = 0;
+ last_replaced_with = 0;
+ next_tabu_slot = 0; //use tabu search
+ total_pairs = sorted_count;
+
+ if (pair_set == NULL)
+ pair_set = extend_set_randomly (NULL, ¶ms, 3);
+ min_spc = max_spc = pair_set->set_specificity;
+ min_cov = max_cov = pair_set->set_coverage;
+
+ for (k = 0; k < PRIMERS_IN_SET_COUNT; k++)
+ {
+ if (pair_set->set_pairs[k] != -1) mcnt++;
+ }
+ count_per_size = kmax/(PRIMERS_IN_SET_COUNT-mcnt);
+
+ set = clone_set (pair_set, options->dbsize);
+ /*if (mcnt < 5)
+ {
+ printf ("Set before extension:\n");
+ print_set_info (&set, ¶ms);
+ extend_set_to (&set, ¶ms, 5);
+ printf ("Set after extension:\n");
+ print_set_info (&set, ¶ms);
+ set_reduce_to_best (&set, ¶ms);
+ printf ("Set after reduction to best size:\n");
+ print_set_info (&set, ¶ms);
+ }*/
+ mcnt++;
+ extend_set_to (&set, ¶ms, mcnt);
+
+ bset = clone_set (&set, options->dbsize);
+ bscore = bset.set_score;
+ nset.set_wellIdentifiedTaxa = NULL;
+
+ for (k = 0; k < PRIMERS_IN_SET_COUNT; k++)
+ TabuList[k] = -1;
+
+ k = 1;
+ while (k < kmax && bscore < max_score)
+ {
+ if (k == (size_counter*count_per_size))
+ {
+ size_counter++;
+ mcnt++;
+ extend_set_to (&set, ¶ms, mcnt);
+ }
+
+ //nset = get_neighbor (&set, ¶ms); //all random
+ //nset = get_neighbor2 (&set, ¶ms); //replace next with next available
+ nset = get_neighbor3 (&set, ¶ms); //replace the one with least contribution with next
+ //nset = get_neighbor4 (&set, ¶ms); //replace randome no of least contributing elements with random elements in the remaining set
+
+ if (nset.set_specificity < min_spc)
+ min_spc = nset.set_specificity;
+
+ if (nset.set_specificity > max_spc)
+ max_spc = nset.set_specificity;
+
+ if (nset.set_coverage < min_cov)
+ min_cov = nset.set_coverage;
+
+ if (nset.set_coverage > max_cov)
+ max_cov = nset.set_coverage;
+
+ nscore = nset.set_score;
+ printf ("Neighbor: ");
+ print_set_info (&nset, ¶ms);
+
+ if (nscore > bscore)
+ {
+ ECOFREE (bset.set_wellIdentifiedTaxa, "Could not free memory for pair set wi");
+ bset = clone_set (&nset, options->dbsize);
+ bscore = nscore;
+ printf ("best: ");
+ print_set_info (&nset, ¶ms);
+ }
+ ECOFREE (set.set_wellIdentifiedTaxa, "Could not free memory for pair set wi");
+ set = nset;
+ k++;
+ }
+ printf ("Minimum specificity: %0.3f, Maximum specificity: %0.3f, range: %0.3f\n",min_spc, max_spc, max_spc-min_spc);
+ printf ("Minimum coverage: %0.3f, Maximum coverage: %0.3f, range: %0.3f\n",min_cov, max_cov, max_cov-min_cov);
+}
+
+pairset *sets_by_BruteForce (ppair_t * sortedpairs,
+//void sets_by_BruteForce (ppair_t * sortedpairs,
+ int32_t sorted_count, pecodnadb_t seqdb, poptions_t options)
+{
+ SetParams params;
+ params.sortedpairs = sortedpairs;
+ params.sorted_count = sorted_count;
+ params.seqdb = seqdb;
+ params.options = options;
+
+ pairset set;
+ pairset *pset = NULL;
+ if (sorted_count < 3)
+ {
+ printf ("Too few primer pairs to find a pair set.\n");
+ return NULL;
+ }
+
+ set.set_wellIdentifiedTaxa = (int *) ECOMALLOC(options->dbsize*sizeof (int),
+ "Could not allocate memory for pair set WI");
+ int32_t set_indeces_array[PRIMERS_IN_SET_COUNT];
+ //int start_elements = 3;
+ int end_elements = 3;
+ int current_elements = 3;
+ int32_t i, j;
+ float maxscore = -1000.0;
+ int maxcount = 2000;
+ int counter = 0;
+
+ if (sorted_count <= PRIMERS_IN_SET_COUNT)
+ end_elements = sorted_count;
+
+ if (end_elements < sorted_count)
+ {
+ pset = (pairset *) ECOMALLOC(sizeof (pairset),
+ "Could not allocate memory for pair");
+ pset->set_wellIdentifiedTaxa = (int *) ECOMALLOC(options->dbsize*sizeof (int),
+ "Could not allocate memory for pair set WI");
+ }
+
+ while (current_elements <= end_elements)
+ {
+ for (i=0; idbsize*sizeof (int));
+ for (i=0; i maxscore)
+ {
+ maxscore = set.set_score;
+ printf ("best: ");
+ print_set_info (&set, ¶ms);
+
+ ECOFREE (pset->set_wellIdentifiedTaxa, "Could not free memory for pair set wi");
+ *pset = clone_set (&set, options->dbsize);
+ }
+ else
+ {
+ printf ("set: ");
+ print_set_info (&set, ¶ms);
+ }
+
+ for (i=current_elements-1; i>0; i--)
+ {
+ set_indeces_array[i]++;
+ if (set_indeces_array[i] == (sorted_count+(i-current_elements+1)))
+ {
+ set_indeces_array[i] = set_indeces_array[i-1]+2;
+ for (j=i+1; j= sorted_count)
+ {
+ break;
+ }
+ if (i < current_elements) //above loop broken?
+ {
+ current_elements++;
+ break;
+ }
+ counter++;
+ if (counter > maxcount)
+ break;
+ }
+ if (counter > maxcount)
+ break;
+ }
+ return pset;
+}
+
+void build_and_print_sets (ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options)
+{
+ SetParams params;
+ pairset pair_set;
+ pairset *pset;
+
+ params.sortedpairs = sortedpairs;
+ params.sorted_count = sorted_count;
+ params.seqdb = seqdb;
+ params.options = options;
+
+ pair_set = build_primers_set_greedy_spc (¶ms);
+ printf("Greedy algorithm results based on specificity:\n");
+ print_set_info (&pair_set, ¶ms);
+ if (pair_set.set_wellIdentifiedTaxa)
+ ECOFREE (pair_set.set_wellIdentifiedTaxa, "Could not free memory for pair set wi");
+
+ pair_set = build_primers_set_greedy_cov (¶ms);
+ printf("\nGreedy algorithm results based on coverage:\n");
+ print_set_info (&pair_set, ¶ms);
+ if (pair_set.set_wellIdentifiedTaxa)
+ ECOFREE (pair_set.set_wellIdentifiedTaxa, "Could not free memory for pair set wi");
+
+ pset = extend_set_randomly (NULL, ¶ms, 3);
+ printf("\nStart Random seed set:\n");
+ print_set_info (pset, ¶ms);
+
+ printf("\nResults from simulated Anealing:\n");
+ sets_by_SimulatedAnealing (pset, sortedpairs, sorted_count, seqdb, options);
+ printf("\nResults from Tabu Search:\n");
+ sets_by_TabuSearch (pset, sortedpairs, sorted_count, seqdb, options);
+}
+
+void primers_graph_graphviz (ppair_t * sortedpairs,
+ int32_t sorted_count, poptions_t options)
+{
+ int32_t i, j, k, total_links;
+ char fileName[100] = "PrimerLinks";
+ int *owi;
+ int *iwi;
+ int allowedtaxa;
+ FILE *of;
+
+ srand ( time(NULL) );
+ sprintf (fileName, "PrimerLinks_%d.gv", rand ());
+ of = fopen (fileName, "w");
+
+ fprintf (of, "graph primerlinks {\n");
+ for (i=0; iwellIdentifiedSeqs;
+
+ for (j=i+1; jwellIdentifiedSeqs;
+ total_links = 0;
+
+ for (k=0; kdbsize; k++)
+ if (owi[k] == 1 && iwi[k] == 1)
+ total_links++;
+
+ //if (total_links > 0 && ((total_links*1.0) <= (options->max_links_percent*options->dbsize)))
+ //if (total_links > 0 && total_links <= options->max_links_percent)
+
+ if((sortedpairs[i]->intaxa - sortedpairs[i]->notwellidentifiedtaxa) < (sortedpairs[j]->intaxa - sortedpairs[j]->notwellidentifiedtaxa))
+ allowedtaxa = (sortedpairs[i]->intaxa - sortedpairs[i]->notwellidentifiedtaxa)/2;
+ else
+ allowedtaxa = (sortedpairs[j]->intaxa - sortedpairs[j]->notwellidentifiedtaxa)/2;
+
+
+ //if (total_links > 5 && (total_links <= options->max_links_percent || options->max_links_percent==-1))
+ // fprintf (of, "\t%d -- %d [label=\"%d: %0.2f: %0.2f\"];\n", i, j, total_links,sortedpairs[i]->bc,sortedpairs[j]->bc );
+
+ allowedtaxa = options->max_links_percent;
+ if (total_links > 5 && total_links < allowedtaxa)
+ fprintf (of, "\t%d -- %d [label=\"%d: %0.2f: %0.2f\"];\n", i, j, total_links,sortedpairs[i]->bc,sortedpairs[j]->bc );
+ //fprintf (of, "\t%d\t%d\t%d\n", i, j, total_links);
+
+ //fprintf (of, "\t%d\t%d\t%d\n", i, j, total_links);
+ //fprintf (of, "\t%d -- %d;\n", i, j, total_links);
+ }
+ }
+ fprintf (of, "}\n");
+ fclose (of);
+}
+
+int32_t *addinset (int32_t *set, int32_t i, int32_t j, int32_t* slots, int32_t *index)
+{
+ int32_t k;
+
+ if (*index == *slots)
+ {
+ *slots += 50;
+ set = ECOREALLOC(set, (*slots)*sizeof (int32_t),
+ "Could not allocate memory for index set.");
+ }
+
+ if (i > -1)
+ {
+ for (k=0; k<*index; k++)
+ if (set[k] == i) break;
+ if (k== *index)
+ set[(*index)++] = i;
+ }
+
+ if (j > -1)
+ {
+ for (k=0; k<*index; k++)
+ if (set[k] == j) break;
+ if (k== *index)
+ set[(*index)++] = j;
+ }
+
+ return set;
+}
+
+size_t primers_changeSortedArray (ppair_t ** pairs,
+ size_t sorted_count, poptions_t options)
+{
+ int32_t i, j, k, l, total_links;
+ int *owi;
+ int *iwi;
+ int allowedtaxa;
+ ppair_t *sortedpairs = *pairs;
+ bool_t passed;
+
+ int32_t *idx_set = NULL;
+ int32_t slots=50, index=0;
+
+ idx_set = ECOMALLOC(slots*sizeof (int32_t),
+ "Could not allocate memory for index set.");
+
+ for (i=0; iwellIdentifiedSeqs;
+ passed = FALSE;
+
+ for (j=0; jwellIdentifiedSeqs;
+ total_links = 0;
+
+ for (k=0; kdbsize; k++)
+ if (owi[k] == 1 && iwi[k] == 1)
+ total_links++;
+
+ //if (total_links > 0 && ((total_links*1.0) <= (options->max_links_percent*options->dbsize)))
+ //if (total_links > 0 && total_links <= options->max_links_percent)
+
+ if((sortedpairs[i]->intaxa - sortedpairs[i]->notwellidentifiedtaxa) < (sortedpairs[j]->intaxa - sortedpairs[j]->notwellidentifiedtaxa))
+ allowedtaxa = (sortedpairs[i]->intaxa - sortedpairs[i]->notwellidentifiedtaxa)/2;
+ else
+ allowedtaxa = (sortedpairs[j]->intaxa - sortedpairs[j]->notwellidentifiedtaxa)/2;
+
+
+ //if (total_links > 5 && (total_links <= options->max_links_percent || options->max_links_percent==-1))
+ // fprintf (of, "\t%d -- %d [label=\"%d: %0.2f: %0.2f\"];\n", i, j, total_links,sortedpairs[i]->bc,sortedpairs[j]->bc );
+
+ if (options->max_links_percent > 0)
+ {
+ allowedtaxa = options->max_links_percent;
+ if (total_links > allowedtaxa)
+ passed = TRUE;
+ break;
+ }
+ else
+ if (!(total_links > 5 && total_links <= allowedtaxa))
+ {
+ //idx_set = addinset (idx_set, i, j, &slots, &index);
+ passed = TRUE;
+ break;
+ }
+ }
+ if (passed == TRUE)
+ idx_set = addinset (idx_set, i, -1, &slots, &index);
+ }
+
+ i=-1;
+ for (j=0; jwellIdentifiedSeqs, "Cannot free wi for changing sorted array");
+ ECOFREE (sortedpairs[j]->pcr.amplifias, "Cannot free wi for changing sorted array");
+ if (i == -1) i = j;
+ }
+ else
+ {
+ if (i != -1)
+ sortedpairs[i++] = sortedpairs[j];
+ }
+ }
+ ECOFREE (idx_set, "Cannot free index set.");
+ if (i != -1)
+ {
+ *pairs = ECOREALLOC (*pairs, i*sizeof(pair_t), "Cannot free wi for changing sorted array");
+ }
+ else i=sorted_count;
+ return i;
+}
+
+
+int32_t *addinset_withLinks (int32_t *set, int32_t i, int32_t* slots, int32_t *index, ppair_t *pairs, poptions_t options)
+{
+ int32_t j, k, total_links;
+ int *owi;
+ int *iwi;
+ bool_t passed = TRUE;
+ int allowedtaxa;
+
+ //see if we need to extend the set array
+ if (*index == *slots)
+ {
+ *slots += 50;
+ set = ECOREALLOC(set, (*slots)*sizeof (int32_t),
+ "Could not allocate memory for index set.");
+ }
+
+ //find no of links of current element i with all the elements
+ //in the set to see that they are within limit
+ owi = pairs[i]->coveredSeqs;
+ for (j=0; j<*index; j++)
+ {
+ iwi = pairs[set[j]]->coveredSeqs;
+ total_links = 0;
+
+ for (k=0; kdbsize; k++)
+ if (owi[k] == 1 && iwi[k] == 1)
+ total_links++;
+
+ //if((pairs[i]->intaxa - pairs[i]->notwellidentifiedtaxa) < (pairs[set[j]]->intaxa - pairs[set[j]]->notwellidentifiedtaxa))
+ // allowedtaxa = (pairs[i]->intaxa - pairs[i]->notwellidentifiedtaxa)/2;
+ //else
+ // allowedtaxa = (pairs[set[j]]->intaxa - pairs[set[j]]->notwellidentifiedtaxa)/2;
+
+ if(pairs[i]->intaxa < pairs[set[j]]->intaxa)
+ allowedtaxa = pairs[i]->intaxa/2;
+ else
+ allowedtaxa = pairs[set[j]]->intaxa/2;
+
+ if (!(total_links > 5 && total_links <= allowedtaxa))
+ passed = FALSE;
+ }
+
+ //links respect the limits with all set elements
+ if (passed)
+ {
+ for (k=0; k<*index; k++)
+ if (set[k] == i) break;
+ if (k== *index)
+ set[(*index)++] = i;
+ }
+
+ return set;
+}
+
+size_t primers_filterWithGivenLinks (ppair_t ** pairs,
+ size_t sorted_count, poptions_t options)
+{
+ int32_t i, j, k;
+ ppair_t *sortedpairs = *pairs;
+ bool_t passed;
+
+ int32_t *idx_set = NULL;
+ int32_t slots=50, index=0;
+ int *cov = ECOMALLOC(options->dbsize*sizeof (int),
+ "Could not allocate memory for index set.");
+
+ idx_set = ECOMALLOC(slots*sizeof (int32_t),
+ "Could not allocate memory for index set.");
+
+ for (i=sorted_count-1; i>=0; i--)
+ {
+ idx_set = addinset_withLinks (idx_set, i, &slots, &index, sortedpairs, options);
+ }
+
+ i=-1;
+ for (j=0; jcoveredSeqs, "Cannot free wi for changing sorted array");
+ ECOFREE (sortedpairs[j]->wellIdentifiedSeqs, "Cannot free wi for changing sorted array");
+ ECOFREE (sortedpairs[j]->pcr.amplifias, "Cannot free wi for changing sorted array");
+ if (i == -1) i = j;
+ }
+ else
+ {
+ if (i != -1)
+ sortedpairs[i++] = sortedpairs[j];
+ }
+ }
+ ECOFREE (idx_set, "Cannot free index set.");
+ if (i != -1)
+ {
+ *pairs = ECOREALLOC (*pairs, i*sizeof(pair_t), "Cannot free wi for changing sorted array");
+ }
+ else i=sorted_count;
+
+
+ for (j=0; jdbsize; k++)
+ if ((*pairs)[j]->coveredSeqs[k] == 1)
+ cov[k] = 1;
+ j=0;
+ for (k=0; kdbsize; k++)
+ if (cov[k] == 1)
+ j++;
+ fprintf (stderr, "\nALL ELEMENTS COVERAGE: (%d/%d) %0.2f\n", j, options->intaxa, j*1.0/options->intaxa);
+ ECOFREE (cov, "Cannot free cov");
+
+ return i;
+}
diff --git a/src/libecoprimer/PrimerSets.h b/src/libecoprimer/PrimerSets.h
index 8e46432..44d6294 100644
--- a/src/libecoprimer/PrimerSets.h
+++ b/src/libecoprimer/PrimerSets.h
@@ -13,6 +13,8 @@ typedef struct {
float set_lmean;
float set_lcov;
float set_score;
+ int32_t set_intaxa;
+ int32_t set_wi_cnt;
}pairset;
typedef struct{
@@ -33,9 +35,24 @@ typedef struct{
void add_pair_in_set (pairset *pair_set, int32_t pset_idx, int32_t prb_idx, SetParams *pparams);
void get_next_pair_options (int *pair_wi_count_sorted_ids, pairset *pair_set, SetParams *pparams);
float get_links_distribution (int prb_idx, pairset *prob_set, SetParams *pparams);
-pairset build_primers_set (ppair_t* sortedpairs, int32_t sorted_count, pecodnadb_t seqdb,
- poptions_t options);
+pairset build_primers_set_greedy_spc (SetParams *pparams);
void get_set_mean_cov_stats (pairset *prob_set, SetParams *pparams);
void some_other_set_possibilities (pairset *pair_set,
ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options);
+void sets_by_SimulatedAnealing (pairset *pair_set,
+ ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options);
+void sets_by_TabuSearch (pairset *pair_set,
+ ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options);
+pairset * sets_by_BruteForce (ppair_t * sortedpairs,
+ int32_t sorted_count, pecodnadb_t seqdb, poptions_t options);
+pairset * extend_set_randomly (pairset *pair_set, SetParams *params, int extend_to_cnt);
+void build_and_print_sets (ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options);
+int32_t get_next_option_increasing_cov (pairset *pair_set, SetParams *pparams);
+void reset_set_props (pairset *pair_set, SetParams *pparams);
+void primers_graph_graphviz (ppair_t * sortedpairs,
+ int32_t sorted_count, poptions_t options);
+size_t primers_changeSortedArray (ppair_t ** pairs,
+ size_t sorted_count, poptions_t options);
+size_t primers_filterWithGivenLinks (ppair_t ** pairs,
+ size_t sorted_count, poptions_t options);
#endif
diff --git a/src/libecoprimer/ahocorasick.c b/src/libecoprimer/ahocorasick.c
new file mode 100755
index 0000000..078be0d
--- /dev/null
+++ b/src/libecoprimer/ahocorasick.c
@@ -0,0 +1,479 @@
+/*
+ * ahocorasick.h
+ *
+ * Created on: 26 march 2011
+ * Author: tiayyba
+ */
+#include
+#include "hashencoder.h"
+#include "ahocorasick.h"
+
+void ahoc_graphKeywordTree (aho_state *root);
+aho_state *groot = NULL; //just for graph testing
+
+#define BASEATINDEX(w, l, i) (uint8_t)((((w)&(0x3LLU<<(((l)-(i))*2)))>>(((l)-(i))*2)) & 0x3LLU)
+
+void ahoc_addOutputElement (aho_state *node, bool_t isdirect, uint32_t idx)
+{
+ if (!node) return;
+ if (node->output.count == 0)
+ node->output.out_set = ECOMALLOC(sizeof(aho_output),
+ "Cannot allocate memory for aho-corasick state output element");
+ else
+ node->output.out_set = ECOREALLOC(node->output.out_set, (node->output.count+1)*sizeof(aho_output),
+ "Cannot allocate memory for aho-corasick state output element");
+ node->output.out_set[node->output.count].wordidx = idx;
+ node->output.out_set[node->output.count].isdirect = isdirect;
+ node->output.count++;
+}
+
+//is the passed output element in the set
+bool_t ahoc_isOutputIn (aho_state *node, aho_output ot)
+{
+ uint32_t i;
+
+ for (i=0; ioutput.count; i++)
+ if (node->output.out_set[i].isdirect == ot.isdirect && node->output.out_set[i].wordidx == ot.wordidx) return TRUE;
+ return FALSE;
+}
+
+//take union of output of the two nodes and put in node1
+void ahoc_unionOutputElements (aho_state *node1, aho_state *node2)
+{
+ uint32_t i;
+
+ for (i=0; ioutput.count; i++)
+ if (ahoc_isOutputIn (node1, node2->output.out_set[i]) == FALSE)
+ ahoc_addOutputElement (node1, node2->output.out_set[i].isdirect, node2->output.out_set[i].wordidx);
+}
+
+void ahoc_addKeyword (aho_state *root, word_t w, bool_t isdirect, uint32_t idx, poptions_t options)
+{
+ uint32_t i;
+ aho_state *nextnode = root;
+ uint8_t basecode;
+ static uint32_t state_id = 0;
+
+ //fprintf (stderr, "%s\n", ecoUnhashWord(w, options->primer_length));
+ for (i=1; i<=options->primer_length; i++)
+ {
+ basecode = BASEATINDEX (w, options->primer_length, i);
+ //fprintf (stderr, "%d", basecode);
+ if (nextnode->next[basecode] == NULL)
+ {
+ //add new state
+ nextnode->next[basecode] = ECOMALLOC(sizeof(aho_state),
+ "Cannot allocate memory for aho-corasick state");
+ nextnode = nextnode->next[basecode];
+ //initialize state
+ nextnode->id = ++state_id;
+ nextnode->next[0]=nextnode->next[1]=nextnode->next[2]=nextnode->next[3]=NULL;
+ nextnode->fail = NULL;
+ nextnode->output.count = 0;
+ }
+ else
+ nextnode = nextnode->next[basecode];
+ }
+ //fprintf (stderr, "\n", basecode);
+ //new pattern addess so add node ouptup element
+ ahoc_addOutputElement (nextnode, isdirect, idx);
+}
+
+void ahoc_buildKeywordTree (aho_state *root, pwordcount_t words, poptions_t options)
+{
+ uint32_t i;
+ if (!root) return;
+
+ //init root
+ root->id = 0;
+ root->next[0]=root->next[1]=root->next[2]=root->next[3]=NULL;
+ root->fail = NULL;
+ root->output.count = 0;
+
+ //now add each word as a pattern in the keyword tree
+ for (i=0; isize; i++)
+ {
+ //add direct word
+ word_t w=WORD(words->words[i]);
+ ahoc_addKeyword (root, w, TRUE, i, options);
+
+ //add reverse word
+ w=ecoComplementWord(w,options->primer_length);
+ ahoc_addKeyword (root, w, FALSE, i, options);
+ }
+
+ //loop on root if some base has no out going edge from roots
+ for (i=0; i<4; i++)
+ if (root->next[i] == NULL)
+ root->next[i] = root;
+}
+
+void ahoc_enqueue (aho_queue *ahoqueue, aho_state *node)
+{
+ queue_node *q;
+ if (node == NULL) return;
+
+ q = ECOMALLOC(sizeof(queue_node),
+ "Cannot allocate memory for aho-corasick queue node");
+ q->state_node = node;
+ q->next = NULL;
+
+ if (ahoqueue->first == NULL)
+ {
+ ahoqueue->first = q;
+ ahoqueue->last = q;
+ }
+ else
+ {
+ ahoqueue->last->next = q;
+ ahoqueue->last = q;
+ }
+}
+
+aho_state *ahoc_dequeue (aho_queue *ahoqueue)
+{
+ aho_state *node = NULL;
+ queue_node *q;
+
+ if (ahoqueue->first == NULL) return node;
+ q = ahoqueue->first;
+ ahoqueue->first = q->next;
+
+ node = q->state_node;
+ ECOFREE (q, "Cannot free memory for aho-corasick queue node");
+ return node;
+}
+
+//set fail links and output sets for the keyword tree
+void ahoc_updateForFailAndOutput (aho_state *root)
+{
+ int32_t i;
+ aho_queue Q;
+ aho_state *node_r;
+ aho_state *node_u;
+ aho_state *node_v;
+
+ //empty queue
+ Q.first = NULL;
+ Q.last = NULL;
+
+ //for us alphabet has 4 elements, A=0, C=1, G=2 and T=3
+ for (i=0; i<4; i++)
+ {
+ if (root->next[i] != root && root->next[i] != NULL)
+ {
+ root->next[i]->fail = root;
+ ahoc_enqueue (&Q, root->next[i]);
+ }
+ }
+
+ //while queue not empty
+ while (Q.first != NULL)
+ {
+ node_r = ahoc_dequeue (&Q);
+ for (i=0; i<4; i++)
+ {
+ if (node_r->next[i] != NULL)
+ {
+ node_u = node_r->next[i];
+ ahoc_enqueue (&Q, node_u);
+ node_v = node_r->fail;
+ while (node_v->next[i] == NULL)
+ node_v = node_v->fail;
+ node_u->fail = node_v->next[i];
+ ahoc_unionOutputElements (node_u, node_u->fail);
+ }
+ }
+ }
+}
+
+void ahoc_freeKeywordTree (aho_state *node)
+{
+ int i;
+ for (i=0; i<4; i++)
+ if (node->next[i])
+ ahoc_freeKeywordTree (node->next[i]);
+ if (node->output.count > 0)
+ ECOFREE (node->output.out_set, "Free failed for node output");
+ ECOFREE (node, "Free failed for node");
+}
+
+pprimercount_t ahoc_lookforStrictPrimers (pecodnadb_t database, uint32_t seqdbsize,uint32_t exampleCount,
+ pwordcount_t words,poptions_t options)
+{
+ aho_state automaton_root;
+ aho_state *curr_state;
+ //uint32_t inSequenceQuorum;
+ uint32_t outSequenceQuorum;
+ pprimer_t data;
+ pprimercount_t primers;
+ uint32_t i, j, k;
+ int32_t pos;
+ uint32_t lmax;
+ char *base;
+ int8_t code;
+ uint32_t goodPrimers=0;
+ static int iii=0;
+
+
+ //inSequenceQuorum = (uint32_t)floor((float)exampleCount * options->sensitivity_quorum);
+ outSequenceQuorum = (uint32_t)floor((float)(seqdbsize-exampleCount) * options->false_positive_quorum);
+
+ //fprintf(stderr," Primers should be at least present in %d/%d example sequences\n",inSequenceQuorum,exampleCount);
+ fprintf(stderr," Primers should not be present in more than %d/%d counterexample sequences\n",outSequenceQuorum,(seqdbsize-exampleCount));
+
+ data = ECOMALLOC(words->size * sizeof(primer_t),
+ "Cannot allocate memory for fuzzy matching results");
+ for (i=0; i < words->size; i++)
+ {
+ data[i].word=WORD(words->words[i]);
+ data[i].inexample = 0;
+ data[i].outexample= 0;
+
+ data[i].directCount=ECOMALLOC(seqdbsize * sizeof(uint32_t),
+ "Cannot allocate memory for primer position");
+ data[i].directPos = ECOMALLOC(seqdbsize * sizeof(poslist_t),
+ "Cannot allocate memory for primer position");
+ data[i].reverseCount=ECOMALLOC(seqdbsize * sizeof(uint32_t),
+ "Cannot allocate memory for primer position");
+ data[i].reversePos = ECOMALLOC(seqdbsize * sizeof(poslist_t),
+ "Cannot allocate memory for primer position");
+ }
+
+ //build keywords automaton
+ ahoc_buildKeywordTree (&automaton_root, words, options);
+ //set fail links and output sets
+ ahoc_updateForFailAndOutput (&automaton_root);
+
+ //debug; print keywordtree in a gv file
+ //ahoc_graphKeywordTree (&automaton_root);
+
+ //loop on each sequence for its each base and find words
+ for (i=0; i < seqdbsize; i++)
+ {
+ if(database[i]->SQ_length <= options->primer_length) continue;
+
+ lmax = database[i]->SQ_length;
+ if (!options->circular)
+ lmax += options->primer_length-1;
+ curr_state = &automaton_root;
+
+ for (j=0,base=database[i]->SQ; jSQ_length) base=database[i]->SQ;
+
+ //code = encoder[(*base) - 'A'];
+ code = *base;
+ //if (iii++ < 30)
+ // fprintf (stderr, "%d:%d,", *base, code);
+ if (code < 0 || code > 3)
+ {
+ //if error char, start from root for next character
+ //+forget any incomplete words
+ curr_state = &automaton_root;
+ continue;
+ }
+ while (curr_state->next[code] == NULL) curr_state = curr_state->fail;
+ curr_state = curr_state->next[code];
+
+ //start position of primer is options->primer_length-1 chars back
+ pos = j-options->primer_length+1;
+ if (pos < 0) pos = database[i]->SQ_length+pos;
+
+ //set output, if there is some output on this state then
+ //+all words in the output set complete here, so increment their
+ //+found properties for current sequence
+ for (k=0; koutput.count; k++)
+ {
+ if (curr_state->output.out_set[k].isdirect)
+ data[curr_state->output.out_set[k].wordidx].directCount[i]++;
+ else
+ data[curr_state->output.out_set[k].wordidx].reverseCount[i]++;
+
+ if (options->no_multi_match)
+ {
+ if ((data[curr_state->output.out_set[k].wordidx].directCount[i] +
+ data[curr_state->output.out_set[k].wordidx].reverseCount[i]) > 1)
+ //since multimach not allowd, set an indication on 1st seq position that
+ //+ a multimatch was found, so that this word will be filtered out
+ //+ and because of first postion we wont have to search the whole array
+ //+ to find if it voilated nomultimatch constraint for some seq
+ data[curr_state->output.out_set[k].wordidx].directCount[0] = 2;
+ else
+ {
+ if (curr_state->output.out_set[k].isdirect)
+ //direct word found on jth position of ith sequence
+ data[curr_state->output.out_set[k].wordidx].directPos[i].value = (uint32_t)pos;
+ else
+ //reverse word found on jth position of ith sequence
+ data[curr_state->output.out_set[k].wordidx].reversePos[i].value = (uint32_t)pos;
+ }
+ }
+ else
+ {
+ //okay multi match allowed
+ if (curr_state->output.out_set[k].isdirect)
+ {
+ if (data[curr_state->output.out_set[k].wordidx].directCount[i] == 1)
+ data[curr_state->output.out_set[k].wordidx].directPos[i].value = (uint32_t)pos;
+ else
+ {
+ //need to create or extend the positions list
+ if (data[curr_state->output.out_set[k].wordidx].directCount[i] == 2)
+ {
+ //for second element, first was put in .value, so dont forget to copy that in the array too
+ data[curr_state->output.out_set[k].wordidx].directPos[i].pointer = ECOMALLOC(2 * sizeof(uint32_t),
+ "Cannot allocate memory for primer position");
+ data[curr_state->output.out_set[k].wordidx].directPos[i].pointer[0] = data[curr_state->output.out_set[k].wordidx].directPos[i].value;
+ data[curr_state->output.out_set[k].wordidx].directPos[i].pointer[1] = (uint32_t)pos;
+ }
+ else
+ {
+ //for third or greater element
+ data[curr_state->output.out_set[k].wordidx].directPos[i].pointer = ECOREALLOC(data[curr_state->output.out_set[k].wordidx].directPos[i].pointer,
+ data[curr_state->output.out_set[k].wordidx].directCount[i] * sizeof(uint32_t),
+ "Cannot allocate memory for primer position");
+ data[curr_state->output.out_set[k].wordidx].directPos[i].pointer[data[curr_state->output.out_set[k].wordidx].directCount[i]-1] = (uint32_t)pos;
+ }
+ }
+ }
+ else
+ {
+ if (data[curr_state->output.out_set[k].wordidx].reverseCount[i] == 1)
+ data[curr_state->output.out_set[k].wordidx].reversePos[i].value = (uint32_t)pos;
+ else
+ {
+ //need to create or extend the positions list
+ if (data[curr_state->output.out_set[k].wordidx].reverseCount[i] == 2)
+ {
+ //for second element, first was put in .value, so dont forget to copy that in the array too
+ data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer = ECOMALLOC(2 * sizeof(uint32_t),
+ "Cannot allocate memory for primer position");
+ data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer[0] = data[curr_state->output.out_set[k].wordidx].reversePos[i].value;
+ data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer[1] = (uint32_t)pos;
+ }
+ else
+ {
+ //for third or greater element
+ data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer = ECOREALLOC(data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer,
+ data[curr_state->output.out_set[k].wordidx].reverseCount[i] * sizeof(uint32_t),
+ "Cannot allocate memory for primer position");
+ data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer[data[curr_state->output.out_set[k].wordidx].reverseCount[i]-1] = (uint32_t)pos;
+ }
+ }
+ }
+ }
+ //dont forget to increment inexample or outexample count, but only once for a sequence
+ if ((data[curr_state->output.out_set[k].wordidx].directCount[i] +
+ data[curr_state->output.out_set[k].wordidx].reverseCount[i]) == 1)
+ {
+ if (database[i]->isexample)
+ data[curr_state->output.out_set[k].wordidx].inexample++;
+ else
+ data[curr_state->output.out_set[k].wordidx].outexample++;
+ }
+ }
+ }
+ }
+
+ //Only thing that remains is to remove the failed words
+ for (i=0,j=0; isize; i++)
+ {
+ fprintf(stderr,"Primers %5d/%lld analyzed => sequence : %s in %d example and %d counterexample sequences \r",
+ i+1,words->size,ecoUnhashWord(data[i].word,options->primer_length),
+ data[i].inexample,data[i].outexample);
+
+ //if (data[i].inexample < inSequenceQuorum || (data[i].directCount[0] == 2 && options->no_multi_match))
+ if (data[i].directCount[0] == 2 && options->no_multi_match)
+ {
+ //bad word, delete from the array
+ for (k=0; k 1)
+ ECOFREE (data[i].directPos[k].pointer, "Cannot free position pointer.");
+ if (data[i].reverseCount[k] > 1)
+ ECOFREE (data[i].reversePos[k].pointer, "Cannot free position pointer.");
+ }
+ ECOFREE (data[i].directCount, "Cannot free position pointer.");
+ ECOFREE (data[i].directPos, "Cannot free position pointer.");
+ ECOFREE (data[i].reverseCount, "Cannot free position pointer.");
+ ECOFREE (data[i].reversePos, "Cannot free position pointer.");
+ }
+ else
+ {
+ //data[i].good = data[i].inexample >= inSequenceQuorum && data[i].outexample <= outSequenceQuorum;
+ data[i].good = data[i].outexample <= outSequenceQuorum;
+ goodPrimers+=data[i].good? 1:0;
+ if (j < i)
+ data[j] = data[i];
+ j++;
+ }
+ }
+ fprintf(stderr,"\n\nOn %lld analyzed primers %d respect quorum conditions\n",words->size,goodPrimers);
+ fprintf(stderr,"Conserved primers for further analysis : %d/%lld\n",j,words->size);
+
+ primers = ECOMALLOC(sizeof(primercount_t),"Cannot allocate memory for primer table");
+ primers->primers=ECOREALLOC(data,
+ j * sizeof(primer_t),
+ "Cannot reallocate memory for fuzzy matching results");
+ primers->size=j;
+
+ //free memory of keyword table
+ for (i=0; i<4; i++)
+ if (automaton_root.next[i] != &automaton_root)
+ ahoc_freeKeywordTree (automaton_root.next[i]);
+
+ return primers;
+}
+
+void ahoc_graphPrintNodesInfo (aho_state *node, FILE* gfile)
+{
+ uint32_t i;
+ fprintf (gfile, "\"%d\"[\n", node->id);
+ fprintf (gfile, "label=\"%d\\n", node->id);
+ for (i=0; ioutput.count; i++)
+ fprintf (gfile, "%d%c,", node->output.out_set[i].wordidx, node->output.out_set[i].isdirect?'d':'r');
+ fprintf (gfile, "\"\n];\n");
+
+ for (i=0; i<4; i++)
+ if (node->next[i] != NULL && node->next[i] != node)
+ ahoc_graphPrintNodesInfo (node->next[i], gfile);
+}
+
+void ahoc_graphPrintNodesLinks (aho_state *node, FILE* gfile)
+{
+ uint32_t i;
+ static int j=0;
+
+ for (i=0; i<4; i++)
+ if (node->next[i] != NULL && node->next[i] != node)
+ {
+ fprintf (gfile, "\"%d\" -> \"%d\" [\n", node->id, node->next[i]->id);
+ fprintf (gfile, "label=\"%c\"\n];\n", "ACGT"[i]);
+ }
+
+ if (j++ < 40)
+ if (node->fail != NULL && node->fail != groot)
+ {
+ fprintf (gfile, "\"%d\" -> \"%d\" [\n", node->id, node->fail->id);
+ fprintf (gfile, "color= \"red\"\n];\n");
+ }
+
+ for (i=0; i<4; i++)
+ if (node->next[i] != NULL && node->next[i] != node)
+ ahoc_graphPrintNodesLinks (node->next[i], gfile);
+}
+
+void ahoc_graphKeywordTree (aho_state *root)
+{
+ FILE *gfile;
+
+ groot=root;
+ gfile = fopen ("keywordtree.gv", "w");
+ fprintf (gfile, "digraph keywordtree {\n");
+ ahoc_graphPrintNodesInfo (root, gfile);
+ ahoc_graphPrintNodesLinks (root, gfile);
+ fprintf (gfile, "}\n");
+ fclose(gfile);
+}
+
diff --git a/src/libecoprimer/ahocorasick.h b/src/libecoprimer/ahocorasick.h
new file mode 100755
index 0000000..097af4f
--- /dev/null
+++ b/src/libecoprimer/ahocorasick.h
@@ -0,0 +1,43 @@
+/*
+ * ahocorasick.h
+ *
+ * Created on: 26 march 2011
+ * Author: tiayyba
+ */
+
+#ifndef H_ahocorasick
+#define H_ahocorasick
+
+#include "ecoprimer.h"
+
+typedef struct aho_output_t{
+ uint32_t wordidx; //index of strict word (dont save the word of 64B)
+ bool_t isdirect; //we need to find both direct and reverse words so we must know which one is it
+}aho_output;
+
+typedef struct aho_output_count_t{
+ uint32_t count;
+ aho_output *out_set;
+}aho_output_count;
+
+typedef struct aho_state_t{
+ int32_t id;
+ struct aho_state_t *next[4]; //for labels A=0,C=1,G=2 and T=3
+ struct aho_state_t *fail;
+ aho_output_count output;
+}aho_state;
+
+typedef struct queue_node_t {
+ aho_state *state_node;
+ struct queue_node_t *next;
+}queue_node;
+
+typedef struct{
+ queue_node *first;
+ queue_node *last;
+}aho_queue;
+
+pprimercount_t ahoc_lookforStrictPrimers (pecodnadb_t database, uint32_t seqdbsize,uint32_t exampleCount,
+ pwordcount_t words,poptions_t options);
+#endif /* H_ahocorasick */
+
diff --git a/src/libecoprimer/ecoprimer.h b/src/libecoprimer/ecoprimer.h
index b2f61fd..37fefd1 100644
--- a/src/libecoprimer/ecoprimer.h
+++ b/src/libecoprimer/ecoprimer.h
@@ -176,6 +176,7 @@ typedef struct {
int *wellIdentifiedSeqs; //< an array having elements equla to total seqs
// values are either 0 or 1, if seq is well identified
// its 1 else 0
+ int *coveredSeqs; //< an array having elements equal to total seqs, 1 if seq is covered else 0
// these statistics are relative to inexample sequences
@@ -291,6 +292,9 @@ typedef struct {
PNNParams pnparm;
bool_t print_sets_of_primers;
float specificity_threshold;
+ int links_cnt;
+ float max_links_percent;
+ bool_t filter_on_links;
} options_t, *poptions_t;
typedef ecoseq_t **pecodnadb_t;
@@ -350,7 +354,7 @@ int32_t getrankdbstats(pecodnadb_t seqdb,
uint32_t seqdbsize,
ecotaxonomy_t *taxonomy,
poptions_t options);
-float taxonomycoverage(ppair_t pair, poptions_t options);
+float taxonomycoverage(ppair_t pair, poptions_t options, pecodnadb_t seqdb,uint32_t seqdbsize);
char ecoComplementChar(char base);
void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize);
diff --git a/src/libecoprimer/filtering.c b/src/libecoprimer/filtering.c
index a585f9e..24e1387 100644
--- a/src/libecoprimer/filtering.c
+++ b/src/libecoprimer/filtering.c
@@ -114,6 +114,8 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest,
error<<= 1;
error&=ERRORMASK(FWORDSIZE);
+ //code = -1;
+ //if((*base) >= 'A' && (*base) <= 'Z')
code = encoder[(*base) - 'A'];
if (code <0)
{
@@ -154,7 +156,7 @@ int32_t *filteringSeq(pecodnadb_t database, uint32_t seqdbsize,
for (i=0;iisexample)
+ if (database[i]->isexample && database[i]->SQ_length > options->primer_length)
{
j++;
wordscount=ecoFilteringHashSequence(wordscount,
diff --git a/src/libecoprimer/pairs.c b/src/libecoprimer/pairs.c
index ecb302b..88c1c4b 100644
--- a/src/libecoprimer/pairs.c
+++ b/src/libecoprimer/pairs.c
@@ -179,7 +179,7 @@ static void buildPrimerPairsForOneSeq(uint32_t seqid,
uint32_t i,j,k;
uint32_t matchcount=0;
pprimermatch_t matches = NULL;
- primermatchcount_t seqmatchcount;
+ //primermatchcount_t seqmatchcount;
ppair_t pcurrent;
pair_t current;
pprimer_t wswp;
@@ -189,9 +189,9 @@ static void buildPrimerPairsForOneSeq(uint32_t seqid,
//char prmr[50];
//float mtemp;
word_t w1, w1a, omask = (0x1L << (options->strict_three_prime*2)) -1;
- word_t w2, w2a, wtmp;
+ word_t w2, w2a;//, wtmp;
uint32_t bp1,bp2;
-
+
//prmr[options->primer_length] = '\0';
for (i=0;i < primers->size; i++)
@@ -252,16 +252,17 @@ static void buildPrimerPairsForOneSeq(uint32_t seqid,
{
// For all primers matching the sequence
- //for(j=i+1;
- // (jprimer_length) < options->lmax);
- // j++
- // )
+ /*for(j=i+1;
+ (jprimer_length) < options->lmax);
+ j++
+ )//*/
for (j=i+1; jprimer_length) continue;
distance = matches[j].position - matches[i].position - options->primer_length;
if (distance >= options->lmax) break;
+
// For all not too far primers
@@ -269,9 +270,7 @@ static void buildPrimerPairsForOneSeq(uint32_t seqid,
&& (distance > options->lmin)
)
{
-
// If possible primer pair
-
current.p1 = matches[i].primer;
current.asdirect1=matches[i].strand;
current.p2 = matches[j].primer;
@@ -456,7 +455,6 @@ static void buildPrimerPairsForOneSeq(uint32_t seqid,
}
}
}
-
pairs->count=paircount;
}
diff --git a/src/libecoprimer/strictprimers.c b/src/libecoprimer/strictprimers.c
index e6e7f4e..d555720 100644
--- a/src/libecoprimer/strictprimers.c
+++ b/src/libecoprimer/strictprimers.c
@@ -108,10 +108,11 @@ void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circ
table->inseqcount++;
-
- table->strictcount = ECOREALLOC(table->strictcount,buffersize*sizeof(uint32_t),
+ //fprintf (stderr, "\nOldAddress: %x", table->strictcount);
+ table->strictcount = ECOREALLOC(table->strictcount,(buffersize+5000)*sizeof(uint32_t),
"Cannot allocate memory to extend example word count table");
-
+ //fprintf (stderr, " NewAddress: %x\n", table->strictcount);
+
for (i=table->size; i < buffersize; i++)
table->strictcount[i]=1;
@@ -172,7 +173,7 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
for (i=0;iisexample)
+ if (database[i]->isexample && database[i]->SQ_length > options->primer_length)
{
if (first)
diff --git a/src/libecoprimer/taxstats.c b/src/libecoprimer/taxstats.c
index cdc97a5..0898303 100644
--- a/src/libecoprimer/taxstats.c
+++ b/src/libecoprimer/taxstats.c
@@ -6,10 +6,46 @@
*/
#include
+//void tdestroy (void *root, void (*free_node)(void *nodep));
+
#include "ecoprimer.h"
static int cmptaxon(const void *t1, const void* t2);
+void **tree_root = NULL;
+int delete_passes = 0;
+
+void delete_twalkaction (const void *node, VISIT order, int level)
+{
+ switch (order)
+ {
+ case preorder:
+ delete_passes++;
+ break;
+ case postorder:
+ delete_passes++;
+ break;
+ case endorder:
+ delete_passes++;
+ break;
+ case leaf:
+ if (tree_root)
+ tdelete (node, tree_root,cmptaxon);
+ delete_passes++;
+ break;
+ }
+}
+
+void free_tree_nodes (void *tree)
+{
+ while (1)
+ {
+ delete_passes = 0;
+ twalk (tree, delete_twalkaction);
+ if (delete_passes <= 1) break;
+ }
+}
+
static int cmptaxon(const void *t1, const void* t2)
{
const size_t taxid1=(size_t)t1;
@@ -35,7 +71,12 @@ int32_t counttaxon(int32_t taxid)
if (taxid==-1)
{
if (taxontree)
+ {
+ tree_root = (void **)&taxontree;
+ //free_tree_nodes (taxontree);
ECOFREE(taxontree,"Free taxon tree");
+ tree_root = NULL;
+ }
taxontree=NULL;
taxoncount=0;
return 0;
@@ -97,22 +138,30 @@ int32_t getrankdbstats(pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *tax
}
-float taxonomycoverage(ppair_t pair, poptions_t options)
+float taxonomycoverage(ppair_t pair, poptions_t options, pecodnadb_t seqdb,uint32_t seqdbsize)
{
int32_t seqcount;
int32_t i;
int32_t incount=0;
int32_t outcount=0;
+ uint32_t j;
+ memset (pair->coveredSeqs, 0, seqdbsize*sizeof (int));
seqcount=pair->pcr.ampcount;
counttaxon(-1);
for (i=0; i < seqcount; i++)
if (pair->pcr.amplifias[i].sequence->isexample
&& pair->pcr.amplifias[i].sequence->ranktaxonid > 0 )
+ {
incount = counttaxon(pair->pcr.amplifias[i].sequence->ranktaxonid);
+ for (j=0; jpcr.amplifias[i].sequence == seqdb[j])
+ {pair->coveredSeqs[j] = 1; break;}
+ }
+
counttaxon(-1);
for (i=0; i < seqcount; i++)
if (!pair->pcr.amplifias[i].sequence->isexample
@@ -145,12 +194,14 @@ static int cmpamp(const void *ampf1, const void* ampf2)
{
incr = -1;
j = pampf1->length - 1;
+
if (pampf2->strand)
{
pampf1 = (pamptotaxon_t) ampf2;
pampf2 = (pamptotaxon_t) ampf1;
chd = 1;
}
+ //j = pampf2->length - 1; should have been here and pampf2 instead of pampf1?
}
len = (pampf1->length <= pampf2->length)? pampf1->length: pampf2->length;
@@ -173,6 +224,7 @@ static int cmpamp(const void *ampf1, const void* ampf2)
return 0;
}*/
+
static int cmpamp(const void *ampf1, const void* ampf2)
{
int i;
@@ -183,10 +235,10 @@ static int cmpamp(const void *ampf1, const void* ampf2)
char *ch2;
int incr1;
int incr2;
-
+
pamptotaxon_t pampf1 = (pamptotaxon_t) ampf1;
pamptotaxon_t pampf2 = (pamptotaxon_t) ampf2;
-
+
ch1 = pampf1->amplifia;
ch2 = pampf2->amplifia;
@@ -218,7 +270,7 @@ static int cmpamp(const void *ampf1, const void* ampf2)
if (pampf1->length > pampf2->length) return 1;
if (pampf2->length > pampf1->length) return -1;
-
+
return 0;
}
@@ -242,6 +294,8 @@ void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize)
uint32_t i, j;
uint32_t ampfindex = 0;
int32_t taxid;
+ uint32_t wellidentifiedcount;
+
void *ampftree = NULL;
pamptotaxon_t pcurrentampf;
pamptotaxon_t *ptmp;
@@ -278,11 +332,14 @@ void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize)
}
memset (pair->wellIdentifiedSeqs, 0, seqdbsize*sizeof (int));
- counttaxon(-1);
+ //counttaxon(-1);
for (i = 0; i < ampfindex; i++)
{
if (ampfwithtaxtree[i].taxoncount > 1)
- twalk(ampfwithtaxtree[i].taxontree, twalkaction);
+ {
+ //printf ("\nampfwithtaxtree[i].taxoncount: %d\n", ampfwithtaxtree[i].taxoncount);
+ //twalk(ampfwithtaxtree[i].taxontree, twalkaction);
+ }
//TR 5/9/10 - added code for well identified seqs
else if(ampfwithtaxtree[i].taxoncount == 1) /*well identified*/
{
@@ -293,6 +350,7 @@ void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize)
{
for (j = 0; j < seqdbsize; j++)
if (seqdb[j]->ranktaxonid == gtxid
+ && seqdb[j]->isexample
&&(pair->p1->directCount[j] > 0
|| pair->p1->reverseCount[j] > 0)
&& (pair->p2->directCount[j] > 0
@@ -303,10 +361,18 @@ void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize)
}
}
}
-
- pair->notwellidentifiedtaxa = counttaxon(-2);
- pair->bs = ((float)pair->intaxa - (float)pair->notwellidentifiedtaxa) / pair->intaxa;
-
+ //printf ("\n");
+ counttaxon(-1);
+ wellidentifiedcount = 0;
+ for (j = 0; j < seqdbsize; j++)
+ if (pair->wellIdentifiedSeqs[j] == 1)
+ counttaxon(seqdb[j]->ranktaxonid);
+ wellidentifiedcount = counttaxon(-2);
+ //pair->notwellidentifiedtaxa = counttaxon(-2);
+ pair->notwellidentifiedtaxa = (pair->intaxa-wellidentifiedcount); //counttaxon(-2);
+ //pair->bs = ((float)pair->intaxa - (float)pair->notwellidentifiedtaxa) / pair->intaxa;
+ pair->bs = ((float)wellidentifiedcount) / (float)pair->intaxa;
+
ECOFREE (ampfwithtaxtree, "Free amplifia table");
}