Added code for building sets of primers and -p command line option

git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@279 60f365c0-8329-0410-b2a4-ec073aeeaa1d
This commit is contained in:
2010-09-13 09:44:53 +00:00
parent e483b17e18
commit dd08d73dda
6 changed files with 495 additions and 18 deletions

View File

@ -6,6 +6,7 @@
*/
#include "libecoprimer/ecoprimer.h"
#include "libecoprimer/PrimerSets.h"
#include <stdio.h>
#include <string.h>
#include <ctype.h>
@ -74,6 +75,7 @@ static void PrintHelp()
PP "-A : Print the list of all identifier of sequences present in the database\n\n");
PP "-f : Remove data mining step during strict primer identification\n\n");
PP "-v : Store statistic file about memory usage during strict primer identification\n\n");
PP "-p : Print sets of primers\n\n");
PP "\n");
PP "------------------------------------------\n");
PP "Table result description : \n");
@ -132,9 +134,9 @@ void initoptions(poptions_t options)
options->refseq=NULL;
options->circular=0;
options->doublestrand=1;
options->strict_quorum=0.7;
options->strict_quorum=0.3;
options->strict_exclude_quorum=0.1;
options->sensitivity_quorum=0.9;
options->sensitivity_quorum=0.3;
options->false_positive_quorum=0.1;
options->strict_three_prime=0;
options->r=0;
@ -146,6 +148,7 @@ void initoptions(poptions_t options)
options->saltmethod = SALT_METHOD_SANTALUCIA;
options->salt = DEF_SALT;
options->printAC=FALSE;
options->print_sets_of_primers = FALSE;
}
void printapair(int32_t index,ppair_t pair, poptions_t options)
@ -314,7 +317,7 @@ static int cmpprintedpairs(const void* p1,const void* p2)
return 0;
}
uint32_t filterandsortpairs(ppair_t* sortedpairs,uint32_t count, poptions_t options)
uint32_t filterandsortpairs(ppair_t* sortedpairs,uint32_t count, poptions_t options, pecodnadb_t seqdb)
{
uint32_t i,j;
float q,qfp;
@ -329,6 +332,7 @@ uint32_t filterandsortpairs(ppair_t* sortedpairs,uint32_t count, poptions_t opti
qfp = (float)sortedpairs[i]->outexample/options->outsamples;
else qfp=0.0;
sortedpairs[i]->wellIdentifiedSeqs = NULL; //TR 05/09/10 - wellIdentified needed for primer sets
sortedpairs[i]->quorumin = q;
sortedpairs[i]->quorumout = qfp;
sortedpairs[i]->yule = q - qfp;
@ -337,8 +341,10 @@ uint32_t filterandsortpairs(ppair_t* sortedpairs,uint32_t count, poptions_t opti
if (q > options->sensitivity_quorum &&
qfp < options->false_positive_quorum)
{
//TR 05/09/10 - wellIdentified needed for primer sets
sortedpairs[j]->wellIdentifiedSeqs = ECOMALLOC(options->dbsize * sizeof(int),"Cannot allocate well_identified_array");
(void)taxonomycoverage(sortedpairs[j],options);
taxonomyspecificity(sortedpairs[j]);
taxonomyspecificity(sortedpairs[j], seqdb, options->dbsize);
j++;
}
@ -348,7 +354,7 @@ uint32_t filterandsortpairs(ppair_t* sortedpairs,uint32_t count, poptions_t opti
}
void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy)
void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy, pecodnadb_t seqdb)
{
ppair_t* sortedpairs;
ppair_t* index;
@ -357,7 +363,7 @@ void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy)
size_t count;
char *taxon[]={"taxon","taxa"};
ecotx_t *current_taxon;
pairset pair_sets;
//printf("Index\tPrimer1\tPrimer2\tGB\tInexampleCount\tOutexampleCount\tYule\tIntaxaCount\tOuttaxaCount\tCoverage\tSpecificity\tMinAmplifiedLength\tMaxAmplifiedLength\tAvgAmplifiedLength\n");
@ -377,7 +383,7 @@ void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy)
for (i=0;i<pl->paircount;i++,j++)
sortedpairs[j]=pl->pairs+i;
count=filterandsortpairs(sortedpairs,pairs->count,options);
count=filterandsortpairs(sortedpairs,pairs->count,options, seqdb);
getThermoProperties(sortedpairs, count, options);
fprintf(stderr,"Total good pair count : %u\n",(uint32_t)count);
@ -441,8 +447,12 @@ void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy)
for (i=0;i < count;i++)
printapair(i,sortedpairs[i],options);
if (options->print_sets_of_primers == TRUE)
{
pair_sets = build_primers_set (sortedpairs, count, seqdb, options);
some_other_set_possibilities (&pair_sets, sortedpairs, count, seqdb, options);
}
}
@ -528,7 +538,7 @@ int main(int argc, char **argv)
initoptions(&options);
while ((carg = getopt(argc, argv, "hAfvcUDSE:d:l:L:e:i:r:R:q:3:s:x:t:O:m:a:")) != -1) {
while ((carg = getopt(argc, argv, "hAfvcUDSpE:d:l:L:e:i:r:R:q:3:s:x:t:O:m:a:")) != -1) {
switch (carg) {
/* ---------------------------- */
@ -691,6 +701,12 @@ int main(int argc, char **argv)
options.circular = 1;
break;
/* -------------------- */
case 'p': /* print sets of primers */
/* --------------------------------- */
options.print_sets_of_primers = TRUE;
break;
case '?': /* bad option */
/* -------------------- */
errflag++;
@ -799,9 +815,9 @@ int main(int argc, char **argv)
fprintf(stderr,"\n");
pairs = buildPrimerPairs(seqdb, seqdbsize, primers, &options);
printpairs (pairs, &options,taxonomy);
pairs = buildPrimerPairs(seqdb, seqdbsize, primers, &options);
printpairs (pairs, &options,taxonomy, seqdb);
return 0;
}

View File

@ -14,7 +14,8 @@ SOURCES = goodtaxon.c \
pairs.c \
taxstats.c \
apat_search.c \
filtering.c
filtering.c \
PrimerSets.c
SRCS=$(SOURCES)

View File

@ -0,0 +1,388 @@
#include <stdint.h>
//#include "ecoprimer.h"
#include "PrimerSets.h"
int32_t counttaxon(int32_t taxid);
float get_set_coverage (pairset *p_set, SetParams *pparams, int32_t pidx_toexclude)
{
int32_t i, j;
float cov;
pprimer_t prmr1;
pprimer_t prmr2;
int32_t s_intaxa = 0;
counttaxon(-1);
for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
{
if (p_set->set_pairs[i] == -1 || pidx_toexclude == i) continue;
prmr1 = pparams->sortedpairs[p_set->set_pairs[i]]->p1;
prmr2 = pparams->sortedpairs[p_set->set_pairs[i]]->p2;
for (j = 0; j < pparams->options->dbsize; j++)
if ((prmr1->directCount[j] > 0 || prmr1->reverseCount[j] > 0)
&& (prmr2->directCount[j] > 0 || prmr2->reverseCount[j] > 0)
&& pparams->seqdb[j]->isexample && pparams->seqdb[j]->ranktaxonid > 0)
{
s_intaxa = counttaxon(pparams->seqdb[j]->ranktaxonid);
}
}
//fprintf(stderr, "%d/%d\n", s_intaxa, pparams->options->intaxa);
cov = s_intaxa*1.0/(pparams->options->intaxa*1.0);
return cov;
}
void set_cov_spc (pairset *p_set, SetParams *pparams)
{
int32_t i;
int32_t ssp = 0;
for (i = 0; i < pparams->options->dbsize; i++)
if (p_set->set_wellIdentifiedTaxa[i] == 1) ssp++;
//set specificity
p_set->set_specificity = ssp*1.0/(pparams->options->insamples*1.0);
//set coverage
p_set->set_coverage = get_set_coverage (p_set, pparams, -1);
}
int ok_to_add (int id, pairset *pair_set)
{
int i;
for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
if (pair_set->set_pairs[i] == id) return 0;
return 1;
}
//1. Add in set the first pair having highiest specificity
//2. For the sequences not WI by primers in set, calculate count for each pair equal to number of seqs WI by it
//3. Take the pair with highiest such count and see its links with primers already in set,
//4. If no/insufficient links, take next pair
//5. repeate 3,4 untill pair gets added in set
//6. Repeate 2 to 5 until set complete
pairset build_primers_set (ppair_t* sortedpairs, int32_t sorted_count, pecodnadb_t seqdb,
poptions_t options)
{
SetParams params;
pairset pair_set;
int32_t i;
int32_t pset_idx;
int32_t prb_idx;
int *pair_wi_count_sorted_ids;
params.sortedpairs = sortedpairs;
params.sorted_count = sorted_count;
params.seqdb = seqdb;
params.options = options;
memset (&pair_set, 0, sizeof(pairset));
pair_set.set_wellIdentifiedTaxa = (int *) ECOMALLOC(options->dbsize*sizeof (int),
"Could not allocate memory for pair set");
pair_wi_count_sorted_ids = (int *) ECOMALLOC(sorted_count*sizeof (int),
"Could not allocate memory for pair_wi_count_sorted");
for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
pair_set.set_pairs[i] = -1;
pset_idx = 0;
prb_idx = 0;
//add first pair by default, this should be the one having highiest specificty
add_pair_in_set (&pair_set, pset_idx, prb_idx, &params);
pset_idx++;
while (pset_idx < PRIMERS_IN_SET_COUNT)
{
//get a sorted list of pair ids with the pair well identifying most of the remaining seqs at top
get_next_pair_options (pair_wi_count_sorted_ids, &pair_set, &params);
if (pair_wi_count_sorted_ids[0] == 0)
{
fprintf (stderr, "No further pair found, total so far %d\n", pset_idx);
break;
}
else
fprintf (stderr, "Some counts found\n");
for (prb_idx = 0; prb_idx < sorted_count; prb_idx++)
{
if (pair_wi_count_sorted_ids[prb_idx])
if (ok_to_add (pair_wi_count_sorted_ids[prb_idx], &pair_set))
{
//fprintf (stderr, "Oktoadd\n");
float ldist = get_links_distribution (pair_wi_count_sorted_ids[prb_idx], &pair_set, &params);
fprintf (stderr, "ldist:%f\n", ldist);
if (ldist >= 0.7)
{
add_pair_in_set (&pair_set, pset_idx, pair_wi_count_sorted_ids[prb_idx], &params);
pset_idx++;
break;
}
else
fprintf (stderr, "ldist:%f\n", ldist);
}
}
if (prb_idx == sorted_count)
{
fprintf (stderr, "No further pair found, total so far %d\n", pset_idx);
break;
}
if (pair_set.set_specificity == 1.0)
{
fprintf (stderr, "Set Complete with total primers: %d\n", pset_idx);
break;
}
}
get_set_mean_cov_stats (&pair_set, &params);
return pair_set;
}
float get_links_distribution (int prb_idx, pairset *pair_set, SetParams *pparams)
{
int i, j;
int *pair_link_count;
int *pwi;
int *pswi;
float pcnt = 0.0;
float pscnt = 0.0;
pair_link_count = (int *) ECOMALLOC(PRIMERS_IN_SET_COUNT*sizeof (int),
"Could not allocate memory for pair_link_count");
pwi = pparams->sortedpairs[prb_idx]->wellIdentifiedSeqs;
//fprintf(stderr, "a,");
for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
{
pair_link_count[i] = 0;
if (pair_set->set_pairs[i] != -1)
{
//fprintf(stderr, "b,");
pswi = pparams->sortedpairs[pair_set->set_pairs[i]]->wellIdentifiedSeqs;
for (j = 0; j < pparams->options->dbsize; j++)
if (pwi[j] == 1 && pwi[j] == pswi[j])
pair_link_count[i] += 1;
}
}
//fprintf(stderr, "c,");
for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
{
if (pair_set->set_pairs[i] != -1)
pscnt++;
if (pair_link_count[i] > 0)
pcnt++;
}
ECOFREE (pair_link_count, "Could not free memory for pair_link_count");
//fprintf(stderr, "d,");
return (pcnt/pscnt);
}
void get_next_pair_options (int *pair_wi_count_sorted_ids, pairset *pair_set, SetParams *pparams)
{
int *pair_count;
int32_t i, j;
int max;
int tmp;
pair_count = (int *) ECOMALLOC(pparams->sorted_count*sizeof (int),
"Could not allocate memory for pair_count");
memset (pair_wi_count_sorted_ids, 0, pparams->sorted_count*sizeof(int));
for (i = 0; i < pparams->options->dbsize; i++)
{
if (pair_set->set_wellIdentifiedTaxa[i] == 1) continue;
for (j = 0; j < pparams->sorted_count; j++)
{
if (pparams->sortedpairs[j]->wellIdentifiedSeqs[i] == 1)
pair_count[j] += 1;
}
}
//set pair ids
for (j = 0; j < pparams->sorted_count; j++)
pair_wi_count_sorted_ids[j] = j;
//set count of primers already in set to zero (it should already be zero) just for testing
for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
if (pair_set->set_pairs[i] != -1)
pair_count[pair_set->set_pairs[i]] = 0;
//sort two arrays in descending wrt count
for (i = 0; i < pparams->sorted_count - 1; i++)
{
max = i;
for (j = i + 1; j < pparams->sorted_count; j++)
if (pair_count[max] < pair_count[j])
max = j;
if (max > i)
{
tmp = pair_count[i];
pair_count[i] = pair_count[max];
pair_count[max] = tmp;
tmp = pair_wi_count_sorted_ids[i];
pair_wi_count_sorted_ids[i] = pair_wi_count_sorted_ids[max];
pair_wi_count_sorted_ids[max] = tmp;
}
}
for (i = 0; i < pparams->sorted_count - 1; i++)
if (pair_count[i] == 0)
pair_wi_count_sorted_ids[i] = 0;
//else
// fprintf (stderr, "%d:%d, ", i, pair_count[i]);
ECOFREE (pair_count, "Could not free memory for pair_count");
}
void add_pair_in_set (pairset *pair_set, int32_t pset_idx, int32_t prb_idx, SetParams *pparams)
{
int *pwi;
int32_t i;
pair_set->set_pairs[pset_idx] = prb_idx;
pwi = pparams->sortedpairs[prb_idx]->wellIdentifiedSeqs;
for (i = 0; i < pparams->options->dbsize; i++)
if (pwi[i] == 1)
pair_set->set_wellIdentifiedTaxa[i] = 1;
set_cov_spc (pair_set, pparams);
}
void get_set_mean_cov_stats (pairset *pair_set, SetParams *pparams)
{
int32_t i, j, k;
int interseq_vals[PRIMERS_IN_SET_COUNT*PRIMERS_IN_SET_COUNT];
int interseq_cnt = 0;
double msum;
int *p1wi;
int *p2wi;
for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
{
if (pair_set->set_pairs[i] == -1) continue;
p1wi = pparams->sortedpairs[pair_set->set_pairs[i]]->wellIdentifiedSeqs;
for (j = i+1; j < PRIMERS_IN_SET_COUNT; j++)
{
if (pair_set->set_pairs[j] == -1) continue;
p2wi = pparams->sortedpairs[pair_set->set_pairs[j]]->wellIdentifiedSeqs;
interseq_vals[interseq_cnt] = 0;
for (k = 0; k < pparams->options->dbsize; k++)
if (p1wi[k] == 1 && p2wi[k] == 1)
interseq_vals[interseq_cnt]++;
interseq_cnt++;
}
}
//calculate mean
msum = 0;
pair_set->set_score = 0;
pair_set->set_lmean = 0;
pair_set->set_lcov = -1;
if (interseq_cnt == 0) return;
for (i = 0; i < interseq_cnt; i++)
msum += interseq_vals[i];
pair_set->set_lmean = msum/interseq_cnt;
msum = 0;
for (i = 0; i < interseq_cnt; i++)
msum += (interseq_vals[i] - pair_set->set_lmean)*(interseq_vals[i] - pair_set->set_lmean);
pair_set->set_lcov = msum/interseq_cnt;
if (pair_set->set_lcov != 0)
pair_set->set_score = (pair_set->set_coverage*pair_set->set_specificity*pair_set->set_lmean)/pair_set->set_lcov;
}
void reset_set_props (pairset *pair_set, SetParams *pparams)
{
int *pwi;
int i, j;
for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
{
if (pair_set->set_pairs[i] == -1) continue;
pwi = pparams->sortedpairs[pair_set->set_pairs[i]]->wellIdentifiedSeqs;
for (j = 0; j < pparams->options->dbsize; j++)
if (pwi[j] == 1)
pair_set->set_wellIdentifiedTaxa[j] = 1;
}
set_cov_spc (pair_set, pparams);
get_set_mean_cov_stats (pair_set, pparams);
}
void print_set_info (pairset *pair_set, SetParams *pparams)
{
int i;
int printed1st = 0;
printf ("%f\t%f\t%f\t%f\t%f\t", pair_set->set_specificity,
pair_set->set_coverage,pair_set->set_lmean,
pair_set->set_lcov,pair_set->set_score);
for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
{
if (pair_set->set_pairs[i] == -1) continue;
if (printed1st)
printf (":%d", pair_set->set_pairs[i]);
else
printf ("%d", pair_set->set_pairs[i]);
printed1st = 1;
}
printf ("\n");
}
void some_other_set_possibilities (pairset *pair_set,
ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options)
{
SetParams params;
pairset tmp_pair_set;
int i, j;
int *wi;
params.sortedpairs = sortedpairs;
params.sorted_count = sorted_count;
params.seqdb = seqdb;
params.options = options;
wi = (int *) ECOMALLOC(options->dbsize*sizeof (int),
"Could not allocate memory for pair set wi");
//print stats for first original set
printf ("\nspecificity\tcoverage\tmean\tcovariance\tscore\tprimers\n");
print_set_info (pair_set, &params);
for (i = 0; i < PRIMERS_IN_SET_COUNT; i++)
{
if (pair_set->set_pairs[i] == -1) continue;
for (j = 0; j < sorted_count; j++)
{
if (ok_to_add (j, pair_set))
{
tmp_pair_set = *pair_set;
tmp_pair_set.set_pairs[i] = j;
memset (wi, 0, options->dbsize*sizeof (int));
tmp_pair_set.set_wellIdentifiedTaxa = wi;
reset_set_props (&tmp_pair_set, &params);
print_set_info (&tmp_pair_set, &params);
}
}
}
ECOFREE (wi, "Could not free memory for pair set wi");
}

View File

@ -0,0 +1,41 @@
#ifndef PRIMERSETS_H_
#define PRIMERSETS_H_
#include "ecoprimer.h"
#define PRIMERS_IN_SET_COUNT 10
typedef struct {
int *set_wellIdentifiedTaxa;
int32_t set_pairs[PRIMERS_IN_SET_COUNT];
float set_specificity;
float set_coverage;
float set_lmean;
float set_lcov;
float set_score;
}pairset;
typedef struct{
ppair_t* sortedpairs;
int32_t sorted_count;
pecodnadb_t seqdb;
poptions_t options;
}SetParams;
typedef struct{
float t_spc; //specificity contribution
float t_cov; //coverage contribution
float t_lmd; //link spread difference
float len; //length
float score; //score
}primerscore;
void add_pair_in_set (pairset *pair_set, int32_t pset_idx, int32_t prb_idx, SetParams *pparams);
void get_next_pair_options (int *pair_wi_count_sorted_ids, pairset *pair_set, SetParams *pparams);
float get_links_distribution (int prb_idx, pairset *prob_set, SetParams *pparams);
pairset build_primers_set (ppair_t* sortedpairs, int32_t sorted_count, pecodnadb_t seqdb,
poptions_t options);
void get_set_mean_cov_stats (pairset *prob_set, SetParams *pparams);
void some_other_set_possibilities (pairset *pair_set,
ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options);
#endif

View File

@ -173,6 +173,9 @@ typedef struct {
uint32_t outtaxa; //< counterexample taxa count
uint32_t notwellidentifiedtaxa;
int *wellIdentifiedSeqs; //< an array having elements equla to total seqs
// values are either 0 or 1, if seq is well identified
// its 1 else 0
// these statistics are relative to inexample sequences
@ -286,6 +289,7 @@ typedef struct {
int saltmethod;
float salt;
PNNParams pnparm;
bool_t print_sets_of_primers;
} options_t, *poptions_t;
typedef ecoseq_t **pecodnadb_t;
@ -347,7 +351,7 @@ int32_t getrankdbstats(pecodnadb_t seqdb,
poptions_t options);
float taxonomycoverage(ppair_t pair, poptions_t options);
char ecoComplementChar(char base);
void taxonomyspecificity (ppair_t pair);
void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize);
int32_t *filteringSeq(pecodnadb_t database, uint32_t seqdbsize,
uint32_t exampleCount,poptions_t options,uint32_t *size,int32_t sequenceQuorum);

View File

@ -179,9 +179,16 @@ void twalkaction (const void *node, VISIT order, int level)
counttaxon(taxid);
}
void taxonomyspecificity (ppair_t pair)
int32_t gtxid;
void twalkaction2 (const void *node, VISIT order, int level)
{
uint32_t i;
int32_t *pt = (int32_t *) node;
gtxid = *pt;
}
void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize)
{
uint32_t i, j;
uint32_t ampfindex = 0;
int32_t taxid;
void *ampftree = NULL;
@ -219,11 +226,31 @@ void taxonomyspecificity (ppair_t pair)
}
}
memset (pair->wellIdentifiedSeqs, 0, seqdbsize*sizeof (int));
counttaxon(-1);
for (i = 0; i < ampfindex; i++)
{
if (ampfwithtaxtree[i].taxoncount > 1)
twalk(ampfwithtaxtree[i].taxontree, twalkaction);
//TR 5/9/10 - added code for well identified seqs
else if(ampfwithtaxtree[i].taxoncount == 1) /*well identified*/
{
gtxid = -1;
twalk(ampfwithtaxtree[i].taxontree, twalkaction2);
if (gtxid != -1)
{
for (j = 0; j < seqdbsize; j++)
if (seqdb[j]->ranktaxonid == gtxid
&&(pair->p1->directCount[j] > 0
|| pair->p1->reverseCount[j] > 0)
&& (pair->p2->directCount[j] > 0
|| pair->p2->reverseCount[j] > 0))
{
pair->wellIdentifiedSeqs[j] = 1;
}
}
}
}
pair->notwellidentifiedtaxa = counttaxon(-2);