From 313dd59f5a8cf0105a769448b32dc55a5ef62e7e Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Wed, 22 Apr 2009 08:15:18 +0000 Subject: [PATCH] First run of patch git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@203 60f365c0-8329-0410-b2a4-ec073aeeaa1d --- src/ecoprimer.c | 160 ++++++++++++----------------------- src/libecoprimer/ecoprimer.h | 6 +- src/libecoprimer/taxstats.c | 77 +++++++++-------- 3 files changed, 102 insertions(+), 141 deletions(-) diff --git a/src/ecoprimer.c b/src/ecoprimer.c index 4d5ab34..613f486 100644 --- a/src/ecoprimer.c +++ b/src/ecoprimer.c @@ -16,7 +16,10 @@ #define VERSION "0.1" /* TR: by default, statistics are made on species level*/ -#define DEFULTTAXONRANK "species" +#define DEFAULTTAXONRANK "species" + +static int cmpprintedpairs(const void* p1,const void* p2); + /* ----------------------------------------------- */ /* printout help */ @@ -40,7 +43,7 @@ static void PrintHelp() PP " .tdx : contains information concerning the taxonomy\n"); PP " .rdx : contains the taxonomy rank\n\n"); PP " ecoPrimer needs all the file type. As a result, you have to write the\n"); - PP " database radical without any extension. For example /ecoPrimerDB/fstvert\n\n"); + PP " database radical without any extension. For example /ecoPrimerDB/fstvert\n\n"); PP "-e : [E]rror : max error allowed by oligonucleotide (0 by default)\n\n"); PP "-h : [H]elp - print help\n\n"); PP "-i : [I]gnore the given taxonomy id.\n\n"); @@ -69,13 +72,14 @@ static void PrintHelp() PP "column 8 : in taxa count\n"); PP "column 9 : out taxa count\n"); PP "column 10 : coverage\n"); - PP "column 11 : specificity\n"); - PP "column 12 : minimum amplified length\n"); - PP "column 13 : maximum amplified length\n"); - PP "column 14 : average amplified length\n"); + PP "column 11 : unambiguously identified taxa\n"); + PP "column 12 : specificity\n"); + PP "column 13 : minimum amplified length\n"); + PP "column 14 : maximum amplified length\n"); + PP "column 15 : average amplified length\n"); PP "------------------------------------------\n"); PP " http://www.grenoble.prabi.fr/trac/ecoPrimer/\n"); - PP "------------------------------------------\n\n"); + PP "------------------------------------------\n\n"); PP "\n"); } @@ -110,7 +114,7 @@ void initoptions(poptions_t options) options->r=0; options->g=0; options->no_multi_match=FALSE; - strcpy(options->taxonrank, DEFULTTAXONRANK); /*taxon level for results, species by default*/ + strcpy(options->taxonrank, DEFAULTTAXONRANK); /*taxon level for results, species by default*/ } void printcurrenttime () @@ -143,8 +147,7 @@ void printcurrenttimeinmilli() void printapair(int32_t index,ppair_t pair, poptions_t options) { - uint32_t wellidentifiedtaxa; - + printf("%6d\t",index); if (pair->asdirect1) printf("%s\t",ecoUnhashWord(pair->p1->word,options->primer_length)); @@ -167,17 +170,11 @@ void printapair(int32_t index,ppair_t pair, poptions_t options) printf("\t%d", pair->intaxa); printf("\t%d", pair->outtaxa); - printf("\t%4.3f", (float)pair->intaxa/options->intaxa); - - wellidentifiedtaxa = (pair->intaxa + pair->outtaxa) - pair->notwellidentifiedtaxa; - - //printf("\t%d", pair->notwellidentifiedtaxa); - - //printf("\t%d", (pair->intaxa + pair->outtaxa)); + printf("\t%4.3f", (float)pair->bc); + printf("\t%d", pair->intaxa - pair->notwellidentifiedtaxa); - - printf("\t%4.3f", (float)wellidentifiedtaxa/(options->intaxa + options->outtaxa)); + printf("\t%4.3f", pair->bs); printf("\t%d", pair->mind); @@ -186,6 +183,25 @@ void printapair(int32_t index,ppair_t pair, poptions_t options) } +static int cmpprintedpairs(const void* p1,const void* p2) +{ + float s1,s2; + ppair_t pair1,pair2; + + pair1=*((ppair_t*)p1); + pair2=*((ppair_t*)p2); + + s1 = pair1->yule * pair1->bs; + s2 = pair2->yule * pair2->bs; + +// fprintf(stderr,"s1 : %4.3f %4.3f %4.3f\n",pair1->yule , pair1->bs,s1); +// fprintf(stderr,"s2 : %4.3f %4.3f %4.3f\n\n",pair2->yule , pair2->bs,s2); + + if (s1 > s2) return -1; + if (s1 < s2) return 1; + return 0; +} + uint32_t filterandsortpairs(ppair_t* sortedpairs,uint32_t count, poptions_t options) { uint32_t i,j; @@ -201,9 +217,10 @@ uint32_t filterandsortpairs(ppair_t* sortedpairs,uint32_t count, poptions_t opti qfp = (float)sortedpairs[i]->outexample/options->outsamples; else qfp=0.0; + sortedpairs[i]->quorumin = q; sortedpairs[i]->quorumout = qfp; - sortedpairs[i]->yule = q -qfp; + sortedpairs[i]->yule = q - qfp; sortedpairs[j]=sortedpairs[i]; @@ -215,23 +232,30 @@ uint32_t filterandsortpairs(ppair_t* sortedpairs,uint32_t count, poptions_t opti { (void)taxonomycoverage(sortedpairs[j],options); taxonomyspecificity(sortedpairs[j]); +// fprintf(stderr,"%4.3f %4.3f %4.3f\n",sortedpairs[j]->yule , sortedpairs[j]->bs,sortedpairs[j]->bc); + j++; } + } + + qsort(sortedpairs,j,sizeof(ppair_t),cmpprintedpairs); + return j; } + void printpairs (ppairtree_t pairs, poptions_t options) { ppair_t* sortedpairs; ppair_t* index; ppairlist_t pl; size_t i,j; - int32_t count; + size_t count; + + //printf("Index\tPrimer1\tPrimer2\tGB\tInexampleCount\tOutexampleCount\tYule\tIntaxaCount\tOuttaxaCount\tCoverage\tSpecificity\tMinAmplifiedLength\tMaxAmplifiedLength\tAvgAmplifiedLength\n"); - //printf("Index\tPrimer1\tPrimer2\tGB\tInexampleCount\tOutexampleCount\tYule\tIntaxaCount\tOuttaxaCount\tCoverage\tSpecificity\tMinAmplifiedLength\tMaxAmplifiedLength\tAvgAmplifiedLength\n"); - fprintf(stderr,"Total pair count : %d\n",pairs->count); sortedpairs = ECOMALLOC(pairs->count*sizeof(ppair_t),"Cannot Allocate ordered pairs"); @@ -256,40 +280,6 @@ void printpairs (ppairtree_t pairs, poptions_t options) } -#ifdef MASKEDCODE -void printpairs (pairscount_t pairs, poptions_t options, int32_t rankdbstats, uint32_t seqdbsize) - -{ - uint32_t i; - uint32_t wordsize = options->primer_length; - uint32_t quorumseqs; - double sens; - double speci; - float avg; - - quorumseqs = seqdbsize * 70 / 100; - - printf("primer_1\tseq_1\tPrimer_2\tseq_2\tamplifia_count\t%s_snes\t%s_spe\tmin_l\tmax_l\tavr_l\n", options->taxonrank, options->taxonrank); - - for (i=0; i < pairs.paircount; i++) - { - if (quorumseqs > pairs.pairs[i].inexample) continue; - sens = (pairs.pairs[i].taxsetindex*1.0)/rankdbstats*100; - speci = (pairs.pairs[i].oktaxoncount*1.0)/pairs.pairs[i].taxsetindex*100; - avg = (pairs.pairs[i].mind+pairs.pairs[i].maxd)*1.0/2; - - printf("P1\t%s", ecoUnhashWord(pairs.pairs[i].w1, wordsize)); - printf("\tP2\t%s", ecoUnhashWord(pairs.pairs[i].w2, wordsize)); - printf("\t%d", pairs.pairs[i].inexample); - printf("\t%3.2f", sens); - printf("\t%3.2f", speci); - printf("\t%d", pairs.pairs[i].mind); - printf("\t%d", pairs.pairs[i].maxd); - printf("\t%3.2f\n", avg); - } -} - -#endif /* MASKEDCODE */ /*updateseqparams: This function counts the insample and outsample sequences * and with each sequences adds a tag of the taxon to which the sequence beongs*/ @@ -340,52 +330,6 @@ void setresulttaxonrank (ecotaxonomy_t *taxonomy, poptions_t options) } /* to get db stats, totals of species, genus etc....*/ -#ifdef MASKEDCODE - -void setoktaxforspecificity (ppairtree_t pairs) -{ - uint32_t i; - uint32_t j; - uint32_t k; - uint32_t l; - int taxcount; - int32_t taxid; - - for (i = 0; i < pairs->paircount; i++) - { - for (j = 0; j < pairs->pairs[i].taxsetindex; j++) - { - for (k = 0; k < pairs->pairs[i].taxset[j].amplifiaindex; k++) - { - taxid = 0; - taxcount = 0; - for (l = 0; l < pairs->pairs[i].ampsetindex; l++) - { - /*compare only char pointers because for equal strings we have same pointer in both sets*/ - if (pairs->pairs[i].taxset[j].amplifia[k] == pairs->pairs[i].ampset[l].amplifia) - { - if (pairs->pairs[i].ampset[l].seqidindex > 1) - { - taxcount += pairs->pairs[i].ampset[l].seqidindex; - break; - } - - if (taxid != pairs->pairs[i].ampset[l].taxonids[0]) - { - if (!taxid) taxid = pairs->pairs[i].ampset[l].taxonids[0]; - taxcount++; - } - - if (taxcount > 1) break; - } - } - if (taxcount == 1) pairs->pairs[i].oktaxoncount++; - } - } - } -} - -#endif int main(int argc, char **argv) { @@ -412,7 +356,7 @@ int main(int argc, char **argv) initoptions(&options); - while ((carg = getopt(argc, argv, "hcUDSd:l:L:e:i:r:q:3:s:x:t:")) != -1) { + while ((carg = getopt(argc, argv, "hcUDSd:l:L:e:i:r:q:3:s:x:t:O:")) != -1) { switch (carg) { /* -------------------- */ @@ -516,6 +460,12 @@ int main(int argc, char **argv) options.g++; break; + /* --------------------------------- */ + case 'O': /* set primer size */ + /* --------------------------------- */ + sscanf(optarg,"%d",&(options.primer_length)); + break; + /* -------------------- */ case 'c': /* sequences are circular */ /* --------------------------------- */ @@ -601,7 +551,7 @@ int main(int argc, char **argv) /*TR: Added*/ pairs = buildPrimerPairs(seqdb, seqdbsize, primers, &options); - + // setoktaxforspecificity (&pairs); diff --git a/src/libecoprimer/ecoprimer.h b/src/libecoprimer/ecoprimer.h index c65a7fd..045fb51 100644 --- a/src/libecoprimer/ecoprimer.h +++ b/src/libecoprimer/ecoprimer.h @@ -104,7 +104,7 @@ typedef struct { * on all sequences as a list of primer_t */ typedef struct { - pprimer_t primers; + pprimer_t primers; uint32_t size; } primercount_t, *pprimercount_t; @@ -170,6 +170,8 @@ typedef struct { float yule; float quorumin; float quorumout; + float bs; + float bc; // // uint32_t taxsetcount; // uint32_t taxsetindex; @@ -224,7 +226,7 @@ typedef struct { typedef struct { int32_t taxid; - void *amptree; + void *amptree; }taxontoamp_t, *ptaxontoamp_t; typedef struct { diff --git a/src/libecoprimer/taxstats.c b/src/libecoprimer/taxstats.c index 5cf82b8..487f84e 100644 --- a/src/libecoprimer/taxstats.c +++ b/src/libecoprimer/taxstats.c @@ -40,7 +40,7 @@ int32_t counttaxon(int32_t taxid) taxoncount=0; return 0; } - + if ((taxid > 0) && ((!taxontree) || (!tfind((void*)((size_t)taxid),&taxontree,cmptaxon)))) { @@ -108,18 +108,21 @@ float taxonomycoverage(ppair_t pair, poptions_t options) counttaxon(-1); for (i=0; i < seqcount; i++) - if (pair->pcr.amplifias[i].sequence->isexample) + if (pair->pcr.amplifias[i].sequence->isexample + && pair->pcr.amplifias[i].sequence->ranktaxonid > 0 ) incount = counttaxon(pair->pcr.amplifias[i].sequence->ranktaxonid); counttaxon(-1); for (i=0; i < seqcount; i++) - if (!pair->pcr.amplifias[i].sequence->isexample) + if (!pair->pcr.amplifias[i].sequence->isexample + && pair->pcr.amplifias[i].sequence->ranktaxonid) outcount = counttaxon(pair->pcr.amplifias[i].sequence->ranktaxonid); pair->intaxa=incount; pair->outtaxa=outcount; - return (float)incount/options->intaxa; + pair->bc=(float)incount/options->intaxa; + return pair->bc; } @@ -132,11 +135,11 @@ static int cmpamp(const void *ampf1, const void* ampf2) char cd2; int chd = 0; int len = 0; - + pamptotaxon_t pampf1 = (pamptotaxon_t) ampf1; pamptotaxon_t pampf2 = (pamptotaxon_t) ampf2; - - + + if (pampf1->strand != pampf2->strand) { incr = -1; @@ -148,9 +151,9 @@ static int cmpamp(const void *ampf1, const void* ampf2) chd = 1; } } - + len = (pampf1->length <= pampf2->length)? pampf1->length: pampf2->length; - + for (i = 0; i < len; i++, j += incr) { cd1 = pampf1->amplifia[i]; @@ -158,14 +161,14 @@ static int cmpamp(const void *ampf1, const void* ampf2) cd2 = ecoComplementChar(pampf2->amplifia[j]); else cd2 = pampf2->amplifia[j]; - + if (cd1 < cd2) return chd ? 1: -1; if (cd2 < cd1) return chd ? -1: 1; } - + if (pampf1->length > pampf2->length) return chd ? -1: 1; if (pampf2->length > pampf1->length) return chd ? 1: -1; - + return 0; } @@ -183,42 +186,48 @@ void taxonomyspecificity (ppair_t pair) void *ampftree = NULL; pamptotaxon_t pcurrentampf; pamptotaxon_t *ptmp; - + pamptotaxon_t ampfwithtaxtree = ECOMALLOC(sizeof(amptotaxon_t) * pair->pcr.ampcount,"Cannot allocate amplifia tree"); - + for (i = 0; i < pair->pcr.ampcount; i++) { /*populate taxon ids tree against each unique amplifia i.e set of taxon ids for each amplifia*/ - ampfwithtaxtree[ampfindex].amplifia = pair->pcr.amplifias[i].amplifia; - ampfwithtaxtree[ampfindex].strand = pair->pcr.amplifias[i].strand; - ampfwithtaxtree[ampfindex].length = pair->pcr.amplifias[i].length; - pcurrentampf = &fwithtaxtree[ampfindex]; - taxid = pair->pcr.amplifias[i].sequence->ranktaxonid; - ptmp = tfind((const void*)pcurrentampf, &ftree, cmpamp); - if (ptmp == NULL) + if (pair->pcr.amplifias[i].sequence->isexample) { + ampfwithtaxtree[ampfindex].amplifia = pair->pcr.amplifias[i].amplifia; + ampfwithtaxtree[ampfindex].strand = pair->pcr.amplifias[i].strand; + ampfwithtaxtree[ampfindex].length = pair->pcr.amplifias[i].length; pcurrentampf = &fwithtaxtree[ampfindex]; - tsearch((void*)pcurrentampf,&ftree,cmpamp); - ampfindex++; - } - else - pcurrentampf = *ptmp; - - if (tfind((void*)((size_t)taxid), &(pcurrentampf->taxontree), cmptaxon) == NULL) - { - pcurrentampf->taxoncount++; - tsearch((void*)((size_t)taxid),&(pcurrentampf->taxontree),cmptaxon); + taxid = pair->pcr.amplifias[i].sequence->ranktaxonid; + ptmp = tfind((const void*)pcurrentampf, &ftree, cmpamp); + if (ptmp == NULL) + { + pcurrentampf = &fwithtaxtree[ampfindex]; + tsearch((void*)pcurrentampf,&ftree,cmpamp); + ampfindex++; + } + else + pcurrentampf = *ptmp; + + if (tfind((void*)((size_t)taxid), &(pcurrentampf->taxontree), cmptaxon) == NULL) + { + pcurrentampf->taxoncount++; + tsearch((void*)((size_t)taxid),&(pcurrentampf->taxontree),cmptaxon); + } } } - + counttaxon(-1); for (i = 0; i < ampfindex; i++) { if (ampfwithtaxtree[i].taxoncount > 1) twalk(ampfwithtaxtree[i].taxontree, twalkaction); } - + pair->notwellidentifiedtaxa = counttaxon(-2); + pair->bs = ((float)pair->intaxa - (float)pair->notwellidentifiedtaxa) / pair->intaxa; + ECOFREE (ampfwithtaxtree, "Free amplifia table"); -} \ No newline at end of file + +}