diff --git a/src/ecoPrimer b/src/ecoPrimer new file mode 100755 index 0000000..1ccf566 Binary files /dev/null and b/src/ecoPrimer differ diff --git a/src/ecoprimer.c b/src/ecoprimer.c index b54ef97..4d5ab34 100644 --- a/src/ecoprimer.c +++ b/src/ecoprimer.c @@ -25,9 +25,59 @@ static void PrintHelp() { - PP "------------------------------------------\n"); - PP " ecoPrimer Version %s\n", VERSION); - PP "------------------------------------------\n"); + PP "------------------------------------------\n"); + PP " ecoPrimer Version %s\n", VERSION); + PP "------------------------------------------\n"); + PP "synopsis : finding primers and measureing the quality of primers and barcode region\n"); + PP "usage: ./ecoPrimer [options] \n"); + PP "------------------------------------------\n"); + PP "options:\n"); + PP "-d : [D]atabase : to match the expected format, the database\n"); + PP " has to be formated first by the ecoPCRFormat.py program located.\n"); + PP " in the ecoPCR/tools directory.\n"); + PP " ecoPCRFormat.py creates three file types :\n"); + PP " .sdx : contains the sequences\n"); + PP " .tdx : contains information concerning the taxonomy\n"); + PP " .rdx : contains the taxonomy rank\n\n"); + PP " ecoPrimer needs all the file type. As a result, you have to write the\n"); + PP " database radical without any extension. For example /ecoPrimerDB/fstvert\n\n"); + PP "-e : [E]rror : max error allowed by oligonucleotide (0 by default)\n\n"); + PP "-h : [H]elp - print help\n\n"); + PP "-i : [I]gnore the given taxonomy id.\n\n"); + PP "-l : minimum [L]ength : define the minimum amplication length. \n\n"); + PP "-L : maximum [L]ength : define the maximum amplicationlength. \n\n"); + PP "-r : [R]estricts the search to the given taxonomic id.\n\n"); + PP "-c : Consider that the database sequences are [c]ircular\n\n"); + PP "-3 : Three prime strict match\n\n"); + PP "-q : Strict matching [q]uorum, percentage of the sequences in which strict primers are found. By default it is 70\n\n"); + PP "-s : [S]ensitivity quorum\n\n"); + PP "-t : required [t]axon level for results, by default the results are computed at species level\n\n"); + PP "-x : false positive quorum\n\n"); + PP "-D : set in [d]ouble strand mode\n\n"); + PP "-S : Set in [s]ingle strand mode\n\n"); + PP "-U : No multi match\n\n"); + PP "\n"); + PP "------------------------------------------\n"); + PP "Table result description : \n"); + PP "column 1 : serial number\n"); + PP "column 2 : primer1\n"); + PP "column 3 : primer2\n"); + PP "column 4 : good/bad\n"); + PP "column 5 : in sequence count\n"); + PP "column 6 : out sequence count\n"); + PP "column 7 : yule\n"); + PP "column 8 : in taxa count\n"); + PP "column 9 : out taxa count\n"); + PP "column 10 : coverage\n"); + PP "column 11 : specificity\n"); + PP "column 12 : minimum amplified length\n"); + PP "column 13 : maximum amplified length\n"); + PP "column 14 : average amplified length\n"); + PP "------------------------------------------\n"); + PP " http://www.grenoble.prabi.fr/trac/ecoPrimer/\n"); + PP "------------------------------------------\n\n"); + PP "\n"); + } static void ExitUsage(int stat) @@ -56,7 +106,7 @@ void initoptions(poptions_t options) options->strict_exclude_quorum=0.1; options->sensitivity_quorum=0.9; options->false_positive_quorum=0.1; - options->strict_three_prime=2; + options->strict_three_prime=0; options->r=0; options->g=0; options->no_multi_match=FALSE; @@ -75,7 +125,7 @@ void printcurrenttime () /* Format and print the time, "ddd yyyy-mm-dd hh:mm:ss zzz" */ ts = localtime(&now); strftime(buf, sizeof(buf), "%a %Y-%m-%d %H:%M:%S %Z", ts); - fprintf(stderr,"#%d#, %s\n",now, buf); + fprintf(stderr,"#%d#, %s\n",(int)now, buf); } void printcurrenttimeinmilli() @@ -90,7 +140,125 @@ void printcurrenttimeinmilli() } /*TR: Added*/ + +void printapair(int32_t index,ppair_t pair, poptions_t options) +{ + uint32_t wellidentifiedtaxa; + + printf("%6d\t",index); + if (pair->asdirect1) + printf("%s\t",ecoUnhashWord(pair->p1->word,options->primer_length)); + else + printf("%s\t",ecoUnhashWord(ecoComplementWord(pair->p1->word, + options->primer_length),options->primer_length)); + + if (pair->asdirect2) + printf("%s",ecoUnhashWord(pair->p2->word,options->primer_length)); + else + printf("%s",ecoUnhashWord(ecoComplementWord(pair->p2->word, + options->primer_length),options->primer_length)); + + printf("\t%c%c", "bG"[(int)pair->p1->good],"bG"[(int)pair->p2->good]); + + + printf("\t%d", pair->inexample); + printf("\t%d", pair->outexample); + printf("\t%4.3f", pair->yule); + + printf("\t%d", pair->intaxa); + printf("\t%d", pair->outtaxa); + printf("\t%4.3f", (float)pair->intaxa/options->intaxa); + + wellidentifiedtaxa = (pair->intaxa + pair->outtaxa) - pair->notwellidentifiedtaxa; + + //printf("\t%d", pair->notwellidentifiedtaxa); + + //printf("\t%d", (pair->intaxa + pair->outtaxa)); + + + + printf("\t%4.3f", (float)wellidentifiedtaxa/(options->intaxa + options->outtaxa)); + + + printf("\t%d", pair->mind); + printf("\t%d", pair->maxd); + printf("\t%3.2f\n", (float)pair->sumd/pair->inexample); + +} + +uint32_t filterandsortpairs(ppair_t* sortedpairs,uint32_t count, poptions_t options) +{ + uint32_t i,j; + float q,qfp; + + for (i=0,j=0;i < count;i++) + { + if (options->insamples) + q = (float)sortedpairs[i]->inexample/options->insamples; + else q=1.0; + + if (options->outsamples) + qfp = (float)sortedpairs[i]->outexample/options->outsamples; + else qfp=0.0; + + sortedpairs[i]->quorumin = q; + sortedpairs[i]->quorumout = qfp; + sortedpairs[i]->yule = q -qfp; + + + sortedpairs[j]=sortedpairs[i]; + + + + if (q > options->sensitivity_quorum && + qfp < options->false_positive_quorum) + { + (void)taxonomycoverage(sortedpairs[j],options); + taxonomyspecificity(sortedpairs[j]); + j++; + } + } + + return j; +} + +void printpairs (ppairtree_t pairs, poptions_t options) +{ + ppair_t* sortedpairs; + ppair_t* index; + ppairlist_t pl; + size_t i,j; + int32_t count; + + //printf("Index\tPrimer1\tPrimer2\tGB\tInexampleCount\tOutexampleCount\tYule\tIntaxaCount\tOuttaxaCount\tCoverage\tSpecificity\tMinAmplifiedLength\tMaxAmplifiedLength\tAvgAmplifiedLength\n"); + + fprintf(stderr,"Total pair count : %d\n",pairs->count); + + sortedpairs = ECOMALLOC(pairs->count*sizeof(ppair_t),"Cannot Allocate ordered pairs"); + index=sortedpairs; + pl=pairs->first; + j=0; + while(pl->next) + { + for (i=0;ipaircount;i++,j++) + sortedpairs[j]=pl->pairs+i; + pl=pl->next; + } + + for (i=0;ipaircount;i++,j++) + sortedpairs[j]=pl->pairs+i; + + count=filterandsortpairs(sortedpairs,pairs->count,options); + + for (i=0;i < count;i++) + printapair(i,sortedpairs[i],options); + +} + + +#ifdef MASKEDCODE void printpairs (pairscount_t pairs, poptions_t options, int32_t rankdbstats, uint32_t seqdbsize) + { uint32_t i; uint32_t wordsize = options->primer_length; @@ -98,9 +266,9 @@ void printpairs (pairscount_t pairs, poptions_t options, int32_t rankdbstats, ui double sens; double speci; float avg; - + quorumseqs = seqdbsize * 70 / 100; - + printf("primer_1\tseq_1\tPrimer_2\tseq_2\tamplifia_count\t%s_snes\t%s_spe\tmin_l\tmax_l\tavr_l\n", options->taxonrank, options->taxonrank); for (i=0; i < pairs.paircount; i++) @@ -121,9 +289,12 @@ void printpairs (pairscount_t pairs, poptions_t options, int32_t rankdbstats, ui } } +#endif /* MASKEDCODE */ + /*updateseqparams: This function counts the insample and outsample sequences * and with each sequences adds a tag of the taxon to which the sequence beongs*/ -void updateseqparams (pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxonomy, + +void updateseqparams (pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxonomy, poptions_t options, int32_t *insamples, int32_t *outsamples) { uint32_t i; @@ -131,7 +302,7 @@ void updateseqparams (pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxo ecotx_t *tmptaxon; for (i=0;iisexample=isGoodTaxon(taxonomy,seqdb[i]->taxid,options); if (seqdb[i]->isexample) (*insamples)++; @@ -139,7 +310,7 @@ void updateseqparams (pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxo (*outsamples)++; taxid = taxonomy->taxons->taxon[seqdb[i]->taxid].taxid; - tmptaxon = eco_findtaxonbytaxid(taxonomy, taxid); + tmptaxon = eco_findtaxonbytaxid(taxonomy, taxid); if (tmptaxon) tmptaxon = eco_findtaxonatrank(tmptaxon, options->taxonrankidx); if (tmptaxon) @@ -154,7 +325,7 @@ void setresulttaxonrank (ecotaxonomy_t *taxonomy, poptions_t options) /*set taxon rank for which result is to be given*/ for (i = 0; i < taxonomy->ranks->count; i++) { - if (strcmp(taxonomy->ranks->label[i], options->taxonrank) == 0) + if (strcmp(taxonomy->ranks->label[i], options->taxonrank) == 0) { options->taxonrankidx = i; break; @@ -168,47 +339,10 @@ void setresulttaxonrank (ecotaxonomy_t *taxonomy, poptions_t options) } } /* to get db stats, totals of species, genus etc....*/ -int32_t getrankdbstats(pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxonomy, - poptions_t options) -{ - uint32_t i; - uint32_t j; - uint32_t nameslots = 500; - uint32_t namesindex = 0; - int32_t *ranktaxonids = ECOMALLOC(nameslots * sizeof(int32_t), "Error in taxon rank allocation"); - int32_t taxid; - - ecotx_t *tmptaxon; - for (i=0;itaxons->taxon[seqdb[i]->taxid].taxid; - tmptaxon = eco_findtaxonbytaxid(taxonomy, taxid); - if (tmptaxon) - tmptaxon = eco_findtaxonatrank(tmptaxon, options->taxonrankidx); - if (tmptaxon) - { - for (j = 0; j < namesindex; j++) - { - if (tmptaxon->taxid == ranktaxonids[j]) break; - } - if (j < namesindex) continue; /* name is already in list, so no need to add it*/ - - if (namesindex == nameslots) - { - nameslots += 500; - ranktaxonids = ECOREALLOC(ranktaxonids, nameslots * sizeof(int32_t), "Cannot allocate pair rank taxon table"); - } - ranktaxonids[namesindex] = tmptaxon->taxid; - namesindex++; - } - } - ECOFREE(ranktaxonids, "free rank taxon table"); - - return namesindex; -} +#ifdef MASKEDCODE -void setoktaxforspecificity (ppairscount_t pairs) +void setoktaxforspecificity (ppairtree_t pairs) { uint32_t i; uint32_t j; @@ -216,7 +350,7 @@ void setoktaxforspecificity (ppairscount_t pairs) uint32_t l; int taxcount; int32_t taxid; - + for (i = 0; i < pairs->paircount; i++) { for (j = 0; j < pairs->pairs[i].taxsetindex; j++) @@ -235,13 +369,13 @@ void setoktaxforspecificity (ppairscount_t pairs) taxcount += pairs->pairs[i].ampset[l].seqidindex; break; } - + if (taxid != pairs->pairs[i].ampset[l].taxonids[0]) { if (!taxid) taxid = pairs->pairs[i].ampset[l].taxonids[0]; - taxcount++; + taxcount++; } - + if (taxcount > 1) break; } } @@ -251,6 +385,8 @@ void setoktaxforspecificity (ppairscount_t pairs) } } +#endif + int main(int argc, char **argv) { pecodnadb_t seqdb; /* of type ecoseq_t */ @@ -264,11 +400,11 @@ int main(int argc, char **argv) int32_t insamples=0; int32_t outsamples=0; uint32_t i; - - pwordcount_t words; - pprimercount_t primers; - pairscount_t pairs; - + + pwordcount_t words; + pprimercount_t primers; + ppairtree_t pairs; + int32_t rankdbstats = 0; //printcurrenttime(); @@ -290,9 +426,9 @@ int main(int argc, char **argv) /* -------------------- */ case 'h': /* help */ /* -------------------- */ - PrintHelp(); - exit(0); - break; + PrintHelp(); + exit(0); + break; /* ------------------------- */ case 'l': /* min amplification lenght */ @@ -337,7 +473,7 @@ int main(int argc, char **argv) strncpy(options.taxonrank, optarg, 19); options.taxonrank[19] = 0; break; - + /* -------------------- */ case 'x': /* strict matching quorum */ /* -------------------- */ @@ -396,22 +532,27 @@ int main(int argc, char **argv) fprintf(stderr,"Reading taxonomy database ..."); taxonomy = read_taxonomy(options.prefix,0); fprintf(stderr,"Ok\n"); - + setresulttaxonrank(taxonomy, &options); /*TR: set rank level for statistics*/ - + fprintf(stderr,"Reading sequence database ...\n"); seqdb = readdnadb(options.prefix,&seqdbsize); fprintf(stderr,"Ok\n"); fprintf(stderr,"Sequence read : %d\n",(int32_t)seqdbsize); - + updateseqparams(seqdb, seqdbsize, taxonomy, &options, &insamples , &outsamples); + options.dbsize=seqdbsize; + options.insamples=insamples; + options.outsamples=outsamples; rankdbstats = getrankdbstats(seqdb, seqdbsize, taxonomy, &options); - fprintf(stderr,"Database is constituted of %5d examples\n",insamples); - fprintf(stderr," and %5d counterexamples\n",outsamples); + fprintf(stderr,"Database is constituted of %5d examples corresponding to %5d %s\n",insamples, + options.intaxa,options.taxonrank); + fprintf(stderr," and %5d counterexamples corresponding to %5d %s\n",outsamples, + options.outtaxa,options.taxonrank); fprintf(stderr,"Total distinct %s count %d\n",options.taxonrank, rankdbstats); fprintf(stderr,"\nIndexing words in sequences\n"); @@ -419,7 +560,7 @@ int main(int argc, char **argv) printcurrenttimeinmilli(); words = lookforStrictPrimer(seqdb,seqdbsize,insamples,&options); printcurrenttimeinmilli(); - + fprintf(stderr,"\n Strict primer count : %d\n",words->size); if (options.no_multi_match) @@ -460,13 +601,15 @@ int main(int argc, char **argv) /*TR: Added*/ pairs = buildPrimerPairs(seqdb, seqdbsize, primers, &options); - setoktaxforspecificity (&pairs); - printpairs (pairs, &options, rankdbstats, seqdbsize); - - - - ECOFREE(pairs.pairs,"Free pairs table"); + + // setoktaxforspecificity (&pairs); + + printpairs (pairs, &options); + + + + //ECOFREE(pairs.pairs,"Free pairs table"); return 0; } diff --git a/src/libecoPCR/ecoError.P b/src/libecoPCR/ecoError.P new file mode 100644 index 0000000..7c7ae71 --- /dev/null +++ b/src/libecoPCR/ecoError.P @@ -0,0 +1,15 @@ +ecoError.o ecoError.P : ecoError.c ecoPCR.h /usr/include/stdio.h \ + /usr/include/_types.h /usr/include/sys/_types.h \ + /usr/include/sys/cdefs.h /usr/include/machine/_types.h \ + /usr/include/i386/_types.h /usr/include/inttypes.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \ + /usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \ + /usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \ + /usr/include/machine/signal.h /usr/include/i386/signal.h \ + /usr/include/i386/_structs.h /usr/include/sys/_structs.h \ + /usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \ + /usr/include/sys/resource.h /usr/include/machine/endian.h \ + /usr/include/i386/endian.h /usr/include/sys/_endian.h \ + /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \ + /usr/include/machine/types.h /usr/include/i386/types.h diff --git a/src/libecoPCR/ecoIOUtils.P b/src/libecoPCR/ecoIOUtils.P new file mode 100644 index 0000000..fc34a70 --- /dev/null +++ b/src/libecoPCR/ecoIOUtils.P @@ -0,0 +1,15 @@ +ecoIOUtils.o ecoIOUtils.P : ecoIOUtils.c ecoPCR.h /usr/include/stdio.h \ + /usr/include/_types.h /usr/include/sys/_types.h \ + /usr/include/sys/cdefs.h /usr/include/machine/_types.h \ + /usr/include/i386/_types.h /usr/include/inttypes.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \ + /usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \ + /usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \ + /usr/include/machine/signal.h /usr/include/i386/signal.h \ + /usr/include/i386/_structs.h /usr/include/sys/_structs.h \ + /usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \ + /usr/include/sys/resource.h /usr/include/machine/endian.h \ + /usr/include/i386/endian.h /usr/include/sys/_endian.h \ + /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \ + /usr/include/machine/types.h /usr/include/i386/types.h diff --git a/src/libecoPCR/ecoMalloc.P b/src/libecoPCR/ecoMalloc.P new file mode 100644 index 0000000..ea3767b --- /dev/null +++ b/src/libecoPCR/ecoMalloc.P @@ -0,0 +1,15 @@ +ecoMalloc.o ecoMalloc.P : ecoMalloc.c ecoPCR.h /usr/include/stdio.h \ + /usr/include/_types.h /usr/include/sys/_types.h \ + /usr/include/sys/cdefs.h /usr/include/machine/_types.h \ + /usr/include/i386/_types.h /usr/include/inttypes.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \ + /usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \ + /usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \ + /usr/include/machine/signal.h /usr/include/i386/signal.h \ + /usr/include/i386/_structs.h /usr/include/sys/_structs.h \ + /usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \ + /usr/include/sys/resource.h /usr/include/machine/endian.h \ + /usr/include/i386/endian.h /usr/include/sys/_endian.h \ + /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \ + /usr/include/machine/types.h /usr/include/i386/types.h diff --git a/src/libecoPCR/ecodna.P b/src/libecoPCR/ecodna.P new file mode 100644 index 0000000..b9a71b9 --- /dev/null +++ b/src/libecoPCR/ecodna.P @@ -0,0 +1,5 @@ +ecodna.o ecodna.P : ecodna.c /usr/include/string.h /usr/include/_types.h \ + /usr/include/sys/_types.h /usr/include/sys/cdefs.h \ + /usr/include/machine/_types.h /usr/include/i386/_types.h ecoPCR.h \ + /usr/include/stdio.h /usr/include/inttypes.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h diff --git a/src/libecoPCR/ecofilter.P b/src/libecoPCR/ecofilter.P new file mode 100644 index 0000000..d46d3e0 --- /dev/null +++ b/src/libecoPCR/ecofilter.P @@ -0,0 +1,5 @@ +ecofilter.o ecofilter.P : ecofilter.c ecoPCR.h /usr/include/stdio.h \ + /usr/include/_types.h /usr/include/sys/_types.h \ + /usr/include/sys/cdefs.h /usr/include/machine/_types.h \ + /usr/include/i386/_types.h /usr/include/inttypes.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h diff --git a/src/libecoPCR/econame.P b/src/libecoPCR/econame.P new file mode 100644 index 0000000..4c9946c --- /dev/null +++ b/src/libecoPCR/econame.P @@ -0,0 +1,15 @@ +econame.o econame.P : econame.c ecoPCR.h /usr/include/stdio.h /usr/include/_types.h \ + /usr/include/sys/_types.h /usr/include/sys/cdefs.h \ + /usr/include/machine/_types.h /usr/include/i386/_types.h \ + /usr/include/inttypes.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \ + /usr/include/string.h /usr/include/stdlib.h /usr/include/available.h \ + /usr/include/sys/wait.h /usr/include/sys/signal.h \ + /usr/include/sys/appleapiopts.h /usr/include/machine/signal.h \ + /usr/include/i386/signal.h /usr/include/i386/_structs.h \ + /usr/include/sys/_structs.h /usr/include/machine/_structs.h \ + /usr/include/mach/i386/_structs.h /usr/include/sys/resource.h \ + /usr/include/machine/endian.h /usr/include/i386/endian.h \ + /usr/include/sys/_endian.h /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \ + /usr/include/machine/types.h /usr/include/i386/types.h diff --git a/src/libecoPCR/ecorank.P b/src/libecoPCR/ecorank.P new file mode 100644 index 0000000..75e09b9 --- /dev/null +++ b/src/libecoPCR/ecorank.P @@ -0,0 +1,15 @@ +ecorank.o ecorank.P : ecorank.c ecoPCR.h /usr/include/stdio.h /usr/include/_types.h \ + /usr/include/sys/_types.h /usr/include/sys/cdefs.h \ + /usr/include/machine/_types.h /usr/include/i386/_types.h \ + /usr/include/inttypes.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \ + /usr/include/string.h /usr/include/stdlib.h /usr/include/available.h \ + /usr/include/sys/wait.h /usr/include/sys/signal.h \ + /usr/include/sys/appleapiopts.h /usr/include/machine/signal.h \ + /usr/include/i386/signal.h /usr/include/i386/_structs.h \ + /usr/include/sys/_structs.h /usr/include/machine/_structs.h \ + /usr/include/mach/i386/_structs.h /usr/include/sys/resource.h \ + /usr/include/machine/endian.h /usr/include/i386/endian.h \ + /usr/include/sys/_endian.h /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \ + /usr/include/machine/types.h /usr/include/i386/types.h diff --git a/src/libecoPCR/ecoseq.P b/src/libecoPCR/ecoseq.P new file mode 100644 index 0000000..6222690 --- /dev/null +++ b/src/libecoPCR/ecoseq.P @@ -0,0 +1,19 @@ +ecoseq.o ecoseq.P : ecoseq.c ecoPCR.h /usr/include/stdio.h /usr/include/_types.h \ + /usr/include/sys/_types.h /usr/include/sys/cdefs.h \ + /usr/include/machine/_types.h /usr/include/i386/_types.h \ + /usr/include/inttypes.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \ + /usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \ + /usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \ + /usr/include/machine/signal.h /usr/include/i386/signal.h \ + /usr/include/i386/_structs.h /usr/include/sys/_structs.h \ + /usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \ + /usr/include/sys/resource.h /usr/include/machine/endian.h \ + /usr/include/i386/endian.h /usr/include/sys/_endian.h \ + /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \ + /usr/include/machine/types.h /usr/include/i386/types.h \ + /usr/include/string.h /usr/include/zlib.h /usr/include/zconf.h \ + /usr/include/sys/types.h /usr/include/unistd.h \ + /usr/include/sys/unistd.h /usr/include/sys/select.h \ + /usr/include/sys/_select.h diff --git a/src/libecoPCR/ecotax.P b/src/libecoPCR/ecotax.P new file mode 100644 index 0000000..489fc97 --- /dev/null +++ b/src/libecoPCR/ecotax.P @@ -0,0 +1,15 @@ +ecotax.o ecotax.P : ecotax.c ecoPCR.h /usr/include/stdio.h /usr/include/_types.h \ + /usr/include/sys/_types.h /usr/include/sys/cdefs.h \ + /usr/include/machine/_types.h /usr/include/i386/_types.h \ + /usr/include/inttypes.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \ + /usr/include/string.h /usr/include/stdlib.h /usr/include/available.h \ + /usr/include/sys/wait.h /usr/include/sys/signal.h \ + /usr/include/sys/appleapiopts.h /usr/include/machine/signal.h \ + /usr/include/i386/signal.h /usr/include/i386/_structs.h \ + /usr/include/sys/_structs.h /usr/include/machine/_structs.h \ + /usr/include/mach/i386/_structs.h /usr/include/sys/resource.h \ + /usr/include/machine/endian.h /usr/include/i386/endian.h \ + /usr/include/sys/_endian.h /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \ + /usr/include/machine/types.h /usr/include/i386/types.h diff --git a/src/libecoprimer/Makefile b/src/libecoprimer/Makefile index d68202b..4fc412c 100644 --- a/src/libecoprimer/Makefile +++ b/src/libecoprimer/Makefile @@ -10,7 +10,9 @@ SOURCES = goodtaxon.c \ queue.c \ libstki.c \ sortmatch.c \ + pairtree.c \ pairs.c \ + taxstats.c \ apat_search.c SRCS=$(SOURCES) diff --git a/src/libecoprimer/aproxpattern.c b/src/libecoprimer/aproxpattern.c index d924281..2dc13bf 100644 --- a/src/libecoprimer/aproxpattern.c +++ b/src/libecoprimer/aproxpattern.c @@ -61,7 +61,7 @@ void encodeSequence(ecoseq_t *seq) for (i=0;iSQ_length;i++,data++,cseq++) { - *data = encoder[(IS_UPPER(*cseq) ? *cseq - 'A' : 'Z')]; + *data = encoder[(IS_UPPER(*cseq) ? *cseq : 'Z') - 'A']; } } @@ -82,7 +82,7 @@ pprimercount_t lookforAproxPrimer(pecodnadb_t database, uint32_t seqdbsize,uint3 uint32_t inSequenceQuorum; uint32_t outSequenceQuorum; bool_t conserved = TRUE; - + //poslist_t ttt; diff --git a/src/libecoprimer/ecoprimer.h b/src/libecoprimer/ecoprimer.h index 5d26573..c65a7fd 100644 --- a/src/libecoprimer/ecoprimer.h +++ b/src/libecoprimer/ecoprimer.h @@ -79,28 +79,39 @@ typedef union { uint32_t value; } poslist_t, *ppostlist_t; -typedef struct { - word_t word; - uint32_t *directCount; - ppostlist_t directPos; +/** + * primer_t structure store fuzzy match positions for a primer + * on all sequences + */ - uint32_t *reverseCount; - ppostlist_t reversePos; - bool_t good; - uint32_t inexample; - uint32_t outexample; +typedef struct { + word_t word; //< code for the primer + uint32_t *directCount; //< Occurrence count on direct strand + ppostlist_t directPos; //< list of position list on direct strand + + uint32_t *reverseCount; //< Occurrence count on reverse strand + ppostlist_t reversePos; //< list of position list on reverse strand + + bool_t good; //< primer match more than quorum example and no + // more counterexample quorum. + + uint32_t inexample; //< count of example sequences matching primer + uint32_t outexample; //< count of counterexample sequences matching primer } primer_t, *pprimer_t; +/** + * primercount_t structure store fuzzy match positions for all primers + * on all sequences as a list of primer_t + */ typedef struct { - pprimer_t primers; + pprimer_t primers; uint32_t size; } primercount_t, *pprimercount_t; typedef struct { - word_t word; + pprimer_t primer; uint32_t position; bool_t strand; - bool_t good; /*TR: Added*/ } primermatch_t, *pprimermatch_t; /*TR: Added*/ @@ -109,6 +120,19 @@ typedef struct { uint32_t matchcount; } primermatchcount_t, *pprimermatchcount_t; +typedef struct { + pecoseq_t sequence; + bool_t strand; + const char *amplifia; + int32_t length; +} amplifia_t, *pamplifia_t; + +typedef struct { + pamplifia_t amplifias; + uint32_t ampcount; + uint32_t ampslot; +} amplifiacount_t, *pamplifiacount_t; + typedef struct { char *amplifia; int32_t *taxonids; @@ -124,30 +148,52 @@ typedef struct { } taxampset_t, *ptaxampset_t; typedef struct { - word_t w1; - word_t w2; - uint32_t inexample; /*inexample count*/ - uint32_t outexample; /*outexample count*/ - - uint32_t mind; - uint32_t maxd; - - uint32_t ampsetcount; - uint32_t ampsetindex; - pampseqset_t ampset; - - uint32_t taxsetcount; - uint32_t taxsetindex; - ptaxampset_t taxset; - - uint32_t oktaxoncount; -} pairs_t, *ppairs_t; + pprimer_t p1; + bool_t asdirect1; + pprimer_t p2; + bool_t asdirect2; + + amplifiacount_t pcr; + + uint32_t inexample; //< example sequence count + uint32_t outexample; //< counterexample sequence count + uint32_t intaxa; //< example taxa count + uint32_t outtaxa; //< counterexample taxa count + uint32_t notwellidentifiedtaxa; + + + // these statistics are relative to inexample sequences + + uint32_t mind; //< minimum distance between primers + uint32_t maxd; //< maximum distance between primers + uint32_t sumd; //< distance sum + float yule; + float quorumin; + float quorumout; +// +// uint32_t taxsetcount; +// uint32_t taxsetindex; +// ptaxampset_t taxset; +// +// uint32_t oktaxoncount; + +} pair_t, *ppair_t; /*TR: Added*/ + typedef struct { - ppairs_t pairs; - uint32_t paircount; -}pairscount_t, *ppairscount_t; + size_t paircount; + size_t pairslots; + void* next; + pair_t pairs[1]; +} pairlist_t, *ppairlist_t; + +typedef struct { + ppairlist_t first; + ppairlist_t last; + void *tree; + int32_t count; +} pairtree_t, *ppairtree_t; typedef struct { pword_t words; @@ -168,6 +214,18 @@ typedef struct { uint32_t size; } merge_t, *pmerge_t; +typedef struct { + const char *amplifia; + bool_t strand; + int32_t length; + int32_t taxoncount; + void *taxontree; +}amptotaxon_t, *pamptotaxon_t; + +typedef struct { + int32_t taxid; + void *amptree; +}taxontoamp_t, *ptaxontoamp_t; typedef struct { uint32_t lmin; //**< Amplifia minimal length @@ -189,6 +247,14 @@ typedef struct { bool_t no_multi_match; char taxonrank[20]; //TR to count ranks against a pair int32_t taxonrankidx; //TR to count ranks against a pair + + // Some statistics useful for options filters + + int32_t dbsize; + int32_t insamples; + int32_t outsamples; + int32_t intaxa; + int32_t outtaxa; } options_t, *poptions_t; typedef ecoseq_t **pecodnadb_t; @@ -232,7 +298,21 @@ pprimercount_t lookforAproxPrimer(pecodnadb_t database, uint32_t seqdbsize,uint3 void sortmatch(pprimermatch_t table,uint32_t N); +ppairtree_t initpairtree(ppairtree_t tree); +ppair_t pairintree (pair_t key,ppairtree_t pairlist); +ppair_t insertpair(pair_t key,ppairtree_t list); + + /*TR: Added*/ -pairscount_t buildPrimerPairs(pecodnadb_t seqdb,uint32_t seqdbsize,pprimercount_t primers,poptions_t options); +ppairtree_t buildPrimerPairs(pecodnadb_t seqdb,uint32_t seqdbsize,pprimercount_t primers,poptions_t options); + +int32_t counttaxon(int32_t taxid); +int32_t getrankdbstats(pecodnadb_t seqdb, + uint32_t seqdbsize, + ecotaxonomy_t *taxonomy, + poptions_t options); +float taxonomycoverage(ppair_t pair, poptions_t options); +char ecoComplementChar(char base); +void taxonomyspecificity (ppair_t pair); #endif /* EPSORT_H_ */ diff --git a/src/libecoprimer/hashsequence.c b/src/libecoprimer/hashsequence.c index 8dfe6d4..af89025 100644 --- a/src/libecoprimer/hashsequence.c +++ b/src/libecoprimer/hashsequence.c @@ -201,3 +201,8 @@ uint32_t ecoFindWord(pwordcount_t table,word_t word) return ~0; } +char ecoComplementChar(char base) +{ + return (base < 4)? !base & 3: 4; +} + diff --git a/src/libecoprimer/pairs.c b/src/libecoprimer/pairs.c index a5308a6..88e6c21 100644 --- a/src/libecoprimer/pairs.c +++ b/src/libecoprimer/pairs.c @@ -7,34 +7,40 @@ #include "ecoprimer.h" #include +#include -primermatchcount_t buildPrimerPairsForOneSeq(uint32_t seqid,pprimercount_t primers,poptions_t options); +static void buildPrimerPairsForOneSeq(uint32_t seqid, + pecodnadb_t seqdb, + pprimercount_t primers, + ppairtree_t pairs, + poptions_t options); -int32_t pairinlist (ppairs_t pairlist, word_t w1, word_t w2, uint32_t size) -{ - uint32_t i; - - for (i = 0; i < size; i++) - { - if (w1 == pairlist[i].w1 && w2 == pairlist[i].w2) return i; - if (w1 == pairlist[i].w2 && w2 == pairlist[i].w1) return i; - } - return -1; -} -char *addamplifiasetelem (ppairs_t pair, char* amplifia, int32_t taxid) + + + + +/************************************* + * + * pair collection management + * + *************************************/ + +#ifdef MASKEDCODE + +char *addamplifiasetelem (ppair_t pair, char* amplifia, int32_t taxid) { uint32_t i; uint32_t j; char *ampused = NULL; - + if(pair->ampsetcount == 0) { pair->ampsetcount = 500; pair->ampsetindex = 0; pair->ampset = ECOMALLOC(pair->ampsetcount * sizeof(ampseqset_t),"Cannot allocate amplifia set"); } - + for (i = 0; i < pair->ampsetindex; i++) { if (strcmp (pair->ampset[i].amplifia, amplifia) == 0) @@ -43,43 +49,43 @@ char *addamplifiasetelem (ppairs_t pair, char* amplifia, int32_t taxid) break; } } - + if (i == 0) { pair->ampset[i].seqidcount = 100; pair->ampset[i].seqidindex = 0; pair->ampset[i].taxonids = ECOMALLOC(pair->ampset[i].seqidcount * sizeof(uint32_t),"Cannot allocate amplifia sequence table"); } - + if (pair->ampsetindex == pair->ampsetcount) { pair->ampsetcount += 500; pair->ampset = ECOREALLOC(pair->ampset, pair->ampsetcount * sizeof(ampseqset_t), "Cannot allocate amplifia set"); } - + if (pair->ampset[i].seqidindex == pair->ampset[i].seqidcount) { pair->ampset[i].seqidcount += 100; pair->ampset[i].taxonids = ECOREALLOC(pair->ampset[i].taxonids, pair->ampset[i].seqidcount * sizeof(int32_t), "Cannot allocate amplifia sequence table"); } - + if (pair->ampset[i].amplifia == NULL) { pair->ampset[i].amplifia = amplifia; pair->ampsetindex++; } - + for (j = 0; j < pair->ampset[i].seqidindex; j++) { if (pair->ampset[i].taxonids[j] == taxid) break; } - + if (j == pair->ampset[i].seqidindex) pair->ampset[i].taxonids[pair->ampset[i].seqidindex++] = taxid; return ampused; } -void addtaxampsetelem (ppairs_t pair, int32_t taxid, char *amplifia) +void addtaxampsetelem (ppair_t pair, int32_t taxid, char *amplifia) { uint32_t i; uint32_t j; @@ -90,42 +96,42 @@ void addtaxampsetelem (ppairs_t pair, int32_t taxid, char *amplifia) pair->taxsetindex = 0; pair->taxset = ECOMALLOC(pair->taxsetcount * sizeof(taxampset_t),"Cannot allocate taxon set"); } - + for (i = 0; i < pair->taxsetindex; i++) { if (pair->taxset[i].taxonid == taxid) break; } - + if (i == 0) { pair->taxset[i].amplifiacount = 100; pair->taxset[i].amplifiaindex = 0; pair->taxset[i].amplifia = ECOMALLOC(pair->taxset[i].amplifiacount * sizeof(char *),"Cannot allocate amplifia table"); } - + if (pair->taxsetindex == pair->taxsetcount) { pair->taxsetcount += 500; pair->taxset = ECOREALLOC(pair->taxset, pair->taxsetcount * sizeof(taxampset_t), "Cannot allocate taxon set"); } - + if (pair->taxset[i].amplifiaindex == pair->taxset[i].amplifiacount) { pair->taxset[i].amplifiacount += 100; pair->taxset[i].amplifia = ECOREALLOC(pair->taxset[i].amplifia, pair->taxset[i].amplifiacount * sizeof(char *), "Cannot allocate amplifia table"); } - + if (pair->taxset[i].taxonid == 0) { pair->taxset[i].taxonid = taxid; pair->taxsetindex++; } - + for (j = 0; j < pair->taxset[i].amplifiaindex; j++) { if (strcmp(pair->taxset[i].amplifia[j], amplifia) == 0) break; } - + if (j == pair->taxset[i].amplifiaindex) { pair->taxset[i].amplifia[j] = amplifia; @@ -135,140 +141,62 @@ void addtaxampsetelem (ppairs_t pair, int32_t taxid, char *amplifia) char *getamplifia (pecoseq_t seq, uint32_t start, uint32_t len) { + fprintf(stderr,"start : %d length : %d\n",start,len); char *amplifia = ECOMALLOC((len + 1) * sizeof(char),"Cannot allocate amplifia"); char *seqc = &seq->SQ[start]; - + strncpy(amplifia, seqc, len); return amplifia; } +#endif /*TR: Added*/ -pairscount_t buildPrimerPairs(pecodnadb_t seqdb,uint32_t seqdbsize,pprimercount_t primers,poptions_t options) +ppairtree_t buildPrimerPairs(pecodnadb_t seqdb,uint32_t seqdbsize,pprimercount_t primers,poptions_t options) { uint32_t i; - uint32_t j; - uint32_t k; - uint32_t d; - uint32_t strt; - uint32_t end; - uint32_t paircount = 0; - uint32_t pairslots = 500; - int32_t foundindex; - ppairs_t pairs; - pairscount_t primerpairs; - primermatchcount_t seqmatchcount; - word_t w1; - word_t w2; - char *amplifia; - char *oldamp; - + ppairtree_t primerpairs; - pairs = ECOMALLOC(pairslots * sizeof(pairs_t),"Cannot allocate pairs table"); + primerpairs = initpairtree(NULL); for (i=0; i < seqdbsize; i++) { - seqmatchcount = buildPrimerPairsForOneSeq(i, primers, options); - if (seqmatchcount.matchcount == 0) continue; - - for (j=0; j < seqmatchcount.matchcount; j++) - { - strt = 0; - w1 = seqmatchcount.matches[j].word; - /*first word should b on direct strand*/ - if (!seqmatchcount.matches[j].strand) - w1 = ecoComplementWord(w1, options->primer_length); - else - strt = options->primer_length; - - for (k=j+1; k < seqmatchcount.matchcount; k++) - { - end = 0; - w2 = seqmatchcount.matches[k].word; - /*second word should be on reverse strand*/ - if (seqmatchcount.matches[k].strand) - w2 = ecoComplementWord(w2, options->primer_length); - else - end = options->primer_length; - - if (!(seqmatchcount.matches[j].good || seqmatchcount.matches[k].good)) continue; - if (w1 == w2) continue; - - d = seqmatchcount.matches[k].position - seqmatchcount.matches[j].position; - if (d >= options->lmin && d <= options->lmax) - { - /*get amplified string*/ - amplifia = getamplifia (seqdb[i], seqmatchcount.matches[j].position + strt, d - strt - end); - - foundindex = pairinlist(pairs, w1, w2, paircount); - if (foundindex != -1) /*pair is found*/ - { - if (seqdb[i]->isexample) - pairs[foundindex].inexample++; - else - pairs[foundindex].outexample++; - - if (pairs[foundindex].mind > d) pairs[foundindex].mind = d; - else if (pairs[foundindex].maxd < d) pairs[foundindex].maxd = d; - - oldamp = addamplifiasetelem (&pairs[foundindex], amplifia, seqdb[i]->ranktaxonid); - /*if exact same string is already in amplifia set then use that for taxon set, it will help for - * calculating the fully identified taxons i.e specificity, we will compare pointrs instead of strings - * because same string means same pointer*/ - if (oldamp) - { - ECOFREE (amplifia, "free amplifia"); - amplifia = oldamp; - } - addtaxampsetelem (&pairs[foundindex], seqdb[i]->ranktaxonid, amplifia); - - continue; - } - - if (paircount == pairslots) - { - pairslots += 500; - pairs = ECOREALLOC(pairs, pairslots * sizeof(pairs_t), "Cannot allocate pairs table"); - } - pairs[paircount].w1 = w1; - pairs[paircount].w2 = w2; - if (seqdb[i]->isexample) pairs[paircount].inexample = 1; - else pairs[paircount].outexample = 1; - pairs[paircount].mind = d; - pairs[paircount].maxd = d; - oldamp = addamplifiasetelem (&pairs[paircount], amplifia, seqdb[i]->ranktaxonid); - addtaxampsetelem (&pairs[paircount], seqdb[i]->ranktaxonid, amplifia); - - paircount++; - } - else if (d > options->lmax) - break; /*once if the distance is greater than lmax then it will keep on increasing*/ - } - } - ECOFREE(seqmatchcount.matches, "Cannot free matches table"); + buildPrimerPairsForOneSeq(i, seqdb, primers, primerpairs, options); } - primerpairs.pairs = ECOREALLOC(pairs, paircount * sizeof(pairs_t), "Cannot allocate pairs table"); - primerpairs.paircount = paircount; + return primerpairs; + } -primermatchcount_t buildPrimerPairsForOneSeq(uint32_t seqid,pprimercount_t primers,poptions_t options) -{ - uint32_t i,j,k; - uint32_t matchcount=0; - pprimermatch_t matches = NULL; - primermatchcount_t seqmatchcount; +#define DMAX (2000000000) + +static void buildPrimerPairsForOneSeq(uint32_t seqid, + pecodnadb_t seqdb, + pprimercount_t primers, + ppairtree_t pairs, + poptions_t options) +{ + static uint32_t paircount=0; + uint32_t i,j,k; + uint32_t matchcount=0; + pprimermatch_t matches = NULL; + primermatchcount_t seqmatchcount; + ppair_t pcurrent; + pair_t current; + pprimer_t wswp; + bool_t bswp; + size_t distance; + bool_t strand; - seqmatchcount.matchcount = 0; - seqmatchcount.matches = NULL; - for (i=0;i < primers->size; i++) { matchcount+=primers->primers[i].directCount[seqid]; matchcount+=primers->primers[i].reverseCount[seqid]; } - - if (matchcount <= 0) return seqmatchcount; + + if (matchcount <= 0) + return; + matches = ECOMALLOC(matchcount * sizeof(primermatch_t),"Cannot allocate primers match table"); for (i=0,j=0;i < primers->size; i++) @@ -277,17 +205,15 @@ primermatchcount_t buildPrimerPairsForOneSeq(uint32_t seqid,pprimercount_t prime { if (primers->primers[i].directCount[seqid]==1) { - matches[j].word = primers->primers[i].word; + matches[j].primer = primers->primers+i; matches[j].strand=TRUE; - matches[j].good=primers->primers[i].good;/*TR: Added*/ matches[j].position=primers->primers[i].directPos[seqid].value; j++; } else for (k=0; k < primers->primers[i].directCount[seqid]; k++,j++) { - matches[j].word = primers->primers[i].word; + matches[j].primer = primers->primers+i; matches[j].strand=TRUE; - matches[j].good=primers->primers[i].good;/*TR: Added*/ matches[j].position=primers->primers[i].directPos[seqid].pointer[k]; } } @@ -296,26 +222,144 @@ primermatchcount_t buildPrimerPairsForOneSeq(uint32_t seqid,pprimercount_t prime { if (primers->primers[i].reverseCount[seqid]==1) { - matches[j].word = primers->primers[i].word; + matches[j].primer = primers->primers+i; matches[j].strand=FALSE; - matches[j].good=primers->primers[i].good;/*TR: Added*/ matches[j].position=primers->primers[i].reversePos[seqid].value; j++; } else for (k=0; k < primers->primers[i].reverseCount[seqid]; k++,j++) { - matches[j].word = primers->primers[i].word; + matches[j].primer = primers->primers+i; matches[j].strand=FALSE; - matches[j].good=primers->primers[i].good;/*TR: Added*/ matches[j].position=primers->primers[i].reversePos[seqid].pointer[k]; } } } - sortmatch(matches,matchcount); // sort in asscending order by position - - /*TR: Added*/ - seqmatchcount.matches = matches; - seqmatchcount.matchcount = matchcount; - return seqmatchcount; + if (matchcount>1) + { +// fprintf(stderr,"\n====================================\n"); + + sortmatch(matches,matchcount); // sort in ascending order by position + + for (i=0; i < matchcount;i++) + { + // For all primers matching the sequence + + for(j=i+1; + (jprimer_length) < options->lmax); + j++ + ) + + // For all not too far primers + + if ( (matches[i].primer->good || matches[j].primer->good) + && (distance > options->lmin) + ) + { + + // If possible primer pair + + current.p1 = matches[i].primer; + current.asdirect1=matches[i].strand; + current.p2 = matches[j].primer; + current.asdirect2= !matches[j].strand; + current.maxd=DMAX; + current.mind=DMAX; + current.sumd=0; + current.inexample=0; + current.outexample=0; + + + // Standardize the pair + + strand = current.p2->word > current.p1->word; + if (!strand) + { + wswp = current.p1; + current.p1=current.p2; + current.p2=wswp; + + bswp = current.asdirect1; + current.asdirect1=current.asdirect2; + current.asdirect2=bswp; + } + + + // Look for the new pair in already seen pairs + + pcurrent = insertpair(current,pairs); + + + if (seqdb[seqid]->isexample) + + { + pcurrent->inexample++; + pcurrent->sumd+=distance; + + if ((pcurrent->maxd==DMAX) || (distance > pcurrent->maxd)) + pcurrent->maxd = distance; + + if (distance < pcurrent->mind) + pcurrent->mind = distance; + } + else + pcurrent->outexample++; + + if ((pcurrent->outexample+pcurrent->inexample)==1) + { + paircount++; + pcurrent->pcr.ampslot=200; + pcurrent->pcr.ampcount=0; + pcurrent->pcr.amplifias = ECOMALLOC(sizeof(amplifia_t)*pcurrent->pcr.ampslot, + "Cannot allocate amplifia table"); + } + else + { + if (pcurrent->pcr.ampslot==pcurrent->pcr.ampcount) + { + pcurrent->pcr.ampslot+=200; + pcurrent->pcr.amplifias = ECOREALLOC(pcurrent->pcr.amplifias, + sizeof(amplifia_t)*pcurrent->pcr.ampslot, + "Cannot allocate amplifia table"); + } + } + + pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].length=distance; + pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].sequence=seqdb[seqid]; + pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].strand=strand; + + if (strand) + pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].amplifia= seqdb[seqid]->SQ + matches[i].position + options->primer_length; + else + pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].amplifia= seqdb[seqid]->SQ + matches[j].position - 1 ; + + pcurrent->pcr.ampcount++; +// fprintf(stderr,"%c%c W1 : %s direct : %c", +// "bG"[(int)pcurrent->p1->good], +// "bG"[(int)pcurrent->p2->good], +// ecoUnhashWord(pcurrent->p1->word, options->primer_length), +// "><"[(int)pcurrent->asdirect1] +// ); +// +// fprintf(stderr," W2 : %s direct : %c distance : %d (min/max/avg : %d/%d/%f) in/out: %d/%d %c (%d pairs)\n", +// ecoUnhashWord(pcurrent->p2->word, options->primer_length), +// "><"[(int)pcurrent->asdirect2], +// distance, +// pcurrent->mind,pcurrent->maxd, +// (pcurrent->inexample) ? (float)pcurrent->sumd/pcurrent->inexample:0.0, +// pcurrent->inexample,pcurrent->outexample, +// " N"[(pcurrent->outexample+pcurrent->inexample)==1], +// paircount +// +// ); +// + + } + } + } + + pairs->count=paircount; + } diff --git a/tools/ecoPCRFormat.py b/tools/ecoPCRFormat.py new file mode 100755 index 0000000..3884001 --- /dev/null +++ b/tools/ecoPCRFormat.py @@ -0,0 +1,651 @@ +#!/usr/bin/env python + +import re +import gzip +import struct +import sys +import time +import getopt + +try: + import psycopg2 + _dbenable=True +except ImportError: + _dbenable=False + +##### +# +# +# Generic file function +# +# +##### + +def universalOpen(file): + if isinstance(file,str): + if file[-3:] == '.gz': + rep = gzip.open(file) + else: + rep = open(file) + else: + rep = file + return rep + +def universalTell(file): + if isinstance(file, gzip.GzipFile): + file=file.myfileobj + return file.tell() + +def fileSize(file): + if isinstance(file, gzip.GzipFile): + file=file.myfileobj + pos = file.tell() + file.seek(0,2) + length = file.tell() + file.seek(pos,0) + return length + +def progressBar(pos,max,reset=False,delta=[]): + if reset: + del delta[:] + if not delta: + delta.append(time.time()) + delta.append(time.time()) + + delta[1]=time.time() + elapsed = delta[1]-delta[0] + percent = float(pos)/max * 100 + remain = time.strftime('%H:%M:%S',time.gmtime(elapsed / percent * (100-percent))) + bar = '#' * int(percent/2) + bar+= '|/-\\-'[pos % 5] + bar+= ' ' * (50 - int(percent/2)) + sys.stderr.write('\r%5.1f %% |%s] remain : %s' %(percent,bar,remain)) + +##### +# +# +# NCBI Dump Taxonomy reader +# +# +##### + +def endLessIterator(endedlist): + for x in endedlist: + yield x + while(1): + yield endedlist[-1] + +class ColumnFile(object): + + def __init__(self,stream,sep=None,strip=True,types=None): + if isinstance(stream,str): + self._stream = open(stream) + elif hasattr(stream,'next'): + self._stream = stream + else: + raise ValueError,'stream must be string or an iterator' + self._delimiter=sep + self._strip=strip + if types: + self._types=[x for x in types] + for i in xrange(len(self._types)): + if self._types[i] is bool: + self._types[i]=ColumnFile.str2bool + else: + self._types=None + + def str2bool(x): + return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False})) + + str2bool = staticmethod(str2bool) + + + def __iter__(self): + return self + + def next(self): + ligne = self._stream.next() + data = ligne.split(self._delimiter) + if self._strip or self._types: + data = [x.strip() for x in data] + if self._types: + it = endLessIterator(self._types) + data = [x[1](x[0]) for x in ((y,it.next()) for y in data)] + return data + +def taxonCmp(t1,t2): + if t1[0] < t2[0]: + return -1 + elif t1[0] > t2[0]: + return +1 + return 0 + +def bsearchTaxon(taxonomy,taxid): + taxCount = len(taxonomy) + begin = 0 + end = taxCount + oldcheck=taxCount + check = begin + end / 2 + while check != oldcheck and taxonomy[check][0]!=taxid : + if taxonomy[check][0] < taxid: + begin=check + else: + end=check + oldcheck=check + check = (begin + end) / 2 + + + if taxonomy[check][0]==taxid: + return check + else: + return None + + + +def readNodeTable(file): + + file = universalOpen(file) + + nodes = ColumnFile(file, + sep='|', + types=(int,int,str, + str,str,bool, + int,bool,int, + bool,bool,bool,str)) + print >>sys.stderr,"Reading taxonomy dump file..." + taxonomy=[[n[0],n[2],n[1]] for n in nodes] + print >>sys.stderr,"List all taxonomy rank..." + ranks =list(set(x[1] for x in taxonomy)) + ranks.sort() + ranks = dict(map(None,ranks,xrange(len(ranks)))) + + print >>sys.stderr,"Sorting taxons..." + taxonomy.sort(taxonCmp) + + print >>sys.stderr,"Indexing taxonomy..." + index = {} + for t in taxonomy: + index[t[0]]=bsearchTaxon(taxonomy, t[0]) + + print >>sys.stderr,"Indexing parent and rank..." + for t in taxonomy: + t[1]=ranks[t[1]] + t[2]=index[t[2]] + + + return taxonomy,ranks,index + +def nameIterator(file): + file = universalOpen(file) + names = ColumnFile(file, + sep='|', + types=(int,str, + str,str)) + for taxid,name,unique,classname,white in names: + yield taxid,name,classname + +def mergedNodeIterator(file): + file = universalOpen(file) + merged = ColumnFile(file, + sep='|', + types=(int,int,str)) + for taxid,current,white in merged: + yield taxid,current + +def deletedNodeIterator(file): + file = universalOpen(file) + deleted = ColumnFile(file, + sep='|', + types=(int,str)) + for taxid,white in deleted: + yield taxid + +def readTaxonomyDump(taxdir): + taxonomy,ranks,index = readNodeTable('%s/nodes.dmp' % taxdir) + + print >>sys.stderr,"Adding scientific name..." + + alternativeName=[] + for taxid,name,classname in nameIterator('%s/names.dmp' % taxdir): + alternativeName.append((name,classname,index[taxid])) + if classname == 'scientific name': + taxonomy[index[taxid]].append(name) + + print >>sys.stderr,"Adding taxid alias..." + for taxid,current in mergedNodeIterator('%s/merged.dmp' % taxdir): + index[taxid]=index[current] + + print >>sys.stderr,"Adding deleted taxid..." + for taxid in deletedNodeIterator('%s/delnodes.dmp' % taxdir): + index[taxid]=None + + return taxonomy,ranks,alternativeName,index + +def readTaxonomyDB(dbname): + connection = psycopg2.connect(database=dbname) + + cursor = connection.cursor() + cursor.execute("select numid,rank,parent from ncbi_taxonomy.taxon") + taxonomy=[list(x) for x in cursor] + + cursor.execute("select rank_class from ncbi_taxonomy.taxon_rank_class order by rank_class") + ranks=cursor.fetchall() + ranks = dict(map(None,(x[0] for x in ranks),xrange(len(ranks)))) + + print >>sys.stderr,"Sorting taxons..." + taxonomy.sort(taxonCmp) + + print >>sys.stderr,"Indexing taxonomy..." + index = {} + for t in taxonomy: + index[t[0]]=bsearchTaxon(taxonomy, t[0]) + + print >>sys.stderr,"Indexing parent and rank..." + for t in taxonomy: + t[1]=ranks[t[1]] + try: + t[2]=index[t[2]] + except KeyError,e: + if t[2] is None and t[0]==1: + t[2]=index[t[0]] + else: + raise e + + cursor.execute("select taxid,name,category from ncbi_taxonomy.name") + + alternativeName=[] + for taxid,name,classname in cursor: + alternativeName.append((name,classname,index[taxid])) + if classname == 'scientific name': + taxonomy[index[taxid]].append(name) + + cursor.execute("select old_numid,current_numid from ncbi_taxonomy.taxon_id_alias") + + print >>sys.stderr,"Adding taxid alias..." + for taxid,current in cursor: + if current is not None: + index[taxid]=index[current] + else: + index[taxid]=None + + + return taxonomy,ranks,alternativeName,index + +##### +# +# +# Genbank/EMBL sequence reader +# +# +##### + +def entryIterator(file): + file = universalOpen(file) + rep =[] + for ligne in file: + rep.append(ligne) + if ligne == '//\n': + rep = ''.join(rep) + yield rep + rep = [] + +def fastaEntryIterator(file): + file = universalOpen(file) + rep =[] + for ligne in file: + if ligne[0] == '>' and rep: + rep = ''.join(rep) + yield rep + rep = [] + rep.append(ligne) + if rep: + rep = ''.join(rep) + yield rep + +_cleanSeq = re.compile('[ \n0-9]+') + +def cleanSeq(seq): + return _cleanSeq.sub('',seq) + + +_gbParseID = re.compile('(?<=^LOCUS {7})[^ ]+(?= )',re.MULTILINE) +_gbParseDE = re.compile('(?<=^DEFINITION {2}).+?\. *$(?=[^ ])',re.MULTILINE+re.DOTALL) +_gbParseSQ = re.compile('(?<=^ORIGIN).+?(?=^//$)',re.MULTILINE+re.DOTALL) +_gbParseTX = re.compile('(?<= /db_xref="taxon:)[0-9]+(?=")') + +def genbankEntryParser(entry): + Id = _gbParseID.findall(entry)[0] + De = ' '.join(_gbParseDE.findall(entry)[0].split()) + Sq = cleanSeq(_gbParseSQ.findall(entry)[0].upper()) + try: + Tx = int(_gbParseTX.findall(entry)[0]) + except IndexError: + Tx = None + return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq} + +###################### + +_cleanDef = re.compile('[\nDE]') + +def cleanDef(definition): + return _cleanDef.sub('',definition) + +_emblParseID = re.compile('(?<=^ID {3})[^ ]+(?=;)',re.MULTILINE) +_emblParseDE = re.compile('(?<=^DE {3}).+?\. *$(?=[^ ])',re.MULTILINE+re.DOTALL) +_emblParseSQ = re.compile('(?<=^ ).+?(?=^//$)',re.MULTILINE+re.DOTALL) +_emblParseTX = re.compile('(?<= /db_xref="taxon:)[0-9]+(?=")') + +def emblEntryParser(entry): + Id = _emblParseID.findall(entry)[0] + De = ' '.join(cleanDef(_emblParseDE.findall(entry)[0]).split()) + Sq = cleanSeq(_emblParseSQ.findall(entry)[0].upper()) + try: + Tx = int(_emblParseTX.findall(entry)[0]) + except IndexError: + Tx = None + return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq} + + +###################### + +_fastaSplit=re.compile(';\W*') + +def parseFasta(seq): + seq=seq.split('\n') + title = seq[0].strip()[1:].split(None,1) + id=title[0] + if len(title) == 2: + field = _fastaSplit.split(title[1]) + else: + field=[] + info = dict(x.split('=',1) for x in field if '=' in x) + definition = ' '.join([x for x in field if '=' not in x]) + seq=(''.join([x.strip() for x in seq[1:]])).upper() + return id,seq,definition,info + + +def fastaEntryParser(entry): + id,seq,definition,info = parseFasta(entry) + Tx = info.get('taxid',None) + if Tx is not None: + Tx=int(Tx) + return {'id':id,'taxid':Tx,'definition':definition,'sequence':seq} + + +def sequenceIteratorFactory(entryParser,entryIterator): + def sequenceIterator(file): + for entry in entryIterator(file): + yield entryParser(entry) + return sequenceIterator + + +def taxonomyInfo(entry,connection): + taxid = entry['taxid'] + curseur = connection.cursor() + curseur.execute(""" + select taxid,species,genus,family, + taxonomy.scientificName(taxid) as sn, + taxonomy.scientificName(species) as species_sn, + taxonomy.scientificName(genus) as genus_sn, + taxonomy.scientificName(family) as family_sn + from + ( + select alias as taxid, + taxonomy.getSpecies(alias) as species, + taxonomy.getGenus(alias) as genus, + taxonomy.getFamily(alias) as family + from taxonomy.aliases + where id=%d ) as tax + """ % taxid) + rep = curseur.fetchone() + entry['current_taxid']=rep[0] + entry['species']=rep[1] + entry['genus']=rep[2] + entry['family']=rep[3] + entry['scientific_name']=rep[4] + entry['species_sn']=rep[5] + entry['genus_sn']=rep[6] + entry['family_sn']=rep[7] + return entry + +##### +# +# +# Binary writer +# +# +##### + +def ecoSeqPacker(sq): + + compactseq = gzip.zlib.compress(sq['sequence'],9) + cptseqlength = len(compactseq) + delength = len(sq['definition']) + + totalSize = 4 + 20 + 4 + 4 + 4 + cptseqlength + delength + + packed = struct.pack('> I I 20s I I I %ds %ds' % (delength,cptseqlength), + totalSize, + sq['taxid'], + sq['id'], + delength, + len(sq['sequence']), + cptseqlength, + sq['definition'], + compactseq) + + assert len(packed) == totalSize+4, "error in sequence packing" + + return packed + +def ecoTaxPacker(tx): + + namelength = len(tx[3]) + + totalSize = 4 + 4 + 4 + 4 + namelength + + packed = struct.pack('> I I I I I %ds' % namelength, + totalSize, + tx[0], + tx[1], + tx[2], + namelength, + tx[3]) + + return packed + +def ecoRankPacker(rank): + + namelength = len(rank) + + packed = struct.pack('> I %ds' % namelength, + namelength, + rank) + + return packed + +def ecoNamePacker(name): + + namelength = len(name[0]) + classlength= len(name[1]) + totalSize = namelength + classlength + 4 + 4 + 4 + 4 + + packed = struct.pack('> I I I I I %ds %ds' % (namelength,classlength), + totalSize, + int(name[1]=='scientific name'), + namelength, + classlength, + name[2], + name[0], + name[1]) + + return packed + +def ecoSeqWriter(file,input,taxindex,parser): + output = open(file,'wb') + input = universalOpen(input) + inputsize = fileSize(input) + entries = parser(input) + seqcount=0 + skipped = [] + + output.write(struct.pack('> I',seqcount)) + + progressBar(1, inputsize,reset=True) + for entry in entries: + if entry['taxid'] is not None: + try: + entry['taxid']=taxindex[entry['taxid']] + except KeyError: + entry['taxid']=None + if entry['taxid'] is not None: + seqcount+=1 + output.write(ecoSeqPacker(entry)) + else: + skipped.append(entry['id']) + where = universalTell(input) + progressBar(where, inputsize) + print >>sys.stderr," Readed sequences : %d " % seqcount, + else: + skipped.append(entry['id']) + + print >>sys.stderr + output.seek(0,0) + output.write(struct.pack('> I',seqcount)) + + output.close() + return skipped + + +def ecoTaxWriter(file,taxonomy): + output = open(file,'wb') + output.write(struct.pack('> I',len(taxonomy))) + + for tx in taxonomy: + output.write(ecoTaxPacker(tx)) + + output.close() + +def ecoRankWriter(file,ranks): + output = open(file,'wb') + output.write(struct.pack('> I',len(ranks))) + + rankNames = ranks.keys() + rankNames.sort() + + for rank in rankNames: + output.write(ecoRankPacker(rank)) + + output.close() + +def nameCmp(n1,n2): + name1=n1[0].upper() + name2=n2[0].upper() + if name1 < name2: + return -1 + elif name1 > name2: + return 1 + return 0 + + +def ecoNameWriter(file,names): + output = open(file,'wb') + output.write(struct.pack('> I',len(names))) + + names.sort(nameCmp) + + for name in names: + output.write(ecoNamePacker(name)) + + output.close() + +def ecoDBWriter(prefix,taxonomy,seqFileNames,parser): + + ecoRankWriter('%s.rdx' % prefix, taxonomy[1]) + ecoTaxWriter('%s.tdx' % prefix, taxonomy[0]) + ecoNameWriter('%s.ndx' % prefix, taxonomy[2]) + + filecount = 0 + for filename in seqFileNames: + filecount+=1 + sk=ecoSeqWriter('%s_%03d.sdx' % (prefix,filecount), + filename, + taxonomy[3], + parser) + if sk: + print >>sys.stderr,"Skipped entry :" + print >>sys.stderr,sk + +def ecoParseOptions(arguments): + opt = { + 'prefix' : 'ecodb', + 'taxdir' : 'taxdump', + 'parser' : sequenceIteratorFactory(genbankEntryParser, + entryIterator) + } + + o,filenames = getopt.getopt(arguments, + 'ht:T:n:gfe', + ['help', + 'taxonomy=', + 'taxonomy_db=', + 'name=', + 'genbank', + 'fasta', + 'embl']) + + for name,value in o: + if name in ('-h','--help'): + printHelp() + exit() + elif name in ('-t','--taxonomy'): + opt['taxmod']='dump' + opt['taxdir']=value + elif name in ('-T','--taxonomy_db'): + opt['taxmod']='db' + opt['taxdb']=value + elif name in ('-n','--name'): + opt['prefix']=value + elif name in ('-g','--genbank'): + opt['parser']=sequenceIteratorFactory(genbankEntryParser, + entryIterator) + + elif name in ('-f','--fasta'): + opt['parser']=sequenceIteratorFactory(fastaEntryParser, + fastaEntryIterator) + + elif name in ('-e','--embl'): + opt['parser']=sequenceIteratorFactory(emblEntryParser, + entryIterator) + else: + raise ValueError,'Unknown option %s' % name + + return opt,filenames + +def printHelp(): + print "-----------------------------------" + print " ecoPCRFormat.py" + print "-----------------------------------" + print "ecoPCRFormat.py [option] " + print "-----------------------------------" + print "-e --embl :[E]mbl format" + print "-f --fasta :[F]asta format" + print "-g --genbank :[G]enbank format" + print "-h --help :[H]elp - print this help" + print "-n --name :[N]ame of the new database created" + print "-t --taxonomy :[T]axonomy - path to the taxonomy database" + print " :bcp-like dump from GenBank taxonomy database." + print "-----------------------------------" + +if __name__ == '__main__': + + opt,filenames = ecoParseOptions(sys.argv[1:]) + + if opt['taxmod']=='dump': + taxonomy = readTaxonomyDump(opt['taxdir']) + elif opt['taxmod']=='db': + taxonomy = readTaxonomyDB(opt['taxdb']) + + + ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser']) +