/* * ecoprimer.c * * Created on: 7 nov. 2008 * Author: coissac */ #include "libecoprimer/ecoprimer.h" #include #include #include #include #include #include #include #define VERSION "0.1" /* TR: by default, statistics are made on species level*/ #define DEFULTTAXONRANK "species" /* ----------------------------------------------- */ /* printout help */ /* ----------------------------------------------- */ #define PP fprintf(stdout, static void PrintHelp() { PP "------------------------------------------\n"); PP " ecoPrimer Version %s\n", VERSION); PP "------------------------------------------\n"); } static void ExitUsage(int stat) { PP "usage: ecoprimer [-d database] [-l value] [-L value] [-e value] [-r taxid] [-i taxid] [-R rank] [-t taxon level]\n"); PP "type \"ecoprimer -h\" for help\n"); if (stat) exit(stat); } #undef PP void initoptions(poptions_t options) { options->lmin=0; //< Amplifia minimal length options->lmax=0; //< Amplifia maximal length options->error_max=3; //**< maximum error count in fuzzy search options->primer_length=18; //**< minimal length of the primers options->restricted_taxid=NULL; //**< limit amplification below these taxid options->ignored_taxid=NULL; //**< no amplification below these taxid options->prefix=NULL; options->circular=0; options->doublestrand=1; options->strict_quorum=0.7; options->strict_exclude_quorum=0.1; options->sensitivity_quorum=0.9; options->false_positive_quorum=0.1; options->strict_three_prime=2; options->r=0; options->g=0; options->no_multi_match=FALSE; strcpy(options->taxonrank, DEFULTTAXONRANK); /*taxon level for results, species by default*/ } void printcurrenttime () { time_t now; struct tm *ts; char buf[80]; /* Get the current time */ now = time(NULL); /* Format and print the time, "ddd yyyy-mm-dd hh:mm:ss zzz" */ ts = localtime(&now); strftime(buf, sizeof(buf), "%a %Y-%m-%d %H:%M:%S %Z", ts); fprintf(stderr,"#%d#, %s\n",now, buf); } void printcurrenttimeinmilli() { struct timeval tv; struct timezone tz; struct tm *tm; gettimeofday(&tv, &tz); tm=localtime(&tv.tv_sec); fprintf(stderr, " %d:%02d:%02d %d %d \n", tm->tm_hour, tm->tm_min, tm->tm_sec, tv.tv_usec, tv.tv_usec/1000); } /*TR: Added*/ void printpairs (pairscount_t pairs, poptions_t options, int32_t rankdbstats, uint32_t seqdbsize) { uint32_t i; uint32_t wordsize = options->primer_length; uint32_t quorumseqs; double sens; double speci; float avg; quorumseqs = seqdbsize * 70 / 100; printf("primer_1\tseq_1\tPrimer_2\tseq_2\tamplifia_count\t%s_snes\t%s_spe\tmin_l\tmax_l\tavr_l\n", options->taxonrank, options->taxonrank); for (i=0; i < pairs.paircount; i++) { if (quorumseqs > pairs.pairs[i].inexample) continue; sens = (pairs.pairs[i].taxsetindex*1.0)/rankdbstats*100; speci = (pairs.pairs[i].oktaxoncount*1.0)/pairs.pairs[i].taxsetindex*100; avg = (pairs.pairs[i].mind+pairs.pairs[i].maxd)*1.0/2; printf("P1\t%s", ecoUnhashWord(pairs.pairs[i].w1, wordsize)); printf("\tP2\t%s", ecoUnhashWord(pairs.pairs[i].w2, wordsize)); printf("\t%d", pairs.pairs[i].inexample); printf("\t%3.2f", sens); printf("\t%3.2f", speci); printf("\t%d", pairs.pairs[i].mind); printf("\t%d", pairs.pairs[i].maxd); printf("\t%3.2f\n", avg); } } /*updateseqparams: This function counts the insample and outsample sequences * and with each sequences adds a tag of the taxon to which the sequence beongs*/ void updateseqparams (pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxonomy, poptions_t options, int32_t *insamples, int32_t *outsamples) { uint32_t i; int32_t taxid; ecotx_t *tmptaxon; for (i=0;iisexample=isGoodTaxon(taxonomy,seqdb[i]->taxid,options); if (seqdb[i]->isexample) (*insamples)++; else (*outsamples)++; taxid = taxonomy->taxons->taxon[seqdb[i]->taxid].taxid; tmptaxon = eco_findtaxonbytaxid(taxonomy, taxid); if (tmptaxon) tmptaxon = eco_findtaxonatrank(tmptaxon, options->taxonrankidx); if (tmptaxon) seqdb[i]->ranktaxonid = tmptaxon->taxid; } } void setresulttaxonrank (ecotaxonomy_t *taxonomy, poptions_t options) { int32_t i; /*set taxon rank for which result is to be given*/ for (i = 0; i < taxonomy->ranks->count; i++) { if (strcmp(taxonomy->ranks->label[i], options->taxonrank) == 0) { options->taxonrankidx = i; break; } } if (i == taxonomy->ranks->count) { fprintf(stderr,"\nUnknown taxon level: '%s'\n", options->taxonrank); exit(0); } } /* to get db stats, totals of species, genus etc....*/ int32_t getrankdbstats(pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxonomy, poptions_t options) { uint32_t i; uint32_t j; uint32_t nameslots = 500; uint32_t namesindex = 0; int32_t *ranktaxonids = ECOMALLOC(nameslots * sizeof(int32_t), "Error in taxon rank allocation"); int32_t taxid; ecotx_t *tmptaxon; for (i=0;itaxons->taxon[seqdb[i]->taxid].taxid; tmptaxon = eco_findtaxonbytaxid(taxonomy, taxid); if (tmptaxon) tmptaxon = eco_findtaxonatrank(tmptaxon, options->taxonrankidx); if (tmptaxon) { for (j = 0; j < namesindex; j++) { if (tmptaxon->taxid == ranktaxonids[j]) break; } if (j < namesindex) continue; /* name is already in list, so no need to add it*/ if (namesindex == nameslots) { nameslots += 500; ranktaxonids = ECOREALLOC(ranktaxonids, nameslots * sizeof(int32_t), "Cannot allocate pair rank taxon table"); } ranktaxonids[namesindex] = tmptaxon->taxid; namesindex++; } } ECOFREE(ranktaxonids, "free rank taxon table"); return namesindex; } void setoktaxforspecificity (ppairscount_t pairs) { uint32_t i; uint32_t j; uint32_t k; uint32_t l; int taxcount; int32_t taxid; for (i = 0; i < pairs->paircount; i++) { for (j = 0; j < pairs->pairs[i].taxsetindex; j++) { for (k = 0; k < pairs->pairs[i].taxset[j].amplifiaindex; k++) { taxid = 0; taxcount = 0; for (l = 0; l < pairs->pairs[i].ampsetindex; l++) { /*compare only char pointers because for equal strings we have same pointer in both sets*/ if (pairs->pairs[i].taxset[j].amplifia[k] == pairs->pairs[i].ampset[l].amplifia) { if (pairs->pairs[i].ampset[l].seqidindex > 1) { taxcount += pairs->pairs[i].ampset[l].seqidindex; break; } if (taxid != pairs->pairs[i].ampset[l].taxonids[0]) { if (!taxid) taxid = pairs->pairs[i].ampset[l].taxonids[0]; taxcount++; } if (taxcount > 1) break; } } if (taxcount == 1) pairs->pairs[i].oktaxoncount++; } } } } int main(int argc, char **argv) { pecodnadb_t seqdb; /* of type ecoseq_t */ uint32_t seqdbsize=0; ecotaxonomy_t *taxonomy; options_t options; int carg; int32_t errflag=0; int32_t insamples=0; int32_t outsamples=0; uint32_t i; pwordcount_t words; pprimercount_t primers; pairscount_t pairs; int32_t rankdbstats = 0; //printcurrenttime(); //return 0; initoptions(&options); while ((carg = getopt(argc, argv, "hcUDSd:l:L:e:i:r:q:3:s:x:t:")) != -1) { switch (carg) { /* -------------------- */ case 'd': /* database name */ /* -------------------- */ options.prefix = ECOMALLOC(strlen(optarg)+1, "Error on prefix allocation"); strcpy(options.prefix,optarg); break; /* -------------------- */ case 'h': /* help */ /* -------------------- */ PrintHelp(); exit(0); break; /* ------------------------- */ case 'l': /* min amplification lenght */ /* ------------------------- */ sscanf(optarg,"%d",&(options.lmin)); break; /* -------------------------- */ case 'L': /* max amplification lenght */ /* -------------------------- */ sscanf(optarg,"%d",&(options.lmax)); break; /* -------------------- */ case 'e': /* error max */ /* -------------------- */ sscanf(optarg,"%d",&(options.error_max)); break; /* ------------------------ */ case '3': /* three prime strict match */ /* ------------------------ */ sscanf(optarg,"%d",&(options.strict_three_prime)); break; /* -------------------- */ case 'q': /* strict matching quorum */ /* -------------------- */ sscanf(optarg,"%f",&(options.strict_quorum)); break; /* -------------------- */ case 's': /* strict matching quorum */ /* -------------------- */ sscanf(optarg,"%f",&(options.sensitivity_quorum)); break; /* -------------------- */ case 't': /* required taxon level for results */ /* -------------------- */ strncpy(options.taxonrank, optarg, 19); options.taxonrank[19] = 0; break; /* -------------------- */ case 'x': /* strict matching quorum */ /* -------------------- */ sscanf(optarg,"%f",&(options.false_positive_quorum)); break; /* ---------------------------- */ case 'D': /* set in double strand mode */ /* ---------------------------- */ options.doublestrand=1; break; /* ---------------------------- */ case 'S': /* set in single strand mode */ /* ---------------------------- */ options.doublestrand=0; break; /* ---------------------------- */ case 'U': /* set in single strand mode */ /* ---------------------------- */ options.no_multi_match=TRUE; break; /* ------------------------------------------ */ case 'r': /* stores the restricting search taxonomic id */ /* ------------------------------------------ */ options.restricted_taxid = ECOREALLOC(options.restricted_taxid,sizeof(int32_t)*(options.r+1), "Error on restricted_taxid reallocation"); sscanf(optarg,"%d",&(options.restricted_taxid[options.r])); options.r++; break; /* --------------------------------- */ case 'i': /* stores the taxonomic id to ignore */ /* --------------------------------- */ options.ignored_taxid = ECOREALLOC(options.ignored_taxid,sizeof(int32_t)*(options.g+1), "Error on excluded_taxid reallocation"); sscanf(optarg,"%d",&(options.ignored_taxid[options.g])); options.g++; break; /* -------------------- */ case 'c': /* sequences are circular */ /* --------------------------------- */ options.circular = 1; break; case '?': /* bad option */ /* -------------------- */ errflag++; } } fprintf(stderr,"Reading taxonomy database ..."); taxonomy = read_taxonomy(options.prefix,0); fprintf(stderr,"Ok\n"); setresulttaxonrank(taxonomy, &options); /*TR: set rank level for statistics*/ fprintf(stderr,"Reading sequence database ...\n"); seqdb = readdnadb(options.prefix,&seqdbsize); fprintf(stderr,"Ok\n"); fprintf(stderr,"Sequence read : %d\n",(int32_t)seqdbsize); updateseqparams(seqdb, seqdbsize, taxonomy, &options, &insamples , &outsamples); rankdbstats = getrankdbstats(seqdb, seqdbsize, taxonomy, &options); fprintf(stderr,"Database is constituted of %5d examples\n",insamples); fprintf(stderr," and %5d counterexamples\n",outsamples); fprintf(stderr,"Total distinct %s count %d\n",options.taxonrank, rankdbstats); fprintf(stderr,"\nIndexing words in sequences\n"); printcurrenttimeinmilli(); words = lookforStrictPrimer(seqdb,seqdbsize,insamples,&options); printcurrenttimeinmilli(); fprintf(stderr,"\n Strict primer count : %d\n",words->size); if (options.no_multi_match) { (void)filterMultiStrictPrimer(words); fprintf(stderr,"\n Strict primer with single match count : %d\n",words->size); } fprintf(stderr,"\n\n Primer sample : \n"); for (i=0; isize); i++) fprintf(stderr," + Primer : %s sequence count : %d\n",ecoUnhashWord(words->words[i],options.primer_length),words->strictcount[i]); fprintf(stderr,"\nEncoding sequences for fuzzy pattern matching...\n"); for (i=0;istrictcount,"Free strict primer count table"); primers = lookforAproxPrimer(seqdb,seqdbsize,insamples,words,&options); ECOFREE(words->words,"Free strict primer table"); ECOFREE(words,"Free strict primer structure"); fprintf(stderr,"\n\n Approximate repeats :%d \n", primers->size); fprintf(stderr,"\n\n Primer sample : \n"); for (i=0; isize); i++) fprintf(stderr," + Primer : %s example sequence count : %5d counterexample sequence count : %5d status : %s\n",ecoUnhashWord(primers->primers[i].word,options.primer_length), primers->primers[i].inexample, primers->primers[i].outexample, primers->primers[i].good ? "good":"bad"); fprintf(stderr,"\n"); /*TR: Added*/ pairs = buildPrimerPairs(seqdb, seqdbsize, primers, &options); setoktaxforspecificity (&pairs); printpairs (pairs, &options, rankdbstats, seqdbsize); ECOFREE(pairs.pairs,"Free pairs table"); return 0; }