/* * ecoprimer.c * * Created on: 7 nov. 2008 * Author: coissac */ #include "libecoprimer/ecoprimer.h" #include #include #include #include #include #include #include #include #include"libthermo/nnparams.h" #include"libthermo/thermostats.h" #define VERSION "0.3" /* TR: by default, statistics are made on species level*/ #define DEFAULTTAXONRANK "species" static int cmpprintedpairs(const void* p1,const void* p2); //float _Z27calculateMeltingTemperature_ (char * seq1, char * seq2); void* lib_handle = NULL; float (*calcMelTemp)(char*, char*); void openlibman () { // Open the library. char* lib_name = "./libPHunterLib.dylib"; lib_handle = dlopen(lib_name, RTLD_NOW); if (lib_handle) { fprintf(stderr, "[%s] dlopen(\"%s\", RTLD_NOW): Successful\n", __FILE__, lib_name); } else { fprintf(stderr, "[%s] Unable to open library: %s\n", __FILE__, dlerror()); exit(EXIT_FAILURE); } // Get the symbol addresses. calcMelTemp = dlsym(lib_handle, "_Z27calculateMeltingTemperaturePcS_"); if (calcMelTemp) { fprintf(stderr, "[%s] dlsym(lib_handle, \"addRating\"): Successful\n", __FILE__); } else { fprintf(stderr, "[%s] Unable to get symbol: %s\n", __FILE__, dlerror()); exit(EXIT_FAILURE); } } void closlibman () { if (lib_handle) dlclose(lib_handle); } /* ----------------------------------------------- */ /* printout help */ /* ----------------------------------------------- */ #define PP fprintf(stdout, static void PrintHelp() { PP "------------------------------------------\n"); PP " ecoPrimer Version %s\n", VERSION); PP "------------------------------------------\n"); PP "synopsis : finding primers and measureing the quality of primers and barcode region\n"); PP "usage: ./ecoPrimer [options] \n"); PP "------------------------------------------\n"); PP "options:\n"); PP "-d : [D]atabase : to match the expected format, the database\n"); PP " has to be formated first by the ecoPCRFormat.py program located.\n"); PP " in the ecoPCR/tools directory.\n"); PP " ecoPCRFormat.py creates three file types :\n"); PP " .sdx : contains the sequences\n"); PP " .tdx : contains information concerning the taxonomy\n"); PP " .rdx : contains the taxonomy rank\n\n"); PP " ecoPrimer needs all the file type. As a result, you have to write the\n"); PP " database radical without any extension. For example /ecoPrimerDB/fstvert\n\n"); PP "-e : [E]rror : max error allowed by oligonucleotide (0 by default)\n\n"); PP "-h : [H]elp - print help\n\n"); PP "-i : [I]gnore the given taxonomy id.\n\n"); PP "-l : minimum [L]ength : define the minimum amplication length. \n\n"); PP "-L : maximum [L]ength : define the maximum amplicationlength. \n\n"); PP "-r : [R]estricts the search to the given taxonomic id.\n\n"); PP "-c : Consider that the database sequences are [c]ircular\n\n"); // PP "-3 : Three prime strict match\n\n"); PP "-q : Strict matching [q]uorum, percentage of the sequences in which strict primers are found. By default it is 70\n\n"); PP "-s : [S]ensitivity quorum\n\n"); PP "-t : required [t]axon level for results, by default the results are computed at species level\n\n"); PP "-x : false positive quorum\n\n"); PP "-D : set in [d]ouble strand mode\n\n"); PP "-S : Set in [s]ingle strand mode\n\n"); PP "-U : No multi match\n\n"); PP "\n"); PP "------------------------------------------\n"); PP "Table result description : \n"); PP "column 1 : serial number\n"); PP "column 2 : primer1\n"); PP "column 3 : primer2\n"); PP "column 4 : good/bad\n"); PP "column 5 : in sequence count\n"); PP "column 6 : out sequence count\n"); PP "column 7 : yule\n"); PP "column 8 : in taxa count\n"); PP "column 9 : out taxa count\n"); PP "column 10 : coverage\n"); PP "column 11 : unambiguously identified taxa\n"); PP "column 12 : specificity\n"); PP "column 13 : minimum amplified length\n"); PP "column 14 : maximum amplified length\n"); PP "column 15 : average amplified length\n"); PP "------------------------------------------\n"); PP " http://www.grenoble.prabi.fr/trac/ecoPrimer/\n"); PP "------------------------------------------\n\n"); PP "\n"); } static void ExitUsage(int stat) { PP "usage: ecoprimer [-d database] [-l value] [-L value] [-e value] [-r taxid] [-i taxid] [-R rank] [-t taxon level]\n"); PP "type \"ecoprimer -h\" for help\n"); if (stat) exit(stat); } #undef PP void initoptions(poptions_t options) { options->statistics=FALSE; options->filtering=TRUE; options->lmin=0; //< Amplifia minimal length options->lmax=1000; //< Amplifia maximal length options->error_max=3; //**< maximum error count in fuzzy search options->primer_length=18; //**< minimal length of the primers options->restricted_taxid=NULL; //**< limit amplification below these taxid options->ignored_taxid=NULL; //**< no amplification below these taxid options->prefix=NULL; options->reference=NULL; options->refseq=NULL; options->circular=0; options->doublestrand=1; options->strict_quorum=0.7; options->strict_exclude_quorum=0.1; options->sensitivity_quorum=0.9; options->false_positive_quorum=0.1; options->strict_three_prime=0; options->r=0; options->g=0; options->no_multi_match=FALSE; options->pnparm = NULL; strcpy(options->taxonrank, DEFAULTTAXONRANK); /*taxon level for results, species by default*/ options->saltmethod = SALT_METHOD_SANTALUCIA; options->salt = DEF_SALT; } void printapair(int32_t index,ppair_t pair, poptions_t options) { bool_t asdirect1=pair->asdirect1; bool_t asdirect2=pair->asdirect2; bool_t asdirecttmp; word_t w1=pair->p1->word; word_t w2=pair->p2->word; word_t wtmp; bool_t good1=pair->p1->good; bool_t good2=pair->p2->good; bool_t goodtmp; bool_t strand; uint32_t i; float temp; CNNParams nnparams; //nparam_InitParams(&nnparams, DEF_CONC_PRIMERS,DEF_CONC_SEQUENCES,DEF_SALT,SALT_METHOD_SANTALUCIA); char *c; char p1[32]; char p2[32]; if (!asdirect1) w1=ecoComplementWord(w1,options->primer_length); if (!asdirect2) w2=ecoComplementWord(w2,options->primer_length); if (w2 < w1) { wtmp=w1; w1=w2; w2=wtmp; asdirecttmp=asdirect1; asdirect1=asdirect2; asdirect2=asdirecttmp; goodtmp=good1; good1=good2; good2=goodtmp; } //print serial number printf("%6d\t",index); c = ecoUnhashWord(w1,options->primer_length); strcpy (p1, c); c = ecoUnhashWord(w2,options->primer_length); strcpy (p2, c); //print primer1 printf("%s\t", p1); //print primer2 printf("%s", p2); //print primer1 melting temperature printf ("\t%4.3f", pair->p1temp); //print minimum melting temperature of approximate versions of primer1 printf ("\t%4.3f", pair->p1mintemp); //print primer2 melting temperature printf ("\t%4.3f", pair->p2temp); //print minimum melting temperature of approximate versions of primer2 printf ("\t%4.3f", pair->p2mintemp); //print gc contents of primer1 printf ("\t%d",nparam_CountGCContent(p1)); //print gc contents of primer2 printf ("\t%d",nparam_CountGCContent(p2)); //print good/bad pair indicator printf("\t%c%c", "bG"[(int)good1],"bG"[(int)good2]); //print inexample count printf("\t%d", pair->inexample); //print out example count printf("\t%d", pair->outexample); //print yule printf("\t%4.3f", pair->yule); //print in taxa count printf("\t%d", pair->intaxa); //print out taxa count printf("\t%d", pair->outtaxa); //print coverage printf("\t%4.3f", (float)pair->bc); //print well identified taxa count printf("\t%d", pair->intaxa - pair->notwellidentifiedtaxa); //print specificity printf("\t%4.3f", pair->bs); //print min amplifia lenght printf("\t%d", pair->mind); //print max amplifia lenght printf("\t%d", pair->maxd); //print average amplifia lenght printf("\t%3.2f", (float)pair->sumd/pair->inexample); //print amplifia information about reference sequence if specified if (options->refseq && pair->refsequence >=0) { printf("\t%s:",options->reference); strand = pair->pcr.amplifias[pair->refsequence].strand; if (strand) printf("join("); else printf("complement("); printf("%d..%d,%d..%d",pair->pcr.amplifias[pair->refsequence].begin - options->primer_length + 1, pair->pcr.amplifias[pair->refsequence].begin, pair->pcr.amplifias[pair->refsequence].end + 2, pair->pcr.amplifias[pair->refsequence].end + options->primer_length + 1 ); printf(")"); printf("\t"); for (c=pair->pcr.amplifias[pair->refsequence].amplifia, i=pair->pcr.amplifias[pair->refsequence].begin; i<=pair->pcr.amplifias[pair->refsequence].end; i++, c+=(strand)? 1:-1) printf("%c","acgt"[(strand)? (*c):(~*c)&3]); } else printf("\t\t"); printf("\n"); } static int cmpprintedpairs(const void* p1,const void* p2) { float s1,s2; ppair_t pair1,pair2; pair1=*((ppair_t*)p1); pair2=*((ppair_t*)p2); s1 = pair1->yule * pair1->bs; s2 = pair2->yule * pair2->bs; // fprintf(stderr,"s1 : %4.3f %4.3f %4.3f\n",pair1->yule , pair1->bs,s1); // fprintf(stderr,"s2 : %4.3f %4.3f %4.3f\n\n",pair2->yule , pair2->bs,s2); if (s1 > s2) return -1; if (s1 < s2) return 1; return 0; } uint32_t filterandsortpairs(ppair_t* sortedpairs,uint32_t count, poptions_t options) { uint32_t i,j; float q,qfp; for (i=0,j=0;i < count;i++) { if (options->insamples) q = (float)sortedpairs[i]->inexample/options->insamples; else q=1.0; if (options->outsamples) qfp = (float)sortedpairs[i]->outexample/options->outsamples; else qfp=0.0; sortedpairs[i]->quorumin = q; sortedpairs[i]->quorumout = qfp; sortedpairs[i]->yule = q - qfp; sortedpairs[j]=sortedpairs[i]; if (q > options->sensitivity_quorum && qfp < options->false_positive_quorum) { (void)taxonomycoverage(sortedpairs[j],options); taxonomyspecificity(sortedpairs[j]); j++; } } qsort(sortedpairs,j,sizeof(ppair_t),cmpprintedpairs); return j; } void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy) { ppair_t* sortedpairs; ppair_t* index; ppairlist_t pl; size_t i,j; size_t count; char *taxon[]={"taxon","taxa"}; ecotx_t *current_taxon; //printf("Index\tPrimer1\tPrimer2\tGB\tInexampleCount\tOutexampleCount\tYule\tIntaxaCount\tOuttaxaCount\tCoverage\tSpecificity\tMinAmplifiedLength\tMaxAmplifiedLength\tAvgAmplifiedLength\n"); fprintf(stderr,"Total pair count : %d\n",pairs->count); sortedpairs = ECOMALLOC(pairs->count*sizeof(ppair_t),"Cannot Allocate ordered pairs"); index=sortedpairs; pl=pairs->first; j=0; while(pl->next) { for (i=0;ipaircount;i++,j++) sortedpairs[j]=pl->pairs+i; pl=pl->next; } for (i=0;ipaircount;i++,j++) sortedpairs[j]=pl->pairs+i; count=filterandsortpairs(sortedpairs,pairs->count,options); getThermoProperties(sortedpairs, count, options); fprintf(stderr,"Total good pair count : %u\n",(uint32_t)count); printf("#\n"); printf("# ecoPrimer version %s\n",VERSION); printf("# Rank level optimisation : %s\n", options->taxonrank); printf("# max error count by oligonucleotide : %d\n",options->error_max); printf("#\n"); if (options->r) { printf("# Restricted to %s:\n",taxon[(options->r>1) ? 1:0]); for(i=0;i<(uint32_t)options->r;i++) { current_taxon=eco_findtaxonbytaxid(taxonomy,options->restricted_taxid[i]); printf("# %d : %s (%s)\n", current_taxon->taxid, current_taxon->name, taxonomy->ranks->label[current_taxon->rank] ); } printf("#\n"); } if (options->g) { printf("# Ignore %s:\n",taxon[(options->g>1) ? 1:0]); for(i=0;i<(uint32_t)options->r;i++) { current_taxon=eco_findtaxonbytaxid(taxonomy,options->ignored_taxid[i]); printf("# %d : %s (%s)\n", current_taxon->taxid, current_taxon->name, taxonomy->ranks->label[current_taxon->rank] ); } printf("#\n"); } printf("# strict primer quorum : %3.2f\n",options->strict_quorum); printf("# example quorum : %3.2f\n",options->sensitivity_quorum); if (options->g + options->r) printf("# counterexample quorum : %3.2f\n",options->false_positive_quorum); printf("#\n"); printf("# database : %s\n",options->prefix); printf("# Database is constituted of %5d examples corresponding to %5d %s\n",options->insamples, options->intaxa,options->taxonrank); printf("# and %5d counterexamples corresponding to %5d %s\n",options->outsamples, options->outtaxa,options->taxonrank); printf("#\n"); if (options->lmin && options->lmax) printf("# amplifiat length between [%d,%d] bp\n",options->lmin,options->lmax); else if (options->lmin) printf("# amplifiat length larger than %d bp\n",options->lmin); else if (options->lmax) printf("# amplifiat length smaller than %d bp\n",options->lmax); if (options->circular) printf("# DB sequences are considered as circular\n"); else printf("# DB sequences are considered as linear\n"); printf("#\n"); for (i=0;i < count;i++) printapair(i,sortedpairs[i],options); } /*updateseqparams: This function counts the insample and outsample sequences * and with each sequences adds a tag of the taxon to which the sequence beongs*/ void updateseqparams (pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxonomy, poptions_t options, int32_t *insamples, int32_t *outsamples) { uint32_t i; int32_t taxid; ecotx_t *tmptaxon; for (i=0;iisexample=isGoodTaxon(taxonomy,seqdb[i]->taxid,options); if (seqdb[i]->isexample) (*insamples)++; else (*outsamples)++; taxid = taxonomy->taxons->taxon[seqdb[i]->taxid].taxid; tmptaxon = eco_findtaxonbytaxid(taxonomy, taxid); if (tmptaxon) tmptaxon = eco_findtaxonatrank(tmptaxon, options->taxonrankidx); if (tmptaxon) seqdb[i]->ranktaxonid = tmptaxon->taxid; } } void setresulttaxonrank (ecotaxonomy_t *taxonomy, poptions_t options) { int32_t i; /*set taxon rank for which result is to be given*/ for (i = 0; i < taxonomy->ranks->count; i++) { if (strcmp(taxonomy->ranks->label[i], options->taxonrank) == 0) { options->taxonrankidx = i; break; } } if (i == taxonomy->ranks->count) { fprintf(stderr,"\nUnknown taxon level: '%s'\n", options->taxonrank); exit(0); } } /* to get db stats, totals of species, genus etc....*/ int main(int argc, char **argv) { pecodnadb_t seqdb; /* of type ecoseq_t */ uint32_t seqdbsize=0; ecotaxonomy_t *taxonomy; options_t options; int carg; int32_t errflag=0; int32_t insamples=0; int32_t outsamples=0; uint32_t i; pwordcount_t words; // pwordcount_t words2; pprimercount_t primers; ppairtree_t pairs; int32_t rankdbstats = 0; //printcurrenttime(); //return 0; //openlibman (); //float temp = calculateMeltingTemperature ("GGTCTGAACTCAGATCAC", "CTGTTTACCAAAAACATC"); //float temp = calculateMeltingTemperatureBasic ("CTGTTTACCAAAAACATC"); //printf ("temp = %f\n", temp); //return 0; //char *t = "ACGT"; //printf ("\nGETNUMCODE: A=%d, C=%d, G=%d, T=%d\n", GETNUMCODE(t[0]), GETNUMCODE('C'), GETNUMCODE('G'), GETNUMCODE('T')); //printf ("\nGETREVCODE: A=%d, C=%d, G=%d, T=%d\n", GETREVCODE(t[0]), GETREVCODE('C'), GETREVCODE('G'), GETREVCODE('T')); //return 0; CNNParams nnparams; initoptions(&options); while ((carg = getopt(argc, argv, "hfvcUDSd:l:L:e:i:r:R:q:3:s:x:t:O:m:a:")) != -1) { switch (carg) { /* ---------------------------- */ case 'v': /* set in single strand mode */ /* ---------------------------- */ options.statistics=TRUE; break; /* ---------------------------- */ case 'f': /* set in single strand mode */ /* ---------------------------- */ options.filtering=FALSE; break; /* -------------------- */ case 'd': /* database name */ /* -------------------- */ options.prefix = ECOMALLOC(strlen(optarg)+1, "Error on prefix allocation"); strcpy(options.prefix,optarg); break; /* -------------------- */ case 'h': /* help */ /* -------------------- */ PrintHelp(); exit(0); break; /* ------------------------- */ case 'l': /* min amplification lenght */ /* ------------------------- */ sscanf(optarg,"%d",&(options.lmin)); break; /* -------------------------- */ case 'L': /* max amplification lenght */ /* -------------------------- */ sscanf(optarg,"%d",&(options.lmax)); break; /* -------------------- */ case 'e': /* error max */ /* -------------------- */ sscanf(optarg,"%d",&(options.error_max)); break; // // /* ------------------------ */ // case '3': /* three prime strict match */ // /* ------------------------ */ // sscanf(optarg,"%d",&(options.strict_three_prime)); // break; /* -------------------- */ case 'q': /* strict matching quorum */ /* -------------------- */ sscanf(optarg,"%f",&(options.strict_quorum)); break; /* -------------------- */ case 's': /* strict matching quorum */ /* -------------------- */ sscanf(optarg,"%f",&(options.sensitivity_quorum)); break; /* -------------------- */ case 't': /* required taxon level for results */ /* -------------------- */ strncpy(options.taxonrank, optarg, 19); options.taxonrank[19] = 0; break; /* -------------------- */ case 'x': /* strict matching quorum */ /* -------------------- */ sscanf(optarg,"%f",&(options.false_positive_quorum)); break; /* ---------------------------- */ case 'D': /* set in double strand mode */ /* ---------------------------- */ options.doublestrand=1; break; /* ---------------------------- */ case 'S': /* set in single strand mode */ /* ---------------------------- */ options.doublestrand=0; break; /* ---------------------------- */ case 'U': /* set in single strand mode */ /* ---------------------------- */ options.no_multi_match=TRUE; break; /* ------------------------------------------ */ case 'r': /* stores the restricting search taxonomic id */ /* ------------------------------------------ */ options.restricted_taxid = ECOREALLOC(options.restricted_taxid,sizeof(int32_t)*(options.r+1), "Error on restricted_taxid reallocation"); sscanf(optarg,"%d",&(options.restricted_taxid[options.r])); options.r++; break; /* -------------------- */ case 'R': /* reference sequence */ /* -------------------- */ options.reference = ECOMALLOC(strlen(optarg)+1, "Error on prefix allocation"); strcpy(options.reference,optarg); break; /* --------------------------------- */ case 'i': /* stores the taxonomic id to ignore */ /* --------------------------------- */ options.ignored_taxid = ECOREALLOC(options.ignored_taxid,sizeof(int32_t)*(options.g+1), "Error on excluded_taxid reallocation"); sscanf(optarg,"%d",&(options.ignored_taxid[options.g])); options.g++; break; /* --------------------------------- */ case 'O': /* set primer size */ /* --------------------------------- */ sscanf(optarg,"%d",&(options.primer_length)); break; /* --------------------------------- */ case 'm': /* set salt method */ /* --------------------------------- */ sscanf(optarg,"%d",&(options.saltmethod)); break; /* --------------------------------- */ case 'a': /* set salt */ /* --------------------------------- */ sscanf(optarg,"%f",&(options.salt)); break; /* -------------------- */ case 'c': /* sequences are circular */ /* --------------------------------- */ options.circular = 1; break; case '?': /* bad option */ /* -------------------- */ errflag++; } } options.pnparm = &nnparams; if (options.saltmethod != 2) //if not SALT_METHOD_OWCZARZY options.saltmethod = SALT_METHOD_SANTALUCIA; //then force SALT_METHOD_SANTALUCIA if (options.salt < 0.01 || options.salt > 0.3) //if salt value out of literature values options.salt = DEF_SALT; //set to default nparam_InitParams(&nnparams, DEF_CONC_PRIMERS,DEF_CONC_SEQUENCES,options.salt,options.saltmethod); fprintf(stderr,"Reading taxonomy database ..."); taxonomy = read_taxonomy(options.prefix,0); fprintf(stderr,"Ok\n"); setresulttaxonrank(taxonomy, &options); /*TR: set rank level for statistics*/ fprintf(stderr,"Reading sequence database ...\n"); seqdb = readdnadb(options.prefix,&seqdbsize); if (options.reference) for (i=0; i < seqdbsize;i++) if (strcmp(seqdb[i]->AC,options.reference)==0) { options.refseq=seqdb[i]; options.refseqid=i; fprintf(stderr,"Reference sequence %s identified\n",options.reference); } fprintf(stderr,"Ok\n"); fprintf(stderr,"Sequence read : %d\n",(int32_t)seqdbsize); updateseqparams(seqdb, seqdbsize, taxonomy, &options, &insamples , &outsamples); options.dbsize=seqdbsize; options.insamples=insamples; options.outsamples=outsamples; rankdbstats = getrankdbstats(seqdb, seqdbsize, taxonomy, &options); fprintf(stderr,"Database is constituted of %5d examples corresponding to %5d %s\n",insamples, options.intaxa,options.taxonrank); fprintf(stderr," and %5d counterexamples corresponding to %5d %s\n",outsamples, options.outtaxa,options.taxonrank); fprintf(stderr,"Total distinct %s count %d\n",options.taxonrank, rankdbstats); fprintf(stderr,"\nIndexing words in sequences\n"); words = lookforStrictPrimer(seqdb,seqdbsize,insamples,&options); fprintf(stderr,"\n Strict primer count : %d\n",words->size); // options.filtering=FALSE; // words2= lookforStrictPrimer(seqdb,seqdbsize,insamples,&options); // fprintf(stderr,"\n Strict primer count : %d\n",words2->size); // // fprintf(stderr,"\n\n Primer sample : \n"); // for (i=0; isize; i++) // fprintf(stderr," + Primer : %s sequence count : %d\n",ecoUnhashWord(words->words[i],options.primer_length),words->strictcount[i]); // fprintf(stderr,"\n\n Primer sample : \n"); // for (i=0; isize; i++) // fprintf(stderr," + Primer : %s sequence count : %d\n",ecoUnhashWord(words2->words[i],options.primer_length),words2->strictcount[i]); if (options.no_multi_match) { (void)filterMultiStrictPrimer(words); fprintf(stderr,"\n Strict primer with single match count : %d\n",words->size); } fprintf(stderr,"\n\n Primer sample : \n"); for (i=0; isize); i++) fprintf(stderr," + Primer : %s sequence count : %d\n",ecoUnhashWord(words->words[i],options.primer_length),words->strictcount[i]); fprintf(stderr,"\nEncoding sequences for fuzzy pattern matching...\n"); for (i=0;istrictcount,"Free strict primer count table"); primers = lookforAproxPrimer(seqdb,seqdbsize,insamples,words,&options); ECOFREE(words->words,"Free strict primer table"); ECOFREE(words,"Free strict primer structure"); fprintf(stderr,"\n\n Approximate repeats :%d \n", primers->size); fprintf(stderr,"\n\n Primer sample : \n"); for (i=0; isize); i++) fprintf(stderr," + Primer : %s example sequence count : %5d counterexample sequence count : %5d status : %s\n",ecoUnhashWord(primers->primers[i].word,options.primer_length), primers->primers[i].inexample, primers->primers[i].outexample, primers->primers[i].good ? "good":"bad"); fprintf(stderr,"\n"); pairs = buildPrimerPairs(seqdb, seqdbsize, primers, &options); printpairs (pairs, &options,taxonomy); // closlibman (); return 0; }