From b3d6acae7641f1f97ed2c9d761d8770c975ec330 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Wed, 7 Apr 2010 11:32:41 +0000 Subject: [PATCH] git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@256 60f365c0-8329-0410-b2a4-ec073aeeaa1d --- src/Makefile | 18 +- src/ecoprobe.c | 534 ++++++++++++++++++++++++++++++++++++ src/global.mk | 2 +- src/libthermo/thermostats.c | 69 ++++- src/libthermo/thermostats.h | 5 +- 5 files changed, 615 insertions(+), 13 deletions(-) create mode 100644 src/ecoprobe.c diff --git a/src/Makefile b/src/Makefile index 4f75583..b63b6d0 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,15 +1,15 @@ -EXEC=ecoPrimer +EXEC=ecoprobe -PRIMER_SRC= ecoprimer.c +PRIMER_SRC= ecoprobe.c PRIMER_OBJ= $(patsubst %.c,%.o,$(PRIMER_SRC)) SRCS= $(PRIMER_SRC) -LIB= -lecoprimer -lecoPCR -lthermo -lz -lm +LIB= -lecoprobe -lecoPCR -lthermo -lz -lm LIBFILE= libecoPCR/libecoPCR.a \ - libecoprimer/libecoprimer.a \ + libecoprobe/libecoprobe.a \ libthermo/libthermo.a \ @@ -21,13 +21,13 @@ all: $(EXEC) ######## # -# ecoPrimer compilation +# ecoprobe compilation # ######## # executable compilation and link -ecoPrimer: $(PRIMER_OBJ) $(LIBFILE) +ecoprobe: $(PRIMER_OBJ) $(LIBFILE) $(CC) $(LDFLAGS) -O5 -m64 -o $@ $< $(LIBPATH) $(LIB) @@ -40,8 +40,8 @@ ecoPrimer: $(PRIMER_OBJ) $(LIBFILE) libecoPCR/libecoPCR.a: $(MAKE) -C libecoPCR -libecoprimer/libecoprimer.a: - $(MAKE) -C libecoprimer +libecoprobe/libecoprobe.a: + $(MAKE) -C libecoprobe libthermo/libthermo.a: $(MAKE) -C libthermo @@ -56,7 +56,7 @@ clean: rm -f *.o rm -f $(EXEC) $(MAKE) -C libecoPCR clean - $(MAKE) -C libecoprimer clean + $(MAKE) -C libecoprobe clean $(MAKE) -C libthermo clean diff --git a/src/ecoprobe.c b/src/ecoprobe.c new file mode 100644 index 0000000..f3cc4d4 --- /dev/null +++ b/src/ecoprobe.c @@ -0,0 +1,534 @@ +/* + * ecoprobe.c + * + * Created on: 20 March. 2010 + * Author: + */ + +#include "libecoprobe/ecoprobe.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include"libthermo/nnparams.h" +#include"libthermo/thermostats.h" + +void printprobes (pecodnadb_t seqdb,uint32_t seqdbsize, pprobecount_t pprobes, poptions_t options); + +#define VERSION "0.1" + /* TR: by default, statistics are made on species level*/ +#define DEFAULTTAXONRANK "species" + + +void* lib_handle = NULL; +float (*calcMelTemp)(char*, char*); + +/* ----------------------------------------------- */ +/* printout help */ +/* ----------------------------------------------- */ +#define PP fprintf(stdout, + +static void PrintHelp() +{ + PP "------------------------------------------\n"); + PP " ecoprobe Version %s\n", VERSION); + PP "------------------------------------------\n"); + PP "synopsis : finding probes and measureing the quality of barcode region\n"); + PP "usage: ./ecoprobe [options] \n"); + PP "------------------------------------------\n"); + PP "options:\n"); + PP "-d : [D]atabase : to match the expected format, the database\n"); + PP " has to be formated first by the ecoPCRFormat.py program located.\n"); + PP " in the ecoPCR/tools directory.\n"); + PP " ecoPCRFormat.py creates three file types :\n"); + PP " .sdx : contains the sequences\n"); + PP " .tdx : contains information concerning the taxonomy\n"); + PP " .rdx : contains the taxonomy rank\n\n"); + PP " ecoprobe needs all the file type. As a result, you have to write the\n"); + PP " database radical without any extension. For example /ecoprobeDB/fstvert\n\n"); + PP "-e : [E]rror : max error allowed by oligonucleotide (0 by default)\n\n"); + PP "-h : [H]elp - print help\n\n"); + PP "-i : [I]gnore the given taxonomy id (define the counterexample taxon set).\n\n"); + PP "-r : [R]estricts the search to the given taxonomic id (restrict the example taxon set).\n\n"); + PP "-E : [E]xception taxid allows to indicate than some subclade of example sequences are conterexamples.\n\n"); + PP "-c : Consider that the database sequences are [c]ircular\n\n"); + PP "-3 : Three prime strict match\n\n"); + PP "-q : Strict matching [q]uorum, percentage of the sequences in which strict primers are found. By default it is 70\n\n"); + PP "-s : [S]ensitivity quorum\n\n"); + PP "-t : required [t]axon level for results, by default the results are computed at species level\n\n"); + PP "-x : false positive quorum\n\n"); + PP "-D : set in [d]ouble strand mode\n\n"); + PP "-O : set the primer length (default 18) \n\n"); + PP "-S : Set in [s]ingle strand mode\n\n"); + PP "-m : Salt correction method for Tm computation (SANTALUCIA : 1 or OWCZARZY:2, default=1)\n\n"); + PP "-a : Salt contentration in M for Tm computation (default 0.05 M)\n\n"); + PP "-U : No multi match\n\n"); + PP "-R : Define the [R]eference sequence identifier (must be part of example set)\n\n"); + PP "-A : Print the list of all identifier of sequences present in the database\n\n"); + PP "-f : Remove data mining step during strict primer identification\n\n"); + PP "-v : Store statistic file about memory usage during strict primer identification\n\n"); + PP "\n"); + PP "------------------------------------------\n"); + PP "Table result description : \n"); + PP "column 1 : serial number\n"); + PP "column 2 : probe\n"); + PP "column 3 : amplified example sequence count\n"); + PP "column 4 : amplified counterexample sequence count\n"); + PP "column 5 : probe Tm without mismatch\n"); + PP "column 6 : probe lowest Tm against exemple sequences\n"); + PP "column 7 : amplified length on left side\n"); + PP "column 8 : specificity of left amplifia\n"); + PP "column 9 : amplified length on right side\n"); + PP "column 10 : specificity of right amplifia\n"); + + PP "------------------------------------------\n"); + PP " http://www.grenoble.prabi.fr/trac/ecoprobe/\n"); + PP "------------------------------------------\n\n"); + PP "\n"); + +} + +static void ExitUsage(int stat) +{ + PP "usage: ecoprobe [-d database] [-e value] [-r taxid] [-i taxid] [-R rank] [-t taxon level]\n"); + PP "type \"ecoprobe -h\" for help\n"); + + if (stat) + exit(stat); +} + +#undef PP + +void initoptions(poptions_t options) +{ + options->statistics=FALSE; + options->filtering=TRUE; + options->error_max=3; //**< maximum error count in fuzzy search + options->primer_length=18; //**< minimal length of the primers + options->restricted_taxid=NULL; //**< limit amplification below these taxid + options->ignored_taxid=NULL; //**< no amplification below these taxid + options->exception_taxid=NULL; //**< no amplification below these taxid + options->prefix=NULL; + options->reference=NULL; + options->refseq=NULL; + options->circular=1; + options->doublestrand=1; + options->strict_quorum=0.7; + options->strict_exclude_quorum=0.1; + options->sensitivity_quorum=0.9; + options->false_positive_quorum=0.1; + options->strict_three_prime=0; + options->r=0; + options->g=0; + options->e=0; + options->no_multi_match=FALSE; + options->pnparm = NULL; + strcpy(options->taxonrank, DEFAULTTAXONRANK); /*taxon level for results, species by default*/ + options->saltmethod = SALT_METHOD_SANTALUCIA; + options->salt = DEF_SALT; + options->printAC=FALSE; + + /*probe parameters*/ + options->amp_len_guess = 100; /*Just a guess of the length for fast search*/ + options->probe_specificity = 0.6; /*minimum value of acceptable specificity percent*/ +} + + + +/*updateseqparams: This function counts the insample and outsample sequences + * and with each sequences adds a tag of the taxon to which the sequence beongs*/ + +void updateseqparams (pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxonomy, + poptions_t options, int32_t *insamples, int32_t *outsamples) +{ + uint32_t i; + int32_t taxid; + ecotx_t *tmptaxon; + + for (i=0;iisexample=isExampleTaxon(taxonomy,seqdb[i]->taxid,options); + if (seqdb[i]->isexample) + (*insamples)++; + else + (*outsamples)++; + + taxid = taxonomy->taxons->taxon[seqdb[i]->taxid].taxid; + tmptaxon = eco_findtaxonbytaxid(taxonomy, taxid); + if (tmptaxon) + tmptaxon = eco_findtaxonatrank(tmptaxon, options->taxonrankidx); + if (tmptaxon) + seqdb[i]->ranktaxonid = tmptaxon->taxid; + } +} + +void setresulttaxonrank (ecotaxonomy_t *taxonomy, poptions_t options) +{ + int32_t i; + + /*set taxon rank for which result is to be given*/ + for (i = 0; i < taxonomy->ranks->count; i++) + { + if (strcmp(taxonomy->ranks->label[i], options->taxonrank) == 0) + { + options->taxonrankidx = i; + break; + } + } + + if (i == taxonomy->ranks->count) + { + fprintf(stderr,"\nUnknown taxon level: '%s'\n", options->taxonrank); + exit(0); + } +} +/* to get db stats, totals of species, genus etc....*/ + +static void printAC(pecodnadb_t seqdb,uint32_t seqdbsize) +{ + uint32_t i; + + for (i=0; i< seqdbsize;i++) + printf("%15s : %s\n",seqdb[i]->AC,seqdb[i]->DE); +} + +int main(int argc, char **argv) +{ + pecodnadb_t seqdb; /* of type ecoseq_t */ + uint32_t seqdbsize=0; + ecotaxonomy_t *taxonomy; + + options_t options; + int carg; + int32_t errflag=0; + + int32_t insamples=0; + int32_t outsamples=0; + uint32_t i; + + pwordcount_t words; + pprimercount_t primers; + ppairtree_t pairs; + + int32_t rankdbstats = 0; + + CNNParams nnparams; + probecount_t probes; + + initoptions(&options); + + while ((carg = getopt(argc, argv, "hAfvcUDSE:d:e:i:r:R:q:3:s:x:t:O:m:a:")) != -1) { + + switch (carg) { + /* ---------------------------- */ + case 'v': /* Store statistic file about memory usage during strict primer identification */ + /* ---------------------------- */ + options.statistics=TRUE; + break; + + /* ---------------------------- */ + case 'f': /* Remove data mining step during strict primer identification */ + /* ---------------------------- */ + options.filtering=FALSE; + break; + + /* ---------------------------- */ + case 'A': /* Print the list of all identifier of sequences present in the database */ + /* ---------------------------- */ + options.printAC=TRUE; + break; + + /* -------------------- */ + case 'd': /* database name */ + /* -------------------- */ + options.prefix = ECOMALLOC(strlen(optarg)+1, + "Error on prefix allocation"); + strcpy(options.prefix,optarg); + break; + + /* -------------------- */ + case 'h': /* help */ + /* -------------------- */ + PrintHelp(); + exit(0); + break; + + /* -------------------- */ + case 'e': /* error max */ + /* -------------------- */ + sscanf(optarg,"%d",&(options.error_max)); + break; + + + /* ------------------------ */ + case '3': /* three prime strict match */ + /* ------------------------ */ + sscanf(optarg,"%d",&(options.strict_three_prime)); + break; + + /* -------------------- */ + case 'q': /* strict matching quorum */ + /* -------------------- */ + sscanf(optarg,"%f",&(options.strict_quorum)); + break; + + /* -------------------- */ + case 's': /* sensitivity quorum */ + /* -------------------- */ + sscanf(optarg,"%f",&(options.sensitivity_quorum)); + break; + + /* -------------------- */ + case 't': /* required taxon level for results */ + /* -------------------- */ + strncpy(options.taxonrank, optarg, 19); + options.taxonrank[19] = 0; + break; + + /* -------------------- */ + case 'x': /* false positive quorum */ + /* -------------------- */ + sscanf(optarg,"%f",&(options.false_positive_quorum)); + break; + + /* ---------------------------- */ + case 'D': /* set in double strand mode */ + /* ---------------------------- */ + options.doublestrand=1; + break; + + /* ---------------------------- */ + case 'S': /* set in single strand mode */ + /* ---------------------------- */ + options.doublestrand=0; + break; + + /* ---------------------------- */ + case 'U': /* set in single strand mode */ + /* ---------------------------- */ + options.no_multi_match=TRUE; + break; + + /* ------------------------------------------ */ + case 'r': /* stores the restricting search taxonomic id */ + /* ------------------------------------------ */ + options.restricted_taxid = ECOREALLOC(options.restricted_taxid,sizeof(int32_t)*(options.r+1), + "Error on restricted_taxid reallocation"); + sscanf(optarg,"%d",&(options.restricted_taxid[options.r])); + options.r++; + break; + + /* ------------------------------------------ */ + case 'E': /* stores the restricting search taxonomic id */ + /* ------------------------------------------ */ + options.exception_taxid = ECOREALLOC(options.exception_taxid,sizeof(int32_t)*(options.e+1), + "Error on exception_taxid reallocation"); + sscanf(optarg,"%d",&(options.exception_taxid[options.e])); + options.e++; + break; + + /* -------------------- */ + case 'R': /* reference sequence */ + /* -------------------- */ + options.reference = ECOMALLOC(strlen(optarg)+1, + "Error on prefix allocation"); + strcpy(options.reference,optarg); + break; + + /* --------------------------------- */ + case 'i': /* stores the taxonomic id to ignore */ + /* --------------------------------- */ + options.ignored_taxid = ECOREALLOC(options.ignored_taxid,sizeof(int32_t)*(options.g+1), + "Error on excluded_taxid reallocation"); + sscanf(optarg,"%d",&(options.ignored_taxid[options.g])); + options.g++; + break; + + /* --------------------------------- */ + case 'O': /* set primer size */ + /* --------------------------------- */ + sscanf(optarg,"%d",&(options.primer_length)); + break; + + /* --------------------------------- */ + case 'm': /* set salt method */ + /* --------------------------------- */ + sscanf(optarg,"%d",&(options.saltmethod)); + break; + + /* --------------------------------- */ + case 'a': /* set salt */ + /* --------------------------------- */ + sscanf(optarg,"%f",&(options.salt)); + break; + + /* -------------------- */ + case 'c': /* sequences are circular */ + /* --------------------------------- */ + options.circular = 1; + break; + + case '?': /* bad option */ + /* -------------------- */ + errflag++; + } + + } + options.pnparm = &nnparams; + if (options.saltmethod != 2) //if not SALT_METHOD_OWCZARZY + options.saltmethod = SALT_METHOD_SANTALUCIA; //then force SALT_METHOD_SANTALUCIA + + + if (options.salt < 0.01 || options.salt > 0.3) //if salt value out of literature values + options.salt = DEF_SALT; //set to default + + nparam_InitParams(&nnparams, DEF_CONC_PRIMERS,DEF_CONC_SEQUENCES,options.salt,options.saltmethod); + + fprintf(stderr,"Reading taxonomy database ..."); + taxonomy = read_taxonomy(options.prefix,0); + fprintf(stderr,"Ok\n"); + + setresulttaxonrank(taxonomy, &options); /*TR: set rank level for statistics*/ + + fprintf(stderr,"Reading sequence database ...\n"); + + seqdb = readdnadb(options.prefix,taxonomy,&seqdbsize, &options); + + if (options.printAC) + { + printAC(seqdb,seqdbsize); + exit(0); + } + if (options.reference) + for (i=0; i < seqdbsize;i++) + if (strcmp(seqdb[i]->AC,options.reference)==0) + { + options.refseq=seqdb[i]; + options.refseqid=i; + fprintf(stderr,"Reference sequence %s identified\n",options.reference); + } + + fprintf(stderr,"Ok\n"); + fprintf(stderr,"Sequence read : %d\n",(int32_t)seqdbsize); + + updateseqparams(seqdb, seqdbsize, taxonomy, &options, &insamples , &outsamples); + options.dbsize=seqdbsize; + options.insamples=insamples; + options.outsamples=outsamples; + + rankdbstats = getrankdbstats(seqdb, seqdbsize, taxonomy, &options); + + fprintf(stderr,"Database is constituted of %5d examples corresponding to %5d %s\n",insamples, + options.intaxa,options.taxonrank); + fprintf(stderr," and %5d counterexamples corresponding to %5d %s\n",outsamples, + options.outtaxa,options.taxonrank); + fprintf(stderr,"Total distinct %s count %d\n",options.taxonrank, rankdbstats); + + fprintf(stderr,"\nIndexing words in sequences\n"); + + words = lookforStrictPrimer(seqdb,seqdbsize,insamples,&options); + fprintf(stderr,"\n Strict primer count : %d\n",words->size); + + if (options.no_multi_match) + { + (void)filterMultiStrictPrimer(words); + fprintf(stderr,"\n Strict primer with single match count : %d\n",words->size); + } + + + fprintf(stderr,"\n\n Primer sample : \n"); + for (i=0; isize); i++) + fprintf(stderr," + Primer : %s sequence count : %d\n",ecoUnhashWord(words->words[i],options.primer_length),words->strictcount[i]); + + + fprintf(stderr,"\nEncoding sequences for fuzzy pattern matching...\n"); + for (i=0;istrictcount,"Free strict primer count table"); + + primers = lookforAproxPrimer(seqdb,seqdbsize,insamples,words,&options); + + ECOFREE(words->words,"Free strict primer table"); + ECOFREE(words,"Free strict primer structure"); + fprintf(stderr,"\n\n Approximate repeats :%d \n", primers->size); + + fprintf(stderr,"\n\n Primer sample : \n"); + for (i=0; isize); i++) + fprintf(stderr," + Primer : %s example sequence count : %5d counterexample sequence count : %5d status : %s\n",ecoUnhashWord(primers->primers[i].word,options.primer_length), + primers->primers[i].inexample, + primers->primers[i].outexample, + primers->primers[i].good ? "good":"bad"); + + fprintf(stderr,"\n"); + + probes = buildprobes (seqdb, seqdbsize, primers, &options); + printprobes (seqdb, seqdbsize, &probes, &options); + + return 0; +} + +void printprobes (pecodnadb_t seqdb,uint32_t seqdbsize, pprobecount_t pprobes, poptions_t options) +{ + uint32_t i, j; + uint32_t count; + char wrd[32]; + char *c; + word_t w; + + getProbeThermoProperties (seqdb, seqdbsize, pprobes, options); + + for (i = 0; i < pprobes->size; i++) + { + w = pprobes->probes[i].primer->word; + + count = 0; + for (j = 0; j < seqdbsize; j++) + count += pprobes->probes[i].primer->directCount[j]; + + //if occures only on reverse positions then take reverse complement + if (count == 0) + w = ecoComplementWord(w,options->primer_length); + + c = ecoUnhashWord(w,options->primer_length); + strcpy (wrd, c); + + //print serial number + printf("%6d\t",i); + + //print probe + printf("%s\t", wrd); + + //print in example count + printf("%d\t",pprobes->probes[i].primer->inexample); + + //print out example count + printf("%d\t", pprobes->probes[i].primer->outexample); + + //print primer1 melting temperature + printf ("\t%3.1f", pprobes->probes[i].ptemp); + + //print minimum melting temperature of approximate versions of probe + printf ("\t%3.1f", pprobes->probes[i].pmintemp); + + //left length + printf("\t%d",pprobes->probes[i].llength); + + //left specificity + printf("\t%0.2f",pprobes->probes[i].lspecificity); + + //right length + printf("\t%d",pprobes->probes[i].rlength); + + //right specificity + printf("\t%0.2f\n",pprobes->probes[i].rspecificity); + } +} + diff --git a/src/global.mk b/src/global.mk index a7bcfef..7c4214e 100644 --- a/src/global.mk +++ b/src/global.mk @@ -1,5 +1,5 @@ MACHINE=MAC_OS_X -LIBPATH= -Llibapat -LlibecoPCR -Llibecoprimer -Llibthermo +LIBPATH= -Llibapat -LlibecoPCR -Llibecoprobe -Llibthermo MAKEDEPEND = gcc -D$(MACHINE) -M $(CPPFLAGS) -o $*.d $< CC=gcc diff --git a/src/libthermo/thermostats.c b/src/libthermo/thermostats.c index 525cc42..f1aa2cd 100644 --- a/src/libthermo/thermostats.c +++ b/src/libthermo/thermostats.c @@ -26,7 +26,7 @@ word_t extractSite(char* sequence, size_t begin, size_t length, bool_t strand) return site; } -void getThermoProperties (ppair_t* pairs, size_t count, poptions_t options) +/*void getThermoProperties (ppair_t* pairs, size_t count, poptions_t options) { size_t i, j,k,l; uint32_t bp1,bp2; @@ -112,4 +112,71 @@ void getThermoProperties (ppair_t* pairs, size_t count, poptions_t options) } } +}*/ + +void getProbeThermoProperties (pecodnadb_t seqdb,uint32_t seqdbsize, pprobecount_t pprobs, poptions_t options) +{ + size_t i, j,k; + uint32_t bp1,bp2; + word_t w; + //word_t wtmp; + bool_t strand; + + char *sq; + char prmr[50]; + + float mtemp; + + for (i = 0; i < pprobs->size; i++) + { + w = pprobs->probes[i].primer->word; + //wtmp = ecoComplementWord(w1,options->primer_length); + //if (wtmp > w) w = wtmp; + + strncpy(prmr,ecoUnhashWord(w, options->primer_length),options->primer_length); + prmr[options->primer_length]=0; + + pprobs->probes[i].ptemp = nparam_CalcSelfTM (options->pnparm, prmr, options->primer_length) - 273.0; + + pprobs->probes[i].pmintemp = 100; + + for (j = 0; j < seqdbsize; j++) + { + if (!seqdb[j]->isexample) continue; + sq = seqdb[j]->SQ; + + + if (pprobs->probes[i].primer->directCount[j] > 0) + { + strand = TRUE; + for (k = 0; k < pprobs->probes[i].primer->directCount[j]; k++) + { + bp1 = (pprobs->probes[i].primer->directCount[j] > 1)? pprobs->probes[i].primer->directPos[j].pointer[k] : pprobs->probes[i].primer->directPos[j].value; + mtemp = nparam_CalcTwoTM(options->pnparm, + prmr, + ecoUnhashWord(extractSite(sq,bp1,options->primer_length,strand),options->primer_length), + options->primer_length) - 273.0; + + if (mtemp < pprobs->probes[i].pmintemp) + pprobs->probes[i].pmintemp = mtemp; + } + } + + if (pprobs->probes[i].primer->reverseCount[j] > 0) + { + strand = FALSE; + for (k = 0; k < pprobs->probes[i].primer->directCount[j]; k++) + { + bp1 = (pprobs->probes[i].primer->reverseCount[j] > 1)? pprobs->probes[i].primer->reversePos[j].pointer[k] : pprobs->probes[i].primer->reversePos[j].value; + mtemp = nparam_CalcTwoTM(options->pnparm, + prmr, + ecoUnhashWord(extractSite(sq,bp1,options->primer_length,strand),options->primer_length), + options->primer_length) - 273.0; + + if (mtemp < pprobs->probes[i].pmintemp) + pprobs->probes[i].pmintemp = mtemp; + } + } + } + } } diff --git a/src/libthermo/thermostats.h b/src/libthermo/thermostats.h index 40c4806..8e8bd25 100644 --- a/src/libthermo/thermostats.h +++ b/src/libthermo/thermostats.h @@ -1,9 +1,10 @@ #ifndef THERMOSTATS_H_ #define THERMOSTATS_H_ -#include "../libecoprimer/ecoprimer.h" +#include "../libecoprobe/ecoprobe.h" -void getThermoProperties (ppair_t* pairs, size_t count, poptions_t options); +//void getThermoProperties (ppair_t* pairs, size_t count, poptions_t options); word_t extractSite(char* sequence, size_t begin, size_t length, bool_t strand); +void getProbeThermoProperties (pecodnadb_t seqdb,uint32_t seqdbsize, pprobecount_t pprobs, poptions_t options); #endif \ No newline at end of file