Files
ecoprimers/src/ecoprimer.c

727 lines
24 KiB
C
Raw Normal View History

/*
* ecoprimer.c
*
* Created on: 7 nov. 2008
* Author: coissac
*/
#include "libecoprimer/ecoprimer.h"
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <stdlib.h>
#include <getopt.h>
#include <time.h>
#include <sys/time.h>
#define VERSION "0.3"
/* TR: by default, statistics are made on species level*/
#define DEFAULTTAXONRANK "species"
static int cmpprintedpairs(const void* p1,const void* p2);
/* ----------------------------------------------- */
/* printout help */
/* ----------------------------------------------- */
#define PP fprintf(stdout,
static void PrintHelp()
{
PP "------------------------------------------\n");
PP " ecoPrimer Version %s\n", VERSION);
PP "------------------------------------------\n");
PP "synopsis : finding primers and measureing the quality of primers and barcode region\n");
PP "usage: ./ecoPrimer [options] \n");
PP "------------------------------------------\n");
PP "options:\n");
PP "-d : [D]atabase : to match the expected format, the database\n");
PP " has to be formated first by the ecoPCRFormat.py program located.\n");
PP " in the ecoPCR/tools directory.\n");
PP " ecoPCRFormat.py creates three file types :\n");
PP " .sdx : contains the sequences\n");
PP " .tdx : contains information concerning the taxonomy\n");
PP " .rdx : contains the taxonomy rank\n\n");
PP " ecoPrimer needs all the file type. As a result, you have to write the\n");
PP " database radical without any extension. For example /ecoPrimerDB/fstvert\n\n");
PP "-e : [E]rror : max error allowed by oligonucleotide (0 by default)\n\n");
PP "-h : [H]elp - print <this> help\n\n");
PP "-i : [I]gnore the given taxonomy id.\n\n");
PP "-l : minimum [L]ength : define the minimum amplication length. \n\n");
PP "-L : maximum [L]ength : define the maximum amplicationlength. \n\n");
PP "-r : [R]estricts the search to the given taxonomic id.\n\n");
PP "-c : Consider that the database sequences are [c]ircular\n\n");
// PP "-3 : Three prime strict match\n\n");
PP "-q : Strict matching [q]uorum, percentage of the sequences in which strict primers are found. By default it is 70\n\n");
PP "-s : [S]ensitivity quorum\n\n");
PP "-t : required [t]axon level for results, by default the results are computed at species level\n\n");
PP "-x : false positive quorum\n\n");
PP "-D : set in [d]ouble strand mode\n\n");
PP "-S : Set in [s]ingle strand mode\n\n");
PP "-U : No multi match\n\n");
PP "\n");
PP "------------------------------------------\n");
PP "Table result description : \n");
PP "column 1 : serial number\n");
PP "column 2 : primer1\n");
PP "column 3 : primer2\n");
PP "column 4 : good/bad\n");
PP "column 5 : in sequence count\n");
PP "column 6 : out sequence count\n");
PP "column 7 : yule\n");
PP "column 8 : in taxa count\n");
PP "column 9 : out taxa count\n");
PP "column 10 : coverage\n");
PP "column 11 : unambiguously identified taxa\n");
PP "column 12 : specificity\n");
PP "column 13 : minimum amplified length\n");
PP "column 14 : maximum amplified length\n");
PP "column 15 : average amplified length\n");
PP "------------------------------------------\n");
PP " http://www.grenoble.prabi.fr/trac/ecoPrimer/\n");
PP "------------------------------------------\n\n");
PP "\n");
}
static void ExitUsage(int stat)
{
PP "usage: ecoprimer [-d database] [-l value] [-L value] [-e value] [-r taxid] [-i taxid] [-R rank] [-t taxon level]\n");
PP "type \"ecoprimer -h\" for help\n");
if (stat)
exit(stat);
}
#undef PP
void initoptions(poptions_t options)
{
options->statistics=FALSE;
options->filtering=TRUE;
options->lmin=0; //< Amplifia minimal length
options->lmax=1000; //< Amplifia maximal length
options->error_max=3; //**< maximum error count in fuzzy search
options->primer_length=18; //**< minimal length of the primers
options->restricted_taxid=NULL; //**< limit amplification below these taxid
options->ignored_taxid=NULL; //**< no amplification below these taxid
options->prefix=NULL;
options->reference=NULL;
options->refseq=NULL;
options->circular=0;
options->doublestrand=1;
options->strict_quorum=0.7;
options->strict_exclude_quorum=0.1;
options->sensitivity_quorum=0.9;
options->false_positive_quorum=0.1;
options->strict_three_prime=0;
options->r=0;
options->g=0;
options->no_multi_match=FALSE;
strcpy(options->taxonrank, DEFAULTTAXONRANK); /*taxon level for results, species by default*/
}
void printcurrenttime ()
{
time_t now;
struct tm *ts;
char buf[80];
/* Get the current time */
now = time(NULL);
/* Format and print the time, "ddd yyyy-mm-dd hh:mm:ss zzz" */
ts = localtime(&now);
strftime(buf, sizeof(buf), "%a %Y-%m-%d %H:%M:%S %Z", ts);
fprintf(stderr,"#%d#, %s\n",(int)now, buf);
}
void printcurrenttimeinmilli()
{
struct timeval tv;
struct timezone tz;
struct tm *tm;
gettimeofday(&tv, &tz);
tm=localtime(&tv.tv_sec);
fprintf(stderr, " %d:%02d:%02d %d %d \n", tm->tm_hour, tm->tm_min,
tm->tm_sec, tv.tv_usec, tv.tv_usec/1000);
}
/*TR: Added*/
void printapair(int32_t index,ppair_t pair, poptions_t options)
{
bool_t asdirect1=pair->asdirect1;
bool_t asdirect2=pair->asdirect2;
bool_t asdirecttmp;
word_t w1=pair->p1->word;
word_t w2=pair->p2->word;
word_t wtmp;
bool_t good1=pair->p1->good;
bool_t good2=pair->p2->good;
bool_t goodtmp;
bool_t strand;
uint32_t i;
char *c;
if (!asdirect1)
w1=ecoComplementWord(w1,options->primer_length);
if (!asdirect2)
w2=ecoComplementWord(w2,options->primer_length);
if (w2 < w1)
{
wtmp=w1;
w1=w2;
w2=wtmp;
asdirecttmp=asdirect1;
asdirect1=asdirect2;
asdirect2=asdirecttmp;
goodtmp=good1;
good1=good2;
good2=goodtmp;
}
printf("%6d\t",index);
printf("%s\t",ecoUnhashWord(w1,options->primer_length));
printf("%s",ecoUnhashWord(w2,options->primer_length));
printf("\t%c%c", "bG"[(int)good1],"bG"[(int)good2]);
printf("\t%d", pair->inexample);
printf("\t%d", pair->outexample);
printf("\t%4.3f", pair->yule);
printf("\t%d", pair->intaxa);
printf("\t%d", pair->outtaxa);
printf("\t%4.3f", (float)pair->bc);
printf("\t%d", pair->intaxa - pair->notwellidentifiedtaxa);
printf("\t%4.3f", pair->bs);
printf("\t%d", pair->mind);
printf("\t%d", pair->maxd);
printf("\t%3.2f", (float)pair->sumd/pair->inexample);
if (options->refseq && pair->refsequence >=0)
{
printf("\t%s:",options->reference);
strand = pair->pcr.amplifias[pair->refsequence].strand;
if (strand)
printf("join(");
else
printf("complement(");
printf("%d..%d,%d..%d",pair->pcr.amplifias[pair->refsequence].begin - options->primer_length + 1,
pair->pcr.amplifias[pair->refsequence].begin,
pair->pcr.amplifias[pair->refsequence].end + 2,
pair->pcr.amplifias[pair->refsequence].end + options->primer_length + 1
);
printf(")");
printf("\t");
for (c=pair->pcr.amplifias[pair->refsequence].amplifia,
i=pair->pcr.amplifias[pair->refsequence].begin;
i<=pair->pcr.amplifias[pair->refsequence].end;
i++,
c+=(strand)? 1:-1)
printf("%c","acgt"[(strand)? (*c):(~*c)&3]);
}
printf("\n");
}
static int cmpprintedpairs(const void* p1,const void* p2)
{
float s1,s2;
ppair_t pair1,pair2;
pair1=*((ppair_t*)p1);
pair2=*((ppair_t*)p2);
s1 = pair1->yule * pair1->bs;
s2 = pair2->yule * pair2->bs;
// fprintf(stderr,"s1 : %4.3f %4.3f %4.3f\n",pair1->yule , pair1->bs,s1);
// fprintf(stderr,"s2 : %4.3f %4.3f %4.3f\n\n",pair2->yule , pair2->bs,s2);
if (s1 > s2) return -1;
if (s1 < s2) return 1;
return 0;
}
uint32_t filterandsortpairs(ppair_t* sortedpairs,uint32_t count, poptions_t options)
{
uint32_t i,j;
float q,qfp;
for (i=0,j=0;i < count;i++)
{
if (options->insamples)
q = (float)sortedpairs[i]->inexample/options->insamples;
else q=1.0;
if (options->outsamples)
qfp = (float)sortedpairs[i]->outexample/options->outsamples;
else qfp=0.0;
sortedpairs[i]->quorumin = q;
sortedpairs[i]->quorumout = qfp;
sortedpairs[i]->yule = q - qfp;
sortedpairs[j]=sortedpairs[i];
if (q > options->sensitivity_quorum &&
qfp < options->false_positive_quorum)
{
(void)taxonomycoverage(sortedpairs[j],options);
taxonomyspecificity(sortedpairs[j]);
// fprintf(stderr,"%4.3f %4.3f %4.3f\n",sortedpairs[j]->yule , sortedpairs[j]->bs,sortedpairs[j]->bc);
j++;
}
}
qsort(sortedpairs,j,sizeof(ppair_t),cmpprintedpairs);
return j;
}
void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy)
{
ppair_t* sortedpairs;
ppair_t* index;
ppairlist_t pl;
size_t i,j;
size_t count;
char *taxon[]={"taxon","taxa"};
ecotx_t *current_taxon;
//printf("Index\tPrimer1\tPrimer2\tGB\tInexampleCount\tOutexampleCount\tYule\tIntaxaCount\tOuttaxaCount\tCoverage\tSpecificity\tMinAmplifiedLength\tMaxAmplifiedLength\tAvgAmplifiedLength\n");
fprintf(stderr,"Total pair count : %d\n",pairs->count);
sortedpairs = ECOMALLOC(pairs->count*sizeof(ppair_t),"Cannot Allocate ordered pairs");
index=sortedpairs;
pl=pairs->first;
j=0;
while(pl->next)
{
for (i=0;i<pl->paircount;i++,j++)
sortedpairs[j]=pl->pairs+i;
pl=pl->next;
}
for (i=0;i<pl->paircount;i++,j++)
sortedpairs[j]=pl->pairs+i;
count=filterandsortpairs(sortedpairs,pairs->count,options);
fprintf(stderr,"Total good pair count : %u\n",(uint32_t)count);
printf("#\n");
printf("# ecoPrimer version %s\n",VERSION);
printf("# Rank level optimisation : %s\n", options->taxonrank);
printf("# max error count by oligonucleotide : %d\n",options->error_max);
printf("#\n");
if (options->r)
{
printf("# Restricted to %s:\n",taxon[(options->r>1) ? 1:0]);
for(i=0;i<(uint32_t)options->r;i++)
{
current_taxon=eco_findtaxonbytaxid(taxonomy,options->restricted_taxid[i]);
printf("# %d : %s (%s)\n", current_taxon->taxid,
current_taxon->name,
taxonomy->ranks->label[current_taxon->rank]
);
}
printf("#\n");
}
if (options->g)
{
printf("# Ignore %s:\n",taxon[(options->g>1) ? 1:0]);
for(i=0;i<(uint32_t)options->r;i++)
{
current_taxon=eco_findtaxonbytaxid(taxonomy,options->ignored_taxid[i]);
printf("# %d : %s (%s)\n", current_taxon->taxid,
current_taxon->name,
taxonomy->ranks->label[current_taxon->rank]
);
}
printf("#\n");
}
printf("# strict primer quorum : %3.2f\n",options->strict_quorum);
printf("# example quorum : %3.2f\n",options->sensitivity_quorum);
if (options->g + options->r)
printf("# counterexample quorum : %3.2f\n",options->false_positive_quorum);
printf("#\n");
printf("# database : %s\n",options->prefix);
printf("# Database is constituted of %5d examples corresponding to %5d %s\n",options->insamples,
options->intaxa,options->taxonrank);
printf("# and %5d counterexamples corresponding to %5d %s\n",options->outsamples,
options->outtaxa,options->taxonrank);
printf("#\n");
if (options->lmin && options->lmax)
printf("# amplifiat length between [%d,%d] bp\n",options->lmin,options->lmax);
else if (options->lmin)
printf("# amplifiat length larger than %d bp\n",options->lmin);
else if (options->lmax)
printf("# amplifiat length smaller than %d bp\n",options->lmax);
if (options->circular)
printf("# DB sequences are considered as circular\n");
else
printf("# DB sequences are considered as linear\n");
printf("#\n");
for (i=0;i < count;i++)
printapair(i,sortedpairs[i],options);
}
/*updateseqparams: This function counts the insample and outsample sequences
* and with each sequences adds a tag of the taxon to which the sequence beongs*/
void updateseqparams (pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxonomy,
poptions_t options, int32_t *insamples, int32_t *outsamples)
{
uint32_t i;
int32_t taxid;
ecotx_t *tmptaxon;
for (i=0;i<seqdbsize;i++)
{
seqdb[i]->isexample=isGoodTaxon(taxonomy,seqdb[i]->taxid,options);
if (seqdb[i]->isexample)
(*insamples)++;
else
(*outsamples)++;
taxid = taxonomy->taxons->taxon[seqdb[i]->taxid].taxid;
tmptaxon = eco_findtaxonbytaxid(taxonomy, taxid);
if (tmptaxon)
tmptaxon = eco_findtaxonatrank(tmptaxon, options->taxonrankidx);
if (tmptaxon)
seqdb[i]->ranktaxonid = tmptaxon->taxid;
}
}
void setresulttaxonrank (ecotaxonomy_t *taxonomy, poptions_t options)
{
int32_t i;
/*set taxon rank for which result is to be given*/
for (i = 0; i < taxonomy->ranks->count; i++)
{
if (strcmp(taxonomy->ranks->label[i], options->taxonrank) == 0)
{
options->taxonrankidx = i;
break;
}
}
if (i == taxonomy->ranks->count)
{
fprintf(stderr,"\nUnknown taxon level: '%s'\n", options->taxonrank);
exit(0);
}
}
/* to get db stats, totals of species, genus etc....*/
int main(int argc, char **argv)
{
pecodnadb_t seqdb; /* of type ecoseq_t */
uint32_t seqdbsize=0;
ecotaxonomy_t *taxonomy;
options_t options;
int carg;
int32_t errflag=0;
int32_t insamples=0;
int32_t outsamples=0;
uint32_t i;
pwordcount_t words;
// pwordcount_t words2;
pprimercount_t primers;
ppairtree_t pairs;
int32_t rankdbstats = 0;
//printcurrenttime();
//return 0;
initoptions(&options);
while ((carg = getopt(argc, argv, "hfvcUDSd:l:L:e:i:r:R:q:3:s:x:t:O:")) != -1) {
switch (carg) {
/* ---------------------------- */
case 'v': /* set in single strand mode */
/* ---------------------------- */
options.statistics=TRUE;
break;
/* ---------------------------- */
case 'f': /* set in single strand mode */
/* ---------------------------- */
options.filtering=FALSE;
break;
/* -------------------- */
case 'd': /* database name */
/* -------------------- */
options.prefix = ECOMALLOC(strlen(optarg)+1,
"Error on prefix allocation");
strcpy(options.prefix,optarg);
break;
/* -------------------- */
case 'h': /* help */
/* -------------------- */
PrintHelp();
exit(0);
break;
/* ------------------------- */
case 'l': /* min amplification lenght */
/* ------------------------- */
sscanf(optarg,"%d",&(options.lmin));
break;
/* -------------------------- */
case 'L': /* max amplification lenght */
/* -------------------------- */
sscanf(optarg,"%d",&(options.lmax));
break;
/* -------------------- */
case 'e': /* error max */
/* -------------------- */
sscanf(optarg,"%d",&(options.error_max));
break;
//
// /* ------------------------ */
// case '3': /* three prime strict match */
// /* ------------------------ */
// sscanf(optarg,"%d",&(options.strict_three_prime));
// break;
/* -------------------- */
case 'q': /* strict matching quorum */
/* -------------------- */
sscanf(optarg,"%f",&(options.strict_quorum));
break;
/* -------------------- */
case 's': /* strict matching quorum */
/* -------------------- */
sscanf(optarg,"%f",&(options.sensitivity_quorum));
break;
/* -------------------- */
case 't': /* required taxon level for results */
/* -------------------- */
strncpy(options.taxonrank, optarg, 19);
options.taxonrank[19] = 0;
break;
/* -------------------- */
case 'x': /* strict matching quorum */
/* -------------------- */
sscanf(optarg,"%f",&(options.false_positive_quorum));
break;
/* ---------------------------- */
case 'D': /* set in double strand mode */
/* ---------------------------- */
options.doublestrand=1;
break;
/* ---------------------------- */
case 'S': /* set in single strand mode */
/* ---------------------------- */
options.doublestrand=0;
break;
/* ---------------------------- */
case 'U': /* set in single strand mode */
/* ---------------------------- */
options.no_multi_match=TRUE;
break;
/* ------------------------------------------ */
case 'r': /* stores the restricting search taxonomic id */
/* ------------------------------------------ */
options.restricted_taxid = ECOREALLOC(options.restricted_taxid,sizeof(int32_t)*(options.r+1),
"Error on restricted_taxid reallocation");
sscanf(optarg,"%d",&(options.restricted_taxid[options.r]));
options.r++;
break;
/* -------------------- */
case 'R': /* reference sequence */
/* -------------------- */
options.reference = ECOMALLOC(strlen(optarg)+1,
"Error on prefix allocation");
strcpy(options.reference,optarg);
break;
/* --------------------------------- */
case 'i': /* stores the taxonomic id to ignore */
/* --------------------------------- */
options.ignored_taxid = ECOREALLOC(options.ignored_taxid,sizeof(int32_t)*(options.g+1),
"Error on excluded_taxid reallocation");
sscanf(optarg,"%d",&(options.ignored_taxid[options.g]));
options.g++;
break;
/* --------------------------------- */
case 'O': /* set primer size */
/* --------------------------------- */
sscanf(optarg,"%d",&(options.primer_length));
break;
/* -------------------- */
case 'c': /* sequences are circular */
/* --------------------------------- */
options.circular = 1;
break;
case '?': /* bad option */
/* -------------------- */
errflag++;
}
}
fprintf(stderr,"Reading taxonomy database ...");
taxonomy = read_taxonomy(options.prefix,0);
fprintf(stderr,"Ok\n");
setresulttaxonrank(taxonomy, &options); /*TR: set rank level for statistics*/
fprintf(stderr,"Reading sequence database ...\n");
seqdb = readdnadb(options.prefix,&seqdbsize);
if (options.reference)
for (i=0; i < seqdbsize;i++)
if (strcmp(seqdb[i]->AC,options.reference)==0)
{
options.refseq=seqdb[i];
options.refseqid=i;
fprintf(stderr,"Reference sequence %s identified\n",options.reference);
}
fprintf(stderr,"Ok\n");
fprintf(stderr,"Sequence read : %d\n",(int32_t)seqdbsize);
updateseqparams(seqdb, seqdbsize, taxonomy, &options, &insamples , &outsamples);
options.dbsize=seqdbsize;
options.insamples=insamples;
options.outsamples=outsamples;
rankdbstats = getrankdbstats(seqdb, seqdbsize, taxonomy, &options);
fprintf(stderr,"Database is constituted of %5d examples corresponding to %5d %s\n",insamples,
options.intaxa,options.taxonrank);
fprintf(stderr," and %5d counterexamples corresponding to %5d %s\n",outsamples,
options.outtaxa,options.taxonrank);
fprintf(stderr,"Total distinct %s count %d\n",options.taxonrank, rankdbstats);
fprintf(stderr,"\nIndexing words in sequences\n");
words = lookforStrictPrimer(seqdb,seqdbsize,insamples,&options);
fprintf(stderr,"\n Strict primer count : %d\n",words->size);
// options.filtering=FALSE;
// words2= lookforStrictPrimer(seqdb,seqdbsize,insamples,&options);
// fprintf(stderr,"\n Strict primer count : %d\n",words2->size);
//
// fprintf(stderr,"\n\n Primer sample : \n");
// for (i=0; i<words->size; i++)
// fprintf(stderr," + Primer : %s sequence count : %d\n",ecoUnhashWord(words->words[i],options.primer_length),words->strictcount[i]);
// fprintf(stderr,"\n\n Primer sample : \n");
// for (i=0; i<words2->size; i++)
// fprintf(stderr," + Primer : %s sequence count : %d\n",ecoUnhashWord(words2->words[i],options.primer_length),words2->strictcount[i]);
if (options.no_multi_match)
{
(void)filterMultiStrictPrimer(words);
fprintf(stderr,"\n Strict primer with single match count : %d\n",words->size);
}
fprintf(stderr,"\n\n Primer sample : \n");
for (i=0; i<MINI(10,words->size); i++)
fprintf(stderr," + Primer : %s sequence count : %d\n",ecoUnhashWord(words->words[i],options.primer_length),words->strictcount[i]);
fprintf(stderr,"\nEncoding sequences for fuzzy pattern matching...\n");
for (i=0;i<seqdbsize;i++)
{
encodeSequence(seqdb[i]);
fprintf(stderr," Encoded sequences %5d/%5d \r",(int32_t)i+1,(int32_t)seqdbsize);
}
ECOFREE(words->strictcount,"Free strict primer count table");
primers = lookforAproxPrimer(seqdb,seqdbsize,insamples,words,&options);
ECOFREE(words->words,"Free strict primer table");
ECOFREE(words,"Free strict primer structure");
fprintf(stderr,"\n\n Approximate repeats :%d \n", primers->size);
fprintf(stderr,"\n\n Primer sample : \n");
for (i=0; i<MINI(10,primers->size); i++)
fprintf(stderr," + Primer : %s example sequence count : %5d counterexample sequence count : %5d status : %s\n",ecoUnhashWord(primers->primers[i].word,options.primer_length),
primers->primers[i].inexample,
primers->primers[i].outexample,
primers->primers[i].good ? "good":"bad");
fprintf(stderr,"\n");
/*TR: Added*/
pairs = buildPrimerPairs(seqdb, seqdbsize, primers, &options);
// setoktaxforspecificity (&pairs);
printpairs (pairs, &options,taxonomy);
//ECOFREE(pairs.pairs,"Free pairs table");
return 0;
}