473 lines
15 KiB
C
473 lines
15 KiB
C
|
/*
|
||
|
* ecoprimer.c
|
||
|
*
|
||
|
* Created on: 7 nov. 2008
|
||
|
* Author: coissac
|
||
|
*/
|
||
|
|
||
|
#include "libecoprimer/ecoprimer.h"
|
||
|
#include <stdio.h>
|
||
|
#include <string.h>
|
||
|
#include <ctype.h>
|
||
|
#include <stdlib.h>
|
||
|
#include <getopt.h>
|
||
|
#include <time.h>
|
||
|
#include <sys/time.h>
|
||
|
|
||
|
#define VERSION "0.1"
|
||
|
/* TR: by default, statistics are made on species level*/
|
||
|
#define DEFULTTAXONRANK "species"
|
||
|
|
||
|
/* ----------------------------------------------- */
|
||
|
/* printout help */
|
||
|
/* ----------------------------------------------- */
|
||
|
#define PP fprintf(stdout,
|
||
|
|
||
|
static void PrintHelp()
|
||
|
{
|
||
|
PP "------------------------------------------\n");
|
||
|
PP " ecoPrimer Version %s\n", VERSION);
|
||
|
PP "------------------------------------------\n");
|
||
|
}
|
||
|
|
||
|
static void ExitUsage(int stat)
|
||
|
{
|
||
|
PP "usage: ecoprimer [-d database] [-l value] [-L value] [-e value] [-r taxid] [-i taxid] [-R rank] [-t taxon level]\n");
|
||
|
PP "type \"ecoprimer -h\" for help\n");
|
||
|
|
||
|
if (stat)
|
||
|
exit(stat);
|
||
|
}
|
||
|
|
||
|
#undef PP
|
||
|
|
||
|
void initoptions(poptions_t options)
|
||
|
{
|
||
|
options->lmin=0; //< Amplifia minimal length
|
||
|
options->lmax=0; //< Amplifia maximal length
|
||
|
options->error_max=3; //**< maximum error count in fuzzy search
|
||
|
options->primer_length=18; //**< minimal length of the primers
|
||
|
options->restricted_taxid=NULL; //**< limit amplification below these taxid
|
||
|
options->ignored_taxid=NULL; //**< no amplification below these taxid
|
||
|
options->prefix=NULL;
|
||
|
options->circular=0;
|
||
|
options->doublestrand=1;
|
||
|
options->strict_quorum=0.7;
|
||
|
options->strict_exclude_quorum=0.1;
|
||
|
options->sensitivity_quorum=0.9;
|
||
|
options->false_positive_quorum=0.1;
|
||
|
options->strict_three_prime=2;
|
||
|
options->r=0;
|
||
|
options->g=0;
|
||
|
options->no_multi_match=FALSE;
|
||
|
strcpy(options->taxonrank, DEFULTTAXONRANK); /*taxon level for results, species by default*/
|
||
|
}
|
||
|
|
||
|
void printcurrenttime ()
|
||
|
{
|
||
|
time_t now;
|
||
|
struct tm *ts;
|
||
|
char buf[80];
|
||
|
|
||
|
/* Get the current time */
|
||
|
now = time(NULL);
|
||
|
|
||
|
/* Format and print the time, "ddd yyyy-mm-dd hh:mm:ss zzz" */
|
||
|
ts = localtime(&now);
|
||
|
strftime(buf, sizeof(buf), "%a %Y-%m-%d %H:%M:%S %Z", ts);
|
||
|
fprintf(stderr,"#%d#, %s\n",now, buf);
|
||
|
}
|
||
|
|
||
|
void printcurrenttimeinmilli()
|
||
|
{
|
||
|
struct timeval tv;
|
||
|
struct timezone tz;
|
||
|
struct tm *tm;
|
||
|
gettimeofday(&tv, &tz);
|
||
|
tm=localtime(&tv.tv_sec);
|
||
|
fprintf(stderr, " %d:%02d:%02d %d %d \n", tm->tm_hour, tm->tm_min,
|
||
|
tm->tm_sec, tv.tv_usec, tv.tv_usec/1000);
|
||
|
}
|
||
|
|
||
|
/*TR: Added*/
|
||
|
void printpairs (pairscount_t pairs, poptions_t options, int32_t rankdbstats, uint32_t seqdbsize)
|
||
|
{
|
||
|
uint32_t i;
|
||
|
uint32_t wordsize = options->primer_length;
|
||
|
uint32_t quorumseqs;
|
||
|
double sens;
|
||
|
double speci;
|
||
|
float avg;
|
||
|
|
||
|
quorumseqs = seqdbsize * 70 / 100;
|
||
|
|
||
|
printf("primer_1\tseq_1\tPrimer_2\tseq_2\tamplifia_count\t%s_snes\t%s_spe\tmin_l\tmax_l\tavr_l\n", options->taxonrank, options->taxonrank);
|
||
|
|
||
|
for (i=0; i < pairs.paircount; i++)
|
||
|
{
|
||
|
if (quorumseqs > pairs.pairs[i].inexample) continue;
|
||
|
sens = (pairs.pairs[i].taxsetindex*1.0)/rankdbstats*100;
|
||
|
speci = (pairs.pairs[i].oktaxoncount*1.0)/pairs.pairs[i].taxsetindex*100;
|
||
|
avg = (pairs.pairs[i].mind+pairs.pairs[i].maxd)*1.0/2;
|
||
|
|
||
|
printf("P1\t%s", ecoUnhashWord(pairs.pairs[i].w1, wordsize));
|
||
|
printf("\tP2\t%s", ecoUnhashWord(pairs.pairs[i].w2, wordsize));
|
||
|
printf("\t%d", pairs.pairs[i].inexample);
|
||
|
printf("\t%3.2f", sens);
|
||
|
printf("\t%3.2f", speci);
|
||
|
printf("\t%d", pairs.pairs[i].mind);
|
||
|
printf("\t%d", pairs.pairs[i].maxd);
|
||
|
printf("\t%3.2f\n", avg);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/*updateseqparams: This function counts the insample and outsample sequences
|
||
|
* and with each sequences adds a tag of the taxon to which the sequence beongs*/
|
||
|
void updateseqparams (pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxonomy,
|
||
|
poptions_t options, int32_t *insamples, int32_t *outsamples)
|
||
|
{
|
||
|
uint32_t i;
|
||
|
int32_t taxid;
|
||
|
ecotx_t *tmptaxon;
|
||
|
|
||
|
for (i=0;i<seqdbsize;i++)
|
||
|
{
|
||
|
seqdb[i]->isexample=isGoodTaxon(taxonomy,seqdb[i]->taxid,options);
|
||
|
if (seqdb[i]->isexample)
|
||
|
(*insamples)++;
|
||
|
else
|
||
|
(*outsamples)++;
|
||
|
|
||
|
taxid = taxonomy->taxons->taxon[seqdb[i]->taxid].taxid;
|
||
|
tmptaxon = eco_findtaxonbytaxid(taxonomy, taxid);
|
||
|
if (tmptaxon)
|
||
|
tmptaxon = eco_findtaxonatrank(tmptaxon, options->taxonrankidx);
|
||
|
if (tmptaxon)
|
||
|
seqdb[i]->ranktaxonid = tmptaxon->taxid;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void setresulttaxonrank (ecotaxonomy_t *taxonomy, poptions_t options)
|
||
|
{
|
||
|
int32_t i;
|
||
|
|
||
|
/*set taxon rank for which result is to be given*/
|
||
|
for (i = 0; i < taxonomy->ranks->count; i++)
|
||
|
{
|
||
|
if (strcmp(taxonomy->ranks->label[i], options->taxonrank) == 0)
|
||
|
{
|
||
|
options->taxonrankidx = i;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (i == taxonomy->ranks->count)
|
||
|
{
|
||
|
fprintf(stderr,"\nUnknown taxon level: '%s'\n", options->taxonrank);
|
||
|
exit(0);
|
||
|
}
|
||
|
}
|
||
|
/* to get db stats, totals of species, genus etc....*/
|
||
|
int32_t getrankdbstats(pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxonomy,
|
||
|
poptions_t options)
|
||
|
{
|
||
|
uint32_t i;
|
||
|
uint32_t j;
|
||
|
uint32_t nameslots = 500;
|
||
|
uint32_t namesindex = 0;
|
||
|
int32_t *ranktaxonids = ECOMALLOC(nameslots * sizeof(int32_t), "Error in taxon rank allocation");
|
||
|
int32_t taxid;
|
||
|
|
||
|
ecotx_t *tmptaxon;
|
||
|
|
||
|
for (i=0;i<seqdbsize;i++)
|
||
|
{
|
||
|
taxid = taxonomy->taxons->taxon[seqdb[i]->taxid].taxid;
|
||
|
tmptaxon = eco_findtaxonbytaxid(taxonomy, taxid);
|
||
|
if (tmptaxon)
|
||
|
tmptaxon = eco_findtaxonatrank(tmptaxon, options->taxonrankidx);
|
||
|
if (tmptaxon)
|
||
|
{
|
||
|
for (j = 0; j < namesindex; j++)
|
||
|
{
|
||
|
if (tmptaxon->taxid == ranktaxonids[j]) break;
|
||
|
}
|
||
|
if (j < namesindex) continue; /* name is already in list, so no need to add it*/
|
||
|
|
||
|
if (namesindex == nameslots)
|
||
|
{
|
||
|
nameslots += 500;
|
||
|
ranktaxonids = ECOREALLOC(ranktaxonids, nameslots * sizeof(int32_t), "Cannot allocate pair rank taxon table");
|
||
|
}
|
||
|
ranktaxonids[namesindex] = tmptaxon->taxid;
|
||
|
namesindex++;
|
||
|
}
|
||
|
}
|
||
|
ECOFREE(ranktaxonids, "free rank taxon table");
|
||
|
|
||
|
return namesindex;
|
||
|
}
|
||
|
|
||
|
void setoktaxforspecificity (ppairscount_t pairs)
|
||
|
{
|
||
|
uint32_t i;
|
||
|
uint32_t j;
|
||
|
uint32_t k;
|
||
|
uint32_t l;
|
||
|
int taxcount;
|
||
|
int32_t taxid;
|
||
|
|
||
|
for (i = 0; i < pairs->paircount; i++)
|
||
|
{
|
||
|
for (j = 0; j < pairs->pairs[i].taxsetindex; j++)
|
||
|
{
|
||
|
for (k = 0; k < pairs->pairs[i].taxset[j].amplifiaindex; k++)
|
||
|
{
|
||
|
taxid = 0;
|
||
|
taxcount = 0;
|
||
|
for (l = 0; l < pairs->pairs[i].ampsetindex; l++)
|
||
|
{
|
||
|
/*compare only char pointers because for equal strings we have same pointer in both sets*/
|
||
|
if (pairs->pairs[i].taxset[j].amplifia[k] == pairs->pairs[i].ampset[l].amplifia)
|
||
|
{
|
||
|
if (pairs->pairs[i].ampset[l].seqidindex > 1)
|
||
|
{
|
||
|
taxcount += pairs->pairs[i].ampset[l].seqidindex;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
if (taxid != pairs->pairs[i].ampset[l].taxonids[0])
|
||
|
{
|
||
|
if (!taxid) taxid = pairs->pairs[i].ampset[l].taxonids[0];
|
||
|
taxcount++;
|
||
|
}
|
||
|
|
||
|
if (taxcount > 1) break;
|
||
|
}
|
||
|
}
|
||
|
if (taxcount == 1) pairs->pairs[i].oktaxoncount++;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
int main(int argc, char **argv)
|
||
|
{
|
||
|
pecodnadb_t seqdb; /* of type ecoseq_t */
|
||
|
uint32_t seqdbsize=0;
|
||
|
ecotaxonomy_t *taxonomy;
|
||
|
|
||
|
options_t options;
|
||
|
int carg;
|
||
|
int32_t errflag=0;
|
||
|
|
||
|
int32_t insamples=0;
|
||
|
int32_t outsamples=0;
|
||
|
uint32_t i;
|
||
|
|
||
|
pwordcount_t words;
|
||
|
pprimercount_t primers;
|
||
|
pairscount_t pairs;
|
||
|
|
||
|
int32_t rankdbstats = 0;
|
||
|
|
||
|
//printcurrenttime();
|
||
|
//return 0;
|
||
|
|
||
|
initoptions(&options);
|
||
|
|
||
|
while ((carg = getopt(argc, argv, "hcUDSd:l:L:e:i:r:q:3:s:x:t:")) != -1) {
|
||
|
|
||
|
switch (carg) {
|
||
|
/* -------------------- */
|
||
|
case 'd': /* database name */
|
||
|
/* -------------------- */
|
||
|
options.prefix = ECOMALLOC(strlen(optarg)+1,
|
||
|
"Error on prefix allocation");
|
||
|
strcpy(options.prefix,optarg);
|
||
|
break;
|
||
|
|
||
|
/* -------------------- */
|
||
|
case 'h': /* help */
|
||
|
/* -------------------- */
|
||
|
PrintHelp();
|
||
|
exit(0);
|
||
|
break;
|
||
|
|
||
|
/* ------------------------- */
|
||
|
case 'l': /* min amplification lenght */
|
||
|
/* ------------------------- */
|
||
|
sscanf(optarg,"%d",&(options.lmin));
|
||
|
break;
|
||
|
|
||
|
/* -------------------------- */
|
||
|
case 'L': /* max amplification lenght */
|
||
|
/* -------------------------- */
|
||
|
sscanf(optarg,"%d",&(options.lmax));
|
||
|
break;
|
||
|
|
||
|
/* -------------------- */
|
||
|
case 'e': /* error max */
|
||
|
/* -------------------- */
|
||
|
sscanf(optarg,"%d",&(options.error_max));
|
||
|
break;
|
||
|
|
||
|
|
||
|
/* ------------------------ */
|
||
|
case '3': /* three prime strict match */
|
||
|
/* ------------------------ */
|
||
|
sscanf(optarg,"%d",&(options.strict_three_prime));
|
||
|
break;
|
||
|
|
||
|
/* -------------------- */
|
||
|
case 'q': /* strict matching quorum */
|
||
|
/* -------------------- */
|
||
|
sscanf(optarg,"%f",&(options.strict_quorum));
|
||
|
break;
|
||
|
|
||
|
/* -------------------- */
|
||
|
case 's': /* strict matching quorum */
|
||
|
/* -------------------- */
|
||
|
sscanf(optarg,"%f",&(options.sensitivity_quorum));
|
||
|
break;
|
||
|
|
||
|
/* -------------------- */
|
||
|
case 't': /* required taxon level for results */
|
||
|
/* -------------------- */
|
||
|
strncpy(options.taxonrank, optarg, 19);
|
||
|
options.taxonrank[19] = 0;
|
||
|
break;
|
||
|
|
||
|
/* -------------------- */
|
||
|
case 'x': /* strict matching quorum */
|
||
|
/* -------------------- */
|
||
|
sscanf(optarg,"%f",&(options.false_positive_quorum));
|
||
|
break;
|
||
|
|
||
|
/* ---------------------------- */
|
||
|
case 'D': /* set in double strand mode */
|
||
|
/* ---------------------------- */
|
||
|
options.doublestrand=1;
|
||
|
break;
|
||
|
|
||
|
/* ---------------------------- */
|
||
|
case 'S': /* set in single strand mode */
|
||
|
/* ---------------------------- */
|
||
|
options.doublestrand=0;
|
||
|
break;
|
||
|
|
||
|
/* ---------------------------- */
|
||
|
case 'U': /* set in single strand mode */
|
||
|
/* ---------------------------- */
|
||
|
options.no_multi_match=TRUE;
|
||
|
break;
|
||
|
|
||
|
/* ------------------------------------------ */
|
||
|
case 'r': /* stores the restricting search taxonomic id */
|
||
|
/* ------------------------------------------ */
|
||
|
options.restricted_taxid = ECOREALLOC(options.restricted_taxid,sizeof(int32_t)*(options.r+1),
|
||
|
"Error on restricted_taxid reallocation");
|
||
|
sscanf(optarg,"%d",&(options.restricted_taxid[options.r]));
|
||
|
options.r++;
|
||
|
break;
|
||
|
|
||
|
/* --------------------------------- */
|
||
|
case 'i': /* stores the taxonomic id to ignore */
|
||
|
/* --------------------------------- */
|
||
|
options.ignored_taxid = ECOREALLOC(options.ignored_taxid,sizeof(int32_t)*(options.g+1),
|
||
|
"Error on excluded_taxid reallocation");
|
||
|
sscanf(optarg,"%d",&(options.ignored_taxid[options.g]));
|
||
|
options.g++;
|
||
|
break;
|
||
|
|
||
|
/* -------------------- */
|
||
|
case 'c': /* sequences are circular */
|
||
|
/* --------------------------------- */
|
||
|
options.circular = 1;
|
||
|
break;
|
||
|
|
||
|
case '?': /* bad option */
|
||
|
/* -------------------- */
|
||
|
errflag++;
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
fprintf(stderr,"Reading taxonomy database ...");
|
||
|
taxonomy = read_taxonomy(options.prefix,0);
|
||
|
fprintf(stderr,"Ok\n");
|
||
|
|
||
|
setresulttaxonrank(taxonomy, &options); /*TR: set rank level for statistics*/
|
||
|
|
||
|
fprintf(stderr,"Reading sequence database ...\n");
|
||
|
|
||
|
seqdb = readdnadb(options.prefix,&seqdbsize);
|
||
|
|
||
|
fprintf(stderr,"Ok\n");
|
||
|
fprintf(stderr,"Sequence read : %d\n",(int32_t)seqdbsize);
|
||
|
|
||
|
updateseqparams(seqdb, seqdbsize, taxonomy, &options, &insamples , &outsamples);
|
||
|
|
||
|
rankdbstats = getrankdbstats(seqdb, seqdbsize, taxonomy, &options);
|
||
|
|
||
|
fprintf(stderr,"Database is constituted of %5d examples\n",insamples);
|
||
|
fprintf(stderr," and %5d counterexamples\n",outsamples);
|
||
|
fprintf(stderr,"Total distinct %s count %d\n",options.taxonrank, rankdbstats);
|
||
|
|
||
|
fprintf(stderr,"\nIndexing words in sequences\n");
|
||
|
|
||
|
printcurrenttimeinmilli();
|
||
|
words = lookforStrictPrimer(seqdb,seqdbsize,insamples,&options);
|
||
|
printcurrenttimeinmilli();
|
||
|
|
||
|
fprintf(stderr,"\n Strict primer count : %d\n",words->size);
|
||
|
|
||
|
if (options.no_multi_match)
|
||
|
{
|
||
|
(void)filterMultiStrictPrimer(words);
|
||
|
fprintf(stderr,"\n Strict primer with single match count : %d\n",words->size);
|
||
|
}
|
||
|
|
||
|
|
||
|
fprintf(stderr,"\n\n Primer sample : \n");
|
||
|
for (i=0; i<MINI(10,words->size); i++)
|
||
|
fprintf(stderr," + Primer : %s sequence count : %d\n",ecoUnhashWord(words->words[i],options.primer_length),words->strictcount[i]);
|
||
|
|
||
|
|
||
|
fprintf(stderr,"\nEncoding sequences for fuzzy pattern matching...\n");
|
||
|
for (i=0;i<seqdbsize;i++)
|
||
|
{
|
||
|
encodeSequence(seqdb[i]);
|
||
|
fprintf(stderr," Encoded sequences %5d/%5d \r",(int32_t)i+1,(int32_t)seqdbsize);
|
||
|
}
|
||
|
|
||
|
ECOFREE(words->strictcount,"Free strict primer count table");
|
||
|
|
||
|
primers = lookforAproxPrimer(seqdb,seqdbsize,insamples,words,&options);
|
||
|
|
||
|
ECOFREE(words->words,"Free strict primer table");
|
||
|
ECOFREE(words,"Free strict primer structure");
|
||
|
fprintf(stderr,"\n\n Approximate repeats :%d \n", primers->size);
|
||
|
|
||
|
fprintf(stderr,"\n\n Primer sample : \n");
|
||
|
for (i=0; i<MINI(10,primers->size); i++)
|
||
|
fprintf(stderr," + Primer : %s example sequence count : %5d counterexample sequence count : %5d status : %s\n",ecoUnhashWord(primers->primers[i].word,options.primer_length),
|
||
|
primers->primers[i].inexample,
|
||
|
primers->primers[i].outexample,
|
||
|
primers->primers[i].good ? "good":"bad");
|
||
|
|
||
|
fprintf(stderr,"\n");
|
||
|
|
||
|
/*TR: Added*/
|
||
|
pairs = buildPrimerPairs(seqdb, seqdbsize, primers, &options);
|
||
|
setoktaxforspecificity (&pairs);
|
||
|
|
||
|
printpairs (pairs, &options, rankdbstats, seqdbsize);
|
||
|
|
||
|
|
||
|
|
||
|
ECOFREE(pairs.pairs,"Free pairs table");
|
||
|
|
||
|
return 0;
|
||
|
}
|