Files
ecoprimers/src/ecoprimer.c

473 lines
15 KiB
C
Raw Normal View History

/*
* ecoprimer.c
*
* Created on: 7 nov. 2008
* Author: coissac
*/
#include "libecoprimer/ecoprimer.h"
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <stdlib.h>
#include <getopt.h>
#include <time.h>
#include <sys/time.h>
#define VERSION "0.1"
/* TR: by default, statistics are made on species level*/
#define DEFULTTAXONRANK "species"
/* ----------------------------------------------- */
/* printout help */
/* ----------------------------------------------- */
#define PP fprintf(stdout,
static void PrintHelp()
{
PP "------------------------------------------\n");
PP " ecoPrimer Version %s\n", VERSION);
PP "------------------------------------------\n");
}
static void ExitUsage(int stat)
{
PP "usage: ecoprimer [-d database] [-l value] [-L value] [-e value] [-r taxid] [-i taxid] [-R rank] [-t taxon level]\n");
PP "type \"ecoprimer -h\" for help\n");
if (stat)
exit(stat);
}
#undef PP
void initoptions(poptions_t options)
{
options->lmin=0; //< Amplifia minimal length
options->lmax=0; //< Amplifia maximal length
options->error_max=3; //**< maximum error count in fuzzy search
options->primer_length=18; //**< minimal length of the primers
options->restricted_taxid=NULL; //**< limit amplification below these taxid
options->ignored_taxid=NULL; //**< no amplification below these taxid
options->prefix=NULL;
options->circular=0;
options->doublestrand=1;
options->strict_quorum=0.7;
options->strict_exclude_quorum=0.1;
options->sensitivity_quorum=0.9;
options->false_positive_quorum=0.1;
options->strict_three_prime=2;
options->r=0;
options->g=0;
options->no_multi_match=FALSE;
strcpy(options->taxonrank, DEFULTTAXONRANK); /*taxon level for results, species by default*/
}
void printcurrenttime ()
{
time_t now;
struct tm *ts;
char buf[80];
/* Get the current time */
now = time(NULL);
/* Format and print the time, "ddd yyyy-mm-dd hh:mm:ss zzz" */
ts = localtime(&now);
strftime(buf, sizeof(buf), "%a %Y-%m-%d %H:%M:%S %Z", ts);
fprintf(stderr,"#%d#, %s\n",now, buf);
}
void printcurrenttimeinmilli()
{
struct timeval tv;
struct timezone tz;
struct tm *tm;
gettimeofday(&tv, &tz);
tm=localtime(&tv.tv_sec);
fprintf(stderr, " %d:%02d:%02d %d %d \n", tm->tm_hour, tm->tm_min,
tm->tm_sec, tv.tv_usec, tv.tv_usec/1000);
}
/*TR: Added*/
void printpairs (pairscount_t pairs, poptions_t options, int32_t rankdbstats, uint32_t seqdbsize)
{
uint32_t i;
uint32_t wordsize = options->primer_length;
uint32_t quorumseqs;
double sens;
double speci;
float avg;
quorumseqs = seqdbsize * 70 / 100;
printf("primer_1\tseq_1\tPrimer_2\tseq_2\tamplifia_count\t%s_snes\t%s_spe\tmin_l\tmax_l\tavr_l\n", options->taxonrank, options->taxonrank);
for (i=0; i < pairs.paircount; i++)
{
if (quorumseqs > pairs.pairs[i].inexample) continue;
sens = (pairs.pairs[i].taxsetindex*1.0)/rankdbstats*100;
speci = (pairs.pairs[i].oktaxoncount*1.0)/pairs.pairs[i].taxsetindex*100;
avg = (pairs.pairs[i].mind+pairs.pairs[i].maxd)*1.0/2;
printf("P1\t%s", ecoUnhashWord(pairs.pairs[i].w1, wordsize));
printf("\tP2\t%s", ecoUnhashWord(pairs.pairs[i].w2, wordsize));
printf("\t%d", pairs.pairs[i].inexample);
printf("\t%3.2f", sens);
printf("\t%3.2f", speci);
printf("\t%d", pairs.pairs[i].mind);
printf("\t%d", pairs.pairs[i].maxd);
printf("\t%3.2f\n", avg);
}
}
/*updateseqparams: This function counts the insample and outsample sequences
* and with each sequences adds a tag of the taxon to which the sequence beongs*/
void updateseqparams (pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxonomy,
poptions_t options, int32_t *insamples, int32_t *outsamples)
{
uint32_t i;
int32_t taxid;
ecotx_t *tmptaxon;
for (i=0;i<seqdbsize;i++)
{
seqdb[i]->isexample=isGoodTaxon(taxonomy,seqdb[i]->taxid,options);
if (seqdb[i]->isexample)
(*insamples)++;
else
(*outsamples)++;
taxid = taxonomy->taxons->taxon[seqdb[i]->taxid].taxid;
tmptaxon = eco_findtaxonbytaxid(taxonomy, taxid);
if (tmptaxon)
tmptaxon = eco_findtaxonatrank(tmptaxon, options->taxonrankidx);
if (tmptaxon)
seqdb[i]->ranktaxonid = tmptaxon->taxid;
}
}
void setresulttaxonrank (ecotaxonomy_t *taxonomy, poptions_t options)
{
int32_t i;
/*set taxon rank for which result is to be given*/
for (i = 0; i < taxonomy->ranks->count; i++)
{
if (strcmp(taxonomy->ranks->label[i], options->taxonrank) == 0)
{
options->taxonrankidx = i;
break;
}
}
if (i == taxonomy->ranks->count)
{
fprintf(stderr,"\nUnknown taxon level: '%s'\n", options->taxonrank);
exit(0);
}
}
/* to get db stats, totals of species, genus etc....*/
int32_t getrankdbstats(pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxonomy,
poptions_t options)
{
uint32_t i;
uint32_t j;
uint32_t nameslots = 500;
uint32_t namesindex = 0;
int32_t *ranktaxonids = ECOMALLOC(nameslots * sizeof(int32_t), "Error in taxon rank allocation");
int32_t taxid;
ecotx_t *tmptaxon;
for (i=0;i<seqdbsize;i++)
{
taxid = taxonomy->taxons->taxon[seqdb[i]->taxid].taxid;
tmptaxon = eco_findtaxonbytaxid(taxonomy, taxid);
if (tmptaxon)
tmptaxon = eco_findtaxonatrank(tmptaxon, options->taxonrankidx);
if (tmptaxon)
{
for (j = 0; j < namesindex; j++)
{
if (tmptaxon->taxid == ranktaxonids[j]) break;
}
if (j < namesindex) continue; /* name is already in list, so no need to add it*/
if (namesindex == nameslots)
{
nameslots += 500;
ranktaxonids = ECOREALLOC(ranktaxonids, nameslots * sizeof(int32_t), "Cannot allocate pair rank taxon table");
}
ranktaxonids[namesindex] = tmptaxon->taxid;
namesindex++;
}
}
ECOFREE(ranktaxonids, "free rank taxon table");
return namesindex;
}
void setoktaxforspecificity (ppairscount_t pairs)
{
uint32_t i;
uint32_t j;
uint32_t k;
uint32_t l;
int taxcount;
int32_t taxid;
for (i = 0; i < pairs->paircount; i++)
{
for (j = 0; j < pairs->pairs[i].taxsetindex; j++)
{
for (k = 0; k < pairs->pairs[i].taxset[j].amplifiaindex; k++)
{
taxid = 0;
taxcount = 0;
for (l = 0; l < pairs->pairs[i].ampsetindex; l++)
{
/*compare only char pointers because for equal strings we have same pointer in both sets*/
if (pairs->pairs[i].taxset[j].amplifia[k] == pairs->pairs[i].ampset[l].amplifia)
{
if (pairs->pairs[i].ampset[l].seqidindex > 1)
{
taxcount += pairs->pairs[i].ampset[l].seqidindex;
break;
}
if (taxid != pairs->pairs[i].ampset[l].taxonids[0])
{
if (!taxid) taxid = pairs->pairs[i].ampset[l].taxonids[0];
taxcount++;
}
if (taxcount > 1) break;
}
}
if (taxcount == 1) pairs->pairs[i].oktaxoncount++;
}
}
}
}
int main(int argc, char **argv)
{
pecodnadb_t seqdb; /* of type ecoseq_t */
uint32_t seqdbsize=0;
ecotaxonomy_t *taxonomy;
options_t options;
int carg;
int32_t errflag=0;
int32_t insamples=0;
int32_t outsamples=0;
uint32_t i;
pwordcount_t words;
pprimercount_t primers;
pairscount_t pairs;
int32_t rankdbstats = 0;
//printcurrenttime();
//return 0;
initoptions(&options);
while ((carg = getopt(argc, argv, "hcUDSd:l:L:e:i:r:q:3:s:x:t:")) != -1) {
switch (carg) {
/* -------------------- */
case 'd': /* database name */
/* -------------------- */
options.prefix = ECOMALLOC(strlen(optarg)+1,
"Error on prefix allocation");
strcpy(options.prefix,optarg);
break;
/* -------------------- */
case 'h': /* help */
/* -------------------- */
PrintHelp();
exit(0);
break;
/* ------------------------- */
case 'l': /* min amplification lenght */
/* ------------------------- */
sscanf(optarg,"%d",&(options.lmin));
break;
/* -------------------------- */
case 'L': /* max amplification lenght */
/* -------------------------- */
sscanf(optarg,"%d",&(options.lmax));
break;
/* -------------------- */
case 'e': /* error max */
/* -------------------- */
sscanf(optarg,"%d",&(options.error_max));
break;
/* ------------------------ */
case '3': /* three prime strict match */
/* ------------------------ */
sscanf(optarg,"%d",&(options.strict_three_prime));
break;
/* -------------------- */
case 'q': /* strict matching quorum */
/* -------------------- */
sscanf(optarg,"%f",&(options.strict_quorum));
break;
/* -------------------- */
case 's': /* strict matching quorum */
/* -------------------- */
sscanf(optarg,"%f",&(options.sensitivity_quorum));
break;
/* -------------------- */
case 't': /* required taxon level for results */
/* -------------------- */
strncpy(options.taxonrank, optarg, 19);
options.taxonrank[19] = 0;
break;
/* -------------------- */
case 'x': /* strict matching quorum */
/* -------------------- */
sscanf(optarg,"%f",&(options.false_positive_quorum));
break;
/* ---------------------------- */
case 'D': /* set in double strand mode */
/* ---------------------------- */
options.doublestrand=1;
break;
/* ---------------------------- */
case 'S': /* set in single strand mode */
/* ---------------------------- */
options.doublestrand=0;
break;
/* ---------------------------- */
case 'U': /* set in single strand mode */
/* ---------------------------- */
options.no_multi_match=TRUE;
break;
/* ------------------------------------------ */
case 'r': /* stores the restricting search taxonomic id */
/* ------------------------------------------ */
options.restricted_taxid = ECOREALLOC(options.restricted_taxid,sizeof(int32_t)*(options.r+1),
"Error on restricted_taxid reallocation");
sscanf(optarg,"%d",&(options.restricted_taxid[options.r]));
options.r++;
break;
/* --------------------------------- */
case 'i': /* stores the taxonomic id to ignore */
/* --------------------------------- */
options.ignored_taxid = ECOREALLOC(options.ignored_taxid,sizeof(int32_t)*(options.g+1),
"Error on excluded_taxid reallocation");
sscanf(optarg,"%d",&(options.ignored_taxid[options.g]));
options.g++;
break;
/* -------------------- */
case 'c': /* sequences are circular */
/* --------------------------------- */
options.circular = 1;
break;
case '?': /* bad option */
/* -------------------- */
errflag++;
}
}
fprintf(stderr,"Reading taxonomy database ...");
taxonomy = read_taxonomy(options.prefix,0);
fprintf(stderr,"Ok\n");
setresulttaxonrank(taxonomy, &options); /*TR: set rank level for statistics*/
fprintf(stderr,"Reading sequence database ...\n");
seqdb = readdnadb(options.prefix,&seqdbsize);
fprintf(stderr,"Ok\n");
fprintf(stderr,"Sequence read : %d\n",(int32_t)seqdbsize);
updateseqparams(seqdb, seqdbsize, taxonomy, &options, &insamples , &outsamples);
rankdbstats = getrankdbstats(seqdb, seqdbsize, taxonomy, &options);
fprintf(stderr,"Database is constituted of %5d examples\n",insamples);
fprintf(stderr," and %5d counterexamples\n",outsamples);
fprintf(stderr,"Total distinct %s count %d\n",options.taxonrank, rankdbstats);
fprintf(stderr,"\nIndexing words in sequences\n");
printcurrenttimeinmilli();
words = lookforStrictPrimer(seqdb,seqdbsize,insamples,&options);
printcurrenttimeinmilli();
fprintf(stderr,"\n Strict primer count : %d\n",words->size);
if (options.no_multi_match)
{
(void)filterMultiStrictPrimer(words);
fprintf(stderr,"\n Strict primer with single match count : %d\n",words->size);
}
fprintf(stderr,"\n\n Primer sample : \n");
for (i=0; i<MINI(10,words->size); i++)
fprintf(stderr," + Primer : %s sequence count : %d\n",ecoUnhashWord(words->words[i],options.primer_length),words->strictcount[i]);
fprintf(stderr,"\nEncoding sequences for fuzzy pattern matching...\n");
for (i=0;i<seqdbsize;i++)
{
encodeSequence(seqdb[i]);
fprintf(stderr," Encoded sequences %5d/%5d \r",(int32_t)i+1,(int32_t)seqdbsize);
}
ECOFREE(words->strictcount,"Free strict primer count table");
primers = lookforAproxPrimer(seqdb,seqdbsize,insamples,words,&options);
ECOFREE(words->words,"Free strict primer table");
ECOFREE(words,"Free strict primer structure");
fprintf(stderr,"\n\n Approximate repeats :%d \n", primers->size);
fprintf(stderr,"\n\n Primer sample : \n");
for (i=0; i<MINI(10,primers->size); i++)
fprintf(stderr," + Primer : %s example sequence count : %5d counterexample sequence count : %5d status : %s\n",ecoUnhashWord(primers->primers[i].word,options.primer_length),
primers->primers[i].inexample,
primers->primers[i].outexample,
primers->primers[i].good ? "good":"bad");
fprintf(stderr,"\n");
/*TR: Added*/
pairs = buildPrimerPairs(seqdb, seqdbsize, primers, &options);
setoktaxforspecificity (&pairs);
printpairs (pairs, &options, rankdbstats, seqdbsize);
ECOFREE(pairs.pairs,"Free pairs table");
return 0;
}