2009-03-04 22:32:55 +00:00
|
|
|
/*
|
|
|
|
* ecoprimer.c
|
|
|
|
*
|
|
|
|
* Created on: 7 nov. 2008
|
|
|
|
* Author: coissac
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "libecoprimer/ecoprimer.h"
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include <ctype.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <getopt.h>
|
|
|
|
#include <time.h>
|
|
|
|
#include <sys/time.h>
|
|
|
|
|
2009-05-13 06:51:25 +00:00
|
|
|
#define VERSION "0.3"
|
2009-03-04 22:32:55 +00:00
|
|
|
/* TR: by default, statistics are made on species level*/
|
2009-04-22 08:15:18 +00:00
|
|
|
#define DEFAULTTAXONRANK "species"
|
|
|
|
|
|
|
|
static int cmpprintedpairs(const void* p1,const void* p2);
|
|
|
|
|
2009-03-04 22:32:55 +00:00
|
|
|
|
|
|
|
/* ----------------------------------------------- */
|
|
|
|
/* printout help */
|
|
|
|
/* ----------------------------------------------- */
|
|
|
|
#define PP fprintf(stdout,
|
|
|
|
|
|
|
|
static void PrintHelp()
|
|
|
|
{
|
2009-04-20 08:38:41 +00:00
|
|
|
PP "------------------------------------------\n");
|
|
|
|
PP " ecoPrimer Version %s\n", VERSION);
|
|
|
|
PP "------------------------------------------\n");
|
|
|
|
PP "synopsis : finding primers and measureing the quality of primers and barcode region\n");
|
|
|
|
PP "usage: ./ecoPrimer [options] \n");
|
|
|
|
PP "------------------------------------------\n");
|
|
|
|
PP "options:\n");
|
|
|
|
PP "-d : [D]atabase : to match the expected format, the database\n");
|
|
|
|
PP " has to be formated first by the ecoPCRFormat.py program located.\n");
|
|
|
|
PP " in the ecoPCR/tools directory.\n");
|
|
|
|
PP " ecoPCRFormat.py creates three file types :\n");
|
|
|
|
PP " .sdx : contains the sequences\n");
|
|
|
|
PP " .tdx : contains information concerning the taxonomy\n");
|
|
|
|
PP " .rdx : contains the taxonomy rank\n\n");
|
|
|
|
PP " ecoPrimer needs all the file type. As a result, you have to write the\n");
|
2009-04-22 08:15:18 +00:00
|
|
|
PP " database radical without any extension. For example /ecoPrimerDB/fstvert\n\n");
|
2009-04-20 08:38:41 +00:00
|
|
|
PP "-e : [E]rror : max error allowed by oligonucleotide (0 by default)\n\n");
|
|
|
|
PP "-h : [H]elp - print <this> help\n\n");
|
|
|
|
PP "-i : [I]gnore the given taxonomy id.\n\n");
|
|
|
|
PP "-l : minimum [L]ength : define the minimum amplication length. \n\n");
|
|
|
|
PP "-L : maximum [L]ength : define the maximum amplicationlength. \n\n");
|
|
|
|
PP "-r : [R]estricts the search to the given taxonomic id.\n\n");
|
|
|
|
PP "-c : Consider that the database sequences are [c]ircular\n\n");
|
2009-04-23 11:38:52 +00:00
|
|
|
// PP "-3 : Three prime strict match\n\n");
|
2009-04-20 08:38:41 +00:00
|
|
|
PP "-q : Strict matching [q]uorum, percentage of the sequences in which strict primers are found. By default it is 70\n\n");
|
|
|
|
PP "-s : [S]ensitivity quorum\n\n");
|
|
|
|
PP "-t : required [t]axon level for results, by default the results are computed at species level\n\n");
|
|
|
|
PP "-x : false positive quorum\n\n");
|
|
|
|
PP "-D : set in [d]ouble strand mode\n\n");
|
|
|
|
PP "-S : Set in [s]ingle strand mode\n\n");
|
|
|
|
PP "-U : No multi match\n\n");
|
|
|
|
PP "\n");
|
|
|
|
PP "------------------------------------------\n");
|
|
|
|
PP "Table result description : \n");
|
|
|
|
PP "column 1 : serial number\n");
|
|
|
|
PP "column 2 : primer1\n");
|
|
|
|
PP "column 3 : primer2\n");
|
|
|
|
PP "column 4 : good/bad\n");
|
|
|
|
PP "column 5 : in sequence count\n");
|
|
|
|
PP "column 6 : out sequence count\n");
|
|
|
|
PP "column 7 : yule\n");
|
|
|
|
PP "column 8 : in taxa count\n");
|
|
|
|
PP "column 9 : out taxa count\n");
|
|
|
|
PP "column 10 : coverage\n");
|
2009-04-22 08:15:18 +00:00
|
|
|
PP "column 11 : unambiguously identified taxa\n");
|
|
|
|
PP "column 12 : specificity\n");
|
|
|
|
PP "column 13 : minimum amplified length\n");
|
|
|
|
PP "column 14 : maximum amplified length\n");
|
|
|
|
PP "column 15 : average amplified length\n");
|
2009-04-20 08:38:41 +00:00
|
|
|
PP "------------------------------------------\n");
|
|
|
|
PP " http://www.grenoble.prabi.fr/trac/ecoPrimer/\n");
|
2009-04-22 08:15:18 +00:00
|
|
|
PP "------------------------------------------\n\n");
|
2009-04-20 08:38:41 +00:00
|
|
|
PP "\n");
|
|
|
|
|
2009-03-04 22:32:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void ExitUsage(int stat)
|
|
|
|
{
|
|
|
|
PP "usage: ecoprimer [-d database] [-l value] [-L value] [-e value] [-r taxid] [-i taxid] [-R rank] [-t taxon level]\n");
|
|
|
|
PP "type \"ecoprimer -h\" for help\n");
|
|
|
|
|
|
|
|
if (stat)
|
|
|
|
exit(stat);
|
|
|
|
}
|
|
|
|
|
|
|
|
#undef PP
|
|
|
|
|
|
|
|
void initoptions(poptions_t options)
|
|
|
|
{
|
2009-05-12 09:06:43 +00:00
|
|
|
options->statistics=FALSE;
|
2009-05-13 06:51:25 +00:00
|
|
|
options->filtering=TRUE;
|
2009-03-04 22:32:55 +00:00
|
|
|
options->lmin=0; //< Amplifia minimal length
|
2009-05-12 09:06:43 +00:00
|
|
|
options->lmax=1000; //< Amplifia maximal length
|
2009-03-04 22:32:55 +00:00
|
|
|
options->error_max=3; //**< maximum error count in fuzzy search
|
|
|
|
options->primer_length=18; //**< minimal length of the primers
|
|
|
|
options->restricted_taxid=NULL; //**< limit amplification below these taxid
|
|
|
|
options->ignored_taxid=NULL; //**< no amplification below these taxid
|
|
|
|
options->prefix=NULL;
|
2009-06-23 14:11:39 +00:00
|
|
|
options->reference=NULL;
|
|
|
|
options->refseq=NULL;
|
2009-03-04 22:32:55 +00:00
|
|
|
options->circular=0;
|
|
|
|
options->doublestrand=1;
|
|
|
|
options->strict_quorum=0.7;
|
|
|
|
options->strict_exclude_quorum=0.1;
|
|
|
|
options->sensitivity_quorum=0.9;
|
|
|
|
options->false_positive_quorum=0.1;
|
2009-04-20 08:38:41 +00:00
|
|
|
options->strict_three_prime=0;
|
2009-03-04 22:32:55 +00:00
|
|
|
options->r=0;
|
|
|
|
options->g=0;
|
|
|
|
options->no_multi_match=FALSE;
|
2009-04-22 08:15:18 +00:00
|
|
|
strcpy(options->taxonrank, DEFAULTTAXONRANK); /*taxon level for results, species by default*/
|
2009-03-04 22:32:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void printcurrenttime ()
|
|
|
|
{
|
|
|
|
time_t now;
|
|
|
|
struct tm *ts;
|
|
|
|
char buf[80];
|
|
|
|
|
|
|
|
/* Get the current time */
|
|
|
|
now = time(NULL);
|
|
|
|
|
|
|
|
/* Format and print the time, "ddd yyyy-mm-dd hh:mm:ss zzz" */
|
|
|
|
ts = localtime(&now);
|
|
|
|
strftime(buf, sizeof(buf), "%a %Y-%m-%d %H:%M:%S %Z", ts);
|
2009-04-20 08:38:41 +00:00
|
|
|
fprintf(stderr,"#%d#, %s\n",(int)now, buf);
|
2009-03-04 22:32:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void printcurrenttimeinmilli()
|
|
|
|
{
|
|
|
|
struct timeval tv;
|
|
|
|
struct timezone tz;
|
|
|
|
struct tm *tm;
|
|
|
|
gettimeofday(&tv, &tz);
|
|
|
|
tm=localtime(&tv.tv_sec);
|
|
|
|
fprintf(stderr, " %d:%02d:%02d %d %d \n", tm->tm_hour, tm->tm_min,
|
|
|
|
tm->tm_sec, tv.tv_usec, tv.tv_usec/1000);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*TR: Added*/
|
2009-04-20 08:38:41 +00:00
|
|
|
|
|
|
|
void printapair(int32_t index,ppair_t pair, poptions_t options)
|
|
|
|
{
|
2009-04-23 11:38:52 +00:00
|
|
|
bool_t asdirect1=pair->asdirect1;
|
|
|
|
bool_t asdirect2=pair->asdirect2;
|
|
|
|
bool_t asdirecttmp;
|
|
|
|
word_t w1=pair->p1->word;
|
|
|
|
word_t w2=pair->p2->word;
|
|
|
|
word_t wtmp;
|
|
|
|
bool_t good1=pair->p1->good;
|
|
|
|
bool_t good2=pair->p2->good;
|
|
|
|
bool_t goodtmp;
|
|
|
|
|
|
|
|
if (!asdirect1)
|
|
|
|
w1=ecoComplementWord(w1,options->primer_length);
|
|
|
|
|
|
|
|
if (!asdirect2)
|
|
|
|
w2=ecoComplementWord(w2,options->primer_length);
|
|
|
|
|
|
|
|
|
|
|
|
if (w2 < w1)
|
|
|
|
{
|
|
|
|
wtmp=w1;
|
|
|
|
w1=w2;
|
|
|
|
w2=wtmp;
|
|
|
|
|
|
|
|
asdirecttmp=asdirect1;
|
|
|
|
asdirect1=asdirect2;
|
|
|
|
asdirect2=asdirecttmp;
|
|
|
|
|
|
|
|
goodtmp=good1;
|
|
|
|
good1=good2;
|
|
|
|
good2=goodtmp;
|
|
|
|
}
|
2009-04-22 08:15:18 +00:00
|
|
|
|
2009-04-20 08:38:41 +00:00
|
|
|
printf("%6d\t",index);
|
|
|
|
|
2009-04-23 11:38:52 +00:00
|
|
|
printf("%s\t",ecoUnhashWord(w1,options->primer_length));
|
|
|
|
printf("%s",ecoUnhashWord(w2,options->primer_length));
|
2009-04-20 08:38:41 +00:00
|
|
|
|
2009-04-23 11:38:52 +00:00
|
|
|
printf("\t%c%c", "bG"[(int)good1],"bG"[(int)good2]);
|
2009-04-20 08:38:41 +00:00
|
|
|
|
|
|
|
|
|
|
|
printf("\t%d", pair->inexample);
|
|
|
|
printf("\t%d", pair->outexample);
|
|
|
|
printf("\t%4.3f", pair->yule);
|
|
|
|
|
|
|
|
printf("\t%d", pair->intaxa);
|
|
|
|
printf("\t%d", pair->outtaxa);
|
2009-04-22 08:15:18 +00:00
|
|
|
printf("\t%4.3f", (float)pair->bc);
|
2009-04-20 08:38:41 +00:00
|
|
|
|
2009-04-22 08:15:18 +00:00
|
|
|
printf("\t%d", pair->intaxa - pair->notwellidentifiedtaxa);
|
2009-04-20 08:38:41 +00:00
|
|
|
|
2009-04-22 08:15:18 +00:00
|
|
|
printf("\t%4.3f", pair->bs);
|
2009-04-20 08:38:41 +00:00
|
|
|
|
|
|
|
|
|
|
|
printf("\t%d", pair->mind);
|
|
|
|
printf("\t%d", pair->maxd);
|
2009-06-23 14:11:39 +00:00
|
|
|
printf("\t%3.2f", (float)pair->sumd/pair->inexample);
|
|
|
|
|
|
|
|
if (options->refseq && pair->refsequence >=0)
|
|
|
|
{
|
|
|
|
printf("\t%s:",options->reference);
|
|
|
|
|
|
|
|
|
|
|
|
if (pair->pcr.amplifias[pair->refsequence].strand)
|
|
|
|
printf("join(");
|
|
|
|
else
|
|
|
|
printf("complement(");
|
|
|
|
|
|
|
|
printf("%d..%d,%d..%d",pair->pcr.amplifias[pair->refsequence].begin - options->primer_length + 1,
|
|
|
|
pair->pcr.amplifias[pair->refsequence].begin,
|
|
|
|
pair->pcr.amplifias[pair->refsequence].end + 2,
|
|
|
|
pair->pcr.amplifias[pair->refsequence].end + options->primer_length + 1
|
|
|
|
);
|
|
|
|
printf(")");
|
|
|
|
printf("\t");
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
printf("\n");
|
2009-04-20 08:38:41 +00:00
|
|
|
|
|
|
|
}
|
|
|
|
|
2009-04-22 08:15:18 +00:00
|
|
|
static int cmpprintedpairs(const void* p1,const void* p2)
|
|
|
|
{
|
|
|
|
float s1,s2;
|
|
|
|
ppair_t pair1,pair2;
|
|
|
|
|
|
|
|
pair1=*((ppair_t*)p1);
|
|
|
|
pair2=*((ppair_t*)p2);
|
|
|
|
|
|
|
|
s1 = pair1->yule * pair1->bs;
|
|
|
|
s2 = pair2->yule * pair2->bs;
|
|
|
|
|
|
|
|
// fprintf(stderr,"s1 : %4.3f %4.3f %4.3f\n",pair1->yule , pair1->bs,s1);
|
|
|
|
// fprintf(stderr,"s2 : %4.3f %4.3f %4.3f\n\n",pair2->yule , pair2->bs,s2);
|
|
|
|
|
|
|
|
if (s1 > s2) return -1;
|
|
|
|
if (s1 < s2) return 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-04-20 08:38:41 +00:00
|
|
|
uint32_t filterandsortpairs(ppair_t* sortedpairs,uint32_t count, poptions_t options)
|
|
|
|
{
|
|
|
|
uint32_t i,j;
|
|
|
|
float q,qfp;
|
|
|
|
|
|
|
|
for (i=0,j=0;i < count;i++)
|
|
|
|
{
|
|
|
|
if (options->insamples)
|
|
|
|
q = (float)sortedpairs[i]->inexample/options->insamples;
|
|
|
|
else q=1.0;
|
|
|
|
|
|
|
|
if (options->outsamples)
|
|
|
|
qfp = (float)sortedpairs[i]->outexample/options->outsamples;
|
|
|
|
else qfp=0.0;
|
|
|
|
|
2009-04-22 08:15:18 +00:00
|
|
|
|
2009-04-20 08:38:41 +00:00
|
|
|
sortedpairs[i]->quorumin = q;
|
|
|
|
sortedpairs[i]->quorumout = qfp;
|
2009-04-22 08:15:18 +00:00
|
|
|
sortedpairs[i]->yule = q - qfp;
|
2009-04-20 08:38:41 +00:00
|
|
|
|
|
|
|
|
|
|
|
sortedpairs[j]=sortedpairs[i];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (q > options->sensitivity_quorum &&
|
|
|
|
qfp < options->false_positive_quorum)
|
|
|
|
{
|
|
|
|
(void)taxonomycoverage(sortedpairs[j],options);
|
|
|
|
taxonomyspecificity(sortedpairs[j]);
|
2009-04-22 08:15:18 +00:00
|
|
|
// fprintf(stderr,"%4.3f %4.3f %4.3f\n",sortedpairs[j]->yule , sortedpairs[j]->bs,sortedpairs[j]->bc);
|
|
|
|
|
2009-04-20 08:38:41 +00:00
|
|
|
j++;
|
|
|
|
}
|
2009-04-22 08:15:18 +00:00
|
|
|
|
2009-04-20 08:38:41 +00:00
|
|
|
}
|
|
|
|
|
2009-04-22 08:15:18 +00:00
|
|
|
|
|
|
|
qsort(sortedpairs,j,sizeof(ppair_t),cmpprintedpairs);
|
|
|
|
|
2009-04-20 08:38:41 +00:00
|
|
|
return j;
|
|
|
|
}
|
|
|
|
|
2009-04-22 08:15:18 +00:00
|
|
|
|
2009-04-23 11:38:52 +00:00
|
|
|
void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy)
|
2009-04-20 08:38:41 +00:00
|
|
|
{
|
|
|
|
ppair_t* sortedpairs;
|
|
|
|
ppair_t* index;
|
|
|
|
ppairlist_t pl;
|
|
|
|
size_t i,j;
|
2009-04-22 08:15:18 +00:00
|
|
|
size_t count;
|
2009-04-23 11:38:52 +00:00
|
|
|
char *taxon[]={"taxon","taxa"};
|
|
|
|
ecotx_t *current_taxon;
|
|
|
|
|
2009-04-22 08:15:18 +00:00
|
|
|
|
|
|
|
//printf("Index\tPrimer1\tPrimer2\tGB\tInexampleCount\tOutexampleCount\tYule\tIntaxaCount\tOuttaxaCount\tCoverage\tSpecificity\tMinAmplifiedLength\tMaxAmplifiedLength\tAvgAmplifiedLength\n");
|
2009-04-20 08:38:41 +00:00
|
|
|
|
|
|
|
fprintf(stderr,"Total pair count : %d\n",pairs->count);
|
|
|
|
|
|
|
|
sortedpairs = ECOMALLOC(pairs->count*sizeof(ppair_t),"Cannot Allocate ordered pairs");
|
|
|
|
index=sortedpairs;
|
|
|
|
pl=pairs->first;
|
|
|
|
j=0;
|
|
|
|
while(pl->next)
|
|
|
|
{
|
|
|
|
for (i=0;i<pl->paircount;i++,j++)
|
|
|
|
sortedpairs[j]=pl->pairs+i;
|
|
|
|
pl=pl->next;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i=0;i<pl->paircount;i++,j++)
|
|
|
|
sortedpairs[j]=pl->pairs+i;
|
|
|
|
|
|
|
|
count=filterandsortpairs(sortedpairs,pairs->count,options);
|
|
|
|
|
2009-04-23 11:38:52 +00:00
|
|
|
fprintf(stderr,"Total good pair count : %u\n",(uint32_t)count);
|
|
|
|
|
|
|
|
printf("#\n");
|
|
|
|
printf("# ecoPrimer version %s\n",VERSION);
|
|
|
|
printf("# Rank level optimisation : %s\n", options->taxonrank);
|
|
|
|
printf("# max error count by oligonucleotide : %d\n",options->error_max);
|
|
|
|
printf("#\n");
|
|
|
|
|
|
|
|
if (options->r)
|
|
|
|
{
|
|
|
|
printf("# Restricted to %s:\n",taxon[(options->r>1) ? 1:0]);
|
|
|
|
for(i=0;i<(uint32_t)options->r;i++)
|
|
|
|
{
|
|
|
|
current_taxon=eco_findtaxonbytaxid(taxonomy,options->restricted_taxid[i]);
|
|
|
|
printf("# %d : %s (%s)\n", current_taxon->taxid,
|
|
|
|
current_taxon->name,
|
|
|
|
taxonomy->ranks->label[current_taxon->rank]
|
|
|
|
);
|
|
|
|
}
|
|
|
|
printf("#\n");
|
|
|
|
}
|
|
|
|
if (options->g)
|
|
|
|
{
|
|
|
|
printf("# Ignore %s:\n",taxon[(options->g>1) ? 1:0]);
|
|
|
|
for(i=0;i<(uint32_t)options->r;i++)
|
|
|
|
{
|
|
|
|
current_taxon=eco_findtaxonbytaxid(taxonomy,options->ignored_taxid[i]);
|
|
|
|
printf("# %d : %s (%s)\n", current_taxon->taxid,
|
|
|
|
current_taxon->name,
|
|
|
|
taxonomy->ranks->label[current_taxon->rank]
|
|
|
|
);
|
|
|
|
}
|
|
|
|
printf("#\n");
|
|
|
|
}
|
|
|
|
printf("# strict primer quorum : %3.2f\n",options->strict_quorum);
|
|
|
|
printf("# example quorum : %3.2f\n",options->sensitivity_quorum);
|
|
|
|
if (options->g + options->r)
|
|
|
|
printf("# counterexample quorum : %3.2f\n",options->false_positive_quorum);
|
|
|
|
|
|
|
|
printf("#\n");
|
|
|
|
printf("# database : %s\n",options->prefix);
|
|
|
|
printf("# Database is constituted of %5d examples corresponding to %5d %s\n",options->insamples,
|
|
|
|
options->intaxa,options->taxonrank);
|
|
|
|
printf("# and %5d counterexamples corresponding to %5d %s\n",options->outsamples,
|
|
|
|
options->outtaxa,options->taxonrank);
|
|
|
|
printf("#\n");
|
|
|
|
|
|
|
|
if (options->lmin && options->lmax)
|
|
|
|
printf("# amplifiat length between [%d,%d] bp\n",options->lmin,options->lmax);
|
|
|
|
else if (options->lmin)
|
|
|
|
printf("# amplifiat length larger than %d bp\n",options->lmin);
|
|
|
|
else if (options->lmax)
|
|
|
|
printf("# amplifiat length smaller than %d bp\n",options->lmax);
|
|
|
|
if (options->circular)
|
|
|
|
printf("# DB sequences are considered as circular\n");
|
|
|
|
else
|
|
|
|
printf("# DB sequences are considered as linear\n");
|
|
|
|
printf("#\n");
|
|
|
|
|
2009-04-20 08:38:41 +00:00
|
|
|
for (i=0;i < count;i++)
|
|
|
|
printapair(i,sortedpairs[i],options);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
2009-03-04 22:32:55 +00:00
|
|
|
/*updateseqparams: This function counts the insample and outsample sequences
|
|
|
|
* and with each sequences adds a tag of the taxon to which the sequence beongs*/
|
2009-04-20 08:38:41 +00:00
|
|
|
|
|
|
|
void updateseqparams (pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxonomy,
|
2009-03-04 22:32:55 +00:00
|
|
|
poptions_t options, int32_t *insamples, int32_t *outsamples)
|
|
|
|
{
|
|
|
|
uint32_t i;
|
|
|
|
int32_t taxid;
|
|
|
|
ecotx_t *tmptaxon;
|
|
|
|
|
|
|
|
for (i=0;i<seqdbsize;i++)
|
2009-04-20 08:38:41 +00:00
|
|
|
{
|
2009-03-04 22:32:55 +00:00
|
|
|
seqdb[i]->isexample=isGoodTaxon(taxonomy,seqdb[i]->taxid,options);
|
|
|
|
if (seqdb[i]->isexample)
|
|
|
|
(*insamples)++;
|
|
|
|
else
|
|
|
|
(*outsamples)++;
|
|
|
|
|
|
|
|
taxid = taxonomy->taxons->taxon[seqdb[i]->taxid].taxid;
|
2009-04-20 08:38:41 +00:00
|
|
|
tmptaxon = eco_findtaxonbytaxid(taxonomy, taxid);
|
2009-03-04 22:32:55 +00:00
|
|
|
if (tmptaxon)
|
|
|
|
tmptaxon = eco_findtaxonatrank(tmptaxon, options->taxonrankidx);
|
|
|
|
if (tmptaxon)
|
|
|
|
seqdb[i]->ranktaxonid = tmptaxon->taxid;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void setresulttaxonrank (ecotaxonomy_t *taxonomy, poptions_t options)
|
|
|
|
{
|
|
|
|
int32_t i;
|
|
|
|
|
|
|
|
/*set taxon rank for which result is to be given*/
|
|
|
|
for (i = 0; i < taxonomy->ranks->count; i++)
|
|
|
|
{
|
2009-04-20 08:38:41 +00:00
|
|
|
if (strcmp(taxonomy->ranks->label[i], options->taxonrank) == 0)
|
2009-03-04 22:32:55 +00:00
|
|
|
{
|
|
|
|
options->taxonrankidx = i;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (i == taxonomy->ranks->count)
|
|
|
|
{
|
|
|
|
fprintf(stderr,"\nUnknown taxon level: '%s'\n", options->taxonrank);
|
|
|
|
exit(0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* to get db stats, totals of species, genus etc....*/
|
|
|
|
|
2009-04-20 08:38:41 +00:00
|
|
|
|
2009-03-04 22:32:55 +00:00
|
|
|
int main(int argc, char **argv)
|
|
|
|
{
|
|
|
|
pecodnadb_t seqdb; /* of type ecoseq_t */
|
|
|
|
uint32_t seqdbsize=0;
|
|
|
|
ecotaxonomy_t *taxonomy;
|
|
|
|
|
|
|
|
options_t options;
|
|
|
|
int carg;
|
|
|
|
int32_t errflag=0;
|
|
|
|
|
|
|
|
int32_t insamples=0;
|
|
|
|
int32_t outsamples=0;
|
|
|
|
uint32_t i;
|
2009-04-20 08:38:41 +00:00
|
|
|
|
|
|
|
pwordcount_t words;
|
2009-05-13 06:51:25 +00:00
|
|
|
// pwordcount_t words2;
|
2009-04-20 08:38:41 +00:00
|
|
|
pprimercount_t primers;
|
|
|
|
ppairtree_t pairs;
|
|
|
|
|
2009-03-04 22:32:55 +00:00
|
|
|
int32_t rankdbstats = 0;
|
|
|
|
|
|
|
|
//printcurrenttime();
|
|
|
|
//return 0;
|
|
|
|
|
|
|
|
initoptions(&options);
|
|
|
|
|
2009-06-23 14:11:39 +00:00
|
|
|
while ((carg = getopt(argc, argv, "hfvcUDSd:l:L:e:i:r:R:q:3:s:x:t:O:")) != -1) {
|
2009-03-04 22:32:55 +00:00
|
|
|
|
|
|
|
switch (carg) {
|
2009-05-12 09:06:43 +00:00
|
|
|
/* ---------------------------- */
|
|
|
|
case 'v': /* set in single strand mode */
|
|
|
|
/* ---------------------------- */
|
|
|
|
options.statistics=TRUE;
|
|
|
|
break;
|
|
|
|
|
2009-05-13 06:51:25 +00:00
|
|
|
/* ---------------------------- */
|
|
|
|
case 'f': /* set in single strand mode */
|
|
|
|
/* ---------------------------- */
|
|
|
|
options.filtering=FALSE;
|
|
|
|
break;
|
|
|
|
|
2009-03-04 22:32:55 +00:00
|
|
|
/* -------------------- */
|
|
|
|
case 'd': /* database name */
|
|
|
|
/* -------------------- */
|
|
|
|
options.prefix = ECOMALLOC(strlen(optarg)+1,
|
|
|
|
"Error on prefix allocation");
|
|
|
|
strcpy(options.prefix,optarg);
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* -------------------- */
|
|
|
|
case 'h': /* help */
|
|
|
|
/* -------------------- */
|
2009-04-20 08:38:41 +00:00
|
|
|
PrintHelp();
|
|
|
|
exit(0);
|
|
|
|
break;
|
2009-03-04 22:32:55 +00:00
|
|
|
|
|
|
|
/* ------------------------- */
|
|
|
|
case 'l': /* min amplification lenght */
|
|
|
|
/* ------------------------- */
|
|
|
|
sscanf(optarg,"%d",&(options.lmin));
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* -------------------------- */
|
|
|
|
case 'L': /* max amplification lenght */
|
|
|
|
/* -------------------------- */
|
|
|
|
sscanf(optarg,"%d",&(options.lmax));
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* -------------------- */
|
|
|
|
case 'e': /* error max */
|
|
|
|
/* -------------------- */
|
|
|
|
sscanf(optarg,"%d",&(options.error_max));
|
|
|
|
break;
|
|
|
|
|
2009-04-23 11:38:52 +00:00
|
|
|
//
|
|
|
|
// /* ------------------------ */
|
|
|
|
// case '3': /* three prime strict match */
|
|
|
|
// /* ------------------------ */
|
|
|
|
// sscanf(optarg,"%d",&(options.strict_three_prime));
|
|
|
|
// break;
|
2009-03-04 22:32:55 +00:00
|
|
|
|
|
|
|
/* -------------------- */
|
|
|
|
case 'q': /* strict matching quorum */
|
|
|
|
/* -------------------- */
|
|
|
|
sscanf(optarg,"%f",&(options.strict_quorum));
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* -------------------- */
|
|
|
|
case 's': /* strict matching quorum */
|
|
|
|
/* -------------------- */
|
|
|
|
sscanf(optarg,"%f",&(options.sensitivity_quorum));
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* -------------------- */
|
|
|
|
case 't': /* required taxon level for results */
|
|
|
|
/* -------------------- */
|
|
|
|
strncpy(options.taxonrank, optarg, 19);
|
|
|
|
options.taxonrank[19] = 0;
|
|
|
|
break;
|
2009-04-20 08:38:41 +00:00
|
|
|
|
2009-03-04 22:32:55 +00:00
|
|
|
/* -------------------- */
|
|
|
|
case 'x': /* strict matching quorum */
|
|
|
|
/* -------------------- */
|
|
|
|
sscanf(optarg,"%f",&(options.false_positive_quorum));
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* ---------------------------- */
|
|
|
|
case 'D': /* set in double strand mode */
|
|
|
|
/* ---------------------------- */
|
|
|
|
options.doublestrand=1;
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* ---------------------------- */
|
|
|
|
case 'S': /* set in single strand mode */
|
|
|
|
/* ---------------------------- */
|
|
|
|
options.doublestrand=0;
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* ---------------------------- */
|
|
|
|
case 'U': /* set in single strand mode */
|
|
|
|
/* ---------------------------- */
|
|
|
|
options.no_multi_match=TRUE;
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* ------------------------------------------ */
|
|
|
|
case 'r': /* stores the restricting search taxonomic id */
|
|
|
|
/* ------------------------------------------ */
|
|
|
|
options.restricted_taxid = ECOREALLOC(options.restricted_taxid,sizeof(int32_t)*(options.r+1),
|
|
|
|
"Error on restricted_taxid reallocation");
|
|
|
|
sscanf(optarg,"%d",&(options.restricted_taxid[options.r]));
|
|
|
|
options.r++;
|
2009-06-23 14:11:39 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
/* -------------------- */
|
|
|
|
case 'R': /* reference sequence */
|
|
|
|
/* -------------------- */
|
|
|
|
options.reference = ECOMALLOC(strlen(optarg)+1,
|
|
|
|
"Error on prefix allocation");
|
|
|
|
strcpy(options.reference,optarg);
|
2009-03-04 22:32:55 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
/* --------------------------------- */
|
|
|
|
case 'i': /* stores the taxonomic id to ignore */
|
|
|
|
/* --------------------------------- */
|
|
|
|
options.ignored_taxid = ECOREALLOC(options.ignored_taxid,sizeof(int32_t)*(options.g+1),
|
|
|
|
"Error on excluded_taxid reallocation");
|
|
|
|
sscanf(optarg,"%d",&(options.ignored_taxid[options.g]));
|
|
|
|
options.g++;
|
|
|
|
break;
|
|
|
|
|
2009-04-22 08:15:18 +00:00
|
|
|
/* --------------------------------- */
|
|
|
|
case 'O': /* set primer size */
|
|
|
|
/* --------------------------------- */
|
|
|
|
sscanf(optarg,"%d",&(options.primer_length));
|
|
|
|
break;
|
|
|
|
|
2009-03-04 22:32:55 +00:00
|
|
|
/* -------------------- */
|
|
|
|
case 'c': /* sequences are circular */
|
|
|
|
/* --------------------------------- */
|
|
|
|
options.circular = 1;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case '?': /* bad option */
|
|
|
|
/* -------------------- */
|
|
|
|
errflag++;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
fprintf(stderr,"Reading taxonomy database ...");
|
|
|
|
taxonomy = read_taxonomy(options.prefix,0);
|
|
|
|
fprintf(stderr,"Ok\n");
|
2009-04-20 08:38:41 +00:00
|
|
|
|
2009-03-04 22:32:55 +00:00
|
|
|
setresulttaxonrank(taxonomy, &options); /*TR: set rank level for statistics*/
|
2009-04-20 08:38:41 +00:00
|
|
|
|
2009-03-04 22:32:55 +00:00
|
|
|
fprintf(stderr,"Reading sequence database ...\n");
|
|
|
|
|
|
|
|
seqdb = readdnadb(options.prefix,&seqdbsize);
|
|
|
|
|
2009-06-23 14:11:39 +00:00
|
|
|
if (options.reference)
|
|
|
|
for (i=0; i < seqdbsize;i++)
|
|
|
|
if (strcmp(seqdb[i]->AC,options.reference)==0)
|
|
|
|
{
|
|
|
|
options.refseq=seqdb[i];
|
|
|
|
options.refseqid=i;
|
|
|
|
fprintf(stderr,"Reference sequence %s identified\n",options.reference);
|
|
|
|
}
|
|
|
|
|
2009-03-04 22:32:55 +00:00
|
|
|
fprintf(stderr,"Ok\n");
|
|
|
|
fprintf(stderr,"Sequence read : %d\n",(int32_t)seqdbsize);
|
2009-04-20 08:38:41 +00:00
|
|
|
|
2009-03-04 22:32:55 +00:00
|
|
|
updateseqparams(seqdb, seqdbsize, taxonomy, &options, &insamples , &outsamples);
|
2009-04-20 08:38:41 +00:00
|
|
|
options.dbsize=seqdbsize;
|
|
|
|
options.insamples=insamples;
|
|
|
|
options.outsamples=outsamples;
|
2009-03-04 22:32:55 +00:00
|
|
|
|
|
|
|
rankdbstats = getrankdbstats(seqdb, seqdbsize, taxonomy, &options);
|
|
|
|
|
2009-04-20 08:38:41 +00:00
|
|
|
fprintf(stderr,"Database is constituted of %5d examples corresponding to %5d %s\n",insamples,
|
|
|
|
options.intaxa,options.taxonrank);
|
|
|
|
fprintf(stderr," and %5d counterexamples corresponding to %5d %s\n",outsamples,
|
|
|
|
options.outtaxa,options.taxonrank);
|
2009-03-04 22:32:55 +00:00
|
|
|
fprintf(stderr,"Total distinct %s count %d\n",options.taxonrank, rankdbstats);
|
|
|
|
|
|
|
|
fprintf(stderr,"\nIndexing words in sequences\n");
|
|
|
|
|
|
|
|
words = lookforStrictPrimer(seqdb,seqdbsize,insamples,&options);
|
|
|
|
fprintf(stderr,"\n Strict primer count : %d\n",words->size);
|
|
|
|
|
2009-05-13 06:51:25 +00:00
|
|
|
// options.filtering=FALSE;
|
|
|
|
// words2= lookforStrictPrimer(seqdb,seqdbsize,insamples,&options);
|
|
|
|
// fprintf(stderr,"\n Strict primer count : %d\n",words2->size);
|
|
|
|
//
|
|
|
|
// fprintf(stderr,"\n\n Primer sample : \n");
|
|
|
|
// for (i=0; i<words->size; i++)
|
|
|
|
// fprintf(stderr," + Primer : %s sequence count : %d\n",ecoUnhashWord(words->words[i],options.primer_length),words->strictcount[i]);
|
|
|
|
// fprintf(stderr,"\n\n Primer sample : \n");
|
|
|
|
// for (i=0; i<words2->size; i++)
|
|
|
|
// fprintf(stderr," + Primer : %s sequence count : %d\n",ecoUnhashWord(words2->words[i],options.primer_length),words2->strictcount[i]);
|
|
|
|
|
2009-03-04 22:32:55 +00:00
|
|
|
if (options.no_multi_match)
|
|
|
|
{
|
|
|
|
(void)filterMultiStrictPrimer(words);
|
|
|
|
fprintf(stderr,"\n Strict primer with single match count : %d\n",words->size);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fprintf(stderr,"\n\n Primer sample : \n");
|
|
|
|
for (i=0; i<MINI(10,words->size); i++)
|
|
|
|
fprintf(stderr," + Primer : %s sequence count : %d\n",ecoUnhashWord(words->words[i],options.primer_length),words->strictcount[i]);
|
|
|
|
|
|
|
|
|
|
|
|
fprintf(stderr,"\nEncoding sequences for fuzzy pattern matching...\n");
|
|
|
|
for (i=0;i<seqdbsize;i++)
|
|
|
|
{
|
|
|
|
encodeSequence(seqdb[i]);
|
|
|
|
fprintf(stderr," Encoded sequences %5d/%5d \r",(int32_t)i+1,(int32_t)seqdbsize);
|
|
|
|
}
|
|
|
|
|
|
|
|
ECOFREE(words->strictcount,"Free strict primer count table");
|
|
|
|
|
|
|
|
primers = lookforAproxPrimer(seqdb,seqdbsize,insamples,words,&options);
|
|
|
|
|
|
|
|
ECOFREE(words->words,"Free strict primer table");
|
|
|
|
ECOFREE(words,"Free strict primer structure");
|
|
|
|
fprintf(stderr,"\n\n Approximate repeats :%d \n", primers->size);
|
|
|
|
|
|
|
|
fprintf(stderr,"\n\n Primer sample : \n");
|
|
|
|
for (i=0; i<MINI(10,primers->size); i++)
|
|
|
|
fprintf(stderr," + Primer : %s example sequence count : %5d counterexample sequence count : %5d status : %s\n",ecoUnhashWord(primers->primers[i].word,options.primer_length),
|
|
|
|
primers->primers[i].inexample,
|
|
|
|
primers->primers[i].outexample,
|
|
|
|
primers->primers[i].good ? "good":"bad");
|
|
|
|
|
|
|
|
fprintf(stderr,"\n");
|
|
|
|
|
|
|
|
/*TR: Added*/
|
|
|
|
pairs = buildPrimerPairs(seqdb, seqdbsize, primers, &options);
|
2009-04-22 08:15:18 +00:00
|
|
|
|
2009-04-20 08:38:41 +00:00
|
|
|
|
|
|
|
// setoktaxforspecificity (&pairs);
|
|
|
|
|
2009-04-23 11:38:52 +00:00
|
|
|
printpairs (pairs, &options,taxonomy);
|
2009-04-20 08:38:41 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//ECOFREE(pairs.pairs,"Free pairs table");
|
2009-03-04 22:32:55 +00:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|