ecoprimers/src/ecoprimer.c

/*
 * ecoprimer.c
 *
 *  Created on: 7 nov. 2008
 *      Author: coissac
 */

#include "libecoprimer/ecoprimer.h"
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <stdlib.h>
#include <getopt.h>
#include <time.h>
#include <sys/time.h>

#define VERSION "0.3"
  /* TR: by default, statistics are made on species level*/
#define DEFAULTTAXONRANK "species"

static int cmpprintedpairs(const void* p1,const void* p2);


/* ----------------------------------------------- */
/* printout help                                   */
/* ----------------------------------------------- */
#define PP fprintf(stdout,

static void PrintHelp()
{
			  PP      "------------------------------------------\n");
			  PP      " ecoPrimer Version %s\n", VERSION);
			  PP      "------------------------------------------\n");
              PP      "synopsis : finding primers and measureing the quality of primers and barcode region\n");
              PP      "usage: ./ecoPrimer [options] \n");
              PP      "------------------------------------------\n");
              PP      "options:\n");
              PP      "-d    : [D]atabase : to match the expected format, the database\n");
              PP      "        has to be formated first by the ecoPCRFormat.py program located.\n");
              PP      "        in the ecoPCR/tools directory.\n");
              PP      "        ecoPCRFormat.py creates three file types :\n");
              PP      "            .sdx : contains the sequences\n");
              PP      "            .tdx : contains information concerning the taxonomy\n");
              PP      "            .rdx : contains the taxonomy rank\n\n");
              PP      "        ecoPrimer needs all the file type. As a result, you have to write the\n");
              PP      "        database radical without any extension. For example /ecoPrimerDB/fstvert\n\n");
              PP      "-e    : [E]rror : max error allowed by oligonucleotide (0 by default)\n\n");
              PP      "-h    : [H]elp - print <this> help\n\n");
              PP      "-i    : [I]gnore the given taxonomy id.\n\n");
              PP      "-l    : minimum [L]ength : define the minimum amplication length. \n\n");
              PP      "-L    : maximum [L]ength : define the maximum amplicationlength. \n\n");
              PP      "-r    : [R]estricts the search to the given taxonomic id.\n\n");
              PP      "-c    : Consider that the database sequences are [c]ircular\n\n");
//              PP      "-3 	 : Three prime strict match\n\n");
              PP      "-q    : Strict matching [q]uorum, percentage of the sequences in which strict primers are found. By default it is 70\n\n");
              PP      "-s    : [S]ensitivity quorum\n\n");
              PP      "-t    : required [t]axon level for results, by default the results are computed at species level\n\n");
              PP      "-x    : false positive quorum\n\n");
              PP      "-D    : set in [d]ouble strand mode\n\n");
              PP      "-S    : Set in [s]ingle strand mode\n\n");
              PP      "-U    : No multi match\n\n");
              PP      "\n");
              PP      "------------------------------------------\n");
              PP      "Table result description : \n");
              PP      "column 1 : serial number\n");
              PP      "column 2 : primer1\n");
              PP      "column 3 : primer2\n");
              PP      "column 4 : good/bad\n");
              PP      "column 5 : in sequence count\n");
              PP      "column 6 : out sequence count\n");
              PP      "column 7 : yule\n");
              PP      "column 8 : in taxa count\n");
              PP      "column 9 : out taxa count\n");
              PP      "column 10 : coverage\n");
              PP      "column 11 : unambiguously identified taxa\n");
              PP      "column 12 : specificity\n");
              PP      "column 13 : minimum amplified length\n");
              PP      "column 14 : maximum amplified length\n");
              PP      "column 15 : average amplified length\n");
              PP      "------------------------------------------\n");
              PP		" http://www.grenoble.prabi.fr/trac/ecoPrimer/\n");
              PP      "------------------------------------------\n\n");
              PP      "\n");

}

static void ExitUsage(int stat)
{
        PP      "usage: ecoprimer [-d database] [-l value] [-L value] [-e value] [-r taxid] [-i taxid] [-R rank] [-t taxon level]\n");
        PP      "type \"ecoprimer -h\" for help\n");

        if (stat)
            exit(stat);
}

#undef  PP

void initoptions(poptions_t options)
{
	options->statistics=FALSE;
	options->filtering=TRUE;
	options->lmin=0;                   //< Amplifia minimal length
	options->lmax=1000;                   //< Amplifia maximal length
	options->error_max=3;              //**< maximum error count in fuzzy search
	options->primer_length=18;          //**< minimal length of the primers
	options->restricted_taxid=NULL;      //**< limit amplification below these taxid
	options->ignored_taxid=NULL;         //**< no amplification below these taxid
	options->prefix=NULL;
	options->reference=NULL;
	options->refseq=NULL;
	options->circular=0;
	options->doublestrand=1;
	options->strict_quorum=0.7;
	options->strict_exclude_quorum=0.1;
	options->sensitivity_quorum=0.9;
	options->false_positive_quorum=0.1;
	options->strict_three_prime=0;
	options->r=0;
	options->g=0;
	options->no_multi_match=FALSE;
	strcpy(options->taxonrank, DEFAULTTAXONRANK);			/*taxon level for results, species by default*/
}

void printcurrenttime ()
{
    time_t     now;
    struct tm  *ts;
    char       buf[80];

    /* Get the current time */
    now = time(NULL);

    /* Format and print the time, "ddd yyyy-mm-dd hh:mm:ss zzz" */
    ts = localtime(&now);
    strftime(buf, sizeof(buf), "%a %Y-%m-%d %H:%M:%S %Z", ts);
    fprintf(stderr,"#%d#, %s\n",(int)now, buf);
 }

void printcurrenttimeinmilli()
{
     struct timeval tv;
     struct timezone tz;
     struct tm *tm;
     gettimeofday(&tv, &tz);
     tm=localtime(&tv.tv_sec);
     fprintf(stderr, " %d:%02d:%02d %d %d \n", tm->tm_hour, tm->tm_min,
              tm->tm_sec, tv.tv_usec, tv.tv_usec/1000);
}

/*TR: Added*/

void printapair(int32_t index,ppair_t pair, poptions_t options)
{
	bool_t asdirect1=pair->asdirect1;
	bool_t asdirect2=pair->asdirect2;
	bool_t asdirecttmp;
	word_t w1=pair->p1->word;
	word_t w2=pair->p2->word;
	word_t wtmp;
	bool_t good1=pair->p1->good;
	bool_t good2=pair->p2->good;
	bool_t goodtmp;

	if (!asdirect1)
		w1=ecoComplementWord(w1,options->primer_length);

	if (!asdirect2)
		w2=ecoComplementWord(w2,options->primer_length);


	if (w2 < w1)
	{
		wtmp=w1;
		w1=w2;
		w2=wtmp;

		asdirecttmp=asdirect1;
		asdirect1=asdirect2;
		asdirect2=asdirecttmp;

		goodtmp=good1;
		good1=good2;
		good2=goodtmp;
	}

	printf("%6d\t",index);

	printf("%s\t",ecoUnhashWord(w1,options->primer_length));
	printf("%s",ecoUnhashWord(w2,options->primer_length));

	printf("\t%c%c", "bG"[(int)good1],"bG"[(int)good2]);


	printf("\t%d", pair->inexample);
	printf("\t%d", pair->outexample);
	printf("\t%4.3f", pair->yule);

	printf("\t%d", pair->intaxa);
	printf("\t%d", pair->outtaxa);
	printf("\t%4.3f", (float)pair->bc);

	printf("\t%d", pair->intaxa - pair->notwellidentifiedtaxa);

	printf("\t%4.3f", pair->bs);


	printf("\t%d", pair->mind);
	printf("\t%d", pair->maxd);
	printf("\t%3.2f", (float)pair->sumd/pair->inexample);

	if (options->refseq && pair->refsequence >=0)
	{
		printf("\t%s:",options->reference);


		if (pair->pcr.amplifias[pair->refsequence].strand)
			printf("join(");
		else
			printf("complement(");

		printf("%d..%d,%d..%d",pair->pcr.amplifias[pair->refsequence].begin - options->primer_length + 1,
				                 pair->pcr.amplifias[pair->refsequence].begin,
							  	 pair->pcr.amplifias[pair->refsequence].end + 2,
							  	 pair->pcr.amplifias[pair->refsequence].end + options->primer_length + 1
							  	 );
		printf(")");
		printf("\t");

	}

	printf("\n");

}

static int cmpprintedpairs(const void* p1,const void* p2)
{
	float s1,s2;
	ppair_t pair1,pair2;

	pair1=*((ppair_t*)p1);
	pair2=*((ppair_t*)p2);

	s1 = pair1->yule * pair1->bs;
	s2 = pair2->yule * pair2->bs;

//	fprintf(stderr,"s1 : %4.3f  %4.3f   %4.3f\n",pair1->yule , pair1->bs,s1);
//	fprintf(stderr,"s2 : %4.3f  %4.3f   %4.3f\n\n",pair2->yule , pair2->bs,s2);

	if (s1 > s2) return -1;
	if (s1 < s2) return 1;
	return 0;
}

uint32_t filterandsortpairs(ppair_t* sortedpairs,uint32_t count, poptions_t options)
{
	uint32_t i,j;
	float q,qfp;

	for (i=0,j=0;i < count;i++)
	{
		if (options->insamples)
			q = (float)sortedpairs[i]->inexample/options->insamples;
		else q=1.0;

		if (options->outsamples)
			qfp = (float)sortedpairs[i]->outexample/options->outsamples;
		else qfp=0.0;


		sortedpairs[i]->quorumin = q;
		sortedpairs[i]->quorumout = qfp;
		sortedpairs[i]->yule = q - qfp;


		sortedpairs[j]=sortedpairs[i];


		if (q > options->sensitivity_quorum &&
			qfp < options->false_positive_quorum)
		{
			(void)taxonomycoverage(sortedpairs[j],options);
			taxonomyspecificity(sortedpairs[j]);
//			fprintf(stderr,"%4.3f  %4.3f   %4.3f\n",sortedpairs[j]->yule , sortedpairs[j]->bs,sortedpairs[j]->bc);

			j++;
		}

	}


	qsort(sortedpairs,j,sizeof(ppair_t),cmpprintedpairs);

	return j;
}


void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy)
{
   ppair_t* sortedpairs;
   ppair_t* index;
   ppairlist_t pl;
   size_t i,j;
   size_t count;
   char *taxon[]={"taxon","taxa"};
   ecotx_t    *current_taxon;


   //printf("Index\tPrimer1\tPrimer2\tGB\tInexampleCount\tOutexampleCount\tYule\tIntaxaCount\tOuttaxaCount\tCoverage\tSpecificity\tMinAmplifiedLength\tMaxAmplifiedLength\tAvgAmplifiedLength\n");

   fprintf(stderr,"Total pair count : %d\n",pairs->count);

   sortedpairs = ECOMALLOC(pairs->count*sizeof(ppair_t),"Cannot Allocate ordered pairs");
   index=sortedpairs;
   pl=pairs->first;
   j=0;
   while(pl->next)
   {
	   for (i=0;i<pl->paircount;i++,j++)
		   sortedpairs[j]=pl->pairs+i;
	   pl=pl->next;
   }

   for (i=0;i<pl->paircount;i++,j++)
	   sortedpairs[j]=pl->pairs+i;

   count=filterandsortpairs(sortedpairs,pairs->count,options);

   fprintf(stderr,"Total good pair count : %u\n",(uint32_t)count);

	printf("#\n");
	printf("# ecoPrimer version %s\n",VERSION);
	printf("# Rank level optimisation : %s\n", options->taxonrank);
	printf("# max error count by oligonucleotide : %d\n",options->error_max);
	printf("#\n");

	if (options->r)
	{
		printf("# Restricted to %s:\n",taxon[(options->r>1) ? 1:0]);
		for(i=0;i<(uint32_t)options->r;i++)
		{
			current_taxon=eco_findtaxonbytaxid(taxonomy,options->restricted_taxid[i]);
			printf("#     %d : %s (%s)\n", current_taxon->taxid,
										   current_taxon->name,
										   taxonomy->ranks->label[current_taxon->rank]
										 );
		}
		printf("#\n");
	}
	if (options->g)
	{
		printf("# Ignore  %s:\n",taxon[(options->g>1) ? 1:0]);
		for(i=0;i<(uint32_t)options->r;i++)
		{
			current_taxon=eco_findtaxonbytaxid(taxonomy,options->ignored_taxid[i]);
			printf("#     %d : %s (%s)\n", current_taxon->taxid,
										   current_taxon->name,
										   taxonomy->ranks->label[current_taxon->rank]
										 );
		}
		printf("#\n");
	}
	printf("# strict primer quorum  : %3.2f\n",options->strict_quorum);
	printf("# example quorum        : %3.2f\n",options->sensitivity_quorum);
	if (options->g + options->r)
		printf("# counterexample quorum : %3.2f\n",options->false_positive_quorum);

	printf("#\n");
	printf("# database : %s\n",options->prefix);
    printf("# Database is constituted of %5d examples        corresponding to %5d %s\n",options->insamples,
    		options->intaxa,options->taxonrank);
    printf("#                        and %5d counterexamples corresponding to %5d %s\n",options->outsamples,
    		options->outtaxa,options->taxonrank);
	printf("#\n");

	if (options->lmin && options->lmax)
		printf("# amplifiat length between [%d,%d] bp\n",options->lmin,options->lmax);
	else if (options->lmin)
		printf("# amplifiat length larger than %d bp\n",options->lmin);
	else if (options->lmax)
		printf("# amplifiat length smaller than %d bp\n",options->lmax);
	if (options->circular)
		printf("# DB sequences are considered as circular\n");
	else
		printf("# DB sequences are considered as linear\n");
	printf("#\n");

   for (i=0;i < count;i++)
	   printapair(i,sortedpairs[i],options);

}


/*updateseqparams: This function counts the insample and outsample sequences
 *  and with each sequences adds a tag of the taxon to which the sequence beongs*/

void updateseqparams (pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxonomy,
		poptions_t options, int32_t *insamples, int32_t *outsamples)
{
	uint32_t i;
	int32_t taxid;
	ecotx_t  *tmptaxon;

    for (i=0;i<seqdbsize;i++)
    {
    	seqdb[i]->isexample=isGoodTaxon(taxonomy,seqdb[i]->taxid,options);
    	if (seqdb[i]->isexample)
    		(*insamples)++;
    	else
    		(*outsamples)++;

    	taxid = taxonomy->taxons->taxon[seqdb[i]->taxid].taxid;
		tmptaxon = eco_findtaxonbytaxid(taxonomy, taxid);
		if (tmptaxon)
			tmptaxon = eco_findtaxonatrank(tmptaxon, options->taxonrankidx);
		if (tmptaxon)
			seqdb[i]->ranktaxonid = tmptaxon->taxid;
    }
}

void setresulttaxonrank (ecotaxonomy_t *taxonomy, poptions_t options)
{
	int32_t i;

    /*set taxon rank for which result is to be given*/
    for (i = 0; i < taxonomy->ranks->count; i++)
    {
    	if (strcmp(taxonomy->ranks->label[i], options->taxonrank) == 0)
    	{
    		options->taxonrankidx = i;
    		break;
    	}
    }

    if (i == taxonomy->ranks->count)
    {
    	fprintf(stderr,"\nUnknown taxon level: '%s'\n", options->taxonrank);
    	exit(0);
    }
}
/* to get db stats, totals of species, genus etc....*/


int main(int argc, char **argv)
{
	pecodnadb_t   seqdb; /* of type ecoseq_t */
	uint32_t        seqdbsize=0;
	ecotaxonomy_t *taxonomy;

	options_t     options;
	int           carg;
	int32_t       errflag=0;

	int32_t       insamples=0;
	int32_t       outsamples=0;
	uint32_t        i;

	pwordcount_t    words;
//	pwordcount_t    words2;
	pprimercount_t  primers;
	ppairtree_t		pairs;

	int32_t		  rankdbstats = 0;

	//printcurrenttime();
	//return 0;

	initoptions(&options);

    while ((carg = getopt(argc, argv, "hfvcUDSd:l:L:e:i:r:R:q:3:s:x:t:O:")) != -1) {

     switch (carg) {
								 /* ---------------------------- */
		 case 'v':               /* set in single strand mode    */
								 /* ---------------------------- */
			options.statistics=TRUE;
			break;

			 /* ---------------------------- */
		 case 'f':               /* set in single strand mode    */
			 /* ---------------------------- */
			 options.filtering=FALSE;
			 break;

                                /* -------------------- */
        case 'd':               /* database name        */
                                /* -------------------- */
        	options.prefix = ECOMALLOC(strlen(optarg)+1,
                              "Error on prefix allocation");
           strcpy(options.prefix,optarg);
           break;

                                /* -------------------- */
        case 'h':               /* help                 */
                                /* -------------------- */
        	PrintHelp();
        	 exit(0);
        	 break;

                                /* ------------------------- */
        case 'l':               /* min amplification lenght  */
                                /* ------------------------- */
           sscanf(optarg,"%d",&(options.lmin));
           break;

                                /* -------------------------- */
        case 'L':               /* max amplification lenght   */
                                /* -------------------------- */
          sscanf(optarg,"%d",&(options.lmax));
          break;

                                /* -------------------- */
        case 'e':               /* error max            */
                                /* -------------------- */
          sscanf(optarg,"%d",&(options.error_max));
          break;

//
//								/* ------------------------ */
//		case '3':               /* three prime strict match */
//                                /* ------------------------ */
//			sscanf(optarg,"%d",&(options.strict_three_prime));
//			break;

                                /* -------------------- */
        case 'q':               /* strict matching quorum           */
                                /* -------------------- */
          sscanf(optarg,"%f",&(options.strict_quorum));
          break;

								/* -------------------- */
        case 's':               /* strict matching quorum      */
								/* -------------------- */
        	sscanf(optarg,"%f",&(options.sensitivity_quorum));
        	break;

        						/* -------------------- */
        case 't':               /* required taxon level for results           */
        						/* -------------------- */
        	strncpy(options.taxonrank, optarg, 19);
        	options.taxonrank[19] = 0;
            break;

        						/* -------------------- */
        case 'x':               /* strict matching quorum           */
								/* -------------------- */
        	sscanf(optarg,"%f",&(options.false_positive_quorum));
        	break;

							    /* ---------------------------- */
		case 'D':               /* set in double strand mode    */
							    /* ---------------------------- */
			options.doublestrand=1;
			break;

								/* ---------------------------- */
		case 'S':               /* set in single strand mode    */
								/* ---------------------------- */
			options.doublestrand=0;
			break;

			                    /* ---------------------------- */
        case 'U':               /* set in single strand mode    */
			                    /* ---------------------------- */
            options.no_multi_match=TRUE;
            break;

         						/* ------------------------------------------ */
        case 'r':               /* stores the restricting search taxonomic id */
        						/* ------------------------------------------ */
          options.restricted_taxid = ECOREALLOC(options.restricted_taxid,sizeof(int32_t)*(options.r+1),
          									"Error on restricted_taxid reallocation");
          sscanf(optarg,"%d",&(options.restricted_taxid[options.r]));
          options.r++;
          break;

								/* -------------------- */
        case 'R':               /* reference sequence   */
                                /* -------------------- */
          options.reference = ECOMALLOC(strlen(optarg)+1,
                                     "Error on prefix allocation");
          strcpy(options.reference,optarg);
          break;

         						/* --------------------------------- */
        case 'i':               /* stores the taxonomic id to ignore */
        						/* --------------------------------- */
          options.ignored_taxid = ECOREALLOC(options.ignored_taxid,sizeof(int32_t)*(options.g+1),
          									"Error on excluded_taxid reallocation");
          sscanf(optarg,"%d",&(options.ignored_taxid[options.g]));
          options.g++;
          break;

			                    /* --------------------------------- */
        case 'O':               /* set primer size                   */
        						/* --------------------------------- */
			sscanf(optarg,"%d",&(options.primer_length));
			break;

                                /* -------------------- */
        case 'c':               /* sequences are circular */
        						/* --------------------------------- */
		  options.circular = 1;
          break;

        case '?':               /* bad option           */
                                /* -------------------- */
            errflag++;
   		}

	}

    fprintf(stderr,"Reading taxonomy database ...");
    taxonomy = read_taxonomy(options.prefix,0);
    fprintf(stderr,"Ok\n");

    setresulttaxonrank(taxonomy, &options); /*TR: set rank level for statistics*/

    fprintf(stderr,"Reading sequence database ...\n");

    seqdb = readdnadb(options.prefix,&seqdbsize);

    if (options.reference)
    	for (i=0; i < seqdbsize;i++)
    		if (strcmp(seqdb[i]->AC,options.reference)==0)
    		{
    			options.refseq=seqdb[i];
    			options.refseqid=i;
    			fprintf(stderr,"Reference sequence %s identified\n",options.reference);
    		}

    fprintf(stderr,"Ok\n");
    fprintf(stderr,"Sequence read : %d\n",(int32_t)seqdbsize);

    updateseqparams(seqdb, seqdbsize, taxonomy, &options, &insamples , &outsamples);
    options.dbsize=seqdbsize;
    options.insamples=insamples;
    options.outsamples=outsamples;

    rankdbstats = getrankdbstats(seqdb, seqdbsize, taxonomy, &options);

    fprintf(stderr,"Database is constituted of %5d examples        corresponding to %5d %s\n",insamples,
    		options.intaxa,options.taxonrank);
    fprintf(stderr,"                       and %5d counterexamples corresponding to %5d %s\n",outsamples,
    		options.outtaxa,options.taxonrank);
    fprintf(stderr,"Total distinct %s count %d\n",options.taxonrank, rankdbstats);

    fprintf(stderr,"\nIndexing words in sequences\n");

    words = lookforStrictPrimer(seqdb,seqdbsize,insamples,&options);
    fprintf(stderr,"\n  Strict primer count : %d\n",words->size);

//    options.filtering=FALSE;
//    words2= lookforStrictPrimer(seqdb,seqdbsize,insamples,&options);
//    fprintf(stderr,"\n  Strict primer count : %d\n",words2->size);
//
//    fprintf(stderr,"\n\n  Primer sample : \n");
//    for (i=0; i<words->size; i++)
//    	fprintf(stderr,"  + Primer : %s   sequence count : %d\n",ecoUnhashWord(words->words[i],options.primer_length),words->strictcount[i]);
//    fprintf(stderr,"\n\n  Primer sample : \n");
//    for (i=0; i<words2->size; i++)
//    	fprintf(stderr,"  + Primer : %s   sequence count : %d\n",ecoUnhashWord(words2->words[i],options.primer_length),words2->strictcount[i]);

    if (options.no_multi_match)
    {
    	(void)filterMultiStrictPrimer(words);
    	fprintf(stderr,"\n  Strict primer with single match count : %d\n",words->size);
    }


    fprintf(stderr,"\n\n  Primer sample : \n");
    for (i=0; i<MINI(10,words->size); i++)
    	fprintf(stderr,"  + Primer : %s   sequence count : %d\n",ecoUnhashWord(words->words[i],options.primer_length),words->strictcount[i]);


    fprintf(stderr,"\nEncoding sequences for fuzzy pattern matching...\n");
    for (i=0;i<seqdbsize;i++)
    {
    	encodeSequence(seqdb[i]);
    	fprintf(stderr,"  Encoded sequences %5d/%5d          \r",(int32_t)i+1,(int32_t)seqdbsize);
    }

    ECOFREE(words->strictcount,"Free strict primer count table");

    primers = lookforAproxPrimer(seqdb,seqdbsize,insamples,words,&options);

    ECOFREE(words->words,"Free strict primer table");
    ECOFREE(words,"Free strict primer structure");
    fprintf(stderr,"\n\n  Approximate repeats :%d \n", primers->size);

    fprintf(stderr,"\n\n  Primer sample : \n");
    for (i=0; i<MINI(10,primers->size); i++)
    	fprintf(stderr,"  + Primer : %s   example sequence count : %5d counterexample sequence count : %5d  status : %s\n",ecoUnhashWord(primers->primers[i].word,options.primer_length),
                       primers->primers[i].inexample,
                       primers->primers[i].outexample,
                       primers->primers[i].good ? "good":"bad");

    fprintf(stderr,"\n");

    /*TR: Added*/
    pairs = buildPrimerPairs(seqdb, seqdbsize, primers, &options);


  //  setoktaxforspecificity (&pairs);

     printpairs (pairs, &options,taxonomy);


    //ECOFREE(pairs.pairs,"Free pairs table");

    return 0;
}