Merge of eric-test branche to the trunk

git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@200 60f365c0-8329-0410-b2a4-ec073aeeaa1d
2009-04-20 08:38:41 +00:00
parent b8af5dd65f
commit e3d922e103
17 changed files with 1308 additions and 264 deletions
--- a/src/ecoPrimer
+++ b/src/ecoPrimer
--- a/src/ecoprimer.c
+++ b/src/ecoprimer.c
@ -28,6 +28,56 @@ static void PrintHelp()
 			  PP      "------------------------------------------\n");
 			  PP      " ecoPrimer Version %s\n", VERSION);
 			  PP      "------------------------------------------\n");
              PP      "synopsis : finding primers and measureing the quality of primers and barcode region\n");
              PP      "usage: ./ecoPrimer [options] \n");
              PP      "------------------------------------------\n");
              PP      "options:\n");
              PP      "-d    : [D]atabase : to match the expected format, the database\n");
              PP      "        has to be formated first by the ecoPCRFormat.py program located.\n");
              PP      "        in the ecoPCR/tools directory.\n");
              PP      "        ecoPCRFormat.py creates three file types :\n");
              PP      "            .sdx : contains the sequences\n");
              PP      "            .tdx : contains information concerning the taxonomy\n");
              PP      "            .rdx : contains the taxonomy rank\n\n");
              PP      "        ecoPrimer needs all the file type. As a result, you have to write the\n");
              PP      "        database radical without any extension. For example /ecoPrimerDB/fstvert\n\n");        
              PP      "-e    : [E]rror : max error allowed by oligonucleotide (0 by default)\n\n");
              PP      "-h    : [H]elp - print <this> help\n\n");
              PP      "-i    : [I]gnore the given taxonomy id.\n\n");
              PP      "-l    : minimum [L]ength : define the minimum amplication length. \n\n");
              PP      "-L    : maximum [L]ength : define the maximum amplicationlength. \n\n");
              PP      "-r    : [R]estricts the search to the given taxonomic id.\n\n");
              PP      "-c    : Consider that the database sequences are [c]ircular\n\n");
              PP      "-3 	 : Three prime strict match\n\n");
              PP      "-q    : Strict matching [q]uorum, percentage of the sequences in which strict primers are found. By default it is 70\n\n");
              PP      "-s    : [S]ensitivity quorum\n\n");
              PP      "-t    : required [t]axon level for results, by default the results are computed at species level\n\n");
              PP      "-x    : false positive quorum\n\n");
              PP      "-D    : set in [d]ouble strand mode\n\n");
              PP      "-S    : Set in [s]ingle strand mode\n\n");
              PP      "-U    : No multi match\n\n");
              PP      "\n");
              PP      "------------------------------------------\n");
              PP      "Table result description : \n");
              PP      "column 1 : serial number\n");
              PP      "column 2 : primer1\n");
              PP      "column 3 : primer2\n");
              PP      "column 4 : good/bad\n");
              PP      "column 5 : in sequence count\n");
              PP      "column 6 : out sequence count\n");
              PP      "column 7 : yule\n");
              PP      "column 8 : in taxa count\n");
              PP      "column 9 : out taxa count\n");
              PP      "column 10 : coverage\n");
              PP      "column 11 : specificity\n");
              PP      "column 12 : minimum amplified length\n");
              PP      "column 13 : maximum amplified length\n");
              PP      "column 14 : average amplified length\n");
              PP      "------------------------------------------\n");
              PP		" http://www.grenoble.prabi.fr/trac/ecoPrimer/\n");
              PP      "------------------------------------------\n\n");        
              PP      "\n");
 }
 static void ExitUsage(int stat)
@ -56,7 +106,7 @@ void initoptions(poptions_t options)
 	options->strict_exclude_quorum=0.1;
 	options->sensitivity_quorum=0.9;
 	options->false_positive_quorum=0.1;
-	options->strict_three_prime=2;
+	options->strict_three_prime=0;
 	options->r=0;
 	options->g=0;
 	options->no_multi_match=FALSE;
@ -75,7 +125,7 @@ void printcurrenttime ()
    /* Format and print the time, "ddd yyyy-mm-dd hh:mm:ss zzz" */
    ts = localtime(&now);
    strftime(buf, sizeof(buf), "%a %Y-%m-%d %H:%M:%S %Z", ts);
-    fprintf(stderr,"#%d#, %s\n",now, buf);
+    fprintf(stderr,"#%d#, %s\n",(int)now, buf);
 }
 void printcurrenttimeinmilli()
@ -90,7 +140,125 @@ void printcurrenttimeinmilli()
 }
 /*TR: Added*/
 void printapair(int32_t index,ppair_t pair, poptions_t options)
 {
 	uint32_t wellidentifiedtaxa;
 	printf("%6d\t",index);
 	if (pair->asdirect1)
 		printf("%s\t",ecoUnhashWord(pair->p1->word,options->primer_length));
 	else
 		printf("%s\t",ecoUnhashWord(ecoComplementWord(pair->p1->word,
 				                                      options->primer_length),options->primer_length));
 	if (pair->asdirect2)
 		printf("%s",ecoUnhashWord(pair->p2->word,options->primer_length));
 	else
 		printf("%s",ecoUnhashWord(ecoComplementWord(pair->p2->word,
 				                                    options->primer_length),options->primer_length));
 	printf("\t%c%c", "bG"[(int)pair->p1->good],"bG"[(int)pair->p2->good]);
 	printf("\t%d", pair->inexample);
 	printf("\t%d", pair->outexample);
 	printf("\t%4.3f", pair->yule);
 	printf("\t%d", pair->intaxa);
 	printf("\t%d", pair->outtaxa);
 	printf("\t%4.3f", (float)pair->intaxa/options->intaxa);
 	wellidentifiedtaxa = (pair->intaxa + pair->outtaxa) - pair->notwellidentifiedtaxa;
 	//printf("\t%d", pair->notwellidentifiedtaxa);
 	//printf("\t%d", (pair->intaxa + pair->outtaxa));
 	printf("\t%4.3f", (float)wellidentifiedtaxa/(options->intaxa + options->outtaxa));
 	printf("\t%d", pair->mind);
 	printf("\t%d", pair->maxd);
 	printf("\t%3.2f\n", (float)pair->sumd/pair->inexample);
 }
 uint32_t filterandsortpairs(ppair_t* sortedpairs,uint32_t count, poptions_t options)
 {
 	uint32_t i,j;
 	float q,qfp;
 	for (i=0,j=0;i < count;i++)
 	{
 		if (options->insamples)
 			q = (float)sortedpairs[i]->inexample/options->insamples;
 		else q=1.0;
 		if (options->outsamples)
 			qfp = (float)sortedpairs[i]->outexample/options->outsamples;
 		else qfp=0.0;
 		sortedpairs[i]->quorumin = q;
 		sortedpairs[i]->quorumout = qfp;
 		sortedpairs[i]->yule = q -qfp;
 		sortedpairs[j]=sortedpairs[i];
 		if (q > options->sensitivity_quorum &&
 			qfp < options->false_positive_quorum)
 		{
 			(void)taxonomycoverage(sortedpairs[j],options);
 			taxonomyspecificity(sortedpairs[j]);
 			j++;
 		}
 	}
 	return j;
 }
 void printpairs (ppairtree_t pairs, poptions_t options)
 {
   ppair_t* sortedpairs;
   ppair_t* index;
   ppairlist_t pl;
   size_t i,j;
   int32_t count;
   //printf("Index\tPrimer1\tPrimer2\tGB\tInexampleCount\tOutexampleCount\tYule\tIntaxaCount\tOuttaxaCount\tCoverage\tSpecificity\tMinAmplifiedLength\tMaxAmplifiedLength\tAvgAmplifiedLength\n"); 
   fprintf(stderr,"Total pair count : %d\n",pairs->count);
   sortedpairs = ECOMALLOC(pairs->count*sizeof(ppair_t),"Cannot Allocate ordered pairs");
   index=sortedpairs;
   pl=pairs->first;
   j=0;
   while(pl->next)
   {
 	   for (i=0;i<pl->paircount;i++,j++)
 		   sortedpairs[j]=pl->pairs+i;
 	   pl=pl->next;
   }
   for (i=0;i<pl->paircount;i++,j++)
 	   sortedpairs[j]=pl->pairs+i;
   count=filterandsortpairs(sortedpairs,pairs->count,options);
   for (i=0;i < count;i++)
 	   printapair(i,sortedpairs[i],options);
 }
 #ifdef MASKEDCODE
 void printpairs (pairscount_t pairs, poptions_t options, int32_t rankdbstats, uint32_t seqdbsize)
 {
 	uint32_t i;
 	uint32_t  wordsize = options->primer_length;
@ -121,8 +289,11 @@ void printpairs (pairscount_t pairs, poptions_t options, int32_t rankdbstats, ui
 	}
 }
 #endif /* MASKEDCODE */
 /*updateseqparams: This function counts the insample and outsample sequences
 *  and with each sequences adds a tag of the taxon to which the sequence beongs*/
 void updateseqparams (pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxonomy,
 		poptions_t options, int32_t *insamples, int32_t *outsamples)
 {
@ -168,47 +339,10 @@ void setresulttaxonrank (ecotaxonomy_t *taxonomy, poptions_t options)
    }
 }
 /* to get db stats, totals of species, genus etc....*/
 int32_t getrankdbstats(pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxonomy, 
 		poptions_t options)
 {
 	uint32_t i;
 	uint32_t j;
 	uint32_t nameslots = 500;
 	uint32_t namesindex = 0;
 	int32_t *ranktaxonids = ECOMALLOC(nameslots * sizeof(int32_t), "Error in taxon rank allocation");
 	int32_t taxid;
-	ecotx_t  *tmptaxon;
+#ifdef MASKEDCODE
-    for (i=0;i<seqdbsize;i++)
+void setoktaxforspecificity (ppairtree_t pairs)
 	{
    	taxid = taxonomy->taxons->taxon[seqdb[i]->taxid].taxid;
 		tmptaxon = eco_findtaxonbytaxid(taxonomy, taxid);
 		if (tmptaxon)
 			tmptaxon = eco_findtaxonatrank(tmptaxon, options->taxonrankidx);
 		if (tmptaxon)
 		{
 			for (j = 0; j < namesindex; j++)
 			{
 				if (tmptaxon->taxid == ranktaxonids[j]) break;
 			}
 			if (j < namesindex) continue; /* name is already in list, so no need to add it*/
 			if (namesindex == nameslots)
 			{
 				nameslots += 500;
 				ranktaxonids = ECOREALLOC(ranktaxonids, nameslots * sizeof(int32_t), "Cannot allocate pair rank taxon table");
 			}
 			ranktaxonids[namesindex] = tmptaxon->taxid;
 			namesindex++;
 		}
 	}
    ECOFREE(ranktaxonids, "free rank taxon table");
    return namesindex;
 }
 void setoktaxforspecificity (ppairscount_t pairs)
 {
 	uint32_t i;
 	uint32_t j;
@ -251,6 +385,8 @@ void setoktaxforspecificity (ppairscount_t pairs)
 	}
 }
 #endif
 int main(int argc, char **argv)
 {
 	pecodnadb_t   seqdb; /* of type ecoseq_t */
@ -267,7 +403,7 @@ int main(int argc, char **argv)
 	pwordcount_t    words;
 	pprimercount_t  primers;
-	pairscount_t		pairs;
+	ppairtree_t		pairs;
 	int32_t		  rankdbstats = 0;
@ -407,11 +543,16 @@ int main(int argc, char **argv)
    fprintf(stderr,"Sequence read : %d\n",(int32_t)seqdbsize);
    updateseqparams(seqdb, seqdbsize, taxonomy, &options, &insamples , &outsamples);
    options.dbsize=seqdbsize;
    options.insamples=insamples;
    options.outsamples=outsamples;
    rankdbstats = getrankdbstats(seqdb, seqdbsize, taxonomy, &options);
-    fprintf(stderr,"Database is constituted of %5d examples\n",insamples);
+    fprintf(stderr,"Database is constituted of %5d examples        corresponding to %5d %s\n",insamples,
-    fprintf(stderr,"                       and %5d counterexamples\n",outsamples);
+    		options.intaxa,options.taxonrank);
    fprintf(stderr,"                       and %5d counterexamples corresponding to %5d %s\n",outsamples,
    		options.outtaxa,options.taxonrank);
    fprintf(stderr,"Total distinct %s count %d\n",options.taxonrank, rankdbstats);
    fprintf(stderr,"\nIndexing words in sequences\n");
@ -460,13 +601,15 @@ int main(int argc, char **argv)
    /*TR: Added*/
    pairs = buildPrimerPairs(seqdb, seqdbsize, primers, &options);
    setoktaxforspecificity (&pairs);
-    printpairs (pairs, &options, rankdbstats, seqdbsize);
+
  //  setoktaxforspecificity (&pairs);
     printpairs (pairs, &options);
-    ECOFREE(pairs.pairs,"Free pairs table");
+    //ECOFREE(pairs.pairs,"Free pairs table");
    return 0;
 }
--- a/src/libecoPCR/ecoError.P
+++ b/src/libecoPCR/ecoError.P
@ -0,0 +1,15 @@
 ecoError.o ecoError.P : ecoError.c ecoPCR.h /usr/include/stdio.h \
  /usr/include/_types.h /usr/include/sys/_types.h \
  /usr/include/sys/cdefs.h /usr/include/machine/_types.h \
  /usr/include/i386/_types.h /usr/include/inttypes.h \
  /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \
  /usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \
  /usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \
  /usr/include/machine/signal.h /usr/include/i386/signal.h \
  /usr/include/i386/_structs.h /usr/include/sys/_structs.h \
  /usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \
  /usr/include/sys/resource.h /usr/include/machine/endian.h \
  /usr/include/i386/endian.h /usr/include/sys/_endian.h \
  /usr/include/libkern/_OSByteOrder.h \
  /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \
  /usr/include/machine/types.h /usr/include/i386/types.h
--- a/src/libecoPCR/ecoIOUtils.P
+++ b/src/libecoPCR/ecoIOUtils.P
@ -0,0 +1,15 @@
 ecoIOUtils.o ecoIOUtils.P : ecoIOUtils.c ecoPCR.h /usr/include/stdio.h \
  /usr/include/_types.h /usr/include/sys/_types.h \
  /usr/include/sys/cdefs.h /usr/include/machine/_types.h \
  /usr/include/i386/_types.h /usr/include/inttypes.h \
  /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \
  /usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \
  /usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \
  /usr/include/machine/signal.h /usr/include/i386/signal.h \
  /usr/include/i386/_structs.h /usr/include/sys/_structs.h \
  /usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \
  /usr/include/sys/resource.h /usr/include/machine/endian.h \
  /usr/include/i386/endian.h /usr/include/sys/_endian.h \
  /usr/include/libkern/_OSByteOrder.h \
  /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \
  /usr/include/machine/types.h /usr/include/i386/types.h
--- a/src/libecoPCR/ecoMalloc.P
+++ b/src/libecoPCR/ecoMalloc.P
@ -0,0 +1,15 @@
 ecoMalloc.o ecoMalloc.P : ecoMalloc.c ecoPCR.h /usr/include/stdio.h \
  /usr/include/_types.h /usr/include/sys/_types.h \
  /usr/include/sys/cdefs.h /usr/include/machine/_types.h \
  /usr/include/i386/_types.h /usr/include/inttypes.h \
  /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \
  /usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \
  /usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \
  /usr/include/machine/signal.h /usr/include/i386/signal.h \
  /usr/include/i386/_structs.h /usr/include/sys/_structs.h \
  /usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \
  /usr/include/sys/resource.h /usr/include/machine/endian.h \
  /usr/include/i386/endian.h /usr/include/sys/_endian.h \
  /usr/include/libkern/_OSByteOrder.h \
  /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \
  /usr/include/machine/types.h /usr/include/i386/types.h
--- a/src/libecoPCR/ecodna.P
+++ b/src/libecoPCR/ecodna.P
@ -0,0 +1,5 @@
 ecodna.o ecodna.P : ecodna.c /usr/include/string.h /usr/include/_types.h \
  /usr/include/sys/_types.h /usr/include/sys/cdefs.h \
  /usr/include/machine/_types.h /usr/include/i386/_types.h ecoPCR.h \
  /usr/include/stdio.h /usr/include/inttypes.h \
  /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h
--- a/src/libecoPCR/ecofilter.P
+++ b/src/libecoPCR/ecofilter.P
@ -0,0 +1,5 @@
 ecofilter.o ecofilter.P : ecofilter.c ecoPCR.h /usr/include/stdio.h \
  /usr/include/_types.h /usr/include/sys/_types.h \
  /usr/include/sys/cdefs.h /usr/include/machine/_types.h \
  /usr/include/i386/_types.h /usr/include/inttypes.h \
  /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h
--- a/src/libecoPCR/econame.P
+++ b/src/libecoPCR/econame.P
@ -0,0 +1,15 @@
 econame.o econame.P : econame.c ecoPCR.h /usr/include/stdio.h /usr/include/_types.h \
  /usr/include/sys/_types.h /usr/include/sys/cdefs.h \
  /usr/include/machine/_types.h /usr/include/i386/_types.h \
  /usr/include/inttypes.h \
  /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \
  /usr/include/string.h /usr/include/stdlib.h /usr/include/available.h \
  /usr/include/sys/wait.h /usr/include/sys/signal.h \
  /usr/include/sys/appleapiopts.h /usr/include/machine/signal.h \
  /usr/include/i386/signal.h /usr/include/i386/_structs.h \
  /usr/include/sys/_structs.h /usr/include/machine/_structs.h \
  /usr/include/mach/i386/_structs.h /usr/include/sys/resource.h \
  /usr/include/machine/endian.h /usr/include/i386/endian.h \
  /usr/include/sys/_endian.h /usr/include/libkern/_OSByteOrder.h \
  /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \
  /usr/include/machine/types.h /usr/include/i386/types.h
--- a/src/libecoPCR/ecorank.P
+++ b/src/libecoPCR/ecorank.P
@ -0,0 +1,15 @@
 ecorank.o ecorank.P : ecorank.c ecoPCR.h /usr/include/stdio.h /usr/include/_types.h \
  /usr/include/sys/_types.h /usr/include/sys/cdefs.h \
  /usr/include/machine/_types.h /usr/include/i386/_types.h \
  /usr/include/inttypes.h \
  /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \
  /usr/include/string.h /usr/include/stdlib.h /usr/include/available.h \
  /usr/include/sys/wait.h /usr/include/sys/signal.h \
  /usr/include/sys/appleapiopts.h /usr/include/machine/signal.h \
  /usr/include/i386/signal.h /usr/include/i386/_structs.h \
  /usr/include/sys/_structs.h /usr/include/machine/_structs.h \
  /usr/include/mach/i386/_structs.h /usr/include/sys/resource.h \
  /usr/include/machine/endian.h /usr/include/i386/endian.h \
  /usr/include/sys/_endian.h /usr/include/libkern/_OSByteOrder.h \
  /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \
  /usr/include/machine/types.h /usr/include/i386/types.h
--- a/src/libecoPCR/ecoseq.P
+++ b/src/libecoPCR/ecoseq.P
@ -0,0 +1,19 @@
 ecoseq.o ecoseq.P : ecoseq.c ecoPCR.h /usr/include/stdio.h /usr/include/_types.h \
  /usr/include/sys/_types.h /usr/include/sys/cdefs.h \
  /usr/include/machine/_types.h /usr/include/i386/_types.h \
  /usr/include/inttypes.h \
  /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \
  /usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \
  /usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \
  /usr/include/machine/signal.h /usr/include/i386/signal.h \
  /usr/include/i386/_structs.h /usr/include/sys/_structs.h \
  /usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \
  /usr/include/sys/resource.h /usr/include/machine/endian.h \
  /usr/include/i386/endian.h /usr/include/sys/_endian.h \
  /usr/include/libkern/_OSByteOrder.h \
  /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \
  /usr/include/machine/types.h /usr/include/i386/types.h \
  /usr/include/string.h /usr/include/zlib.h /usr/include/zconf.h \
  /usr/include/sys/types.h /usr/include/unistd.h \
  /usr/include/sys/unistd.h /usr/include/sys/select.h \
  /usr/include/sys/_select.h
--- a/src/libecoPCR/ecotax.P
+++ b/src/libecoPCR/ecotax.P
@ -0,0 +1,15 @@
 ecotax.o ecotax.P : ecotax.c ecoPCR.h /usr/include/stdio.h /usr/include/_types.h \
  /usr/include/sys/_types.h /usr/include/sys/cdefs.h \
  /usr/include/machine/_types.h /usr/include/i386/_types.h \
  /usr/include/inttypes.h \
  /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \
  /usr/include/string.h /usr/include/stdlib.h /usr/include/available.h \
  /usr/include/sys/wait.h /usr/include/sys/signal.h \
  /usr/include/sys/appleapiopts.h /usr/include/machine/signal.h \
  /usr/include/i386/signal.h /usr/include/i386/_structs.h \
  /usr/include/sys/_structs.h /usr/include/machine/_structs.h \
  /usr/include/mach/i386/_structs.h /usr/include/sys/resource.h \
  /usr/include/machine/endian.h /usr/include/i386/endian.h \
  /usr/include/sys/_endian.h /usr/include/libkern/_OSByteOrder.h \
  /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \
  /usr/include/machine/types.h /usr/include/i386/types.h
--- a/src/libecoprimer/Makefile
+++ b/src/libecoprimer/Makefile
@ -10,7 +10,9 @@ SOURCES = goodtaxon.c \
          queue.c \
          libstki.c \
          sortmatch.c \
          pairtree.c \
          pairs.c \
          taxstats.c \
          apat_search.c
 SRCS=$(SOURCES)
--- a/src/libecoprimer/aproxpattern.c
+++ b/src/libecoprimer/aproxpattern.c
@ -61,7 +61,7 @@ void encodeSequence(ecoseq_t *seq)
     for (i=0;i<seq->SQ_length;i++,data++,cseq++)
     {
-         *data = encoder[(IS_UPPER(*cseq) ? *cseq - 'A' : 'Z')];
+         *data = encoder[(IS_UPPER(*cseq) ? *cseq : 'Z') - 'A'];
     }
 }
--- a/src/libecoprimer/ecoprimer.h
+++ b/src/libecoprimer/ecoprimer.h
@ -79,28 +79,39 @@ typedef union {
 	uint32_t value;
 } poslist_t, *ppostlist_t;
-typedef struct {
+/**
-    word_t     word;
+ * primer_t structure store fuzzy match positions for a primer
-	uint32_t   *directCount;
+ * on all sequences
-	ppostlist_t directPos;
+ */
-	uint32_t   *reverseCount;
+typedef struct {
-	ppostlist_t reversePos;
+    word_t     word;              //< code for the primer
-	bool_t     good;
+	uint32_t   *directCount;      //< Occurrence count on direct strand
-	uint32_t   inexample;
+	ppostlist_t directPos;        //< list of position list on direct strand
-	uint32_t   outexample;
+
 	uint32_t   *reverseCount;     //< Occurrence count on reverse strand
 	ppostlist_t reversePos;       //< list of position list on reverse strand
 	bool_t     good;              //< primer match more than quorum example and no
 	                              //  more counterexample quorum.
 	uint32_t   inexample;         //< count of example sequences matching primer
 	uint32_t   outexample;        //< count of counterexample sequences matching primer
 } primer_t, *pprimer_t;
 /**
 * primercount_t structure store fuzzy match positions for all primers
 * on all sequences as a list of primer_t
 */
 typedef struct {
 	pprimer_t   primers; 
 	uint32_t    size;
 } primercount_t, *pprimercount_t;
 typedef struct {
-	word_t      word;
+	pprimer_t   primer;
 	uint32_t    position;
 	bool_t      strand;
 	bool_t     	good; /*TR: Added*/
 } primermatch_t, *pprimermatch_t;
 /*TR: Added*/
@ -109,6 +120,19 @@ typedef struct {
 	uint32_t	matchcount;
 } primermatchcount_t, *pprimermatchcount_t;
 typedef struct {
 	pecoseq_t  sequence;
 	bool_t     strand;
 	const char *amplifia;
 	int32_t	   length;
 } amplifia_t, *pamplifia_t;
 typedef struct {
 	pamplifia_t amplifias;
 	uint32_t    ampcount;
 	uint32_t	ampslot;
 } amplifiacount_t, *pamplifiacount_t;
 typedef struct {
 	char *amplifia;
 	int32_t *taxonids;
@ -124,30 +148,52 @@ typedef struct {
 } taxampset_t, *ptaxampset_t;
 typedef struct {
-	word_t 			w1;
+	pprimer_t 		p1;
-	word_t 			w2;
+	bool_t			asdirect1;
-	uint32_t   		inexample; /*inexample count*/
+	pprimer_t 		p2;
-	uint32_t   		outexample; /*outexample count*/
+	bool_t			asdirect2;
-	uint32_t 		mind;
+	amplifiacount_t pcr;
 	uint32_t		maxd;
-	uint32_t		ampsetcount;
+	uint32_t   		inexample;         //< example sequence count
-	uint32_t		ampsetindex;
+	uint32_t   		outexample;        //< counterexample sequence count
-	pampseqset_t	ampset;
+	uint32_t   		intaxa;            //< example taxa count
 	uint32_t   		outtaxa;            //< counterexample taxa count
 	uint32_t		notwellidentifiedtaxa;
 	uint32_t		taxsetcount;
 	uint32_t		taxsetindex;
 	ptaxampset_t	taxset;
-	uint32_t		oktaxoncount;
+					// these statistics are relative to inexample sequences
-} pairs_t, *ppairs_t;
+
 	uint32_t 		mind;				//< minimum distance between primers
 	uint32_t		maxd;				//< maximum distance between primers
 	uint32_t        sumd;				//< distance sum
 	float			yule;
 	float			quorumin;
 	float           quorumout;
 //
 //	uint32_t		taxsetcount;
 //	uint32_t		taxsetindex;
 //	ptaxampset_t	taxset;
 //
 //	uint32_t		oktaxoncount;
 } pair_t, *ppair_t;
 /*TR: Added*/
 typedef struct {
-	ppairs_t 	pairs;
+	size_t  	paircount;
-	uint32_t	paircount;
+	size_t      pairslots;
-}pairscount_t, *ppairscount_t;
+    void*       next;
 	pair_t 	    pairs[1];
 } pairlist_t, *ppairlist_t;
 typedef struct {
 	ppairlist_t first;
 	ppairlist_t last;
 	void       *tree;
 	int32_t     count;
 } pairtree_t, *ppairtree_t;
 typedef struct {
 	pword_t     words;
@ -168,6 +214,18 @@ typedef struct {
 	uint32_t    size;
 } merge_t, *pmerge_t;
 typedef struct {
 	const char 		*amplifia;
 	bool_t     	strand;
 	int32_t	   	length;
 	int32_t		taxoncount;
 	void		*taxontree;
 }amptotaxon_t, *pamptotaxon_t;
 typedef struct {
 	int32_t 	taxid;
 	void		*amptree;	
 }taxontoamp_t, *ptaxontoamp_t;
 typedef struct {
 	uint32_t        lmin;                   //**< Amplifia minimal length
@ -189,6 +247,14 @@ typedef struct {
 	bool_t        no_multi_match;
 	char		  taxonrank[20];  //TR to count ranks against a pair
 	int32_t		  taxonrankidx;   //TR to count ranks against a pair
 	// Some statistics useful for options filters
 	int32_t       dbsize;
 	int32_t		  insamples;
 	int32_t		  outsamples;
 	int32_t       intaxa;
 	int32_t       outtaxa;
 } options_t, *poptions_t;
 typedef ecoseq_t  **pecodnadb_t;
@ -232,7 +298,21 @@ pprimercount_t lookforAproxPrimer(pecodnadb_t database, uint32_t seqdbsize,uint3
 void sortmatch(pprimermatch_t table,uint32_t N);
 ppairtree_t initpairtree(ppairtree_t tree);
 ppair_t pairintree (pair_t key,ppairtree_t pairlist);
 ppair_t insertpair(pair_t key,ppairtree_t list);
 /*TR: Added*/
-pairscount_t buildPrimerPairs(pecodnadb_t seqdb,uint32_t seqdbsize,pprimercount_t primers,poptions_t options);
+ppairtree_t buildPrimerPairs(pecodnadb_t seqdb,uint32_t seqdbsize,pprimercount_t primers,poptions_t options);
 int32_t counttaxon(int32_t taxid);
 int32_t getrankdbstats(pecodnadb_t seqdb,
 					   uint32_t seqdbsize,
 					   ecotaxonomy_t *taxonomy,
 					   poptions_t options);
 float taxonomycoverage(ppair_t pair, poptions_t options);
 char ecoComplementChar(char base);
 void taxonomyspecificity (ppair_t pair);
 #endif /* EPSORT_H_ */
--- a/src/libecoprimer/hashsequence.c
+++ b/src/libecoprimer/hashsequence.c
@ -201,3 +201,8 @@ uint32_t ecoFindWord(pwordcount_t table,word_t word)
 		return ~0;
 }
 char ecoComplementChar(char base)
 {
 	return (base < 4)? !base & 3: 4; 
 }
--- a/src/libecoprimer/pairs.c
+++ b/src/libecoprimer/pairs.c
@ -7,22 +7,28 @@
 #include  "ecoprimer.h"
 #include  <string.h>
 #include  <stdlib.h>
-primermatchcount_t buildPrimerPairsForOneSeq(uint32_t seqid,pprimercount_t primers,poptions_t options);
+static void buildPrimerPairsForOneSeq(uint32_t seqid,
 									  pecodnadb_t seqdb,
 		 							  pprimercount_t primers,
 								 	  ppairtree_t pairs,
 									  poptions_t options);
 int32_t pairinlist (ppairs_t pairlist, word_t w1, word_t w2, uint32_t size)
 {
 	uint32_t i;
 	for (i = 0; i < size; i++)
 	{
 		if (w1 == pairlist[i].w1 && w2 == pairlist[i].w2) return i;
 		if (w1 == pairlist[i].w2 && w2 == pairlist[i].w1) return i;
 	}
 	return -1;
 }
-char *addamplifiasetelem (ppairs_t pair, char* amplifia, int32_t taxid)
+
 /*************************************
 *
 *       pair collection management
 *
 *************************************/
 #ifdef MASKEDCODE
 char *addamplifiasetelem (ppair_t pair, char* amplifia, int32_t taxid)
 {
 	uint32_t i;
 	uint32_t j;
@ -79,7 +85,7 @@ char *addamplifiasetelem (ppairs_t pair, char* amplifia, int32_t taxid)
 	return ampused;
 }
-void addtaxampsetelem (ppairs_t pair, int32_t taxid, char *amplifia)
+void addtaxampsetelem (ppair_t pair, int32_t taxid, char *amplifia)
 {
 	uint32_t i;
 	uint32_t j;
@ -135,6 +141,7 @@ void addtaxampsetelem (ppairs_t pair, int32_t taxid, char *amplifia)
 char *getamplifia (pecoseq_t seq, uint32_t start, uint32_t len)
 {
 	fprintf(stderr,"start : %d length : %d\n",start,len);
 	char *amplifia = ECOMALLOC((len + 1) * sizeof(char),"Cannot allocate amplifia");
 	char *seqc = &seq->SQ[start];
@ -142,125 +149,44 @@ char *getamplifia (pecoseq_t seq, uint32_t start, uint32_t len)
 	return amplifia;
 }
 #endif
 /*TR: Added*/
-pairscount_t buildPrimerPairs(pecodnadb_t seqdb,uint32_t seqdbsize,pprimercount_t primers,poptions_t options)
+ppairtree_t buildPrimerPairs(pecodnadb_t seqdb,uint32_t seqdbsize,pprimercount_t primers,poptions_t options)
 {
 	uint32_t i;
-	uint32_t j;
+	ppairtree_t primerpairs;
 	uint32_t k;
 	uint32_t d;
 	uint32_t strt;
 	uint32_t end;
 	uint32_t paircount = 0;
 	uint32_t pairslots = 500;
 	int32_t foundindex;
 	ppairs_t pairs;
 	pairscount_t primerpairs;
 	primermatchcount_t seqmatchcount;
 	word_t w1;
 	word_t w2;
 	char *amplifia;
 	char *oldamp;
-
+	primerpairs = initpairtree(NULL);
 	pairs = ECOMALLOC(pairslots * sizeof(pairs_t),"Cannot allocate pairs table");
 	for (i=0; i < seqdbsize; i++)
 	{
-		seqmatchcount = buildPrimerPairsForOneSeq(i, primers, options);
+		buildPrimerPairsForOneSeq(i, seqdb, primers, primerpairs, options);
 		if (seqmatchcount.matchcount == 0) continue;
 		for (j=0; j < seqmatchcount.matchcount; j++)
 		{		
 			strt = 0;
 			w1 = seqmatchcount.matches[j].word;
 			/*first word should b on direct strand*/
 			if (!seqmatchcount.matches[j].strand)
 				w1 = ecoComplementWord(w1, options->primer_length);
 			else
 				strt = options->primer_length;
 			for (k=j+1; k < seqmatchcount.matchcount; k++)
 			{
 				end = 0;
 				w2 = seqmatchcount.matches[k].word;
 				/*second word should be on reverse strand*/
 				if (seqmatchcount.matches[k].strand)
 					w2 = ecoComplementWord(w2, options->primer_length);
 				else
 					end = options->primer_length;
 				if (!(seqmatchcount.matches[j].good || seqmatchcount.matches[k].good)) continue;
 				if (w1 == w2) continue;
 				d = seqmatchcount.matches[k].position - seqmatchcount.matches[j].position;
 				if (d >= options->lmin && d <= options->lmax)
 				{
 					/*get amplified string*/
 					amplifia = getamplifia (seqdb[i], seqmatchcount.matches[j].position + strt, d - strt - end);
 					foundindex = pairinlist(pairs, w1, w2, paircount);
 					if (foundindex != -1) /*pair is found*/
 					{
 						if (seqdb[i]->isexample)
 							pairs[foundindex].inexample++;
 						else
 							pairs[foundindex].outexample++;
 						if (pairs[foundindex].mind > d) pairs[foundindex].mind = d;
 						else if (pairs[foundindex].maxd < d) pairs[foundindex].maxd = d;
 						oldamp = addamplifiasetelem (&pairs[foundindex], amplifia, seqdb[i]->ranktaxonid);
 						/*if exact same string is already in amplifia set then use that for taxon set, it will help for
 						 * calculating the fully identified taxons i.e specificity, we will compare pointrs instead of strings
 						 * because same string means same pointer*/
 						if (oldamp)
 						{
 							ECOFREE (amplifia, "free amplifia");
 							amplifia = oldamp;
 						}
 						addtaxampsetelem (&pairs[foundindex], seqdb[i]->ranktaxonid, amplifia);
 						continue;
 	}
 					if (paircount == pairslots)
 					{
 						pairslots += 500;
 						pairs = ECOREALLOC(pairs, pairslots * sizeof(pairs_t), "Cannot allocate pairs table");
 					}
 					pairs[paircount].w1 = w1;
 					pairs[paircount].w2 = w2;
 					if (seqdb[i]->isexample) pairs[paircount].inexample = 1;
 					else pairs[paircount].outexample = 1;
 					pairs[paircount].mind = d;
 					pairs[paircount].maxd = d;
 					oldamp = addamplifiasetelem (&pairs[paircount], amplifia, seqdb[i]->ranktaxonid);
 					addtaxampsetelem (&pairs[paircount], seqdb[i]->ranktaxonid, amplifia);
 					paircount++;
 				}
 				else if (d > options->lmax)
 					break; /*once if the distance is greater than lmax then it will keep on increasing*/
 			}
 		}
 		ECOFREE(seqmatchcount.matches, "Cannot free matches table");
 	}
 	primerpairs.pairs = ECOREALLOC(pairs, paircount * sizeof(pairs_t), "Cannot allocate pairs table");
 	primerpairs.paircount = paircount;
 	return primerpairs;
 }
-primermatchcount_t buildPrimerPairsForOneSeq(uint32_t seqid,pprimercount_t primers,poptions_t options)
+#define DMAX (2000000000)
 static void buildPrimerPairsForOneSeq(uint32_t seqid,
 									  pecodnadb_t seqdb,
 		 							  pprimercount_t primers,
 								 	  ppairtree_t pairs,
 									  poptions_t options)
 {
 	static uint32_t    paircount=0;
 	uint32_t           i,j,k;
 	uint32_t           matchcount=0;
 	pprimermatch_t     matches = NULL;
 	primermatchcount_t seqmatchcount;
-
+	ppair_t            pcurrent;
-	seqmatchcount.matchcount = 0;
+	pair_t			   current;
-	seqmatchcount.matches = NULL;
+	pprimer_t		   wswp;
 	bool_t			   bswp;
 	size_t			   distance;
 	bool_t             strand;
 	for (i=0;i < primers->size; i++)
 	{
@ -268,7 +194,9 @@ primermatchcount_t buildPrimerPairsForOneSeq(uint32_t seqid,pprimercount_t prime
 		matchcount+=primers->primers[i].reverseCount[seqid];
 	}
-	if (matchcount <= 0) return seqmatchcount;
+	if (matchcount <= 0)
 		return;
 	matches = ECOMALLOC(matchcount * sizeof(primermatch_t),"Cannot allocate primers match table");
 	for (i=0,j=0;i < primers->size; i++)
@ -277,17 +205,15 @@ primermatchcount_t buildPrimerPairsForOneSeq(uint32_t seqid,pprimercount_t prime
 		{
 			if (primers->primers[i].directCount[seqid]==1)
 			{
-				matches[j].word = primers->primers[i].word;
+				matches[j].primer = primers->primers+i;
 				matches[j].strand=TRUE;
 				matches[j].good=primers->primers[i].good;/*TR: Added*/
 				matches[j].position=primers->primers[i].directPos[seqid].value;
 				j++;
 			}
 			else for (k=0; k < primers->primers[i].directCount[seqid]; k++,j++)
 			{
-				matches[j].word = primers->primers[i].word;
+				matches[j].primer = primers->primers+i;
 				matches[j].strand=TRUE;
 				matches[j].good=primers->primers[i].good;/*TR: Added*/
 				matches[j].position=primers->primers[i].directPos[seqid].pointer[k];
 			}
 		}
@ -296,26 +222,144 @@ primermatchcount_t buildPrimerPairsForOneSeq(uint32_t seqid,pprimercount_t prime
 		{
 			if (primers->primers[i].reverseCount[seqid]==1)
 			{
-				matches[j].word = primers->primers[i].word;
+				matches[j].primer = primers->primers+i;
 				matches[j].strand=FALSE;
 				matches[j].good=primers->primers[i].good;/*TR: Added*/
 				matches[j].position=primers->primers[i].reversePos[seqid].value;
 				j++;
 			}
 			else for (k=0; k < primers->primers[i].reverseCount[seqid]; k++,j++)
 			{
-				matches[j].word = primers->primers[i].word;
+				matches[j].primer = primers->primers+i;
 				matches[j].strand=FALSE;
 				matches[j].good=primers->primers[i].good;/*TR: Added*/
 				matches[j].position=primers->primers[i].reversePos[seqid].pointer[k];
 			}
 		}
 	}
-	sortmatch(matches,matchcount); // sort in asscending order by position
+	if (matchcount>1)
 	{
 //		fprintf(stderr,"\n====================================\n");
 		sortmatch(matches,matchcount); // sort in ascending order by position
 		for (i=0; i < matchcount;i++)
 		{
 			// For all primers matching the sequence
 			for(j=i+1;
 			       (j<matchcount)
 			    && ((distance=matches[j].position - matches[i].position - options->primer_length) < options->lmax);
 			   j++
 			   )
 			   // For all not too far primers
 		       if ( (matches[i].primer->good || matches[j].primer->good)
 		    		&& (distance > options->lmin)
 		    		)
 		       {
 		    	   // If possible primer pair
 		    	   current.p1 = matches[i].primer;
 		    	   current.asdirect1=matches[i].strand;
 		    	   current.p2 = matches[j].primer;
 		    	   current.asdirect2= !matches[j].strand;
 		    	   current.maxd=DMAX;
 		    	   current.mind=DMAX;
 		    	   current.sumd=0;
 	    		   current.inexample=0;
 	    		   current.outexample=0;
 		    	   // Standardize the pair
 	    		   strand = current.p2->word > current.p1->word;
 		    	   if (!strand)
 		    	   {
 		    		   wswp = current.p1;
 		    		   current.p1=current.p2;
 		    		   current.p2=wswp;
 		    		   bswp = current.asdirect1;
 		    		   current.asdirect1=current.asdirect2;
 		    		   current.asdirect2=bswp;
 		    	   }
 		    	   // Look for the new pair in already seen pairs
 		    	   pcurrent = insertpair(current,pairs);
 		    	   if (seqdb[seqid]->isexample)
 				   {
 		    		   pcurrent->inexample++;
 		    		   pcurrent->sumd+=distance;
 		    		   if ((pcurrent->maxd==DMAX) || (distance > pcurrent->maxd))
 			    		   pcurrent->maxd = distance;
 			    	   if (distance < pcurrent->mind)
 			    		   pcurrent->mind = distance;
 		    	   }
 		    	   else
 		    		   pcurrent->outexample++;
 		    	   if ((pcurrent->outexample+pcurrent->inexample)==1)
 		    	   {
 					   paircount++;
 					   pcurrent->pcr.ampslot=200;
 					   pcurrent->pcr.ampcount=0;
 					   pcurrent->pcr.amplifias = ECOMALLOC(sizeof(amplifia_t)*pcurrent->pcr.ampslot,
 							                               "Cannot allocate amplifia table");
 		    	   }
 		    	   else
 		    	   {
 		    		   if (pcurrent->pcr.ampslot==pcurrent->pcr.ampcount)
 		    		   {
 		    			   pcurrent->pcr.ampslot+=200;
 		    			   pcurrent->pcr.amplifias = ECOREALLOC(pcurrent->pcr.amplifias,
 															    sizeof(amplifia_t)*pcurrent->pcr.ampslot,
 		    			   							            "Cannot allocate amplifia table");
 		    		   }
 		    	   }
 		    	   pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].length=distance;
 		    	   pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].sequence=seqdb[seqid];
 		    	   pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].strand=strand;
 		    	   if (strand)
 		    	   	   pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].amplifia=  seqdb[seqid]->SQ + matches[i].position + options->primer_length;
 		    	   else
 		    	   	   pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].amplifia=  seqdb[seqid]->SQ + matches[j].position - 1 ;
 		    	   pcurrent->pcr.ampcount++;
 //		    	   fprintf(stderr,"%c%c W1 : %s   direct : %c",
 //		    			   "bG"[(int)pcurrent->p1->good],
 //		    			   "bG"[(int)pcurrent->p2->good],
 //		    			   ecoUnhashWord(pcurrent->p1->word, options->primer_length),
 //		    			   "><"[(int)pcurrent->asdirect1]
 //		    			   );
 //
 //		    	   fprintf(stderr,"   W2 : %s   direct : %c distance : %d (min/max/avg : %d/%d/%f) in/out: %d/%d %c (%d pairs)\n",
 //		    			   ecoUnhashWord(pcurrent->p2->word, options->primer_length),
 //		    			   "><"[(int)pcurrent->asdirect2],
 //		    			   distance,
 //		    			   pcurrent->mind,pcurrent->maxd,
 //		    			   (pcurrent->inexample) ? (float)pcurrent->sumd/pcurrent->inexample:0.0,
 //		    			   pcurrent->inexample,pcurrent->outexample,
 //		    			   " N"[(pcurrent->outexample+pcurrent->inexample)==1],
 //		    			   paircount
 //
 //		    			   );
 //
 		       }
 		 }
 	}
   pairs->count=paircount;
 	/*TR: Added*/
 	seqmatchcount.matches = matches;
 	seqmatchcount.matchcount = matchcount;
 	return seqmatchcount;
 }
--- a/tools/ecoPCRFormat.py
+++ b/tools/ecoPCRFormat.py
@ -0,0 +1,651 @@
 #!/usr/bin/env python
 import re
 import gzip
 import struct
 import sys
 import time
 import getopt
 try:
    import psycopg2
    _dbenable=True
 except ImportError:
    _dbenable=False
 #####
 #
 #
 # Generic file function
 #
 #
 #####
 def universalOpen(file):
    if isinstance(file,str):
        if file[-3:] == '.gz':
            rep = gzip.open(file)
        else:
            rep = open(file)
    else:
        rep = file
    return rep
 def universalTell(file):
    if isinstance(file, gzip.GzipFile):
        file=file.myfileobj
    return file.tell()
 def fileSize(file):
    if isinstance(file, gzip.GzipFile):
        file=file.myfileobj
    pos = file.tell()
    file.seek(0,2)
    length = file.tell()
    file.seek(pos,0)
    return length
 def progressBar(pos,max,reset=False,delta=[]):
    if reset:
        del delta[:]
    if not delta:
        delta.append(time.time())
        delta.append(time.time())
    delta[1]=time.time()
    elapsed = delta[1]-delta[0]
    percent = float(pos)/max * 100
    remain = time.strftime('%H:%M:%S',time.gmtime(elapsed / percent * (100-percent)))
    bar = '#' * int(percent/2)
    bar+= '|/-\\-'[pos % 5]
    bar+= ' ' * (50 - int(percent/2))
    sys.stderr.write('\r%5.1f %% |%s] remain : %s' %(percent,bar,remain))
 #####
 #
 #
 # NCBI Dump Taxonomy reader
 #
 #
 #####
 def endLessIterator(endedlist):
    for x in endedlist:
        yield x
    while(1):
        yield endedlist[-1]
 class ColumnFile(object):
    def __init__(self,stream,sep=None,strip=True,types=None):
        if isinstance(stream,str):
            self._stream = open(stream)
        elif hasattr(stream,'next'):
            self._stream = stream
        else:
            raise ValueError,'stream must be string or an iterator'
        self._delimiter=sep
        self._strip=strip
        if types:
            self._types=[x for x in types]
            for i in xrange(len(self._types)):
                if self._types[i] is bool:
                    self._types[i]=ColumnFile.str2bool
        else:
            self._types=None
    def str2bool(x):
        return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False}))
    str2bool = staticmethod(str2bool)
    def __iter__(self):
        return self
    def next(self):
        ligne = self._stream.next()
        data = ligne.split(self._delimiter)
        if self._strip or self._types:
            data = [x.strip() for x in data]
        if self._types:
            it = endLessIterator(self._types)
            data = [x[1](x[0]) for x in ((y,it.next()) for y in data)]
        return data
 def taxonCmp(t1,t2):
    if t1[0] < t2[0]:
        return -1
    elif t1[0] > t2[0]:
        return +1
    return 0
 def bsearchTaxon(taxonomy,taxid):
    taxCount = len(taxonomy)
    begin = 0
    end   = taxCount 
    oldcheck=taxCount
    check = begin + end / 2
    while check != oldcheck and taxonomy[check][0]!=taxid :
        if taxonomy[check][0] < taxid:
            begin=check
        else:
            end=check
        oldcheck=check
        check = (begin + end) / 2
    if taxonomy[check][0]==taxid:
        return check
    else:
        return None
 def readNodeTable(file):
    file = universalOpen(file)
    nodes = ColumnFile(file, 
                       sep='|', 
                       types=(int,int,str,
                              str,str,bool,
                              int,bool,int,
                              bool,bool,bool,str))
    print >>sys.stderr,"Reading taxonomy dump file..."
    taxonomy=[[n[0],n[2],n[1]] for n in nodes]
    print >>sys.stderr,"List all taxonomy rank..."    
    ranks =list(set(x[1] for x in taxonomy))
    ranks.sort()
    ranks = dict(map(None,ranks,xrange(len(ranks))))
    print >>sys.stderr,"Sorting taxons..."
    taxonomy.sort(taxonCmp)
    print >>sys.stderr,"Indexing taxonomy..."
    index = {}
    for t in taxonomy:
        index[t[0]]=bsearchTaxon(taxonomy, t[0])
    print >>sys.stderr,"Indexing parent and rank..."
    for t in taxonomy:
        t[1]=ranks[t[1]]
        t[2]=index[t[2]]
    return taxonomy,ranks,index
 def nameIterator(file):
    file = universalOpen(file)
    names = ColumnFile(file, 
                       sep='|', 
                       types=(int,str,
                              str,str))
    for taxid,name,unique,classname,white in names:
        yield taxid,name,classname
 def mergedNodeIterator(file):
    file = universalOpen(file)
    merged = ColumnFile(file, 
                       sep='|', 
                       types=(int,int,str))
    for taxid,current,white in merged:
            yield taxid,current
 def deletedNodeIterator(file):
    file = universalOpen(file)
    deleted = ColumnFile(file, 
                       sep='|', 
                       types=(int,str))
    for taxid,white in deleted:
            yield taxid
 def readTaxonomyDump(taxdir):
    taxonomy,ranks,index = readNodeTable('%s/nodes.dmp' % taxdir)
    print >>sys.stderr,"Adding scientific name..."
    alternativeName=[]
    for taxid,name,classname in nameIterator('%s/names.dmp' % taxdir):
        alternativeName.append((name,classname,index[taxid]))
        if classname == 'scientific name':
            taxonomy[index[taxid]].append(name)
    print >>sys.stderr,"Adding taxid alias..."
    for taxid,current in mergedNodeIterator('%s/merged.dmp' % taxdir):
        index[taxid]=index[current]
    print >>sys.stderr,"Adding deleted taxid..."
    for taxid in deletedNodeIterator('%s/delnodes.dmp' % taxdir):
        index[taxid]=None
    return taxonomy,ranks,alternativeName,index
 def readTaxonomyDB(dbname):
    connection = psycopg2.connect(database=dbname)
    cursor = connection.cursor()
    cursor.execute("select numid,rank,parent from ncbi_taxonomy.taxon")
    taxonomy=[list(x) for x in cursor]
    cursor.execute("select rank_class from ncbi_taxonomy.taxon_rank_class order by rank_class")
    ranks=cursor.fetchall()
    ranks = dict(map(None,(x[0] for x in ranks),xrange(len(ranks))))
    print >>sys.stderr,"Sorting taxons..."
    taxonomy.sort(taxonCmp)
    print >>sys.stderr,"Indexing taxonomy..."
    index = {}
    for t in taxonomy:
        index[t[0]]=bsearchTaxon(taxonomy, t[0])
    print >>sys.stderr,"Indexing parent and rank..."
    for t in taxonomy:
        t[1]=ranks[t[1]]
        try:
            t[2]=index[t[2]]
        except KeyError,e:
            if t[2] is None and t[0]==1:
                t[2]=index[t[0]]
            else:
                raise e
    cursor.execute("select taxid,name,category from ncbi_taxonomy.name")
    alternativeName=[]
    for taxid,name,classname in cursor:
        alternativeName.append((name,classname,index[taxid]))
        if classname == 'scientific name':
            taxonomy[index[taxid]].append(name)
    cursor.execute("select old_numid,current_numid from ncbi_taxonomy.taxon_id_alias")
    print >>sys.stderr,"Adding taxid alias..."
    for taxid,current in cursor:
        if current is not None:
            index[taxid]=index[current]
        else:
            index[taxid]=None
    return taxonomy,ranks,alternativeName,index
 #####
 #
 #
 #  Genbank/EMBL sequence reader
 #
 #
 #####
 def entryIterator(file):
    file = universalOpen(file)
    rep =[]
    for ligne in file:
        rep.append(ligne)
        if ligne == '//\n':
            rep = ''.join(rep)
            yield rep
            rep = []
 def fastaEntryIterator(file):
    file = universalOpen(file)
    rep =[]
    for ligne in file:
        if ligne[0] == '>' and rep:
            rep = ''.join(rep)
            yield rep
            rep = []
        rep.append(ligne)
    if rep:
        rep = ''.join(rep)
        yield rep
 _cleanSeq = re.compile('[ \n0-9]+')
 def cleanSeq(seq):
    return _cleanSeq.sub('',seq)
 _gbParseID = re.compile('(?<=^LOCUS {7})[^ ]+(?= )',re.MULTILINE)   
 _gbParseDE = re.compile('(?<=^DEFINITION {2}).+?\. *$(?=[^ ])',re.MULTILINE+re.DOTALL)   
 _gbParseSQ = re.compile('(?<=^ORIGIN).+?(?=^//$)',re.MULTILINE+re.DOTALL)  
 _gbParseTX = re.compile('(?<= /db_xref="taxon:)[0-9]+(?=")')
 def genbankEntryParser(entry):
    Id = _gbParseID.findall(entry)[0]
    De = ' '.join(_gbParseDE.findall(entry)[0].split())
    Sq = cleanSeq(_gbParseSQ.findall(entry)[0].upper())
    try:
        Tx = int(_gbParseTX.findall(entry)[0])
    except IndexError:
        Tx = None
    return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq}
 ######################
 _cleanDef = re.compile('[\nDE]')
 def cleanDef(definition):
    return _cleanDef.sub('',definition)
 _emblParseID = re.compile('(?<=^ID {3})[^ ]+(?=;)',re.MULTILINE)   
 _emblParseDE = re.compile('(?<=^DE {3}).+?\. *$(?=[^ ])',re.MULTILINE+re.DOTALL)   
 _emblParseSQ = re.compile('(?<=^  ).+?(?=^//$)',re.MULTILINE+re.DOTALL)  
 _emblParseTX = re.compile('(?<= /db_xref="taxon:)[0-9]+(?=")')
 def emblEntryParser(entry):
    Id = _emblParseID.findall(entry)[0]
    De = ' '.join(cleanDef(_emblParseDE.findall(entry)[0]).split())
    Sq = cleanSeq(_emblParseSQ.findall(entry)[0].upper())
    try:
        Tx = int(_emblParseTX.findall(entry)[0])
    except IndexError:
        Tx = None
    return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq}
 ######################
 _fastaSplit=re.compile(';\W*')
 def parseFasta(seq):
    seq=seq.split('\n')
    title = seq[0].strip()[1:].split(None,1)
    id=title[0]
    if len(title) == 2:
        field = _fastaSplit.split(title[1])
    else:
        field=[]
    info = dict(x.split('=',1) for x in field if '=' in x)
    definition = ' '.join([x for x in field if '=' not in x])
    seq=(''.join([x.strip() for x in seq[1:]])).upper()   
    return id,seq,definition,info
 def fastaEntryParser(entry):
    id,seq,definition,info = parseFasta(entry)
    Tx = info.get('taxid',None)   
    if Tx is not None:
        Tx=int(Tx)
    return {'id':id,'taxid':Tx,'definition':definition,'sequence':seq}
 def sequenceIteratorFactory(entryParser,entryIterator):
    def sequenceIterator(file):
        for entry in entryIterator(file):
            yield entryParser(entry)
    return sequenceIterator
 def taxonomyInfo(entry,connection):
    taxid = entry['taxid']
    curseur = connection.cursor()
    curseur.execute("""
                        select taxid,species,genus,family,
                               taxonomy.scientificName(taxid) as sn,
                               taxonomy.scientificName(species) as species_sn,
                               taxonomy.scientificName(genus) as genus_sn,
                               taxonomy.scientificName(family) as family_sn
                        from
                            (   
                             select alias                      as taxid,
                               taxonomy.getSpecies(alias) as species,
                               taxonomy.getGenus(alias)   as genus,
                               taxonomy.getFamily(alias)  as family
                                from taxonomy.aliases
                               where id=%d ) as tax
                    """ % taxid)
    rep = curseur.fetchone()
    entry['current_taxid']=rep[0]
    entry['species']=rep[1]
    entry['genus']=rep[2]
    entry['family']=rep[3]
    entry['scientific_name']=rep[4]
    entry['species_sn']=rep[5]
    entry['genus_sn']=rep[6]
    entry['family_sn']=rep[7]
    return entry
 #####
 #
 #
 # Binary writer
 #
 #
 #####
 def ecoSeqPacker(sq):
    compactseq = gzip.zlib.compress(sq['sequence'],9)
    cptseqlength  = len(compactseq)
    delength   = len(sq['definition'])
    totalSize = 4 + 20 + 4 + 4 + 4 + cptseqlength + delength
    packed = struct.pack('> I I 20s I I I %ds %ds' % (delength,cptseqlength),
                         totalSize,
                         sq['taxid'],
                         sq['id'],
                         delength,
                         len(sq['sequence']),
                         cptseqlength,
                         sq['definition'],
                         compactseq)
    assert len(packed) == totalSize+4, "error in sequence packing"
    return packed
 def ecoTaxPacker(tx):
    namelength = len(tx[3])
    totalSize = 4 + 4 + 4 + 4 + namelength
    packed = struct.pack('> I I I I I %ds' % namelength, 
                         totalSize, 
                         tx[0],
                         tx[1],
                         tx[2], 
                         namelength,
                         tx[3])
    return packed
 def ecoRankPacker(rank):
    namelength = len(rank)
    packed = struct.pack('> I %ds' % namelength,
                         namelength,
                         rank)
    return packed
 def ecoNamePacker(name):
    namelength = len(name[0])
    classlength= len(name[1])
    totalSize =  namelength + classlength + 4 + 4 + 4 + 4
    packed = struct.pack('> I I I I I %ds %ds' % (namelength,classlength),
                         totalSize,
                         int(name[1]=='scientific name'),
                         namelength,
                         classlength,
                         name[2],
                         name[0],
                         name[1])
    return packed
 def ecoSeqWriter(file,input,taxindex,parser):
    output = open(file,'wb')
    input  = universalOpen(input)
    inputsize = fileSize(input)
    entries = parser(input)
    seqcount=0
    skipped = []
    output.write(struct.pack('> I',seqcount))
    progressBar(1, inputsize,reset=True)
    for entry in entries:
        if entry['taxid'] is not None:
            try:
                entry['taxid']=taxindex[entry['taxid']]
            except KeyError:
                entry['taxid']=None
            if entry['taxid'] is not None:
                seqcount+=1
                output.write(ecoSeqPacker(entry))
            else:
                skipped.append(entry['id'])
            where = universalTell(input)
            progressBar(where, inputsize)
            print >>sys.stderr," Readed sequences : %d     " % seqcount,
        else:
            skipped.append(entry['id'])
    print >>sys.stderr
    output.seek(0,0)
    output.write(struct.pack('> I',seqcount))
    output.close()
    return skipped
 def ecoTaxWriter(file,taxonomy):
    output = open(file,'wb')
    output.write(struct.pack('> I',len(taxonomy)))
    for tx in taxonomy:
        output.write(ecoTaxPacker(tx))
    output.close()
 def ecoRankWriter(file,ranks):
    output = open(file,'wb')
    output.write(struct.pack('> I',len(ranks)))
    rankNames = ranks.keys()
    rankNames.sort()
    for rank in rankNames:
        output.write(ecoRankPacker(rank))
    output.close()
 def nameCmp(n1,n2):
    name1=n1[0].upper()
    name2=n2[0].upper()
    if name1 < name2:
        return -1
    elif name1 > name2:
        return 1
    return 0
 def ecoNameWriter(file,names):
    output = open(file,'wb')
    output.write(struct.pack('> I',len(names)))
    names.sort(nameCmp)
    for name in names:
        output.write(ecoNamePacker(name))
    output.close()
 def ecoDBWriter(prefix,taxonomy,seqFileNames,parser):
    ecoRankWriter('%s.rdx' % prefix, taxonomy[1])
    ecoTaxWriter('%s.tdx' % prefix, taxonomy[0])
    ecoNameWriter('%s.ndx' % prefix, taxonomy[2])
    filecount = 0
    for filename in seqFileNames:
        filecount+=1
        sk=ecoSeqWriter('%s_%03d.sdx' % (prefix,filecount), 
                     filename, 
                     taxonomy[3], 
                     parser)
        if sk:
            print >>sys.stderr,"Skipped entry :"
            print >>sys.stderr,sk
 def ecoParseOptions(arguments):
    opt = {
            'prefix' : 'ecodb',
            'taxdir' : 'taxdump',
            'parser' : sequenceIteratorFactory(genbankEntryParser,
                                                  entryIterator)
           }
    o,filenames = getopt.getopt(arguments,
                                'ht:T:n:gfe',
                                ['help',
                                 'taxonomy=',
                                 'taxonomy_db=',
                                 'name=',
                                 'genbank',
                                 'fasta',
                                 'embl'])
    for name,value in o:
        if name in ('-h','--help'):
            printHelp()
            exit()
        elif name in ('-t','--taxonomy'):
            opt['taxmod']='dump'
            opt['taxdir']=value
        elif name in ('-T','--taxonomy_db'):
            opt['taxmod']='db'
            opt['taxdb']=value
        elif name in ('-n','--name'):
            opt['prefix']=value
        elif name in ('-g','--genbank'):
            opt['parser']=sequenceIteratorFactory(genbankEntryParser,
                                                  entryIterator)
        elif name in ('-f','--fasta'):
            opt['parser']=sequenceIteratorFactory(fastaEntryParser,
                                                  fastaEntryIterator)
        elif name in ('-e','--embl'):
            opt['parser']=sequenceIteratorFactory(emblEntryParser,
                                                  entryIterator)
        else:
            raise ValueError,'Unknown option %s' % name
    return opt,filenames
 def printHelp():
    print "-----------------------------------"
    print " ecoPCRFormat.py"
    print "-----------------------------------"
    print "ecoPCRFormat.py [option] <argument>"
    print "-----------------------------------"
    print "-e    --embl        :[E]mbl format"
    print "-f    --fasta       :[F]asta format"
    print "-g    --genbank     :[G]enbank format"
    print "-h    --help        :[H]elp - print this help"
    print "-n    --name        :[N]ame of the new database created"
    print "-t    --taxonomy    :[T]axonomy - path to the taxonomy database"
    print "                    :bcp-like dump from GenBank taxonomy database."
    print "-----------------------------------"
 if __name__ == '__main__':
    opt,filenames = ecoParseOptions(sys.argv[1:])
    if opt['taxmod']=='dump':
        taxonomy = readTaxonomyDump(opt['taxdir'])
    elif opt['taxmod']=='db':
        taxonomy = readTaxonomyDB(opt['taxdb'])
    ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser'])