New version 0.3 with filtering on short words

git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@213 60f365c0-8329-0410-b2a4-ec073aeeaa1d
2009-05-13 06:51:25 +00:00
parent 5dc55c7f53
commit b7c1640042
12 changed files with 330 additions and 34 deletions
--- a/2
+++ b/2
@ -1 +1 @@
-0.2
+0.3
--- a/src/ecoPrimer
+++ b/src/ecoPrimer
--- a/src/ecoprimer.c
+++ b/src/ecoprimer.c
@ -14,7 +14,7 @@
 #include <time.h>
 #include <sys/time.h>

-#define VERSION "0.2"
+#define VERSION "0.3"
  /* TR: by default, statistics are made on species level*/
 #define DEFAULTTAXONRANK "species"

@ -98,6 +98,7 @@ static void ExitUsage(int stat)
 void initoptions(poptions_t options)
 {
 	options->statistics=FALSE;
+	options->filtering=TRUE;
 	options->lmin=0;                   //< Amplifia minimal length
 	options->lmax=1000;                   //< Amplifia maximal length
 	options->error_max=3;              //**< maximum error count in fuzzy search
@ -432,6 +433,7 @@ int main(int argc, char **argv)
 	uint32_t        i;

 	pwordcount_t    words;
+//	pwordcount_t    words2;
 	pprimercount_t  primers;
 	ppairtree_t		pairs;

@ -442,7 +444,7 @@ int main(int argc, char **argv)

 	initoptions(&options);

-    while ((carg = getopt(argc, argv, "hvcUDSd:l:L:e:i:r:q:3:s:x:t:O:")) != -1) {
+    while ((carg = getopt(argc, argv, "hfvcUDSd:l:L:e:i:r:q:3:s:x:t:O:")) != -1) {

     switch (carg) {
 								 /* ---------------------------- */
@ -451,6 +453,12 @@ int main(int argc, char **argv)
 			options.statistics=TRUE;
 			break;

+			 /* ---------------------------- */
+		 case 'f':               /* set in single strand mode    */
+			 /* ---------------------------- */
+			 options.filtering=FALSE;
+			 break;
+
                                /* -------------------- */
        case 'd':               /* database name        */
                                /* -------------------- */
@ -599,12 +607,20 @@ int main(int argc, char **argv)

    fprintf(stderr,"\nIndexing words in sequences\n");

-    printcurrenttimeinmilli();
    words = lookforStrictPrimer(seqdb,seqdbsize,insamples,&options);
-    printcurrenttimeinmilli();
-
    fprintf(stderr,"\n  Strict primer count : %d\n",words->size);

+//    options.filtering=FALSE;
+//    words2= lookforStrictPrimer(seqdb,seqdbsize,insamples,&options);
+//    fprintf(stderr,"\n  Strict primer count : %d\n",words2->size);
+//
+//    fprintf(stderr,"\n\n  Primer sample : \n");
+//    for (i=0; i<words->size; i++)
+//    	fprintf(stderr,"  + Primer : %s   sequence count : %d\n",ecoUnhashWord(words->words[i],options.primer_length),words->strictcount[i]);
+//    fprintf(stderr,"\n\n  Primer sample : \n");
+//    for (i=0; i<words2->size; i++)
+//    	fprintf(stderr,"  + Primer : %s   sequence count : %d\n",ecoUnhashWord(words2->words[i],options.primer_length),words2->strictcount[i]);
+
    if (options.no_multi_match)
    {
    	(void)filterMultiStrictPrimer(words);
--- a/src/libecoPCR/ecoMalloc.c
+++ b/src/libecoPCR/ecoMalloc.c
@ -17,7 +17,7 @@ void    eco_untrace_memory_allocation()

 void ecoMallocedMemory()
 {
-	return eco_amount_malloc;
+	//eco_amount_malloc;
 }

 void   *eco_malloc(int64_t chunksize,
@ -60,7 +60,7 @@ void   *eco_realloc(void *chunk,
 	if (!newchunk)
           {
 		ecoError(ECO_MEM_ERROR,error_message,filename,line);
-                fprintf('Requested memory : %d\n',newsize);
+                fprintf(stderr,"Requested memory : %d\n",newsize);
           }
 	if (!chunk)
 		eco_chunk_malloc++;
--- a/src/libecoprimer/Makefile
+++ b/src/libecoprimer/Makefile
@ -13,7 +13,8 @@ SOURCES = goodtaxon.c \
          pairtree.c \
          pairs.c \
          taxstats.c \
-          apat_search.c
+          apat_search.c \
+		  filtering.c

 SRCS=$(SOURCES)
         
--- a/src/libecoprimer/ecoprimer.h
+++ b/src/libecoprimer/ecoprimer.h
@ -47,6 +47,7 @@ typedef uint64_t word_t, *pword_t;
 #define WORDMASK(s)       ((1LLU << ((s) * 2)) -1)
 #define LSHIFTWORD(x,s)    (((x) << 2) & WORDMASK(s))
 #define RSHIFTWORD(x,s)    (((x) & WORDMASK(s))>> 2)
+#define ERRORMASK(s)       ((int32_t)((1LLU << (s)) -1))

 #define RAPPENDBASE(x,s,c) (LSHIFTWORD((x),(s)) | (word_t)(c))
 #define LAPPENDBASE(x,s,c) (RSHIFTWORD((x),(s)) | ((word_t)((~(c)) & 3) << (((s)-1) *2)))
@ -65,6 +66,13 @@ typedef uint64_t word_t, *pword_t;
 #define MINI(x,y) (((x) < (y)) ? (x):(y))
 #define MAXI(x,y) (((x) < (y)) ? (y):(x))

+#define FWORDSIZE (13)
+#define FWORDMASK WORDMASK(FWORDSIZE)
+#define FILTERWORD(x) ((uint32_t)((x) & FWORDMASK))
+#define CFILTERWORD(x,s) ((uint32_t)(((x) >> (((s)-FWORDSIZE)*2)) & FWORDMASK))
+
+
+
 typedef struct {
 	pword_t    words;
 	uint32_t    *strictcount;
@ -231,6 +239,7 @@ typedef struct {

 typedef struct {
 	bool_t         statistics;
+	bool_t         filtering;
 	uint32_t        lmin;                   //**< Amplifia minimal length
 	uint32_t        lmax;                   //**< Amplifia maximal length
 	uint32_t        error_max;              //**< maximum error count in fuzzy search
@ -270,7 +279,8 @@ pecodnadb_t readdnadb(const char *name, uint32_t *size);
 int isGoodTaxon(ecotaxonomy_t *taxonomy,int32_t taxon,poptions_t options);

 uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq);
-pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint32_t doublestrand, ecoseq_t *seq,uint32_t *size);
+pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint32_t doublestrand, ecoseq_t *seq,uint32_t *size,int32_t *neededWords,uint32_t neededWordCount,
+	    int32_t quorum);
 uint32_t ecoCompactHashSequence(pword_t dest,uint32_t size);
 const char* ecoUnhashWord(word_t word,uint32_t size);
 word_t ecoComplementWord(word_t word,uint32_t size);
@ -278,8 +288,8 @@ uint32_t ecoFindWord(pwordcount_t table,word_t word);


 void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,uint32_t seqQuorum);
-pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,ecoseq_t *seq);
-void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t exampleCount,uint32_t seqQuorum,ecoseq_t *seq);
+pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount);
+void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t exampleCount,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount);

 pqueue_t newQueue(pqueue_t queue, uint32_t size);
 pqueue_t resizeQueue(pqueue_t queue, uint32_t size);
@ -318,4 +328,8 @@ float taxonomycoverage(ppair_t pair, poptions_t options);
 char ecoComplementChar(char base);
 void taxonomyspecificity (ppair_t pair);

+int32_t *filteringSeq(pecodnadb_t database, uint32_t seqdbsize,
+					 uint32_t exampleCount,poptions_t options,uint32_t *size,int32_t  sequenceQuorum);
+
+
 #endif /* EPSORT_H_ */
--- a/src/libecoprimer/filtering.c
+++ b/src/libecoprimer/filtering.c
@ -0,0 +1,183 @@
+/*
+ * filtering.c
+ *
+ *  Created on: 12 mai 2009
+ *      Author: coissac
+ */
+
+#include "ecoprimer.h"
+#include <string.h>
+#include <math.h>
+
+#include "hashencoder.h"
+
+static int32_t *ecoFilteringHashSequence(int32_t *dest,
+										 uint32_t circular,
+										 uint32_t doublestrand,
+										 ecoseq_t *seq,
+										 uint32_t *size);
+
+
+
+
+
+static	int32_t *ecoFilteringHashSequence(int32_t *dest,
+										  uint32_t circular,
+										  uint32_t doublestrand,
+										  ecoseq_t *seq,
+										  uint32_t *size)
+{
+	static char    *in_last_seq=NULL;
+	uint32_t i=0;
+	uint32_t j;
+	char *base;
+	int8_t code;
+	int32_t error=0;
+	word_t word=0;
+	word_t antiword=0;
+	uint32_t goodword;
+	uint32_t lmax=0;
+
+	// run on the first call;
+
+
+	if (dest==(void*)-1)
+	{
+		if (in_last_seq) ECOFREE(in_last_seq,"Free in last seq table");
+		return NULL;
+	}
+
+
+	*size = pow(4,FWORDSIZE);
+
+	if (!in_last_seq)
+		in_last_seq = ECOMALLOC(*size*sizeof(char),
+								"Cannot allocate filtering hash table");
+
+	memset(in_last_seq,0,*size*sizeof(char));
+
+
+	if (!dest)
+	{
+		dest = ECOMALLOC(*size*sizeof(int32_t),
+				               "Cannot allocate filtering hash table");
+		memset(dest,0,*size*sizeof(int32_t));
+	}
+
+	lmax = seq->SQ_length;
+	if (!circular)
+       lmax-= FWORDSIZE-1;
+
+
+
+//	DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i));
+
+	for (i=0, base = seq->SQ; i < FWORDSIZE && i < lmax; i++,base++)
+	{
+		error<<= 1;
+		error&=ERRORMASK(FWORDSIZE);
+
+		code = encoder[(*base) - 'A'];
+		if (code <0)
+		{
+			code = 0;
+			error|= 1;
+		}
+
+
+		word=RAPPENDBASE(word,FWORDSIZE,code);
+		if (doublestrand)
+			antiword=LAPPENDBASE(antiword,FWORDSIZE,code);
+	}
+
+	if (!error && i==FWORDSIZE)
+	{
+
+		goodword=(uint32_t)((doublestrand) ? MINI(word,antiword):word);
+
+		if (!in_last_seq[goodword])
+		{
+			in_last_seq[goodword]=1;
+			dest[goodword]++;
+		}
+	}
+
+
+	for (j=1; j < lmax; j++,i++,base++)
+	{
+
+//		DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j));
+
+							/* roll over the sequence for circular ones */
+		if (i==(uint32_t)seq->SQ_length) base=seq->SQ;
+
+		error<<= 1;
+		error&=ERRORMASK(FWORDSIZE);
+
+		code = encoder[(*base) - 'A'];
+		if (code <0)
+		{
+			code = 0;
+			error|= 1;
+		}
+
+		word=RAPPENDBASE(word,FWORDSIZE,code);
+		if (doublestrand)
+			antiword=LAPPENDBASE(antiword,FWORDSIZE,code);
+
+		if (!error)
+		{
+			goodword=(uint32_t)((doublestrand) ? MINI(word,antiword):word);
+			if (!in_last_seq[goodword])
+			{
+				in_last_seq[goodword]=1;
+				dest[goodword]++;
+			}
+		}
+
+	}
+
+	return dest;
+
+}
+
+
+int32_t *filteringSeq(pecodnadb_t database, uint32_t seqdbsize,
+					 uint32_t exampleCount,poptions_t options,uint32_t *size,int32_t  sequenceQuorum)
+{
+	int32_t *wordscount=NULL;
+	int32_t keep=0;
+	uint32_t i,j=0;
+
+	for (i=0;i<seqdbsize;i++)
+    {
+    	if (database[i]->isexample)
+    	{
+    		j++;
+    		wordscount=ecoFilteringHashSequence(wordscount,
+									 options->circular,
+                                     options->doublestrand,
+                                     database[i],
+                                     size);
+    	}
+    	fprintf(stderr,"  Filtered sequences %5u/%5u          \r",j,exampleCount);
+
+    }
+
+	fprintf(stderr,"\n");
+
+	for (i=0;i<*size;i++)
+		if (wordscount[i] >= sequenceQuorum)
+			keep++;
+
+
+	(void)ecoFilteringHashSequence((int32_t*)-1,
+									options->circular,
+	                                options->doublestrand,
+	                                NULL,
+	                                NULL);
+
+	fprintf(stderr,"ok\n Considered word of size %d for filtering : %d\n",FWORDSIZE,keep);
+	return wordscount;
+
+}
--- a/src/libecoprimer/hashencoder.h
+++ b/src/libecoprimer/hashencoder.h
@ -0,0 +1,21 @@
+/*
+ * hashencoder.h
+ *
+ *  Created on: 12 mai 2009
+ *      Author: coissac
+ */
+
+#ifndef HASHENCODER_H_
+#define HASHENCODER_H_
+
+static int8_t encoder[] = {0,                                            // A
+		                   -1,                                           // b
+		                   1,                                            // C
+	       	               -1,-1,-1,                                     // d, e, f
+		                   2,                                            // G
+		                   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,          // h,i,j,k,l,m,n,o,p,q,r,s
+		                   3,3,                                           // T,U
+		                   -1,-1,-1,-1,-1};                              // v,w,x,y,z
+
+
+#endif /* HASHENCODER_H_ */
--- a/src/libecoprimer/hashsequence.c
+++ b/src/libecoprimer/hashsequence.c
@ -10,15 +10,7 @@

 static int cmpword(const void *x,const void *y);

-static int8_t encoder[] = {0,                                            // A
-		                   -1,                                           // b
-		                   1,                                            // C
-	       	               -1,-1,-1,                                     // d, e, f
-		                   2,                                            // G
-		                   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,          // h,i,j,k,l,m,n,o,p,q,r,s
-		                   3,3,                                           // T,U
-		                   -1,-1,-1,-1,-1};                              // v,w,x,y,z
-
+#include "hashencoder.h"

 uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq)
 {
@ -31,7 +23,15 @@ uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq)
 	return wordcount;
 }

-pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint32_t doublestrand, ecoseq_t *seq,uint32_t *size)
+pword_t ecoHashSequence(pword_t dest,
+					    uint32_t wordsize,
+					    uint32_t circular,
+					    uint32_t doublestrand,
+					    ecoseq_t *seq,
+					    uint32_t *size,
+					    int32_t  *neededWords,
+					    uint32_t neededWordCount,
+					    int32_t quorum)
 {
 	uint32_t i=0;
 	uint32_t j;
@ -40,6 +40,7 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
 	int32_t error=0;
 	word_t word=0;
 	word_t antiword=0;
+	word_t goodword;
 	uint32_t lmax=0;

 	(*size)=0;
@ -57,7 +58,9 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint

 	for (i=0, base = seq->SQ; i < wordsize && i < lmax; i++,base++)
 	{
+
 		error<<= 1;
+		error&=ERRORMASK(wordsize);

 		code = encoder[(*base) - 'A'];
 		if (code <0)
@ -68,10 +71,22 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint


 		word=RAPPENDBASE(word,wordsize,code);
+
 		if (doublestrand)
 			antiword=LAPPENDBASE(antiword,wordsize,code);
+
+		if (neededWordCount && i>=(FWORDSIZE-1))
+		{
+
+			goodword = (doublestrand) ? MINI(FILTERWORD(word),CFILTERWORD(antiword,wordsize)):FILTERWORD(word);
+			if (neededWords[(uint32_t)goodword]<quorum)
+				error|= (1 << (FWORDSIZE-1));
+
+		}
+
 	}

+
 	if (!error && i==wordsize)
 	{
 		dest[*size]=(doublestrand) ? MINI(word,antiword):word;
@ -85,9 +100,11 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
 //		DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j));

 							/* roll over the sequence for circular ones */
+
 		if (i==(uint32_t)seq->SQ_length) base=seq->SQ;

 		error<<= 1;
+		error&=ERRORMASK(wordsize);

 		code = encoder[(*base) - 'A'];
 		if (code <0)
@ -100,6 +117,17 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
 		if (doublestrand)
 			antiword=LAPPENDBASE(antiword,wordsize,code);

+		if (neededWordCount)
+		{
+			goodword = (doublestrand) ? MINI(FILTERWORD(word),CFILTERWORD(antiword,wordsize)):FILTERWORD(word);
+			if (neededWords[(uint32_t)goodword]<quorum)
+				error|= (1 << (FWORDSIZE-1));
+//			else
+//				DEBUG_LOG("%s goodword = %p %d/%d (pos:%d error:%d)",seq->AC,goodword,neededWords[(uint32_t)goodword],quorum,i,error);
+
+		}
+
+
 		if (!error)
 		{
 			dest[*size]=(doublestrand) ? MINI(word,antiword):word;
@ -107,7 +135,7 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
 		}

 	}
-
+	// DEBUG_LOG("%s goodword = %d",seq->AC,*size);
 	return dest;

 }
@ -116,12 +144,16 @@ uint32_t ecoCompactHashSequence(pword_t table,uint32_t size)
 {
 	uint32_t i,j;
 	word_t  current;
+//	bool_t here=FALSE;

 	sortword(table,size);

 	current = 0;
 	current=SETMULTIWORD(current);   /* build impossible word for the first loop cycle */

+//	if (strcmp(ecoUnhashWord(table[size-1],18),"GTTTGTTCAACGATTAAA")==0)
+//		here=TRUE;
+
 	for (i=0,j=0; j < size;j++)
 	{
 		if (WORD(table[j])!=current)
@ -134,6 +166,9 @@ uint32_t ecoCompactHashSequence(pword_t table,uint32_t size)
 			table[i]=SETMULTIWORD(table[i]);
 	}

+//		if (strcmp(ecoUnhashWord(WORD(table[i-1]),18),"TACGACCTCGATGTTGGA")==0)
+//			DEBUG_LOG("winner %d",i)
+
 	return i;
 }

--- a/src/libecoprimer/merge.c
+++ b/src/libecoprimer/merge.c
@ -41,7 +41,8 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui
 	(void)mergeInit(&merged,data,s1,s2);
 	(void)newQueue(&queue,MINI(s1,s2));

-	while (merged.read1 < s1 && merged.read2 < merged.size)
+
+	while (merged.read1 < s1 || merged.read2 < merged.size)
 	{
 		if (! queue.empty)
 		{
@ -56,7 +57,8 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui
 			source=S1;
 		}

-		if (WORD(currentword) > WORD(merged.words[merged.read2]))
+		if (merged.read2 < merged.size &&
+				WORD(currentword) > WORD(merged.words[merged.read2]))
 		{
 			currentword  = merged.words[merged.read2];
 			currentcount = merged.count[merged.read2];
@ -114,6 +116,8 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui


 	if (merged.read2 < merged.size)
+		{
+			//DEBUG_LOG("end1 %d %d/%d  %d/%d",merged.write,merged.read1,s1,merged.read2,merged.size);
 		for (;merged.read2 < merged.size;merged.read2++)
 		{
 			merged.words[merged.write]=merged.words[merged.read2];
@ -122,7 +126,10 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui
 	        	merged.write++;

 		}
-	else while (! queue.empty)
+		}
+	else {
+		//DEBUG_LOG("end2 %d %d/%d  %d/%d",merged.write,merged.read1,s1,merged.read2,merged.size);
+		while (! queue.empty)
 		{
 //			DEBUG_LOG("write : %s count : %d write : %d size : %d pop : %d push : %d  empty : %d",ecoUnhashWord(queue.words[queue.pop],18),queue.count[queue.pop],merged.write,queue.size,queue.pop,queue.push,queue.empty)
 			merged.words[merged.write]=queue.words[queue.pop];
@ -131,6 +138,7 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui
 	        if (remainingSeq + merged.count[merged.write] >= seqQuorum)
 	        	merged.write++;
 		}
+		}

 	data->size = merged.write;

--- a/src/libecoprimer/strictprimers.c
+++ b/src/libecoprimer/strictprimers.c
@ -48,7 +48,7 @@ double timeval_subtract (struct timeval *x, struct timeval *y)
   return (double)result.tv_sec + (double)result.tv_usec/1e6;
 }

-pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,ecoseq_t *seq)
+pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount)
 {
 	uint32_t i;
 	uint32_t buffsize;
@ -65,7 +65,7 @@ pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circ

 	if (seq)
 	{
-		table->words = ecoHashSequence(NULL,wordsize,circular,doublestrand,seq,&buffsize);
+		table->words = ecoHashSequence(NULL,wordsize,circular,doublestrand,seq,&buffsize,neededWords,neededWordCount,seqQuorum);
 		table->size  = ecoCompactHashSequence(table->words,buffsize);

 		table->inseqcount=1;
@ -79,7 +79,7 @@ pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circ
 	return table;
 }

-void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t exampleCount,uint32_t seqQuorum,ecoseq_t *seq)
+void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t exampleCount,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount)
 {
 	uint32_t buffersize;
 	pword_t newtable;
@ -96,7 +96,7 @@ void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circ

 //	DEBUG_LOG("Words = %x (%u) new = %x", table->words,table->size,newtable);

-	(void)ecoHashSequence(newtable,wordsize,circular,doublestrand,seq,&newsize);
+	(void)ecoHashSequence(newtable,wordsize,circular,doublestrand,seq,&newsize,neededWords,neededWordCount,seqQuorum);
 //	DEBUG_LOG("new seq wordCount : %d",newsize);

 	newsize = ecoCompactHashSequence(newtable,newsize);
@ -137,6 +137,18 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
 	pwordcount_t strictprimers=NULL;
 	uint64_t totallength=0;
 	uint32_t  sequenceQuorum = (uint32_t)floor((float)exampleCount * options->strict_quorum);
+	int32_t *neededWords;
+	uint32_t neededWordCount;
+
+	fprintf(stderr,"Filtering... ");
+
+	if (options->filtering)
+		neededWords = filteringSeq(database,seqdbsize,exampleCount,options,&neededWordCount,(int32_t)sequenceQuorum);
+	else
+	{
+		neededWordCount=0;
+		neededWords=NULL;
+	}

 	if (options->statistics)
 	{
@ -152,7 +164,8 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
 	strictprimers = initCountTable(NULL,options->primer_length,
                                                 options->circular,
                                                 options->doublestrand,
-                                   NULL);
+                                                 0,
+                                   NULL,NULL,0);


 	getrusage(RUSAGE_SELF,&start);
@ -167,7 +180,8 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
    			strictprimers = initCountTable(strictprimers,options->primer_length,
                                                             options->circular,
                                                             options->doublestrand,
-                                               database[i]);
+                                                             sequenceQuorum,
+                                               database[i],neededWords,neededWordCount);
    			first=FALSE;
    		}
    		else
@ -180,7 +194,7 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
    					                             options->doublestrand,
    					                             exampleCount,
    					                             sequenceQuorum,
-    					               database[i]);
+    					               database[i],neededWords,neededWordCount);
    		};
    		totallength+=database[i]->SQ_length;
    		getrusage(RUSAGE_SELF,&usage);
@ -215,6 +229,9 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
 										sizeof(word_t)*strictprimers->size,
 										"Cannot reallocate strict primer table");

+    if (neededWords)
+    	ECOFREE(neededWords,"Clean needed word table");
+
 	return strictprimers;
 }

--- a/src/libecoprimer/taxstats.c
+++ b/src/libecoprimer/taxstats.c
@ -47,7 +47,6 @@ int32_t counttaxon(int32_t taxid)
 		tsearch((void*)((size_t)taxid),&taxontree,cmptaxon);
 		taxoncount++;
 	}
-
 	return taxoncount;
 }

@ -60,6 +59,7 @@ int32_t getrankdbstats(pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *tax
 	ecotx_t  *tmptaxon;

 	counttaxon(-1);
+	options->intaxa = 0;

    for (i=0;i<seqdbsize;i++)
 	{
@ -85,6 +85,7 @@ int32_t getrankdbstats(pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *tax
 	}

 	counttaxon(-1);
+	options->outtaxa = 0;

    for (i=0;i<seqdbsize;i++)
 		{
 @ -1 +1 @@
 .2
 .3