New version 0.3 with filtering on short words

git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@213 60f365c0-8329-0410-b2a4-ec073aeeaa1d
This commit is contained in:
2009-05-13 06:51:25 +00:00
parent 5dc55c7f53
commit b7c1640042
12 changed files with 330 additions and 34 deletions

View File

@ -1 +1 @@
0.2 0.3

BIN
src/ecoPrimer Executable file

Binary file not shown.

View File

@ -14,7 +14,7 @@
#include <time.h> #include <time.h>
#include <sys/time.h> #include <sys/time.h>
#define VERSION "0.2" #define VERSION "0.3"
/* TR: by default, statistics are made on species level*/ /* TR: by default, statistics are made on species level*/
#define DEFAULTTAXONRANK "species" #define DEFAULTTAXONRANK "species"
@ -98,6 +98,7 @@ static void ExitUsage(int stat)
void initoptions(poptions_t options) void initoptions(poptions_t options)
{ {
options->statistics=FALSE; options->statistics=FALSE;
options->filtering=TRUE;
options->lmin=0; //< Amplifia minimal length options->lmin=0; //< Amplifia minimal length
options->lmax=1000; //< Amplifia maximal length options->lmax=1000; //< Amplifia maximal length
options->error_max=3; //**< maximum error count in fuzzy search options->error_max=3; //**< maximum error count in fuzzy search
@ -432,6 +433,7 @@ int main(int argc, char **argv)
uint32_t i; uint32_t i;
pwordcount_t words; pwordcount_t words;
// pwordcount_t words2;
pprimercount_t primers; pprimercount_t primers;
ppairtree_t pairs; ppairtree_t pairs;
@ -442,7 +444,7 @@ int main(int argc, char **argv)
initoptions(&options); initoptions(&options);
while ((carg = getopt(argc, argv, "hvcUDSd:l:L:e:i:r:q:3:s:x:t:O:")) != -1) { while ((carg = getopt(argc, argv, "hfvcUDSd:l:L:e:i:r:q:3:s:x:t:O:")) != -1) {
switch (carg) { switch (carg) {
/* ---------------------------- */ /* ---------------------------- */
@ -451,6 +453,12 @@ int main(int argc, char **argv)
options.statistics=TRUE; options.statistics=TRUE;
break; break;
/* ---------------------------- */
case 'f': /* set in single strand mode */
/* ---------------------------- */
options.filtering=FALSE;
break;
/* -------------------- */ /* -------------------- */
case 'd': /* database name */ case 'd': /* database name */
/* -------------------- */ /* -------------------- */
@ -599,12 +607,20 @@ int main(int argc, char **argv)
fprintf(stderr,"\nIndexing words in sequences\n"); fprintf(stderr,"\nIndexing words in sequences\n");
printcurrenttimeinmilli();
words = lookforStrictPrimer(seqdb,seqdbsize,insamples,&options); words = lookforStrictPrimer(seqdb,seqdbsize,insamples,&options);
printcurrenttimeinmilli();
fprintf(stderr,"\n Strict primer count : %d\n",words->size); fprintf(stderr,"\n Strict primer count : %d\n",words->size);
// options.filtering=FALSE;
// words2= lookforStrictPrimer(seqdb,seqdbsize,insamples,&options);
// fprintf(stderr,"\n Strict primer count : %d\n",words2->size);
//
// fprintf(stderr,"\n\n Primer sample : \n");
// for (i=0; i<words->size; i++)
// fprintf(stderr," + Primer : %s sequence count : %d\n",ecoUnhashWord(words->words[i],options.primer_length),words->strictcount[i]);
// fprintf(stderr,"\n\n Primer sample : \n");
// for (i=0; i<words2->size; i++)
// fprintf(stderr," + Primer : %s sequence count : %d\n",ecoUnhashWord(words2->words[i],options.primer_length),words2->strictcount[i]);
if (options.no_multi_match) if (options.no_multi_match)
{ {
(void)filterMultiStrictPrimer(words); (void)filterMultiStrictPrimer(words);

View File

@ -17,7 +17,7 @@ void eco_untrace_memory_allocation()
void ecoMallocedMemory() void ecoMallocedMemory()
{ {
return eco_amount_malloc; //eco_amount_malloc;
} }
void *eco_malloc(int64_t chunksize, void *eco_malloc(int64_t chunksize,
@ -60,7 +60,7 @@ void *eco_realloc(void *chunk,
if (!newchunk) if (!newchunk)
{ {
ecoError(ECO_MEM_ERROR,error_message,filename,line); ecoError(ECO_MEM_ERROR,error_message,filename,line);
fprintf('Requested memory : %d\n',newsize); fprintf(stderr,"Requested memory : %d\n",newsize);
} }
if (!chunk) if (!chunk)
eco_chunk_malloc++; eco_chunk_malloc++;

View File

@ -13,7 +13,8 @@ SOURCES = goodtaxon.c \
pairtree.c \ pairtree.c \
pairs.c \ pairs.c \
taxstats.c \ taxstats.c \
apat_search.c apat_search.c \
filtering.c
SRCS=$(SOURCES) SRCS=$(SOURCES)

View File

@ -47,6 +47,7 @@ typedef uint64_t word_t, *pword_t;
#define WORDMASK(s) ((1LLU << ((s) * 2)) -1) #define WORDMASK(s) ((1LLU << ((s) * 2)) -1)
#define LSHIFTWORD(x,s) (((x) << 2) & WORDMASK(s)) #define LSHIFTWORD(x,s) (((x) << 2) & WORDMASK(s))
#define RSHIFTWORD(x,s) (((x) & WORDMASK(s))>> 2) #define RSHIFTWORD(x,s) (((x) & WORDMASK(s))>> 2)
#define ERRORMASK(s) ((int32_t)((1LLU << (s)) -1))
#define RAPPENDBASE(x,s,c) (LSHIFTWORD((x),(s)) | (word_t)(c)) #define RAPPENDBASE(x,s,c) (LSHIFTWORD((x),(s)) | (word_t)(c))
#define LAPPENDBASE(x,s,c) (RSHIFTWORD((x),(s)) | ((word_t)((~(c)) & 3) << (((s)-1) *2))) #define LAPPENDBASE(x,s,c) (RSHIFTWORD((x),(s)) | ((word_t)((~(c)) & 3) << (((s)-1) *2)))
@ -65,6 +66,13 @@ typedef uint64_t word_t, *pword_t;
#define MINI(x,y) (((x) < (y)) ? (x):(y)) #define MINI(x,y) (((x) < (y)) ? (x):(y))
#define MAXI(x,y) (((x) < (y)) ? (y):(x)) #define MAXI(x,y) (((x) < (y)) ? (y):(x))
#define FWORDSIZE (13)
#define FWORDMASK WORDMASK(FWORDSIZE)
#define FILTERWORD(x) ((uint32_t)((x) & FWORDMASK))
#define CFILTERWORD(x,s) ((uint32_t)(((x) >> (((s)-FWORDSIZE)*2)) & FWORDMASK))
typedef struct { typedef struct {
pword_t words; pword_t words;
uint32_t *strictcount; uint32_t *strictcount;
@ -231,6 +239,7 @@ typedef struct {
typedef struct { typedef struct {
bool_t statistics; bool_t statistics;
bool_t filtering;
uint32_t lmin; //**< Amplifia minimal length uint32_t lmin; //**< Amplifia minimal length
uint32_t lmax; //**< Amplifia maximal length uint32_t lmax; //**< Amplifia maximal length
uint32_t error_max; //**< maximum error count in fuzzy search uint32_t error_max; //**< maximum error count in fuzzy search
@ -270,7 +279,8 @@ pecodnadb_t readdnadb(const char *name, uint32_t *size);
int isGoodTaxon(ecotaxonomy_t *taxonomy,int32_t taxon,poptions_t options); int isGoodTaxon(ecotaxonomy_t *taxonomy,int32_t taxon,poptions_t options);
uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq); uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq);
pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint32_t doublestrand, ecoseq_t *seq,uint32_t *size); pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint32_t doublestrand, ecoseq_t *seq,uint32_t *size,int32_t *neededWords,uint32_t neededWordCount,
int32_t quorum);
uint32_t ecoCompactHashSequence(pword_t dest,uint32_t size); uint32_t ecoCompactHashSequence(pword_t dest,uint32_t size);
const char* ecoUnhashWord(word_t word,uint32_t size); const char* ecoUnhashWord(word_t word,uint32_t size);
word_t ecoComplementWord(word_t word,uint32_t size); word_t ecoComplementWord(word_t word,uint32_t size);
@ -278,8 +288,8 @@ uint32_t ecoFindWord(pwordcount_t table,word_t word);
void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,uint32_t seqQuorum); void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,uint32_t seqQuorum);
pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,ecoseq_t *seq); pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount);
void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t exampleCount,uint32_t seqQuorum,ecoseq_t *seq); void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t exampleCount,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount);
pqueue_t newQueue(pqueue_t queue, uint32_t size); pqueue_t newQueue(pqueue_t queue, uint32_t size);
pqueue_t resizeQueue(pqueue_t queue, uint32_t size); pqueue_t resizeQueue(pqueue_t queue, uint32_t size);
@ -318,4 +328,8 @@ float taxonomycoverage(ppair_t pair, poptions_t options);
char ecoComplementChar(char base); char ecoComplementChar(char base);
void taxonomyspecificity (ppair_t pair); void taxonomyspecificity (ppair_t pair);
int32_t *filteringSeq(pecodnadb_t database, uint32_t seqdbsize,
uint32_t exampleCount,poptions_t options,uint32_t *size,int32_t sequenceQuorum);
#endif /* EPSORT_H_ */ #endif /* EPSORT_H_ */

View File

@ -0,0 +1,183 @@
/*
* filtering.c
*
* Created on: 12 mai 2009
* Author: coissac
*/
#include "ecoprimer.h"
#include <string.h>
#include <math.h>
#include "hashencoder.h"
static int32_t *ecoFilteringHashSequence(int32_t *dest,
uint32_t circular,
uint32_t doublestrand,
ecoseq_t *seq,
uint32_t *size);
static int32_t *ecoFilteringHashSequence(int32_t *dest,
uint32_t circular,
uint32_t doublestrand,
ecoseq_t *seq,
uint32_t *size)
{
static char *in_last_seq=NULL;
uint32_t i=0;
uint32_t j;
char *base;
int8_t code;
int32_t error=0;
word_t word=0;
word_t antiword=0;
uint32_t goodword;
uint32_t lmax=0;
// run on the first call;
if (dest==(void*)-1)
{
if (in_last_seq) ECOFREE(in_last_seq,"Free in last seq table");
return NULL;
}
*size = pow(4,FWORDSIZE);
if (!in_last_seq)
in_last_seq = ECOMALLOC(*size*sizeof(char),
"Cannot allocate filtering hash table");
memset(in_last_seq,0,*size*sizeof(char));
if (!dest)
{
dest = ECOMALLOC(*size*sizeof(int32_t),
"Cannot allocate filtering hash table");
memset(dest,0,*size*sizeof(int32_t));
}
lmax = seq->SQ_length;
if (!circular)
lmax-= FWORDSIZE-1;
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i));
for (i=0, base = seq->SQ; i < FWORDSIZE && i < lmax; i++,base++)
{
error<<= 1;
error&=ERRORMASK(FWORDSIZE);
code = encoder[(*base) - 'A'];
if (code <0)
{
code = 0;
error|= 1;
}
word=RAPPENDBASE(word,FWORDSIZE,code);
if (doublestrand)
antiword=LAPPENDBASE(antiword,FWORDSIZE,code);
}
if (!error && i==FWORDSIZE)
{
goodword=(uint32_t)((doublestrand) ? MINI(word,antiword):word);
if (!in_last_seq[goodword])
{
in_last_seq[goodword]=1;
dest[goodword]++;
}
}
for (j=1; j < lmax; j++,i++,base++)
{
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j));
/* roll over the sequence for circular ones */
if (i==(uint32_t)seq->SQ_length) base=seq->SQ;
error<<= 1;
error&=ERRORMASK(FWORDSIZE);
code = encoder[(*base) - 'A'];
if (code <0)
{
code = 0;
error|= 1;
}
word=RAPPENDBASE(word,FWORDSIZE,code);
if (doublestrand)
antiword=LAPPENDBASE(antiword,FWORDSIZE,code);
if (!error)
{
goodword=(uint32_t)((doublestrand) ? MINI(word,antiword):word);
if (!in_last_seq[goodword])
{
in_last_seq[goodword]=1;
dest[goodword]++;
}
}
}
return dest;
}
int32_t *filteringSeq(pecodnadb_t database, uint32_t seqdbsize,
uint32_t exampleCount,poptions_t options,uint32_t *size,int32_t sequenceQuorum)
{
int32_t *wordscount=NULL;
int32_t keep=0;
uint32_t i,j=0;
for (i=0;i<seqdbsize;i++)
{
if (database[i]->isexample)
{
j++;
wordscount=ecoFilteringHashSequence(wordscount,
options->circular,
options->doublestrand,
database[i],
size);
}
fprintf(stderr," Filtered sequences %5u/%5u \r",j,exampleCount);
}
fprintf(stderr,"\n");
for (i=0;i<*size;i++)
if (wordscount[i] >= sequenceQuorum)
keep++;
(void)ecoFilteringHashSequence((int32_t*)-1,
options->circular,
options->doublestrand,
NULL,
NULL);
fprintf(stderr,"ok\n Considered word of size %d for filtering : %d\n",FWORDSIZE,keep);
return wordscount;
}

View File

@ -0,0 +1,21 @@
/*
* hashencoder.h
*
* Created on: 12 mai 2009
* Author: coissac
*/
#ifndef HASHENCODER_H_
#define HASHENCODER_H_
static int8_t encoder[] = {0, // A
-1, // b
1, // C
-1,-1,-1, // d, e, f
2, // G
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // h,i,j,k,l,m,n,o,p,q,r,s
3,3, // T,U
-1,-1,-1,-1,-1}; // v,w,x,y,z
#endif /* HASHENCODER_H_ */

View File

@ -10,15 +10,7 @@
static int cmpword(const void *x,const void *y); static int cmpword(const void *x,const void *y);
static int8_t encoder[] = {0, // A #include "hashencoder.h"
-1, // b
1, // C
-1,-1,-1, // d, e, f
2, // G
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // h,i,j,k,l,m,n,o,p,q,r,s
3,3, // T,U
-1,-1,-1,-1,-1}; // v,w,x,y,z
uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq) uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq)
{ {
@ -31,7 +23,15 @@ uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq)
return wordcount; return wordcount;
} }
pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint32_t doublestrand, ecoseq_t *seq,uint32_t *size) pword_t ecoHashSequence(pword_t dest,
uint32_t wordsize,
uint32_t circular,
uint32_t doublestrand,
ecoseq_t *seq,
uint32_t *size,
int32_t *neededWords,
uint32_t neededWordCount,
int32_t quorum)
{ {
uint32_t i=0; uint32_t i=0;
uint32_t j; uint32_t j;
@ -40,6 +40,7 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
int32_t error=0; int32_t error=0;
word_t word=0; word_t word=0;
word_t antiword=0; word_t antiword=0;
word_t goodword;
uint32_t lmax=0; uint32_t lmax=0;
(*size)=0; (*size)=0;
@ -57,7 +58,9 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
for (i=0, base = seq->SQ; i < wordsize && i < lmax; i++,base++) for (i=0, base = seq->SQ; i < wordsize && i < lmax; i++,base++)
{ {
error<<= 1; error<<= 1;
error&=ERRORMASK(wordsize);
code = encoder[(*base) - 'A']; code = encoder[(*base) - 'A'];
if (code <0) if (code <0)
@ -68,10 +71,22 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
word=RAPPENDBASE(word,wordsize,code); word=RAPPENDBASE(word,wordsize,code);
if (doublestrand) if (doublestrand)
antiword=LAPPENDBASE(antiword,wordsize,code); antiword=LAPPENDBASE(antiword,wordsize,code);
if (neededWordCount && i>=(FWORDSIZE-1))
{
goodword = (doublestrand) ? MINI(FILTERWORD(word),CFILTERWORD(antiword,wordsize)):FILTERWORD(word);
if (neededWords[(uint32_t)goodword]<quorum)
error|= (1 << (FWORDSIZE-1));
}
} }
if (!error && i==wordsize) if (!error && i==wordsize)
{ {
dest[*size]=(doublestrand) ? MINI(word,antiword):word; dest[*size]=(doublestrand) ? MINI(word,antiword):word;
@ -85,9 +100,11 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j)); // DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j));
/* roll over the sequence for circular ones */ /* roll over the sequence for circular ones */
if (i==(uint32_t)seq->SQ_length) base=seq->SQ; if (i==(uint32_t)seq->SQ_length) base=seq->SQ;
error<<= 1; error<<= 1;
error&=ERRORMASK(wordsize);
code = encoder[(*base) - 'A']; code = encoder[(*base) - 'A'];
if (code <0) if (code <0)
@ -100,6 +117,17 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
if (doublestrand) if (doublestrand)
antiword=LAPPENDBASE(antiword,wordsize,code); antiword=LAPPENDBASE(antiword,wordsize,code);
if (neededWordCount)
{
goodword = (doublestrand) ? MINI(FILTERWORD(word),CFILTERWORD(antiword,wordsize)):FILTERWORD(word);
if (neededWords[(uint32_t)goodword]<quorum)
error|= (1 << (FWORDSIZE-1));
// else
// DEBUG_LOG("%s goodword = %p %d/%d (pos:%d error:%d)",seq->AC,goodword,neededWords[(uint32_t)goodword],quorum,i,error);
}
if (!error) if (!error)
{ {
dest[*size]=(doublestrand) ? MINI(word,antiword):word; dest[*size]=(doublestrand) ? MINI(word,antiword):word;
@ -107,7 +135,7 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
} }
} }
// DEBUG_LOG("%s goodword = %d",seq->AC,*size);
return dest; return dest;
} }
@ -116,12 +144,16 @@ uint32_t ecoCompactHashSequence(pword_t table,uint32_t size)
{ {
uint32_t i,j; uint32_t i,j;
word_t current; word_t current;
// bool_t here=FALSE;
sortword(table,size); sortword(table,size);
current = 0; current = 0;
current=SETMULTIWORD(current); /* build impossible word for the first loop cycle */ current=SETMULTIWORD(current); /* build impossible word for the first loop cycle */
// if (strcmp(ecoUnhashWord(table[size-1],18),"GTTTGTTCAACGATTAAA")==0)
// here=TRUE;
for (i=0,j=0; j < size;j++) for (i=0,j=0; j < size;j++)
{ {
if (WORD(table[j])!=current) if (WORD(table[j])!=current)
@ -134,6 +166,9 @@ uint32_t ecoCompactHashSequence(pword_t table,uint32_t size)
table[i]=SETMULTIWORD(table[i]); table[i]=SETMULTIWORD(table[i]);
} }
// if (strcmp(ecoUnhashWord(WORD(table[i-1]),18),"TACGACCTCGATGTTGGA")==0)
// DEBUG_LOG("winner %d",i)
return i; return i;
} }

View File

@ -41,7 +41,8 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui
(void)mergeInit(&merged,data,s1,s2); (void)mergeInit(&merged,data,s1,s2);
(void)newQueue(&queue,MINI(s1,s2)); (void)newQueue(&queue,MINI(s1,s2));
while (merged.read1 < s1 && merged.read2 < merged.size)
while (merged.read1 < s1 || merged.read2 < merged.size)
{ {
if (! queue.empty) if (! queue.empty)
{ {
@ -56,7 +57,8 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui
source=S1; source=S1;
} }
if (WORD(currentword) > WORD(merged.words[merged.read2])) if (merged.read2 < merged.size &&
WORD(currentword) > WORD(merged.words[merged.read2]))
{ {
currentword = merged.words[merged.read2]; currentword = merged.words[merged.read2];
currentcount = merged.count[merged.read2]; currentcount = merged.count[merged.read2];
@ -114,6 +116,8 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui
if (merged.read2 < merged.size) if (merged.read2 < merged.size)
{
//DEBUG_LOG("end1 %d %d/%d %d/%d",merged.write,merged.read1,s1,merged.read2,merged.size);
for (;merged.read2 < merged.size;merged.read2++) for (;merged.read2 < merged.size;merged.read2++)
{ {
merged.words[merged.write]=merged.words[merged.read2]; merged.words[merged.write]=merged.words[merged.read2];
@ -122,7 +126,10 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui
merged.write++; merged.write++;
} }
else while (! queue.empty) }
else {
//DEBUG_LOG("end2 %d %d/%d %d/%d",merged.write,merged.read1,s1,merged.read2,merged.size);
while (! queue.empty)
{ {
// DEBUG_LOG("write : %s count : %d write : %d size : %d pop : %d push : %d empty : %d",ecoUnhashWord(queue.words[queue.pop],18),queue.count[queue.pop],merged.write,queue.size,queue.pop,queue.push,queue.empty) // DEBUG_LOG("write : %s count : %d write : %d size : %d pop : %d push : %d empty : %d",ecoUnhashWord(queue.words[queue.pop],18),queue.count[queue.pop],merged.write,queue.size,queue.pop,queue.push,queue.empty)
merged.words[merged.write]=queue.words[queue.pop]; merged.words[merged.write]=queue.words[queue.pop];
@ -131,6 +138,7 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui
if (remainingSeq + merged.count[merged.write] >= seqQuorum) if (remainingSeq + merged.count[merged.write] >= seqQuorum)
merged.write++; merged.write++;
} }
}
data->size = merged.write; data->size = merged.write;

View File

@ -48,7 +48,7 @@ double timeval_subtract (struct timeval *x, struct timeval *y)
return (double)result.tv_sec + (double)result.tv_usec/1e6; return (double)result.tv_sec + (double)result.tv_usec/1e6;
} }
pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,ecoseq_t *seq) pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount)
{ {
uint32_t i; uint32_t i;
uint32_t buffsize; uint32_t buffsize;
@ -65,7 +65,7 @@ pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circ
if (seq) if (seq)
{ {
table->words = ecoHashSequence(NULL,wordsize,circular,doublestrand,seq,&buffsize); table->words = ecoHashSequence(NULL,wordsize,circular,doublestrand,seq,&buffsize,neededWords,neededWordCount,seqQuorum);
table->size = ecoCompactHashSequence(table->words,buffsize); table->size = ecoCompactHashSequence(table->words,buffsize);
table->inseqcount=1; table->inseqcount=1;
@ -79,7 +79,7 @@ pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circ
return table; return table;
} }
void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t exampleCount,uint32_t seqQuorum,ecoseq_t *seq) void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t exampleCount,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount)
{ {
uint32_t buffersize; uint32_t buffersize;
pword_t newtable; pword_t newtable;
@ -96,7 +96,7 @@ void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circ
// DEBUG_LOG("Words = %x (%u) new = %x", table->words,table->size,newtable); // DEBUG_LOG("Words = %x (%u) new = %x", table->words,table->size,newtable);
(void)ecoHashSequence(newtable,wordsize,circular,doublestrand,seq,&newsize); (void)ecoHashSequence(newtable,wordsize,circular,doublestrand,seq,&newsize,neededWords,neededWordCount,seqQuorum);
// DEBUG_LOG("new seq wordCount : %d",newsize); // DEBUG_LOG("new seq wordCount : %d",newsize);
newsize = ecoCompactHashSequence(newtable,newsize); newsize = ecoCompactHashSequence(newtable,newsize);
@ -137,6 +137,18 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
pwordcount_t strictprimers=NULL; pwordcount_t strictprimers=NULL;
uint64_t totallength=0; uint64_t totallength=0;
uint32_t sequenceQuorum = (uint32_t)floor((float)exampleCount * options->strict_quorum); uint32_t sequenceQuorum = (uint32_t)floor((float)exampleCount * options->strict_quorum);
int32_t *neededWords;
uint32_t neededWordCount;
fprintf(stderr,"Filtering... ");
if (options->filtering)
neededWords = filteringSeq(database,seqdbsize,exampleCount,options,&neededWordCount,(int32_t)sequenceQuorum);
else
{
neededWordCount=0;
neededWords=NULL;
}
if (options->statistics) if (options->statistics)
{ {
@ -152,7 +164,8 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
strictprimers = initCountTable(NULL,options->primer_length, strictprimers = initCountTable(NULL,options->primer_length,
options->circular, options->circular,
options->doublestrand, options->doublestrand,
NULL); 0,
NULL,NULL,0);
getrusage(RUSAGE_SELF,&start); getrusage(RUSAGE_SELF,&start);
@ -167,7 +180,8 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
strictprimers = initCountTable(strictprimers,options->primer_length, strictprimers = initCountTable(strictprimers,options->primer_length,
options->circular, options->circular,
options->doublestrand, options->doublestrand,
database[i]); sequenceQuorum,
database[i],neededWords,neededWordCount);
first=FALSE; first=FALSE;
} }
else else
@ -180,7 +194,7 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
options->doublestrand, options->doublestrand,
exampleCount, exampleCount,
sequenceQuorum, sequenceQuorum,
database[i]); database[i],neededWords,neededWordCount);
}; };
totallength+=database[i]->SQ_length; totallength+=database[i]->SQ_length;
getrusage(RUSAGE_SELF,&usage); getrusage(RUSAGE_SELF,&usage);
@ -215,6 +229,9 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
sizeof(word_t)*strictprimers->size, sizeof(word_t)*strictprimers->size,
"Cannot reallocate strict primer table"); "Cannot reallocate strict primer table");
if (neededWords)
ECOFREE(neededWords,"Clean needed word table");
return strictprimers; return strictprimers;
} }

View File

@ -47,7 +47,6 @@ int32_t counttaxon(int32_t taxid)
tsearch((void*)((size_t)taxid),&taxontree,cmptaxon); tsearch((void*)((size_t)taxid),&taxontree,cmptaxon);
taxoncount++; taxoncount++;
} }
return taxoncount; return taxoncount;
} }
@ -60,6 +59,7 @@ int32_t getrankdbstats(pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *tax
ecotx_t *tmptaxon; ecotx_t *tmptaxon;
counttaxon(-1); counttaxon(-1);
options->intaxa = 0;
for (i=0;i<seqdbsize;i++) for (i=0;i<seqdbsize;i++)
{ {
@ -85,6 +85,7 @@ int32_t getrankdbstats(pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *tax
} }
counttaxon(-1); counttaxon(-1);
options->outtaxa = 0;
for (i=0;i<seqdbsize;i++) for (i=0;i<seqdbsize;i++)
{ {