New version 0.3 with filtering on short words
git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@213 60f365c0-8329-0410-b2a4-ec073aeeaa1d
This commit is contained in:
BIN
src/ecoPrimer
Executable file
BIN
src/ecoPrimer
Executable file
Binary file not shown.
@ -14,7 +14,7 @@
|
||||
#include <time.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
#define VERSION "0.2"
|
||||
#define VERSION "0.3"
|
||||
/* TR: by default, statistics are made on species level*/
|
||||
#define DEFAULTTAXONRANK "species"
|
||||
|
||||
@ -98,6 +98,7 @@ static void ExitUsage(int stat)
|
||||
void initoptions(poptions_t options)
|
||||
{
|
||||
options->statistics=FALSE;
|
||||
options->filtering=TRUE;
|
||||
options->lmin=0; //< Amplifia minimal length
|
||||
options->lmax=1000; //< Amplifia maximal length
|
||||
options->error_max=3; //**< maximum error count in fuzzy search
|
||||
@ -432,6 +433,7 @@ int main(int argc, char **argv)
|
||||
uint32_t i;
|
||||
|
||||
pwordcount_t words;
|
||||
// pwordcount_t words2;
|
||||
pprimercount_t primers;
|
||||
ppairtree_t pairs;
|
||||
|
||||
@ -442,7 +444,7 @@ int main(int argc, char **argv)
|
||||
|
||||
initoptions(&options);
|
||||
|
||||
while ((carg = getopt(argc, argv, "hvcUDSd:l:L:e:i:r:q:3:s:x:t:O:")) != -1) {
|
||||
while ((carg = getopt(argc, argv, "hfvcUDSd:l:L:e:i:r:q:3:s:x:t:O:")) != -1) {
|
||||
|
||||
switch (carg) {
|
||||
/* ---------------------------- */
|
||||
@ -451,6 +453,12 @@ int main(int argc, char **argv)
|
||||
options.statistics=TRUE;
|
||||
break;
|
||||
|
||||
/* ---------------------------- */
|
||||
case 'f': /* set in single strand mode */
|
||||
/* ---------------------------- */
|
||||
options.filtering=FALSE;
|
||||
break;
|
||||
|
||||
/* -------------------- */
|
||||
case 'd': /* database name */
|
||||
/* -------------------- */
|
||||
@ -599,12 +607,20 @@ int main(int argc, char **argv)
|
||||
|
||||
fprintf(stderr,"\nIndexing words in sequences\n");
|
||||
|
||||
printcurrenttimeinmilli();
|
||||
words = lookforStrictPrimer(seqdb,seqdbsize,insamples,&options);
|
||||
printcurrenttimeinmilli();
|
||||
|
||||
fprintf(stderr,"\n Strict primer count : %d\n",words->size);
|
||||
|
||||
// options.filtering=FALSE;
|
||||
// words2= lookforStrictPrimer(seqdb,seqdbsize,insamples,&options);
|
||||
// fprintf(stderr,"\n Strict primer count : %d\n",words2->size);
|
||||
//
|
||||
// fprintf(stderr,"\n\n Primer sample : \n");
|
||||
// for (i=0; i<words->size; i++)
|
||||
// fprintf(stderr," + Primer : %s sequence count : %d\n",ecoUnhashWord(words->words[i],options.primer_length),words->strictcount[i]);
|
||||
// fprintf(stderr,"\n\n Primer sample : \n");
|
||||
// for (i=0; i<words2->size; i++)
|
||||
// fprintf(stderr," + Primer : %s sequence count : %d\n",ecoUnhashWord(words2->words[i],options.primer_length),words2->strictcount[i]);
|
||||
|
||||
if (options.no_multi_match)
|
||||
{
|
||||
(void)filterMultiStrictPrimer(words);
|
||||
|
@ -17,7 +17,7 @@ void eco_untrace_memory_allocation()
|
||||
|
||||
void ecoMallocedMemory()
|
||||
{
|
||||
return eco_amount_malloc;
|
||||
//eco_amount_malloc;
|
||||
}
|
||||
|
||||
void *eco_malloc(int64_t chunksize,
|
||||
@ -60,7 +60,7 @@ void *eco_realloc(void *chunk,
|
||||
if (!newchunk)
|
||||
{
|
||||
ecoError(ECO_MEM_ERROR,error_message,filename,line);
|
||||
fprintf('Requested memory : %d\n',newsize);
|
||||
fprintf(stderr,"Requested memory : %d\n",newsize);
|
||||
}
|
||||
if (!chunk)
|
||||
eco_chunk_malloc++;
|
||||
|
@ -13,7 +13,8 @@ SOURCES = goodtaxon.c \
|
||||
pairtree.c \
|
||||
pairs.c \
|
||||
taxstats.c \
|
||||
apat_search.c
|
||||
apat_search.c \
|
||||
filtering.c
|
||||
|
||||
SRCS=$(SOURCES)
|
||||
|
||||
|
@ -47,6 +47,7 @@ typedef uint64_t word_t, *pword_t;
|
||||
#define WORDMASK(s) ((1LLU << ((s) * 2)) -1)
|
||||
#define LSHIFTWORD(x,s) (((x) << 2) & WORDMASK(s))
|
||||
#define RSHIFTWORD(x,s) (((x) & WORDMASK(s))>> 2)
|
||||
#define ERRORMASK(s) ((int32_t)((1LLU << (s)) -1))
|
||||
|
||||
#define RAPPENDBASE(x,s,c) (LSHIFTWORD((x),(s)) | (word_t)(c))
|
||||
#define LAPPENDBASE(x,s,c) (RSHIFTWORD((x),(s)) | ((word_t)((~(c)) & 3) << (((s)-1) *2)))
|
||||
@ -65,6 +66,13 @@ typedef uint64_t word_t, *pword_t;
|
||||
#define MINI(x,y) (((x) < (y)) ? (x):(y))
|
||||
#define MAXI(x,y) (((x) < (y)) ? (y):(x))
|
||||
|
||||
#define FWORDSIZE (13)
|
||||
#define FWORDMASK WORDMASK(FWORDSIZE)
|
||||
#define FILTERWORD(x) ((uint32_t)((x) & FWORDMASK))
|
||||
#define CFILTERWORD(x,s) ((uint32_t)(((x) >> (((s)-FWORDSIZE)*2)) & FWORDMASK))
|
||||
|
||||
|
||||
|
||||
typedef struct {
|
||||
pword_t words;
|
||||
uint32_t *strictcount;
|
||||
@ -231,6 +239,7 @@ typedef struct {
|
||||
|
||||
typedef struct {
|
||||
bool_t statistics;
|
||||
bool_t filtering;
|
||||
uint32_t lmin; //**< Amplifia minimal length
|
||||
uint32_t lmax; //**< Amplifia maximal length
|
||||
uint32_t error_max; //**< maximum error count in fuzzy search
|
||||
@ -270,7 +279,8 @@ pecodnadb_t readdnadb(const char *name, uint32_t *size);
|
||||
int isGoodTaxon(ecotaxonomy_t *taxonomy,int32_t taxon,poptions_t options);
|
||||
|
||||
uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq);
|
||||
pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint32_t doublestrand, ecoseq_t *seq,uint32_t *size);
|
||||
pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint32_t doublestrand, ecoseq_t *seq,uint32_t *size,int32_t *neededWords,uint32_t neededWordCount,
|
||||
int32_t quorum);
|
||||
uint32_t ecoCompactHashSequence(pword_t dest,uint32_t size);
|
||||
const char* ecoUnhashWord(word_t word,uint32_t size);
|
||||
word_t ecoComplementWord(word_t word,uint32_t size);
|
||||
@ -278,8 +288,8 @@ uint32_t ecoFindWord(pwordcount_t table,word_t word);
|
||||
|
||||
|
||||
void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,uint32_t seqQuorum);
|
||||
pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,ecoseq_t *seq);
|
||||
void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t exampleCount,uint32_t seqQuorum,ecoseq_t *seq);
|
||||
pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount);
|
||||
void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t exampleCount,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount);
|
||||
|
||||
pqueue_t newQueue(pqueue_t queue, uint32_t size);
|
||||
pqueue_t resizeQueue(pqueue_t queue, uint32_t size);
|
||||
@ -318,4 +328,8 @@ float taxonomycoverage(ppair_t pair, poptions_t options);
|
||||
char ecoComplementChar(char base);
|
||||
void taxonomyspecificity (ppair_t pair);
|
||||
|
||||
int32_t *filteringSeq(pecodnadb_t database, uint32_t seqdbsize,
|
||||
uint32_t exampleCount,poptions_t options,uint32_t *size,int32_t sequenceQuorum);
|
||||
|
||||
|
||||
#endif /* EPSORT_H_ */
|
||||
|
183
src/libecoprimer/filtering.c
Normal file
183
src/libecoprimer/filtering.c
Normal file
@ -0,0 +1,183 @@
|
||||
/*
|
||||
* filtering.c
|
||||
*
|
||||
* Created on: 12 mai 2009
|
||||
* Author: coissac
|
||||
*/
|
||||
|
||||
#include "ecoprimer.h"
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "hashencoder.h"
|
||||
|
||||
static int32_t *ecoFilteringHashSequence(int32_t *dest,
|
||||
uint32_t circular,
|
||||
uint32_t doublestrand,
|
||||
ecoseq_t *seq,
|
||||
uint32_t *size);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
static int32_t *ecoFilteringHashSequence(int32_t *dest,
|
||||
uint32_t circular,
|
||||
uint32_t doublestrand,
|
||||
ecoseq_t *seq,
|
||||
uint32_t *size)
|
||||
{
|
||||
static char *in_last_seq=NULL;
|
||||
uint32_t i=0;
|
||||
uint32_t j;
|
||||
char *base;
|
||||
int8_t code;
|
||||
int32_t error=0;
|
||||
word_t word=0;
|
||||
word_t antiword=0;
|
||||
uint32_t goodword;
|
||||
uint32_t lmax=0;
|
||||
|
||||
// run on the first call;
|
||||
|
||||
|
||||
if (dest==(void*)-1)
|
||||
{
|
||||
if (in_last_seq) ECOFREE(in_last_seq,"Free in last seq table");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
*size = pow(4,FWORDSIZE);
|
||||
|
||||
if (!in_last_seq)
|
||||
in_last_seq = ECOMALLOC(*size*sizeof(char),
|
||||
"Cannot allocate filtering hash table");
|
||||
|
||||
memset(in_last_seq,0,*size*sizeof(char));
|
||||
|
||||
|
||||
if (!dest)
|
||||
{
|
||||
dest = ECOMALLOC(*size*sizeof(int32_t),
|
||||
"Cannot allocate filtering hash table");
|
||||
memset(dest,0,*size*sizeof(int32_t));
|
||||
}
|
||||
|
||||
lmax = seq->SQ_length;
|
||||
if (!circular)
|
||||
lmax-= FWORDSIZE-1;
|
||||
|
||||
|
||||
|
||||
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i));
|
||||
|
||||
for (i=0, base = seq->SQ; i < FWORDSIZE && i < lmax; i++,base++)
|
||||
{
|
||||
error<<= 1;
|
||||
error&=ERRORMASK(FWORDSIZE);
|
||||
|
||||
code = encoder[(*base) - 'A'];
|
||||
if (code <0)
|
||||
{
|
||||
code = 0;
|
||||
error|= 1;
|
||||
}
|
||||
|
||||
|
||||
word=RAPPENDBASE(word,FWORDSIZE,code);
|
||||
if (doublestrand)
|
||||
antiword=LAPPENDBASE(antiword,FWORDSIZE,code);
|
||||
}
|
||||
|
||||
if (!error && i==FWORDSIZE)
|
||||
{
|
||||
|
||||
goodword=(uint32_t)((doublestrand) ? MINI(word,antiword):word);
|
||||
|
||||
if (!in_last_seq[goodword])
|
||||
{
|
||||
in_last_seq[goodword]=1;
|
||||
dest[goodword]++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
for (j=1; j < lmax; j++,i++,base++)
|
||||
{
|
||||
|
||||
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j));
|
||||
|
||||
/* roll over the sequence for circular ones */
|
||||
if (i==(uint32_t)seq->SQ_length) base=seq->SQ;
|
||||
|
||||
error<<= 1;
|
||||
error&=ERRORMASK(FWORDSIZE);
|
||||
|
||||
code = encoder[(*base) - 'A'];
|
||||
if (code <0)
|
||||
{
|
||||
code = 0;
|
||||
error|= 1;
|
||||
}
|
||||
|
||||
word=RAPPENDBASE(word,FWORDSIZE,code);
|
||||
if (doublestrand)
|
||||
antiword=LAPPENDBASE(antiword,FWORDSIZE,code);
|
||||
|
||||
if (!error)
|
||||
{
|
||||
goodword=(uint32_t)((doublestrand) ? MINI(word,antiword):word);
|
||||
if (!in_last_seq[goodword])
|
||||
{
|
||||
in_last_seq[goodword]=1;
|
||||
dest[goodword]++;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return dest;
|
||||
|
||||
}
|
||||
|
||||
|
||||
int32_t *filteringSeq(pecodnadb_t database, uint32_t seqdbsize,
|
||||
uint32_t exampleCount,poptions_t options,uint32_t *size,int32_t sequenceQuorum)
|
||||
{
|
||||
int32_t *wordscount=NULL;
|
||||
int32_t keep=0;
|
||||
uint32_t i,j=0;
|
||||
|
||||
for (i=0;i<seqdbsize;i++)
|
||||
{
|
||||
if (database[i]->isexample)
|
||||
{
|
||||
j++;
|
||||
wordscount=ecoFilteringHashSequence(wordscount,
|
||||
options->circular,
|
||||
options->doublestrand,
|
||||
database[i],
|
||||
size);
|
||||
}
|
||||
fprintf(stderr," Filtered sequences %5u/%5u \r",j,exampleCount);
|
||||
|
||||
}
|
||||
|
||||
fprintf(stderr,"\n");
|
||||
|
||||
for (i=0;i<*size;i++)
|
||||
if (wordscount[i] >= sequenceQuorum)
|
||||
keep++;
|
||||
|
||||
|
||||
(void)ecoFilteringHashSequence((int32_t*)-1,
|
||||
options->circular,
|
||||
options->doublestrand,
|
||||
NULL,
|
||||
NULL);
|
||||
|
||||
fprintf(stderr,"ok\n Considered word of size %d for filtering : %d\n",FWORDSIZE,keep);
|
||||
return wordscount;
|
||||
|
||||
}
|
21
src/libecoprimer/hashencoder.h
Normal file
21
src/libecoprimer/hashencoder.h
Normal file
@ -0,0 +1,21 @@
|
||||
/*
|
||||
* hashencoder.h
|
||||
*
|
||||
* Created on: 12 mai 2009
|
||||
* Author: coissac
|
||||
*/
|
||||
|
||||
#ifndef HASHENCODER_H_
|
||||
#define HASHENCODER_H_
|
||||
|
||||
static int8_t encoder[] = {0, // A
|
||||
-1, // b
|
||||
1, // C
|
||||
-1,-1,-1, // d, e, f
|
||||
2, // G
|
||||
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // h,i,j,k,l,m,n,o,p,q,r,s
|
||||
3,3, // T,U
|
||||
-1,-1,-1,-1,-1}; // v,w,x,y,z
|
||||
|
||||
|
||||
#endif /* HASHENCODER_H_ */
|
@ -10,15 +10,7 @@
|
||||
|
||||
static int cmpword(const void *x,const void *y);
|
||||
|
||||
static int8_t encoder[] = {0, // A
|
||||
-1, // b
|
||||
1, // C
|
||||
-1,-1,-1, // d, e, f
|
||||
2, // G
|
||||
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // h,i,j,k,l,m,n,o,p,q,r,s
|
||||
3,3, // T,U
|
||||
-1,-1,-1,-1,-1}; // v,w,x,y,z
|
||||
|
||||
#include "hashencoder.h"
|
||||
|
||||
uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq)
|
||||
{
|
||||
@ -31,7 +23,15 @@ uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq)
|
||||
return wordcount;
|
||||
}
|
||||
|
||||
pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint32_t doublestrand, ecoseq_t *seq,uint32_t *size)
|
||||
pword_t ecoHashSequence(pword_t dest,
|
||||
uint32_t wordsize,
|
||||
uint32_t circular,
|
||||
uint32_t doublestrand,
|
||||
ecoseq_t *seq,
|
||||
uint32_t *size,
|
||||
int32_t *neededWords,
|
||||
uint32_t neededWordCount,
|
||||
int32_t quorum)
|
||||
{
|
||||
uint32_t i=0;
|
||||
uint32_t j;
|
||||
@ -40,6 +40,7 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
|
||||
int32_t error=0;
|
||||
word_t word=0;
|
||||
word_t antiword=0;
|
||||
word_t goodword;
|
||||
uint32_t lmax=0;
|
||||
|
||||
(*size)=0;
|
||||
@ -57,7 +58,9 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
|
||||
|
||||
for (i=0, base = seq->SQ; i < wordsize && i < lmax; i++,base++)
|
||||
{
|
||||
|
||||
error<<= 1;
|
||||
error&=ERRORMASK(wordsize);
|
||||
|
||||
code = encoder[(*base) - 'A'];
|
||||
if (code <0)
|
||||
@ -68,10 +71,22 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
|
||||
|
||||
|
||||
word=RAPPENDBASE(word,wordsize,code);
|
||||
|
||||
if (doublestrand)
|
||||
antiword=LAPPENDBASE(antiword,wordsize,code);
|
||||
|
||||
if (neededWordCount && i>=(FWORDSIZE-1))
|
||||
{
|
||||
|
||||
goodword = (doublestrand) ? MINI(FILTERWORD(word),CFILTERWORD(antiword,wordsize)):FILTERWORD(word);
|
||||
if (neededWords[(uint32_t)goodword]<quorum)
|
||||
error|= (1 << (FWORDSIZE-1));
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
if (!error && i==wordsize)
|
||||
{
|
||||
dest[*size]=(doublestrand) ? MINI(word,antiword):word;
|
||||
@ -85,9 +100,11 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
|
||||
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j));
|
||||
|
||||
/* roll over the sequence for circular ones */
|
||||
|
||||
if (i==(uint32_t)seq->SQ_length) base=seq->SQ;
|
||||
|
||||
error<<= 1;
|
||||
error&=ERRORMASK(wordsize);
|
||||
|
||||
code = encoder[(*base) - 'A'];
|
||||
if (code <0)
|
||||
@ -100,6 +117,17 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
|
||||
if (doublestrand)
|
||||
antiword=LAPPENDBASE(antiword,wordsize,code);
|
||||
|
||||
if (neededWordCount)
|
||||
{
|
||||
goodword = (doublestrand) ? MINI(FILTERWORD(word),CFILTERWORD(antiword,wordsize)):FILTERWORD(word);
|
||||
if (neededWords[(uint32_t)goodword]<quorum)
|
||||
error|= (1 << (FWORDSIZE-1));
|
||||
// else
|
||||
// DEBUG_LOG("%s goodword = %p %d/%d (pos:%d error:%d)",seq->AC,goodword,neededWords[(uint32_t)goodword],quorum,i,error);
|
||||
|
||||
}
|
||||
|
||||
|
||||
if (!error)
|
||||
{
|
||||
dest[*size]=(doublestrand) ? MINI(word,antiword):word;
|
||||
@ -107,7 +135,7 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// DEBUG_LOG("%s goodword = %d",seq->AC,*size);
|
||||
return dest;
|
||||
|
||||
}
|
||||
@ -116,12 +144,16 @@ uint32_t ecoCompactHashSequence(pword_t table,uint32_t size)
|
||||
{
|
||||
uint32_t i,j;
|
||||
word_t current;
|
||||
// bool_t here=FALSE;
|
||||
|
||||
sortword(table,size);
|
||||
|
||||
current = 0;
|
||||
current=SETMULTIWORD(current); /* build impossible word for the first loop cycle */
|
||||
|
||||
// if (strcmp(ecoUnhashWord(table[size-1],18),"GTTTGTTCAACGATTAAA")==0)
|
||||
// here=TRUE;
|
||||
|
||||
for (i=0,j=0; j < size;j++)
|
||||
{
|
||||
if (WORD(table[j])!=current)
|
||||
@ -134,6 +166,9 @@ uint32_t ecoCompactHashSequence(pword_t table,uint32_t size)
|
||||
table[i]=SETMULTIWORD(table[i]);
|
||||
}
|
||||
|
||||
// if (strcmp(ecoUnhashWord(WORD(table[i-1]),18),"TACGACCTCGATGTTGGA")==0)
|
||||
// DEBUG_LOG("winner %d",i)
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
|
@ -41,7 +41,8 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui
|
||||
(void)mergeInit(&merged,data,s1,s2);
|
||||
(void)newQueue(&queue,MINI(s1,s2));
|
||||
|
||||
while (merged.read1 < s1 && merged.read2 < merged.size)
|
||||
|
||||
while (merged.read1 < s1 || merged.read2 < merged.size)
|
||||
{
|
||||
if (! queue.empty)
|
||||
{
|
||||
@ -56,7 +57,8 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui
|
||||
source=S1;
|
||||
}
|
||||
|
||||
if (WORD(currentword) > WORD(merged.words[merged.read2]))
|
||||
if (merged.read2 < merged.size &&
|
||||
WORD(currentword) > WORD(merged.words[merged.read2]))
|
||||
{
|
||||
currentword = merged.words[merged.read2];
|
||||
currentcount = merged.count[merged.read2];
|
||||
@ -114,6 +116,8 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui
|
||||
|
||||
|
||||
if (merged.read2 < merged.size)
|
||||
{
|
||||
//DEBUG_LOG("end1 %d %d/%d %d/%d",merged.write,merged.read1,s1,merged.read2,merged.size);
|
||||
for (;merged.read2 < merged.size;merged.read2++)
|
||||
{
|
||||
merged.words[merged.write]=merged.words[merged.read2];
|
||||
@ -122,7 +126,10 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui
|
||||
merged.write++;
|
||||
|
||||
}
|
||||
else while (! queue.empty)
|
||||
}
|
||||
else {
|
||||
//DEBUG_LOG("end2 %d %d/%d %d/%d",merged.write,merged.read1,s1,merged.read2,merged.size);
|
||||
while (! queue.empty)
|
||||
{
|
||||
// DEBUG_LOG("write : %s count : %d write : %d size : %d pop : %d push : %d empty : %d",ecoUnhashWord(queue.words[queue.pop],18),queue.count[queue.pop],merged.write,queue.size,queue.pop,queue.push,queue.empty)
|
||||
merged.words[merged.write]=queue.words[queue.pop];
|
||||
@ -131,6 +138,7 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui
|
||||
if (remainingSeq + merged.count[merged.write] >= seqQuorum)
|
||||
merged.write++;
|
||||
}
|
||||
}
|
||||
|
||||
data->size = merged.write;
|
||||
|
||||
|
@ -48,7 +48,7 @@ double timeval_subtract (struct timeval *x, struct timeval *y)
|
||||
return (double)result.tv_sec + (double)result.tv_usec/1e6;
|
||||
}
|
||||
|
||||
pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,ecoseq_t *seq)
|
||||
pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount)
|
||||
{
|
||||
uint32_t i;
|
||||
uint32_t buffsize;
|
||||
@ -65,7 +65,7 @@ pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circ
|
||||
|
||||
if (seq)
|
||||
{
|
||||
table->words = ecoHashSequence(NULL,wordsize,circular,doublestrand,seq,&buffsize);
|
||||
table->words = ecoHashSequence(NULL,wordsize,circular,doublestrand,seq,&buffsize,neededWords,neededWordCount,seqQuorum);
|
||||
table->size = ecoCompactHashSequence(table->words,buffsize);
|
||||
|
||||
table->inseqcount=1;
|
||||
@ -79,7 +79,7 @@ pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circ
|
||||
return table;
|
||||
}
|
||||
|
||||
void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t exampleCount,uint32_t seqQuorum,ecoseq_t *seq)
|
||||
void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t exampleCount,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount)
|
||||
{
|
||||
uint32_t buffersize;
|
||||
pword_t newtable;
|
||||
@ -96,7 +96,7 @@ void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circ
|
||||
|
||||
// DEBUG_LOG("Words = %x (%u) new = %x", table->words,table->size,newtable);
|
||||
|
||||
(void)ecoHashSequence(newtable,wordsize,circular,doublestrand,seq,&newsize);
|
||||
(void)ecoHashSequence(newtable,wordsize,circular,doublestrand,seq,&newsize,neededWords,neededWordCount,seqQuorum);
|
||||
// DEBUG_LOG("new seq wordCount : %d",newsize);
|
||||
|
||||
newsize = ecoCompactHashSequence(newtable,newsize);
|
||||
@ -137,6 +137,18 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
|
||||
pwordcount_t strictprimers=NULL;
|
||||
uint64_t totallength=0;
|
||||
uint32_t sequenceQuorum = (uint32_t)floor((float)exampleCount * options->strict_quorum);
|
||||
int32_t *neededWords;
|
||||
uint32_t neededWordCount;
|
||||
|
||||
fprintf(stderr,"Filtering... ");
|
||||
|
||||
if (options->filtering)
|
||||
neededWords = filteringSeq(database,seqdbsize,exampleCount,options,&neededWordCount,(int32_t)sequenceQuorum);
|
||||
else
|
||||
{
|
||||
neededWordCount=0;
|
||||
neededWords=NULL;
|
||||
}
|
||||
|
||||
if (options->statistics)
|
||||
{
|
||||
@ -152,7 +164,8 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
|
||||
strictprimers = initCountTable(NULL,options->primer_length,
|
||||
options->circular,
|
||||
options->doublestrand,
|
||||
NULL);
|
||||
0,
|
||||
NULL,NULL,0);
|
||||
|
||||
|
||||
getrusage(RUSAGE_SELF,&start);
|
||||
@ -167,7 +180,8 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
|
||||
strictprimers = initCountTable(strictprimers,options->primer_length,
|
||||
options->circular,
|
||||
options->doublestrand,
|
||||
database[i]);
|
||||
sequenceQuorum,
|
||||
database[i],neededWords,neededWordCount);
|
||||
first=FALSE;
|
||||
}
|
||||
else
|
||||
@ -180,7 +194,7 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
|
||||
options->doublestrand,
|
||||
exampleCount,
|
||||
sequenceQuorum,
|
||||
database[i]);
|
||||
database[i],neededWords,neededWordCount);
|
||||
};
|
||||
totallength+=database[i]->SQ_length;
|
||||
getrusage(RUSAGE_SELF,&usage);
|
||||
@ -215,6 +229,9 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
|
||||
sizeof(word_t)*strictprimers->size,
|
||||
"Cannot reallocate strict primer table");
|
||||
|
||||
if (neededWords)
|
||||
ECOFREE(neededWords,"Clean needed word table");
|
||||
|
||||
return strictprimers;
|
||||
}
|
||||
|
||||
|
@ -47,7 +47,6 @@ int32_t counttaxon(int32_t taxid)
|
||||
tsearch((void*)((size_t)taxid),&taxontree,cmptaxon);
|
||||
taxoncount++;
|
||||
}
|
||||
|
||||
return taxoncount;
|
||||
}
|
||||
|
||||
@ -60,6 +59,7 @@ int32_t getrankdbstats(pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *tax
|
||||
ecotx_t *tmptaxon;
|
||||
|
||||
counttaxon(-1);
|
||||
options->intaxa = 0;
|
||||
|
||||
for (i=0;i<seqdbsize;i++)
|
||||
{
|
||||
@ -85,6 +85,7 @@ int32_t getrankdbstats(pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *tax
|
||||
}
|
||||
|
||||
counttaxon(-1);
|
||||
options->outtaxa = 0;
|
||||
|
||||
for (i=0;i<seqdbsize;i++)
|
||||
{
|
||||
|
Reference in New Issue
Block a user