New version 0.3 with filtering on short words
git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@213 60f365c0-8329-0410-b2a4-ec073aeeaa1d
This commit is contained in:
BIN
src/ecoPrimer
Executable file
BIN
src/ecoPrimer
Executable file
Binary file not shown.
@ -14,7 +14,7 @@
|
|||||||
#include <time.h>
|
#include <time.h>
|
||||||
#include <sys/time.h>
|
#include <sys/time.h>
|
||||||
|
|
||||||
#define VERSION "0.2"
|
#define VERSION "0.3"
|
||||||
/* TR: by default, statistics are made on species level*/
|
/* TR: by default, statistics are made on species level*/
|
||||||
#define DEFAULTTAXONRANK "species"
|
#define DEFAULTTAXONRANK "species"
|
||||||
|
|
||||||
@ -98,6 +98,7 @@ static void ExitUsage(int stat)
|
|||||||
void initoptions(poptions_t options)
|
void initoptions(poptions_t options)
|
||||||
{
|
{
|
||||||
options->statistics=FALSE;
|
options->statistics=FALSE;
|
||||||
|
options->filtering=TRUE;
|
||||||
options->lmin=0; //< Amplifia minimal length
|
options->lmin=0; //< Amplifia minimal length
|
||||||
options->lmax=1000; //< Amplifia maximal length
|
options->lmax=1000; //< Amplifia maximal length
|
||||||
options->error_max=3; //**< maximum error count in fuzzy search
|
options->error_max=3; //**< maximum error count in fuzzy search
|
||||||
@ -432,6 +433,7 @@ int main(int argc, char **argv)
|
|||||||
uint32_t i;
|
uint32_t i;
|
||||||
|
|
||||||
pwordcount_t words;
|
pwordcount_t words;
|
||||||
|
// pwordcount_t words2;
|
||||||
pprimercount_t primers;
|
pprimercount_t primers;
|
||||||
ppairtree_t pairs;
|
ppairtree_t pairs;
|
||||||
|
|
||||||
@ -442,7 +444,7 @@ int main(int argc, char **argv)
|
|||||||
|
|
||||||
initoptions(&options);
|
initoptions(&options);
|
||||||
|
|
||||||
while ((carg = getopt(argc, argv, "hvcUDSd:l:L:e:i:r:q:3:s:x:t:O:")) != -1) {
|
while ((carg = getopt(argc, argv, "hfvcUDSd:l:L:e:i:r:q:3:s:x:t:O:")) != -1) {
|
||||||
|
|
||||||
switch (carg) {
|
switch (carg) {
|
||||||
/* ---------------------------- */
|
/* ---------------------------- */
|
||||||
@ -451,6 +453,12 @@ int main(int argc, char **argv)
|
|||||||
options.statistics=TRUE;
|
options.statistics=TRUE;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
/* ---------------------------- */
|
||||||
|
case 'f': /* set in single strand mode */
|
||||||
|
/* ---------------------------- */
|
||||||
|
options.filtering=FALSE;
|
||||||
|
break;
|
||||||
|
|
||||||
/* -------------------- */
|
/* -------------------- */
|
||||||
case 'd': /* database name */
|
case 'd': /* database name */
|
||||||
/* -------------------- */
|
/* -------------------- */
|
||||||
@ -599,12 +607,20 @@ int main(int argc, char **argv)
|
|||||||
|
|
||||||
fprintf(stderr,"\nIndexing words in sequences\n");
|
fprintf(stderr,"\nIndexing words in sequences\n");
|
||||||
|
|
||||||
printcurrenttimeinmilli();
|
|
||||||
words = lookforStrictPrimer(seqdb,seqdbsize,insamples,&options);
|
words = lookforStrictPrimer(seqdb,seqdbsize,insamples,&options);
|
||||||
printcurrenttimeinmilli();
|
|
||||||
|
|
||||||
fprintf(stderr,"\n Strict primer count : %d\n",words->size);
|
fprintf(stderr,"\n Strict primer count : %d\n",words->size);
|
||||||
|
|
||||||
|
// options.filtering=FALSE;
|
||||||
|
// words2= lookforStrictPrimer(seqdb,seqdbsize,insamples,&options);
|
||||||
|
// fprintf(stderr,"\n Strict primer count : %d\n",words2->size);
|
||||||
|
//
|
||||||
|
// fprintf(stderr,"\n\n Primer sample : \n");
|
||||||
|
// for (i=0; i<words->size; i++)
|
||||||
|
// fprintf(stderr," + Primer : %s sequence count : %d\n",ecoUnhashWord(words->words[i],options.primer_length),words->strictcount[i]);
|
||||||
|
// fprintf(stderr,"\n\n Primer sample : \n");
|
||||||
|
// for (i=0; i<words2->size; i++)
|
||||||
|
// fprintf(stderr," + Primer : %s sequence count : %d\n",ecoUnhashWord(words2->words[i],options.primer_length),words2->strictcount[i]);
|
||||||
|
|
||||||
if (options.no_multi_match)
|
if (options.no_multi_match)
|
||||||
{
|
{
|
||||||
(void)filterMultiStrictPrimer(words);
|
(void)filterMultiStrictPrimer(words);
|
||||||
|
@ -17,7 +17,7 @@ void eco_untrace_memory_allocation()
|
|||||||
|
|
||||||
void ecoMallocedMemory()
|
void ecoMallocedMemory()
|
||||||
{
|
{
|
||||||
return eco_amount_malloc;
|
//eco_amount_malloc;
|
||||||
}
|
}
|
||||||
|
|
||||||
void *eco_malloc(int64_t chunksize,
|
void *eco_malloc(int64_t chunksize,
|
||||||
@ -60,7 +60,7 @@ void *eco_realloc(void *chunk,
|
|||||||
if (!newchunk)
|
if (!newchunk)
|
||||||
{
|
{
|
||||||
ecoError(ECO_MEM_ERROR,error_message,filename,line);
|
ecoError(ECO_MEM_ERROR,error_message,filename,line);
|
||||||
fprintf('Requested memory : %d\n',newsize);
|
fprintf(stderr,"Requested memory : %d\n",newsize);
|
||||||
}
|
}
|
||||||
if (!chunk)
|
if (!chunk)
|
||||||
eco_chunk_malloc++;
|
eco_chunk_malloc++;
|
||||||
|
@ -13,7 +13,8 @@ SOURCES = goodtaxon.c \
|
|||||||
pairtree.c \
|
pairtree.c \
|
||||||
pairs.c \
|
pairs.c \
|
||||||
taxstats.c \
|
taxstats.c \
|
||||||
apat_search.c
|
apat_search.c \
|
||||||
|
filtering.c
|
||||||
|
|
||||||
SRCS=$(SOURCES)
|
SRCS=$(SOURCES)
|
||||||
|
|
||||||
|
@ -47,6 +47,7 @@ typedef uint64_t word_t, *pword_t;
|
|||||||
#define WORDMASK(s) ((1LLU << ((s) * 2)) -1)
|
#define WORDMASK(s) ((1LLU << ((s) * 2)) -1)
|
||||||
#define LSHIFTWORD(x,s) (((x) << 2) & WORDMASK(s))
|
#define LSHIFTWORD(x,s) (((x) << 2) & WORDMASK(s))
|
||||||
#define RSHIFTWORD(x,s) (((x) & WORDMASK(s))>> 2)
|
#define RSHIFTWORD(x,s) (((x) & WORDMASK(s))>> 2)
|
||||||
|
#define ERRORMASK(s) ((int32_t)((1LLU << (s)) -1))
|
||||||
|
|
||||||
#define RAPPENDBASE(x,s,c) (LSHIFTWORD((x),(s)) | (word_t)(c))
|
#define RAPPENDBASE(x,s,c) (LSHIFTWORD((x),(s)) | (word_t)(c))
|
||||||
#define LAPPENDBASE(x,s,c) (RSHIFTWORD((x),(s)) | ((word_t)((~(c)) & 3) << (((s)-1) *2)))
|
#define LAPPENDBASE(x,s,c) (RSHIFTWORD((x),(s)) | ((word_t)((~(c)) & 3) << (((s)-1) *2)))
|
||||||
@ -65,6 +66,13 @@ typedef uint64_t word_t, *pword_t;
|
|||||||
#define MINI(x,y) (((x) < (y)) ? (x):(y))
|
#define MINI(x,y) (((x) < (y)) ? (x):(y))
|
||||||
#define MAXI(x,y) (((x) < (y)) ? (y):(x))
|
#define MAXI(x,y) (((x) < (y)) ? (y):(x))
|
||||||
|
|
||||||
|
#define FWORDSIZE (13)
|
||||||
|
#define FWORDMASK WORDMASK(FWORDSIZE)
|
||||||
|
#define FILTERWORD(x) ((uint32_t)((x) & FWORDMASK))
|
||||||
|
#define CFILTERWORD(x,s) ((uint32_t)(((x) >> (((s)-FWORDSIZE)*2)) & FWORDMASK))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
pword_t words;
|
pword_t words;
|
||||||
uint32_t *strictcount;
|
uint32_t *strictcount;
|
||||||
@ -231,6 +239,7 @@ typedef struct {
|
|||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
bool_t statistics;
|
bool_t statistics;
|
||||||
|
bool_t filtering;
|
||||||
uint32_t lmin; //**< Amplifia minimal length
|
uint32_t lmin; //**< Amplifia minimal length
|
||||||
uint32_t lmax; //**< Amplifia maximal length
|
uint32_t lmax; //**< Amplifia maximal length
|
||||||
uint32_t error_max; //**< maximum error count in fuzzy search
|
uint32_t error_max; //**< maximum error count in fuzzy search
|
||||||
@ -270,7 +279,8 @@ pecodnadb_t readdnadb(const char *name, uint32_t *size);
|
|||||||
int isGoodTaxon(ecotaxonomy_t *taxonomy,int32_t taxon,poptions_t options);
|
int isGoodTaxon(ecotaxonomy_t *taxonomy,int32_t taxon,poptions_t options);
|
||||||
|
|
||||||
uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq);
|
uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq);
|
||||||
pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint32_t doublestrand, ecoseq_t *seq,uint32_t *size);
|
pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint32_t doublestrand, ecoseq_t *seq,uint32_t *size,int32_t *neededWords,uint32_t neededWordCount,
|
||||||
|
int32_t quorum);
|
||||||
uint32_t ecoCompactHashSequence(pword_t dest,uint32_t size);
|
uint32_t ecoCompactHashSequence(pword_t dest,uint32_t size);
|
||||||
const char* ecoUnhashWord(word_t word,uint32_t size);
|
const char* ecoUnhashWord(word_t word,uint32_t size);
|
||||||
word_t ecoComplementWord(word_t word,uint32_t size);
|
word_t ecoComplementWord(word_t word,uint32_t size);
|
||||||
@ -278,8 +288,8 @@ uint32_t ecoFindWord(pwordcount_t table,word_t word);
|
|||||||
|
|
||||||
|
|
||||||
void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,uint32_t seqQuorum);
|
void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,uint32_t seqQuorum);
|
||||||
pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,ecoseq_t *seq);
|
pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount);
|
||||||
void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t exampleCount,uint32_t seqQuorum,ecoseq_t *seq);
|
void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t exampleCount,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount);
|
||||||
|
|
||||||
pqueue_t newQueue(pqueue_t queue, uint32_t size);
|
pqueue_t newQueue(pqueue_t queue, uint32_t size);
|
||||||
pqueue_t resizeQueue(pqueue_t queue, uint32_t size);
|
pqueue_t resizeQueue(pqueue_t queue, uint32_t size);
|
||||||
@ -318,4 +328,8 @@ float taxonomycoverage(ppair_t pair, poptions_t options);
|
|||||||
char ecoComplementChar(char base);
|
char ecoComplementChar(char base);
|
||||||
void taxonomyspecificity (ppair_t pair);
|
void taxonomyspecificity (ppair_t pair);
|
||||||
|
|
||||||
|
int32_t *filteringSeq(pecodnadb_t database, uint32_t seqdbsize,
|
||||||
|
uint32_t exampleCount,poptions_t options,uint32_t *size,int32_t sequenceQuorum);
|
||||||
|
|
||||||
|
|
||||||
#endif /* EPSORT_H_ */
|
#endif /* EPSORT_H_ */
|
||||||
|
183
src/libecoprimer/filtering.c
Normal file
183
src/libecoprimer/filtering.c
Normal file
@ -0,0 +1,183 @@
|
|||||||
|
/*
|
||||||
|
* filtering.c
|
||||||
|
*
|
||||||
|
* Created on: 12 mai 2009
|
||||||
|
* Author: coissac
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "ecoprimer.h"
|
||||||
|
#include <string.h>
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
#include "hashencoder.h"
|
||||||
|
|
||||||
|
static int32_t *ecoFilteringHashSequence(int32_t *dest,
|
||||||
|
uint32_t circular,
|
||||||
|
uint32_t doublestrand,
|
||||||
|
ecoseq_t *seq,
|
||||||
|
uint32_t *size);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
static int32_t *ecoFilteringHashSequence(int32_t *dest,
|
||||||
|
uint32_t circular,
|
||||||
|
uint32_t doublestrand,
|
||||||
|
ecoseq_t *seq,
|
||||||
|
uint32_t *size)
|
||||||
|
{
|
||||||
|
static char *in_last_seq=NULL;
|
||||||
|
uint32_t i=0;
|
||||||
|
uint32_t j;
|
||||||
|
char *base;
|
||||||
|
int8_t code;
|
||||||
|
int32_t error=0;
|
||||||
|
word_t word=0;
|
||||||
|
word_t antiword=0;
|
||||||
|
uint32_t goodword;
|
||||||
|
uint32_t lmax=0;
|
||||||
|
|
||||||
|
// run on the first call;
|
||||||
|
|
||||||
|
|
||||||
|
if (dest==(void*)-1)
|
||||||
|
{
|
||||||
|
if (in_last_seq) ECOFREE(in_last_seq,"Free in last seq table");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
*size = pow(4,FWORDSIZE);
|
||||||
|
|
||||||
|
if (!in_last_seq)
|
||||||
|
in_last_seq = ECOMALLOC(*size*sizeof(char),
|
||||||
|
"Cannot allocate filtering hash table");
|
||||||
|
|
||||||
|
memset(in_last_seq,0,*size*sizeof(char));
|
||||||
|
|
||||||
|
|
||||||
|
if (!dest)
|
||||||
|
{
|
||||||
|
dest = ECOMALLOC(*size*sizeof(int32_t),
|
||||||
|
"Cannot allocate filtering hash table");
|
||||||
|
memset(dest,0,*size*sizeof(int32_t));
|
||||||
|
}
|
||||||
|
|
||||||
|
lmax = seq->SQ_length;
|
||||||
|
if (!circular)
|
||||||
|
lmax-= FWORDSIZE-1;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i));
|
||||||
|
|
||||||
|
for (i=0, base = seq->SQ; i < FWORDSIZE && i < lmax; i++,base++)
|
||||||
|
{
|
||||||
|
error<<= 1;
|
||||||
|
error&=ERRORMASK(FWORDSIZE);
|
||||||
|
|
||||||
|
code = encoder[(*base) - 'A'];
|
||||||
|
if (code <0)
|
||||||
|
{
|
||||||
|
code = 0;
|
||||||
|
error|= 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
word=RAPPENDBASE(word,FWORDSIZE,code);
|
||||||
|
if (doublestrand)
|
||||||
|
antiword=LAPPENDBASE(antiword,FWORDSIZE,code);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!error && i==FWORDSIZE)
|
||||||
|
{
|
||||||
|
|
||||||
|
goodword=(uint32_t)((doublestrand) ? MINI(word,antiword):word);
|
||||||
|
|
||||||
|
if (!in_last_seq[goodword])
|
||||||
|
{
|
||||||
|
in_last_seq[goodword]=1;
|
||||||
|
dest[goodword]++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
for (j=1; j < lmax; j++,i++,base++)
|
||||||
|
{
|
||||||
|
|
||||||
|
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j));
|
||||||
|
|
||||||
|
/* roll over the sequence for circular ones */
|
||||||
|
if (i==(uint32_t)seq->SQ_length) base=seq->SQ;
|
||||||
|
|
||||||
|
error<<= 1;
|
||||||
|
error&=ERRORMASK(FWORDSIZE);
|
||||||
|
|
||||||
|
code = encoder[(*base) - 'A'];
|
||||||
|
if (code <0)
|
||||||
|
{
|
||||||
|
code = 0;
|
||||||
|
error|= 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
word=RAPPENDBASE(word,FWORDSIZE,code);
|
||||||
|
if (doublestrand)
|
||||||
|
antiword=LAPPENDBASE(antiword,FWORDSIZE,code);
|
||||||
|
|
||||||
|
if (!error)
|
||||||
|
{
|
||||||
|
goodword=(uint32_t)((doublestrand) ? MINI(word,antiword):word);
|
||||||
|
if (!in_last_seq[goodword])
|
||||||
|
{
|
||||||
|
in_last_seq[goodword]=1;
|
||||||
|
dest[goodword]++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return dest;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int32_t *filteringSeq(pecodnadb_t database, uint32_t seqdbsize,
|
||||||
|
uint32_t exampleCount,poptions_t options,uint32_t *size,int32_t sequenceQuorum)
|
||||||
|
{
|
||||||
|
int32_t *wordscount=NULL;
|
||||||
|
int32_t keep=0;
|
||||||
|
uint32_t i,j=0;
|
||||||
|
|
||||||
|
for (i=0;i<seqdbsize;i++)
|
||||||
|
{
|
||||||
|
if (database[i]->isexample)
|
||||||
|
{
|
||||||
|
j++;
|
||||||
|
wordscount=ecoFilteringHashSequence(wordscount,
|
||||||
|
options->circular,
|
||||||
|
options->doublestrand,
|
||||||
|
database[i],
|
||||||
|
size);
|
||||||
|
}
|
||||||
|
fprintf(stderr," Filtered sequences %5u/%5u \r",j,exampleCount);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr,"\n");
|
||||||
|
|
||||||
|
for (i=0;i<*size;i++)
|
||||||
|
if (wordscount[i] >= sequenceQuorum)
|
||||||
|
keep++;
|
||||||
|
|
||||||
|
|
||||||
|
(void)ecoFilteringHashSequence((int32_t*)-1,
|
||||||
|
options->circular,
|
||||||
|
options->doublestrand,
|
||||||
|
NULL,
|
||||||
|
NULL);
|
||||||
|
|
||||||
|
fprintf(stderr,"ok\n Considered word of size %d for filtering : %d\n",FWORDSIZE,keep);
|
||||||
|
return wordscount;
|
||||||
|
|
||||||
|
}
|
21
src/libecoprimer/hashencoder.h
Normal file
21
src/libecoprimer/hashencoder.h
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
/*
|
||||||
|
* hashencoder.h
|
||||||
|
*
|
||||||
|
* Created on: 12 mai 2009
|
||||||
|
* Author: coissac
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef HASHENCODER_H_
|
||||||
|
#define HASHENCODER_H_
|
||||||
|
|
||||||
|
static int8_t encoder[] = {0, // A
|
||||||
|
-1, // b
|
||||||
|
1, // C
|
||||||
|
-1,-1,-1, // d, e, f
|
||||||
|
2, // G
|
||||||
|
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // h,i,j,k,l,m,n,o,p,q,r,s
|
||||||
|
3,3, // T,U
|
||||||
|
-1,-1,-1,-1,-1}; // v,w,x,y,z
|
||||||
|
|
||||||
|
|
||||||
|
#endif /* HASHENCODER_H_ */
|
@ -10,15 +10,7 @@
|
|||||||
|
|
||||||
static int cmpword(const void *x,const void *y);
|
static int cmpword(const void *x,const void *y);
|
||||||
|
|
||||||
static int8_t encoder[] = {0, // A
|
#include "hashencoder.h"
|
||||||
-1, // b
|
|
||||||
1, // C
|
|
||||||
-1,-1,-1, // d, e, f
|
|
||||||
2, // G
|
|
||||||
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // h,i,j,k,l,m,n,o,p,q,r,s
|
|
||||||
3,3, // T,U
|
|
||||||
-1,-1,-1,-1,-1}; // v,w,x,y,z
|
|
||||||
|
|
||||||
|
|
||||||
uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq)
|
uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq)
|
||||||
{
|
{
|
||||||
@ -31,7 +23,15 @@ uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq)
|
|||||||
return wordcount;
|
return wordcount;
|
||||||
}
|
}
|
||||||
|
|
||||||
pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint32_t doublestrand, ecoseq_t *seq,uint32_t *size)
|
pword_t ecoHashSequence(pword_t dest,
|
||||||
|
uint32_t wordsize,
|
||||||
|
uint32_t circular,
|
||||||
|
uint32_t doublestrand,
|
||||||
|
ecoseq_t *seq,
|
||||||
|
uint32_t *size,
|
||||||
|
int32_t *neededWords,
|
||||||
|
uint32_t neededWordCount,
|
||||||
|
int32_t quorum)
|
||||||
{
|
{
|
||||||
uint32_t i=0;
|
uint32_t i=0;
|
||||||
uint32_t j;
|
uint32_t j;
|
||||||
@ -40,6 +40,7 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
|
|||||||
int32_t error=0;
|
int32_t error=0;
|
||||||
word_t word=0;
|
word_t word=0;
|
||||||
word_t antiword=0;
|
word_t antiword=0;
|
||||||
|
word_t goodword;
|
||||||
uint32_t lmax=0;
|
uint32_t lmax=0;
|
||||||
|
|
||||||
(*size)=0;
|
(*size)=0;
|
||||||
@ -57,7 +58,9 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
|
|||||||
|
|
||||||
for (i=0, base = seq->SQ; i < wordsize && i < lmax; i++,base++)
|
for (i=0, base = seq->SQ; i < wordsize && i < lmax; i++,base++)
|
||||||
{
|
{
|
||||||
|
|
||||||
error<<= 1;
|
error<<= 1;
|
||||||
|
error&=ERRORMASK(wordsize);
|
||||||
|
|
||||||
code = encoder[(*base) - 'A'];
|
code = encoder[(*base) - 'A'];
|
||||||
if (code <0)
|
if (code <0)
|
||||||
@ -68,10 +71,22 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
|
|||||||
|
|
||||||
|
|
||||||
word=RAPPENDBASE(word,wordsize,code);
|
word=RAPPENDBASE(word,wordsize,code);
|
||||||
|
|
||||||
if (doublestrand)
|
if (doublestrand)
|
||||||
antiword=LAPPENDBASE(antiword,wordsize,code);
|
antiword=LAPPENDBASE(antiword,wordsize,code);
|
||||||
|
|
||||||
|
if (neededWordCount && i>=(FWORDSIZE-1))
|
||||||
|
{
|
||||||
|
|
||||||
|
goodword = (doublestrand) ? MINI(FILTERWORD(word),CFILTERWORD(antiword,wordsize)):FILTERWORD(word);
|
||||||
|
if (neededWords[(uint32_t)goodword]<quorum)
|
||||||
|
error|= (1 << (FWORDSIZE-1));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (!error && i==wordsize)
|
if (!error && i==wordsize)
|
||||||
{
|
{
|
||||||
dest[*size]=(doublestrand) ? MINI(word,antiword):word;
|
dest[*size]=(doublestrand) ? MINI(word,antiword):word;
|
||||||
@ -85,9 +100,11 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
|
|||||||
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j));
|
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j));
|
||||||
|
|
||||||
/* roll over the sequence for circular ones */
|
/* roll over the sequence for circular ones */
|
||||||
|
|
||||||
if (i==(uint32_t)seq->SQ_length) base=seq->SQ;
|
if (i==(uint32_t)seq->SQ_length) base=seq->SQ;
|
||||||
|
|
||||||
error<<= 1;
|
error<<= 1;
|
||||||
|
error&=ERRORMASK(wordsize);
|
||||||
|
|
||||||
code = encoder[(*base) - 'A'];
|
code = encoder[(*base) - 'A'];
|
||||||
if (code <0)
|
if (code <0)
|
||||||
@ -100,6 +117,17 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
|
|||||||
if (doublestrand)
|
if (doublestrand)
|
||||||
antiword=LAPPENDBASE(antiword,wordsize,code);
|
antiword=LAPPENDBASE(antiword,wordsize,code);
|
||||||
|
|
||||||
|
if (neededWordCount)
|
||||||
|
{
|
||||||
|
goodword = (doublestrand) ? MINI(FILTERWORD(word),CFILTERWORD(antiword,wordsize)):FILTERWORD(word);
|
||||||
|
if (neededWords[(uint32_t)goodword]<quorum)
|
||||||
|
error|= (1 << (FWORDSIZE-1));
|
||||||
|
// else
|
||||||
|
// DEBUG_LOG("%s goodword = %p %d/%d (pos:%d error:%d)",seq->AC,goodword,neededWords[(uint32_t)goodword],quorum,i,error);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
if (!error)
|
if (!error)
|
||||||
{
|
{
|
||||||
dest[*size]=(doublestrand) ? MINI(word,antiword):word;
|
dest[*size]=(doublestrand) ? MINI(word,antiword):word;
|
||||||
@ -107,7 +135,7 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
// DEBUG_LOG("%s goodword = %d",seq->AC,*size);
|
||||||
return dest;
|
return dest;
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -116,12 +144,16 @@ uint32_t ecoCompactHashSequence(pword_t table,uint32_t size)
|
|||||||
{
|
{
|
||||||
uint32_t i,j;
|
uint32_t i,j;
|
||||||
word_t current;
|
word_t current;
|
||||||
|
// bool_t here=FALSE;
|
||||||
|
|
||||||
sortword(table,size);
|
sortword(table,size);
|
||||||
|
|
||||||
current = 0;
|
current = 0;
|
||||||
current=SETMULTIWORD(current); /* build impossible word for the first loop cycle */
|
current=SETMULTIWORD(current); /* build impossible word for the first loop cycle */
|
||||||
|
|
||||||
|
// if (strcmp(ecoUnhashWord(table[size-1],18),"GTTTGTTCAACGATTAAA")==0)
|
||||||
|
// here=TRUE;
|
||||||
|
|
||||||
for (i=0,j=0; j < size;j++)
|
for (i=0,j=0; j < size;j++)
|
||||||
{
|
{
|
||||||
if (WORD(table[j])!=current)
|
if (WORD(table[j])!=current)
|
||||||
@ -134,6 +166,9 @@ uint32_t ecoCompactHashSequence(pword_t table,uint32_t size)
|
|||||||
table[i]=SETMULTIWORD(table[i]);
|
table[i]=SETMULTIWORD(table[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// if (strcmp(ecoUnhashWord(WORD(table[i-1]),18),"TACGACCTCGATGTTGGA")==0)
|
||||||
|
// DEBUG_LOG("winner %d",i)
|
||||||
|
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -41,7 +41,8 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui
|
|||||||
(void)mergeInit(&merged,data,s1,s2);
|
(void)mergeInit(&merged,data,s1,s2);
|
||||||
(void)newQueue(&queue,MINI(s1,s2));
|
(void)newQueue(&queue,MINI(s1,s2));
|
||||||
|
|
||||||
while (merged.read1 < s1 && merged.read2 < merged.size)
|
|
||||||
|
while (merged.read1 < s1 || merged.read2 < merged.size)
|
||||||
{
|
{
|
||||||
if (! queue.empty)
|
if (! queue.empty)
|
||||||
{
|
{
|
||||||
@ -56,7 +57,8 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui
|
|||||||
source=S1;
|
source=S1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (WORD(currentword) > WORD(merged.words[merged.read2]))
|
if (merged.read2 < merged.size &&
|
||||||
|
WORD(currentword) > WORD(merged.words[merged.read2]))
|
||||||
{
|
{
|
||||||
currentword = merged.words[merged.read2];
|
currentword = merged.words[merged.read2];
|
||||||
currentcount = merged.count[merged.read2];
|
currentcount = merged.count[merged.read2];
|
||||||
@ -114,6 +116,8 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui
|
|||||||
|
|
||||||
|
|
||||||
if (merged.read2 < merged.size)
|
if (merged.read2 < merged.size)
|
||||||
|
{
|
||||||
|
//DEBUG_LOG("end1 %d %d/%d %d/%d",merged.write,merged.read1,s1,merged.read2,merged.size);
|
||||||
for (;merged.read2 < merged.size;merged.read2++)
|
for (;merged.read2 < merged.size;merged.read2++)
|
||||||
{
|
{
|
||||||
merged.words[merged.write]=merged.words[merged.read2];
|
merged.words[merged.write]=merged.words[merged.read2];
|
||||||
@ -122,7 +126,10 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui
|
|||||||
merged.write++;
|
merged.write++;
|
||||||
|
|
||||||
}
|
}
|
||||||
else while (! queue.empty)
|
}
|
||||||
|
else {
|
||||||
|
//DEBUG_LOG("end2 %d %d/%d %d/%d",merged.write,merged.read1,s1,merged.read2,merged.size);
|
||||||
|
while (! queue.empty)
|
||||||
{
|
{
|
||||||
// DEBUG_LOG("write : %s count : %d write : %d size : %d pop : %d push : %d empty : %d",ecoUnhashWord(queue.words[queue.pop],18),queue.count[queue.pop],merged.write,queue.size,queue.pop,queue.push,queue.empty)
|
// DEBUG_LOG("write : %s count : %d write : %d size : %d pop : %d push : %d empty : %d",ecoUnhashWord(queue.words[queue.pop],18),queue.count[queue.pop],merged.write,queue.size,queue.pop,queue.push,queue.empty)
|
||||||
merged.words[merged.write]=queue.words[queue.pop];
|
merged.words[merged.write]=queue.words[queue.pop];
|
||||||
@ -131,6 +138,7 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui
|
|||||||
if (remainingSeq + merged.count[merged.write] >= seqQuorum)
|
if (remainingSeq + merged.count[merged.write] >= seqQuorum)
|
||||||
merged.write++;
|
merged.write++;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
data->size = merged.write;
|
data->size = merged.write;
|
||||||
|
|
||||||
|
@ -48,7 +48,7 @@ double timeval_subtract (struct timeval *x, struct timeval *y)
|
|||||||
return (double)result.tv_sec + (double)result.tv_usec/1e6;
|
return (double)result.tv_sec + (double)result.tv_usec/1e6;
|
||||||
}
|
}
|
||||||
|
|
||||||
pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,ecoseq_t *seq)
|
pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount)
|
||||||
{
|
{
|
||||||
uint32_t i;
|
uint32_t i;
|
||||||
uint32_t buffsize;
|
uint32_t buffsize;
|
||||||
@ -65,7 +65,7 @@ pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circ
|
|||||||
|
|
||||||
if (seq)
|
if (seq)
|
||||||
{
|
{
|
||||||
table->words = ecoHashSequence(NULL,wordsize,circular,doublestrand,seq,&buffsize);
|
table->words = ecoHashSequence(NULL,wordsize,circular,doublestrand,seq,&buffsize,neededWords,neededWordCount,seqQuorum);
|
||||||
table->size = ecoCompactHashSequence(table->words,buffsize);
|
table->size = ecoCompactHashSequence(table->words,buffsize);
|
||||||
|
|
||||||
table->inseqcount=1;
|
table->inseqcount=1;
|
||||||
@ -79,7 +79,7 @@ pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circ
|
|||||||
return table;
|
return table;
|
||||||
}
|
}
|
||||||
|
|
||||||
void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t exampleCount,uint32_t seqQuorum,ecoseq_t *seq)
|
void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t exampleCount,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount)
|
||||||
{
|
{
|
||||||
uint32_t buffersize;
|
uint32_t buffersize;
|
||||||
pword_t newtable;
|
pword_t newtable;
|
||||||
@ -96,7 +96,7 @@ void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circ
|
|||||||
|
|
||||||
// DEBUG_LOG("Words = %x (%u) new = %x", table->words,table->size,newtable);
|
// DEBUG_LOG("Words = %x (%u) new = %x", table->words,table->size,newtable);
|
||||||
|
|
||||||
(void)ecoHashSequence(newtable,wordsize,circular,doublestrand,seq,&newsize);
|
(void)ecoHashSequence(newtable,wordsize,circular,doublestrand,seq,&newsize,neededWords,neededWordCount,seqQuorum);
|
||||||
// DEBUG_LOG("new seq wordCount : %d",newsize);
|
// DEBUG_LOG("new seq wordCount : %d",newsize);
|
||||||
|
|
||||||
newsize = ecoCompactHashSequence(newtable,newsize);
|
newsize = ecoCompactHashSequence(newtable,newsize);
|
||||||
@ -137,6 +137,18 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
|
|||||||
pwordcount_t strictprimers=NULL;
|
pwordcount_t strictprimers=NULL;
|
||||||
uint64_t totallength=0;
|
uint64_t totallength=0;
|
||||||
uint32_t sequenceQuorum = (uint32_t)floor((float)exampleCount * options->strict_quorum);
|
uint32_t sequenceQuorum = (uint32_t)floor((float)exampleCount * options->strict_quorum);
|
||||||
|
int32_t *neededWords;
|
||||||
|
uint32_t neededWordCount;
|
||||||
|
|
||||||
|
fprintf(stderr,"Filtering... ");
|
||||||
|
|
||||||
|
if (options->filtering)
|
||||||
|
neededWords = filteringSeq(database,seqdbsize,exampleCount,options,&neededWordCount,(int32_t)sequenceQuorum);
|
||||||
|
else
|
||||||
|
{
|
||||||
|
neededWordCount=0;
|
||||||
|
neededWords=NULL;
|
||||||
|
}
|
||||||
|
|
||||||
if (options->statistics)
|
if (options->statistics)
|
||||||
{
|
{
|
||||||
@ -152,7 +164,8 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
|
|||||||
strictprimers = initCountTable(NULL,options->primer_length,
|
strictprimers = initCountTable(NULL,options->primer_length,
|
||||||
options->circular,
|
options->circular,
|
||||||
options->doublestrand,
|
options->doublestrand,
|
||||||
NULL);
|
0,
|
||||||
|
NULL,NULL,0);
|
||||||
|
|
||||||
|
|
||||||
getrusage(RUSAGE_SELF,&start);
|
getrusage(RUSAGE_SELF,&start);
|
||||||
@ -167,7 +180,8 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
|
|||||||
strictprimers = initCountTable(strictprimers,options->primer_length,
|
strictprimers = initCountTable(strictprimers,options->primer_length,
|
||||||
options->circular,
|
options->circular,
|
||||||
options->doublestrand,
|
options->doublestrand,
|
||||||
database[i]);
|
sequenceQuorum,
|
||||||
|
database[i],neededWords,neededWordCount);
|
||||||
first=FALSE;
|
first=FALSE;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -180,7 +194,7 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
|
|||||||
options->doublestrand,
|
options->doublestrand,
|
||||||
exampleCount,
|
exampleCount,
|
||||||
sequenceQuorum,
|
sequenceQuorum,
|
||||||
database[i]);
|
database[i],neededWords,neededWordCount);
|
||||||
};
|
};
|
||||||
totallength+=database[i]->SQ_length;
|
totallength+=database[i]->SQ_length;
|
||||||
getrusage(RUSAGE_SELF,&usage);
|
getrusage(RUSAGE_SELF,&usage);
|
||||||
@ -215,6 +229,9 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
|
|||||||
sizeof(word_t)*strictprimers->size,
|
sizeof(word_t)*strictprimers->size,
|
||||||
"Cannot reallocate strict primer table");
|
"Cannot reallocate strict primer table");
|
||||||
|
|
||||||
|
if (neededWords)
|
||||||
|
ECOFREE(neededWords,"Clean needed word table");
|
||||||
|
|
||||||
return strictprimers;
|
return strictprimers;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -47,7 +47,6 @@ int32_t counttaxon(int32_t taxid)
|
|||||||
tsearch((void*)((size_t)taxid),&taxontree,cmptaxon);
|
tsearch((void*)((size_t)taxid),&taxontree,cmptaxon);
|
||||||
taxoncount++;
|
taxoncount++;
|
||||||
}
|
}
|
||||||
|
|
||||||
return taxoncount;
|
return taxoncount;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -60,6 +59,7 @@ int32_t getrankdbstats(pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *tax
|
|||||||
ecotx_t *tmptaxon;
|
ecotx_t *tmptaxon;
|
||||||
|
|
||||||
counttaxon(-1);
|
counttaxon(-1);
|
||||||
|
options->intaxa = 0;
|
||||||
|
|
||||||
for (i=0;i<seqdbsize;i++)
|
for (i=0;i<seqdbsize;i++)
|
||||||
{
|
{
|
||||||
@ -85,6 +85,7 @@ int32_t getrankdbstats(pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *tax
|
|||||||
}
|
}
|
||||||
|
|
||||||
counttaxon(-1);
|
counttaxon(-1);
|
||||||
|
options->outtaxa = 0;
|
||||||
|
|
||||||
for (i=0;i<seqdbsize;i++)
|
for (i=0;i<seqdbsize;i++)
|
||||||
{
|
{
|
||||||
|
Reference in New Issue
Block a user