New version 0.3 with filtering on short words
git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@213 60f365c0-8329-0410-b2a4-ec073aeeaa1d
This commit is contained in:
183
src/libecoprimer/filtering.c
Normal file
183
src/libecoprimer/filtering.c
Normal file
@ -0,0 +1,183 @@
|
||||
/*
|
||||
* filtering.c
|
||||
*
|
||||
* Created on: 12 mai 2009
|
||||
* Author: coissac
|
||||
*/
|
||||
|
||||
#include "ecoprimer.h"
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "hashencoder.h"
|
||||
|
||||
static int32_t *ecoFilteringHashSequence(int32_t *dest,
|
||||
uint32_t circular,
|
||||
uint32_t doublestrand,
|
||||
ecoseq_t *seq,
|
||||
uint32_t *size);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
static int32_t *ecoFilteringHashSequence(int32_t *dest,
|
||||
uint32_t circular,
|
||||
uint32_t doublestrand,
|
||||
ecoseq_t *seq,
|
||||
uint32_t *size)
|
||||
{
|
||||
static char *in_last_seq=NULL;
|
||||
uint32_t i=0;
|
||||
uint32_t j;
|
||||
char *base;
|
||||
int8_t code;
|
||||
int32_t error=0;
|
||||
word_t word=0;
|
||||
word_t antiword=0;
|
||||
uint32_t goodword;
|
||||
uint32_t lmax=0;
|
||||
|
||||
// run on the first call;
|
||||
|
||||
|
||||
if (dest==(void*)-1)
|
||||
{
|
||||
if (in_last_seq) ECOFREE(in_last_seq,"Free in last seq table");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
*size = pow(4,FWORDSIZE);
|
||||
|
||||
if (!in_last_seq)
|
||||
in_last_seq = ECOMALLOC(*size*sizeof(char),
|
||||
"Cannot allocate filtering hash table");
|
||||
|
||||
memset(in_last_seq,0,*size*sizeof(char));
|
||||
|
||||
|
||||
if (!dest)
|
||||
{
|
||||
dest = ECOMALLOC(*size*sizeof(int32_t),
|
||||
"Cannot allocate filtering hash table");
|
||||
memset(dest,0,*size*sizeof(int32_t));
|
||||
}
|
||||
|
||||
lmax = seq->SQ_length;
|
||||
if (!circular)
|
||||
lmax-= FWORDSIZE-1;
|
||||
|
||||
|
||||
|
||||
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i));
|
||||
|
||||
for (i=0, base = seq->SQ; i < FWORDSIZE && i < lmax; i++,base++)
|
||||
{
|
||||
error<<= 1;
|
||||
error&=ERRORMASK(FWORDSIZE);
|
||||
|
||||
code = encoder[(*base) - 'A'];
|
||||
if (code <0)
|
||||
{
|
||||
code = 0;
|
||||
error|= 1;
|
||||
}
|
||||
|
||||
|
||||
word=RAPPENDBASE(word,FWORDSIZE,code);
|
||||
if (doublestrand)
|
||||
antiword=LAPPENDBASE(antiword,FWORDSIZE,code);
|
||||
}
|
||||
|
||||
if (!error && i==FWORDSIZE)
|
||||
{
|
||||
|
||||
goodword=(uint32_t)((doublestrand) ? MINI(word,antiword):word);
|
||||
|
||||
if (!in_last_seq[goodword])
|
||||
{
|
||||
in_last_seq[goodword]=1;
|
||||
dest[goodword]++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
for (j=1; j < lmax; j++,i++,base++)
|
||||
{
|
||||
|
||||
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j));
|
||||
|
||||
/* roll over the sequence for circular ones */
|
||||
if (i==(uint32_t)seq->SQ_length) base=seq->SQ;
|
||||
|
||||
error<<= 1;
|
||||
error&=ERRORMASK(FWORDSIZE);
|
||||
|
||||
code = encoder[(*base) - 'A'];
|
||||
if (code <0)
|
||||
{
|
||||
code = 0;
|
||||
error|= 1;
|
||||
}
|
||||
|
||||
word=RAPPENDBASE(word,FWORDSIZE,code);
|
||||
if (doublestrand)
|
||||
antiword=LAPPENDBASE(antiword,FWORDSIZE,code);
|
||||
|
||||
if (!error)
|
||||
{
|
||||
goodword=(uint32_t)((doublestrand) ? MINI(word,antiword):word);
|
||||
if (!in_last_seq[goodword])
|
||||
{
|
||||
in_last_seq[goodword]=1;
|
||||
dest[goodword]++;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return dest;
|
||||
|
||||
}
|
||||
|
||||
|
||||
int32_t *filteringSeq(pecodnadb_t database, uint32_t seqdbsize,
|
||||
uint32_t exampleCount,poptions_t options,uint32_t *size,int32_t sequenceQuorum)
|
||||
{
|
||||
int32_t *wordscount=NULL;
|
||||
int32_t keep=0;
|
||||
uint32_t i,j=0;
|
||||
|
||||
for (i=0;i<seqdbsize;i++)
|
||||
{
|
||||
if (database[i]->isexample)
|
||||
{
|
||||
j++;
|
||||
wordscount=ecoFilteringHashSequence(wordscount,
|
||||
options->circular,
|
||||
options->doublestrand,
|
||||
database[i],
|
||||
size);
|
||||
}
|
||||
fprintf(stderr," Filtered sequences %5u/%5u \r",j,exampleCount);
|
||||
|
||||
}
|
||||
|
||||
fprintf(stderr,"\n");
|
||||
|
||||
for (i=0;i<*size;i++)
|
||||
if (wordscount[i] >= sequenceQuorum)
|
||||
keep++;
|
||||
|
||||
|
||||
(void)ecoFilteringHashSequence((int32_t*)-1,
|
||||
options->circular,
|
||||
options->doublestrand,
|
||||
NULL,
|
||||
NULL);
|
||||
|
||||
fprintf(stderr,"ok\n Considered word of size %d for filtering : %d\n",FWORDSIZE,keep);
|
||||
return wordscount;
|
||||
|
||||
}
|
Reference in New Issue
Block a user