Files
ecoprimers/src/libecoprimer/filtering.c

189 lines
3.6 KiB
C

/*
* filtering.c
*
* Created on: 12 mai 2009
* Author: coissac
*/
#include "ecoprimer.h"
#include <string.h>
#include <math.h>
#include "hashencoder.h"
static int32_t *ecoFilteringHashSequence(int32_t *dest,
uint32_t circular,
uint32_t doublestrand,
ecoseq_t *seq,
uint32_t *size);
static int32_t *ecoFilteringHashSequence(int32_t *dest,
uint32_t circular,
uint32_t doublestrand,
ecoseq_t *seq,
uint32_t *size)
{
static char *in_last_seq=NULL;
uint32_t i=0;
uint32_t j;
char *base;
int8_t code;
int32_t error=0;
word_t word=0;
word_t antiword=0;
uint32_t goodword;
uint32_t lmax=0;
// run on the first call;
if (dest==(void*)-1)
{
if (in_last_seq) ECOFREE(in_last_seq,"Free in last seq table");
return NULL;
}
*size = pow(4,FWORDSIZE);
if (!in_last_seq)
in_last_seq = ECOMALLOC(*size*sizeof(char),
"Cannot allocate filtering hash table");
memset(in_last_seq,0,*size*sizeof(char));
if (!dest)
{
dest = ECOMALLOC(*size*sizeof(int32_t),
"Cannot allocate filtering hash table");
memset(dest,0,*size*sizeof(int32_t));
}
lmax = seq->SQ_length;
if (!circular)
lmax-= FWORDSIZE-1;
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i));
for (i=0, base = seq->SQ; i < FWORDSIZE && i < lmax; i++,base++)
{
error<<= 1;
error&=ERRORMASK(FWORDSIZE);
code = encoder[(*base) - 'A'];
if (code <0)
{
code = 0;
error|= 1;
}
word=RAPPENDBASE(word,FWORDSIZE,code);
if (doublestrand)
antiword=LAPPENDBASE(antiword,FWORDSIZE,code);
}
if (!error && i==FWORDSIZE)
{
goodword=(uint32_t)((doublestrand) ? MINI(word,antiword):word);
if (!in_last_seq[goodword])
{
in_last_seq[goodword]=1;
dest[goodword]++;
}
}
for (j=1; j < lmax; j++,i++,base++)
{
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j));
/* roll over the sequence for circular ones */
if (i==(uint32_t)seq->SQ_length) base=seq->SQ;
error<<= 1;
error&=ERRORMASK(FWORDSIZE);
//code = -1;
//if((*base) >= 'A' && (*base) <= 'Z')
code = encoder[(*base) - 'A'];
if (code <0)
{
code = 0;
error|= 1;
}
word=RAPPENDBASE(word,FWORDSIZE,code);
if (doublestrand)
antiword=LAPPENDBASE(antiword,FWORDSIZE,code);
if (!error)
{
if (doublestrand)
goodword=(uint32_t)MINI(word,antiword);
else
goodword=word;
if (!in_last_seq[goodword])
{
in_last_seq[goodword]=1;
dest[goodword]++;
}
}
}
return dest;
}
int32_t *filteringSeq(pecodnadb_t database, uint32_t seqdbsize,
uint32_t exampleCount,poptions_t options,uint32_t *size,int32_t sequenceQuorum)
{
int32_t *wordscount=NULL;
int32_t keep=0;
uint32_t i,j=0;
for (i=0;i<seqdbsize;i++)
{
if (database[i]->isexample && database[i]->SQ_length > options->primer_length)
{
j++;
wordscount=ecoFilteringHashSequence(wordscount,
options->circular,
options->doublestrand,
database[i],
size);
}
fprintf(stderr," Filtered sequences %5u/%5u \r",j,exampleCount);
}
fprintf(stderr,"\n");
for (i=0;i<*size;i++)
if (wordscount[i] >= sequenceQuorum)
keep++;
(void)ecoFilteringHashSequence((int32_t*)-1,
options->circular,
options->doublestrand,
NULL,
NULL);
fprintf(stderr,"ok\n Considered word of size %d for filtering : %d\n",FWORDSIZE,keep);
return wordscount;
}