git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@393 60f365c0-8329-0410-b2a4-ec073aeeaa1d
189 lines
3.6 KiB
C
189 lines
3.6 KiB
C
/*
|
|
* filtering.c
|
|
*
|
|
* Created on: 12 mai 2009
|
|
* Author: coissac
|
|
*/
|
|
|
|
#include "ecoprimer.h"
|
|
#include <string.h>
|
|
#include <math.h>
|
|
|
|
#include "hashencoder.h"
|
|
|
|
static int32_t *ecoFilteringHashSequence(int32_t *dest,
|
|
uint32_t circular,
|
|
uint32_t doublestrand,
|
|
ecoseq_t *seq,
|
|
uint32_t *size);
|
|
|
|
|
|
|
|
|
|
|
|
static int32_t *ecoFilteringHashSequence(int32_t *dest,
|
|
uint32_t circular,
|
|
uint32_t doublestrand,
|
|
ecoseq_t *seq,
|
|
uint32_t *size)
|
|
{
|
|
static char *in_last_seq=NULL;
|
|
uint32_t i=0;
|
|
uint32_t j;
|
|
char *base;
|
|
int8_t code;
|
|
int32_t error=0;
|
|
word_t word=0;
|
|
word_t antiword=0;
|
|
uint32_t goodword;
|
|
uint32_t lmax=0;
|
|
|
|
// run on the first call;
|
|
|
|
|
|
if (dest==(void*)-1)
|
|
{
|
|
if (in_last_seq) ECOFREE(in_last_seq,"Free in last seq table");
|
|
return NULL;
|
|
}
|
|
|
|
|
|
*size = pow(4,FWORDSIZE);
|
|
|
|
if (!in_last_seq)
|
|
in_last_seq = ECOMALLOC(*size*sizeof(char),
|
|
"Cannot allocate filtering hash table");
|
|
|
|
memset(in_last_seq,0,*size*sizeof(char));
|
|
|
|
|
|
if (!dest)
|
|
{
|
|
dest = ECOMALLOC(*size*sizeof(int32_t),
|
|
"Cannot allocate filtering hash table");
|
|
memset(dest,0,*size*sizeof(int32_t));
|
|
}
|
|
|
|
lmax = seq->SQ_length;
|
|
if (!circular)
|
|
lmax-= FWORDSIZE-1;
|
|
|
|
|
|
|
|
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i));
|
|
|
|
for (i=0, base = seq->SQ; i < FWORDSIZE && i < lmax; i++,base++)
|
|
{
|
|
error<<= 1;
|
|
error&=ERRORMASK(FWORDSIZE);
|
|
|
|
code = encoder[(*base) - 'A'];
|
|
if (code <0)
|
|
{
|
|
code = 0;
|
|
error|= 1;
|
|
}
|
|
|
|
|
|
word=RAPPENDBASE(word,FWORDSIZE,code);
|
|
if (doublestrand)
|
|
antiword=LAPPENDBASE(antiword,FWORDSIZE,code);
|
|
}
|
|
|
|
if (!error && i==FWORDSIZE)
|
|
{
|
|
|
|
goodword=(uint32_t)((doublestrand) ? MINI(word,antiword):word);
|
|
|
|
if (!in_last_seq[goodword])
|
|
{
|
|
in_last_seq[goodword]=1;
|
|
dest[goodword]++;
|
|
}
|
|
}
|
|
|
|
|
|
for (j=1; j < lmax; j++,i++,base++)
|
|
{
|
|
|
|
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j));
|
|
|
|
/* roll over the sequence for circular ones */
|
|
if (i==(uint32_t)seq->SQ_length) base=seq->SQ;
|
|
|
|
error<<= 1;
|
|
error&=ERRORMASK(FWORDSIZE);
|
|
|
|
//code = -1;
|
|
//if((*base) >= 'A' && (*base) <= 'Z')
|
|
code = encoder[(*base) - 'A'];
|
|
if (code <0)
|
|
{
|
|
code = 0;
|
|
error|= 1;
|
|
}
|
|
|
|
word=RAPPENDBASE(word,FWORDSIZE,code);
|
|
if (doublestrand)
|
|
antiword=LAPPENDBASE(antiword,FWORDSIZE,code);
|
|
|
|
if (!error)
|
|
{
|
|
if (doublestrand)
|
|
goodword=(uint32_t)MINI(word,antiword);
|
|
else
|
|
goodword=word;
|
|
if (!in_last_seq[goodword])
|
|
{
|
|
in_last_seq[goodword]=1;
|
|
dest[goodword]++;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
return dest;
|
|
|
|
}
|
|
|
|
|
|
int32_t *filteringSeq(pecodnadb_t database, uint32_t seqdbsize,
|
|
uint32_t exampleCount,poptions_t options,uint32_t *size,int32_t sequenceQuorum)
|
|
{
|
|
int32_t *wordscount=NULL;
|
|
int32_t keep=0;
|
|
uint32_t i,j=0;
|
|
|
|
for (i=0;i<seqdbsize;i++)
|
|
{
|
|
if (database[i]->isexample && database[i]->SQ_length > options->primer_length)
|
|
{
|
|
j++;
|
|
wordscount=ecoFilteringHashSequence(wordscount,
|
|
options->circular,
|
|
options->doublestrand,
|
|
database[i],
|
|
size);
|
|
}
|
|
fprintf(stderr," Filtered sequences %5u/%5u \r",j,exampleCount);
|
|
|
|
}
|
|
|
|
fprintf(stderr,"\n");
|
|
|
|
for (i=0;i<*size;i++)
|
|
if (wordscount[i] >= sequenceQuorum)
|
|
keep++;
|
|
|
|
|
|
(void)ecoFilteringHashSequence((int32_t*)-1,
|
|
options->circular,
|
|
options->doublestrand,
|
|
NULL,
|
|
NULL);
|
|
|
|
fprintf(stderr,"ok\n Considered word of size %d for filtering : %d\n",FWORDSIZE,keep);
|
|
return wordscount;
|
|
|
|
}
|