/* * filtering.c * * Created on: 12 mai 2009 * Author: coissac */ #include "ecoprimer.h" #include #include #include "hashencoder.h" static int32_t *ecoFilteringHashSequence(int32_t *dest, uint32_t circular, uint32_t doublestrand, ecoseq_t *seq, uint32_t *size); static int32_t *ecoFilteringHashSequence(int32_t *dest, uint32_t circular, uint32_t doublestrand, ecoseq_t *seq, uint32_t *size) { static char *in_last_seq=NULL; uint32_t i=0; uint32_t j; char *base; int8_t code; int32_t error=0; word_t word=0; word_t antiword=0; uint32_t goodword; uint32_t lmax=0; // run on the first call; if (dest==(void*)-1) { if (in_last_seq) ECOFREE(in_last_seq,"Free in last seq table"); return NULL; } *size = pow(4,FWORDSIZE); if (!in_last_seq) in_last_seq = ECOMALLOC(*size*sizeof(char), "Cannot allocate filtering hash table"); memset(in_last_seq,0,*size*sizeof(char)); if (!dest) { dest = ECOMALLOC(*size*sizeof(int32_t), "Cannot allocate filtering hash table"); memset(dest,0,*size*sizeof(int32_t)); } lmax = seq->SQ_length; if (!circular) lmax-= FWORDSIZE-1; // DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i)); for (i=0, base = seq->SQ; i < FWORDSIZE && i < lmax; i++,base++) { error<<= 1; error&=ERRORMASK(FWORDSIZE); code = encoder[(*base) - 'A']; if (code <0) { code = 0; error|= 1; } word=RAPPENDBASE(word,FWORDSIZE,code); if (doublestrand) antiword=LAPPENDBASE(antiword,FWORDSIZE,code); } if (!error && i==FWORDSIZE) { goodword=(uint32_t)((doublestrand) ? MINI(word,antiword):word); if (!in_last_seq[goodword]) { in_last_seq[goodword]=1; dest[goodword]++; } } for (j=1; j < lmax; j++,i++,base++) { // DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j)); /* roll over the sequence for circular ones */ if (i==(uint32_t)seq->SQ_length) base=seq->SQ; error<<= 1; error&=ERRORMASK(FWORDSIZE); //code = -1; //if((*base) >= 'A' && (*base) <= 'Z') code = encoder[(*base) - 'A']; if (code <0) { code = 0; error|= 1; } word=RAPPENDBASE(word,FWORDSIZE,code); if (doublestrand) antiword=LAPPENDBASE(antiword,FWORDSIZE,code); if (!error) { if (doublestrand) goodword=(uint32_t)MINI(word,antiword); else goodword=word; if (!in_last_seq[goodword]) { in_last_seq[goodword]=1; dest[goodword]++; } } } return dest; } int32_t *filteringSeq(pecodnadb_t database, uint32_t seqdbsize, uint32_t exampleCount,poptions_t options,uint32_t *size,int32_t sequenceQuorum) { int32_t *wordscount=NULL; int32_t keep=0; uint32_t i,j=0; for (i=0;iisexample && database[i]->SQ_length > options->primer_length) { j++; wordscount=ecoFilteringHashSequence(wordscount, options->circular, options->doublestrand, database[i], size); } fprintf(stderr," Filtered sequences %5u/%5u \r",j,exampleCount); } fprintf(stderr,"\n"); for (i=0;i<*size;i++) if (wordscount[i] >= sequenceQuorum) keep++; (void)ecoFilteringHashSequence((int32_t*)-1, options->circular, options->doublestrand, NULL, NULL); fprintf(stderr,"ok\n Considered word of size %d for filtering : %d\n",FWORDSIZE,keep); return wordscount; }