From aedee0c1547e2f5d42417fd611fcbd68ea2a3f38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Boyer?= Date: Wed, 29 Feb 2012 14:12:17 +0000 Subject: [PATCH] Added some comments git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@400 60f365c0-8329-0410-b2a4-ec073aeeaa1d --- src/libecoprimer/filtering.c | 36 ++++++++++++++++++++++++++++++++- src/libecoprimer/hashsequence.c | 25 ++++++++++++++++++++--- 2 files changed, 57 insertions(+), 4 deletions(-) diff --git a/src/libecoprimer/filtering.c b/src/libecoprimer/filtering.c index 24e1387..534ac7f 100644 --- a/src/libecoprimer/filtering.c +++ b/src/libecoprimer/filtering.c @@ -27,6 +27,17 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest, ecoseq_t *seq, uint32_t *size) { + + /* + * This function aims at building a vector of count for every possible hash codes + * + * The function must be applied once on each sequence + * + * The function allocates memory on the first call for the dest table + * The function also allocates memory for the static temporary table in_last_seq and + * the function must be called with *dest == -1 in order to free this temporary table + * + */ static char *in_last_seq=NULL; uint32_t i=0; uint32_t j; @@ -40,7 +51,6 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest, // run on the first call; - if (dest==(void*)-1) { if (in_last_seq) ECOFREE(in_last_seq,"Free in last seq table"); @@ -48,14 +58,27 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest, } + /* FWORDSIZE = 13 => *size = 67 108 864 + * FWORDSIZE = 14 => *size = 268 435 456 + * FWORDSIZE = 15 => *size = 1 073 741 824 + */ + *size = pow(4,FWORDSIZE); + /* + * in_last_seq is a vector of char as it is just to avoid counting twice (or more) a hash code (a DNA word) + * it is set to zero (with memset) and then filled with ones for each word belonging to the sequence + */ + if (!in_last_seq) in_last_seq = ECOMALLOC(*size*sizeof(char), "Cannot allocate filtering hash table"); memset(in_last_seq,0,*size*sizeof(char)); + /* + * Allocate (on first call) the memory for the table of counts + */ if (!dest) { @@ -72,6 +95,10 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest, // DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i)); + /* + * Compute first word of seq + */ + for (i=0, base = seq->SQ; i < FWORDSIZE && i < lmax; i++,base++) { error<<= 1; @@ -95,6 +122,9 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest, goodword=(uint32_t)((doublestrand) ? MINI(word,antiword):word); + /* + * FB: I don't think the test is necessary as the table has just been initialized + */ if (!in_last_seq[goodword]) { in_last_seq[goodword]=1; @@ -102,6 +132,9 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest, } } + /* + * compute and store counts (avoid counting twice a word) for the other words of the seq + */ for (j=1; j < lmax; j++,i++,base++) { @@ -142,6 +175,7 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest, } + return dest; } diff --git a/src/libecoprimer/hashsequence.c b/src/libecoprimer/hashsequence.c index 2caea48..e8f1a9a 100644 --- a/src/libecoprimer/hashsequence.c +++ b/src/libecoprimer/hashsequence.c @@ -33,6 +33,19 @@ pword_t ecoHashSequence(pword_t dest, uint32_t neededWordCount, int32_t quorum) { + + /* + * dest / out : words of the hashed sequence position per position + * wordsize / in : size of the word to be hashed (record error for that size) BUT not equal to FWORDSIZE ... + * ... the size of the word REALLY returned as a result + * circular / in : is the sequence circular + * doublestrand / in : if we have to hash on both strands of the sequence + * seq / in : the sequence in ecoseq format + * size / out : number of hashed words (size of the dest vector) + * neededWordCount / in : table hash codes of word counts in the full DB (used to filter words) + * quorum / in : minimum quorum used to filter words based on the neededWordCount table + */ + uint32_t i=0; uint32_t j; char *base; @@ -54,7 +67,7 @@ pword_t ecoHashSequence(pword_t dest, "I cannot allocate memory for sequence hashing" ); -// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i)); + //DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i)); for (i=0, base = seq->SQ; i < wordsize && i < lmax; i++,base++) { @@ -97,7 +110,7 @@ pword_t ecoHashSequence(pword_t dest, for (j=1; j < lmax; j++,i++,base++) { -// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j)); + //DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j)); /* roll over the sequence for circular ones */ @@ -135,13 +148,19 @@ pword_t ecoHashSequence(pword_t dest, } } - // DEBUG_LOG("%s goodword = %d",seq->AC,*size); + //DEBUG_LOG("%s goodword = %d",seq->AC,*size); return dest; } uint32_t ecoCompactHashSequence(pword_t table,uint32_t size) { + /* + * + * MULTIWORD is a word occurring more than once in a sequence + * + */ + uint32_t i,j; word_t current; // bool_t here=FALSE;