Added some comments

git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@400 60f365c0-8329-0410-b2a4-ec073aeeaa1d
This commit is contained in:
Frédéric Boyer
2012-02-29 14:12:17 +00:00
parent 9e36754fef
commit aedee0c154
2 changed files with 57 additions and 4 deletions

View File

@ -27,6 +27,17 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest,
ecoseq_t *seq, ecoseq_t *seq,
uint32_t *size) uint32_t *size)
{ {
/*
* This function aims at building a vector of count for every possible hash codes
*
* The function must be applied once on each sequence
*
* The function allocates memory on the first call for the dest table
* The function also allocates memory for the static temporary table in_last_seq and
* the function must be called with *dest == -1 in order to free this temporary table
*
*/
static char *in_last_seq=NULL; static char *in_last_seq=NULL;
uint32_t i=0; uint32_t i=0;
uint32_t j; uint32_t j;
@ -40,7 +51,6 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest,
// run on the first call; // run on the first call;
if (dest==(void*)-1) if (dest==(void*)-1)
{ {
if (in_last_seq) ECOFREE(in_last_seq,"Free in last seq table"); if (in_last_seq) ECOFREE(in_last_seq,"Free in last seq table");
@ -48,14 +58,27 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest,
} }
/* FWORDSIZE = 13 => *size = 67 108 864
* FWORDSIZE = 14 => *size = 268 435 456
* FWORDSIZE = 15 => *size = 1 073 741 824
*/
*size = pow(4,FWORDSIZE); *size = pow(4,FWORDSIZE);
/*
* in_last_seq is a vector of char as it is just to avoid counting twice (or more) a hash code (a DNA word)
* it is set to zero (with memset) and then filled with ones for each word belonging to the sequence
*/
if (!in_last_seq) if (!in_last_seq)
in_last_seq = ECOMALLOC(*size*sizeof(char), in_last_seq = ECOMALLOC(*size*sizeof(char),
"Cannot allocate filtering hash table"); "Cannot allocate filtering hash table");
memset(in_last_seq,0,*size*sizeof(char)); memset(in_last_seq,0,*size*sizeof(char));
/*
* Allocate (on first call) the memory for the table of counts
*/
if (!dest) if (!dest)
{ {
@ -72,6 +95,10 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest,
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i)); // DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i));
/*
* Compute first word of seq
*/
for (i=0, base = seq->SQ; i < FWORDSIZE && i < lmax; i++,base++) for (i=0, base = seq->SQ; i < FWORDSIZE && i < lmax; i++,base++)
{ {
error<<= 1; error<<= 1;
@ -95,6 +122,9 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest,
goodword=(uint32_t)((doublestrand) ? MINI(word,antiword):word); goodword=(uint32_t)((doublestrand) ? MINI(word,antiword):word);
/*
* FB: I don't think the test is necessary as the table has just been initialized
*/
if (!in_last_seq[goodword]) if (!in_last_seq[goodword])
{ {
in_last_seq[goodword]=1; in_last_seq[goodword]=1;
@ -102,6 +132,9 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest,
} }
} }
/*
* compute and store counts (avoid counting twice a word) for the other words of the seq
*/
for (j=1; j < lmax; j++,i++,base++) for (j=1; j < lmax; j++,i++,base++)
{ {
@ -142,6 +175,7 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest,
} }
return dest; return dest;
} }

View File

@ -33,6 +33,19 @@ pword_t ecoHashSequence(pword_t dest,
uint32_t neededWordCount, uint32_t neededWordCount,
int32_t quorum) int32_t quorum)
{ {
/*
* dest / out : words of the hashed sequence position per position
* wordsize / in : size of the word to be hashed (record error for that size) BUT not equal to FWORDSIZE ...
* ... the size of the word REALLY returned as a result
* circular / in : is the sequence circular
* doublestrand / in : if we have to hash on both strands of the sequence
* seq / in : the sequence in ecoseq format
* size / out : number of hashed words (size of the dest vector)
* neededWordCount / in : table hash codes of word counts in the full DB (used to filter words)
* quorum / in : minimum quorum used to filter words based on the neededWordCount table
*/
uint32_t i=0; uint32_t i=0;
uint32_t j; uint32_t j;
char *base; char *base;
@ -54,7 +67,7 @@ pword_t ecoHashSequence(pword_t dest,
"I cannot allocate memory for sequence hashing" "I cannot allocate memory for sequence hashing"
); );
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i)); //DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i));
for (i=0, base = seq->SQ; i < wordsize && i < lmax; i++,base++) for (i=0, base = seq->SQ; i < wordsize && i < lmax; i++,base++)
{ {
@ -97,7 +110,7 @@ pword_t ecoHashSequence(pword_t dest,
for (j=1; j < lmax; j++,i++,base++) for (j=1; j < lmax; j++,i++,base++)
{ {
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j)); //DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j));
/* roll over the sequence for circular ones */ /* roll over the sequence for circular ones */
@ -135,13 +148,19 @@ pword_t ecoHashSequence(pword_t dest,
} }
} }
// DEBUG_LOG("%s goodword = %d",seq->AC,*size); //DEBUG_LOG("%s goodword = %d",seq->AC,*size);
return dest; return dest;
} }
uint32_t ecoCompactHashSequence(pword_t table,uint32_t size) uint32_t ecoCompactHashSequence(pword_t table,uint32_t size)
{ {
/*
*
* MULTIWORD is a word occurring more than once in a sequence
*
*/
uint32_t i,j; uint32_t i,j;
word_t current; word_t current;
// bool_t here=FALSE; // bool_t here=FALSE;