Added some comments

git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@400 60f365c0-8329-0410-b2a4-ec073aeeaa1d
This commit is contained in:
Frédéric Boyer
2012-02-29 14:12:17 +00:00
parent 9e36754fef
commit aedee0c154
2 changed files with 57 additions and 4 deletions

View File

@ -27,6 +27,17 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest,
ecoseq_t *seq,
uint32_t *size)
{
/*
* This function aims at building a vector of count for every possible hash codes
*
* The function must be applied once on each sequence
*
* The function allocates memory on the first call for the dest table
* The function also allocates memory for the static temporary table in_last_seq and
* the function must be called with *dest == -1 in order to free this temporary table
*
*/
static char *in_last_seq=NULL;
uint32_t i=0;
uint32_t j;
@ -40,7 +51,6 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest,
// run on the first call;
if (dest==(void*)-1)
{
if (in_last_seq) ECOFREE(in_last_seq,"Free in last seq table");
@ -48,14 +58,27 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest,
}
/* FWORDSIZE = 13 => *size = 67 108 864
* FWORDSIZE = 14 => *size = 268 435 456
* FWORDSIZE = 15 => *size = 1 073 741 824
*/
*size = pow(4,FWORDSIZE);
/*
* in_last_seq is a vector of char as it is just to avoid counting twice (or more) a hash code (a DNA word)
* it is set to zero (with memset) and then filled with ones for each word belonging to the sequence
*/
if (!in_last_seq)
in_last_seq = ECOMALLOC(*size*sizeof(char),
"Cannot allocate filtering hash table");
memset(in_last_seq,0,*size*sizeof(char));
/*
* Allocate (on first call) the memory for the table of counts
*/
if (!dest)
{
@ -72,6 +95,10 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest,
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i));
/*
* Compute first word of seq
*/
for (i=0, base = seq->SQ; i < FWORDSIZE && i < lmax; i++,base++)
{
error<<= 1;
@ -95,6 +122,9 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest,
goodword=(uint32_t)((doublestrand) ? MINI(word,antiword):word);
/*
* FB: I don't think the test is necessary as the table has just been initialized
*/
if (!in_last_seq[goodword])
{
in_last_seq[goodword]=1;
@ -102,6 +132,9 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest,
}
}
/*
* compute and store counts (avoid counting twice a word) for the other words of the seq
*/
for (j=1; j < lmax; j++,i++,base++)
{
@ -142,6 +175,7 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest,
}
return dest;
}

View File

@ -33,6 +33,19 @@ pword_t ecoHashSequence(pword_t dest,
uint32_t neededWordCount,
int32_t quorum)
{
/*
* dest / out : words of the hashed sequence position per position
* wordsize / in : size of the word to be hashed (record error for that size) BUT not equal to FWORDSIZE ...
* ... the size of the word REALLY returned as a result
* circular / in : is the sequence circular
* doublestrand / in : if we have to hash on both strands of the sequence
* seq / in : the sequence in ecoseq format
* size / out : number of hashed words (size of the dest vector)
* neededWordCount / in : table hash codes of word counts in the full DB (used to filter words)
* quorum / in : minimum quorum used to filter words based on the neededWordCount table
*/
uint32_t i=0;
uint32_t j;
char *base;
@ -54,7 +67,7 @@ pword_t ecoHashSequence(pword_t dest,
"I cannot allocate memory for sequence hashing"
);
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i));
//DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i));
for (i=0, base = seq->SQ; i < wordsize && i < lmax; i++,base++)
{
@ -97,7 +110,7 @@ pword_t ecoHashSequence(pword_t dest,
for (j=1; j < lmax; j++,i++,base++)
{
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j));
//DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j));
/* roll over the sequence for circular ones */
@ -135,13 +148,19 @@ pword_t ecoHashSequence(pword_t dest,
}
}
// DEBUG_LOG("%s goodword = %d",seq->AC,*size);
//DEBUG_LOG("%s goodword = %d",seq->AC,*size);
return dest;
}
uint32_t ecoCompactHashSequence(pword_t table,uint32_t size)
{
/*
*
* MULTIWORD is a word occurring more than once in a sequence
*
*/
uint32_t i,j;
word_t current;
// bool_t here=FALSE;