Added some comments
git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@400 60f365c0-8329-0410-b2a4-ec073aeeaa1d
This commit is contained in:
@ -27,6 +27,17 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest,
|
||||
ecoseq_t *seq,
|
||||
uint32_t *size)
|
||||
{
|
||||
|
||||
/*
|
||||
* This function aims at building a vector of count for every possible hash codes
|
||||
*
|
||||
* The function must be applied once on each sequence
|
||||
*
|
||||
* The function allocates memory on the first call for the dest table
|
||||
* The function also allocates memory for the static temporary table in_last_seq and
|
||||
* the function must be called with *dest == -1 in order to free this temporary table
|
||||
*
|
||||
*/
|
||||
static char *in_last_seq=NULL;
|
||||
uint32_t i=0;
|
||||
uint32_t j;
|
||||
@ -40,7 +51,6 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest,
|
||||
|
||||
// run on the first call;
|
||||
|
||||
|
||||
if (dest==(void*)-1)
|
||||
{
|
||||
if (in_last_seq) ECOFREE(in_last_seq,"Free in last seq table");
|
||||
@ -48,14 +58,27 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest,
|
||||
}
|
||||
|
||||
|
||||
/* FWORDSIZE = 13 => *size = 67 108 864
|
||||
* FWORDSIZE = 14 => *size = 268 435 456
|
||||
* FWORDSIZE = 15 => *size = 1 073 741 824
|
||||
*/
|
||||
|
||||
*size = pow(4,FWORDSIZE);
|
||||
|
||||
/*
|
||||
* in_last_seq is a vector of char as it is just to avoid counting twice (or more) a hash code (a DNA word)
|
||||
* it is set to zero (with memset) and then filled with ones for each word belonging to the sequence
|
||||
*/
|
||||
|
||||
if (!in_last_seq)
|
||||
in_last_seq = ECOMALLOC(*size*sizeof(char),
|
||||
"Cannot allocate filtering hash table");
|
||||
|
||||
memset(in_last_seq,0,*size*sizeof(char));
|
||||
|
||||
/*
|
||||
* Allocate (on first call) the memory for the table of counts
|
||||
*/
|
||||
|
||||
if (!dest)
|
||||
{
|
||||
@ -72,6 +95,10 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest,
|
||||
|
||||
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i));
|
||||
|
||||
/*
|
||||
* Compute first word of seq
|
||||
*/
|
||||
|
||||
for (i=0, base = seq->SQ; i < FWORDSIZE && i < lmax; i++,base++)
|
||||
{
|
||||
error<<= 1;
|
||||
@ -95,6 +122,9 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest,
|
||||
|
||||
goodword=(uint32_t)((doublestrand) ? MINI(word,antiword):word);
|
||||
|
||||
/*
|
||||
* FB: I don't think the test is necessary as the table has just been initialized
|
||||
*/
|
||||
if (!in_last_seq[goodword])
|
||||
{
|
||||
in_last_seq[goodword]=1;
|
||||
@ -102,6 +132,9 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest,
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* compute and store counts (avoid counting twice a word) for the other words of the seq
|
||||
*/
|
||||
|
||||
for (j=1; j < lmax; j++,i++,base++)
|
||||
{
|
||||
@ -142,6 +175,7 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest,
|
||||
|
||||
}
|
||||
|
||||
|
||||
return dest;
|
||||
|
||||
}
|
||||
|
@ -33,6 +33,19 @@ pword_t ecoHashSequence(pword_t dest,
|
||||
uint32_t neededWordCount,
|
||||
int32_t quorum)
|
||||
{
|
||||
|
||||
/*
|
||||
* dest / out : words of the hashed sequence position per position
|
||||
* wordsize / in : size of the word to be hashed (record error for that size) BUT not equal to FWORDSIZE ...
|
||||
* ... the size of the word REALLY returned as a result
|
||||
* circular / in : is the sequence circular
|
||||
* doublestrand / in : if we have to hash on both strands of the sequence
|
||||
* seq / in : the sequence in ecoseq format
|
||||
* size / out : number of hashed words (size of the dest vector)
|
||||
* neededWordCount / in : table hash codes of word counts in the full DB (used to filter words)
|
||||
* quorum / in : minimum quorum used to filter words based on the neededWordCount table
|
||||
*/
|
||||
|
||||
uint32_t i=0;
|
||||
uint32_t j;
|
||||
char *base;
|
||||
@ -142,6 +155,12 @@ pword_t ecoHashSequence(pword_t dest,
|
||||
|
||||
uint32_t ecoCompactHashSequence(pword_t table,uint32_t size)
|
||||
{
|
||||
/*
|
||||
*
|
||||
* MULTIWORD is a word occurring more than once in a sequence
|
||||
*
|
||||
*/
|
||||
|
||||
uint32_t i,j;
|
||||
word_t current;
|
||||
// bool_t here=FALSE;
|
||||
|
Reference in New Issue
Block a user