Added some comments

git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@400 60f365c0-8329-0410-b2a4-ec073aeeaa1d
2012-02-29 14:12:17 +00:00
parent 9e36754fef
commit aedee0c154
2 changed files with 57 additions and 4 deletions
--- a/src/libecoprimer/filtering.c
+++ b/src/libecoprimer/filtering.c
@ -27,6 +27,17 @@ static	int32_t *ecoFilteringHashSequence(int32_t *dest,
 										  ecoseq_t *seq,
 										  uint32_t *size)
 {
+
+	/*
+	 * This function aims at building a vector of count for every possible hash codes
+	 *
+	 * The function must be applied once on each sequence
+	 *
+	 * The function allocates memory on the first call for the dest table
+	 * The function also allocates memory for the static temporary table in_last_seq and
+	 * the function must be called with *dest == -1 in order to free this temporary table
+	 *
+	 */
 	static char    *in_last_seq=NULL;
 	uint32_t i=0;
 	uint32_t j;
@ -40,7 +51,6 @@ static	int32_t *ecoFilteringHashSequence(int32_t *dest,

 	// run on the first call;

-
 	if (dest==(void*)-1)
 	{
 		if (in_last_seq) ECOFREE(in_last_seq,"Free in last seq table");
@ -48,14 +58,27 @@ static	int32_t *ecoFilteringHashSequence(int32_t *dest,
 	}


+	/* FWORDSIZE = 13 => *size =    67 108 864
+	 * FWORDSIZE = 14 => *size =   268 435 456
+	 * FWORDSIZE = 15 => *size = 1 073 741 824
+	 */
+
 	*size = pow(4,FWORDSIZE);

+	/*
+	 * in_last_seq is a vector of char as it is just to avoid counting twice (or more) a hash code (a DNA word)
+	 * it is set to zero (with memset) and then filled with ones for each word belonging to the sequence
+	 */
+
 	if (!in_last_seq)
 		in_last_seq = ECOMALLOC(*size*sizeof(char),
 								"Cannot allocate filtering hash table");

 	memset(in_last_seq,0,*size*sizeof(char));

+	/*
+	 * Allocate (on first call) the memory for the table of counts
+	 */

 	if (!dest)
 	{
@ -72,6 +95,10 @@ static	int32_t *ecoFilteringHashSequence(int32_t *dest,

 //	DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i));

+	/*
+	 * Compute first word of seq
+	 */
+
 	for (i=0, base = seq->SQ; i < FWORDSIZE && i < lmax; i++,base++)
 	{
 		error<<= 1;
@ -95,6 +122,9 @@ static	int32_t *ecoFilteringHashSequence(int32_t *dest,

 		goodword=(uint32_t)((doublestrand) ? MINI(word,antiword):word);

+		/*
+		 * FB: I don't think the test is necessary as the table has just been initialized
+		 */
 		if (!in_last_seq[goodword])
 		{
 			in_last_seq[goodword]=1;
@ -102,6 +132,9 @@ static	int32_t *ecoFilteringHashSequence(int32_t *dest,
 		}
 	}

+	/*
+	 * compute and store counts (avoid counting twice a word) for the other words of the seq
+	 */

 	for (j=1; j < lmax; j++,i++,base++)
 	{
@ -142,6 +175,7 @@ static	int32_t *ecoFilteringHashSequence(int32_t *dest,

 	}

+
 	return dest;

 }
--- a/src/libecoprimer/hashsequence.c
+++ b/src/libecoprimer/hashsequence.c
@ -33,6 +33,19 @@ pword_t ecoHashSequence(pword_t dest,
 					    uint32_t neededWordCount,
 					    int32_t quorum)
 {
+
+	/*
+	 * dest            / out    : words of the hashed sequence position per position
+	 * wordsize        / in     : size of the word to be hashed (record error for that size) BUT not equal to FWORDSIZE ...
+	 *                            ... the size of the word REALLY returned as a result
+	 * circular        / in     : is the sequence circular
+	 * doublestrand    / in     : if we have to hash on both strands of the sequence
+	 * seq             / in     : the sequence in ecoseq format
+	 * size            / out    : number of hashed words (size of the dest vector)
+	 * neededWordCount / in     : table hash codes of word counts in the full DB (used to filter words)
+	 * quorum          / in     : minimum quorum used to filter words based on the neededWordCount table
+	 */
+
 	uint32_t i=0;
 	uint32_t j;
 	char *base;
@ -54,7 +67,7 @@ pword_t ecoHashSequence(pword_t dest,
 				         "I cannot allocate memory for sequence hashing"
 				        );

-//	DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i));
+	//DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i));

 	for (i=0, base = seq->SQ; i < wordsize && i < lmax; i++,base++)
 	{
@ -97,7 +110,7 @@ pword_t ecoHashSequence(pword_t dest,
 	for (j=1; j < lmax; j++,i++,base++)
 	{

-//		DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j));
+		//DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j));

 							/* roll over the sequence for circular ones */

@ -135,13 +148,19 @@ pword_t ecoHashSequence(pword_t dest,
 		}

 	}
-	// DEBUG_LOG("%s goodword = %d",seq->AC,*size);
+	//DEBUG_LOG("%s goodword = %d",seq->AC,*size);
 	return dest;

 }

 uint32_t ecoCompactHashSequence(pword_t table,uint32_t size)
 {
+	/*
+	 *
+	 * MULTIWORD is a word occurring more than once in a sequence
+	 *
+	 */
+
 	uint32_t i,j;
 	word_t  current;
 //	bool_t here=FALSE;