diff --git a/src/libecoprimer/merge.c b/src/libecoprimer/merge.c index a638ca9..c27b6cb 100644 --- a/src/libecoprimer/merge.c +++ b/src/libecoprimer/merge.c @@ -10,7 +10,7 @@ static pmerge_t mergeInit(pmerge_t merge,pwordcount_t data,uint32_t s1,uint32_t s2); -static pmerge_t mergeInit(pmerge_t merge, pwordcount_t data,uint32_t s1,uint32_t s2) +static pmerge_t mergeInit(pmerge_t merge, pwordcount_t data, uint32_t s1, uint32_t s2) { merge->words = data->words; merge->count = data->strictcount; @@ -26,6 +26,15 @@ typedef enum {S1=1,S2=2,STACK=3} source_t; void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,uint32_t seqQuorum) { + + /* + * data / in out : the table that contains the two parts to be merged + * s1 / in : end of the first part of the table + * s2 / in : end of the second part of the table + * remainingSeq / in : the number of remaining seqs to be added to the table + * seqQuorum / in : the minimum number of sequences in which a pattern must appear + */ + merge_t merged; source_t source; word_t currentword,tmpword; @@ -38,12 +47,31 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui // DEBUG_LOG("Coucou %p s1= %d s2= %d",data,s1,s2) + /* + * init the merged structure (used only for coding convenience, never returned, allocated on the C-stack) + * note that : + * merged.words : hashcodes (initialized to data->words) + * merged.count : counts of each word (initialized to data->strictcount) + * merged.read1 : index of the first word of the first subtable (initialized to 0) + * merged.read1 : index of the first word of the first subtable (initialized to 0) + * merged.read2 : index of the first word of the second subtable (initialized to s1) + * merged.size : total size of the table (initialized to s1+s2) + * + * allocate a new stack of size min(s1, s2) + */ + (void)mergeInit(&merged,data,s1,s2); (void)newQueue(&queue,MINI(s1,s2)); + /* true until + * merged.read1 == s1 AND merged.read2 == merged.size, i.e. ALL words have been processed + */ while (merged.read1 < s1 || merged.read2 < merged.size) { + /* + * initialize current{word,count} from either STACK (if not empty) or first table (S1) + */ if (! queue.empty) { currentword = queue.words[queue.pop]; @@ -57,6 +85,12 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui source=S1; } + /* + * IF there are some words in the second subtable remaining to be processed AND + * its first word is lower than current word + * THEN initialize current{word,count} from the second table (S2) + * + */ if (merged.read2 < merged.size && WORD(currentword) > WORD(merged.words[merged.read2])) { @@ -65,11 +99,20 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui source = S2; } + /* + * record if the two words in the both subtable are the same + */ same = (source != S2) && (WORD(currentword) == WORD(merged.words[merged.read2])); nsame+=same; // DEBUG_LOG("Merging : r1 = %d s1 = %d r2 = %d size = %d word = %s source=%u same=%u",merged.read1,s1,merged.read2-s1,merged.size,ecoUnhashWord(currentword,18),source,same) + + /* + * merge step (AND apply the quorum property) + * update merged.read1 AND/OR merged.read2 + * record the word and its count in the table + */ tmpword = merged.words[merged.write]; tmpcount= merged.count[merged.write]; @@ -115,6 +158,12 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui // DEBUG_LOG("r1 : %d r2 : %d qsize : %d nsame : %d tot : %d write : %s count : %d source : %d size : %d pop : %d push : %d empty : %d",merged.read1,merged.read2-s1,qsize,nsame,qsize+nsame,ecoUnhashWord(currentword,18),currentcount,source,queue.size,queue.pop,queue.push,queue.empty) + /* + * finish the merging with words not processed (AND apply the quorum property) + * they are stored in the second subtable (IF) + * OR in the queue (ELSE) + */ + if (merged.read2 < merged.size) { //DEBUG_LOG("end1 %d %d/%d %d/%d",merged.write,merged.read1,s1,merged.read2,merged.size); diff --git a/src/libecoprimer/strictprimers.c b/src/libecoprimer/strictprimers.c index d555720..dfd6211 100644 --- a/src/libecoprimer/strictprimers.c +++ b/src/libecoprimer/strictprimers.c @@ -91,7 +91,9 @@ void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circ table->words = ECOREALLOC(table->words,buffersize*sizeof(word_t), "\n\nCannot allocate memory to extend word table" ); - + /* + * newtable is a pointer on the memory planed to be used for the new sequence (ecoWordCount new hash codes max) + */ newtable = table->words + table->size; // DEBUG_LOG("Words = %x (%u) new = %x", table->words,table->size,newtable); @@ -99,11 +101,24 @@ void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circ (void)ecoHashSequence(newtable,wordsize,circular,doublestrand,seq,&newsize,neededWords,neededWordCount,seqQuorum); // DEBUG_LOG("new seq wordCount : %d",newsize); + /* + * at this stage, new hash codes have been added in the table but the table is not sorted + */ + newsize = ecoCompactHashSequence(newtable,newsize); + /* + * new hash codes have now been sorted BUT the whole table is not. + * MULTIWORDS have been tagged (and compacted) + */ + // DEBUG_LOG("compacted wordCount : %d",newsize); buffersize = table->size + newsize; + /* + * buffersize is now set to the REAL size used by the table (but the memory chunck may be larger) + */ + // resize the count buffer table->inseqcount++; @@ -113,10 +128,13 @@ void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circ "Cannot allocate memory to extend example word count table"); //fprintf (stderr, " NewAddress: %x\n", table->strictcount); + for (i=table->size; i < buffersize; i++) table->strictcount[i]=1; - + /* + * new words in the table are set to a count of ONE + */ // Now we have to merge in situ the two tables