ecoprimers/src/libecoprimer/merge.c

/*
 * merge.c
 *
 *  Created on: 11 nov. 2008
 *      Author: coissac
 */

#include "ecoprimer.h"

static pmerge_t mergeInit(pmerge_t merge,pwordcount_t data,uint32_t s1,uint32_t s2);


static pmerge_t mergeInit(pmerge_t merge, pwordcount_t data, uint32_t s1, uint32_t s2)
{
	merge->words = data->words;
	merge->count = data->strictcount;
    merge->write = 0;
    merge->read1 = 0;
    merge->read2 = s1;
    merge->size  = s1+s2;
    return merge;
}


typedef enum {S1=1,S2=2,STACK=3} source_t;

void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,uint32_t seqQuorum)
{

	/*
	 * data          / in out : the table that contains the two parts to be merged
	 * s1            / in     : end of the first part of the table
	 * s2            / in     : end of the second part of the table
	 * remainingSeq  / in     : the number of remaining seqs to be added to the table
	 * seqQuorum     / in     : the minimum number of sequences in which a pattern must appear
	 */

	merge_t   merged;
	source_t  source;
	word_t    currentword,tmpword;
	uint32_t  currentcount,tmpcount;
	int       same;
	queue_t   queue;
	int       nsame=0;
	uint32_t  maxcount=0;
	bool_t    writed=TRUE;

//	DEBUG_LOG("Coucou %p  s1= %d s2= %d",data,s1,s2)

	/*
	 * init the merged structure (used only for coding convenience, never returned, allocated on the C-stack)
	 * note that :
	 *     merged.words : hashcodes           (initialized to data->words)
	 *     merged.count : counts of each word (initialized to data->strictcount)
	 *     merged.read1 : index of the first word of the first subtable   (initialized to 0)
	 *     merged.read1 : index of the first word of the first subtable   (initialized to 0)
	 *     merged.read2 : index of the first word of the second subtable  (initialized to s1)
	 *     merged.size  : total size of the table (initialized to s1+s2)
	 *
	 * allocate a new stack of size min(s1, s2)
	 */

	(void)mergeInit(&merged,data,s1,s2);
	(void)newQueue(&queue,MINI(s1,s2));


	/* true until
	 * merged.read1 == s1 AND merged.read2 == merged.size, i.e. ALL words have been processed
	 */
	while (merged.read1 < s1 || merged.read2 < merged.size)
	{
		/*
		 * initialize current{word,count} from either STACK (if not empty) or first table (S1)
		 */
		if (! queue.empty)
		{
			currentword  = queue.words[queue.pop];
			currentcount = queue.count[queue.pop];
			source=STACK;
		}
		else
		{
			currentword  = merged.words[merged.read1];
			currentcount = merged.count[merged.read1];
			source=S1;
		}

		/*
		 * IF there are some words in the second subtable remaining to be processed AND
		 *    its first word is lower than current word
		 * THEN initialize current{word,count} from the second table (S2)
		 *
		 */
		if (merged.read2 < merged.size &&
				WORD(currentword) > WORD(merged.words[merged.read2]))
		{
			currentword  = merged.words[merged.read2];
			currentcount = merged.count[merged.read2];
			source  = S2;
		}

		/*
		 * record if the two words in the both subtable are the same
		 */
		same = (source != S2) && (WORD(currentword) == WORD(merged.words[merged.read2]));
		nsame+=same;

//		DEBUG_LOG("Merging : r1 = %d s1 = %d r2 = %d size = %d word = %s source=%u same=%u",merged.read1,s1,merged.read2-s1,merged.size,ecoUnhashWord(currentword,18),source,same)


		/*
		 * merge step (AND apply the quorum property)
		 * update merged.read1 AND/OR merged.read2
		 * record the word and its count in the table
		 */
		tmpword = merged.words[merged.write];
		tmpcount= merged.count[merged.write];

		merged.words[merged.write] = currentword;
		merged.count[merged.write] = currentcount;

		if (source != S2)
		{
			if (same)
			{
				merged.count[merged.write]+=merged.count[merged.read2];

				if (ISMULTIWORD(currentword) || ISMULTIWORD(merged.words[merged.read2]))
					merged.words[merged.write]=SETMULTIWORD(currentword);

				merged.read2++;
			}

			if (source==STACK)
				pop(&queue);
			merged.read1++;
		}
		else
			merged.read2++;

		if (writed && merged.read1 <= merged.write && merged.write < s1)
			push(&queue,tmpword,tmpcount);

		if (merged.count[merged.write] > maxcount)
			maxcount=merged.count[merged.write];

		writed = remainingSeq + merged.count[merged.write] >= seqQuorum;
        if (writed)
        	merged.write++;


//      else
//        	DEBUG_LOG("Remove word : %s count : %d remainingSeq : %d total : %d Quorum : %d",
//        			  ecoUnhashWord(currentword,18),merged.count[merged.write],remainingSeq,maxcount+remainingSeq,seqQuorum);

	}  /* while loop */

//	DEBUG_LOG("r1 : %d r2 : %d qsize : %d nsame : %d tot : %d write : %s count : %d source : %d size : %d pop : %d push : %d  empty : %d",merged.read1,merged.read2-s1,qsize,nsame,qsize+nsame,ecoUnhashWord(currentword,18),currentcount,source,queue.size,queue.pop,queue.push,queue.empty)


	/*
	 * finish the merging with words not processed (AND apply the quorum property)
	 * they are stored in the second subtable (IF)
	 *              OR in the queue           (ELSE)
	 */

	if (merged.read2 < merged.size)
		{
			//DEBUG_LOG("end1 %d %d/%d  %d/%d",merged.write,merged.read1,s1,merged.read2,merged.size);
		for (;merged.read2 < merged.size;merged.read2++)
		{
			merged.words[merged.write]=merged.words[merged.read2];
			merged.count[merged.write]=merged.count[merged.read2];
	        if (remainingSeq + merged.count[merged.write] >= seqQuorum)
	        	merged.write++;

		}
		}
	else {
		//DEBUG_LOG("end2 %d %d/%d  %d/%d",merged.write,merged.read1,s1,merged.read2,merged.size);
		while (! queue.empty)
		{
//			DEBUG_LOG("write : %s count : %d write : %d size : %d pop : %d push : %d  empty : %d",ecoUnhashWord(queue.words[queue.pop],18),queue.count[queue.pop],merged.write,queue.size,queue.pop,queue.push,queue.empty)
			merged.words[merged.write]=queue.words[queue.pop];
			merged.count[merged.write]=queue.count[queue.pop];
			pop(&queue);
	        if (remainingSeq + merged.count[merged.write] >= seqQuorum)
	        	merged.write++;
		}
		}

	data->size = merged.write;

	cleanQueue(&queue);

//	DEBUG_LOG("Max count : %d remainingSeq : %d total : %d Quorum : %d",maxcount,remainingSeq,maxcount+remainingSeq,seqQuorum)
//	DEBUG_LOG("Second word : %s",ecoUnhashWord(data->words[1],18))
//	DEBUG_LOG("Last  word : %s",ecoUnhashWord(data->words[data->size-1],18))


}
New version based on sort them merge algorithm git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@180 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2009-03-04 22:32:55 +00:00			`/*`
			`* merge.c`
			`*`
			`* Created on: 11 nov. 2008`
			`* Author: coissac`
			`*/`

			`#include "ecoprimer.h"`

			`static pmerge_t mergeInit(pmerge_t merge,pwordcount_t data,uint32_t s1,uint32_t s2);`


Added some comments git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@399 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2012-02-27 14:57:03 +00:00			`static pmerge_t mergeInit(pmerge_t merge, pwordcount_t data, uint32_t s1, uint32_t s2)`
New version based on sort them merge algorithm git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@180 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2009-03-04 22:32:55 +00:00			`{`
			`merge->words = data->words;`
			`merge->count = data->strictcount;`
			`merge->write = 0;`
			`merge->read1 = 0;`
			`merge->read2 = s1;`
			`merge->size = s1+s2;`
			`return merge;`
			`}`


			`typedef enum {S1=1,S2=2,STACK=3} source_t;`

			`void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,uint32_t seqQuorum)`
			`{`
Added some comments git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@399 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2012-02-27 14:57:03 +00:00
			`/*`
			`* data / in out : the table that contains the two parts to be merged`
			`* s1 / in : end of the first part of the table`
			`* s2 / in : end of the second part of the table`
			`* remainingSeq / in : the number of remaining seqs to be added to the table`
			`* seqQuorum / in : the minimum number of sequences in which a pattern must appear`
			`*/`

New version based on sort them merge algorithm git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@180 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2009-03-04 22:32:55 +00:00			`merge_t merged;`
			`source_t source;`
			`word_t currentword,tmpword;`
			`uint32_t currentcount,tmpcount;`
			`int same;`
			`queue_t queue;`
			`int nsame=0;`
			`uint32_t maxcount=0;`
			`bool_t writed=TRUE;`

			`// DEBUG_LOG("Coucou %p s1= %d s2= %d",data,s1,s2)`

Added some comments git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@399 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2012-02-27 14:57:03 +00:00			`/*`
			`* init the merged structure (used only for coding convenience, never returned, allocated on the C-stack)`
			`* note that :`
			`* merged.words : hashcodes (initialized to data->words)`
			`* merged.count : counts of each word (initialized to data->strictcount)`
			`* merged.read1 : index of the first word of the first subtable (initialized to 0)`
			`* merged.read1 : index of the first word of the first subtable (initialized to 0)`
			`* merged.read2 : index of the first word of the second subtable (initialized to s1)`
			`* merged.size : total size of the table (initialized to s1+s2)`
			`*`
			`* allocate a new stack of size min(s1, s2)`
			`*/`

New version based on sort them merge algorithm git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@180 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2009-03-04 22:32:55 +00:00			`(void)mergeInit(&merged,data,s1,s2);`
			`(void)newQueue(&queue,MINI(s1,s2));`

New version 0.3 with filtering on short words git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@213 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2009-05-13 06:51:25 +00:00
Added some comments git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@399 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2012-02-27 14:57:03 +00:00			`/* true until`
			`* merged.read1 == s1 AND merged.read2 == merged.size, i.e. ALL words have been processed`
			`*/`
New version 0.3 with filtering on short words git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@213 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2009-05-13 06:51:25 +00:00			`while (merged.read1 < s1 \|\| merged.read2 < merged.size)`
New version based on sort them merge algorithm git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@180 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2009-03-04 22:32:55 +00:00			`{`
Added some comments git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@399 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2012-02-27 14:57:03 +00:00			`/*`
			`* initialize current{word,count} from either STACK (if not empty) or first table (S1)`
			`*/`
New version based on sort them merge algorithm git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@180 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2009-03-04 22:32:55 +00:00			`if (! queue.empty)`
			`{`
			`currentword = queue.words[queue.pop];`
			`currentcount = queue.count[queue.pop];`
			`source=STACK;`
			`}`
			`else`
			`{`
			`currentword = merged.words[merged.read1];`
			`currentcount = merged.count[merged.read1];`
			`source=S1;`
			`}`

Added some comments git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@399 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2012-02-27 14:57:03 +00:00			`/*`
			`* IF there are some words in the second subtable remaining to be processed AND`
			`* its first word is lower than current word`
			`* THEN initialize current{word,count} from the second table (S2)`
			`*`
			`*/`
New version 0.3 with filtering on short words git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@213 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2009-05-13 06:51:25 +00:00			`if (merged.read2 < merged.size &&`
			`WORD(currentword) > WORD(merged.words[merged.read2]))`
New version based on sort them merge algorithm git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@180 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2009-03-04 22:32:55 +00:00			`{`
			`currentword = merged.words[merged.read2];`
			`currentcount = merged.count[merged.read2];`
			`source = S2;`
			`}`

Added some comments git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@399 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2012-02-27 14:57:03 +00:00			`/*`
			`* record if the two words in the both subtable are the same`
			`*/`
New version based on sort them merge algorithm git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@180 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2009-03-04 22:32:55 +00:00			`same = (source != S2) && (WORD(currentword) == WORD(merged.words[merged.read2]));`
			`nsame+=same;`

			`// DEBUG_LOG("Merging : r1 = %d s1 = %d r2 = %d size = %d word = %s source=%u same=%u",merged.read1,s1,merged.read2-s1,merged.size,ecoUnhashWord(currentword,18),source,same)`

Added some comments git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@399 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2012-02-27 14:57:03 +00:00
			`/*`
			`* merge step (AND apply the quorum property)`
			`* update merged.read1 AND/OR merged.read2`
			`* record the word and its count in the table`
			`*/`
New version based on sort them merge algorithm git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@180 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2009-03-04 22:32:55 +00:00			`tmpword = merged.words[merged.write];`
			`tmpcount= merged.count[merged.write];`

			`merged.words[merged.write] = currentword;`
			`merged.count[merged.write] = currentcount;`

			`if (source != S2)`
			`{`
			`if (same)`
			`{`
			`merged.count[merged.write]+=merged.count[merged.read2];`

			`if (ISMULTIWORD(currentword) \|\| ISMULTIWORD(merged.words[merged.read2]))`
			`merged.words[merged.write]=SETMULTIWORD(currentword);`

			`merged.read2++;`
			`}`

			`if (source==STACK)`
			`pop(&queue);`
			`merged.read1++;`
			`}`
			`else`
			`merged.read2++;`

			`if (writed && merged.read1 <= merged.write && merged.write < s1)`
			`push(&queue,tmpword,tmpcount);`

			`if (merged.count[merged.write] > maxcount)`
			`maxcount=merged.count[merged.write];`

			`writed = remainingSeq + merged.count[merged.write] >= seqQuorum;`
			`if (writed)`
			`merged.write++;`


			`// else`
			`// DEBUG_LOG("Remove word : %s count : %d remainingSeq : %d total : %d Quorum : %d",`
			`// ecoUnhashWord(currentword,18),merged.count[merged.write],remainingSeq,maxcount+remainingSeq,seqQuorum);`

			`} /* while loop */`

			`// DEBUG_LOG("r1 : %d r2 : %d qsize : %d nsame : %d tot : %d write : %s count : %d source : %d size : %d pop : %d push : %d empty : %d",merged.read1,merged.read2-s1,qsize,nsame,qsize+nsame,ecoUnhashWord(currentword,18),currentcount,source,queue.size,queue.pop,queue.push,queue.empty)`


Added some comments git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@399 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2012-02-27 14:57:03 +00:00			`/*`
			`* finish the merging with words not processed (AND apply the quorum property)`
			`* they are stored in the second subtable (IF)`
			`* OR in the queue (ELSE)`
			`*/`

New version based on sort them merge algorithm git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@180 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2009-03-04 22:32:55 +00:00			`if (merged.read2 < merged.size)`
New version 0.3 with filtering on short words git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@213 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2009-05-13 06:51:25 +00:00			`{`
			`//DEBUG_LOG("end1 %d %d/%d %d/%d",merged.write,merged.read1,s1,merged.read2,merged.size);`
New version based on sort them merge algorithm git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@180 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2009-03-04 22:32:55 +00:00			`for (;merged.read2 < merged.size;merged.read2++)`
			`{`
			`merged.words[merged.write]=merged.words[merged.read2];`
			`merged.count[merged.write]=merged.count[merged.read2];`
			`if (remainingSeq + merged.count[merged.write] >= seqQuorum)`
			`merged.write++;`

			`}`
New version 0.3 with filtering on short words git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@213 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2009-05-13 06:51:25 +00:00			`}`
			`else {`
			`//DEBUG_LOG("end2 %d %d/%d %d/%d",merged.write,merged.read1,s1,merged.read2,merged.size);`
			`while (! queue.empty)`
New version based on sort them merge algorithm git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@180 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2009-03-04 22:32:55 +00:00			`{`
			`// DEBUG_LOG("write : %s count : %d write : %d size : %d pop : %d push : %d empty : %d",ecoUnhashWord(queue.words[queue.pop],18),queue.count[queue.pop],merged.write,queue.size,queue.pop,queue.push,queue.empty)`
			`merged.words[merged.write]=queue.words[queue.pop];`
			`merged.count[merged.write]=queue.count[queue.pop];`
			`pop(&queue);`
			`if (remainingSeq + merged.count[merged.write] >= seqQuorum)`
			`merged.write++;`
			`}`
New version 0.3 with filtering on short words git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@213 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2009-05-13 06:51:25 +00:00			`}`
New version based on sort them merge algorithm git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@180 60f365c0-8329-0410-b2a4-ec073aeeaa1d 2009-03-04 22:32:55 +00:00
			`data->size = merged.write;`

			`cleanQueue(&queue);`

			`// DEBUG_LOG("Max count : %d remainingSeq : %d total : %d Quorum : %d",maxcount,remainingSeq,maxcount+remainingSeq,seqQuorum)`
			`// DEBUG_LOG("Second word : %s",ecoUnhashWord(data->words[1],18))`
			`// DEBUG_LOG("Last word : %s",ecoUnhashWord(data->words[data->size-1],18))`


			`}`