New version based on sort them merge algorithm
git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@180 60f365c0-8329-0410-b2a4-ec073aeeaa1d
This commit is contained in:
203
src/libecoprimer/hashsequence.c
Normal file
203
src/libecoprimer/hashsequence.c
Normal file
@ -0,0 +1,203 @@
|
||||
/*
|
||||
* hashsequence.c
|
||||
*
|
||||
* Created on: 7 nov. 2008
|
||||
* Author: coissac
|
||||
*/
|
||||
|
||||
|
||||
#include "ecoprimer.h"
|
||||
|
||||
static int cmpword(const void *x,const void *y);
|
||||
|
||||
static int8_t encoder[] = {0, // A
|
||||
-1, // b
|
||||
1, // C
|
||||
-1,-1,-1, // d, e, f
|
||||
2, // G
|
||||
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // h,i,j,k,l,m,n,o,p,q,r,s
|
||||
3,3, // T,U
|
||||
-1,-1,-1,-1,-1}; // v,w,x,y,z
|
||||
|
||||
|
||||
uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq)
|
||||
{
|
||||
uint32_t wordcount;
|
||||
|
||||
wordcount = seq->SQ_length;
|
||||
|
||||
if (!circular) wordcount-=wordsize-1;
|
||||
|
||||
return wordcount;
|
||||
}
|
||||
|
||||
pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint32_t doublestrand, ecoseq_t *seq,uint32_t *size)
|
||||
{
|
||||
uint32_t i=0;
|
||||
uint32_t j;
|
||||
char *base;
|
||||
int8_t code;
|
||||
int32_t error=0;
|
||||
word_t word=0;
|
||||
word_t antiword=0;
|
||||
uint32_t lmax=0;
|
||||
|
||||
(*size)=0;
|
||||
|
||||
lmax = seq->SQ_length;
|
||||
if (!circular)
|
||||
lmax-= wordsize-1;
|
||||
|
||||
if (!dest)
|
||||
dest = ECOMALLOC(lmax*sizeof(word_t),
|
||||
"I cannot allocate memory for sequence hashing"
|
||||
);
|
||||
|
||||
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i));
|
||||
|
||||
for (i=0, base = seq->SQ; i < wordsize && i < lmax; i++,base++)
|
||||
{
|
||||
error<<= 1;
|
||||
|
||||
code = encoder[(*base) - 'A'];
|
||||
if (code <0)
|
||||
{
|
||||
code = 0;
|
||||
error|= 1;
|
||||
}
|
||||
|
||||
|
||||
word=RAPPENDBASE(word,wordsize,code);
|
||||
if (doublestrand)
|
||||
antiword=LAPPENDBASE(antiword,wordsize,code);
|
||||
}
|
||||
|
||||
if (!error && i==wordsize)
|
||||
{
|
||||
dest[*size]=(doublestrand) ? MINI(word,antiword):word;
|
||||
(*size)++;
|
||||
}
|
||||
|
||||
|
||||
for (j=1; j < lmax; j++,i++,base++)
|
||||
{
|
||||
|
||||
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j));
|
||||
|
||||
/* roll over the sequence for circular ones */
|
||||
if (i==(uint32_t)seq->SQ_length) base=seq->SQ;
|
||||
|
||||
error<<= 1;
|
||||
|
||||
code = encoder[(*base) - 'A'];
|
||||
if (code <0)
|
||||
{
|
||||
code = 0;
|
||||
error|= 1;
|
||||
}
|
||||
|
||||
word=RAPPENDBASE(word,wordsize,code);
|
||||
if (doublestrand)
|
||||
antiword=LAPPENDBASE(antiword,wordsize,code);
|
||||
|
||||
if (!error)
|
||||
{
|
||||
dest[*size]=(doublestrand) ? MINI(word,antiword):word;
|
||||
(*size)++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return dest;
|
||||
|
||||
}
|
||||
|
||||
uint32_t ecoCompactHashSequence(pword_t table,uint32_t size)
|
||||
{
|
||||
uint32_t i,j;
|
||||
word_t current;
|
||||
|
||||
sortword(table,size);
|
||||
|
||||
current = 0;
|
||||
current=SETMULTIWORD(current); /* build impossible word for the first loop cycle */
|
||||
|
||||
for (i=0,j=0; j < size;j++)
|
||||
{
|
||||
if (table[j]!=current)
|
||||
{
|
||||
current =table[j];
|
||||
table[i]=current;
|
||||
i++;
|
||||
}
|
||||
else
|
||||
table[i]=SETMULTIWORD(table[i]);
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
const char* ecoUnhashWord(word_t word,uint32_t size)
|
||||
{
|
||||
static char buffer[32];
|
||||
static char decode[]="ACGT";
|
||||
|
||||
uint32_t i;
|
||||
|
||||
for (i=0; i < size; i++)
|
||||
{
|
||||
buffer[i]=decode[(word >> (2 * (size - 1 -i))) & 3];
|
||||
}
|
||||
|
||||
buffer[size]=0;
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
word_t ecoComplementWord(word_t word,uint32_t size)
|
||||
{
|
||||
word_t rep=0;
|
||||
uint32_t i;
|
||||
|
||||
// DEBUG_LOG("%llx %llx",word,~word);
|
||||
word=(~word) & WORDMASK(size);
|
||||
for (i=0;i < size; i++)
|
||||
{
|
||||
|
||||
rep = RAPPENDBASE(rep,size,word & 3LLU);
|
||||
// DEBUG_LOG("%016llx %016llx %016llx",word,word & 3LLU,rep);
|
||||
word>>=2;
|
||||
}
|
||||
// DEBUG_LOG("Complemented = %s",ecoUnhashWord(rep,18));
|
||||
return rep;
|
||||
|
||||
}
|
||||
|
||||
static int cmpword(const void *x,const void *y)
|
||||
{
|
||||
word_t w1 = *(pword_t)x;
|
||||
word_t w2 = *(pword_t)y;
|
||||
|
||||
w1 = WORD(w1);
|
||||
w2 = WORD(w2);
|
||||
|
||||
if (w1 < w2)
|
||||
return -1;
|
||||
if (w1 > w2)
|
||||
return +1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint32_t ecoFindWord(pwordcount_t table,word_t word)
|
||||
{
|
||||
pword_t dest;
|
||||
|
||||
dest = (pword_t)bsearch((const void*)&word,(const void*)table->words,table->size,sizeof(word_t),cmpword);
|
||||
|
||||
if (dest)
|
||||
return dest - table->words;
|
||||
else
|
||||
return ~0;
|
||||
}
|
||||
|
Reference in New Issue
Block a user