2009-03-04 22:32:55 +00:00
|
|
|
/*
|
|
|
|
* hashsequence.c
|
|
|
|
*
|
|
|
|
* Created on: 7 nov. 2008
|
|
|
|
* Author: coissac
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
#include "ecoprimer.h"
|
|
|
|
|
|
|
|
static int cmpword(const void *x,const void *y);
|
|
|
|
|
|
|
|
static int8_t encoder[] = {0, // A
|
|
|
|
-1, // b
|
|
|
|
1, // C
|
|
|
|
-1,-1,-1, // d, e, f
|
|
|
|
2, // G
|
|
|
|
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // h,i,j,k,l,m,n,o,p,q,r,s
|
|
|
|
3,3, // T,U
|
|
|
|
-1,-1,-1,-1,-1}; // v,w,x,y,z
|
|
|
|
|
|
|
|
|
|
|
|
uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq)
|
|
|
|
{
|
|
|
|
uint32_t wordcount;
|
|
|
|
|
|
|
|
wordcount = seq->SQ_length;
|
|
|
|
|
|
|
|
if (!circular) wordcount-=wordsize-1;
|
|
|
|
|
|
|
|
return wordcount;
|
|
|
|
}
|
|
|
|
|
|
|
|
pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint32_t doublestrand, ecoseq_t *seq,uint32_t *size)
|
|
|
|
{
|
|
|
|
uint32_t i=0;
|
|
|
|
uint32_t j;
|
|
|
|
char *base;
|
|
|
|
int8_t code;
|
|
|
|
int32_t error=0;
|
|
|
|
word_t word=0;
|
|
|
|
word_t antiword=0;
|
|
|
|
uint32_t lmax=0;
|
|
|
|
|
|
|
|
(*size)=0;
|
|
|
|
|
|
|
|
lmax = seq->SQ_length;
|
|
|
|
if (!circular)
|
|
|
|
lmax-= wordsize-1;
|
|
|
|
|
|
|
|
if (!dest)
|
|
|
|
dest = ECOMALLOC(lmax*sizeof(word_t),
|
|
|
|
"I cannot allocate memory for sequence hashing"
|
|
|
|
);
|
|
|
|
|
|
|
|
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i));
|
|
|
|
|
|
|
|
for (i=0, base = seq->SQ; i < wordsize && i < lmax; i++,base++)
|
|
|
|
{
|
|
|
|
error<<= 1;
|
|
|
|
|
|
|
|
code = encoder[(*base) - 'A'];
|
|
|
|
if (code <0)
|
|
|
|
{
|
|
|
|
code = 0;
|
|
|
|
error|= 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
word=RAPPENDBASE(word,wordsize,code);
|
|
|
|
if (doublestrand)
|
|
|
|
antiword=LAPPENDBASE(antiword,wordsize,code);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!error && i==wordsize)
|
|
|
|
{
|
|
|
|
dest[*size]=(doublestrand) ? MINI(word,antiword):word;
|
|
|
|
(*size)++;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
for (j=1; j < lmax; j++,i++,base++)
|
|
|
|
{
|
|
|
|
|
|
|
|
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j));
|
|
|
|
|
|
|
|
/* roll over the sequence for circular ones */
|
|
|
|
if (i==(uint32_t)seq->SQ_length) base=seq->SQ;
|
|
|
|
|
|
|
|
error<<= 1;
|
|
|
|
|
|
|
|
code = encoder[(*base) - 'A'];
|
|
|
|
if (code <0)
|
|
|
|
{
|
|
|
|
code = 0;
|
|
|
|
error|= 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
word=RAPPENDBASE(word,wordsize,code);
|
|
|
|
if (doublestrand)
|
|
|
|
antiword=LAPPENDBASE(antiword,wordsize,code);
|
|
|
|
|
|
|
|
if (!error)
|
|
|
|
{
|
|
|
|
dest[*size]=(doublestrand) ? MINI(word,antiword):word;
|
|
|
|
(*size)++;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return dest;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t ecoCompactHashSequence(pword_t table,uint32_t size)
|
|
|
|
{
|
|
|
|
uint32_t i,j;
|
|
|
|
word_t current;
|
|
|
|
|
|
|
|
sortword(table,size);
|
|
|
|
|
|
|
|
current = 0;
|
|
|
|
current=SETMULTIWORD(current); /* build impossible word for the first loop cycle */
|
|
|
|
|
|
|
|
for (i=0,j=0; j < size;j++)
|
|
|
|
{
|
|
|
|
if (table[j]!=current)
|
|
|
|
{
|
|
|
|
current =table[j];
|
|
|
|
table[i]=current;
|
|
|
|
i++;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
table[i]=SETMULTIWORD(table[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
|
|
|
|
const char* ecoUnhashWord(word_t word,uint32_t size)
|
|
|
|
{
|
|
|
|
static char buffer[32];
|
|
|
|
static char decode[]="ACGT";
|
|
|
|
|
|
|
|
uint32_t i;
|
|
|
|
|
|
|
|
for (i=0; i < size; i++)
|
|
|
|
{
|
|
|
|
buffer[i]=decode[(word >> (2 * (size - 1 -i))) & 3];
|
|
|
|
}
|
|
|
|
|
|
|
|
buffer[size]=0;
|
|
|
|
|
|
|
|
return buffer;
|
|
|
|
}
|
|
|
|
|
|
|
|
word_t ecoComplementWord(word_t word,uint32_t size)
|
|
|
|
{
|
|
|
|
word_t rep=0;
|
|
|
|
uint32_t i;
|
|
|
|
|
|
|
|
// DEBUG_LOG("%llx %llx",word,~word);
|
|
|
|
word=(~word) & WORDMASK(size);
|
|
|
|
for (i=0;i < size; i++)
|
|
|
|
{
|
|
|
|
|
|
|
|
rep = RAPPENDBASE(rep,size,word & 3LLU);
|
|
|
|
// DEBUG_LOG("%016llx %016llx %016llx",word,word & 3LLU,rep);
|
|
|
|
word>>=2;
|
|
|
|
}
|
|
|
|
// DEBUG_LOG("Complemented = %s",ecoUnhashWord(rep,18));
|
|
|
|
return rep;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
static int cmpword(const void *x,const void *y)
|
|
|
|
{
|
|
|
|
word_t w1 = *(pword_t)x;
|
|
|
|
word_t w2 = *(pword_t)y;
|
|
|
|
|
|
|
|
w1 = WORD(w1);
|
|
|
|
w2 = WORD(w2);
|
|
|
|
|
|
|
|
if (w1 < w2)
|
|
|
|
return -1;
|
|
|
|
if (w1 > w2)
|
|
|
|
return +1;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t ecoFindWord(pwordcount_t table,word_t word)
|
|
|
|
{
|
|
|
|
pword_t dest;
|
|
|
|
|
|
|
|
dest = (pword_t)bsearch((const void*)&word,(const void*)table->words,table->size,sizeof(word_t),cmpword);
|
|
|
|
|
|
|
|
if (dest)
|
|
|
|
return dest - table->words;
|
|
|
|
else
|
|
|
|
return ~0;
|
|
|
|
}
|
|
|
|
|
2009-03-20 11:59:55 +00:00
|
|
|
char ecoComplementChar(char base)
|
|
|
|
{
|
|
|
|
switch(base){
|
|
|
|
case 'A': return T;
|
|
|
|
case 'C': return G;
|
|
|
|
case 'G': return C;
|
|
|
|
case 'T': return A;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|