Files
ecoprimers/src/libecoprimer/hashsequence.c

214 lines
4.0 KiB
C

/*
* hashsequence.c
*
* Created on: 7 nov. 2008
* Author: coissac
*/
#include "ecoprimer.h"
static int cmpword(const void *x,const void *y);
static int8_t encoder[] = {0, // A
-1, // b
1, // C
-1,-1,-1, // d, e, f
2, // G
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // h,i,j,k,l,m,n,o,p,q,r,s
3,3, // T,U
-1,-1,-1,-1,-1}; // v,w,x,y,z
uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq)
{
uint32_t wordcount;
wordcount = seq->SQ_length;
if (!circular) wordcount-=wordsize-1;
return wordcount;
}
pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint32_t doublestrand, ecoseq_t *seq,uint32_t *size)
{
uint32_t i=0;
uint32_t j;
char *base;
int8_t code;
int32_t error=0;
word_t word=0;
word_t antiword=0;
uint32_t lmax=0;
(*size)=0;
lmax = seq->SQ_length;
if (!circular)
lmax-= wordsize-1;
if (!dest)
dest = ECOMALLOC(lmax*sizeof(word_t),
"I cannot allocate memory for sequence hashing"
);
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i));
for (i=0, base = seq->SQ; i < wordsize && i < lmax; i++,base++)
{
error<<= 1;
code = encoder[(*base) - 'A'];
if (code <0)
{
code = 0;
error|= 1;
}
word=RAPPENDBASE(word,wordsize,code);
if (doublestrand)
antiword=LAPPENDBASE(antiword,wordsize,code);
}
if (!error && i==wordsize)
{
dest[*size]=(doublestrand) ? MINI(word,antiword):word;
(*size)++;
}
for (j=1; j < lmax; j++,i++,base++)
{
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j));
/* roll over the sequence for circular ones */
if (i==(uint32_t)seq->SQ_length) base=seq->SQ;
error<<= 1;
code = encoder[(*base) - 'A'];
if (code <0)
{
code = 0;
error|= 1;
}
word=RAPPENDBASE(word,wordsize,code);
if (doublestrand)
antiword=LAPPENDBASE(antiword,wordsize,code);
if (!error)
{
dest[*size]=(doublestrand) ? MINI(word,antiword):word;
(*size)++;
}
}
return dest;
}
uint32_t ecoCompactHashSequence(pword_t table,uint32_t size)
{
uint32_t i,j;
word_t current;
sortword(table,size);
current = 0;
current=SETMULTIWORD(current); /* build impossible word for the first loop cycle */
for (i=0,j=0; j < size;j++)
{
if (table[j]!=current)
{
current =table[j];
table[i]=current;
i++;
}
else
table[i]=SETMULTIWORD(table[i]);
}
return i;
}
const char* ecoUnhashWord(word_t word,uint32_t size)
{
static char buffer[32];
static char decode[]="ACGT";
uint32_t i;
for (i=0; i < size; i++)
{
buffer[i]=decode[(word >> (2 * (size - 1 -i))) & 3];
}
buffer[size]=0;
return buffer;
}
word_t ecoComplementWord(word_t word,uint32_t size)
{
word_t rep=0;
uint32_t i;
// DEBUG_LOG("%llx %llx",word,~word);
word=(~word) & WORDMASK(size);
for (i=0;i < size; i++)
{
rep = RAPPENDBASE(rep,size,word & 3LLU);
// DEBUG_LOG("%016llx %016llx %016llx",word,word & 3LLU,rep);
word>>=2;
}
// DEBUG_LOG("Complemented = %s",ecoUnhashWord(rep,18));
return rep;
}
static int cmpword(const void *x,const void *y)
{
word_t w1 = *(pword_t)x;
word_t w2 = *(pword_t)y;
w1 = WORD(w1);
w2 = WORD(w2);
if (w1 < w2)
return -1;
if (w1 > w2)
return +1;
return 0;
}
uint32_t ecoFindWord(pwordcount_t table,word_t word)
{
pword_t dest;
dest = (pword_t)bsearch((const void*)&word,(const void*)table->words,table->size,sizeof(word_t),cmpword);
if (dest)
return dest - table->words;
else
return ~0;
}
char ecoComplementChar(char base)
{
switch(base){
case 'A': return T;
case 'C': return G;
case 'G': return C;
case 'T': return A;
}
}