/* * hashsequence.c * * Created on: 7 nov. 2008 * Author: coissac */ #include "ecoprimer.h" static int cmpword(const void *x,const void *y); static int8_t encoder[] = {0, // A -1, // b 1, // C -1,-1,-1, // d, e, f 2, // G -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // h,i,j,k,l,m,n,o,p,q,r,s 3,3, // T,U -1,-1,-1,-1,-1}; // v,w,x,y,z uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq) { uint32_t wordcount; wordcount = seq->SQ_length; if (!circular) wordcount-=wordsize-1; return wordcount; } pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint32_t doublestrand, ecoseq_t *seq,uint32_t *size) { uint32_t i=0; uint32_t j; char *base; int8_t code; int32_t error=0; word_t word=0; word_t antiword=0; uint32_t lmax=0; (*size)=0; lmax = seq->SQ_length; if (!circular) lmax-= wordsize-1; if (!dest) dest = ECOMALLOC(lmax*sizeof(word_t), "I cannot allocate memory for sequence hashing" ); // DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i)); for (i=0, base = seq->SQ; i < wordsize && i < lmax; i++,base++) { error<<= 1; code = encoder[(*base) - 'A']; if (code <0) { code = 0; error|= 1; } word=RAPPENDBASE(word,wordsize,code); if (doublestrand) antiword=LAPPENDBASE(antiword,wordsize,code); } if (!error && i==wordsize) { dest[*size]=(doublestrand) ? MINI(word,antiword):word; (*size)++; } for (j=1; j < lmax; j++,i++,base++) { // DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j)); /* roll over the sequence for circular ones */ if (i==(uint32_t)seq->SQ_length) base=seq->SQ; error<<= 1; code = encoder[(*base) - 'A']; if (code <0) { code = 0; error|= 1; } word=RAPPENDBASE(word,wordsize,code); if (doublestrand) antiword=LAPPENDBASE(antiword,wordsize,code); if (!error) { dest[*size]=(doublestrand) ? MINI(word,antiword):word; (*size)++; } } return dest; } uint32_t ecoCompactHashSequence(pword_t table,uint32_t size) { uint32_t i,j; word_t current; sortword(table,size); current = 0; current=SETMULTIWORD(current); /* build impossible word for the first loop cycle */ for (i=0,j=0; j < size;j++) { if (table[j]!=current) { current =table[j]; table[i]=current; i++; } else table[i]=SETMULTIWORD(table[i]); } return i; } const char* ecoUnhashWord(word_t word,uint32_t size) { static char buffer[32]; static char decode[]="ACGT"; uint32_t i; for (i=0; i < size; i++) { buffer[i]=decode[(word >> (2 * (size - 1 -i))) & 3]; } buffer[size]=0; return buffer; } word_t ecoComplementWord(word_t word,uint32_t size) { word_t rep=0; uint32_t i; // DEBUG_LOG("%llx %llx",word,~word); word=(~word) & WORDMASK(size); for (i=0;i < size; i++) { rep = RAPPENDBASE(rep,size,word & 3LLU); // DEBUG_LOG("%016llx %016llx %016llx",word,word & 3LLU,rep); word>>=2; } // DEBUG_LOG("Complemented = %s",ecoUnhashWord(rep,18)); return rep; } static int cmpword(const void *x,const void *y) { word_t w1 = *(pword_t)x; word_t w2 = *(pword_t)y; w1 = WORD(w1); w2 = WORD(w2); if (w1 < w2) return -1; if (w1 > w2) return +1; return 0; } uint32_t ecoFindWord(pwordcount_t table,word_t word) { pword_t dest; dest = (pword_t)bsearch((const void*)&word,(const void*)table->words,table->size,sizeof(word_t),cmpword); if (dest) return dest - table->words; else return ~0; }