Files
ecoprimers/src/libecoprimer/ahocorasick.c

479 lines
16 KiB
C
Executable File

/*
* ahocorasick.h
*
* Created on: 26 march 2011
* Author: tiayyba
*/
#include <inttypes.h>
#include "hashencoder.h"
#include "ahocorasick.h"
void ahoc_graphKeywordTree (aho_state *root);
aho_state *groot = NULL; //just for graph testing
#define BASEATINDEX(w, l, i) (uint8_t)((((w)&(0x3LLU<<(((l)-(i))*2)))>>(((l)-(i))*2)) & 0x3LLU)
void ahoc_addOutputElement (aho_state *node, bool_t isdirect, uint32_t idx)
{
if (!node) return;
if (node->output.count == 0)
node->output.out_set = ECOMALLOC(sizeof(aho_output),
"Cannot allocate memory for aho-corasick state output element");
else
node->output.out_set = ECOREALLOC(node->output.out_set, (node->output.count+1)*sizeof(aho_output),
"Cannot allocate memory for aho-corasick state output element");
node->output.out_set[node->output.count].wordidx = idx;
node->output.out_set[node->output.count].isdirect = isdirect;
node->output.count++;
}
//is the passed output element in the set
bool_t ahoc_isOutputIn (aho_state *node, aho_output ot)
{
uint32_t i;
for (i=0; i<node->output.count; i++)
if (node->output.out_set[i].isdirect == ot.isdirect && node->output.out_set[i].wordidx == ot.wordidx) return TRUE;
return FALSE;
}
//take union of output of the two nodes and put in node1
void ahoc_unionOutputElements (aho_state *node1, aho_state *node2)
{
uint32_t i;
for (i=0; i<node2->output.count; i++)
if (ahoc_isOutputIn (node1, node2->output.out_set[i]) == FALSE)
ahoc_addOutputElement (node1, node2->output.out_set[i].isdirect, node2->output.out_set[i].wordidx);
}
void ahoc_addKeyword (aho_state *root, word_t w, bool_t isdirect, uint32_t idx, poptions_t options)
{
uint32_t i;
aho_state *nextnode = root;
uint8_t basecode;
static uint32_t state_id = 0;
//fprintf (stderr, "%s\n", ecoUnhashWord(w, options->primer_length));
for (i=1; i<=options->primer_length; i++)
{
basecode = BASEATINDEX (w, options->primer_length, i);
//fprintf (stderr, "%d", basecode);
if (nextnode->next[basecode] == NULL)
{
//add new state
nextnode->next[basecode] = ECOMALLOC(sizeof(aho_state),
"Cannot allocate memory for aho-corasick state");
nextnode = nextnode->next[basecode];
//initialize state
nextnode->id = ++state_id;
nextnode->next[0]=nextnode->next[1]=nextnode->next[2]=nextnode->next[3]=NULL;
nextnode->fail = NULL;
nextnode->output.count = 0;
}
else
nextnode = nextnode->next[basecode];
}
//fprintf (stderr, "\n", basecode);
//new pattern addess so add node ouptup element
ahoc_addOutputElement (nextnode, isdirect, idx);
}
void ahoc_buildKeywordTree (aho_state *root, pwordcount_t words, poptions_t options)
{
uint32_t i;
if (!root) return;
//init root
root->id = 0;
root->next[0]=root->next[1]=root->next[2]=root->next[3]=NULL;
root->fail = NULL;
root->output.count = 0;
//now add each word as a pattern in the keyword tree
for (i=0; i<words->size; i++)
{
//add direct word
word_t w=WORD(words->words[i]);
ahoc_addKeyword (root, w, TRUE, i, options);
//add reverse word
w=ecoComplementWord(w,options->primer_length);
ahoc_addKeyword (root, w, FALSE, i, options);
}
//loop on root if some base has no out going edge from roots
for (i=0; i<4; i++)
if (root->next[i] == NULL)
root->next[i] = root;
}
void ahoc_enqueue (aho_queue *ahoqueue, aho_state *node)
{
queue_node *q;
if (node == NULL) return;
q = ECOMALLOC(sizeof(queue_node),
"Cannot allocate memory for aho-corasick queue node");
q->state_node = node;
q->next = NULL;
if (ahoqueue->first == NULL)
{
ahoqueue->first = q;
ahoqueue->last = q;
}
else
{
ahoqueue->last->next = q;
ahoqueue->last = q;
}
}
aho_state *ahoc_dequeue (aho_queue *ahoqueue)
{
aho_state *node = NULL;
queue_node *q;
if (ahoqueue->first == NULL) return node;
q = ahoqueue->first;
ahoqueue->first = q->next;
node = q->state_node;
ECOFREE (q, "Cannot free memory for aho-corasick queue node");
return node;
}
//set fail links and output sets for the keyword tree
void ahoc_updateForFailAndOutput (aho_state *root)
{
int32_t i;
aho_queue Q;
aho_state *node_r;
aho_state *node_u;
aho_state *node_v;
//empty queue
Q.first = NULL;
Q.last = NULL;
//for us alphabet has 4 elements, A=0, C=1, G=2 and T=3
for (i=0; i<4; i++)
{
if (root->next[i] != root && root->next[i] != NULL)
{
root->next[i]->fail = root;
ahoc_enqueue (&Q, root->next[i]);
}
}
//while queue not empty
while (Q.first != NULL)
{
node_r = ahoc_dequeue (&Q);
for (i=0; i<4; i++)
{
if (node_r->next[i] != NULL)
{
node_u = node_r->next[i];
ahoc_enqueue (&Q, node_u);
node_v = node_r->fail;
while (node_v->next[i] == NULL)
node_v = node_v->fail;
node_u->fail = node_v->next[i];
ahoc_unionOutputElements (node_u, node_u->fail);
}
}
}
}
void ahoc_freeKeywordTree (aho_state *node)
{
int i;
for (i=0; i<4; i++)
if (node->next[i])
ahoc_freeKeywordTree (node->next[i]);
if (node->output.count > 0)
ECOFREE (node->output.out_set, "Free failed for node output");
ECOFREE (node, "Free failed for node");
}
pprimercount_t ahoc_lookforStrictPrimers (pecodnadb_t database, uint32_t seqdbsize,uint32_t exampleCount,
pwordcount_t words,poptions_t options)
{
aho_state automaton_root;
aho_state *curr_state;
//uint32_t inSequenceQuorum;
uint32_t outSequenceQuorum;
pprimer_t data;
pprimercount_t primers;
uint32_t i, j, k;
int32_t pos;
uint32_t lmax;
char *base;
int8_t code;
uint32_t goodPrimers=0;
//inSequenceQuorum = (uint32_t)floor((float)exampleCount * options->sensitivity_quorum);
outSequenceQuorum = (uint32_t)floor((float)(seqdbsize-exampleCount) * options->false_positive_quorum);
//fprintf(stderr," Primers should be at least present in %d/%d example sequences\n",inSequenceQuorum,exampleCount);
fprintf(stderr," Primers should not be present in more than %d/%d counterexample sequences\n",outSequenceQuorum,(seqdbsize-exampleCount));
data = ECOMALLOC(words->size * sizeof(primer_t),
"Cannot allocate memory for fuzzy matching results");
for (i=0; i < words->size; i++)
{
data[i].word=WORD(words->words[i]);
data[i].inexample = 0;
data[i].outexample= 0;
data[i].directCount=ECOMALLOC(seqdbsize * sizeof(uint32_t),
"Cannot allocate memory for primer position");
data[i].directPos = ECOMALLOC(seqdbsize * sizeof(poslist_t),
"Cannot allocate memory for primer position");
data[i].reverseCount=ECOMALLOC(seqdbsize * sizeof(uint32_t),
"Cannot allocate memory for primer position");
data[i].reversePos = ECOMALLOC(seqdbsize * sizeof(poslist_t),
"Cannot allocate memory for primer position");
}
//build keywords automaton
ahoc_buildKeywordTree (&automaton_root, words, options);
//set fail links and output sets
ahoc_updateForFailAndOutput (&automaton_root);
//debug; print keywordtree in a gv file
//ahoc_graphKeywordTree (&automaton_root);
//loop on each sequence for its each base and find words
for (i=0; i < seqdbsize; i++)
{
if(database[i]->SQ_length <= options->primer_length) continue;
lmax = database[i]->SQ_length;
if (!options->circular)
lmax += options->primer_length-1;
curr_state = &automaton_root;
for (j=0,base=database[i]->SQ; j<lmax; j++,base++)
{
if (i==(uint32_t)database[i]->SQ_length) base=database[i]->SQ;
//code = encoder[(*base) - 'A'];
code = *base;
//if (iii++ < 30)
// fprintf (stderr, "%d:%d,", *base, code);
if (code < 0 || code > 3)
{
//if error char, start from root for next character
//+forget any incomplete words
curr_state = &automaton_root;
continue;
}
while (curr_state->next[code] == NULL) curr_state = curr_state->fail;
curr_state = curr_state->next[code];
//start position of primer is options->primer_length-1 chars back
pos = j-options->primer_length+1;
if (pos < 0) pos = database[i]->SQ_length+pos;
//set output, if there is some output on this state then
//+all words in the output set complete here, so increment their
//+found properties for current sequence
for (k=0; k<curr_state->output.count; k++)
{
if (curr_state->output.out_set[k].isdirect)
data[curr_state->output.out_set[k].wordidx].directCount[i]++;
else
data[curr_state->output.out_set[k].wordidx].reverseCount[i]++;
if (options->no_multi_match)
{
if ((data[curr_state->output.out_set[k].wordidx].directCount[i] +
data[curr_state->output.out_set[k].wordidx].reverseCount[i]) > 1)
//since multimach not allowd, set an indication on 1st seq position that
//+ a multimatch was found, so that this word will be filtered out
//+ and because of first postion we wont have to search the whole array
//+ to find if it voilated nomultimatch constraint for some seq
data[curr_state->output.out_set[k].wordidx].directCount[0] = 2;
else
{
if (curr_state->output.out_set[k].isdirect)
//direct word found on jth position of ith sequence
data[curr_state->output.out_set[k].wordidx].directPos[i].value = (uint32_t)pos;
else
//reverse word found on jth position of ith sequence
data[curr_state->output.out_set[k].wordidx].reversePos[i].value = (uint32_t)pos;
}
}
else
{
//okay multi match allowed
if (curr_state->output.out_set[k].isdirect)
{
if (data[curr_state->output.out_set[k].wordidx].directCount[i] == 1)
data[curr_state->output.out_set[k].wordidx].directPos[i].value = (uint32_t)pos;
else
{
//need to create or extend the positions list
if (data[curr_state->output.out_set[k].wordidx].directCount[i] == 2)
{
//for second element, first was put in .value, so dont forget to copy that in the array too
data[curr_state->output.out_set[k].wordidx].directPos[i].pointer = ECOMALLOC(2 * sizeof(uint32_t),
"Cannot allocate memory for primer position");
data[curr_state->output.out_set[k].wordidx].directPos[i].pointer[0] = data[curr_state->output.out_set[k].wordidx].directPos[i].value;
data[curr_state->output.out_set[k].wordidx].directPos[i].pointer[1] = (uint32_t)pos;
}
else
{
//for third or greater element
data[curr_state->output.out_set[k].wordidx].directPos[i].pointer = ECOREALLOC(data[curr_state->output.out_set[k].wordidx].directPos[i].pointer,
data[curr_state->output.out_set[k].wordidx].directCount[i] * sizeof(uint32_t),
"Cannot allocate memory for primer position");
data[curr_state->output.out_set[k].wordidx].directPos[i].pointer[data[curr_state->output.out_set[k].wordidx].directCount[i]-1] = (uint32_t)pos;
}
}
}
else
{
if (data[curr_state->output.out_set[k].wordidx].reverseCount[i] == 1)
data[curr_state->output.out_set[k].wordidx].reversePos[i].value = (uint32_t)pos;
else
{
//need to create or extend the positions list
if (data[curr_state->output.out_set[k].wordidx].reverseCount[i] == 2)
{
//for second element, first was put in .value, so dont forget to copy that in the array too
data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer = ECOMALLOC(2 * sizeof(uint32_t),
"Cannot allocate memory for primer position");
data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer[0] = data[curr_state->output.out_set[k].wordidx].reversePos[i].value;
data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer[1] = (uint32_t)pos;
}
else
{
//for third or greater element
data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer = ECOREALLOC(data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer,
data[curr_state->output.out_set[k].wordidx].reverseCount[i] * sizeof(uint32_t),
"Cannot allocate memory for primer position");
data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer[data[curr_state->output.out_set[k].wordidx].reverseCount[i]-1] = (uint32_t)pos;
}
}
}
}
//dont forget to increment inexample or outexample count, but only once for a sequence
if ((data[curr_state->output.out_set[k].wordidx].directCount[i] +
data[curr_state->output.out_set[k].wordidx].reverseCount[i]) == 1)
{
if (database[i]->isexample)
data[curr_state->output.out_set[k].wordidx].inexample++;
else
data[curr_state->output.out_set[k].wordidx].outexample++;
}
}
}
}
//Only thing that remains is to remove the failed words
for (i=0,j=0; i<words->size; i++)
{
fprintf(stderr,"Primers %5d/%lld analyzed => sequence : %s in %d example and %d counterexample sequences \r",
i+1,words->size,ecoUnhashWord(data[i].word,options->primer_length),
data[i].inexample,data[i].outexample);
//if (data[i].inexample < inSequenceQuorum || (data[i].directCount[0] == 2 && options->no_multi_match))
if (data[i].directCount[0] == 2 && options->no_multi_match)
{
//bad word, delete from the array
for (k=0; k<seqdbsize; k++)
{
if (data[i].directCount[k] > 1)
ECOFREE (data[i].directPos[k].pointer, "Cannot free position pointer.");
if (data[i].reverseCount[k] > 1)
ECOFREE (data[i].reversePos[k].pointer, "Cannot free position pointer.");
}
ECOFREE (data[i].directCount, "Cannot free position pointer.");
ECOFREE (data[i].directPos, "Cannot free position pointer.");
ECOFREE (data[i].reverseCount, "Cannot free position pointer.");
ECOFREE (data[i].reversePos, "Cannot free position pointer.");
}
else
{
//data[i].good = data[i].inexample >= inSequenceQuorum && data[i].outexample <= outSequenceQuorum;
data[i].good = data[i].outexample <= outSequenceQuorum;
goodPrimers+=data[i].good? 1:0;
if (j < i)
data[j] = data[i];
j++;
}
}
fprintf(stderr,"\n\nOn %lld analyzed primers %d respect quorum conditions\n",words->size,goodPrimers);
fprintf(stderr,"Conserved primers for further analysis : %d/%lld\n",j,words->size);
primers = ECOMALLOC(sizeof(primercount_t),"Cannot allocate memory for primer table");
primers->primers=ECOREALLOC(data,
j * sizeof(primer_t),
"Cannot reallocate memory for fuzzy matching results");
primers->size=j;
//free memory of keyword table
for (i=0; i<4; i++)
if (automaton_root.next[i] != &automaton_root)
ahoc_freeKeywordTree (automaton_root.next[i]);
return primers;
}
void ahoc_graphPrintNodesInfo (aho_state *node, FILE* gfile)
{
uint32_t i;
fprintf (gfile, "\"%d\"[\n", node->id);
fprintf (gfile, "label=\"%d\\n", node->id);
for (i=0; i<node->output.count; i++)
fprintf (gfile, "%d%c,", node->output.out_set[i].wordidx, node->output.out_set[i].isdirect?'d':'r');
fprintf (gfile, "\"\n];\n");
for (i=0; i<4; i++)
if (node->next[i] != NULL && node->next[i] != node)
ahoc_graphPrintNodesInfo (node->next[i], gfile);
}
void ahoc_graphPrintNodesLinks (aho_state *node, FILE* gfile)
{
uint32_t i;
static int j=0;
for (i=0; i<4; i++)
if (node->next[i] != NULL && node->next[i] != node)
{
fprintf (gfile, "\"%d\" -> \"%d\" [\n", node->id, node->next[i]->id);
fprintf (gfile, "label=\"%c\"\n];\n", "ACGT"[i]);
}
if (j++ < 40)
if (node->fail != NULL && node->fail != groot)
{
fprintf (gfile, "\"%d\" -> \"%d\" [\n", node->id, node->fail->id);
fprintf (gfile, "color= \"red\"\n];\n");
}
for (i=0; i<4; i++)
if (node->next[i] != NULL && node->next[i] != node)
ahoc_graphPrintNodesLinks (node->next[i], gfile);
}
void ahoc_graphKeywordTree (aho_state *root)
{
FILE *gfile;
groot=root;
gfile = fopen ("keywordtree.gv", "w");
fprintf (gfile, "digraph keywordtree {\n");
ahoc_graphPrintNodesInfo (root, gfile);
ahoc_graphPrintNodesLinks (root, gfile);
fprintf (gfile, "}\n");
fclose(gfile);
}