Added the kmer filter to LCS alignments, and now obiblobs containing
encoded sequences are directly put in int16_t arrays for the alignment
This commit is contained in:
@ -46,7 +46,7 @@ def addOptions(parser):
|
|||||||
metavar='<ALIGNMENT TYPE>',
|
metavar='<ALIGNMENT TYPE>',
|
||||||
default='lcs',
|
default='lcs',
|
||||||
type=str,
|
type=str,
|
||||||
help="Compute alignment using the LCS method.")
|
help="Compute alignment using the LCS method (default).")
|
||||||
|
|
||||||
group.add_argument('--threshold','-t',
|
group.add_argument('--threshold','-t',
|
||||||
action="store", dest="align:threshold",
|
action="store", dest="align:threshold",
|
||||||
@ -64,15 +64,15 @@ def addOptions(parser):
|
|||||||
|
|
||||||
group.add_argument('--longest_length','-L',
|
group.add_argument('--longest_length','-L',
|
||||||
action="store_const", dest="align:reflength",
|
action="store_const", dest="align:reflength",
|
||||||
default="ali",
|
default=0,
|
||||||
const="longest",
|
const=1,
|
||||||
help="The reference length is the length of the longest sequence."
|
help="The reference length is the length of the longest sequence."
|
||||||
" Default: the reference length is the length of the alignment.")
|
" Default: the reference length is the length of the alignment.")
|
||||||
|
|
||||||
group.add_argument('--shortest_length','-l',
|
group.add_argument('--shortest_length','-l',
|
||||||
action="store_const", dest="align:reflength",
|
action="store_const", dest="align:reflength",
|
||||||
default="ali",
|
default=0,
|
||||||
const="shortest",
|
const=2,
|
||||||
help="The reference length is the length of the shortest sequence."
|
help="The reference length is the length of the shortest sequence."
|
||||||
" Default: the reference length is the length of the alignment.")
|
" Default: the reference length is the length of the alignment.")
|
||||||
|
|
||||||
@ -89,9 +89,7 @@ def addOptions(parser):
|
|||||||
|
|
||||||
|
|
||||||
def run(config):
|
def run(config):
|
||||||
|
|
||||||
#pb = ProgressBar(1, config, seconde=5) # TODO
|
|
||||||
|
|
||||||
# Open DMS
|
# Open DMS
|
||||||
d = OBIDMS(config['obi']['defaultdms'])
|
d = OBIDMS(config['obi']['defaultdms'])
|
||||||
|
|
||||||
@ -106,9 +104,9 @@ def run(config):
|
|||||||
# TODO Take other alignment types into account when they'll be implemented
|
# TODO Take other alignment types into account when they'll be implemented
|
||||||
|
|
||||||
# Call cython alignment function
|
# Call cython alignment function
|
||||||
iview.align(oview)
|
iview.align(oview, threshold=config['align']['threshold'], normalize=config['align']['normalize'], reference=config['align']['reflength'], similarity_mode=config['align']['similarity'])
|
||||||
|
|
||||||
repr(oview)
|
print(repr(oview))
|
||||||
|
|
||||||
iview.close()
|
iview.close()
|
||||||
oview.close()
|
oview.close()
|
||||||
|
@ -59,5 +59,7 @@
|
|||||||
../../../src/sse_banded_LCS_alignment.c
|
../../../src/sse_banded_LCS_alignment.c
|
||||||
../../../src/uint8_indexer.h
|
../../../src/uint8_indexer.h
|
||||||
../../../src/uint8_indexer.c
|
../../../src/uint8_indexer.c
|
||||||
|
../../../src/upperband.h
|
||||||
|
../../../src/upperband.c
|
||||||
../../../src/utils.h
|
../../../src/utils.h
|
||||||
../../../src/utils.c
|
../../../src/utils.c
|
||||||
|
@ -59,5 +59,7 @@
|
|||||||
../../../src/sse_banded_LCS_alignment.c
|
../../../src/sse_banded_LCS_alignment.c
|
||||||
../../../src/uint8_indexer.h
|
../../../src/uint8_indexer.h
|
||||||
../../../src/uint8_indexer.c
|
../../../src/uint8_indexer.c
|
||||||
|
../../../src/upperband.h
|
||||||
|
../../../src/upperband.c
|
||||||
../../../src/utils.h
|
../../../src/utils.h
|
||||||
../../../src/utils.c
|
../../../src/utils.c
|
||||||
|
@ -59,5 +59,7 @@
|
|||||||
../../../src/sse_banded_LCS_alignment.c
|
../../../src/sse_banded_LCS_alignment.c
|
||||||
../../../src/uint8_indexer.h
|
../../../src/uint8_indexer.h
|
||||||
../../../src/uint8_indexer.c
|
../../../src/uint8_indexer.c
|
||||||
|
../../../src/upperband.h
|
||||||
|
../../../src/upperband.c
|
||||||
../../../src/utils.h
|
../../../src/utils.h
|
||||||
../../../src/utils.c
|
../../../src/utils.c
|
||||||
|
@ -59,5 +59,7 @@
|
|||||||
../../../src/sse_banded_LCS_alignment.c
|
../../../src/sse_banded_LCS_alignment.c
|
||||||
../../../src/uint8_indexer.h
|
../../../src/uint8_indexer.h
|
||||||
../../../src/uint8_indexer.c
|
../../../src/uint8_indexer.c
|
||||||
|
../../../src/upperband.h
|
||||||
|
../../../src/upperband.c
|
||||||
../../../src/utils.h
|
../../../src/utils.h
|
||||||
../../../src/utils.c
|
../../../src/utils.c
|
||||||
|
@ -59,5 +59,7 @@
|
|||||||
../../../src/sse_banded_LCS_alignment.c
|
../../../src/sse_banded_LCS_alignment.c
|
||||||
../../../src/uint8_indexer.h
|
../../../src/uint8_indexer.h
|
||||||
../../../src/uint8_indexer.c
|
../../../src/uint8_indexer.c
|
||||||
|
../../../src/upperband.h
|
||||||
|
../../../src/upperband.c
|
||||||
../../../src/utils.h
|
../../../src/utils.h
|
||||||
../../../src/utils.c
|
../../../src/utils.c
|
||||||
|
@ -59,5 +59,7 @@
|
|||||||
../../../src/sse_banded_LCS_alignment.c
|
../../../src/sse_banded_LCS_alignment.c
|
||||||
../../../src/uint8_indexer.h
|
../../../src/uint8_indexer.h
|
||||||
../../../src/uint8_indexer.c
|
../../../src/uint8_indexer.c
|
||||||
|
../../../src/upperband.h
|
||||||
|
../../../src/upperband.c
|
||||||
../../../src/utils.h
|
../../../src/utils.h
|
||||||
../../../src/utils.c
|
../../../src/utils.c
|
||||||
|
@ -59,5 +59,7 @@
|
|||||||
../../../src/sse_banded_LCS_alignment.c
|
../../../src/sse_banded_LCS_alignment.c
|
||||||
../../../src/uint8_indexer.h
|
../../../src/uint8_indexer.h
|
||||||
../../../src/uint8_indexer.c
|
../../../src/uint8_indexer.c
|
||||||
|
../../../src/upperband.h
|
||||||
|
../../../src/upperband.c
|
||||||
../../../src/utils.h
|
../../../src/utils.h
|
||||||
../../../src/utils.c
|
../../../src/utils.c
|
||||||
|
@ -59,5 +59,7 @@
|
|||||||
../../../src/sse_banded_LCS_alignment.c
|
../../../src/sse_banded_LCS_alignment.c
|
||||||
../../../src/uint8_indexer.h
|
../../../src/uint8_indexer.h
|
||||||
../../../src/uint8_indexer.c
|
../../../src/uint8_indexer.c
|
||||||
|
../../../src/upperband.h
|
||||||
|
../../../src/upperband.c
|
||||||
../../../src/utils.h
|
../../../src/utils.h
|
||||||
../../../src/utils.c
|
../../../src/utils.c
|
||||||
|
@ -59,5 +59,7 @@
|
|||||||
../../../src/sse_banded_LCS_alignment.c
|
../../../src/sse_banded_LCS_alignment.c
|
||||||
../../../src/uint8_indexer.h
|
../../../src/uint8_indexer.h
|
||||||
../../../src/uint8_indexer.c
|
../../../src/uint8_indexer.c
|
||||||
|
../../../src/upperband.h
|
||||||
|
../../../src/upperband.c
|
||||||
../../../src/utils.h
|
../../../src/utils.h
|
||||||
../../../src/utils.c
|
../../../src/utils.c
|
||||||
|
@ -59,5 +59,7 @@
|
|||||||
../../../src/sse_banded_LCS_alignment.c
|
../../../src/sse_banded_LCS_alignment.c
|
||||||
../../../src/uint8_indexer.h
|
../../../src/uint8_indexer.h
|
||||||
../../../src/uint8_indexer.c
|
../../../src/uint8_indexer.c
|
||||||
|
../../../src/upperband.h
|
||||||
|
../../../src/upperband.c
|
||||||
../../../src/utils.h
|
../../../src/utils.h
|
||||||
../../../src/utils.c
|
../../../src/utils.c
|
||||||
|
108
src/obi_align.c
108
src/obi_align.c
@ -19,6 +19,8 @@
|
|||||||
#include "obitypes.h"
|
#include "obitypes.h"
|
||||||
#include "obiview.h"
|
#include "obiview.h"
|
||||||
#include "sse_banded_LCS_alignment.h"
|
#include "sse_banded_LCS_alignment.h"
|
||||||
|
#include "upperband.h"
|
||||||
|
#include "obiblob.h"
|
||||||
|
|
||||||
|
|
||||||
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
|
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
|
||||||
@ -29,21 +31,22 @@
|
|||||||
// option pour ecrire en stdint?
|
// option pour ecrire en stdint?
|
||||||
// check NUC_SEQS view type? and score type (int or float if normalize)
|
// check NUC_SEQS view type? and score type (int or float if normalize)
|
||||||
// what's with multiple sequences/line columns?
|
// what's with multiple sequences/line columns?
|
||||||
// make function that put blobs in int16
|
|
||||||
|
|
||||||
|
|
||||||
int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column,
|
int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column,
|
||||||
Obiview_p score_view, OBIDMS_column_p id1_column, OBIDMS_column_p id2_column, OBIDMS_column_p score_column,
|
Obiview_p score_view, OBIDMS_column_p id1_column, OBIDMS_column_p id2_column, OBIDMS_column_p score_column,
|
||||||
double threshold, bool normalize, int reference, bool similarity_mode)
|
double threshold, bool normalize, int reference, bool similarity_mode)
|
||||||
{
|
{
|
||||||
index_t i, j, k;
|
index_t i, j, k;
|
||||||
index_t seq_count;
|
index_t seq_count;
|
||||||
char* seq1;
|
const char* id1;
|
||||||
char* seq2;
|
const char* id2;
|
||||||
const char* id1;
|
double score;
|
||||||
const char* id2;
|
|
||||||
double score;
|
|
||||||
OBIDMS_column_p id_column;
|
OBIDMS_column_p id_column;
|
||||||
|
Kmer_table_p ktable;
|
||||||
|
Obi_blob_p blob1;
|
||||||
|
Obi_blob_p blob2;
|
||||||
|
int lcs_min;
|
||||||
|
|
||||||
k = 0;
|
k = 0;
|
||||||
|
|
||||||
@ -62,6 +65,15 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column,
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Build kmer tables
|
||||||
|
ktable = hash_seq_column(seq_view, seq_column);
|
||||||
|
if (ktable == NULL)
|
||||||
|
{
|
||||||
|
obi_set_errno(OBI_ALIGN_ERROR);
|
||||||
|
obidebug(1, "\nError building kmer tables before aligning");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
// Get the ID column pointer
|
// Get the ID column pointer
|
||||||
id_column = obi_view_get_column(seq_view, ID_COLUMN);
|
id_column = obi_view_get_column(seq_view, ID_COLUMN);
|
||||||
|
|
||||||
@ -69,66 +81,72 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column,
|
|||||||
|
|
||||||
for (i=0; i < (seq_count - 1); i++)
|
for (i=0; i < (seq_count - 1); i++)
|
||||||
{
|
{
|
||||||
|
if (i%100 == 0)
|
||||||
|
fprintf(stderr,"\rDone : %f %% ", (i / (float) seq_count)*100);
|
||||||
|
|
||||||
for (j=i+1; j < seq_count; j++)
|
for (j=i+1; j < seq_count; j++)
|
||||||
{
|
{
|
||||||
//fprintf(stderr, "\ni=%lld, j=%lld, k=%lld", i, j, k);
|
blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, 0);
|
||||||
|
blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, 0);
|
||||||
|
|
||||||
seq1 = obi_get_seq_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, 0);
|
if ((blob1 == NULL) || (blob2 == NULL))
|
||||||
seq2 = obi_get_seq_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, 0);
|
|
||||||
|
|
||||||
if ((seq1 == NULL) || (seq2 == NULL))
|
|
||||||
{
|
{
|
||||||
obidebug(1, "\nError retrieving sequences to align");
|
obidebug(1, "\nError retrieving sequences to align");
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO kmer filter
|
// kmer filter
|
||||||
|
align_filters(ktable, blob1, blob2, i, j, threshold, normalize, reference, similarity_mode, &score, &lcs_min);
|
||||||
|
|
||||||
// Compute alignment score
|
// Compute alignment score
|
||||||
score = generic_sse_banded_lcs_align(seq1, seq2, threshold, normalize, reference, similarity_mode);
|
if ((threshold == 0) || (score == -1.0)) // no threshold or filter passed, and sequences not identical: align
|
||||||
|
score = obiblob_sse_banded_lcs_align(blob1, blob2, threshold, normalize, reference, similarity_mode);
|
||||||
|
|
||||||
// Get sequence ids
|
if ((score >= 0) && (((normalize || similarity_mode) && (score >= threshold)) || ((!similarity_mode && !normalize) && (score <= threshold))))
|
||||||
id1 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0);
|
{ // Print result
|
||||||
id2 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, j, 0);
|
|
||||||
|
|
||||||
// Write sequence ids in output view
|
// Get sequence ids
|
||||||
if (obi_set_str_with_elt_idx_and_col_p_in_view(score_view, id1_column, k, 0, id1) < 0)
|
id1 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0);
|
||||||
{
|
id2 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, j, 0);
|
||||||
obidebug(1, "\nError writing id1 in a column");
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (obi_set_str_with_elt_idx_and_col_p_in_view(score_view, id2_column, k, 0, id2) < 0)
|
// Write sequence ids in output view
|
||||||
{
|
if (obi_set_str_with_elt_idx_and_col_p_in_view(score_view, id1_column, k, 0, id1) < 0)
|
||||||
obidebug(1, "\nError writing id2 in a column");
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Write score in output view
|
|
||||||
if (normalize)
|
|
||||||
{
|
|
||||||
if (obi_set_float_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obifloat_t) score) < 0)
|
|
||||||
{
|
{
|
||||||
obidebug(1, "\nError writing alignment score in a column");
|
obidebug(1, "\nError writing id1 in a column");
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
else
|
if (obi_set_str_with_elt_idx_and_col_p_in_view(score_view, id2_column, k, 0, id2) < 0)
|
||||||
{
|
|
||||||
if (obi_set_int_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obiint_t) score) < 0)
|
|
||||||
{
|
{
|
||||||
obidebug(1, "\nError writing alignment score in a column");
|
obidebug(1, "\nError writing id2 in a column");
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Write score in output view
|
||||||
|
if (normalize)
|
||||||
|
{
|
||||||
|
if (obi_set_float_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obifloat_t) score) < 0)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError writing alignment score in a column");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (obi_set_int_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obiint_t) score) < 0)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError writing alignment score in a column");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
k++;
|
||||||
}
|
}
|
||||||
|
|
||||||
free(seq1);
|
|
||||||
free(seq2);
|
|
||||||
|
|
||||||
k++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
free_kmer_tables(ktable, seq_count);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -19,6 +19,7 @@
|
|||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
|
||||||
#include "obidms.h"
|
#include "obidms.h"
|
||||||
|
#include "obiview.h"
|
||||||
#include "obidmscolumn.h"
|
#include "obidmscolumn.h"
|
||||||
#include "obitypes.h"
|
#include "obitypes.h"
|
||||||
|
|
||||||
|
@ -17,6 +17,8 @@
|
|||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
#include "_sse.h"
|
#include "_sse.h"
|
||||||
#include "sse_banded_LCS_alignment.h"
|
#include "sse_banded_LCS_alignment.h"
|
||||||
|
#include "obiblob.h"
|
||||||
|
#include "encode.h" // TODO move putBlobInSeq function to encode.c ?
|
||||||
|
|
||||||
|
|
||||||
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
|
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
|
||||||
@ -107,7 +109,7 @@ void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int
|
|||||||
max = INT16_MAX - l1;
|
max = INT16_MAX - l1;
|
||||||
|
|
||||||
numberOfRegistersPerLine = bandLengthTotal / 8;
|
numberOfRegistersPerLine = bandLengthTotal / 8;
|
||||||
numberOfRegistersFor3Lines = 3 * numberOfRegistersPerLine;
|
numberOfRegistersFor3Lines = 3 * numberOfRegistersPerLine;
|
||||||
|
|
||||||
SSEregisters = (um128*) calloc(numberOfRegistersFor3Lines * 2, sizeof(um128));
|
SSEregisters = (um128*) calloc(numberOfRegistersFor3Lines * 2, sizeof(um128));
|
||||||
l_ali_SSEregisters = SSEregisters + numberOfRegistersFor3Lines;
|
l_ali_SSEregisters = SSEregisters + numberOfRegistersFor3Lines;
|
||||||
@ -115,7 +117,7 @@ void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int
|
|||||||
// preparer registres SSE
|
// preparer registres SSE
|
||||||
|
|
||||||
for (j=0; j<numberOfRegistersFor3Lines; j++)
|
for (j=0; j<numberOfRegistersFor3Lines; j++)
|
||||||
l_ali_SSEregisters[j].i = _MM_LOAD_SI128(address+j*8);
|
l_ali_SSEregisters[j].i = _MM_LOAD_SI128((const __m128i*)(address+j*8));
|
||||||
|
|
||||||
p_diag = SSEregisters;
|
p_diag = SSEregisters;
|
||||||
p_gap1 = SSEregisters+numberOfRegistersPerLine;
|
p_gap1 = SSEregisters+numberOfRegistersPerLine;
|
||||||
@ -151,13 +153,15 @@ void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int
|
|||||||
|
|
||||||
k2 = line - k1 - 1;
|
k2 = line - k1 - 1;
|
||||||
|
|
||||||
nucs1.i = _MM_LOADU_SI128(seq1+l1-k1);
|
nucs1.i = _MM_LOADU_SI128((const __m128i*)(seq1+l1-k1));
|
||||||
nucs2.i = _MM_LOADU_SI128(seq2+k2);
|
nucs2.i = _MM_LOADU_SI128((const __m128i*)(seq2+k2));
|
||||||
|
|
||||||
/* fprintf(stderr, "\nnucs, r %d, k1 = %d, k2 = %d\n", j, k1, k2);
|
// if (print)
|
||||||
printreg(nucs1.i);
|
// {
|
||||||
printreg(nucs2.i);
|
// fprintf(stderr, "\nnucs, r %d, k1 = %d, k2 = %d\n", j, k1, k2);
|
||||||
*/
|
// printreg(nucs1.i);
|
||||||
|
// printreg(nucs2.i);
|
||||||
|
// }
|
||||||
|
|
||||||
// computing diagonal score :
|
// computing diagonal score :
|
||||||
scores.i = _MM_AND_SI128(_MM_CMPEQ_EPI16(nucs1.i, nucs2.i), _MM_SET1_EPI16(1));
|
scores.i = _MM_AND_SI128(_MM_CMPEQ_EPI16(nucs1.i, nucs2.i), _MM_SET1_EPI16(1));
|
||||||
@ -199,25 +203,26 @@ void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int
|
|||||||
_MM_AND_SI128(p_gap2_j->i, boolean_reg.i),
|
_MM_AND_SI128(p_gap2_j->i, boolean_reg.i),
|
||||||
_MM_ANDNOT_SI128(boolean_reg.i, current.i));
|
_MM_ANDNOT_SI128(boolean_reg.i, current.i));
|
||||||
|
|
||||||
/*
|
// if (print)
|
||||||
fprintf(stderr, "\nline = %d", line);
|
// {
|
||||||
fprintf(stderr, "\nDiag, r %d : ", j);
|
// fprintf(stderr, "\nline = %d", line);
|
||||||
printreg((*(p_diag_j)).i);
|
// fprintf(stderr, "\nDiag, r %d : ", j);
|
||||||
fprintf(stderr, "Gap1 : ");
|
// printreg((*(p_diag_j)).i);
|
||||||
printreg((*(p_gap1_j)).i);
|
// fprintf(stderr, "Gap1 : ");
|
||||||
fprintf(stderr, "Gap2 : ");
|
// printreg((*(p_gap1_j)).i);
|
||||||
printreg((*(p_gap2_j)).i);
|
// fprintf(stderr, "Gap2 : ");
|
||||||
fprintf(stderr, "current : ");
|
// printreg((*(p_gap2_j)).i);
|
||||||
printreg(current.i);
|
// fprintf(stderr, "current : ");
|
||||||
fprintf(stderr, "L ALI\nDiag r %d : ", j);
|
// printreg(current.i);
|
||||||
printreg((*(p_l_ali_diag_j)).i);
|
// fprintf(stderr, "L ALI\nDiag r %d : ", j);
|
||||||
fprintf(stderr, "Gap1 : ");
|
// printreg((*(p_l_ali_diag_j)).i);
|
||||||
printreg((*(p_l_ali_gap1_j)).i);
|
// fprintf(stderr, "Gap1 : ");
|
||||||
fprintf(stderr, "Gap2 : ");
|
// printreg((*(p_l_ali_gap1_j)).i);
|
||||||
printreg((*(p_l_ali_gap2_j)).i);
|
// fprintf(stderr, "Gap2 : ");
|
||||||
fprintf(stderr, "current : ");
|
// printreg((*(p_l_ali_gap2_j)).i);
|
||||||
printreg(l_ali_current.i);
|
// fprintf(stderr, "current : ");
|
||||||
*/
|
// printreg(l_ali_current.i);
|
||||||
|
// }
|
||||||
|
|
||||||
// diag = gap1 and gap1 = current
|
// diag = gap1 and gap1 = current
|
||||||
p_diag_j->i = p_gap1_j->i;
|
p_diag_j->i = p_gap1_j->i;
|
||||||
@ -234,8 +239,8 @@ void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int
|
|||||||
{
|
{
|
||||||
if ((odd_line && even_BLL) || (even_line && odd_BLL))
|
if ((odd_line && even_BLL) || (even_line && odd_BLL))
|
||||||
{
|
{
|
||||||
p_gap2[j].i = _MM_LOADU_SI128((p_gap1[j].s16)-1);
|
p_gap2[j].i = _MM_LOADU_SI128((const __m128i*)((p_gap1[j].s16)-1));
|
||||||
p_l_ali_gap2[j].i = _MM_LOADU_SI128((p_l_ali_gap1[j].s16)-1);
|
p_l_ali_gap2[j].i = _MM_LOADU_SI128((const __m128i*)((p_l_ali_gap1[j].s16)-1));
|
||||||
if (j == 0)
|
if (j == 0)
|
||||||
{
|
{
|
||||||
p_gap2[j].i = _MM_INSERT_EPI16(p_gap2[j].i, 0, 0);
|
p_gap2[j].i = _MM_INSERT_EPI16(p_gap2[j].i, 0, 0);
|
||||||
@ -244,8 +249,8 @@ void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
p_gap2[j].i = _MM_LOADU_SI128(p_gap1[j].s16+1);
|
p_gap2[j].i = _MM_LOADU_SI128((const __m128i*)(p_gap1[j].s16+1));
|
||||||
p_l_ali_gap2[j].i = _MM_LOADU_SI128(p_l_ali_gap1[j].s16+1);
|
p_l_ali_gap2[j].i = _MM_LOADU_SI128((const __m128i*)(p_l_ali_gap1[j].s16+1));
|
||||||
if (j == numberOfRegistersPerLine - 1)
|
if (j == numberOfRegistersPerLine - 1)
|
||||||
{
|
{
|
||||||
p_gap2[j].i = _MM_INSERT_EPI16(p_gap2[j].i, 0, 7);
|
p_gap2[j].i = _MM_INSERT_EPI16(p_gap2[j].i, 0, 7);
|
||||||
@ -267,7 +272,10 @@ void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int
|
|||||||
l_loc = (int) floor((double)(bandLengthLeft) / (double)2) - ceil((double)(diff) / (double)2);
|
l_loc = (int) floor((double)(bandLengthLeft) / (double)2) - ceil((double)(diff) / (double)2);
|
||||||
|
|
||||||
l_reg = (int)floor((double)l_loc/(double)8.0);
|
l_reg = (int)floor((double)l_loc/(double)8.0);
|
||||||
//fprintf(stderr, "\nl_reg = %d, l_loc = %d\n", l_reg, l_loc);
|
|
||||||
|
// if (print)
|
||||||
|
// fprintf(stderr, "\nl_reg = %d, l_loc = %d\n", l_reg, l_loc);
|
||||||
|
|
||||||
l_loc = l_loc - l_reg*8;
|
l_loc = l_loc - l_reg*8;
|
||||||
|
|
||||||
// extracting the results from the registers :
|
// extracting the results from the registers :
|
||||||
@ -357,8 +365,8 @@ double sse_banded_align_just_lcs(int16_t* seq1, int16_t* seq2, int l1, int l2, i
|
|||||||
|
|
||||||
k2 = line - k1 - 1;
|
k2 = line - k1 - 1;
|
||||||
|
|
||||||
nucs1.i = _MM_LOADU_SI128(seq1+l1-k1);
|
nucs1.i = _MM_LOADU_SI128((const __m128i*)(seq1+l1-k1));
|
||||||
nucs2.i = _MM_LOADU_SI128(seq2+k2);
|
nucs2.i = _MM_LOADU_SI128((const __m128i*)(seq2+k2));
|
||||||
|
|
||||||
// computing diagonal score :
|
// computing diagonal score :
|
||||||
scores.i = _MM_AND_SI128(_MM_CMPEQ_EPI16(nucs1.i, nucs2.i), _MM_SET1_EPI16(1));
|
scores.i = _MM_AND_SI128(_MM_CMPEQ_EPI16(nucs1.i, nucs2.i), _MM_SET1_EPI16(1));
|
||||||
@ -381,7 +389,7 @@ double sse_banded_align_just_lcs(int16_t* seq1, int16_t* seq2, int l1, int l2, i
|
|||||||
{
|
{
|
||||||
if ((odd_line && even_BLL) || (even_line && odd_BLL))
|
if ((odd_line && even_BLL) || (even_line && odd_BLL))
|
||||||
{
|
{
|
||||||
(*(p_gap2+j)).i = _MM_LOADU_SI128(((*(p_gap1+j)).s16)-1);
|
(*(p_gap2+j)).i = _MM_LOADU_SI128((const __m128i*)(((*(p_gap1+j)).s16)-1));
|
||||||
if (j == 0)
|
if (j == 0)
|
||||||
{
|
{
|
||||||
(*(p_gap2+j)).i = _MM_INSERT_EPI16((*(p_gap2+j)).i, 0, 0);
|
(*(p_gap2+j)).i = _MM_INSERT_EPI16((*(p_gap2+j)).i, 0, 0);
|
||||||
@ -389,7 +397,7 @@ double sse_banded_align_just_lcs(int16_t* seq1, int16_t* seq2, int l1, int l2, i
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
(*(p_gap2+j)).i = _MM_LOADU_SI128(((*(p_gap1+j)).s16)+1);
|
(*(p_gap2+j)).i = _MM_LOADU_SI128((const __m128i*)(((*(p_gap1+j)).s16)+1));
|
||||||
if (j == numberOfRegistersPerLine - 1)
|
if (j == numberOfRegistersPerLine - 1)
|
||||||
{
|
{
|
||||||
(*(p_gap2+j)).i = _MM_INSERT_EPI16((*(p_gap2+j)).i, 0, 7);
|
(*(p_gap2+j)).i = _MM_INSERT_EPI16((*(p_gap2+j)).i, 0, 7);
|
||||||
@ -483,6 +491,41 @@ void putSeqInSeq(int16_t* seq, char* s, int l, bool reverse)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void putBlobInSeq(int16_t* seq, Obi_blob_p b, int l, bool reverse)
|
||||||
|
{
|
||||||
|
size_t i;
|
||||||
|
uint8_t shift;
|
||||||
|
uint8_t mask;
|
||||||
|
uint8_t nuc;
|
||||||
|
|
||||||
|
int16_t* target = seq;
|
||||||
|
int16_t* end = target + (size_t) l;
|
||||||
|
|
||||||
|
if (reverse)
|
||||||
|
{
|
||||||
|
for (i = l-1; target < end; target++, i--)
|
||||||
|
{
|
||||||
|
shift = 6 - 2*(i % 4);
|
||||||
|
mask = NUC_MASK_2B << shift;
|
||||||
|
nuc = (b->value[i/4] & mask) >> shift;
|
||||||
|
|
||||||
|
*target = (int16_t)nuc+1; // +1 because nucleotide can't be == 0 (0 is a default value used to initialize some registers)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (i=0; target < end; target++, i++)
|
||||||
|
{
|
||||||
|
shift = 6 - 2*(i % 4);
|
||||||
|
mask = NUC_MASK_2B << shift;
|
||||||
|
nuc = (b->value[i/4] & mask) >> shift;
|
||||||
|
|
||||||
|
*target = (int16_t)nuc+1; // +1 because nucleotide can't be == 0 (0 is a default value used to initialize some registers)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void initializeAddressWithGaps(int16_t* address, int bandLengthTotal, int bandLengthLeft, int l1)
|
void initializeAddressWithGaps(int16_t* address, int bandLengthTotal, int bandLengthLeft, int l1)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
@ -491,7 +534,7 @@ void initializeAddressWithGaps(int16_t* address, int bandLengthTotal, int bandLe
|
|||||||
int bm;
|
int bm;
|
||||||
int value=INT16_MAX-l1;
|
int value=INT16_MAX-l1;
|
||||||
|
|
||||||
numberOfRegistersPerLine = bandLengthTotal / 8;
|
numberOfRegistersPerLine = bandLengthTotal / 8;
|
||||||
bm = bandLengthLeft%2;
|
bm = bandLengthLeft%2;
|
||||||
|
|
||||||
for (i=0; i < (3*numberOfRegistersPerLine*8); i++)
|
for (i=0; i < (3*numberOfRegistersPerLine*8); i++)
|
||||||
@ -543,7 +586,7 @@ double sse_banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, bool n
|
|||||||
|
|
||||||
// fprintf(stderr, "\nid before normalizations = %f", id);
|
// fprintf(stderr, "\nid before normalizations = %f", id);
|
||||||
|
|
||||||
// fprintf(stderr, "\nlcs = %f, ali = %d\n", id, ali_length);
|
//fprintf(stderr, "\nlcs = %f, ali length = %d\n", id, ali_length);
|
||||||
|
|
||||||
if (!similarity_mode && !normalize)
|
if (!similarity_mode && !normalize)
|
||||||
switch(reference) {
|
switch(reference) {
|
||||||
@ -694,3 +737,98 @@ double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bo
|
|||||||
return(id);
|
return(id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
double obiblob_sse_banded_lcs_align(Obi_blob_p seq1, Obi_blob_p seq2, double threshold, bool normalize, int reference, bool similarity_mode)
|
||||||
|
{
|
||||||
|
double id;
|
||||||
|
int l1, l2;
|
||||||
|
int lmax, lmin;
|
||||||
|
int sizeToAllocateForBand, sizeToAllocateForSeqs;
|
||||||
|
int maxBLL;
|
||||||
|
int LCSmin;
|
||||||
|
int shift;
|
||||||
|
int16_t* address;
|
||||||
|
int16_t* iseq1;
|
||||||
|
int16_t* iseq2;
|
||||||
|
|
||||||
|
address = NULL;
|
||||||
|
|
||||||
|
l1 = seq1->length_decoded_value;
|
||||||
|
l2 = seq2->length_decoded_value;
|
||||||
|
|
||||||
|
if (l1 > l2)
|
||||||
|
{
|
||||||
|
lmax = l1;
|
||||||
|
lmin = l2;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
lmax = l2;
|
||||||
|
lmin = l1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the score is expressed as a normalized distance, get the corresponding identity
|
||||||
|
if (!similarity_mode && normalize)
|
||||||
|
threshold = 1.0 - threshold;
|
||||||
|
|
||||||
|
// Calculate the minimum LCS length corresponding to the threshold
|
||||||
|
LCSmin = calculateLCSmin(lmax, lmin, threshold, normalize, reference, similarity_mode);
|
||||||
|
|
||||||
|
// Allocate space for matrix band if the alignment length must be computed
|
||||||
|
if ((reference == ALILEN) && (normalize || !similarity_mode)) // cases in which alignment length must be computed
|
||||||
|
{
|
||||||
|
sizeToAllocateForBand = calculateSizeToAllocate(lmax, lmin, LCSmin);
|
||||||
|
address = obi_get_memory_aligned_on_16(sizeToAllocateForBand, &shift);
|
||||||
|
if (address == NULL)
|
||||||
|
{
|
||||||
|
obi_set_errno(OBI_MALLOC_ERROR);
|
||||||
|
obidebug(1, "\nError getting a memory address aligned on 16 bytes boundary");
|
||||||
|
return 0; // TODO DOUBLE_MIN
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Allocate space for the int16_t arrays representing the sequences
|
||||||
|
maxBLL = calculateLeftBandLength(lmax, LCSmin);
|
||||||
|
sizeToAllocateForSeqs = 2*maxBLL+lmax;
|
||||||
|
iseq1 = (int16_t*) malloc(sizeToAllocateForSeqs*sizeof(int16_t));
|
||||||
|
iseq2 = (int16_t*) malloc(sizeToAllocateForSeqs*sizeof(int16_t));
|
||||||
|
if ((iseq1 == NULL) || (iseq2 == NULL))
|
||||||
|
{
|
||||||
|
obi_set_errno(OBI_MALLOC_ERROR);
|
||||||
|
obidebug(1, "\nError allocating memory for integer arrays to use in LCS alignment");
|
||||||
|
return 0; // TODO DOUBLE_MIN
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize the int arrays
|
||||||
|
iniSeq(iseq1, (2*maxBLL)+lmax, 0);
|
||||||
|
iniSeq(iseq2, (2*maxBLL)+lmax, 255);
|
||||||
|
|
||||||
|
// Shift addresses to where the sequences have to be put
|
||||||
|
iseq1 = iseq1+maxBLL;
|
||||||
|
iseq2 = iseq2+maxBLL;
|
||||||
|
|
||||||
|
// Put the DNA sequences in the int arrays. Longest sequence must be first argument of sse_align function
|
||||||
|
if (l2 > l1)
|
||||||
|
{
|
||||||
|
putBlobInSeq(iseq1, seq2, l2, TRUE);
|
||||||
|
putBlobInSeq(iseq2, seq1, l1, FALSE);
|
||||||
|
// Compute alignment
|
||||||
|
id = sse_banded_lcs_align(iseq1, iseq2, l2, l1, normalize, reference, similarity_mode, address, LCSmin);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
putBlobInSeq(iseq1, seq1, l1, TRUE);
|
||||||
|
putBlobInSeq(iseq2, seq2, l2, FALSE);
|
||||||
|
// Compute alignment
|
||||||
|
id = sse_banded_lcs_align(iseq1, iseq2, l1, l2, normalize, reference, similarity_mode, address, LCSmin);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Free allocated elements
|
||||||
|
if (address != NULL)
|
||||||
|
free(address-shift);
|
||||||
|
free(iseq1-maxBLL);
|
||||||
|
free(iseq2-maxBLL);
|
||||||
|
|
||||||
|
return(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -12,6 +12,9 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
|
||||||
|
#include "obiblob.h"
|
||||||
|
|
||||||
|
|
||||||
#define ALILEN (0) // TODO enum
|
#define ALILEN (0) // TODO enum
|
||||||
#define MAXLEN (1)
|
#define MAXLEN (1)
|
||||||
#define MINLEN (2)
|
#define MINLEN (2)
|
||||||
@ -19,5 +22,6 @@
|
|||||||
// TODO doc
|
// TODO doc
|
||||||
int calculateLCSmin(int l1, int l2, double threshold, bool normalize, int reference, bool lcsmode);
|
int calculateLCSmin(int l1, int l2, double threshold, bool normalize, int reference, bool lcsmode);
|
||||||
double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bool normalize, int reference, bool similarity_mode);
|
double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bool normalize, int reference, bool similarity_mode);
|
||||||
|
double obiblob_sse_banded_lcs_align(Obi_blob_p seq1, Obi_blob_p seq2, double threshold, bool normalize, int reference, bool similarity_mode);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
354
src/upperband.c
Normal file
354
src/upperband.c
Normal file
@ -0,0 +1,354 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include <stdbool.h>
|
||||||
|
|
||||||
|
#include "upperband.h"
|
||||||
|
#include "_sse.h"
|
||||||
|
#include "sse_banded_LCS_alignment.h"
|
||||||
|
#include "obidmscolumn.h"
|
||||||
|
#include "obiview.h"
|
||||||
|
|
||||||
|
//#include "../libutils/utilities.h"
|
||||||
|
//#include "../libfasta/sequence.h"
|
||||||
|
|
||||||
|
|
||||||
|
inline static uchar_v hash4m128(uchar_v frag)
|
||||||
|
{
|
||||||
|
uchar_v words;
|
||||||
|
|
||||||
|
vUInt8 mask_03= _MM_SET1_EPI8(0x03); // charge le registre avec 16x le meme octet
|
||||||
|
vUInt8 mask_FC= _MM_SET1_EPI8(0xFC);
|
||||||
|
|
||||||
|
frag.m = _MM_SRLI_EPI64(frag.m,1); // shift logic a droite sur 2 x 64 bits
|
||||||
|
frag.m = _MM_AND_SI128(frag.m,mask_03); // and sur les 128 bits
|
||||||
|
|
||||||
|
|
||||||
|
words.m= _MM_SLLI_EPI64(frag.m,2);
|
||||||
|
words.m= _MM_AND_SI128(words.m,mask_FC);
|
||||||
|
frag.m = _MM_SRLI_SI128(frag.m,1);
|
||||||
|
words.m= _MM_OR_SI128(words.m,frag.m);
|
||||||
|
|
||||||
|
words.m= _MM_SLLI_EPI64(words.m,2);
|
||||||
|
words.m= _MM_AND_SI128(words.m,mask_FC);
|
||||||
|
frag.m = _MM_SRLI_SI128(frag.m,1);
|
||||||
|
words.m= _MM_OR_SI128(words.m,frag.m);
|
||||||
|
|
||||||
|
words.m= _MM_SLLI_EPI64(words.m,2);
|
||||||
|
words.m= _MM_AND_SI128(words.m,mask_FC);
|
||||||
|
frag.m = _MM_SRLI_SI128(frag.m,1);
|
||||||
|
words.m= _MM_OR_SI128(words.m,frag.m);
|
||||||
|
|
||||||
|
return words;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef __SSE2__
|
||||||
|
|
||||||
|
inline static int anyzerom128(vUInt8 data)
|
||||||
|
{
|
||||||
|
vUInt8 mask_00= _MM_SETZERO_SI128();
|
||||||
|
uint64_v tmp;
|
||||||
|
tmp.m = _MM_CMPEQ_EPI8(data,mask_00);
|
||||||
|
return (int)(tmp.c[0]!=0 || tmp.c[1]!=0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
inline static int anyzerom128(vUInt8 data)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
um128 tmp;
|
||||||
|
tmp.i = data;
|
||||||
|
for (i=0;i<8;i++)
|
||||||
|
if (tmp.s8[i]==0)
|
||||||
|
return 1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
inline static void dumpm128(unsigned short *table,vUInt8 data)
|
||||||
|
{
|
||||||
|
memcpy(table,&data,16);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compute 4mer occurrence table from a DNA sequence
|
||||||
|
*
|
||||||
|
* sequence : a pointer to the null terminated nuc sequence
|
||||||
|
* table : a pointer to a 256 cells unisgned char table for
|
||||||
|
* storing the occurrence table
|
||||||
|
* count : pointer to an int value used as a return value
|
||||||
|
* containing the global word counted
|
||||||
|
*
|
||||||
|
* returns the number of words observed in the sequence with a
|
||||||
|
* count greater than 255.
|
||||||
|
*/
|
||||||
|
int build_table(const char* sequence, unsigned char *table, int *count)
|
||||||
|
{
|
||||||
|
int overflow = 0;
|
||||||
|
int wc=0;
|
||||||
|
int i;
|
||||||
|
vUInt8 mask_00= _MM_SETZERO_SI128();
|
||||||
|
|
||||||
|
uchar_v frag;
|
||||||
|
uchar_v words;
|
||||||
|
uchar_v zero;
|
||||||
|
|
||||||
|
char* s;
|
||||||
|
|
||||||
|
s=(char*)sequence;
|
||||||
|
|
||||||
|
memset(table,0,256*sizeof(unsigned char));
|
||||||
|
|
||||||
|
// encode ascii sequence with A : 00 C : 01 T: 10 G : 11
|
||||||
|
|
||||||
|
for(frag.m=_MM_LOADU_SI128((vUInt8*)s);
|
||||||
|
! anyzerom128(frag.m);
|
||||||
|
s+=12,frag.m=_MM_LOADU_SI128((vUInt8*)s))
|
||||||
|
{
|
||||||
|
words= hash4m128(frag);
|
||||||
|
|
||||||
|
// printf("%d %d %d %d\n",words.c[0],words.c[1],words.c[2],words.c[3]);
|
||||||
|
|
||||||
|
if (table[words.c[0]]<255) table[words.c[0]]++; else overflow++;
|
||||||
|
if (table[words.c[1]]<255) table[words.c[1]]++; else overflow++;
|
||||||
|
if (table[words.c[2]]<255) table[words.c[2]]++; else overflow++;
|
||||||
|
if (table[words.c[3]]<255) table[words.c[3]]++; else overflow++;
|
||||||
|
if (table[words.c[4]]<255) table[words.c[4]]++; else overflow++;
|
||||||
|
if (table[words.c[5]]<255) table[words.c[5]]++; else overflow++;
|
||||||
|
if (table[words.c[6]]<255) table[words.c[6]]++; else overflow++;
|
||||||
|
if (table[words.c[7]]<255) table[words.c[7]]++; else overflow++;
|
||||||
|
if (table[words.c[8]]<255) table[words.c[8]]++; else overflow++;
|
||||||
|
if (table[words.c[9]]<255) table[words.c[9]]++; else overflow++;
|
||||||
|
if (table[words.c[10]]<255) table[words.c[10]]++; else overflow++;
|
||||||
|
if (table[words.c[11]]<255) table[words.c[11]]++; else overflow++;
|
||||||
|
|
||||||
|
wc+=12;
|
||||||
|
}
|
||||||
|
|
||||||
|
zero.m=_MM_CMPEQ_EPI8(frag.m,mask_00);
|
||||||
|
//printf("frag=%d %d %d %d\n",frag.c[0],frag.c[1],frag.c[2],frag.c[3]);
|
||||||
|
//printf("zero=%d %d %d %d\n",zero.c[0],zero.c[1],zero.c[2],zero.c[3]);
|
||||||
|
words = hash4m128(frag);
|
||||||
|
|
||||||
|
if (zero.c[0]+zero.c[1]+zero.c[2]+zero.c[3]==0)
|
||||||
|
{
|
||||||
|
for(i=0;zero.c[i+3]==0;i++,wc++)
|
||||||
|
{
|
||||||
|
if (table[words.c[i]]<255)
|
||||||
|
(table[words.c[i]])++;
|
||||||
|
else
|
||||||
|
(overflow)++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (count)
|
||||||
|
*count=wc;
|
||||||
|
return overflow;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static inline vUInt16 partial_min_sum(vUInt8 ft1,vUInt8 ft2)
|
||||||
|
{
|
||||||
|
vUInt8 mini;
|
||||||
|
vUInt16 minilo;
|
||||||
|
vUInt16 minihi;
|
||||||
|
vUInt8 mask_00= _MM_SETZERO_SI128();
|
||||||
|
|
||||||
|
mini = _MM_MIN_EPU8(ft1,ft2);
|
||||||
|
minilo = _MM_UNPACKLO_EPI8(mini,mask_00);
|
||||||
|
minihi = _MM_UNPACKHI_EPI8(mini,mask_00);
|
||||||
|
|
||||||
|
return _MM_ADDS_EPU16(minilo,minihi);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int compare_tables(unsigned char *t1, int over1, unsigned char* t2, int over2)
|
||||||
|
{
|
||||||
|
vUInt8 ft1;
|
||||||
|
vUInt8 ft2;
|
||||||
|
vUInt8 *table1=(vUInt8*)t1;
|
||||||
|
vUInt8 *table2=(vUInt8*)t2;
|
||||||
|
ushort_v summini;
|
||||||
|
int i;
|
||||||
|
int total;
|
||||||
|
|
||||||
|
ft1 = _MM_LOADU_SI128(table1);
|
||||||
|
ft2 = _MM_LOADU_SI128(table2);
|
||||||
|
summini.m = partial_min_sum(ft1,ft2);
|
||||||
|
table1++;
|
||||||
|
table2++;
|
||||||
|
|
||||||
|
|
||||||
|
for (i=1;i<16;i++,table1++,table2++)
|
||||||
|
{
|
||||||
|
ft1 = _MM_LOADU_SI128(table1);
|
||||||
|
ft2 = _MM_LOADU_SI128(table2);
|
||||||
|
summini.m = _MM_ADDS_EPU16(summini.m, partial_min_sum(ft1,ft2));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// Finishing the sum process
|
||||||
|
|
||||||
|
summini.m = _MM_ADDS_EPU16(summini.m,_MM_SRLI_SI128(summini.m,8)); // sum the 4 firsts with the 4 lasts
|
||||||
|
summini.m = _MM_ADDS_EPU16(summini.m,_MM_SRLI_SI128(summini.m,4));
|
||||||
|
|
||||||
|
total = summini.c[0]+summini.c[1];
|
||||||
|
total+= (over1 < over2) ? over1:over2;
|
||||||
|
|
||||||
|
return total;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int threshold4(int wordcount, double identity)
|
||||||
|
{
|
||||||
|
int error;
|
||||||
|
int lmax;
|
||||||
|
|
||||||
|
wordcount+=3;
|
||||||
|
error = (int)floor((double)wordcount * ((double)1.0-identity));
|
||||||
|
lmax = (wordcount - error) / (error + 1);
|
||||||
|
if (lmax < 4)
|
||||||
|
return 0;
|
||||||
|
return (lmax - 3) \
|
||||||
|
* (error + 1) \
|
||||||
|
+ ((wordcount - error) % (error + 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int thresholdLCS4(int32_t reflen, int32_t lcs)
|
||||||
|
{
|
||||||
|
int nbfrag;
|
||||||
|
int smin;
|
||||||
|
int R;
|
||||||
|
int common;
|
||||||
|
|
||||||
|
nbfrag = (reflen - lcs)*2 + 1;
|
||||||
|
smin = lcs/nbfrag;
|
||||||
|
R = lcs - smin * nbfrag;
|
||||||
|
common = MAX(smin - 2,0) * R + MAX(smin - 3,0) * (nbfrag - R);
|
||||||
|
return common;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Kmer_table_p hash_seq_column(Obiview_p view, OBIDMS_column_p seq_col)
|
||||||
|
{
|
||||||
|
size_t i;
|
||||||
|
size_t seq_count;
|
||||||
|
int32_t count;
|
||||||
|
Kmer_table_p ktable;
|
||||||
|
char* seq;
|
||||||
|
|
||||||
|
fprintf(stderr,"Building kmer tables...");
|
||||||
|
|
||||||
|
seq_count = (seq_col->header)->lines_used;
|
||||||
|
|
||||||
|
// Allocate memory for the table structure
|
||||||
|
ktable = (Kmer_table_p) malloc(sizeof(Kmer_table_t) * seq_count);
|
||||||
|
if (ktable == NULL)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
for (i=0; i < seq_count; i++)
|
||||||
|
{
|
||||||
|
seq = obi_get_seq_with_elt_idx_and_col_p_in_view(view, seq_col, i, 0); // TODO discuss 1 element per line mandatory
|
||||||
|
if (seq == NULL)
|
||||||
|
return NULL; // TODO or not
|
||||||
|
ktable[i].table = malloc(256 * sizeof(unsigned char));
|
||||||
|
if (ktable[i].table == NULL)
|
||||||
|
return NULL;
|
||||||
|
ktable[i].over = build_table(seq, ktable[i].table, &count);
|
||||||
|
free(seq);
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr," : Done\n");
|
||||||
|
|
||||||
|
return ktable;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void free_kmer_tables(Kmer_table_p ktable, size_t count)
|
||||||
|
{
|
||||||
|
size_t i;
|
||||||
|
|
||||||
|
for (i=0; i < count; i++)
|
||||||
|
free(ktable[i].table);
|
||||||
|
|
||||||
|
free(ktable);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
bool is_possible(Kmer_table_p ktable, index_t idx1, index_t idx2, int l1, int l2, double threshold, bool normalize, int reference, bool similarity_mode)
|
||||||
|
{
|
||||||
|
int32_t reflen;
|
||||||
|
int32_t lcs;
|
||||||
|
int32_t mincount;
|
||||||
|
|
||||||
|
if (l1 < 12 || l2 < 12)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (reference==ALILEN || reference==MAXLEN)
|
||||||
|
reflen = l1;
|
||||||
|
else
|
||||||
|
reflen = l2;
|
||||||
|
|
||||||
|
if (normalize)
|
||||||
|
lcs = (int32_t)ceil((double)reflen * threshold);
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (! similarity_mode)
|
||||||
|
threshold = reflen - threshold;
|
||||||
|
lcs = (int32_t) threshold;
|
||||||
|
}
|
||||||
|
|
||||||
|
mincount = thresholdLCS4(l1, lcs);
|
||||||
|
|
||||||
|
return compare_tables(ktable[idx1].table, ktable[idx1].over, ktable[idx2].table, ktable[idx2].over) >= mincount;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void align_filters(Kmer_table_p ktable, Obi_blob_p seq1, Obi_blob_p seq2, index_t idx1, index_t idx2, double threshold, bool normalize, int reference, bool similarity_mode, double* score, int* LCSmin)
|
||||||
|
{ // score takes value -2 if filters are not passed, -1 if filters are passed and >= 0 with max score if the 2 sequences are identical.
|
||||||
|
|
||||||
|
int l1;
|
||||||
|
int l2;
|
||||||
|
l1 = seq1->length_decoded_value;
|
||||||
|
|
||||||
|
*score = -2.0;
|
||||||
|
|
||||||
|
if (obi_blob_compare(seq1, seq2) == 0) // the 2 sequences are identical. TODO add bool arg indicating whether that's a possibility?
|
||||||
|
{
|
||||||
|
if (similarity_mode && normalize)
|
||||||
|
*score = 1.0;
|
||||||
|
else if (!similarity_mode)
|
||||||
|
*score = 0.0;
|
||||||
|
else
|
||||||
|
*score = l1;
|
||||||
|
}
|
||||||
|
|
||||||
|
else if (threshold != 0)
|
||||||
|
{
|
||||||
|
l2 = seq2->length_decoded_value;
|
||||||
|
|
||||||
|
if (l1 >= l2)
|
||||||
|
{
|
||||||
|
*LCSmin = calculateLCSmin(l1, l2, threshold, normalize, reference, similarity_mode);
|
||||||
|
if (l2 >= *LCSmin)
|
||||||
|
{
|
||||||
|
if (is_possible(ktable, idx1, idx2, l1, l2, threshold, normalize, reference, similarity_mode)) // 4-mers filter
|
||||||
|
*score = -1.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
*LCSmin = calculateLCSmin(l2, l1, threshold, normalize, reference, similarity_mode);
|
||||||
|
if (l1 >= *LCSmin)
|
||||||
|
{
|
||||||
|
if (is_possible(ktable, idx2, idx1, l2, l1, threshold, normalize, reference, similarity_mode)) // 4-mers filter
|
||||||
|
*score = -1.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
*LCSmin = 0;
|
||||||
|
}
|
26
src/upperband.h
Normal file
26
src/upperband.h
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
|
||||||
|
#ifndef UPPERBAND_H_
|
||||||
|
#define UPPERBAND_H_
|
||||||
|
|
||||||
|
// TODO doc
|
||||||
|
|
||||||
|
#include <stdbool.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
#include "obiblob.h"
|
||||||
|
#include "obiview.h"
|
||||||
|
#include "obidmscolumn.h"
|
||||||
|
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
unsigned char* table; // 4mer occurrence table built using the build_table function
|
||||||
|
int32_t over; // count of 4mers with an occurrence greater than 255 (overflow)
|
||||||
|
} Kmer_table_t, *Kmer_table_p;
|
||||||
|
|
||||||
|
|
||||||
|
Kmer_table_p hash_seq_column(Obiview_p view, OBIDMS_column_p seq_col);
|
||||||
|
void align_filters(Kmer_table_p ktable, Obi_blob_p seq1, Obi_blob_p seq2, index_t idx1, index_t idx2, double threshold, bool normalize, int reference, bool similarity_mode, double* score, int* LCSmin);
|
||||||
|
void free_kmer_tables(Kmer_table_p ktable, size_t count);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
Reference in New Issue
Block a user