Sequence alignment: added the possibility to specify the index of the
sequences to align in a column containing multiple sequences per line (C level for now)
This commit is contained in:
@ -581,7 +581,7 @@ cdef class OBIView_NUC_SEQS(OBIView):
|
|||||||
id1_col_p = id1_col_pp[0]
|
id1_col_p = id1_col_pp[0]
|
||||||
id2_col_p = id2_col_pp[0]
|
id2_col_p = id2_col_pp[0]
|
||||||
|
|
||||||
if obi_align_one_column(iview1_p, icol1_p, oview_p, id1_col_p, id2_col_p, ocol_p, threshold, normalize, reference, similarity_mode) < 0 :
|
if obi_align_one_column(iview1_p, icol1_p, NULL, oview_p, id1_col_p, id2_col_p, ocol_p, threshold, normalize, reference, similarity_mode) < 0 :
|
||||||
raise Exception("Error aligning sequences")
|
raise Exception("Error aligning sequences")
|
||||||
|
|
||||||
|
|
||||||
|
@ -8,6 +8,7 @@ cdef extern from "obi_align.h" nogil:
|
|||||||
|
|
||||||
int obi_align_one_column(Obiview_p seq_view,
|
int obi_align_one_column(Obiview_p seq_view,
|
||||||
OBIDMS_column_p seq_column,
|
OBIDMS_column_p seq_column,
|
||||||
|
const char* seq_name,
|
||||||
Obiview_p score_view,
|
Obiview_p score_view,
|
||||||
OBIDMS_column_p id1_column,
|
OBIDMS_column_p id1_column,
|
||||||
OBIDMS_column_p id2_column,
|
OBIDMS_column_p id2_column,
|
||||||
|
@ -33,7 +33,7 @@
|
|||||||
// what's with multiple sequences/line columns?
|
// what's with multiple sequences/line columns?
|
||||||
|
|
||||||
|
|
||||||
int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column,
|
int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const char* seq_name,
|
||||||
Obiview_p score_view, OBIDMS_column_p id1_column, OBIDMS_column_p id2_column, OBIDMS_column_p score_column,
|
Obiview_p score_view, OBIDMS_column_p id1_column, OBIDMS_column_p id2_column, OBIDMS_column_p score_column,
|
||||||
double threshold, bool normalize, int reference, bool similarity_mode)
|
double threshold, bool normalize, int reference, bool similarity_mode)
|
||||||
{
|
{
|
||||||
@ -47,6 +47,7 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column,
|
|||||||
Obi_blob_p blob1;
|
Obi_blob_p blob1;
|
||||||
Obi_blob_p blob2;
|
Obi_blob_p blob2;
|
||||||
int lcs_min;
|
int lcs_min;
|
||||||
|
index_t seq_idx;
|
||||||
|
|
||||||
k = 0;
|
k = 0;
|
||||||
|
|
||||||
@ -65,8 +66,21 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column,
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Get element index from element name to compute it only once
|
||||||
|
if (seq_name != NULL)
|
||||||
|
{
|
||||||
|
seq_idx = obi_column_get_element_index_from_name(seq_column, seq_name);
|
||||||
|
if (seq_idx == OBIIdx_NA)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError getting the sequence index in a column line when aligning");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
seq_idx = 0;
|
||||||
|
|
||||||
// Build kmer tables
|
// Build kmer tables
|
||||||
ktable = hash_seq_column(seq_view, seq_column);
|
ktable = hash_seq_column(seq_view, seq_column, seq_idx);
|
||||||
if (ktable == NULL)
|
if (ktable == NULL)
|
||||||
{
|
{
|
||||||
obi_set_errno(OBI_ALIGN_ERROR);
|
obi_set_errno(OBI_ALIGN_ERROR);
|
||||||
@ -86,8 +100,8 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column,
|
|||||||
|
|
||||||
for (j=i+1; j < seq_count; j++)
|
for (j=i+1; j < seq_count; j++)
|
||||||
{
|
{
|
||||||
blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, 0);
|
blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, seq_idx);
|
||||||
blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, 0);
|
blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, seq_idx);
|
||||||
|
|
||||||
if ((blob1 == NULL) || (blob2 == NULL))
|
if ((blob1 == NULL) || (blob2 == NULL))
|
||||||
{
|
{
|
||||||
@ -96,7 +110,7 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Check if the sequences are identical in a quick way (same index in the same indexer)
|
// Check if the sequences are identical in a quick way (same index in the same indexer)
|
||||||
if (obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, 0) == obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, 0))
|
if (obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, seq_idx) == obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, seq_idx))
|
||||||
{
|
{
|
||||||
if (similarity_mode && normalize)
|
if (similarity_mode && normalize)
|
||||||
score = 1.0;
|
score = 1.0;
|
||||||
@ -120,7 +134,7 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column,
|
|||||||
{ // Print result
|
{ // Print result
|
||||||
|
|
||||||
// Get sequence ids
|
// Get sequence ids
|
||||||
id1 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0);
|
id1 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0); // TODO Could there be multiple IDs per line?
|
||||||
id2 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, j, 0);
|
id2 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, j, 0);
|
||||||
|
|
||||||
// Write sequence ids in output view
|
// Write sequence ids in output view
|
||||||
|
@ -49,7 +49,7 @@
|
|||||||
* @since May 2016
|
* @since May 2016
|
||||||
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||||
*/
|
*/
|
||||||
int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column,
|
int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const char* seq_name,
|
||||||
Obiview_p score_view, OBIDMS_column_p id1_column, OBIDMS_column_p id2_column, OBIDMS_column_p score_column,
|
Obiview_p score_view, OBIDMS_column_p id1_column, OBIDMS_column_p id2_column, OBIDMS_column_p score_column,
|
||||||
double threshold, bool normalize, int reference, bool similarity_mode);
|
double threshold, bool normalize, int reference, bool similarity_mode);
|
||||||
|
|
||||||
|
@ -232,7 +232,7 @@ int thresholdLCS4(int32_t reflen, int32_t lcs)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
Kmer_table_p hash_seq_column(Obiview_p view, OBIDMS_column_p seq_col)
|
Kmer_table_p hash_seq_column(Obiview_p view, OBIDMS_column_p seq_col, index_t seq_idx)
|
||||||
{
|
{
|
||||||
size_t i;
|
size_t i;
|
||||||
size_t seq_count;
|
size_t seq_count;
|
||||||
@ -251,7 +251,7 @@ Kmer_table_p hash_seq_column(Obiview_p view, OBIDMS_column_p seq_col)
|
|||||||
|
|
||||||
for (i=0; i < seq_count; i++)
|
for (i=0; i < seq_count; i++)
|
||||||
{
|
{
|
||||||
seq = obi_get_seq_with_elt_idx_and_col_p_in_view(view, seq_col, i, 0); // TODO discuss 1 element per line mandatory
|
seq = obi_get_seq_with_elt_idx_and_col_p_in_view(view, seq_col, i, seq_idx); // TODO discuss 1 element per line mandatory
|
||||||
if (seq == NULL)
|
if (seq == NULL)
|
||||||
return NULL; // TODO or not
|
return NULL; // TODO or not
|
||||||
ktable[i].table = malloc(256 * sizeof(unsigned char));
|
ktable[i].table = malloc(256 * sizeof(unsigned char));
|
||||||
|
@ -18,7 +18,7 @@ typedef struct {
|
|||||||
} Kmer_table_t, *Kmer_table_p;
|
} Kmer_table_t, *Kmer_table_p;
|
||||||
|
|
||||||
|
|
||||||
Kmer_table_p hash_seq_column(Obiview_p view, OBIDMS_column_p seq_col);
|
Kmer_table_p hash_seq_column(Obiview_p view, OBIDMS_column_p seq_col, index_t seq_idx);
|
||||||
void align_filters(Kmer_table_p ktable, Obi_blob_p seq1, Obi_blob_p seq2, index_t idx1, index_t idx2, double threshold, bool normalize, int reference, bool similarity_mode, double* score, int* LCSmin, bool can_be_identical);
|
void align_filters(Kmer_table_p ktable, Obi_blob_p seq1, Obi_blob_p seq2, index_t idx1, index_t idx2, double threshold, bool normalize, int reference, bool similarity_mode, double* score, int* LCSmin, bool can_be_identical);
|
||||||
void free_kmer_tables(Kmer_table_p ktable, size_t count);
|
void free_kmer_tables(Kmer_table_p ktable, size_t count);
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user