Sequence alignment: added the possibility to specify the index of the

sequences to align in a column containing multiple sequences per line (C
level for now)
This commit is contained in:
Celine Mercier
2016-11-29 16:15:02 +01:00
parent 5fb025f310
commit 98d0849653
6 changed files with 26 additions and 11 deletions

View File

@ -581,7 +581,7 @@ cdef class OBIView_NUC_SEQS(OBIView):
id1_col_p = id1_col_pp[0]
id2_col_p = id2_col_pp[0]
if obi_align_one_column(iview1_p, icol1_p, oview_p, id1_col_p, id2_col_p, ocol_p, threshold, normalize, reference, similarity_mode) < 0 :
if obi_align_one_column(iview1_p, icol1_p, NULL, oview_p, id1_col_p, id2_col_p, ocol_p, threshold, normalize, reference, similarity_mode) < 0 :
raise Exception("Error aligning sequences")

View File

@ -8,6 +8,7 @@ cdef extern from "obi_align.h" nogil:
int obi_align_one_column(Obiview_p seq_view,
OBIDMS_column_p seq_column,
const char* seq_name,
Obiview_p score_view,
OBIDMS_column_p id1_column,
OBIDMS_column_p id2_column,

View File

@ -33,7 +33,7 @@
// what's with multiple sequences/line columns?
int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column,
int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const char* seq_name,
Obiview_p score_view, OBIDMS_column_p id1_column, OBIDMS_column_p id2_column, OBIDMS_column_p score_column,
double threshold, bool normalize, int reference, bool similarity_mode)
{
@ -47,6 +47,7 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column,
Obi_blob_p blob1;
Obi_blob_p blob2;
int lcs_min;
index_t seq_idx;
k = 0;
@ -65,8 +66,21 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column,
return -1;
}
// Get element index from element name to compute it only once
if (seq_name != NULL)
{
seq_idx = obi_column_get_element_index_from_name(seq_column, seq_name);
if (seq_idx == OBIIdx_NA)
{
obidebug(1, "\nError getting the sequence index in a column line when aligning");
return -1;
}
}
else
seq_idx = 0;
// Build kmer tables
ktable = hash_seq_column(seq_view, seq_column);
ktable = hash_seq_column(seq_view, seq_column, seq_idx);
if (ktable == NULL)
{
obi_set_errno(OBI_ALIGN_ERROR);
@ -86,8 +100,8 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column,
for (j=i+1; j < seq_count; j++)
{
blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, 0);
blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, 0);
blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, seq_idx);
blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, seq_idx);
if ((blob1 == NULL) || (blob2 == NULL))
{
@ -96,7 +110,7 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column,
}
// Check if the sequences are identical in a quick way (same index in the same indexer)
if (obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, 0) == obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, 0))
if (obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, seq_idx) == obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, seq_idx))
{
if (similarity_mode && normalize)
score = 1.0;
@ -120,7 +134,7 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column,
{ // Print result
// Get sequence ids
id1 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0);
id1 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0); // TODO Could there be multiple IDs per line?
id2 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, j, 0);
// Write sequence ids in output view

View File

@ -49,7 +49,7 @@
* @since May 2016
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column,
int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const char* seq_name,
Obiview_p score_view, OBIDMS_column_p id1_column, OBIDMS_column_p id2_column, OBIDMS_column_p score_column,
double threshold, bool normalize, int reference, bool similarity_mode);

View File

@ -232,7 +232,7 @@ int thresholdLCS4(int32_t reflen, int32_t lcs)
}
Kmer_table_p hash_seq_column(Obiview_p view, OBIDMS_column_p seq_col)
Kmer_table_p hash_seq_column(Obiview_p view, OBIDMS_column_p seq_col, index_t seq_idx)
{
size_t i;
size_t seq_count;
@ -251,7 +251,7 @@ Kmer_table_p hash_seq_column(Obiview_p view, OBIDMS_column_p seq_col)
for (i=0; i < seq_count; i++)
{
seq = obi_get_seq_with_elt_idx_and_col_p_in_view(view, seq_col, i, 0); // TODO discuss 1 element per line mandatory
seq = obi_get_seq_with_elt_idx_and_col_p_in_view(view, seq_col, i, seq_idx); // TODO discuss 1 element per line mandatory
if (seq == NULL)
return NULL; // TODO or not
ktable[i].table = malloc(256 * sizeof(unsigned char));

View File

@ -18,7 +18,7 @@ typedef struct {
} Kmer_table_t, *Kmer_table_p;
Kmer_table_p hash_seq_column(Obiview_p view, OBIDMS_column_p seq_col);
Kmer_table_p hash_seq_column(Obiview_p view, OBIDMS_column_p seq_col, index_t seq_idx);
void align_filters(Kmer_table_p ktable, Obi_blob_p seq1, Obi_blob_p seq2, index_t idx1, index_t idx2, double threshold, bool normalize, int reference, bool similarity_mode, double* score, int* LCSmin, bool can_be_identical);
void free_kmer_tables(Kmer_table_p ktable, size_t count);