From 98d08496537d926dd2b90968f1ad1f2c5ebc6ccd Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Tue, 29 Nov 2016 16:15:02 +0100 Subject: [PATCH] Sequence alignment: added the possibility to specify the index of the sequences to align in a column containing multiple sequences per line (C level for now) --- python/obitools3/obidms/_obidms.pyx | 2 +- python/obitools3/obidms/capi/obialign.pxd | 1 + src/obi_align.c | 26 +++++++++++++++++------ src/obi_align.h | 2 +- src/upperband.c | 4 ++-- src/upperband.h | 2 +- 6 files changed, 26 insertions(+), 11 deletions(-) diff --git a/python/obitools3/obidms/_obidms.pyx b/python/obitools3/obidms/_obidms.pyx index c0786e6..98022b4 100644 --- a/python/obitools3/obidms/_obidms.pyx +++ b/python/obitools3/obidms/_obidms.pyx @@ -581,7 +581,7 @@ cdef class OBIView_NUC_SEQS(OBIView): id1_col_p = id1_col_pp[0] id2_col_p = id2_col_pp[0] - if obi_align_one_column(iview1_p, icol1_p, oview_p, id1_col_p, id2_col_p, ocol_p, threshold, normalize, reference, similarity_mode) < 0 : + if obi_align_one_column(iview1_p, icol1_p, NULL, oview_p, id1_col_p, id2_col_p, ocol_p, threshold, normalize, reference, similarity_mode) < 0 : raise Exception("Error aligning sequences") diff --git a/python/obitools3/obidms/capi/obialign.pxd b/python/obitools3/obidms/capi/obialign.pxd index 8886495..753623d 100644 --- a/python/obitools3/obidms/capi/obialign.pxd +++ b/python/obitools3/obidms/capi/obialign.pxd @@ -8,6 +8,7 @@ cdef extern from "obi_align.h" nogil: int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, + const char* seq_name, Obiview_p score_view, OBIDMS_column_p id1_column, OBIDMS_column_p id2_column, diff --git a/src/obi_align.c b/src/obi_align.c index eb89b5f..9fd747e 100644 --- a/src/obi_align.c +++ b/src/obi_align.c @@ -33,7 +33,7 @@ // what's with multiple sequences/line columns? -int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, +int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const char* seq_name, Obiview_p score_view, OBIDMS_column_p id1_column, OBIDMS_column_p id2_column, OBIDMS_column_p score_column, double threshold, bool normalize, int reference, bool similarity_mode) { @@ -47,6 +47,7 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, Obi_blob_p blob1; Obi_blob_p blob2; int lcs_min; + index_t seq_idx; k = 0; @@ -65,8 +66,21 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, return -1; } + // Get element index from element name to compute it only once + if (seq_name != NULL) + { + seq_idx = obi_column_get_element_index_from_name(seq_column, seq_name); + if (seq_idx == OBIIdx_NA) + { + obidebug(1, "\nError getting the sequence index in a column line when aligning"); + return -1; + } + } + else + seq_idx = 0; + // Build kmer tables - ktable = hash_seq_column(seq_view, seq_column); + ktable = hash_seq_column(seq_view, seq_column, seq_idx); if (ktable == NULL) { obi_set_errno(OBI_ALIGN_ERROR); @@ -86,8 +100,8 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, for (j=i+1; j < seq_count; j++) { - blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, 0); - blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, 0); + blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, seq_idx); + blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, seq_idx); if ((blob1 == NULL) || (blob2 == NULL)) { @@ -96,7 +110,7 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, } // Check if the sequences are identical in a quick way (same index in the same indexer) - if (obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, 0) == obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, 0)) + if (obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, seq_idx) == obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, seq_idx)) { if (similarity_mode && normalize) score = 1.0; @@ -120,7 +134,7 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, { // Print result // Get sequence ids - id1 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0); + id1 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0); // TODO Could there be multiple IDs per line? id2 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, j, 0); // Write sequence ids in output view diff --git a/src/obi_align.h b/src/obi_align.h index 90abc46..96ce4de 100644 --- a/src/obi_align.h +++ b/src/obi_align.h @@ -49,7 +49,7 @@ * @since May 2016 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, +int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const char* seq_name, Obiview_p score_view, OBIDMS_column_p id1_column, OBIDMS_column_p id2_column, OBIDMS_column_p score_column, double threshold, bool normalize, int reference, bool similarity_mode); diff --git a/src/upperband.c b/src/upperband.c index eeca342..9eba088 100644 --- a/src/upperband.c +++ b/src/upperband.c @@ -232,7 +232,7 @@ int thresholdLCS4(int32_t reflen, int32_t lcs) } -Kmer_table_p hash_seq_column(Obiview_p view, OBIDMS_column_p seq_col) +Kmer_table_p hash_seq_column(Obiview_p view, OBIDMS_column_p seq_col, index_t seq_idx) { size_t i; size_t seq_count; @@ -251,7 +251,7 @@ Kmer_table_p hash_seq_column(Obiview_p view, OBIDMS_column_p seq_col) for (i=0; i < seq_count; i++) { - seq = obi_get_seq_with_elt_idx_and_col_p_in_view(view, seq_col, i, 0); // TODO discuss 1 element per line mandatory + seq = obi_get_seq_with_elt_idx_and_col_p_in_view(view, seq_col, i, seq_idx); // TODO discuss 1 element per line mandatory if (seq == NULL) return NULL; // TODO or not ktable[i].table = malloc(256 * sizeof(unsigned char)); diff --git a/src/upperband.h b/src/upperband.h index 56031cf..f378287 100644 --- a/src/upperband.h +++ b/src/upperband.h @@ -18,7 +18,7 @@ typedef struct { } Kmer_table_t, *Kmer_table_p; -Kmer_table_p hash_seq_column(Obiview_p view, OBIDMS_column_p seq_col); +Kmer_table_p hash_seq_column(Obiview_p view, OBIDMS_column_p seq_col, index_t seq_idx); void align_filters(Kmer_table_p ktable, Obi_blob_p seq1, Obi_blob_p seq2, index_t idx1, index_t idx2, double threshold, bool normalize, int reference, bool similarity_mode, double* score, int* LCSmin, bool can_be_identical); void free_kmer_tables(Kmer_table_p ktable, size_t count);