From 8afb1644e9c4bc4218793a13a0377258b1fd938c Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Mon, 12 Dec 2016 11:58:59 +0100 Subject: [PATCH] Alignment: API rework. 'obi align' is now 'obi lcs', and the results are now written to columns automatically created in the output view, all optimally handled at the C level. --- python/obitools3/commands/align.pyx | 120 ------- python/obitools3/commands/lcs.pyx | 209 +++++++++++ python/obitools3/obidms/_obidms.pxd | 12 +- python/obitools3/obidms/_obidms.pyx | 45 --- python/obitools3/obidms/_obidmscolumn_seq.pyx | 1 - python/obitools3/obidms/_obiseq.pyx | 4 +- python/obitools3/obidms/capi/obialign.pxd | 28 +- src/obi_align.c | 334 +++++++++++++++--- src/obi_align.h | 59 +++- src/sse_banded_LCS_alignment.c | 35 +- src/sse_banded_LCS_alignment.h | 4 +- 11 files changed, 579 insertions(+), 272 deletions(-) delete mode 100644 python/obitools3/commands/align.pyx create mode 100644 python/obitools3/commands/lcs.pyx diff --git a/python/obitools3/commands/align.pyx b/python/obitools3/commands/align.pyx deleted file mode 100644 index 04febc4..0000000 --- a/python/obitools3/commands/align.pyx +++ /dev/null @@ -1,120 +0,0 @@ -from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport -from obitools3.obidms._obidms import OBIDMS, OBIView # TODO cimport doesn't work - - -import time - -__title__="Aligns one sequence column with itself or two sequence columns" - - -default_config = { 'inputview' : None, - } - -def addOptions(parser): - - # TODO put this common group somewhere else but I don't know where - group=parser.add_argument_group('DMS and view options') - - group.add_argument('--default-dms','-d', - action="store", dest="obi:defaultdms", - metavar='', - default=None, - type=str, - help="Name of the default DMS for reading and writing data.") - - group.add_argument('--input-view','-i', - action="store", dest="obi:inputview", - metavar='', - default=None, - type=str, - help="Name of the input view.") - - # TODO eventually 2nd view, or 2nd column? - - group.add_argument('--output-view','-o', - action="store", dest="obi:outputview", - metavar='', - default=None, - type=str, - help="Name of the output view.") - - - group=parser.add_argument_group('obi align specific options') - - group.add_argument('--lcs','-C', - action="store", dest="align:alitype", - metavar='', - default='lcs', - type=str, - help="Compute alignment using the LCS method (default).") - - group.add_argument('--threshold','-t', - action="store", dest="align:threshold", - metavar='', - default=0.0, - type=float, - help="Score threshold. If the score is normalized and expressed in similarity (default)," - " it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized" - " and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%." - " If the score is not normalized and expressed in similarity, it is the length of the" - " Longest Common Subsequence. If the score is not normalized and expressed in distance," - " it is (reference length - LCS length)." - " Only sequence pairs with a similarity above are printed. Default: 0.00" - " (no threshold).") - - group.add_argument('--longest_length','-L', - action="store_const", dest="align:reflength", - default=0, - const=1, - help="The reference length is the length of the longest sequence." - " Default: the reference length is the length of the alignment.") - - group.add_argument('--shortest_length','-l', - action="store_const", dest="align:reflength", - default=0, - const=2, - help="The reference length is the length of the shortest sequence." - " Default: the reference length is the length of the alignment.") - - group.add_argument('--raw','-r', - action="store_false", dest="align:normalize", - default=True, - help="Raw score, not normalized. Default: score is normalized with the reference sequence length.") - - group.add_argument('--distance','-D', - action="store_false", dest="align:similarity", - default=True, - help="Score is expressed in distance. Default: score is expressed in similarity.") - - - -def run(config): - - # Open DMS - d = OBIDMS(config['obi']['defaultdms']) - - # Open input view 1 - iview = d.open_view(config['obi']['inputview']) - - # TODO Open input view 2 if there is one - - # Create output view - oview = d.new_view(config['obi']['outputview']) - - # TODO Take other alignment types into account when they'll be implemented - - # Call cython alignment function - iview.align(oview, threshold=config['align']['threshold'], normalize=config['align']['normalize'], reference=config['align']['reflength'], similarity_mode=config['align']['similarity']) - - print(repr(oview)) - - iview.close() - oview.close() - d.close() - - print("Done.") - - - - - \ No newline at end of file diff --git a/python/obitools3/commands/lcs.pyx b/python/obitools3/commands/lcs.pyx new file mode 100644 index 0000000..0a8b3b0 --- /dev/null +++ b/python/obitools3/commands/lcs.pyx @@ -0,0 +1,209 @@ +#cython: language_level=3 + +from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport +from obitools3.obidms._obidms cimport OBIDMS # TODO cimport doesn't work +from obitools3.utils cimport str2bytes + +from obitools3.obidms.capi.obialign cimport obi_lcs_align_one_column + + +import time + +__title__="Aligns one sequence column with itself or two sequence columns" + + +default_config = { 'inputview' : None, + } + +def addOptions(parser): + + # TODO put this common group somewhere else but I don't know where. + # Also some options should probably be in another group + group=parser.add_argument_group('DMS and view options') + + group.add_argument('--default-dms', '-d', + action="store", dest="obi:defaultdms", + metavar='', + default=None, + type=str, + help="Name of the default DMS for reading and writing data.") + + group.add_argument('--input-view-1', '-i', + action="store", dest="obi:inputview1", + metavar='', + default=None, + type=str, + help="Name of the (first) input view.") + + group.add_argument('--input-view-2', '-I', + action="store", dest="obi:inputview2", + metavar='', + default="", + type=str, + help="Eventually, the name of the second input view.") + + group.add_argument('--input-column-1', '-c', + action="store", dest="obi:inputcolumn1", + metavar='', + default="", + type=str, + help="Name of the (first) input column. " + " Default: the default nucleotide sequence column of the view if there is one.") + + group.add_argument('--input-column-2', '-C', + action="store", dest="obi:inputcolumn2", + metavar='', + default="", + type=str, + help="Eventually, the name of the second input column.") + + group.add_argument('--input-elt-1', '-e', + action="store", dest="obi:inputelement1", + metavar='', + default="", + type=str, + help="If the first input column has multiple elements per line, name of the element referring to the sequence to align. " + " Default: the first element of the line.") + + group.add_argument('--input-elt-2', '-E', + action="store", dest="obi:inputelement2", + metavar='', + default="", + type=str, + help="If the second input column has multiple elements per line, name of the element referring to the sequence to align. " + " Default: the first element of the line.") + + group.add_argument('--id-column-1', '-f', + action="store", dest="obi:idcolumn1", + metavar='', + default="", + type=str, + help="Name of the (first) column containing the identifiers of the sequences to align. " + " Default: the default ID column of the view if there is one.") + + group.add_argument('--id-column-2', '-F', + action="store", dest="obi:idcolumn2", + metavar='', + default="", + type=str, + help="Eventually, the name of the second ID column.") + + group.add_argument('--output-view', '-o', + action="store", dest="obi:outputview", + metavar='', + default=None, + type=str, + help="Name of the output view.") + + + group=parser.add_argument_group('obi lcs specific options') + + group.add_argument('--threshold','-t', + action="store", dest="align:threshold", + metavar='', + default=0.0, + type=float, + help="Score threshold. If the score is normalized and expressed in similarity (default)," + " it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized" + " and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%." + " If the score is not normalized and expressed in similarity, it is the length of the" + " Longest Common Subsequence. If the score is not normalized and expressed in distance," + " it is (reference length - LCS length)." + " Only sequence pairs with a similarity above are printed. Default: 0.00" + " (no threshold).") + + group.add_argument('--longest-length','-L', + action="store_const", dest="align:reflength", + default=0, + const=1, + help="The reference length is the length of the longest sequence." + " Default: the reference length is the length of the alignment.") + + group.add_argument('--shortest-length','-l', + action="store_const", dest="align:reflength", + default=0, + const=2, + help="The reference length is the length of the shortest sequence." + " Default: the reference length is the length of the alignment.") + + group.add_argument('--raw','-r', + action="store_false", dest="align:normalize", + default=True, + help="Raw score, not normalized. Default: score is normalized with the reference sequence length.") + + group.add_argument('--distance','-D', + action="store_false", dest="align:similarity", + default=True, + help="Score is expressed in distance. Default: score is expressed in similarity.") + + group.add_argument('--print-seq','-s', + action="store_true", dest="align:printseq", + default=False, + help="The nucleotide sequences are written in the output view. Default: they are not written.") + + group.add_argument('--print-count','-n', + action="store_true", dest="align:printcount", + default=False, + help="Sequence counts are written in the output view. Default: they are not written.") + + +cpdef align(str dms_n, + str input_view_1_n, str output_view_n, + str input_view_2_n="", + str input_column_1_n="", str input_column_2_n="", + str input_elt_1_n="", str input_elt_2_n="", + str id_column_1_n="", str id_column_2_n="", + double threshold=0.0, bint normalize=True, + int reference=0, bint similarity_mode=True, + bint print_seq=False, bint print_count=False, + comments="") : + + cdef OBIDMS d + d = OBIDMS(dms_n) + + # Align 1 column (2 columns not implemented yet) + if obi_lcs_align_one_column(d._pointer, \ + str2bytes(input_view_1_n), \ + str2bytes(input_column_1_n), \ + str2bytes(input_elt_1_n), \ + str2bytes(id_column_1_n), \ + str2bytes(output_view_n), \ + str2bytes(comments), \ + print_seq, \ + print_count, \ + threshold, normalize, reference, similarity_mode) < 0 : + raise Exception("Error aligning sequences") + + d.close() + + +def run(config): + + # TODO: Build formatted comments with all parameters etc + comments = "Obi align" + + # Call cython alignment function + align(config['obi']['defaultdms'], \ + config['obi']['inputview1'], \ + config['obi']['outputview'], \ + input_view_2_n = config['obi']['inputview2'], \ + input_column_1_n = config['obi']['inputcolumn1'], \ + input_column_2_n = config['obi']['inputcolumn2'], \ + input_elt_1_n = config['obi']['inputelement1'], \ + input_elt_2_n = config['obi']['inputelement2'], \ + id_column_1_n = config['obi']['idcolumn1'], \ + id_column_2_n = config['obi']['idcolumn2'], \ + threshold = config['align']['threshold'], \ + normalize = config['align']['normalize'], \ + reference = config['align']['reflength'], \ + similarity_mode = config['align']['similarity'], \ + print_seq = config['align']['printseq'], \ + print_count = config['align']['printcount'], \ + comments = comments) + + print("Done.") + + + + + \ No newline at end of file diff --git a/python/obitools3/obidms/_obidms.pxd b/python/obitools3/obidms/_obidms.pxd index 64574ad..20c5375 100644 --- a/python/obitools3/obidms/_obidms.pxd +++ b/python/obitools3/obidms/_obidms.pxd @@ -67,16 +67,8 @@ cdef class OBIView: cdef object get_view_subclass(str view_type) -cdef class OBIView_NUC_SEQS(OBIView): - - cpdef align(self, - OBIView oview, - OBIView iview2=*, - double threshold=*, - bint normalize=*, - int reference=*, - bint similarity_mode=* - ) +cdef class OBIView_NUC_SEQS(OBIView) : + pass cdef class OBIView_line : diff --git a/python/obitools3/obidms/_obidms.pyx b/python/obitools3/obidms/_obidms.pyx index cb9bc6a..c35cd14 100644 --- a/python/obitools3/obidms/_obidms.pyx +++ b/python/obitools3/obidms/_obidms.pyx @@ -10,8 +10,6 @@ from .capi.obidmscolumn cimport obi_close_column, \ OBIDMS_column_header_p from .capi.obiutils cimport obi_format_date - -from .capi.obialign cimport obi_align_one_column from .capi.obitypes cimport const_char_p, \ OBIType_t, \ @@ -535,49 +533,6 @@ cdef class OBIView_NUC_SEQS(OBIView): self[line_idx][key] = sequence_obj[key] - # TODO discuss - cpdef align(self, OBIView oview, OBIView iview2=None, - double threshold=0.0, bint normalize=True, int reference=0, bint similarity_mode=True) : - pass -# -# cdef OBIView iview1 -# -# cdef Obiview_p iview1_p -# cdef Obiview_p iview2_p -# cdef Obiview_p oview_p -# -# cdef OBIDMS_column icol1 -# cdef OBIDMS_column_p icol1_p -# cdef OBIDMS_column_p* icol1_pp -# -# cdef OBIDMS_column id1_col -# cdef OBIDMS_column_p id1_col_p -# cdef OBIDMS_column_p* id1_col_pp -# -# cdef OBIDMS_column id2_col -# cdef OBIDMS_column_p id2_col_p -# cdef OBIDMS_column_p* id2_col_pp -# -# cdef OBIDMS_column ocol -# cdef OBIDMS_column_p ocol_p -# cdef OBIDMS_column_p* ocol_pp -# -# cdef str id1_col_name -# cdef str id2_col_name -# cdef str score_col_name -# -# score_col_name = "score" -# -# iview1= self -# iview1_p = iview1._pointer -# icol1 = iview1[bytes2str(NUC_SEQUENCE_COLUMN)] -# icol1_pp = icol1._pointer -# icol1_p = icol1_pp[0] -# -# if obi_align_one_column(iview1_p, icol1_p, threshold, normalize, reference, similarity_mode) < 0 : -# raise Exception("Error aligning sequences") - - ###################################################################################################### diff --git a/python/obitools3/obidms/_obidmscolumn_seq.pyx b/python/obitools3/obidms/_obidmscolumn_seq.pyx index 5ead66d..77c8335 100644 --- a/python/obitools3/obidms/_obidmscolumn_seq.pyx +++ b/python/obitools3/obidms/_obidmscolumn_seq.pyx @@ -4,7 +4,6 @@ from .capi.obiview cimport obi_get_seq_with_elt_name_and_col_p_in_view, \ obi_get_seq_with_elt_idx_and_col_p_in_view, \ obi_set_seq_with_elt_name_and_col_p_in_view, \ obi_set_seq_with_elt_idx_and_col_p_in_view -from .capi.obialign cimport obi_align_one_column from .capi.obierrno cimport obi_errno from .capi.obitypes cimport OBISeq_NA, const_char_p diff --git a/python/obitools3/obidms/_obiseq.pyx b/python/obitools3/obidms/_obiseq.pyx index 1e4709e..7bdf37b 100644 --- a/python/obitools3/obidms/_obiseq.pyx +++ b/python/obitools3/obidms/_obiseq.pyx @@ -102,12 +102,12 @@ cdef class OBI_Nuc_Seq_Stored(OBIView_line) : return self[bytes2str(QUALITY_COLUMN)] @quality.setter def quality(self, object new_qual): - if (type(new_qual) == list) or (new_qual is None) : + if (type(new_qual) == list) or (new_qual is None) : # TODO check that quality column exists self[bytes2str(QUALITY_COLUMN)] = new_qual else : # Quality is in str form (((self._view).columns)[bytes2str(QUALITY_COLUMN)]).set_str_line(self._index, new_qual) - cpdef object get_str_quality(self) : # TODO not ideal + cpdef object get_str_quality(self) : # TODO not ideal. Make quality_int and quality_str properties return ((self._view).columns)[bytes2str(QUALITY_COLUMN)].get_str_line(self._index) # cpdef str reverse_complement(self) : TODO in C ? diff --git a/python/obitools3/obidms/capi/obialign.pxd b/python/obitools3/obidms/capi/obialign.pxd index 753623d..e9c105c 100644 --- a/python/obitools3/obidms/capi/obialign.pxd +++ b/python/obitools3/obidms/capi/obialign.pxd @@ -1,20 +1,22 @@ #cython: language_level=3 -from ..capi.obiview cimport Obiview_p -from ..capi.obidmscolumn cimport OBIDMS_column_p +from obitools3.obidms.capi.obidms cimport OBIDMS_p +from obitools3.obidms.capi.obitypes cimport const_char_p cdef extern from "obi_align.h" nogil: - int obi_align_one_column(Obiview_p seq_view, - OBIDMS_column_p seq_column, - const char* seq_name, - Obiview_p score_view, - OBIDMS_column_p id1_column, - OBIDMS_column_p id2_column, - OBIDMS_column_p score_column, - double threshold, - bint normalize, - int reference, - bint similarity_mode) + int obi_lcs_align_one_column(OBIDMS_p dms, + const_char_p seq_view_name, + const_char_p seq_column_name, + const_char_p seq_elt_name, + const_char_p id_column_name, + const_char_p output_view_name, + const_char_p output_view_comments, + bint print_seq, + bint print_count, + double threshold, + bint normalize, + int reference, + bint similarity_mode) diff --git a/src/obi_align.c b/src/obi_align.c index 8453a7c..390ff60 100644 --- a/src/obi_align.c +++ b/src/obi_align.c @@ -14,6 +14,7 @@ #include #include +#include "obi_align.h" #include "obidebug.h" #include "obierrno.h" #include "obitypes.h" @@ -28,67 +29,227 @@ // TODO // use openMP pragmas -// option pour ecrire en stdint? -// check NUC_SEQS view type? and score type (int or float if normalize) -// what's with multiple sequences/line columns? -int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const char* seq_name, - Obiview_p score_view, OBIDMS_column_p id1_column, OBIDMS_column_p id2_column, OBIDMS_column_p score_column, - double threshold, bool normalize, int reference, bool similarity_mode) +int obi_lcs_align_one_column(OBIDMS_p dms, const char* seq_view_name, const char* seq_column_name, const char* seq_elt_name, + const char* id_column_name, + const char* output_view_name, const char* output_view_comments, + bool print_seq, bool print_count, + double threshold, bool normalize, int reference, bool similarity_mode) { index_t i, j, k; index_t seq_count; - const char* id1; - const char* id2; + index_t id1_idx, id2_idx; + index_t seq1_idx, seq2_idx; double score; - OBIDMS_column_p id_column; + int lcs_length; + int ali_length; Kmer_table_p ktable; Obi_blob_p blob1; Obi_blob_p blob2; int lcs_min; - index_t seq_idx; + index_t seq_elt_idx; + + Obiview_p seq_view = NULL; + Obiview_p output_view = NULL; + OBIDMS_column_p iseq_column = NULL; + OBIDMS_column_p id_column; + OBIDMS_column_p id1_column = NULL; + OBIDMS_column_p id2_column = NULL; + OBIDMS_column_p seq1_column = NULL; + OBIDMS_column_p seq2_column = NULL; + //OBIDMS_column_p count1_column = NULL; + //OBIDMS_column_p count2_column = NULL; + OBIDMS_column_p idx1_column = NULL; + OBIDMS_column_p idx2_column = NULL; + OBIDMS_column_p lcs_length_column = NULL; + OBIDMS_column_p ali_length_column = NULL; + OBIDMS_column_p score_column = NULL; k = 0; - // If no sequence column is given and the view has the type NUC_SEQS_VIEW, the default sequence column is aligned - if ((seq_column == NULL) && (strcmp((seq_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0)) + // Open input view + seq_view = obi_open_view(dms, seq_view_name); + if (seq_view == NULL) { - seq_column = obi_view_get_column(seq_view, NUC_SEQUENCE_COLUMN); - if (seq_column == NULL) + obidebug(1, "\nError opening the input view to align"); + return -1; + } + + // Open the sequence column to align + // If a column name wasn't given, open default sequence column + if (strcmp(seq_column_name, "") == 0) + { + if (strcmp((seq_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0) + iseq_column = obi_view_get_column(seq_view, NUC_SEQUENCE_COLUMN); + else + { + obi_set_errno(OBI_ALIGN_ERROR); + obidebug(1, "\nError: no column given to align"); return -1; + } } - // Check that the given sequence column contains nucleotide sequences - else if ((seq_column->header)->returned_data_type != OBI_SEQ) + else + iseq_column = obi_view_get_column(seq_view, seq_column_name); + if (iseq_column == NULL) { - obi_set_errno(OBI_ALIGN_ERROR); - obidebug(1, "\nTrying to align a column of a different type than OBI_SEQ"); + obidebug(1, "\nError getting the column to align"); return -1; } - if ((normalize && ((score_column->header)->returned_data_type != OBI_FLOAT)) || - (!normalize && ((score_column->header)->returned_data_type != OBI_INT))) + // Get element index of the sequence to align in each line to compute it only once + if ((strcmp(seq_elt_name, "") != 0) && (seq_elt_name != NULL)) { - obi_set_errno(OBI_ALIGN_ERROR); - obidebug(1, "\nTrying to store alignment scores in a column of an inappropriate type"); - return -1; - } - - // Get element index from element name to compute it only once - if (seq_name != NULL) - { - seq_idx = obi_column_get_element_index_from_name(seq_column, seq_name); - if (seq_idx == OBIIdx_NA) + seq_elt_idx = obi_column_get_element_index_from_name(iseq_column, seq_elt_name); + if (seq_elt_idx == OBIIdx_NA) { obidebug(1, "\nError getting the sequence index in a column line when aligning"); return -1; } } else - seq_idx = 0; + seq_elt_idx = 0; + + // Open the ID column, containing the identifiers of the sequences to align + // If a column name wasn't given, open default ID column + if (strcmp(id_column_name, "") == 0) + { + if (strcmp((seq_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0) + id_column = obi_view_get_column(seq_view, ID_COLUMN); + else + { + obi_set_errno(OBI_ALIGN_ERROR); + obidebug(1, "\nError: no ID column given"); + return -1; + } + } + else + id_column = obi_view_get_column(seq_view, id_column_name); + if (id_column == NULL) + { + obidebug(1, "\nError getting the ID column"); + return -1; + } + + // Create the output view + output_view = obi_new_view(dms, output_view_name, NULL, NULL, output_view_comments); + if (output_view == NULL) + { + obidebug(1, "\nError creating the output view when aligning"); + return -1; + } + + // Create the output columns + + // Create the column for the ids of the 1st sequence aligned + if (obi_view_add_column(output_view, ID1_COLUMN_NAME, -1, ID1_COLUMN_NAME, OBI_STR, 0, 1, NULL, (id_column->header)->indexer_name, NULL, -1, ID1_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the first column for the sequence ids when aligning"); + return -1; + } + id1_column = obi_view_get_column(output_view, ID1_COLUMN_NAME); + + // Create the column for the ids of the 2nd sequence aligned + if (obi_view_add_column(output_view, ID2_COLUMN_NAME, -1, ID2_COLUMN_NAME, OBI_STR, 0, 1, NULL, (id_column->header)->indexer_name, NULL, -1, ID2_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the second column for the sequence ids when aligning"); + return -1; + } + id2_column = obi_view_get_column(output_view, ID2_COLUMN_NAME); + + // Create the column for the index (in the input view) of the first sequences aligned + if (obi_view_add_column(output_view, IDX1_COLUMN_NAME, -1, IDX1_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, IDX1_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the first column for the sequence indices when aligning"); + return -1; + } + idx1_column = obi_view_get_column(output_view, IDX1_COLUMN_NAME); + + // Create the column for the index (in the input view) of the second sequences aligned + if (obi_view_add_column(output_view, IDX2_COLUMN_NAME, -1, IDX2_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, IDX2_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the second column for the sequence indices when aligning"); + return -1; + } + idx2_column = obi_view_get_column(output_view, IDX2_COLUMN_NAME); + + // Create the column for the LCS length + if (obi_view_add_column(output_view, LCS_LENGTH_COLUMN_NAME, -1, LCS_LENGTH_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, LCS_LENGTH_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the column for the LCS length when aligning"); + return -1; + } + lcs_length_column = obi_view_get_column(output_view, LCS_LENGTH_COLUMN_NAME); + + // Create the column for the alignment length if it is computed + if ((reference == ALILEN) && (normalize || !similarity_mode)) + { + if (obi_view_add_column(output_view, ALI_LENGTH_COLUMN_NAME, -1, ALI_LENGTH_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, ALI_LENGTH_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the column for the alignment length when aligning"); + return -1; + } + ali_length_column = obi_view_get_column(output_view, ALI_LENGTH_COLUMN_NAME); + } + // Create the column for the alignment score + if (normalize) + { + if (obi_view_add_column(output_view, SCORE_COLUMN_NAME, -1, SCORE_COLUMN_NAME, OBI_FLOAT, 0, 1, NULL, NULL, NULL, -1, SCORE_COLUMN_NAME, true) < 0) + { + obidebug(1, "\nError creating the column for the score when aligning"); + return -1; + } + } + else + { + if (obi_view_add_column(output_view, SCORE_COLUMN_NAME, -1, SCORE_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, SCORE_COLUMN_NAME, true) < 0) + { + obidebug(1, "\nError creating the column for the score when aligning"); + return -1; + } + } + score_column = obi_view_get_column(output_view, SCORE_COLUMN_NAME); + + if (print_seq) + { + // Create the column for the first sequences aligned + if (obi_view_add_column(output_view, SEQ1_COLUMN_NAME, -1, SEQ1_COLUMN_NAME, OBI_SEQ, 0, 1, NULL, (iseq_column->header)->indexer_name, NULL, -1, SEQ1_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the first column for the sequences when aligning"); + return -1; + } + seq1_column = obi_view_get_column(output_view, SEQ1_COLUMN_NAME); + + // Create the column for the second sequences aligned + if (obi_view_add_column(output_view, SEQ2_COLUMN_NAME, -1, SEQ2_COLUMN_NAME, OBI_SEQ, 0, 1, NULL, (iseq_column->header)->indexer_name, NULL, -1, SEQ2_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the second column for the sequences when aligning"); + return -1; + } + seq2_column = obi_view_get_column(output_view, SEQ2_COLUMN_NAME); + } +// if (print_count) // TODO count columns not implemented yet +// { +// // Create the column for the count of the first sequences aligned +// if (obi_view_add_column(output_view, COUNT1_COLUMN_NAME, -1, COUNT1_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, COUNT1_COLUMN_COMMENTS, true) < 0) +// { +// obidebug(1, "\nError creating the first column for the sequence counts when aligning"); +// return -1; +// } +// count1_column = obi_view_get_column(seq_view, COUNT1_COLUMN_NAME); +// +// // Create the column for the count of the second sequences aligned +// if (obi_view_add_column(output_view, COUNT2_COLUMN_NAME, -1, COUNT2_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, COUNT2_COLUMN_COMMENTS, true) < 0) +// { +// obidebug(1, "\nError creating the second column for the sequence counts when aligning"); +// return -1; +// } +// count2_column = obi_view_get_column(seq_view, COUNT2_COLUMN_NAME); +// } + // Build kmer tables - ktable = hash_seq_column(seq_view, seq_column, seq_idx); + ktable = hash_seq_column(seq_view, iseq_column, seq_elt_idx); if (ktable == NULL) { obi_set_errno(OBI_ALIGN_ERROR); @@ -96,10 +257,7 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const c return -1; } - // Get the ID column pointer - id_column = obi_view_get_column(seq_view, ID_COLUMN); - - seq_count = (seq_column->header)->lines_used; + seq_count = (iseq_column->header)->lines_used; for (i=0; i < (seq_count - 1); i++) { @@ -108,8 +266,10 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const c for (j=i+1; j < seq_count; j++) { - blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, seq_idx); - blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, seq_idx); + blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, i, seq_elt_idx); + blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, j, seq_elt_idx); + seq1_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, i, seq_elt_idx); + seq2_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, j, seq_elt_idx); if ((blob1 == NULL) || (blob2 == NULL)) { @@ -118,7 +278,7 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const c } // Check if the sequences are identical in a quick way (same index in the same indexer) - if (obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, seq_idx) == obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, seq_idx)) + if (obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, i, seq_elt_idx) == obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, j, seq_elt_idx)) { if (similarity_mode && normalize) score = 1.0; @@ -135,33 +295,93 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const c // Compute alignment score if ((threshold == 0) || (score == -1.0)) // no threshold, or filter passed: align - score = obiblob_sse_banded_lcs_align(blob1, blob2, threshold, normalize, reference, similarity_mode); + score = obiblob_sse_banded_lcs_align(blob1, blob2, threshold, normalize, reference, similarity_mode, &lcs_length, &ali_length); } if ((score >= 0) && (((normalize || similarity_mode) && (score >= threshold)) || ((!similarity_mode && !normalize) && (score <= threshold)))) - { // Print result + { // Print result // TODO make separate function maybe - // Get sequence ids - id1 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0); // TODO Could there be multiple IDs per line? - id2 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, j, 0); + // Write line indices of the input view in the output view (to easily refer to the input sequences from the output view) + if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, idx1_column, k, 0, i) < 0) + { + obidebug(1, "\nError writing idx1 in a column"); + return -1; + } + if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, idx2_column, k, 0, j) < 0) + { + obidebug(1, "\nError writing idx2 in a column"); + return -1; + } - // Write sequence ids in output view - if (obi_set_str_with_elt_idx_and_col_p_in_view(score_view, id1_column, k, 0, id1) < 0) + // Get ids idx + id1_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0); // TODO Could there be multiple IDs per line? + id2_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, id_column, j, 0); + + // Write ids in output view + if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, id1_column, k, 0, id1_idx) < 0) { obidebug(1, "\nError writing id1 in a column"); return -1; } - - if (obi_set_str_with_elt_idx_and_col_p_in_view(score_view, id2_column, k, 0, id2) < 0) + if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, id2_column, k, 0, id2_idx) < 0) { obidebug(1, "\nError writing id2 in a column"); return -1; } - // Write score in output view + // Write the sequences if needed + if (print_seq) + { + if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, seq1_column, k, 0, seq1_idx) < 0) + { + obidebug(1, "\nError writing seq1 in a column"); + return -1; + } + + if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, seq2_column, k, 0, seq2_idx) < 0) + { + obidebug(1, "\nError writing seq2 in a column"); + return -1; + } + } + +// // Write the counts if needed // TODO count columns not implemented yet +// if (print_count) +// { +// if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, count1_column, k, 0, count1) < 0) +// { +// obidebug(1, "\nError writing count1 in a column"); +// return -1; +// } +// +// if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, count2_column, k, 0, count2) < 0) +// { +// obidebug(1, "\nError writing count2 in a column"); +// return -1; +// } +// } + + // Write the alignment length if it was computed + if ((reference == ALILEN) && (normalize || !similarity_mode)) + { + if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, ali_length_column, k, 0, ali_length) < 0) + { + obidebug(1, "\nError writing alignment length in a column"); + return -1; + } + } + + // Write the LCS length + if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, lcs_length_column, k, 0, lcs_length) < 0) + { + obidebug(1, "\nError writing LCS length in a column"); + return -1; + } + + // Write score if (normalize) { - if (obi_set_float_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obifloat_t) score) < 0) + if (obi_set_float_with_elt_idx_and_col_p_in_view(output_view, score_column, k, 0, (obifloat_t) score) < 0) { obidebug(1, "\nError writing alignment score in a column"); return -1; @@ -169,7 +389,7 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const c } else { - if (obi_set_int_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obiint_t) score) < 0) + if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, score_column, k, 0, (obiint_t) score) < 0) { obidebug(1, "\nError writing alignment score in a column"); return -1; @@ -181,6 +401,18 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const c } } + // Close views + if (obi_close_view(seq_view) < 0) + { + obidebug(1, "\nError closing the input view after aligning"); + return -1; + } + if (obi_close_view(output_view) < 0) + { + obidebug(1, "\nError closing the output view after aligning"); + return -1; + } + free_kmer_tables(ktable, seq_count); return 0; diff --git a/src/obi_align.h b/src/obi_align.h index 96ce4de..c0d823c 100644 --- a/src/obi_align.h +++ b/src/obi_align.h @@ -24,15 +24,53 @@ #include "obitypes.h" +/** + * @brief Names and comments of columns automatically created in the output view when aligning. + * + * @since December 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +#define ID1_COLUMN_NAME "ID1" +#define ID1_COLUMN_COMMENTS "ID1" +#define ID2_COLUMN_NAME "ID2" +#define ID2_COLUMN_COMMENTS "ID2" +#define SEQ1_COLUMN_NAME "SEQ1" +#define SEQ1_COLUMN_COMMENTS "SEQ1" +#define SEQ2_COLUMN_NAME "SEQ2" +#define SEQ2_COLUMN_COMMENTS "SEQ2" +#define COUNT1_COLUMN_NAME "COUNT1" +#define COUNT1_COLUMN_COMMENTS "COUNT1" +#define COUNT2_COLUMN_NAME "COUNT2" +#define COUNT2_COLUMN_COMMENTS "COUNT2" +#define IDX1_COLUMN_NAME "IDX1" +#define IDX1_COLUMN_COMMENTS "IDX1" +#define IDX2_COLUMN_NAME "IDX2" +#define IDX2_COLUMN_COMMENTS "IDX2" +#define LCS_LENGTH_COLUMN_NAME "LCS_LENGTH" +#define LCS_LENGTH_COLUMN_COMMENTS "LCS_LENGTH" +#define ALI_LENGTH_COLUMN_NAME "ALI_LENGTH" +#define ALI_LENGTH_COLUMN_COMMENTS "ALI_LENGTH" +#define SCORE_COLUMN_NAME "SCORE" +#define SCORE_COLUMN_COMMENTS "SCORE" + + /** * @brief Aligns a NUC_SEQ column with itself. * - * @param seq_view A pointer on the view where the column to align is. - * @param seq_column A pointer on the OBI_SEQ column to align. - * @param score_view A pointer on the view to write the outputs to. - * @param id1_column A pointer on the OBI_STR column in score_view where the id of the first sequence should be written. - * @param id2_column A pointer on the OBI_STR column in score_view where the id of the second sequence should be written. - * @param score_column A pointer on the OBI_FLOAT column in score_view where the alignment score should be written. + * Note: The columns where the results are written are automatically named and created. + * + * @param dms A pointer on an OBIDMS. + * @param seq_view_name The name of the view where the column to align is. + * @param seq_column_name The name of the OBI_SEQ column in the input view to align. + * If "" (empty string), and the input view is of type NUC_SEQS_VIEW, the associated "NUC_SEQ" column is aligned. + * @param seq_elt_name The name of the element in the column corresponding to the sequence to align, if the column has multiple + * elements per line. + * @param id_column_name The name of the column in the input view containing the identifiers of the sequences to align. + * If "" (empty string), and the input view is of type NUC_SEQS_VIEW, the associated "ID" column is aligned. + * @param output_view_name The name of the output view where the results should be written (should not already exist). + * @param output_view_comments The comments that should be associated with the output view. + * @param print_seq A boolean indicating whether the aligned sequences should be copied in the output view. + * @param print_count A boolean indicating whether the aligned sequence counts should be copied in the output view. * @param threshold Score threshold. If the score is normalized and expressed in similarity, it is an identity, e.g. 0.95 * for an identity of 95%. If the score is normalized and expressed in distance, it is (1.0 - identity), * e.g. 0.05 for an identity of 95%. If the score is not normalized and expressed in similarity, it is @@ -49,9 +87,12 @@ * @since May 2016 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const char* seq_name, - Obiview_p score_view, OBIDMS_column_p id1_column, OBIDMS_column_p id2_column, OBIDMS_column_p score_column, - double threshold, bool normalize, int reference, bool similarity_mode); +int obi_lcs_align_one_column(OBIDMS_p dms, + const char* seq_view_name, const char* seq_column_name, const char* seq_elt_name, + const char* id_column_name, + const char* output_view_name, const char* output_view_comments, + bool print_seq, bool print_count, + double threshold, bool normalize, int reference, bool similarity_mode); /** diff --git a/src/sse_banded_LCS_alignment.c b/src/sse_banded_LCS_alignment.c index 3a483b4..07b5ffe 100644 --- a/src/sse_banded_LCS_alignment.c +++ b/src/sse_banded_LCS_alignment.c @@ -62,7 +62,7 @@ static inline int extract_reg(__m128i r, int p) // TODO warning on length order -void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal, int16_t* address, double* lcs_length, int* ali_length) +void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal, int16_t* address, int* lcs_length, int* ali_length) { register int j; int k1, k2; @@ -288,13 +288,12 @@ void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int // TODO warning on length order -double sse_banded_align_just_lcs(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal) +void sse_banded_align_just_lcs(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal, int* lcs_length) { register int j; int k1, k2; int diff; int l_reg, l_loc; - int16_t l_lcs; int line; int numberOfRegistersPerLine; int numberOfRegistersFor3Lines; @@ -330,7 +329,6 @@ double sse_banded_align_just_lcs(int16_t* seq1, int16_t* seq2, int l1, int l2, i { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError allocating memory for SSE registers for LCS alignment"); - return 0; // TODO DOUBLE_MIN? } // preparer registres SSE @@ -421,12 +419,10 @@ double sse_banded_align_just_lcs(int16_t* seq1, int16_t* seq2, int l1, int l2, i l_loc = l_loc - l_reg*8; // extracting LCS from the registers : - l_lcs = extract_reg((*(p_gap1+l_reg)).i, l_loc); + *lcs_length = extract_reg((*(p_gap1+l_reg)).i, l_loc); // freeing the registers free(SSEregisters); - - return((double) l_lcs); } @@ -561,11 +557,10 @@ void initializeAddressWithGaps(int16_t* address, int bandLengthTotal, int bandLe // TODO warning on length order -double sse_banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, bool normalize, int reference, bool similarity_mode, int16_t* address, int LCSmin) +double sse_banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, bool normalize, int reference, bool similarity_mode, int16_t* address, int LCSmin, int* lcs_length, int* ali_length) { double id; int bandLengthRight, bandLengthLeft, bandLengthTotal; - int ali_length; bandLengthLeft = calculateLeftBandLength(l1, LCSmin); bandLengthRight = calculateRightBandLength(l2, LCSmin); @@ -579,10 +574,12 @@ double sse_banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, bool n if ((reference == ALILEN) && (normalize || !similarity_mode)) { initializeAddressWithGaps(address, bandLengthTotal, bandLengthLeft, l1); - sse_banded_align_lcs_and_ali_len(seq1, seq2, l1, l2, bandLengthLeft, bandLengthTotal, address, &id, &ali_length); + sse_banded_align_lcs_and_ali_len(seq1, seq2, l1, l2, bandLengthLeft, bandLengthTotal, address, lcs_length, ali_length); } else - id = sse_banded_align_just_lcs(seq1, seq2, l1, l2, bandLengthLeft, bandLengthTotal); + sse_banded_align_just_lcs(seq1, seq2, l1, l2, bandLengthLeft, bandLengthTotal, lcs_length); + + id = (double) *lcs_length; // fprintf(stderr, "\nid before normalizations = %f", id); @@ -590,7 +587,7 @@ double sse_banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, bool n if (!similarity_mode && !normalize) switch(reference) { - case ALILEN: id = ali_length - id; + case ALILEN: id = *ali_length - id; break; case MAXLEN: id = l1 - id; break; @@ -600,7 +597,7 @@ double sse_banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, bool n // fprintf(stderr, "\n2>>> %f, %d\n", id, ali_length); if (normalize) switch(reference) { - case ALILEN: id = id / (double) ali_length; + case ALILEN: id = id / (double) *ali_length; break; case MAXLEN: id = id / (double) l1; break; @@ -643,7 +640,7 @@ int calculateLCSmin(int l1, int l2, double threshold, bool normalize, int refere } -double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bool normalize, int reference, bool similarity_mode) +double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bool normalize, int reference, bool similarity_mode, int* lcs_length, int* ali_length) { double id; int l1, l2; @@ -718,14 +715,14 @@ double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bo putSeqInSeq(iseq1, seq2, l2, TRUE); putSeqInSeq(iseq2, seq1, l1, FALSE); // Compute alignment - id = sse_banded_lcs_align(iseq1, iseq2, l2, l1, normalize, reference, similarity_mode, address, LCSmin); + id = sse_banded_lcs_align(iseq1, iseq2, l2, l1, normalize, reference, similarity_mode, address, LCSmin, lcs_length, ali_length); } else { putSeqInSeq(iseq1, seq1, l1, TRUE); putSeqInSeq(iseq2, seq2, l2, FALSE); // Compute alignment - id = sse_banded_lcs_align(iseq1, iseq2, l1, l2, normalize, reference, similarity_mode, address, LCSmin); + id = sse_banded_lcs_align(iseq1, iseq2, l1, l2, normalize, reference, similarity_mode, address, LCSmin, lcs_length, ali_length); } // Free allocated elements @@ -738,7 +735,7 @@ double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bo } -double obiblob_sse_banded_lcs_align(Obi_blob_p seq1, Obi_blob_p seq2, double threshold, bool normalize, int reference, bool similarity_mode) +double obiblob_sse_banded_lcs_align(Obi_blob_p seq1, Obi_blob_p seq2, double threshold, bool normalize, int reference, bool similarity_mode, int* lcs_length, int* ali_length) { double id; int l1, l2; @@ -813,14 +810,14 @@ double obiblob_sse_banded_lcs_align(Obi_blob_p seq1, Obi_blob_p seq2, double thr putBlobInSeq(iseq1, seq2, l2, TRUE); putBlobInSeq(iseq2, seq1, l1, FALSE); // Compute alignment - id = sse_banded_lcs_align(iseq1, iseq2, l2, l1, normalize, reference, similarity_mode, address, LCSmin); + id = sse_banded_lcs_align(iseq1, iseq2, l2, l1, normalize, reference, similarity_mode, address, LCSmin, lcs_length, ali_length); } else { putBlobInSeq(iseq1, seq1, l1, TRUE); putBlobInSeq(iseq2, seq2, l2, FALSE); // Compute alignment - id = sse_banded_lcs_align(iseq1, iseq2, l1, l2, normalize, reference, similarity_mode, address, LCSmin); + id = sse_banded_lcs_align(iseq1, iseq2, l1, l2, normalize, reference, similarity_mode, address, LCSmin, lcs_length, ali_length); } // Free allocated elements diff --git a/src/sse_banded_LCS_alignment.h b/src/sse_banded_LCS_alignment.h index 5aed869..23f3358 100644 --- a/src/sse_banded_LCS_alignment.h +++ b/src/sse_banded_LCS_alignment.h @@ -21,7 +21,7 @@ // TODO doc int calculateLCSmin(int l1, int l2, double threshold, bool normalize, int reference, bool lcsmode); -double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bool normalize, int reference, bool similarity_mode); -double obiblob_sse_banded_lcs_align(Obi_blob_p seq1, Obi_blob_p seq2, double threshold, bool normalize, int reference, bool similarity_mode); +double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bool normalize, int reference, bool similarity_mode, int* lcs_length, int* ali_length); +double obiblob_sse_banded_lcs_align(Obi_blob_p seq1, Obi_blob_p seq2, double threshold, bool normalize, int reference, bool similarity_mode, int* lcs_length, int* ali_length); #endif