diff --git a/python/obitools3/commands/align.pyx b/python/obitools3/commands/align.pyx new file mode 100644 index 0000000..9cadecd --- /dev/null +++ b/python/obitools3/commands/align.pyx @@ -0,0 +1,128 @@ +from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport +from obitools3.obidms._obidms import OBIDMS, OBIView # TODO cimport doesn't work + + +import time + +__title__="Aligns one sequence column with itself or two sequence columns" + + +default_config = { 'inputview' : None, + 'skip' : 0, + 'only' : None, + 'skiperror' : False, + 'moltype' : 'nuc', + } + +def addOptions(parser): + + # TODO put this common group somewhere else but I don't know where + group=parser.add_argument_group('DMS and view options') + + group.add_argument('--default-dms','-d', + action="store", dest="obi:defaultdms", + metavar='', + default=None, + type=str, + help="Name of the default DMS for reading and writing data.") + + group.add_argument('--input-view','-i', + action="store", dest="obi:inputview", + metavar='', + default=None, + type=str, + help="Name of the input view, either raw if the view is in the default DMS," + " or in the form 'dms:view' if it is in another DMS.") + + # TODO eventually 2nd view, or 2nd column? + + group.add_argument('--output-view','-o', + action="store", dest="obi:outputview", + metavar='', + default=None, + type=str, + help="Name of the output view, either raw if the view is in the default DMS," + " or in the form 'dms:view' if it is in another DMS.") + + + group=parser.add_argument_group('obi align specific options') + + group.add_argument('--lcs','-C', + action="store", dest="align:alitype", + metavar='', + default='lcs', + type=str, + help="Compute alignment using the LCS method.") + + group.add_argument('--threshold','-t', + action="store", dest="align:threshold", + metavar='', + default=0.0, + type=float, + help="Score threshold. If the score is normalized and expressed in similarity (default)," + " it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized" + " and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%." + " If the score is not normalized and expressed in similarity, it is the length of the" + " Longest Common Subsequence. If the score is not normalized and expressed in distance," + " it is (reference length - LCS length)." + " Only sequence pairs with a similarity above are printed. Default: 0.00" + " (no threshold).") + + group.add_argument('--longest_length','-L', + action="store_const", dest="align:reflength", + default="ali", + const="longest", + help="The reference length is the length of the longest sequence." + " Default: the reference length is the length of the alignment.") + + group.add_argument('--shortest_length','-l', + action="store_const", dest="align:reflength", + default="ali", + const="shortest", + help="The reference length is the length of the shortest sequence." + " Default: the reference length is the length of the alignment.") + + group.add_argument('--raw','-r', + action="store_false", dest="align:normalize", + default=True, + help="Raw score, not normalized. Default: score is normalized with the reference sequence length.") + + group.add_argument('--distance','-D', + action="store_false", dest="align:similarity", + default=True, + help="Score is expressed in distance. Default: score is expressed in similarity.") + + + +def run(config): + + #pb = ProgressBar(1, config, seconde=5) # TODO + + # Open DMS + d = OBIDMS(config['obi']['defaultdms']) + + # Open input view 1 + iview = d.open_view(config['obi']['inputview']) + + # TODO Open input view 2 if there is one + + # Create output view if necessary + if config['obi']['outputview'] is not None : + oview = d.new_view(config['obi']['outputview']) + else : + oview = None + + # TODO Take other alignment types into account when they'll be implemented + + # Call cython alignment function + iview.align(output_view=oview) + + print("Done.") + + + + + + + + \ No newline at end of file diff --git a/python/obitools3/obidms/_obidms.pxd b/python/obitools3/obidms/_obidms.pxd index e334c6d..b91f0ce 100644 --- a/python/obitools3/obidms/_obidms.pxd +++ b/python/obitools3/obidms/_obidms.pxd @@ -77,7 +77,15 @@ cdef class OBIView_NUC_SEQS(OBIView): cdef OBIDMS_column qualities cpdef delete_column(self, str column_name) - + cpdef align(self, + OBIView iview2=*, + object output_view=*, + double threshold=*, + bint normalize=*, + int reference=*, + bint similarity_mode=* + ) + cdef class OBIView_line : diff --git a/python/obitools3/obidms/_obidms.pyx b/python/obitools3/obidms/_obidms.pyx index c5903da..c8ec1bb 100644 --- a/python/obitools3/obidms/_obidms.pyx +++ b/python/obitools3/obidms/_obidms.pyx @@ -10,7 +10,9 @@ from .capi.obidmscolumn cimport obi_close_column, \ OBIDMS_column_header_p from .capi.obiutils cimport obi_format_date - + +from .capi.obialign cimport obi_align_one_column + from .capi.obitypes cimport const_char_p, \ OBIType_t, \ OBI_INT, \ @@ -454,6 +456,7 @@ cdef class OBIView : for line in self.__iter__() : to_print = to_print + str(line) + "\n" return to_print + ############################################# @@ -524,7 +527,76 @@ cdef class OBIView_NUC_SEQS(OBIView): def __setitem__(self, index_t line_idx, OBI_Nuc_Seq sequence_obj) : for key in sequence_obj : self[line_idx][key] = sequence_obj[key] - + + + # TODO + cpdef align(self, OBIView iview2=None, object output_view=None, + double threshold=0.0, bint normalize=True, int reference=0, bint similarity_mode=True) : + + cdef OBIView iview1 + cdef OBIView oview + + cdef Obiview_p iview1_p + cdef Obiview_p iview2_p + cdef Obiview_p oview_p + + cdef OBIDMS_column icol1 + cdef OBIDMS_column_p icol1_p + cdef OBIDMS_column_p* icol1_pp + + cdef OBIDMS_column id1_col + cdef OBIDMS_column_p id1_col_p + cdef OBIDMS_column_p* id1_col_pp + + cdef OBIDMS_column id2_col + cdef OBIDMS_column_p id2_col_p + cdef OBIDMS_column_p* id2_col_pp + + cdef OBIDMS_column ocol + cdef OBIDMS_column_p ocol_p + cdef OBIDMS_column_p* ocol_pp + + cdef str id1_col_name + cdef str id2_col_name + cdef str score_col_name + + id1_col_name = "ID1" # TODO discuss names, aliases + id2_col_name = "ID2" + score_col_name = "score" + + iview1= self + iview1_p = iview1.pointer + icol1 = iview1[bytes2str(NUC_SEQUENCE_COLUMN)] + icol1_pp = icol1.pointer + icol1_p = icol1_pp[0] + + # Create the output view if needed + if output_view is None : + oview = self.dms.new_view("alignment_score_view") # TODO discuss + elif type(output_view) == str : + oview = self.dms.new_view(output_view) + else : + oview = output_view + + oview.add_column(id1_col_name, type='OBI_STR', create=True) + oview.add_column(id2_col_name, type='OBI_STR', create=True) + oview.add_column(score_col_name, type='OBI_FLOAT', create=True) + + oview_p = oview.pointer + ocol = oview[score_col_name] + ocol_pp = ocol.pointer + ocol_p = ocol_pp[0] + + id1_col = oview[id1_col_name] + id2_col = oview[id2_col_name] + id1_col_pp = id1_col.pointer + id2_col_pp = id2_col.pointer + id1_col_p = id1_col_pp[0] + id2_col_p = id2_col_pp[0] + + if obi_align_one_column(iview1_p, icol1_p, oview_p, id1_col_p, id2_col_p, ocol_p, threshold, normalize, reference, similarity_mode) < 0 : + raise Exception("Error aligning sequences") + ############################################# diff --git a/python/obitools3/obidms/_obidmscolumn_seq.pxd b/python/obitools3/obidms/_obidmscolumn_seq.pxd index 136a9a2..5319b38 100644 --- a/python/obitools3/obidms/_obidmscolumn_seq.pxd +++ b/python/obitools3/obidms/_obidmscolumn_seq.pxd @@ -8,17 +8,6 @@ cdef class OBIDMS_column_seq(OBIDMS_column): cpdef object get_line(self, index_t line_nb) cpdef set_line(self, index_t line_nb, object value) - # TO DISCUSS : - # I'am not sure that this method has to be declared here - # Alignment must be declared outside of the sequence object - cpdef align(self, - OBIView score_view, - OBIDMS_column score_column, - double threshold = *, - bint normalize = *, - int reference = *, - bint similarity_mode = *) - cdef class OBIDMS_column_multi_elts_seq(OBIDMS_column_multi_elts): cpdef object get_item(self, index_t line_nb, str element_name) diff --git a/python/obitools3/obidms/_obidmscolumn_seq.pyx b/python/obitools3/obidms/_obidmscolumn_seq.pyx index 134289c..76881d0 100644 --- a/python/obitools3/obidms/_obidmscolumn_seq.pyx +++ b/python/obitools3/obidms/_obidmscolumn_seq.pyx @@ -37,18 +37,6 @@ cdef class OBIDMS_column_seq(OBIDMS_column): else : if obi_set_seq_with_elt_idx_and_col_p_in_view(self.view.pointer, (self.pointer)[0], line_nb, 0, str2bytes(value)) < 0: raise Exception("Problem setting a value in a column") - - # TODO choose alignment type (lcs or other) with supplementary argument - cpdef align(self, - OBIView score_view, - OBIDMS_column score_column, - double threshold = 0.0, - bint normalize = True, - int reference = 0, # TODO - bint similarity_mode = True): - if (obi_align_one_column(self.view.pointer, (self.pointer)[0], score_view.pointer, (score_column.pointer)[0], threshold, normalize, reference, similarity_mode) < 0) : - raise Exception("An error occurred while aligning sequences") - cdef class OBIDMS_column_multi_elts_seq(OBIDMS_column_multi_elts): diff --git a/python/obitools3/obidms/capi/obialign.pxd b/python/obitools3/obidms/capi/obialign.pxd index 9e9f59d..8886495 100644 --- a/python/obitools3/obidms/capi/obialign.pxd +++ b/python/obitools3/obidms/capi/obialign.pxd @@ -6,5 +6,14 @@ from ..capi.obidmscolumn cimport OBIDMS_column_p cdef extern from "obi_align.h" nogil: - int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, Obiview_p score_view, OBIDMS_column_p score_column, double threshold, bint normalize, int reference, bint similarity_mode) + int obi_align_one_column(Obiview_p seq_view, + OBIDMS_column_p seq_column, + Obiview_p score_view, + OBIDMS_column_p id1_column, + OBIDMS_column_p id2_column, + OBIDMS_column_p score_column, + double threshold, + bint normalize, + int reference, + bint similarity_mode) diff --git a/src/obi_align.c b/src/obi_align.c index 4bdc214..3cad4e4 100644 --- a/src/obi_align.c +++ b/src/obi_align.c @@ -25,20 +25,25 @@ // TODO -// use openMP pragmas : garder scores en memoire et ecrire a la fin? (normalement c bon avec index) -// tout ecrire en stdint? -// check NUC_SEQS and score type (int or float if normalize) -// what's with multiple sequence/line columns? +// use openMP pragmas +// option pour ecrire en stdint? +// check NUC_SEQS view type? and score type (int or float if normalize) +// what's with multiple sequences/line columns? // make function that put blobs in int16 -int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, Obiview_p score_view, OBIDMS_column_p score_column, double threshold, bool normalize, int reference, bool similarity_mode) +int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, + Obiview_p score_view, OBIDMS_column_p id1_column, OBIDMS_column_p id2_column, OBIDMS_column_p score_column, + double threshold, bool normalize, int reference, bool similarity_mode) { index_t i, j, k; index_t seq_count; char* seq1; char* seq2; + const char* id1; + const char* id2; double score; + OBIDMS_column_p id_column; k = 0; @@ -57,6 +62,9 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, Obiview return -1; } + // Get the ID column pointer + id_column = obi_view_get_column(seq_view, ID_COLUMN); + seq_count = (seq_column->header)->lines_used; for (i=0; i < (seq_count - 1); i++) @@ -65,8 +73,8 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, Obiview { //fprintf(stderr, "\ni=%lld, j=%lld, k=%lld", i, j, k); - seq1 = obi_column_get_obiseq_with_elt_idx_in_view(seq_view, seq_column, i, 0); - seq2 = obi_column_get_obiseq_with_elt_idx_in_view(seq_view, seq_column, j, 0); + seq1 = obi_get_seq_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, 0); + seq2 = obi_get_seq_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, 0); if ((seq1 == NULL) || (seq2 == NULL)) { @@ -74,12 +82,32 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, Obiview return -1; } - // TODO filter + // TODO kmer filter + + // Compute alignment score score = generic_sse_banded_lcs_align(seq1, seq2, threshold, normalize, reference, similarity_mode); + // Get sequence ids + id1 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0); + id2 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, j, 0); + + // Write sequence ids in output view + if (obi_set_str_with_elt_idx_and_col_p_in_view(score_view, id1_column, k, 0, id1) < 0) + { + obidebug(1, "\nError writing id1 in a column"); + return -1; + } + + if (obi_set_str_with_elt_idx_and_col_p_in_view(score_view, id2_column, k, 0, id2) < 0) + { + obidebug(1, "\nError writing id2 in a column"); + return -1; + } + + // Write score in output view if (normalize) { - if (obi_column_set_obifloat_with_elt_idx_in_view(score_view, score_column, k, 0, (obifloat_t) score) < 0) + if (obi_set_float_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obifloat_t) score) < 0) { obidebug(1, "\nError writing alignment score in a column"); return -1; @@ -87,13 +115,16 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, Obiview } else { - if (obi_column_set_obiint_with_elt_idx_in_view(score_view, score_column, k, 0, (obiint_t) score) < 0) + if (obi_set_int_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obiint_t) score) < 0) { obidebug(1, "\nError writing alignment score in a column"); return -1; } } + free(seq1); + free(seq2); + k++; } } @@ -102,8 +133,82 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, Obiview } -int obi_align_two_columns(OBIDMS_column_p seq_column_1, OBIDMS_column_p seq_column_2, OBIDMS_column_p score_column, double threshold, bool normalize, int reference, bool similarity_mode) -{ - // TODO - return 0; -} +// TODO discuss if 2 input views or 2 columns or both possible +//int obi_align_two_columns(Obiview_p seq_view, OBIDMS_column_p seq_column_1, OBIDMS_column_p seq_column_2, // TODO it's implied both seq columns are in the same view but maybe it shouldn't +// Obiview_p score_view, OBIDMS_column_p score_column, +// double threshold, bool normalize, int reference, bool similarity_mode) +//{ +// index_t i, j, k; +// index_t seq_count_1; +// index_t seq_count_2; +// char* seq1; +// char* seq2; +// double score; +// +// k = 0; +// +// if (((seq_column_1->header)->returned_data_type != OBI_SEQ) || ((seq_column_2->header)->returned_data_type != OBI_SEQ)) +// { +// obi_set_errno(OBI_ALIGN_ERROR); +// obidebug(1, "\nTrying to align a column of a different type than OBI_SEQ"); +// return -1; +// } +// +// if ((normalize && ((score_column->header)->returned_data_type != OBI_FLOAT)) || +// (!normalize && ((score_column->header)->returned_data_type != OBI_INT))) +// { +// obi_set_errno(OBI_ALIGN_ERROR); +// obidebug(1, "\nTrying to store alignment scores in a column of an inappropriate type"); +// return -1; +// } +// +// seq_count_1 = (seq_column_1->header)->lines_used; +// seq_count_2 = (seq_column_2->header)->lines_used; +// +// for (i=0; i < (seq_count_1 - 1); i++) +// { +// for (j=0; j < seq_count_2; j++) +// { +// //fprintf(stderr, "\ni=%lld, j=%lld, k=%lld", i, j, k); +// +// seq1 = obi_get_seq_with_elt_idx_and_col_p_in_view(seq_view, seq_column_1, i, 0); +// seq2 = obi_get_seq_with_elt_idx_and_col_p_in_view(seq_view, seq_column_2, j, 0); +// +// if ((seq1 == NULL) || (seq2 == NULL)) +// { +// obidebug(1, "\nError retrieving sequences to align"); +// return -1; +// } +// +// // TODO kmer filter +// +// score = generic_sse_banded_lcs_align(seq1, seq2, threshold, normalize, reference, similarity_mode); +// +// if (normalize) +// { +// if (obi_set_float_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obifloat_t) score) < 0) +// { +// obidebug(1, "\nError writing alignment score in a column"); +// return -1; +// } +// } +// else +// { +// if (obi_set_int_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obiint_t) score) < 0) +// { +// obidebug(1, "\nError writing alignment score in a column"); +// return -1; +// } +// } +// +// free(seq1); +// free(seq2); +// +// k++; +// } +// } +// +// return 0; +//} + + diff --git a/src/obi_align.h b/src/obi_align.h index ed8aa93..4859e05 100644 --- a/src/obi_align.h +++ b/src/obi_align.h @@ -23,6 +23,35 @@ #include "obitypes.h" +/** + * @brief Aligns a NUC_SEQ column with itself. + * + * @param seq_view A pointer on the view where the column to align is. + * @param seq_column A pointer on the OBI_SEQ column to align. + * @param score_view A pointer on the view to write the outputs to. + * @param id1_column A pointer on the OBI_STR column in score_view where the id of the first sequence should be written. + * @param id2_column A pointer on the OBI_STR column in score_view where the id of the second sequence should be written. + * @param score_column A pointer on the OBI_FLOAT column in score_view where the alignment score should be written. + * @param threshold Score threshold. If the score is normalized and expressed in similarity, it is an identity, e.g. 0.95 + * for an identity of 95%. If the score is normalized and expressed in distance, it is (1.0 - identity), + * e.g. 0.05 for an identity of 95%. If the score is not normalized and expressed in similarity, it is + * the length of the Longest Common Subsequence. If the score is not normalized and expressed in distance, + * it is (reference length - LCS length). Only sequence pairs with a similarity above the threshold are printed. + * @param normalize Whether the score should be normalized with the reference sequence length. + * @param reference The reference length. 0: The alignement length; 1: The longest sequence's length; 2: The shortest sequence's length. + * @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false). + * + * @returns A value indicating the success of the operation. + * @retval 0 if the operation was successfully completed. + * @retval -1 if an error occurred. + * + * @since May 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, + Obiview_p score_view, OBIDMS_column_p id1_column, OBIDMS_column_p id2_column, OBIDMS_column_p score_column, + double threshold, bool normalize, int reference, bool similarity_mode); + /** * @brief @@ -30,7 +59,9 @@ * TODO * */ -int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, Obiview_p score_view, OBIDMS_column_p score_column, double threshold, bool normalize, int reference, bool similarity_mode); +//int obi_align_two_columns(Obiview_p seq_view, OBIDMS_column_p seq_column_1, OBIDMS_column_p seq_column_2, +// Obiview_p score_view, OBIDMS_column_p score_column, +// double threshold, bool normalize, int reference, bool similarity_mode); #endif /* OBI_ALIGN_H_ */