Alignment: API rework. 'obi align' is now 'obi lcs', and the results are

now written to columns automatically created in the output view, all optimally handled at the C level.
2016-12-12 11:58:59 +01:00
parent fa4e4ffaff
commit 8afb1644e9
11 changed files with 579 additions and 272 deletions
--- a/python/obitools3/commands/align.pyx
+++ b/python/obitools3/commands/align.pyx
@ -1,120 +0,0 @@
-from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
-from obitools3.obidms._obidms import OBIDMS, OBIView    # TODO cimport doesn't work
-
-
-import time
-
-__title__="Aligns one sequence column with itself or two sequence columns"
-
-
-default_config = {   'inputview'    : None,
-                 }
-
-def addOptions(parser):
-
-    # TODO put this common group somewhere else but I don't know where
-    group=parser.add_argument_group('DMS and view options')
-
-    group.add_argument('--default-dms','-d', 
-                       action="store", dest="obi:defaultdms",
-                       metavar='<DMS NAME>',
-                       default=None,
-                       type=str,
-                       help="Name of the default DMS for reading and writing data.")
-
-    group.add_argument('--input-view','-i',
-                       action="store", dest="obi:inputview",
-                       metavar='<INPUT VIEW NAME>',
-                       default=None,
-                       type=str,
-                       help="Name of the input view.")
-
-    # TODO eventually 2nd view, or 2nd column?
-
-    group.add_argument('--output-view','-o',
-                       action="store", dest="obi:outputview",
-                       metavar='<OUTPUT VIEW NAME>',
-                       default=None,
-                       type=str,
-                       help="Name of the output view.")
-
-
-    group=parser.add_argument_group('obi align specific options')
-
-    group.add_argument('--lcs','-C',
-                       action="store", dest="align:alitype",
-                       metavar='<ALIGNMENT TYPE>',
-                       default='lcs',
-                       type=str,
-                       help="Compute alignment using the LCS method (default).")
- 
-    group.add_argument('--threshold','-t',
-                       action="store", dest="align:threshold",
-                       metavar='<THRESHOLD>',
-                       default=0.0,
-                       type=float,
-                       help="Score threshold. If the score is normalized and expressed in similarity (default),"
-                            " it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized"
-                            " and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%."
-                            " If the score is not normalized and expressed in similarity, it is the length of the"
-                            " Longest Common Subsequence. If the score is not normalized and expressed in distance,"
-                            " it is (reference length - LCS length)."
-                            " Only sequence pairs with a similarity above <THRESHOLD> are printed. Default: 0.00"
-                            " (no threshold).")
-
-    group.add_argument('--longest_length','-L',
-                       action="store_const", dest="align:reflength",
-                       default=0,
-                       const=1,
-                       help="The reference length is the length of the longest sequence."
-                            " Default: the reference length is the length of the alignment.")
-
-    group.add_argument('--shortest_length','-l',
-                       action="store_const", dest="align:reflength",
-                       default=0,
-                       const=2,
-                       help="The reference length is the length of the shortest sequence."
-                            " Default: the reference length is the length of the alignment.")
-
-    group.add_argument('--raw','-r',
-                       action="store_false", dest="align:normalize",
-                       default=True,
-                       help="Raw score, not normalized. Default: score is normalized with the reference sequence length.")
-
-    group.add_argument('--distance','-D',
-                       action="store_false", dest="align:similarity",
-                       default=True,
-                       help="Score is expressed in distance. Default: score is expressed in similarity.")
-
-
-    
-def run(config):
-     
-    # Open DMS
-    d = OBIDMS(config['obi']['defaultdms'])
-    
-    # Open input view 1
-    iview = d.open_view(config['obi']['inputview'])
-
-    # TODO Open input view 2 if there is one
-
-    # Create output view
-    oview = d.new_view(config['obi']['outputview'])
-    
-    # TODO Take other alignment types into account when they'll be implemented
-    
-    # Call cython alignment function
-    iview.align(oview, threshold=config['align']['threshold'], normalize=config['align']['normalize'], reference=config['align']['reflength'], similarity_mode=config['align']['similarity'])
-
-    print(repr(oview))
-    
-    iview.close()
-    oview.close()
-    d.close()
-     
-    print("Done.")
-
-    
-    
-    
-        
--- a/python/obitools3/commands/lcs.pyx
+++ b/python/obitools3/commands/lcs.pyx
@ -0,0 +1,209 @@
+#cython: language_level=3
+
+from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
+from obitools3.obidms._obidms cimport OBIDMS    # TODO cimport doesn't work
+from obitools3.utils cimport str2bytes
+
+from obitools3.obidms.capi.obialign cimport obi_lcs_align_one_column
+
+
+import time
+
+__title__="Aligns one sequence column with itself or two sequence columns"
+
+
+default_config = {   'inputview'    : None,
+                 }
+
+def addOptions(parser):
+
+    # TODO put this common group somewhere else but I don't know where.
+    # Also some options should probably be in another group
+    group=parser.add_argument_group('DMS and view options')
+
+    group.add_argument('--default-dms', '-d', 
+                       action="store", dest="obi:defaultdms",
+                       metavar='<DMS NAME>',
+                       default=None,
+                       type=str,
+                       help="Name of the default DMS for reading and writing data.")
+
+    group.add_argument('--input-view-1', '-i',
+                       action="store", dest="obi:inputview1",
+                       metavar='<INPUT VIEW NAME>',
+                       default=None,
+                       type=str,
+                       help="Name of the (first) input view.")
+
+    group.add_argument('--input-view-2', '-I',
+                       action="store", dest="obi:inputview2",
+                       metavar='<INPUT VIEW NAME>',
+                       default="",
+                       type=str,
+                       help="Eventually, the name of the second input view.")
+
+    group.add_argument('--input-column-1', '-c',
+                       action="store", dest="obi:inputcolumn1",
+                       metavar='<INPUT COLUMN NAME>',
+                       default="",
+                       type=str,
+                       help="Name of the (first) input column. "
+                            " Default: the default nucleotide sequence column of the view if there is one.")
+
+    group.add_argument('--input-column-2', '-C',
+                       action="store", dest="obi:inputcolumn2",
+                       metavar='<INPUT COLUMN NAME>',
+                       default="",
+                       type=str,
+                       help="Eventually, the name of the second input column.")
+
+    group.add_argument('--input-elt-1', '-e',
+                       action="store", dest="obi:inputelement1",
+                       metavar='<INPUT ELEMENT NAME>',
+                       default="",
+                       type=str,
+                       help="If the first input column has multiple elements per line, name of the element referring to the sequence to align. "
+                            " Default: the first element of the line.")
+
+    group.add_argument('--input-elt-2', '-E',
+                       action="store", dest="obi:inputelement2",
+                       metavar='<INPUT ELEMENT NAME>',
+                       default="",
+                       type=str,
+                       help="If the second input column has multiple elements per line, name of the element referring to the sequence to align. "
+                            " Default: the first element of the line.")
+
+    group.add_argument('--id-column-1', '-f',
+                       action="store", dest="obi:idcolumn1",
+                       metavar='<ID COLUMN NAME>',
+                       default="",
+                       type=str,
+                       help="Name of the (first) column containing the identifiers of the sequences to align. "
+                            " Default: the default ID column of the view if there is one.")
+
+    group.add_argument('--id-column-2', '-F',
+                       action="store", dest="obi:idcolumn2",
+                       metavar='<ID COLUMN NAME>',
+                       default="",
+                       type=str,
+                       help="Eventually, the name of the second ID column.")
+
+    group.add_argument('--output-view', '-o',
+                       action="store", dest="obi:outputview",
+                       metavar='<OUTPUT VIEW NAME>',
+                       default=None,
+                       type=str,
+                       help="Name of the output view.")
+
+
+    group=parser.add_argument_group('obi lcs specific options')
+ 
+    group.add_argument('--threshold','-t',
+                       action="store", dest="align:threshold",
+                       metavar='<THRESHOLD>',
+                       default=0.0,
+                       type=float,
+                       help="Score threshold. If the score is normalized and expressed in similarity (default),"
+                            " it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized"
+                            " and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%."
+                            " If the score is not normalized and expressed in similarity, it is the length of the"
+                            " Longest Common Subsequence. If the score is not normalized and expressed in distance,"
+                            " it is (reference length - LCS length)."
+                            " Only sequence pairs with a similarity above <THRESHOLD> are printed. Default: 0.00"
+                            " (no threshold).")
+
+    group.add_argument('--longest-length','-L',
+                       action="store_const", dest="align:reflength",
+                       default=0,
+                       const=1,
+                       help="The reference length is the length of the longest sequence."
+                            " Default: the reference length is the length of the alignment.")
+
+    group.add_argument('--shortest-length','-l',
+                       action="store_const", dest="align:reflength",
+                       default=0,
+                       const=2,
+                       help="The reference length is the length of the shortest sequence."
+                            " Default: the reference length is the length of the alignment.")
+
+    group.add_argument('--raw','-r',
+                       action="store_false", dest="align:normalize",
+                       default=True,
+                       help="Raw score, not normalized. Default: score is normalized with the reference sequence length.")
+
+    group.add_argument('--distance','-D',
+                       action="store_false", dest="align:similarity",
+                       default=True,
+                       help="Score is expressed in distance. Default: score is expressed in similarity.")
+
+    group.add_argument('--print-seq','-s',
+                       action="store_true", dest="align:printseq",
+                       default=False,
+                       help="The nucleotide sequences are written in the output view. Default: they are not written.")
+
+    group.add_argument('--print-count','-n',
+                       action="store_true", dest="align:printcount",
+                       default=False,
+                       help="Sequence counts are written in the output view. Default: they are not written.")
+
+
+cpdef align(str dms_n, 
+            str input_view_1_n, str output_view_n,
+            str input_view_2_n="",
+            str input_column_1_n="", str input_column_2_n="",
+            str input_elt_1_n="", str input_elt_2_n="",
+            str id_column_1_n="", str id_column_2_n="",
+            double threshold=0.0, bint normalize=True, 
+            int reference=0, bint similarity_mode=True,
+            bint print_seq=False, bint print_count=False,
+            comments="") :
+                 
+    cdef OBIDMS d         
+    d = OBIDMS(dms_n)
+
+    # Align 1 column (2 columns not implemented yet)
+    if obi_lcs_align_one_column(d._pointer, \
+                                str2bytes(input_view_1_n), \
+                                str2bytes(input_column_1_n), \
+                                str2bytes(input_elt_1_n), \
+                                str2bytes(id_column_1_n), \
+                                str2bytes(output_view_n), \
+                                str2bytes(comments), \
+                                print_seq, \
+                                print_count, \
+                                threshold, normalize, reference, similarity_mode) < 0 :
+        raise Exception("Error aligning sequences")
+
+    d.close()
+
+
+def run(config):
+    
+    # TODO: Build formatted comments with all parameters etc
+    comments = "Obi align"
+    
+    # Call cython alignment function
+    align(config['obi']['defaultdms'],  \
+          config['obi']['inputview1'],  \
+          config['obi']['outputview'],  \
+          input_view_2_n   = config['obi']['inputview2'],  \
+          input_column_1_n = config['obi']['inputcolumn1'],  \
+          input_column_2_n = config['obi']['inputcolumn2'], \
+          input_elt_1_n    = config['obi']['inputelement1'],  \
+          input_elt_2_n    = config['obi']['inputelement2'], \
+          id_column_1_n    = config['obi']['idcolumn1'],  \
+          id_column_2_n    = config['obi']['idcolumn2'], \
+          threshold        = config['align']['threshold'], \
+          normalize        = config['align']['normalize'],  \
+          reference        = config['align']['reflength'],  \
+          similarity_mode  = config['align']['similarity'],  \
+          print_seq        = config['align']['printseq'],  \
+          print_count      = config['align']['printcount'], \
+          comments         = comments)
+      
+    print("Done.")
+
+    
+    
+    
+    
--- a/python/obitools3/obidms/_obidms.pxd
+++ b/python/obitools3/obidms/_obidms.pxd
@ -67,16 +67,8 @@ cdef class OBIView:
    cdef object get_view_subclass(str view_type)
 

-cdef class OBIView_NUC_SEQS(OBIView):
-    
-    cpdef align(self,
-                OBIView oview, 
-                OBIView iview2=*, 
-                double threshold=*, 
-                bint normalize=*, 
-                int reference=*, 
-                bint similarity_mode=*
-               )
+cdef class OBIView_NUC_SEQS(OBIView) :
+    pass


 cdef class OBIView_line :
--- a/python/obitools3/obidms/_obidms.pyx
+++ b/python/obitools3/obidms/_obidms.pyx
@ -10,8 +10,6 @@ from .capi.obidmscolumn cimport obi_close_column, \
                                OBIDMS_column_header_p

 from .capi.obiutils cimport obi_format_date
-
-from .capi.obialign cimport obi_align_one_column
                   
 from .capi.obitypes cimport const_char_p, \
                            OBIType_t, \
@ -535,49 +533,6 @@ cdef class OBIView_NUC_SEQS(OBIView):
            self[line_idx][key] = sequence_obj[key]


-    # TODO discuss
-    cpdef align(self, OBIView oview, OBIView iview2=None,
-                double threshold=0.0, bint normalize=True, int reference=0, bint similarity_mode=True) :
-        pass
-#                 
-#         cdef OBIView iview1
-# 
-#         cdef Obiview_p iview1_p
-#         cdef Obiview_p iview2_p
-#         cdef Obiview_p oview_p
-#         
-#         cdef OBIDMS_column icol1
-#         cdef OBIDMS_column_p icol1_p
-#         cdef OBIDMS_column_p* icol1_pp
-#         
-#         cdef OBIDMS_column id1_col
-#         cdef OBIDMS_column_p id1_col_p
-#         cdef OBIDMS_column_p* id1_col_pp
-# 
-#         cdef OBIDMS_column id2_col
-#         cdef OBIDMS_column_p id2_col_p
-#         cdef OBIDMS_column_p* id2_col_pp
-# 
-#         cdef OBIDMS_column ocol
-#         cdef OBIDMS_column_p ocol_p
-#         cdef OBIDMS_column_p* ocol_pp
-#         
-#         cdef str id1_col_name
-#         cdef str id2_col_name
-#         cdef str score_col_name
-# 
-#         score_col_name = "score"
-#         
-#         iview1= self
-#         iview1_p = iview1._pointer
-#         icol1 = iview1[bytes2str(NUC_SEQUENCE_COLUMN)]
-#         icol1_pp = icol1._pointer
-#         icol1_p = icol1_pp[0]
-# 
-#         if obi_align_one_column(iview1_p, icol1_p, threshold, normalize, reference, similarity_mode) < 0 :
-#             raise Exception("Error aligning sequences")
-        
-
 ######################################################################################################


--- a/python/obitools3/obidms/_obidmscolumn_seq.pyx
+++ b/python/obitools3/obidms/_obidmscolumn_seq.pyx
@ -4,7 +4,6 @@ from .capi.obiview      cimport obi_get_seq_with_elt_name_and_col_p_in_view, \
                                obi_get_seq_with_elt_idx_and_col_p_in_view, \
                                obi_set_seq_with_elt_name_and_col_p_in_view, \
                                obi_set_seq_with_elt_idx_and_col_p_in_view
-from .capi.obialign     cimport obi_align_one_column
 from .capi.obierrno     cimport obi_errno
 from .capi.obitypes     cimport OBISeq_NA, const_char_p

--- a/python/obitools3/obidms/_obiseq.pyx
+++ b/python/obitools3/obidms/_obiseq.pyx
@ -102,12 +102,12 @@ cdef class OBI_Nuc_Seq_Stored(OBIView_line) :
        return self[bytes2str(QUALITY_COLUMN)]
    @quality.setter
    def quality(self, object new_qual):
-        if (type(new_qual) == list) or (new_qual is None) :
+        if (type(new_qual) == list) or (new_qual is None) : # TODO check that quality column exists
            self[bytes2str(QUALITY_COLUMN)] = new_qual
        else :  # Quality is in str form
            (((self._view).columns)[bytes2str(QUALITY_COLUMN)]).set_str_line(self._index, new_qual)

-    cpdef object get_str_quality(self) :    # TODO not ideal
+    cpdef object get_str_quality(self) :    # TODO not ideal. Make quality_int and quality_str properties
        return ((self._view).columns)[bytes2str(QUALITY_COLUMN)].get_str_line(self._index)
    
 #    cpdef str reverse_complement(self) :    TODO in C ?
--- a/python/obitools3/obidms/capi/obialign.pxd
+++ b/python/obitools3/obidms/capi/obialign.pxd
@ -1,20 +1,22 @@
 #cython: language_level=3

-from ..capi.obiview      cimport Obiview_p
-from ..capi.obidmscolumn cimport OBIDMS_column_p
+from obitools3.obidms.capi.obidms       cimport OBIDMS_p
+from obitools3.obidms.capi.obitypes     cimport const_char_p


 cdef extern from "obi_align.h" nogil:

-    int obi_align_one_column(Obiview_p seq_view, 
-                             OBIDMS_column_p seq_column, 
-                             const char* seq_name,
-                             Obiview_p score_view, 
-                             OBIDMS_column_p id1_column, 
-                             OBIDMS_column_p id2_column, 
-                             OBIDMS_column_p score_column, 
-                             double threshold, 
-                             bint normalize, 
-                             int reference, 
-                             bint similarity_mode)
+    int obi_lcs_align_one_column(OBIDMS_p dms, 
+                                 const_char_p seq_view_name, 
+                                 const_char_p seq_column_name, 
+                                 const_char_p seq_elt_name,
+                                 const_char_p id_column_name, 
+                                 const_char_p output_view_name, 
+                                 const_char_p output_view_comments,
+                                 bint print_seq, 
+                                 bint print_count,
+                                 double threshold, 
+                                 bint normalize, 
+                                 int reference, 
+                                 bint similarity_mode)