Alignment: API rework. 'obi align' is now 'obi lcs', and the results are

now written to columns automatically created in the output view, all optimally handled at the C level.
2016-12-12 11:58:59 +01:00
parent fa4e4ffaff
commit 8afb1644e9
11 changed files with 579 additions and 272 deletions
--- a/python/obitools3/commands/align.pyx
+++ b/python/obitools3/commands/align.pyx
@ -1,120 +0,0 @@
-from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
-from obitools3.obidms._obidms import OBIDMS, OBIView    # TODO cimport doesn't work
-
-
-import time
-
-__title__="Aligns one sequence column with itself or two sequence columns"
-
-
-default_config = {   'inputview'    : None,
-                 }
-
-def addOptions(parser):
-
-    # TODO put this common group somewhere else but I don't know where
-    group=parser.add_argument_group('DMS and view options')
-
-    group.add_argument('--default-dms','-d', 
-                       action="store", dest="obi:defaultdms",
-                       metavar='<DMS NAME>',
-                       default=None,
-                       type=str,
-                       help="Name of the default DMS for reading and writing data.")
-
-    group.add_argument('--input-view','-i',
-                       action="store", dest="obi:inputview",
-                       metavar='<INPUT VIEW NAME>',
-                       default=None,
-                       type=str,
-                       help="Name of the input view.")
-
-    # TODO eventually 2nd view, or 2nd column?
-
-    group.add_argument('--output-view','-o',
-                       action="store", dest="obi:outputview",
-                       metavar='<OUTPUT VIEW NAME>',
-                       default=None,
-                       type=str,
-                       help="Name of the output view.")
-
-
-    group=parser.add_argument_group('obi align specific options')
-
-    group.add_argument('--lcs','-C',
-                       action="store", dest="align:alitype",
-                       metavar='<ALIGNMENT TYPE>',
-                       default='lcs',
-                       type=str,
-                       help="Compute alignment using the LCS method (default).")
- 
-    group.add_argument('--threshold','-t',
-                       action="store", dest="align:threshold",
-                       metavar='<THRESHOLD>',
-                       default=0.0,
-                       type=float,
-                       help="Score threshold. If the score is normalized and expressed in similarity (default),"
-                            " it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized"
-                            " and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%."
-                            " If the score is not normalized and expressed in similarity, it is the length of the"
-                            " Longest Common Subsequence. If the score is not normalized and expressed in distance,"
-                            " it is (reference length - LCS length)."
-                            " Only sequence pairs with a similarity above <THRESHOLD> are printed. Default: 0.00"
-                            " (no threshold).")
-
-    group.add_argument('--longest_length','-L',
-                       action="store_const", dest="align:reflength",
-                       default=0,
-                       const=1,
-                       help="The reference length is the length of the longest sequence."
-                            " Default: the reference length is the length of the alignment.")
-
-    group.add_argument('--shortest_length','-l',
-                       action="store_const", dest="align:reflength",
-                       default=0,
-                       const=2,
-                       help="The reference length is the length of the shortest sequence."
-                            " Default: the reference length is the length of the alignment.")
-
-    group.add_argument('--raw','-r',
-                       action="store_false", dest="align:normalize",
-                       default=True,
-                       help="Raw score, not normalized. Default: score is normalized with the reference sequence length.")
-
-    group.add_argument('--distance','-D',
-                       action="store_false", dest="align:similarity",
-                       default=True,
-                       help="Score is expressed in distance. Default: score is expressed in similarity.")
-
-
-    
-def run(config):
-     
-    # Open DMS
-    d = OBIDMS(config['obi']['defaultdms'])
-    
-    # Open input view 1
-    iview = d.open_view(config['obi']['inputview'])
-
-    # TODO Open input view 2 if there is one
-
-    # Create output view
-    oview = d.new_view(config['obi']['outputview'])
-    
-    # TODO Take other alignment types into account when they'll be implemented
-    
-    # Call cython alignment function
-    iview.align(oview, threshold=config['align']['threshold'], normalize=config['align']['normalize'], reference=config['align']['reflength'], similarity_mode=config['align']['similarity'])
-
-    print(repr(oview))
-    
-    iview.close()
-    oview.close()
-    d.close()
-     
-    print("Done.")
-
-    
-    
-    
-        
--- a/python/obitools3/commands/lcs.pyx
+++ b/python/obitools3/commands/lcs.pyx
@ -0,0 +1,209 @@
+#cython: language_level=3
+
+from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
+from obitools3.obidms._obidms cimport OBIDMS    # TODO cimport doesn't work
+from obitools3.utils cimport str2bytes
+
+from obitools3.obidms.capi.obialign cimport obi_lcs_align_one_column
+
+
+import time
+
+__title__="Aligns one sequence column with itself or two sequence columns"
+
+
+default_config = {   'inputview'    : None,
+                 }
+
+def addOptions(parser):
+
+    # TODO put this common group somewhere else but I don't know where.
+    # Also some options should probably be in another group
+    group=parser.add_argument_group('DMS and view options')
+
+    group.add_argument('--default-dms', '-d', 
+                       action="store", dest="obi:defaultdms",
+                       metavar='<DMS NAME>',
+                       default=None,
+                       type=str,
+                       help="Name of the default DMS for reading and writing data.")
+
+    group.add_argument('--input-view-1', '-i',
+                       action="store", dest="obi:inputview1",
+                       metavar='<INPUT VIEW NAME>',
+                       default=None,
+                       type=str,
+                       help="Name of the (first) input view.")
+
+    group.add_argument('--input-view-2', '-I',
+                       action="store", dest="obi:inputview2",
+                       metavar='<INPUT VIEW NAME>',
+                       default="",
+                       type=str,
+                       help="Eventually, the name of the second input view.")
+
+    group.add_argument('--input-column-1', '-c',
+                       action="store", dest="obi:inputcolumn1",
+                       metavar='<INPUT COLUMN NAME>',
+                       default="",
+                       type=str,
+                       help="Name of the (first) input column. "
+                            " Default: the default nucleotide sequence column of the view if there is one.")
+
+    group.add_argument('--input-column-2', '-C',
+                       action="store", dest="obi:inputcolumn2",
+                       metavar='<INPUT COLUMN NAME>',
+                       default="",
+                       type=str,
+                       help="Eventually, the name of the second input column.")
+
+    group.add_argument('--input-elt-1', '-e',
+                       action="store", dest="obi:inputelement1",
+                       metavar='<INPUT ELEMENT NAME>',
+                       default="",
+                       type=str,
+                       help="If the first input column has multiple elements per line, name of the element referring to the sequence to align. "
+                            " Default: the first element of the line.")
+
+    group.add_argument('--input-elt-2', '-E',
+                       action="store", dest="obi:inputelement2",
+                       metavar='<INPUT ELEMENT NAME>',
+                       default="",
+                       type=str,
+                       help="If the second input column has multiple elements per line, name of the element referring to the sequence to align. "
+                            " Default: the first element of the line.")
+
+    group.add_argument('--id-column-1', '-f',
+                       action="store", dest="obi:idcolumn1",
+                       metavar='<ID COLUMN NAME>',
+                       default="",
+                       type=str,
+                       help="Name of the (first) column containing the identifiers of the sequences to align. "
+                            " Default: the default ID column of the view if there is one.")
+
+    group.add_argument('--id-column-2', '-F',
+                       action="store", dest="obi:idcolumn2",
+                       metavar='<ID COLUMN NAME>',
+                       default="",
+                       type=str,
+                       help="Eventually, the name of the second ID column.")
+
+    group.add_argument('--output-view', '-o',
+                       action="store", dest="obi:outputview",
+                       metavar='<OUTPUT VIEW NAME>',
+                       default=None,
+                       type=str,
+                       help="Name of the output view.")
+
+
+    group=parser.add_argument_group('obi lcs specific options')
+ 
+    group.add_argument('--threshold','-t',
+                       action="store", dest="align:threshold",
+                       metavar='<THRESHOLD>',
+                       default=0.0,
+                       type=float,
+                       help="Score threshold. If the score is normalized and expressed in similarity (default),"
+                            " it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized"
+                            " and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%."
+                            " If the score is not normalized and expressed in similarity, it is the length of the"
+                            " Longest Common Subsequence. If the score is not normalized and expressed in distance,"
+                            " it is (reference length - LCS length)."
+                            " Only sequence pairs with a similarity above <THRESHOLD> are printed. Default: 0.00"
+                            " (no threshold).")
+
+    group.add_argument('--longest-length','-L',
+                       action="store_const", dest="align:reflength",
+                       default=0,
+                       const=1,
+                       help="The reference length is the length of the longest sequence."
+                            " Default: the reference length is the length of the alignment.")
+
+    group.add_argument('--shortest-length','-l',
+                       action="store_const", dest="align:reflength",
+                       default=0,
+                       const=2,
+                       help="The reference length is the length of the shortest sequence."
+                            " Default: the reference length is the length of the alignment.")
+
+    group.add_argument('--raw','-r',
+                       action="store_false", dest="align:normalize",
+                       default=True,
+                       help="Raw score, not normalized. Default: score is normalized with the reference sequence length.")
+
+    group.add_argument('--distance','-D',
+                       action="store_false", dest="align:similarity",
+                       default=True,
+                       help="Score is expressed in distance. Default: score is expressed in similarity.")
+
+    group.add_argument('--print-seq','-s',
+                       action="store_true", dest="align:printseq",
+                       default=False,
+                       help="The nucleotide sequences are written in the output view. Default: they are not written.")
+
+    group.add_argument('--print-count','-n',
+                       action="store_true", dest="align:printcount",
+                       default=False,
+                       help="Sequence counts are written in the output view. Default: they are not written.")
+
+
+cpdef align(str dms_n, 
+            str input_view_1_n, str output_view_n,
+            str input_view_2_n="",
+            str input_column_1_n="", str input_column_2_n="",
+            str input_elt_1_n="", str input_elt_2_n="",
+            str id_column_1_n="", str id_column_2_n="",
+            double threshold=0.0, bint normalize=True, 
+            int reference=0, bint similarity_mode=True,
+            bint print_seq=False, bint print_count=False,
+            comments="") :
+                 
+    cdef OBIDMS d         
+    d = OBIDMS(dms_n)
+
+    # Align 1 column (2 columns not implemented yet)
+    if obi_lcs_align_one_column(d._pointer, \
+                                str2bytes(input_view_1_n), \
+                                str2bytes(input_column_1_n), \
+                                str2bytes(input_elt_1_n), \
+                                str2bytes(id_column_1_n), \
+                                str2bytes(output_view_n), \
+                                str2bytes(comments), \
+                                print_seq, \
+                                print_count, \
+                                threshold, normalize, reference, similarity_mode) < 0 :
+        raise Exception("Error aligning sequences")
+
+    d.close()
+
+
+def run(config):
+    
+    # TODO: Build formatted comments with all parameters etc
+    comments = "Obi align"
+    
+    # Call cython alignment function
+    align(config['obi']['defaultdms'],  \
+          config['obi']['inputview1'],  \
+          config['obi']['outputview'],  \
+          input_view_2_n   = config['obi']['inputview2'],  \
+          input_column_1_n = config['obi']['inputcolumn1'],  \
+          input_column_2_n = config['obi']['inputcolumn2'], \
+          input_elt_1_n    = config['obi']['inputelement1'],  \
+          input_elt_2_n    = config['obi']['inputelement2'], \
+          id_column_1_n    = config['obi']['idcolumn1'],  \
+          id_column_2_n    = config['obi']['idcolumn2'], \
+          threshold        = config['align']['threshold'], \
+          normalize        = config['align']['normalize'],  \
+          reference        = config['align']['reflength'],  \
+          similarity_mode  = config['align']['similarity'],  \
+          print_seq        = config['align']['printseq'],  \
+          print_count      = config['align']['printcount'], \
+          comments         = comments)
+      
+    print("Done.")
+
+    
+    
+    
+