New command: obi align, except it's called obi pouic for now because of

a Cython compilation bug
2018-11-07 16:05:48 +01:00
parent d1f1fd432e
commit b0da36cb48
6 changed files with 296 additions and 242 deletions
--- a/python/obitools3/commands/lcs.pyx
+++ b/python/obitools3/commands/lcs.pyx
@ -1,236 +0,0 @@
-#cython: language_level=3
-# 
-# from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
-# from obitools3.dms.dms import OBIDMS       # TODO cimport doesn't work
-# from obitools3.utils cimport str2bytes
-# 
-# from obitools3.dms.capi.obialign cimport obi_lcs_align_one_column, \
-#                                          obi_lcs_align_two_columns
-# 
-# 
-# import time
-# 
-# __title__="Aligns one sequence column with itself or two sequence columns"
-# 
-# 
-# default_config = {   'inputview'    : None,
-#                  }
-# 
-# def addOptions(parser):
-# 
-#     # TODO put this common group somewhere else but I don't know where.
-#     # Also some options should probably be in another group
-#     group=parser.add_argument_group('DMS and view options')
-# 
-#     group.add_argument('--default-dms', '-d', 
-#                        action="store", dest="obi:defaultdms",
-#                        metavar='<DMS NAME>',
-#                        default=None,
-#                        type=str,
-#                        help="Name of the default DMS for reading and writing data.")
-# 
-#     group.add_argument('--input-view-1', '-i',
-#                        action="store", dest="obi:inputview1",
-#                        metavar='<INPUT VIEW NAME>',
-#                        default=None,
-#                        type=str,
-#                        help="Name of the (first) input view.")
-# 
-#     group.add_argument('--input-view-2', '-I',
-#                        action="store", dest="obi:inputview2",
-#                        metavar='<INPUT VIEW NAME>',
-#                        default="",
-#                        type=str,
-#                        help="Eventually, the name of the second input view.")
-# 
-#     group.add_argument('--input-column-1', '-c',
-#                        action="store", dest="obi:inputcolumn1",
-#                        metavar='<INPUT COLUMN NAME>',
-#                        default="",
-#                        type=str,
-#                        help="Name of the (first) input column. "
-#                             " Default: the default nucleotide sequence column of the view if there is one.")
-# 
-#     group.add_argument('--input-column-2', '-C',
-#                        action="store", dest="obi:inputcolumn2",
-#                        metavar='<INPUT COLUMN NAME>',
-#                        default="",
-#                        type=str,
-#                        help="Eventually, the name of the second input column.")
-# 
-#     group.add_argument('--input-elt-1', '-e',
-#                        action="store", dest="obi:inputelement1",
-#                        metavar='<INPUT ELEMENT NAME>',
-#                        default="",
-#                        type=str,
-#                        help="If the first input column has multiple elements per line, name of the element referring to the sequence to align. "
-#                             " Default: the first element of the line.")
-# 
-#     group.add_argument('--input-elt-2', '-E',
-#                        action="store", dest="obi:inputelement2",
-#                        metavar='<INPUT ELEMENT NAME>',
-#                        default="",
-#                        type=str,
-#                        help="If the second input column has multiple elements per line, name of the element referring to the sequence to align. "
-#                             " Default: the first element of the line.")
-# 
-#     group.add_argument('--id-column-1', '-f',
-#                        action="store", dest="obi:idcolumn1",
-#                        metavar='<ID COLUMN NAME>',
-#                        default="",
-#                        type=str,
-#                        help="Name of the (first) column containing the identifiers of the sequences to align. "
-#                             " Default: the default ID column of the view if there is one.")
-# 
-#     group.add_argument('--id-column-2', '-F',
-#                        action="store", dest="obi:idcolumn2",
-#                        metavar='<ID COLUMN NAME>',
-#                        default="",
-#                        type=str,
-#                        help="Eventually, the name of the second ID column.")
-# 
-#     group.add_argument('--output-view', '-o',
-#                        action="store", dest="obi:outputview",
-#                        metavar='<OUTPUT VIEW NAME>',
-#                        default=None,
-#                        type=str,
-#                        help="Name of the output view.")
-# 
-# 
-#     group=parser.add_argument_group('obi lcs specific options')
-#  
-#     group.add_argument('--threshold','-t',
-#                        action="store", dest="align:threshold",
-#                        metavar='<THRESHOLD>',
-#                        default=0.0,
-#                        type=float,
-#                        help="Score threshold. If the score is normalized and expressed in similarity (default),"
-#                             " it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized"
-#                             " and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%."
-#                             " If the score is not normalized and expressed in similarity, it is the length of the"
-#                             " Longest Common Subsequence. If the score is not normalized and expressed in distance,"
-#                             " it is (reference length - LCS length)."
-#                             " Only sequence pairs with a similarity above <THRESHOLD> are printed. Default: 0.00"
-#                             " (no threshold).")
-# 
-#     group.add_argument('--longest-length','-L',
-#                        action="store_const", dest="align:reflength",
-#                        default=0,
-#                        const=1,
-#                        help="The reference length is the length of the longest sequence."
-#                             " Default: the reference length is the length of the alignment.")
-# 
-#     group.add_argument('--shortest-length','-l',
-#                        action="store_const", dest="align:reflength",
-#                        default=0,
-#                        const=2,
-#                        help="The reference length is the length of the shortest sequence."
-#                             " Default: the reference length is the length of the alignment.")
-# 
-#     group.add_argument('--raw','-r',
-#                        action="store_false", dest="align:normalize",
-#                        default=True,
-#                        help="Raw score, not normalized. Default: score is normalized with the reference sequence length.")
-# 
-#     group.add_argument('--distance','-D',
-#                        action="store_false", dest="align:similarity",
-#                        default=True,
-#                        help="Score is expressed in distance. Default: score is expressed in similarity.")
-# 
-#     group.add_argument('--print-seq','-s',
-#                        action="store_true", dest="align:printseq",
-#                        default=False,
-#                        help="The nucleotide sequences are written in the output view. Default: they are not written.")
-# 
-#     group.add_argument('--print-count','-n',
-#                        action="store_true", dest="align:printcount",
-#                        default=False,
-#                        help="Sequence counts are written in the output view. Default: they are not written.")
-# 
-#     group.add_argument('--thread-count','-p',   # TODO should probably be in a specific option group
-#                        action="store", dest="align:threadcount",
-#                        metavar='<THREAD COUNT>',
-#                        default=1,
-#                        type=int,
-#                        help="Number of threads to use for the computation. Default: one.")
-# 
-# 
-# # cpdef align(str dms_n, 
-# #             str input_view_1_n, str output_view_n,
-# #             str input_view_2_n="",
-# #             str input_column_1_n="", str input_column_2_n="",
-# #             str input_elt_1_n="", str input_elt_2_n="",
-# #             str id_column_1_n="", str id_column_2_n="",
-# #             double threshold=0.0, bint normalize=True, 
-# #             int reference=0, bint similarity_mode=True,
-# #             bint print_seq=False, bint print_count=False,
-# #             comments="",
-# #             int thread_count=1) :
-# #                  
-# #     cdef OBIDMS d         
-# #     d = OBIDMS(dms_n)
-# # 
-# #     if input_view_2_n == "" and input_column_2_n == "" :
-# #         if obi_lcs_align_one_column(d._pointer, \
-# #                                     str2bytes(input_view_1_n), \
-# #                                     str2bytes(input_column_1_n), \
-# #                                     str2bytes(input_elt_1_n), \
-# #                                     str2bytes(id_column_1_n), \
-# #                                     str2bytes(output_view_n), \
-# #                                     str2bytes(comments), \
-# #                                     print_seq, \
-# #                                     print_count, \
-# #                                     threshold, normalize, reference, similarity_mode,
-# #                                     thread_count) < 0 :
-# #             raise Exception("Error aligning sequences")
-# #     else :
-# #         if obi_lcs_align_two_columns(d._pointer, \
-# #                                      str2bytes(input_view_1_n), \
-# #                                      str2bytes(input_view_2_n), \
-# #                                      str2bytes(input_column_1_n), \
-# #                                      str2bytes(input_column_2_n), \
-# #                                      str2bytes(input_elt_1_n), \
-# #                                      str2bytes(input_elt_2_n), \
-# #                                      str2bytes(id_column_1_n), \
-# #                                      str2bytes(id_column_2_n), \
-# #                                      str2bytes(output_view_n), \
-# #                                      str2bytes(comments), \
-# #                                      print_seq, \
-# #                                      print_count, \
-# #                                      threshold, normalize, reference, similarity_mode) < 0 :
-# #             raise Exception("Error aligning sequences")        
-# #         
-# #     d.close()
-# # 
-# # 
-def run(config):
-    pass
-    # TODO: Build formatted comments with all parameters etc
-#     comments = "Obi align"
-#      
-#     # Call cython alignment function
-#     align(config['obi']['defaultdms'],  \
-#           config['obi']['inputview1'],  \
-#           config['obi']['outputview'],  \
-#           input_view_2_n   = config['obi']['inputview2'],  \
-#           input_column_1_n = config['obi']['inputcolumn1'],  \
-#           input_column_2_n = config['obi']['inputcolumn2'], \
-#           input_elt_1_n    = config['obi']['inputelement1'],  \
-#           input_elt_2_n    = config['obi']['inputelement2'], \
-#           id_column_1_n    = config['obi']['idcolumn1'],  \
-#           id_column_2_n    = config['obi']['idcolumn2'], \
-#           threshold        = config['align']['threshold'], \
-#           normalize        = config['align']['normalize'],  \
-#           reference        = config['align']['reflength'],  \
-#           similarity_mode  = config['align']['similarity'],  \
-#           print_seq        = config['align']['printseq'],  \
-#           print_count      = config['align']['printcount'], \
-#           comments         = comments, \
-#           thread_count     = config['align']['threadcount'])
-#  
-#     print("Done.")
-# # 
-# #     
-# #     
-# #     
-# #     
--- a/python/obitools3/commands/pouic.pxd
+++ b/python/obitools3/commands/pouic.pxd
@ -0,0 +1,18 @@
+#cython: language_level=3
+
+
+cpdef align_columns(bytes dms_n, 
+                    bytes input_view_1_n, 
+                    bytes output_view_n,
+                    bytes input_view_2_n=*,
+                    bytes input_column_1_n=*, 
+                    bytes input_column_2_n=*,
+                    bytes input_elt_1_n=*, 
+                    bytes input_elt_2_n=*,
+                    bytes id_column_1_n=*, 
+                    bytes id_column_2_n=*,
+                    double threshold=*, bint normalize=*, 
+                    int reference=*, bint similarity_mode=*,
+                    bint print_seq=*, bint print_count=*,
+                    bytes comments=*,
+                    int thread_count=*)
--- a/python/obitools3/commands/pouic.pyx
+++ b/python/obitools3/commands/pouic.pyx
@ -0,0 +1,272 @@
+#cython: language_level=3
+
+from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
+from obitools3.dms import DMS
+from obitools3.dms.view.view cimport View
+from obitools3.uri.decode import open_uri
+from obitools3.apps.optiongroups import addMinimalInputOption, addMinimalOutputOption
+from obitools3.dms.view import RollbackException
+from obitools3.apps.config import logger
+from obitools3.utils cimport tobytes, str2bytes
+
+from obitools3.dms.capi.obialign cimport obi_lcs_align_one_column, \
+                                         obi_lcs_align_two_columns
+
+import time
+import sys
+
+
+__title__="Aligns one sequence column with itself or two sequence columns"
+
+
+def addOptions(parser):
+   
+   addMinimalInputOption(parser)
+   addMinimalOutputOption(parser)
+
+   group=parser.add_argument_group('obi align specific options')
+
+   group.add_argument('--input-2', '-I',
+                      action="store", dest="pouic:inputuri2",
+                      metavar='<INPUT URI>',
+                      default="",
+                      type=str,
+                      help="Eventually, the URI of the second input to align with the first one.")
+      
+   group.add_argument('--threshold','-t',
+                      action="store", dest="pouic:threshold",
+                      metavar='<THRESHOLD>',
+                      default=0.0,
+                      type=float,
+                      help="Score threshold. If the score is normalized and expressed in similarity (default),"
+                           " it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized"
+                           " and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%."
+                           " If the score is not normalized and expressed in similarity, it is the length of the"
+                           " Longest Common Subsequence. If the score is not normalized and expressed in distance,"
+                           " it is (reference length - LCS length)."
+                           " Only sequence pairs with a similarity above <THRESHOLD> are printed. Default: 0.00"
+                           " (no threshold).")
+   
+   group.add_argument('--longest-length','-L',
+                      action="store_const", dest="pouic:reflength",
+                      default=0,
+                      const=1,
+                      help="The reference length is the length of the longest sequence."
+                           " Default: the reference length is the length of the alignment.")
+   
+   group.add_argument('--shortest-length','-l',
+                      action="store_const", dest="pouic:reflength",
+                      default=0,
+                      const=2,
+                      help="The reference length is the length of the shortest sequence."
+                           " Default: the reference length is the length of the alignment.")
+   
+   group.add_argument('--raw','-r',
+                      action="store_false", dest="pouic:normalize",
+                      default=True,
+                      help="Raw score, not normalized. Default: score is normalized with the reference sequence length.")
+   
+   group.add_argument('--distance','-D',
+                      action="store_false", dest="pouic:similarity",
+                      default=True,
+                      help="Score is expressed in distance. Default: score is expressed in similarity.")
+   
+   group.add_argument('--print-seq','-s',
+                      action="store_true", dest="pouic:printseq",
+                      default=False,
+                      help="The nucleotide sequences are written in the output view. Default: they are not written.")
+   
+   group.add_argument('--print-count','-n',
+                      action="store_true", dest="pouic:printcount",
+                      default=False,
+                      help="Sequence counts are written in the output view. Default: they are not written.")
+   
+   group.add_argument('--thread-count','-p',   # TODO should probably be in a specific option group
+                      action="store", dest="pouic:threadcount",
+                      metavar='<THREAD COUNT>',
+                      default=1,
+                      type=int,
+                      help="Number of threads to use for the computation. Default: one.")
+
+
+cpdef align_columns(bytes dms_n, 
+                    bytes input_view_1_n, 
+                    bytes output_view_n,
+                    bytes input_view_2_n=b"",
+                    bytes input_column_1_n=b"", 
+                    bytes input_column_2_n=b"",
+                    bytes input_elt_1_n=b"", 
+                    bytes input_elt_2_n=b"",
+                    bytes id_column_1_n=b"", 
+                    bytes id_column_2_n=b"",
+                    double threshold=0.0, bint normalize=True, 
+                    int reference=0, bint similarity_mode=True,
+                    bint print_seq=False, bint print_count=False,
+                    bytes comments=b"{}",
+                    int thread_count=1) : 
+
+    if input_view_2_n == b"" and input_column_2_n == b"" :
+        if obi_lcs_align_one_column(dms_n, \
+                                    input_view_1_n, \
+                                    input_column_1_n, \
+                                    input_elt_1_n, \
+                                    id_column_1_n, \
+                                    output_view_n, \
+                                    comments, \
+                                    print_seq, \
+                                    print_count, \
+                                    threshold, normalize, reference, similarity_mode,
+                                    thread_count) < 0 :
+            raise Exception("Error aligning sequences")        
+            
+    else:
+        if obi_lcs_align_two_columns(dms_n, \
+                                     input_view_1_n, \
+                                     input_view_2_n, \
+                                     input_column_1_n, \
+                                     input_column_2_n, \
+                                     input_elt_1_n, \
+                                     input_elt_2_n, \
+                                     id_column_1_n, \
+                                     id_column_2_n, \
+                                     output_view_n, \
+                                     comments, \
+                                     print_seq, \
+                                     print_count, \
+                                     threshold, normalize, reference, similarity_mode) < 0 :
+            raise Exception("Error aligning sequences")        
+ 
+ 
+def run(config):
+    
+    DMS.obi_atexit()
+    
+    logger("info", "obi align")
+
+    # Open the input: only the DMS
+    input = open_uri(config['obi']['inputURI'],
+                     dms_only=True)
+    if input is None:
+        raise Exception("Could not read input")
+    i_dms = input[0]
+    i_dms_name = input[0].name
+    i_uri = input[1]
+    i_view_name = i_uri.split(b"/")[0]
+    i_column_name = b""
+    i_element_name = b""
+    if len(i_uri.split(b"/")) == 2:
+        i_column_name = i_uri.split(b"/")[1]
+    if len(i_uri.split(b"/")) == 3:
+        i_element_name = i_uri.split(b"/")[2]
+    if len(i_uri.split(b"/")) > 3:
+        raise Exception("Input URI contains too many elements:", config['obi']['inputURI'])
+
+    # Open the second input if there is one
+    i_dms_2 = None
+    i_dms_name_2 = b""
+    original_i_view_name_2 = b""
+    i_view_name_2 = b""
+    i_column_name_2 = b""
+    i_element_name_2 = b""
+    if config['pouic']['inputuri2']:
+        input_2 = open_uri(config['pouic']['inputuri2'],
+                           dms_only=True)
+        if input_2 is None:
+            raise Exception("Could not read second input")
+        i_dms_2 = input_2[0]
+        i_dms_name_2 = i_dms_2.name
+        i_uri_2 = input_2[1]
+        original_i_view_name_2 = i_uri_2.split(b"/")[0]
+        if len(i_uri_2.split(b"/")) == 2:
+            i_column_name_2 = i_uri_2.split(b"/")[1]
+        if len(i_uri_2.split(b"/")) == 3:
+            i_element_name_2 = i_uri_2.split(b"/")[2]
+        if len(i_uri_2.split(b"/")) > 3:
+            raise Exception("Input URI contains too many elements:", config['pouic']['inputuri2'])
+
+        # If the 2 input DMS are not the same, temporarily import 2nd input view in first input DMS
+        if i_dms != i_dms_2:
+            temp_i_view_name_2 = original_i_view_name_2
+            i=0
+            while temp_i_view_name_2 in i_dms:  # Making sure view name is unique in input DMS
+                temp_i_view_name_2 = original_i_view_name_2+b"_"+str2bytes(str(i))
+                i+=1
+            i_view_name_2 = temp_i_view_name_2
+            View.import_view(i_dms_2.full_path[:-7], i_dms.full_path[:-7], original_i_view_name_2, i_view_name_2)
+
+    # Open the output: only the DMS
+    output = open_uri(config['obi']['outputURI'],
+                      input=False,
+                      dms_only=True)
+    if output is None:
+        raise Exception("Could not create output")
+    o_dms = output[0]
+    o_dms_name = o_dms.name 
+    final_o_view_name = output[1]
+
+    # If the input and output DMS are not the same, align creating a temporary view in the input dms that will be exported to 
+    # the right DMS and deleted in the other afterwards.
+    if i_dms != o_dms:
+        temporary_view_name = final_o_view_name
+        i=0
+        while temporary_view_name in i_dms:  # Making sure view name is unique in input DMS
+            temporary_view_name = final_o_view_name+b"_"+str2bytes(str(i))
+            i+=1
+        o_view_name = temporary_view_name
+    else:
+        o_view_name = final_o_view_name
+        
+    # Save command config in View comments
+    command_line = " ".join(sys.argv[1:])
+    
+    i_dms_list = [i_dms_name]
+    if i_dms_name_2 and i_dms_name != i_dms_name_2:
+        i_dms_list.append(i_dms_name_2)
+    i_view_list = [i_view_name]
+    if original_i_view_name_2 and i_view_name != original_i_view_name_2:
+        i_view_list.append(original_i_view_name_2)
+    comments = View.print_config(config, "pouic", command_line, input_dms_name=i_dms_list, input_view_name=i_view_list)
+    
+    # Call cython alignment function
+      # Using default ID columns of the view. TODO discuss adding option
+    align_columns(i_dms_name,  \
+                  i_view_name,  \
+                  o_view_name,  \
+                  input_view_2_n   = i_view_name_2,  \
+                  input_column_1_n = i_column_name,  \
+                  input_column_2_n = i_column_name_2, \
+                  input_elt_1_n    = i_element_name,  \
+                  input_elt_2_n    = i_element_name_2, \
+                  id_column_1_n    = b"",  \
+                  id_column_2_n    = b"", \
+                  threshold        = config['pouic']['threshold'], \
+                  normalize        = config['pouic']['normalize'],  \
+                  reference        = config['pouic']['reflength'],  \
+                  similarity_mode  = config['pouic']['similarity'],  \
+                  print_seq        = config['pouic']['printseq'],  \
+                  print_count      = config['pouic']['printcount'], \
+                  comments         = comments, \
+                  thread_count     = config['pouic']['threadcount'])
+
+    # If the input and output DMS are not the same, export result view to output DMS
+    if i_dms != o_dms:
+        View.import_view(i_dms.full_path[:-7], o_dms.full_path[:-7], o_view_name, final_o_view_name)
+
+    # Save command config in output DMS comments
+    o_dms.record_command_line(command_line)
+
+    print("\n")
+    print(repr(o_dms[final_o_view_name]))
+
+    # If the two input DMS are different, delete the temporary input view in the first input DMS
+    if i_dms_2 and i_dms != i_dms_2:
+        View.delete_view(i_dms, i_view_name_2)
+        i_dms_2.close()
+
+    # If the input and the output DMS are different, delete the temporary result view in the input DMS
+    if i_dms != o_dms:
+        View.delete_view(i_dms, o_view_name)
+        o_dms.close()
+    
+    i_dms.close()
+  
--- a/python/obitools3/dms/capi/obialign.pxd
+++ b/python/obitools3/dms/capi/obialign.pxd
@ -4,13 +4,13 @@ from obitools3.dms.capi.obidms       cimport OBIDMS_p
 from obitools3.dms.capi.obitypes     cimport const_char_p


-cdef extern from "obi_align.h" nogil:
+cdef extern from "obi_lcs.h" nogil:

-    int obi_lcs_align_one_column(OBIDMS_p dms, 
+    int obi_lcs_align_one_column(const_char_p dms_name, 
                                 const_char_p seq_view_name, 
                                 const_char_p seq_column_name, 
                                 const_char_p seq_elt_name,
-                                 const_char_p id_column_name, 
+                                 const_char_p id_column_name,
                                 const_char_p output_view_name, 
                                 const_char_p output_view_comments,
                                 bint print_seq, 
@ -22,7 +22,7 @@ cdef extern from "obi_align.h" nogil:
                                 int thread_count)


-    int obi_lcs_align_two_columns(OBIDMS_p dms,
+    int obi_lcs_align_two_columns(const_char_p dms_name,
                                  const_char_p seq1_view_name,
                                  const_char_p seq2_view_name,
                                  const_char_p seq1_column_name,
--- a/python/obitools3/dms/dms.cfiles
+++ b/python/obitools3/dms/dms.cfiles
@ -7,7 +7,7 @@
 ../../../src/hashtable.c
 ../../../src/linked_list.c
 ../../../src/murmurhash2.c
-../../../src/obi_align.c
+../../../src/obi_lcs.c
 ../../../src/obi_clean.c
 ../../../src/obi_ecopcr.c
 ../../../src/libecoPCR/libthermo/nnparams.c
--- a/python/obitools3/utils.cfiles
+++ b/python/obitools3/utils.cfiles
@ -7,7 +7,7 @@
 ../../src/hashtable.c
 ../../src/linked_list.c
 ../../src/murmurhash2.c
-../../src/obi_align.c
+../../src/obi_lcs.c
 ../../src/obi_clean.c
 ../../src/obi_ecopcr.c
 ../../src/libecoPCR/libthermo/nnparams.c