From b0da36cb48fc912f4b0aeaa694821348c9e400a9 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Wed, 7 Nov 2018 16:05:48 +0100 Subject: [PATCH] New command: obi align, except it's called obi pouic for now because of a Cython compilation bug --- python/obitools3/commands/lcs.pyx | 236 --------------------- python/obitools3/commands/pouic.pxd | 18 ++ python/obitools3/commands/pouic.pyx | 272 +++++++++++++++++++++++++ python/obitools3/dms/capi/obialign.pxd | 8 +- python/obitools3/dms/dms.cfiles | 2 +- python/obitools3/utils.cfiles | 2 +- 6 files changed, 296 insertions(+), 242 deletions(-) delete mode 100644 python/obitools3/commands/lcs.pyx create mode 100644 python/obitools3/commands/pouic.pxd create mode 100644 python/obitools3/commands/pouic.pyx diff --git a/python/obitools3/commands/lcs.pyx b/python/obitools3/commands/lcs.pyx deleted file mode 100644 index c76375e..0000000 --- a/python/obitools3/commands/lcs.pyx +++ /dev/null @@ -1,236 +0,0 @@ -#cython: language_level=3 -# -# from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport -# from obitools3.dms.dms import OBIDMS # TODO cimport doesn't work -# from obitools3.utils cimport str2bytes -# -# from obitools3.dms.capi.obialign cimport obi_lcs_align_one_column, \ -# obi_lcs_align_two_columns -# -# -# import time -# -# __title__="Aligns one sequence column with itself or two sequence columns" -# -# -# default_config = { 'inputview' : None, -# } -# -# def addOptions(parser): -# -# # TODO put this common group somewhere else but I don't know where. -# # Also some options should probably be in another group -# group=parser.add_argument_group('DMS and view options') -# -# group.add_argument('--default-dms', '-d', -# action="store", dest="obi:defaultdms", -# metavar='', -# default=None, -# type=str, -# help="Name of the default DMS for reading and writing data.") -# -# group.add_argument('--input-view-1', '-i', -# action="store", dest="obi:inputview1", -# metavar='', -# default=None, -# type=str, -# help="Name of the (first) input view.") -# -# group.add_argument('--input-view-2', '-I', -# action="store", dest="obi:inputview2", -# metavar='', -# default="", -# type=str, -# help="Eventually, the name of the second input view.") -# -# group.add_argument('--input-column-1', '-c', -# action="store", dest="obi:inputcolumn1", -# metavar='', -# default="", -# type=str, -# help="Name of the (first) input column. " -# " Default: the default nucleotide sequence column of the view if there is one.") -# -# group.add_argument('--input-column-2', '-C', -# action="store", dest="obi:inputcolumn2", -# metavar='', -# default="", -# type=str, -# help="Eventually, the name of the second input column.") -# -# group.add_argument('--input-elt-1', '-e', -# action="store", dest="obi:inputelement1", -# metavar='', -# default="", -# type=str, -# help="If the first input column has multiple elements per line, name of the element referring to the sequence to align. " -# " Default: the first element of the line.") -# -# group.add_argument('--input-elt-2', '-E', -# action="store", dest="obi:inputelement2", -# metavar='', -# default="", -# type=str, -# help="If the second input column has multiple elements per line, name of the element referring to the sequence to align. " -# " Default: the first element of the line.") -# -# group.add_argument('--id-column-1', '-f', -# action="store", dest="obi:idcolumn1", -# metavar='', -# default="", -# type=str, -# help="Name of the (first) column containing the identifiers of the sequences to align. " -# " Default: the default ID column of the view if there is one.") -# -# group.add_argument('--id-column-2', '-F', -# action="store", dest="obi:idcolumn2", -# metavar='', -# default="", -# type=str, -# help="Eventually, the name of the second ID column.") -# -# group.add_argument('--output-view', '-o', -# action="store", dest="obi:outputview", -# metavar='', -# default=None, -# type=str, -# help="Name of the output view.") -# -# -# group=parser.add_argument_group('obi lcs specific options') -# -# group.add_argument('--threshold','-t', -# action="store", dest="align:threshold", -# metavar='', -# default=0.0, -# type=float, -# help="Score threshold. If the score is normalized and expressed in similarity (default)," -# " it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized" -# " and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%." -# " If the score is not normalized and expressed in similarity, it is the length of the" -# " Longest Common Subsequence. If the score is not normalized and expressed in distance," -# " it is (reference length - LCS length)." -# " Only sequence pairs with a similarity above are printed. Default: 0.00" -# " (no threshold).") -# -# group.add_argument('--longest-length','-L', -# action="store_const", dest="align:reflength", -# default=0, -# const=1, -# help="The reference length is the length of the longest sequence." -# " Default: the reference length is the length of the alignment.") -# -# group.add_argument('--shortest-length','-l', -# action="store_const", dest="align:reflength", -# default=0, -# const=2, -# help="The reference length is the length of the shortest sequence." -# " Default: the reference length is the length of the alignment.") -# -# group.add_argument('--raw','-r', -# action="store_false", dest="align:normalize", -# default=True, -# help="Raw score, not normalized. Default: score is normalized with the reference sequence length.") -# -# group.add_argument('--distance','-D', -# action="store_false", dest="align:similarity", -# default=True, -# help="Score is expressed in distance. Default: score is expressed in similarity.") -# -# group.add_argument('--print-seq','-s', -# action="store_true", dest="align:printseq", -# default=False, -# help="The nucleotide sequences are written in the output view. Default: they are not written.") -# -# group.add_argument('--print-count','-n', -# action="store_true", dest="align:printcount", -# default=False, -# help="Sequence counts are written in the output view. Default: they are not written.") -# -# group.add_argument('--thread-count','-p', # TODO should probably be in a specific option group -# action="store", dest="align:threadcount", -# metavar='', -# default=1, -# type=int, -# help="Number of threads to use for the computation. Default: one.") -# -# -# # cpdef align(str dms_n, -# # str input_view_1_n, str output_view_n, -# # str input_view_2_n="", -# # str input_column_1_n="", str input_column_2_n="", -# # str input_elt_1_n="", str input_elt_2_n="", -# # str id_column_1_n="", str id_column_2_n="", -# # double threshold=0.0, bint normalize=True, -# # int reference=0, bint similarity_mode=True, -# # bint print_seq=False, bint print_count=False, -# # comments="", -# # int thread_count=1) : -# # -# # cdef OBIDMS d -# # d = OBIDMS(dms_n) -# # -# # if input_view_2_n == "" and input_column_2_n == "" : -# # if obi_lcs_align_one_column(d._pointer, \ -# # str2bytes(input_view_1_n), \ -# # str2bytes(input_column_1_n), \ -# # str2bytes(input_elt_1_n), \ -# # str2bytes(id_column_1_n), \ -# # str2bytes(output_view_n), \ -# # str2bytes(comments), \ -# # print_seq, \ -# # print_count, \ -# # threshold, normalize, reference, similarity_mode, -# # thread_count) < 0 : -# # raise Exception("Error aligning sequences") -# # else : -# # if obi_lcs_align_two_columns(d._pointer, \ -# # str2bytes(input_view_1_n), \ -# # str2bytes(input_view_2_n), \ -# # str2bytes(input_column_1_n), \ -# # str2bytes(input_column_2_n), \ -# # str2bytes(input_elt_1_n), \ -# # str2bytes(input_elt_2_n), \ -# # str2bytes(id_column_1_n), \ -# # str2bytes(id_column_2_n), \ -# # str2bytes(output_view_n), \ -# # str2bytes(comments), \ -# # print_seq, \ -# # print_count, \ -# # threshold, normalize, reference, similarity_mode) < 0 : -# # raise Exception("Error aligning sequences") -# # -# # d.close() -# # -# # -def run(config): - pass - # TODO: Build formatted comments with all parameters etc -# comments = "Obi align" -# -# # Call cython alignment function -# align(config['obi']['defaultdms'], \ -# config['obi']['inputview1'], \ -# config['obi']['outputview'], \ -# input_view_2_n = config['obi']['inputview2'], \ -# input_column_1_n = config['obi']['inputcolumn1'], \ -# input_column_2_n = config['obi']['inputcolumn2'], \ -# input_elt_1_n = config['obi']['inputelement1'], \ -# input_elt_2_n = config['obi']['inputelement2'], \ -# id_column_1_n = config['obi']['idcolumn1'], \ -# id_column_2_n = config['obi']['idcolumn2'], \ -# threshold = config['align']['threshold'], \ -# normalize = config['align']['normalize'], \ -# reference = config['align']['reflength'], \ -# similarity_mode = config['align']['similarity'], \ -# print_seq = config['align']['printseq'], \ -# print_count = config['align']['printcount'], \ -# comments = comments, \ -# thread_count = config['align']['threadcount']) -# -# print("Done.") -# # -# # -# # -# # -# # \ No newline at end of file diff --git a/python/obitools3/commands/pouic.pxd b/python/obitools3/commands/pouic.pxd new file mode 100644 index 0000000..7d84de2 --- /dev/null +++ b/python/obitools3/commands/pouic.pxd @@ -0,0 +1,18 @@ +#cython: language_level=3 + + +cpdef align_columns(bytes dms_n, + bytes input_view_1_n, + bytes output_view_n, + bytes input_view_2_n=*, + bytes input_column_1_n=*, + bytes input_column_2_n=*, + bytes input_elt_1_n=*, + bytes input_elt_2_n=*, + bytes id_column_1_n=*, + bytes id_column_2_n=*, + double threshold=*, bint normalize=*, + int reference=*, bint similarity_mode=*, + bint print_seq=*, bint print_count=*, + bytes comments=*, + int thread_count=*) \ No newline at end of file diff --git a/python/obitools3/commands/pouic.pyx b/python/obitools3/commands/pouic.pyx new file mode 100644 index 0000000..2027390 --- /dev/null +++ b/python/obitools3/commands/pouic.pyx @@ -0,0 +1,272 @@ +#cython: language_level=3 + +from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport +from obitools3.dms import DMS +from obitools3.dms.view.view cimport View +from obitools3.uri.decode import open_uri +from obitools3.apps.optiongroups import addMinimalInputOption, addMinimalOutputOption +from obitools3.dms.view import RollbackException +from obitools3.apps.config import logger +from obitools3.utils cimport tobytes, str2bytes + +from obitools3.dms.capi.obialign cimport obi_lcs_align_one_column, \ + obi_lcs_align_two_columns + +import time +import sys + + +__title__="Aligns one sequence column with itself or two sequence columns" + + +def addOptions(parser): + + addMinimalInputOption(parser) + addMinimalOutputOption(parser) + + group=parser.add_argument_group('obi align specific options') + + group.add_argument('--input-2', '-I', + action="store", dest="pouic:inputuri2", + metavar='', + default="", + type=str, + help="Eventually, the URI of the second input to align with the first one.") + + group.add_argument('--threshold','-t', + action="store", dest="pouic:threshold", + metavar='', + default=0.0, + type=float, + help="Score threshold. If the score is normalized and expressed in similarity (default)," + " it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized" + " and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%." + " If the score is not normalized and expressed in similarity, it is the length of the" + " Longest Common Subsequence. If the score is not normalized and expressed in distance," + " it is (reference length - LCS length)." + " Only sequence pairs with a similarity above are printed. Default: 0.00" + " (no threshold).") + + group.add_argument('--longest-length','-L', + action="store_const", dest="pouic:reflength", + default=0, + const=1, + help="The reference length is the length of the longest sequence." + " Default: the reference length is the length of the alignment.") + + group.add_argument('--shortest-length','-l', + action="store_const", dest="pouic:reflength", + default=0, + const=2, + help="The reference length is the length of the shortest sequence." + " Default: the reference length is the length of the alignment.") + + group.add_argument('--raw','-r', + action="store_false", dest="pouic:normalize", + default=True, + help="Raw score, not normalized. Default: score is normalized with the reference sequence length.") + + group.add_argument('--distance','-D', + action="store_false", dest="pouic:similarity", + default=True, + help="Score is expressed in distance. Default: score is expressed in similarity.") + + group.add_argument('--print-seq','-s', + action="store_true", dest="pouic:printseq", + default=False, + help="The nucleotide sequences are written in the output view. Default: they are not written.") + + group.add_argument('--print-count','-n', + action="store_true", dest="pouic:printcount", + default=False, + help="Sequence counts are written in the output view. Default: they are not written.") + + group.add_argument('--thread-count','-p', # TODO should probably be in a specific option group + action="store", dest="pouic:threadcount", + metavar='', + default=1, + type=int, + help="Number of threads to use for the computation. Default: one.") + + +cpdef align_columns(bytes dms_n, + bytes input_view_1_n, + bytes output_view_n, + bytes input_view_2_n=b"", + bytes input_column_1_n=b"", + bytes input_column_2_n=b"", + bytes input_elt_1_n=b"", + bytes input_elt_2_n=b"", + bytes id_column_1_n=b"", + bytes id_column_2_n=b"", + double threshold=0.0, bint normalize=True, + int reference=0, bint similarity_mode=True, + bint print_seq=False, bint print_count=False, + bytes comments=b"{}", + int thread_count=1) : + + if input_view_2_n == b"" and input_column_2_n == b"" : + if obi_lcs_align_one_column(dms_n, \ + input_view_1_n, \ + input_column_1_n, \ + input_elt_1_n, \ + id_column_1_n, \ + output_view_n, \ + comments, \ + print_seq, \ + print_count, \ + threshold, normalize, reference, similarity_mode, + thread_count) < 0 : + raise Exception("Error aligning sequences") + + else: + if obi_lcs_align_two_columns(dms_n, \ + input_view_1_n, \ + input_view_2_n, \ + input_column_1_n, \ + input_column_2_n, \ + input_elt_1_n, \ + input_elt_2_n, \ + id_column_1_n, \ + id_column_2_n, \ + output_view_n, \ + comments, \ + print_seq, \ + print_count, \ + threshold, normalize, reference, similarity_mode) < 0 : + raise Exception("Error aligning sequences") + + +def run(config): + + DMS.obi_atexit() + + logger("info", "obi align") + + # Open the input: only the DMS + input = open_uri(config['obi']['inputURI'], + dms_only=True) + if input is None: + raise Exception("Could not read input") + i_dms = input[0] + i_dms_name = input[0].name + i_uri = input[1] + i_view_name = i_uri.split(b"/")[0] + i_column_name = b"" + i_element_name = b"" + if len(i_uri.split(b"/")) == 2: + i_column_name = i_uri.split(b"/")[1] + if len(i_uri.split(b"/")) == 3: + i_element_name = i_uri.split(b"/")[2] + if len(i_uri.split(b"/")) > 3: + raise Exception("Input URI contains too many elements:", config['obi']['inputURI']) + + # Open the second input if there is one + i_dms_2 = None + i_dms_name_2 = b"" + original_i_view_name_2 = b"" + i_view_name_2 = b"" + i_column_name_2 = b"" + i_element_name_2 = b"" + if config['pouic']['inputuri2']: + input_2 = open_uri(config['pouic']['inputuri2'], + dms_only=True) + if input_2 is None: + raise Exception("Could not read second input") + i_dms_2 = input_2[0] + i_dms_name_2 = i_dms_2.name + i_uri_2 = input_2[1] + original_i_view_name_2 = i_uri_2.split(b"/")[0] + if len(i_uri_2.split(b"/")) == 2: + i_column_name_2 = i_uri_2.split(b"/")[1] + if len(i_uri_2.split(b"/")) == 3: + i_element_name_2 = i_uri_2.split(b"/")[2] + if len(i_uri_2.split(b"/")) > 3: + raise Exception("Input URI contains too many elements:", config['pouic']['inputuri2']) + + # If the 2 input DMS are not the same, temporarily import 2nd input view in first input DMS + if i_dms != i_dms_2: + temp_i_view_name_2 = original_i_view_name_2 + i=0 + while temp_i_view_name_2 in i_dms: # Making sure view name is unique in input DMS + temp_i_view_name_2 = original_i_view_name_2+b"_"+str2bytes(str(i)) + i+=1 + i_view_name_2 = temp_i_view_name_2 + View.import_view(i_dms_2.full_path[:-7], i_dms.full_path[:-7], original_i_view_name_2, i_view_name_2) + + # Open the output: only the DMS + output = open_uri(config['obi']['outputURI'], + input=False, + dms_only=True) + if output is None: + raise Exception("Could not create output") + o_dms = output[0] + o_dms_name = o_dms.name + final_o_view_name = output[1] + + # If the input and output DMS are not the same, align creating a temporary view in the input dms that will be exported to + # the right DMS and deleted in the other afterwards. + if i_dms != o_dms: + temporary_view_name = final_o_view_name + i=0 + while temporary_view_name in i_dms: # Making sure view name is unique in input DMS + temporary_view_name = final_o_view_name+b"_"+str2bytes(str(i)) + i+=1 + o_view_name = temporary_view_name + else: + o_view_name = final_o_view_name + + # Save command config in View comments + command_line = " ".join(sys.argv[1:]) + + i_dms_list = [i_dms_name] + if i_dms_name_2 and i_dms_name != i_dms_name_2: + i_dms_list.append(i_dms_name_2) + i_view_list = [i_view_name] + if original_i_view_name_2 and i_view_name != original_i_view_name_2: + i_view_list.append(original_i_view_name_2) + comments = View.print_config(config, "pouic", command_line, input_dms_name=i_dms_list, input_view_name=i_view_list) + + # Call cython alignment function + # Using default ID columns of the view. TODO discuss adding option + align_columns(i_dms_name, \ + i_view_name, \ + o_view_name, \ + input_view_2_n = i_view_name_2, \ + input_column_1_n = i_column_name, \ + input_column_2_n = i_column_name_2, \ + input_elt_1_n = i_element_name, \ + input_elt_2_n = i_element_name_2, \ + id_column_1_n = b"", \ + id_column_2_n = b"", \ + threshold = config['pouic']['threshold'], \ + normalize = config['pouic']['normalize'], \ + reference = config['pouic']['reflength'], \ + similarity_mode = config['pouic']['similarity'], \ + print_seq = config['pouic']['printseq'], \ + print_count = config['pouic']['printcount'], \ + comments = comments, \ + thread_count = config['pouic']['threadcount']) + + # If the input and output DMS are not the same, export result view to output DMS + if i_dms != o_dms: + View.import_view(i_dms.full_path[:-7], o_dms.full_path[:-7], o_view_name, final_o_view_name) + + # Save command config in output DMS comments + o_dms.record_command_line(command_line) + + print("\n") + print(repr(o_dms[final_o_view_name])) + + # If the two input DMS are different, delete the temporary input view in the first input DMS + if i_dms_2 and i_dms != i_dms_2: + View.delete_view(i_dms, i_view_name_2) + i_dms_2.close() + + # If the input and the output DMS are different, delete the temporary result view in the input DMS + if i_dms != o_dms: + View.delete_view(i_dms, o_view_name) + o_dms.close() + + i_dms.close() + \ No newline at end of file diff --git a/python/obitools3/dms/capi/obialign.pxd b/python/obitools3/dms/capi/obialign.pxd index 15fc3eb..f248625 100755 --- a/python/obitools3/dms/capi/obialign.pxd +++ b/python/obitools3/dms/capi/obialign.pxd @@ -4,13 +4,13 @@ from obitools3.dms.capi.obidms cimport OBIDMS_p from obitools3.dms.capi.obitypes cimport const_char_p -cdef extern from "obi_align.h" nogil: +cdef extern from "obi_lcs.h" nogil: - int obi_lcs_align_one_column(OBIDMS_p dms, + int obi_lcs_align_one_column(const_char_p dms_name, const_char_p seq_view_name, const_char_p seq_column_name, const_char_p seq_elt_name, - const_char_p id_column_name, + const_char_p id_column_name, const_char_p output_view_name, const_char_p output_view_comments, bint print_seq, @@ -22,7 +22,7 @@ cdef extern from "obi_align.h" nogil: int thread_count) - int obi_lcs_align_two_columns(OBIDMS_p dms, + int obi_lcs_align_two_columns(const_char_p dms_name, const_char_p seq1_view_name, const_char_p seq2_view_name, const_char_p seq1_column_name, diff --git a/python/obitools3/dms/dms.cfiles b/python/obitools3/dms/dms.cfiles index 6fc9329..6c08369 100755 --- a/python/obitools3/dms/dms.cfiles +++ b/python/obitools3/dms/dms.cfiles @@ -7,7 +7,7 @@ ../../../src/hashtable.c ../../../src/linked_list.c ../../../src/murmurhash2.c -../../../src/obi_align.c +../../../src/obi_lcs.c ../../../src/obi_clean.c ../../../src/obi_ecopcr.c ../../../src/libecoPCR/libthermo/nnparams.c diff --git a/python/obitools3/utils.cfiles b/python/obitools3/utils.cfiles index 7e738af..d3824da 100755 --- a/python/obitools3/utils.cfiles +++ b/python/obitools3/utils.cfiles @@ -7,7 +7,7 @@ ../../src/hashtable.c ../../src/linked_list.c ../../src/murmurhash2.c -../../src/obi_align.c +../../src/obi_lcs.c ../../src/obi_clean.c ../../src/obi_ecopcr.c ../../src/libecoPCR/libthermo/nnparams.c