Alignment: API rework. 'obi align' is now 'obi lcs', and the results are
now written to columns automatically created in the output view, all optimally handled at the C level.
This commit is contained in:
@ -1,120 +0,0 @@
|
||||
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
||||
from obitools3.obidms._obidms import OBIDMS, OBIView # TODO cimport doesn't work
|
||||
|
||||
|
||||
import time
|
||||
|
||||
__title__="Aligns one sequence column with itself or two sequence columns"
|
||||
|
||||
|
||||
default_config = { 'inputview' : None,
|
||||
}
|
||||
|
||||
def addOptions(parser):
|
||||
|
||||
# TODO put this common group somewhere else but I don't know where
|
||||
group=parser.add_argument_group('DMS and view options')
|
||||
|
||||
group.add_argument('--default-dms','-d',
|
||||
action="store", dest="obi:defaultdms",
|
||||
metavar='<DMS NAME>',
|
||||
default=None,
|
||||
type=str,
|
||||
help="Name of the default DMS for reading and writing data.")
|
||||
|
||||
group.add_argument('--input-view','-i',
|
||||
action="store", dest="obi:inputview",
|
||||
metavar='<INPUT VIEW NAME>',
|
||||
default=None,
|
||||
type=str,
|
||||
help="Name of the input view.")
|
||||
|
||||
# TODO eventually 2nd view, or 2nd column?
|
||||
|
||||
group.add_argument('--output-view','-o',
|
||||
action="store", dest="obi:outputview",
|
||||
metavar='<OUTPUT VIEW NAME>',
|
||||
default=None,
|
||||
type=str,
|
||||
help="Name of the output view.")
|
||||
|
||||
|
||||
group=parser.add_argument_group('obi align specific options')
|
||||
|
||||
group.add_argument('--lcs','-C',
|
||||
action="store", dest="align:alitype",
|
||||
metavar='<ALIGNMENT TYPE>',
|
||||
default='lcs',
|
||||
type=str,
|
||||
help="Compute alignment using the LCS method (default).")
|
||||
|
||||
group.add_argument('--threshold','-t',
|
||||
action="store", dest="align:threshold",
|
||||
metavar='<THRESHOLD>',
|
||||
default=0.0,
|
||||
type=float,
|
||||
help="Score threshold. If the score is normalized and expressed in similarity (default),"
|
||||
" it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized"
|
||||
" and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%."
|
||||
" If the score is not normalized and expressed in similarity, it is the length of the"
|
||||
" Longest Common Subsequence. If the score is not normalized and expressed in distance,"
|
||||
" it is (reference length - LCS length)."
|
||||
" Only sequence pairs with a similarity above <THRESHOLD> are printed. Default: 0.00"
|
||||
" (no threshold).")
|
||||
|
||||
group.add_argument('--longest_length','-L',
|
||||
action="store_const", dest="align:reflength",
|
||||
default=0,
|
||||
const=1,
|
||||
help="The reference length is the length of the longest sequence."
|
||||
" Default: the reference length is the length of the alignment.")
|
||||
|
||||
group.add_argument('--shortest_length','-l',
|
||||
action="store_const", dest="align:reflength",
|
||||
default=0,
|
||||
const=2,
|
||||
help="The reference length is the length of the shortest sequence."
|
||||
" Default: the reference length is the length of the alignment.")
|
||||
|
||||
group.add_argument('--raw','-r',
|
||||
action="store_false", dest="align:normalize",
|
||||
default=True,
|
||||
help="Raw score, not normalized. Default: score is normalized with the reference sequence length.")
|
||||
|
||||
group.add_argument('--distance','-D',
|
||||
action="store_false", dest="align:similarity",
|
||||
default=True,
|
||||
help="Score is expressed in distance. Default: score is expressed in similarity.")
|
||||
|
||||
|
||||
|
||||
def run(config):
|
||||
|
||||
# Open DMS
|
||||
d = OBIDMS(config['obi']['defaultdms'])
|
||||
|
||||
# Open input view 1
|
||||
iview = d.open_view(config['obi']['inputview'])
|
||||
|
||||
# TODO Open input view 2 if there is one
|
||||
|
||||
# Create output view
|
||||
oview = d.new_view(config['obi']['outputview'])
|
||||
|
||||
# TODO Take other alignment types into account when they'll be implemented
|
||||
|
||||
# Call cython alignment function
|
||||
iview.align(oview, threshold=config['align']['threshold'], normalize=config['align']['normalize'], reference=config['align']['reflength'], similarity_mode=config['align']['similarity'])
|
||||
|
||||
print(repr(oview))
|
||||
|
||||
iview.close()
|
||||
oview.close()
|
||||
d.close()
|
||||
|
||||
print("Done.")
|
||||
|
||||
|
||||
|
||||
|
||||
|
209
python/obitools3/commands/lcs.pyx
Normal file
209
python/obitools3/commands/lcs.pyx
Normal file
@ -0,0 +1,209 @@
|
||||
#cython: language_level=3
|
||||
|
||||
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
||||
from obitools3.obidms._obidms cimport OBIDMS # TODO cimport doesn't work
|
||||
from obitools3.utils cimport str2bytes
|
||||
|
||||
from obitools3.obidms.capi.obialign cimport obi_lcs_align_one_column
|
||||
|
||||
|
||||
import time
|
||||
|
||||
__title__="Aligns one sequence column with itself or two sequence columns"
|
||||
|
||||
|
||||
default_config = { 'inputview' : None,
|
||||
}
|
||||
|
||||
def addOptions(parser):
|
||||
|
||||
# TODO put this common group somewhere else but I don't know where.
|
||||
# Also some options should probably be in another group
|
||||
group=parser.add_argument_group('DMS and view options')
|
||||
|
||||
group.add_argument('--default-dms', '-d',
|
||||
action="store", dest="obi:defaultdms",
|
||||
metavar='<DMS NAME>',
|
||||
default=None,
|
||||
type=str,
|
||||
help="Name of the default DMS for reading and writing data.")
|
||||
|
||||
group.add_argument('--input-view-1', '-i',
|
||||
action="store", dest="obi:inputview1",
|
||||
metavar='<INPUT VIEW NAME>',
|
||||
default=None,
|
||||
type=str,
|
||||
help="Name of the (first) input view.")
|
||||
|
||||
group.add_argument('--input-view-2', '-I',
|
||||
action="store", dest="obi:inputview2",
|
||||
metavar='<INPUT VIEW NAME>',
|
||||
default="",
|
||||
type=str,
|
||||
help="Eventually, the name of the second input view.")
|
||||
|
||||
group.add_argument('--input-column-1', '-c',
|
||||
action="store", dest="obi:inputcolumn1",
|
||||
metavar='<INPUT COLUMN NAME>',
|
||||
default="",
|
||||
type=str,
|
||||
help="Name of the (first) input column. "
|
||||
" Default: the default nucleotide sequence column of the view if there is one.")
|
||||
|
||||
group.add_argument('--input-column-2', '-C',
|
||||
action="store", dest="obi:inputcolumn2",
|
||||
metavar='<INPUT COLUMN NAME>',
|
||||
default="",
|
||||
type=str,
|
||||
help="Eventually, the name of the second input column.")
|
||||
|
||||
group.add_argument('--input-elt-1', '-e',
|
||||
action="store", dest="obi:inputelement1",
|
||||
metavar='<INPUT ELEMENT NAME>',
|
||||
default="",
|
||||
type=str,
|
||||
help="If the first input column has multiple elements per line, name of the element referring to the sequence to align. "
|
||||
" Default: the first element of the line.")
|
||||
|
||||
group.add_argument('--input-elt-2', '-E',
|
||||
action="store", dest="obi:inputelement2",
|
||||
metavar='<INPUT ELEMENT NAME>',
|
||||
default="",
|
||||
type=str,
|
||||
help="If the second input column has multiple elements per line, name of the element referring to the sequence to align. "
|
||||
" Default: the first element of the line.")
|
||||
|
||||
group.add_argument('--id-column-1', '-f',
|
||||
action="store", dest="obi:idcolumn1",
|
||||
metavar='<ID COLUMN NAME>',
|
||||
default="",
|
||||
type=str,
|
||||
help="Name of the (first) column containing the identifiers of the sequences to align. "
|
||||
" Default: the default ID column of the view if there is one.")
|
||||
|
||||
group.add_argument('--id-column-2', '-F',
|
||||
action="store", dest="obi:idcolumn2",
|
||||
metavar='<ID COLUMN NAME>',
|
||||
default="",
|
||||
type=str,
|
||||
help="Eventually, the name of the second ID column.")
|
||||
|
||||
group.add_argument('--output-view', '-o',
|
||||
action="store", dest="obi:outputview",
|
||||
metavar='<OUTPUT VIEW NAME>',
|
||||
default=None,
|
||||
type=str,
|
||||
help="Name of the output view.")
|
||||
|
||||
|
||||
group=parser.add_argument_group('obi lcs specific options')
|
||||
|
||||
group.add_argument('--threshold','-t',
|
||||
action="store", dest="align:threshold",
|
||||
metavar='<THRESHOLD>',
|
||||
default=0.0,
|
||||
type=float,
|
||||
help="Score threshold. If the score is normalized and expressed in similarity (default),"
|
||||
" it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized"
|
||||
" and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%."
|
||||
" If the score is not normalized and expressed in similarity, it is the length of the"
|
||||
" Longest Common Subsequence. If the score is not normalized and expressed in distance,"
|
||||
" it is (reference length - LCS length)."
|
||||
" Only sequence pairs with a similarity above <THRESHOLD> are printed. Default: 0.00"
|
||||
" (no threshold).")
|
||||
|
||||
group.add_argument('--longest-length','-L',
|
||||
action="store_const", dest="align:reflength",
|
||||
default=0,
|
||||
const=1,
|
||||
help="The reference length is the length of the longest sequence."
|
||||
" Default: the reference length is the length of the alignment.")
|
||||
|
||||
group.add_argument('--shortest-length','-l',
|
||||
action="store_const", dest="align:reflength",
|
||||
default=0,
|
||||
const=2,
|
||||
help="The reference length is the length of the shortest sequence."
|
||||
" Default: the reference length is the length of the alignment.")
|
||||
|
||||
group.add_argument('--raw','-r',
|
||||
action="store_false", dest="align:normalize",
|
||||
default=True,
|
||||
help="Raw score, not normalized. Default: score is normalized with the reference sequence length.")
|
||||
|
||||
group.add_argument('--distance','-D',
|
||||
action="store_false", dest="align:similarity",
|
||||
default=True,
|
||||
help="Score is expressed in distance. Default: score is expressed in similarity.")
|
||||
|
||||
group.add_argument('--print-seq','-s',
|
||||
action="store_true", dest="align:printseq",
|
||||
default=False,
|
||||
help="The nucleotide sequences are written in the output view. Default: they are not written.")
|
||||
|
||||
group.add_argument('--print-count','-n',
|
||||
action="store_true", dest="align:printcount",
|
||||
default=False,
|
||||
help="Sequence counts are written in the output view. Default: they are not written.")
|
||||
|
||||
|
||||
cpdef align(str dms_n,
|
||||
str input_view_1_n, str output_view_n,
|
||||
str input_view_2_n="",
|
||||
str input_column_1_n="", str input_column_2_n="",
|
||||
str input_elt_1_n="", str input_elt_2_n="",
|
||||
str id_column_1_n="", str id_column_2_n="",
|
||||
double threshold=0.0, bint normalize=True,
|
||||
int reference=0, bint similarity_mode=True,
|
||||
bint print_seq=False, bint print_count=False,
|
||||
comments="") :
|
||||
|
||||
cdef OBIDMS d
|
||||
d = OBIDMS(dms_n)
|
||||
|
||||
# Align 1 column (2 columns not implemented yet)
|
||||
if obi_lcs_align_one_column(d._pointer, \
|
||||
str2bytes(input_view_1_n), \
|
||||
str2bytes(input_column_1_n), \
|
||||
str2bytes(input_elt_1_n), \
|
||||
str2bytes(id_column_1_n), \
|
||||
str2bytes(output_view_n), \
|
||||
str2bytes(comments), \
|
||||
print_seq, \
|
||||
print_count, \
|
||||
threshold, normalize, reference, similarity_mode) < 0 :
|
||||
raise Exception("Error aligning sequences")
|
||||
|
||||
d.close()
|
||||
|
||||
|
||||
def run(config):
|
||||
|
||||
# TODO: Build formatted comments with all parameters etc
|
||||
comments = "Obi align"
|
||||
|
||||
# Call cython alignment function
|
||||
align(config['obi']['defaultdms'], \
|
||||
config['obi']['inputview1'], \
|
||||
config['obi']['outputview'], \
|
||||
input_view_2_n = config['obi']['inputview2'], \
|
||||
input_column_1_n = config['obi']['inputcolumn1'], \
|
||||
input_column_2_n = config['obi']['inputcolumn2'], \
|
||||
input_elt_1_n = config['obi']['inputelement1'], \
|
||||
input_elt_2_n = config['obi']['inputelement2'], \
|
||||
id_column_1_n = config['obi']['idcolumn1'], \
|
||||
id_column_2_n = config['obi']['idcolumn2'], \
|
||||
threshold = config['align']['threshold'], \
|
||||
normalize = config['align']['normalize'], \
|
||||
reference = config['align']['reflength'], \
|
||||
similarity_mode = config['align']['similarity'], \
|
||||
print_seq = config['align']['printseq'], \
|
||||
print_count = config['align']['printcount'], \
|
||||
comments = comments)
|
||||
|
||||
print("Done.")
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -67,16 +67,8 @@ cdef class OBIView:
|
||||
cdef object get_view_subclass(str view_type)
|
||||
|
||||
|
||||
cdef class OBIView_NUC_SEQS(OBIView):
|
||||
|
||||
cpdef align(self,
|
||||
OBIView oview,
|
||||
OBIView iview2=*,
|
||||
double threshold=*,
|
||||
bint normalize=*,
|
||||
int reference=*,
|
||||
bint similarity_mode=*
|
||||
)
|
||||
cdef class OBIView_NUC_SEQS(OBIView) :
|
||||
pass
|
||||
|
||||
|
||||
cdef class OBIView_line :
|
||||
|
@ -10,8 +10,6 @@ from .capi.obidmscolumn cimport obi_close_column, \
|
||||
OBIDMS_column_header_p
|
||||
|
||||
from .capi.obiutils cimport obi_format_date
|
||||
|
||||
from .capi.obialign cimport obi_align_one_column
|
||||
|
||||
from .capi.obitypes cimport const_char_p, \
|
||||
OBIType_t, \
|
||||
@ -535,49 +533,6 @@ cdef class OBIView_NUC_SEQS(OBIView):
|
||||
self[line_idx][key] = sequence_obj[key]
|
||||
|
||||
|
||||
# TODO discuss
|
||||
cpdef align(self, OBIView oview, OBIView iview2=None,
|
||||
double threshold=0.0, bint normalize=True, int reference=0, bint similarity_mode=True) :
|
||||
pass
|
||||
#
|
||||
# cdef OBIView iview1
|
||||
#
|
||||
# cdef Obiview_p iview1_p
|
||||
# cdef Obiview_p iview2_p
|
||||
# cdef Obiview_p oview_p
|
||||
#
|
||||
# cdef OBIDMS_column icol1
|
||||
# cdef OBIDMS_column_p icol1_p
|
||||
# cdef OBIDMS_column_p* icol1_pp
|
||||
#
|
||||
# cdef OBIDMS_column id1_col
|
||||
# cdef OBIDMS_column_p id1_col_p
|
||||
# cdef OBIDMS_column_p* id1_col_pp
|
||||
#
|
||||
# cdef OBIDMS_column id2_col
|
||||
# cdef OBIDMS_column_p id2_col_p
|
||||
# cdef OBIDMS_column_p* id2_col_pp
|
||||
#
|
||||
# cdef OBIDMS_column ocol
|
||||
# cdef OBIDMS_column_p ocol_p
|
||||
# cdef OBIDMS_column_p* ocol_pp
|
||||
#
|
||||
# cdef str id1_col_name
|
||||
# cdef str id2_col_name
|
||||
# cdef str score_col_name
|
||||
#
|
||||
# score_col_name = "score"
|
||||
#
|
||||
# iview1= self
|
||||
# iview1_p = iview1._pointer
|
||||
# icol1 = iview1[bytes2str(NUC_SEQUENCE_COLUMN)]
|
||||
# icol1_pp = icol1._pointer
|
||||
# icol1_p = icol1_pp[0]
|
||||
#
|
||||
# if obi_align_one_column(iview1_p, icol1_p, threshold, normalize, reference, similarity_mode) < 0 :
|
||||
# raise Exception("Error aligning sequences")
|
||||
|
||||
|
||||
######################################################################################################
|
||||
|
||||
|
||||
|
@ -4,7 +4,6 @@ from .capi.obiview cimport obi_get_seq_with_elt_name_and_col_p_in_view, \
|
||||
obi_get_seq_with_elt_idx_and_col_p_in_view, \
|
||||
obi_set_seq_with_elt_name_and_col_p_in_view, \
|
||||
obi_set_seq_with_elt_idx_and_col_p_in_view
|
||||
from .capi.obialign cimport obi_align_one_column
|
||||
from .capi.obierrno cimport obi_errno
|
||||
from .capi.obitypes cimport OBISeq_NA, const_char_p
|
||||
|
||||
|
@ -102,12 +102,12 @@ cdef class OBI_Nuc_Seq_Stored(OBIView_line) :
|
||||
return self[bytes2str(QUALITY_COLUMN)]
|
||||
@quality.setter
|
||||
def quality(self, object new_qual):
|
||||
if (type(new_qual) == list) or (new_qual is None) :
|
||||
if (type(new_qual) == list) or (new_qual is None) : # TODO check that quality column exists
|
||||
self[bytes2str(QUALITY_COLUMN)] = new_qual
|
||||
else : # Quality is in str form
|
||||
(((self._view).columns)[bytes2str(QUALITY_COLUMN)]).set_str_line(self._index, new_qual)
|
||||
|
||||
cpdef object get_str_quality(self) : # TODO not ideal
|
||||
cpdef object get_str_quality(self) : # TODO not ideal. Make quality_int and quality_str properties
|
||||
return ((self._view).columns)[bytes2str(QUALITY_COLUMN)].get_str_line(self._index)
|
||||
|
||||
# cpdef str reverse_complement(self) : TODO in C ?
|
||||
|
@ -1,20 +1,22 @@
|
||||
#cython: language_level=3
|
||||
|
||||
from ..capi.obiview cimport Obiview_p
|
||||
from ..capi.obidmscolumn cimport OBIDMS_column_p
|
||||
from obitools3.obidms.capi.obidms cimport OBIDMS_p
|
||||
from obitools3.obidms.capi.obitypes cimport const_char_p
|
||||
|
||||
|
||||
cdef extern from "obi_align.h" nogil:
|
||||
|
||||
int obi_align_one_column(Obiview_p seq_view,
|
||||
OBIDMS_column_p seq_column,
|
||||
const char* seq_name,
|
||||
Obiview_p score_view,
|
||||
OBIDMS_column_p id1_column,
|
||||
OBIDMS_column_p id2_column,
|
||||
OBIDMS_column_p score_column,
|
||||
double threshold,
|
||||
bint normalize,
|
||||
int reference,
|
||||
bint similarity_mode)
|
||||
int obi_lcs_align_one_column(OBIDMS_p dms,
|
||||
const_char_p seq_view_name,
|
||||
const_char_p seq_column_name,
|
||||
const_char_p seq_elt_name,
|
||||
const_char_p id_column_name,
|
||||
const_char_p output_view_name,
|
||||
const_char_p output_view_comments,
|
||||
bint print_seq,
|
||||
bint print_count,
|
||||
double threshold,
|
||||
bint normalize,
|
||||
int reference,
|
||||
bint similarity_mode)
|
||||
|
||||
|
Reference in New Issue
Block a user