1st version of obi align command and reworked functions that handle

column alignment
This commit is contained in:
Celine Mercier
2016-08-10 14:51:02 +02:00
parent 26b8e1f215
commit 2aaa87edcc
8 changed files with 373 additions and 43 deletions

View File

@ -0,0 +1,128 @@
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.obidms._obidms import OBIDMS, OBIView # TODO cimport doesn't work
import time
__title__="Aligns one sequence column with itself or two sequence columns"
default_config = { 'inputview' : None,
'skip' : 0,
'only' : None,
'skiperror' : False,
'moltype' : 'nuc',
}
def addOptions(parser):
# TODO put this common group somewhere else but I don't know where
group=parser.add_argument_group('DMS and view options')
group.add_argument('--default-dms','-d',
action="store", dest="obi:defaultdms",
metavar='<DMS NAME>',
default=None,
type=str,
help="Name of the default DMS for reading and writing data.")
group.add_argument('--input-view','-i',
action="store", dest="obi:inputview",
metavar='<INPUT VIEW NAME>',
default=None,
type=str,
help="Name of the input view, either raw if the view is in the default DMS,"
" or in the form 'dms:view' if it is in another DMS.")
# TODO eventually 2nd view, or 2nd column?
group.add_argument('--output-view','-o',
action="store", dest="obi:outputview",
metavar='<OUTPUT VIEW NAME>',
default=None,
type=str,
help="Name of the output view, either raw if the view is in the default DMS,"
" or in the form 'dms:view' if it is in another DMS.")
group=parser.add_argument_group('obi align specific options')
group.add_argument('--lcs','-C',
action="store", dest="align:alitype",
metavar='<ALIGNMENT TYPE>',
default='lcs',
type=str,
help="Compute alignment using the LCS method.")
group.add_argument('--threshold','-t',
action="store", dest="align:threshold",
metavar='<THRESHOLD>',
default=0.0,
type=float,
help="Score threshold. If the score is normalized and expressed in similarity (default),"
" it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized"
" and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%."
" If the score is not normalized and expressed in similarity, it is the length of the"
" Longest Common Subsequence. If the score is not normalized and expressed in distance,"
" it is (reference length - LCS length)."
" Only sequence pairs with a similarity above <THRESHOLD> are printed. Default: 0.00"
" (no threshold).")
group.add_argument('--longest_length','-L',
action="store_const", dest="align:reflength",
default="ali",
const="longest",
help="The reference length is the length of the longest sequence."
" Default: the reference length is the length of the alignment.")
group.add_argument('--shortest_length','-l',
action="store_const", dest="align:reflength",
default="ali",
const="shortest",
help="The reference length is the length of the shortest sequence."
" Default: the reference length is the length of the alignment.")
group.add_argument('--raw','-r',
action="store_false", dest="align:normalize",
default=True,
help="Raw score, not normalized. Default: score is normalized with the reference sequence length.")
group.add_argument('--distance','-D',
action="store_false", dest="align:similarity",
default=True,
help="Score is expressed in distance. Default: score is expressed in similarity.")
def run(config):
#pb = ProgressBar(1, config, seconde=5) # TODO
# Open DMS
d = OBIDMS(config['obi']['defaultdms'])
# Open input view 1
iview = d.open_view(config['obi']['inputview'])
# TODO Open input view 2 if there is one
# Create output view if necessary
if config['obi']['outputview'] is not None :
oview = d.new_view(config['obi']['outputview'])
else :
oview = None
# TODO Take other alignment types into account when they'll be implemented
# Call cython alignment function
iview.align(output_view=oview)
print("Done.")

View File

@ -77,7 +77,15 @@ cdef class OBIView_NUC_SEQS(OBIView):
cdef OBIDMS_column qualities
cpdef delete_column(self, str column_name)
cpdef align(self,
OBIView iview2=*,
object output_view=*,
double threshold=*,
bint normalize=*,
int reference=*,
bint similarity_mode=*
)
cdef class OBIView_line :

View File

@ -10,7 +10,9 @@ from .capi.obidmscolumn cimport obi_close_column, \
OBIDMS_column_header_p
from .capi.obiutils cimport obi_format_date
from .capi.obialign cimport obi_align_one_column
from .capi.obitypes cimport const_char_p, \
OBIType_t, \
OBI_INT, \
@ -454,6 +456,7 @@ cdef class OBIView :
for line in self.__iter__() :
to_print = to_print + str(line) + "\n"
return to_print
#############################################
@ -524,7 +527,76 @@ cdef class OBIView_NUC_SEQS(OBIView):
def __setitem__(self, index_t line_idx, OBI_Nuc_Seq sequence_obj) :
for key in sequence_obj :
self[line_idx][key] = sequence_obj[key]
# TODO
cpdef align(self, OBIView iview2=None, object output_view=None,
double threshold=0.0, bint normalize=True, int reference=0, bint similarity_mode=True) :
cdef OBIView iview1
cdef OBIView oview
cdef Obiview_p iview1_p
cdef Obiview_p iview2_p
cdef Obiview_p oview_p
cdef OBIDMS_column icol1
cdef OBIDMS_column_p icol1_p
cdef OBIDMS_column_p* icol1_pp
cdef OBIDMS_column id1_col
cdef OBIDMS_column_p id1_col_p
cdef OBIDMS_column_p* id1_col_pp
cdef OBIDMS_column id2_col
cdef OBIDMS_column_p id2_col_p
cdef OBIDMS_column_p* id2_col_pp
cdef OBIDMS_column ocol
cdef OBIDMS_column_p ocol_p
cdef OBIDMS_column_p* ocol_pp
cdef str id1_col_name
cdef str id2_col_name
cdef str score_col_name
id1_col_name = "ID1" # TODO discuss names, aliases
id2_col_name = "ID2"
score_col_name = "score"
iview1= self
iview1_p = iview1.pointer
icol1 = iview1[bytes2str(NUC_SEQUENCE_COLUMN)]
icol1_pp = icol1.pointer
icol1_p = icol1_pp[0]
# Create the output view if needed
if output_view is None :
oview = self.dms.new_view("alignment_score_view") # TODO discuss
elif type(output_view) == str :
oview = self.dms.new_view(output_view)
else :
oview = output_view
oview.add_column(id1_col_name, type='OBI_STR', create=True)
oview.add_column(id2_col_name, type='OBI_STR', create=True)
oview.add_column(score_col_name, type='OBI_FLOAT', create=True)
oview_p = oview.pointer
ocol = oview[score_col_name]
ocol_pp = ocol.pointer
ocol_p = ocol_pp[0]
id1_col = oview[id1_col_name]
id2_col = oview[id2_col_name]
id1_col_pp = id1_col.pointer
id2_col_pp = id2_col.pointer
id1_col_p = id1_col_pp[0]
id2_col_p = id2_col_pp[0]
if obi_align_one_column(iview1_p, icol1_p, oview_p, id1_col_p, id2_col_p, ocol_p, threshold, normalize, reference, similarity_mode) < 0 :
raise Exception("Error aligning sequences")
#############################################

View File

@ -8,17 +8,6 @@ cdef class OBIDMS_column_seq(OBIDMS_column):
cpdef object get_line(self, index_t line_nb)
cpdef set_line(self, index_t line_nb, object value)
# TO DISCUSS :
# I'am not sure that this method has to be declared here
# Alignment must be declared outside of the sequence object
cpdef align(self,
OBIView score_view,
OBIDMS_column score_column,
double threshold = *,
bint normalize = *,
int reference = *,
bint similarity_mode = *)
cdef class OBIDMS_column_multi_elts_seq(OBIDMS_column_multi_elts):
cpdef object get_item(self, index_t line_nb, str element_name)

View File

@ -37,18 +37,6 @@ cdef class OBIDMS_column_seq(OBIDMS_column):
else :
if obi_set_seq_with_elt_idx_and_col_p_in_view(self.view.pointer, (self.pointer)[0], line_nb, 0, str2bytes(value)) < 0:
raise Exception("Problem setting a value in a column")
# TODO choose alignment type (lcs or other) with supplementary argument
cpdef align(self,
OBIView score_view,
OBIDMS_column score_column,
double threshold = 0.0,
bint normalize = True,
int reference = 0, # TODO
bint similarity_mode = True):
if (obi_align_one_column(self.view.pointer, (self.pointer)[0], score_view.pointer, (score_column.pointer)[0], threshold, normalize, reference, similarity_mode) < 0) :
raise Exception("An error occurred while aligning sequences")
cdef class OBIDMS_column_multi_elts_seq(OBIDMS_column_multi_elts):

View File

@ -6,5 +6,14 @@ from ..capi.obidmscolumn cimport OBIDMS_column_p
cdef extern from "obi_align.h" nogil:
int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, Obiview_p score_view, OBIDMS_column_p score_column, double threshold, bint normalize, int reference, bint similarity_mode)
int obi_align_one_column(Obiview_p seq_view,
OBIDMS_column_p seq_column,
Obiview_p score_view,
OBIDMS_column_p id1_column,
OBIDMS_column_p id2_column,
OBIDMS_column_p score_column,
double threshold,
bint normalize,
int reference,
bint similarity_mode)