Alignment: API rework. 'obi align' is now 'obi lcs', and the results are
now written to columns automatically created in the output view, all optimally handled at the C level.
This commit is contained in:
@ -1,120 +0,0 @@
|
|||||||
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
|
||||||
from obitools3.obidms._obidms import OBIDMS, OBIView # TODO cimport doesn't work
|
|
||||||
|
|
||||||
|
|
||||||
import time
|
|
||||||
|
|
||||||
__title__="Aligns one sequence column with itself or two sequence columns"
|
|
||||||
|
|
||||||
|
|
||||||
default_config = { 'inputview' : None,
|
|
||||||
}
|
|
||||||
|
|
||||||
def addOptions(parser):
|
|
||||||
|
|
||||||
# TODO put this common group somewhere else but I don't know where
|
|
||||||
group=parser.add_argument_group('DMS and view options')
|
|
||||||
|
|
||||||
group.add_argument('--default-dms','-d',
|
|
||||||
action="store", dest="obi:defaultdms",
|
|
||||||
metavar='<DMS NAME>',
|
|
||||||
default=None,
|
|
||||||
type=str,
|
|
||||||
help="Name of the default DMS for reading and writing data.")
|
|
||||||
|
|
||||||
group.add_argument('--input-view','-i',
|
|
||||||
action="store", dest="obi:inputview",
|
|
||||||
metavar='<INPUT VIEW NAME>',
|
|
||||||
default=None,
|
|
||||||
type=str,
|
|
||||||
help="Name of the input view.")
|
|
||||||
|
|
||||||
# TODO eventually 2nd view, or 2nd column?
|
|
||||||
|
|
||||||
group.add_argument('--output-view','-o',
|
|
||||||
action="store", dest="obi:outputview",
|
|
||||||
metavar='<OUTPUT VIEW NAME>',
|
|
||||||
default=None,
|
|
||||||
type=str,
|
|
||||||
help="Name of the output view.")
|
|
||||||
|
|
||||||
|
|
||||||
group=parser.add_argument_group('obi align specific options')
|
|
||||||
|
|
||||||
group.add_argument('--lcs','-C',
|
|
||||||
action="store", dest="align:alitype",
|
|
||||||
metavar='<ALIGNMENT TYPE>',
|
|
||||||
default='lcs',
|
|
||||||
type=str,
|
|
||||||
help="Compute alignment using the LCS method (default).")
|
|
||||||
|
|
||||||
group.add_argument('--threshold','-t',
|
|
||||||
action="store", dest="align:threshold",
|
|
||||||
metavar='<THRESHOLD>',
|
|
||||||
default=0.0,
|
|
||||||
type=float,
|
|
||||||
help="Score threshold. If the score is normalized and expressed in similarity (default),"
|
|
||||||
" it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized"
|
|
||||||
" and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%."
|
|
||||||
" If the score is not normalized and expressed in similarity, it is the length of the"
|
|
||||||
" Longest Common Subsequence. If the score is not normalized and expressed in distance,"
|
|
||||||
" it is (reference length - LCS length)."
|
|
||||||
" Only sequence pairs with a similarity above <THRESHOLD> are printed. Default: 0.00"
|
|
||||||
" (no threshold).")
|
|
||||||
|
|
||||||
group.add_argument('--longest_length','-L',
|
|
||||||
action="store_const", dest="align:reflength",
|
|
||||||
default=0,
|
|
||||||
const=1,
|
|
||||||
help="The reference length is the length of the longest sequence."
|
|
||||||
" Default: the reference length is the length of the alignment.")
|
|
||||||
|
|
||||||
group.add_argument('--shortest_length','-l',
|
|
||||||
action="store_const", dest="align:reflength",
|
|
||||||
default=0,
|
|
||||||
const=2,
|
|
||||||
help="The reference length is the length of the shortest sequence."
|
|
||||||
" Default: the reference length is the length of the alignment.")
|
|
||||||
|
|
||||||
group.add_argument('--raw','-r',
|
|
||||||
action="store_false", dest="align:normalize",
|
|
||||||
default=True,
|
|
||||||
help="Raw score, not normalized. Default: score is normalized with the reference sequence length.")
|
|
||||||
|
|
||||||
group.add_argument('--distance','-D',
|
|
||||||
action="store_false", dest="align:similarity",
|
|
||||||
default=True,
|
|
||||||
help="Score is expressed in distance. Default: score is expressed in similarity.")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def run(config):
|
|
||||||
|
|
||||||
# Open DMS
|
|
||||||
d = OBIDMS(config['obi']['defaultdms'])
|
|
||||||
|
|
||||||
# Open input view 1
|
|
||||||
iview = d.open_view(config['obi']['inputview'])
|
|
||||||
|
|
||||||
# TODO Open input view 2 if there is one
|
|
||||||
|
|
||||||
# Create output view
|
|
||||||
oview = d.new_view(config['obi']['outputview'])
|
|
||||||
|
|
||||||
# TODO Take other alignment types into account when they'll be implemented
|
|
||||||
|
|
||||||
# Call cython alignment function
|
|
||||||
iview.align(oview, threshold=config['align']['threshold'], normalize=config['align']['normalize'], reference=config['align']['reflength'], similarity_mode=config['align']['similarity'])
|
|
||||||
|
|
||||||
print(repr(oview))
|
|
||||||
|
|
||||||
iview.close()
|
|
||||||
oview.close()
|
|
||||||
d.close()
|
|
||||||
|
|
||||||
print("Done.")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
209
python/obitools3/commands/lcs.pyx
Normal file
209
python/obitools3/commands/lcs.pyx
Normal file
@ -0,0 +1,209 @@
|
|||||||
|
#cython: language_level=3
|
||||||
|
|
||||||
|
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
||||||
|
from obitools3.obidms._obidms cimport OBIDMS # TODO cimport doesn't work
|
||||||
|
from obitools3.utils cimport str2bytes
|
||||||
|
|
||||||
|
from obitools3.obidms.capi.obialign cimport obi_lcs_align_one_column
|
||||||
|
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
|
__title__="Aligns one sequence column with itself or two sequence columns"
|
||||||
|
|
||||||
|
|
||||||
|
default_config = { 'inputview' : None,
|
||||||
|
}
|
||||||
|
|
||||||
|
def addOptions(parser):
|
||||||
|
|
||||||
|
# TODO put this common group somewhere else but I don't know where.
|
||||||
|
# Also some options should probably be in another group
|
||||||
|
group=parser.add_argument_group('DMS and view options')
|
||||||
|
|
||||||
|
group.add_argument('--default-dms', '-d',
|
||||||
|
action="store", dest="obi:defaultdms",
|
||||||
|
metavar='<DMS NAME>',
|
||||||
|
default=None,
|
||||||
|
type=str,
|
||||||
|
help="Name of the default DMS for reading and writing data.")
|
||||||
|
|
||||||
|
group.add_argument('--input-view-1', '-i',
|
||||||
|
action="store", dest="obi:inputview1",
|
||||||
|
metavar='<INPUT VIEW NAME>',
|
||||||
|
default=None,
|
||||||
|
type=str,
|
||||||
|
help="Name of the (first) input view.")
|
||||||
|
|
||||||
|
group.add_argument('--input-view-2', '-I',
|
||||||
|
action="store", dest="obi:inputview2",
|
||||||
|
metavar='<INPUT VIEW NAME>',
|
||||||
|
default="",
|
||||||
|
type=str,
|
||||||
|
help="Eventually, the name of the second input view.")
|
||||||
|
|
||||||
|
group.add_argument('--input-column-1', '-c',
|
||||||
|
action="store", dest="obi:inputcolumn1",
|
||||||
|
metavar='<INPUT COLUMN NAME>',
|
||||||
|
default="",
|
||||||
|
type=str,
|
||||||
|
help="Name of the (first) input column. "
|
||||||
|
" Default: the default nucleotide sequence column of the view if there is one.")
|
||||||
|
|
||||||
|
group.add_argument('--input-column-2', '-C',
|
||||||
|
action="store", dest="obi:inputcolumn2",
|
||||||
|
metavar='<INPUT COLUMN NAME>',
|
||||||
|
default="",
|
||||||
|
type=str,
|
||||||
|
help="Eventually, the name of the second input column.")
|
||||||
|
|
||||||
|
group.add_argument('--input-elt-1', '-e',
|
||||||
|
action="store", dest="obi:inputelement1",
|
||||||
|
metavar='<INPUT ELEMENT NAME>',
|
||||||
|
default="",
|
||||||
|
type=str,
|
||||||
|
help="If the first input column has multiple elements per line, name of the element referring to the sequence to align. "
|
||||||
|
" Default: the first element of the line.")
|
||||||
|
|
||||||
|
group.add_argument('--input-elt-2', '-E',
|
||||||
|
action="store", dest="obi:inputelement2",
|
||||||
|
metavar='<INPUT ELEMENT NAME>',
|
||||||
|
default="",
|
||||||
|
type=str,
|
||||||
|
help="If the second input column has multiple elements per line, name of the element referring to the sequence to align. "
|
||||||
|
" Default: the first element of the line.")
|
||||||
|
|
||||||
|
group.add_argument('--id-column-1', '-f',
|
||||||
|
action="store", dest="obi:idcolumn1",
|
||||||
|
metavar='<ID COLUMN NAME>',
|
||||||
|
default="",
|
||||||
|
type=str,
|
||||||
|
help="Name of the (first) column containing the identifiers of the sequences to align. "
|
||||||
|
" Default: the default ID column of the view if there is one.")
|
||||||
|
|
||||||
|
group.add_argument('--id-column-2', '-F',
|
||||||
|
action="store", dest="obi:idcolumn2",
|
||||||
|
metavar='<ID COLUMN NAME>',
|
||||||
|
default="",
|
||||||
|
type=str,
|
||||||
|
help="Eventually, the name of the second ID column.")
|
||||||
|
|
||||||
|
group.add_argument('--output-view', '-o',
|
||||||
|
action="store", dest="obi:outputview",
|
||||||
|
metavar='<OUTPUT VIEW NAME>',
|
||||||
|
default=None,
|
||||||
|
type=str,
|
||||||
|
help="Name of the output view.")
|
||||||
|
|
||||||
|
|
||||||
|
group=parser.add_argument_group('obi lcs specific options')
|
||||||
|
|
||||||
|
group.add_argument('--threshold','-t',
|
||||||
|
action="store", dest="align:threshold",
|
||||||
|
metavar='<THRESHOLD>',
|
||||||
|
default=0.0,
|
||||||
|
type=float,
|
||||||
|
help="Score threshold. If the score is normalized and expressed in similarity (default),"
|
||||||
|
" it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized"
|
||||||
|
" and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%."
|
||||||
|
" If the score is not normalized and expressed in similarity, it is the length of the"
|
||||||
|
" Longest Common Subsequence. If the score is not normalized and expressed in distance,"
|
||||||
|
" it is (reference length - LCS length)."
|
||||||
|
" Only sequence pairs with a similarity above <THRESHOLD> are printed. Default: 0.00"
|
||||||
|
" (no threshold).")
|
||||||
|
|
||||||
|
group.add_argument('--longest-length','-L',
|
||||||
|
action="store_const", dest="align:reflength",
|
||||||
|
default=0,
|
||||||
|
const=1,
|
||||||
|
help="The reference length is the length of the longest sequence."
|
||||||
|
" Default: the reference length is the length of the alignment.")
|
||||||
|
|
||||||
|
group.add_argument('--shortest-length','-l',
|
||||||
|
action="store_const", dest="align:reflength",
|
||||||
|
default=0,
|
||||||
|
const=2,
|
||||||
|
help="The reference length is the length of the shortest sequence."
|
||||||
|
" Default: the reference length is the length of the alignment.")
|
||||||
|
|
||||||
|
group.add_argument('--raw','-r',
|
||||||
|
action="store_false", dest="align:normalize",
|
||||||
|
default=True,
|
||||||
|
help="Raw score, not normalized. Default: score is normalized with the reference sequence length.")
|
||||||
|
|
||||||
|
group.add_argument('--distance','-D',
|
||||||
|
action="store_false", dest="align:similarity",
|
||||||
|
default=True,
|
||||||
|
help="Score is expressed in distance. Default: score is expressed in similarity.")
|
||||||
|
|
||||||
|
group.add_argument('--print-seq','-s',
|
||||||
|
action="store_true", dest="align:printseq",
|
||||||
|
default=False,
|
||||||
|
help="The nucleotide sequences are written in the output view. Default: they are not written.")
|
||||||
|
|
||||||
|
group.add_argument('--print-count','-n',
|
||||||
|
action="store_true", dest="align:printcount",
|
||||||
|
default=False,
|
||||||
|
help="Sequence counts are written in the output view. Default: they are not written.")
|
||||||
|
|
||||||
|
|
||||||
|
cpdef align(str dms_n,
|
||||||
|
str input_view_1_n, str output_view_n,
|
||||||
|
str input_view_2_n="",
|
||||||
|
str input_column_1_n="", str input_column_2_n="",
|
||||||
|
str input_elt_1_n="", str input_elt_2_n="",
|
||||||
|
str id_column_1_n="", str id_column_2_n="",
|
||||||
|
double threshold=0.0, bint normalize=True,
|
||||||
|
int reference=0, bint similarity_mode=True,
|
||||||
|
bint print_seq=False, bint print_count=False,
|
||||||
|
comments="") :
|
||||||
|
|
||||||
|
cdef OBIDMS d
|
||||||
|
d = OBIDMS(dms_n)
|
||||||
|
|
||||||
|
# Align 1 column (2 columns not implemented yet)
|
||||||
|
if obi_lcs_align_one_column(d._pointer, \
|
||||||
|
str2bytes(input_view_1_n), \
|
||||||
|
str2bytes(input_column_1_n), \
|
||||||
|
str2bytes(input_elt_1_n), \
|
||||||
|
str2bytes(id_column_1_n), \
|
||||||
|
str2bytes(output_view_n), \
|
||||||
|
str2bytes(comments), \
|
||||||
|
print_seq, \
|
||||||
|
print_count, \
|
||||||
|
threshold, normalize, reference, similarity_mode) < 0 :
|
||||||
|
raise Exception("Error aligning sequences")
|
||||||
|
|
||||||
|
d.close()
|
||||||
|
|
||||||
|
|
||||||
|
def run(config):
|
||||||
|
|
||||||
|
# TODO: Build formatted comments with all parameters etc
|
||||||
|
comments = "Obi align"
|
||||||
|
|
||||||
|
# Call cython alignment function
|
||||||
|
align(config['obi']['defaultdms'], \
|
||||||
|
config['obi']['inputview1'], \
|
||||||
|
config['obi']['outputview'], \
|
||||||
|
input_view_2_n = config['obi']['inputview2'], \
|
||||||
|
input_column_1_n = config['obi']['inputcolumn1'], \
|
||||||
|
input_column_2_n = config['obi']['inputcolumn2'], \
|
||||||
|
input_elt_1_n = config['obi']['inputelement1'], \
|
||||||
|
input_elt_2_n = config['obi']['inputelement2'], \
|
||||||
|
id_column_1_n = config['obi']['idcolumn1'], \
|
||||||
|
id_column_2_n = config['obi']['idcolumn2'], \
|
||||||
|
threshold = config['align']['threshold'], \
|
||||||
|
normalize = config['align']['normalize'], \
|
||||||
|
reference = config['align']['reflength'], \
|
||||||
|
similarity_mode = config['align']['similarity'], \
|
||||||
|
print_seq = config['align']['printseq'], \
|
||||||
|
print_count = config['align']['printcount'], \
|
||||||
|
comments = comments)
|
||||||
|
|
||||||
|
print("Done.")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -67,16 +67,8 @@ cdef class OBIView:
|
|||||||
cdef object get_view_subclass(str view_type)
|
cdef object get_view_subclass(str view_type)
|
||||||
|
|
||||||
|
|
||||||
cdef class OBIView_NUC_SEQS(OBIView):
|
cdef class OBIView_NUC_SEQS(OBIView) :
|
||||||
|
pass
|
||||||
cpdef align(self,
|
|
||||||
OBIView oview,
|
|
||||||
OBIView iview2=*,
|
|
||||||
double threshold=*,
|
|
||||||
bint normalize=*,
|
|
||||||
int reference=*,
|
|
||||||
bint similarity_mode=*
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
cdef class OBIView_line :
|
cdef class OBIView_line :
|
||||||
|
@ -11,8 +11,6 @@ from .capi.obidmscolumn cimport obi_close_column, \
|
|||||||
|
|
||||||
from .capi.obiutils cimport obi_format_date
|
from .capi.obiutils cimport obi_format_date
|
||||||
|
|
||||||
from .capi.obialign cimport obi_align_one_column
|
|
||||||
|
|
||||||
from .capi.obitypes cimport const_char_p, \
|
from .capi.obitypes cimport const_char_p, \
|
||||||
OBIType_t, \
|
OBIType_t, \
|
||||||
OBI_INT, \
|
OBI_INT, \
|
||||||
@ -535,49 +533,6 @@ cdef class OBIView_NUC_SEQS(OBIView):
|
|||||||
self[line_idx][key] = sequence_obj[key]
|
self[line_idx][key] = sequence_obj[key]
|
||||||
|
|
||||||
|
|
||||||
# TODO discuss
|
|
||||||
cpdef align(self, OBIView oview, OBIView iview2=None,
|
|
||||||
double threshold=0.0, bint normalize=True, int reference=0, bint similarity_mode=True) :
|
|
||||||
pass
|
|
||||||
#
|
|
||||||
# cdef OBIView iview1
|
|
||||||
#
|
|
||||||
# cdef Obiview_p iview1_p
|
|
||||||
# cdef Obiview_p iview2_p
|
|
||||||
# cdef Obiview_p oview_p
|
|
||||||
#
|
|
||||||
# cdef OBIDMS_column icol1
|
|
||||||
# cdef OBIDMS_column_p icol1_p
|
|
||||||
# cdef OBIDMS_column_p* icol1_pp
|
|
||||||
#
|
|
||||||
# cdef OBIDMS_column id1_col
|
|
||||||
# cdef OBIDMS_column_p id1_col_p
|
|
||||||
# cdef OBIDMS_column_p* id1_col_pp
|
|
||||||
#
|
|
||||||
# cdef OBIDMS_column id2_col
|
|
||||||
# cdef OBIDMS_column_p id2_col_p
|
|
||||||
# cdef OBIDMS_column_p* id2_col_pp
|
|
||||||
#
|
|
||||||
# cdef OBIDMS_column ocol
|
|
||||||
# cdef OBIDMS_column_p ocol_p
|
|
||||||
# cdef OBIDMS_column_p* ocol_pp
|
|
||||||
#
|
|
||||||
# cdef str id1_col_name
|
|
||||||
# cdef str id2_col_name
|
|
||||||
# cdef str score_col_name
|
|
||||||
#
|
|
||||||
# score_col_name = "score"
|
|
||||||
#
|
|
||||||
# iview1= self
|
|
||||||
# iview1_p = iview1._pointer
|
|
||||||
# icol1 = iview1[bytes2str(NUC_SEQUENCE_COLUMN)]
|
|
||||||
# icol1_pp = icol1._pointer
|
|
||||||
# icol1_p = icol1_pp[0]
|
|
||||||
#
|
|
||||||
# if obi_align_one_column(iview1_p, icol1_p, threshold, normalize, reference, similarity_mode) < 0 :
|
|
||||||
# raise Exception("Error aligning sequences")
|
|
||||||
|
|
||||||
|
|
||||||
######################################################################################################
|
######################################################################################################
|
||||||
|
|
||||||
|
|
||||||
|
@ -4,7 +4,6 @@ from .capi.obiview cimport obi_get_seq_with_elt_name_and_col_p_in_view, \
|
|||||||
obi_get_seq_with_elt_idx_and_col_p_in_view, \
|
obi_get_seq_with_elt_idx_and_col_p_in_view, \
|
||||||
obi_set_seq_with_elt_name_and_col_p_in_view, \
|
obi_set_seq_with_elt_name_and_col_p_in_view, \
|
||||||
obi_set_seq_with_elt_idx_and_col_p_in_view
|
obi_set_seq_with_elt_idx_and_col_p_in_view
|
||||||
from .capi.obialign cimport obi_align_one_column
|
|
||||||
from .capi.obierrno cimport obi_errno
|
from .capi.obierrno cimport obi_errno
|
||||||
from .capi.obitypes cimport OBISeq_NA, const_char_p
|
from .capi.obitypes cimport OBISeq_NA, const_char_p
|
||||||
|
|
||||||
|
@ -102,12 +102,12 @@ cdef class OBI_Nuc_Seq_Stored(OBIView_line) :
|
|||||||
return self[bytes2str(QUALITY_COLUMN)]
|
return self[bytes2str(QUALITY_COLUMN)]
|
||||||
@quality.setter
|
@quality.setter
|
||||||
def quality(self, object new_qual):
|
def quality(self, object new_qual):
|
||||||
if (type(new_qual) == list) or (new_qual is None) :
|
if (type(new_qual) == list) or (new_qual is None) : # TODO check that quality column exists
|
||||||
self[bytes2str(QUALITY_COLUMN)] = new_qual
|
self[bytes2str(QUALITY_COLUMN)] = new_qual
|
||||||
else : # Quality is in str form
|
else : # Quality is in str form
|
||||||
(((self._view).columns)[bytes2str(QUALITY_COLUMN)]).set_str_line(self._index, new_qual)
|
(((self._view).columns)[bytes2str(QUALITY_COLUMN)]).set_str_line(self._index, new_qual)
|
||||||
|
|
||||||
cpdef object get_str_quality(self) : # TODO not ideal
|
cpdef object get_str_quality(self) : # TODO not ideal. Make quality_int and quality_str properties
|
||||||
return ((self._view).columns)[bytes2str(QUALITY_COLUMN)].get_str_line(self._index)
|
return ((self._view).columns)[bytes2str(QUALITY_COLUMN)].get_str_line(self._index)
|
||||||
|
|
||||||
# cpdef str reverse_complement(self) : TODO in C ?
|
# cpdef str reverse_complement(self) : TODO in C ?
|
||||||
|
@ -1,20 +1,22 @@
|
|||||||
#cython: language_level=3
|
#cython: language_level=3
|
||||||
|
|
||||||
from ..capi.obiview cimport Obiview_p
|
from obitools3.obidms.capi.obidms cimport OBIDMS_p
|
||||||
from ..capi.obidmscolumn cimport OBIDMS_column_p
|
from obitools3.obidms.capi.obitypes cimport const_char_p
|
||||||
|
|
||||||
|
|
||||||
cdef extern from "obi_align.h" nogil:
|
cdef extern from "obi_align.h" nogil:
|
||||||
|
|
||||||
int obi_align_one_column(Obiview_p seq_view,
|
int obi_lcs_align_one_column(OBIDMS_p dms,
|
||||||
OBIDMS_column_p seq_column,
|
const_char_p seq_view_name,
|
||||||
const char* seq_name,
|
const_char_p seq_column_name,
|
||||||
Obiview_p score_view,
|
const_char_p seq_elt_name,
|
||||||
OBIDMS_column_p id1_column,
|
const_char_p id_column_name,
|
||||||
OBIDMS_column_p id2_column,
|
const_char_p output_view_name,
|
||||||
OBIDMS_column_p score_column,
|
const_char_p output_view_comments,
|
||||||
double threshold,
|
bint print_seq,
|
||||||
bint normalize,
|
bint print_count,
|
||||||
int reference,
|
double threshold,
|
||||||
bint similarity_mode)
|
bint normalize,
|
||||||
|
int reference,
|
||||||
|
bint similarity_mode)
|
||||||
|
|
||||||
|
334
src/obi_align.c
334
src/obi_align.c
@ -14,6 +14,7 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
|
||||||
|
#include "obi_align.h"
|
||||||
#include "obidebug.h"
|
#include "obidebug.h"
|
||||||
#include "obierrno.h"
|
#include "obierrno.h"
|
||||||
#include "obitypes.h"
|
#include "obitypes.h"
|
||||||
@ -28,67 +29,227 @@
|
|||||||
|
|
||||||
// TODO
|
// TODO
|
||||||
// use openMP pragmas
|
// use openMP pragmas
|
||||||
// option pour ecrire en stdint?
|
|
||||||
// check NUC_SEQS view type? and score type (int or float if normalize)
|
|
||||||
// what's with multiple sequences/line columns?
|
|
||||||
|
|
||||||
|
|
||||||
int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const char* seq_name,
|
int obi_lcs_align_one_column(OBIDMS_p dms, const char* seq_view_name, const char* seq_column_name, const char* seq_elt_name,
|
||||||
Obiview_p score_view, OBIDMS_column_p id1_column, OBIDMS_column_p id2_column, OBIDMS_column_p score_column,
|
const char* id_column_name,
|
||||||
double threshold, bool normalize, int reference, bool similarity_mode)
|
const char* output_view_name, const char* output_view_comments,
|
||||||
|
bool print_seq, bool print_count,
|
||||||
|
double threshold, bool normalize, int reference, bool similarity_mode)
|
||||||
{
|
{
|
||||||
index_t i, j, k;
|
index_t i, j, k;
|
||||||
index_t seq_count;
|
index_t seq_count;
|
||||||
const char* id1;
|
index_t id1_idx, id2_idx;
|
||||||
const char* id2;
|
index_t seq1_idx, seq2_idx;
|
||||||
double score;
|
double score;
|
||||||
OBIDMS_column_p id_column;
|
int lcs_length;
|
||||||
|
int ali_length;
|
||||||
Kmer_table_p ktable;
|
Kmer_table_p ktable;
|
||||||
Obi_blob_p blob1;
|
Obi_blob_p blob1;
|
||||||
Obi_blob_p blob2;
|
Obi_blob_p blob2;
|
||||||
int lcs_min;
|
int lcs_min;
|
||||||
index_t seq_idx;
|
index_t seq_elt_idx;
|
||||||
|
|
||||||
|
Obiview_p seq_view = NULL;
|
||||||
|
Obiview_p output_view = NULL;
|
||||||
|
OBIDMS_column_p iseq_column = NULL;
|
||||||
|
OBIDMS_column_p id_column;
|
||||||
|
OBIDMS_column_p id1_column = NULL;
|
||||||
|
OBIDMS_column_p id2_column = NULL;
|
||||||
|
OBIDMS_column_p seq1_column = NULL;
|
||||||
|
OBIDMS_column_p seq2_column = NULL;
|
||||||
|
//OBIDMS_column_p count1_column = NULL;
|
||||||
|
//OBIDMS_column_p count2_column = NULL;
|
||||||
|
OBIDMS_column_p idx1_column = NULL;
|
||||||
|
OBIDMS_column_p idx2_column = NULL;
|
||||||
|
OBIDMS_column_p lcs_length_column = NULL;
|
||||||
|
OBIDMS_column_p ali_length_column = NULL;
|
||||||
|
OBIDMS_column_p score_column = NULL;
|
||||||
|
|
||||||
k = 0;
|
k = 0;
|
||||||
|
|
||||||
// If no sequence column is given and the view has the type NUC_SEQS_VIEW, the default sequence column is aligned
|
// Open input view
|
||||||
if ((seq_column == NULL) && (strcmp((seq_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0))
|
seq_view = obi_open_view(dms, seq_view_name);
|
||||||
|
if (seq_view == NULL)
|
||||||
{
|
{
|
||||||
seq_column = obi_view_get_column(seq_view, NUC_SEQUENCE_COLUMN);
|
obidebug(1, "\nError opening the input view to align");
|
||||||
if (seq_column == NULL)
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Open the sequence column to align
|
||||||
|
// If a column name wasn't given, open default sequence column
|
||||||
|
if (strcmp(seq_column_name, "") == 0)
|
||||||
|
{
|
||||||
|
if (strcmp((seq_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0)
|
||||||
|
iseq_column = obi_view_get_column(seq_view, NUC_SEQUENCE_COLUMN);
|
||||||
|
else
|
||||||
|
{
|
||||||
|
obi_set_errno(OBI_ALIGN_ERROR);
|
||||||
|
obidebug(1, "\nError: no column given to align");
|
||||||
return -1;
|
return -1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// Check that the given sequence column contains nucleotide sequences
|
else
|
||||||
else if ((seq_column->header)->returned_data_type != OBI_SEQ)
|
iseq_column = obi_view_get_column(seq_view, seq_column_name);
|
||||||
|
if (iseq_column == NULL)
|
||||||
{
|
{
|
||||||
obi_set_errno(OBI_ALIGN_ERROR);
|
obidebug(1, "\nError getting the column to align");
|
||||||
obidebug(1, "\nTrying to align a column of a different type than OBI_SEQ");
|
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((normalize && ((score_column->header)->returned_data_type != OBI_FLOAT)) ||
|
// Get element index of the sequence to align in each line to compute it only once
|
||||||
(!normalize && ((score_column->header)->returned_data_type != OBI_INT)))
|
if ((strcmp(seq_elt_name, "") != 0) && (seq_elt_name != NULL))
|
||||||
{
|
{
|
||||||
obi_set_errno(OBI_ALIGN_ERROR);
|
seq_elt_idx = obi_column_get_element_index_from_name(iseq_column, seq_elt_name);
|
||||||
obidebug(1, "\nTrying to store alignment scores in a column of an inappropriate type");
|
if (seq_elt_idx == OBIIdx_NA)
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get element index from element name to compute it only once
|
|
||||||
if (seq_name != NULL)
|
|
||||||
{
|
|
||||||
seq_idx = obi_column_get_element_index_from_name(seq_column, seq_name);
|
|
||||||
if (seq_idx == OBIIdx_NA)
|
|
||||||
{
|
{
|
||||||
obidebug(1, "\nError getting the sequence index in a column line when aligning");
|
obidebug(1, "\nError getting the sequence index in a column line when aligning");
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
seq_idx = 0;
|
seq_elt_idx = 0;
|
||||||
|
|
||||||
|
// Open the ID column, containing the identifiers of the sequences to align
|
||||||
|
// If a column name wasn't given, open default ID column
|
||||||
|
if (strcmp(id_column_name, "") == 0)
|
||||||
|
{
|
||||||
|
if (strcmp((seq_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0)
|
||||||
|
id_column = obi_view_get_column(seq_view, ID_COLUMN);
|
||||||
|
else
|
||||||
|
{
|
||||||
|
obi_set_errno(OBI_ALIGN_ERROR);
|
||||||
|
obidebug(1, "\nError: no ID column given");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
id_column = obi_view_get_column(seq_view, id_column_name);
|
||||||
|
if (id_column == NULL)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError getting the ID column");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create the output view
|
||||||
|
output_view = obi_new_view(dms, output_view_name, NULL, NULL, output_view_comments);
|
||||||
|
if (output_view == NULL)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError creating the output view when aligning");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create the output columns
|
||||||
|
|
||||||
|
// Create the column for the ids of the 1st sequence aligned
|
||||||
|
if (obi_view_add_column(output_view, ID1_COLUMN_NAME, -1, ID1_COLUMN_NAME, OBI_STR, 0, 1, NULL, (id_column->header)->indexer_name, NULL, -1, ID1_COLUMN_COMMENTS, true) < 0)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError creating the first column for the sequence ids when aligning");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
id1_column = obi_view_get_column(output_view, ID1_COLUMN_NAME);
|
||||||
|
|
||||||
|
// Create the column for the ids of the 2nd sequence aligned
|
||||||
|
if (obi_view_add_column(output_view, ID2_COLUMN_NAME, -1, ID2_COLUMN_NAME, OBI_STR, 0, 1, NULL, (id_column->header)->indexer_name, NULL, -1, ID2_COLUMN_COMMENTS, true) < 0)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError creating the second column for the sequence ids when aligning");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
id2_column = obi_view_get_column(output_view, ID2_COLUMN_NAME);
|
||||||
|
|
||||||
|
// Create the column for the index (in the input view) of the first sequences aligned
|
||||||
|
if (obi_view_add_column(output_view, IDX1_COLUMN_NAME, -1, IDX1_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, IDX1_COLUMN_COMMENTS, true) < 0)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError creating the first column for the sequence indices when aligning");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
idx1_column = obi_view_get_column(output_view, IDX1_COLUMN_NAME);
|
||||||
|
|
||||||
|
// Create the column for the index (in the input view) of the second sequences aligned
|
||||||
|
if (obi_view_add_column(output_view, IDX2_COLUMN_NAME, -1, IDX2_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, IDX2_COLUMN_COMMENTS, true) < 0)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError creating the second column for the sequence indices when aligning");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
idx2_column = obi_view_get_column(output_view, IDX2_COLUMN_NAME);
|
||||||
|
|
||||||
|
// Create the column for the LCS length
|
||||||
|
if (obi_view_add_column(output_view, LCS_LENGTH_COLUMN_NAME, -1, LCS_LENGTH_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, LCS_LENGTH_COLUMN_COMMENTS, true) < 0)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError creating the column for the LCS length when aligning");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
lcs_length_column = obi_view_get_column(output_view, LCS_LENGTH_COLUMN_NAME);
|
||||||
|
|
||||||
|
// Create the column for the alignment length if it is computed
|
||||||
|
if ((reference == ALILEN) && (normalize || !similarity_mode))
|
||||||
|
{
|
||||||
|
if (obi_view_add_column(output_view, ALI_LENGTH_COLUMN_NAME, -1, ALI_LENGTH_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, ALI_LENGTH_COLUMN_COMMENTS, true) < 0)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError creating the column for the alignment length when aligning");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
ali_length_column = obi_view_get_column(output_view, ALI_LENGTH_COLUMN_NAME);
|
||||||
|
}
|
||||||
|
// Create the column for the alignment score
|
||||||
|
if (normalize)
|
||||||
|
{
|
||||||
|
if (obi_view_add_column(output_view, SCORE_COLUMN_NAME, -1, SCORE_COLUMN_NAME, OBI_FLOAT, 0, 1, NULL, NULL, NULL, -1, SCORE_COLUMN_NAME, true) < 0)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError creating the column for the score when aligning");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (obi_view_add_column(output_view, SCORE_COLUMN_NAME, -1, SCORE_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, SCORE_COLUMN_NAME, true) < 0)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError creating the column for the score when aligning");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
score_column = obi_view_get_column(output_view, SCORE_COLUMN_NAME);
|
||||||
|
|
||||||
|
if (print_seq)
|
||||||
|
{
|
||||||
|
// Create the column for the first sequences aligned
|
||||||
|
if (obi_view_add_column(output_view, SEQ1_COLUMN_NAME, -1, SEQ1_COLUMN_NAME, OBI_SEQ, 0, 1, NULL, (iseq_column->header)->indexer_name, NULL, -1, SEQ1_COLUMN_COMMENTS, true) < 0)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError creating the first column for the sequences when aligning");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
seq1_column = obi_view_get_column(output_view, SEQ1_COLUMN_NAME);
|
||||||
|
|
||||||
|
// Create the column for the second sequences aligned
|
||||||
|
if (obi_view_add_column(output_view, SEQ2_COLUMN_NAME, -1, SEQ2_COLUMN_NAME, OBI_SEQ, 0, 1, NULL, (iseq_column->header)->indexer_name, NULL, -1, SEQ2_COLUMN_COMMENTS, true) < 0)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError creating the second column for the sequences when aligning");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
seq2_column = obi_view_get_column(output_view, SEQ2_COLUMN_NAME);
|
||||||
|
}
|
||||||
|
// if (print_count) // TODO count columns not implemented yet
|
||||||
|
// {
|
||||||
|
// // Create the column for the count of the first sequences aligned
|
||||||
|
// if (obi_view_add_column(output_view, COUNT1_COLUMN_NAME, -1, COUNT1_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, COUNT1_COLUMN_COMMENTS, true) < 0)
|
||||||
|
// {
|
||||||
|
// obidebug(1, "\nError creating the first column for the sequence counts when aligning");
|
||||||
|
// return -1;
|
||||||
|
// }
|
||||||
|
// count1_column = obi_view_get_column(seq_view, COUNT1_COLUMN_NAME);
|
||||||
|
//
|
||||||
|
// // Create the column for the count of the second sequences aligned
|
||||||
|
// if (obi_view_add_column(output_view, COUNT2_COLUMN_NAME, -1, COUNT2_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, COUNT2_COLUMN_COMMENTS, true) < 0)
|
||||||
|
// {
|
||||||
|
// obidebug(1, "\nError creating the second column for the sequence counts when aligning");
|
||||||
|
// return -1;
|
||||||
|
// }
|
||||||
|
// count2_column = obi_view_get_column(seq_view, COUNT2_COLUMN_NAME);
|
||||||
|
// }
|
||||||
|
|
||||||
|
|
||||||
// Build kmer tables
|
// Build kmer tables
|
||||||
ktable = hash_seq_column(seq_view, seq_column, seq_idx);
|
ktable = hash_seq_column(seq_view, iseq_column, seq_elt_idx);
|
||||||
if (ktable == NULL)
|
if (ktable == NULL)
|
||||||
{
|
{
|
||||||
obi_set_errno(OBI_ALIGN_ERROR);
|
obi_set_errno(OBI_ALIGN_ERROR);
|
||||||
@ -96,10 +257,7 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const c
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get the ID column pointer
|
seq_count = (iseq_column->header)->lines_used;
|
||||||
id_column = obi_view_get_column(seq_view, ID_COLUMN);
|
|
||||||
|
|
||||||
seq_count = (seq_column->header)->lines_used;
|
|
||||||
|
|
||||||
for (i=0; i < (seq_count - 1); i++)
|
for (i=0; i < (seq_count - 1); i++)
|
||||||
{
|
{
|
||||||
@ -108,8 +266,10 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const c
|
|||||||
|
|
||||||
for (j=i+1; j < seq_count; j++)
|
for (j=i+1; j < seq_count; j++)
|
||||||
{
|
{
|
||||||
blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, seq_idx);
|
blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, i, seq_elt_idx);
|
||||||
blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, seq_idx);
|
blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, j, seq_elt_idx);
|
||||||
|
seq1_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, i, seq_elt_idx);
|
||||||
|
seq2_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, j, seq_elt_idx);
|
||||||
|
|
||||||
if ((blob1 == NULL) || (blob2 == NULL))
|
if ((blob1 == NULL) || (blob2 == NULL))
|
||||||
{
|
{
|
||||||
@ -118,7 +278,7 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const c
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Check if the sequences are identical in a quick way (same index in the same indexer)
|
// Check if the sequences are identical in a quick way (same index in the same indexer)
|
||||||
if (obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, seq_idx) == obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, seq_idx))
|
if (obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, i, seq_elt_idx) == obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, j, seq_elt_idx))
|
||||||
{
|
{
|
||||||
if (similarity_mode && normalize)
|
if (similarity_mode && normalize)
|
||||||
score = 1.0;
|
score = 1.0;
|
||||||
@ -135,33 +295,93 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const c
|
|||||||
|
|
||||||
// Compute alignment score
|
// Compute alignment score
|
||||||
if ((threshold == 0) || (score == -1.0)) // no threshold, or filter passed: align
|
if ((threshold == 0) || (score == -1.0)) // no threshold, or filter passed: align
|
||||||
score = obiblob_sse_banded_lcs_align(blob1, blob2, threshold, normalize, reference, similarity_mode);
|
score = obiblob_sse_banded_lcs_align(blob1, blob2, threshold, normalize, reference, similarity_mode, &lcs_length, &ali_length);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((score >= 0) && (((normalize || similarity_mode) && (score >= threshold)) || ((!similarity_mode && !normalize) && (score <= threshold))))
|
if ((score >= 0) && (((normalize || similarity_mode) && (score >= threshold)) || ((!similarity_mode && !normalize) && (score <= threshold))))
|
||||||
{ // Print result
|
{ // Print result // TODO make separate function maybe
|
||||||
|
|
||||||
// Get sequence ids
|
// Write line indices of the input view in the output view (to easily refer to the input sequences from the output view)
|
||||||
id1 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0); // TODO Could there be multiple IDs per line?
|
if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, idx1_column, k, 0, i) < 0)
|
||||||
id2 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, j, 0);
|
{
|
||||||
|
obidebug(1, "\nError writing idx1 in a column");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, idx2_column, k, 0, j) < 0)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError writing idx2 in a column");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
// Write sequence ids in output view
|
// Get ids idx
|
||||||
if (obi_set_str_with_elt_idx_and_col_p_in_view(score_view, id1_column, k, 0, id1) < 0)
|
id1_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0); // TODO Could there be multiple IDs per line?
|
||||||
|
id2_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, id_column, j, 0);
|
||||||
|
|
||||||
|
// Write ids in output view
|
||||||
|
if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, id1_column, k, 0, id1_idx) < 0)
|
||||||
{
|
{
|
||||||
obidebug(1, "\nError writing id1 in a column");
|
obidebug(1, "\nError writing id1 in a column");
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, id2_column, k, 0, id2_idx) < 0)
|
||||||
if (obi_set_str_with_elt_idx_and_col_p_in_view(score_view, id2_column, k, 0, id2) < 0)
|
|
||||||
{
|
{
|
||||||
obidebug(1, "\nError writing id2 in a column");
|
obidebug(1, "\nError writing id2 in a column");
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Write score in output view
|
// Write the sequences if needed
|
||||||
|
if (print_seq)
|
||||||
|
{
|
||||||
|
if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, seq1_column, k, 0, seq1_idx) < 0)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError writing seq1 in a column");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, seq2_column, k, 0, seq2_idx) < 0)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError writing seq2 in a column");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// // Write the counts if needed // TODO count columns not implemented yet
|
||||||
|
// if (print_count)
|
||||||
|
// {
|
||||||
|
// if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, count1_column, k, 0, count1) < 0)
|
||||||
|
// {
|
||||||
|
// obidebug(1, "\nError writing count1 in a column");
|
||||||
|
// return -1;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, count2_column, k, 0, count2) < 0)
|
||||||
|
// {
|
||||||
|
// obidebug(1, "\nError writing count2 in a column");
|
||||||
|
// return -1;
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
// Write the alignment length if it was computed
|
||||||
|
if ((reference == ALILEN) && (normalize || !similarity_mode))
|
||||||
|
{
|
||||||
|
if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, ali_length_column, k, 0, ali_length) < 0)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError writing alignment length in a column");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write the LCS length
|
||||||
|
if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, lcs_length_column, k, 0, lcs_length) < 0)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError writing LCS length in a column");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write score
|
||||||
if (normalize)
|
if (normalize)
|
||||||
{
|
{
|
||||||
if (obi_set_float_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obifloat_t) score) < 0)
|
if (obi_set_float_with_elt_idx_and_col_p_in_view(output_view, score_column, k, 0, (obifloat_t) score) < 0)
|
||||||
{
|
{
|
||||||
obidebug(1, "\nError writing alignment score in a column");
|
obidebug(1, "\nError writing alignment score in a column");
|
||||||
return -1;
|
return -1;
|
||||||
@ -169,7 +389,7 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const c
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (obi_set_int_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obiint_t) score) < 0)
|
if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, score_column, k, 0, (obiint_t) score) < 0)
|
||||||
{
|
{
|
||||||
obidebug(1, "\nError writing alignment score in a column");
|
obidebug(1, "\nError writing alignment score in a column");
|
||||||
return -1;
|
return -1;
|
||||||
@ -181,6 +401,18 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const c
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Close views
|
||||||
|
if (obi_close_view(seq_view) < 0)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError closing the input view after aligning");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
if (obi_close_view(output_view) < 0)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError closing the output view after aligning");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
free_kmer_tables(ktable, seq_count);
|
free_kmer_tables(ktable, seq_count);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -24,15 +24,53 @@
|
|||||||
#include "obitypes.h"
|
#include "obitypes.h"
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Names and comments of columns automatically created in the output view when aligning.
|
||||||
|
*
|
||||||
|
* @since December 2016
|
||||||
|
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||||
|
*/
|
||||||
|
#define ID1_COLUMN_NAME "ID1"
|
||||||
|
#define ID1_COLUMN_COMMENTS "ID1"
|
||||||
|
#define ID2_COLUMN_NAME "ID2"
|
||||||
|
#define ID2_COLUMN_COMMENTS "ID2"
|
||||||
|
#define SEQ1_COLUMN_NAME "SEQ1"
|
||||||
|
#define SEQ1_COLUMN_COMMENTS "SEQ1"
|
||||||
|
#define SEQ2_COLUMN_NAME "SEQ2"
|
||||||
|
#define SEQ2_COLUMN_COMMENTS "SEQ2"
|
||||||
|
#define COUNT1_COLUMN_NAME "COUNT1"
|
||||||
|
#define COUNT1_COLUMN_COMMENTS "COUNT1"
|
||||||
|
#define COUNT2_COLUMN_NAME "COUNT2"
|
||||||
|
#define COUNT2_COLUMN_COMMENTS "COUNT2"
|
||||||
|
#define IDX1_COLUMN_NAME "IDX1"
|
||||||
|
#define IDX1_COLUMN_COMMENTS "IDX1"
|
||||||
|
#define IDX2_COLUMN_NAME "IDX2"
|
||||||
|
#define IDX2_COLUMN_COMMENTS "IDX2"
|
||||||
|
#define LCS_LENGTH_COLUMN_NAME "LCS_LENGTH"
|
||||||
|
#define LCS_LENGTH_COLUMN_COMMENTS "LCS_LENGTH"
|
||||||
|
#define ALI_LENGTH_COLUMN_NAME "ALI_LENGTH"
|
||||||
|
#define ALI_LENGTH_COLUMN_COMMENTS "ALI_LENGTH"
|
||||||
|
#define SCORE_COLUMN_NAME "SCORE"
|
||||||
|
#define SCORE_COLUMN_COMMENTS "SCORE"
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Aligns a NUC_SEQ column with itself.
|
* @brief Aligns a NUC_SEQ column with itself.
|
||||||
*
|
*
|
||||||
* @param seq_view A pointer on the view where the column to align is.
|
* Note: The columns where the results are written are automatically named and created.
|
||||||
* @param seq_column A pointer on the OBI_SEQ column to align.
|
*
|
||||||
* @param score_view A pointer on the view to write the outputs to.
|
* @param dms A pointer on an OBIDMS.
|
||||||
* @param id1_column A pointer on the OBI_STR column in score_view where the id of the first sequence should be written.
|
* @param seq_view_name The name of the view where the column to align is.
|
||||||
* @param id2_column A pointer on the OBI_STR column in score_view where the id of the second sequence should be written.
|
* @param seq_column_name The name of the OBI_SEQ column in the input view to align.
|
||||||
* @param score_column A pointer on the OBI_FLOAT column in score_view where the alignment score should be written.
|
* If "" (empty string), and the input view is of type NUC_SEQS_VIEW, the associated "NUC_SEQ" column is aligned.
|
||||||
|
* @param seq_elt_name The name of the element in the column corresponding to the sequence to align, if the column has multiple
|
||||||
|
* elements per line.
|
||||||
|
* @param id_column_name The name of the column in the input view containing the identifiers of the sequences to align.
|
||||||
|
* If "" (empty string), and the input view is of type NUC_SEQS_VIEW, the associated "ID" column is aligned.
|
||||||
|
* @param output_view_name The name of the output view where the results should be written (should not already exist).
|
||||||
|
* @param output_view_comments The comments that should be associated with the output view.
|
||||||
|
* @param print_seq A boolean indicating whether the aligned sequences should be copied in the output view.
|
||||||
|
* @param print_count A boolean indicating whether the aligned sequence counts should be copied in the output view.
|
||||||
* @param threshold Score threshold. If the score is normalized and expressed in similarity, it is an identity, e.g. 0.95
|
* @param threshold Score threshold. If the score is normalized and expressed in similarity, it is an identity, e.g. 0.95
|
||||||
* for an identity of 95%. If the score is normalized and expressed in distance, it is (1.0 - identity),
|
* for an identity of 95%. If the score is normalized and expressed in distance, it is (1.0 - identity),
|
||||||
* e.g. 0.05 for an identity of 95%. If the score is not normalized and expressed in similarity, it is
|
* e.g. 0.05 for an identity of 95%. If the score is not normalized and expressed in similarity, it is
|
||||||
@ -49,9 +87,12 @@
|
|||||||
* @since May 2016
|
* @since May 2016
|
||||||
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||||
*/
|
*/
|
||||||
int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const char* seq_name,
|
int obi_lcs_align_one_column(OBIDMS_p dms,
|
||||||
Obiview_p score_view, OBIDMS_column_p id1_column, OBIDMS_column_p id2_column, OBIDMS_column_p score_column,
|
const char* seq_view_name, const char* seq_column_name, const char* seq_elt_name,
|
||||||
double threshold, bool normalize, int reference, bool similarity_mode);
|
const char* id_column_name,
|
||||||
|
const char* output_view_name, const char* output_view_comments,
|
||||||
|
bool print_seq, bool print_count,
|
||||||
|
double threshold, bool normalize, int reference, bool similarity_mode);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -62,7 +62,7 @@ static inline int extract_reg(__m128i r, int p)
|
|||||||
|
|
||||||
|
|
||||||
// TODO warning on length order
|
// TODO warning on length order
|
||||||
void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal, int16_t* address, double* lcs_length, int* ali_length)
|
void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal, int16_t* address, int* lcs_length, int* ali_length)
|
||||||
{
|
{
|
||||||
register int j;
|
register int j;
|
||||||
int k1, k2;
|
int k1, k2;
|
||||||
@ -288,13 +288,12 @@ void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int
|
|||||||
|
|
||||||
|
|
||||||
// TODO warning on length order
|
// TODO warning on length order
|
||||||
double sse_banded_align_just_lcs(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal)
|
void sse_banded_align_just_lcs(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal, int* lcs_length)
|
||||||
{
|
{
|
||||||
register int j;
|
register int j;
|
||||||
int k1, k2;
|
int k1, k2;
|
||||||
int diff;
|
int diff;
|
||||||
int l_reg, l_loc;
|
int l_reg, l_loc;
|
||||||
int16_t l_lcs;
|
|
||||||
int line;
|
int line;
|
||||||
int numberOfRegistersPerLine;
|
int numberOfRegistersPerLine;
|
||||||
int numberOfRegistersFor3Lines;
|
int numberOfRegistersFor3Lines;
|
||||||
@ -330,7 +329,6 @@ double sse_banded_align_just_lcs(int16_t* seq1, int16_t* seq2, int l1, int l2, i
|
|||||||
{
|
{
|
||||||
obi_set_errno(OBI_MALLOC_ERROR);
|
obi_set_errno(OBI_MALLOC_ERROR);
|
||||||
obidebug(1, "\nError allocating memory for SSE registers for LCS alignment");
|
obidebug(1, "\nError allocating memory for SSE registers for LCS alignment");
|
||||||
return 0; // TODO DOUBLE_MIN?
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// preparer registres SSE
|
// preparer registres SSE
|
||||||
@ -421,12 +419,10 @@ double sse_banded_align_just_lcs(int16_t* seq1, int16_t* seq2, int l1, int l2, i
|
|||||||
l_loc = l_loc - l_reg*8;
|
l_loc = l_loc - l_reg*8;
|
||||||
|
|
||||||
// extracting LCS from the registers :
|
// extracting LCS from the registers :
|
||||||
l_lcs = extract_reg((*(p_gap1+l_reg)).i, l_loc);
|
*lcs_length = extract_reg((*(p_gap1+l_reg)).i, l_loc);
|
||||||
|
|
||||||
// freeing the registers
|
// freeing the registers
|
||||||
free(SSEregisters);
|
free(SSEregisters);
|
||||||
|
|
||||||
return((double) l_lcs);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -561,11 +557,10 @@ void initializeAddressWithGaps(int16_t* address, int bandLengthTotal, int bandLe
|
|||||||
|
|
||||||
|
|
||||||
// TODO warning on length order
|
// TODO warning on length order
|
||||||
double sse_banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, bool normalize, int reference, bool similarity_mode, int16_t* address, int LCSmin)
|
double sse_banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, bool normalize, int reference, bool similarity_mode, int16_t* address, int LCSmin, int* lcs_length, int* ali_length)
|
||||||
{
|
{
|
||||||
double id;
|
double id;
|
||||||
int bandLengthRight, bandLengthLeft, bandLengthTotal;
|
int bandLengthRight, bandLengthLeft, bandLengthTotal;
|
||||||
int ali_length;
|
|
||||||
|
|
||||||
bandLengthLeft = calculateLeftBandLength(l1, LCSmin);
|
bandLengthLeft = calculateLeftBandLength(l1, LCSmin);
|
||||||
bandLengthRight = calculateRightBandLength(l2, LCSmin);
|
bandLengthRight = calculateRightBandLength(l2, LCSmin);
|
||||||
@ -579,10 +574,12 @@ double sse_banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, bool n
|
|||||||
if ((reference == ALILEN) && (normalize || !similarity_mode))
|
if ((reference == ALILEN) && (normalize || !similarity_mode))
|
||||||
{
|
{
|
||||||
initializeAddressWithGaps(address, bandLengthTotal, bandLengthLeft, l1);
|
initializeAddressWithGaps(address, bandLengthTotal, bandLengthLeft, l1);
|
||||||
sse_banded_align_lcs_and_ali_len(seq1, seq2, l1, l2, bandLengthLeft, bandLengthTotal, address, &id, &ali_length);
|
sse_banded_align_lcs_and_ali_len(seq1, seq2, l1, l2, bandLengthLeft, bandLengthTotal, address, lcs_length, ali_length);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
id = sse_banded_align_just_lcs(seq1, seq2, l1, l2, bandLengthLeft, bandLengthTotal);
|
sse_banded_align_just_lcs(seq1, seq2, l1, l2, bandLengthLeft, bandLengthTotal, lcs_length);
|
||||||
|
|
||||||
|
id = (double) *lcs_length;
|
||||||
|
|
||||||
// fprintf(stderr, "\nid before normalizations = %f", id);
|
// fprintf(stderr, "\nid before normalizations = %f", id);
|
||||||
|
|
||||||
@ -590,7 +587,7 @@ double sse_banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, bool n
|
|||||||
|
|
||||||
if (!similarity_mode && !normalize)
|
if (!similarity_mode && !normalize)
|
||||||
switch(reference) {
|
switch(reference) {
|
||||||
case ALILEN: id = ali_length - id;
|
case ALILEN: id = *ali_length - id;
|
||||||
break;
|
break;
|
||||||
case MAXLEN: id = l1 - id;
|
case MAXLEN: id = l1 - id;
|
||||||
break;
|
break;
|
||||||
@ -600,7 +597,7 @@ double sse_banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, bool n
|
|||||||
// fprintf(stderr, "\n2>>> %f, %d\n", id, ali_length);
|
// fprintf(stderr, "\n2>>> %f, %d\n", id, ali_length);
|
||||||
if (normalize)
|
if (normalize)
|
||||||
switch(reference) {
|
switch(reference) {
|
||||||
case ALILEN: id = id / (double) ali_length;
|
case ALILEN: id = id / (double) *ali_length;
|
||||||
break;
|
break;
|
||||||
case MAXLEN: id = id / (double) l1;
|
case MAXLEN: id = id / (double) l1;
|
||||||
break;
|
break;
|
||||||
@ -643,7 +640,7 @@ int calculateLCSmin(int l1, int l2, double threshold, bool normalize, int refere
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bool normalize, int reference, bool similarity_mode)
|
double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bool normalize, int reference, bool similarity_mode, int* lcs_length, int* ali_length)
|
||||||
{
|
{
|
||||||
double id;
|
double id;
|
||||||
int l1, l2;
|
int l1, l2;
|
||||||
@ -718,14 +715,14 @@ double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bo
|
|||||||
putSeqInSeq(iseq1, seq2, l2, TRUE);
|
putSeqInSeq(iseq1, seq2, l2, TRUE);
|
||||||
putSeqInSeq(iseq2, seq1, l1, FALSE);
|
putSeqInSeq(iseq2, seq1, l1, FALSE);
|
||||||
// Compute alignment
|
// Compute alignment
|
||||||
id = sse_banded_lcs_align(iseq1, iseq2, l2, l1, normalize, reference, similarity_mode, address, LCSmin);
|
id = sse_banded_lcs_align(iseq1, iseq2, l2, l1, normalize, reference, similarity_mode, address, LCSmin, lcs_length, ali_length);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
putSeqInSeq(iseq1, seq1, l1, TRUE);
|
putSeqInSeq(iseq1, seq1, l1, TRUE);
|
||||||
putSeqInSeq(iseq2, seq2, l2, FALSE);
|
putSeqInSeq(iseq2, seq2, l2, FALSE);
|
||||||
// Compute alignment
|
// Compute alignment
|
||||||
id = sse_banded_lcs_align(iseq1, iseq2, l1, l2, normalize, reference, similarity_mode, address, LCSmin);
|
id = sse_banded_lcs_align(iseq1, iseq2, l1, l2, normalize, reference, similarity_mode, address, LCSmin, lcs_length, ali_length);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Free allocated elements
|
// Free allocated elements
|
||||||
@ -738,7 +735,7 @@ double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bo
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
double obiblob_sse_banded_lcs_align(Obi_blob_p seq1, Obi_blob_p seq2, double threshold, bool normalize, int reference, bool similarity_mode)
|
double obiblob_sse_banded_lcs_align(Obi_blob_p seq1, Obi_blob_p seq2, double threshold, bool normalize, int reference, bool similarity_mode, int* lcs_length, int* ali_length)
|
||||||
{
|
{
|
||||||
double id;
|
double id;
|
||||||
int l1, l2;
|
int l1, l2;
|
||||||
@ -813,14 +810,14 @@ double obiblob_sse_banded_lcs_align(Obi_blob_p seq1, Obi_blob_p seq2, double thr
|
|||||||
putBlobInSeq(iseq1, seq2, l2, TRUE);
|
putBlobInSeq(iseq1, seq2, l2, TRUE);
|
||||||
putBlobInSeq(iseq2, seq1, l1, FALSE);
|
putBlobInSeq(iseq2, seq1, l1, FALSE);
|
||||||
// Compute alignment
|
// Compute alignment
|
||||||
id = sse_banded_lcs_align(iseq1, iseq2, l2, l1, normalize, reference, similarity_mode, address, LCSmin);
|
id = sse_banded_lcs_align(iseq1, iseq2, l2, l1, normalize, reference, similarity_mode, address, LCSmin, lcs_length, ali_length);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
putBlobInSeq(iseq1, seq1, l1, TRUE);
|
putBlobInSeq(iseq1, seq1, l1, TRUE);
|
||||||
putBlobInSeq(iseq2, seq2, l2, FALSE);
|
putBlobInSeq(iseq2, seq2, l2, FALSE);
|
||||||
// Compute alignment
|
// Compute alignment
|
||||||
id = sse_banded_lcs_align(iseq1, iseq2, l1, l2, normalize, reference, similarity_mode, address, LCSmin);
|
id = sse_banded_lcs_align(iseq1, iseq2, l1, l2, normalize, reference, similarity_mode, address, LCSmin, lcs_length, ali_length);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Free allocated elements
|
// Free allocated elements
|
||||||
|
@ -21,7 +21,7 @@
|
|||||||
|
|
||||||
// TODO doc
|
// TODO doc
|
||||||
int calculateLCSmin(int l1, int l2, double threshold, bool normalize, int reference, bool lcsmode);
|
int calculateLCSmin(int l1, int l2, double threshold, bool normalize, int reference, bool lcsmode);
|
||||||
double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bool normalize, int reference, bool similarity_mode);
|
double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bool normalize, int reference, bool similarity_mode, int* lcs_length, int* ali_length);
|
||||||
double obiblob_sse_banded_lcs_align(Obi_blob_p seq1, Obi_blob_p seq2, double threshold, bool normalize, int reference, bool similarity_mode);
|
double obiblob_sse_banded_lcs_align(Obi_blob_p seq1, Obi_blob_p seq2, double threshold, bool normalize, int reference, bool similarity_mode, int* lcs_length, int* ali_length);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
Reference in New Issue
Block a user