1st version of obi align command and reworked functions that handle

column alignment
This commit is contained in:
Celine Mercier
2016-08-10 14:51:02 +02:00
parent 26b8e1f215
commit 2aaa87edcc
8 changed files with 373 additions and 43 deletions

View File

@ -0,0 +1,128 @@
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.obidms._obidms import OBIDMS, OBIView # TODO cimport doesn't work
import time
__title__="Aligns one sequence column with itself or two sequence columns"
default_config = { 'inputview' : None,
'skip' : 0,
'only' : None,
'skiperror' : False,
'moltype' : 'nuc',
}
def addOptions(parser):
# TODO put this common group somewhere else but I don't know where
group=parser.add_argument_group('DMS and view options')
group.add_argument('--default-dms','-d',
action="store", dest="obi:defaultdms",
metavar='<DMS NAME>',
default=None,
type=str,
help="Name of the default DMS for reading and writing data.")
group.add_argument('--input-view','-i',
action="store", dest="obi:inputview",
metavar='<INPUT VIEW NAME>',
default=None,
type=str,
help="Name of the input view, either raw if the view is in the default DMS,"
" or in the form 'dms:view' if it is in another DMS.")
# TODO eventually 2nd view, or 2nd column?
group.add_argument('--output-view','-o',
action="store", dest="obi:outputview",
metavar='<OUTPUT VIEW NAME>',
default=None,
type=str,
help="Name of the output view, either raw if the view is in the default DMS,"
" or in the form 'dms:view' if it is in another DMS.")
group=parser.add_argument_group('obi align specific options')
group.add_argument('--lcs','-C',
action="store", dest="align:alitype",
metavar='<ALIGNMENT TYPE>',
default='lcs',
type=str,
help="Compute alignment using the LCS method.")
group.add_argument('--threshold','-t',
action="store", dest="align:threshold",
metavar='<THRESHOLD>',
default=0.0,
type=float,
help="Score threshold. If the score is normalized and expressed in similarity (default),"
" it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized"
" and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%."
" If the score is not normalized and expressed in similarity, it is the length of the"
" Longest Common Subsequence. If the score is not normalized and expressed in distance,"
" it is (reference length - LCS length)."
" Only sequence pairs with a similarity above <THRESHOLD> are printed. Default: 0.00"
" (no threshold).")
group.add_argument('--longest_length','-L',
action="store_const", dest="align:reflength",
default="ali",
const="longest",
help="The reference length is the length of the longest sequence."
" Default: the reference length is the length of the alignment.")
group.add_argument('--shortest_length','-l',
action="store_const", dest="align:reflength",
default="ali",
const="shortest",
help="The reference length is the length of the shortest sequence."
" Default: the reference length is the length of the alignment.")
group.add_argument('--raw','-r',
action="store_false", dest="align:normalize",
default=True,
help="Raw score, not normalized. Default: score is normalized with the reference sequence length.")
group.add_argument('--distance','-D',
action="store_false", dest="align:similarity",
default=True,
help="Score is expressed in distance. Default: score is expressed in similarity.")
def run(config):
#pb = ProgressBar(1, config, seconde=5) # TODO
# Open DMS
d = OBIDMS(config['obi']['defaultdms'])
# Open input view 1
iview = d.open_view(config['obi']['inputview'])
# TODO Open input view 2 if there is one
# Create output view if necessary
if config['obi']['outputview'] is not None :
oview = d.new_view(config['obi']['outputview'])
else :
oview = None
# TODO Take other alignment types into account when they'll be implemented
# Call cython alignment function
iview.align(output_view=oview)
print("Done.")

View File

@ -77,7 +77,15 @@ cdef class OBIView_NUC_SEQS(OBIView):
cdef OBIDMS_column qualities
cpdef delete_column(self, str column_name)
cpdef align(self,
OBIView iview2=*,
object output_view=*,
double threshold=*,
bint normalize=*,
int reference=*,
bint similarity_mode=*
)
cdef class OBIView_line :

View File

@ -10,7 +10,9 @@ from .capi.obidmscolumn cimport obi_close_column, \
OBIDMS_column_header_p
from .capi.obiutils cimport obi_format_date
from .capi.obialign cimport obi_align_one_column
from .capi.obitypes cimport const_char_p, \
OBIType_t, \
OBI_INT, \
@ -454,6 +456,7 @@ cdef class OBIView :
for line in self.__iter__() :
to_print = to_print + str(line) + "\n"
return to_print
#############################################
@ -524,7 +527,76 @@ cdef class OBIView_NUC_SEQS(OBIView):
def __setitem__(self, index_t line_idx, OBI_Nuc_Seq sequence_obj) :
for key in sequence_obj :
self[line_idx][key] = sequence_obj[key]
# TODO
cpdef align(self, OBIView iview2=None, object output_view=None,
double threshold=0.0, bint normalize=True, int reference=0, bint similarity_mode=True) :
cdef OBIView iview1
cdef OBIView oview
cdef Obiview_p iview1_p
cdef Obiview_p iview2_p
cdef Obiview_p oview_p
cdef OBIDMS_column icol1
cdef OBIDMS_column_p icol1_p
cdef OBIDMS_column_p* icol1_pp
cdef OBIDMS_column id1_col
cdef OBIDMS_column_p id1_col_p
cdef OBIDMS_column_p* id1_col_pp
cdef OBIDMS_column id2_col
cdef OBIDMS_column_p id2_col_p
cdef OBIDMS_column_p* id2_col_pp
cdef OBIDMS_column ocol
cdef OBIDMS_column_p ocol_p
cdef OBIDMS_column_p* ocol_pp
cdef str id1_col_name
cdef str id2_col_name
cdef str score_col_name
id1_col_name = "ID1" # TODO discuss names, aliases
id2_col_name = "ID2"
score_col_name = "score"
iview1= self
iview1_p = iview1.pointer
icol1 = iview1[bytes2str(NUC_SEQUENCE_COLUMN)]
icol1_pp = icol1.pointer
icol1_p = icol1_pp[0]
# Create the output view if needed
if output_view is None :
oview = self.dms.new_view("alignment_score_view") # TODO discuss
elif type(output_view) == str :
oview = self.dms.new_view(output_view)
else :
oview = output_view
oview.add_column(id1_col_name, type='OBI_STR', create=True)
oview.add_column(id2_col_name, type='OBI_STR', create=True)
oview.add_column(score_col_name, type='OBI_FLOAT', create=True)
oview_p = oview.pointer
ocol = oview[score_col_name]
ocol_pp = ocol.pointer
ocol_p = ocol_pp[0]
id1_col = oview[id1_col_name]
id2_col = oview[id2_col_name]
id1_col_pp = id1_col.pointer
id2_col_pp = id2_col.pointer
id1_col_p = id1_col_pp[0]
id2_col_p = id2_col_pp[0]
if obi_align_one_column(iview1_p, icol1_p, oview_p, id1_col_p, id2_col_p, ocol_p, threshold, normalize, reference, similarity_mode) < 0 :
raise Exception("Error aligning sequences")
#############################################

View File

@ -8,17 +8,6 @@ cdef class OBIDMS_column_seq(OBIDMS_column):
cpdef object get_line(self, index_t line_nb)
cpdef set_line(self, index_t line_nb, object value)
# TO DISCUSS :
# I'am not sure that this method has to be declared here
# Alignment must be declared outside of the sequence object
cpdef align(self,
OBIView score_view,
OBIDMS_column score_column,
double threshold = *,
bint normalize = *,
int reference = *,
bint similarity_mode = *)
cdef class OBIDMS_column_multi_elts_seq(OBIDMS_column_multi_elts):
cpdef object get_item(self, index_t line_nb, str element_name)

View File

@ -37,18 +37,6 @@ cdef class OBIDMS_column_seq(OBIDMS_column):
else :
if obi_set_seq_with_elt_idx_and_col_p_in_view(self.view.pointer, (self.pointer)[0], line_nb, 0, str2bytes(value)) < 0:
raise Exception("Problem setting a value in a column")
# TODO choose alignment type (lcs or other) with supplementary argument
cpdef align(self,
OBIView score_view,
OBIDMS_column score_column,
double threshold = 0.0,
bint normalize = True,
int reference = 0, # TODO
bint similarity_mode = True):
if (obi_align_one_column(self.view.pointer, (self.pointer)[0], score_view.pointer, (score_column.pointer)[0], threshold, normalize, reference, similarity_mode) < 0) :
raise Exception("An error occurred while aligning sequences")
cdef class OBIDMS_column_multi_elts_seq(OBIDMS_column_multi_elts):

View File

@ -6,5 +6,14 @@ from ..capi.obidmscolumn cimport OBIDMS_column_p
cdef extern from "obi_align.h" nogil:
int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, Obiview_p score_view, OBIDMS_column_p score_column, double threshold, bint normalize, int reference, bint similarity_mode)
int obi_align_one_column(Obiview_p seq_view,
OBIDMS_column_p seq_column,
Obiview_p score_view,
OBIDMS_column_p id1_column,
OBIDMS_column_p id2_column,
OBIDMS_column_p score_column,
double threshold,
bint normalize,
int reference,
bint similarity_mode)

View File

@ -25,20 +25,25 @@
// TODO
// use openMP pragmas : garder scores en memoire et ecrire a la fin? (normalement c bon avec index)
// tout ecrire en stdint?
// check NUC_SEQS and score type (int or float if normalize)
// what's with multiple sequence/line columns?
// use openMP pragmas
// option pour ecrire en stdint?
// check NUC_SEQS view type? and score type (int or float if normalize)
// what's with multiple sequences/line columns?
// make function that put blobs in int16
int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, Obiview_p score_view, OBIDMS_column_p score_column, double threshold, bool normalize, int reference, bool similarity_mode)
int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column,
Obiview_p score_view, OBIDMS_column_p id1_column, OBIDMS_column_p id2_column, OBIDMS_column_p score_column,
double threshold, bool normalize, int reference, bool similarity_mode)
{
index_t i, j, k;
index_t seq_count;
char* seq1;
char* seq2;
const char* id1;
const char* id2;
double score;
OBIDMS_column_p id_column;
k = 0;
@ -57,6 +62,9 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, Obiview
return -1;
}
// Get the ID column pointer
id_column = obi_view_get_column(seq_view, ID_COLUMN);
seq_count = (seq_column->header)->lines_used;
for (i=0; i < (seq_count - 1); i++)
@ -65,8 +73,8 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, Obiview
{
//fprintf(stderr, "\ni=%lld, j=%lld, k=%lld", i, j, k);
seq1 = obi_column_get_obiseq_with_elt_idx_in_view(seq_view, seq_column, i, 0);
seq2 = obi_column_get_obiseq_with_elt_idx_in_view(seq_view, seq_column, j, 0);
seq1 = obi_get_seq_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, 0);
seq2 = obi_get_seq_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, 0);
if ((seq1 == NULL) || (seq2 == NULL))
{
@ -74,12 +82,32 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, Obiview
return -1;
}
// TODO filter
// TODO kmer filter
// Compute alignment score
score = generic_sse_banded_lcs_align(seq1, seq2, threshold, normalize, reference, similarity_mode);
// Get sequence ids
id1 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0);
id2 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, j, 0);
// Write sequence ids in output view
if (obi_set_str_with_elt_idx_and_col_p_in_view(score_view, id1_column, k, 0, id1) < 0)
{
obidebug(1, "\nError writing id1 in a column");
return -1;
}
if (obi_set_str_with_elt_idx_and_col_p_in_view(score_view, id2_column, k, 0, id2) < 0)
{
obidebug(1, "\nError writing id2 in a column");
return -1;
}
// Write score in output view
if (normalize)
{
if (obi_column_set_obifloat_with_elt_idx_in_view(score_view, score_column, k, 0, (obifloat_t) score) < 0)
if (obi_set_float_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obifloat_t) score) < 0)
{
obidebug(1, "\nError writing alignment score in a column");
return -1;
@ -87,13 +115,16 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, Obiview
}
else
{
if (obi_column_set_obiint_with_elt_idx_in_view(score_view, score_column, k, 0, (obiint_t) score) < 0)
if (obi_set_int_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obiint_t) score) < 0)
{
obidebug(1, "\nError writing alignment score in a column");
return -1;
}
}
free(seq1);
free(seq2);
k++;
}
}
@ -102,8 +133,82 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, Obiview
}
int obi_align_two_columns(OBIDMS_column_p seq_column_1, OBIDMS_column_p seq_column_2, OBIDMS_column_p score_column, double threshold, bool normalize, int reference, bool similarity_mode)
{
// TODO
return 0;
}
// TODO discuss if 2 input views or 2 columns or both possible
//int obi_align_two_columns(Obiview_p seq_view, OBIDMS_column_p seq_column_1, OBIDMS_column_p seq_column_2, // TODO it's implied both seq columns are in the same view but maybe it shouldn't
// Obiview_p score_view, OBIDMS_column_p score_column,
// double threshold, bool normalize, int reference, bool similarity_mode)
//{
// index_t i, j, k;
// index_t seq_count_1;
// index_t seq_count_2;
// char* seq1;
// char* seq2;
// double score;
//
// k = 0;
//
// if (((seq_column_1->header)->returned_data_type != OBI_SEQ) || ((seq_column_2->header)->returned_data_type != OBI_SEQ))
// {
// obi_set_errno(OBI_ALIGN_ERROR);
// obidebug(1, "\nTrying to align a column of a different type than OBI_SEQ");
// return -1;
// }
//
// if ((normalize && ((score_column->header)->returned_data_type != OBI_FLOAT)) ||
// (!normalize && ((score_column->header)->returned_data_type != OBI_INT)))
// {
// obi_set_errno(OBI_ALIGN_ERROR);
// obidebug(1, "\nTrying to store alignment scores in a column of an inappropriate type");
// return -1;
// }
//
// seq_count_1 = (seq_column_1->header)->lines_used;
// seq_count_2 = (seq_column_2->header)->lines_used;
//
// for (i=0; i < (seq_count_1 - 1); i++)
// {
// for (j=0; j < seq_count_2; j++)
// {
// //fprintf(stderr, "\ni=%lld, j=%lld, k=%lld", i, j, k);
//
// seq1 = obi_get_seq_with_elt_idx_and_col_p_in_view(seq_view, seq_column_1, i, 0);
// seq2 = obi_get_seq_with_elt_idx_and_col_p_in_view(seq_view, seq_column_2, j, 0);
//
// if ((seq1 == NULL) || (seq2 == NULL))
// {
// obidebug(1, "\nError retrieving sequences to align");
// return -1;
// }
//
// // TODO kmer filter
//
// score = generic_sse_banded_lcs_align(seq1, seq2, threshold, normalize, reference, similarity_mode);
//
// if (normalize)
// {
// if (obi_set_float_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obifloat_t) score) < 0)
// {
// obidebug(1, "\nError writing alignment score in a column");
// return -1;
// }
// }
// else
// {
// if (obi_set_int_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obiint_t) score) < 0)
// {
// obidebug(1, "\nError writing alignment score in a column");
// return -1;
// }
// }
//
// free(seq1);
// free(seq2);
//
// k++;
// }
// }
//
// return 0;
//}

View File

@ -23,6 +23,35 @@
#include "obitypes.h"
/**
* @brief Aligns a NUC_SEQ column with itself.
*
* @param seq_view A pointer on the view where the column to align is.
* @param seq_column A pointer on the OBI_SEQ column to align.
* @param score_view A pointer on the view to write the outputs to.
* @param id1_column A pointer on the OBI_STR column in score_view where the id of the first sequence should be written.
* @param id2_column A pointer on the OBI_STR column in score_view where the id of the second sequence should be written.
* @param score_column A pointer on the OBI_FLOAT column in score_view where the alignment score should be written.
* @param threshold Score threshold. If the score is normalized and expressed in similarity, it is an identity, e.g. 0.95
* for an identity of 95%. If the score is normalized and expressed in distance, it is (1.0 - identity),
* e.g. 0.05 for an identity of 95%. If the score is not normalized and expressed in similarity, it is
* the length of the Longest Common Subsequence. If the score is not normalized and expressed in distance,
* it is (reference length - LCS length). Only sequence pairs with a similarity above the threshold are printed.
* @param normalize Whether the score should be normalized with the reference sequence length.
* @param reference The reference length. 0: The alignement length; 1: The longest sequence's length; 2: The shortest sequence's length.
* @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false).
*
* @returns A value indicating the success of the operation.
* @retval 0 if the operation was successfully completed.
* @retval -1 if an error occurred.
*
* @since May 2016
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column,
Obiview_p score_view, OBIDMS_column_p id1_column, OBIDMS_column_p id2_column, OBIDMS_column_p score_column,
double threshold, bool normalize, int reference, bool similarity_mode);
/**
* @brief
@ -30,7 +59,9 @@
* TODO
*
*/
int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, Obiview_p score_view, OBIDMS_column_p score_column, double threshold, bool normalize, int reference, bool similarity_mode);
//int obi_align_two_columns(Obiview_p seq_view, OBIDMS_column_p seq_column_1, OBIDMS_column_p seq_column_2,
// Obiview_p score_view, OBIDMS_column_p score_column,
// double threshold, bool normalize, int reference, bool similarity_mode);
#endif /* OBI_ALIGN_H_ */