Alignment: API rework. 'obi align' is now 'obi lcs', and the results are
now written to columns automatically created in the output view, all optimally handled at the C level.
This commit is contained in:
334
src/obi_align.c
334
src/obi_align.c
@ -14,6 +14,7 @@
|
||||
#include <stdio.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#include "obi_align.h"
|
||||
#include "obidebug.h"
|
||||
#include "obierrno.h"
|
||||
#include "obitypes.h"
|
||||
@ -28,67 +29,227 @@
|
||||
|
||||
// TODO
|
||||
// use openMP pragmas
|
||||
// option pour ecrire en stdint?
|
||||
// check NUC_SEQS view type? and score type (int or float if normalize)
|
||||
// what's with multiple sequences/line columns?
|
||||
|
||||
|
||||
int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const char* seq_name,
|
||||
Obiview_p score_view, OBIDMS_column_p id1_column, OBIDMS_column_p id2_column, OBIDMS_column_p score_column,
|
||||
double threshold, bool normalize, int reference, bool similarity_mode)
|
||||
int obi_lcs_align_one_column(OBIDMS_p dms, const char* seq_view_name, const char* seq_column_name, const char* seq_elt_name,
|
||||
const char* id_column_name,
|
||||
const char* output_view_name, const char* output_view_comments,
|
||||
bool print_seq, bool print_count,
|
||||
double threshold, bool normalize, int reference, bool similarity_mode)
|
||||
{
|
||||
index_t i, j, k;
|
||||
index_t seq_count;
|
||||
const char* id1;
|
||||
const char* id2;
|
||||
index_t id1_idx, id2_idx;
|
||||
index_t seq1_idx, seq2_idx;
|
||||
double score;
|
||||
OBIDMS_column_p id_column;
|
||||
int lcs_length;
|
||||
int ali_length;
|
||||
Kmer_table_p ktable;
|
||||
Obi_blob_p blob1;
|
||||
Obi_blob_p blob2;
|
||||
int lcs_min;
|
||||
index_t seq_idx;
|
||||
index_t seq_elt_idx;
|
||||
|
||||
Obiview_p seq_view = NULL;
|
||||
Obiview_p output_view = NULL;
|
||||
OBIDMS_column_p iseq_column = NULL;
|
||||
OBIDMS_column_p id_column;
|
||||
OBIDMS_column_p id1_column = NULL;
|
||||
OBIDMS_column_p id2_column = NULL;
|
||||
OBIDMS_column_p seq1_column = NULL;
|
||||
OBIDMS_column_p seq2_column = NULL;
|
||||
//OBIDMS_column_p count1_column = NULL;
|
||||
//OBIDMS_column_p count2_column = NULL;
|
||||
OBIDMS_column_p idx1_column = NULL;
|
||||
OBIDMS_column_p idx2_column = NULL;
|
||||
OBIDMS_column_p lcs_length_column = NULL;
|
||||
OBIDMS_column_p ali_length_column = NULL;
|
||||
OBIDMS_column_p score_column = NULL;
|
||||
|
||||
k = 0;
|
||||
|
||||
// If no sequence column is given and the view has the type NUC_SEQS_VIEW, the default sequence column is aligned
|
||||
if ((seq_column == NULL) && (strcmp((seq_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0))
|
||||
// Open input view
|
||||
seq_view = obi_open_view(dms, seq_view_name);
|
||||
if (seq_view == NULL)
|
||||
{
|
||||
seq_column = obi_view_get_column(seq_view, NUC_SEQUENCE_COLUMN);
|
||||
if (seq_column == NULL)
|
||||
obidebug(1, "\nError opening the input view to align");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Open the sequence column to align
|
||||
// If a column name wasn't given, open default sequence column
|
||||
if (strcmp(seq_column_name, "") == 0)
|
||||
{
|
||||
if (strcmp((seq_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0)
|
||||
iseq_column = obi_view_get_column(seq_view, NUC_SEQUENCE_COLUMN);
|
||||
else
|
||||
{
|
||||
obi_set_errno(OBI_ALIGN_ERROR);
|
||||
obidebug(1, "\nError: no column given to align");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
// Check that the given sequence column contains nucleotide sequences
|
||||
else if ((seq_column->header)->returned_data_type != OBI_SEQ)
|
||||
else
|
||||
iseq_column = obi_view_get_column(seq_view, seq_column_name);
|
||||
if (iseq_column == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_ALIGN_ERROR);
|
||||
obidebug(1, "\nTrying to align a column of a different type than OBI_SEQ");
|
||||
obidebug(1, "\nError getting the column to align");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if ((normalize && ((score_column->header)->returned_data_type != OBI_FLOAT)) ||
|
||||
(!normalize && ((score_column->header)->returned_data_type != OBI_INT)))
|
||||
// Get element index of the sequence to align in each line to compute it only once
|
||||
if ((strcmp(seq_elt_name, "") != 0) && (seq_elt_name != NULL))
|
||||
{
|
||||
obi_set_errno(OBI_ALIGN_ERROR);
|
||||
obidebug(1, "\nTrying to store alignment scores in a column of an inappropriate type");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Get element index from element name to compute it only once
|
||||
if (seq_name != NULL)
|
||||
{
|
||||
seq_idx = obi_column_get_element_index_from_name(seq_column, seq_name);
|
||||
if (seq_idx == OBIIdx_NA)
|
||||
seq_elt_idx = obi_column_get_element_index_from_name(iseq_column, seq_elt_name);
|
||||
if (seq_elt_idx == OBIIdx_NA)
|
||||
{
|
||||
obidebug(1, "\nError getting the sequence index in a column line when aligning");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
else
|
||||
seq_idx = 0;
|
||||
seq_elt_idx = 0;
|
||||
|
||||
// Open the ID column, containing the identifiers of the sequences to align
|
||||
// If a column name wasn't given, open default ID column
|
||||
if (strcmp(id_column_name, "") == 0)
|
||||
{
|
||||
if (strcmp((seq_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0)
|
||||
id_column = obi_view_get_column(seq_view, ID_COLUMN);
|
||||
else
|
||||
{
|
||||
obi_set_errno(OBI_ALIGN_ERROR);
|
||||
obidebug(1, "\nError: no ID column given");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
else
|
||||
id_column = obi_view_get_column(seq_view, id_column_name);
|
||||
if (id_column == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting the ID column");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Create the output view
|
||||
output_view = obi_new_view(dms, output_view_name, NULL, NULL, output_view_comments);
|
||||
if (output_view == NULL)
|
||||
{
|
||||
obidebug(1, "\nError creating the output view when aligning");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Create the output columns
|
||||
|
||||
// Create the column for the ids of the 1st sequence aligned
|
||||
if (obi_view_add_column(output_view, ID1_COLUMN_NAME, -1, ID1_COLUMN_NAME, OBI_STR, 0, 1, NULL, (id_column->header)->indexer_name, NULL, -1, ID1_COLUMN_COMMENTS, true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the first column for the sequence ids when aligning");
|
||||
return -1;
|
||||
}
|
||||
id1_column = obi_view_get_column(output_view, ID1_COLUMN_NAME);
|
||||
|
||||
// Create the column for the ids of the 2nd sequence aligned
|
||||
if (obi_view_add_column(output_view, ID2_COLUMN_NAME, -1, ID2_COLUMN_NAME, OBI_STR, 0, 1, NULL, (id_column->header)->indexer_name, NULL, -1, ID2_COLUMN_COMMENTS, true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the second column for the sequence ids when aligning");
|
||||
return -1;
|
||||
}
|
||||
id2_column = obi_view_get_column(output_view, ID2_COLUMN_NAME);
|
||||
|
||||
// Create the column for the index (in the input view) of the first sequences aligned
|
||||
if (obi_view_add_column(output_view, IDX1_COLUMN_NAME, -1, IDX1_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, IDX1_COLUMN_COMMENTS, true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the first column for the sequence indices when aligning");
|
||||
return -1;
|
||||
}
|
||||
idx1_column = obi_view_get_column(output_view, IDX1_COLUMN_NAME);
|
||||
|
||||
// Create the column for the index (in the input view) of the second sequences aligned
|
||||
if (obi_view_add_column(output_view, IDX2_COLUMN_NAME, -1, IDX2_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, IDX2_COLUMN_COMMENTS, true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the second column for the sequence indices when aligning");
|
||||
return -1;
|
||||
}
|
||||
idx2_column = obi_view_get_column(output_view, IDX2_COLUMN_NAME);
|
||||
|
||||
// Create the column for the LCS length
|
||||
if (obi_view_add_column(output_view, LCS_LENGTH_COLUMN_NAME, -1, LCS_LENGTH_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, LCS_LENGTH_COLUMN_COMMENTS, true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the column for the LCS length when aligning");
|
||||
return -1;
|
||||
}
|
||||
lcs_length_column = obi_view_get_column(output_view, LCS_LENGTH_COLUMN_NAME);
|
||||
|
||||
// Create the column for the alignment length if it is computed
|
||||
if ((reference == ALILEN) && (normalize || !similarity_mode))
|
||||
{
|
||||
if (obi_view_add_column(output_view, ALI_LENGTH_COLUMN_NAME, -1, ALI_LENGTH_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, ALI_LENGTH_COLUMN_COMMENTS, true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the column for the alignment length when aligning");
|
||||
return -1;
|
||||
}
|
||||
ali_length_column = obi_view_get_column(output_view, ALI_LENGTH_COLUMN_NAME);
|
||||
}
|
||||
// Create the column for the alignment score
|
||||
if (normalize)
|
||||
{
|
||||
if (obi_view_add_column(output_view, SCORE_COLUMN_NAME, -1, SCORE_COLUMN_NAME, OBI_FLOAT, 0, 1, NULL, NULL, NULL, -1, SCORE_COLUMN_NAME, true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the column for the score when aligning");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (obi_view_add_column(output_view, SCORE_COLUMN_NAME, -1, SCORE_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, SCORE_COLUMN_NAME, true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the column for the score when aligning");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
score_column = obi_view_get_column(output_view, SCORE_COLUMN_NAME);
|
||||
|
||||
if (print_seq)
|
||||
{
|
||||
// Create the column for the first sequences aligned
|
||||
if (obi_view_add_column(output_view, SEQ1_COLUMN_NAME, -1, SEQ1_COLUMN_NAME, OBI_SEQ, 0, 1, NULL, (iseq_column->header)->indexer_name, NULL, -1, SEQ1_COLUMN_COMMENTS, true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the first column for the sequences when aligning");
|
||||
return -1;
|
||||
}
|
||||
seq1_column = obi_view_get_column(output_view, SEQ1_COLUMN_NAME);
|
||||
|
||||
// Create the column for the second sequences aligned
|
||||
if (obi_view_add_column(output_view, SEQ2_COLUMN_NAME, -1, SEQ2_COLUMN_NAME, OBI_SEQ, 0, 1, NULL, (iseq_column->header)->indexer_name, NULL, -1, SEQ2_COLUMN_COMMENTS, true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the second column for the sequences when aligning");
|
||||
return -1;
|
||||
}
|
||||
seq2_column = obi_view_get_column(output_view, SEQ2_COLUMN_NAME);
|
||||
}
|
||||
// if (print_count) // TODO count columns not implemented yet
|
||||
// {
|
||||
// // Create the column for the count of the first sequences aligned
|
||||
// if (obi_view_add_column(output_view, COUNT1_COLUMN_NAME, -1, COUNT1_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, COUNT1_COLUMN_COMMENTS, true) < 0)
|
||||
// {
|
||||
// obidebug(1, "\nError creating the first column for the sequence counts when aligning");
|
||||
// return -1;
|
||||
// }
|
||||
// count1_column = obi_view_get_column(seq_view, COUNT1_COLUMN_NAME);
|
||||
//
|
||||
// // Create the column for the count of the second sequences aligned
|
||||
// if (obi_view_add_column(output_view, COUNT2_COLUMN_NAME, -1, COUNT2_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, COUNT2_COLUMN_COMMENTS, true) < 0)
|
||||
// {
|
||||
// obidebug(1, "\nError creating the second column for the sequence counts when aligning");
|
||||
// return -1;
|
||||
// }
|
||||
// count2_column = obi_view_get_column(seq_view, COUNT2_COLUMN_NAME);
|
||||
// }
|
||||
|
||||
|
||||
// Build kmer tables
|
||||
ktable = hash_seq_column(seq_view, seq_column, seq_idx);
|
||||
ktable = hash_seq_column(seq_view, iseq_column, seq_elt_idx);
|
||||
if (ktable == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_ALIGN_ERROR);
|
||||
@ -96,10 +257,7 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const c
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Get the ID column pointer
|
||||
id_column = obi_view_get_column(seq_view, ID_COLUMN);
|
||||
|
||||
seq_count = (seq_column->header)->lines_used;
|
||||
seq_count = (iseq_column->header)->lines_used;
|
||||
|
||||
for (i=0; i < (seq_count - 1); i++)
|
||||
{
|
||||
@ -108,8 +266,10 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const c
|
||||
|
||||
for (j=i+1; j < seq_count; j++)
|
||||
{
|
||||
blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, seq_idx);
|
||||
blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, seq_idx);
|
||||
blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, i, seq_elt_idx);
|
||||
blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, j, seq_elt_idx);
|
||||
seq1_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, i, seq_elt_idx);
|
||||
seq2_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, j, seq_elt_idx);
|
||||
|
||||
if ((blob1 == NULL) || (blob2 == NULL))
|
||||
{
|
||||
@ -118,7 +278,7 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const c
|
||||
}
|
||||
|
||||
// Check if the sequences are identical in a quick way (same index in the same indexer)
|
||||
if (obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, seq_idx) == obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, seq_idx))
|
||||
if (obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, i, seq_elt_idx) == obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, j, seq_elt_idx))
|
||||
{
|
||||
if (similarity_mode && normalize)
|
||||
score = 1.0;
|
||||
@ -135,33 +295,93 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const c
|
||||
|
||||
// Compute alignment score
|
||||
if ((threshold == 0) || (score == -1.0)) // no threshold, or filter passed: align
|
||||
score = obiblob_sse_banded_lcs_align(blob1, blob2, threshold, normalize, reference, similarity_mode);
|
||||
score = obiblob_sse_banded_lcs_align(blob1, blob2, threshold, normalize, reference, similarity_mode, &lcs_length, &ali_length);
|
||||
}
|
||||
|
||||
if ((score >= 0) && (((normalize || similarity_mode) && (score >= threshold)) || ((!similarity_mode && !normalize) && (score <= threshold))))
|
||||
{ // Print result
|
||||
{ // Print result // TODO make separate function maybe
|
||||
|
||||
// Get sequence ids
|
||||
id1 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0); // TODO Could there be multiple IDs per line?
|
||||
id2 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, j, 0);
|
||||
// Write line indices of the input view in the output view (to easily refer to the input sequences from the output view)
|
||||
if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, idx1_column, k, 0, i) < 0)
|
||||
{
|
||||
obidebug(1, "\nError writing idx1 in a column");
|
||||
return -1;
|
||||
}
|
||||
if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, idx2_column, k, 0, j) < 0)
|
||||
{
|
||||
obidebug(1, "\nError writing idx2 in a column");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Write sequence ids in output view
|
||||
if (obi_set_str_with_elt_idx_and_col_p_in_view(score_view, id1_column, k, 0, id1) < 0)
|
||||
// Get ids idx
|
||||
id1_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0); // TODO Could there be multiple IDs per line?
|
||||
id2_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, id_column, j, 0);
|
||||
|
||||
// Write ids in output view
|
||||
if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, id1_column, k, 0, id1_idx) < 0)
|
||||
{
|
||||
obidebug(1, "\nError writing id1 in a column");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (obi_set_str_with_elt_idx_and_col_p_in_view(score_view, id2_column, k, 0, id2) < 0)
|
||||
if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, id2_column, k, 0, id2_idx) < 0)
|
||||
{
|
||||
obidebug(1, "\nError writing id2 in a column");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Write score in output view
|
||||
// Write the sequences if needed
|
||||
if (print_seq)
|
||||
{
|
||||
if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, seq1_column, k, 0, seq1_idx) < 0)
|
||||
{
|
||||
obidebug(1, "\nError writing seq1 in a column");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, seq2_column, k, 0, seq2_idx) < 0)
|
||||
{
|
||||
obidebug(1, "\nError writing seq2 in a column");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// // Write the counts if needed // TODO count columns not implemented yet
|
||||
// if (print_count)
|
||||
// {
|
||||
// if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, count1_column, k, 0, count1) < 0)
|
||||
// {
|
||||
// obidebug(1, "\nError writing count1 in a column");
|
||||
// return -1;
|
||||
// }
|
||||
//
|
||||
// if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, count2_column, k, 0, count2) < 0)
|
||||
// {
|
||||
// obidebug(1, "\nError writing count2 in a column");
|
||||
// return -1;
|
||||
// }
|
||||
// }
|
||||
|
||||
// Write the alignment length if it was computed
|
||||
if ((reference == ALILEN) && (normalize || !similarity_mode))
|
||||
{
|
||||
if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, ali_length_column, k, 0, ali_length) < 0)
|
||||
{
|
||||
obidebug(1, "\nError writing alignment length in a column");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Write the LCS length
|
||||
if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, lcs_length_column, k, 0, lcs_length) < 0)
|
||||
{
|
||||
obidebug(1, "\nError writing LCS length in a column");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Write score
|
||||
if (normalize)
|
||||
{
|
||||
if (obi_set_float_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obifloat_t) score) < 0)
|
||||
if (obi_set_float_with_elt_idx_and_col_p_in_view(output_view, score_column, k, 0, (obifloat_t) score) < 0)
|
||||
{
|
||||
obidebug(1, "\nError writing alignment score in a column");
|
||||
return -1;
|
||||
@ -169,7 +389,7 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const c
|
||||
}
|
||||
else
|
||||
{
|
||||
if (obi_set_int_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obiint_t) score) < 0)
|
||||
if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, score_column, k, 0, (obiint_t) score) < 0)
|
||||
{
|
||||
obidebug(1, "\nError writing alignment score in a column");
|
||||
return -1;
|
||||
@ -181,6 +401,18 @@ int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const c
|
||||
}
|
||||
}
|
||||
|
||||
// Close views
|
||||
if (obi_close_view(seq_view) < 0)
|
||||
{
|
||||
obidebug(1, "\nError closing the input view after aligning");
|
||||
return -1;
|
||||
}
|
||||
if (obi_close_view(output_view) < 0)
|
||||
{
|
||||
obidebug(1, "\nError closing the output view after aligning");
|
||||
return -1;
|
||||
}
|
||||
|
||||
free_kmer_tables(ktable, seq_count);
|
||||
|
||||
return 0;
|
||||
|
Reference in New Issue
Block a user