Files
obitools3/src/obi_align.c

269 lines
7.9 KiB
C
Raw Normal View History

/****************************************************************************
* Sequence alignment functions *
****************************************************************************/
/**
* @file obi_align.c
* @author Celine Mercier
* @date May 4th 2016
* @brief Functions handling sequence alignments.
*/
#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#include "obidebug.h"
#include "obierrno.h"
#include "obitypes.h"
#include "obiview.h"
#include "sse_banded_LCS_alignment.h"
#include "upperband.h"
#include "obiblob.h"
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
// TODO
// use openMP pragmas
// option pour ecrire en stdint?
// check NUC_SEQS view type? and score type (int or float if normalize)
// what's with multiple sequences/line columns?
int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const char* seq_name,
Obiview_p score_view, OBIDMS_column_p id1_column, OBIDMS_column_p id2_column, OBIDMS_column_p score_column,
double threshold, bool normalize, int reference, bool similarity_mode)
{
index_t i, j, k;
index_t seq_count;
const char* id1;
const char* id2;
double score;
OBIDMS_column_p id_column;
Kmer_table_p ktable;
Obi_blob_p blob1;
Obi_blob_p blob2;
int lcs_min;
index_t seq_idx;
k = 0;
// If no sequence column is given and the view has the type NUC_SEQS_VIEW, the default sequence column is aligned
if ((seq_column == NULL) && (strcmp((seq_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0))
{
seq_column = obi_view_get_column(seq_view, NUC_SEQUENCE_COLUMN);
if (seq_column == NULL)
return -1;
}
// Check that the given sequence column contains nucleotide sequences
else if ((seq_column->header)->returned_data_type != OBI_SEQ)
{
obi_set_errno(OBI_ALIGN_ERROR);
obidebug(1, "\nTrying to align a column of a different type than OBI_SEQ");
return -1;
}
if ((normalize && ((score_column->header)->returned_data_type != OBI_FLOAT)) ||
(!normalize && ((score_column->header)->returned_data_type != OBI_INT)))
{
obi_set_errno(OBI_ALIGN_ERROR);
obidebug(1, "\nTrying to store alignment scores in a column of an inappropriate type");
return -1;
}
// Get element index from element name to compute it only once
if (seq_name != NULL)
{
seq_idx = obi_column_get_element_index_from_name(seq_column, seq_name);
if (seq_idx == OBIIdx_NA)
{
obidebug(1, "\nError getting the sequence index in a column line when aligning");
return -1;
}
}
else
seq_idx = 0;
// Build kmer tables
ktable = hash_seq_column(seq_view, seq_column, seq_idx);
if (ktable == NULL)
{
obi_set_errno(OBI_ALIGN_ERROR);
obidebug(1, "\nError building kmer tables before aligning");
return -1;
}
// Get the ID column pointer
id_column = obi_view_get_column(seq_view, ID_COLUMN);
seq_count = (seq_column->header)->lines_used;
for (i=0; i < (seq_count - 1); i++)
{
if (i%100 == 0)
fprintf(stderr,"\rDone : %f %% ", (i / (float) seq_count)*100);
for (j=i+1; j < seq_count; j++)
{
blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, seq_idx);
blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, seq_idx);
if ((blob1 == NULL) || (blob2 == NULL))
{
obidebug(1, "\nError retrieving sequences to align");
return -1;
}
// Check if the sequences are identical in a quick way (same index in the same indexer)
if (obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, seq_idx) == obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, seq_idx))
{
if (similarity_mode && normalize)
score = 1.0;
else if (!similarity_mode)
score = 0.0;
else
score = blob1->length_decoded_value;
}
else // the sequences aren't identical
{
// kmer filter
align_filters(ktable, blob1, blob2, i, j, threshold, normalize, reference, similarity_mode, &score, &lcs_min, false);
// Compute alignment score
if ((threshold == 0) || (score == -1.0)) // no threshold, or filter passed: align
score = obiblob_sse_banded_lcs_align(blob1, blob2, threshold, normalize, reference, similarity_mode);
}
if ((score >= 0) && (((normalize || similarity_mode) && (score >= threshold)) || ((!similarity_mode && !normalize) && (score <= threshold))))
{ // Print result
// Get sequence ids
id1 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0); // TODO Could there be multiple IDs per line?
id2 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, j, 0);
// Write sequence ids in output view
if (obi_set_str_with_elt_idx_and_col_p_in_view(score_view, id1_column, k, 0, id1) < 0)
{
obidebug(1, "\nError writing id1 in a column");
return -1;
}
if (obi_set_str_with_elt_idx_and_col_p_in_view(score_view, id2_column, k, 0, id2) < 0)
{
obidebug(1, "\nError writing id2 in a column");
return -1;
}
// Write score in output view
if (normalize)
{
if (obi_set_float_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obifloat_t) score) < 0)
{
obidebug(1, "\nError writing alignment score in a column");
return -1;
}
}
else
{
if (obi_set_int_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obiint_t) score) < 0)
{
obidebug(1, "\nError writing alignment score in a column");
return -1;
}
}
k++;
}
}
}
free_kmer_tables(ktable, seq_count);
return 0;
}
// TODO discuss if 2 input views or 2 columns or both possible
//int obi_align_two_columns(Obiview_p seq_view, OBIDMS_column_p seq_column_1, OBIDMS_column_p seq_column_2, // TODO it's implied both seq columns are in the same view but maybe it shouldn't
// Obiview_p score_view, OBIDMS_column_p score_column,
// double threshold, bool normalize, int reference, bool similarity_mode)
//{
// index_t i, j, k;
// index_t seq_count_1;
// index_t seq_count_2;
// char* seq1;
// char* seq2;
// double score;
//
// k = 0;
//
// if (((seq_column_1->header)->returned_data_type != OBI_SEQ) || ((seq_column_2->header)->returned_data_type != OBI_SEQ))
// {
// obi_set_errno(OBI_ALIGN_ERROR);
// obidebug(1, "\nTrying to align a column of a different type than OBI_SEQ");
// return -1;
// }
//
// if ((normalize && ((score_column->header)->returned_data_type != OBI_FLOAT)) ||
// (!normalize && ((score_column->header)->returned_data_type != OBI_INT)))
// {
// obi_set_errno(OBI_ALIGN_ERROR);
// obidebug(1, "\nTrying to store alignment scores in a column of an inappropriate type");
// return -1;
// }
//
// seq_count_1 = (seq_column_1->header)->lines_used;
// seq_count_2 = (seq_column_2->header)->lines_used;
//
// for (i=0; i < (seq_count_1 - 1); i++)
// {
// for (j=0; j < seq_count_2; j++)
// {
// //fprintf(stderr, "\ni=%lld, j=%lld, k=%lld", i, j, k);
//
// seq1 = obi_get_seq_with_elt_idx_and_col_p_in_view(seq_view, seq_column_1, i, 0);
// seq2 = obi_get_seq_with_elt_idx_and_col_p_in_view(seq_view, seq_column_2, j, 0);
//
// if ((seq1 == NULL) || (seq2 == NULL))
// {
// obidebug(1, "\nError retrieving sequences to align");
// return -1;
// }
//
// // TODO kmer filter
//
// score = generic_sse_banded_lcs_align(seq1, seq2, threshold, normalize, reference, similarity_mode);
//
// if (normalize)
// {
// if (obi_set_float_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obifloat_t) score) < 0)
// {
// obidebug(1, "\nError writing alignment score in a column");
// return -1;
// }
// }
// else
// {
// if (obi_set_int_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obiint_t) score) < 0)
// {
// obidebug(1, "\nError writing alignment score in a column");
// return -1;
// }
// }
//
// free(seq1);
// free(seq2);
//
// k++;
// }
// }
//
// return 0;
//}