2016-05-11 16:36:23 +02:00
/****************************************************************************
* Sequence alignment functions *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/**
* @ file obi_align . c
* @ author Celine Mercier
* @ date May 4 th 2016
* @ brief Functions handling sequence alignments .
*/
# include <stdlib.h>
# include <stdio.h>
# include <stdbool.h>
# include "obidebug.h"
# include "obierrno.h"
# include "obitypes.h"
# include "obiview.h"
# include "sse_banded_LCS_alignment.h"
2016-11-18 16:29:28 +01:00
# include "upperband.h"
# include "obiblob.h"
2016-05-11 16:36:23 +02:00
# define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
// TODO
2016-08-10 14:51:02 +02:00
// use openMP pragmas
// option pour ecrire en stdint?
// check NUC_SEQS view type? and score type (int or float if normalize)
// what's with multiple sequences/line columns?
2016-05-11 16:36:23 +02:00
2016-11-29 16:15:02 +01:00
int obi_align_one_column ( Obiview_p seq_view , OBIDMS_column_p seq_column , const char * seq_name ,
2016-08-10 14:51:02 +02:00
Obiview_p score_view , OBIDMS_column_p id1_column , OBIDMS_column_p id2_column , OBIDMS_column_p score_column ,
double threshold , bool normalize , int reference , bool similarity_mode )
2016-05-11 16:36:23 +02:00
{
2016-11-18 16:29:28 +01:00
index_t i , j , k ;
index_t seq_count ;
const char * id1 ;
const char * id2 ;
double score ;
2016-08-10 14:51:02 +02:00
OBIDMS_column_p id_column ;
2016-11-18 16:29:28 +01:00
Kmer_table_p ktable ;
Obi_blob_p blob1 ;
Obi_blob_p blob2 ;
int lcs_min ;
2016-11-29 16:15:02 +01:00
index_t seq_idx ;
2016-05-11 16:36:23 +02:00
k = 0 ;
2016-11-29 16:52:41 +01:00
// If no sequence column is given and the view has the type NUC_SEQS_VIEW, the default sequence column is aligned
if ( ( seq_column = = NULL ) & & ( strcmp ( ( seq_view - > infos ) - > view_type , VIEW_TYPE_NUC_SEQS ) = = 0 ) )
{
seq_column = obi_view_get_column ( seq_view , NUC_SEQUENCE_COLUMN ) ;
if ( seq_column = = NULL )
return - 1 ;
}
// Check that the given sequence column contains nucleotide sequences
else if ( ( seq_column - > header ) - > returned_data_type ! = OBI_SEQ )
2016-05-11 16:36:23 +02:00
{
obi_set_errno ( OBI_ALIGN_ERROR ) ;
obidebug ( 1 , " \n Trying to align a column of a different type than OBI_SEQ " ) ;
return - 1 ;
}
if ( ( normalize & & ( ( score_column - > header ) - > returned_data_type ! = OBI_FLOAT ) ) | |
( ! normalize & & ( ( score_column - > header ) - > returned_data_type ! = OBI_INT ) ) )
{
obi_set_errno ( OBI_ALIGN_ERROR ) ;
obidebug ( 1 , " \n Trying to store alignment scores in a column of an inappropriate type " ) ;
return - 1 ;
}
2016-11-29 16:15:02 +01:00
// Get element index from element name to compute it only once
if ( seq_name ! = NULL )
{
seq_idx = obi_column_get_element_index_from_name ( seq_column , seq_name ) ;
if ( seq_idx = = OBIIdx_NA )
{
obidebug ( 1 , " \n Error getting the sequence index in a column line when aligning " ) ;
return - 1 ;
}
}
else
seq_idx = 0 ;
2016-11-18 16:29:28 +01:00
// Build kmer tables
2016-11-29 16:15:02 +01:00
ktable = hash_seq_column ( seq_view , seq_column , seq_idx ) ;
2016-11-18 16:29:28 +01:00
if ( ktable = = NULL )
{
obi_set_errno ( OBI_ALIGN_ERROR ) ;
obidebug ( 1 , " \n Error building kmer tables before aligning " ) ;
return - 1 ;
}
2016-08-10 14:51:02 +02:00
// Get the ID column pointer
id_column = obi_view_get_column ( seq_view , ID_COLUMN ) ;
2016-05-11 16:36:23 +02:00
seq_count = ( seq_column - > header ) - > lines_used ;
for ( i = 0 ; i < ( seq_count - 1 ) ; i + + )
{
2016-11-18 16:29:28 +01:00
if ( i % 100 = = 0 )
fprintf ( stderr , " \r Done : %f %% " , ( i / ( float ) seq_count ) * 100 ) ;
2016-05-11 16:36:23 +02:00
for ( j = i + 1 ; j < seq_count ; j + + )
{
2016-11-29 16:15:02 +01:00
blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view ( seq_view , seq_column , i , seq_idx ) ;
blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view ( seq_view , seq_column , j , seq_idx ) ;
2016-05-11 16:36:23 +02:00
2016-11-18 16:29:28 +01:00
if ( ( blob1 = = NULL ) | | ( blob2 = = NULL ) )
2016-05-11 16:36:23 +02:00
{
obidebug ( 1 , " \n Error retrieving sequences to align " ) ;
return - 1 ;
}
2016-11-28 11:39:29 +01:00
// Check if the sequences are identical in a quick way (same index in the same indexer)
2016-11-29 16:15:02 +01:00
if ( obi_get_index_with_elt_idx_and_col_p_in_view ( seq_view , seq_column , i , seq_idx ) = = obi_get_index_with_elt_idx_and_col_p_in_view ( seq_view , seq_column , j , seq_idx ) )
2016-11-28 11:39:29 +01:00
{
if ( similarity_mode & & normalize )
score = 1.0 ;
else if ( ! similarity_mode )
score = 0.0 ;
else
score = blob1 - > length_decoded_value ;
}
2016-08-10 14:51:02 +02:00
2016-11-28 11:39:29 +01:00
else // the sequences aren't identical
{
// kmer filter
align_filters ( ktable , blob1 , blob2 , i , j , threshold , normalize , reference , similarity_mode , & score , & lcs_min , false ) ;
// Compute alignment score
if ( ( threshold = = 0 ) | | ( score = = - 1.0 ) ) // no threshold, or filter passed: align
score = obiblob_sse_banded_lcs_align ( blob1 , blob2 , threshold , normalize , reference , similarity_mode ) ;
}
2016-05-11 16:36:23 +02:00
2016-11-18 16:29:28 +01:00
if ( ( score > = 0 ) & & ( ( ( normalize | | similarity_mode ) & & ( score > = threshold ) ) | | ( ( ! similarity_mode & & ! normalize ) & & ( score < = threshold ) ) ) )
{ // Print result
2016-08-10 14:51:02 +02:00
2016-11-18 16:29:28 +01:00
// Get sequence ids
2016-11-29 16:15:02 +01:00
id1 = obi_get_str_with_elt_idx_and_col_p_in_view ( seq_view , id_column , i , 0 ) ; // TODO Could there be multiple IDs per line?
2016-11-18 16:29:28 +01:00
id2 = obi_get_str_with_elt_idx_and_col_p_in_view ( seq_view , id_column , j , 0 ) ;
2016-08-10 14:51:02 +02:00
2016-11-18 16:29:28 +01:00
// Write sequence ids in output view
if ( obi_set_str_with_elt_idx_and_col_p_in_view ( score_view , id1_column , k , 0 , id1 ) < 0 )
2016-05-11 16:36:23 +02:00
{
2016-11-18 16:29:28 +01:00
obidebug ( 1 , " \n Error writing id1 in a column " ) ;
2016-05-11 16:36:23 +02:00
return - 1 ;
}
2016-11-18 16:29:28 +01:00
if ( obi_set_str_with_elt_idx_and_col_p_in_view ( score_view , id2_column , k , 0 , id2 ) < 0 )
2016-05-11 16:36:23 +02:00
{
2016-11-18 16:29:28 +01:00
obidebug ( 1 , " \n Error writing id2 in a column " ) ;
2016-05-11 16:36:23 +02:00
return - 1 ;
}
2016-11-18 16:29:28 +01:00
// Write score in output view
if ( normalize )
{
if ( obi_set_float_with_elt_idx_and_col_p_in_view ( score_view , score_column , k , 0 , ( obifloat_t ) score ) < 0 )
{
obidebug ( 1 , " \n Error writing alignment score in a column " ) ;
return - 1 ;
}
}
else
{
if ( obi_set_int_with_elt_idx_and_col_p_in_view ( score_view , score_column , k , 0 , ( obiint_t ) score ) < 0 )
{
obidebug ( 1 , " \n Error writing alignment score in a column " ) ;
return - 1 ;
}
}
2016-08-10 14:51:02 +02:00
2016-11-18 16:29:28 +01:00
k + + ;
}
2016-05-11 16:36:23 +02:00
}
}
2016-11-18 16:29:28 +01:00
free_kmer_tables ( ktable , seq_count ) ;
2016-05-11 16:36:23 +02:00
return 0 ;
}
2016-08-10 14:51:02 +02:00
// TODO discuss if 2 input views or 2 columns or both possible
//int obi_align_two_columns(Obiview_p seq_view, OBIDMS_column_p seq_column_1, OBIDMS_column_p seq_column_2, // TODO it's implied both seq columns are in the same view but maybe it shouldn't
// Obiview_p score_view, OBIDMS_column_p score_column,
// double threshold, bool normalize, int reference, bool similarity_mode)
//{
// index_t i, j, k;
// index_t seq_count_1;
// index_t seq_count_2;
// char* seq1;
// char* seq2;
// double score;
//
// k = 0;
//
// if (((seq_column_1->header)->returned_data_type != OBI_SEQ) || ((seq_column_2->header)->returned_data_type != OBI_SEQ))
// {
// obi_set_errno(OBI_ALIGN_ERROR);
// obidebug(1, "\nTrying to align a column of a different type than OBI_SEQ");
// return -1;
// }
//
// if ((normalize && ((score_column->header)->returned_data_type != OBI_FLOAT)) ||
// (!normalize && ((score_column->header)->returned_data_type != OBI_INT)))
// {
// obi_set_errno(OBI_ALIGN_ERROR);
// obidebug(1, "\nTrying to store alignment scores in a column of an inappropriate type");
// return -1;
// }
//
// seq_count_1 = (seq_column_1->header)->lines_used;
// seq_count_2 = (seq_column_2->header)->lines_used;
//
// for (i=0; i < (seq_count_1 - 1); i++)
// {
// for (j=0; j < seq_count_2; j++)
// {
// //fprintf(stderr, "\ni=%lld, j=%lld, k=%lld", i, j, k);
//
// seq1 = obi_get_seq_with_elt_idx_and_col_p_in_view(seq_view, seq_column_1, i, 0);
// seq2 = obi_get_seq_with_elt_idx_and_col_p_in_view(seq_view, seq_column_2, j, 0);
//
// if ((seq1 == NULL) || (seq2 == NULL))
// {
// obidebug(1, "\nError retrieving sequences to align");
// return -1;
// }
//
// // TODO kmer filter
//
// score = generic_sse_banded_lcs_align(seq1, seq2, threshold, normalize, reference, similarity_mode);
//
// if (normalize)
// {
// if (obi_set_float_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obifloat_t) score) < 0)
// {
// obidebug(1, "\nError writing alignment score in a column");
// return -1;
// }
// }
// else
// {
// if (obi_set_int_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obiint_t) score) < 0)
// {
// obidebug(1, "\nError writing alignment score in a column");
// return -1;
// }
// }
//
// free(seq1);
// free(seq2);
//
// k++;
// }
// }
//
// return 0;
//}