/**************************************************************************** * Sequence alignment functions * ****************************************************************************/ /** * @file obi_align.c * @author Celine Mercier * @date May 4th 2016 * @brief Functions handling sequence alignments. */ #include #include #include #include "obidebug.h" #include "obierrno.h" #include "obitypes.h" #include "obiview.h" #include "sse_banded_LCS_alignment.h" #include "upperband.h" #include "obiblob.h" #define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) // TODO // use openMP pragmas // option pour ecrire en stdint? // check NUC_SEQS view type? and score type (int or float if normalize) // what's with multiple sequences/line columns? int obi_align_one_column(Obiview_p seq_view, OBIDMS_column_p seq_column, const char* seq_name, Obiview_p score_view, OBIDMS_column_p id1_column, OBIDMS_column_p id2_column, OBIDMS_column_p score_column, double threshold, bool normalize, int reference, bool similarity_mode) { index_t i, j, k; index_t seq_count; const char* id1; const char* id2; double score; OBIDMS_column_p id_column; Kmer_table_p ktable; Obi_blob_p blob1; Obi_blob_p blob2; int lcs_min; index_t seq_idx; k = 0; // If no sequence column is given and the view has the type NUC_SEQS_VIEW, the default sequence column is aligned if ((seq_column == NULL) && (strcmp((seq_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0)) { seq_column = obi_view_get_column(seq_view, NUC_SEQUENCE_COLUMN); if (seq_column == NULL) return -1; } // Check that the given sequence column contains nucleotide sequences else if ((seq_column->header)->returned_data_type != OBI_SEQ) { obi_set_errno(OBI_ALIGN_ERROR); obidebug(1, "\nTrying to align a column of a different type than OBI_SEQ"); return -1; } if ((normalize && ((score_column->header)->returned_data_type != OBI_FLOAT)) || (!normalize && ((score_column->header)->returned_data_type != OBI_INT))) { obi_set_errno(OBI_ALIGN_ERROR); obidebug(1, "\nTrying to store alignment scores in a column of an inappropriate type"); return -1; } // Get element index from element name to compute it only once if (seq_name != NULL) { seq_idx = obi_column_get_element_index_from_name(seq_column, seq_name); if (seq_idx == OBIIdx_NA) { obidebug(1, "\nError getting the sequence index in a column line when aligning"); return -1; } } else seq_idx = 0; // Build kmer tables ktable = hash_seq_column(seq_view, seq_column, seq_idx); if (ktable == NULL) { obi_set_errno(OBI_ALIGN_ERROR); obidebug(1, "\nError building kmer tables before aligning"); return -1; } // Get the ID column pointer id_column = obi_view_get_column(seq_view, ID_COLUMN); seq_count = (seq_column->header)->lines_used; for (i=0; i < (seq_count - 1); i++) { if (i%100 == 0) fprintf(stderr,"\rDone : %f %% ", (i / (float) seq_count)*100); for (j=i+1; j < seq_count; j++) { blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, seq_idx); blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, seq_idx); if ((blob1 == NULL) || (blob2 == NULL)) { obidebug(1, "\nError retrieving sequences to align"); return -1; } // Check if the sequences are identical in a quick way (same index in the same indexer) if (obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, seq_column, i, seq_idx) == obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, seq_column, j, seq_idx)) { if (similarity_mode && normalize) score = 1.0; else if (!similarity_mode) score = 0.0; else score = blob1->length_decoded_value; } else // the sequences aren't identical { // kmer filter align_filters(ktable, blob1, blob2, i, j, threshold, normalize, reference, similarity_mode, &score, &lcs_min, false); // Compute alignment score if ((threshold == 0) || (score == -1.0)) // no threshold, or filter passed: align score = obiblob_sse_banded_lcs_align(blob1, blob2, threshold, normalize, reference, similarity_mode); } if ((score >= 0) && (((normalize || similarity_mode) && (score >= threshold)) || ((!similarity_mode && !normalize) && (score <= threshold)))) { // Print result // Get sequence ids id1 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0); // TODO Could there be multiple IDs per line? id2 = obi_get_str_with_elt_idx_and_col_p_in_view(seq_view, id_column, j, 0); // Write sequence ids in output view if (obi_set_str_with_elt_idx_and_col_p_in_view(score_view, id1_column, k, 0, id1) < 0) { obidebug(1, "\nError writing id1 in a column"); return -1; } if (obi_set_str_with_elt_idx_and_col_p_in_view(score_view, id2_column, k, 0, id2) < 0) { obidebug(1, "\nError writing id2 in a column"); return -1; } // Write score in output view if (normalize) { if (obi_set_float_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obifloat_t) score) < 0) { obidebug(1, "\nError writing alignment score in a column"); return -1; } } else { if (obi_set_int_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obiint_t) score) < 0) { obidebug(1, "\nError writing alignment score in a column"); return -1; } } k++; } } } free_kmer_tables(ktable, seq_count); return 0; } // TODO discuss if 2 input views or 2 columns or both possible //int obi_align_two_columns(Obiview_p seq_view, OBIDMS_column_p seq_column_1, OBIDMS_column_p seq_column_2, // TODO it's implied both seq columns are in the same view but maybe it shouldn't // Obiview_p score_view, OBIDMS_column_p score_column, // double threshold, bool normalize, int reference, bool similarity_mode) //{ // index_t i, j, k; // index_t seq_count_1; // index_t seq_count_2; // char* seq1; // char* seq2; // double score; // // k = 0; // // if (((seq_column_1->header)->returned_data_type != OBI_SEQ) || ((seq_column_2->header)->returned_data_type != OBI_SEQ)) // { // obi_set_errno(OBI_ALIGN_ERROR); // obidebug(1, "\nTrying to align a column of a different type than OBI_SEQ"); // return -1; // } // // if ((normalize && ((score_column->header)->returned_data_type != OBI_FLOAT)) || // (!normalize && ((score_column->header)->returned_data_type != OBI_INT))) // { // obi_set_errno(OBI_ALIGN_ERROR); // obidebug(1, "\nTrying to store alignment scores in a column of an inappropriate type"); // return -1; // } // // seq_count_1 = (seq_column_1->header)->lines_used; // seq_count_2 = (seq_column_2->header)->lines_used; // // for (i=0; i < (seq_count_1 - 1); i++) // { // for (j=0; j < seq_count_2; j++) // { // //fprintf(stderr, "\ni=%lld, j=%lld, k=%lld", i, j, k); // // seq1 = obi_get_seq_with_elt_idx_and_col_p_in_view(seq_view, seq_column_1, i, 0); // seq2 = obi_get_seq_with_elt_idx_and_col_p_in_view(seq_view, seq_column_2, j, 0); // // if ((seq1 == NULL) || (seq2 == NULL)) // { // obidebug(1, "\nError retrieving sequences to align"); // return -1; // } // // // TODO kmer filter // // score = generic_sse_banded_lcs_align(seq1, seq2, threshold, normalize, reference, similarity_mode); // // if (normalize) // { // if (obi_set_float_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obifloat_t) score) < 0) // { // obidebug(1, "\nError writing alignment score in a column"); // return -1; // } // } // else // { // if (obi_set_int_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obiint_t) score) < 0) // { // obidebug(1, "\nError writing alignment score in a column"); // return -1; // } // } // // free(seq1); // free(seq2); // // k++; // } // } // // return 0; //}