From d99447c12b2505ce96d874cf2e65a63bb1884777 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Fri, 16 Dec 2016 19:39:02 +0100 Subject: [PATCH] C function for LCS alignment of two columns, and optimized and fixed line count bug in function to align one column --- src/obi_align.c | 466 +++++++++++++++++++++++++++++++++++++++--------- src/obi_align.h | 61 ++++++- 2 files changed, 430 insertions(+), 97 deletions(-) diff --git a/src/obi_align.c b/src/obi_align.c index a23e27f..97f6d4c 100644 --- a/src/obi_align.c +++ b/src/obi_align.c @@ -1,12 +1,12 @@ /**************************************************************************** - * Sequence alignment functions * + * LCS sequence alignment functions * ****************************************************************************/ /** * @file obi_align.c * @author Celine Mercier * @date May 4th 2016 - * @brief Functions handling sequence alignments. + * @brief Functions handling LCS sequence alignments. */ @@ -407,7 +407,7 @@ int obi_lcs_align_one_column(OBIDMS_p dms, const char* seq_view_name, const char Obiview_p seq_view = NULL; Obiview_p output_view = NULL; OBIDMS_column_p iseq_column = NULL; - OBIDMS_column_p id_column; + OBIDMS_column_p id_column = NULL; OBIDMS_column_p id1_column = NULL; OBIDMS_column_p id2_column = NULL; OBIDMS_column_p seq1_column = NULL; @@ -451,6 +451,14 @@ int obi_lcs_align_one_column(OBIDMS_p dms, const char* seq_view_name, const char return -1; } + // Check column type + if ((iseq_column->header)->returned_data_type != OBI_SEQ) + { + obi_set_errno(OBI_ALIGN_ERROR); + obidebug(1, "\nError: column given to align is not an OBI_SEQ column"); + return -1; + } + // Get element index of the sequence to align in each line to compute it only once if ((strcmp(seq_elt_name, "") != 0) && (seq_elt_name != NULL)) { @@ -527,21 +535,30 @@ int obi_lcs_align_one_column(OBIDMS_p dms, const char* seq_view_name, const char return -1; } - seq_count = (iseq_column->header)->lines_used; + seq_count = (seq_view->infos)->line_count; for (i=0; i < (seq_count - 1); i++) { if (i%100 == 0) fprintf(stderr,"\rDone : %f %% ", (i / (float) seq_count)*100); + // Get first id idx + id1_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0); // TODO Could there be multiple IDs per line? + // Get first sequence and its index + seq1_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, i, seq_elt_idx); + blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, i, seq_elt_idx); + if (blob1 == NULL) + { + obidebug(1, "\nError retrieving sequences to align"); + return -1; + } + for (j=i+1; j < seq_count; j++) { - blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, i, seq_elt_idx); - blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, j, seq_elt_idx); - seq1_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, i, seq_elt_idx); + // Get second sequence and its index seq2_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, j, seq_elt_idx); - - if ((blob1 == NULL) || (blob2 == NULL)) + blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, j, seq_elt_idx); + if (blob2 == NULL) { obidebug(1, "\nError retrieving sequences to align"); return -1; @@ -569,10 +586,9 @@ int obi_lcs_align_one_column(OBIDMS_p dms, const char* seq_view_name, const char } if ((score >= 0) && (((normalize || similarity_mode) && (score >= threshold)) || ((!similarity_mode && !normalize) && (score <= threshold)))) - { // Print result // TODO make separate function maybe + { // Print result - // Get ids idx - id1_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0); // TODO Could there be multiple IDs per line? + // Get second id idx id2_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, id_column, j, 0); if (print_alignment_result(output_view, k, @@ -609,82 +625,354 @@ int obi_lcs_align_one_column(OBIDMS_p dms, const char* seq_view_name, const char } -// TODO discuss if 2 input views or 2 columns or both possible -//int obi_align_two_columns(Obiview_p seq_view, OBIDMS_column_p seq_column_1, OBIDMS_column_p seq_column_2, // TODO it's implied both seq columns are in the same view but maybe it shouldn't -// Obiview_p score_view, OBIDMS_column_p score_column, -// double threshold, bool normalize, int reference, bool similarity_mode) -//{ -// index_t i, j, k; -// index_t seq_count_1; -// index_t seq_count_2; -// char* seq1; -// char* seq2; -// double score; -// -// k = 0; -// -// if (((seq_column_1->header)->returned_data_type != OBI_SEQ) || ((seq_column_2->header)->returned_data_type != OBI_SEQ)) -// { -// obi_set_errno(OBI_ALIGN_ERROR); -// obidebug(1, "\nTrying to align a column of a different type than OBI_SEQ"); -// return -1; -// } -// -// if ((normalize && ((score_column->header)->returned_data_type != OBI_FLOAT)) || -// (!normalize && ((score_column->header)->returned_data_type != OBI_INT))) -// { -// obi_set_errno(OBI_ALIGN_ERROR); -// obidebug(1, "\nTrying to store alignment scores in a column of an inappropriate type"); -// return -1; -// } -// -// seq_count_1 = (seq_column_1->header)->lines_used; -// seq_count_2 = (seq_column_2->header)->lines_used; -// -// for (i=0; i < (seq_count_1 - 1); i++) -// { -// for (j=0; j < seq_count_2; j++) -// { -// //fprintf(stderr, "\ni=%lld, j=%lld, k=%lld", i, j, k); -// -// seq1 = obi_get_seq_with_elt_idx_and_col_p_in_view(seq_view, seq_column_1, i, 0); -// seq2 = obi_get_seq_with_elt_idx_and_col_p_in_view(seq_view, seq_column_2, j, 0); -// -// if ((seq1 == NULL) || (seq2 == NULL)) -// { -// obidebug(1, "\nError retrieving sequences to align"); -// return -1; -// } -// -// // TODO kmer filter -// -// score = generic_sse_banded_lcs_align(seq1, seq2, threshold, normalize, reference, similarity_mode); -// -// if (normalize) -// { -// if (obi_set_float_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obifloat_t) score) < 0) -// { -// obidebug(1, "\nError writing alignment score in a column"); -// return -1; -// } -// } -// else -// { -// if (obi_set_int_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obiint_t) score) < 0) -// { -// obidebug(1, "\nError writing alignment score in a column"); -// return -1; -// } -// } -// -// free(seq1); -// free(seq2); -// -// k++; -// } -// } -// -// return 0; -//} +int obi_lcs_align_two_columns(OBIDMS_p dms, + const char* seq1_view_name, + const char* seq2_view_name, + const char* seq1_column_name, + const char* seq2_column_name, + const char* seq1_elt_name, + const char* seq2_elt_name, + const char* id1_column_name, + const char* id2_column_name, + const char* output_view_name, const char* output_view_comments, + bool print_seq, bool print_count, + double threshold, bool normalize, int reference, bool similarity_mode) +{ + index_t i, j, k; + index_t seq1_count; + index_t seq2_count; + index_t id1_idx, id2_idx; + index_t seq1_idx, seq2_idx; + double score; + int lcs_length; + int ali_length; + Kmer_table_p ktable; + Obi_blob_p blob1; + Obi_blob_p blob2; + int lcs_min; + index_t seq1_elt_idx; + index_t seq2_elt_idx; + bool same_indexer; + + Obiview_p seq1_view = NULL; + Obiview_p seq2_view = NULL; + Obiview_p output_view = NULL; + OBIDMS_column_p i_seq1_column = NULL; + OBIDMS_column_p i_seq2_column = NULL; + OBIDMS_column_p i_id1_column = NULL; + OBIDMS_column_p i_id2_column = NULL; + OBIDMS_column_p id1_column = NULL; + OBIDMS_column_p id2_column = NULL; + OBIDMS_column_p seq1_column = NULL; + OBIDMS_column_p seq2_column = NULL; + //OBIDMS_column_p count1_column = NULL; + //OBIDMS_column_p count2_column = NULL; + OBIDMS_column_p idx1_column = NULL; + OBIDMS_column_p idx2_column = NULL; + OBIDMS_column_p lcs_length_column = NULL; + OBIDMS_column_p ali_length_column = NULL; + OBIDMS_column_p score_column = NULL; + + k = 0; + + // Open the first input view + seq1_view = obi_open_view(dms, seq1_view_name); + if (seq1_view == NULL) + { + obidebug(1, "\nError opening the first input view to align"); + return -1; + } + + // Open the second input view. Same as 1st if "" + if (strcmp(seq2_view_name, "") == 0) + seq2_view = seq1_view; + else + { + seq2_view = obi_open_view(dms, seq2_view_name); + if (seq2_view == NULL) + { + obidebug(1, "\nError opening the second input view to align"); + return -1; + } + } + + // Open the first sequence column to align + // If a column name wasn't given, open default sequence column + if (strcmp(seq1_column_name, "") == 0) + { + if (strcmp((seq1_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0) + i_seq1_column = obi_view_get_column(seq1_view, NUC_SEQUENCE_COLUMN); + else + { + obi_set_errno(OBI_ALIGN_ERROR); + obidebug(1, "\nError: no first column given to align"); + return -1; + } + } + else + i_seq1_column = obi_view_get_column(seq1_view, seq1_column_name); + if (i_seq1_column == NULL) + { + obidebug(1, "\nError getting the first column to align"); + return -1; + } + + // Check column type + if ((i_seq1_column->header)->returned_data_type != OBI_SEQ) + { + obi_set_errno(OBI_ALIGN_ERROR); + obidebug(1, "\nError: first column given to align is not an OBI_SEQ column"); + return -1; + } + + // Open the second sequence column to align + // If a column name wasn't given, open default sequence column + if (strcmp(seq2_column_name, "") == 0) + { + if (strcmp((seq2_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0) + i_seq2_column = obi_view_get_column(seq2_view, NUC_SEQUENCE_COLUMN); + else + { + obi_set_errno(OBI_ALIGN_ERROR); + obidebug(1, "\nError: no second column given to align"); + return -1; + } + } + else + i_seq2_column = obi_view_get_column(seq2_view, seq2_column_name); + if (i_seq2_column == NULL) + { + obidebug(1, "\nError getting the second column to align"); + return -1; + } + // Check that the sequence columns are not both the default NUC_SEQ column of the same view + if (i_seq1_column == i_seq2_column) + { + obidebug(1, "\nError: trying to align a column with itself (default NUC_SEQ column of the same view)"); + return -1; + } + + // Check column type + if ((i_seq2_column->header)->returned_data_type != OBI_SEQ) + { + obi_set_errno(OBI_ALIGN_ERROR); + obidebug(1, "\nError: second column given to align is not an OBI_SEQ column"); + return -1; + } + + // Get element index of the sequence to align in each line of the first column to compute it only once + if ((strcmp(seq1_elt_name, "") != 0) && (seq1_elt_name != NULL)) + { + seq1_elt_idx = obi_column_get_element_index_from_name(i_seq1_column, seq1_elt_name); + if (seq1_elt_idx == OBIIdx_NA) + { + obidebug(1, "\nError getting the sequence index in a column line when aligning"); + return -1; + } + } + else + seq1_elt_idx = 0; + + // Get element index of the sequence to align in each line of the second column to compute it only once + if ((strcmp(seq2_elt_name, "") != 0) && (seq2_elt_name != NULL)) + { + seq2_elt_idx = obi_column_get_element_index_from_name(i_seq2_column, seq2_elt_name); + if (seq2_elt_idx == OBIIdx_NA) + { + obidebug(1, "\nError getting the sequence index in a column line when aligning"); + return -1; + } + } + else + seq2_elt_idx = 0; + // Open the first ID column, containing the identifiers of the first sequence to align + // If a column name wasn't given, open default ID column + if (strcmp(id1_column_name, "") == 0) + { + if (strcmp((seq1_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0) + i_id1_column = obi_view_get_column(seq1_view, ID_COLUMN); + else + { + obi_set_errno(OBI_ALIGN_ERROR); + obidebug(1, "\nError: no first ID column given"); + return -1; + } + } + else + i_id1_column = obi_view_get_column(seq1_view, id1_column_name); + if (i_id1_column == NULL) + { + obidebug(1, "\nError getting the first ID column"); + return -1; + } + + // Open the second ID column, containing the identifiers of the second sequence to align + // If a column name wasn't given, open default ID column + if (strcmp(id2_column_name, "") == 0) + { + if (strcmp((seq2_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0) + i_id2_column = obi_view_get_column(seq2_view, ID_COLUMN); + else + { + obi_set_errno(OBI_ALIGN_ERROR); + obidebug(1, "\nError: no second ID column given"); + return -1; + } + } + else + i_id2_column = obi_view_get_column(seq2_view, id2_column_name); + if (i_id2_column == NULL) + { + obidebug(1, "\nError getting the second ID column"); + return -1; + } + + // Create the output view + output_view = obi_new_view(dms, output_view_name, NULL, NULL, output_view_comments); + if (output_view == NULL) + { + obidebug(1, "\nError creating the output view when aligning"); + return -1; + } + + // Create the output columns + if (create_alignment_output_columns(output_view, + (i_id1_column->header)->indexer_name, (i_id2_column->header)->indexer_name, + (i_seq1_column->header)->indexer_name, (i_seq2_column->header)->indexer_name, + print_seq, print_count, normalize, reference, similarity_mode) < 0) + return -1; + id1_column = obi_view_get_column(output_view, ID1_COLUMN_NAME); + id2_column = obi_view_get_column(output_view, ID2_COLUMN_NAME); + idx1_column = obi_view_get_column(output_view, IDX1_COLUMN_NAME); + idx2_column = obi_view_get_column(output_view, IDX2_COLUMN_NAME); + lcs_length_column = obi_view_get_column(output_view, LCS_LENGTH_COLUMN_NAME); + if ((reference == ALILEN) && (normalize || !similarity_mode)) + ali_length_column = obi_view_get_column(output_view, ALI_LENGTH_COLUMN_NAME); + score_column = obi_view_get_column(output_view, SCORE_COLUMN_NAME); + if (print_seq) + { + seq1_column = obi_view_get_column(output_view, SEQ1_COLUMN_NAME); + seq2_column = obi_view_get_column(output_view, SEQ2_COLUMN_NAME); + } +// if (print_count) // TODO count columns not implemented yet +// { +// count1_column = obi_view_get_column(seq_view, COUNT1_COLUMN_NAME); +// count2_column = obi_view_get_column(seq_view, COUNT2_COLUMN_NAME); +// } + + // Check if the sequence columns share the same indexer (allows for quick checking of sequence equality) + if (strcmp((i_seq1_column->header)->indexer_name, (i_seq2_column->header)->indexer_name) == 0) + same_indexer = true; + else + same_indexer = false; + + // Build kmer tables + ktable = hash_two_seq_columns(seq1_view, i_seq1_column, seq1_elt_idx, seq2_view, i_seq2_column, seq2_elt_idx); + if (ktable == NULL) + { + obi_set_errno(OBI_ALIGN_ERROR); + obidebug(1, "\nError building kmer tables before aligning"); + return -1; + } + + seq1_count = (seq1_view->infos)->line_count; + seq2_count = (seq2_view->infos)->line_count; + + for (i=0; i < seq1_count; i++) + { + if (i%100 == 0) + fprintf(stderr,"\rDone : %f %% ", (i / (float) seq1_count)*100); + + // Get id index of first sequence + id1_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq1_view, i_id1_column, i, 0); // TODO Could there be multiple IDs per line? + // Get first sequence and its index + seq1_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq1_view, i_seq1_column, i, seq1_elt_idx); + blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq1_view, i_seq1_column, i, seq1_elt_idx); + if (blob1 == NULL) + { + obidebug(1, "\nError retrieving sequences to align"); + return -1; + } + + for (j=0; j < seq2_count; j++) + { + // Get second sequence and its index + seq2_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq2_view, i_seq2_column, j, seq2_elt_idx); + blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq2_view, i_seq2_column, j, seq2_elt_idx); + if (blob2 == NULL) + { + obidebug(1, "\nError retrieving sequences to align"); + return -1; + } + + // Check if the sequences are identical in a quick way (same index in the same indexer) + if (same_indexer && (seq1_idx == seq2_idx)) + { + if (similarity_mode && normalize) + score = 1.0; + else if (!similarity_mode) + score = 0.0; + else + score = blob1->length_decoded_value; + } + + else // the sequences aren't identical + { + // kmer filter (offset for the index of the kmer table of the 2nd sequence because the kmer tables of the 2 sequence columns are concatenated in one) + align_filters(ktable, blob1, blob2, i, seq1_count+j, threshold, normalize, reference, similarity_mode, &score, &lcs_min, false); + + // Compute alignment score + if ((threshold == 0) || (score == -1.0)) // no threshold, or filter passed: align + score = obiblob_sse_banded_lcs_align(blob1, blob2, threshold, normalize, reference, similarity_mode, &lcs_length, &ali_length); + } + + if ((score >= 0) && (((normalize || similarity_mode) && (score >= threshold)) || ((!similarity_mode && !normalize) && (score <= threshold)))) + { // Print result + + // Get second id idx + id2_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq2_view, i_id2_column, j, 0); + + if (print_alignment_result(output_view, k, + idx1_column, idx2_column, i, j, + id1_column, id2_column, id1_idx, id2_idx, + print_seq, seq1_column, seq2_column, seq1_idx, seq2_idx, + //print_count, count1_column, count2_column, count1, count2, + ali_length_column, ali_length, + lcs_length_column, lcs_length, + score_column, score, + reference, normalize, similarity_mode) < 0) + return -1; + + k++; + } + } + } + + // Close views + if (seq2_view != seq1_view) + { + if (obi_close_view(seq2_view) < 0) + { + obidebug(1, "\nError closing the second input view after aligning"); + return -1; + } + } + if (obi_close_view(seq1_view) < 0) + { + obidebug(1, "\nError closing the first input view after aligning"); + return -1; + } + + if (obi_close_view(output_view) < 0) + { + obidebug(1, "\nError closing the output view after aligning"); + return -1; + } + + free_kmer_tables(ktable, seq1_count + seq2_count); + + return 0; +} + diff --git a/src/obi_align.h b/src/obi_align.h index c0d823c..68048bd 100644 --- a/src/obi_align.h +++ b/src/obi_align.h @@ -1,12 +1,12 @@ /**************************************************************************** - * Sequence alignment functions header file * + * LCS sequence alignment functions header file * ****************************************************************************/ /** * @file obi_align.h * @author Celine Mercier * @date May 11th 2016 - * @brief Header file for the functions handling the alignment of DNA sequences. + * @brief Header file for the functions handling the LCS alignment of DNA sequences. */ @@ -55,7 +55,7 @@ /** - * @brief Aligns a NUC_SEQ column with itself. + * @brief Aligns an OBI_SEQ column with itself. * * Note: The columns where the results are written are automatically named and created. * @@ -96,14 +96,59 @@ int obi_lcs_align_one_column(OBIDMS_p dms, /** - * @brief + * @brief Aligns two OBI_SEQ columns. * - * TODO + * The columns must belong to the same OBIDMS, but can belong to different views. * + * Note: The columns where the results are written are automatically named and created. + * + * @param dms A pointer on an OBIDMS. + * @param seq1_view_name The name of the view where the first column to align is. + * @param seq2_view_name The name of the view where the second column to align is ("" if it is the same view as the first one). + * @param seq1_column_name The name of the first OBI_SEQ column in the input view to align. + * If "" (empty string), and the input view is of type NUC_SEQS_VIEW, the associated "NUC_SEQ" column is aligned. + * @param seq2_column_name The name of the second OBI_SEQ column in the input view to align. + * If "" (empty string), and the input view is of type NUC_SEQS_VIEW, the associated "NUC_SEQ" column is aligned. + * @param seq1_elt_name The name of the element in the first column corresponding to the sequence to align, if the column has multiple + * elements per line. + * @param seq2_elt_name The name of the element in the second column corresponding to the sequence to align, if the column has multiple + * elements per line. + * @param id1_column_name The name of the column in the first input view containing the identifiers of the first sequence to align. + * If "" (empty string), and the input view is of type NUC_SEQS_VIEW, the associated "ID" column is aligned. + * @param id2_column_name The name of the column in the second input view containing the identifiers of the second sequence to align. + * If "" (empty string), and the input view is of type NUC_SEQS_VIEW, the associated "ID" column is aligned. + * @param output_view_name The name of the output view where the results should be written (should not already exist). + * @param output_view_comments The comments that should be associated with the output view. + * @param print_seq A boolean indicating whether the aligned sequences should be copied in the output view. + * @param print_count A boolean indicating whether the aligned sequence counts should be copied in the output view. + * @param threshold Score threshold. If the score is normalized and expressed in similarity, it is an identity, e.g. 0.95 + * for an identity of 95%. If the score is normalized and expressed in distance, it is (1.0 - identity), + * e.g. 0.05 for an identity of 95%. If the score is not normalized and expressed in similarity, it is + * the length of the Longest Common Subsequence. If the score is not normalized and expressed in distance, + * it is (reference length - LCS length). Only sequence pairs with a similarity above the threshold are printed. + * @param normalize Whether the score should be normalized with the reference sequence length. + * @param reference The reference length. 0: The alignement length; 1: The longest sequence's length; 2: The shortest sequence's length. + * @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false). + * + * @returns A value indicating the success of the operation. + * @retval 0 if the operation was successfully completed. + * @retval -1 if an error occurred. + * + * @since December 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -//int obi_align_two_columns(Obiview_p seq_view, OBIDMS_column_p seq_column_1, OBIDMS_column_p seq_column_2, -// Obiview_p score_view, OBIDMS_column_p score_column, -// double threshold, bool normalize, int reference, bool similarity_mode); +int obi_lcs_align_two_columns(OBIDMS_p dms, + const char* seq1_view_name, + const char* seq2_view_name, + const char* seq1_column_name, + const char* seq2_column_name, + const char* seq1_elt_name, + const char* seq2_elt_name, + const char* id1_column_name, + const char* id2_column_name, + const char* output_view_name, const char* output_view_comments, + bool print_seq, bool print_count, + double threshold, bool normalize, int reference, bool similarity_mode); #endif /* OBI_ALIGN_H_ */