From 9b24818fe2b2381eddb39df607a4eba25377a5ad Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Tue, 13 Dec 2016 17:18:12 +0100 Subject: [PATCH 01/22] Refactored alignment code for minimum redundancy between the function that aligns 1 column and the function that aligns 2 columns --- src/obi_align.c | 546 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 368 insertions(+), 178 deletions(-) diff --git a/src/obi_align.c b/src/obi_align.c index 390ff60..a23e27f 100644 --- a/src/obi_align.c +++ b/src/obi_align.c @@ -31,6 +31,360 @@ // use openMP pragmas +/************************************************************************** + * + * D E C L A R A T I O N O F T H E P R I V A T E F U N C T I O N S + * + **************************************************************************/ + + +/** + * @brief Internal function creating the columns where the alignment results are written. + * + * @param output_view A pointer on the writable view where the columns should be created. + * @param id1_indexer_name The name of the indexer where the id of the 1st sequence aligned is indexed. + * @param id2_indexer_name The name of the indexer where the id of the 2nd sequence aligned is indexed. + * @param seq1_indexer_name The name of the indexer where the 1st sequence aligned is indexed (needed only if print_seq is True). + * @param seq2_indexer_name The name of the indexer where the 2nd sequence aligned is indexed (needed only if print_seq is True). + * @param print_seq A boolean indicating whether the aligned sequences should be copied in the output view. + * @param print_count A boolean indicating whether the aligned sequence counts should be copied in the output view. + * @param normalize Whether the score should be normalized with the reference sequence length. + * @param reference The reference length. 0: The alignement length; 1: The longest sequence's length; 2: The shortest sequence's length. + * @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false). + * + * @retval 0 if the operation was successfully completed. + * @retval -1 if an error occurred. + * + * @since December 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int create_alignment_output_columns(Obiview_p output_view, + const char* id1_indexer_name, + const char* id2_indexer_name, + const char* seq1_indexer_name, + const char* seq2_indexer_name, + bool print_seq, bool print_count, + bool normalize, int reference, bool similarity_mode); + + +/** + * @brief Internal function printing the result of one alignment to the output view. + * + * @param output_view A pointer on the writable view where the columns should be created. + * @param line The line in the output view where the result should be written. + * @param idx1_column A pointer on the column where the index referring to the line of the first sequence aligned in the input view should be written. + * @param idx2_column A pointer on the column where the index referring to the line of the second sequence aligned in the input view should be written. + * @param idx1 The index referring to the line of the first sequence aligned in the input view. + * @param idx2 The index referring to the line of the second sequence aligned in the input view. + * @param id1_column A pointer on the column where the identifier of the first sequence aligned should be written. + * @param id2_column A pointer on the column where the identifier of the second sequence aligned should be written. + * @param id1_idx The index of the identifier of the first sequence aligned. + * @param id2_idx The index of the identifier of the second sequence aligned. + * @param print_seq A boolean indicating whether the aligned sequences should be copied in the output view. + * @param seq1_column A pointer on the column where the first sequence aligned should be written. + * @param seq2_column A pointer on the column where the second sequence aligned should be written. + * @param seq1_idx The index of the sequence of the first sequence aligned. + * @param seq2_idx The index of the sequence of the second sequence aligned. + * @param print_count A boolean indicating whether the aligned sequence counts should be copied in the output view. // Count columns not implement yet + * @param count1_column A pointer on the column where the count of the first sequence aligned should be written. + * @param count2_column A pointer on the column where the count of the second sequence aligned should be written. + * @param count1 The count of the first sequence aligned. + * @param count2 The count of the second sequence aligned. + * @param ali_length_column A pointer on the column where the alignment length should be written. + * @param ali_length The alignment length. + * @param lcs_length_column A pointer on the column where the LCS length should be written. + * @param lcs_length The LCS length. + * @param score_column A pointer on the column where the score should be written. + * @param score The alignment score. + * @param reference The reference length. 0: The alignment length; 1: The longest sequence's length; 2: The shortest sequence's length. + * @param normalize Whether the score should be normalized with the reference sequence length. + * @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false). + * + * @retval 0 if the operation was successfully completed. + * @retval -1 if an error occurred. + * + * @since December 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int print_alignment_result(Obiview_p output_view, + index_t line, + OBIDMS_column_p idx1_column, + OBIDMS_column_p idx2_column, + index_t idx1, + index_t idx2, + OBIDMS_column_p id1_column, + OBIDMS_column_p id2_column, + index_t id1_idx, + index_t id2_idx, + bool print_seq, + OBIDMS_column_p seq1_column, + OBIDMS_column_p seq2_column, + index_t seq1_idx, + index_t seq2_idx, +// bool print_count, +// OBIDMS_column_p count1_column, +// OBIDMS_column_p count2_column, +// int count1, +// int count2, + OBIDMS_column_p ali_length_column, + int ali_length, + OBIDMS_column_p lcs_length_column, + int lcs_length, + OBIDMS_column_p score_column, + double score, + int reference, + bool normalize, + bool similarity_mode); + + + +/************************************************************************ + * + * D E F I N I T I O N O F T H E P R I V A T E F U N C T I O N S + * + ************************************************************************/ + + +static int create_alignment_output_columns(Obiview_p output_view, + const char* id1_indexer_name, + const char* id2_indexer_name, + const char* seq1_indexer_name, + const char* seq2_indexer_name, + bool print_seq, bool print_count, + bool normalize, int reference, bool similarity_mode) +{ + // Create the column for the ids of the 1st sequence aligned + if (obi_view_add_column(output_view, ID1_COLUMN_NAME, -1, ID1_COLUMN_NAME, OBI_STR, 0, 1, NULL, id1_indexer_name, NULL, -1, ID1_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the first column for the sequence ids when aligning"); + return -1; + } + + // Create the column for the ids of the 2nd sequence aligned + if (obi_view_add_column(output_view, ID2_COLUMN_NAME, -1, ID2_COLUMN_NAME, OBI_STR, 0, 1, NULL, id2_indexer_name, NULL, -1, ID2_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the second column for the sequence ids when aligning"); + return -1; + } + + // Create the column for the index (in the input view) of the first sequences aligned + if (obi_view_add_column(output_view, IDX1_COLUMN_NAME, -1, IDX1_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, IDX1_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the first column for the sequence indices when aligning"); + return -1; + } + + // Create the column for the index (in the input view) of the second sequences aligned + if (obi_view_add_column(output_view, IDX2_COLUMN_NAME, -1, IDX2_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, IDX2_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the second column for the sequence indices when aligning"); + return -1; + } + + // Create the column for the LCS length + if (obi_view_add_column(output_view, LCS_LENGTH_COLUMN_NAME, -1, LCS_LENGTH_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, LCS_LENGTH_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the column for the LCS length when aligning"); + return -1; + } + + // Create the column for the alignment length if it is computed + if ((reference == ALILEN) && (normalize || !similarity_mode)) + { + if (obi_view_add_column(output_view, ALI_LENGTH_COLUMN_NAME, -1, ALI_LENGTH_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, ALI_LENGTH_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the column for the alignment length when aligning"); + return -1; + } + } + // Create the column for the alignment score + if (normalize) + { + if (obi_view_add_column(output_view, SCORE_COLUMN_NAME, -1, SCORE_COLUMN_NAME, OBI_FLOAT, 0, 1, NULL, NULL, NULL, -1, SCORE_COLUMN_NAME, true) < 0) + { + obidebug(1, "\nError creating the column for the score when aligning"); + return -1; + } + } + else + { + if (obi_view_add_column(output_view, SCORE_COLUMN_NAME, -1, SCORE_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, SCORE_COLUMN_NAME, true) < 0) + { + obidebug(1, "\nError creating the column for the score when aligning"); + return -1; + } + } + + if (print_seq) + { + // Create the column for the first sequences aligned + if (obi_view_add_column(output_view, SEQ1_COLUMN_NAME, -1, SEQ1_COLUMN_NAME, OBI_SEQ, 0, 1, NULL, seq1_indexer_name, NULL, -1, SEQ1_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the first column for the sequences when aligning"); + return -1; + } + + // Create the column for the second sequences aligned + if (obi_view_add_column(output_view, SEQ2_COLUMN_NAME, -1, SEQ2_COLUMN_NAME, OBI_SEQ, 0, 1, NULL, seq2_indexer_name, NULL, -1, SEQ2_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the second column for the sequences when aligning"); + return -1; + } + } +// if (print_count) // TODO count columns not implemented yet +// { +// // Create the column for the count of the first sequences aligned +// if (obi_view_add_column(output_view, COUNT1_COLUMN_NAME, -1, COUNT1_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, COUNT1_COLUMN_COMMENTS, true) < 0) +// { +// obidebug(1, "\nError creating the first column for the sequence counts when aligning"); +// return -1; +// } +// +// // Create the column for the count of the second sequences aligned +// if (obi_view_add_column(output_view, COUNT2_COLUMN_NAME, -1, COUNT2_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, COUNT2_COLUMN_COMMENTS, true) < 0) +// { +// obidebug(1, "\nError creating the second column for the sequence counts when aligning"); +// return -1; +// } +// } + + return 0; +} + + +static int print_alignment_result(Obiview_p output_view, + index_t line, + OBIDMS_column_p idx1_column, + OBIDMS_column_p idx2_column, + index_t idx1, + index_t idx2, + OBIDMS_column_p id1_column, + OBIDMS_column_p id2_column, + index_t id1_idx, + index_t id2_idx, + bool print_seq, + OBIDMS_column_p seq1_column, + OBIDMS_column_p seq2_column, + index_t seq1_idx, + index_t seq2_idx, +// bool print_count, +// OBIDMS_column_p count1_column, +// OBIDMS_column_p count2_column, +// int count1, +// int count2, + OBIDMS_column_p ali_length_column, + int ali_length, + OBIDMS_column_p lcs_length_column, + int lcs_length, + OBIDMS_column_p score_column, + double score, + int reference, + bool normalize, + bool similarity_mode) +{ + // Write line indices of the input view in the output view (to easily refer to the input sequences from the output view) + if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, idx1_column, line, 0, idx1) < 0) + { + obidebug(1, "\nError writing idx1 in a column"); + return -1; + } + if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, idx2_column, line, 0, idx2) < 0) + { + obidebug(1, "\nError writing idx2 in a column"); + return -1; + } + + // Write ids in output view + if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, id1_column, line, 0, id1_idx) < 0) + { + obidebug(1, "\nError writing id1 in a column"); + return -1; + } + if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, id2_column, line, 0, id2_idx) < 0) + { + obidebug(1, "\nError writing id2 in a column"); + return -1; + } + + // Write the sequences if needed + if (print_seq) + { + if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, seq1_column, line, 0, seq1_idx) < 0) + { + obidebug(1, "\nError writing seq1 in a column"); + return -1; + } + + if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, seq2_column, line, 0, seq2_idx) < 0) + { + obidebug(1, "\nError writing seq2 in a column"); + return -1; + } + } + +// // Write the counts if needed // TODO count columns not implemented yet +// if (print_count) +// { +// if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, count1_column, line, 0, count1) < 0) +// { +// obidebug(1, "\nError writing count1 in a column"); +// return -1; +// } +// +// if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, count2_column, line, 0, count2) < 0) +// { +// obidebug(1, "\nError writing count2 in a column"); +// return -1; +// } +// } + + // Write the alignment length if it was computed + if ((reference == ALILEN) && (normalize || !similarity_mode)) + { + if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, ali_length_column, line, 0, ali_length) < 0) + { + obidebug(1, "\nError writing alignment length in a column"); + return -1; + } + } + + // Write the LCS length + if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, lcs_length_column, line, 0, lcs_length) < 0) + { + obidebug(1, "\nError writing LCS length in a column"); + return -1; + } + + // Write score + if (normalize) + { + if (obi_set_float_with_elt_idx_and_col_p_in_view(output_view, score_column, line, 0, (obifloat_t) score) < 0) + { + obidebug(1, "\nError writing alignment score in a column"); + return -1; + } + } + else + { + if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, score_column, line, 0, (obiint_t) score) < 0) + { + obidebug(1, "\nError writing alignment score in a column"); + return -1; + } + } + + return 0; +} + + + +/********************************************************************** + * + * D E F I N I T I O N O F T H E P U B L I C F U N C T I O N S + * + **********************************************************************/ + + int obi_lcs_align_one_column(OBIDMS_p dms, const char* seq_view_name, const char* seq_column_name, const char* seq_elt_name, const char* id_column_name, const char* output_view_name, const char* output_view_comments, @@ -140,114 +494,30 @@ int obi_lcs_align_one_column(OBIDMS_p dms, const char* seq_view_name, const char } // Create the output columns - - // Create the column for the ids of the 1st sequence aligned - if (obi_view_add_column(output_view, ID1_COLUMN_NAME, -1, ID1_COLUMN_NAME, OBI_STR, 0, 1, NULL, (id_column->header)->indexer_name, NULL, -1, ID1_COLUMN_COMMENTS, true) < 0) - { - obidebug(1, "\nError creating the first column for the sequence ids when aligning"); + if (create_alignment_output_columns(output_view, + (id_column->header)->indexer_name, (id_column->header)->indexer_name, + (iseq_column->header)->indexer_name, (iseq_column->header)->indexer_name, + print_seq, print_count, normalize, reference, similarity_mode) < 0) return -1; - } id1_column = obi_view_get_column(output_view, ID1_COLUMN_NAME); - - // Create the column for the ids of the 2nd sequence aligned - if (obi_view_add_column(output_view, ID2_COLUMN_NAME, -1, ID2_COLUMN_NAME, OBI_STR, 0, 1, NULL, (id_column->header)->indexer_name, NULL, -1, ID2_COLUMN_COMMENTS, true) < 0) - { - obidebug(1, "\nError creating the second column for the sequence ids when aligning"); - return -1; - } id2_column = obi_view_get_column(output_view, ID2_COLUMN_NAME); - - // Create the column for the index (in the input view) of the first sequences aligned - if (obi_view_add_column(output_view, IDX1_COLUMN_NAME, -1, IDX1_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, IDX1_COLUMN_COMMENTS, true) < 0) - { - obidebug(1, "\nError creating the first column for the sequence indices when aligning"); - return -1; - } idx1_column = obi_view_get_column(output_view, IDX1_COLUMN_NAME); - - // Create the column for the index (in the input view) of the second sequences aligned - if (obi_view_add_column(output_view, IDX2_COLUMN_NAME, -1, IDX2_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, IDX2_COLUMN_COMMENTS, true) < 0) - { - obidebug(1, "\nError creating the second column for the sequence indices when aligning"); - return -1; - } idx2_column = obi_view_get_column(output_view, IDX2_COLUMN_NAME); - - // Create the column for the LCS length - if (obi_view_add_column(output_view, LCS_LENGTH_COLUMN_NAME, -1, LCS_LENGTH_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, LCS_LENGTH_COLUMN_COMMENTS, true) < 0) - { - obidebug(1, "\nError creating the column for the LCS length when aligning"); - return -1; - } - lcs_length_column = obi_view_get_column(output_view, LCS_LENGTH_COLUMN_NAME); - - // Create the column for the alignment length if it is computed + lcs_length_column = obi_view_get_column(output_view, LCS_LENGTH_COLUMN_NAME); if ((reference == ALILEN) && (normalize || !similarity_mode)) - { - if (obi_view_add_column(output_view, ALI_LENGTH_COLUMN_NAME, -1, ALI_LENGTH_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, ALI_LENGTH_COLUMN_COMMENTS, true) < 0) - { - obidebug(1, "\nError creating the column for the alignment length when aligning"); - return -1; - } ali_length_column = obi_view_get_column(output_view, ALI_LENGTH_COLUMN_NAME); - } - // Create the column for the alignment score - if (normalize) - { - if (obi_view_add_column(output_view, SCORE_COLUMN_NAME, -1, SCORE_COLUMN_NAME, OBI_FLOAT, 0, 1, NULL, NULL, NULL, -1, SCORE_COLUMN_NAME, true) < 0) - { - obidebug(1, "\nError creating the column for the score when aligning"); - return -1; - } - } - else - { - if (obi_view_add_column(output_view, SCORE_COLUMN_NAME, -1, SCORE_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, SCORE_COLUMN_NAME, true) < 0) - { - obidebug(1, "\nError creating the column for the score when aligning"); - return -1; - } - } score_column = obi_view_get_column(output_view, SCORE_COLUMN_NAME); - if (print_seq) { - // Create the column for the first sequences aligned - if (obi_view_add_column(output_view, SEQ1_COLUMN_NAME, -1, SEQ1_COLUMN_NAME, OBI_SEQ, 0, 1, NULL, (iseq_column->header)->indexer_name, NULL, -1, SEQ1_COLUMN_COMMENTS, true) < 0) - { - obidebug(1, "\nError creating the first column for the sequences when aligning"); - return -1; - } seq1_column = obi_view_get_column(output_view, SEQ1_COLUMN_NAME); - - // Create the column for the second sequences aligned - if (obi_view_add_column(output_view, SEQ2_COLUMN_NAME, -1, SEQ2_COLUMN_NAME, OBI_SEQ, 0, 1, NULL, (iseq_column->header)->indexer_name, NULL, -1, SEQ2_COLUMN_COMMENTS, true) < 0) - { - obidebug(1, "\nError creating the second column for the sequences when aligning"); - return -1; - } seq2_column = obi_view_get_column(output_view, SEQ2_COLUMN_NAME); } // if (print_count) // TODO count columns not implemented yet // { -// // Create the column for the count of the first sequences aligned -// if (obi_view_add_column(output_view, COUNT1_COLUMN_NAME, -1, COUNT1_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, COUNT1_COLUMN_COMMENTS, true) < 0) -// { -// obidebug(1, "\nError creating the first column for the sequence counts when aligning"); -// return -1; -// } // count1_column = obi_view_get_column(seq_view, COUNT1_COLUMN_NAME); -// -// // Create the column for the count of the second sequences aligned -// if (obi_view_add_column(output_view, COUNT2_COLUMN_NAME, -1, COUNT2_COLUMN_NAME, OBI_INT, 0, 1, NULL, NULL, NULL, -1, COUNT2_COLUMN_COMMENTS, true) < 0) -// { -// obidebug(1, "\nError creating the second column for the sequence counts when aligning"); -// return -1; -// } // count2_column = obi_view_get_column(seq_view, COUNT2_COLUMN_NAME); // } - // Build kmer tables ktable = hash_seq_column(seq_view, iseq_column, seq_elt_idx); if (ktable == NULL) @@ -301,100 +571,20 @@ int obi_lcs_align_one_column(OBIDMS_p dms, const char* seq_view_name, const char if ((score >= 0) && (((normalize || similarity_mode) && (score >= threshold)) || ((!similarity_mode && !normalize) && (score <= threshold)))) { // Print result // TODO make separate function maybe - // Write line indices of the input view in the output view (to easily refer to the input sequences from the output view) - if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, idx1_column, k, 0, i) < 0) - { - obidebug(1, "\nError writing idx1 in a column"); - return -1; - } - if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, idx2_column, k, 0, j) < 0) - { - obidebug(1, "\nError writing idx2 in a column"); - return -1; - } - // Get ids idx id1_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0); // TODO Could there be multiple IDs per line? id2_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, id_column, j, 0); - // Write ids in output view - if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, id1_column, k, 0, id1_idx) < 0) - { - obidebug(1, "\nError writing id1 in a column"); + if (print_alignment_result(output_view, k, + idx1_column, idx2_column, i, j, + id1_column, id2_column, id1_idx, id2_idx, + print_seq, seq1_column, seq2_column, seq1_idx, seq2_idx, + //print_count, count1_column, count2_column, count1, count2, + ali_length_column, ali_length, + lcs_length_column, lcs_length, + score_column, score, + reference, normalize, similarity_mode) < 0) return -1; - } - if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, id2_column, k, 0, id2_idx) < 0) - { - obidebug(1, "\nError writing id2 in a column"); - return -1; - } - - // Write the sequences if needed - if (print_seq) - { - if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, seq1_column, k, 0, seq1_idx) < 0) - { - obidebug(1, "\nError writing seq1 in a column"); - return -1; - } - - if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, seq2_column, k, 0, seq2_idx) < 0) - { - obidebug(1, "\nError writing seq2 in a column"); - return -1; - } - } - -// // Write the counts if needed // TODO count columns not implemented yet -// if (print_count) -// { -// if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, count1_column, k, 0, count1) < 0) -// { -// obidebug(1, "\nError writing count1 in a column"); -// return -1; -// } -// -// if (obi_set_index_with_elt_idx_and_col_p_in_view(output_view, count2_column, k, 0, count2) < 0) -// { -// obidebug(1, "\nError writing count2 in a column"); -// return -1; -// } -// } - - // Write the alignment length if it was computed - if ((reference == ALILEN) && (normalize || !similarity_mode)) - { - if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, ali_length_column, k, 0, ali_length) < 0) - { - obidebug(1, "\nError writing alignment length in a column"); - return -1; - } - } - - // Write the LCS length - if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, lcs_length_column, k, 0, lcs_length) < 0) - { - obidebug(1, "\nError writing LCS length in a column"); - return -1; - } - - // Write score - if (normalize) - { - if (obi_set_float_with_elt_idx_and_col_p_in_view(output_view, score_column, k, 0, (obifloat_t) score) < 0) - { - obidebug(1, "\nError writing alignment score in a column"); - return -1; - } - } - else - { - if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, score_column, k, 0, (obiint_t) score) < 0) - { - obidebug(1, "\nError writing alignment score in a column"); - return -1; - } - } k++; } From 191c83aafc5cbac301269323dbc41fa22f2455a7 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Thu, 15 Dec 2016 15:28:34 +0100 Subject: [PATCH 02/22] Added missing *.cfiles --- python/obitools3/commands/lcs.cfiles | 65 +++++++++++++++++++ python/obitools3/obidms/capi/obialign.cfiles | 65 +++++++++++++++++++ python/obitools3/obidms/capi/obidms.cfiles | 65 +++++++++++++++++++ .../obitools3/obidms/capi/obidmscolumn.cfiles | 65 +++++++++++++++++++ python/obitools3/obidms/capi/obierrno.cfiles | 65 +++++++++++++++++++ .../obitools3/obidms/capi/obitaxonomy.cfiles | 65 +++++++++++++++++++ python/obitools3/obidms/capi/obitypes.cfiles | 65 +++++++++++++++++++ python/obitools3/obidms/capi/obiutils.cfiles | 65 +++++++++++++++++++ python/obitools3/obidms/capi/obiview.cfiles | 65 +++++++++++++++++++ 9 files changed, 585 insertions(+) create mode 100644 python/obitools3/commands/lcs.cfiles create mode 100644 python/obitools3/obidms/capi/obialign.cfiles create mode 100644 python/obitools3/obidms/capi/obidms.cfiles create mode 100644 python/obitools3/obidms/capi/obidmscolumn.cfiles create mode 100644 python/obitools3/obidms/capi/obierrno.cfiles create mode 100644 python/obitools3/obidms/capi/obitaxonomy.cfiles create mode 100644 python/obitools3/obidms/capi/obitypes.cfiles create mode 100644 python/obitools3/obidms/capi/obiutils.cfiles create mode 100644 python/obitools3/obidms/capi/obiview.cfiles diff --git a/python/obitools3/commands/lcs.cfiles b/python/obitools3/commands/lcs.cfiles new file mode 100644 index 0000000..84e0436 --- /dev/null +++ b/python/obitools3/commands/lcs.cfiles @@ -0,0 +1,65 @@ +../../../src/bloom.h +../../../src/bloom.c +../../../src/char_str_indexer.h +../../../src/char_str_indexer.c +../../../src/crc64.h +../../../src/crc64.c +../../../src/dna_seq_indexer.h +../../../src/dna_seq_indexer.c +../../../src/encode.h +../../../src/encode.c +../../../src/hashtable.h +../../../src/hashtable.c +../../../src/murmurhash2.h +../../../src/murmurhash2.c +../../../src/obi_align.h +../../../src/obi_align.c +../../../src/obiavl.h +../../../src/obiavl.c +../../../src/obiblob_indexer.h +../../../src/obiblob_indexer.c +../../../src/obiblob.h +../../../src/obiblob.c +../../../src/obidebug.h +../../../src/obidms_taxonomy.h +../../../src/obidms_taxonomy.c +../../../src/obidms.h +../../../src/obidms.c +../../../src/obidmscolumn_blob.c +../../../src/obidmscolumn_blob.h +../../../src/obidmscolumn_bool.c +../../../src/obidmscolumn_bool.h +../../../src/obidmscolumn_char.c +../../../src/obidmscolumn_char.h +../../../src/obidmscolumn_float.c +../../../src/obidmscolumn_float.h +../../../src/obidmscolumn_idx.h +../../../src/obidmscolumn_idx.c +../../../src/obidmscolumn_int.c +../../../src/obidmscolumn_int.h +../../../src/obidmscolumn_qual.h +../../../src/obidmscolumn_qual.c +../../../src/obidmscolumn_seq.c +../../../src/obidmscolumn_seq.h +../../../src/obidmscolumn_str.c +../../../src/obidmscolumn_str.h +../../../src/obidmscolumn.h +../../../src/obidmscolumn.c +../../../src/obidmscolumndir.h +../../../src/obidmscolumndir.c +../../../src/obierrno.h +../../../src/obierrno.c +../../../src/obilittlebigman.h +../../../src/obilittlebigman.c +../../../src/obitypes.h +../../../src/obitypes.c +../../../src/obiview.h +../../../src/obiview.c +../../../src/sse_banded_LCS_alignment.h +../../../src/sse_banded_LCS_alignment.c +../../../src/uint8_indexer.h +../../../src/uint8_indexer.c +../../../src/upperband.h +../../../src/upperband.c +../../../src/utils.h +../../../src/utils.c diff --git a/python/obitools3/obidms/capi/obialign.cfiles b/python/obitools3/obidms/capi/obialign.cfiles new file mode 100644 index 0000000..3bbdbcb --- /dev/null +++ b/python/obitools3/obidms/capi/obialign.cfiles @@ -0,0 +1,65 @@ +../../../../src/bloom.h +../../../../src/bloom.c +../../../../src/char_str_indexer.h +../../../../src/char_str_indexer.c +../../../../src/crc64.h +../../../../src/crc64.c +../../../../src/dna_seq_indexer.h +../../../../src/dna_seq_indexer.c +../../../../src/encode.h +../../../../src/encode.c +../../../../src/hashtable.h +../../../../src/hashtable.c +../../../../src/murmurhash2.h +../../../../src/murmurhash2.c +../../../../src/obi_align.h +../../../../src/obi_align.c +../../../../src/obiavl.h +../../../../src/obiavl.c +../../../../src/obiblob_indexer.h +../../../../src/obiblob_indexer.c +../../../../src/obiblob.h +../../../../src/obiblob.c +../../../../src/obidebug.h +../../../../src/obidms_taxonomy.h +../../../../src/obidms_taxonomy.c +../../../../src/obidms.h +../../../../src/obidms.c +../../../../src/obidmscolumn_blob.c +../../../../src/obidmscolumn_blob.h +../../../../src/obidmscolumn_bool.c +../../../../src/obidmscolumn_bool.h +../../../../src/obidmscolumn_char.c +../../../../src/obidmscolumn_char.h +../../../../src/obidmscolumn_float.c +../../../../src/obidmscolumn_float.h +../../../../src/obidmscolumn_idx.h +../../../../src/obidmscolumn_idx.c +../../../../src/obidmscolumn_int.c +../../../../src/obidmscolumn_int.h +../../../../src/obidmscolumn_qual.h +../../../../src/obidmscolumn_qual.c +../../../../src/obidmscolumn_seq.c +../../../../src/obidmscolumn_seq.h +../../../../src/obidmscolumn_str.c +../../../../src/obidmscolumn_str.h +../../../../src/obidmscolumn.h +../../../../src/obidmscolumn.c +../../../../src/obidmscolumndir.h +../../../../src/obidmscolumndir.c +../../../../src/obierrno.h +../../../../src/obierrno.c +../../../../src/obilittlebigman.h +../../../../src/obilittlebigman.c +../../../../src/obitypes.h +../../../../src/obitypes.c +../../../../src/obiview.h +../../../../src/obiview.c +../../../../src/sse_banded_LCS_alignment.h +../../../../src/sse_banded_LCS_alignment.c +../../../../src/uint8_indexer.h +../../../../src/uint8_indexer.c +../../../../src/upperband.h +../../../../src/upperband.c +../../../../src/utils.h +../../../../src/utils.c diff --git a/python/obitools3/obidms/capi/obidms.cfiles b/python/obitools3/obidms/capi/obidms.cfiles new file mode 100644 index 0000000..3bbdbcb --- /dev/null +++ b/python/obitools3/obidms/capi/obidms.cfiles @@ -0,0 +1,65 @@ +../../../../src/bloom.h +../../../../src/bloom.c +../../../../src/char_str_indexer.h +../../../../src/char_str_indexer.c +../../../../src/crc64.h +../../../../src/crc64.c +../../../../src/dna_seq_indexer.h +../../../../src/dna_seq_indexer.c +../../../../src/encode.h +../../../../src/encode.c +../../../../src/hashtable.h +../../../../src/hashtable.c +../../../../src/murmurhash2.h +../../../../src/murmurhash2.c +../../../../src/obi_align.h +../../../../src/obi_align.c +../../../../src/obiavl.h +../../../../src/obiavl.c +../../../../src/obiblob_indexer.h +../../../../src/obiblob_indexer.c +../../../../src/obiblob.h +../../../../src/obiblob.c +../../../../src/obidebug.h +../../../../src/obidms_taxonomy.h +../../../../src/obidms_taxonomy.c +../../../../src/obidms.h +../../../../src/obidms.c +../../../../src/obidmscolumn_blob.c +../../../../src/obidmscolumn_blob.h +../../../../src/obidmscolumn_bool.c +../../../../src/obidmscolumn_bool.h +../../../../src/obidmscolumn_char.c +../../../../src/obidmscolumn_char.h +../../../../src/obidmscolumn_float.c +../../../../src/obidmscolumn_float.h +../../../../src/obidmscolumn_idx.h +../../../../src/obidmscolumn_idx.c +../../../../src/obidmscolumn_int.c +../../../../src/obidmscolumn_int.h +../../../../src/obidmscolumn_qual.h +../../../../src/obidmscolumn_qual.c +../../../../src/obidmscolumn_seq.c +../../../../src/obidmscolumn_seq.h +../../../../src/obidmscolumn_str.c +../../../../src/obidmscolumn_str.h +../../../../src/obidmscolumn.h +../../../../src/obidmscolumn.c +../../../../src/obidmscolumndir.h +../../../../src/obidmscolumndir.c +../../../../src/obierrno.h +../../../../src/obierrno.c +../../../../src/obilittlebigman.h +../../../../src/obilittlebigman.c +../../../../src/obitypes.h +../../../../src/obitypes.c +../../../../src/obiview.h +../../../../src/obiview.c +../../../../src/sse_banded_LCS_alignment.h +../../../../src/sse_banded_LCS_alignment.c +../../../../src/uint8_indexer.h +../../../../src/uint8_indexer.c +../../../../src/upperband.h +../../../../src/upperband.c +../../../../src/utils.h +../../../../src/utils.c diff --git a/python/obitools3/obidms/capi/obidmscolumn.cfiles b/python/obitools3/obidms/capi/obidmscolumn.cfiles new file mode 100644 index 0000000..3bbdbcb --- /dev/null +++ b/python/obitools3/obidms/capi/obidmscolumn.cfiles @@ -0,0 +1,65 @@ +../../../../src/bloom.h +../../../../src/bloom.c +../../../../src/char_str_indexer.h +../../../../src/char_str_indexer.c +../../../../src/crc64.h +../../../../src/crc64.c +../../../../src/dna_seq_indexer.h +../../../../src/dna_seq_indexer.c +../../../../src/encode.h +../../../../src/encode.c +../../../../src/hashtable.h +../../../../src/hashtable.c +../../../../src/murmurhash2.h +../../../../src/murmurhash2.c +../../../../src/obi_align.h +../../../../src/obi_align.c +../../../../src/obiavl.h +../../../../src/obiavl.c +../../../../src/obiblob_indexer.h +../../../../src/obiblob_indexer.c +../../../../src/obiblob.h +../../../../src/obiblob.c +../../../../src/obidebug.h +../../../../src/obidms_taxonomy.h +../../../../src/obidms_taxonomy.c +../../../../src/obidms.h +../../../../src/obidms.c +../../../../src/obidmscolumn_blob.c +../../../../src/obidmscolumn_blob.h +../../../../src/obidmscolumn_bool.c +../../../../src/obidmscolumn_bool.h +../../../../src/obidmscolumn_char.c +../../../../src/obidmscolumn_char.h +../../../../src/obidmscolumn_float.c +../../../../src/obidmscolumn_float.h +../../../../src/obidmscolumn_idx.h +../../../../src/obidmscolumn_idx.c +../../../../src/obidmscolumn_int.c +../../../../src/obidmscolumn_int.h +../../../../src/obidmscolumn_qual.h +../../../../src/obidmscolumn_qual.c +../../../../src/obidmscolumn_seq.c +../../../../src/obidmscolumn_seq.h +../../../../src/obidmscolumn_str.c +../../../../src/obidmscolumn_str.h +../../../../src/obidmscolumn.h +../../../../src/obidmscolumn.c +../../../../src/obidmscolumndir.h +../../../../src/obidmscolumndir.c +../../../../src/obierrno.h +../../../../src/obierrno.c +../../../../src/obilittlebigman.h +../../../../src/obilittlebigman.c +../../../../src/obitypes.h +../../../../src/obitypes.c +../../../../src/obiview.h +../../../../src/obiview.c +../../../../src/sse_banded_LCS_alignment.h +../../../../src/sse_banded_LCS_alignment.c +../../../../src/uint8_indexer.h +../../../../src/uint8_indexer.c +../../../../src/upperband.h +../../../../src/upperband.c +../../../../src/utils.h +../../../../src/utils.c diff --git a/python/obitools3/obidms/capi/obierrno.cfiles b/python/obitools3/obidms/capi/obierrno.cfiles new file mode 100644 index 0000000..3bbdbcb --- /dev/null +++ b/python/obitools3/obidms/capi/obierrno.cfiles @@ -0,0 +1,65 @@ +../../../../src/bloom.h +../../../../src/bloom.c +../../../../src/char_str_indexer.h +../../../../src/char_str_indexer.c +../../../../src/crc64.h +../../../../src/crc64.c +../../../../src/dna_seq_indexer.h +../../../../src/dna_seq_indexer.c +../../../../src/encode.h +../../../../src/encode.c +../../../../src/hashtable.h +../../../../src/hashtable.c +../../../../src/murmurhash2.h +../../../../src/murmurhash2.c +../../../../src/obi_align.h +../../../../src/obi_align.c +../../../../src/obiavl.h +../../../../src/obiavl.c +../../../../src/obiblob_indexer.h +../../../../src/obiblob_indexer.c +../../../../src/obiblob.h +../../../../src/obiblob.c +../../../../src/obidebug.h +../../../../src/obidms_taxonomy.h +../../../../src/obidms_taxonomy.c +../../../../src/obidms.h +../../../../src/obidms.c +../../../../src/obidmscolumn_blob.c +../../../../src/obidmscolumn_blob.h +../../../../src/obidmscolumn_bool.c +../../../../src/obidmscolumn_bool.h +../../../../src/obidmscolumn_char.c +../../../../src/obidmscolumn_char.h +../../../../src/obidmscolumn_float.c +../../../../src/obidmscolumn_float.h +../../../../src/obidmscolumn_idx.h +../../../../src/obidmscolumn_idx.c +../../../../src/obidmscolumn_int.c +../../../../src/obidmscolumn_int.h +../../../../src/obidmscolumn_qual.h +../../../../src/obidmscolumn_qual.c +../../../../src/obidmscolumn_seq.c +../../../../src/obidmscolumn_seq.h +../../../../src/obidmscolumn_str.c +../../../../src/obidmscolumn_str.h +../../../../src/obidmscolumn.h +../../../../src/obidmscolumn.c +../../../../src/obidmscolumndir.h +../../../../src/obidmscolumndir.c +../../../../src/obierrno.h +../../../../src/obierrno.c +../../../../src/obilittlebigman.h +../../../../src/obilittlebigman.c +../../../../src/obitypes.h +../../../../src/obitypes.c +../../../../src/obiview.h +../../../../src/obiview.c +../../../../src/sse_banded_LCS_alignment.h +../../../../src/sse_banded_LCS_alignment.c +../../../../src/uint8_indexer.h +../../../../src/uint8_indexer.c +../../../../src/upperband.h +../../../../src/upperband.c +../../../../src/utils.h +../../../../src/utils.c diff --git a/python/obitools3/obidms/capi/obitaxonomy.cfiles b/python/obitools3/obidms/capi/obitaxonomy.cfiles new file mode 100644 index 0000000..3bbdbcb --- /dev/null +++ b/python/obitools3/obidms/capi/obitaxonomy.cfiles @@ -0,0 +1,65 @@ +../../../../src/bloom.h +../../../../src/bloom.c +../../../../src/char_str_indexer.h +../../../../src/char_str_indexer.c +../../../../src/crc64.h +../../../../src/crc64.c +../../../../src/dna_seq_indexer.h +../../../../src/dna_seq_indexer.c +../../../../src/encode.h +../../../../src/encode.c +../../../../src/hashtable.h +../../../../src/hashtable.c +../../../../src/murmurhash2.h +../../../../src/murmurhash2.c +../../../../src/obi_align.h +../../../../src/obi_align.c +../../../../src/obiavl.h +../../../../src/obiavl.c +../../../../src/obiblob_indexer.h +../../../../src/obiblob_indexer.c +../../../../src/obiblob.h +../../../../src/obiblob.c +../../../../src/obidebug.h +../../../../src/obidms_taxonomy.h +../../../../src/obidms_taxonomy.c +../../../../src/obidms.h +../../../../src/obidms.c +../../../../src/obidmscolumn_blob.c +../../../../src/obidmscolumn_blob.h +../../../../src/obidmscolumn_bool.c +../../../../src/obidmscolumn_bool.h +../../../../src/obidmscolumn_char.c +../../../../src/obidmscolumn_char.h +../../../../src/obidmscolumn_float.c +../../../../src/obidmscolumn_float.h +../../../../src/obidmscolumn_idx.h +../../../../src/obidmscolumn_idx.c +../../../../src/obidmscolumn_int.c +../../../../src/obidmscolumn_int.h +../../../../src/obidmscolumn_qual.h +../../../../src/obidmscolumn_qual.c +../../../../src/obidmscolumn_seq.c +../../../../src/obidmscolumn_seq.h +../../../../src/obidmscolumn_str.c +../../../../src/obidmscolumn_str.h +../../../../src/obidmscolumn.h +../../../../src/obidmscolumn.c +../../../../src/obidmscolumndir.h +../../../../src/obidmscolumndir.c +../../../../src/obierrno.h +../../../../src/obierrno.c +../../../../src/obilittlebigman.h +../../../../src/obilittlebigman.c +../../../../src/obitypes.h +../../../../src/obitypes.c +../../../../src/obiview.h +../../../../src/obiview.c +../../../../src/sse_banded_LCS_alignment.h +../../../../src/sse_banded_LCS_alignment.c +../../../../src/uint8_indexer.h +../../../../src/uint8_indexer.c +../../../../src/upperband.h +../../../../src/upperband.c +../../../../src/utils.h +../../../../src/utils.c diff --git a/python/obitools3/obidms/capi/obitypes.cfiles b/python/obitools3/obidms/capi/obitypes.cfiles new file mode 100644 index 0000000..3bbdbcb --- /dev/null +++ b/python/obitools3/obidms/capi/obitypes.cfiles @@ -0,0 +1,65 @@ +../../../../src/bloom.h +../../../../src/bloom.c +../../../../src/char_str_indexer.h +../../../../src/char_str_indexer.c +../../../../src/crc64.h +../../../../src/crc64.c +../../../../src/dna_seq_indexer.h +../../../../src/dna_seq_indexer.c +../../../../src/encode.h +../../../../src/encode.c +../../../../src/hashtable.h +../../../../src/hashtable.c +../../../../src/murmurhash2.h +../../../../src/murmurhash2.c +../../../../src/obi_align.h +../../../../src/obi_align.c +../../../../src/obiavl.h +../../../../src/obiavl.c +../../../../src/obiblob_indexer.h +../../../../src/obiblob_indexer.c +../../../../src/obiblob.h +../../../../src/obiblob.c +../../../../src/obidebug.h +../../../../src/obidms_taxonomy.h +../../../../src/obidms_taxonomy.c +../../../../src/obidms.h +../../../../src/obidms.c +../../../../src/obidmscolumn_blob.c +../../../../src/obidmscolumn_blob.h +../../../../src/obidmscolumn_bool.c +../../../../src/obidmscolumn_bool.h +../../../../src/obidmscolumn_char.c +../../../../src/obidmscolumn_char.h +../../../../src/obidmscolumn_float.c +../../../../src/obidmscolumn_float.h +../../../../src/obidmscolumn_idx.h +../../../../src/obidmscolumn_idx.c +../../../../src/obidmscolumn_int.c +../../../../src/obidmscolumn_int.h +../../../../src/obidmscolumn_qual.h +../../../../src/obidmscolumn_qual.c +../../../../src/obidmscolumn_seq.c +../../../../src/obidmscolumn_seq.h +../../../../src/obidmscolumn_str.c +../../../../src/obidmscolumn_str.h +../../../../src/obidmscolumn.h +../../../../src/obidmscolumn.c +../../../../src/obidmscolumndir.h +../../../../src/obidmscolumndir.c +../../../../src/obierrno.h +../../../../src/obierrno.c +../../../../src/obilittlebigman.h +../../../../src/obilittlebigman.c +../../../../src/obitypes.h +../../../../src/obitypes.c +../../../../src/obiview.h +../../../../src/obiview.c +../../../../src/sse_banded_LCS_alignment.h +../../../../src/sse_banded_LCS_alignment.c +../../../../src/uint8_indexer.h +../../../../src/uint8_indexer.c +../../../../src/upperband.h +../../../../src/upperband.c +../../../../src/utils.h +../../../../src/utils.c diff --git a/python/obitools3/obidms/capi/obiutils.cfiles b/python/obitools3/obidms/capi/obiutils.cfiles new file mode 100644 index 0000000..3bbdbcb --- /dev/null +++ b/python/obitools3/obidms/capi/obiutils.cfiles @@ -0,0 +1,65 @@ +../../../../src/bloom.h +../../../../src/bloom.c +../../../../src/char_str_indexer.h +../../../../src/char_str_indexer.c +../../../../src/crc64.h +../../../../src/crc64.c +../../../../src/dna_seq_indexer.h +../../../../src/dna_seq_indexer.c +../../../../src/encode.h +../../../../src/encode.c +../../../../src/hashtable.h +../../../../src/hashtable.c +../../../../src/murmurhash2.h +../../../../src/murmurhash2.c +../../../../src/obi_align.h +../../../../src/obi_align.c +../../../../src/obiavl.h +../../../../src/obiavl.c +../../../../src/obiblob_indexer.h +../../../../src/obiblob_indexer.c +../../../../src/obiblob.h +../../../../src/obiblob.c +../../../../src/obidebug.h +../../../../src/obidms_taxonomy.h +../../../../src/obidms_taxonomy.c +../../../../src/obidms.h +../../../../src/obidms.c +../../../../src/obidmscolumn_blob.c +../../../../src/obidmscolumn_blob.h +../../../../src/obidmscolumn_bool.c +../../../../src/obidmscolumn_bool.h +../../../../src/obidmscolumn_char.c +../../../../src/obidmscolumn_char.h +../../../../src/obidmscolumn_float.c +../../../../src/obidmscolumn_float.h +../../../../src/obidmscolumn_idx.h +../../../../src/obidmscolumn_idx.c +../../../../src/obidmscolumn_int.c +../../../../src/obidmscolumn_int.h +../../../../src/obidmscolumn_qual.h +../../../../src/obidmscolumn_qual.c +../../../../src/obidmscolumn_seq.c +../../../../src/obidmscolumn_seq.h +../../../../src/obidmscolumn_str.c +../../../../src/obidmscolumn_str.h +../../../../src/obidmscolumn.h +../../../../src/obidmscolumn.c +../../../../src/obidmscolumndir.h +../../../../src/obidmscolumndir.c +../../../../src/obierrno.h +../../../../src/obierrno.c +../../../../src/obilittlebigman.h +../../../../src/obilittlebigman.c +../../../../src/obitypes.h +../../../../src/obitypes.c +../../../../src/obiview.h +../../../../src/obiview.c +../../../../src/sse_banded_LCS_alignment.h +../../../../src/sse_banded_LCS_alignment.c +../../../../src/uint8_indexer.h +../../../../src/uint8_indexer.c +../../../../src/upperband.h +../../../../src/upperband.c +../../../../src/utils.h +../../../../src/utils.c diff --git a/python/obitools3/obidms/capi/obiview.cfiles b/python/obitools3/obidms/capi/obiview.cfiles new file mode 100644 index 0000000..3bbdbcb --- /dev/null +++ b/python/obitools3/obidms/capi/obiview.cfiles @@ -0,0 +1,65 @@ +../../../../src/bloom.h +../../../../src/bloom.c +../../../../src/char_str_indexer.h +../../../../src/char_str_indexer.c +../../../../src/crc64.h +../../../../src/crc64.c +../../../../src/dna_seq_indexer.h +../../../../src/dna_seq_indexer.c +../../../../src/encode.h +../../../../src/encode.c +../../../../src/hashtable.h +../../../../src/hashtable.c +../../../../src/murmurhash2.h +../../../../src/murmurhash2.c +../../../../src/obi_align.h +../../../../src/obi_align.c +../../../../src/obiavl.h +../../../../src/obiavl.c +../../../../src/obiblob_indexer.h +../../../../src/obiblob_indexer.c +../../../../src/obiblob.h +../../../../src/obiblob.c +../../../../src/obidebug.h +../../../../src/obidms_taxonomy.h +../../../../src/obidms_taxonomy.c +../../../../src/obidms.h +../../../../src/obidms.c +../../../../src/obidmscolumn_blob.c +../../../../src/obidmscolumn_blob.h +../../../../src/obidmscolumn_bool.c +../../../../src/obidmscolumn_bool.h +../../../../src/obidmscolumn_char.c +../../../../src/obidmscolumn_char.h +../../../../src/obidmscolumn_float.c +../../../../src/obidmscolumn_float.h +../../../../src/obidmscolumn_idx.h +../../../../src/obidmscolumn_idx.c +../../../../src/obidmscolumn_int.c +../../../../src/obidmscolumn_int.h +../../../../src/obidmscolumn_qual.h +../../../../src/obidmscolumn_qual.c +../../../../src/obidmscolumn_seq.c +../../../../src/obidmscolumn_seq.h +../../../../src/obidmscolumn_str.c +../../../../src/obidmscolumn_str.h +../../../../src/obidmscolumn.h +../../../../src/obidmscolumn.c +../../../../src/obidmscolumndir.h +../../../../src/obidmscolumndir.c +../../../../src/obierrno.h +../../../../src/obierrno.c +../../../../src/obilittlebigman.h +../../../../src/obilittlebigman.c +../../../../src/obitypes.h +../../../../src/obitypes.c +../../../../src/obiview.h +../../../../src/obiview.c +../../../../src/sse_banded_LCS_alignment.h +../../../../src/sse_banded_LCS_alignment.c +../../../../src/uint8_indexer.h +../../../../src/uint8_indexer.c +../../../../src/upperband.h +../../../../src/upperband.c +../../../../src/utils.h +../../../../src/utils.c From 490f5fe6b9990740e6ed2610d2a9c3c2a107345e Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Fri, 16 Dec 2016 19:04:21 +0100 Subject: [PATCH 03/22] Updated deprecated code in cython API for columns (using line count of view instead of column) --- python/obitools3/obidms/_obidms.pyx | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/python/obitools3/obidms/_obidms.pyx b/python/obitools3/obidms/_obidms.pyx index c35cd14..7f86c59 100644 --- a/python/obitools3/obidms/_obidms.pyx +++ b/python/obitools3/obidms/_obidms.pyx @@ -100,17 +100,17 @@ cdef class OBIDMS_column : def __getitem__(self, index_t line_nb): return self.get_line(line_nb) - def __len__(self): - return self.lines_used + def __len__(self): # TODO discuss + return self._view.line_count def __sizeof__(self): return ((self._pointer)[0].header.header_size + (self._pointer)[0].header.data_size) - def __iter__(self): + def __iter__(self): # TODO discuss # Declarations cdef index_t line_nb # Yield each line - for line_nb in range(self.lines_used): + for line_nb in range(self._view.line_count): yield self.get_line(line_nb) def __str__(self) : @@ -160,11 +160,6 @@ cdef class OBIDMS_column : def version(self): return ((self._pointer)[0].header).version - # lines_used property getter - @property - def lines_used(self): - return (self._pointer)[0].header.lines_used - # comments property getter @property def comments(self): From 303bd6f4454458670d5c1c5b34291ddc24f925c4 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Fri, 16 Dec 2016 19:10:18 +0100 Subject: [PATCH 04/22] Added function to build kmer table for 2 columns, and fixed bug (with line count) when building kmer table of one column --- src/upperband.c | 42 +++++++++++++++++++++++++++++++++++++++--- src/upperband.h | 4 ++++ 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/src/upperband.c b/src/upperband.c index 9eba088..548d09e 100644 --- a/src/upperband.c +++ b/src/upperband.c @@ -8,8 +8,6 @@ #include "obidmscolumn.h" #include "obiview.h" -//#include "../libutils/utilities.h" -//#include "../libfasta/sequence.h" inline static uchar_v hash4m128(uchar_v frag) @@ -242,7 +240,7 @@ Kmer_table_p hash_seq_column(Obiview_p view, OBIDMS_column_p seq_col, index_t se fprintf(stderr,"Building kmer tables..."); - seq_count = (seq_col->header)->lines_used; + seq_count = (view->infos)->line_count; // Allocate memory for the table structure ktable = (Kmer_table_p) malloc(sizeof(Kmer_table_t) * seq_count); @@ -267,6 +265,44 @@ Kmer_table_p hash_seq_column(Obiview_p view, OBIDMS_column_p seq_col, index_t se } +Kmer_table_p hash_two_seq_columns(Obiview_p view1, OBIDMS_column_p seq1_col, index_t seq1_idx, + Obiview_p view2, OBIDMS_column_p seq2_col, index_t seq2_idx) +{ + size_t seq1_count; + size_t seq2_count; + Kmer_table_p ktable1; + Kmer_table_p ktable2; + Kmer_table_p ktable; + + seq1_count = (view1->infos)->line_count; + seq2_count = (view2->infos)->line_count; + + // Build the two tables then concatenate them + ktable1 = hash_seq_column(view1, seq1_col, seq1_idx); + if (ktable1 == NULL) + return NULL; + ktable2 = hash_seq_column(view2, seq2_col, seq2_idx); + if (ktable2 == NULL) + return NULL; + + // Realloc to hold the 2 tables + ktable = realloc(ktable1, sizeof(Kmer_table_t) * (seq1_count + seq2_count)); + if (ktable == NULL) + { + free_kmer_tables(ktable2, seq2_count); + return NULL; + } + + // Concatenate + memcpy(ktable+seq1_count, ktable2, sizeof(Kmer_table_t) * seq2_count); + + // Free copied table + free(ktable2); + + return ktable; +} + + void free_kmer_tables(Kmer_table_p ktable, size_t count) { size_t i; diff --git a/src/upperband.h b/src/upperband.h index f378287..087ac8b 100644 --- a/src/upperband.h +++ b/src/upperband.h @@ -18,7 +18,11 @@ typedef struct { } Kmer_table_t, *Kmer_table_p; +// TODO doc + Kmer_table_p hash_seq_column(Obiview_p view, OBIDMS_column_p seq_col, index_t seq_idx); +Kmer_table_p hash_two_seq_columns(Obiview_p view1, OBIDMS_column_p seq1_col, index_t seq1_idx, + Obiview_p view2, OBIDMS_column_p seq2_col, index_t seq2_idx); void align_filters(Kmer_table_p ktable, Obi_blob_p seq1, Obi_blob_p seq2, index_t idx1, index_t idx2, double threshold, bool normalize, int reference, bool similarity_mode, double* score, int* LCSmin, bool can_be_identical); void free_kmer_tables(Kmer_table_p ktable, size_t count); From d99447c12b2505ce96d874cf2e65a63bb1884777 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Fri, 16 Dec 2016 19:39:02 +0100 Subject: [PATCH 05/22] C function for LCS alignment of two columns, and optimized and fixed line count bug in function to align one column --- src/obi_align.c | 466 +++++++++++++++++++++++++++++++++++++++--------- src/obi_align.h | 61 ++++++- 2 files changed, 430 insertions(+), 97 deletions(-) diff --git a/src/obi_align.c b/src/obi_align.c index a23e27f..97f6d4c 100644 --- a/src/obi_align.c +++ b/src/obi_align.c @@ -1,12 +1,12 @@ /**************************************************************************** - * Sequence alignment functions * + * LCS sequence alignment functions * ****************************************************************************/ /** * @file obi_align.c * @author Celine Mercier * @date May 4th 2016 - * @brief Functions handling sequence alignments. + * @brief Functions handling LCS sequence alignments. */ @@ -407,7 +407,7 @@ int obi_lcs_align_one_column(OBIDMS_p dms, const char* seq_view_name, const char Obiview_p seq_view = NULL; Obiview_p output_view = NULL; OBIDMS_column_p iseq_column = NULL; - OBIDMS_column_p id_column; + OBIDMS_column_p id_column = NULL; OBIDMS_column_p id1_column = NULL; OBIDMS_column_p id2_column = NULL; OBIDMS_column_p seq1_column = NULL; @@ -451,6 +451,14 @@ int obi_lcs_align_one_column(OBIDMS_p dms, const char* seq_view_name, const char return -1; } + // Check column type + if ((iseq_column->header)->returned_data_type != OBI_SEQ) + { + obi_set_errno(OBI_ALIGN_ERROR); + obidebug(1, "\nError: column given to align is not an OBI_SEQ column"); + return -1; + } + // Get element index of the sequence to align in each line to compute it only once if ((strcmp(seq_elt_name, "") != 0) && (seq_elt_name != NULL)) { @@ -527,21 +535,30 @@ int obi_lcs_align_one_column(OBIDMS_p dms, const char* seq_view_name, const char return -1; } - seq_count = (iseq_column->header)->lines_used; + seq_count = (seq_view->infos)->line_count; for (i=0; i < (seq_count - 1); i++) { if (i%100 == 0) fprintf(stderr,"\rDone : %f %% ", (i / (float) seq_count)*100); + // Get first id idx + id1_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0); // TODO Could there be multiple IDs per line? + // Get first sequence and its index + seq1_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, i, seq_elt_idx); + blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, i, seq_elt_idx); + if (blob1 == NULL) + { + obidebug(1, "\nError retrieving sequences to align"); + return -1; + } + for (j=i+1; j < seq_count; j++) { - blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, i, seq_elt_idx); - blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, j, seq_elt_idx); - seq1_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, i, seq_elt_idx); + // Get second sequence and its index seq2_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, j, seq_elt_idx); - - if ((blob1 == NULL) || (blob2 == NULL)) + blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq_view, iseq_column, j, seq_elt_idx); + if (blob2 == NULL) { obidebug(1, "\nError retrieving sequences to align"); return -1; @@ -569,10 +586,9 @@ int obi_lcs_align_one_column(OBIDMS_p dms, const char* seq_view_name, const char } if ((score >= 0) && (((normalize || similarity_mode) && (score >= threshold)) || ((!similarity_mode && !normalize) && (score <= threshold)))) - { // Print result // TODO make separate function maybe + { // Print result - // Get ids idx - id1_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, id_column, i, 0); // TODO Could there be multiple IDs per line? + // Get second id idx id2_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq_view, id_column, j, 0); if (print_alignment_result(output_view, k, @@ -609,82 +625,354 @@ int obi_lcs_align_one_column(OBIDMS_p dms, const char* seq_view_name, const char } -// TODO discuss if 2 input views or 2 columns or both possible -//int obi_align_two_columns(Obiview_p seq_view, OBIDMS_column_p seq_column_1, OBIDMS_column_p seq_column_2, // TODO it's implied both seq columns are in the same view but maybe it shouldn't -// Obiview_p score_view, OBIDMS_column_p score_column, -// double threshold, bool normalize, int reference, bool similarity_mode) -//{ -// index_t i, j, k; -// index_t seq_count_1; -// index_t seq_count_2; -// char* seq1; -// char* seq2; -// double score; -// -// k = 0; -// -// if (((seq_column_1->header)->returned_data_type != OBI_SEQ) || ((seq_column_2->header)->returned_data_type != OBI_SEQ)) -// { -// obi_set_errno(OBI_ALIGN_ERROR); -// obidebug(1, "\nTrying to align a column of a different type than OBI_SEQ"); -// return -1; -// } -// -// if ((normalize && ((score_column->header)->returned_data_type != OBI_FLOAT)) || -// (!normalize && ((score_column->header)->returned_data_type != OBI_INT))) -// { -// obi_set_errno(OBI_ALIGN_ERROR); -// obidebug(1, "\nTrying to store alignment scores in a column of an inappropriate type"); -// return -1; -// } -// -// seq_count_1 = (seq_column_1->header)->lines_used; -// seq_count_2 = (seq_column_2->header)->lines_used; -// -// for (i=0; i < (seq_count_1 - 1); i++) -// { -// for (j=0; j < seq_count_2; j++) -// { -// //fprintf(stderr, "\ni=%lld, j=%lld, k=%lld", i, j, k); -// -// seq1 = obi_get_seq_with_elt_idx_and_col_p_in_view(seq_view, seq_column_1, i, 0); -// seq2 = obi_get_seq_with_elt_idx_and_col_p_in_view(seq_view, seq_column_2, j, 0); -// -// if ((seq1 == NULL) || (seq2 == NULL)) -// { -// obidebug(1, "\nError retrieving sequences to align"); -// return -1; -// } -// -// // TODO kmer filter -// -// score = generic_sse_banded_lcs_align(seq1, seq2, threshold, normalize, reference, similarity_mode); -// -// if (normalize) -// { -// if (obi_set_float_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obifloat_t) score) < 0) -// { -// obidebug(1, "\nError writing alignment score in a column"); -// return -1; -// } -// } -// else -// { -// if (obi_set_int_with_elt_idx_and_col_p_in_view(score_view, score_column, k, 0, (obiint_t) score) < 0) -// { -// obidebug(1, "\nError writing alignment score in a column"); -// return -1; -// } -// } -// -// free(seq1); -// free(seq2); -// -// k++; -// } -// } -// -// return 0; -//} +int obi_lcs_align_two_columns(OBIDMS_p dms, + const char* seq1_view_name, + const char* seq2_view_name, + const char* seq1_column_name, + const char* seq2_column_name, + const char* seq1_elt_name, + const char* seq2_elt_name, + const char* id1_column_name, + const char* id2_column_name, + const char* output_view_name, const char* output_view_comments, + bool print_seq, bool print_count, + double threshold, bool normalize, int reference, bool similarity_mode) +{ + index_t i, j, k; + index_t seq1_count; + index_t seq2_count; + index_t id1_idx, id2_idx; + index_t seq1_idx, seq2_idx; + double score; + int lcs_length; + int ali_length; + Kmer_table_p ktable; + Obi_blob_p blob1; + Obi_blob_p blob2; + int lcs_min; + index_t seq1_elt_idx; + index_t seq2_elt_idx; + bool same_indexer; + + Obiview_p seq1_view = NULL; + Obiview_p seq2_view = NULL; + Obiview_p output_view = NULL; + OBIDMS_column_p i_seq1_column = NULL; + OBIDMS_column_p i_seq2_column = NULL; + OBIDMS_column_p i_id1_column = NULL; + OBIDMS_column_p i_id2_column = NULL; + OBIDMS_column_p id1_column = NULL; + OBIDMS_column_p id2_column = NULL; + OBIDMS_column_p seq1_column = NULL; + OBIDMS_column_p seq2_column = NULL; + //OBIDMS_column_p count1_column = NULL; + //OBIDMS_column_p count2_column = NULL; + OBIDMS_column_p idx1_column = NULL; + OBIDMS_column_p idx2_column = NULL; + OBIDMS_column_p lcs_length_column = NULL; + OBIDMS_column_p ali_length_column = NULL; + OBIDMS_column_p score_column = NULL; + + k = 0; + + // Open the first input view + seq1_view = obi_open_view(dms, seq1_view_name); + if (seq1_view == NULL) + { + obidebug(1, "\nError opening the first input view to align"); + return -1; + } + + // Open the second input view. Same as 1st if "" + if (strcmp(seq2_view_name, "") == 0) + seq2_view = seq1_view; + else + { + seq2_view = obi_open_view(dms, seq2_view_name); + if (seq2_view == NULL) + { + obidebug(1, "\nError opening the second input view to align"); + return -1; + } + } + + // Open the first sequence column to align + // If a column name wasn't given, open default sequence column + if (strcmp(seq1_column_name, "") == 0) + { + if (strcmp((seq1_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0) + i_seq1_column = obi_view_get_column(seq1_view, NUC_SEQUENCE_COLUMN); + else + { + obi_set_errno(OBI_ALIGN_ERROR); + obidebug(1, "\nError: no first column given to align"); + return -1; + } + } + else + i_seq1_column = obi_view_get_column(seq1_view, seq1_column_name); + if (i_seq1_column == NULL) + { + obidebug(1, "\nError getting the first column to align"); + return -1; + } + + // Check column type + if ((i_seq1_column->header)->returned_data_type != OBI_SEQ) + { + obi_set_errno(OBI_ALIGN_ERROR); + obidebug(1, "\nError: first column given to align is not an OBI_SEQ column"); + return -1; + } + + // Open the second sequence column to align + // If a column name wasn't given, open default sequence column + if (strcmp(seq2_column_name, "") == 0) + { + if (strcmp((seq2_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0) + i_seq2_column = obi_view_get_column(seq2_view, NUC_SEQUENCE_COLUMN); + else + { + obi_set_errno(OBI_ALIGN_ERROR); + obidebug(1, "\nError: no second column given to align"); + return -1; + } + } + else + i_seq2_column = obi_view_get_column(seq2_view, seq2_column_name); + if (i_seq2_column == NULL) + { + obidebug(1, "\nError getting the second column to align"); + return -1; + } + // Check that the sequence columns are not both the default NUC_SEQ column of the same view + if (i_seq1_column == i_seq2_column) + { + obidebug(1, "\nError: trying to align a column with itself (default NUC_SEQ column of the same view)"); + return -1; + } + + // Check column type + if ((i_seq2_column->header)->returned_data_type != OBI_SEQ) + { + obi_set_errno(OBI_ALIGN_ERROR); + obidebug(1, "\nError: second column given to align is not an OBI_SEQ column"); + return -1; + } + + // Get element index of the sequence to align in each line of the first column to compute it only once + if ((strcmp(seq1_elt_name, "") != 0) && (seq1_elt_name != NULL)) + { + seq1_elt_idx = obi_column_get_element_index_from_name(i_seq1_column, seq1_elt_name); + if (seq1_elt_idx == OBIIdx_NA) + { + obidebug(1, "\nError getting the sequence index in a column line when aligning"); + return -1; + } + } + else + seq1_elt_idx = 0; + + // Get element index of the sequence to align in each line of the second column to compute it only once + if ((strcmp(seq2_elt_name, "") != 0) && (seq2_elt_name != NULL)) + { + seq2_elt_idx = obi_column_get_element_index_from_name(i_seq2_column, seq2_elt_name); + if (seq2_elt_idx == OBIIdx_NA) + { + obidebug(1, "\nError getting the sequence index in a column line when aligning"); + return -1; + } + } + else + seq2_elt_idx = 0; + // Open the first ID column, containing the identifiers of the first sequence to align + // If a column name wasn't given, open default ID column + if (strcmp(id1_column_name, "") == 0) + { + if (strcmp((seq1_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0) + i_id1_column = obi_view_get_column(seq1_view, ID_COLUMN); + else + { + obi_set_errno(OBI_ALIGN_ERROR); + obidebug(1, "\nError: no first ID column given"); + return -1; + } + } + else + i_id1_column = obi_view_get_column(seq1_view, id1_column_name); + if (i_id1_column == NULL) + { + obidebug(1, "\nError getting the first ID column"); + return -1; + } + + // Open the second ID column, containing the identifiers of the second sequence to align + // If a column name wasn't given, open default ID column + if (strcmp(id2_column_name, "") == 0) + { + if (strcmp((seq2_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0) + i_id2_column = obi_view_get_column(seq2_view, ID_COLUMN); + else + { + obi_set_errno(OBI_ALIGN_ERROR); + obidebug(1, "\nError: no second ID column given"); + return -1; + } + } + else + i_id2_column = obi_view_get_column(seq2_view, id2_column_name); + if (i_id2_column == NULL) + { + obidebug(1, "\nError getting the second ID column"); + return -1; + } + + // Create the output view + output_view = obi_new_view(dms, output_view_name, NULL, NULL, output_view_comments); + if (output_view == NULL) + { + obidebug(1, "\nError creating the output view when aligning"); + return -1; + } + + // Create the output columns + if (create_alignment_output_columns(output_view, + (i_id1_column->header)->indexer_name, (i_id2_column->header)->indexer_name, + (i_seq1_column->header)->indexer_name, (i_seq2_column->header)->indexer_name, + print_seq, print_count, normalize, reference, similarity_mode) < 0) + return -1; + id1_column = obi_view_get_column(output_view, ID1_COLUMN_NAME); + id2_column = obi_view_get_column(output_view, ID2_COLUMN_NAME); + idx1_column = obi_view_get_column(output_view, IDX1_COLUMN_NAME); + idx2_column = obi_view_get_column(output_view, IDX2_COLUMN_NAME); + lcs_length_column = obi_view_get_column(output_view, LCS_LENGTH_COLUMN_NAME); + if ((reference == ALILEN) && (normalize || !similarity_mode)) + ali_length_column = obi_view_get_column(output_view, ALI_LENGTH_COLUMN_NAME); + score_column = obi_view_get_column(output_view, SCORE_COLUMN_NAME); + if (print_seq) + { + seq1_column = obi_view_get_column(output_view, SEQ1_COLUMN_NAME); + seq2_column = obi_view_get_column(output_view, SEQ2_COLUMN_NAME); + } +// if (print_count) // TODO count columns not implemented yet +// { +// count1_column = obi_view_get_column(seq_view, COUNT1_COLUMN_NAME); +// count2_column = obi_view_get_column(seq_view, COUNT2_COLUMN_NAME); +// } + + // Check if the sequence columns share the same indexer (allows for quick checking of sequence equality) + if (strcmp((i_seq1_column->header)->indexer_name, (i_seq2_column->header)->indexer_name) == 0) + same_indexer = true; + else + same_indexer = false; + + // Build kmer tables + ktable = hash_two_seq_columns(seq1_view, i_seq1_column, seq1_elt_idx, seq2_view, i_seq2_column, seq2_elt_idx); + if (ktable == NULL) + { + obi_set_errno(OBI_ALIGN_ERROR); + obidebug(1, "\nError building kmer tables before aligning"); + return -1; + } + + seq1_count = (seq1_view->infos)->line_count; + seq2_count = (seq2_view->infos)->line_count; + + for (i=0; i < seq1_count; i++) + { + if (i%100 == 0) + fprintf(stderr,"\rDone : %f %% ", (i / (float) seq1_count)*100); + + // Get id index of first sequence + id1_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq1_view, i_id1_column, i, 0); // TODO Could there be multiple IDs per line? + // Get first sequence and its index + seq1_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq1_view, i_seq1_column, i, seq1_elt_idx); + blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq1_view, i_seq1_column, i, seq1_elt_idx); + if (blob1 == NULL) + { + obidebug(1, "\nError retrieving sequences to align"); + return -1; + } + + for (j=0; j < seq2_count; j++) + { + // Get second sequence and its index + seq2_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq2_view, i_seq2_column, j, seq2_elt_idx); + blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(seq2_view, i_seq2_column, j, seq2_elt_idx); + if (blob2 == NULL) + { + obidebug(1, "\nError retrieving sequences to align"); + return -1; + } + + // Check if the sequences are identical in a quick way (same index in the same indexer) + if (same_indexer && (seq1_idx == seq2_idx)) + { + if (similarity_mode && normalize) + score = 1.0; + else if (!similarity_mode) + score = 0.0; + else + score = blob1->length_decoded_value; + } + + else // the sequences aren't identical + { + // kmer filter (offset for the index of the kmer table of the 2nd sequence because the kmer tables of the 2 sequence columns are concatenated in one) + align_filters(ktable, blob1, blob2, i, seq1_count+j, threshold, normalize, reference, similarity_mode, &score, &lcs_min, false); + + // Compute alignment score + if ((threshold == 0) || (score == -1.0)) // no threshold, or filter passed: align + score = obiblob_sse_banded_lcs_align(blob1, blob2, threshold, normalize, reference, similarity_mode, &lcs_length, &ali_length); + } + + if ((score >= 0) && (((normalize || similarity_mode) && (score >= threshold)) || ((!similarity_mode && !normalize) && (score <= threshold)))) + { // Print result + + // Get second id idx + id2_idx = obi_get_index_with_elt_idx_and_col_p_in_view(seq2_view, i_id2_column, j, 0); + + if (print_alignment_result(output_view, k, + idx1_column, idx2_column, i, j, + id1_column, id2_column, id1_idx, id2_idx, + print_seq, seq1_column, seq2_column, seq1_idx, seq2_idx, + //print_count, count1_column, count2_column, count1, count2, + ali_length_column, ali_length, + lcs_length_column, lcs_length, + score_column, score, + reference, normalize, similarity_mode) < 0) + return -1; + + k++; + } + } + } + + // Close views + if (seq2_view != seq1_view) + { + if (obi_close_view(seq2_view) < 0) + { + obidebug(1, "\nError closing the second input view after aligning"); + return -1; + } + } + if (obi_close_view(seq1_view) < 0) + { + obidebug(1, "\nError closing the first input view after aligning"); + return -1; + } + + if (obi_close_view(output_view) < 0) + { + obidebug(1, "\nError closing the output view after aligning"); + return -1; + } + + free_kmer_tables(ktable, seq1_count + seq2_count); + + return 0; +} + diff --git a/src/obi_align.h b/src/obi_align.h index c0d823c..68048bd 100644 --- a/src/obi_align.h +++ b/src/obi_align.h @@ -1,12 +1,12 @@ /**************************************************************************** - * Sequence alignment functions header file * + * LCS sequence alignment functions header file * ****************************************************************************/ /** * @file obi_align.h * @author Celine Mercier * @date May 11th 2016 - * @brief Header file for the functions handling the alignment of DNA sequences. + * @brief Header file for the functions handling the LCS alignment of DNA sequences. */ @@ -55,7 +55,7 @@ /** - * @brief Aligns a NUC_SEQ column with itself. + * @brief Aligns an OBI_SEQ column with itself. * * Note: The columns where the results are written are automatically named and created. * @@ -96,14 +96,59 @@ int obi_lcs_align_one_column(OBIDMS_p dms, /** - * @brief + * @brief Aligns two OBI_SEQ columns. * - * TODO + * The columns must belong to the same OBIDMS, but can belong to different views. * + * Note: The columns where the results are written are automatically named and created. + * + * @param dms A pointer on an OBIDMS. + * @param seq1_view_name The name of the view where the first column to align is. + * @param seq2_view_name The name of the view where the second column to align is ("" if it is the same view as the first one). + * @param seq1_column_name The name of the first OBI_SEQ column in the input view to align. + * If "" (empty string), and the input view is of type NUC_SEQS_VIEW, the associated "NUC_SEQ" column is aligned. + * @param seq2_column_name The name of the second OBI_SEQ column in the input view to align. + * If "" (empty string), and the input view is of type NUC_SEQS_VIEW, the associated "NUC_SEQ" column is aligned. + * @param seq1_elt_name The name of the element in the first column corresponding to the sequence to align, if the column has multiple + * elements per line. + * @param seq2_elt_name The name of the element in the second column corresponding to the sequence to align, if the column has multiple + * elements per line. + * @param id1_column_name The name of the column in the first input view containing the identifiers of the first sequence to align. + * If "" (empty string), and the input view is of type NUC_SEQS_VIEW, the associated "ID" column is aligned. + * @param id2_column_name The name of the column in the second input view containing the identifiers of the second sequence to align. + * If "" (empty string), and the input view is of type NUC_SEQS_VIEW, the associated "ID" column is aligned. + * @param output_view_name The name of the output view where the results should be written (should not already exist). + * @param output_view_comments The comments that should be associated with the output view. + * @param print_seq A boolean indicating whether the aligned sequences should be copied in the output view. + * @param print_count A boolean indicating whether the aligned sequence counts should be copied in the output view. + * @param threshold Score threshold. If the score is normalized and expressed in similarity, it is an identity, e.g. 0.95 + * for an identity of 95%. If the score is normalized and expressed in distance, it is (1.0 - identity), + * e.g. 0.05 for an identity of 95%. If the score is not normalized and expressed in similarity, it is + * the length of the Longest Common Subsequence. If the score is not normalized and expressed in distance, + * it is (reference length - LCS length). Only sequence pairs with a similarity above the threshold are printed. + * @param normalize Whether the score should be normalized with the reference sequence length. + * @param reference The reference length. 0: The alignement length; 1: The longest sequence's length; 2: The shortest sequence's length. + * @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false). + * + * @returns A value indicating the success of the operation. + * @retval 0 if the operation was successfully completed. + * @retval -1 if an error occurred. + * + * @since December 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -//int obi_align_two_columns(Obiview_p seq_view, OBIDMS_column_p seq_column_1, OBIDMS_column_p seq_column_2, -// Obiview_p score_view, OBIDMS_column_p score_column, -// double threshold, bool normalize, int reference, bool similarity_mode); +int obi_lcs_align_two_columns(OBIDMS_p dms, + const char* seq1_view_name, + const char* seq2_view_name, + const char* seq1_column_name, + const char* seq2_column_name, + const char* seq1_elt_name, + const char* seq2_elt_name, + const char* id1_column_name, + const char* id2_column_name, + const char* output_view_name, const char* output_view_comments, + bool print_seq, bool print_count, + double threshold, bool normalize, int reference, bool similarity_mode); #endif /* OBI_ALIGN_H_ */ From 857a5198e4dc33c46c1abf33b819cc10dd1c01a2 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Fri, 16 Dec 2016 19:40:36 +0100 Subject: [PATCH 06/22] Updated ``obi lcs`` for the LCS alignment of two columns --- python/obitools3/commands/lcs.pyx | 45 ++++++++++++++++------- python/obitools3/obidms/capi/obialign.pxd | 18 +++++++++ 2 files changed, 49 insertions(+), 14 deletions(-) diff --git a/python/obitools3/commands/lcs.pyx b/python/obitools3/commands/lcs.pyx index 0a8b3b0..8a8bea9 100644 --- a/python/obitools3/commands/lcs.pyx +++ b/python/obitools3/commands/lcs.pyx @@ -4,7 +4,8 @@ from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport from obitools3.obidms._obidms cimport OBIDMS # TODO cimport doesn't work from obitools3.utils cimport str2bytes -from obitools3.obidms.capi.obialign cimport obi_lcs_align_one_column +from obitools3.obidms.capi.obialign cimport obi_lcs_align_one_column, \ + obi_lcs_align_two_columns import time @@ -161,19 +162,35 @@ cpdef align(str dms_n, cdef OBIDMS d d = OBIDMS(dms_n) - # Align 1 column (2 columns not implemented yet) - if obi_lcs_align_one_column(d._pointer, \ - str2bytes(input_view_1_n), \ - str2bytes(input_column_1_n), \ - str2bytes(input_elt_1_n), \ - str2bytes(id_column_1_n), \ - str2bytes(output_view_n), \ - str2bytes(comments), \ - print_seq, \ - print_count, \ - threshold, normalize, reference, similarity_mode) < 0 : - raise Exception("Error aligning sequences") - + if input_view_2_n == "" and input_column_2_n == "" : + if obi_lcs_align_one_column(d._pointer, \ + str2bytes(input_view_1_n), \ + str2bytes(input_column_1_n), \ + str2bytes(input_elt_1_n), \ + str2bytes(id_column_1_n), \ + str2bytes(output_view_n), \ + str2bytes(comments), \ + print_seq, \ + print_count, \ + threshold, normalize, reference, similarity_mode) < 0 : + raise Exception("Error aligning sequences") + else : + if obi_lcs_align_two_columns(d._pointer, \ + str2bytes(input_view_1_n), \ + str2bytes(input_view_2_n), \ + str2bytes(input_column_1_n), \ + str2bytes(input_column_2_n), \ + str2bytes(input_elt_1_n), \ + str2bytes(input_elt_2_n), \ + str2bytes(id_column_1_n), \ + str2bytes(id_column_2_n), \ + str2bytes(output_view_n), \ + str2bytes(comments), \ + print_seq, \ + print_count, \ + threshold, normalize, reference, similarity_mode) < 0 : + raise Exception("Error aligning sequences") + d.close() diff --git a/python/obitools3/obidms/capi/obialign.pxd b/python/obitools3/obidms/capi/obialign.pxd index e9c105c..e76cabe 100644 --- a/python/obitools3/obidms/capi/obialign.pxd +++ b/python/obitools3/obidms/capi/obialign.pxd @@ -20,3 +20,21 @@ cdef extern from "obi_align.h" nogil: int reference, bint similarity_mode) + + int obi_lcs_align_two_columns(OBIDMS_p dms, + const_char_p seq1_view_name, + const_char_p seq2_view_name, + const_char_p seq1_column_name, + const_char_p seq2_column_name, + const_char_p seq1_elt_name, + const_char_p seq2_elt_name, + const_char_p id1_column_name, + const_char_p id2_column_name, + const_char_p output_view_name, + const_char_p output_view_comments, + bint print_seq, + bint print_count, + double threshold, + bint normalize, + int reference, + bint similarity_mode); From 9c71b06117784f4d7a2b2f61e1045d40e72ff3d8 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Mon, 19 Dec 2016 14:36:40 +0100 Subject: [PATCH 07/22] Removed deprecated TODOs --- python/obitools3/obidms/_obidmscolumn_str.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/obitools3/obidms/_obidmscolumn_str.pyx b/python/obitools3/obidms/_obidmscolumn_str.pyx index 1401141..87a7936 100644 --- a/python/obitools3/obidms/_obidmscolumn_str.pyx +++ b/python/obitools3/obidms/_obidmscolumn_str.pyx @@ -22,7 +22,7 @@ cdef class OBIDMS_column_str(OBIDMS_column): result = None else : result = bytes2str(value) - # NOTE: value is not freed because the pointer points to a mmapped region in an AVL data file. (TODO discuss) + # NOTE: value is not freed because the pointer points to a mmapped region in an AVL data file. return result cpdef set_line(self, index_t line_nb, object value): @@ -46,7 +46,7 @@ cdef class OBIDMS_column_multi_elts_str(OBIDMS_column_multi_elts): result = None else : result = bytes2str(value) - # NOTE: value is not freed because the pointer points to a mmapped region in an AVL data file. (TODO discuss) + # NOTE: value is not freed because the pointer points to a mmapped region in an AVL data file. return result cpdef object get_line(self, index_t line_nb) : @@ -65,7 +65,7 @@ cdef class OBIDMS_column_multi_elts_str(OBIDMS_column_multi_elts): value_in_result = None else : value_in_result = bytes2str(value) - # NOTE: value is not freed because the pointer points to a mmapped region in an AVL data file. (TODO discuss) + # NOTE: value is not freed because the pointer points to a mmapped region in an AVL data file. result[self.elements_names[i]] = value_in_result if all_NA and (value_in_result is not None) : all_NA = False From 5c50e5b378a3240ce8ea07ef1fa2d4fce80eea79 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Tue, 20 Dec 2016 11:46:58 +0100 Subject: [PATCH 08/22] Embryo of code for openMP parallelization of LCS alignment but deactivated for now because can't make it compile with cython/clang --- python/obitools3/commands/lcs.pyx | 18 ++++++++++++++---- src/obi_align.c | 12 +++++++++++- src/obi_align.h | 3 ++- 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/python/obitools3/commands/lcs.pyx b/python/obitools3/commands/lcs.pyx index 8a8bea9..1591fd8 100644 --- a/python/obitools3/commands/lcs.pyx +++ b/python/obitools3/commands/lcs.pyx @@ -147,6 +147,13 @@ def addOptions(parser): default=False, help="Sequence counts are written in the output view. Default: they are not written.") + group.add_argument('--thread-count','-p', # TODO should probably be in a specific option group + action="store", dest="align:threadcount", + metavar='', + default=1, + type=int, + help="Number of threads to use for the computation. Default: one.") + cpdef align(str dms_n, str input_view_1_n, str output_view_n, @@ -157,7 +164,8 @@ cpdef align(str dms_n, double threshold=0.0, bint normalize=True, int reference=0, bint similarity_mode=True, bint print_seq=False, bint print_count=False, - comments="") : + comments="", + int thread_count=1) : cdef OBIDMS d d = OBIDMS(dms_n) @@ -172,7 +180,8 @@ cpdef align(str dms_n, str2bytes(comments), \ print_seq, \ print_count, \ - threshold, normalize, reference, similarity_mode) < 0 : + threshold, normalize, reference, similarity_mode, + thread_count) < 0 : raise Exception("Error aligning sequences") else : if obi_lcs_align_two_columns(d._pointer, \ @@ -216,8 +225,9 @@ def run(config): similarity_mode = config['align']['similarity'], \ print_seq = config['align']['printseq'], \ print_count = config['align']['printcount'], \ - comments = comments) - + comments = comments, \ + thread_count = config['align']['threadcount']) + print("Done.") diff --git a/src/obi_align.c b/src/obi_align.c index 97f6d4c..3fa3678 100644 --- a/src/obi_align.c +++ b/src/obi_align.c @@ -9,6 +9,10 @@ * @brief Functions handling LCS sequence alignments. */ +//#define OMP_SUPPORT // TODO +#ifdef OMP_SUPPORT +#include +#endif #include #include @@ -389,7 +393,8 @@ int obi_lcs_align_one_column(OBIDMS_p dms, const char* seq_view_name, const char const char* id_column_name, const char* output_view_name, const char* output_view_comments, bool print_seq, bool print_count, - double threshold, bool normalize, int reference, bool similarity_mode) + double threshold, bool normalize, int reference, bool similarity_mode, + int thread_count) { index_t i, j, k; index_t seq_count; @@ -537,6 +542,11 @@ int obi_lcs_align_one_column(OBIDMS_p dms, const char* seq_view_name, const char seq_count = (seq_view->infos)->line_count; + #ifdef OMP_SUPPORT + omp_set_num_threads(thread_count); + #pragma omp parallel for + #endif + for (i=0; i < (seq_count - 1); i++) { if (i%100 == 0) diff --git a/src/obi_align.h b/src/obi_align.h index 68048bd..98da4da 100644 --- a/src/obi_align.h +++ b/src/obi_align.h @@ -92,7 +92,8 @@ int obi_lcs_align_one_column(OBIDMS_p dms, const char* id_column_name, const char* output_view_name, const char* output_view_comments, bool print_seq, bool print_count, - double threshold, bool normalize, int reference, bool similarity_mode); + double threshold, bool normalize, int reference, bool similarity_mode, + int thread_count); /** From 30e4359c8556296331e7f5ee8d70e3ab94b56f2d Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Thu, 22 Dec 2016 17:03:51 +0100 Subject: [PATCH 09/22] LCS alignment: documentation for all the lowest level functions --- src/sse_banded_LCS_alignment.c | 279 ++++++++++++++++++++++++++++++--- src/sse_banded_LCS_alignment.h | 105 ++++++++++++- 2 files changed, 352 insertions(+), 32 deletions(-) diff --git a/src/sse_banded_LCS_alignment.c b/src/sse_banded_LCS_alignment.c index 07b5ffe..8790c6b 100644 --- a/src/sse_banded_LCS_alignment.c +++ b/src/sse_banded_LCS_alignment.c @@ -1,16 +1,22 @@ -/* - * sse_banded_LCS_alignment.c - * - * Created on: 7 nov. 2012 - * Author: celine mercier +/**************************************************************************** + * LCS alignment of two sequences * + ****************************************************************************/ + +/** + * @file sse_banded_LCS_alignment.c + * @author Celine Mercier (celine.mercier@metabarcoding.org) + * @date November 7th 2012 + * @brief Functions handling the alignment of two sequences to compute their Longest Common Sequence. */ + #include #include #include #include #include +#include #include "obierrno.h" #include "obidebug.h" @@ -24,6 +30,231 @@ #define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) + +/************************************************************************** + * + * D E C L A R A T I O N O F T H E P R I V A T E F U N C T I O N S + * + **************************************************************************/ + + +/** + * @brief Internal function printing a 128 bits register as 8 16-bits integers. + * + * @param r The register to print. + * + * @author Eric Coissac (eric.coissac@metabarcoding.org) + */ +static void printreg(__m128i r); + + +/** + * @brief Internal function extracting a 16-bits integer from a 128 bits register. + * + * @param r The register to read. + * @param p The position at which the integer should be read (between 0 and 7). + * + * @returns The extracted integer. + * + * @author Eric Coissac (eric.coissac@metabarcoding.org) + */ +static inline int extract_reg(__m128i r, int p); + + +/** + * @brief Internal function aligning two sequences, computing the lengths of their Longest Common Subsequence and of their alignment. + * + * @warning The first argument (seq1) must correspond to the longest sequence. + * + * @param seq1 The first sequence, the longest of the two, as prepared by putSeqInSeq() or putBlobInSeq(). + * @param seq2 The second sequence, the shortest of the two, as prepared by putSeqInSeq() or putBlobInSeq(). + * @param l1 The length of the first sequence. + * @param l2 The length of the second sequence. + * @param bandLengthLeft The length of the left band for the banded alignment, as computed by calculateLeftBandLength(). + * @param bandLengthTotal The length of the complete band for the banded alignment, as computed by calculateSSEBandLength(). + * @param address A pointer, aligned on a 16 bits boundary, on the int array where the initial values for the alignment length are stored, + * as prepared for the alignment by initializeAddressWithGaps(). + * @param lcs_length A pointer on the int where the LCS length will be stored. + * @param ali_length A pointer on the int where the alignment length will be stored. + * + * @since 2012 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal, int16_t* address, int* lcs_length, int* ali_length); + + +/** + * @brief Internal function aligning two sequences, computing the length of their Longest Common Subsequence (and not the alignment length). + * + * @warning The first argument (seq1) must correspond to the longest sequence. + * + * @param seq1 The first sequence, the longest of the two, as prepared by putSeqInSeq() or putBlobInSeq(). + * @param seq2 The second sequence, the shortest of the two, as prepared by putSeqInSeq() or putBlobInSeq(). + * @param l1 The length of the first sequence. + * @param l2 The length of the second sequence. + * @param bandLengthLeft The length of the left band for the banded alignment, as computed by calculateLeftBandLength(). + * @param bandLengthTotal The length of the complete band for the banded alignment, as computed by calculateSSEBandLength(). + * @param lcs_length A pointer on the int where the LCS length will be stored. + * + * @since 2012 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +void sse_banded_align_just_lcs(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal, int* lcs_length); + + +/** + * @brief Internal function calculating the length of the left band for the banded alignment. + * + * @param lmax The length of the longest sequence to align. + * @param LCSmin The minimum length of the LCS to be above the chosen threshold, as computed by calculateLCSmin(). + * + * @returns The length of the left band. + * + * @since 2012 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +int calculateLeftBandLength(int lmax, int LCSmin); + + +/** + * @brief Internal function calculating the length of the right band for the banded alignment. + * + * @param lmin The length of the shortest sequence to align. + * @param LCSmin The minimum length of the LCS to be above the chosen threshold, as computed by calculateLCSmin(). + * + * @returns The length of the right band. + * + * @since 2012 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +int calculateRightBandLength(int lmin, int LCSmin); + + +/** + * @brief Internal function calculating the length of the complete band for the banded alignment. + * + * @param bandLengthRight The length of the right band for the banded alignment, as computed by calculateRightBandLength(). + * @param bandLengthLeft The length of the left band for the banded alignment, as computed by calculateLeftBandLength(). + * + * @returns The length of the complete band. + * + * @since 2012 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +int calculateSSEBandLength(int bandLengthRight, int bandLengthLeft); + + +/** + * @brief Internal function calculating the size to allocate for the int array where the alignment length will be stored in the matrix. + * + * @param maxLen The length of the longest sequence to align. + * @param LCSmin The minimum length of the LCS to be above the chosen threshold, as computed by calculateLCSmin(). + * + * @returns The size to allocate in bytes. + * + * @since 2012 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +int calculateSizeToAllocate(int maxLen, int LCSmin); + + +/** + * @brief Internal function initializing the int array corresponding to a sequence to align with default values. + * + * @param seq The int array corresponding to the sequence to align, as prepared by putSeqInSeq() or putBlobInSeq(). + * @param size The number of positions to initialize. + * @param iniValue The value that the positions should be initialized to. + * + * @since 2012 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +void iniSeq(int16_t* seq, int size, int16_t iniValue); + + +/** + * @brief Internal function building the int array corresponding to a sequence to align. + * + * Each nucleotide is stored as a short int (int16_t). + * + * @param seq A pointer on the allocated int array. + * @param s A pointer on the character string corresponding to the sequence. + * @param l The length of the sequence. + * @param reverse A boolean indicating whether the sequence should be written reversed + * (for the second sequence to align). + * + * @since 2012 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +void putSeqInSeq(int16_t* seq, char* s, int l, bool reverse); + + +/** + * @brief Internal function building the int array corresponding to an obiblob containing a sequence. + * + * Each nucleotide is stored as a short int (int16_t). + * + * @param seq A pointer on the allocated int array. + * @param b A pointer on the obiblob containing the sequence. + * @param l The length of the (decoded) sequence. + * @param reverse A boolean indicating whether the sequence should be written reversed + * (for the second sequence to align). + * + * @since 2012 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +void putBlobInSeq(int16_t* seq, Obi_blob_p b, int l, bool reverse); + + +/** + * @brief Internal function preparing an int array with the initial values for the alignment lengths before the alignment. + * + * The int array containing the initial alignment lengths (corresponding to the first line of the diagonalized band of the alignment matrix) + * needs to be initialized with external gap lengths before the alignment. + * + * @param address A pointer, aligned on a 16 bits boundary, on the int array where the initial values for the alignment length are to be stored. + * @param bandLengthTotal The length of the complete band for the banded alignment, as computed by calculateSSEBandLength(). + * @param bandLengthLeft The length of the left band for the banded alignment, as computed by calculateLeftBandLength(). + * @param lmax The length of the longest sequence to align. + * + * @since 2012 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +void initializeAddressWithGaps(int16_t* address, int bandLengthTotal, int bandLengthLeft, int lmax); + + +/** + * @brief Internal function aligning two sequences, computing the lengths of their Longest Common Subsequence and of their alignment. + * + * @warning The first argument (seq1) must correspond to the longest sequence. + * + * @param seq1 The first sequence, the longest of the two, as prepared by putSeqInSeq() or putBlobInSeq(). + * @param seq2 The second sequence, the shortest of the two, as prepared by putSeqInSeq() or putBlobInSeq(). + * @param l1 The length of the first sequence. + * @param l2 The length of the second sequence. + * @param normalize Whether the score should be normalized with the reference sequence length. + * @param reference The reference length. 0: The alignment length; 1: The longest sequence's length; 2: The shortest sequence's length. + * @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false). + * @param address A pointer, aligned on a 16 bits boundary, on an allocated int array where the initial values for the alignment length will be stored. + * @param LCSmin The minimum length of the LCS to be above the chosen threshold, as computed by calculateLCSmin(). + * @param lcs_length A pointer on the int where the LCS length will be stored. + * @param ali_length A pointer on the int where the alignment length will be stored. + * + * @returns The alignment score (normalized according to the parameters). + * + * @since 2012 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +double sse_banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, bool normalize, int reference, bool similarity_mode, int16_t* address, int LCSmin, int* lcs_length, int* ali_length); + + + +/************************************************************************ + * + * D E F I N I T I O N O F T H E P R I V A T E F U N C T I O N S + * + ************************************************************************/ + + static void printreg(__m128i r) { int16_t a0,a1,a2,a3,a4,a5,a6,a7; @@ -61,7 +292,6 @@ static inline int extract_reg(__m128i r, int p) } -// TODO warning on length order void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal, int16_t* address, int* lcs_length, int* ali_length) { register int j; @@ -287,7 +517,6 @@ void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int } -// TODO warning on length order void sse_banded_align_just_lcs(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal, int* lcs_length) { register int j; @@ -319,7 +548,7 @@ void sse_banded_align_just_lcs(int16_t* seq1, int16_t* seq2, int l1, int l2, int // Initialisations odd_BLL = bandLengthLeft & 1; - even_BLL = !odd_BLL; + even_BLL = !odd_BLL; numberOfRegistersPerLine = bandLengthTotal / 8; numberOfRegistersFor3Lines = 3 * numberOfRegistersPerLine; @@ -446,15 +675,14 @@ int calculateSSEBandLength(int bandLengthRight, int bandLengthLeft) } -// TODO that's gonna be fun to doc -int calculateSizeToAllocate(int maxLen, int minLen, int LCSmin) +int calculateSizeToAllocate(int maxLen, int LCSmin) { int size; size = calculateLeftBandLength(maxLen, LCSmin); size *= 2; - size = (size & (~ (int)7)) + (( size & (int)7) ? 8:0); // Closest greater 8 multiple + size = (size & (~ (int)7)) + ((size & (int)7) ? 8:0); // Closest greater 8 multiple size *= 3; size += 16; @@ -522,13 +750,13 @@ void putBlobInSeq(int16_t* seq, Obi_blob_p b, int l, bool reverse) } -void initializeAddressWithGaps(int16_t* address, int bandLengthTotal, int bandLengthLeft, int l1) +void initializeAddressWithGaps(int16_t* address, int bandLengthTotal, int bandLengthLeft, int lmax) { int i; int address_00, x_address_10, address_01, address_01_shifted; int numberOfRegistersPerLine; int bm; - int value=INT16_MAX-l1; + int value=INT16_MAX-lmax; numberOfRegistersPerLine = bandLengthTotal / 8; bm = bandLengthLeft%2; @@ -556,7 +784,6 @@ void initializeAddressWithGaps(int16_t* address, int bandLengthTotal, int bandLe } -// TODO warning on length order double sse_banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, bool normalize, int reference, bool similarity_mode, int16_t* address, int LCSmin, int* lcs_length, int* ali_length) { double id; @@ -610,10 +837,14 @@ double sse_banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, bool n -// PUBLIC FUNCTIONS +/********************************************************************** + * + * D E F I N I T I O N O F T H E P U B L I C F U N C T I O N S + * + **********************************************************************/ -int calculateLCSmin(int l1, int l2, double threshold, bool normalize, int reference, bool similarity_mode) +int calculateLCSmin(int lmax, int lmin, double threshold, bool normalize, int reference, bool similarity_mode) { int LCSmin; @@ -622,16 +853,16 @@ int calculateLCSmin(int l1, int l2, double threshold, bool normalize, int refere if (normalize) { if (reference == MINLEN) - LCSmin = threshold*l2; + LCSmin = threshold*lmin; else // ref = maxlen or alilen - LCSmin = threshold*l1; + LCSmin = threshold*lmax; } else if (similarity_mode) LCSmin = threshold; else if (reference == MINLEN) // not similarity_mode - LCSmin = l2 - threshold; + LCSmin = lmin - threshold; else // not similarity_mode and ref = maxlen or alilen - LCSmin = l1 - threshold; + LCSmin = lmax - threshold; } else LCSmin = 0; @@ -679,7 +910,7 @@ double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bo // Allocate space for matrix band if the alignment length must be computed if ((reference == ALILEN) && (normalize || !similarity_mode)) // cases in which alignment length must be computed { - sizeToAllocateForBand = calculateSizeToAllocate(lmax, lmin, LCSmin); + sizeToAllocateForBand = calculateSizeToAllocate(lmax, LCSmin); address = obi_get_memory_aligned_on_16(sizeToAllocateForBand, &shift); if (address == NULL) { @@ -774,13 +1005,13 @@ double obiblob_sse_banded_lcs_align(Obi_blob_p seq1, Obi_blob_p seq2, double thr // Allocate space for matrix band if the alignment length must be computed if ((reference == ALILEN) && (normalize || !similarity_mode)) // cases in which alignment length must be computed { - sizeToAllocateForBand = calculateSizeToAllocate(lmax, lmin, LCSmin); + sizeToAllocateForBand = calculateSizeToAllocate(lmax, LCSmin); address = obi_get_memory_aligned_on_16(sizeToAllocateForBand, &shift); if (address == NULL) { obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError getting a memory address aligned on 16 bytes boundary"); - return 0; // TODO DOUBLE_MIN + obidebug(1, "\nError getting a memory address aligned on a 16 bits boundary"); + return 0; // TODO DOUBLE_MIN to flag error } } diff --git a/src/sse_banded_LCS_alignment.h b/src/sse_banded_LCS_alignment.h index 23f3358..f46bf94 100644 --- a/src/sse_banded_LCS_alignment.h +++ b/src/sse_banded_LCS_alignment.h @@ -1,10 +1,15 @@ -/* - * sse_banded_LCS_alignment.h - * - * Created on: november 29, 2012 - * Author: mercier +/**************************************************************************** + * LCS alignment of two sequences header file * + ****************************************************************************/ + +/** + * @file sse_banded_LCS_alignment.h + * @author Celine Mercier (celine.mercier@metabarcoding.org) + * @date November 7th 2012 + * @brief header file for the functions handling the alignment of two sequences to compute their Longest Common Sequence. */ + #ifndef SSE_BANDED_LCS_ALIGNMENT_H_ #define SSE_BANDED_LCS_ALIGNMENT_H_ @@ -15,13 +20,97 @@ #include "obiblob.h" -#define ALILEN (0) // TODO enum +/** + * @brief Macros for reference lengths to use when aligning. + * + * @since 2012 + * @author Eric Coissac (eric.coissac@metabarcoding.org) + */ +#define ALILEN (0) #define MAXLEN (1) #define MINLEN (2) -// TODO doc -int calculateLCSmin(int l1, int l2, double threshold, bool normalize, int reference, bool lcsmode); + +/** + * @brief Function calculating the minimum length of the Longest Common Subsequence between two sequences to be above a chosen score threshold. + * + * @warning The first argument (lmax) must correspond to length of the longest sequence. + * + * @param lmax The length of the longest sequence to align. + * @param lmin The length of the shortest sequence to align. + * @param threshold Score threshold. If the score is normalized and expressed in similarity, it is an identity, e.g. 0.95 + * for an identity of 95%. If the score is normalized and expressed in distance, it is (1.0 - identity), + * e.g. 0.05 for an identity of 95%. If the score is not normalized and expressed in similarity, it is + * the length of the Longest Common Subsequence. If the score is not normalized and expressed in distance, + * it is (reference length - LCS length). Only sequence pairs with a similarity above the threshold are printed. + * @param normalize Whether the score should be normalized with the reference sequence length. + * @param reference The reference length. 0: The alignment length; 1: The longest sequence's length; 2: The shortest sequence's length. // TODO + * @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false). + * + * @returns The minimum length of the Longest Common Subsequence between two sequences to be above the chosen score threshold. + * + * @since 2012 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +int calculateLCSmin(int lmax, int lmin, double threshold, bool normalize, int reference, bool similarity_mode); + + +/** + * @brief Function aligning two sequences. + * + * The alignment algorithm is a banded global alignment algorithm, a modified version of the classical Needleman and Wunsch algorithm, + * and uses indices based on the length of the Longest Common Subsequence between the two sequences. + * + * Note: the sequences do not need to be ordered (e.g. with the longest sequence as first argument). + * + * @param seq1 A pointer on the character string corresponding to the first sequence. + * @param seq2 A pointer on the character string corresponding to the second sequence. + * @param threshold Score threshold. If the score is normalized and expressed in similarity, it is an identity, e.g. 0.95 + * for an identity of 95%. If the score is normalized and expressed in distance, it is (1.0 - identity), + * e.g. 0.05 for an identity of 95%. If the score is not normalized and expressed in similarity, it is + * the length of the Longest Common Subsequence. If the score is not normalized and expressed in distance, + * it is (reference length - LCS length). Only sequence pairs with a similarity above the threshold are printed. + * @param normalize Whether the score should be normalized with the reference sequence length. + * @param reference The reference length. 0: The alignment length; 1: The longest sequence's length; 2: The shortest sequence's length. // TODO + * @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false). + * @param lcs_length A pointer on the int where the LCS length will be stored. + * @param ali_length A pointer on the int where the alignment length will be stored. + * + * @returns The alignment score (normalized according to the parameters). + * + * @since 2012 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bool normalize, int reference, bool similarity_mode, int* lcs_length, int* ali_length); + + +/** + * @brief Function aligning two sequences encoded in obiblobs. + * + * The alignment algorithm is a banded global alignment algorithm, a modified version of the classical Needleman and Wunsch algorithm, + * and uses indices based on the length of the Longest Common Subsequence between the two sequences. + * + * Note: the obiblobs do not need to be ordered (e.g. with the obiblob containing the longest sequence as first argument). + * + * @param seq1 A pointer on the blob containing the first sequence. + * @param seq2 A pointer on the blob containing the second sequence. + * @param threshold Score threshold. If the score is normalized and expressed in similarity, it is an identity, e.g. 0.95 + * for an identity of 95%. If the score is normalized and expressed in distance, it is (1.0 - identity), + * e.g. 0.05 for an identity of 95%. If the score is not normalized and expressed in similarity, it is + * the length of the Longest Common Subsequence. If the score is not normalized and expressed in distance, + * it is (reference length - LCS length). Only sequence pairs with a similarity above the threshold are printed. + * @param normalize Whether the score should be normalized with the reference sequence length. + * @param reference The reference length. 0: The alignment length; 1: The longest sequence's length; 2: The shortest sequence's length. // TODO + * @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false). + * @param lcs_length A pointer on the int where the LCS length will be stored. + * @param ali_length A pointer on the int where the alignment length will be stored. + * + * @returns The alignment score (normalized according to the parameters). + * + * @since December 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ double obiblob_sse_banded_lcs_align(Obi_blob_p seq1, Obi_blob_p seq2, double threshold, bool normalize, int reference, bool similarity_mode, int* lcs_length, int* ali_length); + #endif From 8e92bf6dacd5543d8120eac970cee2eeb4c0bff8 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Thu, 22 Dec 2016 17:06:23 +0100 Subject: [PATCH 10/22] LCS alignment: it is now checked that sequences are not longer than what a 16 bits integer can code for (as the LCS and alignment lengths are kept in 16 bits registers) --- src/sse_banded_LCS_alignment.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/sse_banded_LCS_alignment.c b/src/sse_banded_LCS_alignment.c index 8790c6b..eab489b 100644 --- a/src/sse_banded_LCS_alignment.c +++ b/src/sse_banded_LCS_alignment.c @@ -900,6 +900,14 @@ double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bo lmin = l1; } + // Check that the sequences are not greater than what can be aligned using the 16 bits registers (as the LCS and alignment lengths are kept on 16 bits) + if (lmax > SHRT_MAX) + { + obi_set_errno(OBI_ALIGN_ERROR); + obidebug(1, "\nError: can not align sequences longer than %d (as the LCS and alignment lengths are kept on 16 bits)", SHRT_MAX); + return 0; // TODO DOUBLE_MIN to flag error + } + // If the score is expressed as a normalized distance, get the corresponding identity if (!similarity_mode && normalize) threshold = 1.0 - threshold; @@ -995,6 +1003,14 @@ double obiblob_sse_banded_lcs_align(Obi_blob_p seq1, Obi_blob_p seq2, double thr lmin = l1; } + // Check that the sequences are not greater than what can be aligned using the 16 bits registers (as the LCS and alignment lengths are kept on 16 bits) + if (lmax > SHRT_MAX) + { + obi_set_errno(OBI_ALIGN_ERROR); + obidebug(1, "\nError: can not align sequences longer than %d (as the LCS and alignment lengths are kept on 16 bits)", SHRT_MAX); + return 0; // TODO DOUBLE_MIN to flag error + } + // If the score is expressed as a normalized distance, get the corresponding identity if (!similarity_mode && normalize) threshold = 1.0 - threshold; From 897032387f5159b9d8e6d19cc94a4ac8be77887c Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Thu, 5 Jan 2017 14:28:36 +0100 Subject: [PATCH 11/22] Taxonomy: reading merged.dmp file in taxdump --- src/obidms_taxonomy.c | 226 ++++++++++++++++++++++++++++++++++++++++-- src/obidms_taxonomy.h | 35 +++++-- 2 files changed, 240 insertions(+), 21 deletions(-) diff --git a/src/obidms_taxonomy.c b/src/obidms_taxonomy.c index 3eafd85..e096a8c 100644 --- a/src/obidms_taxonomy.c +++ b/src/obidms_taxonomy.c @@ -1041,6 +1041,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) FILE* file; bool nodes_found=false; bool names_found=false; + bool merged_found=false; char line[2048]; // TODO large enough? char* elt; char* file_name; @@ -1049,9 +1050,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) int n; char** rank_names; int* parent_taxids; - int taxid; + int taxid, old_taxid; bool already_in; - ecotx_t* t; + ecotx_t* t; // Initialize taxonomy structure tax = (OBIDMS_taxonomy_p) malloc(sizeof(OBIDMS_taxonomy_t)); @@ -1061,9 +1062,10 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) obidebug(1, "\nError allocating the memory for a taxonomy structure"); return NULL; } - tax->ranks = NULL; - tax->taxa = NULL; - tax->names = NULL; + tax->ranks = NULL; + tax->taxa = NULL; + tax->names = NULL; + tax->merged_idx = NULL; tax->dms = NULL; (tax->tax_name)[0] = '\0'; @@ -1312,6 +1314,204 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) closedir(tax_dir); + // Go through directory again for next file // TODO make separate functions? + tax_dir = opendir(taxdump); + if (tax_dir == NULL) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nProblem opening a taxdump directory"); + obi_close_taxonomy(tax); + free(parent_taxids); + free(rank_names); + return NULL; + } + + // Go through taxonomy files + while ((dp = readdir(tax_dir)) != NULL) + { + if (strcmp(dp->d_name, "merged.dmp") == 0) + { + merged_found = true; // TODO + buffer_size = 10000; + + // Initializing the merged structure + tax->merged_idx = (ecomergedidx_t*) malloc(sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size); + if (tax->merged_idx == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating the memory for a taxonomy structure"); + obi_close_taxonomy(tax); + free(parent_taxids); + free(rank_names); + closedir(tax_dir); + return NULL; + } + + // Allocating the memory for the file name + file_name = (char*) malloc((strlen(taxdump) + 12)*sizeof(char)); + if (file_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating the memory for a file name"); + obi_close_taxonomy(tax); + free(parent_taxids); + free(rank_names); + closedir(tax_dir); + return NULL; + } + + // Build the file path + if (sprintf(file_name, "%s/merged.dmp", taxdump) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError building a taxonomy file name"); + obi_close_taxonomy(tax); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + free(file_name); + return NULL; + } + + file = fopen(file_name, "r"); + if (file == NULL) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nProblem opening a taxonomy file"); + obi_close_taxonomy(tax); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + free(file_name); + return NULL; + } + + free(file_name); + + n = 0; + i = 0; + while (fgets(line, sizeof(line), file)) + { + // Check for terminal '\n' character (line complete) + if (line[strlen(line) - 1] != '\n') + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError: line buffer size not large enough for line in taxonomy file"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + return NULL; + } + + // Parse the 2 elements separated by '|' + + // Get first element + elt = strtok(line, "|"); + // Remove the last character (tab character) + elt[strlen(elt)-1] = '\0'; + // First element: old deprecated taxid + old_taxid = atoi(elt); + + // Get 2nd element: new taxid + elt = strtok(NULL, "|"); + // Remove the first and the last characters (tab characters) + elt = elt+1; + elt[strlen(elt)-1] = '\0'; + taxid = atoi(elt); + + // Store the old taxid in the merged_idx ordered taxid list + // First, store the taxids from the current taxonomy that come before + while ((i < (tax->taxa)->count) && ((tax->taxa)->taxon[i].taxid < old_taxid)) + { + // Enlarge structures if needed + if (n == buffer_size) + { + buffer_size = buffer_size * 2; + tax->merged_idx = (ecomergedidx_t*) realloc(tax->merged_idx, sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size); + if (tax->merged_idx == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for a taxonomy structure"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + return NULL; + } + } + + (tax->merged_idx)->merged[n].taxid = (tax->taxa)->taxon[i].taxid; + (tax->merged_idx)->merged[n].idx = i; + i++; + n++; + } + + // Enlarge structures if needed + if (n == buffer_size) + { + buffer_size = buffer_size * 2; + tax->merged_idx = (ecomergedidx_t*) realloc(tax->merged_idx, sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size); + if (tax->merged_idx == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for a taxonomy structure"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + return NULL; + } + } + + // Store the deprecated taxid with the index that refers to the new taxid + // Find the index of the new taxid + t = obi_taxo_get_taxon_with_taxid(tax, taxid); + // Store the old taxid with the index + (tax->merged_idx)->merged[n].taxid = old_taxid; + (tax->merged_idx)->merged[n].idx = t->idx; + n++; + } + + // Check that fgets stopped because it reached EOF + if (!feof(file)) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError: file reading was stopped before end of file"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + return NULL; + } + + // Store count + (tax->merged_idx)->count = n; + + // Truncate the structure memory to the right size + tax->merged_idx = (ecomergedidx_t*) realloc(tax->merged_idx, sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * (tax->merged_idx)->count); + if (tax->merged_idx == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for a a taxonomy structure"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + return NULL; + } + + fclose(file); + } + } + closedir(tax_dir); + + // Go through directory again for next file tax_dir = opendir(taxdump); if (tax_dir == NULL) @@ -1346,7 +1546,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) } // Allocating the memory for the file name - file_name = (char*) malloc((strlen(taxdump) + 10)*sizeof(char)); + file_name = (char*) malloc((strlen(taxdump) + 11)*sizeof(char)); if (file_name == NULL) { obi_set_errno(OBI_MALLOC_ERROR); @@ -1684,7 +1884,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) (tax->taxa)->buffer_size = (tax->taxa)->count; - // Compute longest branches TODO what is this for??? + // Compute longest branches for (i=0; i < (tax->taxa)->count; i++) { t = (((tax->taxa))->taxon)+i; @@ -1844,9 +2044,10 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo return NULL; } - tax->ranks = NULL; - tax->taxa = NULL; - tax->names = NULL; + tax->ranks = NULL; + tax->taxa = NULL; + tax->names = NULL; + tax->merged_idx = NULL; tax->dms = dms; @@ -2028,6 +2229,11 @@ int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy) free(taxonomy->taxa); } + if (taxonomy->merged_idx) + { + free(taxonomy->merged_idx); + } + free(taxonomy); } diff --git a/src/obidms_taxonomy.h b/src/obidms_taxonomy.h index e70e892..543a257 100644 --- a/src/obidms_taxonomy.h +++ b/src/obidms_taxonomy.h @@ -26,12 +26,12 @@ typedef struct { int32_t rank; int32_t parent; int32_t name_length; - char name[1]; + char name[]; } ecotxformat_t; typedef struct ecotxnode { - int32_t taxid; + int32_t taxid; // TODO discuss that this is will be the current taxid even if the struct was accessed through a deprecated one int32_t rank; int32_t farest; int32_t idx; @@ -47,13 +47,13 @@ typedef struct { int32_t local_count; int32_t max_taxid; int32_t buffer_size; - ecotx_t taxon[1]; + ecotx_t taxon[]; } ecotxidx_t; typedef struct { int32_t count; - char* label[1]; + char* label[]; } ecorankidx_t; @@ -62,7 +62,7 @@ typedef struct { int32_t name_length; int32_t class_length; int32_t taxid; // taxid idx - char names[1]; + char names[]; } econameformat_t; @@ -76,16 +76,29 @@ typedef struct { typedef struct { int32_t count; - econame_t names[1]; + econame_t names[]; } econameidx_t; +typedef struct { + int32_t taxid; + int32_t idx; +} ecomerged_t; + + +typedef struct { + int32_t count; + ecomerged_t merged[]; +} ecomergedidx_t; + + typedef struct OBIDMS_taxonomy_t { - char tax_name[TAX_NAME_LEN]; - OBIDMS_p dms; - ecorankidx_t* ranks; - econameidx_t* names; - ecotxidx_t* taxa; + char tax_name[TAX_NAME_LEN]; + OBIDMS_p dms; + ecomergedidx_t* merged_idx; + ecorankidx_t* ranks; + econameidx_t* names; + ecotxidx_t* taxa; } OBIDMS_taxonomy_t, *OBIDMS_taxonomy_p; From f396625f981dfe9c38e4b97dd90384dc80fd1f1d Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Thu, 5 Jan 2017 15:37:13 +0100 Subject: [PATCH 12/22] Taxonomy: function to write *.adx files --- src/obidms_taxonomy.c | 99 ++++++++++++++++++++++++++++++++++++++++++- src/obidms_taxonomy.h | 5 --- 2 files changed, 98 insertions(+), 6 deletions(-) diff --git a/src/obidms_taxonomy.c b/src/obidms_taxonomy.c index e096a8c..bf9d773 100644 --- a/src/obidms_taxonomy.c +++ b/src/obidms_taxonomy.c @@ -996,6 +996,101 @@ int write_nameidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name } +int write_mergedidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? +{ + int i; + char* file_name; + int file_descriptor; + off_t file_size; + char* taxonomy_path; + + // Compute file size + file_size = sizeof(int32_t) + (sizeof(int32_t) * 3 * (tax->merged_idx)->count); + + // Build the taxonomy directory path + taxonomy_path = get_taxonomy_path(dms, taxonomy_name); + + file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 6)*sizeof(char)); + if (file_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating the memory for a binary taxonomy file name"); + return -1; + } + + // Build the file path + if (sprintf(file_name, "%s/%s.adx", taxonomy_path, taxonomy_name) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError building a binary taxonomy file name"); + return -1; + } + + free(taxonomy_path); + + // Create file + file_descriptor = open(file_name, O_RDWR | O_CREAT | O_EXCL, 0777); + if (file_descriptor < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError creating a binary taxonomy file %s", file_name); + free(file_name); + return -1; + } + + free(file_name); + + // Truncate the file to the right size + if (ftruncate(file_descriptor, file_size) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError truncating a binary taxonomy file"); + close(file_descriptor); + return -1; + } + + // Write merged indices count + if (write(file_descriptor, &((tax->merged_idx)->count), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t))) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError writing in a binary taxonomy file"); + close(file_descriptor); + return -1; + } + + // Write merged indices + for (i=0; i < (tax->merged_idx)->count; i++) + { + // Write taxid + if (write(file_descriptor, &(((tax->merged_idx)->merged)[i].taxid), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t))) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError writing in a binary taxonomy file"); + close(file_descriptor); + return -1; + } + + // Write index corresponding to the taxid in the ecotxidx_t structure + if (write(file_descriptor, &(((tax->merged_idx)->merged)[i].idx), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t))) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError writing in a binary taxonomy file"); + close(file_descriptor); + return -1; + } + } + + // Close file + if (close(file_descriptor) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError closing a taxonomy file file"); + return -1; + } + + return 0; +} + int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name) { @@ -1024,6 +1119,8 @@ int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name return -1; if (write_nameidx(dms, tax, tax_name) < 0) return -1; + if (write_mergedidx(dms, tax, tax_name) < 0) + return -1; // Check if there are local taxa (if so last taxon is local) if ((tax->taxa)->local_count > 0) if (write_local_taxonomy_idx(dms, tax, tax_name) < 0) @@ -1331,7 +1428,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { if (strcmp(dp->d_name, "merged.dmp") == 0) { - merged_found = true; // TODO + merged_found = true; buffer_size = 10000; // Initializing the merged structure diff --git a/src/obidms_taxonomy.h b/src/obidms_taxonomy.h index 543a257..48d0293 100644 --- a/src/obidms_taxonomy.h +++ b/src/obidms_taxonomy.h @@ -122,11 +122,6 @@ ecotx_t* obi_taxo_get_kingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy); ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy); - -int write_rankidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name); -int write_taxonomyidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name); -int write_nameidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name); - int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name); OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump); From d68374018bf43eedee8cb3e1500a350b6f308694 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Fri, 6 Jan 2017 15:52:21 +0100 Subject: [PATCH 13/22] Taxonomy: functions to read the *.adx file (containing the deprecated and current taxids and their corresponding indices in the taxa structure) and to find the taxa using the merged index. --- src/obidms_taxonomy.c | 140 +++++++++++++++++++++++++++++++++++++++--- src/obidms_taxonomy.h | 1 + 2 files changed, 132 insertions(+), 9 deletions(-) diff --git a/src/obidms_taxonomy.c b/src/obidms_taxonomy.c index bf9d773..2779699 100644 --- a/src/obidms_taxonomy.c +++ b/src/obidms_taxonomy.c @@ -35,7 +35,7 @@ int cmp_rank_labels(const void* label1, const void* label2) } -static int cmp_taxids(const void* ptaxid, const void* ptaxon) +static int cmp_taxids_in_ecotx_t(const void* ptaxid, const void* ptaxon) { ecotx_t* current_taxon = (ecotx_t*) ptaxon; int32_t taxid = (int32_t) ((size_t) ptaxid); @@ -43,6 +43,14 @@ static int cmp_taxids(const void* ptaxid, const void* ptaxon) } +static int cmp_taxids_in_ecomerged_t(const void* ptaxid, const void* ptaxon) +{ + ecomerged_t* current_taxon = (ecomerged_t*) ptaxon; + int32_t taxid = (int32_t) ((size_t) ptaxid); + return taxid - current_taxon->taxid; +} + + static int cmp_str(const void* s1, const void* s2) { return strcmp(*((char**)s1), *((char**)s2)); @@ -467,6 +475,55 @@ econameidx_t* read_nameidx(const char *file_name, OBIDMS_taxonomy_p taxonomy) +ecomergedidx_t* read_mergedidx(const char *file_name, OBIDMS_taxonomy_p taxonomy) +{ + int32_t count; + FILE* f; + ecomergedidx_t* index_merged_idx; + ecomerged_t* merged_idx; + int32_t i; + int32_t record_length; + + f = open_ecorecorddb(file_name, &count, 0); + if (f == NULL) + { + obidebug(1, "\nError reading taxonomy name file"); + return NULL; + } + + index_merged_idx = (ecomergedidx_t*) malloc(sizeof(ecomergedidx_t) + (sizeof(ecomerged_t) * count)); + if (index_merged_idx == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reading taxonomy name file"); + return NULL; + } + + index_merged_idx->count = count; + + for (i=0; i < count; i++) + { + merged_idx = read_ecorecord(f, &record_length); + memcpy((index_merged_idx->merged)+i, merged_idx, record_length); + if ((index_merged_idx->merged)+i == NULL) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError reading taxonomy name file"); + free(index_merged_idx); + return NULL; + } + } + + fclose(f); + + return index_merged_idx; +} + + + + + + // Functions to write taxonomy structure to binary files @@ -1003,6 +1060,7 @@ int write_mergedidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_na int file_descriptor; off_t file_size; char* taxonomy_path; + int32_t record_size; // Compute file size file_size = sizeof(int32_t) + (sizeof(int32_t) * 3 * (tax->merged_idx)->count); @@ -1058,9 +1116,20 @@ int write_mergedidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_na return -1; } + record_size = 2 * sizeof(int32_t); + // Write merged indices for (i=0; i < (tax->merged_idx)->count; i++) { + // Write record size + if (write(file_descriptor, &(record_size), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t))) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError writing in a binary taxonomy file"); + close(file_descriptor); + return -1; + } + // Write taxid if (write(file_descriptor, &(((tax->merged_idx)->merged)[i].taxid), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t))) { @@ -1566,7 +1635,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) // Store the deprecated taxid with the index that refers to the new taxid // Find the index of the new taxid - t = obi_taxo_get_taxon_with_taxid(tax, taxid); + t = obi_taxo_get_taxon_with_current_taxid(tax, taxid); // Store the old taxid with the index (tax->merged_idx)->merged[n].taxid = old_taxid; (tax->merged_idx)->merged[n].idx = t->idx; @@ -1966,7 +2035,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) // Associate the taxa with their parent for (i=0; i < (tax->taxa)->count; i++) { - ((tax->taxa)->taxon)[i].parent = obi_taxo_get_taxon_with_taxid(tax, parent_taxids[i]); + ((tax->taxa)->taxon)[i].parent = obi_taxo_get_taxon_with_current_taxid(tax, parent_taxids[i]); if (((tax->taxa)->taxon)[i].parent == NULL) { obi_set_errno(OBI_TAXONOMY_ERROR); @@ -2129,6 +2198,7 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo char* taxonomy_path; char* ranks_file_name; char* taxa_file_name; + char* merged_idx_file_name; char* local_taxa_file_name; char* alter_names_file_name; int buffer_size; @@ -2238,6 +2308,35 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo free(taxa_file_name); free(local_taxa_file_name); + // Read merged index (old and current taxids referring to indices in the taxa structure) + merged_idx_file_name = (char*) malloc(buffer_size*sizeof(char)); + if (merged_idx_file_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for merged index file name"); + free(taxonomy_path); + obi_close_taxonomy(tax); + return NULL; + } + if (snprintf(merged_idx_file_name, buffer_size, "%s/%s.adx", taxonomy_path, taxonomy_name) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError building merged index file name"); + free(taxonomy_path); + free(merged_idx_file_name); + obi_close_taxonomy(tax); + return NULL; + } + tax->merged_idx = read_mergedidx(merged_idx_file_name, tax); + if (tax->merged_idx == NULL) + { + free(taxonomy_path); + free(merged_idx_file_name); + obi_close_taxonomy(tax); + return NULL; + } + free(merged_idx_file_name); + // Read alternative names if (read_alternative_names) { @@ -2363,10 +2462,10 @@ ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx) } -ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid) +ecotx_t* obi_taxo_get_taxon_with_current_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid) // TODO discuss keeping private? { - ecotx_t *current_taxon; - int32_t count; + ecotx_t *current_taxon; + int32_t count; count = (taxonomy->taxa)->count; @@ -2374,12 +2473,35 @@ ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid (const void *) taxonomy->taxa->taxon, count, sizeof(ecotx_t), - cmp_taxids); + cmp_taxids_in_ecotx_t); return current_taxon; } -bool obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid) +ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid) +{ + ecotx_t *current_taxon; + ecomerged_t *indexed_taxon; + int32_t count; + + count = (taxonomy->merged_idx)->count; + + indexed_taxon = (ecomerged_t*) bsearch((const void *) ((size_t) taxid), + (const void *) taxonomy->merged_idx->merged, + count, + sizeof(ecomerged_t), + cmp_taxids_in_ecomerged_t); + + if (indexed_taxon == NULL) + current_taxon = NULL; + else + current_taxon = (taxonomy->taxa->taxon)+(indexed_taxon->idx); + + return current_taxon; +} + + +bool obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid) // TODO discuss that this doesn't work with deprecated taxids { ecotx_t* next_parent; @@ -2486,7 +2608,7 @@ ecotx_t* obi_taxo_get_kingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy) ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy) { static OBIDMS_taxonomy_p tax = NULL; - static int32_t rankindex = -1; + static int32_t rankindex = -1; if (taxonomy && (tax != taxonomy)) { diff --git a/src/obidms_taxonomy.h b/src/obidms_taxonomy.h index 48d0293..33d8aba 100644 --- a/src/obidms_taxonomy.h +++ b/src/obidms_taxonomy.h @@ -108,6 +108,7 @@ int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy); ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx); +ecotx_t* obi_taxo_get_taxon_with_current_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid); ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid); bool obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid); From 41ad3deec039f899feff8b785f6212999cbf2597 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Mon, 9 Jan 2017 17:28:49 +0100 Subject: [PATCH 14/22] Taxonomy: informations about deleted taxids is now read from delnodes.dmp file and added to *.adx file --- src/obidms_taxonomy.c | 258 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 228 insertions(+), 30 deletions(-) diff --git a/src/obidms_taxonomy.c b/src/obidms_taxonomy.c index 2779699..c70f788 100644 --- a/src/obidms_taxonomy.c +++ b/src/obidms_taxonomy.c @@ -1208,12 +1208,15 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) bool nodes_found=false; bool names_found=false; bool merged_found=false; + bool delnodes_found=false; + int32_t* delnodes=NULL; + int32_t delnodes_count; char line[2048]; // TODO large enough? char* elt; char* file_name; int buffer_size; int i, j; - int n; + int n, nD, nT; char** rank_names; int* parent_taxids; int taxid, old_taxid; @@ -1495,14 +1498,14 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) // Go through taxonomy files while ((dp = readdir(tax_dir)) != NULL) { - if (strcmp(dp->d_name, "merged.dmp") == 0) + if (strcmp(dp->d_name, "delnodes.dmp") == 0) { - merged_found = true; + delnodes_found = true; buffer_size = 10000; - // Initializing the merged structure - tax->merged_idx = (ecomergedidx_t*) malloc(sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size); - if (tax->merged_idx == NULL) + // Initializing the list of deleted nodes + delnodes = (int32_t*) malloc(sizeof(int32_t) * buffer_size); + if (delnodes == NULL) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError allocating the memory for a taxonomy structure"); @@ -1522,6 +1525,156 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) obi_close_taxonomy(tax); free(parent_taxids); free(rank_names); + free(delnodes); + closedir(tax_dir); + return NULL; + } + + // Build the file path + if (sprintf(file_name, "%s/delnodes.dmp", taxdump) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError building a taxonomy file name"); + obi_close_taxonomy(tax); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + free(file_name); + free(delnodes); + return NULL; + } + + file = fopen(file_name, "r"); + if (file == NULL) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nProblem opening a taxonomy file"); + obi_close_taxonomy(tax); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + free(file_name); + free(delnodes); + return NULL; + } + + free(file_name); + + n = 0; + while (fgets(line, sizeof(line), file)) + { + // Check for terminal '\n' character (line complete) + if (line[strlen(line) - 1] != '\n') + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError: line buffer size not large enough for line in taxonomy file"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + free(delnodes); + return NULL; + } + + // Get first and only element of the line (the deprecated taxid) + elt = strtok(line, "|"); + // Remove the last character (tab character) + elt[strlen(elt)-1] = '\0'; + // First element: old deprecated taxid + old_taxid = atoi(elt); + + // Store the old taxid in the list of deleted taxids + // Enlarge array if needed + if (n == buffer_size) + { + buffer_size = buffer_size * 2; + delnodes = (int32_t*) realloc(tax->merged_idx, sizeof(int32_t) * buffer_size); + if (delnodes == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for a taxonomy structure"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + return NULL; + } + } + + delnodes[n] = old_taxid; + n++; + } + + // Check that fgets stopped because it reached EOF + if (!feof(file)) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError: file reading was stopped before end of file"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + free(delnodes); + return NULL; + } + + // Store count + delnodes_count = n; + + fclose(file); + } + } + closedir(tax_dir); + + + // Go through directory again for next file // TODO make separate functions? + tax_dir = opendir(taxdump); + if (tax_dir == NULL) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nProblem opening a taxdump directory"); + obi_close_taxonomy(tax); + free(parent_taxids); + free(rank_names); + free(delnodes); + return NULL; + } + + // Go through taxonomy files + while ((dp = readdir(tax_dir)) != NULL) + { + if (strcmp(dp->d_name, "merged.dmp") == 0) + { + merged_found = true; + buffer_size = 10000; + + // Initializing the merged structure + tax->merged_idx = (ecomergedidx_t*) malloc(sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size); + if (tax->merged_idx == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating the memory for a taxonomy structure"); + obi_close_taxonomy(tax); + free(parent_taxids); + free(rank_names); + free(delnodes); + closedir(tax_dir); + return NULL; + } + + // Allocating the memory for the file name + file_name = (char*) malloc((strlen(taxdump) + 12)*sizeof(char)); + if (file_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating the memory for a file name"); + obi_close_taxonomy(tax); + free(parent_taxids); + free(rank_names); + free(delnodes); closedir(tax_dir); return NULL; } @@ -1536,6 +1689,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) free(parent_taxids); free(rank_names); free(file_name); + free(delnodes); return NULL; } @@ -1549,13 +1703,15 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) free(parent_taxids); free(rank_names); free(file_name); + free(delnodes); return NULL; } free(file_name); - n = 0; - i = 0; + nT = 0; // to point in current taxa list while merging + nD = delnodes_count-1; // to point in deleted taxids list while merging (going from count-1 to 0 because taxids are sorted in descending order) + n = 0; // to point in final merged list while merging while (fgets(line, sizeof(line), file)) { // Check for terminal '\n' character (line complete) @@ -1568,6 +1724,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) closedir(tax_dir); free(parent_taxids); free(rank_names); + free(delnodes); return NULL; } @@ -1588,34 +1745,68 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) taxid = atoi(elt); // Store the old taxid in the merged_idx ordered taxid list - // First, store the taxids from the current taxonomy that come before - while ((i < (tax->taxa)->count) && ((tax->taxa)->taxon[i].taxid < old_taxid)) + // The merged list is an ordered list of the current taxids, the deprecated taxids that have current references, + // and the deleted taxids with no current reference. An element of the list is composed of the taxid, and the index + // of the taxon in the taxa structure, or -1 for deleted taxids. + // Creating the merged list requires to merge the 3 ordered lists into one. + while (((nT < (tax->taxa)->count) && ((tax->taxa)->taxon[nT].taxid < old_taxid)) && ((nD >= 0) && (delnodes[nD] < old_taxid))) { - // Enlarge structures if needed - if (n == buffer_size) - { - buffer_size = buffer_size * 2; - tax->merged_idx = (ecomergedidx_t*) realloc(tax->merged_idx, sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size); - if (tax->merged_idx == NULL) + if ((tax->taxa)->taxon[nT].taxid < delnodes[nD]) + { // Add element from taxa list + // Enlarge structure if needed + if (n == buffer_size) { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError reallocating memory for a taxonomy structure"); - obi_close_taxonomy(tax); - fclose(file); - closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + buffer_size = buffer_size * 2; + tax->merged_idx = (ecomergedidx_t*) realloc(tax->merged_idx, sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size); + if (tax->merged_idx == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for a taxonomy structure"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + free(delnodes); + return NULL; + } } - } - (tax->merged_idx)->merged[n].taxid = (tax->taxa)->taxon[i].taxid; - (tax->merged_idx)->merged[n].idx = i; - i++; - n++; + (tax->merged_idx)->merged[n].taxid = (tax->taxa)->taxon[nT].taxid; + (tax->merged_idx)->merged[n].idx = nT; + nT++; + n++; + } + else if (delnodes[nD] < (tax->taxa)->taxon[nT].taxid) + { // Add element from deleted taxids list + // Enlarge structure if needed + if (n == buffer_size) + { + buffer_size = buffer_size * 2; + tax->merged_idx = (ecomergedidx_t*) realloc(tax->merged_idx, sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size); + if (tax->merged_idx == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for a taxonomy structure"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + free(delnodes); + return NULL; + } + } + + (tax->merged_idx)->merged[n].taxid = delnodes[nD]; + (tax->merged_idx)->merged[n].idx = -1; // The index to tag deleted taxids is -1 + nD--; + n++; + } } - // Enlarge structures if needed + // Add the deprecated taxid + // Enlarge structure if needed if (n == buffer_size) { buffer_size = buffer_size * 2; @@ -1629,6 +1820,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) closedir(tax_dir); free(parent_taxids); free(rank_names); + free(delnodes); return NULL; } } @@ -1675,6 +1867,10 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) fclose(file); } } + + // Free delnodes array, not needed anymore + free(delnodes); + closedir(tax_dir); @@ -2494,6 +2690,8 @@ ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid if (indexed_taxon == NULL) current_taxon = NULL; + else if (indexed_taxon->idx == -1) + current_taxon = NULL; // TODO discuss what to do when old deleted taxon else current_taxon = (taxonomy->taxa->taxon)+(indexed_taxon->idx); From 0385a92e02c8c66d2d45fd880ac2c7a8f25071f2 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Wed, 11 Jan 2017 16:36:08 +0100 Subject: [PATCH 15/22] Taxonomy: Refactored the taxdump reading, and little fixes --- src/obidms_taxonomy.c | 769 ++++++++++++++++++++++-------------------- 1 file changed, 410 insertions(+), 359 deletions(-) diff --git a/src/obidms_taxonomy.c b/src/obidms_taxonomy.c index c70f788..6295be8 100644 --- a/src/obidms_taxonomy.c +++ b/src/obidms_taxonomy.c @@ -259,7 +259,7 @@ ecorankidx_t* read_rankidx(const char* ranks_file_name) if (ranks_file==NULL) return NULL; - ranks_index = (ecorankidx_t*) malloc(sizeof(ecorankidx_t) + sizeof(char*) * (count-1)); + ranks_index = (ecorankidx_t*) malloc(sizeof(ecorankidx_t) + sizeof(char*) * count); if (ranks_index == NULL) { obi_set_errno(OBI_MALLOC_ERROR); @@ -321,7 +321,7 @@ ecotxidx_t* read_taxonomyidx(const char* taxa_file_name, const char* local_taxa_ f_local_taxa = open_ecorecorddb(local_taxa_file_name, &count_local_taxa, 0); - taxa_index = (ecotxidx_t*) malloc(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count_taxa + count_local_taxa - 1)); + taxa_index = (ecotxidx_t*) malloc(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count_taxa + count_local_taxa)); if (taxa_index == NULL) { obi_set_errno(OBI_MALLOC_ERROR); @@ -445,7 +445,7 @@ econameidx_t* read_nameidx(const char *file_name, OBIDMS_taxonomy_p taxonomy) return NULL; } - index_names = (econameidx_t*) malloc(sizeof(econameidx_t) + sizeof(econame_t) * (count-1)); + index_names = (econameidx_t*) malloc(sizeof(econameidx_t) + sizeof(econame_t) * count); if (index_names == NULL) { obi_set_errno(OBI_MALLOC_ERROR); @@ -548,7 +548,7 @@ int write_rankidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name // Build the taxonomy directory path taxonomy_path = get_taxonomy_path(dms, taxonomy_name); - file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 5)*sizeof(char)); + file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 6)*sizeof(char)); if (file_name == NULL) { obi_set_errno(OBI_MALLOC_ERROR); @@ -622,8 +622,8 @@ int write_rankidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name // Close file if (close(file_descriptor) < 0) { - obi_set_errno(OBIDMS_UNKNOWN_ERROR); - obidebug(1, "\nError closing a DMS information file"); + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError closing an rdx taxonomy file"); return -1; } @@ -652,7 +652,7 @@ int write_taxonomyidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_ // Build the taxonomy directory path taxonomy_path = get_taxonomy_path(dms, taxonomy_name); - file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 5)*sizeof(char)); + file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 6)*sizeof(char)); if (file_name == NULL) { obi_set_errno(OBI_MALLOC_ERROR); @@ -759,8 +759,8 @@ int write_taxonomyidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_ // Close file if (close(file_descriptor) < 0) { - obi_set_errno(OBIDMS_UNKNOWN_ERROR); - obidebug(1, "\nError closing a DMS information file"); + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError closing a tdx taxonomy file"); return -1; } @@ -789,7 +789,7 @@ int write_local_taxonomy_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* ta // Build the taxonomy directory path taxonomy_path = get_taxonomy_path(dms, taxonomy_name); - file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 5)*sizeof(char)); + file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 6)*sizeof(char)); if (file_name == NULL) { obi_set_errno(OBI_MALLOC_ERROR); @@ -896,8 +896,8 @@ int write_local_taxonomy_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* ta // Close file if (close(file_descriptor) < 0) { - obi_set_errno(OBIDMS_UNKNOWN_ERROR); - obidebug(1, "\nError closing a DMS information file"); + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError closing a ldx taxonomy file"); return -1; } @@ -928,7 +928,7 @@ int write_nameidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name // Build the taxonomy directory path taxonomy_path = get_taxonomy_path(dms, taxonomy_name); - file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 5)*sizeof(char)); + file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 6)*sizeof(char)); if (file_name == NULL) { obi_set_errno(OBI_MALLOC_ERROR); @@ -1045,7 +1045,7 @@ int write_nameidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name if (close(file_descriptor) < 0) { obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nError closing a DMS information file"); + obidebug(1, "\nError closing a ndx taxonomy file"); return -1; } @@ -1153,7 +1153,7 @@ int write_mergedidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_na if (close(file_descriptor) < 0) { obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nError closing a taxonomy file file"); + obidebug(1, "\nError closing an adx taxonomy file"); return -1; } @@ -1199,64 +1199,56 @@ int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name } -OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) -{ - OBIDMS_taxonomy_p tax; - struct dirent* dp; - DIR* tax_dir; - FILE* file; - bool nodes_found=false; - bool names_found=false; - bool merged_found=false; - bool delnodes_found=false; - int32_t* delnodes=NULL; - int32_t delnodes_count; - char line[2048]; // TODO large enough? - char* elt; - char* file_name; - int buffer_size; - int i, j; - int n, nD, nT; - char** rank_names; - int* parent_taxids; - int taxid, old_taxid; - bool already_in; - ecotx_t* t; - // Initialize taxonomy structure - tax = (OBIDMS_taxonomy_p) malloc(sizeof(OBIDMS_taxonomy_t)); - if (tax == NULL) +int read_nodes_dmp(const char* taxdump, OBIDMS_taxonomy_p tax, char*** rank_names_p, int** parent_taxids_p) +{ + struct dirent* dp; + DIR* tax_dir; + FILE* file; + char* file_name; + bool file_found=false; + char line[2048]; // TODO large enough? + char* elt; + int buffer_size; + int i, n; + + buffer_size = 10000; + + // Initialize rank names and parent taxids arrays + *parent_taxids_p = malloc(buffer_size * sizeof(int)); + if (*parent_taxids_p == NULL) { obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating the memory for a taxonomy structure"); - return NULL; + obidebug(1, "\nError allocating the memory for the parent taxids array"); + return -1; } - tax->ranks = NULL; - tax->taxa = NULL; - tax->names = NULL; - tax->merged_idx = NULL; - tax->dms = NULL; - (tax->tax_name)[0] = '\0'; - - // TODO check if taxdump path is for a gz file to unzip or a directory + *rank_names_p = malloc(buffer_size * sizeof(char*)); + if (*rank_names_p == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating the memory for the rank names array"); + free(*parent_taxids_p); + return -1; + } + // Open the taxdum directory tax_dir = opendir(taxdump); if (tax_dir == NULL) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nProblem opening a taxdump directory"); - free(tax); - return NULL; + free(*parent_taxids_p); + free(*rank_names_p); + return -1; } - // Go through taxonomy files + // Look for the 'nodes.dmp' file while ((dp = readdir(tax_dir)) != NULL) { if (strcmp(dp->d_name, "nodes.dmp") == 0) { - nodes_found = true; - buffer_size = 10000; + file_found = true; // Initializing the taxa structure tax->taxa = (ecotxidx_t*) malloc(sizeof(ecotxidx_t) + sizeof(ecotx_t) * buffer_size); @@ -1264,57 +1256,34 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError allocating the memory for a taxonomy structure"); - free(tax); + free(*parent_taxids_p); + free(*rank_names_p); closedir(tax_dir); - return NULL; - } - - // Initialize rank names and parent taxids arrays - parent_taxids = malloc(buffer_size * sizeof(int)); - if (file_name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating the memory for a file name"); - obi_close_taxonomy(tax); - closedir(tax_dir); - return NULL; - } - - rank_names = malloc(buffer_size * sizeof(char*)); - if (file_name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating the memory for a file name"); - obi_close_taxonomy(tax); - free(parent_taxids); - closedir(tax_dir); - return NULL; + return -1; } // Allocating the memory for the file name - file_name = (char*) malloc((strlen(taxdump) + 10)*sizeof(char)); + file_name = (char*) malloc((strlen(taxdump) + 11)*sizeof(char)); if (file_name == NULL) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError allocating the memory for a file name"); - obi_close_taxonomy(tax); - free(parent_taxids); - free(rank_names); + free(*parent_taxids_p); + free(*rank_names_p); closedir(tax_dir); - return NULL; + return -1; } // Build the file path if (sprintf(file_name, "%s/nodes.dmp", taxdump) < 0) { obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nError building a taxonomy file name"); - obi_close_taxonomy(tax); + obidebug(1, "\nError building a taxonomy file name for 'nodes.dmp'"); + free(*parent_taxids_p); + free(*rank_names_p); closedir(tax_dir); - free(parent_taxids); - free(rank_names); free(file_name); - return NULL; + return -1; } file = fopen(file_name, "r"); @@ -1322,12 +1291,11 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nProblem opening a taxonomy file"); - obi_close_taxonomy(tax); + free(*parent_taxids_p); + free(*rank_names_p); closedir(tax_dir); - free(parent_taxids); - free(rank_names); free(file_name); - return NULL; + return -1; } free(file_name); @@ -1346,38 +1314,35 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError reallocating memory for a taxonomy structure"); - obi_close_taxonomy(tax); + free(*parent_taxids_p); + free(*rank_names_p); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } - parent_taxids = (int*) realloc(parent_taxids, sizeof(int) * buffer_size); - if (parent_taxids == NULL) + *parent_taxids_p = (int*) realloc(*parent_taxids_p, sizeof(int) * buffer_size); + if (*parent_taxids_p == NULL) { obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError reallocating memory for a taxonomy structure"); - obi_close_taxonomy(tax); + obidebug(1, "\nError reallocating memory for the parent taxids array"); + free(*parent_taxids_p); + free(*rank_names_p); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } - rank_names = (char**) realloc(rank_names, sizeof(char*) * buffer_size); - if (rank_names == NULL) + *rank_names_p = (char**) realloc(*rank_names_p, sizeof(char*) * buffer_size); + if (*rank_names_p == NULL) { obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError reallocating memory for a taxonomy structure"); - obi_close_taxonomy(tax); + obidebug(1, "\nError reallocating memory for the rank names array"); + free(*parent_taxids_p); + free(*rank_names_p); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } } @@ -1386,12 +1351,11 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError: line buffer size not large enough for line in taxonomy file"); - obi_close_taxonomy(tax); + free(*parent_taxids_p); + free(*rank_names_p); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } (tax->taxa)->taxon[n].idx = n; @@ -1423,22 +1387,21 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) elt[strlen(elt)-1] = '\0'; if (i == 1) - parent_taxids[n] = atoi(elt); + (*parent_taxids_p)[n] = atoi(elt); else if (i == 2) { - rank_names[n] = (char*) malloc((strlen(elt)+1) * sizeof(char)); - if (rank_names[n] == NULL) + (*rank_names_p)[n] = (char*) malloc((strlen(elt)+1) * sizeof(char)); + if ((*rank_names_p)[n] == NULL) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError allocating memory for taxon rank name"); - obi_close_taxonomy(tax); + free(*parent_taxids_p); + free(*rank_names_p); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } - strcpy(rank_names[n], elt); + strcpy((*rank_names_p)[n], elt); } i++; } @@ -1450,12 +1413,11 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError: file reading was stopped before end of file"); - obi_close_taxonomy(tax); + free(*parent_taxids_p); + free(*rank_names_p); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } // Store count @@ -1469,30 +1431,79 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError reallocating memory for taxonomy structure"); - obi_close_taxonomy(tax); + free(*parent_taxids_p); + free(*rank_names_p); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } - fclose(file); + if (fclose(file) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError closing a taxdump file"); + free(*parent_taxids_p); + free(*rank_names_p); + closedir(tax_dir); + return -1; + } } } - closedir(tax_dir); + if (closedir(tax_dir) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError closing a taxdump directory"); + free(*parent_taxids_p); + free(*rank_names_p); + closedir(tax_dir); + return -1; + } + + if ( ! file_found) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError: could not find 'nodes.dmp' file in taxdump directory"); + free(*parent_taxids_p); + free(*rank_names_p); + return -1; + } + + return 0; +} - // Go through directory again for next file // TODO make separate functions? +int read_delnodes_dmp(const char* taxdump, OBIDMS_taxonomy_p tax, int32_t** delnodes_p, int32_t* delnodes_count) +{ + struct dirent* dp; + DIR* tax_dir; + FILE* file; + char* file_name; + bool file_found=false; + char line[2048]; // TODO large enough? + char* elt; + int buffer_size; + int n; + int old_taxid; + + buffer_size = 10000; + + // Initializing the list of deleted nodes + *delnodes_p = (int32_t*) malloc(sizeof(int32_t) * buffer_size); + if (*delnodes_p == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating the memory for the deleted taxids array"); + return -1; + } + tax_dir = opendir(taxdump); if (tax_dir == NULL) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nProblem opening a taxdump directory"); - obi_close_taxonomy(tax); - free(parent_taxids); - free(rank_names); - return NULL; + closedir(tax_dir); + free(*delnodes_p); + return -1; } // Go through taxonomy files @@ -1500,34 +1511,17 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { if (strcmp(dp->d_name, "delnodes.dmp") == 0) { - delnodes_found = true; - buffer_size = 10000; - - // Initializing the list of deleted nodes - delnodes = (int32_t*) malloc(sizeof(int32_t) * buffer_size); - if (delnodes == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating the memory for a taxonomy structure"); - obi_close_taxonomy(tax); - free(parent_taxids); - free(rank_names); - closedir(tax_dir); - return NULL; - } + file_found = true; // Allocating the memory for the file name - file_name = (char*) malloc((strlen(taxdump) + 12)*sizeof(char)); + file_name = (char*) malloc((strlen(taxdump) + 14)*sizeof(char)); if (file_name == NULL) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError allocating the memory for a file name"); - obi_close_taxonomy(tax); - free(parent_taxids); - free(rank_names); - free(delnodes); closedir(tax_dir); - return NULL; + free(*delnodes_p); + return -1; } // Build the file path @@ -1535,13 +1529,10 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError building a taxonomy file name"); - obi_close_taxonomy(tax); closedir(tax_dir); - free(parent_taxids); - free(rank_names); + free(*delnodes_p); free(file_name); - free(delnodes); - return NULL; + return -1; } file = fopen(file_name, "r"); @@ -1549,13 +1540,10 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nProblem opening a taxonomy file"); - obi_close_taxonomy(tax); closedir(tax_dir); - free(parent_taxids); - free(rank_names); free(file_name); - free(delnodes); - return NULL; + free(*delnodes_p); + return -1; } free(file_name); @@ -1568,13 +1556,10 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError: line buffer size not large enough for line in taxonomy file"); - obi_close_taxonomy(tax); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - free(delnodes); - return NULL; + free(*delnodes_p); + return -1; } // Get first and only element of the line (the deprecated taxid) @@ -1589,21 +1574,18 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) if (n == buffer_size) { buffer_size = buffer_size * 2; - delnodes = (int32_t*) realloc(tax->merged_idx, sizeof(int32_t) * buffer_size); - if (delnodes == NULL) + (*delnodes_p) = (int32_t*) realloc(tax->merged_idx, sizeof(int32_t) * buffer_size); + if ((*delnodes_p) == NULL) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError reallocating memory for a taxonomy structure"); - obi_close_taxonomy(tax); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } } - delnodes[n] = old_taxid; + (*delnodes_p)[n] = old_taxid; n++; } @@ -1612,35 +1594,67 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError: file reading was stopped before end of file"); - obi_close_taxonomy(tax); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - free(delnodes); - return NULL; + free(*delnodes_p); + return -1; } // Store count - delnodes_count = n; + *delnodes_count = n; - fclose(file); + if (fclose(file) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError closing a taxdump file"); + free(*delnodes_p); + closedir(tax_dir); + return -1; + } } } - closedir(tax_dir); + if (closedir(tax_dir) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError closing a taxdump directory"); + free(*delnodes_p); + return -1; + } + + if ( ! file_found) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError: could not find 'delnodes.dmp' file in taxdump directory"); + free(*delnodes_p); + return -1; + } + + return 0; +} - // Go through directory again for next file // TODO make separate functions? +int read_merged_dmp(const char* taxdump, OBIDMS_taxonomy_p tax, int32_t* delnodes, int32_t delnodes_count) +{ + int n, nD, nT; + int taxid, old_taxid; + ecotx_t* t; + struct dirent* dp; + DIR* tax_dir; + FILE* file; + char* file_name; + bool file_found=false; + char line[2048]; // TODO large enough? + char* elt; + int buffer_size; + + buffer_size = 10000; + tax_dir = opendir(taxdump); if (tax_dir == NULL) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nProblem opening a taxdump directory"); - obi_close_taxonomy(tax); - free(parent_taxids); - free(rank_names); - free(delnodes); - return NULL; + return -1; } // Go through taxonomy files @@ -1648,8 +1662,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { if (strcmp(dp->d_name, "merged.dmp") == 0) { - merged_found = true; - buffer_size = 10000; + file_found = true; // Initializing the merged structure tax->merged_idx = (ecomergedidx_t*) malloc(sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size); @@ -1657,12 +1670,8 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError allocating the memory for a taxonomy structure"); - obi_close_taxonomy(tax); - free(parent_taxids); - free(rank_names); - free(delnodes); closedir(tax_dir); - return NULL; + return -1; } // Allocating the memory for the file name @@ -1671,12 +1680,8 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError allocating the memory for a file name"); - obi_close_taxonomy(tax); - free(parent_taxids); - free(rank_names); - free(delnodes); closedir(tax_dir); - return NULL; + return -1; } // Build the file path @@ -1684,13 +1689,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError building a taxonomy file name"); - obi_close_taxonomy(tax); closedir(tax_dir); - free(parent_taxids); - free(rank_names); free(file_name); - free(delnodes); - return NULL; + return -1; } file = fopen(file_name, "r"); @@ -1698,13 +1699,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nProblem opening a taxonomy file"); - obi_close_taxonomy(tax); closedir(tax_dir); - free(parent_taxids); - free(rank_names); free(file_name); - free(delnodes); - return NULL; + return -1; } free(file_name); @@ -1719,13 +1716,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError: line buffer size not large enough for line in taxonomy file"); - obi_close_taxonomy(tax); - fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - free(delnodes); - return NULL; + fclose(file); + return -1; } // Parse the 2 elements separated by '|' @@ -1762,13 +1755,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError reallocating memory for a taxonomy structure"); - obi_close_taxonomy(tax); - fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - free(delnodes); - return NULL; + fclose(file); + return -1; } } @@ -1788,13 +1777,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError reallocating memory for a taxonomy structure"); - obi_close_taxonomy(tax); - fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - free(delnodes); - return NULL; + fclose(file); + return -1; } } @@ -1815,13 +1800,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError reallocating memory for a taxonomy structure"); - obi_close_taxonomy(tax); - fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - free(delnodes); - return NULL; + fclose(file); + return -1; } } @@ -1839,12 +1820,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError: file reading was stopped before end of file"); - obi_close_taxonomy(tax); - fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + fclose(file); + return -1; } // Store count @@ -1856,34 +1834,63 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError reallocating memory for a a taxonomy structure"); - obi_close_taxonomy(tax); - fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + fclose(file); + return -1; } - fclose(file); + if (fclose(file) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError closing a taxdump file"); + closedir(tax_dir); + return -1; + } } } + if (closedir(tax_dir) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError closing a taxdump directory"); + closedir(tax_dir); + return -1; + } + + if ( ! file_found) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError: could not find 'merged.dmp' file in taxdump directory"); + return -1; + } // Free delnodes array, not needed anymore free(delnodes); - closedir(tax_dir); + return 0; +} - // Go through directory again for next file +int read_names_dmp(const char* taxdump, OBIDMS_taxonomy_p tax) +{ + int i, j, n; + int taxid; + struct dirent* dp; + DIR* tax_dir; + FILE* file; + char* file_name; + bool file_found=false; + char line[2048]; // TODO large enough? + char* elt; + int buffer_size; + + buffer_size = 10000; + tax_dir = opendir(taxdump); if (tax_dir == NULL) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nProblem opening a taxdump directory"); - obi_close_taxonomy(tax); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } // Go through taxonomy files @@ -1891,8 +1898,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { if (strcmp(dp->d_name, "names.dmp") == 0) { - names_found = true; - buffer_size = 10000; + file_found = true; // Initializing the names structure tax->names = (econameidx_t*) malloc(sizeof(econameidx_t) + sizeof(econame_t) * buffer_size); @@ -1900,11 +1906,8 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError allocating the memory for a taxonomy structure"); - obi_close_taxonomy(tax); - free(parent_taxids); - free(rank_names); closedir(tax_dir); - return NULL; + return -1; } // Allocating the memory for the file name @@ -1913,11 +1916,8 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError allocating the memory for a file name"); - obi_close_taxonomy(tax); - free(parent_taxids); - free(rank_names); closedir(tax_dir); - return NULL; + return -1; } // Build the file path @@ -1925,12 +1925,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError building a taxonomy file name"); - obi_close_taxonomy(tax); - closedir(tax_dir); - free(parent_taxids); - free(rank_names); free(file_name); - return NULL; + closedir(tax_dir); + return -1; } file = fopen(file_name, "r"); @@ -1938,12 +1935,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nProblem opening a taxonomy file"); - obi_close_taxonomy(tax); - closedir(tax_dir); - free(parent_taxids); - free(rank_names); free(file_name); - return NULL; + closedir(tax_dir); + return -1; } free(file_name); @@ -1961,12 +1955,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError reallocating memory for a taxonomy structure"); - obi_close_taxonomy(tax); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } } @@ -1975,12 +1966,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError: line buffer size not large enough for line in taxonomy file"); - obi_close_taxonomy(tax); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } // Parse 4 first elements separated by '|' @@ -2000,12 +1988,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError: could not find taxon associated to name when reading taxdump"); - obi_close_taxonomy(tax); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } j = i; // Because there are several names by taxon but they are in the same order (tax->names)->names[n].taxon = ((tax->taxa)->taxon)+i; @@ -2029,9 +2014,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) obi_close_taxonomy(tax); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } strcpy((tax->names)->names[n].name, elt); } @@ -2042,12 +2025,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError allocating memory for a taxon class name"); - obi_close_taxonomy(tax); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } strcpy((tax->names)->names[n].class_name, elt); if (strcmp(elt, "scientific name") == 0) @@ -2067,12 +2047,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError: file reading was stopped before end of file"); - obi_close_taxonomy(tax); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } // Store count @@ -2084,35 +2061,109 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError reallocating memory for a a taxonomy structure"); - obi_close_taxonomy(tax); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } - fclose(file); + if (fclose(file) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError closing a taxdump file"); + closedir(tax_dir); + return -1; + } } } - closedir(tax_dir); - - if (!nodes_found) + if (closedir(tax_dir) < 0) { obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nProblem reading taxdump: nodes.dmp file not found"); - obi_close_taxonomy(tax); - free(parent_taxids); - free(rank_names); + obidebug(1, "\nError closing a taxdump directory"); + closedir(tax_dir); + return -1; + } + + if ( ! file_found) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError: could not find 'merged.dmp' file in taxdump directory"); + return -1; + } + + return 0; +} + + +OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) +{ + OBIDMS_taxonomy_p tax; + char** rank_names=NULL; + int* parent_taxids=NULL; + int32_t* delnodes=NULL; + int32_t delnodes_count; + bool already_in; + ecotx_t* t; + int buffer_size; + int i, j; + + // Initialize taxonomy structure + tax = (OBIDMS_taxonomy_p) malloc(sizeof(OBIDMS_taxonomy_t)); + if (tax == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating the memory for a taxonomy structure"); return NULL; } - if (!names_found) + tax->ranks = NULL; + tax->taxa = NULL; + tax->names = NULL; + tax->merged_idx = NULL; + + tax->dms = NULL; + (tax->tax_name)[0] = '\0'; + + // TODO check if taxdump path is for a gz file to unzip or a directory + + // READ NODES.DMP + if (read_nodes_dmp(taxdump, tax, &rank_names, &parent_taxids) < 0) { obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nProblem reading taxdump: names.dmp file not found"); + obidebug(1, "\nProblem reading 'nodes.dmp'"); + obi_close_taxonomy(tax); + return NULL; + } + + // READ DELNODES.DMP + if (read_delnodes_dmp(taxdump, tax, &delnodes, &delnodes_count) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nProblem reading 'delnodes.dmp'"); obi_close_taxonomy(tax); - free(parent_taxids); free(rank_names); + free(parent_taxids); + return NULL; + } + + // READ MERGED.DMP + if (read_merged_dmp(taxdump, tax, delnodes, delnodes_count) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nProblem reading 'merged.dmp'"); + obi_close_taxonomy(tax); + free(delnodes); + free(rank_names); + free(parent_taxids); + return NULL; + } + + // READ NAMES.DMP + if (read_names_dmp(taxdump, tax) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nProblem reading 'names.dmp'"); + obi_close_taxonomy(tax); + free(rank_names); + free(parent_taxids); return NULL; } @@ -2246,7 +2297,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) (tax->taxa)->buffer_size = (tax->taxa)->count; - // Compute longest branches + // Compute longest branches (used to compute distances between taxa faster) for (i=0; i < (tax->taxa)->count; i++) { t = (((tax->taxa))->taxon)+i; @@ -2281,8 +2332,8 @@ int obi_taxonomy_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const { int32_t taxid; ecotx_t* taxon; - econame_t* name_struct; - int i; + int i; +// econame_t* name_struct; // Enlarge the structure memory for a new taxon tax->taxa = (ecotxidx_t*) realloc(tax->taxa, sizeof(ecotxidx_t) + sizeof(ecotx_t) * (((tax->taxa)->count) + 1)); @@ -2323,7 +2374,7 @@ int obi_taxonomy_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const break; } } - if (taxon->rank == -1) // TODO Discuss possibility of creating rank if doesn't exist + if (taxon->rank == -1) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError: taxon rank not found when adding a new taxon"); @@ -2336,7 +2387,7 @@ int obi_taxonomy_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const obidebug(1, "\nError: taxon parent not found when adding a new taxon"); return -1; } - taxon->farest = 0; // TODO not sure + taxon->farest = 0; // Update taxonomy counts etc (tax->taxa)->max_taxid = taxid; @@ -2344,42 +2395,42 @@ int obi_taxonomy_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const ((tax->taxa)->local_count)++; (tax->taxa)->buffer_size = (tax->taxa)->count; - // Add new name in names structure // TODO discuss because in OBITools1 the new names were not written in .ndx - // Allocate memory for new name - tax->names = (econameidx_t*) realloc(tax->names, sizeof(econameidx_t) + sizeof(econame_t) * ((tax->names)->count + 1)); - if (tax->names == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError reallocating memory for a taxonomy structure to add a new taxon"); - return -1; - } - - // Add new name - name_struct = (tax->names)->names + ((tax->names)->count); - name_struct->name = (char*) malloc((strlen(name) + 1) * sizeof(char)); - if (name_struct->name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for a taxon name to add a new taxon"); - return -1; - } - strcpy(name_struct->name, name); - name_struct->class_name = (char*) malloc((strlen("scientific name") + 1) * sizeof(char)); - if (name_struct->class_name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for a taxon class name to add a new taxon"); - return -1; - } - strcpy(name_struct->class_name, "scientific name"); - name_struct->is_scientific_name = true; - name_struct->taxon = ((tax->taxa)->taxon) + ((tax->taxa)->count) - 1; - - // Sort names in alphabetical order - qsort((tax->names)->names, (tax->names)->count, sizeof(econame_t), cmp_names); - - // Update name count - ((tax->names)->count)++; +// // Add new name in names structure // Commented because the new name was not added in the .ndx file in the OBITools1 +// // Allocate memory for new name +// tax->names = (econameidx_t*) realloc(tax->names, sizeof(econameidx_t) + sizeof(econame_t) * ((tax->names)->count + 1)); +// if (tax->names == NULL) +// { +// obi_set_errno(OBI_MALLOC_ERROR); +// obidebug(1, "\nError reallocating memory for a taxonomy structure to add a new taxon"); +// return -1; +// } +// +// // Add new name +// name_struct = (tax->names)->names + ((tax->names)->count); +// name_struct->name = (char*) malloc((strlen(name) + 1) * sizeof(char)); +// if (name_struct->name == NULL) +// { +// obi_set_errno(OBI_MALLOC_ERROR); +// obidebug(1, "\nError allocating memory for a taxon name to add a new taxon"); +// return -1; +// } +// strcpy(name_struct->name, name); +// name_struct->class_name = (char*) malloc((strlen("scientific name") + 1) * sizeof(char)); +// if (name_struct->class_name == NULL) +// { +// obi_set_errno(OBI_MALLOC_ERROR); +// obidebug(1, "\nError allocating memory for a taxon class name to add a new taxon"); +// return -1; +// } +// strcpy(name_struct->class_name, "scientific name"); +// name_struct->is_scientific_name = true; +// name_struct->taxon = ((tax->taxa)->taxon) + ((tax->taxa)->count) - 1; +// +// // Sort names in alphabetical order +// qsort((tax->names)->names, (tax->names)->count, sizeof(econame_t), cmp_names); +// +// // Update name count +// ((tax->names)->count)++; return taxid; } @@ -2445,7 +2496,7 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo if (tax->ranks == NULL) { obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nError building ranks file name"); + obidebug(1, "\nError reading taxonomy ranks file (check taxonomy name spelling)"); free(taxonomy_path); free(ranks_file_name); free(tax); From c065c1914ac6cf3d740e8f2d285a019bab37b102 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Mon, 16 Jan 2017 17:28:20 +0100 Subject: [PATCH 16/22] Taxonomy: adding, writing and reading preferred names, changed some function names, and fixed a bug with taxa indices not being properly initialized --- python/obitools3/obidms/_obitaxo.pxd | 3 +- python/obitools3/obidms/_obitaxo.pyx | 40 +- python/obitools3/obidms/capi/obitaxonomy.pxd | 10 +- src/obidms_taxonomy.c | 435 +++++++++++++++++-- src/obidms_taxonomy.h | 13 +- 5 files changed, 444 insertions(+), 57 deletions(-) diff --git a/python/obitools3/obidms/_obitaxo.pxd b/python/obitools3/obidms/_obitaxo.pxd index 51c6c0f..671d5e1 100644 --- a/python/obitools3/obidms/_obitaxo.pxd +++ b/python/obitools3/obidms/_obitaxo.pxd @@ -17,4 +17,5 @@ cdef class OBI_Taxonomy : cdef class OBI_Taxon : - cdef ecotx_t* _pointer + cdef ecotx_t* _pointer + cdef OBI_Taxonomy _tax diff --git a/python/obitools3/obidms/_obitaxo.pyx b/python/obitools3/obidms/_obitaxo.pyx index b9d7e98..3aae515 100644 --- a/python/obitools3/obidms/_obitaxo.pyx +++ b/python/obitools3/obidms/_obitaxo.pyx @@ -7,10 +7,10 @@ from .capi.obitaxonomy cimport obi_read_taxonomy, \ obi_write_taxonomy, \ obi_close_taxonomy, \ obi_taxo_get_taxon_with_taxid, \ - obi_taxonomy_add_local_taxon, \ + obi_taxo_add_local_taxon, \ + obi_taxo_add_preferred_name_with_taxon, \ ecotx_t - from ._obidms cimport OBIDMS from cpython.pycapsule cimport PyCapsule_New, PyCapsule_GetPointer @@ -42,11 +42,11 @@ cdef class OBI_Taxonomy : if taxon_p == NULL : raise Exception("Taxon not found") taxon_capsule = PyCapsule_New(taxon_p, NULL, NULL) - return OBI_Taxon(taxon_capsule) + return OBI_Taxon(taxon_capsule, self) else : raise Exception("Not implemented") - - + + def __iter__(self): cdef ecotx_t* taxa @@ -60,7 +60,7 @@ cdef class OBI_Taxonomy : for t in range(self._pointer.taxa.count): taxon_p = (taxa+t) taxon_capsule = PyCapsule_New(taxon_p, NULL, NULL) - yield OBI_Taxon(taxon_capsule) + yield OBI_Taxon(taxon_capsule, self) cpdef write(self, str prefix) : @@ -70,7 +70,7 @@ cdef class OBI_Taxonomy : cpdef int add_taxon(self, str name, str rank_name, int parent_taxid, int min_taxid=10000000) : cdef int taxid - taxid = obi_taxonomy_add_local_taxon(self._pointer, str2bytes(name), str2bytes(rank_name), parent_taxid, min_taxid) + taxid = obi_taxo_add_local_taxon(self._pointer, str2bytes(name), str2bytes(rank_name), parent_taxid, min_taxid) if taxid < 0 : raise Exception("Error adding a new taxon to the taxonomy") else : @@ -85,10 +85,11 @@ cdef class OBI_Taxonomy : cdef class OBI_Taxon : # TODO dict subclass? - def __init__(self, object taxon_capsule) : + def __init__(self, object taxon_capsule, OBI_Taxonomy tax) : self._pointer = PyCapsule_GetPointer(taxon_capsule, NULL) if self._pointer == NULL : - raise Exception("Error reading the taxonomy") + raise Exception("Error reading a taxon (NULL pointer)") + self._tax = tax # name property getter @property @@ -115,14 +116,25 @@ cdef class OBI_Taxon : # TODO dict subclass? def parent(self): cdef object parent_capsule parent_capsule = PyCapsule_New(self._pointer.parent, NULL, NULL) - return OBI_Taxon(parent_capsule) + return OBI_Taxon(parent_capsule, self._tax) + + # preferred name property getter and setter + @property + def preferred_name(self): + if self._pointer.preferred_name != NULL : + return bytes2str(self._pointer.preferred_name) + @preferred_name.setter + def preferred_name(self, str new_preferred_name) : # @DuplicatedSignature + if (obi_taxo_add_preferred_name_with_taxon(self._tax._pointer, self._pointer, str2bytes(new_preferred_name)) < 0) : + raise Exception("Error adding a new preferred name to a taxon") def __repr__(self): d = {} - d['taxid'] = self.taxid - d['name'] = self.name - d['parent'] = self.parent.taxid - d['farest'] = self.farest + d['taxid'] = self.taxid + d['name'] = self.name + d['preferred name'] = self.preferred_name + d['parent'] = self.parent.taxid + d['farest'] = self.farest return str(d) diff --git a/python/obitools3/obidms/capi/obitaxonomy.pxd b/python/obitools3/obidms/capi/obitaxonomy.pxd index 99cd7e4..d90693c 100644 --- a/python/obitools3/obidms/capi/obitaxonomy.pxd +++ b/python/obitools3/obidms/capi/obitaxonomy.pxd @@ -13,7 +13,8 @@ cdef extern from "obidms_taxonomy.h" nogil: int32_t farest ecotxnode* parent char* name - + char* preferred_name + ctypedef ecotxnode ecotx_t @@ -56,4 +57,9 @@ cdef extern from "obidms_taxonomy.h" nogil: ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy) - int obi_taxonomy_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const char* rank_name, int32_t parent_taxid, int32_t min_taxid) + int obi_taxo_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const char* rank_name, int32_t parent_taxid, int32_t min_taxid) + + int obi_taxo_add_preferred_name_with_taxid(OBIDMS_taxonomy_p tax, int32_t taxid, const char* preferred_name) + + int obi_taxo_add_preferred_name_with_taxon(OBIDMS_taxonomy_p tax, ecotx_t* taxon, const char* preferred_name) + diff --git a/src/obidms_taxonomy.c b/src/obidms_taxonomy.c index 6295be8..9e081f0 100644 --- a/src/obidms_taxonomy.c +++ b/src/obidms_taxonomy.c @@ -246,7 +246,7 @@ FILE* open_ecorecorddb(const char* file_name, } -ecorankidx_t* read_rankidx(const char* ranks_file_name) +ecorankidx_t* read_ranks_idx(const char* ranks_file_name) { int32_t count; FILE* ranks_file; @@ -301,7 +301,7 @@ ecorankidx_t* read_rankidx(const char* ranks_file_name) } -ecotxidx_t* read_taxonomyidx(const char* taxa_file_name, const char* local_taxa_file_name) +ecotxidx_t* read_taxonomy_idx(const char* taxa_file_name, const char* local_taxa_file_name) { int32_t count_taxa; int32_t count_local_taxa; @@ -341,10 +341,12 @@ ecotxidx_t* read_taxonomyidx(const char* taxa_file_name, const char* local_taxa_ for (i=0; itaxon[i])); + taxa_index->taxon[i].idx = i; taxa_index->taxon[i].parent = taxa_index->taxon + (size_t) taxa_index->taxon[i].parent; taxa_index->taxon[i].parent->farest = 0; if (taxa_index->taxon[i].taxid > taxa_index->max_taxid) taxa_index->max_taxid = taxa_index->taxon[i].taxid; + taxa_index->taxon[i].preferred_name = NULL; } if (count_local_taxa > 0) @@ -361,6 +363,7 @@ ecotxidx_t* read_taxonomyidx(const char* taxa_file_name, const char* local_taxa_ taxa_index->taxon[i].parent->farest=0; if (taxa_index->taxon[i].taxid > taxa_index->max_taxid) taxa_index->max_taxid = taxa_index->taxon[i].taxid; + taxa_index->taxon[i].preferred_name = NULL; } for (i=0; i < count_taxa; i++) @@ -431,7 +434,60 @@ econame_t* readnext_econame(FILE* f, econame_t* name, OBIDMS_taxonomy_p taxonomy } -econameidx_t* read_nameidx(const char *file_name, OBIDMS_taxonomy_p taxonomy) +econame_t* readnext_ecopreferredname(FILE* f, econame_t* name, OBIDMS_taxonomy_p taxonomy) +{ + econameformat_t* raw; + int32_t record_length; + + raw = read_ecorecord(f, &record_length); + if (raw == NULL) + return NULL; + + name->is_scientific_name = raw->is_scientific_name; + + name->name = malloc((raw->name_length + 1) * sizeof(char)); + if (name->name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a taxon preferred name"); + free(raw); + return NULL; + } + strncpy(name->name, raw->names, raw->name_length); + name->name[raw->name_length] = 0; + + name->class_name = malloc((raw->class_length+1) * sizeof(char)); + if (name->class_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a taxon class name"); + free(name->name); + free(raw); + return NULL; + } + strncpy(name->class_name,(raw->names + raw->name_length), raw->class_length); + name->class_name[raw->class_length] = 0; + + name->taxon = taxonomy->taxa->taxon + raw->taxid; + + // Add the preferred name in the taxon structure // TODO discuss: couldn't they all use the same pointer? + (taxonomy->taxa->taxon + raw->taxid)->preferred_name = malloc((raw->name_length + 1) * sizeof(char)); + if ((taxonomy->taxa->taxon + raw->taxid)->preferred_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a taxon preferred name"); + free(name->name); + free(name->class_name); + free(raw); + return NULL; + } + strcpy((taxonomy->taxa->taxon + raw->taxid)->preferred_name, name->name); + + return name; +} + + +econameidx_t* read_names_idx(const char *file_name, OBIDMS_taxonomy_p taxonomy) { int32_t count; FILE* f; @@ -440,10 +496,7 @@ econameidx_t* read_nameidx(const char *file_name, OBIDMS_taxonomy_p taxonomy) f = open_ecorecorddb(file_name, &count, 0); if (f == NULL) - { - obidebug(1, "\nError reading taxonomy name file"); - return NULL; - } + return NULL; index_names = (econameidx_t*) malloc(sizeof(econameidx_t) + sizeof(econame_t) * count); if (index_names == NULL) @@ -473,9 +526,46 @@ econameidx_t* read_nameidx(const char *file_name, OBIDMS_taxonomy_p taxonomy) } +econameidx_t* read_preferred_names_idx(const char *file_name, OBIDMS_taxonomy_p taxonomy) +{ + int32_t count; + FILE* f; + econameidx_t* index_names; + int32_t i; + + f = open_ecorecorddb(file_name, &count, 0); + if (f == NULL) + return NULL; + + index_names = (econameidx_t*) malloc(sizeof(econameidx_t) + sizeof(econame_t) * count); + if (index_names == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reading taxonomy name file"); + return NULL; + } + + index_names->count = count; + + for (i=0; i < count; i++) + { + readnext_ecopreferredname(f, (index_names->names)+i, taxonomy); + if ((index_names->names)+i == NULL) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError reading taxonomy name file"); + free(index_names); + return NULL; + } + } + + fclose(f); + + return index_names; +} -ecomergedidx_t* read_mergedidx(const char *file_name, OBIDMS_taxonomy_p taxonomy) +ecomergedidx_t* read_merged_idx(const char *file_name, OBIDMS_taxonomy_p taxonomy) { int32_t count; FILE* f; @@ -528,7 +618,7 @@ ecomergedidx_t* read_mergedidx(const char *file_name, OBIDMS_taxonomy_p taxonomy // Functions to write taxonomy structure to binary files -int write_rankidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? +int write_ranks_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? { int i; char* file_name; @@ -631,7 +721,7 @@ int write_rankidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name } -int write_taxonomyidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct? +int write_taxonomy_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct? { int i; char* file_name; @@ -905,7 +995,7 @@ int write_local_taxonomy_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* ta } -int write_nameidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct? +int write_names_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct? { int i; char* file_name; @@ -1053,7 +1143,155 @@ int write_nameidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name } -int write_mergedidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? +int write_preferred_names_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct? +{ + int i; + char* file_name; + int file_descriptor; + off_t file_size; + char* taxonomy_path; + int32_t name_length; + int32_t class_length; + int32_t record_size; + + // Compute file size + file_size = sizeof(int32_t); // To store record count + for (i=0; i < (tax->preferred_names)->count; i++) + { + file_size = file_size + sizeof(int32_t) * 5; // To store record size, taxid, rank index, parent index, and name length + file_size = file_size + strlen(tax->preferred_names->names[i].name); // To store name + file_size = file_size + strlen(tax->preferred_names->names[i].class_name); // To store name + } + + // Build the taxonomy directory path + taxonomy_path = get_taxonomy_path(dms, taxonomy_name); + + file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 6)*sizeof(char)); + if (file_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating the memory for a binary taxonomy file name"); + return -1; + } + + // Build the file path + if (sprintf(file_name, "%s/%s.pdx", taxonomy_path, taxonomy_name) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError building a binary taxonomy file name"); + return -1; + } + + free(taxonomy_path); + + // Create file + file_descriptor = open(file_name, O_RDWR | O_CREAT | O_EXCL, 0777); + if (file_descriptor < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError creating a binary taxonomy file"); + free(file_name); + return -1; + } + + free(file_name); + + // Truncate the file to the right size + if (ftruncate(file_descriptor, file_size) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError truncating a binary taxonomy file"); + close(file_descriptor); + return -1; + } + + // Write record count + if (write(file_descriptor, &(tax->preferred_names->count), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t))) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError writing in a binary taxonomy file"); + close(file_descriptor); + return -1; + } + + // Write records + for (i=0; i < tax->preferred_names->count; i++) + { + name_length = strlen(tax->preferred_names->names[i].name); + class_length = strlen(tax->preferred_names->names[i].class_name); + record_size = 4*sizeof(int32_t) + name_length + class_length; + + // Write record size + if (write(file_descriptor, &record_size, sizeof(int32_t)) < ((ssize_t) sizeof(int32_t))) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError writing in a binary taxonomy file"); + close(file_descriptor); + return -1; + } + // Write if the name is a scientific name + if (write(file_descriptor, &(tax->preferred_names->names[i].is_scientific_name), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t))) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError writing in a binary taxonomy file"); + close(file_descriptor); + return -1; + } + // Write name length + if (write(file_descriptor, &name_length, sizeof(int32_t)) < ((ssize_t) sizeof(int32_t))) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError writing in a binary taxonomy file"); + close(file_descriptor); + return -1; + } + // Write class length + if (write(file_descriptor, &class_length, sizeof(int32_t)) < ((ssize_t) sizeof(int32_t))) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError writing in a binary taxonomy file"); + close(file_descriptor); + return -1; + } + // Write taxid index + if (write(file_descriptor, &(tax->preferred_names->names[i].taxon->idx), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t))) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError writing in a binary taxonomy file"); + close(file_descriptor); + return -1; + } + // Write name + if (write(file_descriptor, tax->preferred_names->names[i].name, name_length) < ((ssize_t) name_length)) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError writing in a binary taxonomy file"); + close(file_descriptor); + return -1; + } + // Write class + if (write(file_descriptor, tax->preferred_names->names[i].class_name, class_length) < ((ssize_t) class_length)) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError writing in a binary taxonomy file"); + close(file_descriptor); + return -1; + } + } + + // Close file + if (close(file_descriptor) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError closing a pdx taxonomy file"); + return -1; + } + + return 0; +} + + +int write_merged_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? { int i; char* file_name; @@ -1182,19 +1420,22 @@ int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name free(taxonomy_path); - if (write_rankidx(dms, tax, tax_name) < 0) - return -1; - if (write_taxonomyidx(dms, tax, tax_name) < 0) - return -1; - if (write_nameidx(dms, tax, tax_name) < 0) - return -1; - if (write_mergedidx(dms, tax, tax_name) < 0) - return -1; - // Check if there are local taxa (if so last taxon is local) - if ((tax->taxa)->local_count > 0) - if (write_local_taxonomy_idx(dms, tax, tax_name) < 0) - return -1; - + if (write_ranks_idx(dms, tax, tax_name) < 0) + return -1; + if (write_taxonomy_idx(dms, tax, tax_name) < 0) + return -1; + if (write_names_idx(dms, tax, tax_name) < 0) + return -1; + if (write_merged_idx(dms, tax, tax_name) < 0) + return -1; + // Check if there are local taxa (if so last taxon is local) + if ((tax->taxa)->local_count > 0) + if (write_local_taxonomy_idx(dms, tax, tax_name) < 0) + return -1; + // Write preferred names if there are some + if (tax->preferred_names != NULL) + if (write_preferred_names_idx(dms, tax, tax_name) < 0) + return -1; return 0; } @@ -2114,10 +2355,11 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) obidebug(1, "\nError allocating the memory for a taxonomy structure"); return NULL; } - tax->ranks = NULL; - tax->taxa = NULL; - tax->names = NULL; - tax->merged_idx = NULL; + tax->ranks = NULL; + tax->taxa = NULL; + tax->names = NULL; + tax->preferred_names = NULL; + tax->merged_idx = NULL; tax->dms = NULL; (tax->tax_name)[0] = '\0'; @@ -2295,6 +2537,10 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) (((tax->taxa)->taxon)[i].parent)->farest = 0; } + // Initialize preferred names to NULL + for (i=0; i < (tax->taxa)->count; i++) + ((tax->taxa)->taxon)[i].preferred_name = NULL; + (tax->taxa)->buffer_size = (tax->taxa)->count; // Compute longest branches (used to compute distances between taxa faster) @@ -2328,7 +2574,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) } -int obi_taxonomy_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const char* rank_name, int32_t parent_taxid, int32_t min_taxid) +int obi_taxo_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const char* rank_name, int32_t parent_taxid, int32_t min_taxid) { int32_t taxid; ecotx_t* taxon; @@ -2436,6 +2682,81 @@ int obi_taxonomy_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const } +int obi_taxo_add_preferred_name_with_taxid(OBIDMS_taxonomy_p tax, int32_t taxid, const char* preferred_name) +{ + ecotx_t* taxon; + + taxon = obi_taxo_get_taxon_with_taxid(tax, taxid); + + return obi_taxo_add_preferred_name_with_taxon(tax, taxon, preferred_name); +} + + +int obi_taxo_add_preferred_name_with_taxon(OBIDMS_taxonomy_p tax, ecotx_t* taxon, const char* preferred_name) +{ + econame_t* name_struct; + + // Free previous preferred name if there is one + if (taxon->preferred_name != NULL) + free(taxon->preferred_name); + + taxon->preferred_name = (char*) malloc((strlen(preferred_name) + 1) * sizeof(char)); + if (taxon->preferred_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a new preferred name for a taxon"); + return -1; + } + strcpy(taxon->preferred_name, preferred_name); + + // Add new name in preferred names structure + // Allocate or reallocate memory for new name + if (tax->preferred_names == NULL) + { + tax->preferred_names = (econameidx_t*) malloc(sizeof(econameidx_t) + sizeof(econame_t)); + (tax->preferred_names)->count = 0; + } + else + tax->preferred_names = (econameidx_t*) realloc(tax->preferred_names, sizeof(econameidx_t) + sizeof(econame_t) * ((tax->preferred_names)->count + 1)); + if (tax->preferred_names == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for a taxonomy structure to add a new preferred name"); + return -1; + } + + // Add new preferred name + name_struct = (tax->preferred_names)->names + ((tax->preferred_names)->count); + name_struct->name = (char*) malloc((strlen(preferred_name) + 1) * sizeof(char)); + if (name_struct->name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a new taxon preferred name"); + return -1; + } + strcpy(name_struct->name, preferred_name); + + name_struct->class_name = (char*) malloc((strlen("preferred name") + 1) * sizeof(char)); + if (name_struct->class_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a taxon class name to add a new preferred name"); + return -1; + } + strcpy(name_struct->class_name, "preferred name"); + name_struct->is_scientific_name = false; + name_struct->taxon = taxon; + + // Sort preferred names in alphabetical order + qsort((tax->preferred_names)->names, (tax->preferred_names)->count, sizeof(econame_t), cmp_names); + + // Update preferred name count + ((tax->preferred_names)->count)++; + + return 0; +} + + /////// PUBLIC ///////// @@ -2448,6 +2769,7 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo char* merged_idx_file_name; char* local_taxa_file_name; char* alter_names_file_name; + char* pref_names_file_name; int buffer_size; tax = (OBIDMS_taxonomy_p) malloc(sizeof(OBIDMS_taxonomy_t)); @@ -2458,10 +2780,11 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo return NULL; } - tax->ranks = NULL; - tax->taxa = NULL; - tax->names = NULL; - tax->merged_idx = NULL; + tax->ranks = NULL; + tax->taxa = NULL; + tax->names = NULL; + tax->preferred_names = NULL; + tax->merged_idx = NULL; tax->dms = dms; @@ -2492,7 +2815,7 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo free(tax); return NULL; } - tax->ranks = read_rankidx(ranks_file_name); + tax->ranks = read_ranks_idx(ranks_file_name); if (tax->ranks == NULL) { obi_set_errno(OBI_TAXONOMY_ERROR); @@ -2543,7 +2866,7 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo obi_close_taxonomy(tax); return NULL; } - tax->taxa = read_taxonomyidx(taxa_file_name, local_taxa_file_name); + tax->taxa = read_taxonomy_idx(taxa_file_name, local_taxa_file_name); if (tax->taxa == NULL) { free(taxonomy_path); @@ -2574,7 +2897,7 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo obi_close_taxonomy(tax); return NULL; } - tax->merged_idx = read_mergedidx(merged_idx_file_name, tax); + tax->merged_idx = read_merged_idx(merged_idx_file_name, tax); if (tax->merged_idx == NULL) { free(taxonomy_path); @@ -2584,6 +2907,38 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo } free(merged_idx_file_name); + // Read preferred names + pref_names_file_name = (char*) malloc(buffer_size*sizeof(char)); + if (pref_names_file_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for alternative names file name"); + free(taxonomy_path); + obi_close_taxonomy(tax); + return NULL; + } + if (snprintf(pref_names_file_name, buffer_size, "%s/%s.pdx", taxonomy_path, taxonomy_name) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError building alternative names file name"); + free(taxonomy_path); + free(pref_names_file_name); + obi_close_taxonomy(tax); + return NULL; + } + tax->preferred_names = read_preferred_names_idx(pref_names_file_name, tax); + if (obi_errno) + { + free(taxonomy_path); + free(pref_names_file_name); + obi_close_taxonomy(tax); + return NULL; + } + free(pref_names_file_name); + + if (tax->preferred_names != NULL) + fprintf(stderr, "\nPreferred names read"); + // Read alternative names if (read_alternative_names) { @@ -2605,7 +2960,7 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo obi_close_taxonomy(tax); return NULL; } - tax->names = read_nameidx(alter_names_file_name, tax); + tax->names = read_names_idx(alter_names_file_name, tax); if (tax->names == NULL) { free(taxonomy_path); @@ -2637,6 +2992,10 @@ int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy) if (write_local_taxonomy_idx(taxonomy->dms, taxonomy, taxonomy->tax_name) < 0) return -1; } + // Write preferred names if there are some + if (taxonomy->preferred_names != NULL) + if (write_preferred_names_idx(taxonomy->dms, taxonomy, taxonomy->tax_name) < 0) + return -1; if (taxonomy) { diff --git a/src/obidms_taxonomy.h b/src/obidms_taxonomy.h index 33d8aba..fa2f511 100644 --- a/src/obidms_taxonomy.h +++ b/src/obidms_taxonomy.h @@ -36,7 +36,8 @@ typedef struct ecotxnode { int32_t farest; int32_t idx; struct ecotxnode* parent; - char* name; + char* name; // scientific name + char* preferred_name; // preferred name bool local; } ecotx_t; @@ -98,6 +99,7 @@ typedef struct OBIDMS_taxonomy_t { ecomergedidx_t* merged_idx; ecorankidx_t* ranks; econameidx_t* names; + econameidx_t* preferred_names; ecotxidx_t* taxa; } OBIDMS_taxonomy_t, *OBIDMS_taxonomy_p; @@ -127,4 +129,11 @@ int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump); -int obi_taxonomy_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const char* rank_name, int32_t parent_taxid, int32_t min_taxid); +int obi_taxo_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const char* rank_name, int32_t parent_taxid, int32_t min_taxid); + +int obi_taxo_add_preferred_name_with_taxid(OBIDMS_taxonomy_p tax, int32_t taxid, const char* preferred_name); + +int obi_taxo_add_preferred_name_with_taxon(OBIDMS_taxonomy_p tax, ecotx_t* taxon, const char* preferred_name); + + + From c0bcdce72450a58195b0feddb5da4a44f510cfc7 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Wed, 18 Jan 2017 18:22:49 +0100 Subject: [PATCH 17/22] Taxonomy: documentation for all the functions, and fixed bugs when closing the taxonomy (overwriting of .pdx files, missing freeing, and re-placed a misplaced condition) --- src/obidms_taxonomy.c | 1266 +++++++++++++++++++++++++++-------------- src/obidms_taxonomy.h | 399 ++++++++++--- 2 files changed, 1179 insertions(+), 486 deletions(-) diff --git a/src/obidms_taxonomy.c b/src/obidms_taxonomy.c index 9e081f0..aba8641 100644 --- a/src/obidms_taxonomy.c +++ b/src/obidms_taxonomy.c @@ -6,7 +6,7 @@ * @file obidms_taxonomy.c * @author Celine Mercier (celine.mercier@metabarcoding.org) * @date March 2nd 2016 - * @brief Functions for reading binary taxonomy files. + * @brief Functions for handling the reading and writing of taxonomy files. */ @@ -29,7 +29,436 @@ #define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) -int cmp_rank_labels(const void* label1, const void* label2) +/************************************************************************** + * + * D E C L A R A T I O N O F T H E P R I V A T E F U N C T I O N S + * + **************************************************************************/ + + +/** + * @brief Internal function comparing two rank names. + * + * @param label1 A char* pointer on the first rank name. + * @param label2 A char** pointer on a second pointer, that second char* pointer being on the second rank name. + * (making the function usable with an ecorankidx_t structure and functions like bsearch) + * + * @returns A value < 0 if label1 < label2, + * a value > 0 if label1 > label2, + * and 0 if label1 == label2. + */ +static int cmp_rank_labels(const void* label1, const void* label2); + + +/** + * @brief Internal function comparing two taxids, one of them stored in an ecotx_t structure. + * + * @param ptaxid The first taxid. + * @param ptaxon A pointer on an ecotx_t structure where the second taxid is stored. + * + * @returns A value < 0 if taxid1 < taxid2, + * a value > 0 if taxid1 > taxid2, + * and 0 if taxid1 == taxid2. + */ +static int cmp_taxids_in_ecotx_t(const void* ptaxid, const void* ptaxon); + + +/** + * @brief Internal function comparing two taxids, one of them stored in an ecomerged_t structure. + * + * @param ptaxid The first taxid. + * @param ptaxon A pointer on an ecomerged_t structure where the second taxid is stored. + * + * @returns A value < 0 if taxid1 < taxid2, + * a value > 0 if taxid1 > taxid2, + * and 0 if taxid1 == taxid2. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int cmp_taxids_in_ecomerged_t(const void* ptaxid, const void* ptaxon); + + +/** + * @brief Internal function comparing two character strings pointed to by char** pointers. + * + * @param s1 A char** pointer on a second pointer, that second char* pointer being on the first character string. + * @param s2 A char** pointer on a second pointer, that second char* pointer being on the second character string. + * + * @returns A value < 0 if s1 < s2, + * a value > 0 if s1 > s2, + * and 0 if s1 == s2. + */ +static int cmp_str(const void* s1, const void* s2); + + +/** + * @brief Internal function comparing two taxon names stored in econame_t structures. + * + * @param n1 A pointer on the first econame_t structure. + * @param n2 A pointer on the second econame_t structure. + * + * @returns A value < 0 if n1 < n2, + * a value > 0 if n1 > n2, + * and 0 if n1 == n2. + * + * @since 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int cmp_names(const void* n1, const void* n2); + + +/** + * @brief Internal function comparing returning the ecotx_t structure associated with a taxid. + * + * This function only looks for the taxid in the modern taxonomy, it does not consider deprecated + * and old taxids, unlike obi_taxo_get_taxon_with_taxid(). + * + * @param taxonomy A pointer on the taxonomy structure. + * @param taxid The taxid of the taxon wanted. + * + * @returns A pointer on the ecotx_t structure associated with a taxid. + * + * @see obi_taxo_get_taxon_with_taxid() + */ +static ecotx_t* get_taxon_with_current_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid); + + +/** + * @brief Internal function returning the complete path to a taxonomy directory in a DMS. + * + * @param dms A pointer on the DMS. + * @param tax_name The name of the taxonomy. + * + * @returns The complete path to the taxonomy directory. + * @retval NULL if an error occurred. + * + * @since 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static char* get_taxonomy_path(OBIDMS_p dms, const char* tax_name); + + +/** + * @brief Internal function returning the index of a rank in an ecorankidx_t structure. + * + * @param label The name of the rank. + * @param ranks A pointer on an ecorankidx_t structure. + * + * @returns The index of a rank in the ecorankidx_t structure. + * @retval -1 if the rank was not found. + */ +static int32_t rank_index(const char* label, ecorankidx_t* ranks); + + +/** + * @brief Internal function opening a binary taxonomy file (.tdx, .rdx, .ndx, .adx, .pdx, .ldx). + * + * @param file_name The file path. + * @param count A pointer on an integer that the function will set to the number of records in the file. + * @param abort_on_open_error A boolean indicating whether the function should trigger an error if the file can't be open. + * + * @returns The FILE object. + * @retval NULL if an error occurred or if the file was not found. + */ +static FILE* open_ecorecorddb(const char* file_name, int32_t* count, int32_t abort_on_open_error); + + +/** + * @brief Internal function returning the next record in a binary taxonomy file (.tdx, .rdx, .ndx, .adx, .pdx, .ldx). + * + * @param f The file object with the offset at the start of a record. + * @param record_size A pointer on an integer that the function will set to the size of the record. + * + * @returns A pointer on the read record. + * @retval NULL if an error occurred. + */ +static void* read_ecorecord(FILE* f, int32_t* record_size); + + +/** + * @brief Internal function reading the next taxon record in a .tdx binary taxonomy file. + * + * @param f The file object with the offset at the start of a record. + * @param taxon A pointer on an empty, allocated ecotx_t structure that the function will fill. + * + * @returns A pointer on the read record. + * @retval NULL if an error occurred. + */ +static ecotx_t* readnext_ecotaxon(FILE* f, ecotx_t* taxon); + + +/** + * @brief Internal function reading the next taxon name record in a .ndx binary taxonomy file. + * + * @param f The file object with the offset at the start of a record. + * @param name A pointer on an empty, allocated econame_t structure that the function will fill. + * @param taxonomy A pointer on the taxonomy structure. + * + * @returns A pointer on the read record. + * @retval NULL if an error occurred. + */ +static econame_t* readnext_econame(FILE* f, econame_t* name, OBIDMS_taxonomy_p taxonomy); + + +/** + * @brief Internal function reading the next taxon preferred name record in a .pdx binary taxonomy file. + * + * @param f The file object with the offset at the start of a record. + * @param name A pointer on an empty, allocated econame_t structure that the function will fill. + * @param taxonomy A pointer on the taxonomy structure. + * + * @returns A pointer on the read record. + * @retval NULL if an error occurred. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static econame_t* readnext_ecopreferredname(FILE* f, econame_t* name, OBIDMS_taxonomy_p taxonomy); + + +/** + * @brief Internal function reading a taxonomic ranks (.rdx) binary taxonomy file. + * + * @param ranks_file_name The name of the .rdx file to read. + * + * @returns A pointer on an ecorankidx_t structure. + * @retval NULL if an error occurred. + */ +static ecorankidx_t* read_ranks_idx(const char* ranks_file_name); + + +/** + * @brief Internal function reading the taxa (.tdx, .ldx) binary taxonomy file. + * + * @param taxa_file_name The name of the .tdx file to read. + * @param local_taxa_file_name The name of the .ldx file containing the local taxa to read if there is one. + * + * @returns A pointer on an ecotxidx_t structure. + * @retval NULL if an error occurred. + */ +static ecotxidx_t* read_taxonomy_idx(const char* taxa_file_name, const char* local_taxa_file_name); + + +/** + * @brief Internal function reading a names (.ndx) binary taxonomy file. + * + * @param file_name The name of the .ndx file to read. + * @param taxonomy A pointer on the taxonomy structure. + * + * @returns A pointer on an econameidx_t structure. + * @retval NULL if an error occurred. + */ +static econameidx_t* read_names_idx(const char* file_name, OBIDMS_taxonomy_p taxonomy); + + +/** + * @brief Internal function reading a preferred names (.pdx) binary taxonomy file. + * + * @param file_name The name of the .pdx file to read. + * @param taxonomy A pointer on the taxonomy structure. + * + * @returns A pointer on an econameidx_t structure. + * @retval NULL if an error occurred. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static econameidx_t* read_preferred_names_idx(const char* file_name, OBIDMS_taxonomy_p taxonomy); + + +/** + * @brief Internal function reading a merged index (.adx) binary taxonomy file. + * + * @param file_name The name of the .adx file to read. + * @param taxonomy A pointer on the taxonomy structure. + * + * @returns A pointer on an ecomergedidx_t structure. + * @retval NULL if an error occurred. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static ecomergedidx_t* read_merged_idx(const char* file_name, OBIDMS_taxonomy_p taxonomy); + + +/** + * @brief Internal function writing a rank index (.rdx) binary taxonomy file. + * + * @param dms A pointer on the DMS. + * @param tax A pointer on the taxonomy structure. + * @param taxonomy_name The name of the taxonomy. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + */ +static int write_ranks_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name); + + +/** + * @brief Internal function writing a taxonomy index (.tdx) binary taxonomy file. + * + * @param dms A pointer on the DMS. + * @param tax A pointer on the taxonomy structure. + * @param taxonomy_name The name of the taxonomy. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + */ +static int write_taxonomy_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name); + + +/** + * @brief Internal function writing a local taxonomy index (.ldx) binary taxonomy file. + * + * @param dms A pointer on the DMS. + * @param tax A pointer on the taxonomy structure. + * @param taxonomy_name The name of the taxonomy. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + * + * @since 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int write_local_taxonomy_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name); + + +/** + * @brief Internal function writing a names index (.ndx) binary taxonomy file. + * + * @param dms A pointer on the DMS. + * @param tax A pointer on the taxonomy structure. + * @param taxonomy_name The name of the taxonomy. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + */ +static int write_names_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name); + + +/** + * @brief Internal function writing a preferred names index (.pdx) binary taxonomy file. + * + * @param dms A pointer on the DMS. + * @param tax A pointer on the taxonomy structure. + * @param taxonomy_name The name of the taxonomy. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int write_preferred_names_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name); + + +/** + * @brief Internal function writing a merged index (.adx) binary taxonomy file. + * + * @param dms A pointer on the DMS. + * @param tax A pointer on the taxonomy structure. + * @param taxonomy_name The name of the taxonomy. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int write_merged_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name); + + +/** + * @brief Internal function reading the 'nodes.dmp' file from an NCBI taxdump. + * + * @param taxdump The path to the taxdump. + * @param tax A pointer on the taxonomy structure. + * @param rank_names_p A char*** pointer on a non allocated char* array where the function will store rank names. + * @param parent_taxids_p An int** pointer on a non allocated int array where the function will store parent taxids. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int read_nodes_dmp(const char* taxdump, OBIDMS_taxonomy_p tax, char*** rank_names_p, int** parent_taxids_p); + + +/** + * @brief Internal function reading the 'delnodes.dmp' file from an NCBI taxdump. + * + * @param taxdump The path to the taxdump. + * @param tax A pointer on the taxonomy structure. + * @param delnodes_p An int** pointer on a non allocated int array where the function will store deleted taxids. + * @param delnodes_count An int* pointer where the function will store the number of deleted taxids. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int read_delnodes_dmp(const char* taxdump, OBIDMS_taxonomy_p tax, int32_t** delnodes_p, int32_t* delnodes_count); + + +/** + * @brief Internal function reading the 'merged.dmp' file from an NCBI taxdump. + * + * @warning Should be used AFTER read_nodes_dmp() and read_delnodes_dmp(). + * + * The function merges the information about current nodes previously read in read_nodes_dmp(), + * the information about deleted nodes previously read in read_delnodes_dmp(), and the information read + * in the 'merged.dmp' file, to build the final merged taxon index in the taxonomy structure. + * + * @param taxdump The path to the taxdump. + * @param tax A pointer on the taxonomy structure. + * @param delnodes An int* pointer containing the deleted taxids. + * @param delnodes_count The number of deleted taxids. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int read_merged_dmp(const char* taxdump, OBIDMS_taxonomy_p tax, int32_t* delnodes, int32_t delnodes_count); + + +/** + * @brief Internal function reading the 'names.dmp' file from an NCBI taxdump. + * + * @param taxdump The path to the taxdump. + * @param tax A pointer on the taxonomy structure. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int read_names_dmp(const char* taxdump, OBIDMS_taxonomy_p tax); + + +/************************************************************************ + * + * D E F I N I T I O N O F T H E P R I V A T E F U N C T I O N S + * + ************************************************************************/ + + +static int cmp_rank_labels(const void* label1, const void* label2) { return strcmp((const char*)label1,*(const char**)label2); } @@ -66,7 +495,23 @@ static int cmp_names(const void* n1, const void* n2) } -char* get_taxonomy_path(OBIDMS_p dms, const char* tax_name) +static ecotx_t* get_taxon_with_current_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid) +{ + ecotx_t *current_taxon; + int32_t count; + + count = (taxonomy->taxa)->count; + + current_taxon = (ecotx_t*) bsearch((const void *) ((size_t) taxid), + (const void *) taxonomy->taxa->taxon, + count, + sizeof(ecotx_t), + cmp_taxids_in_ecotx_t); + return current_taxon; +} + + +static char* get_taxonomy_path(OBIDMS_p dms, const char* tax_name) { char* all_tax_dir_path; char* tax_path; @@ -98,7 +543,7 @@ char* get_taxonomy_path(OBIDMS_p dms, const char* tax_name) } -int32_t rank_index(const char* label, ecorankidx_t* ranks) +static int32_t rank_index(const char* label, ecorankidx_t* ranks) { char **rep; @@ -111,7 +556,50 @@ int32_t rank_index(const char* label, ecorankidx_t* ranks) } -void* read_ecorecord(FILE* f, int32_t* record_size) +static FILE* open_ecorecorddb(const char* file_name, + int32_t* count, + int32_t abort_on_open_error) +{ + FILE* f; + int32_t read; + + f = fopen(file_name, "rb"); + + if (!f) + { + if (abort_on_open_error) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nCouldn't open a taxonomy file"); + fclose(f); + return NULL; + } + else + { + *count = 0; + fclose(f); + return NULL; + } + } + + read = fread(count, + sizeof(int32_t), + 1, + f); + + if (read != 1) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError reading taxonomy record size"); + fclose(f); + return NULL; + } + + return f; +} + + +static void* read_ecorecord(FILE* f, int32_t* record_size) { static void* buffer = NULL; int32_t buffer_size = 0; @@ -174,7 +662,7 @@ void* read_ecorecord(FILE* f, int32_t* record_size) }; -ecotx_t* readnext_ecotaxon(FILE* f, ecotx_t* taxon) +static ecotx_t* readnext_ecotaxon(FILE* f, ecotx_t* taxon) { ecotxformat_t* raw; int32_t record_length; @@ -203,50 +691,100 @@ ecotx_t* readnext_ecotaxon(FILE* f, ecotx_t* taxon) } -FILE* open_ecorecorddb(const char* file_name, - int32_t* count, - int32_t abort_on_open_error) +static econame_t* readnext_econame(FILE* f, econame_t* name, OBIDMS_taxonomy_p taxonomy) { - FILE* f; - int32_t read; + econameformat_t* raw; + int32_t record_length; - f = fopen(file_name, "rb"); + raw = read_ecorecord(f, &record_length); + if (raw == NULL) + return NULL; - if (!f) + name->is_scientific_name = raw->is_scientific_name; + + name->name = malloc((raw->name_length + 1) * sizeof(char)); + if (name->name == NULL) { - if (abort_on_open_error) - { - obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nCouldn't open a taxonomy file"); - fclose(f); - return NULL; - } - else - { - *count = 0; - fclose(f); - return NULL; - } - } - - read = fread(count, - sizeof(int32_t), - 1, - f); - - if (read != 1) - { - obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nError reading taxonomy record size"); - fclose(f); + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a taxon name"); + free(raw); return NULL; } + strncpy(name->name, raw->names, raw->name_length); + name->name[raw->name_length] = 0; - return f; + name->class_name = malloc((raw->class_length+1) * sizeof(char)); + if (name->class_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a taxon class name"); + free(name->name); + free(raw); + return NULL; + } + strncpy(name->class_name,(raw->names + raw->name_length), raw->class_length); + name->class_name[raw->class_length] = 0; + + name->taxon = taxonomy->taxa->taxon + raw->taxid; + + return name; } -ecorankidx_t* read_ranks_idx(const char* ranks_file_name) +static econame_t* readnext_ecopreferredname(FILE* f, econame_t* name, OBIDMS_taxonomy_p taxonomy) +{ + econameformat_t* raw; + int32_t record_length; + + raw = read_ecorecord(f, &record_length); + if (raw == NULL) + return NULL; + + name->is_scientific_name = raw->is_scientific_name; + + name->name = malloc((raw->name_length + 1) * sizeof(char)); + if (name->name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a taxon preferred name"); + free(raw); + return NULL; + } + strncpy(name->name, raw->names, raw->name_length); + name->name[raw->name_length] = 0; + + name->class_name = malloc((raw->class_length+1) * sizeof(char)); + if (name->class_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a taxon class name"); + free(name->name); + free(raw); + return NULL; + } + strncpy(name->class_name,(raw->names + raw->name_length), raw->class_length); + name->class_name[raw->class_length] = 0; + + name->taxon = taxonomy->taxa->taxon + raw->taxid; + + // Add the preferred name in the taxon structure // TODO discuss: couldn't they all use the same pointer? + (taxonomy->taxa->taxon + raw->taxid)->preferred_name = malloc((raw->name_length + 1) * sizeof(char)); + if ((taxonomy->taxa->taxon + raw->taxid)->preferred_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a taxon preferred name"); + free(name->name); + free(name->class_name); + free(raw); + return NULL; + } + strcpy((taxonomy->taxa->taxon + raw->taxid)->preferred_name, name->name); + + return name; +} + + +static ecorankidx_t* read_ranks_idx(const char* ranks_file_name) { int32_t count; FILE* ranks_file; @@ -301,7 +839,7 @@ ecorankidx_t* read_ranks_idx(const char* ranks_file_name) } -ecotxidx_t* read_taxonomy_idx(const char* taxa_file_name, const char* local_taxa_file_name) +static ecotxidx_t* read_taxonomy_idx(const char* taxa_file_name, const char* local_taxa_file_name) { int32_t count_taxa; int32_t count_local_taxa; @@ -394,100 +932,7 @@ ecotxidx_t* read_taxonomy_idx(const char* taxa_file_name, const char* local_taxa } -econame_t* readnext_econame(FILE* f, econame_t* name, OBIDMS_taxonomy_p taxonomy) -{ - econameformat_t* raw; - int32_t record_length; - - raw = read_ecorecord(f, &record_length); - if (raw == NULL) - return NULL; - - name->is_scientific_name = raw->is_scientific_name; - - name->name = malloc((raw->name_length + 1) * sizeof(char)); - if (name->name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for a taxon name"); - free(raw); - return NULL; - } - strncpy(name->name, raw->names, raw->name_length); - name->name[raw->name_length] = 0; - - name->class_name = malloc((raw->class_length+1) * sizeof(char)); - if (name->class_name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for a taxon class name"); - free(name->name); - free(raw); - return NULL; - } - strncpy(name->class_name,(raw->names + raw->name_length), raw->class_length); - name->class_name[raw->class_length] = 0; - - name->taxon = taxonomy->taxa->taxon + raw->taxid; - - return name; -} - - -econame_t* readnext_ecopreferredname(FILE* f, econame_t* name, OBIDMS_taxonomy_p taxonomy) -{ - econameformat_t* raw; - int32_t record_length; - - raw = read_ecorecord(f, &record_length); - if (raw == NULL) - return NULL; - - name->is_scientific_name = raw->is_scientific_name; - - name->name = malloc((raw->name_length + 1) * sizeof(char)); - if (name->name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for a taxon preferred name"); - free(raw); - return NULL; - } - strncpy(name->name, raw->names, raw->name_length); - name->name[raw->name_length] = 0; - - name->class_name = malloc((raw->class_length+1) * sizeof(char)); - if (name->class_name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for a taxon class name"); - free(name->name); - free(raw); - return NULL; - } - strncpy(name->class_name,(raw->names + raw->name_length), raw->class_length); - name->class_name[raw->class_length] = 0; - - name->taxon = taxonomy->taxa->taxon + raw->taxid; - - // Add the preferred name in the taxon structure // TODO discuss: couldn't they all use the same pointer? - (taxonomy->taxa->taxon + raw->taxid)->preferred_name = malloc((raw->name_length + 1) * sizeof(char)); - if ((taxonomy->taxa->taxon + raw->taxid)->preferred_name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for a taxon preferred name"); - free(name->name); - free(name->class_name); - free(raw); - return NULL; - } - strcpy((taxonomy->taxa->taxon + raw->taxid)->preferred_name, name->name); - - return name; -} - - -econameidx_t* read_names_idx(const char *file_name, OBIDMS_taxonomy_p taxonomy) +static econameidx_t* read_names_idx(const char *file_name, OBIDMS_taxonomy_p taxonomy) { int32_t count; FILE* f; @@ -526,7 +971,7 @@ econameidx_t* read_names_idx(const char *file_name, OBIDMS_taxonomy_p taxonomy) } -econameidx_t* read_preferred_names_idx(const char *file_name, OBIDMS_taxonomy_p taxonomy) +static econameidx_t* read_preferred_names_idx(const char *file_name, OBIDMS_taxonomy_p taxonomy) { int32_t count; FILE* f; @@ -565,7 +1010,7 @@ econameidx_t* read_preferred_names_idx(const char *file_name, OBIDMS_taxonomy_p } -ecomergedidx_t* read_merged_idx(const char *file_name, OBIDMS_taxonomy_p taxonomy) +static ecomergedidx_t* read_merged_idx(const char *file_name, OBIDMS_taxonomy_p taxonomy) { int32_t count; FILE* f; @@ -610,15 +1055,7 @@ ecomergedidx_t* read_merged_idx(const char *file_name, OBIDMS_taxonomy_p taxonom } - - - - - - -// Functions to write taxonomy structure to binary files - -int write_ranks_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? +static int write_ranks_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? { int i; char* file_name; @@ -721,7 +1158,7 @@ int write_ranks_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_na } -int write_taxonomy_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct? +static int write_taxonomy_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct? { int i; char* file_name; @@ -858,7 +1295,7 @@ int write_taxonomy_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy } -int write_local_taxonomy_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct? +static int write_local_taxonomy_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct? { int i; char* file_name; @@ -995,7 +1432,7 @@ int write_local_taxonomy_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* ta } -int write_names_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct? +static int write_names_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct? { int i; char* file_name; @@ -1143,7 +1580,7 @@ int write_names_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_na } -int write_preferred_names_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct? +static int write_preferred_names_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct? { int i; char* file_name; @@ -1185,7 +1622,7 @@ int write_preferred_names_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* t free(taxonomy_path); // Create file - file_descriptor = open(file_name, O_RDWR | O_CREAT | O_EXCL, 0777); + file_descriptor = open(file_name, O_RDWR | O_CREAT, 0777); if (file_descriptor < 0) { obi_set_errno(OBI_TAXONOMY_ERROR); @@ -1291,7 +1728,7 @@ int write_preferred_names_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* t } -int write_merged_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? +static int write_merged_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? { int i; char* file_name; @@ -1399,48 +1836,6 @@ int write_merged_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_n } -int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name) -{ - char* taxonomy_path; - - // Build the taxonomy directory path - taxonomy_path = get_taxonomy_path(dms, tax_name); - if (taxonomy_path == NULL) - return -1; - - // Try to create the directory - if (mkdir(taxonomy_path, 00777) < 0) - { - if (errno == EEXIST) - obidebug(1, "\nA taxonomy already exists with this name."); - obidebug(1, "\nProblem creating a new taxonomy directory"); - free(taxonomy_path); - return -1; - } - - free(taxonomy_path); - - if (write_ranks_idx(dms, tax, tax_name) < 0) - return -1; - if (write_taxonomy_idx(dms, tax, tax_name) < 0) - return -1; - if (write_names_idx(dms, tax, tax_name) < 0) - return -1; - if (write_merged_idx(dms, tax, tax_name) < 0) - return -1; - // Check if there are local taxa (if so last taxon is local) - if ((tax->taxa)->local_count > 0) - if (write_local_taxonomy_idx(dms, tax, tax_name) < 0) - return -1; - // Write preferred names if there are some - if (tax->preferred_names != NULL) - if (write_preferred_names_idx(dms, tax, tax_name) < 0) - return -1; - return 0; -} - - - int read_nodes_dmp(const char* taxdump, OBIDMS_taxonomy_p tax, char*** rank_names_p, int** parent_taxids_p) { struct dirent* dp; @@ -2049,7 +2444,7 @@ int read_merged_dmp(const char* taxdump, OBIDMS_taxonomy_p tax, int32_t* delnode // Store the deprecated taxid with the index that refers to the new taxid // Find the index of the new taxid - t = obi_taxo_get_taxon_with_current_taxid(tax, taxid); + t = get_taxon_with_current_taxid(tax, taxid); // Store the old taxid with the index (tax->merged_idx)->merged[n].taxid = old_taxid; (tax->merged_idx)->merged[n].idx = t->idx; @@ -2335,6 +2730,13 @@ int read_names_dmp(const char* taxdump, OBIDMS_taxonomy_p tax) } +/********************************************************************** + * + * D E F I N I T I O N O F T H E P U B L I C F U N C T I O N S + * + **********************************************************************/ + + OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { OBIDMS_taxonomy_p tax; @@ -2524,7 +2926,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) // Associate the taxa with their parent for (i=0; i < (tax->taxa)->count; i++) { - ((tax->taxa)->taxon)[i].parent = obi_taxo_get_taxon_with_current_taxid(tax, parent_taxids[i]); + ((tax->taxa)->taxon)[i].parent = get_taxon_with_current_taxid(tax, parent_taxids[i]); if (((tax->taxa)->taxon)[i].parent == NULL) { obi_set_errno(OBI_TAXONOMY_ERROR); @@ -2574,192 +2976,6 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) } -int obi_taxo_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const char* rank_name, int32_t parent_taxid, int32_t min_taxid) -{ - int32_t taxid; - ecotx_t* taxon; - int i; -// econame_t* name_struct; - - // Enlarge the structure memory for a new taxon - tax->taxa = (ecotxidx_t*) realloc(tax->taxa, sizeof(ecotxidx_t) + sizeof(ecotx_t) * (((tax->taxa)->count) + 1)); - if (tax->taxa == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError reallocating memory for a taxonomy structure to add a new taxon"); - return -1; - } - - // Compute new taxid that must be equal or greater than 1E7 and greater than the maximum taxid existing in the taxonomy - if (min_taxid < MIN_LOCAL_TAXID) - min_taxid = MIN_LOCAL_TAXID; - if (min_taxid > (tax->taxa)->max_taxid) - taxid = min_taxid; - else - taxid = ((tax->taxa)->max_taxid) + 1; - - // Fill the ecotx_t node structure - taxon = ((tax->taxa)->taxon)+((tax->taxa)->count); - taxon->taxid = taxid; - taxon->idx = (tax->taxa)->count; - taxon->local = true; - taxon->name = (char*) malloc((strlen(name) + 1) * sizeof(char)); - if (taxon->name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for a taxon name to add a new taxon"); - return -1; - } - strcpy(taxon->name, name); - taxon->rank = -1; - for (i=0; i < (tax->ranks)->count; i++) - { - if (strcmp(rank_name, ((tax->ranks)->label)[i]) == 0) - { - taxon->rank = i; - break; - } - } - if (taxon->rank == -1) - { - obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nError: taxon rank not found when adding a new taxon"); - return -1; - } - taxon->parent = obi_taxo_get_taxon_with_taxid(tax, parent_taxid); - if (taxon->parent == NULL) - { - obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nError: taxon parent not found when adding a new taxon"); - return -1; - } - taxon->farest = 0; - - // Update taxonomy counts etc - (tax->taxa)->max_taxid = taxid; - ((tax->taxa)->count)++; - ((tax->taxa)->local_count)++; - (tax->taxa)->buffer_size = (tax->taxa)->count; - -// // Add new name in names structure // Commented because the new name was not added in the .ndx file in the OBITools1 -// // Allocate memory for new name -// tax->names = (econameidx_t*) realloc(tax->names, sizeof(econameidx_t) + sizeof(econame_t) * ((tax->names)->count + 1)); -// if (tax->names == NULL) -// { -// obi_set_errno(OBI_MALLOC_ERROR); -// obidebug(1, "\nError reallocating memory for a taxonomy structure to add a new taxon"); -// return -1; -// } -// -// // Add new name -// name_struct = (tax->names)->names + ((tax->names)->count); -// name_struct->name = (char*) malloc((strlen(name) + 1) * sizeof(char)); -// if (name_struct->name == NULL) -// { -// obi_set_errno(OBI_MALLOC_ERROR); -// obidebug(1, "\nError allocating memory for a taxon name to add a new taxon"); -// return -1; -// } -// strcpy(name_struct->name, name); -// name_struct->class_name = (char*) malloc((strlen("scientific name") + 1) * sizeof(char)); -// if (name_struct->class_name == NULL) -// { -// obi_set_errno(OBI_MALLOC_ERROR); -// obidebug(1, "\nError allocating memory for a taxon class name to add a new taxon"); -// return -1; -// } -// strcpy(name_struct->class_name, "scientific name"); -// name_struct->is_scientific_name = true; -// name_struct->taxon = ((tax->taxa)->taxon) + ((tax->taxa)->count) - 1; -// -// // Sort names in alphabetical order -// qsort((tax->names)->names, (tax->names)->count, sizeof(econame_t), cmp_names); -// -// // Update name count -// ((tax->names)->count)++; - - return taxid; -} - - -int obi_taxo_add_preferred_name_with_taxid(OBIDMS_taxonomy_p tax, int32_t taxid, const char* preferred_name) -{ - ecotx_t* taxon; - - taxon = obi_taxo_get_taxon_with_taxid(tax, taxid); - - return obi_taxo_add_preferred_name_with_taxon(tax, taxon, preferred_name); -} - - -int obi_taxo_add_preferred_name_with_taxon(OBIDMS_taxonomy_p tax, ecotx_t* taxon, const char* preferred_name) -{ - econame_t* name_struct; - - // Free previous preferred name if there is one - if (taxon->preferred_name != NULL) - free(taxon->preferred_name); - - taxon->preferred_name = (char*) malloc((strlen(preferred_name) + 1) * sizeof(char)); - if (taxon->preferred_name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for a new preferred name for a taxon"); - return -1; - } - strcpy(taxon->preferred_name, preferred_name); - - // Add new name in preferred names structure - // Allocate or reallocate memory for new name - if (tax->preferred_names == NULL) - { - tax->preferred_names = (econameidx_t*) malloc(sizeof(econameidx_t) + sizeof(econame_t)); - (tax->preferred_names)->count = 0; - } - else - tax->preferred_names = (econameidx_t*) realloc(tax->preferred_names, sizeof(econameidx_t) + sizeof(econame_t) * ((tax->preferred_names)->count + 1)); - if (tax->preferred_names == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError reallocating memory for a taxonomy structure to add a new preferred name"); - return -1; - } - - // Add new preferred name - name_struct = (tax->preferred_names)->names + ((tax->preferred_names)->count); - name_struct->name = (char*) malloc((strlen(preferred_name) + 1) * sizeof(char)); - if (name_struct->name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for a new taxon preferred name"); - return -1; - } - strcpy(name_struct->name, preferred_name); - - name_struct->class_name = (char*) malloc((strlen("preferred name") + 1) * sizeof(char)); - if (name_struct->class_name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for a taxon class name to add a new preferred name"); - return -1; - } - strcpy(name_struct->class_name, "preferred name"); - name_struct->is_scientific_name = false; - name_struct->taxon = taxon; - - // Sort preferred names in alphabetical order - qsort((tax->preferred_names)->names, (tax->preferred_names)->count, sizeof(econame_t), cmp_names); - - // Update preferred name count - ((tax->preferred_names)->count)++; - - return 0; -} - - -/////// PUBLIC ///////// - - OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, bool read_alternative_names) { OBIDMS_taxonomy_p tax; @@ -2977,28 +3193,82 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo } +int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name) +{ + char* taxonomy_path; + + // Build the taxonomy directory path + taxonomy_path = get_taxonomy_path(dms, tax_name); + if (taxonomy_path == NULL) + return -1; + + // Try to create the directory + if (mkdir(taxonomy_path, 00777) < 0) + { + if (errno == EEXIST) + obidebug(1, "\nA taxonomy already exists with this name."); + obidebug(1, "\nProblem creating a new taxonomy directory"); + free(taxonomy_path); + return -1; + } + + free(taxonomy_path); + + if (write_ranks_idx(dms, tax, tax_name) < 0) + return -1; + if (write_taxonomy_idx(dms, tax, tax_name) < 0) + return -1; + if (write_names_idx(dms, tax, tax_name) < 0) + return -1; + if (write_merged_idx(dms, tax, tax_name) < 0) + return -1; + // Check if there are local taxa (if so last taxon is local) + if ((tax->taxa)->local_count > 0) + if (write_local_taxonomy_idx(dms, tax, tax_name) < 0) + return -1; + // Write preferred names if there are some + if (tax->preferred_names != NULL) + if (write_preferred_names_idx(dms, tax, tax_name) < 0) + return -1; + return 0; +} + + int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy) { int i; - // Update local informations (local taxa and preferred names) if there are any - if ((taxonomy->taxa)->local_count > 0) - { - if (taxonomy->dms == NULL) - { - obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nError closing a taxonomy with local files but no DMS associated (probably read directly from taxdump)"); // TODO discuss - } - if (write_local_taxonomy_idx(taxonomy->dms, taxonomy, taxonomy->tax_name) < 0) - return -1; - } - // Write preferred names if there are some - if (taxonomy->preferred_names != NULL) - if (write_preferred_names_idx(taxonomy->dms, taxonomy, taxonomy->tax_name) < 0) - return -1; - if (taxonomy) { + // Update local informations (local taxa and preferred names) if there are any + if ((taxonomy->taxa)->local_count > 0) + { + if (taxonomy->dms == NULL) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError closing a taxonomy with local files but no DMS associated (probably read directly from taxdump)"); // TODO discuss + } + if (write_local_taxonomy_idx(taxonomy->dms, taxonomy, taxonomy->tax_name) < 0) + return -1; + } + + // Write preferred names if there are some + if (taxonomy->preferred_names) + { + if (write_preferred_names_idx(taxonomy->dms, taxonomy, taxonomy->tax_name) < 0) + return -1; + + // Free preferred names + for (i=0; i < (taxonomy->preferred_names)->count; i++) + { + if (((taxonomy->preferred_names)->names[i]).name) + free(((taxonomy->preferred_names)->names[i]).name); + if (((taxonomy->preferred_names)->names[i]).class_name) + free(((taxonomy->preferred_names)->names[i]).class_name); + } + free(taxonomy->preferred_names); + } + if (taxonomy->ranks) { for (i=0; i < (taxonomy->ranks)->count; i++) @@ -3043,7 +3313,187 @@ int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy) } -////////////////////////////////////////////////////////////////////////// +int obi_taxo_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const char* rank_name, int32_t parent_taxid, int32_t min_taxid) +{ + int32_t taxid; + ecotx_t* taxon; + int i; +// econame_t* name_struct; + + // Enlarge the structure memory for a new taxon + tax->taxa = (ecotxidx_t*) realloc(tax->taxa, sizeof(ecotxidx_t) + sizeof(ecotx_t) * (((tax->taxa)->count) + 1)); + if (tax->taxa == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for a taxonomy structure to add a new taxon"); + return -1; + } + + // Compute new taxid that must be equal or greater than 1E7 and greater than the maximum taxid existing in the taxonomy + if (min_taxid < MIN_LOCAL_TAXID) + min_taxid = MIN_LOCAL_TAXID; + if (min_taxid > (tax->taxa)->max_taxid) + taxid = min_taxid; + else + taxid = ((tax->taxa)->max_taxid) + 1; + + // Fill the ecotx_t node structure + taxon = ((tax->taxa)->taxon)+((tax->taxa)->count); + taxon->taxid = taxid; + taxon->idx = (tax->taxa)->count; + taxon->local = true; + taxon->name = (char*) malloc((strlen(name) + 1) * sizeof(char)); + if (taxon->name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a taxon name to add a new taxon"); + return -1; + } + strcpy(taxon->name, name); + taxon->rank = -1; + for (i=0; i < (tax->ranks)->count; i++) + { + if (strcmp(rank_name, ((tax->ranks)->label)[i]) == 0) + { + taxon->rank = i; + break; + } + } + if (taxon->rank == -1) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError: taxon rank not found when adding a new taxon"); + return -1; + } + taxon->parent = obi_taxo_get_taxon_with_taxid(tax, parent_taxid); + if (taxon->parent == NULL) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError: taxon parent not found when adding a new taxon"); + return -1; + } + taxon->farest = 0; + + // Update taxonomy counts etc + (tax->taxa)->max_taxid = taxid; + ((tax->taxa)->count)++; + ((tax->taxa)->local_count)++; + (tax->taxa)->buffer_size = (tax->taxa)->count; + +// // Add new name in names structure // Commented because the new name was not added in the .ndx file in the OBITools1 +// // Allocate memory for new name +// tax->names = (econameidx_t*) realloc(tax->names, sizeof(econameidx_t) + sizeof(econame_t) * ((tax->names)->count + 1)); +// if (tax->names == NULL) +// { +// obi_set_errno(OBI_MALLOC_ERROR); +// obidebug(1, "\nError reallocating memory for a taxonomy structure to add a new taxon"); +// return -1; +// } +// +// // Add new name +// name_struct = (tax->names)->names + ((tax->names)->count); +// name_struct->name = (char*) malloc((strlen(name) + 1) * sizeof(char)); +// if (name_struct->name == NULL) +// { +// obi_set_errno(OBI_MALLOC_ERROR); +// obidebug(1, "\nError allocating memory for a taxon name to add a new taxon"); +// return -1; +// } +// strcpy(name_struct->name, name); +// name_struct->class_name = (char*) malloc((strlen("scientific name") + 1) * sizeof(char)); +// if (name_struct->class_name == NULL) +// { +// obi_set_errno(OBI_MALLOC_ERROR); +// obidebug(1, "\nError allocating memory for a taxon class name to add a new taxon"); +// return -1; +// } +// strcpy(name_struct->class_name, "scientific name"); +// name_struct->is_scientific_name = true; +// name_struct->taxon = ((tax->taxa)->taxon) + ((tax->taxa)->count) - 1; +// +// // Sort names in alphabetical order +// qsort((tax->names)->names, (tax->names)->count, sizeof(econame_t), cmp_names); +// +// // Update name count +// ((tax->names)->count)++; + + return taxid; +} + + +int obi_taxo_add_preferred_name_with_taxid(OBIDMS_taxonomy_p tax, int32_t taxid, const char* preferred_name) +{ + ecotx_t* taxon; + + taxon = obi_taxo_get_taxon_with_taxid(tax, taxid); + + return obi_taxo_add_preferred_name_with_taxon(tax, taxon, preferred_name); +} + + +int obi_taxo_add_preferred_name_with_taxon(OBIDMS_taxonomy_p tax, ecotx_t* taxon, const char* preferred_name) +{ + econame_t* name_struct; + + // Free previous preferred name if there is one + if (taxon->preferred_name != NULL) + free(taxon->preferred_name); + + taxon->preferred_name = (char*) malloc((strlen(preferred_name) + 1) * sizeof(char)); + if (taxon->preferred_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a new preferred name for a taxon"); + return -1; + } + strcpy(taxon->preferred_name, preferred_name); + + // Add new name in preferred names structure + // Allocate or reallocate memory for new name + if (tax->preferred_names == NULL) + { + tax->preferred_names = (econameidx_t*) malloc(sizeof(econameidx_t) + sizeof(econame_t)); + (tax->preferred_names)->count = 0; + } + else + tax->preferred_names = (econameidx_t*) realloc(tax->preferred_names, sizeof(econameidx_t) + sizeof(econame_t) * ((tax->preferred_names)->count + 1)); + if (tax->preferred_names == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for a taxonomy structure to add a new preferred name"); + return -1; + } + + // Add new preferred name + name_struct = (tax->preferred_names)->names + ((tax->preferred_names)->count); + name_struct->name = (char*) malloc((strlen(preferred_name) + 1) * sizeof(char)); + if (name_struct->name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a new taxon preferred name"); + return -1; + } + strcpy(name_struct->name, preferred_name); + + name_struct->class_name = (char*) malloc((strlen("preferred name") + 1) * sizeof(char)); + if (name_struct->class_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a taxon class name to add a new preferred name"); + return -1; + } + strcpy(name_struct->class_name, "preferred name"); + name_struct->is_scientific_name = false; + name_struct->taxon = taxon; + + // Sort preferred names in alphabetical order + qsort((tax->preferred_names)->names, (tax->preferred_names)->count, sizeof(econame_t), cmp_names); + + // Update preferred name count + ((tax->preferred_names)->count)++; + + return 0; +} ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx) @@ -3068,22 +3518,6 @@ ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx) } -ecotx_t* obi_taxo_get_taxon_with_current_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid) // TODO discuss keeping private? -{ - ecotx_t *current_taxon; - int32_t count; - - count = (taxonomy->taxa)->count; - - current_taxon = (ecotx_t*) bsearch((const void *) ((size_t) taxid), - (const void *) taxonomy->taxa->taxon, - count, - sizeof(ecotx_t), - cmp_taxids_in_ecotx_t); - return current_taxon; -} - - ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid) { ecotx_t *current_taxon; @@ -3234,5 +3668,3 @@ ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy) return obi_taxo_get_parent_at_rank(taxon, rankindex); } - - diff --git a/src/obidms_taxonomy.h b/src/obidms_taxonomy.h index fa2f511..dcce499 100644 --- a/src/obidms_taxonomy.h +++ b/src/obidms_taxonomy.h @@ -6,7 +6,7 @@ * @file obidms_taxonomy.h * @author Celine Mercier (celine.mercier@metabarcoding.org) * @date March 2nd 2016 - * @brief Header file for the functions handling the reading of binary taxonomy files. + * @brief Header file for the functions handling the reading and writing of taxonomy files. */ @@ -17,123 +17,384 @@ #include "obidms.h" -#define MIN_LOCAL_TAXID (10000000) -#define TAX_NAME_LEN (1024) +#define MIN_LOCAL_TAXID (10000000) /**< The minimum taxid for a taxon added locally (i.e. not an NCBI taxon). + */ +#define TAX_NAME_LEN (1024) /**< The maximum length for the taxonomy name. + */ +/** + * @brief Structure for a taxon as stored in a .tdx file. + */ typedef struct { - int32_t taxid; - int32_t rank; - int32_t parent; - int32_t name_length; - char name[]; + int32_t taxid; /**< Taxid. + */ + int32_t rank; /**< Rank index. + */ + int32_t parent; /**< Index, in the taxid index, of the parent node in the taxonomic tree. + */ + int32_t name_length; /**< Length of the taxon scientific name. + */ + char name[]; /**< Scientific name of the taxon. + */ } ecotxformat_t; +/** + * @brief Structure for a taxon as stored in a taxonomy structure. + */ typedef struct ecotxnode { - int32_t taxid; // TODO discuss that this is will be the current taxid even if the struct was accessed through a deprecated one - int32_t rank; - int32_t farest; - int32_t idx; - struct ecotxnode* parent; - char* name; // scientific name - char* preferred_name; // preferred name - bool local; + int32_t taxid; /**< Taxid. // TODO discuss that this is will be the current taxid even if the struct was accessed through a deprecated one + */ + int32_t rank; /**< Rank index in ecorankidx_t structure. + */ + int32_t farest; /**< Longest branch length, used to compute distances between taxa faster. + */ + int32_t idx; /**< Index in the ecotxidx_t structure. + */ + struct ecotxnode* parent; /**< Pointer on the parent node in the taxonomic tree. + */ + char* name; /**< Scientific name of the taxon. + */ + char* preferred_name; /**< Preferred name of the taxon if there is one, otherwise NULL. + */ + bool local; /**< A boolean indicating whether the taxon is local or not. + */ } ecotx_t; +/** + * @brief Structure for the taxon index in a taxonomy structure. + */ typedef struct { - int32_t count; - int32_t ncbi_count; - int32_t local_count; - int32_t max_taxid; - int32_t buffer_size; - ecotx_t taxon[]; + int32_t count; /**< Number of taxa. + */ + int32_t ncbi_count; /**< Number of NCBI taxa. + */ + int32_t local_count; /**< Number of taxa added locally. + */ + int32_t max_taxid; /**< Maximum taxid existing in the taxon index. + */ + int32_t buffer_size; /**< Number of taxa. // TODO kept this but not sure of its use + */ + ecotx_t taxon[]; /**< Taxon array. + */ } ecotxidx_t; +/** + * @brief Structure for the rank index in a taxonomy structure. + */ typedef struct { - int32_t count; - char* label[]; + int32_t count; /**< Number of ranks. + */ + char* label[]; /**< Array of rank names. + */ } ecorankidx_t; +/** + * @brief Structure for a taxon name as stored in a .ndx file. + */ typedef struct { - int32_t is_scientific_name; - int32_t name_length; - int32_t class_length; - int32_t taxid; // taxid idx - char names[]; + int32_t is_scientific_name; /**< A boolean indicating whether the name is a scientific name or not. + */ + int32_t name_length; /**< The name length. + */ + int32_t class_length; /**< The name class length. + */ + int32_t taxid; /**< Index of the taxon in the taxid index. + */ + char names[]; /**< Taxon name and name class concatenated. + */ } econameformat_t; +/** + * @brief Structure for a taxon name as stored in a taxonomy structure. + */ typedef struct { - char* name; - char* class_name; - int32_t is_scientific_name; - struct ecotxnode* taxon; + char* name; /**< Taxon name. + */ + char* class_name; /**< Name class. + */ + int32_t is_scientific_name; /**< A boolean indicating whether the name is a scientific name or not. + */ + struct ecotxnode* taxon; /**< Pointer on the taxon in the taxon index. + */ } econame_t; +/** + * @brief Structure for the name index in a taxonomy structure. + */ typedef struct { - int32_t count; - econame_t names[]; + int32_t count; /**< Number of names. + */ + econame_t names[]; /**< Array of names. + */ } econameidx_t; +/** + * @brief Structure for a taxid/index pair as stored in a taxonomy structure. + */ typedef struct { - int32_t taxid; - int32_t idx; + int32_t taxid; /**< Taxid. + */ + int32_t idx; /**< Index of the taxid in the taxon index, -1 if the taxid is deprecated. + */ } ecomerged_t; +/** + * @brief Structure for a merged taxid index in a taxonomy structure. + * + * This index includes all deprecated taxids that now refer to different taxids, and + * the deprecated taxids that are deleted. + * + */ typedef struct { - int32_t count; - ecomerged_t merged[]; + int32_t count; /**< Number of taxid/index pairs. + */ + ecomerged_t merged[]; /**< Array of taxid/index pairs. + */ } ecomergedidx_t; +/** + * @brief Structure for a taxonomy. + */ typedef struct OBIDMS_taxonomy_t { - char tax_name[TAX_NAME_LEN]; - OBIDMS_p dms; - ecomergedidx_t* merged_idx; - ecorankidx_t* ranks; - econameidx_t* names; - econameidx_t* preferred_names; - ecotxidx_t* taxa; + char tax_name[TAX_NAME_LEN]; /**< Taxonomy name. + */ + OBIDMS_p dms; /**< A pointer on the DMS to which the taxonomy belongs. + */ + ecomergedidx_t* merged_idx; /**< Merged taxid index. + */ + ecorankidx_t* ranks; /**< Taxonomic ranks. + */ + econameidx_t* names; /**< Taxon names. + */ + econameidx_t* preferred_names; /**< Taxon preferred names (i.e. added locally). + */ + ecotxidx_t* taxa; /**< Taxa. + */ } OBIDMS_taxonomy_t, *OBIDMS_taxonomy_p; -OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, bool read_alternative_names); - -int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy); - -ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx); - -ecotx_t* obi_taxo_get_taxon_with_current_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid); -ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid); - -bool obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid); - -ecotx_t* obi_taxo_get_species(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy); - -ecotx_t* obi_taxo_get_genus(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy); - -ecotx_t* obi_taxo_get_family(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy); - -ecotx_t* obi_taxo_get_kingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy); - -ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy); - -int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name); - +/** + * @brief Function reading an NCBI taxdump and loading its information into a taxonomy structure. + * + * @param taxdump The path to the taxdump directory. + * + * @returns A pointer on the read taxonomy structure. + * @retval NULL if an error occurred. + * + * @since 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump); + +/** + * @brief Function reading a binary taxonomy database (i.e. a set of .tdx, .ndx, .rdx, .adx, .ldx, .pdx files) + * and loading its information into a taxonomy structure. + * + * @param dms A pointer on the DMS to which the taxonomy belongs. + * @param taxonomy_name The name (prefix) of the taxonomy. + * @param read_alternative_names A boolean indicating whether names other than scientific and preferred names should be read. + * + * @returns A pointer on the read taxonomy structure. + * @retval NULL if an error occurred. + * + * @since 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, bool read_alternative_names); + + +/** + * @brief Function writing a binary taxonomy database (i.e. a set of .tdx, .ndx, .rdx, .adx, .ldx, .pdx files). + * + * @param dms A pointer on the DMS to which the taxonomy belongs. + * @param tax A pointer on the taxonomy structure. + * @param tax_name The name (prefix) of the taxonomy. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + * + * @since 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name); + + +/** + * @brief Function closing a taxonomy structure. + * + * This function writes all changes to the binary files (local taxa and preferred names) and free all allocated memory for the structure. + * + * @param taxonomy A pointer on the taxonomy structure. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + * + * @since 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy); + + +/** + * @brief Function adding a local taxon to a taxonomy. + * + * @param tax A pointer on the taxonomy structure. + * @param name The taxon scientific name. + * @param rank_name The taxon rank name. + * @param parent_taxid The taxid of the parent node in the taxonomic tree. + * @param min_taxid The minimum taxid to give to the new taxon (the function will choose a new taxid >= min_taxid and >= MIN_LOCAL_TAXID). + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + * + * @since 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ int obi_taxo_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const char* rank_name, int32_t parent_taxid, int32_t min_taxid); + +/** + * @brief Function adding a preferred name to a taxon in a taxonomy, referred to by its taxid. + * + * @param tax A pointer on the taxonomy structure. + * @param taxid The taxid of the taxon that should have a new preferred name. + * @param preferred_name The new preferred name. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ int obi_taxo_add_preferred_name_with_taxid(OBIDMS_taxonomy_p tax, int32_t taxid, const char* preferred_name); + +/** + * @brief Function adding a preferred name to a taxon in a taxonomy, referred to by the taxon pointer. + * + * @param tax A pointer on the taxonomy structure. + * @param taxon A pointer on the taxon that should have a new preferred name. + * @param preferred_name The new preferred name. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ int obi_taxo_add_preferred_name_with_taxon(OBIDMS_taxonomy_p tax, ecotx_t* taxon, const char* preferred_name); +/** + * @brief Function returning the parent of a taxon at a given rank. + * + * @param taxon A pointer on the taxon. + * @param rankidx The index of the rank wanted. + * + * @returns A pointer on the parent taxon at the wanted rank. + * @retval NULL if no parent taxon was found at the wanted rank. + */ +ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx); + + +/** + * @brief Function returning a taxon given its taxid. + * + * @param taxonomy A pointer on the taxonomy. + * @param taxid The taxid of the taxon. + * + * @returns A pointer on the wanted taxon. + * @retval NULL if no taxon was found with the given taxid. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid); + + +/** + * @brief Function checking whether a taxon is under another in the taxonomy tree. + * + * @param taxon A pointer on the first taxon. + * @param other_taxid The taxid of the second taxon. + * + * @returns A boolean indicating whether the first taxon is under the second taxon in the taxonomy tree. + */ +bool obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid); + + +/** + * @brief Function returning the parent of a taxon at the species level. + * + * @param taxon A pointer on the taxon. + * @param taxonomy A pointer on the taxonomy structure. + * + * @returns A pointer on the parent taxon at the species level. + * @retval NULL if no parent taxon was found at the wanted rank. + */ +ecotx_t* obi_taxo_get_species(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy); + + +/** + * @brief Function returning the parent of a taxon at the genus level. + * + * @param taxon A pointer on the taxon. + * @param taxonomy A pointer on the taxonomy structure. + * + * @returns A pointer on the parent taxon at the genus level. + * @retval NULL if no parent taxon was found at the wanted rank. + */ +ecotx_t* obi_taxo_get_genus(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy); + + +/** + * @brief Function returning the parent of a taxon at the family level. + * + * @param taxon A pointer on the taxon. + * @param taxonomy A pointer on the taxonomy structure. + * + * @returns A pointer on the parent taxon at the family level. + * @retval NULL if no parent taxon was found at the wanted rank. + */ +ecotx_t* obi_taxo_get_family(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy); + + +/** + * @brief Function returning the parent of a taxon at the kingdom level. + * + * @param taxon A pointer on the taxon. + * @param taxonomy A pointer on the taxonomy structure. + * + * @returns A pointer on the parent taxon at the kingdom level. + * @retval NULL if no parent taxon was found at the wanted rank. + */ +ecotx_t* obi_taxo_get_kingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy); + + +/** + * @brief Function returning the parent of a taxon at the superkingdom level. + * + * @param taxon A pointer on the taxon. + * @param taxonomy A pointer on the taxonomy structure. + * + * @returns A pointer on the parent taxon at the superkingdom level. + * @retval NULL if no parent taxon was found at the wanted rank. + */ +ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy); From 651c1d7845075673d77bba5142fec964d41dcf38 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Tue, 31 Jan 2017 16:45:47 +0100 Subject: [PATCH 18/22] utilities: bsearch and qsort with additional user_data pointer argument --- src/utils.c | 190 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/utils.h | 37 ++++++++++ 2 files changed, 227 insertions(+) diff --git a/src/utils.c b/src/utils.c index f7a0ff5..37e5f0c 100644 --- a/src/utils.c +++ b/src/utils.c @@ -116,3 +116,193 @@ void* obi_get_memory_aligned_on_16(int size, int* shift) return (memory); } + +/* + * A generic implementation of binary search for the Linux kernel + * + * Copyright (C) 2008-2009 Ksplice, Inc. + * Author: Tim Abbott + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; version 2. + */ +void* bsearch_user_data(const void* key, const void* base, size_t num, size_t size, const void* user_data, + int (*cmp)(const void *key, const void *elt, const void* user_data)) +{ + size_t start = 0; + size_t end = num; + size_t mid; + int result; + + while (start < end) + { + mid = start + (end - start) / 2; + result = cmp(key, base + mid * size, user_data); + if (result < 0) + end = mid; + else if (result > 0) + start = mid + 1; + else + return (void*)base + mid * size; + } + + return NULL; +} + + +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function". + */ + +#define MIN(a,b) ((a) < (b) ? a : b) + +#define swapcode(TYPE, parmi, parmj, n) { \ + long i = (n) / sizeof (TYPE); \ + register TYPE *pi = (TYPE *) (parmi); \ + register TYPE *pj = (TYPE *) (parmj); \ + do { \ + register TYPE t = *pi; \ + *pi++ = *pj; \ + *pj++ = t; \ + } while (--i > 0); \ +} + +#define SWAPINIT(a, es) swaptype = ((char *)a - (char *)0) % sizeof(long) || \ + es % sizeof(long) ? 2 : es == sizeof(long)? 0 : 1; + +static __inline void +swapfunc(char *a, char *b, int n, int swaptype) +{ + if (swaptype <= 1) + swapcode(long, a, b, n) + else + swapcode(char, a, b, n) +} + +#define swap(a, b) \ + if (swaptype == 0) { \ + long t = *(long *)(a); \ + *(long *)(a) = *(long *)(b); \ + *(long *)(b) = t; \ + } else \ + swapfunc(a, b, es, swaptype) + +#define vecswap(a, b, n) if ((n) > 0) swapfunc(a, b, n, swaptype) + +static __inline char * +med3(char *a, char *b, char *c, const void *user_data, int (*cmp)(const void *, const void *, const void *)) +{ + return cmp(a, b, user_data) < 0 ? + (cmp(b, c, user_data) < 0 ? b : (cmp(a, c, user_data) < 0 ? c : a )) + :(cmp(b, c, user_data) > 0 ? b : (cmp(a, c, user_data) < 0 ? a : c )); +} + +void +qsort_user_data(void *aa, size_t n, size_t es, const void *user_data, int (*cmp)(const void *, const void *, const void *)) +{ + char *pa, *pb, *pc, *pd, *pl, *pm, *pn; + int d, r, swaptype, swap_cnt; + register char *a = aa; + +loop: SWAPINIT(a, es); + swap_cnt = 0; + if (n < 7) { + for (pm = (char *)a + es; pm < (char *) a + n * es; pm += es) + for (pl = pm; pl > (char *) a && cmp(pl - es, pl, user_data) > 0; + pl -= es) + swap(pl, pl - es); + return; + } + pm = (char *)a + (n / 2) * es; + if (n > 7) { + pl = (char *)a; + pn = (char *)a + (n - 1) * es; + if (n > 40) { + d = (n / 8) * es; + pl = med3(pl, pl + d, pl + 2 * d, user_data, cmp); + pm = med3(pm - d, pm, pm + d, user_data, cmp); + pn = med3(pn - 2 * d, pn - d, pn, user_data, cmp); + } + pm = med3(pl, pm, pn, user_data, cmp); + } + swap(a, pm); + pa = pb = (char *)a + es; + + pc = pd = (char *)a + (n - 1) * es; + for (;;) { + while (pb <= pc && (r = cmp(pb, a, user_data)) <= 0) { + if (r == 0) { + swap_cnt = 1; + swap(pa, pb); + pa += es; + } + pb += es; + } + while (pb <= pc && (r = cmp(pc, a, user_data)) >= 0) { + if (r == 0) { + swap_cnt = 1; + swap(pc, pd); + pd -= es; + } + pc -= es; + } + if (pb > pc) + break; + swap(pb, pc); + swap_cnt = 1; + pb += es; + pc -= es; + } + if (swap_cnt == 0) { /* Switch to insertion sort */ + for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es) + for (pl = pm; pl > (char *) a && cmp(pl - es, pl, user_data) > 0; + pl -= es) + swap(pl, pl - es); + return; + } + + pn = (char *)a + n * es; + r = MIN(pa - (char *)a, pb - pa); + vecswap(a, pb - r, r); + r = MIN((long)(pd - pc), (long)(pn - pd - es)); + vecswap(pb, pn - r, r); + if ((r = pb - pa) > (int)es) + qsort_user_data(a, r / es, es, user_data, cmp); + if ((r = pd - pc) > (int)es) { + /* Iterate rather than recurse to save stack space */ + a = pn - r; + n = r / es; + goto loop; + } +/* qsort(pn - r, r / es, es, cmp);*/ +} + diff --git a/src/utils.h b/src/utils.h index 8ac9a6c..ecab62e 100644 --- a/src/utils.h +++ b/src/utils.h @@ -74,4 +74,41 @@ char* obi_format_date(time_t date); void* obi_get_memory_aligned_on_16(int size, int* shift); +/** + * @brief Version of quick sort modified to allow the user to provide an + * additional pointer sent to the comparison function. + * + * @param key This is the pointer to the object that serves as key for the search, type-casted as a void*. + * @param base This is the pointer to the first object of the array where the search is performed, type-casted as a void*. + * @param num This is the number of elements in the array pointed by base. + * @param size This is the size in bytes of each element in the array. + * @param user_data This is an additional pointer passed to the comparison function. + * @param cmp This is the function that compares two elements, eventually with an additional pointer. + * + * @returns A pointer to an entry in the array that matches the search key. + * @retval NULL if key is not found. + * + * @since January 2017 + * @author original code modified by Celine Mercier (celine.mercier@metabarcoding.org) + */ +void* bsearch_user_data(const void* key, const void* base, size_t num, size_t size, const void* user_data, + int (*cmp)(const void *key, const void *elt, const void* user_data)); + + +/** + * @brief Version of quick sort modified to allow the user to provide an + * additional pointer sent to the comparison function. + * + * @param aa This is the pointer to the first element of the array to be sorted. + * @param n This is the number of elements in the array pointed by base. + * @param es This is the size in bytes of each element in the array. + * @param user_data This is an additional pointer passed to the comparison function. + * @param cmp This is the function that compares two elements, eventually with an additional pointer. + * + * @since January 2017 + * @author original code modified by Celine Mercier (celine.mercier@metabarcoding.org) + */ +void qsort_user_data(void *aa, size_t n, size_t es, const void *user_data, int (*cmp)(const void *, const void *, const void *)); + + #endif /* UTILS_H_ */ From e50da64ea19629ea7cf150f051305805ec0b2da9 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Tue, 31 Jan 2017 16:48:06 +0100 Subject: [PATCH 19/22] The elements names when a column contains several elements per line are now formatted with '\0' as separator and handled in a more optimized way --- python/obitools3/commands/test.pyx | 8 +- python/obitools3/obidms/_obidms.pyx | 14 +- python/obitools3/obidms/capi/obidmscolumn.pxd | 29 +- src/obidmscolumn.c | 376 +++++++++++++++--- src/obidmscolumn.h | 128 +++--- src/obiview.c | 8 +- 6 files changed, 411 insertions(+), 152 deletions(-) diff --git a/python/obitools3/commands/test.pyx b/python/obitools3/commands/test.pyx index 774494a..6f001ce 100644 --- a/python/obitools3/commands/test.pyx +++ b/python/obitools3/commands/test.pyx @@ -97,8 +97,7 @@ def test_set_and_get(config, infos): return idx = random_int(config) value = infos['random_generator'][data_type](config) - - if len(element_names) > 1 : + if col.nb_elements_per_line > 1 : elt = random.choice(element_names) col[idx][elt] = value assert col[idx][elt] == value, "Set value != gotten value "+str(col[idx][elt])+" != "+str(value) @@ -187,6 +186,7 @@ def create_random_column(config, infos) : elements_names = [] for i in range(nb_elements_per_line) : elements_names.append(random_unique_element_name(config, infos)) + elements_names = random.choice([None, elements_names]) name = random_unique_name(infos) infos['view'].add_column(name, alias=alias, @@ -358,7 +358,9 @@ def run(config): config['test']['elt_name_max_len'] = int((COL_COMMENTS_MAX_LEN - config['test']['maxelts']) / config['test']['maxelts']) print("Initializing the DMS and the first view...") - + + shutil.rmtree(config['obi']['defaultdms']+'.obidms', ignore_errors=True) + ini_dms_and_first_view(config, infos) print_test(config, repr(infos['view'])) diff --git a/python/obitools3/obidms/_obidms.pyx b/python/obitools3/obidms/_obidms.pyx index 7f86c59..943b3e2 100644 --- a/python/obitools3/obidms/_obidms.pyx +++ b/python/obitools3/obidms/_obidms.pyx @@ -7,7 +7,8 @@ from .capi.obidms cimport obi_dms, \ from .capi.obidmscolumn cimport obi_close_column, \ OBIDMS_column_p, \ - OBIDMS_column_header_p + OBIDMS_column_header_p, \ + obi_get_elements_names from .capi.obiutils cimport obi_format_date @@ -75,7 +76,7 @@ from .capi.obiview cimport Obiview_p, \ DEFINITION_COLUMN, \ QUALITY_COLUMN -from libc.stdlib cimport malloc +from libc.stdlib cimport malloc, free cdef class OBIDMS_column : @@ -138,7 +139,12 @@ cdef class OBIDMS_column : # elements_names property getter @property def elements_names(self): - return (bytes2str(((self._pointer)[0].header).elements_names)).split(';') + cdef char* elts_names_b + cdef str elts_names + elts_names_b = obi_get_elements_names((self._pointer)[0]) + elts_names = bytes2str(elts_names_b) + free(elts_names_b) + return elts_names.split(';') # nb_elements_per_line property getter @property @@ -376,7 +382,7 @@ cdef class OBIView : elements_names_b = str2bytes("") else : elements_names_b = str2bytes(';'.join(elements_names)) - + if type : # TODO make C function that does that if type == 'OBI_INT' : data_type = OBI_INT diff --git a/python/obitools3/obidms/capi/obidmscolumn.pxd b/python/obitools3/obidms/capi/obidmscolumn.pxd index b23b85f..9589421 100644 --- a/python/obitools3/obidms/capi/obidmscolumn.pxd +++ b/python/obitools3/obidms/capi/obidmscolumn.pxd @@ -47,31 +47,8 @@ cdef extern from "obidmscolumn.h" nogil: bint writable ctypedef OBIDMS_column_t* OBIDMS_column_p - - OBIDMS_column_p obi_create_column(OBIDMS_p dms, - const_char_p column_name, - OBIType_t type, - index_t nb_lines, - index_t nb_elements_per_line, - const_char_p elements_names, - const_char_p indexer_name, - const_char_p associated_colum_name, - obiversion_t associated_colum_version, - const_char_p comments) - - OBIDMS_column_p obi_open_column(OBIDMS_p dms, - const_char_p column_name, - obiversion_t version_number) int obi_close_column(OBIDMS_column_p column) - - OBIDMS_column_p obi_clone_column(OBIDMS_p dms, - OBIDMS_column_p line_selection, - const_char_p column_name, - obiversion_t version_number, - bint clone_data) - - int obi_close_column(OBIDMS_column_p column) obiversion_t obi_column_get_latest_version_from_name(OBIDMS_p dms, const_char_p column_name) @@ -81,9 +58,9 @@ cdef extern from "obidmscolumn.h" nogil: obiversion_t version_number) int obi_close_header(OBIDMS_column_header_p header) - - int obi_select(OBIDMS_column_p line_selection_column, index_t line_to_grep) - + + char* obi_get_elements_names(OBIDMS_column_p column) + cdef extern from "obidmscolumn_int.h" nogil: diff --git a/src/obidmscolumn.c b/src/obidmscolumn.c index a704ab1..00fbae1 100644 --- a/src/obidmscolumn.c +++ b/src/obidmscolumn.c @@ -119,7 +119,7 @@ static obiversion_t create_version_file(OBIDMS_column_directory_p column_directo /** * @brief Internal function building the default elements names of the lines of a - * column (i.e. "0;1;2;...;n"). + * column, with ';' as separator (i.e. "0;1;2;...;n\0"). * * @warning The returned pointer has to be freed by the caller. * @@ -134,12 +134,61 @@ static obiversion_t create_version_file(OBIDMS_column_directory_p column_directo static char* build_default_elements_names(index_t nb_elements_per_line); +/** + * @brief Internal function formatting the elements names of the lines of a + * column with '\0' as separator (e.g. "0\01\02\0...\0n\0"). + * + * @param elements_names The character string formatted with ';' as separator (e.g. "0;1;2;...;n\0"). + * @param elts_names_length A pointer on an integer where the function will store the length of the character string. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static void format_elements_names(char* elements_names, int* elts_names_length); + + +/** + * @brief Internal function comparing two element names using their sorted index, using data stored in the column header. + * + * @param n1_sort_idx A pointer on the sorted index of the first name. + * @param n2_sort_idx A pointer on the sorted index of the second name. + * @param h A pointer on the column header. + * + * @returns A value < 0 if name1 < name2, + * a value > 0 if name1 > name2, + * and 0 if name1 == name2. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int cmp_elements_names_with_idx(const void* n1_sort_idx, const void* n2_sort_idx, const void* h); + + +/** + * @brief Internal function comparing two element names using a pointer on the first name and the sorted index of the second name, + * using data stored in the column header. + * + * @param name1 A pointer on the first name. + * @param n2_sort_idx A pointer on the sorted index of the second name. + * @param h A pointer on the column header. + * + * @returns A value < 0 if name1 < name2, + * a value > 0 if name1 > name2, + * and 0 if name1 == name2. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int cmp_elements_names_with_name_and_idx(const void* name1, const void* n2_sort_idx, const void* h); + + /** * @brief Internal function setting the elements names of the lines of a * column in the header of the OBIDMS column structure. * * @param column A pointer as returned by obi_create_column(). - * @param elements_names The names of the elements with ';' as separator. + * @param elements_names The names of the elements as formatted by format_elements_names(). + * @param elts_names_length The length of elements_names. * * @retval 0 if the operation was successfully completed. * @retval -1 if an error occurred. @@ -147,7 +196,35 @@ static char* build_default_elements_names(index_t nb_elements_per_line); * @since July 2015 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -static int obi_column_set_elements_names(OBIDMS_column_p column, char* elements_names); +static int set_elements_names(OBIDMS_column_p column, char* elements_names, int elts_names_length); + + +/** + * @brief Internal function counting the number of elements names in a character array. + * + * @param elements_names A pointer on the character string corresponding to the elements names, + * formatted with ';' or with '\0' as separator. + * @param elt_names_formatted Whether the separator is ';' (false), or '\0' (true, as formatted by format_elements_names()). + * + * @returns The number of elements names in the character array. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static index_t check_elt_names_count(const char* elements_names, bool elt_names_formatted); + + +/** + * @brief Internal function computing the length of a character array containing elements names as formatted by format_elements_names(). + * + * @param elements_names A pointer on the character string corresponding to the elements names as formatted by format_elements_names(). + * + * @returns The length of a character array. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int get_formatted_elt_names_length(const char* elements_names); /** @@ -198,6 +275,7 @@ static char* build_column_file_name(const char* column_name, obiversion_t versio } + static char* build_version_file_name(const char* column_name) { char* file_name; @@ -222,6 +300,7 @@ static char* build_version_file_name(const char* column_name) } + static obiversion_t obi_get_new_version_number(OBIDMS_column_directory_p column_directory, bool block) { off_t loc_size; @@ -346,6 +425,7 @@ static obiversion_t obi_get_new_version_number(OBIDMS_column_directory_p column_ } + static obiversion_t create_version_file(OBIDMS_column_directory_p column_directory) { off_t loc_size; @@ -437,10 +517,12 @@ static obiversion_t create_version_file(OBIDMS_column_directory_p column_directo } + static char* build_default_elements_names(index_t nb_elements_per_line) { char* elements_names; int i; + int len; elements_names = (char*) malloc(ELEMENTS_NAMES_MAX * sizeof(char)); if (elements_names == NULL) @@ -457,31 +539,169 @@ static char* build_default_elements_names(index_t nb_elements_per_line) return NULL; } - for (i= 0; i < nb_elements_per_line; i++) - sprintf(elements_names, "%d", i); + len = 0; + for (i = 0; i < nb_elements_per_line; i++) + len += sprintf(elements_names+len, "%d;", i); // Terminal character - elements_names[strlen(elements_names)] = '\0'; + elements_names[len-1] = '\0'; // -1 to delete last ';' + len--; return elements_names; } -int obi_column_set_elements_names(OBIDMS_column_p column, char* elements_names) + +static void format_elements_names(char* elements_names, int* elts_names_length) { - if (strlen(elements_names) > ELEMENTS_NAMES_MAX) + int i; + + *elts_names_length = strlen(elements_names); + + // Replace the ';' with '\0' + for (i=0; i < *elts_names_length; i++) + { + if (elements_names[i] == ';') + elements_names[i] = '\0'; + } +} + + + +static int cmp_elements_names_with_idx(const void* n1_sort_idx, const void* n2_sort_idx, const void* h) +{ + char* name1=NULL; + char* name2=NULL; + + int name1_idx; + int name2_idx; + + int name1_sort_idx = *((int*)n1_sort_idx); + int name2_sort_idx = *((int*)n2_sort_idx); + OBIDMS_column_header_p header = (OBIDMS_column_header_p) h; + + name1_idx = (header->elements_names_idx)[name1_sort_idx]; + name1 = (header->elements_names)+name1_idx; + + name2_idx = (header->elements_names_idx)[name2_sort_idx]; + name2 = (header->elements_names)+name2_idx; + + return strcmp(name1, name2); +} + + + +static int cmp_elements_names_with_name_and_idx(const void* name1, const void* n2_sort_idx, const void* h) +{ + char* name2=NULL; + int name2_idx; + + int name2_sort_idx = *((int*)n2_sort_idx); + OBIDMS_column_header_p header = (OBIDMS_column_header_p) h; + + name2_idx = (header->elements_names_idx)[name2_sort_idx]; + name2 = (header->elements_names)+name2_idx; + + return strcmp(name1, name2); +} + + + +static int set_elements_names(OBIDMS_column_p column, char* elements_names, int elts_names_length) +{ + int i, j; + + // Check that the elements names are not too long + if (elts_names_length+2 > ELEMENTS_NAMES_MAX) { obi_set_errno(OBICOL_UNKNOWN_ERROR); obidebug(1, "\nError: element names too long (max: %d)", ELEMENTS_NAMES_MAX); return -1; } - strcpy((column->header)->elements_names, elements_names); + // Copy the elements names in the header + memcpy((column->header)->elements_names, elements_names, elts_names_length*sizeof(char)); + + // Terminal characters + (column->header)->elements_names[elts_names_length] = '\0'; + (column->header)->elements_names[elts_names_length + 1] = '\0'; + + // Store the length of the character array containing the elements names + (column->header)->elements_names_length = elts_names_length; + + // Build the elements names index + i = 0; + j = 0; + // Index the first element name + ((column->header)->elements_names_idx)[j] = i; + ((column->header)->sorted_elements_idx)[j] = j; + i++; + j++; + while (i < elts_names_length) + { + if (elements_names[i] == '\0') + { // Index new element name + ((column->header)->elements_names_idx)[j] = i+1; + ((column->header)->sorted_elements_idx)[j] = j; + j++; + } + i++; + } + + // Build the sorted index + qsort_user_data((column->header)->sorted_elements_idx, j, sizeof(int), column->header, cmp_elements_names_with_idx); + return 0; } -index_t get_line_count_per_page(OBIType_t data_type, index_t nb_elements_per_line) + +static index_t check_elt_names_count(const char* elements_names, bool elt_names_formatted) +{ + char sep; + int i = 0; + bool stop = false; + index_t count = 0; + + if (elt_names_formatted) + sep = FORMATTED_ELT_NAMES_SEPARATOR; + else + sep = NOT_FORMATTED_ELT_NAMES_SEPARATOR; + + while (! stop) + { + if ((elt_names_formatted && (elements_names[i] == '\0') && (elements_names[i+1] == '\0')) || + ((! elt_names_formatted) && (elements_names[i] == '\0'))) + stop = true; + if ((elements_names[i] == sep) || (elements_names[i] == '\0')) + count++; + i++; + } + + return count; +} + + + +static int get_formatted_elt_names_length(const char* elements_names) +{ + int i = 0; + bool stop = false; + + while (! stop) + { + if ((elements_names[i] == '\0') && (elements_names[i+1] == '\0')) + stop = true; + else + i++; + } + + return i; +} + + + +static index_t get_line_count_per_page(OBIType_t data_type, index_t nb_elements_per_line) { return getpagesize() / (obi_sizeof(data_type) * nb_elements_per_line); } @@ -493,6 +713,7 @@ index_t get_line_count_per_page(OBIType_t data_type, index_t nb_elements_per_lin * **********************************************************************/ + obiversion_t obi_get_latest_version_number(OBIDMS_column_directory_p column_directory) { off_t loc_size; @@ -557,6 +778,7 @@ obiversion_t obi_get_latest_version_number(OBIDMS_column_directory_p column_dire } + obiversion_t obi_column_get_latest_version_from_name(OBIDMS_p dms, const char* column_name) { OBIDMS_column_directory_p column_directory; @@ -582,6 +804,7 @@ obiversion_t obi_column_get_latest_version_from_name(OBIDMS_p dms, const char* c } + size_t obi_get_platform_header_size() { size_t header_size; @@ -607,7 +830,8 @@ OBIDMS_column_p obi_create_column(OBIDMS_p dms, const char* indexer_name, const char* associated_column_name, obiversion_t associated_column_version, - const char* comments + const char* comments, + bool elt_names_formatted ) { OBIDMS_column_p new_column; @@ -623,6 +847,8 @@ OBIDMS_column_p obi_create_column(OBIDMS_p dms, OBIType_t returned_data_type; OBIType_t stored_data_type; char* final_indexer_name; + char* built_elements_names = NULL; + int elts_names_length; new_column = NULL; @@ -695,31 +921,29 @@ OBIDMS_column_p obi_create_column(OBIDMS_p dms, else if (nb_lines < minimum_line_count) nb_lines = minimum_line_count; - // Check and build if needed the element names - if ((elements_names == NULL) || (strcmp(elements_names, "") == 0)) // Build the default element names: str of the element index + // Check, format, and build if needed the element names + if ((elements_names == NULL) || (*elements_names == '\0')) // Build the default element names: str of the element index { - elements_names = build_default_elements_names(nb_elements_per_line); - if (elements_names == NULL) + built_elements_names = build_default_elements_names(nb_elements_per_line); + if (built_elements_names == NULL) return NULL; + elements_names = built_elements_names; } - else if (((elements_names == NULL) || (strcmp(elements_names, "") != 0)) && (nb_elements_per_line > 1)) + else { // The number of elements names should be equal to the number of elements per line - char* token; - index_t n = 0; - token = strdup(elements_names); - token = strtok(token, ";"); - while (token != NULL) + if (check_elt_names_count(elements_names, elt_names_formatted) != nb_elements_per_line) { - token = strtok(NULL, ";"); - n++; - } - if (n != nb_elements_per_line) - { - obidebug(1, "\nCan't create column because the number of elements names given is not equal to the number of elements per line"); + obidebug(1, "\nCan't create column because the number of elements names given is not equal to the number of elements per line:" + "\n%lld elements per line\nelements names:%s\n", nb_elements_per_line, elements_names); return NULL; } } - // TODO what if 1 element and name specified? doc + + // Format the elements names string + if (! elt_names_formatted) + format_elements_names(elements_names, &elts_names_length); + else + elts_names_length = get_formatted_elt_names_length(elements_names); // Calculate the size needed header_size = obi_get_platform_header_size(); @@ -816,11 +1040,11 @@ OBIDMS_column_p obi_create_column(OBIDMS_p dms, header->version = version_number; header->cloned_from = -1; - obi_column_set_elements_names(new_column, elements_names); + set_elements_names(new_column, elements_names, elts_names_length); // Free the element names if they were built - if ((elements_names == NULL) || (strcmp(elements_names, "") == 0)) - free(elements_names); + if (built_elements_names != NULL) + free(built_elements_names); strncpy(header->name, column_name, OBIDMS_COLUMN_MAX_NAME); @@ -886,6 +1110,7 @@ OBIDMS_column_p obi_create_column(OBIDMS_p dms, } + OBIDMS_column_p obi_open_column(OBIDMS_p dms, const char* column_name, obiversion_t version_number) @@ -1043,6 +1268,7 @@ OBIDMS_column_p obi_open_column(OBIDMS_p dms, } + OBIDMS_column_p obi_clone_column(OBIDMS_p dms, OBIDMS_column_p line_selection, const char* column_name, @@ -1083,7 +1309,8 @@ OBIDMS_column_p obi_clone_column(OBIDMS_p dms, (column_to_clone->header)->indexer_name, ((column_to_clone->header)->associated_column).column_name, ((column_to_clone->header)->associated_column).version, - (column_to_clone->header)->comments + (column_to_clone->header)->comments, + true ); if (new_column == NULL) @@ -1097,6 +1324,8 @@ OBIDMS_column_p obi_clone_column(OBIDMS_p dms, return NULL; } + + (new_column->header)->cloned_from = (column_to_clone->header)->version; if (clone_data && (line_selection == NULL)) @@ -1137,6 +1366,7 @@ OBIDMS_column_p obi_clone_column(OBIDMS_p dms, } + int obi_close_column(OBIDMS_column_p column) { int ret_val = 0; @@ -1185,6 +1415,7 @@ int obi_close_column(OBIDMS_column_p column) } + int obi_clone_column_indexer(OBIDMS_column_p column) { char* new_indexer_name; @@ -1208,6 +1439,7 @@ int obi_clone_column_indexer(OBIDMS_column_p column) } + int obi_truncate_column(OBIDMS_column_p column) // TODO is it necessary to unmap/remap? { size_t file_size; @@ -1309,6 +1541,7 @@ int obi_truncate_column(OBIDMS_column_p column) // TODO is it necessary to unmap } + int obi_enlarge_column(OBIDMS_column_p column) { size_t file_size; @@ -1363,7 +1596,7 @@ int obi_enlarge_column(OBIDMS_column_p column) header_size = (column->header)->header_size; file_size = header_size + new_data_size; - // Enlarge the file // TODO isn't it possible that this makes the file "move"? + // Enlarge the file if (ftruncate(column_file_descriptor, file_size) < 0) { obi_set_errno(OBICOL_UNKNOWN_ERROR); @@ -1414,6 +1647,7 @@ int obi_enlarge_column(OBIDMS_column_p column) } + void obi_ini_to_NA_values(OBIDMS_column_p column, index_t first_line_nb, index_t nb_lines) @@ -1479,6 +1713,7 @@ void obi_ini_to_NA_values(OBIDMS_column_p column, } + OBIDMS_column_header_p obi_column_get_header_from_name(OBIDMS_p dms, const char* column_name, obiversion_t version_number) { OBIDMS_column_header_p header; @@ -1562,6 +1797,7 @@ OBIDMS_column_header_p obi_column_get_header_from_name(OBIDMS_p dms, const char* } + int obi_close_header(OBIDMS_column_header_p header) { if (munmap(header, header->header_size) < 0) @@ -1574,47 +1810,56 @@ int obi_close_header(OBIDMS_column_header_p header) } -// TODO to be rewritten in an optimized and safe way if possible + index_t obi_column_get_element_index_from_name(OBIDMS_column_p column, const char* element_name) { - char* elements_names; - char* name; - index_t element_index; + int* elt_names_idx; - elements_names = strdup((column->header)->elements_names); - if (elements_names == NULL) - { - obidebug(1, "\nError strdup-ing the elements names"); - return OBIIdx_NA; - } + elt_names_idx = bsearch_user_data(element_name, (column->header)->sorted_elements_idx, (column->header)->nb_elements_per_line, sizeof(int), column->header, cmp_elements_names_with_name_and_idx); - element_index = 0; + if (elt_names_idx != NULL) + return (index_t)(*elt_names_idx); - name = strtok(elements_names, ";"); // TODO not thread safe, see strtok_r maybe - if (strcmp(element_name, name) == 0) - { - free(elements_names); - return element_index; - } - element_index++; - - while (name != NULL) - { - name = strtok(NULL, ";"); // TODO not thread safe, see strtok_r maybe - if (strcmp(element_name, name) == 0) - { - free(elements_names); - return element_index; - } - element_index++; - } - - obidebug(1, "\nCan't find an element name"); - free(elements_names); + obi_set_errno(OBICOL_UNKNOWN_ERROR); + obidebug(1, "\nError: could not find element name %s", element_name); return OBIIdx_NA; } +// TODO doc, returns elements names with ; as separator (discuss maybe char**) +char* obi_get_elements_names(OBIDMS_column_p column) +{ + char* elements_names; + int i, j; + int elt_idx; + int len; + + elements_names = (char*) malloc(ELEMENTS_NAMES_MAX * sizeof(char)); + if (elements_names == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for elements names"); + return NULL; + } + + j = 0; + for (i=0; i < (column->header)->nb_elements_per_line; i++) + { + elt_idx = ((column->header)->elements_names_idx)[i]; + len = strlen(((column->header)->elements_names)+elt_idx); + memcpy(elements_names+j, ((column->header)->elements_names)+elt_idx, len*sizeof(char)); + j = j + len; + elements_names[j] = ';'; + j++; + } + + elements_names[j - 1] = '\0'; + + return elements_names; +} + + + int obi_column_prepare_to_set_value(OBIDMS_column_p column, index_t line_nb) { // Check if the column is read-only @@ -1649,6 +1894,7 @@ int obi_column_prepare_to_set_value(OBIDMS_column_p column, index_t line_nb) } + int obi_column_prepare_to_get_value(OBIDMS_column_p column, index_t line_nb) { if ((line_nb+1) > ((column->header)->line_count)) diff --git a/src/obidmscolumn.h b/src/obidmscolumn.h index 9e5e348..cef1f59 100644 --- a/src/obidmscolumn.h +++ b/src/obidmscolumn.h @@ -28,17 +28,21 @@ #include "obiblob_indexer.h" -#define ELEMENTS_NAMES_MAX (2048) /**< The maximum length of the list of elements names. // TODO Discuss - */ -#define NB_ELTS_MAX_IF_DEFAULT_NAME (539) /**< The maximum number of elements per line if the default element names - * are used ("0;1;2;...;n"), considering ELEMENTS_NAMES_MAX. - */ -#define COLUMN_GROWTH_FACTOR (2) /**< The growth factor when a column is enlarged. - */ -#define MAXIMUM_LINE_COUNT (1000000000) /**< The maximum line count for the data of a column. //TODO - */ -#define COMMENTS_MAX_LENGTH (2048) /**< The maximum length for comments. - */ +#define ELEMENTS_NAMES_MAX (2048) /**< The maximum length of the list of elements names. // TODO Discuss + */ +#define NB_ELTS_MAX_IF_DEFAULT_NAME (539) /**< The maximum number of elements per line if the default element names + * are used ("0\01\02\0...\0n"), considering ELEMENTS_NAMES_MAX. + */ +#define COLUMN_GROWTH_FACTOR (2) /**< The growth factor when a column is enlarged. + */ +#define MAXIMUM_LINE_COUNT (1000000000) /**< The maximum line count for the data of a column. //TODO + */ +#define COMMENTS_MAX_LENGTH (2048) /**< The maximum length for comments. + */ +#define FORMATTED_ELT_NAMES_SEPARATOR '\0' /**< The maximum length for comments. + */ +#define NOT_FORMATTED_ELT_NAMES_SEPARATOR ';' /**< The maximum length for comments. + */ /** @@ -56,42 +60,48 @@ typedef struct Column_reference { * @brief OBIDMS column header structure. */ typedef struct OBIDMS_column_header { - size_t header_size; /**< Size of the header in bytes. - */ - size_t data_size; /**< Size of the data in bytes. - */ - index_t line_count; /**< Number of lines of data allocated. - */ - index_t lines_used; /**< Number of lines of data used. - */ - index_t nb_elements_per_line; /**< Number of elements per line. - */ - char elements_names[ELEMENTS_NAMES_MAX+1]; /**< Names of the line elements with ';' as separator - * (no terminal ';'). - * (default are the indices: "0;1;2;...;n"). - */ - OBIType_t returned_data_type; /**< Type of the data that is returned when getting an - * element from the column. - */ - OBIType_t stored_data_type; /**< Type of the data that is actually stored in the data - * part of the column. - */ - time_t creation_date; /**< Date of creation of the file. - */ - obiversion_t version; /**< Version of the column. - */ - obiversion_t cloned_from; /**< Version of the column from which this column - * was cloned from (-1 if it was not created by cloning - * another column). - */ - char name[OBIDMS_COLUMN_MAX_NAME+1]; /**< The column name as a NULL terminated string. - */ - char indexer_name[INDEXER_MAX_NAME+1]; /**< If there is one, the indexer name as a NULL terminated string. - */ - Column_reference_t associated_column; /**< If there is one, the reference to the associated column. - */ - char comments[COMMENTS_MAX_LENGTH+1]; /**< Comments stored as a classical zero end C string. - */ + size_t header_size; /**< Size of the header in bytes. + */ + size_t data_size; /**< Size of the data in bytes. + */ + index_t line_count; /**< Number of lines of data allocated. + */ + index_t lines_used; /**< Number of lines of data used. + */ + index_t nb_elements_per_line; /**< Number of elements per line. + */ + char elements_names[ELEMENTS_NAMES_MAX+1]; /**< Names of the line elements with '\0' as separator + * and '\0\0' as terminal flag. + * (default are the indices: "0\01\02\0...\0n\0\0"). + */ + int elements_names_length; /**< Length of the character array where the elements names are stored. + */ + int elements_names_idx[NB_ELTS_MAX_IF_DEFAULT_NAME]; /**< Index for the start of each element name in elements_names. + */ + int sorted_elements_idx[NB_ELTS_MAX_IF_DEFAULT_NAME]; /**< Index for the sorted element names in elements_names_idx. + */ + OBIType_t returned_data_type; /**< Type of the data that is returned when getting an + * element from the column. + */ + OBIType_t stored_data_type; /**< Type of the data that is actually stored in the data + * part of the column. + */ + time_t creation_date; /**< Date of creation of the file. + */ + obiversion_t version; /**< Version of the column. + */ + obiversion_t cloned_from; /**< Version of the column from which this column + * was cloned from (-1 if it was not created by cloning + * another column). + */ + char name[OBIDMS_COLUMN_MAX_NAME+1]; /**< The column name as a NULL terminated string. + */ + char indexer_name[INDEXER_MAX_NAME+1]; /**< If there is one, the indexer name as a NULL terminated string. + */ + Column_reference_t associated_column; /**< If there is one, the reference to the associated column. + */ + char comments[COMMENTS_MAX_LENGTH+1]; /**< Comments stored as a classical zero end C string. + */ } OBIDMS_column_header_t, *OBIDMS_column_header_p; @@ -184,12 +194,13 @@ size_t obi_get_platform_header_size(); * @param nb_lines The number of lines to be stored. * @param nb_elements_per_line The number of elements per line. // TODO talk about default values * @param elements_names The names of the elements with ';' as separator (no terminal ';'), - * NULL or "" if the default names are to be used ("0;1;2;...;n"). + * NULL or "" if the default names are to be used ("0\01\02\0...\0n"). * @param indexer_name The name of the indexer if there is one associated with the column. * If NULL or "", the indexer name is set as the column name. * @param associated_column_name The name of the associated column if there is one. * @param associated_column_version The version of the associated column if there is one. * @param comments Optional comments associated with the column. + * @param elt_names_formatted Whether the separator for the elements names is ';' (false), or '\0' (true, as formatted by format_elements_names()). * * @returns A pointer on the newly created column structure. * @retval NULL if an error occurred. @@ -206,7 +217,8 @@ OBIDMS_column_p obi_create_column(OBIDMS_p dms, const char* indexer_name, const char* associated_column_name, obiversion_t associated_column_version, - const char* comments + const char* comments, + bool elt_names_formatted ); @@ -353,7 +365,7 @@ int obi_close_header(OBIDMS_column_header_p header); * @param element_name The name of the element. * * @returns The index of the element in a line of the column. - * @retval OBIIdx_NA if an error occurred. // TODO not sure if this is "clean". + * @retval OBIIdx_NA if an error occurred. * * @since July 2015 * @author Celine Mercier (celine.mercier@metabarcoding.org) @@ -361,6 +373,22 @@ int obi_close_header(OBIDMS_column_header_p header); index_t obi_column_get_element_index_from_name(OBIDMS_column_p column, const char* element_name); +/** + * @brief Recovers the elements names of the lines of a column, with ';' as separator (i.e. "0;1;2;...;n\0"). + * + * @warning The returned pointer has to be freed by the caller. + * + * @param column A pointer on an OBIDMS column. + * + * @returns A pointer on a character array where the elements names are stored. + * @retval NULL if an error occurred. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +char* obi_get_elements_names(OBIDMS_column_p column); + + /** * @brief Prepares a column to set a value. * diff --git a/src/obiview.c b/src/obiview.c index e371ba8..a9b0a86 100644 --- a/src/obiview.c +++ b/src/obiview.c @@ -445,8 +445,8 @@ static char* build_obiview_file_name(const char* view_name) bool view_exists(OBIDMS_p dms, const char* view_name) { - struct dirent* dp; - char* file_name; + struct dirent* dp; + char* file_name; // Create file name file_name = build_obiview_file_name(view_name); @@ -1236,7 +1236,7 @@ Obiview_p obi_new_view(OBIDMS_p dms, const char* view_name, Obiview_p view_to_cl // If there is a new line selection, build it by combining it with the one from the view to clone if there is one else if (line_selection != NULL) { - view->line_selection = obi_create_column(view->dms, LINES_COLUMN_NAME, OBI_IDX, 0, 1, NULL, NULL, NULL, -1, NULL); + view->line_selection = obi_create_column(view->dms, LINES_COLUMN_NAME, OBI_IDX, 0, 1, NULL, NULL, NULL, -1, NULL, false); if ((view->line_selection) == NULL) { obidebug(1, "\nError creating a column corresponding to a line selection"); @@ -1792,7 +1792,7 @@ int obi_view_add_column(Obiview_p view, // Open or create the column if (create) { // Create column - column = obi_create_column(view->dms, column_name, data_type, nb_lines, nb_elements_per_line, elements_names, indexer_name, associated_column_name, associated_column_version, comments); + column = obi_create_column(view->dms, column_name, data_type, nb_lines, nb_elements_per_line, elements_names, indexer_name, associated_column_name, associated_column_version, comments, false); if (column == NULL) { obidebug(1, "\nError creating a column to add to a view"); From 7e9932f488d2faddc8d3e81d99d58f743fef9cfc Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Tue, 7 Feb 2017 17:12:56 +0100 Subject: [PATCH 20/22] Fixed a C function declaration --- python/obitools3/obidms/capi/obialign.pxd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/obitools3/obidms/capi/obialign.pxd b/python/obitools3/obidms/capi/obialign.pxd index e76cabe..c2280b9 100644 --- a/python/obitools3/obidms/capi/obialign.pxd +++ b/python/obitools3/obidms/capi/obialign.pxd @@ -18,7 +18,8 @@ cdef extern from "obi_align.h" nogil: double threshold, bint normalize, int reference, - bint similarity_mode) + bint similarity_mode, + int thread_count) int obi_lcs_align_two_columns(OBIDMS_p dms, From a9102620f5dc219b8363c684ffa5db22787b215a Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Tue, 7 Feb 2017 17:14:10 +0100 Subject: [PATCH 21/22] Fixed missing email address --- src/obi_align.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/obi_align.h b/src/obi_align.h index 98da4da..059c528 100644 --- a/src/obi_align.h +++ b/src/obi_align.h @@ -4,7 +4,7 @@ /** * @file obi_align.h - * @author Celine Mercier + * @author Celine Mercier (celine.mercier@metabarcoding.org) * @date May 11th 2016 * @brief Header file for the functions handling the LCS alignment of DNA sequences. */ @@ -77,7 +77,7 @@ * the length of the Longest Common Subsequence. If the score is not normalized and expressed in distance, * it is (reference length - LCS length). Only sequence pairs with a similarity above the threshold are printed. * @param normalize Whether the score should be normalized with the reference sequence length. - * @param reference The reference length. 0: The alignement length; 1: The longest sequence's length; 2: The shortest sequence's length. + * @param reference The reference length. 0: The alignment length; 1: The longest sequence's length; 2: The shortest sequence's length. * @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false). * * @returns A value indicating the success of the operation. From e524041013b4c887f8db642d698a7fcbae9ec356 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Tue, 7 Feb 2017 17:16:09 +0100 Subject: [PATCH 22/22] Views: Files for unfinished views now have the extension '.obiview_unfinished', renamed to '.obiview' when the view is finished. --- python/obitools3/obidms/_obidms.pyx | 2 +- python/obitools3/obidms/capi/obiview.pxd | 8 +- src/obi_align.c | 16 +- src/obiview.c | 533 ++++++++++++++++------- src/obiview.h | 39 +- 5 files changed, 384 insertions(+), 214 deletions(-) diff --git a/python/obitools3/obidms/_obidms.pyx b/python/obitools3/obidms/_obidms.pyx index 943b3e2..51a0d5a 100644 --- a/python/obitools3/obidms/_obidms.pyx +++ b/python/obitools3/obidms/_obidms.pyx @@ -690,7 +690,7 @@ cdef class OBIDMS : cdef int i, j cdef str column_name - view_infos_p = obi_view_map_file(self._pointer, str2bytes(view_name)) + view_infos_p = obi_view_map_file(self._pointer, str2bytes(view_name), True) view_infos_d = {} view_infos_d["name"] = bytes2str(view_infos_p.name) view_infos_d["comments"] = bytes2str(view_infos_p.comments) diff --git a/python/obitools3/obidms/capi/obiview.pxd b/python/obitools3/obidms/capi/obiview.pxd index 34521ca..792441b 100644 --- a/python/obitools3/obidms/capi/obiview.pxd +++ b/python/obitools3/obidms/capi/obiview.pxd @@ -68,7 +68,7 @@ cdef extern from "obiview.h" nogil: Obiview_p obi_new_view_nuc_seqs_cloned_from_name(OBIDMS_p dms, const_char_p view_name, const_char_p view_to_clone_name, index_t* line_selection, const_char_p comments, bint quality_column) - Obiview_infos_p obi_view_map_file(OBIDMS_p dms, const char* view_name) + Obiview_infos_p obi_view_map_file(OBIDMS_p dms, const char* view_name, bint finished) int obi_view_unmap_file(OBIDMS_p dms, Obiview_infos_p view_infos) @@ -94,11 +94,7 @@ cdef extern from "obiview.h" nogil: OBIDMS_column_p* obi_view_get_pointer_on_column_in_view(Obiview_p view, const_char_p column_name) - int obi_view_create_column_alias(Obiview_p view, const_char_p current_name, const_char_p alias) - - int obi_save_view(Obiview_p view) - - int obi_close_view(Obiview_p view) + int obi_view_create_column_alias(Obiview_p view, const_char_p current_name, const_char_p alias) int obi_save_and_close_view(Obiview_p view) diff --git a/src/obi_align.c b/src/obi_align.c index 3fa3678..07a389b 100644 --- a/src/obi_align.c +++ b/src/obi_align.c @@ -4,7 +4,7 @@ /** * @file obi_align.c - * @author Celine Mercier + * @author Celine Mercier (celine.mercier@metabarcoding.org) * @date May 4th 2016 * @brief Functions handling LCS sequence alignments. */ @@ -31,10 +31,6 @@ #define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) -// TODO -// use openMP pragmas - - /************************************************************************** * * D E C L A R A T I O N O F T H E P R I V A T E F U N C T I O N S @@ -618,12 +614,12 @@ int obi_lcs_align_one_column(OBIDMS_p dms, const char* seq_view_name, const char } // Close views - if (obi_close_view(seq_view) < 0) + if (obi_save_and_close_view(seq_view) < 0) { obidebug(1, "\nError closing the input view after aligning"); return -1; } - if (obi_close_view(output_view) < 0) + if (obi_save_and_close_view(output_view) < 0) { obidebug(1, "\nError closing the output view after aligning"); return -1; @@ -963,19 +959,19 @@ int obi_lcs_align_two_columns(OBIDMS_p dms, // Close views if (seq2_view != seq1_view) { - if (obi_close_view(seq2_view) < 0) + if (obi_save_and_close_view(seq2_view) < 0) { obidebug(1, "\nError closing the second input view after aligning"); return -1; } } - if (obi_close_view(seq1_view) < 0) + if (obi_save_and_close_view(seq1_view) < 0) { obidebug(1, "\nError closing the first input view after aligning"); return -1; } - if (obi_close_view(output_view) < 0) + if (obi_save_and_close_view(output_view) < 0) { obidebug(1, "\nError closing the output view after aligning"); return -1; diff --git a/src/obiview.c b/src/obiview.c index a9b0a86..a8f67a2 100644 --- a/src/obiview.c +++ b/src/obiview.c @@ -47,7 +47,7 @@ /** - * Internal function building the file name where the informations about an obiview are stored. + * Internal function building the file name where the informations about a finished, read-only obiview are stored. * * @warning The returned pointer has to be freed by the caller. * @@ -63,7 +63,23 @@ static char* build_obiview_file_name(const char* view_name); /** - * Internal function checking if a view with a given name already exists in a DMS. + * Internal function building the file name where the informations about an unfinished, writable obiview are stored. + * + * @warning The returned pointer has to be freed by the caller. + * + * @param view_name The name of the view. + * + * @returns A pointer to the file name. + * @retval NULL if an error occurred. + * + * @since February 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static char* build_unfinished_obiview_file_name(const char* view_name); + + +/** + * Internal function checking if a view (either finished or unfinished) with a given name already exists in a DMS. * * @param dms The DMS. * @param view_name The name of the view. @@ -73,7 +89,7 @@ static char* build_obiview_file_name(const char* view_name); * @since September 2016 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -bool view_exists(OBIDMS_p dms, const char* view_name); +static bool view_exists(OBIDMS_p dms, const char* view_name); /** @@ -84,7 +100,7 @@ bool view_exists(OBIDMS_p dms, const char* view_name); * @since June 2016 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -size_t get_platform_view_file_size(); +static size_t get_platform_view_file_size(); /** @@ -99,7 +115,7 @@ size_t get_platform_view_file_size(); * @since August 2016 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -int enlarge_view_file(Obiview_p view, size_t new_size); +static int enlarge_view_file(Obiview_p view, size_t new_size); /** @@ -117,7 +133,7 @@ int enlarge_view_file(Obiview_p view, size_t new_size); * @since August 2016 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -int write_comments_to_view_file(Obiview_p view, const char* comments); +static int write_comments_to_view_file(Obiview_p view, const char* comments); /** @@ -134,7 +150,7 @@ int write_comments_to_view_file(Obiview_p view, const char* comments); * @since June 2016 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -int create_obiview_file(OBIDMS_p dms, const char* view_name); +static int create_obiview_file(OBIDMS_p dms, const char* view_name); /** @@ -156,7 +172,7 @@ int create_obiview_file(OBIDMS_p dms, const char* view_name); * @since June 2016 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -void update_column_refs(Obiview_p view); +static void update_column_refs(Obiview_p view); /** @@ -175,7 +191,7 @@ void update_column_refs(Obiview_p view); * @since July 2016 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -int create_column_dict(Obiview_p view); +static int create_column_dict(Obiview_p view); /** @@ -194,7 +210,7 @@ int create_column_dict(Obiview_p view); * @since July 2016 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -int update_column_dict(Obiview_p view); +static int update_column_dict(Obiview_p view); /** @@ -219,7 +235,7 @@ int update_column_dict(Obiview_p view); * @since July 2016 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -int update_column_refs_and_dict(Obiview_p view); +static int update_column_refs_and_dict(Obiview_p view); /** @@ -239,7 +255,7 @@ int update_column_refs_and_dict(Obiview_p view); * @since February 2016 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -int update_lines(Obiview_p view, index_t line_count); +static int update_lines(Obiview_p view, index_t line_count); /** @@ -257,7 +273,71 @@ int update_lines(Obiview_p view, index_t line_count); * @since February 2016 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -OBIDMS_column_p clone_column_in_view(Obiview_p view, const char* column_name); +static OBIDMS_column_p clone_column_in_view(Obiview_p view, const char* column_name); + + +/** + * @brief Saves a view, updating its informations in the view file. + * + * @warning The view must be writable. + * + * @param view A pointer on the view. + * + * @returns A value indicating the success of the operation. + * @retval 0 if the operation was successfully completed. + * @retval -1 if an error occurred. + * + * @since February 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int save_view(Obiview_p view); + + +/** + * @brief Rename a view file once the view is finished, replacing the '*.obiview_unfinished' extension with '*.obiview'. + * + * @param view A pointer on the view. + * + * @returns A value indicating the success of the operation. + * @retval 0 if the operation was successfully completed. + * @retval -1 if an error occurred. + * + * @since February 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int rename_finished_view(Obiview_p view); + + +/** + * @brief Finishes a view: check the predicates, save all the informations, rename the view file. + * + * @param view A pointer on the view. + * + * @returns A value indicating the success of the operation. + * @retval 0 if the operation was successfully completed. + * @retval -1 if an error occurred. + * + * @since February 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int finish_view(Obiview_p view); + +/** + * @brief Closes an opened view. + * + * @warning Doesn't save the view. + * + * @param view A pointer on the view. + * + * @returns A value indicating the success of the operation. + * @retval 0 if the operation was successfully completed. + * @retval -1 if an error occurred. + * + * @see obi_save_and_close_view() + * @since February 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int close_view(Obiview_p view); /** @@ -276,7 +356,7 @@ OBIDMS_column_p clone_column_in_view(Obiview_p view, const char* column_name); * @since April 2016 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -int prepare_to_set_value_in_column(Obiview_p view, OBIDMS_column_p* column_pp, index_t* line_nb_p); +static int prepare_to_set_value_in_column(Obiview_p view, OBIDMS_column_p* column_pp, index_t* line_nb_p); /** @@ -294,7 +374,7 @@ int prepare_to_set_value_in_column(Obiview_p view, OBIDMS_column_p* column_pp, i * @since April 2016 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -int prepare_to_get_value_from_column(Obiview_p view, index_t* line_nb_p); +static int prepare_to_get_value_from_column(Obiview_p view, index_t* line_nb_p); /****** PREDICATE FUNCTIONS *******/ @@ -313,7 +393,7 @@ int prepare_to_get_value_from_column(Obiview_p view, index_t* line_nb_p); * @since July 2016 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -char* view_has_nuc_sequence_column(Obiview_p view); +static char* view_has_nuc_sequence_column(Obiview_p view); /** @@ -330,7 +410,7 @@ char* view_has_nuc_sequence_column(Obiview_p view); * @since July 2016 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -char* view_has_quality_column(Obiview_p view); +static char* view_has_quality_column(Obiview_p view); /** @@ -347,7 +427,7 @@ char* view_has_quality_column(Obiview_p view); * @since July 2016 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -char* view_has_id_column(Obiview_p view); +static char* view_has_id_column(Obiview_p view); /** @@ -364,7 +444,7 @@ char* view_has_id_column(Obiview_p view); * @since July 2016 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -char* view_has_definition_column(Obiview_p view); +static char* view_has_definition_column(Obiview_p view); /** @@ -381,7 +461,7 @@ char* view_has_definition_column(Obiview_p view); * @since July 2016 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -char* view_check_qual_match_seqs(Obiview_p view); +static char* view_check_qual_match_seqs(Obiview_p view); /** @@ -396,7 +476,7 @@ char* view_check_qual_match_seqs(Obiview_p view); * @since July 2016 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -char* view_check_one_predicate(Obiview_p view, char* (*predicate_function)(Obiview_p view)); +static char* view_check_one_predicate(Obiview_p view, char* (*predicate_function)(Obiview_p view)); /** @@ -410,7 +490,7 @@ char* view_check_one_predicate(Obiview_p view, char* (*predicate_function)(Obivi * @since July 2016 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -char* view_check_all_predicates(Obiview_p view); +static char* view_check_all_predicates(Obiview_p view); /************************************************************************ @@ -443,11 +523,35 @@ static char* build_obiview_file_name(const char* view_name) } -bool view_exists(OBIDMS_p dms, const char* view_name) +static char* build_unfinished_obiview_file_name(const char* view_name) +{ + char* file_name; + + // Build file name + file_name = (char*) malloc((strlen(view_name) + 19 + 1)*sizeof(char)); + if (file_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a view file name"); + return NULL; + } + if (sprintf(file_name, "%s.obiview_unfinished", view_name) < 0) + { + obi_set_errno(OBIVIEW_ERROR); + obidebug(1, "\nProblem building an unfinished obiview file name"); + return NULL; + } + + return file_name; +} + + +static bool view_exists(OBIDMS_p dms, const char* view_name) { struct dirent* dp; char* file_name; + // Check finished views // Create file name file_name = build_obiview_file_name(view_name); if (file_name == NULL) @@ -458,13 +562,38 @@ bool view_exists(OBIDMS_p dms, const char* view_name) if ((dp->d_name)[0] == '.') continue; if (strcmp(dp->d_name, file_name) == 0) + { + free(file_name); return true; + } } + + free(file_name); + + // Check unfinished views + // Create file name + file_name = build_unfinished_obiview_file_name(view_name); + if (file_name == NULL) + return -1; + + while ((dp = readdir(dms->view_directory)) != NULL) + { + if ((dp->d_name)[0] == '.') + continue; + if (strcmp(dp->d_name, file_name) == 0) + { + free(file_name); + return true; + } + } + + free(file_name); + return false; } -size_t get_platform_view_file_size() +static size_t get_platform_view_file_size() { size_t obiview_size; size_t rounded_obiview_size; @@ -480,7 +609,7 @@ size_t get_platform_view_file_size() } -int enlarge_view_file(Obiview_p view, size_t new_size) +static int enlarge_view_file(Obiview_p view, size_t new_size) { int obiview_file_descriptor; double multiple; @@ -488,7 +617,7 @@ int enlarge_view_file(Obiview_p view, size_t new_size) char* file_name; // Create file name - file_name = build_obiview_file_name((view->infos)->name); + file_name = build_unfinished_obiview_file_name((view->infos)->name); if (file_name == NULL) return -1; @@ -556,7 +685,7 @@ int enlarge_view_file(Obiview_p view, size_t new_size) } -int write_comments_to_view_file(Obiview_p view, const char* comments) +static int write_comments_to_view_file(Obiview_p view, const char* comments) { size_t new_size; @@ -580,14 +709,14 @@ int write_comments_to_view_file(Obiview_p view, const char* comments) } -int create_obiview_file(OBIDMS_p dms, const char* view_name) +static int create_obiview_file(OBIDMS_p dms, const char* view_name) { char* file_name; int obiview_file_descriptor; size_t file_size; // Create file name - file_name = build_obiview_file_name(view_name); + file_name = build_unfinished_obiview_file_name(view_name); if (file_name == NULL) return -1; @@ -634,7 +763,7 @@ int create_obiview_file(OBIDMS_p dms, const char* view_name) } -void update_column_refs(Obiview_p view) +static void update_column_refs(Obiview_p view) { int i; @@ -646,7 +775,7 @@ void update_column_refs(Obiview_p view) } -int create_column_dict(Obiview_p view) +static int create_column_dict(Obiview_p view) { int i; @@ -681,7 +810,7 @@ int create_column_dict(Obiview_p view) } -int update_column_dict(Obiview_p view) +static int update_column_dict(Obiview_p view) { // Re-initialize the dictionary to rebuild it from scratch ht_free(view->column_dict); @@ -693,14 +822,14 @@ int update_column_dict(Obiview_p view) } -int update_column_refs_and_dict(Obiview_p view) +static int update_column_refs_and_dict(Obiview_p view) { update_column_refs(view); return update_column_dict(view); } -int update_lines(Obiview_p view, index_t line_count) +static int update_lines(Obiview_p view, index_t line_count) { int i; @@ -739,7 +868,7 @@ int update_lines(Obiview_p view, index_t line_count) } -OBIDMS_column_p clone_column_in_view(Obiview_p view, const char* column_name) +static OBIDMS_column_p clone_column_in_view(Obiview_p view, const char* column_name) { int i; OBIDMS_column_p column = NULL; @@ -799,7 +928,161 @@ OBIDMS_column_p clone_column_in_view(Obiview_p view, const char* column_name) } -int prepare_to_set_value_in_column(Obiview_p view, OBIDMS_column_p* column_pp, index_t* line_nb_p) +static int save_view(Obiview_p view) +{ + // Check that the view is not read-only + if (view->read_only) + { + obi_set_errno(OBIVIEW_ERROR); + obidebug(1, "\nError trying to save a read-only view"); + return -1; + } + + // Store reference for the line selection associated with that view if there is one + if (view->line_selection != NULL) // Unnecessary in theory, the line selection references are already saved + { + strcpy(((view->infos)->line_selection).column_name, ((view->line_selection)->header)->name); + ((view->infos)->line_selection).version = ((view->line_selection)->header)->version; + (view->infos)->all_lines = false; + } + else // Necessary because line selection could have been deleted if a column was cloned + { + (((view->infos)->line_selection).column_name)[0] = '\0'; + ((view->infos)->line_selection).version = -1; + (view->infos)->all_lines = true; + } + + update_column_refs(view); + + return 0; +} + + +static int rename_finished_view(Obiview_p view) +{ + char* old_name; + char* new_name; + char* path_old_name; + char* path_new_name; + char* full_path_old_name; + char* full_path_new_name; + + old_name = build_unfinished_obiview_file_name((view->infos)->name); + new_name = build_obiview_file_name((view->infos)->name); + + path_old_name = malloc(MAX_PATH_LEN); + path_new_name = malloc(MAX_PATH_LEN); + + strcpy(path_old_name, "VIEWS/"); + strcat(path_old_name, old_name); + + strcpy(path_new_name, "VIEWS/"); + strcat(path_new_name, new_name); + + full_path_old_name = obi_dms_get_full_path(view->dms, path_old_name); + full_path_new_name = obi_dms_get_full_path(view->dms, path_new_name); + + if (rename(full_path_old_name, full_path_new_name) < 0) + { + obi_set_errno(OBIVIEW_ERROR); + obidebug(1, "\nError renaming the file of a finished view: %s", full_path_new_name); + free(old_name); + free(new_name); + return -1; + } + + free(old_name); + free(new_name); + free(path_new_name); + free(path_old_name); + free(full_path_old_name); + free(full_path_new_name); + + return 0; +} + + +static int finish_view(Obiview_p view) +{ + char* predicates; + + // Check that the view is not read-only + if (view->read_only) + { + obi_set_errno(OBIVIEW_ERROR); + obidebug(1, "\nError trying to save a read-only view"); + return -1; + } + + // Check predicates + predicates = view_check_all_predicates(view); + if (predicates == NULL) + { + obidebug(1, "\nView predicates not respected"); + return -1; // TODO reverse view (delete files) + } + else + { + write_comments_to_view_file(view, predicates); + free(predicates); + } + + if (save_view(view) < 0) + return -1; + + if (rename_finished_view(view) < 0) + return -1; + + // Flag the view as finished + (view->infos)->finished = true; + + return 0; +} + + +static int close_view(Obiview_p view) +{ + int i; + int ret_value; + + ret_value = 0; + + for (i=0; i < ((view->infos)->column_count); i++) + { + if (obi_close_column((view->columns)[i]) < 0) + { + obidebug(1, "\nError closing a column while closing a view"); + ret_value = -1; + } + } + + // Close line selection if there is one + if (view->line_selection != NULL) + { + if (obi_close_column(view->line_selection) < 0) + { + obidebug(1, "\nError closing a line selection while closing a view"); + ret_value = -1; + } + } + + // Free the column dictionary + ht_free(view->column_dict); + + // Unmap view file + if (obi_view_unmap_file(view->dms, view->infos) < 0) + { + obidebug(1, "\nError unmaping a view file while closing a view"); + ret_value = -1; + } + + free(view); + + return ret_value; +} + + +static int prepare_to_set_value_in_column(Obiview_p view, OBIDMS_column_p* column_pp, index_t* line_nb_p) { int i; char* column_name = NULL; @@ -846,7 +1129,7 @@ int prepare_to_set_value_in_column(Obiview_p view, OBIDMS_column_p* column_pp, i } -int prepare_to_get_value_from_column(Obiview_p view, index_t* line_nb_p) +static int prepare_to_get_value_from_column(Obiview_p view, index_t* line_nb_p) { if (((*line_nb_p)+1) > ((view->infos)->line_count)) { @@ -865,7 +1148,7 @@ int prepare_to_get_value_from_column(Obiview_p view, index_t* line_nb_p) /****** PREDICATE FUNCTIONS *******/ -char* view_has_nuc_sequence_column(Obiview_p view) +static char* view_has_nuc_sequence_column(Obiview_p view) { char* predicate; @@ -889,7 +1172,7 @@ char* view_has_nuc_sequence_column(Obiview_p view) } -char* view_has_quality_column(Obiview_p view) +static char* view_has_quality_column(Obiview_p view) { char* predicate; @@ -913,7 +1196,7 @@ char* view_has_quality_column(Obiview_p view) } -char* view_has_id_column(Obiview_p view) +static char* view_has_id_column(Obiview_p view) { char* predicate; @@ -936,7 +1219,8 @@ char* view_has_id_column(Obiview_p view) } } -char* view_has_definition_column(Obiview_p view) + +static char* view_has_definition_column(Obiview_p view) { char* predicate; @@ -960,7 +1244,7 @@ char* view_has_definition_column(Obiview_p view) } -char* view_check_qual_match_seqs(Obiview_p view) +static char* view_check_qual_match_seqs(Obiview_p view) { index_t i, j, k; index_t nb_elements_per_line; @@ -1053,13 +1337,13 @@ char* view_check_qual_match_seqs(Obiview_p view) } -char* view_check_one_predicate(Obiview_p view, char* (*predicate_function)(Obiview_p view)) +static char* view_check_one_predicate(Obiview_p view, char* (*predicate_function)(Obiview_p view)) { return predicate_function(view); } -char* view_check_all_predicates(Obiview_p view) +static char* view_check_all_predicates(Obiview_p view) { int i, j; size_t size_to_allocate; @@ -1195,7 +1479,7 @@ Obiview_p obi_new_view(OBIDMS_p dms, const char* view_name, Obiview_p view_to_cl } // Map view file - view->infos = obi_view_map_file(dms, view_name); + view->infos = obi_view_map_file(dms, view_name, false); if (view->infos == NULL) { obidebug(1, "\nError mapping the informations of a new view"); @@ -1305,7 +1589,7 @@ Obiview_p obi_new_view(OBIDMS_p dms, const char* view_name, Obiview_p view_to_cl if (write_comments_to_view_file(view, clone_comment) < 0) { obidebug(1, "\nError writing comments when creating a view"); - obi_close_view(view); + close_view(view); return NULL; } } @@ -1341,7 +1625,7 @@ Obiview_p obi_new_view(OBIDMS_p dms, const char* view_name, Obiview_p view_to_cl if (write_comments_to_view_file(view, comments) < 0) { obidebug(1, "\nError writing comments when creating a view"); - obi_close_view(view); + close_view(view); return NULL; } @@ -1360,7 +1644,7 @@ Obiview_p obi_new_view(OBIDMS_p dms, const char* view_name, Obiview_p view_to_cl // Create the column dictionary (hash table) associating column names (or aliases) to column pointers if (create_column_dict(view) < 0) { - obi_close_view(view); + close_view(view); return NULL; } @@ -1409,7 +1693,7 @@ Obiview_p obi_new_view_cloned_from_name(OBIDMS_p dms, const char* view_name, con return NULL; view = obi_new_view(dms, view_name, view_to_clone, line_selection, comments); - obi_close_view(view_to_clone); + close_view(view_to_clone); return view; } @@ -1511,26 +1795,43 @@ Obiview_p obi_new_view_nuc_seqs_cloned_from_name(OBIDMS_p dms, const char* view_ return NULL; view = obi_new_view_nuc_seqs(dms, view_name, view_to_clone, line_selection, comments, quality_column); - obi_close_view(view_to_clone); + close_view(view_to_clone); return view; } -Obiview_infos_p obi_view_map_file(OBIDMS_p dms, const char* view_name) +Obiview_infos_p obi_view_map_file(OBIDMS_p dms, const char* view_name, bool finished) { char* file_name; Obiview_infos_p view_infos; int obiview_file_descriptor; size_t file_size; + int open_flag; + int mmap_flag; // Create file name - file_name = build_obiview_file_name(view_name); + if (finished) + file_name = build_obiview_file_name(view_name); + else + file_name = build_unfinished_obiview_file_name(view_name); if (file_name == NULL) return NULL; + // Set flags (read-only or not) + if (finished) + { + open_flag = O_RDONLY; + mmap_flag = PROT_READ; + } + else + { + open_flag = O_RDWR; + mmap_flag = PROT_READ | PROT_WRITE; + } + // Open view file - obiview_file_descriptor = openat(dms->view_dir_fd, file_name, O_RDWR, 0777); + obiview_file_descriptor = openat(dms->view_dir_fd, file_name, open_flag, 0777); if (obiview_file_descriptor < 0) { if (errno == ENOENT) @@ -1560,7 +1861,7 @@ Obiview_infos_p obi_view_map_file(OBIDMS_p dms, const char* view_name) // Map the view infos structure view_infos = mmap(NULL, file_size, - PROT_READ | PROT_WRITE, + mmap_flag, MAP_SHARED, obiview_file_descriptor, 0 @@ -1585,17 +1886,20 @@ Obiview_infos_p obi_view_map_file(OBIDMS_p dms, const char* view_name) int obi_view_unmap_file(OBIDMS_p dms, Obiview_infos_p view_infos) { - char* file_name; - int obiview_file_descriptor; - size_t file_size; + char* file_name; + int obiview_file_descriptor; + size_t file_size; // Get file name - file_name = build_obiview_file_name(view_infos->name); + if (view_infos->finished) + file_name = build_obiview_file_name(view_infos->name); + else + file_name = build_unfinished_obiview_file_name(view_infos->name); if (file_name == NULL) return -1; // Open view file - obiview_file_descriptor = openat(dms->view_dir_fd, file_name, O_RDWR, 0777); + obiview_file_descriptor = openat(dms->view_dir_fd, file_name, O_RDONLY, 0777); if (obiview_file_descriptor < 0) { obi_set_errno(OBIVIEW_ERROR); @@ -1661,13 +1965,9 @@ Obiview_p obi_open_view(OBIDMS_p dms, const char* view_name) } // Map view file - view->infos = obi_view_map_file(dms, view_name); - - // Check that the view is finished and can be opened - if ((view->infos)->finished == false) + view->infos = obi_view_map_file(dms, view_name, true); + if ((view->infos) == NULL) { - obidebug(1, "\nError opening a view: the view is not finished"); - obi_view_unmap_file(view->dms, view->infos); free(view); return NULL; } @@ -1697,7 +1997,7 @@ Obiview_p obi_open_view(OBIDMS_p dms, const char* view_name) if (column_pointer == NULL) { obidebug(1, "\nError opening a column for a view: column %d: %s, version %d", i, column_name, column_version); - obi_close_view(view); + close_view(view); return NULL; } (view->columns)[i] = column_pointer; @@ -1713,7 +2013,7 @@ Obiview_p obi_open_view(OBIDMS_p dms, const char* view_name) if (create_column_dict(view) < 0) { obidebug(1, "\nError creating the column dictionary when opening a view"); - obi_close_view(view); + close_view(view); return NULL; } @@ -1963,103 +2263,16 @@ int obi_view_create_column_alias(Obiview_p view, const char* current_name, const } -int obi_save_view(Obiview_p view) -{ - // Check that the view is not read-only - if (view->read_only) - { - obi_set_errno(OBIVIEW_ERROR); - obidebug(1, "\nError trying to save a read-only view"); - return -1; - } - - // Store reference for the line selection associated with that view if there is one - if (view->line_selection != NULL) // Unnecessary in theory, the line selection references are already saved - { - strcpy(((view->infos)->line_selection).column_name, ((view->line_selection)->header)->name); - ((view->infos)->line_selection).version = ((view->line_selection)->header)->version; - (view->infos)->all_lines = false; - } - else // Necessary because line selection could have been deleted if a column was cloned - { - (((view->infos)->line_selection).column_name)[0] = '\0'; - ((view->infos)->line_selection).version = -1; - (view->infos)->all_lines = true; - } - - update_column_refs(view); - - return 0; -} - - -int obi_close_view(Obiview_p view) -{ - int i; - int ret_value; - - ret_value = 0; - - for (i=0; i < ((view->infos)->column_count); i++) - { - if (obi_close_column((view->columns)[i]) < 0) - { - obidebug(1, "\nError closing a column while closing a view"); - ret_value = -1; - } - } - - // Close line selection if there is one - if (view->line_selection != NULL) - { - if (obi_close_column(view->line_selection) < 0) - { - obidebug(1, "\nError closing a line selection while closing a view"); - ret_value = -1; - } - } - - // Flag the view as finished - (view->infos)->finished = true; - - // Free the column dictionary - ht_free(view->column_dict); - - // Unmap view file - if (obi_view_unmap_file(view->dms, view->infos) < 0) - { - obidebug(1, "\nError unmaping a view file while closing a view"); - ret_value = -1; - } - - free(view); - - return ret_value; -} - - int obi_save_and_close_view(Obiview_p view) { - char* predicates; - - if (!(view->read_only)) - { - predicates = view_check_all_predicates(view); - if (predicates == NULL) - { - obidebug(1, "\nView predicates not respected"); - return -1; // TODO reverse view (delete files) - } - else - { - write_comments_to_view_file(view, predicates); - free(predicates); - } - if (obi_save_view(view) < 0) + // Finish and save the view if it is not read-only + if ( ! (view->read_only)) + if (finish_view(view) < 0) return -1; - } - if (obi_close_view(view) < 0) + + if (close_view(view) < 0) return -1; + return 0; } diff --git a/src/obiview.h b/src/obiview.h index 641b856..f2750e9 100644 --- a/src/obiview.h +++ b/src/obiview.h @@ -242,6 +242,7 @@ Obiview_p obi_new_view_nuc_seqs_cloned_from_name(OBIDMS_p dms, const char* view_ * * @param dms A pointer on the OBIDMS. * @param view_name The unique name identifying the view. + * @param finished Whether the view is finished or not. * * @returns A pointer on the mapped view infos structure. * @retval NULL if an error occurred. @@ -249,7 +250,7 @@ Obiview_p obi_new_view_nuc_seqs_cloned_from_name(OBIDMS_p dms, const char* view_ * @since June 2016 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ -Obiview_infos_p obi_view_map_file(OBIDMS_p dms, const char* view_name); +Obiview_infos_p obi_view_map_file(OBIDMS_p dms, const char* view_name, bool finished); /** @@ -444,42 +445,6 @@ int obi_select_line(Obiview_p view, index_t line_nb); int obi_select_lines(Obiview_p view, index_t* line_nbs); -/** - * @brief Saves a view, writing it in the view file. - * - * The view is written at the end of the view file, following the latest written view. - * - * @warning The view must be writable. - * - * @param view A pointer on the view. - * - * @returns A value indicating the success of the operation. - * @retval 0 if the operation was successfully completed. - * @retval -1 if an error occurred. - * - * @since February 2016 - * @author Celine Mercier (celine.mercier@metabarcoding.org) - */ -int obi_save_view(Obiview_p view); - - -/** - * @brief Closes an opened view. - * - * @warning Uses obi_save_and_close_view() to automatically save the view if it's not already saved in the view file. - * - * @param view A pointer on the view. - * - * @returns A value indicating the success of the operation. - * @retval 0 if the operation was successfully completed. - * @retval -1 if an error occurred. - * - * @since February 2016 - * @author Celine Mercier (celine.mercier@metabarcoding.org) - */ -int obi_close_view(Obiview_p view); - - /** * @brief Closes an opened view, and saves it if it is not read-only (meaning it is not already saved in the view file). *