From ae5f42c2606c416682abb9b0020d2000678cd164 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Mon, 19 Aug 2019 12:30:56 +0200 Subject: [PATCH] fixes #61 : now reading merged taxids information when building a reference database --- src/build_reference_db.c | 231 +++++++++++++++++++++++++++++++-------- 1 file changed, 184 insertions(+), 47 deletions(-) diff --git a/src/build_reference_db.c b/src/build_reference_db.c index d65df7d..260474c 100755 --- a/src/build_reference_db.c +++ b/src/build_reference_db.c @@ -40,6 +40,27 @@ * **************************************************************************/ +/** + * Internal function computing the last common ancestor (LCA) of a list of merged taxids (generated by obi uniq). + * Also works on just simple taxid (returns the associated taxon). + * + * @param view A pointer on the view containing the taxid information (merged or not). + * @param taxid_column A pointer on the column where taxids are kept (merged or not, aka int array or int columns). + * @param idx The index of the sequence to compute the LCA of in the view. + * @param merged Whether the taxid information is made of arrays of taxids or a single taxid. + * @param tax A pointer on a taxonomy structure. + * @param taxid_count The maximum number of taxids associated with one sequence (aka the number of elements in taxid_column). + * @param taxid_str_indices A pointer on the list of indices in taxid_strings referring to the taxids stored as strings (see next param). + * @param taxid_strings A pointer on the list of taxids stored as strings in the taxid column header (they correspond to the element names). + * + * @returns A pointer on the LCA taxon. + * @retval NULL if an error occurred. + * + * @since August 2019 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static inline ecotx_t* get_lca_from_merged_taxids(Obiview_p view, OBIDMS_column_p taxid_column, index_t idx, bool merged, + OBIDMS_taxonomy_p tax, index_t taxid_count, int64_t* taxid_str_indices, char* taxid_strings); /************************************************************************ @@ -49,6 +70,54 @@ ************************************************************************/ +static inline ecotx_t* get_lca_from_merged_taxids(Obiview_p view, OBIDMS_column_p taxid_column, index_t idx, bool merged, + OBIDMS_taxonomy_p tax, index_t taxid_count, int64_t* taxid_str_indices, char* taxid_strings) +{ + ecotx_t* taxon = NULL; + ecotx_t* lca = NULL; + int32_t taxid; + index_t taxid_idx; + int64_t taxid_str_idx; + char* taxid_str; + obiint_t n; + + for (taxid_idx=0; taxid_idxheader)->to_eval) + { + obidebug(1, "\nError opening the column containing the taxids of the reference sequences when building a reference database: " + "run previous obi uniq with a higher threshold for the --max-elts option while waiting for the implementation of this feature"); + return -1; + } + + // Get the (maximum) number of taxids associated with a sequence + taxid_count = (refs_taxid_column->header)->nb_elements_per_line; + // Get pointers on element names (aka taxids) and their start indices for faster access + taxid_strings = (refs_taxid_column->header)->elements_names; + taxid_str_indices = (refs_taxid_column->header)->elements_names_idx; + matrix_lca_taxid_column = obi_view_get_column(matrix_with_lca_view, LCA_TAXID_COLUMN_NAME); if (matrix_lca_taxid_column == NULL) { @@ -220,39 +318,30 @@ int build_reference_db(const char* dms_name, // For each pair for (i=0; i<(matrix_with_lca_view->infos)->line_count; i++) { - // Read taxid1 and get taxon1 + // Read all taxids associated with the first sequence and compute their LCA // Read line index idx1 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx1_column, i, 0); - taxid1 = obi_get_int_with_elt_idx_and_col_p_in_view(refs_view, refs_taxid_column, idx1, 0); - if (taxid1 == OBIInt_NA) + lca_1 = get_lca_from_merged_taxids(refs_view, refs_taxid_column, idx1, merged, + tax, taxid_count, taxid_str_indices, taxid_strings); + if (lca_1 == NULL) { - obidebug(1, "\nError getting a taxid when building a reference database, seq %lld in refs view has no taxid associated", idx1); - return -1; - } - taxon1 = obi_taxo_get_taxon_with_taxid(tax, taxid1); - if (taxon1 == NULL) - { - obidebug(1, "\nError getting a taxon with taxid %d when building a reference database, seq %lld in refs view", taxid1, idx1); + obidebug(1, "\nError getting the last common ancestor of merged taxids when building a reference database"); return -1; } - // Read taxid2 and get taxon2 + // Read all taxids associated with the second sequence and compute their LCA + // Read line index idx2 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx2_column, i, 0); - taxid2 = obi_get_int_with_elt_idx_and_col_p_in_view(refs_view, refs_taxid_column, idx2, 0); - if (taxid2 == OBIInt_NA) + lca_2 = get_lca_from_merged_taxids(refs_view, refs_taxid_column, idx2, merged, + tax, taxid_count, taxid_str_indices, taxid_strings); + if (lca_2 == NULL) { - obidebug(1, "\nError getting a taxid when building a reference database, seq %lld in refs view has no taxid associated", idx1); - return -1; - } - taxon2 = obi_taxo_get_taxon_with_taxid(tax, taxid2); - if (taxon2 == NULL) - { - obidebug(1, "\nError getting a taxon with taxid %d when building a reference database, seq %lld in refs view", taxid2, idx2); + obidebug(1, "\nError getting the last common ancestor of merged taxids when building a reference database"); return -1; } // Compute and write their LCA - lca = obi_taxo_get_lca(taxon1, taxon2); + lca = obi_taxo_get_lca(lca_1, lca_2); if (lca == NULL) { obidebug(1, "\nError getting the last common ancestor of two taxa when building a reference database"); @@ -373,11 +462,11 @@ int build_reference_db(const char* dms_name, score_array_writable_length = score_array_length; // Check that lengths are equal (TODO eventually remove?) - if (taxid_array_length != score_array_length) - { - obidebug(1, "\nError building a reference database: LCA taxid and score arrays' lengths are not equal (%d and %d)", taxid_array_length, score_array_length); - return -1; - } +// if (taxid_array_length != score_array_length) +// { +// obidebug(1, "\nError building a reference database: LCA taxid and score arrays' lengths are not equal (%d and %d)", taxid_array_length, score_array_length); +// return -1; +// } // If empty, add values if (taxid_array_length == 0) @@ -420,8 +509,11 @@ int build_reference_db(const char* dms_name, lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1]; score_array_writable[k] = score_array_writable[k+1]; } - taxid_array_writable_length--; - score_array_writable_length--; + if (k>(j-1)) + { + taxid_array_writable_length--; + score_array_writable_length--; + } j--; } } @@ -456,8 +548,11 @@ int build_reference_db(const char* dms_name, lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1]; score_array_writable[k] = score_array_writable[k+1]; } - taxid_array_writable_length--; - score_array_writable_length--; + if (k>(j-1)) + { + taxid_array_writable_length--; + score_array_writable_length--; + } j--; } } @@ -484,8 +579,11 @@ int build_reference_db(const char* dms_name, lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1]; score_array_writable[k] = score_array_writable[k+1]; } - taxid_array_writable_length--; - score_array_writable_length--; + if (k>(j-1)) + { + taxid_array_writable_length--; + score_array_writable_length--; + } j--; } } @@ -516,11 +614,11 @@ int build_reference_db(const char* dms_name, score_array_writable_length = score_array_length; // Check that lengths are equal (TODO eventually remove?) - if (taxid_array_length != score_array_length) - { - obidebug(1, "\nError building a reference database: LCA taxid and score arrays' lengths are not equal (%d and %d)", taxid_array_length, score_array_length); - return -1; - } +// if (taxid_array_length != score_array_length) +// { +// obidebug(1, "\nError building a reference database: LCA taxid and score arrays' lengths are not equal (%d and %d)", taxid_array_length, score_array_length); +// return -1; +// } // If empty, add values if (taxid_array_length == 0) @@ -563,8 +661,11 @@ int build_reference_db(const char* dms_name, lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1]; score_array_writable[k] = score_array_writable[k+1]; } - taxid_array_writable_length--; - score_array_writable_length--; + if (k>(j-1)) + { + taxid_array_writable_length--; + score_array_writable_length--; + } j--; } } @@ -599,8 +700,11 @@ int build_reference_db(const char* dms_name, lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1]; score_array_writable[k] = score_array_writable[k+1]; } - taxid_array_writable_length--; - score_array_writable_length--; + if (k>(j-1)) + { + taxid_array_writable_length--; + score_array_writable_length--; + } j--; } } @@ -627,8 +731,11 @@ int build_reference_db(const char* dms_name, lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1]; score_array_writable[k] = score_array_writable[k+1]; } - taxid_array_writable_length--; - score_array_writable_length--; + if (k>(j-1)) + { + taxid_array_writable_length--; + score_array_writable_length--; + } j--; } } @@ -651,6 +758,36 @@ int build_reference_db(const char* dms_name, } } + // Fill empty LCA informations (because filling from potentially sparse alignment matrix) with the sequence taxid + score=1.0; // technically getting LCA of identical sequences + for (i=0; i<(o_view->infos)->line_count; i++) + { + obi_get_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, i, &taxid_array_length); + if (taxid_array_length == 0) // no LCA set + { + lca = get_lca_from_merged_taxids(refs_view, refs_taxid_column, i, merged, + tax, taxid_count, taxid_str_indices, taxid_strings); + if (lca == NULL) + { + obidebug(1, "\nError getting the last common ancestor of merged taxids when building a reference database"); + return -1; + } + + taxid_lca = lca->taxid; + // Set them in output view + if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, i, &taxid_lca, (uint8_t) (obi_sizeof(OBI_INT) * 8), 1) < 0) + { + obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database"); + return -1; + } + if (obi_set_array_with_col_p_in_view(o_view, final_score_a_column, i, &score, (uint8_t) (obi_sizeof(OBI_FLOAT) * 8), 1) < 0) + { + obidebug(1, "\nError setting a score array in a column when building a reference database"); + return -1; + } + } + } + // Close views and DMS if (obi_save_and_close_view(refs_view) < 0) {