From f942dd856f79b53d52e67d662ed6e7bba039993f Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Tue, 27 Nov 2018 15:56:50 +0100 Subject: [PATCH] C: new function to build a reference database with LCA and score metadata for the taxonomic assignment of sequences --- src/build_reference_db.c | 672 +++++++++++++++++++++++++++++++++++++++ src/build_reference_db.h | 56 ++++ 2 files changed, 728 insertions(+) create mode 100755 src/build_reference_db.c create mode 100755 src/build_reference_db.h diff --git a/src/build_reference_db.c b/src/build_reference_db.c new file mode 100755 index 0000000..f1bb222 --- /dev/null +++ b/src/build_reference_db.c @@ -0,0 +1,672 @@ +/*********************************************************************************** + * Functions to build referece databases for the taxonomic assignment of sequences * + ***********************************************************************************/ + +/** + * @file build_reference_db.c + * @author Celine Mercier (celine.mercier@metabarcoding.org) + * @date November 15th 2018 + * @brief Functions to build referece databases for the taxonomic assignment of sequences. + */ + +//#define OMP_SUPPORT // TODO +#ifdef OMP_SUPPORT +#include +#endif + +#include +#include +#include +#include +#include + +#include "build_reference_db.h" +#include "obidms.h" +#include "obidebug.h" +#include "obierrno.h" +#include "obitypes.h" +#include "obiview.h" +#include "obi_lcs.h" +#include "obidms_taxonomy.h" +#include "obidmscolumn_array.h" + + +#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) + + +/************************************************************************** + * + * D E C L A R A T I O N O F T H E P R I V A T E F U N C T I O N S + * + **************************************************************************/ + + + +/************************************************************************ + * + * D E F I N I T I O N O F T H E P R I V A T E F U N C T I O N S + * + ************************************************************************/ + + + +/********************************************************************** + * + * D E F I N I T I O N O F T H E P U B L I C F U N C T I O N S + * + **********************************************************************/ + + +int build_reference_db(const char* dms_name, + const char* refs_view_name, + const char* taxonomy_name, + const char* o_view_name, + const char* o_view_comments, + double threshold) +{ + OBIDMS_p dms = NULL; + OBIDMS_taxonomy_p tax = NULL; + char* matrix_view_name = NULL; // TODO discuss that it must be unique + char* matrix_with_lca_view_name = NULL; // TODO discuss that it must be unique + Obiview_p matrix_with_lca_view = NULL; + Obiview_p refs_view = NULL; + Obiview_p o_view = NULL; + OBIDMS_column_p matrix_idx1_column = NULL; + OBIDMS_column_p matrix_idx2_column = NULL; + OBIDMS_column_p refs_taxid_column = NULL; + OBIDMS_column_p matrix_lca_taxid_column = NULL; + OBIDMS_column_p matrix_score_column = NULL; + OBIDMS_column_p final_lca_taxid_a_column = NULL; + OBIDMS_column_p final_score_a_column = NULL; + obiint_t taxid1, taxid2, taxid_lca; + ecotx_t* taxon1 = NULL; + ecotx_t* taxon2 = NULL; + ecotx_t* lca = NULL; + index_t idx1, idx2; + index_t i, j, k; + int32_t taxid_array_length; + int32_t score_array_length; + int32_t taxid_array_writable_length; + int32_t score_array_writable_length; + obifloat_t score; + obiint_t* lca_taxid_array; + obifloat_t* score_array; + obiint_t lca_taxid_array_writable[1000]; + obifloat_t score_array_writable[1000]; + bool modified; + + // Discuss keeping the matrix view or not + matrix_view_name = calloc((strlen(o_view_name)+strlen("_matrix")+1), sizeof(char)); + if (matrix_view_name == NULL) + { + obidebug(1, "\nError allocating memory for the name of the matrix view when building a reference database"); + return -1; + } + matrix_view_name = strcpy(matrix_view_name, o_view_name); + strcat(matrix_view_name, "_matrix"); + + if (obi_lcs_align_one_column(dms_name, + refs_view_name, + "", + "", + "", + matrix_view_name, + "{}", // TODO proper comments? + false, false, + threshold, true, 0, true, + 1) < 0) + { + obidebug(1, "\nError aligning the sequences when building a reference database"); + } + + // Add a column to the matrix view for LCAs + + // Clone the view with a new name + + // Build the view name + matrix_with_lca_view_name = calloc((strlen(o_view_name)+strlen("_matrix_with_lca")+1), sizeof(char)); + if (matrix_with_lca_view_name == NULL) + { + obidebug(1, "\nError allocating memory for the name of the matrix view with LCA when building a reference database"); + return -1; + } + matrix_with_lca_view_name = strcpy(matrix_with_lca_view_name, o_view_name); + strcat(matrix_with_lca_view_name, "_matrix_with_lca"); + + // Clone the matrix view + // Open the DMS + dms = obi_open_dms(dms_name); + if (dms == NULL) + { + obidebug(1, "\nError opening the DMS when building a reference database"); + return -1; + } + + // Clone the view + matrix_with_lca_view = obi_clone_view_from_name(dms, matrix_view_name, matrix_with_lca_view_name, NULL, "{}"); + if (matrix_with_lca_view == NULL) + { + obidebug(1, "\nError creating the matrix with LCA view when building a reference database"); + return -1; + } + + // Add the LCA taxid column + if (obi_view_add_column(matrix_with_lca_view, + LCA_TAXID_COLUMN_NAME, + -1, + LCA_TAXID_COLUMN_NAME, + OBI_INT, + -1, + 1, + "", + false, + false, + false, + "", + "", + -1, + "{}", + true) + < 0) + { + obidebug(1, "\nError adding the LCA column to the matrix with LCA view when building a reference database"); + return -1; + } + + // Open the taxonomy + tax = obi_read_taxonomy(dms, taxonomy_name, true); + if (tax == NULL) + { + obidebug(1, "\nError reading the taxonomy when building a reference database"); + return -1; + } + + // Open the reference sequences view + refs_view = obi_open_view(dms, refs_view_name); + if (refs_view == NULL) + { + obidebug(1, "\nError opening the reference sequences view when building a reference database"); + return -1; + } + + // Save column pointers + matrix_idx1_column = obi_view_get_column(matrix_with_lca_view, IDX1_COLUMN_NAME); + if (matrix_idx1_column == NULL) + { + obidebug(1, "\nError opening the first index column when building a reference database"); + return -1; + } + matrix_idx2_column = obi_view_get_column(matrix_with_lca_view, IDX2_COLUMN_NAME); + if (matrix_idx2_column == NULL) + { + obidebug(1, "\nError opening the second index column when building a reference database"); + return -1; + } + refs_taxid_column = obi_view_get_column(refs_view, TAXID_COLUMN); + if (refs_taxid_column == NULL) + { + obidebug(1, "\nError opening the taxid column when building a reference database"); + return -1; + } + matrix_lca_taxid_column = obi_view_get_column(matrix_with_lca_view, LCA_TAXID_COLUMN_NAME); + if (matrix_lca_taxid_column == NULL) + { + obidebug(1, "\nError opening the LCA column when building a reference database"); + return -1; + } + + // Compute all the LCAs + // For each pair + for (i=0; i<(matrix_with_lca_view->infos)->line_count; i++) + { + // Read taxid1 and get taxon1 + // Read line index + idx1 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx1_column, i, 0); + taxid1 = obi_get_int_with_elt_idx_and_col_p_in_view(refs_view, refs_taxid_column, idx1, 0); + taxon1 = obi_taxo_get_taxon_with_taxid(tax, taxid1); + if (taxon1 == NULL) + { + obidebug(1, "\nError getting a taxon when building a reference database"); + return -1; + } + + // Read taxid2 and get taxon2 + idx2 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx2_column, i, 0); + taxid2 = obi_get_int_with_elt_idx_and_col_p_in_view(refs_view, refs_taxid_column, idx2, 0); + taxon2 = obi_taxo_get_taxon_with_taxid(tax, taxid2); + if (taxon2 == NULL) + { + obidebug(1, "\nError getting a taxon when building a reference database"); + return -1; + } + + // Compute and write their LCA + lca = obi_taxo_get_lca(taxon1, taxon2); + if (lca == NULL) + { + obidebug(1, "\nError getting the last common ancestor of two taxa when building a reference database"); + return -1; + } + taxid_lca = lca->taxid; + if (obi_set_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_lca_taxid_column, i, 0, taxid_lca) < 0) + { + obidebug(1, "\nError writing the last common ancestor of two taxa when building a reference database"); + return -1; + } + } + + // Clone refs view, add 2 arrays columns for lca and score, compute and write them + + // Clone refs view + o_view = obi_clone_view(dms, refs_view, o_view_name, NULL, o_view_comments); + if (o_view == NULL) + { + obidebug(1, "\nError cloning the view of references when building a reference database"); + return -1; + } + + // Add the LCA taxid array column + if (obi_view_add_column(o_view, + LCA_TAXID_ARRAY_COLUMN_NAME, + -1, + LCA_TAXID_ARRAY_COLUMN_NAME, + OBI_INT, + -1, + 1, + "", + false, + true, + false, + "", + "", + -1, + "{}", + true) + < 0) + { + obidebug(1, "\nError adding the LCA taxid column to the final view when building a reference database"); + return -1; + } + + // Add the score array column + if (obi_view_add_column(o_view, + LCA_SCORE_ARRAY_COLUMN_NAME, + -1, + LCA_SCORE_ARRAY_COLUMN_NAME, + OBI_FLOAT, + -1, + 1, + "", + false, + true, + false, + "", + "", + -1, + "{}", + true) + < 0) + { + obidebug(1, "\nError adding the score column to the final view when building a reference database"); + return -1; + } + + // Open the newly added columns + final_lca_taxid_a_column = obi_view_get_column(o_view, LCA_TAXID_ARRAY_COLUMN_NAME); + if (final_lca_taxid_a_column == NULL) + { + obidebug(1, "\nError opening the LCA taxid array column when building a reference database"); + return -1; + } + final_score_a_column = obi_view_get_column(o_view, LCA_SCORE_ARRAY_COLUMN_NAME); + if (final_score_a_column == NULL) + { + obidebug(1, "\nError opening the score array column when building a reference database"); + return -1; + } + + // Open alignment score column + matrix_score_column = obi_view_get_column(matrix_with_lca_view, SCORE_COLUMN_NAME); + if (matrix_score_column == NULL) + { + obidebug(1, "\nError opening the alignment score column when building a reference database"); + return -1; + } + + // For each sequence, look for all its alignments in the matrix, and for each different LCA taxid/score, order them and write them + // Going through matrix once, filling refs arrays on the go for efficiency + for (i=0; i<(matrix_with_lca_view->infos)->line_count; i++) + { + // Read ref line indexes + idx1 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx1_column, i, 0); + idx2 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx2_column, i, 0); + // Read LCA taxid + taxid_lca = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_lca_taxid_column, i, 0); + // Get LCA taxon + lca = obi_taxo_get_taxon_with_taxid(tax, taxid_lca); + if (lca == NULL) + { + obidebug(1, "\nError getting a LCA from taxid when building a reference database"); + return -1; + } + // Read alignment score + score = obi_get_float_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_score_column, i, 0); + + ///////////////// Compute for first sequence \\\\\\\\\\\\\\\\\\\\\\\ + + // Read arrays + lca_taxid_array = (obiint_t*) obi_column_get_array(final_lca_taxid_a_column, idx1, &taxid_array_length); + score_array = (obifloat_t*) obi_column_get_array(final_score_a_column, idx1, &score_array_length); + + taxid_array_writable_length = taxid_array_length; + score_array_writable_length = score_array_length; + + // Check that lengths are equal (TODO eventually remove?) + if (taxid_array_length != score_array_length) + { + obidebug(1, "\nError building a reference database: LCA taxid and score arrays' lengths are not equal (%d and %d)", taxid_array_length, score_array_length); + return -1; + } + + // If empty, add values + if (taxid_array_length == 0) + { + if (obi_column_set_array(final_lca_taxid_a_column, idx1, &taxid_lca, (uint8_t) obi_sizeof(OBI_INT), 1) < 0) + { + obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database"); + return -1; + } + if (obi_column_set_array(final_score_a_column, idx1, &score, (uint8_t) obi_sizeof(OBI_FLOAT), 1) < 0) + { + obidebug(1, "\nError setting a score array in a column when building a reference database"); + return -1; + } + } + else + { + j = 0; + modified = false; + while (j < taxid_array_length) + { + if (taxid_lca == lca_taxid_array[j]) // Same LCA taxid: replace if the similarity score is greater + { + if (score > score_array[j]) + { + // Copy in array to make writable + memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t)); + memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t)); + modified = true; + + // Better score for the same LCA, replace this LCA/score pair + lca_taxid_array_writable[j] = taxid_lca; + score_array_writable[j] = score; + + // Remove the previous (children) LCAs from the array if their score is equal or lower + while ((j>0) && (score_array_writable[j-1] <= score)) + { + for (k=j-1; k score_array[j]) + { + memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t)); + memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t)); + modified = true; + + // Insert new LCA/score pair + for (k=taxid_array_writable_length; k>=j+1; k--) + { + lca_taxid_array_writable[k] = lca_taxid_array_writable[k-1]; + score_array_writable[k] = score_array_writable[k-1]; + } + + taxid_array_writable_length++; + score_array_writable_length++; + + lca_taxid_array_writable[j] = taxid_lca; + score_array_writable[j] = score; + + // Remove the previous (children) LCAs from the array if their score is equal or lower + while ((j>0) && (score_array_writable[j-1] <= score)) + { + for (k=j-1; k0) && (score_array_writable[j-1] <= score)) + { + for (k=j-1; k score_array[j]) + { + // Copy in array to make writable + memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t)); + memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t)); + modified = true; + + // Better score for the same LCA, replace this LCA/score pair + lca_taxid_array_writable[j] = taxid_lca; + score_array_writable[j] = score; + + // Remove the previous (children) LCAs from the array if their score is equal or lower + while ((j>0) && (score_array_writable[j-1] <= score)) + { + for (k=j-1; k score_array[j]) + { + memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t)); + memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t)); + modified = true; + + // Insert new LCA/score pair + for (k=taxid_array_writable_length; k>=j+1; k--) + { + lca_taxid_array_writable[k] = lca_taxid_array_writable[k-1]; + score_array_writable[k] = score_array_writable[k-1]; + } + + taxid_array_writable_length++; + score_array_writable_length++; + + lca_taxid_array_writable[j] = taxid_lca; + score_array_writable[j] = score; + + // Remove the previous (children) LCAs from the array if their score is equal or lower + while ((j>0) && (score_array_writable[j-1] <= score)) + { + for (k=j-1; k0) && (score_array_writable[j-1] <= score)) + { + for (k=j-1; k +#include + + +#define LCA_TAXID_COLUMN_NAME "LCA_TAXID" +#define LCA_TAXID_ARRAY_COLUMN_NAME "LCA_TAXID" +#define LCA_SCORE_ARRAY_COLUMN_NAME "LCA_SCORE" + + +/** + * @brief Building of reference databases for the taxonomic assignment of sequences. + * + * Note: The columns where the results are written are automatically named and created. + * + * @param dms_name The name of the DMS. + * @param refs_view_name The name of the view containing the reference sequences annotated with their taxids. + * @param taxonomy_name The name of the taxonomy stored in the DMS. + * @param o_view_name The name of the final reference database to create. + * @param o_view_comments The comments to associate with the final reference database to create. + * @param threshold The threshold (similarity score in identity percentage) at which the database should be created. + * The output database contains the Lowest Common Ancestor and associated highest similarity score for each + * sequence with the sequences with a similarity equal or greater than this threshold. + * + * @returns A value indicating the success of the operation. + * @retval 0 if the operation was successfully completed. + * @retval -1 if an error occurred. + * + * @since November 2018 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +int build_reference_db(const char* dms_name, + const char* refs_view_name, + const char* taxonomy_name, + const char* o_view_name, + const char* o_view_comments, + double threshold); + + +#endif /* BUILD_REFERENCE_DB_H_ */ +