/*********************************************************************************** * Functions to build reference databases for the taxonomic assignment of sequences * ***********************************************************************************/ /** * @file build_reference_db.c * @author Celine Mercier (celine.mercier@metabarcoding.org) * @date November 15th 2018 * @brief Functions to build referece databases for the taxonomic assignment of sequences. */ //#define OMP_SUPPORT // TODO #ifdef OMP_SUPPORT #include #endif #include #include #include #include #include #include "build_reference_db.h" #include "obidms.h" #include "obidebug.h" #include "obierrno.h" #include "obitypes.h" #include "obiview.h" #include "obi_lcs.h" #include "obidms_taxonomy.h" #include "obidmscolumn_array.h" #define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) /************************************************************************** * * D E C L A R A T I O N O F T H E P R I V A T E F U N C T I O N S * **************************************************************************/ /************************************************************************ * * D E F I N I T I O N O F T H E P R I V A T E F U N C T I O N S * ************************************************************************/ /********************************************************************** * * D E F I N I T I O N O F T H E P U B L I C F U N C T I O N S * **********************************************************************/ int build_reference_db(const char* dms_name, const char* refs_view_name, const char* taxonomy_name, const char* o_view_name, const char* o_view_comments, double threshold) { OBIDMS_p dms = NULL; OBIDMS_taxonomy_p tax = NULL; char* matrix_view_name = NULL; // TODO discuss that it must be unique char* matrix_with_lca_view_name = NULL; // TODO discuss that it must be unique Obiview_p matrix_with_lca_view = NULL; Obiview_p refs_view = NULL; Obiview_p o_view = NULL; OBIDMS_column_p matrix_idx1_column = NULL; OBIDMS_column_p matrix_idx2_column = NULL; OBIDMS_column_p refs_taxid_column = NULL; OBIDMS_column_p matrix_lca_taxid_column = NULL; OBIDMS_column_p matrix_score_column = NULL; OBIDMS_column_p final_lca_taxid_a_column = NULL; OBIDMS_column_p final_score_a_column = NULL; obiint_t taxid1, taxid2, taxid_lca; ecotx_t* taxon1 = NULL; ecotx_t* taxon2 = NULL; ecotx_t* lca = NULL; index_t idx1, idx2; index_t i, j, k; int32_t taxid_array_length; int32_t score_array_length; int32_t taxid_array_writable_length; int32_t score_array_writable_length; obifloat_t score; obiint_t* lca_taxid_array; obifloat_t* score_array; obiint_t lca_taxid_array_writable[1000]; obifloat_t score_array_writable[1000]; bool modified; // Discuss keeping the matrix view or not matrix_view_name = calloc((strlen(o_view_name)+strlen("_matrix")+1), sizeof(char)); if (matrix_view_name == NULL) { obidebug(1, "\nError allocating memory for the name of the matrix view when building a reference database"); return -1; } // TODO check and create view name that doesn't already exist matrix_view_name = strcpy(matrix_view_name, o_view_name); strcat(matrix_view_name, "_matrix"); if (obi_lcs_align_one_column(dms_name, refs_view_name, "", "", "", matrix_view_name, "{}", // TODO proper comments? false, false, threshold, true, 0, true, 1) < 0) { obidebug(1, "\nError aligning the sequences when building a reference database"); } // Add a column to the matrix view for LCAs // Clone the view with a new name // Build the view name matrix_with_lca_view_name = calloc((strlen(o_view_name)+strlen("_matrix_with_lca")+1), sizeof(char)); if (matrix_with_lca_view_name == NULL) { obidebug(1, "\nError allocating memory for the name of the matrix view with LCA when building a reference database"); return -1; } matrix_with_lca_view_name = strcpy(matrix_with_lca_view_name, o_view_name); strcat(matrix_with_lca_view_name, "_matrix_with_lca"); // Clone the matrix view // Open the DMS dms = obi_open_dms(dms_name); if (dms == NULL) { obidebug(1, "\nError opening the DMS when building a reference database"); return -1; } // Clone the view matrix_with_lca_view = obi_clone_view_from_name(dms, matrix_view_name, matrix_with_lca_view_name, NULL, "{}"); if (matrix_with_lca_view == NULL) { obidebug(1, "\nError creating the matrix with LCA view when building a reference database"); return -1; } // Add the LCA taxid column if (obi_view_add_column(matrix_with_lca_view, LCA_TAXID_COLUMN_NAME, -1, LCA_TAXID_COLUMN_NAME, OBI_INT, -1, 1, "", false, false, false, "", "", -1, "{}", true) < 0) { obidebug(1, "\nError adding the LCA column to the matrix with LCA view when building a reference database"); return -1; } // Open the taxonomy tax = obi_read_taxonomy(dms, taxonomy_name, true); if (tax == NULL) { obidebug(1, "\nError reading the taxonomy when building a reference database"); return -1; } // Open the reference sequences view refs_view = obi_open_view(dms, refs_view_name); if (refs_view == NULL) { obidebug(1, "\nError opening the reference sequences view when building a reference database"); return -1; } // Save column pointers matrix_idx1_column = obi_view_get_column(matrix_with_lca_view, IDX1_COLUMN_NAME); if (matrix_idx1_column == NULL) { obidebug(1, "\nError opening the first index column when building a reference database"); return -1; } matrix_idx2_column = obi_view_get_column(matrix_with_lca_view, IDX2_COLUMN_NAME); if (matrix_idx2_column == NULL) { obidebug(1, "\nError opening the second index column when building a reference database"); return -1; } refs_taxid_column = obi_view_get_column(refs_view, TAXID_COLUMN); if (refs_taxid_column == NULL) { obidebug(1, "\nError opening the taxid column when building a reference database"); return -1; } matrix_lca_taxid_column = obi_view_get_column(matrix_with_lca_view, LCA_TAXID_COLUMN_NAME); if (matrix_lca_taxid_column == NULL) { obidebug(1, "\nError opening the LCA column when building a reference database"); return -1; } // Compute all the LCAs // For each pair for (i=0; i<(matrix_with_lca_view->infos)->line_count; i++) { // Read taxid1 and get taxon1 // Read line index idx1 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx1_column, i, 0); taxid1 = obi_get_int_with_elt_idx_and_col_p_in_view(refs_view, refs_taxid_column, idx1, 0); if (taxid1 == OBIInt_NA) { obidebug(1, "\nError getting a taxid when building a reference database, seq %lld in refs view has no taxid associated", idx1); return -1; } taxon1 = obi_taxo_get_taxon_with_taxid(tax, taxid1); if (taxon1 == NULL) { obidebug(1, "\nError getting a taxon with taxid %d when building a reference database, seq %lld in refs view", taxid1, idx1); return -1; } // Read taxid2 and get taxon2 idx2 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx2_column, i, 0); taxid2 = obi_get_int_with_elt_idx_and_col_p_in_view(refs_view, refs_taxid_column, idx2, 0); if (taxid2 == OBIInt_NA) { obidebug(1, "\nError getting a taxid when building a reference database, seq %lld in refs view has no taxid associated", idx1); return -1; } taxon2 = obi_taxo_get_taxon_with_taxid(tax, taxid2); if (taxon2 == NULL) { obidebug(1, "\nError getting a taxon with taxid %d when building a reference database, seq %lld in refs view", taxid2, idx2); return -1; } // Compute and write their LCA lca = obi_taxo_get_lca(taxon1, taxon2); if (lca == NULL) { obidebug(1, "\nError getting the last common ancestor of two taxa when building a reference database"); return -1; } taxid_lca = lca->taxid; if (obi_set_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_lca_taxid_column, i, 0, taxid_lca) < 0) { obidebug(1, "\nError writing the last common ancestor of two taxa when building a reference database"); return -1; } } // Clone refs view, add 2 arrays columns for lca and score, compute and write them // Clone refs view o_view = obi_clone_view(dms, refs_view, o_view_name, NULL, o_view_comments); if (o_view == NULL) { obidebug(1, "\nError cloning the view of references when building a reference database"); return -1; } // Add the LCA taxid array column if (obi_view_add_column(o_view, LCA_TAXID_ARRAY_COLUMN_NAME, -1, LCA_TAXID_ARRAY_COLUMN_NAME, OBI_INT, -1, 1, "", false, true, false, "", "", -1, "{}", true) < 0) { obidebug(1, "\nError adding the LCA taxid column to the final view when building a reference database"); return -1; } // Add the score array column if (obi_view_add_column(o_view, LCA_SCORE_ARRAY_COLUMN_NAME, -1, LCA_SCORE_ARRAY_COLUMN_NAME, OBI_FLOAT, -1, 1, "", false, true, false, "", "", -1, "{}", true) < 0) { obidebug(1, "\nError adding the score column to the final view when building a reference database"); return -1; } // Open the newly added columns final_lca_taxid_a_column = obi_view_get_column(o_view, LCA_TAXID_ARRAY_COLUMN_NAME); if (final_lca_taxid_a_column == NULL) { obidebug(1, "\nError opening the LCA taxid array column when building a reference database"); return -1; } final_score_a_column = obi_view_get_column(o_view, LCA_SCORE_ARRAY_COLUMN_NAME); if (final_score_a_column == NULL) { obidebug(1, "\nError opening the score array column when building a reference database"); return -1; } // Open alignment score column matrix_score_column = obi_view_get_column(matrix_with_lca_view, SCORE_COLUMN_NAME); if (matrix_score_column == NULL) { obidebug(1, "\nError opening the alignment score column when building a reference database"); return -1; } // For each sequence, look for all its alignments in the matrix, and for each different LCA taxid/score, order them and write them // Going through matrix once, filling refs arrays on the go for efficiency for (i=0; i<(matrix_with_lca_view->infos)->line_count; i++) { // Read ref line indexes idx1 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx1_column, i, 0); idx2 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx2_column, i, 0); // Read LCA taxid taxid_lca = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_lca_taxid_column, i, 0); // Get LCA taxon lca = obi_taxo_get_taxon_with_taxid(tax, taxid_lca); if (lca == NULL) { obidebug(1, "\nError getting a LCA from taxid when building a reference database, taxid %d", taxid_lca); return -1; } // Read alignment score score = obi_get_float_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_score_column, i, 0); ///////////////// Compute for first sequence \\\\\\\\\\\\\\\\\\\\\\\ (TODO function) // Read arrays lca_taxid_array = (obiint_t*) obi_get_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, idx1, &taxid_array_length); score_array = (obifloat_t*) obi_get_array_with_col_p_in_view(o_view, final_score_a_column, idx1, &score_array_length); taxid_array_writable_length = taxid_array_length; score_array_writable_length = score_array_length; // Check that lengths are equal (TODO eventually remove?) if (taxid_array_length != score_array_length) { obidebug(1, "\nError building a reference database: LCA taxid and score arrays' lengths are not equal (%d and %d)", taxid_array_length, score_array_length); return -1; } // If empty, add values if (taxid_array_length == 0) { if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, idx1, &taxid_lca, (uint8_t) (obi_sizeof(OBI_INT) * 8), 1) < 0) { obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database"); return -1; } if (obi_set_array_with_col_p_in_view(o_view, final_score_a_column, idx1, &score, (uint8_t) (obi_sizeof(OBI_FLOAT) * 8), 1) < 0) { obidebug(1, "\nError setting a score array in a column when building a reference database"); return -1; } } else { j = 0; modified = false; while (j < taxid_array_length) { if (taxid_lca == lca_taxid_array[j]) // Same LCA taxid: replace if the similarity score is greater { if (score > score_array[j]) { // Copy in array to make writable memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t)); memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t)); modified = true; // Better score for the same LCA, replace this LCA/score pair lca_taxid_array_writable[j] = taxid_lca; score_array_writable[j] = score; // Remove the previous (children) LCAs from the array if their score is equal or lower while ((j>0) && (score_array_writable[j-1] <= score)) { for (k=j-1; k score_array[j]) { memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t)); memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t)); modified = true; // Insert new LCA/score pair for (k=taxid_array_writable_length; k>=j+1; k--) { lca_taxid_array_writable[k] = lca_taxid_array_writable[k-1]; score_array_writable[k] = score_array_writable[k-1]; } taxid_array_writable_length++; score_array_writable_length++; lca_taxid_array_writable[j] = taxid_lca; score_array_writable[j] = score; // Remove the previous (children) LCAs from the array if their score is equal or lower while ((j>0) && (score_array_writable[j-1] <= score)) { for (k=j-1; k0) && (score_array_writable[j-1] <= score)) { for (k=j-1; k score_array[j]) { // Copy in array to make writable memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t)); memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t)); modified = true; // Better score for the same LCA, replace this LCA/score pair lca_taxid_array_writable[j] = taxid_lca; score_array_writable[j] = score; // Remove the previous (children) LCAs from the array if their score is equal or lower while ((j>0) && (score_array_writable[j-1] <= score)) { for (k=j-1; k score_array[j]) { memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t)); memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t)); modified = true; // Insert new LCA/score pair for (k=taxid_array_writable_length; k>=j+1; k--) { lca_taxid_array_writable[k] = lca_taxid_array_writable[k-1]; score_array_writable[k] = score_array_writable[k-1]; } taxid_array_writable_length++; score_array_writable_length++; lca_taxid_array_writable[j] = taxid_lca; score_array_writable[j] = score; // Remove the previous (children) LCAs from the array if their score is equal or lower while ((j>0) && (score_array_writable[j-1] <= score)) { for (k=j-1; k0) && (score_array_writable[j-1] <= score)) { for (k=j-1; k