/*********************************************************************************** * Functions to build reference databases for the taxonomic assignment of sequences * ***********************************************************************************/ /** * @file build_reference_db.c * @author Celine Mercier (celine.mercier@metabarcoding.org) * @date November 15th 2018 * @brief Functions to build referece databases for the taxonomic assignment of sequences. */ //#define OMP_SUPPORT // TODO #ifdef OMP_SUPPORT #include #endif #include #include #include #include #include #include "build_reference_db.h" #include "obidms.h" #include "obidebug.h" #include "obierrno.h" #include "obisig.h" #include "obitypes.h" #include "obiview.h" #include "obi_lcs.h" #include "obidms_taxonomy.h" #include "obidmscolumn_array.h" #include "libjson/json_utils.h" #define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) /************************************************************************** * * D E C L A R A T I O N O F T H E P R I V A T E F U N C T I O N S * **************************************************************************/ /** * Internal function computing the last common ancestor (LCA) of a list of merged taxids (generated by obi uniq). * Also works on just simple taxid (returns the associated taxon). * * @param view A pointer on the view containing the taxid information (merged or not). * @param taxid_column A pointer on the column where taxids are kept (merged or not, aka int array or int columns). * @param idx The index of the sequence to compute the LCA of in the view. * @param merged Whether the taxid information is made of arrays of taxids or a single taxid. * @param tax A pointer on a taxonomy structure. * @param taxid_count The maximum number of taxids associated with one sequence (aka the number of elements in taxid_column). * @param taxid_str_indices A pointer on the list of indices in taxid_strings referring to the taxids stored as strings (see next param). * @param taxid_strings A pointer on the list of taxids stored as strings in the taxid column header (they correspond to the element names). * * @returns A pointer on the LCA taxon. * @retval NULL if an error occurred. * * @since August 2019 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ static inline ecotx_t* get_lca_from_merged_taxids(Obiview_p view, OBIDMS_column_p taxid_column, index_t idx, bool merged, OBIDMS_taxonomy_p tax, index_t taxid_count, int64_t* taxid_str_indices, char* taxid_strings); /************************************************************************ * * D E F I N I T I O N O F T H E P R I V A T E F U N C T I O N S * ************************************************************************/ static inline ecotx_t* get_lca_from_merged_taxids(Obiview_p view, OBIDMS_column_p taxid_column, index_t idx, bool merged, OBIDMS_taxonomy_p tax, index_t taxid_count, int64_t* taxid_str_indices, char* taxid_strings) { ecotx_t* taxon = NULL; ecotx_t* lca = NULL; int32_t taxid; index_t taxid_idx; int64_t taxid_str_idx; char* taxid_str; obiint_t n; for (taxid_idx=0; taxid_idxheader)->to_eval) { obidebug(1, "\nError opening the column containing the taxids of the reference sequences when building a reference database: " "run previous obi uniq with a higher threshold for the --max-elts option while waiting for the implementation of this feature"); return -1; } // Get the (maximum) number of taxids associated with a sequence taxid_count = (refs_taxid_column->header)->nb_elements_per_line; // Get pointers on element names (aka taxids) and their start indices for faster access taxid_strings = (refs_taxid_column->header)->elements_names; taxid_str_indices = (refs_taxid_column->header)->elements_names_idx; matrix_lca_taxid_column = obi_view_get_column(matrix_with_lca_view, LCA_TAXID_COLUMN_NAME); if (matrix_lca_taxid_column == NULL) { obidebug(1, "\nError opening the LCA column when building a reference database"); return -1; } count = (matrix_with_lca_view->infos)->line_count; fprintf(stderr, "Computing LCAs...\n"); // Compute all the LCAs // For each pair for (i=0; itaxid; if (obi_set_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_lca_taxid_column, i, 0, taxid_lca) < 0) { obidebug(1, "\nError writing the last common ancestor of two taxa when building a reference database"); return -1; } } fprintf(stderr,"\rDone : 100 %% \n"); // Clone refs view, add 2 arrays columns for lca and score, compute and write them // Clone refs view o_view = obi_clone_view(dms, refs_view, o_view_name, NULL, o_view_comments); if (o_view == NULL) { obidebug(1, "\nError cloning the view of references when building a reference database"); return -1; } // Add the LCA taxid array column if (obi_view_add_column(o_view, LCA_TAXID_ARRAY_COLUMN_NAME, -1, LCA_TAXID_ARRAY_COLUMN_NAME, OBI_INT, -1, 1, "", false, true, false, "", "", -1, "{}", true) < 0) { obidebug(1, "\nError adding the LCA taxid column to the final view when building a reference database"); return -1; } // Add the score array column if (obi_view_add_column(o_view, LCA_SCORE_ARRAY_COLUMN_NAME, -1, LCA_SCORE_ARRAY_COLUMN_NAME, OBI_FLOAT, -1, 1, "", false, true, false, "", "", -1, "{}", true) < 0) { obidebug(1, "\nError adding the score column to the final view when building a reference database"); return -1; } // Open the newly added columns final_lca_taxid_a_column = obi_view_get_column(o_view, LCA_TAXID_ARRAY_COLUMN_NAME); if (final_lca_taxid_a_column == NULL) { obidebug(1, "\nError opening the LCA taxid array column when building a reference database"); return -1; } final_score_a_column = obi_view_get_column(o_view, LCA_SCORE_ARRAY_COLUMN_NAME); if (final_score_a_column == NULL) { obidebug(1, "\nError opening the score array column when building a reference database"); return -1; } // Open alignment score column matrix_score_column = obi_view_get_column(matrix_with_lca_view, SCORE_COLUMN_NAME); if (matrix_score_column == NULL) { obidebug(1, "\nError opening the alignment score column when building a reference database"); return -1; } fprintf(stderr, "Building LCA arrays...\n"); // For each sequence, look for all its alignments in the matrix, and for each different LCA taxid/score, order them and write them // Going through matrix once, filling refs arrays on the go for efficiency for (i=0; i score_array[j]) { // Copy in array to make writable memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t)); memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t)); modified = true; //fprintf(stderr, "\nSame LCA, replace %d and %f with %d and %f", lca_taxid_array_writable[j], // score_array_writable[j], taxid_lca, score); // Better score for the same LCA, replace this LCA/score pair lca_taxid_array_writable[j] = taxid_lca; score_array_writable[j] = score; // Remove the previous (children) LCAs from the array if their score is equal or lower while ((j>0) && (score_array_writable[j-1] <= score)) { for (k=j-1; k(j-1)) { taxid_array_writable_length--; score_array_writable_length--; } j--; } } break; } else if (obi_taxo_is_taxon_under_taxid(lca, lca_taxid_array[j])) // Array LCA is a parent LCA { if (score > score_array[j]) { //fprintf(stderr, "\nInsert new"); memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t)); memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t)); modified = true; // Insert new LCA/score pair for (k=taxid_array_writable_length; k>=j+1; k--) { lca_taxid_array_writable[k] = lca_taxid_array_writable[k-1]; score_array_writable[k] = score_array_writable[k-1]; } taxid_array_writable_length++; score_array_writable_length++; lca_taxid_array_writable[j] = taxid_lca; score_array_writable[j] = score; // Remove the previous (children) LCAs from the array if their score is equal or lower while ((j>0) && (score_array_writable[j-1] <= score)) { for (k=j-1; k(j-1)) { taxid_array_writable_length--; score_array_writable_length--; } j--; } } break; } j++; } if (j == taxid_array_length) // same or parent LCA not found, need to be appended at the end { memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t)); memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t)); modified = true; //fprintf(stderr, "\nAppend at the end"); // Append LCA lca_taxid_array_writable[taxid_array_writable_length] = taxid_lca; score_array_writable[score_array_writable_length] = score; taxid_array_writable_length++; score_array_writable_length++; // Remove the previous (children) LCAs from the array if their score is equal or lower while ((j>0) && (score_array_writable[j-1] <= score)) { for (k=j-1; k(j-1)) { taxid_array_writable_length--; score_array_writable_length--; } j--; } } // Write new arrays if (modified) { // fprintf(stderr, "\n\nnew array:"); // for (k=0;kname, score_array_writable[k]); // } if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, idx1, lca_taxid_array_writable, (uint8_t) (obi_sizeof(OBI_INT) * 8), taxid_array_writable_length) < 0) { obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database"); return -1; } if (obi_set_array_with_col_p_in_view(o_view, final_score_a_column, idx1, score_array_writable, (uint8_t) (obi_sizeof(OBI_FLOAT) * 8), score_array_writable_length) < 0) { obidebug(1, "\nError setting a score array in a column when building a reference database"); return -1; } } } ///////////////// Compute for second sequence \\\\\\\\\\\\\\\\\\\\\\\ (TODO function) // Read arrays lca_taxid_array = (obiint_t*) obi_get_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, idx2, &taxid_array_length); score_array = (obifloat_t*) obi_get_array_with_col_p_in_view(o_view, final_score_a_column, idx2, &score_array_length); taxid_array_writable_length = taxid_array_length; score_array_writable_length = score_array_length; // Check that lengths are equal (TODO eventually remove?) // if (taxid_array_length != score_array_length) // { // obidebug(1, "\nError building a reference database: LCA taxid and score arrays' lengths are not equal (%d and %d)", taxid_array_length, score_array_length); // return -1; // } //fprintf(stderr, "\n2nd sequence"); // If empty, add values if (taxid_array_length == 0) { //fprintf(stderr, "\nEmpty, add value"); if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, idx2, &taxid_lca, (uint8_t) (obi_sizeof(OBI_INT) * 8), 1) < 0) { obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database"); return -1; } if (obi_set_array_with_col_p_in_view(o_view, final_score_a_column, idx2, &score, (uint8_t) (obi_sizeof(OBI_FLOAT) * 8), 1) < 0) { obidebug(1, "\nError setting a score array in a column when building a reference database"); return -1; } } else { //fprintf(stderr, "\nNot empty"); j = 0; modified = false; while (j < taxid_array_length) { if (taxid_lca == lca_taxid_array[j]) // Same LCA taxid: replace if the similarity score is greater { if (score > score_array[j]) { // Copy in array to make writable memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t)); memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t)); modified = true; //fprintf(stderr, "\nSame LCA, replace %d and %f with %d and %f", lca_taxid_array_writable[j], // score_array_writable[j], taxid_lca, score); // Better score for the same LCA, replace this LCA/score pair lca_taxid_array_writable[j] = taxid_lca; score_array_writable[j] = score; // Remove the previous (children) LCAs from the array if their score is equal or lower while ((j>0) && (score_array_writable[j-1] <= score)) { for (k=j-1; k(j-1)) { taxid_array_writable_length--; score_array_writable_length--; } j--; } } break; } else if (obi_taxo_is_taxon_under_taxid(lca, lca_taxid_array[j])) // Array LCA is a parent LCA { if (score > score_array[j]) { //fprintf(stderr, "\nInsert new"); memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t)); memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t)); modified = true; // Insert new LCA/score pair for (k=taxid_array_writable_length; k>=j+1; k--) { lca_taxid_array_writable[k] = lca_taxid_array_writable[k-1]; score_array_writable[k] = score_array_writable[k-1]; } taxid_array_writable_length++; score_array_writable_length++; lca_taxid_array_writable[j] = taxid_lca; score_array_writable[j] = score; // Remove the previous (children) LCAs from the array if their score is equal or lower while ((j>0) && (score_array_writable[j-1] <= score)) { for (k=j-1; k(j-1)) { taxid_array_writable_length--; score_array_writable_length--; } j--; } } break; } j++; } if (j == taxid_array_length) // same or parent LCA not found, need to be appended at the end { //fprintf(stderr, "\nAppend at the end"); memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t)); memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t)); modified = true; // Append LCA lca_taxid_array_writable[taxid_array_writable_length] = taxid_lca; score_array_writable[score_array_writable_length] = score; taxid_array_writable_length++; score_array_writable_length++; // Remove the previous (children) LCAs from the array if their score is equal or lower while ((j>0) && (score_array_writable[j-1] <= score)) { for (k=j-1; k(j-1)) { taxid_array_writable_length--; score_array_writable_length--; } j--; } } // Write new arrays // Copy arrays for modification (can't edit in place, as it is stored in indexer data file) if (modified) { if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, idx2, lca_taxid_array_writable, (uint8_t) (obi_sizeof(OBI_INT) * 8), taxid_array_writable_length) < 0) { obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database"); return -1; } if (obi_set_array_with_col_p_in_view(o_view, final_score_a_column, idx2, score_array_writable, (uint8_t) (obi_sizeof(OBI_FLOAT) * 8), score_array_writable_length) < 0) { obidebug(1, "\nError setting a score array in a column when building a reference database"); return -1; } } } } fprintf(stderr,"\rDone : 100 %% \n"); fprintf(stderr, "Writing results...\n"); count = (o_view->infos)->line_count; // Fill empty LCA informations (because filling from potentially sparse alignment matrix) with the sequence taxid score=1.0; // technically getting LCA of identical sequences for (i=0; itaxid; // Set them in output view if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, i, &taxid_lca, (uint8_t) (obi_sizeof(OBI_INT) * 8), 1) < 0) { obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database"); return -1; } if (obi_set_array_with_col_p_in_view(o_view, final_score_a_column, i, &score, (uint8_t) (obi_sizeof(OBI_FLOAT) * 8), 1) < 0) { obidebug(1, "\nError setting a score array in a column when building a reference database"); return -1; } } } fprintf(stderr,"\rDone : 100 %% \n"); // Add information about the threshold used to build the DB snprintf(threshold_str, 5, "%f", threshold); new_comments = obi_add_comment((o_view->infos)->comments, DB_THRESHOLD_KEY_IN_COMMENTS, threshold_str); if (new_comments == NULL) { obidebug(1, "\nError adding a comment (db threshold) to a view, key: %s, value: %s", DB_THRESHOLD_KEY_IN_COMMENTS, threshold_str); return -1; } if (obi_view_write_comments(o_view, new_comments) < 0) { obidebug(1, "\nError adding a comment (db threshold) to a view, key: %s, value: %s", DB_THRESHOLD_KEY_IN_COMMENTS, threshold_str); return -1; } free(new_comments); // Close views and DMS if (obi_save_and_close_view(refs_view) < 0) { obidebug(1, "\nError closing the reference view after building a reference database"); return -1; } if (obi_save_and_close_view(matrix_with_lca_view) < 0) { obidebug(1, "\nError closing the matrix with LCA view after building a reference database"); return -1; } if (obi_save_and_close_view(o_view) < 0) { obidebug(1, "\nError closing the final view after building a reference database"); return -1; } // Delete temporary views if (obi_delete_view(dms, matrix_view_name) < 0) { obidebug(1, "\nError deleting temporary view %s after building a reference database", matrix_view_name); return -1; } if (obi_delete_view(dms, matrix_with_lca_view_name) < 0) { obidebug(1, "\nError deleting temporary view %s after building a reference database", matrix_view_name); return -1; } // Close DMS if (obi_close_dms(dms, false) < 0) { obidebug(1, "\nError closing the DMS after building a reference database"); return -1; } // Free everything free(matrix_view_name); free(matrix_with_lca_view_name); return 0; }