/**************************************************************************** * Taxonomic assignment of sequences * ****************************************************************************/ /** * @file obi_ecotag.c * @author Celine Mercier (celine.mercier@metabarcoding.org) * @date November 15th 2018 * @brief Functions for the taxonomic assignment of sequences. */ //#define OMP_SUPPORT // TODO #ifdef OMP_SUPPORT #include #endif #include #include #include #include #include #include "obi_ecotag.h" #include "obidms_taxonomy.h" #include "obidms.h" #include "obidebug.h" #include "obierrno.h" #include "obisig.h" #include "obitypes.h" #include "obiview.h" #include "obidmscolumn.h" #include "sse_banded_LCS_alignment.h" #include "upperband.h" #include "obiblob.h" #include "build_reference_db.h" #include "libjson/json_utils.h" #define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) /************************************************************************** * * D E C L A R A T I O N O F T H E P R I V A T E F U N C T I O N S * **************************************************************************/ /** * Internal function creating the output columns for the ecotag algorithm. * * @param o_view A pointer on the output view. * * @retval 0 if the operation was successfully completed. * @retval -1 if an error occurred. * * @since December 2018 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ static int create_output_columns(Obiview_p o_view); /** * @brief Internal function printing the result of one assignment to the output view. * * @param output_view A pointer on the writable view where the result should be written. * @param line The line in the output view where the result should be written. * @param assigned_taxid_column A pointer on the column where the assigned taxid should be written. * @param taxid The assigned taxid. * @param assigned_name_column A pointer on the column where the assigned scientific name should be written. * @param name The assigned scientific name. * @param assigned_status_column A pointer on the column where the assigned status should be written. * @param assigned The assigned status (whether the sequence was assigned to a taxon or not). * @param best_match_ids_column A pointer on the column where the list of ids of the best matches should be written. * @param best_match_ids The list of ids of the best matches as an array of the concatenated ids separated by '\0'. * @param best_match_ids_length The total length of the array of ids of best matches. * @param best_match_taxids_column A pointer on the column where the list of taxids of the best matches should be written. * @param best_match_taxids The list of taxids of the best matches as an array of the taxids. * @param best_match_taxids_length The length of the array of taxids of best matches. * @param score_column A pointer on the column where the score should be written. * @param score The similarity score of the sequence with its best match(es). * * @retval 0 if the operation was successfully completed. * @retval -1 if an error occurred. * * @since December 2016 * @author Celine Mercier (celine.mercier@metabarcoding.org) */ int print_assignment_result(Obiview_p output_view, index_t line, OBIDMS_column_p assigned_taxid_column, int32_t taxid, OBIDMS_column_p assigned_name_column, const char* name, OBIDMS_column_p assigned_status_column, bool assigned, OBIDMS_column_p best_match_ids_column, const char* best_match_ids, int best_match_ids_length, OBIDMS_column_p best_match_taxids_column, const int32_t* best_match_taxids, int best_match_taxids_length, OBIDMS_column_p score_column, double score); /************************************************************************ * * D E F I N I T I O N O F T H E P R I V A T E F U N C T I O N S * ************************************************************************/ static int create_output_columns(Obiview_p o_view) { // Score column if (obi_view_add_column(o_view, ECOTAG_SCORE_COLUMN_NAME, -1, NULL, OBI_FLOAT, 0, 1, NULL, false, false, false, false, NULL, NULL, -1, "{}", true) < 0) { obidebug(1, "\nError creating the column for the score in ecotag"); return -1; } // Assigned taxid column if (obi_view_add_column(o_view, ECOTAG_TAXID_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, false, NULL, NULL, -1, "{}", true) < 0) { obidebug(1, "\nError creating the column for the assigned taxid in ecotag"); return -1; } // Assigned scientific name column if (obi_view_add_column(o_view, ECOTAG_NAME_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, false, false, false, NULL, NULL, -1, "{}", true) < 0) { obidebug(1, "\nError creating the column for the assigned scientific name in ecotag"); return -1; } // Assignement status column if (obi_view_add_column(o_view, ECOTAG_STATUS_COLUMN_NAME, -1, NULL, OBI_BOOL, 0, 1, NULL, false, false, false, false, NULL, NULL, -1, "{}", true) < 0) { obidebug(1, "\nError creating the column for the assignment status in ecotag"); return -1; } // Column for array of best match ids if (obi_view_add_column(o_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, false, true, false, NULL, NULL, -1, "{}", true) < 0) { obidebug(1, "\nError creating the column for the array of ids of best matches in ecotag"); return -1; } // Column for array of best match taxids if (obi_view_add_column(o_view, ECOTAG_BEST_MATCH_TAXIDS_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, true, false, NULL, NULL, -1, "{}", true) < 0) { obidebug(1, "\nError creating the column for the array of taxids of best matches in ecotag"); return -1; } return 0; } int print_assignment_result(Obiview_p output_view, index_t line, OBIDMS_column_p assigned_taxid_column, int32_t taxid, OBIDMS_column_p assigned_name_column, const char* name, OBIDMS_column_p assigned_status_column, bool assigned, OBIDMS_column_p best_match_ids_column, const char* best_match_ids, int best_match_ids_length, OBIDMS_column_p best_match_taxids_column, const int32_t* best_match_taxids, int best_match_taxids_length, OBIDMS_column_p score_column, double score) { // Write the assigned taxid if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, assigned_taxid_column, line, 0, taxid) < 0) { obidebug(1, "\nError writing a taxid in a column when writing ecotag results"); return -1; } // Write the assigned scientific name if (obi_set_str_with_elt_idx_and_col_p_in_view(output_view, assigned_name_column, line, 0, name) < 0) { obidebug(1, "\nError writing a scientific name in a column when writing ecotag results"); return -1; } // Write the assigned status if (obi_set_bool_with_elt_idx_and_col_p_in_view(output_view, assigned_status_column, line, 0, assigned) < 0) { obidebug(1, "\nError writing a assignment status in a column when writing ecotag results"); return -1; } // Write the best match ids if (obi_set_array_with_col_p_in_view(output_view, best_match_ids_column, line, best_match_ids, (uint8_t)(sizeof(char)*8), best_match_ids_length) < 0) { obidebug(1, "\nError writing the array of best match ids in a column when writing ecotag results"); return -1; } // Write the best match taxids if (obi_set_array_with_col_p_in_view(output_view, best_match_taxids_column, line, best_match_taxids, (uint8_t)(sizeof(OBI_INT)*8), best_match_taxids_length) < 0) { obidebug(1, "\nError writing the array of best match taxids in a column when writing ecotag results"); return -1; } // Write the similarity score with the best match if (obi_set_float_with_elt_idx_and_col_p_in_view(output_view, score_column, line, 0, score) < 0) { obidebug(1, "\nError writing a score in a column when writing ecotag results"); return -1; } return 0; } /********************************************************************** * * D E F I N I T I O N O F T H E P U B L I C F U N C T I O N S * **********************************************************************/ int obi_ecotag(const char* dms_name, const char* query_view_name, const char* ref_dms_name, const char* ref_view_name, const char* taxo_dms_name, const char* taxonomy_name, const char* output_view_name, const char* output_view_comments, double ecotag_threshold, double bubble_threshold) { // For each sequence // Align with DB // Keep the indices of all max scores // For each kept index, get the LCA at threshold, then the LCA of those LCAs // Write result (max score, threshold, LCA assigned, list of the ids of the best matches) index_t i, j, k; ecotx_t* lca; ecotx_t* lca_in_array; ecotx_t* best_match; index_t query_seq_idx, ref_seq_idx; double score, best_score; double threshold; double lca_threshold; int lcs_length; int ali_length; Kmer_table_p ktable; Obi_blob_p blob1; Obi_blob_p blob2; int lcs_min; index_t query_count; index_t ref_count; bool same_indexer; const double* score_array; const int* lca_array; index_t* best_match_array; char* best_match_ids; char* best_match_ids_to_store; int32_t best_match_ids_length; int32_t* best_match_taxids; int32_t* best_match_taxids_to_store; int best_match_count; int buffer_size; int best_match_ids_buffer_size; index_t best_match_idx; int32_t lca_array_length; int32_t lca_taxid; int32_t taxid_best_match; bool assigned; const char* lca_name; const char* id; int id_len; OBIDMS_p dms = NULL; OBIDMS_p ref_dms = NULL; OBIDMS_p taxo_dms = NULL; OBIDMS_taxonomy_p taxonomy = NULL; Obiview_p query_view = NULL; Obiview_p ref_view = NULL; Obiview_p output_view = NULL; OBIDMS_column_p query_seq_column = NULL; OBIDMS_column_p ref_seq_column = NULL; OBIDMS_column_p ref_id_column = NULL; OBIDMS_column_p score_column = NULL; OBIDMS_column_p assigned_taxid_column = NULL; OBIDMS_column_p assigned_name_column = NULL; OBIDMS_column_p assigned_status_column = NULL; OBIDMS_column_p best_match_ids_column = NULL; OBIDMS_column_p best_match_taxids_column = NULL; OBIDMS_column_p lca_taxid_a_column = NULL; OBIDMS_column_p score_a_column = NULL; OBIDMS_column_p ref_taxid_column = NULL; char* db_threshold_str = NULL; double db_threshold; buffer_size = 1024; best_match_ids_buffer_size = 1024; signal(SIGINT, sig_handler); // Open main DMS containing the query sequences and where the output will be written dms = obi_open_dms(dms_name, false); if (dms == NULL) { obidebug(1, "\nError opening the DMS containing the query sequences for ecotag"); return -1; } // Open the DMS containing the reference database (can be the same or not) ref_dms = obi_open_dms(ref_dms_name, false); if (ref_dms == NULL) { obidebug(1, "\nError opening the DMS containing the reference database for ecotag"); return -1; } // Open the DMS containing the taxonomy (can be the same or not) taxo_dms = obi_open_dms(taxo_dms_name, false); if (taxo_dms == NULL) { obidebug(1, "\nError opening the DMS containing the taxonomy for ecotag"); return -1; } // Open the taxonomy taxonomy = obi_read_taxonomy(taxo_dms, taxonomy_name, 0); if (taxonomy == NULL) { obidebug(1, "\nError opening the taxonomy for the assignment of sequences"); return -1; } // Open the query view query_view = obi_open_view(dms, query_view_name); if (query_view == NULL) { obidebug(1, "\nError opening the view of query sequences to assign"); return -1; } // Open the reference view. ref_view = obi_open_view(ref_dms, ref_view_name); if (ref_view == NULL) { obidebug(1, "\nError opening the view of reference sequences to assign"); return -1; } // Open the column of query sequences to assign if (strcmp((query_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0) query_seq_column = obi_view_get_column(query_view, NUC_SEQUENCE_COLUMN); else { obi_set_errno(OBI_ECOTAG_ERROR); obidebug(1, "\nError: query view must have the type NUC_SEQS view"); return -1; } if (query_seq_column == NULL) { obidebug(1, "\nError getting the query column to assign"); return -1; } // Open the column of reference sequences to compare the query sequences to if (strcmp((ref_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0) ref_seq_column = obi_view_get_column(ref_view, NUC_SEQUENCE_COLUMN); else { obi_set_errno(OBI_ECOTAG_ERROR); obidebug(1, "\nError: reference view must have the type NUC_SEQS view"); return -1; } if (ref_seq_column == NULL) { obidebug(1, "\nError getting the reference column to assign"); return -1; } // Check if the demanded threshold is lower than the threshold used to build the reference database db_threshold_str = obi_read_comment((ref_view->infos)->comments, DB_THRESHOLD_KEY_IN_COMMENTS); if (db_threshold_str == NULL) { obidebug(1, "\nError reading the threshold used to build the reference database."); return -1; } if (sscanf(db_threshold_str+1, "%lf", &db_threshold) <= 0) { obidebug(1, "\nError reading the threshold used to build the reference database."); return -1; } free(db_threshold_str); if (bubble_threshold < db_threshold) { fprintf(stderr, "\nError: The threshold demanded (%f) is lower than the threshold used to build the reference database (%f).\n\n", bubble_threshold, db_threshold); return -1; } // Open the ID column of reference sequences ref_id_column = obi_view_get_column(ref_view, ID_COLUMN); if (ref_id_column == NULL) { obidebug(1, "\nError getting the ID column of the reference view"); return -1; } // Create the output view output_view = obi_clone_view(dms, query_view, output_view_name, NULL, output_view_comments); if (output_view == NULL) { obidebug(1, "\nError creating the output view when assigning"); return -1; } // Create the output columns if (create_output_columns(output_view) < 0) return -1; assigned_taxid_column = obi_view_get_column(output_view, ECOTAG_TAXID_COLUMN_NAME); assigned_name_column = obi_view_get_column(output_view, ECOTAG_NAME_COLUMN_NAME); assigned_status_column = obi_view_get_column(output_view, ECOTAG_STATUS_COLUMN_NAME); best_match_ids_column = obi_view_get_column(output_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME); best_match_taxids_column = obi_view_get_column(output_view, ECOTAG_BEST_MATCH_TAXIDS_COLUMN_NAME); score_column = obi_view_get_column(output_view, ECOTAG_SCORE_COLUMN_NAME); // Open the used reference columns lca_taxid_a_column = obi_view_get_column(ref_view, LCA_TAXID_ARRAY_COLUMN_NAME); if (lca_taxid_a_column == NULL) { obidebug(1, "\nError opening the reference LCA taxid array column when doing the taxonomic assignment of sequences"); return -1; } score_a_column = obi_view_get_column(ref_view, LCA_SCORE_ARRAY_COLUMN_NAME); if (score_a_column == NULL) { obidebug(1, "\nError opening the score array column when doing the taxonomic assignment of sequences"); return -1; } ref_taxid_column = obi_view_get_column(ref_view, TAXID_COLUMN); if (ref_taxid_column == NULL) { obidebug(1, "\nError opening the taxid column when doing the taxonomic assignment of sequences"); return -1; } // Check if the sequence columns share the same indexer (allows for quick checking of sequence equality) if ((strcmp((query_seq_column->header)->indexer_name, (ref_seq_column->header)->indexer_name) == 0) && (dms == ref_dms)) same_indexer = true; else same_indexer = false; // Build kmer tables ktable = hash_two_seq_columns(query_view, query_seq_column, 0, ref_view, ref_seq_column, 0); if (ktable == NULL) { obi_set_errno(OBI_ECOTAG_ERROR); obidebug(1, "\nError building kmer tables before assigning"); return -1; } query_count = (query_view->infos)->line_count; ref_count = (ref_view->infos)->line_count; best_match_array = (index_t*) malloc(buffer_size * sizeof(index_t)); if (best_match_array == NULL) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError allocating memory for the best match index array in ecotag"); return -1; } best_match_ids = (char*) malloc(best_match_ids_buffer_size* sizeof(char)); if (best_match_ids == NULL) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError allocating memory for the best match id array in ecotag"); return -1; } best_match_taxids = (int32_t*) malloc(buffer_size* sizeof(int32_t)); if (best_match_taxids == NULL) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError allocating memory for the best match taxid array in ecotag"); return -1; } for (i=0; i < query_count; i++) { if (i%1000 == 0) fprintf(stderr,"\rDone : %f %% ", (i / (float) query_count)*100); best_match_count = 0; best_match_ids_length = 0; threshold = ecotag_threshold; best_score = 0.0; // Get first sequence and its index query_seq_idx = obi_get_index_with_elt_idx_and_col_p_in_view(query_view, query_seq_column, i, 0); blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(query_view, query_seq_column, i, 0); if (blob1 == NULL) { obidebug(1, "\nError retrieving sequences to align when assigning"); return -1; } for (j=0; j < ref_count; j++) { if (! keep_running) return -1; // Get reference sequence and its index ref_seq_idx = obi_get_index_with_elt_idx_and_col_p_in_view(ref_view, ref_seq_column, j, 0); blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(ref_view, ref_seq_column, j, 0); if (blob2 == NULL) { obidebug(1, "\nError retrieving sequences to align when assigning"); return -1; } // Check if the sequences are identical in a quick way (same index in the same indexer) if (same_indexer && (query_seq_idx == ref_seq_idx)) score = 1.0; else // the sequences aren't identical or we don't know { // kmer filter (offset for the index of the kmer table of the 2nd sequence because the kmer tables of both the query and ref columns are concatenated in one) align_filters(ktable, blob1, blob2, i, query_count+j, threshold, true, ALILEN, true, &score, &lcs_min, !same_indexer); // Compute alignment score if ((score < 0) && ((threshold == 0) || (score == -1.0))) // (sequences are not identical), and (no threshold, or filter passed): align score = obiblob_sse_banded_lcs_align(blob1, blob2, threshold, true, ALILEN, true, &lcs_length, &ali_length); } if ((score >= threshold) && (score >= best_score)) { // Replace everything if score is better if (score > best_score) { // Up the alignment threshold because we only want scores equal or greater than the best found threshold = score; best_score = score; // Reset the array with that match best_match_ids_length = 0; best_match_count = 0; } // Store in best match array // Grow match and taxid array if needed if (best_match_count == buffer_size) { buffer_size = buffer_size*2; best_match_array = (index_t*) realloc(best_match_array, buffer_size*sizeof(index_t)); if (best_match_array == NULL) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError reallocating match array when assigning"); return -1; } best_match_taxids = (int32_t*) realloc(best_match_taxids, buffer_size*sizeof(int32_t)); if (best_match_taxids == NULL) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError reallocating match taxids array when assigning"); return -1; } } id = obi_get_str_with_elt_idx_and_col_p_in_view(ref_view, ref_id_column, j, 0); id_len = strlen(id); // Grow ids array if needed while ((best_match_ids_length+id_len+1) >= best_match_ids_buffer_size) { best_match_ids_buffer_size = best_match_ids_buffer_size*2; best_match_ids = (char*) realloc(best_match_ids, best_match_ids_buffer_size*sizeof(char)); if (best_match_ids == NULL) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError reallocating match ids array when assigning"); return -1; } } // Save match best_match_array[best_match_count] = j; best_match_taxids[best_match_count] = obi_get_int_with_elt_idx_and_col_p_in_view(ref_view, ref_taxid_column, j, 0); best_match_count++; strcpy(best_match_ids+best_match_ids_length, id); best_match_ids_length = best_match_ids_length + id_len + 1; } } // Get LCA of the LCAs of the best matches lca = NULL; lca_taxid = -1; for (j=0; j= lca_threshold)) k++; if (k>0) { lca_array = obi_get_array_with_col_p_in_view(ref_view, lca_taxid_a_column, best_match_idx, &lca_array_length); if (j>0) { // lca = obi_taxo_get_taxon_with_taxid(taxonomy, lca_taxid); // if (lca == NULL) // { // obidebug(1, "\nError getting a taxon from a taxid when doing taxonomic assignment"); // return -1; // } lca_in_array = obi_taxo_get_taxon_with_taxid(taxonomy, lca_array[k-1]); if (lca_in_array == NULL) { obidebug(1, "\nError getting a taxon from a taxid when doing taxonomic assignment"); return -1; } lca = obi_taxo_get_lca(lca, lca_in_array); if (lca == NULL) { obidebug(1, "\nError getting the LCA of two taxa when doing taxonomic assignment"); return -1; } lca_taxid = lca->taxid; } else { lca_taxid = lca_array[k-1]; lca = obi_taxo_get_taxon_with_taxid(taxonomy, lca_taxid); } } else { taxid_best_match = obi_get_int_with_elt_idx_and_col_p_in_view(ref_view, ref_taxid_column, best_match_idx, 0); best_match = obi_taxo_get_taxon_with_taxid(taxonomy, taxid_best_match); if (best_match == NULL) { obidebug(1, "\nError getting a taxon from a taxid (%d) when doing taxonomic assignment", taxid_best_match); return -1; } if (j>0) { lca = obi_taxo_get_lca(lca, best_match); lca_taxid = lca->taxid; } else { lca_taxid = taxid_best_match; lca = obi_taxo_get_taxon_with_taxid(taxonomy, lca_taxid); } } } lca_name = NULL; if (best_match_count > 0) { assigned = true; // Get LCA name if (lca->preferred_name != NULL) lca_name = lca->preferred_name; else lca_name = lca->name; best_match_ids_to_store = best_match_ids; best_match_taxids_to_store = best_match_taxids; } else { assigned = false; lca_name = OBIStr_NA; lca_taxid = OBIInt_NA; best_match_ids_to_store = OBITuple_NA; best_match_taxids_to_store = OBITuple_NA; score = OBIFloat_NA; } // Print result in output view if (print_assignment_result(output_view, i, assigned_taxid_column, lca_taxid, assigned_name_column, lca_name, assigned_status_column, assigned, best_match_ids_column, best_match_ids_to_store, best_match_ids_length, best_match_taxids_column, best_match_taxids_to_store, best_match_count, score_column, best_score ) < 0) return -1; } free(best_match_array); free(best_match_ids); free(best_match_taxids); obi_close_taxonomy(taxonomy); obi_save_and_close_view(query_view); obi_save_and_close_view(ref_view); obi_save_and_close_view(output_view); obi_close_dms(dms, false); obi_close_dms(ref_dms, false); obi_close_dms(taxo_dms, false); fprintf(stderr,"\rDone : 100 %% \n"); return 0; }