From 4aa772c405882e355eabda3d45f3c581141d57e4 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Tue, 9 Jun 2020 14:33:14 +0200 Subject: [PATCH] ecotag: Added list of taxids for all best matches (closes #80) --- src/obi_ecotag.c | 63 ++++++++++++++++++++++++++++++++++++++++-------- src/obi_ecotag.h | 3 ++- 2 files changed, 55 insertions(+), 11 deletions(-) diff --git a/src/obi_ecotag.c b/src/obi_ecotag.c index dcd6dac..df69aee 100755 --- a/src/obi_ecotag.c +++ b/src/obi_ecotag.c @@ -71,9 +71,12 @@ static int create_output_columns(Obiview_p o_view); * @param name The assigned scientific name. * @param assigned_status_column A pointer on the column where the assigned status should be written. * @param assigned The assigned status (whether the sequence was assigned to a taxon or not). - * @param best_match_column A pointer on the column where the list of ids of the best matches should be written. + * @param best_match_ids_column A pointer on the column where the list of ids of the best matches should be written. * @param best_match_ids The list of ids of the best matches as an array of the concatenated ids separated by '\0'. * @param best_match_ids_length The total length of the array of ids of best matches. + * @param best_match_taxids_column A pointer on the column where the list of taxids of the best matches should be written. + * @param best_match_taxids The list of taxids of the best matches as an array of the taxids. + * @param best_match_taxids_length The length of the array of taxids of best matches. * @param score_column A pointer on the column where the score should be written. * @param score The similarity score of the sequence with its best match(es). * @@ -87,7 +90,8 @@ int print_assignment_result(Obiview_p output_view, index_t line, OBIDMS_column_p assigned_taxid_column, int32_t taxid, OBIDMS_column_p assigned_name_column, const char* name, OBIDMS_column_p assigned_status_column, bool assigned, - OBIDMS_column_p best_match_column, const char* best_match_ids, int best_match_ids_length, + OBIDMS_column_p best_match_ids_column, const char* best_match_ids, int best_match_ids_length, + OBIDMS_column_p best_match_taxids_column, const int32_t* best_match_taxids, int best_match_taxids_length, OBIDMS_column_p score_column, double score); @@ -130,7 +134,14 @@ static int create_output_columns(Obiview_p o_view) // Column for array of best match ids if (obi_view_add_column(o_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, true, false, NULL, NULL, -1, "{}", true) < 0) { - obidebug(1, "\nError creating the column for the array of ids of the best match in ecotag"); + obidebug(1, "\nError creating the column for the array of ids of best matches in ecotag"); + return -1; + } + + // Column for array of best match taxids + if (obi_view_add_column(o_view, ECOTAG_BEST_MATCH_TAXIDS_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, true, false, NULL, NULL, -1, "{}", true) < 0) + { + obidebug(1, "\nError creating the column for the array of taxids of best matches in ecotag"); return -1; } @@ -142,7 +153,8 @@ int print_assignment_result(Obiview_p output_view, index_t line, OBIDMS_column_p assigned_taxid_column, int32_t taxid, OBIDMS_column_p assigned_name_column, const char* name, OBIDMS_column_p assigned_status_column, bool assigned, - OBIDMS_column_p best_match_column, const char* best_match_ids, int best_match_ids_length, + OBIDMS_column_p best_match_ids_column, const char* best_match_ids, int best_match_ids_length, + OBIDMS_column_p best_match_taxids_column, const int32_t* best_match_taxids, int best_match_taxids_length, OBIDMS_column_p score_column, double score) { // Write the assigned taxid @@ -167,9 +179,16 @@ int print_assignment_result(Obiview_p output_view, index_t line, } // Write the best match ids - if (obi_set_array_with_col_p_in_view(output_view, best_match_column, line, best_match_ids, (uint8_t)(sizeof(char)*8), best_match_ids_length) < 0) + if (obi_set_array_with_col_p_in_view(output_view, best_match_ids_column, line, best_match_ids, (uint8_t)(sizeof(char)*8), best_match_ids_length) < 0) { - obidebug(1, "\nError writing a assignment status in a column when writing ecotag results"); + obidebug(1, "\nError writing the array of best match ids in a column when writing ecotag results"); + return -1; + } + + // Write the best match taxids + if (obi_set_array_with_col_p_in_view(output_view, best_match_taxids_column, line, best_match_taxids, (uint8_t)(sizeof(OBI_INT)*8), best_match_taxids_length) < 0) + { + obidebug(1, "\nError writing the array of best match taxids in a column when writing ecotag results"); return -1; } @@ -235,6 +254,8 @@ int obi_ecotag(const char* dms_name, char* best_match_ids; char* best_match_ids_to_store; int32_t best_match_ids_length; + int32_t* best_match_taxids; + int32_t* best_match_taxids_to_store; int best_match_count; int buffer_size; int best_match_ids_buffer_size; @@ -263,7 +284,8 @@ int obi_ecotag(const char* dms_name, OBIDMS_column_p assigned_taxid_column = NULL; OBIDMS_column_p assigned_name_column = NULL; OBIDMS_column_p assigned_status_column = NULL; - OBIDMS_column_p best_match_column = NULL; + OBIDMS_column_p best_match_ids_column = NULL; + OBIDMS_column_p best_match_taxids_column = NULL; OBIDMS_column_p lca_taxid_a_column = NULL; OBIDMS_column_p score_a_column = NULL; OBIDMS_column_p ref_taxid_column = NULL; @@ -396,7 +418,8 @@ int obi_ecotag(const char* dms_name, assigned_taxid_column = obi_view_get_column(output_view, ECOTAG_TAXID_COLUMN_NAME); assigned_name_column = obi_view_get_column(output_view, ECOTAG_NAME_COLUMN_NAME); assigned_status_column = obi_view_get_column(output_view, ECOTAG_STATUS_COLUMN_NAME); - best_match_column = obi_view_get_column(output_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME); + best_match_ids_column = obi_view_get_column(output_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME); + best_match_taxids_column = obi_view_get_column(output_view, ECOTAG_BEST_MATCH_TAXIDS_COLUMN_NAME); score_column = obi_view_get_column(output_view, ECOTAG_SCORE_COLUMN_NAME); // Open the used reference columns @@ -453,6 +476,14 @@ int obi_ecotag(const char* dms_name, return -1; } + best_match_taxids = (int32_t*) malloc(buffer_size* sizeof(int32_t)); + if (best_match_taxids == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for the best match taxid array in ecotag"); + return -1; + } + for (i=0; i < query_count; i++) { if (i%1000 == 0) @@ -514,7 +545,7 @@ int obi_ecotag(const char* dms_name, // Store in best match array - // Grow match array if needed + // Grow match and taxid array if needed if (best_match_count == buffer_size) { buffer_size = buffer_size*2; @@ -525,6 +556,13 @@ int obi_ecotag(const char* dms_name, obidebug(1, "\nError reallocating match array when assigning"); return -1; } + best_match_taxids = (int32_t*) realloc(best_match_taxids, buffer_size*sizeof(int32_t)); + if (best_match_taxids == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating match taxids array when assigning"); + return -1; + } } id = obi_get_str_with_elt_idx_and_col_p_in_view(ref_view, ref_id_column, j, 0); @@ -545,6 +583,7 @@ int obi_ecotag(const char* dms_name, // Save match best_match_array[best_match_count] = j; + best_match_taxids[best_match_count] = obi_get_int_with_elt_idx_and_col_p_in_view(ref_view, ref_taxid_column, j, 0); best_match_count++; strcpy(best_match_ids+best_match_ids_length, id); best_match_ids_length = best_match_ids_length + id_len + 1; @@ -629,6 +668,7 @@ int obi_ecotag(const char* dms_name, else lca_name = lca->name; best_match_ids_to_store = best_match_ids; + best_match_taxids_to_store = best_match_taxids; } else { @@ -636,6 +676,7 @@ int obi_ecotag(const char* dms_name, lca_name = OBIStr_NA; lca_taxid = OBIInt_NA; best_match_ids_to_store = OBITuple_NA; + best_match_taxids_to_store = OBITuple_NA; score = OBIFloat_NA; } @@ -644,7 +685,8 @@ int obi_ecotag(const char* dms_name, assigned_taxid_column, lca_taxid, assigned_name_column, lca_name, assigned_status_column, assigned, - best_match_column, best_match_ids_to_store, best_match_ids_length, + best_match_ids_column, best_match_ids_to_store, best_match_ids_length, + best_match_taxids_column, best_match_taxids_to_store, best_match_count, score_column, best_score ) < 0) return -1; @@ -652,6 +694,7 @@ int obi_ecotag(const char* dms_name, free(best_match_array); free(best_match_ids); + free(best_match_taxids); obi_close_taxonomy(taxonomy); obi_save_and_close_view(query_view); diff --git a/src/obi_ecotag.h b/src/obi_ecotag.h index 436f1c8..ed88194 100755 --- a/src/obi_ecotag.h +++ b/src/obi_ecotag.h @@ -23,7 +23,8 @@ #define ECOTAG_TAXID_COLUMN_NAME "TAXID" #define ECOTAG_NAME_COLUMN_NAME "SCIENTIFIC_NAME" #define ECOTAG_STATUS_COLUMN_NAME "ID_STATUS" -#define ECOTAG_BEST_MATCH_IDS_COLUMN_NAME "BEST_MATCH" +#define ECOTAG_BEST_MATCH_IDS_COLUMN_NAME "BEST_MATCH_IDS" +#define ECOTAG_BEST_MATCH_TAXIDS_COLUMN_NAME "BEST_MATCH_TAXIDS" #define ECOTAG_SCORE_COLUMN_NAME "BEST_IDENTITY"