ecotag: Added list of taxids for all best matches (closes #80)

This commit is contained in:
Celine Mercier
2020-06-09 14:33:14 +02:00
parent b0b96ac37a
commit 4aa772c405
2 changed files with 55 additions and 11 deletions

View File

@ -71,9 +71,12 @@ static int create_output_columns(Obiview_p o_view);
* @param name The assigned scientific name. * @param name The assigned scientific name.
* @param assigned_status_column A pointer on the column where the assigned status should be written. * @param assigned_status_column A pointer on the column where the assigned status should be written.
* @param assigned The assigned status (whether the sequence was assigned to a taxon or not). * @param assigned The assigned status (whether the sequence was assigned to a taxon or not).
* @param best_match_column A pointer on the column where the list of ids of the best matches should be written. * @param best_match_ids_column A pointer on the column where the list of ids of the best matches should be written.
* @param best_match_ids The list of ids of the best matches as an array of the concatenated ids separated by '\0'. * @param best_match_ids The list of ids of the best matches as an array of the concatenated ids separated by '\0'.
* @param best_match_ids_length The total length of the array of ids of best matches. * @param best_match_ids_length The total length of the array of ids of best matches.
* @param best_match_taxids_column A pointer on the column where the list of taxids of the best matches should be written.
* @param best_match_taxids The list of taxids of the best matches as an array of the taxids.
* @param best_match_taxids_length The length of the array of taxids of best matches.
* @param score_column A pointer on the column where the score should be written. * @param score_column A pointer on the column where the score should be written.
* @param score The similarity score of the sequence with its best match(es). * @param score The similarity score of the sequence with its best match(es).
* *
@ -87,7 +90,8 @@ int print_assignment_result(Obiview_p output_view, index_t line,
OBIDMS_column_p assigned_taxid_column, int32_t taxid, OBIDMS_column_p assigned_taxid_column, int32_t taxid,
OBIDMS_column_p assigned_name_column, const char* name, OBIDMS_column_p assigned_name_column, const char* name,
OBIDMS_column_p assigned_status_column, bool assigned, OBIDMS_column_p assigned_status_column, bool assigned,
OBIDMS_column_p best_match_column, const char* best_match_ids, int best_match_ids_length, OBIDMS_column_p best_match_ids_column, const char* best_match_ids, int best_match_ids_length,
OBIDMS_column_p best_match_taxids_column, const int32_t* best_match_taxids, int best_match_taxids_length,
OBIDMS_column_p score_column, double score); OBIDMS_column_p score_column, double score);
@ -130,7 +134,14 @@ static int create_output_columns(Obiview_p o_view)
// Column for array of best match ids // Column for array of best match ids
if (obi_view_add_column(o_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, true, false, NULL, NULL, -1, "{}", true) < 0) if (obi_view_add_column(o_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, true, false, NULL, NULL, -1, "{}", true) < 0)
{ {
obidebug(1, "\nError creating the column for the array of ids of the best match in ecotag"); obidebug(1, "\nError creating the column for the array of ids of best matches in ecotag");
return -1;
}
// Column for array of best match taxids
if (obi_view_add_column(o_view, ECOTAG_BEST_MATCH_TAXIDS_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, true, false, NULL, NULL, -1, "{}", true) < 0)
{
obidebug(1, "\nError creating the column for the array of taxids of best matches in ecotag");
return -1; return -1;
} }
@ -142,7 +153,8 @@ int print_assignment_result(Obiview_p output_view, index_t line,
OBIDMS_column_p assigned_taxid_column, int32_t taxid, OBIDMS_column_p assigned_taxid_column, int32_t taxid,
OBIDMS_column_p assigned_name_column, const char* name, OBIDMS_column_p assigned_name_column, const char* name,
OBIDMS_column_p assigned_status_column, bool assigned, OBIDMS_column_p assigned_status_column, bool assigned,
OBIDMS_column_p best_match_column, const char* best_match_ids, int best_match_ids_length, OBIDMS_column_p best_match_ids_column, const char* best_match_ids, int best_match_ids_length,
OBIDMS_column_p best_match_taxids_column, const int32_t* best_match_taxids, int best_match_taxids_length,
OBIDMS_column_p score_column, double score) OBIDMS_column_p score_column, double score)
{ {
// Write the assigned taxid // Write the assigned taxid
@ -167,9 +179,16 @@ int print_assignment_result(Obiview_p output_view, index_t line,
} }
// Write the best match ids // Write the best match ids
if (obi_set_array_with_col_p_in_view(output_view, best_match_column, line, best_match_ids, (uint8_t)(sizeof(char)*8), best_match_ids_length) < 0) if (obi_set_array_with_col_p_in_view(output_view, best_match_ids_column, line, best_match_ids, (uint8_t)(sizeof(char)*8), best_match_ids_length) < 0)
{ {
obidebug(1, "\nError writing a assignment status in a column when writing ecotag results"); obidebug(1, "\nError writing the array of best match ids in a column when writing ecotag results");
return -1;
}
// Write the best match taxids
if (obi_set_array_with_col_p_in_view(output_view, best_match_taxids_column, line, best_match_taxids, (uint8_t)(sizeof(OBI_INT)*8), best_match_taxids_length) < 0)
{
obidebug(1, "\nError writing the array of best match taxids in a column when writing ecotag results");
return -1; return -1;
} }
@ -235,6 +254,8 @@ int obi_ecotag(const char* dms_name,
char* best_match_ids; char* best_match_ids;
char* best_match_ids_to_store; char* best_match_ids_to_store;
int32_t best_match_ids_length; int32_t best_match_ids_length;
int32_t* best_match_taxids;
int32_t* best_match_taxids_to_store;
int best_match_count; int best_match_count;
int buffer_size; int buffer_size;
int best_match_ids_buffer_size; int best_match_ids_buffer_size;
@ -263,7 +284,8 @@ int obi_ecotag(const char* dms_name,
OBIDMS_column_p assigned_taxid_column = NULL; OBIDMS_column_p assigned_taxid_column = NULL;
OBIDMS_column_p assigned_name_column = NULL; OBIDMS_column_p assigned_name_column = NULL;
OBIDMS_column_p assigned_status_column = NULL; OBIDMS_column_p assigned_status_column = NULL;
OBIDMS_column_p best_match_column = NULL; OBIDMS_column_p best_match_ids_column = NULL;
OBIDMS_column_p best_match_taxids_column = NULL;
OBIDMS_column_p lca_taxid_a_column = NULL; OBIDMS_column_p lca_taxid_a_column = NULL;
OBIDMS_column_p score_a_column = NULL; OBIDMS_column_p score_a_column = NULL;
OBIDMS_column_p ref_taxid_column = NULL; OBIDMS_column_p ref_taxid_column = NULL;
@ -396,7 +418,8 @@ int obi_ecotag(const char* dms_name,
assigned_taxid_column = obi_view_get_column(output_view, ECOTAG_TAXID_COLUMN_NAME); assigned_taxid_column = obi_view_get_column(output_view, ECOTAG_TAXID_COLUMN_NAME);
assigned_name_column = obi_view_get_column(output_view, ECOTAG_NAME_COLUMN_NAME); assigned_name_column = obi_view_get_column(output_view, ECOTAG_NAME_COLUMN_NAME);
assigned_status_column = obi_view_get_column(output_view, ECOTAG_STATUS_COLUMN_NAME); assigned_status_column = obi_view_get_column(output_view, ECOTAG_STATUS_COLUMN_NAME);
best_match_column = obi_view_get_column(output_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME); best_match_ids_column = obi_view_get_column(output_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME);
best_match_taxids_column = obi_view_get_column(output_view, ECOTAG_BEST_MATCH_TAXIDS_COLUMN_NAME);
score_column = obi_view_get_column(output_view, ECOTAG_SCORE_COLUMN_NAME); score_column = obi_view_get_column(output_view, ECOTAG_SCORE_COLUMN_NAME);
// Open the used reference columns // Open the used reference columns
@ -453,6 +476,14 @@ int obi_ecotag(const char* dms_name,
return -1; return -1;
} }
best_match_taxids = (int32_t*) malloc(buffer_size* sizeof(int32_t));
if (best_match_taxids == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for the best match taxid array in ecotag");
return -1;
}
for (i=0; i < query_count; i++) for (i=0; i < query_count; i++)
{ {
if (i%1000 == 0) if (i%1000 == 0)
@ -514,7 +545,7 @@ int obi_ecotag(const char* dms_name,
// Store in best match array // Store in best match array
// Grow match array if needed // Grow match and taxid array if needed
if (best_match_count == buffer_size) if (best_match_count == buffer_size)
{ {
buffer_size = buffer_size*2; buffer_size = buffer_size*2;
@ -525,6 +556,13 @@ int obi_ecotag(const char* dms_name,
obidebug(1, "\nError reallocating match array when assigning"); obidebug(1, "\nError reallocating match array when assigning");
return -1; return -1;
} }
best_match_taxids = (int32_t*) realloc(best_match_taxids, buffer_size*sizeof(int32_t));
if (best_match_taxids == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reallocating match taxids array when assigning");
return -1;
}
} }
id = obi_get_str_with_elt_idx_and_col_p_in_view(ref_view, ref_id_column, j, 0); id = obi_get_str_with_elt_idx_and_col_p_in_view(ref_view, ref_id_column, j, 0);
@ -545,6 +583,7 @@ int obi_ecotag(const char* dms_name,
// Save match // Save match
best_match_array[best_match_count] = j; best_match_array[best_match_count] = j;
best_match_taxids[best_match_count] = obi_get_int_with_elt_idx_and_col_p_in_view(ref_view, ref_taxid_column, j, 0);
best_match_count++; best_match_count++;
strcpy(best_match_ids+best_match_ids_length, id); strcpy(best_match_ids+best_match_ids_length, id);
best_match_ids_length = best_match_ids_length + id_len + 1; best_match_ids_length = best_match_ids_length + id_len + 1;
@ -629,6 +668,7 @@ int obi_ecotag(const char* dms_name,
else else
lca_name = lca->name; lca_name = lca->name;
best_match_ids_to_store = best_match_ids; best_match_ids_to_store = best_match_ids;
best_match_taxids_to_store = best_match_taxids;
} }
else else
{ {
@ -636,6 +676,7 @@ int obi_ecotag(const char* dms_name,
lca_name = OBIStr_NA; lca_name = OBIStr_NA;
lca_taxid = OBIInt_NA; lca_taxid = OBIInt_NA;
best_match_ids_to_store = OBITuple_NA; best_match_ids_to_store = OBITuple_NA;
best_match_taxids_to_store = OBITuple_NA;
score = OBIFloat_NA; score = OBIFloat_NA;
} }
@ -644,7 +685,8 @@ int obi_ecotag(const char* dms_name,
assigned_taxid_column, lca_taxid, assigned_taxid_column, lca_taxid,
assigned_name_column, lca_name, assigned_name_column, lca_name,
assigned_status_column, assigned, assigned_status_column, assigned,
best_match_column, best_match_ids_to_store, best_match_ids_length, best_match_ids_column, best_match_ids_to_store, best_match_ids_length,
best_match_taxids_column, best_match_taxids_to_store, best_match_count,
score_column, best_score score_column, best_score
) < 0) ) < 0)
return -1; return -1;
@ -652,6 +694,7 @@ int obi_ecotag(const char* dms_name,
free(best_match_array); free(best_match_array);
free(best_match_ids); free(best_match_ids);
free(best_match_taxids);
obi_close_taxonomy(taxonomy); obi_close_taxonomy(taxonomy);
obi_save_and_close_view(query_view); obi_save_and_close_view(query_view);

View File

@ -23,7 +23,8 @@
#define ECOTAG_TAXID_COLUMN_NAME "TAXID" #define ECOTAG_TAXID_COLUMN_NAME "TAXID"
#define ECOTAG_NAME_COLUMN_NAME "SCIENTIFIC_NAME" #define ECOTAG_NAME_COLUMN_NAME "SCIENTIFIC_NAME"
#define ECOTAG_STATUS_COLUMN_NAME "ID_STATUS" #define ECOTAG_STATUS_COLUMN_NAME "ID_STATUS"
#define ECOTAG_BEST_MATCH_IDS_COLUMN_NAME "BEST_MATCH" #define ECOTAG_BEST_MATCH_IDS_COLUMN_NAME "BEST_MATCH_IDS"
#define ECOTAG_BEST_MATCH_TAXIDS_COLUMN_NAME "BEST_MATCH_TAXIDS"
#define ECOTAG_SCORE_COLUMN_NAME "BEST_IDENTITY" #define ECOTAG_SCORE_COLUMN_NAME "BEST_IDENTITY"