ecotag: Added list of taxids for all best matches (closes #80)

This commit is contained in:
Celine Mercier
2020-06-09 14:33:14 +02:00
parent b0b96ac37a
commit 4aa772c405
2 changed files with 55 additions and 11 deletions

View File

@ -71,9 +71,12 @@ static int create_output_columns(Obiview_p o_view);
* @param name The assigned scientific name.
* @param assigned_status_column A pointer on the column where the assigned status should be written.
* @param assigned The assigned status (whether the sequence was assigned to a taxon or not).
* @param best_match_column A pointer on the column where the list of ids of the best matches should be written.
* @param best_match_ids_column A pointer on the column where the list of ids of the best matches should be written.
* @param best_match_ids The list of ids of the best matches as an array of the concatenated ids separated by '\0'.
* @param best_match_ids_length The total length of the array of ids of best matches.
* @param best_match_taxids_column A pointer on the column where the list of taxids of the best matches should be written.
* @param best_match_taxids The list of taxids of the best matches as an array of the taxids.
* @param best_match_taxids_length The length of the array of taxids of best matches.
* @param score_column A pointer on the column where the score should be written.
* @param score The similarity score of the sequence with its best match(es).
*
@ -87,7 +90,8 @@ int print_assignment_result(Obiview_p output_view, index_t line,
OBIDMS_column_p assigned_taxid_column, int32_t taxid,
OBIDMS_column_p assigned_name_column, const char* name,
OBIDMS_column_p assigned_status_column, bool assigned,
OBIDMS_column_p best_match_column, const char* best_match_ids, int best_match_ids_length,
OBIDMS_column_p best_match_ids_column, const char* best_match_ids, int best_match_ids_length,
OBIDMS_column_p best_match_taxids_column, const int32_t* best_match_taxids, int best_match_taxids_length,
OBIDMS_column_p score_column, double score);
@ -130,7 +134,14 @@ static int create_output_columns(Obiview_p o_view)
// Column for array of best match ids
if (obi_view_add_column(o_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, true, false, NULL, NULL, -1, "{}", true) < 0)
{
obidebug(1, "\nError creating the column for the array of ids of the best match in ecotag");
obidebug(1, "\nError creating the column for the array of ids of best matches in ecotag");
return -1;
}
// Column for array of best match taxids
if (obi_view_add_column(o_view, ECOTAG_BEST_MATCH_TAXIDS_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, true, false, NULL, NULL, -1, "{}", true) < 0)
{
obidebug(1, "\nError creating the column for the array of taxids of best matches in ecotag");
return -1;
}
@ -142,7 +153,8 @@ int print_assignment_result(Obiview_p output_view, index_t line,
OBIDMS_column_p assigned_taxid_column, int32_t taxid,
OBIDMS_column_p assigned_name_column, const char* name,
OBIDMS_column_p assigned_status_column, bool assigned,
OBIDMS_column_p best_match_column, const char* best_match_ids, int best_match_ids_length,
OBIDMS_column_p best_match_ids_column, const char* best_match_ids, int best_match_ids_length,
OBIDMS_column_p best_match_taxids_column, const int32_t* best_match_taxids, int best_match_taxids_length,
OBIDMS_column_p score_column, double score)
{
// Write the assigned taxid
@ -167,9 +179,16 @@ int print_assignment_result(Obiview_p output_view, index_t line,
}
// Write the best match ids
if (obi_set_array_with_col_p_in_view(output_view, best_match_column, line, best_match_ids, (uint8_t)(sizeof(char)*8), best_match_ids_length) < 0)
if (obi_set_array_with_col_p_in_view(output_view, best_match_ids_column, line, best_match_ids, (uint8_t)(sizeof(char)*8), best_match_ids_length) < 0)
{
obidebug(1, "\nError writing a assignment status in a column when writing ecotag results");
obidebug(1, "\nError writing the array of best match ids in a column when writing ecotag results");
return -1;
}
// Write the best match taxids
if (obi_set_array_with_col_p_in_view(output_view, best_match_taxids_column, line, best_match_taxids, (uint8_t)(sizeof(OBI_INT)*8), best_match_taxids_length) < 0)
{
obidebug(1, "\nError writing the array of best match taxids in a column when writing ecotag results");
return -1;
}
@ -235,6 +254,8 @@ int obi_ecotag(const char* dms_name,
char* best_match_ids;
char* best_match_ids_to_store;
int32_t best_match_ids_length;
int32_t* best_match_taxids;
int32_t* best_match_taxids_to_store;
int best_match_count;
int buffer_size;
int best_match_ids_buffer_size;
@ -263,7 +284,8 @@ int obi_ecotag(const char* dms_name,
OBIDMS_column_p assigned_taxid_column = NULL;
OBIDMS_column_p assigned_name_column = NULL;
OBIDMS_column_p assigned_status_column = NULL;
OBIDMS_column_p best_match_column = NULL;
OBIDMS_column_p best_match_ids_column = NULL;
OBIDMS_column_p best_match_taxids_column = NULL;
OBIDMS_column_p lca_taxid_a_column = NULL;
OBIDMS_column_p score_a_column = NULL;
OBIDMS_column_p ref_taxid_column = NULL;
@ -396,7 +418,8 @@ int obi_ecotag(const char* dms_name,
assigned_taxid_column = obi_view_get_column(output_view, ECOTAG_TAXID_COLUMN_NAME);
assigned_name_column = obi_view_get_column(output_view, ECOTAG_NAME_COLUMN_NAME);
assigned_status_column = obi_view_get_column(output_view, ECOTAG_STATUS_COLUMN_NAME);
best_match_column = obi_view_get_column(output_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME);
best_match_ids_column = obi_view_get_column(output_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME);
best_match_taxids_column = obi_view_get_column(output_view, ECOTAG_BEST_MATCH_TAXIDS_COLUMN_NAME);
score_column = obi_view_get_column(output_view, ECOTAG_SCORE_COLUMN_NAME);
// Open the used reference columns
@ -453,6 +476,14 @@ int obi_ecotag(const char* dms_name,
return -1;
}
best_match_taxids = (int32_t*) malloc(buffer_size* sizeof(int32_t));
if (best_match_taxids == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for the best match taxid array in ecotag");
return -1;
}
for (i=0; i < query_count; i++)
{
if (i%1000 == 0)
@ -514,7 +545,7 @@ int obi_ecotag(const char* dms_name,
// Store in best match array
// Grow match array if needed
// Grow match and taxid array if needed
if (best_match_count == buffer_size)
{
buffer_size = buffer_size*2;
@ -525,6 +556,13 @@ int obi_ecotag(const char* dms_name,
obidebug(1, "\nError reallocating match array when assigning");
return -1;
}
best_match_taxids = (int32_t*) realloc(best_match_taxids, buffer_size*sizeof(int32_t));
if (best_match_taxids == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reallocating match taxids array when assigning");
return -1;
}
}
id = obi_get_str_with_elt_idx_and_col_p_in_view(ref_view, ref_id_column, j, 0);
@ -545,6 +583,7 @@ int obi_ecotag(const char* dms_name,
// Save match
best_match_array[best_match_count] = j;
best_match_taxids[best_match_count] = obi_get_int_with_elt_idx_and_col_p_in_view(ref_view, ref_taxid_column, j, 0);
best_match_count++;
strcpy(best_match_ids+best_match_ids_length, id);
best_match_ids_length = best_match_ids_length + id_len + 1;
@ -629,6 +668,7 @@ int obi_ecotag(const char* dms_name,
else
lca_name = lca->name;
best_match_ids_to_store = best_match_ids;
best_match_taxids_to_store = best_match_taxids;
}
else
{
@ -636,6 +676,7 @@ int obi_ecotag(const char* dms_name,
lca_name = OBIStr_NA;
lca_taxid = OBIInt_NA;
best_match_ids_to_store = OBITuple_NA;
best_match_taxids_to_store = OBITuple_NA;
score = OBIFloat_NA;
}
@ -644,7 +685,8 @@ int obi_ecotag(const char* dms_name,
assigned_taxid_column, lca_taxid,
assigned_name_column, lca_name,
assigned_status_column, assigned,
best_match_column, best_match_ids_to_store, best_match_ids_length,
best_match_ids_column, best_match_ids_to_store, best_match_ids_length,
best_match_taxids_column, best_match_taxids_to_store, best_match_count,
score_column, best_score
) < 0)
return -1;
@ -652,6 +694,7 @@ int obi_ecotag(const char* dms_name,
free(best_match_array);
free(best_match_ids);
free(best_match_taxids);
obi_close_taxonomy(taxonomy);
obi_save_and_close_view(query_view);