diff --git a/python/obitools3/commands/ecotag.pyx b/python/obitools3/commands/ecotag.pyx new file mode 100755 index 0000000..c8fb742 --- /dev/null +++ b/python/obitools3/commands/ecotag.pyx @@ -0,0 +1,120 @@ +#cython: language_level=3 + +from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport +from obitools3.dms.dms cimport DMS +from obitools3.dms.view import RollbackException +from obitools3.dms.capi.obiecotag cimport obi_ecotag +from obitools3.apps.optiongroups import addMinimalInputOption, addTaxonomyOption, addMinimalOutputOption +from obitools3.uri.decode import open_uri +from obitools3.apps.config import logger +from obitools3.utils cimport tobytes, str2bytes +from obitools3.dms.view.view cimport View +from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS + +import sys + + +__title__="Taxonomic assignment of sequences" + + +def addOptions(parser): + + addMinimalInputOption(parser) + addTaxonomyOption(parser) + addMinimalOutputOption(parser) + + group = parser.add_argument_group('obi ecotag specific options') + + group.add_argument('--ref-database','-R', + action="store", dest="ecotag:ref_view", + metavar='', + type=str, + help="URI of the view containing the reference database as built by the build_ref_db command.") + + group.add_argument('--minimum-identity','-m', + action="store", dest="ecotag:threshold", + metavar='', + default=0.0, + type=float, + help="Minimum identity to consider for assignment, as a normalized identity, e.g. 0.95 for an identity of 95%%. " + "Default: 0.00 (no threshold).") + +def run(config): + + DMS.obi_atexit() + + logger("info", "obi ecotag") + + # Open the query view: only the DMS + input = open_uri(config['obi']['inputURI'], + dms_only=True) + if input is None: + raise Exception("Could not read input") + i_dms = input[0] + i_dms_name = input[0].name + i_view_name = input[1] + + # Open the reference view: only the DMS + ref = open_uri(config['ecotag']['ref_view'], + dms_only=True) + if ref is None: + raise Exception("Could not read reference view URI") + ref_dms = ref[0] + ref_dms_name = ref[0].name + ref_view_name = ref[1] + + # Open the output: only the DMS + output = open_uri(config['obi']['outputURI'], + input=False, + dms_only=True) + if output is None: + raise Exception("Could not create output") + o_dms = output[0] + final_o_view_name = output[1] + + # If the input and output DMS are not the same, run ecotag creating a temporary view that will be exported to + # the right DMS and deleted in the other afterwards. + if i_dms != o_dms: + temporary_view_name = final_o_view_name + i=0 + while temporary_view_name in i_dms: # Making sure view name is unique in input DMS + temporary_view_name = final_o_view_name+b"_"+str2bytes(str(i)) + i+=1 + o_view_name = temporary_view_name + else: + o_view_name = final_o_view_name + + # Read taxonomy DMS and name + taxo = open_uri(config['obi']['taxoURI'], + dms_only=True) + taxo_dms_name = taxo[0].name + taxo_dms = taxo[0] + taxonomy_name = config['obi']['taxoURI'].split("/")[-1] # Robust in theory + + # Save command config in View comments + command_line = " ".join(sys.argv[1:]) + comments = View.print_config(config, "ecotag", command_line, input_dms_name=[i_dms_name], input_view_name=[i_view_name]) # TODO no. fix + + if obi_ecotag(tobytes(i_dms_name), tobytes(i_view_name), \ + tobytes(ref_dms_name), tobytes(ref_view_name), \ + tobytes(taxo_dms_name), tobytes(taxonomy_name), \ + tobytes(o_view_name), comments, + config['ecotag']['threshold']) < 0: + raise Exception("Error running ecotag") + + # If the input and output DMS are not the same, export result view to output DMS + if i_dms != o_dms: + View.import_view(i_dms.full_path[:-7], o_dms.full_path[:-7], o_view_name, final_o_view_name) + + # Save command config in DMS comments + o_dms.record_command_line(command_line) + + print("\n") + print(repr(o_dms[final_o_view_name])) + + # If the input and the output DMS are different, delete the temporary result view in the input DMS + if i_dms != o_dms: + View.delete_view(i_dms, o_view_name) + o_dms.close() + + i_dms.close() diff --git a/python/obitools3/dms/capi/obiecotag.pxd b/python/obitools3/dms/capi/obiecotag.pxd new file mode 100755 index 0000000..c3d112c --- /dev/null +++ b/python/obitools3/dms/capi/obiecotag.pxd @@ -0,0 +1,14 @@ +#cython: language_level=3 + + +cdef extern from "obi_ecotag.h" nogil: + + int obi_ecotag(const char* dms_name, + const char* query_view_name, + const char* ref_dms_name, + const char* ref_view_name, + const char* taxo_dms_name, + const char* taxonomy_name, + const char* output_view_name, + const char* output_view_comments, + double ecotag_threshold) diff --git a/python/obitools3/dms/dms.cfiles b/python/obitools3/dms/dms.cfiles index bcff959..b407a36 100755 --- a/python/obitools3/dms/dms.cfiles +++ b/python/obitools3/dms/dms.cfiles @@ -38,6 +38,7 @@ ../../../src/obidmscolumn_str.c ../../../src/obidmscolumn.c ../../../src/obidmscolumndir.c +../../../src/obi_ecotag.c ../../../src/obierrno.c ../../../src/obilittlebigman.c ../../../src/obitypes.c diff --git a/python/obitools3/utils.cfiles b/python/obitools3/utils.cfiles index c2043a4..e3c390f 100755 --- a/python/obitools3/utils.cfiles +++ b/python/obitools3/utils.cfiles @@ -38,6 +38,7 @@ ../../src/obidmscolumn_array.c ../../src/obidmscolumn.c ../../src/obidmscolumndir.c +../../src/obi_ecotag.c ../../src/obierrno.c ../../src/obilittlebigman.c ../../src/obitypes.c diff --git a/src/obi_ecotag.c b/src/obi_ecotag.c new file mode 100755 index 0000000..33503b5 --- /dev/null +++ b/src/obi_ecotag.c @@ -0,0 +1,640 @@ +/**************************************************************************** + * Taxonomic assignment of sequences * + ****************************************************************************/ + +/** + * @file obi_ecotag.c + * @author Celine Mercier (celine.mercier@metabarcoding.org) + * @date November 15th 2018 + * @brief Functions for the taxonomic assignment of sequences. + */ + +//#define OMP_SUPPORT // TODO +#ifdef OMP_SUPPORT +#include +#endif + +#include +#include +#include +#include +#include + +#include "obi_ecotag.h" +#include "obidms_taxonomy.h" +#include "obidms.h" +#include "obidebug.h" +#include "obierrno.h" +#include "obitypes.h" +#include "obiview.h" +#include "obidmscolumn.h" +#include "sse_banded_LCS_alignment.h" +#include "upperband.h" +#include "obiblob.h" +#include "build_reference_db.h" + + + +#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) + + +/************************************************************************** + * + * D E C L A R A T I O N O F T H E P R I V A T E F U N C T I O N S + * + **************************************************************************/ + + +/** + * Internal function creating the output columns for the ecotag algorithm. + * + * @param o_view A pointer on the output view. + * + * @retval 0 if the operation was successfully completed. + * @retval -1 if an error occurred. + * + * @since December 2018 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int create_output_columns(Obiview_p o_view); + + +/** + * @brief Internal function printing the result of one assignment to the output view. + * + * @param output_view A pointer on the writable view where the result should be written. + * @param line The line in the output view where the result should be written. + * @param assigned_taxid_column A pointer on the column where the assigned taxid should be written. + * @param taxid The assigned taxid. + * @param assigned_name_column A pointer on the column where the assigned scientific name should be written. + * @param name The assigned scientific name. + * @param assigned_status_column A pointer on the column where the assigned status should be written. + * @param assigned The assigned status (whether the sequence was assigned to a taxon or not). + * @param best_match_column A pointer on the column where the list of ids of the best matches should be written. + * @param best_match_ids The list of ids of the best matches as an array of the concatenated ids separated by '\0'. + * @param best_match_ids_length The total length of the array of ids of best matches. + * @param score_column A pointer on the column where the score should be written. + * @param score The similarity score of the sequence with its best match(es). + * + * @retval 0 if the operation was successfully completed. + * @retval -1 if an error occurred. + * + * @since December 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +int print_assignment_result(Obiview_p output_view, index_t line, + OBIDMS_column_p assigned_taxid_column, int32_t taxid, + OBIDMS_column_p assigned_name_column, const char* name, + OBIDMS_column_p assigned_status_column, bool assigned, + OBIDMS_column_p best_match_column, const char* best_match_ids, int best_match_ids_length, + OBIDMS_column_p score_column, double score); + + +/************************************************************************ + * + * D E F I N I T I O N O F T H E P R I V A T E F U N C T I O N S + * + ************************************************************************/ + +static int create_output_columns(Obiview_p o_view) +{ + // Score column + if (obi_view_add_column(o_view, ECOTAG_SCORE_COLUMN_NAME, -1, NULL, OBI_FLOAT, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_SCORE_COLUMN_NAME, true) < 0) + { + obidebug(1, "\nError creating the column for the score in ecotag"); + return -1; + } + + // Assigned taxid column + if (obi_view_add_column(o_view, ECOTAG_TAXID_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_TAXID_COLUMN_NAME, true) < 0) + { + obidebug(1, "\nError creating the column for the assigned taxid in ecotag"); + return -1; + } + + // Assigned scientific name column + if (obi_view_add_column(o_view, ECOTAG_NAME_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_NAME_COLUMN_NAME, true) < 0) + { + obidebug(1, "\nError creating the column for the assigned scientific name in ecotag"); + return -1; + } + + // Assignement status column + if (obi_view_add_column(o_view, ECOTAG_STATUS_COLUMN_NAME, -1, NULL, OBI_BOOL, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_STATUS_COLUMN_NAME, true) < 0) + { + obidebug(1, "\nError creating the column for the assignment status in ecotag"); + return -1; + } + + // Column for array of best match ids + if (obi_view_add_column(o_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, true, false, NULL, NULL, -1, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME, true) < 0) + { + obidebug(1, "\nError creating the column for the array of ids of the best match in ecotag"); + return -1; + } + + return 0; +} + + +int print_assignment_result(Obiview_p output_view, index_t line, + OBIDMS_column_p assigned_taxid_column, int32_t taxid, + OBIDMS_column_p assigned_name_column, const char* name, + OBIDMS_column_p assigned_status_column, bool assigned, + OBIDMS_column_p best_match_column, const char* best_match_ids, int best_match_ids_length, + OBIDMS_column_p score_column, double score) +{ + // Write the assigned taxid + if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, assigned_taxid_column, line, 0, taxid) < 0) + { + obidebug(1, "\nError writing a taxid in a column when writing ecotag results"); + return -1; + } + + // Write the assigned scientific name + if (obi_set_str_with_elt_idx_and_col_p_in_view(output_view, assigned_name_column, line, 0, name) < 0) + { + obidebug(1, "\nError writing a scientific name in a column when writing ecotag results"); + return -1; + } + + // Write the assigned status + if (obi_set_bool_with_elt_idx_and_col_p_in_view(output_view, assigned_status_column, line, 0, assigned) < 0) + { + obidebug(1, "\nError writing a assignment status in a column when writing ecotag results"); + return -1; + } + + // Write the best match ids + if (obi_set_array_with_col_p_in_view(output_view, best_match_column, line, best_match_ids, (uint8_t)(sizeof(char)*8), best_match_ids_length) < 0) + { + obidebug(1, "\nError writing a assignment status in a column when writing ecotag results"); + return -1; + } + + // Write the similarity score with the best match + if (obi_set_float_with_elt_idx_and_col_p_in_view(output_view, score_column, line, 0, score) < 0) + { + obidebug(1, "\nError writing a score in a column when writing ecotag results"); + return -1; + } + + return 0; +} + + +/********************************************************************** + * + * D E F I N I T I O N O F T H E P U B L I C F U N C T I O N S + * + **********************************************************************/ + + +int obi_ecotag(const char* dms_name, + const char* query_view_name, + const char* ref_dms_name, + const char* ref_view_name, + const char* taxo_dms_name, + const char* taxonomy_name, + const char* output_view_name, + const char* output_view_comments, + double ecotag_threshold) +{ + + // For each sequence + + // Align with DB + + // Keep the indices of all max scores + + // For each kept index, get the LCA at threshold, then the LCA of those LCAs + + // Write result (max score, threshold, LCA assigned, list of the ids of the best matches) + + + index_t i, j, k; + ecotx_t* lca; + ecotx_t* lca_in_array; + ecotx_t* best_match; + index_t query_seq_idx, ref_seq_idx; + double score, best_score; + double threshold; + int lcs_length; + int ali_length; + Kmer_table_p ktable; + Obi_blob_p blob1; + Obi_blob_p blob2; + int lcs_min; + index_t query_count; + index_t ref_count; + bool same_indexer; + const double* score_array; + const int* lca_array; + index_t* best_match_array; + char* best_match_ids; + int32_t best_match_ids_length; + int best_match_count; + int buffer_size; + int best_match_ids_buffer_size; + index_t best_match_idx; + int32_t lca_array_length; + int32_t lca_taxid; + int32_t taxid_best_match; + bool assigned; + const char* lca_name; + const char* id; + int id_len; + + OBIDMS_p dms = NULL; + OBIDMS_p ref_dms = NULL; + OBIDMS_p taxo_dms = NULL; + + OBIDMS_taxonomy_p taxonomy = NULL; + + Obiview_p query_view = NULL; + Obiview_p ref_view = NULL; + Obiview_p output_view = NULL; + OBIDMS_column_p query_seq_column = NULL; + OBIDMS_column_p ref_seq_column = NULL; + OBIDMS_column_p ref_id_column = NULL; + OBIDMS_column_p score_column = NULL; + OBIDMS_column_p assigned_taxid_column = NULL; + OBIDMS_column_p assigned_name_column = NULL; + OBIDMS_column_p assigned_status_column = NULL; + OBIDMS_column_p best_match_column = NULL; + OBIDMS_column_p lca_taxid_a_column = NULL; + OBIDMS_column_p score_a_column = NULL; + OBIDMS_column_p ref_taxid_column = NULL; + + buffer_size = 1024; + best_match_ids_buffer_size = 1024; + + // Open main DMS containing the query sequences and where the output will be written + dms = obi_open_dms(dms_name); + if (dms == NULL) + { + obidebug(1, "\nError opening the DMS containing the query sequences for ecotag"); + return -1; + } + + // Open the DMS containing the reference database (can be the same or not) + ref_dms = obi_open_dms(ref_dms_name); + if (ref_dms == NULL) + { + obidebug(1, "\nError opening the DMS containing the reference database for ecotag"); + return -1; + } + + // Open the DMS containing the taxonomy (can be the same or not) + taxo_dms = obi_open_dms(taxo_dms_name); + if (taxo_dms == NULL) + { + obidebug(1, "\nError opening the DMS containing the taxonomy for ecotag"); + return -1; + } + + // Open the taxonomy + taxonomy = obi_read_taxonomy(taxo_dms, taxonomy_name, 0); + if (taxonomy == NULL) + { + obidebug(1, "\nError opening the taxonomy for the assignment of sequences"); + return -1; + } + + // Open the query view + query_view = obi_open_view(dms, query_view_name); + if (query_view == NULL) + { + obidebug(1, "\nError opening the view of query sequences to assign"); + return -1; + } + + // Open the reference view. + ref_view = obi_open_view(ref_dms, ref_view_name); + if (ref_view == NULL) + { + obidebug(1, "\nError opening the view of reference sequences to assign"); + return -1; + } + + // Open the column of query sequences to assign + if (strcmp((query_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0) + query_seq_column = obi_view_get_column(query_view, NUC_SEQUENCE_COLUMN); + else + { + obi_set_errno(OBI_ECOTAG_ERROR); + obidebug(1, "\nError: query view must have the type NUC_SEQS view"); + return -1; + } + if (query_seq_column == NULL) + { + obidebug(1, "\nError getting the query column to assign"); + return -1; + } + + // Open the column of reference sequences to assign + if (strcmp((ref_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0) + ref_seq_column = obi_view_get_column(ref_view, NUC_SEQUENCE_COLUMN); + else + { + obi_set_errno(OBI_ECOTAG_ERROR); + obidebug(1, "\nError: reference view must have the type NUC_SEQS view"); + return -1; + } + if (ref_seq_column == NULL) + { + obidebug(1, "\nError getting the reference column to assign"); + return -1; + } + + // Open the ID column of reference sequences + ref_id_column = obi_view_get_column(ref_view, ID_COLUMN); + if (ref_id_column == NULL) + { + obidebug(1, "\nError getting the ID column of the reference view"); + return -1; + } + + // Create the output view + output_view = obi_clone_view(dms, query_view, output_view_name, NULL, output_view_comments); + if (output_view == NULL) + { + obidebug(1, "\nError creating the output view when assigning"); + return -1; + } + + // Create the output columns + if (create_output_columns(output_view) < 0) + return -1; + assigned_taxid_column = obi_view_get_column(output_view, ECOTAG_TAXID_COLUMN_NAME); + assigned_name_column = obi_view_get_column(output_view, ECOTAG_NAME_COLUMN_NAME); + assigned_status_column = obi_view_get_column(output_view, ECOTAG_STATUS_COLUMN_NAME); + best_match_column = obi_view_get_column(output_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME); + score_column = obi_view_get_column(output_view, ECOTAG_SCORE_COLUMN_NAME); + + // Open the used reference columns + lca_taxid_a_column = obi_view_get_column(ref_view, LCA_TAXID_ARRAY_COLUMN_NAME); + if (lca_taxid_a_column == NULL) + { + obidebug(1, "\nError opening the reference LCA taxid array column when doing the taxonomic assignment of sequences"); + return -1; + } + score_a_column = obi_view_get_column(ref_view, LCA_SCORE_ARRAY_COLUMN_NAME); + if (score_a_column == NULL) + { + obidebug(1, "\nError opening the score array column when doing the taxonomic assignment of sequences"); + return -1; + } + ref_taxid_column = obi_view_get_column(ref_view, TAXID_COLUMN); + if (ref_taxid_column == NULL) + { + obidebug(1, "\nError opening the taxid column when doing the taxonomic assignment of sequences"); + return -1; + } + + // Check if the sequence columns share the same indexer (allows for quick checking of sequence equality) + if (strcmp((query_seq_column->header)->indexer_name, (ref_seq_column->header)->indexer_name) == 0) + same_indexer = true; + else + same_indexer = false; + + // Build kmer tables + ktable = hash_two_seq_columns(query_view, query_seq_column, 0, ref_view, ref_seq_column, 0); + if (ktable == NULL) + { + obi_set_errno(OBI_ECOTAG_ERROR); + obidebug(1, "\nError building kmer tables before assigning"); + return -1; + } + + query_count = (query_view->infos)->line_count; + ref_count = (ref_view->infos)->line_count; + + best_match_array = (index_t*) malloc(buffer_size * sizeof(index_t)); + if (best_match_array == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for the best match index array in ecotag"); + return -1; + } + + best_match_ids = (char*) malloc(best_match_ids_buffer_size* sizeof(char)); + if (best_match_ids == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for the best match id array in ecotag"); + return -1; + } + + for (i=0; i < query_count; i++) + { + if (i%100 == 0) + fprintf(stderr,"\rDone : %f %% ", (i / (float) query_count)*100); + + best_match_count = 0; + best_match_ids_length = 0; + threshold = ecotag_threshold; + best_score = 0.0; + + // Get first sequence and its index + query_seq_idx = obi_get_index_with_elt_idx_and_col_p_in_view(query_view, query_seq_column, i, 0); + blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(query_view, query_seq_column, i, 0); + if (blob1 == NULL) + { + obidebug(1, "\nError retrieving sequences to align when assigning"); + return -1; + } + + for (j=0; j < ref_count; j++) + { + // Get reference sequence and its index + ref_seq_idx = obi_get_index_with_elt_idx_and_col_p_in_view(ref_view, ref_seq_column, j, 0); + blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(ref_view, ref_seq_column, j, 0); + if (blob2 == NULL) + { + obidebug(1, "\nError retrieving sequences to align when assigning"); + return -1; + } + + // Check if the sequences are identical in a quick way (same index in the same indexer) + if (same_indexer && (query_seq_idx == ref_seq_idx)) + score = 1.0; + + else // the sequences aren't identical or we don't know + { + // kmer filter (offset for the index of the kmer table of the 2nd sequence because the kmer tables of both the query and ref columns are concatenated in one) + align_filters(ktable, blob1, blob2, i, query_count+j, threshold, true, ALILEN, true, &score, &lcs_min, !same_indexer); + + // Compute alignment score + if ((score < 0) && ((threshold == 0) || (score == -1.0))) // (sequences are not identical), and (no threshold, or filter passed): align + { + score = obiblob_sse_banded_lcs_align(blob1, blob2, threshold, true, ALILEN, true, &lcs_length, &ali_length); + } + } + + if ((score >= threshold) && (score >= best_score)) + { + // Replace everything if score is better + if (score > best_score) + { + // Up the alignment threshold because we only want scores equal or greater than the best found + threshold = score; + best_score = score; + // Reset the array with that match + best_match_ids_length = 0; + best_match_count = 0; + } + + // Store in best match array + + // Grow match array if needed + if (best_match_count == buffer_size) + { + buffer_size = buffer_size*2; + best_match_array = (index_t*) realloc(best_match_array, buffer_size*sizeof(index_t)); + if (best_match_array == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating match array when assigning"); + return -1; + } + } + + id = obi_get_str_with_elt_idx_and_col_p_in_view(ref_view, ref_id_column, j, 0); + id_len = strlen(id); + + // Grow ids array if needed + if ((best_match_ids_length+id_len+1) >= best_match_ids_buffer_size) + { + best_match_ids_buffer_size = best_match_ids_buffer_size*2; + best_match_ids = (char*) realloc(best_match_ids, best_match_ids_buffer_size*sizeof(char)); + if (best_match_ids == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating match ids array when assigning"); + return -1; + } + } + + // Save match + best_match_array[best_match_count] = j; + best_match_count++; + strcpy(best_match_ids+best_match_ids_length, id); + best_match_ids_length = best_match_ids_length + id_len + 1; + } + } + + // Get LCA of the LCAs of the best matches + for (j=0; j= ecotag_threshold)) + k++; + + if (k>0) + { + lca_array = obi_get_array_with_col_p_in_view(ref_view, lca_taxid_a_column, best_match_idx, &lca_array_length); + if (j>0) + { + lca = obi_taxo_get_taxon_with_taxid(taxonomy, lca_taxid); + if (lca == NULL) + { + obidebug(1, "\nError getting a taxon from a taxid when doing taxonomic assignment"); + return -1; + } + lca_in_array = obi_taxo_get_taxon_with_taxid(taxonomy, lca_array[k-1]); + if (lca_in_array == NULL) + { + obidebug(1, "\nError getting a taxon from a taxid when doing taxonomic assignment"); + return -1; + } + lca = obi_taxo_get_lca(lca, lca_in_array); + if (lca == NULL) + { + obidebug(1, "\nError getting the LCA of two taxa when doing taxonomic assignment"); + return -1; + } + lca_taxid = lca->taxid; + } + else + { + lca_taxid = lca_array[k-1]; + lca = obi_taxo_get_taxon_with_taxid(taxonomy, lca_taxid); + } + } + else + { + taxid_best_match = obi_get_int_with_elt_idx_and_col_p_in_view(ref_view, ref_taxid_column, j, 0); + best_match = obi_taxo_get_taxon_with_taxid(taxonomy, taxid_best_match); + if (best_match == NULL) + { + obidebug(1, "\nError getting a taxon from a taxid when doing taxonomic assignment"); + return -1; + } + if (j>0) + { + lca = obi_taxo_get_lca(lca, best_match); + lca_taxid = lca->taxid; + } + else + { + lca_taxid = taxid_best_match; + lca = obi_taxo_get_taxon_with_taxid(taxonomy, lca_taxid); + } + } + } + + if (best_match_count > 0) + { + assigned = true; + + // Get LCA name + if (lca->preferred_name != NULL) + lca_name = lca->preferred_name; + else + lca_name = lca->name; + } + else + { + assigned = false; + lca_name = OBIStr_NA; + lca_taxid = OBIInt_NA; + best_match_ids = OBITuple_NA; + score = OBIFloat_NA; + } + + // Print result in output view + if (print_assignment_result(output_view, i, + assigned_taxid_column, lca_taxid, + assigned_name_column, lca_name, + assigned_status_column, assigned, + best_match_column, best_match_ids, best_match_ids_length, + score_column, best_score + ) < 0) + return -1; + } + + free(best_match_array); + free(best_match_ids); + + obi_close_taxonomy(taxonomy); + obi_save_and_close_view(query_view); + obi_save_and_close_view(ref_view); + obi_save_and_close_view(output_view); + + obi_close_dms(dms, false); + obi_close_dms(ref_dms, false); + obi_close_dms(taxo_dms, false); + + fprintf(stderr,"\rDone : 100 %% "); + return 0; +} + + + diff --git a/src/obi_ecotag.h b/src/obi_ecotag.h new file mode 100755 index 0000000..436f1c8 --- /dev/null +++ b/src/obi_ecotag.h @@ -0,0 +1,71 @@ +/************************************************************************************************* + * Header file for functions for the taxonomic assignment of sequences * + *************************************************************************************************/ + +/** + * @file obi_ecotag.h + * @author Celine Mercier (celine.mercier@metabarcoding.org) + * @date November 15th 2018 + * @brief Header file for the functions for the taxonomic assignment of sequences. + */ + + +#ifndef OBI_ECOTAG_H_ +#define OBI_ECOTAG_H_ + + +#include +#include +#include + + + +#define ECOTAG_TAXID_COLUMN_NAME "TAXID" +#define ECOTAG_NAME_COLUMN_NAME "SCIENTIFIC_NAME" +#define ECOTAG_STATUS_COLUMN_NAME "ID_STATUS" +#define ECOTAG_BEST_MATCH_IDS_COLUMN_NAME "BEST_MATCH" +#define ECOTAG_SCORE_COLUMN_NAME "BEST_IDENTITY" + + +/** + * @brief Taxonomic assignment of sequences. + * + * Note: The columns where the results are written are automatically named and created. + * + * @param dms_name The path to the DMS where the views are. + * @param query_view_name The name of the view containing the query sequences. + * @param ref_dms_name The name of the DMS containing the reference database. + * @param ref_view_name The name of the view corresponding to the reference database as built by build_reference_db(). + * @param taxo_dms_name The name of the DMS containing the taxonomy associated with the reference database. + * @param taxonomy_name The name of the taxonomy associated with the reference database. + * @param output_view_name The name to give to the output view. + * @param output_view_comments The comments to associate to the output view. + * @param ecotag_threshold The threshold at which to assign. + * + * The algorithm works like this: + * For each query sequence: + * Align with reference database + * Keep the indices of all the best matches + * For each kept index, get the LCA at that threshold as stored in the reference database, then the LCA of those LCAs + * Write result (max score, threshold, taxid and scientific name of the LCA assigned, list of the ids of the best matches) + * + * @returns A value indicating the success of the operation. + * @retval 0 if the operation was successfully completed. + * @retval -1 if an error occurred. + * + * @since November 2018 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +int obi_ecotag(const char* dms_name, + const char* query_view_name, + const char* ref_dms_name, + const char* ref_view_name, + const char* taxo_dms_name, + const char* taxonomy_name, + const char* output_view_name, + const char* output_view_comments, + double ecotag_threshold); + + +#endif /* OBI_ECOTAG_H_ */ + diff --git a/src/obierrno.h b/src/obierrno.h index 49c9b63..46a86e3 100755 --- a/src/obierrno.h +++ b/src/obierrno.h @@ -130,6 +130,8 @@ extern int obi_errno; */ #define OBIVIEW_ALREADY_EXISTS_ERROR (35) /** Tried to create a new view with a name already existing in the DMS. */ +#define OBI_ECOTAG_ERROR (36) /** Tried to create a new view with a name already existing in the DMS. + */ /**@}*/ #endif /* OBIERRNO_H_ */