New command and C functions: obi ecotag
This commit is contained in:
120
python/obitools3/commands/ecotag.pyx
Executable file
120
python/obitools3/commands/ecotag.pyx
Executable file
@ -0,0 +1,120 @@
|
||||
#cython: language_level=3
|
||||
|
||||
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
||||
from obitools3.dms.dms cimport DMS
|
||||
from obitools3.dms.view import RollbackException
|
||||
from obitools3.dms.capi.obiecotag cimport obi_ecotag
|
||||
from obitools3.apps.optiongroups import addMinimalInputOption, addTaxonomyOption, addMinimalOutputOption
|
||||
from obitools3.uri.decode import open_uri
|
||||
from obitools3.apps.config import logger
|
||||
from obitools3.utils cimport tobytes, str2bytes
|
||||
from obitools3.dms.view.view cimport View
|
||||
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
||||
|
||||
import sys
|
||||
|
||||
|
||||
__title__="Taxonomic assignment of sequences"
|
||||
|
||||
|
||||
def addOptions(parser):
|
||||
|
||||
addMinimalInputOption(parser)
|
||||
addTaxonomyOption(parser)
|
||||
addMinimalOutputOption(parser)
|
||||
|
||||
group = parser.add_argument_group('obi ecotag specific options')
|
||||
|
||||
group.add_argument('--ref-database','-R',
|
||||
action="store", dest="ecotag:ref_view",
|
||||
metavar='<REF_VIEW>',
|
||||
type=str,
|
||||
help="URI of the view containing the reference database as built by the build_ref_db command.")
|
||||
|
||||
group.add_argument('--minimum-identity','-m',
|
||||
action="store", dest="ecotag:threshold",
|
||||
metavar='<THRESHOLD>',
|
||||
default=0.0,
|
||||
type=float,
|
||||
help="Minimum identity to consider for assignment, as a normalized identity, e.g. 0.95 for an identity of 95%%. "
|
||||
"Default: 0.00 (no threshold).")
|
||||
|
||||
def run(config):
|
||||
|
||||
DMS.obi_atexit()
|
||||
|
||||
logger("info", "obi ecotag")
|
||||
|
||||
# Open the query view: only the DMS
|
||||
input = open_uri(config['obi']['inputURI'],
|
||||
dms_only=True)
|
||||
if input is None:
|
||||
raise Exception("Could not read input")
|
||||
i_dms = input[0]
|
||||
i_dms_name = input[0].name
|
||||
i_view_name = input[1]
|
||||
|
||||
# Open the reference view: only the DMS
|
||||
ref = open_uri(config['ecotag']['ref_view'],
|
||||
dms_only=True)
|
||||
if ref is None:
|
||||
raise Exception("Could not read reference view URI")
|
||||
ref_dms = ref[0]
|
||||
ref_dms_name = ref[0].name
|
||||
ref_view_name = ref[1]
|
||||
|
||||
# Open the output: only the DMS
|
||||
output = open_uri(config['obi']['outputURI'],
|
||||
input=False,
|
||||
dms_only=True)
|
||||
if output is None:
|
||||
raise Exception("Could not create output")
|
||||
o_dms = output[0]
|
||||
final_o_view_name = output[1]
|
||||
|
||||
# If the input and output DMS are not the same, run ecotag creating a temporary view that will be exported to
|
||||
# the right DMS and deleted in the other afterwards.
|
||||
if i_dms != o_dms:
|
||||
temporary_view_name = final_o_view_name
|
||||
i=0
|
||||
while temporary_view_name in i_dms: # Making sure view name is unique in input DMS
|
||||
temporary_view_name = final_o_view_name+b"_"+str2bytes(str(i))
|
||||
i+=1
|
||||
o_view_name = temporary_view_name
|
||||
else:
|
||||
o_view_name = final_o_view_name
|
||||
|
||||
# Read taxonomy DMS and name
|
||||
taxo = open_uri(config['obi']['taxoURI'],
|
||||
dms_only=True)
|
||||
taxo_dms_name = taxo[0].name
|
||||
taxo_dms = taxo[0]
|
||||
taxonomy_name = config['obi']['taxoURI'].split("/")[-1] # Robust in theory
|
||||
|
||||
# Save command config in View comments
|
||||
command_line = " ".join(sys.argv[1:])
|
||||
comments = View.print_config(config, "ecotag", command_line, input_dms_name=[i_dms_name], input_view_name=[i_view_name]) # TODO no. fix
|
||||
|
||||
if obi_ecotag(tobytes(i_dms_name), tobytes(i_view_name), \
|
||||
tobytes(ref_dms_name), tobytes(ref_view_name), \
|
||||
tobytes(taxo_dms_name), tobytes(taxonomy_name), \
|
||||
tobytes(o_view_name), comments,
|
||||
config['ecotag']['threshold']) < 0:
|
||||
raise Exception("Error running ecotag")
|
||||
|
||||
# If the input and output DMS are not the same, export result view to output DMS
|
||||
if i_dms != o_dms:
|
||||
View.import_view(i_dms.full_path[:-7], o_dms.full_path[:-7], o_view_name, final_o_view_name)
|
||||
|
||||
# Save command config in DMS comments
|
||||
o_dms.record_command_line(command_line)
|
||||
|
||||
print("\n")
|
||||
print(repr(o_dms[final_o_view_name]))
|
||||
|
||||
# If the input and the output DMS are different, delete the temporary result view in the input DMS
|
||||
if i_dms != o_dms:
|
||||
View.delete_view(i_dms, o_view_name)
|
||||
o_dms.close()
|
||||
|
||||
i_dms.close()
|
14
python/obitools3/dms/capi/obiecotag.pxd
Executable file
14
python/obitools3/dms/capi/obiecotag.pxd
Executable file
@ -0,0 +1,14 @@
|
||||
#cython: language_level=3
|
||||
|
||||
|
||||
cdef extern from "obi_ecotag.h" nogil:
|
||||
|
||||
int obi_ecotag(const char* dms_name,
|
||||
const char* query_view_name,
|
||||
const char* ref_dms_name,
|
||||
const char* ref_view_name,
|
||||
const char* taxo_dms_name,
|
||||
const char* taxonomy_name,
|
||||
const char* output_view_name,
|
||||
const char* output_view_comments,
|
||||
double ecotag_threshold)
|
@ -38,6 +38,7 @@
|
||||
../../../src/obidmscolumn_str.c
|
||||
../../../src/obidmscolumn.c
|
||||
../../../src/obidmscolumndir.c
|
||||
../../../src/obi_ecotag.c
|
||||
../../../src/obierrno.c
|
||||
../../../src/obilittlebigman.c
|
||||
../../../src/obitypes.c
|
||||
|
@ -38,6 +38,7 @@
|
||||
../../src/obidmscolumn_array.c
|
||||
../../src/obidmscolumn.c
|
||||
../../src/obidmscolumndir.c
|
||||
../../src/obi_ecotag.c
|
||||
../../src/obierrno.c
|
||||
../../src/obilittlebigman.c
|
||||
../../src/obitypes.c
|
||||
|
640
src/obi_ecotag.c
Executable file
640
src/obi_ecotag.c
Executable file
@ -0,0 +1,640 @@
|
||||
/****************************************************************************
|
||||
* Taxonomic assignment of sequences *
|
||||
****************************************************************************/
|
||||
|
||||
/**
|
||||
* @file obi_ecotag.c
|
||||
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||
* @date November 15th 2018
|
||||
* @brief Functions for the taxonomic assignment of sequences.
|
||||
*/
|
||||
|
||||
//#define OMP_SUPPORT // TODO
|
||||
#ifdef OMP_SUPPORT
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <stdbool.h>
|
||||
#include <search.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
#include "obi_ecotag.h"
|
||||
#include "obidms_taxonomy.h"
|
||||
#include "obidms.h"
|
||||
#include "obidebug.h"
|
||||
#include "obierrno.h"
|
||||
#include "obitypes.h"
|
||||
#include "obiview.h"
|
||||
#include "obidmscolumn.h"
|
||||
#include "sse_banded_LCS_alignment.h"
|
||||
#include "upperband.h"
|
||||
#include "obiblob.h"
|
||||
#include "build_reference_db.h"
|
||||
|
||||
|
||||
|
||||
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
|
||||
|
||||
|
||||
/**************************************************************************
|
||||
*
|
||||
* D E C L A R A T I O N O F T H E P R I V A T E F U N C T I O N S
|
||||
*
|
||||
**************************************************************************/
|
||||
|
||||
|
||||
/**
|
||||
* Internal function creating the output columns for the ecotag algorithm.
|
||||
*
|
||||
* @param o_view A pointer on the output view.
|
||||
*
|
||||
* @retval 0 if the operation was successfully completed.
|
||||
* @retval -1 if an error occurred.
|
||||
*
|
||||
* @since December 2018
|
||||
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||
*/
|
||||
static int create_output_columns(Obiview_p o_view);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Internal function printing the result of one assignment to the output view.
|
||||
*
|
||||
* @param output_view A pointer on the writable view where the result should be written.
|
||||
* @param line The line in the output view where the result should be written.
|
||||
* @param assigned_taxid_column A pointer on the column where the assigned taxid should be written.
|
||||
* @param taxid The assigned taxid.
|
||||
* @param assigned_name_column A pointer on the column where the assigned scientific name should be written.
|
||||
* @param name The assigned scientific name.
|
||||
* @param assigned_status_column A pointer on the column where the assigned status should be written.
|
||||
* @param assigned The assigned status (whether the sequence was assigned to a taxon or not).
|
||||
* @param best_match_column A pointer on the column where the list of ids of the best matches should be written.
|
||||
* @param best_match_ids The list of ids of the best matches as an array of the concatenated ids separated by '\0'.
|
||||
* @param best_match_ids_length The total length of the array of ids of best matches.
|
||||
* @param score_column A pointer on the column where the score should be written.
|
||||
* @param score The similarity score of the sequence with its best match(es).
|
||||
*
|
||||
* @retval 0 if the operation was successfully completed.
|
||||
* @retval -1 if an error occurred.
|
||||
*
|
||||
* @since December 2016
|
||||
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||
*/
|
||||
int print_assignment_result(Obiview_p output_view, index_t line,
|
||||
OBIDMS_column_p assigned_taxid_column, int32_t taxid,
|
||||
OBIDMS_column_p assigned_name_column, const char* name,
|
||||
OBIDMS_column_p assigned_status_column, bool assigned,
|
||||
OBIDMS_column_p best_match_column, const char* best_match_ids, int best_match_ids_length,
|
||||
OBIDMS_column_p score_column, double score);
|
||||
|
||||
|
||||
/************************************************************************
|
||||
*
|
||||
* D E F I N I T I O N O F T H E P R I V A T E F U N C T I O N S
|
||||
*
|
||||
************************************************************************/
|
||||
|
||||
static int create_output_columns(Obiview_p o_view)
|
||||
{
|
||||
// Score column
|
||||
if (obi_view_add_column(o_view, ECOTAG_SCORE_COLUMN_NAME, -1, NULL, OBI_FLOAT, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_SCORE_COLUMN_NAME, true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the column for the score in ecotag");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Assigned taxid column
|
||||
if (obi_view_add_column(o_view, ECOTAG_TAXID_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_TAXID_COLUMN_NAME, true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the column for the assigned taxid in ecotag");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Assigned scientific name column
|
||||
if (obi_view_add_column(o_view, ECOTAG_NAME_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_NAME_COLUMN_NAME, true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the column for the assigned scientific name in ecotag");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Assignement status column
|
||||
if (obi_view_add_column(o_view, ECOTAG_STATUS_COLUMN_NAME, -1, NULL, OBI_BOOL, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_STATUS_COLUMN_NAME, true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the column for the assignment status in ecotag");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Column for array of best match ids
|
||||
if (obi_view_add_column(o_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, true, false, NULL, NULL, -1, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME, true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the column for the array of ids of the best match in ecotag");
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int print_assignment_result(Obiview_p output_view, index_t line,
|
||||
OBIDMS_column_p assigned_taxid_column, int32_t taxid,
|
||||
OBIDMS_column_p assigned_name_column, const char* name,
|
||||
OBIDMS_column_p assigned_status_column, bool assigned,
|
||||
OBIDMS_column_p best_match_column, const char* best_match_ids, int best_match_ids_length,
|
||||
OBIDMS_column_p score_column, double score)
|
||||
{
|
||||
// Write the assigned taxid
|
||||
if (obi_set_int_with_elt_idx_and_col_p_in_view(output_view, assigned_taxid_column, line, 0, taxid) < 0)
|
||||
{
|
||||
obidebug(1, "\nError writing a taxid in a column when writing ecotag results");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Write the assigned scientific name
|
||||
if (obi_set_str_with_elt_idx_and_col_p_in_view(output_view, assigned_name_column, line, 0, name) < 0)
|
||||
{
|
||||
obidebug(1, "\nError writing a scientific name in a column when writing ecotag results");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Write the assigned status
|
||||
if (obi_set_bool_with_elt_idx_and_col_p_in_view(output_view, assigned_status_column, line, 0, assigned) < 0)
|
||||
{
|
||||
obidebug(1, "\nError writing a assignment status in a column when writing ecotag results");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Write the best match ids
|
||||
if (obi_set_array_with_col_p_in_view(output_view, best_match_column, line, best_match_ids, (uint8_t)(sizeof(char)*8), best_match_ids_length) < 0)
|
||||
{
|
||||
obidebug(1, "\nError writing a assignment status in a column when writing ecotag results");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Write the similarity score with the best match
|
||||
if (obi_set_float_with_elt_idx_and_col_p_in_view(output_view, score_column, line, 0, score) < 0)
|
||||
{
|
||||
obidebug(1, "\nError writing a score in a column when writing ecotag results");
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/**********************************************************************
|
||||
*
|
||||
* D E F I N I T I O N O F T H E P U B L I C F U N C T I O N S
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
|
||||
int obi_ecotag(const char* dms_name,
|
||||
const char* query_view_name,
|
||||
const char* ref_dms_name,
|
||||
const char* ref_view_name,
|
||||
const char* taxo_dms_name,
|
||||
const char* taxonomy_name,
|
||||
const char* output_view_name,
|
||||
const char* output_view_comments,
|
||||
double ecotag_threshold)
|
||||
{
|
||||
|
||||
// For each sequence
|
||||
|
||||
// Align with DB
|
||||
|
||||
// Keep the indices of all max scores
|
||||
|
||||
// For each kept index, get the LCA at threshold, then the LCA of those LCAs
|
||||
|
||||
// Write result (max score, threshold, LCA assigned, list of the ids of the best matches)
|
||||
|
||||
|
||||
index_t i, j, k;
|
||||
ecotx_t* lca;
|
||||
ecotx_t* lca_in_array;
|
||||
ecotx_t* best_match;
|
||||
index_t query_seq_idx, ref_seq_idx;
|
||||
double score, best_score;
|
||||
double threshold;
|
||||
int lcs_length;
|
||||
int ali_length;
|
||||
Kmer_table_p ktable;
|
||||
Obi_blob_p blob1;
|
||||
Obi_blob_p blob2;
|
||||
int lcs_min;
|
||||
index_t query_count;
|
||||
index_t ref_count;
|
||||
bool same_indexer;
|
||||
const double* score_array;
|
||||
const int* lca_array;
|
||||
index_t* best_match_array;
|
||||
char* best_match_ids;
|
||||
int32_t best_match_ids_length;
|
||||
int best_match_count;
|
||||
int buffer_size;
|
||||
int best_match_ids_buffer_size;
|
||||
index_t best_match_idx;
|
||||
int32_t lca_array_length;
|
||||
int32_t lca_taxid;
|
||||
int32_t taxid_best_match;
|
||||
bool assigned;
|
||||
const char* lca_name;
|
||||
const char* id;
|
||||
int id_len;
|
||||
|
||||
OBIDMS_p dms = NULL;
|
||||
OBIDMS_p ref_dms = NULL;
|
||||
OBIDMS_p taxo_dms = NULL;
|
||||
|
||||
OBIDMS_taxonomy_p taxonomy = NULL;
|
||||
|
||||
Obiview_p query_view = NULL;
|
||||
Obiview_p ref_view = NULL;
|
||||
Obiview_p output_view = NULL;
|
||||
OBIDMS_column_p query_seq_column = NULL;
|
||||
OBIDMS_column_p ref_seq_column = NULL;
|
||||
OBIDMS_column_p ref_id_column = NULL;
|
||||
OBIDMS_column_p score_column = NULL;
|
||||
OBIDMS_column_p assigned_taxid_column = NULL;
|
||||
OBIDMS_column_p assigned_name_column = NULL;
|
||||
OBIDMS_column_p assigned_status_column = NULL;
|
||||
OBIDMS_column_p best_match_column = NULL;
|
||||
OBIDMS_column_p lca_taxid_a_column = NULL;
|
||||
OBIDMS_column_p score_a_column = NULL;
|
||||
OBIDMS_column_p ref_taxid_column = NULL;
|
||||
|
||||
buffer_size = 1024;
|
||||
best_match_ids_buffer_size = 1024;
|
||||
|
||||
// Open main DMS containing the query sequences and where the output will be written
|
||||
dms = obi_open_dms(dms_name);
|
||||
if (dms == NULL)
|
||||
{
|
||||
obidebug(1, "\nError opening the DMS containing the query sequences for ecotag");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Open the DMS containing the reference database (can be the same or not)
|
||||
ref_dms = obi_open_dms(ref_dms_name);
|
||||
if (ref_dms == NULL)
|
||||
{
|
||||
obidebug(1, "\nError opening the DMS containing the reference database for ecotag");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Open the DMS containing the taxonomy (can be the same or not)
|
||||
taxo_dms = obi_open_dms(taxo_dms_name);
|
||||
if (taxo_dms == NULL)
|
||||
{
|
||||
obidebug(1, "\nError opening the DMS containing the taxonomy for ecotag");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Open the taxonomy
|
||||
taxonomy = obi_read_taxonomy(taxo_dms, taxonomy_name, 0);
|
||||
if (taxonomy == NULL)
|
||||
{
|
||||
obidebug(1, "\nError opening the taxonomy for the assignment of sequences");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Open the query view
|
||||
query_view = obi_open_view(dms, query_view_name);
|
||||
if (query_view == NULL)
|
||||
{
|
||||
obidebug(1, "\nError opening the view of query sequences to assign");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Open the reference view.
|
||||
ref_view = obi_open_view(ref_dms, ref_view_name);
|
||||
if (ref_view == NULL)
|
||||
{
|
||||
obidebug(1, "\nError opening the view of reference sequences to assign");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Open the column of query sequences to assign
|
||||
if (strcmp((query_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0)
|
||||
query_seq_column = obi_view_get_column(query_view, NUC_SEQUENCE_COLUMN);
|
||||
else
|
||||
{
|
||||
obi_set_errno(OBI_ECOTAG_ERROR);
|
||||
obidebug(1, "\nError: query view must have the type NUC_SEQS view");
|
||||
return -1;
|
||||
}
|
||||
if (query_seq_column == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting the query column to assign");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Open the column of reference sequences to assign
|
||||
if (strcmp((ref_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0)
|
||||
ref_seq_column = obi_view_get_column(ref_view, NUC_SEQUENCE_COLUMN);
|
||||
else
|
||||
{
|
||||
obi_set_errno(OBI_ECOTAG_ERROR);
|
||||
obidebug(1, "\nError: reference view must have the type NUC_SEQS view");
|
||||
return -1;
|
||||
}
|
||||
if (ref_seq_column == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting the reference column to assign");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Open the ID column of reference sequences
|
||||
ref_id_column = obi_view_get_column(ref_view, ID_COLUMN);
|
||||
if (ref_id_column == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting the ID column of the reference view");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Create the output view
|
||||
output_view = obi_clone_view(dms, query_view, output_view_name, NULL, output_view_comments);
|
||||
if (output_view == NULL)
|
||||
{
|
||||
obidebug(1, "\nError creating the output view when assigning");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Create the output columns
|
||||
if (create_output_columns(output_view) < 0)
|
||||
return -1;
|
||||
assigned_taxid_column = obi_view_get_column(output_view, ECOTAG_TAXID_COLUMN_NAME);
|
||||
assigned_name_column = obi_view_get_column(output_view, ECOTAG_NAME_COLUMN_NAME);
|
||||
assigned_status_column = obi_view_get_column(output_view, ECOTAG_STATUS_COLUMN_NAME);
|
||||
best_match_column = obi_view_get_column(output_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME);
|
||||
score_column = obi_view_get_column(output_view, ECOTAG_SCORE_COLUMN_NAME);
|
||||
|
||||
// Open the used reference columns
|
||||
lca_taxid_a_column = obi_view_get_column(ref_view, LCA_TAXID_ARRAY_COLUMN_NAME);
|
||||
if (lca_taxid_a_column == NULL)
|
||||
{
|
||||
obidebug(1, "\nError opening the reference LCA taxid array column when doing the taxonomic assignment of sequences");
|
||||
return -1;
|
||||
}
|
||||
score_a_column = obi_view_get_column(ref_view, LCA_SCORE_ARRAY_COLUMN_NAME);
|
||||
if (score_a_column == NULL)
|
||||
{
|
||||
obidebug(1, "\nError opening the score array column when doing the taxonomic assignment of sequences");
|
||||
return -1;
|
||||
}
|
||||
ref_taxid_column = obi_view_get_column(ref_view, TAXID_COLUMN);
|
||||
if (ref_taxid_column == NULL)
|
||||
{
|
||||
obidebug(1, "\nError opening the taxid column when doing the taxonomic assignment of sequences");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Check if the sequence columns share the same indexer (allows for quick checking of sequence equality)
|
||||
if (strcmp((query_seq_column->header)->indexer_name, (ref_seq_column->header)->indexer_name) == 0)
|
||||
same_indexer = true;
|
||||
else
|
||||
same_indexer = false;
|
||||
|
||||
// Build kmer tables
|
||||
ktable = hash_two_seq_columns(query_view, query_seq_column, 0, ref_view, ref_seq_column, 0);
|
||||
if (ktable == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_ECOTAG_ERROR);
|
||||
obidebug(1, "\nError building kmer tables before assigning");
|
||||
return -1;
|
||||
}
|
||||
|
||||
query_count = (query_view->infos)->line_count;
|
||||
ref_count = (ref_view->infos)->line_count;
|
||||
|
||||
best_match_array = (index_t*) malloc(buffer_size * sizeof(index_t));
|
||||
if (best_match_array == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_MALLOC_ERROR);
|
||||
obidebug(1, "\nError allocating memory for the best match index array in ecotag");
|
||||
return -1;
|
||||
}
|
||||
|
||||
best_match_ids = (char*) malloc(best_match_ids_buffer_size* sizeof(char));
|
||||
if (best_match_ids == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_MALLOC_ERROR);
|
||||
obidebug(1, "\nError allocating memory for the best match id array in ecotag");
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (i=0; i < query_count; i++)
|
||||
{
|
||||
if (i%100 == 0)
|
||||
fprintf(stderr,"\rDone : %f %% ", (i / (float) query_count)*100);
|
||||
|
||||
best_match_count = 0;
|
||||
best_match_ids_length = 0;
|
||||
threshold = ecotag_threshold;
|
||||
best_score = 0.0;
|
||||
|
||||
// Get first sequence and its index
|
||||
query_seq_idx = obi_get_index_with_elt_idx_and_col_p_in_view(query_view, query_seq_column, i, 0);
|
||||
blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(query_view, query_seq_column, i, 0);
|
||||
if (blob1 == NULL)
|
||||
{
|
||||
obidebug(1, "\nError retrieving sequences to align when assigning");
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (j=0; j < ref_count; j++)
|
||||
{
|
||||
// Get reference sequence and its index
|
||||
ref_seq_idx = obi_get_index_with_elt_idx_and_col_p_in_view(ref_view, ref_seq_column, j, 0);
|
||||
blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(ref_view, ref_seq_column, j, 0);
|
||||
if (blob2 == NULL)
|
||||
{
|
||||
obidebug(1, "\nError retrieving sequences to align when assigning");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Check if the sequences are identical in a quick way (same index in the same indexer)
|
||||
if (same_indexer && (query_seq_idx == ref_seq_idx))
|
||||
score = 1.0;
|
||||
|
||||
else // the sequences aren't identical or we don't know
|
||||
{
|
||||
// kmer filter (offset for the index of the kmer table of the 2nd sequence because the kmer tables of both the query and ref columns are concatenated in one)
|
||||
align_filters(ktable, blob1, blob2, i, query_count+j, threshold, true, ALILEN, true, &score, &lcs_min, !same_indexer);
|
||||
|
||||
// Compute alignment score
|
||||
if ((score < 0) && ((threshold == 0) || (score == -1.0))) // (sequences are not identical), and (no threshold, or filter passed): align
|
||||
{
|
||||
score = obiblob_sse_banded_lcs_align(blob1, blob2, threshold, true, ALILEN, true, &lcs_length, &ali_length);
|
||||
}
|
||||
}
|
||||
|
||||
if ((score >= threshold) && (score >= best_score))
|
||||
{
|
||||
// Replace everything if score is better
|
||||
if (score > best_score)
|
||||
{
|
||||
// Up the alignment threshold because we only want scores equal or greater than the best found
|
||||
threshold = score;
|
||||
best_score = score;
|
||||
// Reset the array with that match
|
||||
best_match_ids_length = 0;
|
||||
best_match_count = 0;
|
||||
}
|
||||
|
||||
// Store in best match array
|
||||
|
||||
// Grow match array if needed
|
||||
if (best_match_count == buffer_size)
|
||||
{
|
||||
buffer_size = buffer_size*2;
|
||||
best_match_array = (index_t*) realloc(best_match_array, buffer_size*sizeof(index_t));
|
||||
if (best_match_array == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_MALLOC_ERROR);
|
||||
obidebug(1, "\nError reallocating match array when assigning");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
id = obi_get_str_with_elt_idx_and_col_p_in_view(ref_view, ref_id_column, j, 0);
|
||||
id_len = strlen(id);
|
||||
|
||||
// Grow ids array if needed
|
||||
if ((best_match_ids_length+id_len+1) >= best_match_ids_buffer_size)
|
||||
{
|
||||
best_match_ids_buffer_size = best_match_ids_buffer_size*2;
|
||||
best_match_ids = (char*) realloc(best_match_ids, best_match_ids_buffer_size*sizeof(char));
|
||||
if (best_match_ids == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_MALLOC_ERROR);
|
||||
obidebug(1, "\nError reallocating match ids array when assigning");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Save match
|
||||
best_match_array[best_match_count] = j;
|
||||
best_match_count++;
|
||||
strcpy(best_match_ids+best_match_ids_length, id);
|
||||
best_match_ids_length = best_match_ids_length + id_len + 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Get LCA of the LCAs of the best matches
|
||||
for (j=0; j<best_match_count; j++)
|
||||
{
|
||||
best_match_idx = best_match_array[j];
|
||||
|
||||
// Find the LCA for the chosen threshold
|
||||
score_array = obi_get_array_with_col_p_in_view(ref_view, score_a_column, best_match_idx, &lca_array_length);
|
||||
|
||||
k = 0;
|
||||
|
||||
while ((k < lca_array_length) && (score_array[k] >= ecotag_threshold))
|
||||
k++;
|
||||
|
||||
if (k>0)
|
||||
{
|
||||
lca_array = obi_get_array_with_col_p_in_view(ref_view, lca_taxid_a_column, best_match_idx, &lca_array_length);
|
||||
if (j>0)
|
||||
{
|
||||
lca = obi_taxo_get_taxon_with_taxid(taxonomy, lca_taxid);
|
||||
if (lca == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting a taxon from a taxid when doing taxonomic assignment");
|
||||
return -1;
|
||||
}
|
||||
lca_in_array = obi_taxo_get_taxon_with_taxid(taxonomy, lca_array[k-1]);
|
||||
if (lca_in_array == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting a taxon from a taxid when doing taxonomic assignment");
|
||||
return -1;
|
||||
}
|
||||
lca = obi_taxo_get_lca(lca, lca_in_array);
|
||||
if (lca == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting the LCA of two taxa when doing taxonomic assignment");
|
||||
return -1;
|
||||
}
|
||||
lca_taxid = lca->taxid;
|
||||
}
|
||||
else
|
||||
{
|
||||
lca_taxid = lca_array[k-1];
|
||||
lca = obi_taxo_get_taxon_with_taxid(taxonomy, lca_taxid);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
taxid_best_match = obi_get_int_with_elt_idx_and_col_p_in_view(ref_view, ref_taxid_column, j, 0);
|
||||
best_match = obi_taxo_get_taxon_with_taxid(taxonomy, taxid_best_match);
|
||||
if (best_match == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting a taxon from a taxid when doing taxonomic assignment");
|
||||
return -1;
|
||||
}
|
||||
if (j>0)
|
||||
{
|
||||
lca = obi_taxo_get_lca(lca, best_match);
|
||||
lca_taxid = lca->taxid;
|
||||
}
|
||||
else
|
||||
{
|
||||
lca_taxid = taxid_best_match;
|
||||
lca = obi_taxo_get_taxon_with_taxid(taxonomy, lca_taxid);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (best_match_count > 0)
|
||||
{
|
||||
assigned = true;
|
||||
|
||||
// Get LCA name
|
||||
if (lca->preferred_name != NULL)
|
||||
lca_name = lca->preferred_name;
|
||||
else
|
||||
lca_name = lca->name;
|
||||
}
|
||||
else
|
||||
{
|
||||
assigned = false;
|
||||
lca_name = OBIStr_NA;
|
||||
lca_taxid = OBIInt_NA;
|
||||
best_match_ids = OBITuple_NA;
|
||||
score = OBIFloat_NA;
|
||||
}
|
||||
|
||||
// Print result in output view
|
||||
if (print_assignment_result(output_view, i,
|
||||
assigned_taxid_column, lca_taxid,
|
||||
assigned_name_column, lca_name,
|
||||
assigned_status_column, assigned,
|
||||
best_match_column, best_match_ids, best_match_ids_length,
|
||||
score_column, best_score
|
||||
) < 0)
|
||||
return -1;
|
||||
}
|
||||
|
||||
free(best_match_array);
|
||||
free(best_match_ids);
|
||||
|
||||
obi_close_taxonomy(taxonomy);
|
||||
obi_save_and_close_view(query_view);
|
||||
obi_save_and_close_view(ref_view);
|
||||
obi_save_and_close_view(output_view);
|
||||
|
||||
obi_close_dms(dms, false);
|
||||
obi_close_dms(ref_dms, false);
|
||||
obi_close_dms(taxo_dms, false);
|
||||
|
||||
fprintf(stderr,"\rDone : 100 %% ");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
71
src/obi_ecotag.h
Executable file
71
src/obi_ecotag.h
Executable file
@ -0,0 +1,71 @@
|
||||
/*************************************************************************************************
|
||||
* Header file for functions for the taxonomic assignment of sequences *
|
||||
*************************************************************************************************/
|
||||
|
||||
/**
|
||||
* @file obi_ecotag.h
|
||||
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||
* @date November 15th 2018
|
||||
* @brief Header file for the functions for the taxonomic assignment of sequences.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef OBI_ECOTAG_H_
|
||||
#define OBI_ECOTAG_H_
|
||||
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
|
||||
|
||||
#define ECOTAG_TAXID_COLUMN_NAME "TAXID"
|
||||
#define ECOTAG_NAME_COLUMN_NAME "SCIENTIFIC_NAME"
|
||||
#define ECOTAG_STATUS_COLUMN_NAME "ID_STATUS"
|
||||
#define ECOTAG_BEST_MATCH_IDS_COLUMN_NAME "BEST_MATCH"
|
||||
#define ECOTAG_SCORE_COLUMN_NAME "BEST_IDENTITY"
|
||||
|
||||
|
||||
/**
|
||||
* @brief Taxonomic assignment of sequences.
|
||||
*
|
||||
* Note: The columns where the results are written are automatically named and created.
|
||||
*
|
||||
* @param dms_name The path to the DMS where the views are.
|
||||
* @param query_view_name The name of the view containing the query sequences.
|
||||
* @param ref_dms_name The name of the DMS containing the reference database.
|
||||
* @param ref_view_name The name of the view corresponding to the reference database as built by build_reference_db().
|
||||
* @param taxo_dms_name The name of the DMS containing the taxonomy associated with the reference database.
|
||||
* @param taxonomy_name The name of the taxonomy associated with the reference database.
|
||||
* @param output_view_name The name to give to the output view.
|
||||
* @param output_view_comments The comments to associate to the output view.
|
||||
* @param ecotag_threshold The threshold at which to assign.
|
||||
*
|
||||
* The algorithm works like this:
|
||||
* For each query sequence:
|
||||
* Align with reference database
|
||||
* Keep the indices of all the best matches
|
||||
* For each kept index, get the LCA at that threshold as stored in the reference database, then the LCA of those LCAs
|
||||
* Write result (max score, threshold, taxid and scientific name of the LCA assigned, list of the ids of the best matches)
|
||||
*
|
||||
* @returns A value indicating the success of the operation.
|
||||
* @retval 0 if the operation was successfully completed.
|
||||
* @retval -1 if an error occurred.
|
||||
*
|
||||
* @since November 2018
|
||||
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||
*/
|
||||
int obi_ecotag(const char* dms_name,
|
||||
const char* query_view_name,
|
||||
const char* ref_dms_name,
|
||||
const char* ref_view_name,
|
||||
const char* taxo_dms_name,
|
||||
const char* taxonomy_name,
|
||||
const char* output_view_name,
|
||||
const char* output_view_comments,
|
||||
double ecotag_threshold);
|
||||
|
||||
|
||||
#endif /* OBI_ECOTAG_H_ */
|
||||
|
@ -130,6 +130,8 @@ extern int obi_errno;
|
||||
*/
|
||||
#define OBIVIEW_ALREADY_EXISTS_ERROR (35) /** Tried to create a new view with a name already existing in the DMS.
|
||||
*/
|
||||
#define OBI_ECOTAG_ERROR (36) /** Tried to create a new view with a name already existing in the DMS.
|
||||
*/
|
||||
/**@}*/
|
||||
|
||||
#endif /* OBIERRNO_H_ */
|
||||
|
Reference in New Issue
Block a user