ecotag: added separate threshold for minimum circle identity (and switch

to version 3.0.0b35
This commit is contained in:
Celine Mercier
2020-09-25 16:22:09 +02:00
parent 11a0945a9b
commit c4696ac865
5 changed files with 36 additions and 14 deletions

View File

@ -41,6 +41,17 @@ def addOptions(parser):
help="Minimum identity to consider for assignment, as a normalized identity, e.g. 0.95 for an identity of 95%%. "
"Default: 0.00 (no threshold).")
group.add_argument('--minimum-circle','-c',
action="store", dest="ecotag:bubble_threshold",
metavar='<CIRCLE_THRESHOLD>',
default=0.99,
type=float,
help="Minimum identity considered for the assignment circle "
"(sequence is assigned to the LCA of all sequences within a similarity circle of the best matches; "
"the threshold for this circle is the highest value between <CIRCLE_THRESHOLD> and the best assignment score found). "
"Give value as a normalized identity, e.g. 0.95 for an identity of 95%%. "
"Default: 0.99.")
def run(config):
DMS.obi_atexit()
@ -66,9 +77,8 @@ def run(config):
ref_view_name = ref[1]
# Check that the threshold demanded is greater than or equal to the threshold used to build the reference database
if config['ecotag']['threshold'] < eval(ref_dms[ref_view_name].comments["ref_db_threshold"]) :
print("Error: The threshold demanded (%f) is lower than the threshold used to build the reference database (%f).",
config['ecotag']['threshold'], ref_dms[ref_view_name].comments["ref_db_threshold"])
if config['ecotag']['bubble_threshold'] < eval(ref_dms[ref_view_name].comments["ref_db_threshold"]) :
raise Exception(f"Error: The threshold demanded ({config['ecotag']['bubble_threshold']}) is lower than the threshold used to build the reference database ({float(ref_dms[ref_view_name].comments['ref_db_threshold'])}).")
# Open the output: only the DMS
output = open_uri(config['obi']['outputURI'],
@ -113,8 +123,9 @@ def run(config):
if obi_ecotag(i_dms.name_with_full_path, tobytes(i_view_name), \
ref_dms.name_with_full_path, tobytes(ref_view_name), \
taxo_dms.name_with_full_path, tobytes(taxonomy_name), \
tobytes(o_view_name), comments,
config['ecotag']['threshold']) < 0:
tobytes(o_view_name), comments, \
config['ecotag']['threshold'], \
config['ecotag']['bubble_threshold']) < 0:
raise Exception("Error running ecotag")
# If the input and output DMS are not the same, export result view to output DMS

View File

@ -11,4 +11,5 @@ cdef extern from "obi_ecotag.h" nogil:
const char* taxonomy_name,
const char* output_view_name,
const char* output_view_comments,
double ecotag_threshold)
double ecotag_threshold,
double bubble_threshold)

View File

@ -1,5 +1,5 @@
major = 3
minor = 0
serial= '0b34'
serial= '0b35'
version ="%d.%d.%s" % (major,minor,serial)

View File

@ -218,7 +218,8 @@ int obi_ecotag(const char* dms_name,
const char* taxonomy_name,
const char* output_view_name,
const char* output_view_comments,
double ecotag_threshold) // TODO different threshold for the similarity sphere around ref seqs
double ecotag_threshold,
double bubble_threshold)
{
// For each sequence
@ -239,6 +240,7 @@ int obi_ecotag(const char* dms_name,
index_t query_seq_idx, ref_seq_idx;
double score, best_score;
double threshold;
double lca_threshold;
int lcs_length;
int ali_length;
Kmer_table_p ktable;
@ -389,10 +391,10 @@ int obi_ecotag(const char* dms_name,
return -1;
}
free(db_threshold_str);
if (ecotag_threshold < db_threshold)
if (bubble_threshold < db_threshold)
{
fprintf(stderr, "\nError: The threshold demanded (%f) is lower than the threshold used to build the reference database (%f).\n\n",
ecotag_threshold, db_threshold);
bubble_threshold, db_threshold);
return -1;
}
@ -597,11 +599,16 @@ int obi_ecotag(const char* dms_name,
{
best_match_idx = best_match_array[j];
// Find the LCA for the chosen threshold
// Find the LCA for the highest threshold between best_score and the chosen bubble threshold
score_array = obi_get_array_with_col_p_in_view(ref_view, score_a_column, best_match_idx, &lca_array_length);
if (bubble_threshold < best_score)
lca_threshold = best_score;
else
lca_threshold = bubble_threshold;
k = 0;
while ((k < lca_array_length) && (score_array[k] >= best_score))
while ((k < lca_array_length) && (score_array[k] >= lca_threshold))
k++;
if (k>0)

View File

@ -42,12 +42,14 @@
* @param output_view_name The name to give to the output view.
* @param output_view_comments The comments to associate to the output view.
* @param ecotag_threshold The threshold at which to assign.
* @param bubble_threshold The threshold at which to look for an LCA (i.e. minimum identity considered for the assignment circle);
* the threshold actually used will be the highest between this value and the best assignment score found.
*
* The algorithm works like this:
* For each query sequence:
* Align with reference database
* Keep the indices of all the best matches
* For each kept index, get the LCA at that threshold as stored in the reference database, then the LCA of those LCAs
* For each kept index, get the LCA at the highest threshold between bubble_threshold and the best assignment score found (as stored in the reference database), then the LCA of those LCAs
* Write result (max score, threshold, taxid and scientific name of the LCA assigned, list of the ids of the best matches)
*
* @returns A value indicating the success of the operation.
@ -65,7 +67,8 @@ int obi_ecotag(const char* dms_name,
const char* taxonomy_name,
const char* output_view_name,
const char* output_view_comments,
double ecotag_threshold);
double ecotag_threshold,
double bubble_threshold);
#endif /* OBI_ECOTAG_H_ */