ecotag: added separate threshold for minimum circle identity (and switch
to version 3.0.0b35
This commit is contained in:
@ -41,6 +41,17 @@ def addOptions(parser):
|
|||||||
help="Minimum identity to consider for assignment, as a normalized identity, e.g. 0.95 for an identity of 95%%. "
|
help="Minimum identity to consider for assignment, as a normalized identity, e.g. 0.95 for an identity of 95%%. "
|
||||||
"Default: 0.00 (no threshold).")
|
"Default: 0.00 (no threshold).")
|
||||||
|
|
||||||
|
group.add_argument('--minimum-circle','-c',
|
||||||
|
action="store", dest="ecotag:bubble_threshold",
|
||||||
|
metavar='<CIRCLE_THRESHOLD>',
|
||||||
|
default=0.99,
|
||||||
|
type=float,
|
||||||
|
help="Minimum identity considered for the assignment circle "
|
||||||
|
"(sequence is assigned to the LCA of all sequences within a similarity circle of the best matches; "
|
||||||
|
"the threshold for this circle is the highest value between <CIRCLE_THRESHOLD> and the best assignment score found). "
|
||||||
|
"Give value as a normalized identity, e.g. 0.95 for an identity of 95%%. "
|
||||||
|
"Default: 0.99.")
|
||||||
|
|
||||||
def run(config):
|
def run(config):
|
||||||
|
|
||||||
DMS.obi_atexit()
|
DMS.obi_atexit()
|
||||||
@ -66,9 +77,8 @@ def run(config):
|
|||||||
ref_view_name = ref[1]
|
ref_view_name = ref[1]
|
||||||
|
|
||||||
# Check that the threshold demanded is greater than or equal to the threshold used to build the reference database
|
# Check that the threshold demanded is greater than or equal to the threshold used to build the reference database
|
||||||
if config['ecotag']['threshold'] < eval(ref_dms[ref_view_name].comments["ref_db_threshold"]) :
|
if config['ecotag']['bubble_threshold'] < eval(ref_dms[ref_view_name].comments["ref_db_threshold"]) :
|
||||||
print("Error: The threshold demanded (%f) is lower than the threshold used to build the reference database (%f).",
|
raise Exception(f"Error: The threshold demanded ({config['ecotag']['bubble_threshold']}) is lower than the threshold used to build the reference database ({float(ref_dms[ref_view_name].comments['ref_db_threshold'])}).")
|
||||||
config['ecotag']['threshold'], ref_dms[ref_view_name].comments["ref_db_threshold"])
|
|
||||||
|
|
||||||
# Open the output: only the DMS
|
# Open the output: only the DMS
|
||||||
output = open_uri(config['obi']['outputURI'],
|
output = open_uri(config['obi']['outputURI'],
|
||||||
@ -113,8 +123,9 @@ def run(config):
|
|||||||
if obi_ecotag(i_dms.name_with_full_path, tobytes(i_view_name), \
|
if obi_ecotag(i_dms.name_with_full_path, tobytes(i_view_name), \
|
||||||
ref_dms.name_with_full_path, tobytes(ref_view_name), \
|
ref_dms.name_with_full_path, tobytes(ref_view_name), \
|
||||||
taxo_dms.name_with_full_path, tobytes(taxonomy_name), \
|
taxo_dms.name_with_full_path, tobytes(taxonomy_name), \
|
||||||
tobytes(o_view_name), comments,
|
tobytes(o_view_name), comments, \
|
||||||
config['ecotag']['threshold']) < 0:
|
config['ecotag']['threshold'], \
|
||||||
|
config['ecotag']['bubble_threshold']) < 0:
|
||||||
raise Exception("Error running ecotag")
|
raise Exception("Error running ecotag")
|
||||||
|
|
||||||
# If the input and output DMS are not the same, export result view to output DMS
|
# If the input and output DMS are not the same, export result view to output DMS
|
||||||
|
@ -11,4 +11,5 @@ cdef extern from "obi_ecotag.h" nogil:
|
|||||||
const char* taxonomy_name,
|
const char* taxonomy_name,
|
||||||
const char* output_view_name,
|
const char* output_view_name,
|
||||||
const char* output_view_comments,
|
const char* output_view_comments,
|
||||||
double ecotag_threshold)
|
double ecotag_threshold,
|
||||||
|
double bubble_threshold)
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
major = 3
|
major = 3
|
||||||
minor = 0
|
minor = 0
|
||||||
serial= '0b34'
|
serial= '0b35'
|
||||||
|
|
||||||
version ="%d.%d.%s" % (major,minor,serial)
|
version ="%d.%d.%s" % (major,minor,serial)
|
||||||
|
@ -218,7 +218,8 @@ int obi_ecotag(const char* dms_name,
|
|||||||
const char* taxonomy_name,
|
const char* taxonomy_name,
|
||||||
const char* output_view_name,
|
const char* output_view_name,
|
||||||
const char* output_view_comments,
|
const char* output_view_comments,
|
||||||
double ecotag_threshold) // TODO different threshold for the similarity sphere around ref seqs
|
double ecotag_threshold,
|
||||||
|
double bubble_threshold)
|
||||||
{
|
{
|
||||||
|
|
||||||
// For each sequence
|
// For each sequence
|
||||||
@ -239,6 +240,7 @@ int obi_ecotag(const char* dms_name,
|
|||||||
index_t query_seq_idx, ref_seq_idx;
|
index_t query_seq_idx, ref_seq_idx;
|
||||||
double score, best_score;
|
double score, best_score;
|
||||||
double threshold;
|
double threshold;
|
||||||
|
double lca_threshold;
|
||||||
int lcs_length;
|
int lcs_length;
|
||||||
int ali_length;
|
int ali_length;
|
||||||
Kmer_table_p ktable;
|
Kmer_table_p ktable;
|
||||||
@ -389,10 +391,10 @@ int obi_ecotag(const char* dms_name,
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
free(db_threshold_str);
|
free(db_threshold_str);
|
||||||
if (ecotag_threshold < db_threshold)
|
if (bubble_threshold < db_threshold)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "\nError: The threshold demanded (%f) is lower than the threshold used to build the reference database (%f).\n\n",
|
fprintf(stderr, "\nError: The threshold demanded (%f) is lower than the threshold used to build the reference database (%f).\n\n",
|
||||||
ecotag_threshold, db_threshold);
|
bubble_threshold, db_threshold);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -597,11 +599,16 @@ int obi_ecotag(const char* dms_name,
|
|||||||
{
|
{
|
||||||
best_match_idx = best_match_array[j];
|
best_match_idx = best_match_array[j];
|
||||||
|
|
||||||
// Find the LCA for the chosen threshold
|
// Find the LCA for the highest threshold between best_score and the chosen bubble threshold
|
||||||
score_array = obi_get_array_with_col_p_in_view(ref_view, score_a_column, best_match_idx, &lca_array_length);
|
score_array = obi_get_array_with_col_p_in_view(ref_view, score_a_column, best_match_idx, &lca_array_length);
|
||||||
|
|
||||||
|
if (bubble_threshold < best_score)
|
||||||
|
lca_threshold = best_score;
|
||||||
|
else
|
||||||
|
lca_threshold = bubble_threshold;
|
||||||
|
|
||||||
k = 0;
|
k = 0;
|
||||||
while ((k < lca_array_length) && (score_array[k] >= best_score))
|
while ((k < lca_array_length) && (score_array[k] >= lca_threshold))
|
||||||
k++;
|
k++;
|
||||||
|
|
||||||
if (k>0)
|
if (k>0)
|
||||||
|
@ -42,12 +42,14 @@
|
|||||||
* @param output_view_name The name to give to the output view.
|
* @param output_view_name The name to give to the output view.
|
||||||
* @param output_view_comments The comments to associate to the output view.
|
* @param output_view_comments The comments to associate to the output view.
|
||||||
* @param ecotag_threshold The threshold at which to assign.
|
* @param ecotag_threshold The threshold at which to assign.
|
||||||
|
* @param bubble_threshold The threshold at which to look for an LCA (i.e. minimum identity considered for the assignment circle);
|
||||||
|
* the threshold actually used will be the highest between this value and the best assignment score found.
|
||||||
*
|
*
|
||||||
* The algorithm works like this:
|
* The algorithm works like this:
|
||||||
* For each query sequence:
|
* For each query sequence:
|
||||||
* Align with reference database
|
* Align with reference database
|
||||||
* Keep the indices of all the best matches
|
* Keep the indices of all the best matches
|
||||||
* For each kept index, get the LCA at that threshold as stored in the reference database, then the LCA of those LCAs
|
* For each kept index, get the LCA at the highest threshold between bubble_threshold and the best assignment score found (as stored in the reference database), then the LCA of those LCAs
|
||||||
* Write result (max score, threshold, taxid and scientific name of the LCA assigned, list of the ids of the best matches)
|
* Write result (max score, threshold, taxid and scientific name of the LCA assigned, list of the ids of the best matches)
|
||||||
*
|
*
|
||||||
* @returns A value indicating the success of the operation.
|
* @returns A value indicating the success of the operation.
|
||||||
@ -65,7 +67,8 @@ int obi_ecotag(const char* dms_name,
|
|||||||
const char* taxonomy_name,
|
const char* taxonomy_name,
|
||||||
const char* output_view_name,
|
const char* output_view_name,
|
||||||
const char* output_view_comments,
|
const char* output_view_comments,
|
||||||
double ecotag_threshold);
|
double ecotag_threshold,
|
||||||
|
double bubble_threshold);
|
||||||
|
|
||||||
|
|
||||||
#endif /* OBI_ECOTAG_H_ */
|
#endif /* OBI_ECOTAG_H_ */
|
||||||
|
Reference in New Issue
Block a user