Compare commits

...

3 Commits

6 changed files with 50 additions and 21 deletions

View File

@ -4,7 +4,7 @@ from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.dms import DMS from obitools3.dms import DMS
from obitools3.dms.view.view cimport View from obitools3.dms.view.view cimport View
from obitools3.uri.decode import open_uri from obitools3.uri.decode import open_uri
from obitools3.apps.optiongroups import addMinimalOutputOption from obitools3.apps.optiongroups import addMinimalOutputOption, addNoProgressBarOption
from obitools3.dms.view import RollbackException from obitools3.dms.view import RollbackException
from obitools3.apps.config import logger from obitools3.apps.config import logger
from obitools3.utils cimport str2bytes from obitools3.utils cimport str2bytes
@ -28,6 +28,7 @@ __title__="Concatenate views."
def addOptions(parser): def addOptions(parser):
addMinimalOutputOption(parser) addMinimalOutputOption(parser)
addNoProgressBarOption(parser)
group=parser.add_argument_group('obi cat specific options') group=parser.add_argument_group('obi cat specific options')
@ -47,9 +48,9 @@ def run(config):
logger("info", "obi cat") logger("info", "obi cat")
# Open the views to concatenate # Check the views to concatenate
iview_list = []
idms_list = [] idms_list = []
iview_list = []
total_len = 0 total_len = 0
remove_qual = False remove_qual = False
remove_rev_qual = False remove_rev_qual = False
@ -67,8 +68,9 @@ def run(config):
if REVERSE_QUALITY_COLUMN not in i_view: # same as above for reverse quality if REVERSE_QUALITY_COLUMN not in i_view: # same as above for reverse quality
remove_rev_qual = True remove_rev_qual = True
total_len += len(i_view) total_len += len(i_view)
iview_list.append(i_view)
idms_list.append(i_dms) idms_list.append(i_dms)
iview_list.append(i_view.name)
i_view.close()
# Open the output: only the DMS # Open the output: only the DMS
output = open_uri(config['obi']['outputURI'], output = open_uri(config['obi']['outputURI'],
@ -97,8 +99,10 @@ def run(config):
# Initialize multiple elements columns # Initialize multiple elements columns
if type(output_0)==BufferedWriter: if type(output_0)==BufferedWriter:
dict_cols = {} dict_cols = {}
for v in iview_list: for v_uri in config["cat"]["views_to_cat"]:
v = open_uri(v_uri)[1]
for coln in v.keys(): for coln in v.keys():
col = v[coln]
if v[coln].nb_elements_per_line > 1: if v[coln].nb_elements_per_line > 1:
if coln not in dict_cols: if coln not in dict_cols:
dict_cols[coln] = {} dict_cols[coln] = {}
@ -108,6 +112,7 @@ def run(config):
else: else:
dict_cols[coln]['eltnames'] = set(v[coln].elements_names + list(dict_cols[coln]['eltnames'])) dict_cols[coln]['eltnames'] = set(v[coln].elements_names + list(dict_cols[coln]['eltnames']))
dict_cols[coln]['nbelts'] = len(dict_cols[coln]['eltnames']) dict_cols[coln]['nbelts'] = len(dict_cols[coln]['eltnames'])
v.close()
for coln in dict_cols: for coln in dict_cols:
Column.new_column(o_view, coln, dict_cols[coln]['obitype'], Column.new_column(o_view, coln, dict_cols[coln]['obitype'],
nb_elements_per_line=dict_cols[coln]['nbelts'], elements_names=list(dict_cols[coln]['eltnames'])) nb_elements_per_line=dict_cols[coln]['nbelts'], elements_names=list(dict_cols[coln]['eltnames']))
@ -119,7 +124,8 @@ def run(config):
pb = None pb = None
i = 0 i = 0
for v in iview_list: for v_uri in config["cat"]["views_to_cat"]:
v = open_uri(v_uri)[1]
for entry in v: for entry in v:
PyErr_CheckSignals() PyErr_CheckSignals()
if pb is not None: if pb is not None:
@ -130,6 +136,7 @@ def run(config):
else: else:
o_view[i] = entry o_view[i] = entry
i+=1 i+=1
v.close()
# Deletes quality columns if needed # Deletes quality columns if needed
if type(output_0)!=BufferedWriter: if type(output_0)!=BufferedWriter:
@ -144,7 +151,7 @@ def run(config):
# Save command config in DMS comments # Save command config in DMS comments
command_line = " ".join(sys.argv[1:]) command_line = " ".join(sys.argv[1:])
o_view.write_config(config, "cat", command_line, input_dms_name=[d.name for d in idms_list], input_view_name=[v.name for v in iview_list]) o_view.write_config(config, "cat", command_line, input_dms_name=[d.name for d in idms_list], input_view_name=[vname for vname in iview_list])
o_dms.record_command_line(command_line) o_dms.record_command_line(command_line)
#print("\n\nOutput view:\n````````````", file=sys.stderr) #print("\n\nOutput view:\n````````````", file=sys.stderr)

View File

@ -41,6 +41,17 @@ def addOptions(parser):
help="Minimum identity to consider for assignment, as a normalized identity, e.g. 0.95 for an identity of 95%%. " help="Minimum identity to consider for assignment, as a normalized identity, e.g. 0.95 for an identity of 95%%. "
"Default: 0.00 (no threshold).") "Default: 0.00 (no threshold).")
group.add_argument('--minimum-circle','-c',
action="store", dest="ecotag:bubble_threshold",
metavar='<CIRCLE_THRESHOLD>',
default=0.99,
type=float,
help="Minimum identity considered for the assignment circle "
"(sequence is assigned to the LCA of all sequences within a similarity circle of the best matches; "
"the threshold for this circle is the highest value between <CIRCLE_THRESHOLD> and the best assignment score found). "
"Give value as a normalized identity, e.g. 0.95 for an identity of 95%%. "
"Default: 0.99.")
def run(config): def run(config):
DMS.obi_atexit() DMS.obi_atexit()
@ -66,9 +77,8 @@ def run(config):
ref_view_name = ref[1] ref_view_name = ref[1]
# Check that the threshold demanded is greater than or equal to the threshold used to build the reference database # Check that the threshold demanded is greater than or equal to the threshold used to build the reference database
if config['ecotag']['threshold'] < eval(ref_dms[ref_view_name].comments["ref_db_threshold"]) : if config['ecotag']['bubble_threshold'] < eval(ref_dms[ref_view_name].comments["ref_db_threshold"]) :
print("Error: The threshold demanded (%f) is lower than the threshold used to build the reference database (%f).", raise Exception(f"Error: The threshold demanded ({config['ecotag']['bubble_threshold']}) is lower than the threshold used to build the reference database ({float(ref_dms[ref_view_name].comments['ref_db_threshold'])}).")
config['ecotag']['threshold'], ref_dms[ref_view_name].comments["ref_db_threshold"])
# Open the output: only the DMS # Open the output: only the DMS
output = open_uri(config['obi']['outputURI'], output = open_uri(config['obi']['outputURI'],
@ -113,8 +123,9 @@ def run(config):
if obi_ecotag(i_dms.name_with_full_path, tobytes(i_view_name), \ if obi_ecotag(i_dms.name_with_full_path, tobytes(i_view_name), \
ref_dms.name_with_full_path, tobytes(ref_view_name), \ ref_dms.name_with_full_path, tobytes(ref_view_name), \
taxo_dms.name_with_full_path, tobytes(taxonomy_name), \ taxo_dms.name_with_full_path, tobytes(taxonomy_name), \
tobytes(o_view_name), comments, tobytes(o_view_name), comments, \
config['ecotag']['threshold']) < 0: config['ecotag']['threshold'], \
config['ecotag']['bubble_threshold']) < 0:
raise Exception("Error running ecotag") raise Exception("Error running ecotag")
# If the input and output DMS are not the same, export result view to output DMS # If the input and output DMS are not the same, export result view to output DMS

View File

@ -11,4 +11,5 @@ cdef extern from "obi_ecotag.h" nogil:
const char* taxonomy_name, const char* taxonomy_name,
const char* output_view_name, const char* output_view_name,
const char* output_view_comments, const char* output_view_comments,
double ecotag_threshold) double ecotag_threshold,
double bubble_threshold)

View File

@ -1,5 +1,5 @@
major = 3 major = 3
minor = 0 minor = 0
serial= '0b32' serial= '0b35'
version ="%d.%d.%s" % (major,minor,serial) version ="%d.%d.%s" % (major,minor,serial)

View File

@ -218,7 +218,8 @@ int obi_ecotag(const char* dms_name,
const char* taxonomy_name, const char* taxonomy_name,
const char* output_view_name, const char* output_view_name,
const char* output_view_comments, const char* output_view_comments,
double ecotag_threshold) // TODO different threshold for the similarity sphere around ref seqs double ecotag_threshold,
double bubble_threshold)
{ {
// For each sequence // For each sequence
@ -239,6 +240,7 @@ int obi_ecotag(const char* dms_name,
index_t query_seq_idx, ref_seq_idx; index_t query_seq_idx, ref_seq_idx;
double score, best_score; double score, best_score;
double threshold; double threshold;
double lca_threshold;
int lcs_length; int lcs_length;
int ali_length; int ali_length;
Kmer_table_p ktable; Kmer_table_p ktable;
@ -389,10 +391,10 @@ int obi_ecotag(const char* dms_name,
return -1; return -1;
} }
free(db_threshold_str); free(db_threshold_str);
if (ecotag_threshold < db_threshold) if (bubble_threshold < db_threshold)
{ {
fprintf(stderr, "\nError: The threshold demanded (%f) is lower than the threshold used to build the reference database (%f).\n\n", fprintf(stderr, "\nError: The threshold demanded (%f) is lower than the threshold used to build the reference database (%f).\n\n",
ecotag_threshold, db_threshold); bubble_threshold, db_threshold);
return -1; return -1;
} }
@ -597,11 +599,16 @@ int obi_ecotag(const char* dms_name,
{ {
best_match_idx = best_match_array[j]; best_match_idx = best_match_array[j];
// Find the LCA for the chosen threshold // Find the LCA for the highest threshold between best_score and the chosen bubble threshold
score_array = obi_get_array_with_col_p_in_view(ref_view, score_a_column, best_match_idx, &lca_array_length); score_array = obi_get_array_with_col_p_in_view(ref_view, score_a_column, best_match_idx, &lca_array_length);
if (bubble_threshold < best_score)
lca_threshold = best_score;
else
lca_threshold = bubble_threshold;
k = 0; k = 0;
while ((k < lca_array_length) && (score_array[k] >= best_score)) while ((k < lca_array_length) && (score_array[k] >= lca_threshold))
k++; k++;
if (k>0) if (k>0)

View File

@ -42,12 +42,14 @@
* @param output_view_name The name to give to the output view. * @param output_view_name The name to give to the output view.
* @param output_view_comments The comments to associate to the output view. * @param output_view_comments The comments to associate to the output view.
* @param ecotag_threshold The threshold at which to assign. * @param ecotag_threshold The threshold at which to assign.
* @param bubble_threshold The threshold at which to look for an LCA (i.e. minimum identity considered for the assignment circle);
* the threshold actually used will be the highest between this value and the best assignment score found.
* *
* The algorithm works like this: * The algorithm works like this:
* For each query sequence: * For each query sequence:
* Align with reference database * Align with reference database
* Keep the indices of all the best matches * Keep the indices of all the best matches
* For each kept index, get the LCA at that threshold as stored in the reference database, then the LCA of those LCAs * For each kept index, get the LCA at the highest threshold between bubble_threshold and the best assignment score found (as stored in the reference database), then the LCA of those LCAs
* Write result (max score, threshold, taxid and scientific name of the LCA assigned, list of the ids of the best matches) * Write result (max score, threshold, taxid and scientific name of the LCA assigned, list of the ids of the best matches)
* *
* @returns A value indicating the success of the operation. * @returns A value indicating the success of the operation.
@ -65,7 +67,8 @@ int obi_ecotag(const char* dms_name,
const char* taxonomy_name, const char* taxonomy_name,
const char* output_view_name, const char* output_view_name,
const char* output_view_comments, const char* output_view_comments,
double ecotag_threshold); double ecotag_threshold,
double bubble_threshold);
#endif /* OBI_ECOTAG_H_ */ #endif /* OBI_ECOTAG_H_ */