Compare commits
3 Commits
Author | SHA1 | Date | |
---|---|---|---|
c4696ac865 | |||
11a0945a9b | |||
f23c40c905 |
@ -4,7 +4,7 @@ from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
||||
from obitools3.dms import DMS
|
||||
from obitools3.dms.view.view cimport View
|
||||
from obitools3.uri.decode import open_uri
|
||||
from obitools3.apps.optiongroups import addMinimalOutputOption
|
||||
from obitools3.apps.optiongroups import addMinimalOutputOption, addNoProgressBarOption
|
||||
from obitools3.dms.view import RollbackException
|
||||
from obitools3.apps.config import logger
|
||||
from obitools3.utils cimport str2bytes
|
||||
@ -28,6 +28,7 @@ __title__="Concatenate views."
|
||||
def addOptions(parser):
|
||||
|
||||
addMinimalOutputOption(parser)
|
||||
addNoProgressBarOption(parser)
|
||||
|
||||
group=parser.add_argument_group('obi cat specific options')
|
||||
|
||||
@ -47,9 +48,9 @@ def run(config):
|
||||
|
||||
logger("info", "obi cat")
|
||||
|
||||
# Open the views to concatenate
|
||||
iview_list = []
|
||||
# Check the views to concatenate
|
||||
idms_list = []
|
||||
iview_list = []
|
||||
total_len = 0
|
||||
remove_qual = False
|
||||
remove_rev_qual = False
|
||||
@ -67,8 +68,9 @@ def run(config):
|
||||
if REVERSE_QUALITY_COLUMN not in i_view: # same as above for reverse quality
|
||||
remove_rev_qual = True
|
||||
total_len += len(i_view)
|
||||
iview_list.append(i_view)
|
||||
idms_list.append(i_dms)
|
||||
iview_list.append(i_view.name)
|
||||
i_view.close()
|
||||
|
||||
# Open the output: only the DMS
|
||||
output = open_uri(config['obi']['outputURI'],
|
||||
@ -97,8 +99,10 @@ def run(config):
|
||||
# Initialize multiple elements columns
|
||||
if type(output_0)==BufferedWriter:
|
||||
dict_cols = {}
|
||||
for v in iview_list:
|
||||
for v_uri in config["cat"]["views_to_cat"]:
|
||||
v = open_uri(v_uri)[1]
|
||||
for coln in v.keys():
|
||||
col = v[coln]
|
||||
if v[coln].nb_elements_per_line > 1:
|
||||
if coln not in dict_cols:
|
||||
dict_cols[coln] = {}
|
||||
@ -108,6 +112,7 @@ def run(config):
|
||||
else:
|
||||
dict_cols[coln]['eltnames'] = set(v[coln].elements_names + list(dict_cols[coln]['eltnames']))
|
||||
dict_cols[coln]['nbelts'] = len(dict_cols[coln]['eltnames'])
|
||||
v.close()
|
||||
for coln in dict_cols:
|
||||
Column.new_column(o_view, coln, dict_cols[coln]['obitype'],
|
||||
nb_elements_per_line=dict_cols[coln]['nbelts'], elements_names=list(dict_cols[coln]['eltnames']))
|
||||
@ -119,7 +124,8 @@ def run(config):
|
||||
pb = None
|
||||
|
||||
i = 0
|
||||
for v in iview_list:
|
||||
for v_uri in config["cat"]["views_to_cat"]:
|
||||
v = open_uri(v_uri)[1]
|
||||
for entry in v:
|
||||
PyErr_CheckSignals()
|
||||
if pb is not None:
|
||||
@ -130,6 +136,7 @@ def run(config):
|
||||
else:
|
||||
o_view[i] = entry
|
||||
i+=1
|
||||
v.close()
|
||||
|
||||
# Deletes quality columns if needed
|
||||
if type(output_0)!=BufferedWriter:
|
||||
@ -144,7 +151,7 @@ def run(config):
|
||||
|
||||
# Save command config in DMS comments
|
||||
command_line = " ".join(sys.argv[1:])
|
||||
o_view.write_config(config, "cat", command_line, input_dms_name=[d.name for d in idms_list], input_view_name=[v.name for v in iview_list])
|
||||
o_view.write_config(config, "cat", command_line, input_dms_name=[d.name for d in idms_list], input_view_name=[vname for vname in iview_list])
|
||||
o_dms.record_command_line(command_line)
|
||||
|
||||
#print("\n\nOutput view:\n````````````", file=sys.stderr)
|
||||
|
@ -41,6 +41,17 @@ def addOptions(parser):
|
||||
help="Minimum identity to consider for assignment, as a normalized identity, e.g. 0.95 for an identity of 95%%. "
|
||||
"Default: 0.00 (no threshold).")
|
||||
|
||||
group.add_argument('--minimum-circle','-c',
|
||||
action="store", dest="ecotag:bubble_threshold",
|
||||
metavar='<CIRCLE_THRESHOLD>',
|
||||
default=0.99,
|
||||
type=float,
|
||||
help="Minimum identity considered for the assignment circle "
|
||||
"(sequence is assigned to the LCA of all sequences within a similarity circle of the best matches; "
|
||||
"the threshold for this circle is the highest value between <CIRCLE_THRESHOLD> and the best assignment score found). "
|
||||
"Give value as a normalized identity, e.g. 0.95 for an identity of 95%%. "
|
||||
"Default: 0.99.")
|
||||
|
||||
def run(config):
|
||||
|
||||
DMS.obi_atexit()
|
||||
@ -66,9 +77,8 @@ def run(config):
|
||||
ref_view_name = ref[1]
|
||||
|
||||
# Check that the threshold demanded is greater than or equal to the threshold used to build the reference database
|
||||
if config['ecotag']['threshold'] < eval(ref_dms[ref_view_name].comments["ref_db_threshold"]) :
|
||||
print("Error: The threshold demanded (%f) is lower than the threshold used to build the reference database (%f).",
|
||||
config['ecotag']['threshold'], ref_dms[ref_view_name].comments["ref_db_threshold"])
|
||||
if config['ecotag']['bubble_threshold'] < eval(ref_dms[ref_view_name].comments["ref_db_threshold"]) :
|
||||
raise Exception(f"Error: The threshold demanded ({config['ecotag']['bubble_threshold']}) is lower than the threshold used to build the reference database ({float(ref_dms[ref_view_name].comments['ref_db_threshold'])}).")
|
||||
|
||||
# Open the output: only the DMS
|
||||
output = open_uri(config['obi']['outputURI'],
|
||||
@ -113,8 +123,9 @@ def run(config):
|
||||
if obi_ecotag(i_dms.name_with_full_path, tobytes(i_view_name), \
|
||||
ref_dms.name_with_full_path, tobytes(ref_view_name), \
|
||||
taxo_dms.name_with_full_path, tobytes(taxonomy_name), \
|
||||
tobytes(o_view_name), comments,
|
||||
config['ecotag']['threshold']) < 0:
|
||||
tobytes(o_view_name), comments, \
|
||||
config['ecotag']['threshold'], \
|
||||
config['ecotag']['bubble_threshold']) < 0:
|
||||
raise Exception("Error running ecotag")
|
||||
|
||||
# If the input and output DMS are not the same, export result view to output DMS
|
||||
|
@ -11,4 +11,5 @@ cdef extern from "obi_ecotag.h" nogil:
|
||||
const char* taxonomy_name,
|
||||
const char* output_view_name,
|
||||
const char* output_view_comments,
|
||||
double ecotag_threshold)
|
||||
double ecotag_threshold,
|
||||
double bubble_threshold)
|
||||
|
@ -1,5 +1,5 @@
|
||||
major = 3
|
||||
minor = 0
|
||||
serial= '0b32'
|
||||
serial= '0b35'
|
||||
|
||||
version ="%d.%d.%s" % (major,minor,serial)
|
||||
|
@ -218,7 +218,8 @@ int obi_ecotag(const char* dms_name,
|
||||
const char* taxonomy_name,
|
||||
const char* output_view_name,
|
||||
const char* output_view_comments,
|
||||
double ecotag_threshold) // TODO different threshold for the similarity sphere around ref seqs
|
||||
double ecotag_threshold,
|
||||
double bubble_threshold)
|
||||
{
|
||||
|
||||
// For each sequence
|
||||
@ -239,6 +240,7 @@ int obi_ecotag(const char* dms_name,
|
||||
index_t query_seq_idx, ref_seq_idx;
|
||||
double score, best_score;
|
||||
double threshold;
|
||||
double lca_threshold;
|
||||
int lcs_length;
|
||||
int ali_length;
|
||||
Kmer_table_p ktable;
|
||||
@ -389,10 +391,10 @@ int obi_ecotag(const char* dms_name,
|
||||
return -1;
|
||||
}
|
||||
free(db_threshold_str);
|
||||
if (ecotag_threshold < db_threshold)
|
||||
if (bubble_threshold < db_threshold)
|
||||
{
|
||||
fprintf(stderr, "\nError: The threshold demanded (%f) is lower than the threshold used to build the reference database (%f).\n\n",
|
||||
ecotag_threshold, db_threshold);
|
||||
bubble_threshold, db_threshold);
|
||||
return -1;
|
||||
}
|
||||
|
||||
@ -597,11 +599,16 @@ int obi_ecotag(const char* dms_name,
|
||||
{
|
||||
best_match_idx = best_match_array[j];
|
||||
|
||||
// Find the LCA for the chosen threshold
|
||||
// Find the LCA for the highest threshold between best_score and the chosen bubble threshold
|
||||
score_array = obi_get_array_with_col_p_in_view(ref_view, score_a_column, best_match_idx, &lca_array_length);
|
||||
|
||||
if (bubble_threshold < best_score)
|
||||
lca_threshold = best_score;
|
||||
else
|
||||
lca_threshold = bubble_threshold;
|
||||
|
||||
k = 0;
|
||||
while ((k < lca_array_length) && (score_array[k] >= best_score))
|
||||
while ((k < lca_array_length) && (score_array[k] >= lca_threshold))
|
||||
k++;
|
||||
|
||||
if (k>0)
|
||||
|
@ -42,12 +42,14 @@
|
||||
* @param output_view_name The name to give to the output view.
|
||||
* @param output_view_comments The comments to associate to the output view.
|
||||
* @param ecotag_threshold The threshold at which to assign.
|
||||
* @param bubble_threshold The threshold at which to look for an LCA (i.e. minimum identity considered for the assignment circle);
|
||||
* the threshold actually used will be the highest between this value and the best assignment score found.
|
||||
*
|
||||
* The algorithm works like this:
|
||||
* For each query sequence:
|
||||
* Align with reference database
|
||||
* Keep the indices of all the best matches
|
||||
* For each kept index, get the LCA at that threshold as stored in the reference database, then the LCA of those LCAs
|
||||
* For each kept index, get the LCA at the highest threshold between bubble_threshold and the best assignment score found (as stored in the reference database), then the LCA of those LCAs
|
||||
* Write result (max score, threshold, taxid and scientific name of the LCA assigned, list of the ids of the best matches)
|
||||
*
|
||||
* @returns A value indicating the success of the operation.
|
||||
@ -65,7 +67,8 @@ int obi_ecotag(const char* dms_name,
|
||||
const char* taxonomy_name,
|
||||
const char* output_view_name,
|
||||
const char* output_view_comments,
|
||||
double ecotag_threshold);
|
||||
double ecotag_threshold,
|
||||
double bubble_threshold);
|
||||
|
||||
|
||||
#endif /* OBI_ECOTAG_H_ */
|
||||
|
Reference in New Issue
Block a user