Compare commits

...

12 Commits

15 changed files with 118 additions and 51 deletions

View File

@ -4,7 +4,7 @@ from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.dms import DMS from obitools3.dms import DMS
from obitools3.dms.view.view cimport View from obitools3.dms.view.view cimport View
from obitools3.uri.decode import open_uri from obitools3.uri.decode import open_uri
from obitools3.apps.optiongroups import addMinimalOutputOption from obitools3.apps.optiongroups import addMinimalOutputOption, addNoProgressBarOption
from obitools3.dms.view import RollbackException from obitools3.dms.view import RollbackException
from obitools3.apps.config import logger from obitools3.apps.config import logger
from obitools3.utils cimport str2bytes from obitools3.utils cimport str2bytes
@ -28,6 +28,7 @@ __title__="Concatenate views."
def addOptions(parser): def addOptions(parser):
addMinimalOutputOption(parser) addMinimalOutputOption(parser)
addNoProgressBarOption(parser)
group=parser.add_argument_group('obi cat specific options') group=parser.add_argument_group('obi cat specific options')
@ -47,9 +48,9 @@ def run(config):
logger("info", "obi cat") logger("info", "obi cat")
# Open the views to concatenate # Check the views to concatenate
iview_list = []
idms_list = [] idms_list = []
iview_list = []
total_len = 0 total_len = 0
remove_qual = False remove_qual = False
remove_rev_qual = False remove_rev_qual = False
@ -67,8 +68,9 @@ def run(config):
if REVERSE_QUALITY_COLUMN not in i_view: # same as above for reverse quality if REVERSE_QUALITY_COLUMN not in i_view: # same as above for reverse quality
remove_rev_qual = True remove_rev_qual = True
total_len += len(i_view) total_len += len(i_view)
iview_list.append(i_view)
idms_list.append(i_dms) idms_list.append(i_dms)
iview_list.append(i_view.name)
i_view.close()
# Open the output: only the DMS # Open the output: only the DMS
output = open_uri(config['obi']['outputURI'], output = open_uri(config['obi']['outputURI'],
@ -97,8 +99,10 @@ def run(config):
# Initialize multiple elements columns # Initialize multiple elements columns
if type(output_0)==BufferedWriter: if type(output_0)==BufferedWriter:
dict_cols = {} dict_cols = {}
for v in iview_list: for v_uri in config["cat"]["views_to_cat"]:
v = open_uri(v_uri)[1]
for coln in v.keys(): for coln in v.keys():
col = v[coln]
if v[coln].nb_elements_per_line > 1: if v[coln].nb_elements_per_line > 1:
if coln not in dict_cols: if coln not in dict_cols:
dict_cols[coln] = {} dict_cols[coln] = {}
@ -108,6 +112,7 @@ def run(config):
else: else:
dict_cols[coln]['eltnames'] = set(v[coln].elements_names + list(dict_cols[coln]['eltnames'])) dict_cols[coln]['eltnames'] = set(v[coln].elements_names + list(dict_cols[coln]['eltnames']))
dict_cols[coln]['nbelts'] = len(dict_cols[coln]['eltnames']) dict_cols[coln]['nbelts'] = len(dict_cols[coln]['eltnames'])
v.close()
for coln in dict_cols: for coln in dict_cols:
Column.new_column(o_view, coln, dict_cols[coln]['obitype'], Column.new_column(o_view, coln, dict_cols[coln]['obitype'],
nb_elements_per_line=dict_cols[coln]['nbelts'], elements_names=list(dict_cols[coln]['eltnames'])) nb_elements_per_line=dict_cols[coln]['nbelts'], elements_names=list(dict_cols[coln]['eltnames']))
@ -119,7 +124,8 @@ def run(config):
pb = None pb = None
i = 0 i = 0
for v in iview_list: for v_uri in config["cat"]["views_to_cat"]:
v = open_uri(v_uri)[1]
for entry in v: for entry in v:
PyErr_CheckSignals() PyErr_CheckSignals()
if pb is not None: if pb is not None:
@ -130,6 +136,7 @@ def run(config):
else: else:
o_view[i] = entry o_view[i] = entry
i+=1 i+=1
v.close()
# Deletes quality columns if needed # Deletes quality columns if needed
if type(output_0)!=BufferedWriter: if type(output_0)!=BufferedWriter:
@ -144,7 +151,7 @@ def run(config):
# Save command config in DMS comments # Save command config in DMS comments
command_line = " ".join(sys.argv[1:]) command_line = " ".join(sys.argv[1:])
o_view.write_config(config, "cat", command_line, input_dms_name=[d.name for d in idms_list], input_view_name=[v.name for v in iview_list]) o_view.write_config(config, "cat", command_line, input_dms_name=[d.name for d in idms_list], input_view_name=[vname for vname in iview_list])
o_dms.record_command_line(command_line) o_dms.record_command_line(command_line)
#print("\n\nOutput view:\n````````````", file=sys.stderr) #print("\n\nOutput view:\n````````````", file=sys.stderr)

View File

@ -41,6 +41,17 @@ def addOptions(parser):
help="Minimum identity to consider for assignment, as a normalized identity, e.g. 0.95 for an identity of 95%%. " help="Minimum identity to consider for assignment, as a normalized identity, e.g. 0.95 for an identity of 95%%. "
"Default: 0.00 (no threshold).") "Default: 0.00 (no threshold).")
group.add_argument('--minimum-circle','-c',
action="store", dest="ecotag:bubble_threshold",
metavar='<CIRCLE_THRESHOLD>',
default=0.99,
type=float,
help="Minimum identity considered for the assignment circle "
"(sequence is assigned to the LCA of all sequences within a similarity circle of the best matches; "
"the threshold for this circle is the highest value between <CIRCLE_THRESHOLD> and the best assignment score found). "
"Give value as a normalized identity, e.g. 0.95 for an identity of 95%%. "
"Default: 0.99.")
def run(config): def run(config):
DMS.obi_atexit() DMS.obi_atexit()
@ -66,9 +77,8 @@ def run(config):
ref_view_name = ref[1] ref_view_name = ref[1]
# Check that the threshold demanded is greater than or equal to the threshold used to build the reference database # Check that the threshold demanded is greater than or equal to the threshold used to build the reference database
if config['ecotag']['threshold'] < eval(ref_dms[ref_view_name].comments["ref_db_threshold"]) : if config['ecotag']['bubble_threshold'] < eval(ref_dms[ref_view_name].comments["ref_db_threshold"]) :
print("Error: The threshold demanded (%f) is lower than the threshold used to build the reference database (%f).", raise Exception(f"Error: The threshold demanded ({config['ecotag']['bubble_threshold']}) is lower than the threshold used to build the reference database ({float(ref_dms[ref_view_name].comments['ref_db_threshold'])}).")
config['ecotag']['threshold'], ref_dms[ref_view_name].comments["ref_db_threshold"])
# Open the output: only the DMS # Open the output: only the DMS
output = open_uri(config['obi']['outputURI'], output = open_uri(config['obi']['outputURI'],
@ -113,8 +123,9 @@ def run(config):
if obi_ecotag(i_dms.name_with_full_path, tobytes(i_view_name), \ if obi_ecotag(i_dms.name_with_full_path, tobytes(i_view_name), \
ref_dms.name_with_full_path, tobytes(ref_view_name), \ ref_dms.name_with_full_path, tobytes(ref_view_name), \
taxo_dms.name_with_full_path, tobytes(taxonomy_name), \ taxo_dms.name_with_full_path, tobytes(taxonomy_name), \
tobytes(o_view_name), comments, tobytes(o_view_name), comments, \
config['ecotag']['threshold']) < 0: config['ecotag']['threshold'], \
config['ecotag']['bubble_threshold']) < 0:
raise Exception("Error running ecotag") raise Exception("Error running ecotag")
# If the input and output DMS are not the same, export result view to output DMS # If the input and output DMS are not the same, export result view to output DMS

View File

@ -89,7 +89,7 @@ def run(config):
if pb is not None: if pb is not None:
pb(i, force=True) pb(i, force=True)
print("", file=sys.stderr) print("", file=sys.stderr)
# TODO save command in input dms? # TODO save command in input dms?

View File

@ -23,6 +23,7 @@ from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN, \
import shutil import shutil
import string import string
import random import random
import sys
from cpython.exc cimport PyErr_CheckSignals from cpython.exc cimport PyErr_CheckSignals
@ -366,7 +367,7 @@ def random_new_view(config, infos, first=False):
infos['view'] = View_NUC_SEQS.new(infos['dms'], random_unique_name(infos), comments=random_comments(config)) # TODO quality column infos['view'] = View_NUC_SEQS.new(infos['dms'], random_unique_name(infos), comments=random_comments(config)) # TODO quality column
else : else :
infos['view'] = View.new(infos['dms'], random_unique_name(infos), comments=random_comments(config)) # TODO quality column infos['view'] = View.new(infos['dms'], random_unique_name(infos), comments=random_comments(config)) # TODO quality column
infos['view'].write_config(config, "test", infos["command_line"], input_dms_name=[infos['dms'].name], input_view_name=["random"])
print_test(config, repr(infos['view'])) print_test(config, repr(infos['view']))
if v_to_clone is not None : if v_to_clone is not None :
if line_selection is None: if line_selection is None:
@ -497,7 +498,8 @@ def run(config):
(b"OBI_SEQ", False): random_seq, (b"OBI_SEQ", True): random_seq_tuples, (b"OBI_SEQ", False): random_seq, (b"OBI_SEQ", True): random_seq_tuples,
(b"OBI_STR", False): random_bytes, (b"OBI_STR", True): random_bytes_tuples (b"OBI_STR", False): random_bytes, (b"OBI_STR", True): random_bytes_tuples
}, },
'tests': [test_set_and_get, test_add_col, test_delete_col, test_col_alias, test_new_view] 'tests': [test_set_and_get, test_add_col, test_delete_col, test_col_alias, test_new_view],
'command_line': " ".join(sys.argv[1:])
} }
# TODO ??? # TODO ???

View File

@ -354,6 +354,9 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, di
key = mergedKeys[k] key = mergedKeys[k]
merged_col_name = mergedKeys_m[k] merged_col_name = mergedKeys_m[k]
if merged_infos[merged_col_name]['nb_elts'] == 1:
raise Exception("Can't merge information from a tag with only one element (e.g. one sample ; don't use -m option)")
if merged_col_name in view: if merged_col_name in view:
i_col = view[merged_col_name] i_col = view[merged_col_name]
else: else:

View File

@ -11,4 +11,5 @@ cdef extern from "obi_ecotag.h" nogil:
const char* taxonomy_name, const char* taxonomy_name,
const char* output_view_name, const char* output_view_name,
const char* output_view_comments, const char* output_view_comments,
double ecotag_threshold) double ecotag_threshold,
double bubble_threshold)

View File

@ -7,6 +7,7 @@ cdef dict __VIEW_CLASS__= {}
from libc.stdlib cimport malloc from libc.stdlib cimport malloc
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.version import version
from ..capi.obiview cimport Alias_column_pair_p, \ from ..capi.obiview cimport Alias_column_pair_p, \
obi_new_view, \ obi_new_view, \
@ -184,8 +185,14 @@ cdef class View(OBIWrapper) :
@OBIWrapper.checkIsActive @OBIWrapper.checkIsActive
def __repr__(self) : def __repr__(self) :
cdef str s = "#View name:\n{name:s}\n#Line count:\n{line_count:d}\n#Columns:\n".format(name = bytes2str(self.name), cdef str s
line_count = self.line_count) if self.read_only: # can read date
s = "#View name:\n{name:s}\n#Date created:\n{date:s}\n#Line count:\n{line_count:d}\n#Columns:\n".format(name = bytes2str(self.name),
line_count = self.line_count,
date = str(bytes2str_object(self.comments["Date created"])))
else:
s = "#View name:\n{name:s}\n#Line count:\n{line_count:d}\n#Columns:\n".format(name = bytes2str(self.name),
line_count = self.line_count)
for column_name in self.keys() : for column_name in self.keys() :
s = s + repr(self[column_name]) + '\n' s = s + repr(self[column_name]) + '\n'
return s return s
@ -434,6 +441,7 @@ cdef class View(OBIWrapper) :
for i in range(len(input_view_name)): for i in range(len(input_view_name)):
input_str.append(tostr(input_dms_name[i])+"/"+tostr(input_view_name[i])) input_str.append(tostr(input_dms_name[i])+"/"+tostr(input_view_name[i]))
comments["input_str"] = input_str comments["input_str"] = input_str
comments["version"] = version
return bytes2str_object(comments) return bytes2str_object(comments)

View File

@ -5,6 +5,7 @@ from obitools3.dms.view.view cimport Line
from obitools3.utils cimport bytes2str_object, str2bytes, tobytes from obitools3.utils cimport bytes2str_object, str2bytes, tobytes
from obitools3.dms.column.column cimport Column_line, Column_multi_elts from obitools3.dms.column.column cimport Column_line, Column_multi_elts
import sys
cdef class TabFormat: cdef class TabFormat:
@ -26,18 +27,22 @@ cdef class TabFormat:
if self.header and self.first_line: if self.header and self.first_line:
if isinstance(data.view[k], Column_multi_elts): if isinstance(data.view[k], Column_multi_elts):
for k2 in data.view[k].keys(): keys = data.view[k].keys()
keys.sort()
for k2 in keys:
line.append(tobytes(k)+b':'+tobytes(k2)) line.append(tobytes(k)+b':'+tobytes(k2))
else: else:
line.append(tobytes(k)) line.append(tobytes(k))
else: else:
value = data[k] value = data[k]
if isinstance(data.view[k], Column_multi_elts): if isinstance(data.view[k], Column_multi_elts):
keys = data.view[k].keys()
keys.sort()
if value is None: # all keys at None if value is None: # all keys at None
for k2 in data.view[k].keys(): # TODO could be much more efficient for k2 in keys: # TODO could be much more efficient
line.append(self.NAString) line.append(self.NAString)
else: else:
for k2 in data.view[k].keys(): # TODO could be much more efficient for k2 in keys: # TODO could be much more efficient
if value[k2] is not None: if value[k2] is not None:
line.append(str2bytes(str(bytes2str_object(value[k2])))) # genius programming line.append(str2bytes(str(bytes2str_object(value[k2])))) # genius programming
else: else:

View File

@ -276,9 +276,9 @@ def open_uri(uri,
iseq = urib iseq = urib
objclass = bytes objclass = bytes
else: # TODO update uopen to be able to write? else: # TODO update uopen to be able to write?
if urip.path == b'-': if not urip.path or urip.path == b'-':
file = sys.stdout.buffer file = sys.stdout.buffer
elif urip.path : else:
file = open(urip.path, 'wb') file = open(urip.path, 'wb')
if file is not None: if file is not None:

View File

@ -2,7 +2,7 @@
from obitools3.dms.capi.obitypes cimport obitype_t, index_t from obitools3.dms.capi.obitypes cimport obitype_t, index_t
cpdef bytes format_separator(bytes format) cpdef bytes format_uniq_pattern(bytes format)
cpdef int count_entries(file, bytes format) cpdef int count_entries(file, bytes format)
cdef obi_errno_to_exception(index_t line_nb=*, object elt_id=*, str error_message=*) cdef obi_errno_to_exception(index_t line_nb=*, object elt_id=*, str error_message=*)

View File

@ -24,11 +24,11 @@ import glob
import gzip import gzip
cpdef bytes format_separator(bytes format): cpdef bytes format_uniq_pattern(bytes format):
if format == b"fasta": if format == b"fasta":
return b"\n>" return b"\n>"
elif format == b"fastq": elif format == b"fastq":
return b"\n@" return b"\n\+\n"
elif format == b"ngsfilter" or format == b"tabular": elif format == b"ngsfilter" or format == b"tabular":
return b"\n" return b"\n"
elif format == b"genbank" or format == b"embl": elif format == b"genbank" or format == b"embl":
@ -42,7 +42,7 @@ cpdef bytes format_separator(bytes format):
cpdef int count_entries(file, bytes format): cpdef int count_entries(file, bytes format):
try: try:
sep = format_separator(format) sep = format_uniq_pattern(format)
if sep is None: if sep is None:
return -1 return -1
sep = re.compile(sep) sep = re.compile(sep)
@ -72,7 +72,7 @@ cpdef int count_entries(file, bytes format):
return -1 return -1
mmapped_file = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) mmapped_file = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
total_count += len(re.findall(sep, mmapped_file)) total_count += len(re.findall(sep, mmapped_file))
if format != b"ngsfilter" and format != b"tabular" and format != b"embl" and format != b"genbank": if format != b"ngsfilter" and format != b"tabular" and format != b"embl" and format != b"genbank" and format != b"fastq":
total_count += 1 # adding +1 for 1st entry because separators include \n (ngsfilter and tabular already count one more because of last \n) total_count += 1 # adding +1 for 1st entry because separators include \n (ngsfilter and tabular already count one more because of last \n)
except: except:

View File

@ -1,5 +1,5 @@
major = 3 major = 3
minor = 0 minor = 0
serial= '0b30' serial= '0b35'
version ="%d.%d.%s" % (major,minor,serial) version ="%d.%d.%s" % (major,minor,serial)

View File

@ -218,7 +218,8 @@ int obi_ecotag(const char* dms_name,
const char* taxonomy_name, const char* taxonomy_name,
const char* output_view_name, const char* output_view_name,
const char* output_view_comments, const char* output_view_comments,
double ecotag_threshold) // TODO different threshold for the similarity sphere around ref seqs double ecotag_threshold,
double bubble_threshold)
{ {
// For each sequence // For each sequence
@ -239,6 +240,7 @@ int obi_ecotag(const char* dms_name,
index_t query_seq_idx, ref_seq_idx; index_t query_seq_idx, ref_seq_idx;
double score, best_score; double score, best_score;
double threshold; double threshold;
double lca_threshold;
int lcs_length; int lcs_length;
int ali_length; int ali_length;
Kmer_table_p ktable; Kmer_table_p ktable;
@ -389,10 +391,10 @@ int obi_ecotag(const char* dms_name,
return -1; return -1;
} }
free(db_threshold_str); free(db_threshold_str);
if (ecotag_threshold < db_threshold) if (bubble_threshold < db_threshold)
{ {
fprintf(stderr, "\nError: The threshold demanded (%f) is lower than the threshold used to build the reference database (%f).\n\n", fprintf(stderr, "\nError: The threshold demanded (%f) is lower than the threshold used to build the reference database (%f).\n\n",
ecotag_threshold, db_threshold); bubble_threshold, db_threshold);
return -1; return -1;
} }
@ -597,11 +599,16 @@ int obi_ecotag(const char* dms_name,
{ {
best_match_idx = best_match_array[j]; best_match_idx = best_match_array[j];
// Find the LCA for the chosen threshold // Find the LCA for the highest threshold between best_score and the chosen bubble threshold
score_array = obi_get_array_with_col_p_in_view(ref_view, score_a_column, best_match_idx, &lca_array_length); score_array = obi_get_array_with_col_p_in_view(ref_view, score_a_column, best_match_idx, &lca_array_length);
if (bubble_threshold < best_score)
lca_threshold = best_score;
else
lca_threshold = bubble_threshold;
k = 0; k = 0;
while ((k < lca_array_length) && (score_array[k] >= best_score)) while ((k < lca_array_length) && (score_array[k] >= lca_threshold))
k++; k++;
if (k>0) if (k>0)

View File

@ -42,12 +42,14 @@
* @param output_view_name The name to give to the output view. * @param output_view_name The name to give to the output view.
* @param output_view_comments The comments to associate to the output view. * @param output_view_comments The comments to associate to the output view.
* @param ecotag_threshold The threshold at which to assign. * @param ecotag_threshold The threshold at which to assign.
* @param bubble_threshold The threshold at which to look for an LCA (i.e. minimum identity considered for the assignment circle);
* the threshold actually used will be the highest between this value and the best assignment score found.
* *
* The algorithm works like this: * The algorithm works like this:
* For each query sequence: * For each query sequence:
* Align with reference database * Align with reference database
* Keep the indices of all the best matches * Keep the indices of all the best matches
* For each kept index, get the LCA at that threshold as stored in the reference database, then the LCA of those LCAs * For each kept index, get the LCA at the highest threshold between bubble_threshold and the best assignment score found (as stored in the reference database), then the LCA of those LCAs
* Write result (max score, threshold, taxid and scientific name of the LCA assigned, list of the ids of the best matches) * Write result (max score, threshold, taxid and scientific name of the LCA assigned, list of the ids of the best matches)
* *
* @returns A value indicating the success of the operation. * @returns A value indicating the success of the operation.
@ -65,7 +67,8 @@ int obi_ecotag(const char* dms_name,
const char* taxonomy_name, const char* taxonomy_name,
const char* output_view_name, const char* output_view_name,
const char* output_view_comments, const char* output_view_comments,
double ecotag_threshold); double ecotag_threshold,
double bubble_threshold);
#endif /* OBI_ECOTAG_H_ */ #endif /* OBI_ECOTAG_H_ */

View File

@ -1725,16 +1725,32 @@ int obi_close_column(OBIDMS_column_p column)
int obi_clone_column_indexer(OBIDMS_column_p column) int obi_clone_column_indexer(OBIDMS_column_p column)
{ {
char* new_indexer_name; char* new_indexer_name;
int i;
new_indexer_name = obi_build_indexer_name((column->header)->name, (column->header)->version); i=0;
if (new_indexer_name == NULL) while (true) // find avl name not already used
return -1;
column->indexer = obi_clone_indexer(column->indexer, new_indexer_name); // TODO Need to lock this somehow?
if (column->indexer == NULL)
{ {
obidebug(1, "\nError cloning a column's indexer to make it writable"); new_indexer_name = obi_build_indexer_name((column->header)->name, ((column->header)->version)+i);
return -1; if (new_indexer_name == NULL)
return -1;
column->indexer = obi_clone_indexer(column->indexer, new_indexer_name); // TODO Need to lock this somehow?
if (column->indexer == NULL)
{
if (errno == EEXIST)
{
free(new_indexer_name);
i++;
}
else
{
free(new_indexer_name);
obidebug(1, "\nError cloning a column's indexer to make it writable");
return -1;
}
}
else
break;
} }
strcpy((column->header)->indexer_name, new_indexer_name); strcpy((column->header)->indexer_name, new_indexer_name);
@ -2415,16 +2431,20 @@ char* obi_get_formatted_elements_names(OBIDMS_column_p column)
} }
char* obi_column_formatted_infos(OBIDMS_column_p column) char* obi_column_formatted_infos(OBIDMS_column_p column, bool detailed)
{ {
char* column_infos; char* column_infos = NULL;
char* elt_names; char* elt_names = NULL;
char* column_name = NULL;
column_infos = malloc(1024 * sizeof(char)); // should be in view.c because alias exists in the context of view
column_infos = malloc(2048 * sizeof(char)); // TODO
elt_names = obi_get_formatted_elements_names(column); elt_names = obi_get_formatted_elements_names(column);
// "column_name, data type: OBI_TYPE, element names: [formatted element names](, all comments)"
free(elt_names); free(elt_names);
return column_infos; return column_infos;
} }