Compare commits

..

17 Commits

Author SHA1 Message Date
9ace9989c4 Switch to version 3.0.0b27 2020-07-07 16:47:21 +02:00
a3ebe5f118 C: AVL trees: fixed a bug where storing the difference between 2 crc64
values in an int64 would mess trees up resulting in failed data
dereplication
2020-07-07 16:47:00 +02:00
9100e14899 obi uniq: quick fix for bug where some sequences are not correctly
dereplicated
2020-07-03 17:36:57 +02:00
ccda0661ce small help documentation improvement 2020-07-01 18:20:38 +02:00
aab59f2214 obi clean: fixed a memory bug, fixed the behaviour when no sample info,
and added checks warnings and error handling when sample info not
dereplicated
2020-07-01 18:17:47 +02:00
ade1107b42 switch to version 3.0.0b26 2020-06-17 18:56:07 +02:00
9c7d24406f export: dictionaries are now formatted like in the original OBITools
when exporting in tabular format and tuple formatting is cleaner
2020-06-17 18:55:46 +02:00
03bc9915f2 Cython: utils: added handling of tuples to bytes2str_object function 2020-06-17 18:54:14 +02:00
24b1dab573 Cython: Columns: added a keys() method that returns all element names 2020-06-17 18:53:41 +02:00
7593673f3f ngsfilter: now setting 'reversed' tag to False instead of None when
false
2020-06-17 18:52:35 +02:00
aa01236cae switch to version 3.0.0b25 2020-06-13 21:48:49 +02:00
49b8810a76 C: made indexer opening/closing cleaner 2020-06-13 21:47:03 +02:00
7a39df54c0 ls: fixed an issue where big DMS couldn't be read by ls 2020-06-13 21:45:22 +02:00
09e483b0d6 switch to temporary version 3.0.0b24a 2020-06-10 17:47:56 +02:00
14a2579173 uniq: now outputs an empty view if input view is empty instead of
displaying an error
2020-06-10 17:47:26 +02:00
36a8aaa92e grep: now creating empty views instead of displaying an error when
selecting on an unexisting column/tag
2020-06-10 16:57:42 +02:00
a17eb445c2 ngsfilter: made one of the tag error messages more accurate 2020-06-10 16:27:36 +02:00
16 changed files with 120 additions and 51 deletions

View File

@ -36,8 +36,7 @@ def addOptions(parser):
dest="clean:sample-tag-name",
metavar="<SAMPLE TAG NAME>",
type=str,
default="merged_sample",
help="Name of the tag where sample counts are kept.")
help="Name of the tag where merged sample count informations are kept (typically generated by obi uniq, usually MERGED_sample, default: None).")
group.add_argument('--ratio', '-r',
action="store", dest="clean:ratio",
@ -107,6 +106,9 @@ def run(config):
command_line = " ".join(sys.argv[1:])
comments = View.print_config(config, "clean", command_line, input_dms_name=[i_dms_name], input_view_name=[i_view_name])
if 'sample-tag-name' not in config['clean']:
config['clean']['sample-tag-name'] = ""
if obi_clean(i_dms.name_with_full_path, tobytes(i_view_name), tobytes(config['clean']['sample-tag-name']), tobytes(o_view_name), comments, \
config['clean']['distance'], config['clean']['ratio'], config['clean']['heads-only'], config['clean']['thread-count']) < 0:
raise Exception("Error running obiclean")

View File

@ -22,7 +22,7 @@ def addOptions(parser):
group.add_argument('-s','--sequence',
action="store_true", dest="count:sequence",
default=False,
help="Prints only the number of sequence records.")
help="Prints only the number of sequence records (much faster, default: False).")
group.add_argument('-a','--all',
action="store_true", dest="count:all",

View File

@ -161,8 +161,7 @@ def obi_eval(compiled_expr, loc_env, line):
return obi_eval_result
def Filter_generator(options, tax_filter):
#taxfilter = taxonomyFilterGenerator(options)
def Filter_generator(options, tax_filter, i_view):
# Initialize conditions
predicates = None
@ -171,6 +170,9 @@ def Filter_generator(options, tax_filter):
attributes = None
if "attributes" in options and len(options["attributes"]) > 0:
attributes = options["attributes"]
for attribute in attributes:
if attribute not in i_view:
return None
lmax = None
if "lmax" in options:
lmax = options["lmax"]
@ -196,6 +198,8 @@ def Filter_generator(options, tax_filter):
if "attribute_patterns" in options and len(options["attribute_patterns"]) > 0:
for p in options["attribute_patterns"]:
attribute, pattern = p.split(":", 1)
if attribute not in i_view:
return None
attribute_patterns[tobytes(attribute)] = re.compile(tobytes(pattern))
def filter(line, loc_env):
@ -324,21 +328,29 @@ def run(config):
# Apply filter
tax_filter = Taxonomy_filter_generator(taxo, config["grep"])
filter = Filter_generator(config["grep"], tax_filter)
filter = Filter_generator(config["grep"], tax_filter, i_view)
selection = Line_selection(i_view)
for i in range(len(i_view)):
PyErr_CheckSignals()
pb(i)
line = i_view[i]
loc_env = {"sequence": line, "line": line, "taxonomy": taxo, "obi_eval_result": False}
good = filter(line, loc_env)
if good :
selection.append(i)
pb(i, force=True)
if filter is None and config["grep"]["invert_selection"]: # all sequences are selected: filter is None if no line will be selected because some columns don't exist
for i in range(len(i_view)):
PyErr_CheckSignals()
pb(i)
selection.append(i)
elif filter is not None : # filter is None if no line will be selected because some columns don't exist
for i in range(len(i_view)):
PyErr_CheckSignals()
pb(i)
line = i_view[i]
loc_env = {"sequence": line, "line": line, "taxonomy": taxo, "obi_eval_result": False}
good = filter(line, loc_env)
if good :
selection.append(i)
pb(len(i_view), force=True)
print("", file=sys.stderr)
# Create output view with the line selection

View File

@ -34,9 +34,10 @@ def run(config):
if input[2] == DMS and not config['ls']['longformat']:
dms = input[0]
l = []
for view in input[0]:
l.append(tostr(view) + "\t(Date created: " + str(bytes2str_object(dms[view].comments["Date created"]))+")")
dms[view].close()
for viewname in input[0]:
view = dms[viewname]
l.append(tostr(viewname) + "\t(Date created: " + str(bytes2str_object(view.comments["Date created"]))+")")
view.close()
l.sort()
for v in l:
print(v)

View File

@ -479,6 +479,8 @@ cdef tuple annotate(sequences, infos, no_tags, verbose=False):
if not directmatch[0].forward:
sequences[0] = sequences[0].reverse_complement
sequences[0][b'reversed'] = True # used by the alignpairedend tool (in kmer_similarity.c)
else:
sequences[0][b'reversed'] = False # used by the alignpairedend tool (in kmer_similarity.c)
sample=None
if not no_tags:
@ -506,7 +508,7 @@ cdef tuple annotate(sequences, infos, no_tags, verbose=False):
sample=None
if sample is None:
sequences[0][b'error']=b"No tags found"
sequences[0][b'error']=b"No sample with that tag combination"
return False, sequences[0]
sequences[0].update(sample)

View File

@ -307,6 +307,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, di
for x in categories :
catl.append(i_seq[x])
#unique_id = tuple(catl) + (i_seq_col[i],)
unique_id = tuple(catl) + (i_seq_col.get_line_idx(i),)
#unique_id = tuple(i_seq[x] for x in categories) + (seq_col.get_line_idx(i),) # The line that cython can't read properly
@ -453,6 +454,7 @@ cdef uniq_sequences(View_NUC_SEQS view, View_NUC_SEQS o_view, ProgressBar pb, di
merged_dict[mkey] = {}
for i_idx in merged_sequences:
PyErr_CheckSignals()
pb(total_treated)
i_id = i_id_col[i_idx]
@ -591,10 +593,11 @@ def run(config):
# Initialize the progress bar
pb = ProgressBar(len(entries), config, seconde=5)
try:
uniq_sequences(entries, o_view, pb, config, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'], max_elts=config['obi']['maxelts'])
except Exception, e:
raise RollbackException("obi uniq error, rollbacking view: "+str(e), o_view)
if len(entries) > 0:
try:
uniq_sequences(entries, o_view, pb, config, mergedKeys_list=config['uniq']['merge'], taxonomy=taxo, mergeIds=config['uniq']['mergeids'], categories=config['uniq']['categories'], max_elts=config['obi']['maxelts'])
except Exception, e:
raise RollbackException("obi uniq error, rollbacking view: "+str(e), o_view)
print("", file=sys.stderr)

View File

@ -22,6 +22,7 @@ cdef class Column(OBIWrapper) :
cdef inline OBIDMS_column_p pointer(self)
cdef read_elements_names(self)
cpdef list keys(self)
@staticmethod
cdef type get_column_class(obitype_t obitype, bint multi_elts, bint tuples)

View File

@ -323,7 +323,10 @@ cdef class Column(OBIWrapper) :
free(elts_names_b)
return elts_names_list
cpdef list keys(self):
return self._elements_names
# Column alias property getter and setter
@property
def name(self):
@ -340,7 +343,7 @@ cdef class Column(OBIWrapper) :
@property
def elements_names(self):
return self._elements_names
# nb_elements_per_line property getter
@property
def nb_elements_per_line(self):

View File

@ -227,7 +227,9 @@ cdef class DMS(OBIWrapper):
cdef str s
s=""
for view_name in self.keys():
s = s + repr(self.get_view(view_name)) + "\n"
view = self.get_view(view_name)
s = s + repr(view) + "\n"
view.close()
return s

View File

@ -3,7 +3,7 @@
cimport cython
from obitools3.dms.view.view cimport Line
from obitools3.utils cimport bytes2str_object, str2bytes, tobytes
from obitools3.dms.column.column cimport Column_line
from obitools3.dms.column.column cimport Column_line, Column_multi_elts
cdef class TabFormat:
@ -25,19 +25,29 @@ cdef class TabFormat:
for k in self.tags:
if self.header and self.first_line:
value = tobytes(k)
if isinstance(data.view[k], Column_multi_elts):
for k2 in data.view[k].keys():
line.append(tobytes(k)+b':'+tobytes(k2))
else:
line.append(tobytes(k))
else:
value = data[k]
if value is not None:
if type(value) == Column_line:
value = value.bytes()
if isinstance(data.view[k], Column_multi_elts):
if value is None: # all keys at None
for k2 in data.view[k].keys(): # TODO could be much more efficient
line.append(self.NAString)
else:
value = str2bytes(str(bytes2str_object(value))) # genius programming
if value is None:
value = self.NAString
line.append(value)
for k2 in data.view[k].keys(): # TODO could be much more efficient
if value[k2] is not None:
line.append(str2bytes(str(bytes2str_object(value[k2])))) # genius programming
else:
line.append(self.NAString)
else:
if value is not None:
line.append(str2bytes(str(bytes2str_object(value))))
else:
line.append(self.NAString)
if self.first_line:
self.first_line = False

View File

@ -166,7 +166,9 @@ cdef object bytes2str_object(object value): # Only works if complex types are d
value[k] = bytes2str(v)
if type(k) == bytes:
value[bytes2str(k)] = value.pop(k)
elif isinstance(value, list):
elif isinstance(value, list) or isinstance(value, tuple):
if isinstance(value, tuple):
value = list(value)
for i in range(len(value)):
if isinstance(value[i], list) or isinstance(value[i], dict):
value[i] = bytes2str_object(value[i])

View File

@ -1,5 +1,5 @@
major = 3
minor = 0
serial= '0b24'
serial= '0b27'
version ="%d.%d.%s" % (major,minor,serial)

View File

@ -246,7 +246,16 @@ int obi_clean(const char* dms_name,
// Open the sample column if there is one
if ((strcmp(sample_column_name, "") == 0) || (sample_column_name == NULL))
sample_column = NULL;
{
fprintf(stderr, "Info: No sample information provided, assuming one sample.\n");
sample_column = obi_view_get_column(i_view, COUNT_COLUMN);
if (sample_column == NULL)
{
obidebug(1, "\nError getting the COUNT column");
return -1;
}
sample_count = 1;
}
else
{
sample_column = obi_view_get_column(i_view, sample_column_name);
@ -255,6 +264,13 @@ int obi_clean(const char* dms_name,
obidebug(1, "\nError getting the sample column");
return -1;
}
sample_count = (sample_column->header)->nb_elements_per_line;
// Check that the sample column is a merged column with all sample informations
if (sample_count == 1)
{
obidebug(1, "\n\nError: If a sample column is provided, it must contain 'merged' sample counts as built by obi uniq with the -m option\n");
return -1;
}
}
// Create the output view, or a temporary one if heads only
@ -279,8 +295,6 @@ int obi_clean(const char* dms_name,
return -1;
}
sample_count = (sample_column->header)->nb_elements_per_line;
// Create the output columns
if (create_output_columns(o_view, sample_column, sample_count) < 0)
{
@ -549,7 +563,7 @@ int obi_clean(const char* dms_name,
if (heads_only)
{
line_selection = malloc((o_view->infos)->line_count * sizeof(index_t));
line_selection = malloc((((o_view->infos)->line_count) + 1) * sizeof(index_t));
if (line_selection == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);

View File

@ -52,7 +52,8 @@
*
* @param dms A pointer on an OBIDMS.
* @param i_view_name The name of the input view.
* @param sample_column_name The name of the OBI_STR column in the input view where the sample information is kept.
* @param sample_column_name The name of the column in the input view where the sample information is kept.
* Must be merged informations as built by the obi uniq tool (checked by the function).
* NULL or "" (empty string) if there is no sample information.
* @param o_view_name The name of the output view where the results should be written (should not already exist).
* @param o_view_comments The comments that should be associated with the output view.

View File

@ -2259,7 +2259,13 @@ index_t obi_avl_add(OBIDMS_avl_p avl, Obi_blob_p value)
parent = next;
// Compare the crc of the value with the crc of the current node
comp = (current_node->crc64) - crc;
//comp = (current_node->crc64) - crc;
if ((current_node->crc64) == crc)
comp = 0;
else if ((current_node->crc64) > crc)
comp = 1;
else
comp = -1;
if (comp == 0)
{ // check if really same value
@ -2354,7 +2360,13 @@ index_t obi_avl_find(OBIDMS_avl_p avl, Obi_blob_p value)
current_node = (avl->tree)+next;
// Compare the crc of the value with the crc of the current node
comp = (current_node->crc64) - crc;
//comp = (current_node->crc64) - crc;
if ((current_node->crc64) == crc)
comp = 0;
else if ((current_node->crc64) > crc)
comp = 1;
else
comp = -1;
if (comp == 0)
{ // Check if really same value

View File

@ -1350,6 +1350,8 @@ OBIDMS_column_p obi_create_column(OBIDMS_p dms,
}
strncpy(header->indexer_name, final_indexer_name, INDEXER_MAX_NAME);
}
else
new_column->indexer = NULL;
// Fill the data with NA values
obi_ini_to_NA_values(new_column, 0, nb_lines);
@ -1558,6 +1560,8 @@ OBIDMS_column_p obi_open_column(OBIDMS_p dms,
return NULL;
}
}
else
column->indexer = NULL;
if (close(column_file_descriptor) < 0)
{
@ -1694,7 +1698,7 @@ int obi_close_column(OBIDMS_column_p column)
ret_val = -1;
// If it's a tuple column or the data type is OBI_STR, OBI_SEQ or OBI_QUAL, the associated indexer is closed
if (((column->header)->tuples) || (((column->header)->returned_data_type == OBI_STR) || ((column->header)->returned_data_type == OBI_SEQ) || ((column->header)->returned_data_type == OBI_QUAL)))
if ((column->indexer) != NULL)
if (obi_close_indexer(column->indexer) < 0)
ret_val = -1;