Compare commits
9 Commits
v3.0.0-bet
...
v3.0.0-bet
Author | SHA1 | Date | |
---|---|---|---|
0b4a234671 | |||
d32cfdcce5 | |||
219c0d6fdc | |||
dc9f897917 | |||
bb72682f7d | |||
52920c3c71 | |||
18c22cecf9 | |||
1bfb96023c | |||
c67d668989 |
@ -86,7 +86,24 @@ def run(config):
|
||||
if not remove_rev_qual:
|
||||
Column.new_column(o_view, REVERSE_SEQUENCE_COLUMN, OBI_SEQ)
|
||||
Column.new_column(o_view, REVERSE_QUALITY_COLUMN, OBI_QUAL, associated_column_name=REVERSE_SEQUENCE_COLUMN, associated_column_version=o_view[REVERSE_SEQUENCE_COLUMN].version)
|
||||
|
||||
|
||||
# Initialize multiple elements columns
|
||||
dict_cols = {}
|
||||
for v in iview_list:
|
||||
for coln in v.keys():
|
||||
if v[coln].nb_elements_per_line > 1:
|
||||
if coln not in dict_cols:
|
||||
dict_cols[coln] = {}
|
||||
dict_cols[coln]['eltnames'] = set(v[coln].elements_names)
|
||||
dict_cols[coln]['nbelts'] = v[coln].nb_elements_per_line
|
||||
dict_cols[coln]['obitype'] = v[coln].data_type_int
|
||||
else:
|
||||
dict_cols[coln]['eltnames'] = set(v[coln].elements_names + list(dict_cols[coln]['eltnames']))
|
||||
dict_cols[coln]['nbelts'] = len(dict_cols[coln]['eltnames'])
|
||||
for coln in dict_cols:
|
||||
Column.new_column(o_view, coln, dict_cols[coln]['obitype'],
|
||||
nb_elements_per_line=dict_cols[coln]['nbelts'], elements_names=list(dict_cols[coln]['eltnames']))
|
||||
|
||||
# Initialize the progress bar
|
||||
pb = ProgressBar(total_len, config, seconde=5)
|
||||
|
||||
|
@ -11,6 +11,7 @@ from obitools3.dms.column.column cimport Column
|
||||
from obitools3.dms.obiseq cimport Nuc_Seq
|
||||
from obitools3.dms import DMS
|
||||
from obitools3.dms.taxo.taxo cimport Taxonomy
|
||||
from obitools3.files.uncompress cimport CompressedFile
|
||||
|
||||
|
||||
from obitools3.utils cimport tobytes, \
|
||||
@ -65,6 +66,14 @@ def addOptions(parser):
|
||||
addTaxdumpInputOption(parser)
|
||||
addMinimalOutputOption(parser)
|
||||
|
||||
group = parser.add_argument_group('obi import specific options')
|
||||
|
||||
group.add_argument('--preread',
|
||||
action="store_true", dest="import:preread",
|
||||
default=False,
|
||||
help="Do a first readthrough of the dataset if it contains huge dictionaries (more than 100 keys) for "
|
||||
"a much faster import.")
|
||||
|
||||
|
||||
def run(config):
|
||||
|
||||
@ -169,8 +178,6 @@ def run(config):
|
||||
|
||||
if entry_count >= 0:
|
||||
pb = ProgressBar(entry_count, config, seconde=5)
|
||||
|
||||
entries = input[1]
|
||||
|
||||
NUC_SEQS_view = False
|
||||
if isinstance(output[1], View) :
|
||||
@ -188,6 +195,60 @@ def run(config):
|
||||
|
||||
dcols = {}
|
||||
|
||||
# First read through the entries to prepare columns with dictionaries as they are very time-expensive to rewrite
|
||||
if config['import']['preread']:
|
||||
logger("info", "First readthrough...")
|
||||
entries = input[1]
|
||||
i = 0
|
||||
dict_dict = {}
|
||||
for entry in entries:
|
||||
PyErr_CheckSignals()
|
||||
|
||||
if entry is None: # error or exception handled at lower level, not raised because Python generators can't resume after any exception is raised
|
||||
if config['obi']['skiperror']:
|
||||
i-=1
|
||||
continue
|
||||
else:
|
||||
raise Exception("obi import error in first readthrough")
|
||||
|
||||
if pb is not None:
|
||||
pb(i)
|
||||
elif not i%50000:
|
||||
logger("info", "Read %d entries", i)
|
||||
|
||||
for tag in entry :
|
||||
if type(entry[tag]) == dict :
|
||||
if tag in dict_dict:
|
||||
dict_dict[tag][0].update(entry[tag].keys())
|
||||
else:
|
||||
dict_dict[tag] = [set(entry[tag].keys()), get_obitype(entry[tag])]
|
||||
i+=1
|
||||
|
||||
if pb is not None:
|
||||
pb(i, force=True)
|
||||
print("", file=sys.stderr)
|
||||
|
||||
for tag in dict_dict:
|
||||
dcols[tag] = (Column.new_column(view, tag, dict_dict[tag][1], \
|
||||
nb_elements_per_line=len(dict_dict[tag][0]), \
|
||||
elements_names=list(dict_dict[tag][0])), \
|
||||
value_obitype)
|
||||
|
||||
|
||||
# Reinitialize the input
|
||||
if isinstance(input[0], CompressedFile):
|
||||
input_is_file = True
|
||||
if entry_count >= 0:
|
||||
pb = ProgressBar(entry_count, config, seconde=5)
|
||||
try:
|
||||
input[0].close()
|
||||
except AttributeError:
|
||||
pass
|
||||
input = open_uri(config['obi']['inputURI'], force_file=input_is_file)
|
||||
if input is None:
|
||||
raise Exception("Could not open input URI")
|
||||
|
||||
entries = input[1]
|
||||
i = 0
|
||||
for entry in entries :
|
||||
|
||||
@ -247,6 +308,8 @@ def run(config):
|
||||
dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
|
||||
|
||||
# Fill value
|
||||
if value_type == dict and nb_elts == 1: # special case that makes the OBI3 create a 1 elt/line column which won't read a dict value
|
||||
value = value[list(value.keys())[0]] # The solution is to transform the value in a simple atomic one acceptable by the column
|
||||
dcols[tag][0][i] = value
|
||||
|
||||
# TODO else log error?
|
||||
@ -263,6 +326,12 @@ def run(config):
|
||||
rewrite = True
|
||||
|
||||
try:
|
||||
# Check that it's not the case where the first entry contained a dict of length 1 and now there is a new key
|
||||
if type(value) == dict and \
|
||||
dcols[tag][0].nb_elements_per_line == 1 and len(value.keys()) == 1 \
|
||||
and dcols[tag][0].elements_names[0] != list(value.keys())[0] :
|
||||
raise IndexError # trigger column rewrite
|
||||
|
||||
# Fill value
|
||||
dcols[tag][0][i] = value
|
||||
|
||||
|
@ -102,7 +102,7 @@ cdef extern from "obiview.h" nogil:
|
||||
const_char_p comments,
|
||||
bint create)
|
||||
|
||||
int obi_view_delete_column(Obiview_p view, const_char_p column_name)
|
||||
int obi_view_delete_column(Obiview_p view, const_char_p column_name, bint delete_file)
|
||||
|
||||
OBIDMS_column_p obi_view_get_column(Obiview_p view, const_char_p column_name)
|
||||
|
||||
|
@ -22,7 +22,8 @@ cdef class View(OBIWrapper):
|
||||
cdef inline Obiview_p pointer(self)
|
||||
|
||||
cpdef delete_column(self,
|
||||
object column_name)
|
||||
object column_name,
|
||||
bint delete_file=*)
|
||||
|
||||
cpdef rename_column(self,
|
||||
object current_name,
|
||||
|
@ -227,7 +227,8 @@ cdef class View(OBIWrapper) :
|
||||
|
||||
|
||||
cpdef delete_column(self,
|
||||
object column_name) :
|
||||
object column_name,
|
||||
bint delete_file=False) :
|
||||
|
||||
cdef bytes column_name_b = tobytes(column_name)
|
||||
|
||||
@ -239,7 +240,7 @@ cdef class View(OBIWrapper) :
|
||||
col.close()
|
||||
|
||||
# Remove the column from the view which closes the C structure
|
||||
if obi_view_delete_column(self.pointer(), column_name_b) < 0 :
|
||||
if obi_view_delete_column(self.pointer(), column_name_b, delete_file) < 0 :
|
||||
raise RollbackException("Problem deleting column %s from a view",
|
||||
bytes2str(column_name_b), self)
|
||||
|
||||
@ -297,11 +298,17 @@ cdef class View(OBIWrapper) :
|
||||
nb_elements_per_line=new_nb_elements_per_line, elements_names=new_elements_names,
|
||||
comments=old_column.comments, alias=column_name_b+tobytes('___new___'))
|
||||
|
||||
switch_to_dict = old_column.nb_elements_per_line == 1 and new_nb_elements_per_line > 1
|
||||
ori_key = old_column._elements_names[0]
|
||||
|
||||
for i in range(length) :
|
||||
new_column[i] = old_column[i]
|
||||
if switch_to_dict :
|
||||
new_column[i] = {ori_key: old_column[i]}
|
||||
else:
|
||||
new_column[i] = old_column[i]
|
||||
|
||||
# Remove old column from view
|
||||
self.delete_column(column_name_b)
|
||||
self.delete_column(column_name_b, delete_file=True)
|
||||
|
||||
# Rename new
|
||||
new_column.name = column_name_b
|
||||
|
7
python/obitools3/uri/decode.pyx
Executable file → Normal file
7
python/obitools3/uri/decode.pyx
Executable file → Normal file
@ -171,7 +171,8 @@ Reads an URI and returns a tuple containing:
|
||||
def open_uri(uri,
|
||||
bint input=True,
|
||||
type newviewtype=View,
|
||||
dms_only=False):
|
||||
dms_only=False,
|
||||
force_file=False):
|
||||
|
||||
cdef bytes urib = tobytes(uri)
|
||||
cdef bytes scheme
|
||||
@ -195,9 +196,9 @@ def open_uri(uri,
|
||||
if 'obi' not in config:
|
||||
config['obi']={}
|
||||
|
||||
try:
|
||||
if not force_file and "defaultdms" in config["obi"]:
|
||||
default_dms=config["obi"]["defaultdms"]
|
||||
except KeyError:
|
||||
else:
|
||||
default_dms=None
|
||||
|
||||
try:
|
||||
|
@ -1,5 +1,5 @@
|
||||
major = 3
|
||||
minor = 0
|
||||
serial= '0-beta7'
|
||||
serial= '0-beta11'
|
||||
|
||||
version ="%d.%02d.%s" % (major,minor,serial)
|
||||
|
@ -100,35 +100,35 @@ int print_assignment_result(Obiview_p output_view, index_t line,
|
||||
static int create_output_columns(Obiview_p o_view)
|
||||
{
|
||||
// Score column
|
||||
if (obi_view_add_column(o_view, ECOTAG_SCORE_COLUMN_NAME, -1, NULL, OBI_FLOAT, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_SCORE_COLUMN_NAME, true) < 0)
|
||||
if (obi_view_add_column(o_view, ECOTAG_SCORE_COLUMN_NAME, -1, NULL, OBI_FLOAT, 0, 1, NULL, false, false, false, NULL, NULL, -1, "{}", true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the column for the score in ecotag");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Assigned taxid column
|
||||
if (obi_view_add_column(o_view, ECOTAG_TAXID_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_TAXID_COLUMN_NAME, true) < 0)
|
||||
if (obi_view_add_column(o_view, ECOTAG_TAXID_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, "{}", true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the column for the assigned taxid in ecotag");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Assigned scientific name column
|
||||
if (obi_view_add_column(o_view, ECOTAG_NAME_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_NAME_COLUMN_NAME, true) < 0)
|
||||
if (obi_view_add_column(o_view, ECOTAG_NAME_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, false, false, NULL, NULL, -1, "{}", true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the column for the assigned scientific name in ecotag");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Assignement status column
|
||||
if (obi_view_add_column(o_view, ECOTAG_STATUS_COLUMN_NAME, -1, NULL, OBI_BOOL, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_STATUS_COLUMN_NAME, true) < 0)
|
||||
if (obi_view_add_column(o_view, ECOTAG_STATUS_COLUMN_NAME, -1, NULL, OBI_BOOL, 0, 1, NULL, false, false, false, NULL, NULL, -1, "{}", true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the column for the assignment status in ecotag");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Column for array of best match ids
|
||||
if (obi_view_add_column(o_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, true, false, NULL, NULL, -1, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME, true) < 0)
|
||||
if (obi_view_add_column(o_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, true, false, NULL, NULL, -1, "{}", true) < 0)
|
||||
{
|
||||
obidebug(1, "\nError creating the column for the array of ids of the best match in ecotag");
|
||||
return -1;
|
||||
|
@ -2380,11 +2380,12 @@ int obi_view_add_column(Obiview_p view,
|
||||
}
|
||||
|
||||
|
||||
int obi_view_delete_column(Obiview_p view, const char* column_name)
|
||||
int obi_view_delete_column(Obiview_p view, const char* column_name, bool delete_file)
|
||||
{
|
||||
int i;
|
||||
bool found;
|
||||
OBIDMS_column_p column;
|
||||
char* col_to_delete_path;
|
||||
|
||||
// Check that the view is not read-only
|
||||
if (view->read_only)
|
||||
@ -2406,8 +2407,31 @@ int obi_view_delete_column(Obiview_p view, const char* column_name)
|
||||
obidebug(1, "\nError getting a column from the linked list of column pointers of a view when deleting a column from a view");
|
||||
return -1;
|
||||
}
|
||||
// Keep column path if need to delete the file
|
||||
if (delete_file)
|
||||
{
|
||||
col_to_delete_path = obi_column_full_path(view->dms, column->header->name, column->header->version);
|
||||
if (col_to_delete_path == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting a column file path when deleting a column");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
obi_close_column(column);
|
||||
|
||||
// Delete file if needed
|
||||
if (delete_file)
|
||||
{
|
||||
if (remove(col_to_delete_path) < 0)
|
||||
{
|
||||
obi_set_errno(OBICOL_UNKNOWN_ERROR);
|
||||
obidebug(1, "\nError deleting a column file when deleting unfinished columns: file %s", col_to_delete_path);
|
||||
return -1;
|
||||
}
|
||||
free(col_to_delete_path);
|
||||
}
|
||||
|
||||
view->columns = ll_delete(view->columns, i);
|
||||
// TODO how do we check for error? NULL can be empty list
|
||||
found = true;
|
||||
@ -3047,7 +3071,7 @@ int obi_create_auto_id_column(Obiview_p view, const char* prefix)
|
||||
// Delete old ID column if it exists
|
||||
if (obi_view_get_column(view, ID_COLUMN) != NULL)
|
||||
{
|
||||
if (obi_view_delete_column(view, ID_COLUMN) < 0)
|
||||
if (obi_view_delete_column(view, ID_COLUMN, false) < 0)
|
||||
{
|
||||
obidebug(1, "Error deleting an ID column to replace it in a view");
|
||||
return -1;
|
||||
|
@ -440,6 +440,7 @@ int obi_view_add_column(Obiview_p view,
|
||||
*
|
||||
* @param view A pointer on the view.
|
||||
* @param column_name The name of the column that should be deleted from the view.
|
||||
* @param delete_file Whether the column file should be deleted. Use carefully re: dependencies.
|
||||
*
|
||||
* @returns A value indicating the success of the operation.
|
||||
* @retval 0 if the operation was successfully completed.
|
||||
@ -448,7 +449,7 @@ int obi_view_add_column(Obiview_p view,
|
||||
* @since February 2016
|
||||
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||
*/
|
||||
int obi_view_delete_column(Obiview_p view, const char* column_name);
|
||||
int obi_view_delete_column(Obiview_p view, const char* column_name, bool delete_file);
|
||||
|
||||
|
||||
/**
|
||||
|
Reference in New Issue
Block a user