Swich to version 3.0.0-beta11

ecotag: fixed the generated column comments formatting that would
generate errors
2020-02-12 14:23:42 +01:00 · 2020-02-12 14:23:17 +01:00 · 2020-02-12 14:21:39 +01:00 · 2020-02-02 21:15:27 +01:00 · 2020-02-02 21:12:34 +01:00 · 2020-02-02 21:11:05 +01:00
10 changed files with 141 additions and 21 deletions
--- a/python/obitools3/commands/cat.pyx
+++ b/python/obitools3/commands/cat.pyx
@ -86,7 +86,24 @@ def run(config):
    if not remove_rev_qual:
        Column.new_column(o_view, REVERSE_SEQUENCE_COLUMN, OBI_SEQ)
        Column.new_column(o_view, REVERSE_QUALITY_COLUMN, OBI_QUAL, associated_column_name=REVERSE_SEQUENCE_COLUMN, associated_column_version=o_view[REVERSE_SEQUENCE_COLUMN].version)
-
+        
+    # Initialize multiple elements columns
+    dict_cols = {}
+    for v in iview_list:
+        for coln in v.keys():
+            if v[coln].nb_elements_per_line > 1:
+                if coln not in dict_cols:
+                    dict_cols[coln] = {}
+                    dict_cols[coln]['eltnames'] = set(v[coln].elements_names)
+                    dict_cols[coln]['nbelts'] = v[coln].nb_elements_per_line
+                    dict_cols[coln]['obitype'] = v[coln].data_type_int
+                else:
+                    dict_cols[coln]['eltnames'] = set(v[coln].elements_names + list(dict_cols[coln]['eltnames']))
+                    dict_cols[coln]['nbelts'] = len(dict_cols[coln]['eltnames'])
+    for coln in dict_cols:
+        Column.new_column(o_view, coln, dict_cols[coln]['obitype'], 
+                          nb_elements_per_line=dict_cols[coln]['nbelts'], elements_names=list(dict_cols[coln]['eltnames']))
+    
    # Initialize the progress bar
    pb = ProgressBar(total_len, config, seconde=5)
    
--- a/python/obitools3/commands/import.pyx
+++ b/python/obitools3/commands/import.pyx
@ -11,6 +11,7 @@ from obitools3.dms.column.column cimport Column
 from obitools3.dms.obiseq cimport Nuc_Seq
 from obitools3.dms import DMS
 from obitools3.dms.taxo.taxo cimport Taxonomy
+from obitools3.files.uncompress cimport CompressedFile


 from obitools3.utils cimport tobytes, \
@ -65,6 +66,14 @@ def addOptions(parser):
    addTaxdumpInputOption(parser)
    addMinimalOutputOption(parser)

+    group = parser.add_argument_group('obi import specific options')
+
+    group.add_argument('--preread',
+                     action="store_true", dest="import:preread",
+                     default=False,
+                     help="Do a first readthrough of the dataset if it contains huge dictionaries (more than 100 keys) for "
+                          "a much faster import.")
+

 def run(config):
    
@ -169,8 +178,6 @@ def run(config):

    if entry_count >= 0:
        pb = ProgressBar(entry_count, config, seconde=5)
-    
-    entries = input[1]
        
    NUC_SEQS_view = False
    if isinstance(output[1], View) :
@ -188,6 +195,60 @@ def run(config):
        
    dcols = {}
        
+    # First read through the entries to prepare columns with dictionaries as they are very time-expensive to rewrite
+    if config['import']['preread']:
+        logger("info", "First readthrough...")
+        entries = input[1]
+        i = 0
+        dict_dict = {}
+        for entry in entries:
+            PyErr_CheckSignals()
+        
+            if entry is None:  # error or exception handled at lower level, not raised because Python generators can't resume after any exception is raised
+                if config['obi']['skiperror']:
+                    i-=1
+                    continue
+                else:
+                    raise Exception("obi import error in first readthrough")
+            
+            if pb is not None:
+                pb(i)
+            elif not i%50000:
+                logger("info", "Read %d entries", i)
+    
+            for tag in entry :
+                if type(entry[tag]) == dict :
+                    if tag in dict_dict:
+                        dict_dict[tag][0].update(entry[tag].keys())
+                    else:
+                        dict_dict[tag] = [set(entry[tag].keys()), get_obitype(entry[tag])]
+            i+=1
+        
+        if pb is not None:
+            pb(i, force=True)
+            print("", file=sys.stderr)
+       
+        for tag in dict_dict:
+            dcols[tag] = (Column.new_column(view, tag, dict_dict[tag][1], \
+                              nb_elements_per_line=len(dict_dict[tag][0]), \
+                              elements_names=list(dict_dict[tag][0])), \
+                          value_obitype)
+    
+        
+        # Reinitialize the input
+        if isinstance(input[0], CompressedFile):
+            input_is_file = True
+        if entry_count >= 0:
+            pb = ProgressBar(entry_count, config, seconde=5)
+        try:
+            input[0].close()
+        except AttributeError:
+            pass
+        input = open_uri(config['obi']['inputURI'], force_file=input_is_file)
+        if input is None:
+            raise Exception("Could not open input URI")
+    
+    entries = input[1]
    i = 0
    for entry in entries :
        
@ -247,6 +308,8 @@ def run(config):
                        dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
                                                 
                        # Fill value
+                        if value_type == dict and nb_elts == 1:  # special case that makes the OBI3 create a 1 elt/line column which won't read a dict value
+                            value = value[list(value.keys())[0]]       # The solution is to transform the value in a simple atomic one acceptable by the column
                        dcols[tag][0][i] = value
                     
                    # TODO else log error?
@ -263,6 +326,12 @@ def run(config):
                        rewrite = True
 
                    try:
+                        # Check that it's not the case where the first entry contained a dict of length 1 and now there is a new key                        
+                        if type(value) == dict and \
+                            dcols[tag][0].nb_elements_per_line == 1 and len(value.keys()) == 1 \
+                            and dcols[tag][0].elements_names[0] != list(value.keys())[0] :
+                            raise IndexError  # trigger column rewrite
+                        
                        # Fill value
                        dcols[tag][0][i] = value
                     
--- a/python/obitools3/dms/capi/obiview.pxd
+++ b/python/obitools3/dms/capi/obiview.pxd
@ -102,7 +102,7 @@ cdef extern from "obiview.h" nogil:
                            const_char_p comments,
                            bint create)

-    int obi_view_delete_column(Obiview_p view, const_char_p column_name)
+    int obi_view_delete_column(Obiview_p view, const_char_p column_name, bint delete_file)
            
    OBIDMS_column_p obi_view_get_column(Obiview_p view, const_char_p column_name)

--- a/python/obitools3/dms/view/view.pxd
+++ b/python/obitools3/dms/view/view.pxd
@ -22,7 +22,8 @@ cdef class View(OBIWrapper):
    cdef inline Obiview_p pointer(self)   
        
    cpdef delete_column(self, 
-                        object column_name)
+                        object column_name,
+                        bint delete_file=*)
    
    cpdef rename_column(self, 
                        object current_name, 
--- a/python/obitools3/dms/view/view.pyx
+++ b/python/obitools3/dms/view/view.pyx
@ -227,7 +227,8 @@ cdef class View(OBIWrapper) :

    
    cpdef delete_column(self, 
-                        object column_name) :
+                        object column_name,
+                        bint delete_file=False) :

        cdef bytes column_name_b = tobytes(column_name)

@ -239,7 +240,7 @@ cdef class View(OBIWrapper) :
        col.close()
        
        # Remove the column from the view which closes the C structure
-        if obi_view_delete_column(self.pointer(), column_name_b) < 0 :
+        if obi_view_delete_column(self.pointer(), column_name_b, delete_file) < 0 :
            raise RollbackException("Problem deleting column %s from a view",
                            bytes2str(column_name_b), self)

@ -297,11 +298,17 @@ cdef class View(OBIWrapper) :
                                       nb_elements_per_line=new_nb_elements_per_line, elements_names=new_elements_names, 
                                       comments=old_column.comments, alias=column_name_b+tobytes('___new___'))
        
+        switch_to_dict = old_column.nb_elements_per_line == 1 and new_nb_elements_per_line > 1
+        ori_key = old_column._elements_names[0]
+        
        for i in range(length) :
-            new_column[i] = old_column[i]
+            if switch_to_dict :
+                new_column[i] = {ori_key: old_column[i]}
+            else:
+                new_column[i] = old_column[i]

        # Remove old column from view
-        self.delete_column(column_name_b)
+        self.delete_column(column_name_b, delete_file=True)

        # Rename new
        new_column.name = column_name_b
--- a/python/obitools3/uri/decode.pyx
+++ b/python/obitools3/uri/decode.pyx
@ -171,7 +171,8 @@ Reads an URI and returns a tuple containing:
 def open_uri(uri,
             bint input=True,
             type newviewtype=View,
-             dms_only=False):
+             dms_only=False,
+             force_file=False):
    
    cdef bytes urib = tobytes(uri)
    cdef bytes scheme
@ -195,9 +196,9 @@ def open_uri(uri,
    if 'obi' not in config:
        config['obi']={}
    
-    try:
+    if not force_file and "defaultdms" in config["obi"]:
        default_dms=config["obi"]["defaultdms"]
-    except KeyError:
+    else:
        default_dms=None
        
    try:
--- a/python/obitools3/version.py
+++ b/python/obitools3/version.py
@ -1,5 +1,5 @@
 major = 3
 minor = 0
-serial= '0-beta7'
+serial= '0-beta11'

 version ="%d.%02d.%s" % (major,minor,serial)
--- a/src/obi_ecotag.c
+++ b/src/obi_ecotag.c
@ -100,35 +100,35 @@ int print_assignment_result(Obiview_p output_view, index_t line,
 static int create_output_columns(Obiview_p o_view)
 {
 	// Score column
-	if (obi_view_add_column(o_view, ECOTAG_SCORE_COLUMN_NAME, -1, NULL, OBI_FLOAT, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_SCORE_COLUMN_NAME, true) < 0)
+	if (obi_view_add_column(o_view, ECOTAG_SCORE_COLUMN_NAME, -1, NULL, OBI_FLOAT, 0, 1, NULL, false, false, false, NULL, NULL, -1, "{}", true) < 0)
 	{
 		obidebug(1, "\nError creating the column for the score in ecotag");
 		return -1;
 	}

 	// Assigned taxid column
-	if (obi_view_add_column(o_view, ECOTAG_TAXID_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_TAXID_COLUMN_NAME, true) < 0)
+	if (obi_view_add_column(o_view, ECOTAG_TAXID_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, "{}", true) < 0)
 	{
 		obidebug(1, "\nError creating the column for the assigned taxid in ecotag");
 		return -1;
 	}

 	// Assigned scientific name column
-	if (obi_view_add_column(o_view, ECOTAG_NAME_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_NAME_COLUMN_NAME, true) < 0)
+	if (obi_view_add_column(o_view, ECOTAG_NAME_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, false, false, NULL, NULL, -1, "{}", true) < 0)
 	{
 		obidebug(1, "\nError creating the column for the assigned scientific name in ecotag");
 		return -1;
 	}

 	// Assignement status column
-	if (obi_view_add_column(o_view, ECOTAG_STATUS_COLUMN_NAME, -1, NULL, OBI_BOOL, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_STATUS_COLUMN_NAME, true) < 0)
+	if (obi_view_add_column(o_view, ECOTAG_STATUS_COLUMN_NAME, -1, NULL, OBI_BOOL, 0, 1, NULL, false, false, false, NULL, NULL, -1, "{}", true) < 0)
 	{
 		obidebug(1, "\nError creating the column for the assignment status in ecotag");
 		return -1;
 	}

 	// Column for array of best match ids
-	if (obi_view_add_column(o_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, true, false, NULL, NULL, -1, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME, true) < 0)
+	if (obi_view_add_column(o_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, true, false, NULL, NULL, -1, "{}", true) < 0)
 	{
 		obidebug(1, "\nError creating the column for the array of ids of the best match in ecotag");
 		return -1;
--- a/src/obiview.c
+++ b/src/obiview.c
@ -2380,11 +2380,12 @@ int obi_view_add_column(Obiview_p    view,
 }


-int obi_view_delete_column(Obiview_p view, const char* column_name)
+int obi_view_delete_column(Obiview_p view, const char* column_name, bool delete_file)
 {
 	int  i;
 	bool found;
 	OBIDMS_column_p column;
+	char* col_to_delete_path;

 	// Check that the view is not read-only
 	if (view->read_only)
@ -2406,8 +2407,31 @@ int obi_view_delete_column(Obiview_p view, const char* column_name)
 				obidebug(1, "\nError getting a column from the linked list of column pointers of a view when deleting a column from a view");
 				return -1;
 			}
+			// Keep column path if need to delete the file
+			if (delete_file)
+			{
+				col_to_delete_path = obi_column_full_path(view->dms, column->header->name, column->header->version);
+				if (col_to_delete_path == NULL)
+				{
+					obidebug(1, "\nError getting a column file path when deleting a column");
+					return -1;
+				}
+			}

 			obi_close_column(column);
+
+			// Delete file if needed
+			if (delete_file)
+			{
+				if (remove(col_to_delete_path) < 0)
+				{
+					obi_set_errno(OBICOL_UNKNOWN_ERROR);
+					obidebug(1, "\nError deleting a column file when deleting unfinished columns: file %s", col_to_delete_path);
+					return -1;
+				}
+				free(col_to_delete_path);
+			}
+
 			view->columns = ll_delete(view->columns, i);
 			// TODO how do we check for error? NULL can be empty list
 			found = true;
@ -3047,7 +3071,7 @@ int obi_create_auto_id_column(Obiview_p view, const char* prefix)
 	// Delete old ID column if it exists
 	if (obi_view_get_column(view, ID_COLUMN) != NULL)
 	{
-		if (obi_view_delete_column(view, ID_COLUMN) < 0)
+		if (obi_view_delete_column(view, ID_COLUMN, false) < 0)
 		{
 			obidebug(1, "Error deleting an ID column to replace it in a view");
 			return -1;
--- a/src/obiview.h
+++ b/src/obiview.h
@ -440,6 +440,7 @@ int obi_view_add_column(Obiview_p    view,
 *
 * @param view A pointer on the view.
 * @param column_name The name of the column that should be deleted from the view.
+ * @param delete_file Whether the column file should be deleted. Use carefully re: dependencies.
 *
 * @returns A value indicating the success of the operation.
 * @retval 0 if the operation was successfully completed.
@ -448,7 +449,7 @@ int obi_view_add_column(Obiview_p    view,
 * @since February 2016
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 */
-int obi_view_delete_column(Obiview_p view, const char* column_name);
+int obi_view_delete_column(Obiview_p view, const char* column_name, bool delete_file);


 /**
Author	SHA1	Message	Date
Celine Mercier	0b4a234671	Swich to version 3.0.0-beta11	2020-02-12 14:23:42 +01:00
Celine Mercier	d32cfdcce5	ecotag: fixed the generated column comments formatting that would generate errors	2020-02-12 14:23:17 +01:00
Celine Mercier	219c0d6fdc	obi cat: Fixed the handling when concatenating views with dictionaries having different key sets	2020-02-12 14:21:39 +01:00
Celine Mercier	dc9f897917	switch to version 3.0.0-beta10	2020-02-02 21:15:27 +01:00
Celine Mercier	bb72682f7d	obi import: new option --preread to do a first readthrough of the dataset if it contains huge dictionaries for a much faster import.	2020-02-02 21:12:34 +01:00
Celine Mercier	52920c3c71	URI decoding: dirty temp fix for bug where default dms makes a mess when should guess file	2020-02-02 21:11:05 +01:00
Celine Mercier	18c22cecf9	switch to version 3.0.0-beta9	2020-02-01 15:48:55 +01:00
Celine Mercier	1bfb96023c	obi import: rewriting a column now deletes the old one to save disk space	2020-02-01 15:31:14 +01:00
Celine Mercier	c67d668989	obi import: fixed a bug when the first entry would contain a dictionary with one key. Switch to beta8	2020-01-29 20:23:39 +01:00