Swich to version 3.0.0-beta11

ecotag: fixed the generated column comments formatting that would
generate errors
2020-02-12 14:23:42 +01:00 · 2020-02-12 14:23:17 +01:00 · 2020-02-12 14:21:39 +01:00 · 2020-02-02 21:15:27 +01:00 · 2020-02-02 21:12:34 +01:00 · 2020-02-02 21:11:05 +01:00
5 changed files with 91 additions and 12 deletions
--- a/python/obitools3/commands/cat.pyx
+++ b/python/obitools3/commands/cat.pyx
@ -86,7 +86,24 @@ def run(config):
    if not remove_rev_qual:
        Column.new_column(o_view, REVERSE_SEQUENCE_COLUMN, OBI_SEQ)
        Column.new_column(o_view, REVERSE_QUALITY_COLUMN, OBI_QUAL, associated_column_name=REVERSE_SEQUENCE_COLUMN, associated_column_version=o_view[REVERSE_SEQUENCE_COLUMN].version)
-
+        
+    # Initialize multiple elements columns
+    dict_cols = {}
+    for v in iview_list:
+        for coln in v.keys():
+            if v[coln].nb_elements_per_line > 1:
+                if coln not in dict_cols:
+                    dict_cols[coln] = {}
+                    dict_cols[coln]['eltnames'] = set(v[coln].elements_names)
+                    dict_cols[coln]['nbelts'] = v[coln].nb_elements_per_line
+                    dict_cols[coln]['obitype'] = v[coln].data_type_int
+                else:
+                    dict_cols[coln]['eltnames'] = set(v[coln].elements_names + list(dict_cols[coln]['eltnames']))
+                    dict_cols[coln]['nbelts'] = len(dict_cols[coln]['eltnames'])
+    for coln in dict_cols:
+        Column.new_column(o_view, coln, dict_cols[coln]['obitype'], 
+                          nb_elements_per_line=dict_cols[coln]['nbelts'], elements_names=list(dict_cols[coln]['eltnames']))
+    
    # Initialize the progress bar
    pb = ProgressBar(total_len, config, seconde=5)
    
--- a/python/obitools3/commands/import.pyx
+++ b/python/obitools3/commands/import.pyx
@ -11,6 +11,7 @@ from obitools3.dms.column.column cimport Column
 from obitools3.dms.obiseq cimport Nuc_Seq
 from obitools3.dms import DMS
 from obitools3.dms.taxo.taxo cimport Taxonomy
+from obitools3.files.uncompress cimport CompressedFile


 from obitools3.utils cimport tobytes, \
@ -65,6 +66,14 @@ def addOptions(parser):
    addTaxdumpInputOption(parser)
    addMinimalOutputOption(parser)

+    group = parser.add_argument_group('obi import specific options')
+
+    group.add_argument('--preread',
+                     action="store_true", dest="import:preread",
+                     default=False,
+                     help="Do a first readthrough of the dataset if it contains huge dictionaries (more than 100 keys) for "
+                          "a much faster import.")
+

 def run(config):
    
@ -169,8 +178,6 @@ def run(config):

    if entry_count >= 0:
        pb = ProgressBar(entry_count, config, seconde=5)
-    
-    entries = input[1]
        
    NUC_SEQS_view = False
    if isinstance(output[1], View) :
@ -188,6 +195,60 @@ def run(config):
        
    dcols = {}
        
+    # First read through the entries to prepare columns with dictionaries as they are very time-expensive to rewrite
+    if config['import']['preread']:
+        logger("info", "First readthrough...")
+        entries = input[1]
+        i = 0
+        dict_dict = {}
+        for entry in entries:
+            PyErr_CheckSignals()
+        
+            if entry is None:  # error or exception handled at lower level, not raised because Python generators can't resume after any exception is raised
+                if config['obi']['skiperror']:
+                    i-=1
+                    continue
+                else:
+                    raise Exception("obi import error in first readthrough")
+            
+            if pb is not None:
+                pb(i)
+            elif not i%50000:
+                logger("info", "Read %d entries", i)
+    
+            for tag in entry :
+                if type(entry[tag]) == dict :
+                    if tag in dict_dict:
+                        dict_dict[tag][0].update(entry[tag].keys())
+                    else:
+                        dict_dict[tag] = [set(entry[tag].keys()), get_obitype(entry[tag])]
+            i+=1
+        
+        if pb is not None:
+            pb(i, force=True)
+            print("", file=sys.stderr)
+       
+        for tag in dict_dict:
+            dcols[tag] = (Column.new_column(view, tag, dict_dict[tag][1], \
+                              nb_elements_per_line=len(dict_dict[tag][0]), \
+                              elements_names=list(dict_dict[tag][0])), \
+                          value_obitype)
+    
+        
+        # Reinitialize the input
+        if isinstance(input[0], CompressedFile):
+            input_is_file = True
+        if entry_count >= 0:
+            pb = ProgressBar(entry_count, config, seconde=5)
+        try:
+            input[0].close()
+        except AttributeError:
+            pass
+        input = open_uri(config['obi']['inputURI'], force_file=input_is_file)
+        if input is None:
+            raise Exception("Could not open input URI")
+    
+    entries = input[1]
    i = 0
    for entry in entries :
        
--- a/python/obitools3/uri/decode.pyx
+++ b/python/obitools3/uri/decode.pyx
@ -171,7 +171,8 @@ Reads an URI and returns a tuple containing:
 def open_uri(uri,
             bint input=True,
             type newviewtype=View,
-             dms_only=False):
+             dms_only=False,
+             force_file=False):
    
    cdef bytes urib = tobytes(uri)
    cdef bytes scheme
@ -195,9 +196,9 @@ def open_uri(uri,
    if 'obi' not in config:
        config['obi']={}
    
-    try:
+    if not force_file and "defaultdms" in config["obi"]:
        default_dms=config["obi"]["defaultdms"]
-    except KeyError:
+    else:
        default_dms=None
        
    try:
--- a/python/obitools3/version.py
+++ b/python/obitools3/version.py
@ -1,5 +1,5 @@
 major = 3
 minor = 0
-serial= '0-beta9'
+serial= '0-beta11'

 version ="%d.%02d.%s" % (major,minor,serial)
--- a/src/obi_ecotag.c
+++ b/src/obi_ecotag.c
@ -100,35 +100,35 @@ int print_assignment_result(Obiview_p output_view, index_t line,
 static int create_output_columns(Obiview_p o_view)
 {
 	// Score column
-	if (obi_view_add_column(o_view, ECOTAG_SCORE_COLUMN_NAME, -1, NULL, OBI_FLOAT, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_SCORE_COLUMN_NAME, true) < 0)
+	if (obi_view_add_column(o_view, ECOTAG_SCORE_COLUMN_NAME, -1, NULL, OBI_FLOAT, 0, 1, NULL, false, false, false, NULL, NULL, -1, "{}", true) < 0)
 	{
 		obidebug(1, "\nError creating the column for the score in ecotag");
 		return -1;
 	}

 	// Assigned taxid column
-	if (obi_view_add_column(o_view, ECOTAG_TAXID_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_TAXID_COLUMN_NAME, true) < 0)
+	if (obi_view_add_column(o_view, ECOTAG_TAXID_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, "{}", true) < 0)
 	{
 		obidebug(1, "\nError creating the column for the assigned taxid in ecotag");
 		return -1;
 	}

 	// Assigned scientific name column
-	if (obi_view_add_column(o_view, ECOTAG_NAME_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_NAME_COLUMN_NAME, true) < 0)
+	if (obi_view_add_column(o_view, ECOTAG_NAME_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, false, false, NULL, NULL, -1, "{}", true) < 0)
 	{
 		obidebug(1, "\nError creating the column for the assigned scientific name in ecotag");
 		return -1;
 	}

 	// Assignement status column
-	if (obi_view_add_column(o_view, ECOTAG_STATUS_COLUMN_NAME, -1, NULL, OBI_BOOL, 0, 1, NULL, false, false, false, NULL, NULL, -1, ECOTAG_STATUS_COLUMN_NAME, true) < 0)
+	if (obi_view_add_column(o_view, ECOTAG_STATUS_COLUMN_NAME, -1, NULL, OBI_BOOL, 0, 1, NULL, false, false, false, NULL, NULL, -1, "{}", true) < 0)
 	{
 		obidebug(1, "\nError creating the column for the assignment status in ecotag");
 		return -1;
 	}

 	// Column for array of best match ids
-	if (obi_view_add_column(o_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, true, false, NULL, NULL, -1, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME, true) < 0)
+	if (obi_view_add_column(o_view, ECOTAG_BEST_MATCH_IDS_COLUMN_NAME, -1, NULL, OBI_STR, 0, 1, NULL, false, true, false, NULL, NULL, -1, "{}", true) < 0)
 	{
 		obidebug(1, "\nError creating the column for the array of ids of the best match in ecotag");
 		return -1;
Author	SHA1	Message	Date
Celine Mercier	0b4a234671	Swich to version 3.0.0-beta11	2020-02-12 14:23:42 +01:00
Celine Mercier	d32cfdcce5	ecotag: fixed the generated column comments formatting that would generate errors	2020-02-12 14:23:17 +01:00
Celine Mercier	219c0d6fdc	obi cat: Fixed the handling when concatenating views with dictionaries having different key sets	2020-02-12 14:21:39 +01:00
Celine Mercier	dc9f897917	switch to version 3.0.0-beta10	2020-02-02 21:15:27 +01:00
Celine Mercier	bb72682f7d	obi import: new option --preread to do a first readthrough of the dataset if it contains huge dictionaries for a much faster import.	2020-02-02 21:12:34 +01:00
Celine Mercier	52920c3c71	URI decoding: dirty temp fix for bug where default dms makes a mess when should guess file	2020-02-02 21:11:05 +01:00