From bb72682f7dc160222f6929a14c55115e8f159caf Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Sun, 2 Feb 2020 21:12:34 +0100 Subject: [PATCH] obi import: new option --preread to do a first readthrough of the dataset if it contains huge dictionaries for a much faster import. --- python/obitools3/commands/import.pyx | 65 +++++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 2 deletions(-) diff --git a/python/obitools3/commands/import.pyx b/python/obitools3/commands/import.pyx index 22d59f8..377ad0a 100755 --- a/python/obitools3/commands/import.pyx +++ b/python/obitools3/commands/import.pyx @@ -11,6 +11,7 @@ from obitools3.dms.column.column cimport Column from obitools3.dms.obiseq cimport Nuc_Seq from obitools3.dms import DMS from obitools3.dms.taxo.taxo cimport Taxonomy +from obitools3.files.uncompress cimport CompressedFile from obitools3.utils cimport tobytes, \ @@ -65,6 +66,14 @@ def addOptions(parser): addTaxdumpInputOption(parser) addMinimalOutputOption(parser) + group = parser.add_argument_group('obi import specific options') + + group.add_argument('--preread', + action="store_true", dest="import:preread", + default=False, + help="Do a first readthrough of the dataset if it contains huge dictionaries (more than 100 keys) for " + "a much faster import.") + def run(config): @@ -169,8 +178,6 @@ def run(config): if entry_count >= 0: pb = ProgressBar(entry_count, config, seconde=5) - - entries = input[1] NUC_SEQS_view = False if isinstance(output[1], View) : @@ -188,6 +195,60 @@ def run(config): dcols = {} + # First read through the entries to prepare columns with dictionaries as they are very time-expensive to rewrite + if config['import']['preread']: + logger("info", "First readthrough...") + entries = input[1] + i = 0 + dict_dict = {} + for entry in entries: + PyErr_CheckSignals() + + if entry is None: # error or exception handled at lower level, not raised because Python generators can't resume after any exception is raised + if config['obi']['skiperror']: + i-=1 + continue + else: + raise Exception("obi import error in first readthrough") + + if pb is not None: + pb(i) + elif not i%50000: + logger("info", "Read %d entries", i) + + for tag in entry : + if type(entry[tag]) == dict : + if tag in dict_dict: + dict_dict[tag][0].update(entry[tag].keys()) + else: + dict_dict[tag] = [set(entry[tag].keys()), get_obitype(entry[tag])] + i+=1 + + if pb is not None: + pb(i, force=True) + print("", file=sys.stderr) + + for tag in dict_dict: + dcols[tag] = (Column.new_column(view, tag, dict_dict[tag][1], \ + nb_elements_per_line=len(dict_dict[tag][0]), \ + elements_names=list(dict_dict[tag][0])), \ + value_obitype) + + + # Reinitialize the input + if isinstance(input[0], CompressedFile): + input_is_file = True + if entry_count >= 0: + pb = ProgressBar(entry_count, config, seconde=5) + try: + input[0].close() + except AttributeError: + pass + input = open_uri(config['obi']['inputURI'], force_file=input_is_file) + if input is None: + raise Exception("Could not open input URI") + + entries = input[1] i = 0 for entry in entries :