From 802a3f5933374c5fd033f4afdb2505952ab0526d Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Mon, 18 Mar 2019 18:16:39 +0100 Subject: [PATCH] data import: entries now counted if there are multiple files --- python/obitools3/commands/import.pyx | 17 +++++++---- python/obitools3/utils.pyx | 42 ++++++++++++++++++++++++++-- 2 files changed, 50 insertions(+), 9 deletions(-) diff --git a/python/obitools3/commands/import.pyx b/python/obitools3/commands/import.pyx index 53a63b1..52eb371 100755 --- a/python/obitools3/commands/import.pyx +++ b/python/obitools3/commands/import.pyx @@ -130,8 +130,11 @@ def run(config): output[0].record_command_line(" ".join(sys.argv[1:])) output[0].close() return - - pb = ProgressBar(entry_count, config, seconde=5) + + if entry_count >= 0: + pb = ProgressBar(entry_count, config, seconde=5) + else: + pb = None entries = input[1] @@ -161,7 +164,8 @@ def run(config): else: raise RollbackException("obi import error, rollbacking view", view) - pb(i) + if pb is not None: + pb(i) if NUC_SEQS_view: id_col[i] = entry.id @@ -271,10 +275,11 @@ def run(config): # Fill value dcols[tag][0][i] = value - i+=1 # TODO Not if None sequence + i+=1 - pb(i, force=True) - print("", file=sys.stderr) + if pb is not None: + pb(i, force=True) + print("", file=sys.stderr) # Save command config in View and DMS comments command_line = " ".join(sys.argv[1:]) diff --git a/python/obitools3/utils.pyx b/python/obitools3/utils.pyx index a139e5a..1699419 100755 --- a/python/obitools3/utils.pyx +++ b/python/obitools3/utils.pyx @@ -13,10 +13,13 @@ from obitools3.dms.capi.obitypes cimport is_a_DNA_seq, \ from obitools3.dms.capi.obierrno cimport OBI_LINE_IDX_ERROR, \ OBI_ELT_IDX_ERROR - #obi_errno + #obi_errno # TODO import re import mmap +import os +import glob +import gzip cpdef bytes format_separator(bytes format): @@ -35,16 +38,49 @@ cpdef bytes format_separator(bytes format): cpdef int count_entries(file, bytes format): + try: sep = format_separator(format) if sep is None: return -1 sep = re.compile(sep) - mmapped_file = mmap.mmap(file.fileno(), 0, access=mmap.ACCESS_READ) - return len(re.findall(sep, mmapped_file)) + + if type(file) and (format == b'genbank' or format == b'embl'): # file is actually a directory with multiple files + files = [] + if format == b'embl': + extensions = [b"*.dat"] + elif format == b"genbank": + extensions = [b"*.gbff"] + + for ext in extensions: + for filename in glob.glob(os.path.join(file, ext)): + #if filename[:-3] == ".gz": + # files.append(gzip.open(filename, "rb")) + #else: + files.append(open(filename, "rb")) + else: + files = [file] + + if len(files)==0: + return -1 + + total_count = 0 + for f in files: + mmapped_file = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) + total_count += len(re.findall(sep, mmapped_file)) + except: + if len(files) > 1: + for file in files: + file.close() return -1 + if len(files) > 1: + for f in files: + f.close() + + return total_count + # TODO RollbackException? cdef obi_errno_to_exception(int obi_errno, index_t line_nb=-1, object elt_id=None, str error_message=None) :