Improved progress display when importing files in a DMS

This commit is contained in:
Celine Mercier
2019-08-29 10:12:06 +02:00
parent 728af51cb2
commit 3cfe3a9b00
5 changed files with 35 additions and 10 deletions

View File

@ -99,6 +99,7 @@ def run(config):
logger("info", "obi import: imports an object (file(s), obiview, taxonomy...) into a DMS") logger("info", "obi import: imports an object (file(s), obiview, taxonomy...) into a DMS")
entry_count = -1 entry_count = -1
pb = None
if not config['obi']['taxdump']: if not config['obi']['taxdump']:
input = open_uri(config['obi']['inputURI']) input = open_uri(config['obi']['inputURI'])
@ -110,7 +111,10 @@ def run(config):
else: else:
entry_count = input[4] entry_count = input[4]
logger("info", "Importing %d entries", entry_count) if entry_count > 0:
logger("info", "Importing %d entries", entry_count)
else:
logger("info", "Importing an unknow number of entries")
# TODO a bit dirty? # TODO a bit dirty?
if input[2]==Nuc_Seq or input[2]==View_NUC_SEQS: if input[2]==Nuc_Seq or input[2]==View_NUC_SEQS:
@ -137,8 +141,6 @@ def run(config):
if entry_count >= 0: if entry_count >= 0:
pb = ProgressBar(entry_count, config, seconde=5) pb = ProgressBar(entry_count, config, seconde=5)
else:
pb = None
entries = input[1] entries = input[1]
@ -170,6 +172,9 @@ def run(config):
if pb is not None: if pb is not None:
pb(i) pb(i)
elif not i%50000:
logger("info", "Imported %d entries", i)
if NUC_SEQS_view: if NUC_SEQS_view:
id_col[i] = entry.id id_col[i] = entry.id

View File

@ -15,4 +15,5 @@ cdef class MagicKeyFile:
cdef class CompressedFile: cdef class CompressedFile:
cdef object accessor cdef object accessor
cdef bint compressed

View File

@ -74,8 +74,7 @@ cdef class MagicKeyFile:
cdef class CompressedFile: cdef class CompressedFile:
def __init__(self,stream): def __init__(self,stream):
cdef int keylength cdef int keylength
cdef MagicKeyFile magic cdef MagicKeyFile magic
@ -92,11 +91,13 @@ cdef class CompressedFile:
magic=MagicKeyFile(stream,keylength) magic=MagicKeyFile(stream,keylength)
self.accessor = None self.accessor = None
self.compressed = False
for compressor in compress: for compressor in compress:
k,c = compress[compressor] k,c = compress[compressor]
if magic.key.startswith(k): if magic.key.startswith(k):
self.accessor = c(magic) self.accessor = c(magic)
self.compressed = True
if self.accessor is None: if self.accessor is None:
if 'b' in magic.stream_mode: if 'b' in magic.stream_mode:
@ -110,7 +111,17 @@ cdef class CompressedFile:
'b' not in magic.stream_mode): 'b' not in magic.stream_mode):
self.accessor = io.TextIOWrapper(self.accessor) self.accessor = io.TextIOWrapper(self.accessor)
# compressed property getter
@property
def compressed(self) :
'''
Returns a boolean indicating whether the file is compressed
@rtype: bint
'''
return self.compressed
def __getattr__(self,name): def __getattr__(self,name):
return getattr(self.accessor, name) return getattr(self.accessor, name)

View File

@ -166,9 +166,12 @@ def genbankIterator_dir(dir_path,
): ):
path = dir_path path = dir_path
read = 0 read = 0
for filename in glob.glob(os.path.join(path, b'*.gbff*')): read_files = 0
files = [filename for filename in glob.glob(os.path.join(path, b'*.gbff*'))]
for filename in files:
if read==only: if read==only:
return return
print("Parsing file %s (%d/%d)" % (tostr(filename), read_files, len(files)))
f = uopen(filename) f = uopen(filename)
if only is not None: if only is not None:
only_f = only-read only_f = only-read
@ -177,7 +180,8 @@ def genbankIterator_dir(dir_path,
for seq in genbankIterator_file(f, skip=skip, only=only_f, buffersize=buffersize): for seq in genbankIterator_file(f, skip=skip, only=only_f, buffersize=buffersize):
yield seq yield seq
read+=1 read+=1
read_files+=1
def genbankIterator(obj, def genbankIterator(obj,
int skip=0, int skip=0,

View File

@ -15,6 +15,8 @@ from obitools3.dms.capi.obierrno cimport OBI_LINE_IDX_ERROR, \
OBI_ELT_IDX_ERROR, \ OBI_ELT_IDX_ERROR, \
obi_errno obi_errno
from obitools3.files.uncompress cimport CompressedFile
import re import re
import mmap import mmap
import os import os
@ -45,7 +47,7 @@ cpdef int count_entries(file, bytes format):
return -1 return -1
sep = re.compile(sep) sep = re.compile(sep)
if type(file) and (format == b'genbank' or format == b'embl'): # file is actually a directory with multiple files if type(file) == bytes and (format == b'genbank' or format == b'embl'): # file is actually a directory with multiple files
files = [] files = []
if format == b'embl': if format == b'embl':
extensions = [b"*.dat"] extensions = [b"*.dat"]
@ -66,6 +68,8 @@ cpdef int count_entries(file, bytes format):
total_count = 0 total_count = 0
for f in files: for f in files:
if type(f) == CompressedFile and f.compressed:
return -1
mmapped_file = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) mmapped_file = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
total_count += len(re.findall(sep, mmapped_file)) total_count += len(re.findall(sep, mmapped_file))