Improved progress display when importing files in a DMS

This commit is contained in:
Celine Mercier
2019-08-29 10:12:06 +02:00
parent 728af51cb2
commit 3cfe3a9b00
5 changed files with 35 additions and 10 deletions

View File

@ -99,6 +99,7 @@ def run(config):
logger("info", "obi import: imports an object (file(s), obiview, taxonomy...) into a DMS")
entry_count = -1
pb = None
if not config['obi']['taxdump']:
input = open_uri(config['obi']['inputURI'])
@ -110,7 +111,10 @@ def run(config):
else:
entry_count = input[4]
logger("info", "Importing %d entries", entry_count)
if entry_count > 0:
logger("info", "Importing %d entries", entry_count)
else:
logger("info", "Importing an unknow number of entries")
# TODO a bit dirty?
if input[2]==Nuc_Seq or input[2]==View_NUC_SEQS:
@ -137,8 +141,6 @@ def run(config):
if entry_count >= 0:
pb = ProgressBar(entry_count, config, seconde=5)
else:
pb = None
entries = input[1]
@ -170,6 +172,9 @@ def run(config):
if pb is not None:
pb(i)
elif not i%50000:
logger("info", "Imported %d entries", i)
if NUC_SEQS_view:
id_col[i] = entry.id

View File

@ -15,4 +15,5 @@ cdef class MagicKeyFile:
cdef class CompressedFile:
cdef object accessor
cdef bint compressed

View File

@ -74,8 +74,7 @@ cdef class MagicKeyFile:
cdef class CompressedFile:
def __init__(self,stream):
cdef int keylength
cdef MagicKeyFile magic
@ -92,11 +91,13 @@ cdef class CompressedFile:
magic=MagicKeyFile(stream,keylength)
self.accessor = None
self.compressed = False
for compressor in compress:
k,c = compress[compressor]
if magic.key.startswith(k):
self.accessor = c(magic)
self.compressed = True
if self.accessor is None:
if 'b' in magic.stream_mode:
@ -110,7 +111,17 @@ cdef class CompressedFile:
'b' not in magic.stream_mode):
self.accessor = io.TextIOWrapper(self.accessor)
# compressed property getter
@property
def compressed(self) :
'''
Returns a boolean indicating whether the file is compressed
@rtype: bint
'''
return self.compressed
def __getattr__(self,name):
return getattr(self.accessor, name)

View File

@ -166,9 +166,12 @@ def genbankIterator_dir(dir_path,
):
path = dir_path
read = 0
for filename in glob.glob(os.path.join(path, b'*.gbff*')):
read_files = 0
files = [filename for filename in glob.glob(os.path.join(path, b'*.gbff*'))]
for filename in files:
if read==only:
return
print("Parsing file %s (%d/%d)" % (tostr(filename), read_files, len(files)))
f = uopen(filename)
if only is not None:
only_f = only-read
@ -177,7 +180,8 @@ def genbankIterator_dir(dir_path,
for seq in genbankIterator_file(f, skip=skip, only=only_f, buffersize=buffersize):
yield seq
read+=1
read_files+=1
def genbankIterator(obj,
int skip=0,

View File

@ -15,6 +15,8 @@ from obitools3.dms.capi.obierrno cimport OBI_LINE_IDX_ERROR, \
OBI_ELT_IDX_ERROR, \
obi_errno
from obitools3.files.uncompress cimport CompressedFile
import re
import mmap
import os
@ -45,7 +47,7 @@ cpdef int count_entries(file, bytes format):
return -1
sep = re.compile(sep)
if type(file) and (format == b'genbank' or format == b'embl'): # file is actually a directory with multiple files
if type(file) == bytes and (format == b'genbank' or format == b'embl'): # file is actually a directory with multiple files
files = []
if format == b'embl':
extensions = [b"*.dat"]
@ -66,6 +68,8 @@ cpdef int count_entries(file, bytes format):
total_count = 0
for f in files:
if type(f) == CompressedFile and f.compressed:
return -1
mmapped_file = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
total_count += len(re.findall(sep, mmapped_file))