data import: entries now counted if there are multiple files

This commit is contained in:
Celine Mercier
2019-03-18 18:16:39 +01:00
parent 7e20870719
commit 802a3f5933
2 changed files with 50 additions and 9 deletions

View File

@ -131,7 +131,10 @@ def run(config):
output[0].close()
return
pb = ProgressBar(entry_count, config, seconde=5)
if entry_count >= 0:
pb = ProgressBar(entry_count, config, seconde=5)
else:
pb = None
entries = input[1]
@ -161,7 +164,8 @@ def run(config):
else:
raise RollbackException("obi import error, rollbacking view", view)
pb(i)
if pb is not None:
pb(i)
if NUC_SEQS_view:
id_col[i] = entry.id
@ -271,10 +275,11 @@ def run(config):
# Fill value
dcols[tag][0][i] = value
i+=1 # TODO Not if None sequence
i+=1
pb(i, force=True)
print("", file=sys.stderr)
if pb is not None:
pb(i, force=True)
print("", file=sys.stderr)
# Save command config in View and DMS comments
command_line = " ".join(sys.argv[1:])

View File

@ -13,10 +13,13 @@ from obitools3.dms.capi.obitypes cimport is_a_DNA_seq, \
from obitools3.dms.capi.obierrno cimport OBI_LINE_IDX_ERROR, \
OBI_ELT_IDX_ERROR
#obi_errno
#obi_errno # TODO
import re
import mmap
import os
import glob
import gzip
cpdef bytes format_separator(bytes format):
@ -35,16 +38,49 @@ cpdef bytes format_separator(bytes format):
cpdef int count_entries(file, bytes format):
try:
sep = format_separator(format)
if sep is None:
return -1
sep = re.compile(sep)
mmapped_file = mmap.mmap(file.fileno(), 0, access=mmap.ACCESS_READ)
return len(re.findall(sep, mmapped_file))
if type(file) and (format == b'genbank' or format == b'embl'): # file is actually a directory with multiple files
files = []
if format == b'embl':
extensions = [b"*.dat"]
elif format == b"genbank":
extensions = [b"*.gbff"]
for ext in extensions:
for filename in glob.glob(os.path.join(file, ext)):
#if filename[:-3] == ".gz":
# files.append(gzip.open(filename, "rb"))
#else:
files.append(open(filename, "rb"))
else:
files = [file]
if len(files)==0:
return -1
total_count = 0
for f in files:
mmapped_file = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
total_count += len(re.findall(sep, mmapped_file))
except:
if len(files) > 1:
for file in files:
file.close()
return -1
if len(files) > 1:
for f in files:
f.close()
return total_count
# TODO RollbackException?
cdef obi_errno_to_exception(int obi_errno, index_t line_nb=-1, object elt_id=None, str error_message=None) :