data import: entries now counted if there are multiple files

This commit is contained in:
Celine Mercier
2019-03-18 18:16:39 +01:00
parent 7e20870719
commit 802a3f5933
2 changed files with 50 additions and 9 deletions

View File

@ -131,7 +131,10 @@ def run(config):
output[0].close() output[0].close()
return return
pb = ProgressBar(entry_count, config, seconde=5) if entry_count >= 0:
pb = ProgressBar(entry_count, config, seconde=5)
else:
pb = None
entries = input[1] entries = input[1]
@ -161,7 +164,8 @@ def run(config):
else: else:
raise RollbackException("obi import error, rollbacking view", view) raise RollbackException("obi import error, rollbacking view", view)
pb(i) if pb is not None:
pb(i)
if NUC_SEQS_view: if NUC_SEQS_view:
id_col[i] = entry.id id_col[i] = entry.id
@ -271,10 +275,11 @@ def run(config):
# Fill value # Fill value
dcols[tag][0][i] = value dcols[tag][0][i] = value
i+=1 # TODO Not if None sequence i+=1
pb(i, force=True) if pb is not None:
print("", file=sys.stderr) pb(i, force=True)
print("", file=sys.stderr)
# Save command config in View and DMS comments # Save command config in View and DMS comments
command_line = " ".join(sys.argv[1:]) command_line = " ".join(sys.argv[1:])

View File

@ -13,10 +13,13 @@ from obitools3.dms.capi.obitypes cimport is_a_DNA_seq, \
from obitools3.dms.capi.obierrno cimport OBI_LINE_IDX_ERROR, \ from obitools3.dms.capi.obierrno cimport OBI_LINE_IDX_ERROR, \
OBI_ELT_IDX_ERROR OBI_ELT_IDX_ERROR
#obi_errno #obi_errno # TODO
import re import re
import mmap import mmap
import os
import glob
import gzip
cpdef bytes format_separator(bytes format): cpdef bytes format_separator(bytes format):
@ -35,16 +38,49 @@ cpdef bytes format_separator(bytes format):
cpdef int count_entries(file, bytes format): cpdef int count_entries(file, bytes format):
try: try:
sep = format_separator(format) sep = format_separator(format)
if sep is None: if sep is None:
return -1 return -1
sep = re.compile(sep) sep = re.compile(sep)
mmapped_file = mmap.mmap(file.fileno(), 0, access=mmap.ACCESS_READ)
return len(re.findall(sep, mmapped_file)) if type(file) and (format == b'genbank' or format == b'embl'): # file is actually a directory with multiple files
files = []
if format == b'embl':
extensions = [b"*.dat"]
elif format == b"genbank":
extensions = [b"*.gbff"]
for ext in extensions:
for filename in glob.glob(os.path.join(file, ext)):
#if filename[:-3] == ".gz":
# files.append(gzip.open(filename, "rb"))
#else:
files.append(open(filename, "rb"))
else:
files = [file]
if len(files)==0:
return -1
total_count = 0
for f in files:
mmapped_file = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
total_count += len(re.findall(sep, mmapped_file))
except: except:
if len(files) > 1:
for file in files:
file.close()
return -1 return -1
if len(files) > 1:
for f in files:
f.close()
return total_count
# TODO RollbackException? # TODO RollbackException?
cdef obi_errno_to_exception(int obi_errno, index_t line_nb=-1, object elt_id=None, str error_message=None) : cdef obi_errno_to_exception(int obi_errno, index_t line_nb=-1, object elt_id=None, str error_message=None) :