From d88390c6d8542a06b6b8a23a02ddeb9dc812ee3b Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Wed, 13 Mar 2019 18:35:32 +0100 Subject: [PATCH] Cython API: when importing a file in a DMS, its length is computed beforehand for the progress bar --- python/obitools3/commands/import.pyx | 23 +++++++++---- python/obitools3/parsers/universal.pyx | 47 ++++++++++++++------------ python/obitools3/uri/decode.pyx | 45 +++++++++++++----------- python/obitools3/utils.pxd | 2 ++ python/obitools3/utils.pyx | 28 +++++++++++++++ 5 files changed, 97 insertions(+), 48 deletions(-) diff --git a/python/obitools3/commands/import.pyx b/python/obitools3/commands/import.pyx index 418a9a9..bb4b994 100755 --- a/python/obitools3/commands/import.pyx +++ b/python/obitools3/commands/import.pyx @@ -87,14 +87,19 @@ def run(config): DMS.obi_atexit() - logger("info", "obi import : imports file into a DMS") + logger("info", "obi import: imports an object (file(s), obiview, taxonomy...) into a DMS") + + entry_count = -1 if not config['obi']['taxdump']: input = open_uri(config['obi']['inputURI']) if input is None: # TODO check for bytes instead now? raise Exception("Could not open input URI") - - # TODO uuuuh + + entry_count = input[4] + logger("info", "Importing %d entries", entry_count) + + # TODO a bit dirty if input[2]==Nuc_Seq: v = View_NUC_SEQS else: @@ -117,7 +122,7 @@ def run(config): output[0].close() return - pb = ProgressBar(10000000, config, seconde=5) # TODO should be number of records in file + pb = ProgressBar(entry_count, config, seconde=5) entries = input[1] @@ -250,14 +255,17 @@ def run(config): i+=1 + pb(i, force=True) + print("", file=sys.stderr) + # Save command config in View and DMS comments command_line = " ".join(sys.argv[1:]) view.write_config(config, "import", command_line, input_str=[os.path.abspath(config['obi']['inputURI'])]) output[0].record_command_line(command_line) - print("\n") - print(view.__repr__()) - + #print("\n\nOutput view:\n````````````", file=sys.stderr) + #print(repr(view), file=sys.stderr) + try: input[0].close() except AttributeError: @@ -267,3 +275,4 @@ def run(config): except AttributeError: pass + logger("info", "Done.") diff --git a/python/obitools3/parsers/universal.pyx b/python/obitools3/parsers/universal.pyx index 942e661..beb309c 100755 --- a/python/obitools3/parsers/universal.pyx +++ b/python/obitools3/parsers/universal.pyx @@ -23,20 +23,20 @@ def is_ngsfilter_line(line): # TODO doesn't work? return False def entryIteratorFactory(lineiterator, - int skip=0, - only=None, - bytes seqtype=b'nuc', - int offset=-1, - bint noquality=False, - bint skiperror=True, - bint header=False, - bytes sep=None, - bytes dec=b'.', - bytes nastring=b"NA", - bint stripwhite=True, - bint blanklineskip=True, - bytes commentchar=b"#", - int buffersize=100000000): + int skip=0, + only=None, + bytes seqtype=b'nuc', + int offset=-1, + bint noquality=False, + bint skiperror=True, + bint header=False, + bytes sep=None, + bytes dec=b'.', + bytes nastring=b"NA", + bint stripwhite=True, + bint blanklineskip=True, + bytes commentchar=b"#", + int buffersize=100000000): if isinstance(lineiterator, (str, bytes)): lineiterator=uopen(lineiterator) @@ -65,7 +65,7 @@ def entryIteratorFactory(lineiterator, format=b"embl" elif first[0:6]==b'LOCUS ': format=b"genbank" - elif first[0:11]==b'#@ecopcr-v2': # TODO v2???? + elif first[0:8]==b'#@ecopcr': format=b"ecopcrfile" elif is_ngsfilter_line(first): format=b"ngsfilter" @@ -83,7 +83,8 @@ def entryIteratorFactory(lineiterator, firstline=first, buffersize=buffersize, nastring=nastring), - Nuc_Seq) + Nuc_Seq, + format) else: raise NotImplementedError() elif format==b'fastq': @@ -94,7 +95,8 @@ def entryIteratorFactory(lineiterator, firstline=first, buffersize=buffersize, nastring=nastring), - Nuc_Seq) + Nuc_Seq, + format) elif format==b'tabular': return (tabIterator(lineiterator, header = header, @@ -108,7 +110,8 @@ def entryIteratorFactory(lineiterator, only = only, firstline=first, buffersize=buffersize), - dict) + dict, + format) elif format==b'ngsfilter': return (ngsfilterIterator(lineiterator, sep = sep, @@ -121,7 +124,8 @@ def entryIteratorFactory(lineiterator, only = only, firstline=first, buffersize=buffersize), - dict) + dict, + format) elif format==b'embl': return (emblIterator(lineiterator, @@ -129,7 +133,8 @@ def entryIteratorFactory(lineiterator, only=only, firstline=first, buffersize=buffersize), - dict) + dict, + format) - raise NotImplementedError('File format not yet implemented') + raise NotImplementedError('File format iterator not implemented yet') diff --git a/python/obitools3/uri/decode.pyx b/python/obitools3/uri/decode.pyx index 089cecd..1d55e06 100755 --- a/python/obitools3/uri/decode.pyx +++ b/python/obitools3/uri/decode.pyx @@ -20,7 +20,7 @@ from obitools3.format.fastq import FastqFormat from obitools3.dms.obiseq import Nuc_Seq from obitools3.apps.config import getConfiguration,logger from obitools3.apps.temp import get_temp_dms -from obitools3.utils cimport tobytes # TODO because can't read options as bytes +from obitools3.utils cimport tobytes, count_entries # TODO tobytes because can't read options as bytes from obitools3.dms.capi.obierrno cimport obi_errno, \ OBIVIEW_ALREADY_EXISTS_ERROR @@ -159,6 +159,7 @@ Reads an URI and returns a tuple containing: (2) The opened view or iterator on the opened file or writer (3) The class of object returned or handled by (2) (4) The original URI in bytes + (5) The number of entries (if input URI) or -1 if unavailable ''' def open_uri(uri, bint input=True, @@ -209,7 +210,8 @@ def open_uri(uri, return (dms[0], dms[1], type(dms[1]), - urlunparse(urip)) + urlunparse(urip), + len(dms[0])) try: resource=open_dms_element(dms[0], dms[1], @@ -230,7 +232,8 @@ def open_uri(uri, return (resource[0], resource[1], type(resource[1]), - urlunparse(urip)) + urlunparse(urip), + len(resource[1])) except Exception as e: global obi_errno if obi_errno == OBIVIEW_ALREADY_EXISTS_ERROR: @@ -503,19 +506,19 @@ def open_uri(uri, raise NotImplementedError('Output sequence file format not implemented') else: if input: - iseq, objclass = entryIteratorFactory(file, - skip, only, - seqtype, - offset, - noquality, - skiperror, - header, - sep, - dec, - nastring, - stripwhite, - blanklineskip, - commentchar) + iseq, objclass, format = entryIteratorFactory(file, + skip, only, + seqtype, + offset, + noquality, + skiperror, + header, + sep, + dec, + nastring, + stripwhite, + blanklineskip, + commentchar) else: # default export is in fasta? or tab? TODO objclass = Nuc_Seq # Nuc_Seq_Stored? TODO iseq = FastaNucWriter(FastaFormat(printNAKeys=printna, NAString=nastring), @@ -524,7 +527,9 @@ def open_uri(uri, only=only) #tmpdms = get_temp_dms() - - return (file, iseq, objclass, urib) - - + + entry_count = -1 + if input: + entry_count = count_entries(file, format) + + return (file, iseq, objclass, urib, entry_count) diff --git a/python/obitools3/utils.pxd b/python/obitools3/utils.pxd index feb27a4..cc4e38b 100755 --- a/python/obitools3/utils.pxd +++ b/python/obitools3/utils.pxd @@ -2,6 +2,8 @@ from obitools3.dms.capi.obitypes cimport obitype_t, index_t +cpdef bytes format_separator(bytes format) +cpdef int count_entries(file, bytes format) cdef obi_errno_to_exception(int obi_errno, index_t line_nb=*, object elt_id=*, str error_message=*) diff --git a/python/obitools3/utils.pyx b/python/obitools3/utils.pyx index 1b147c1..a139e5a 100755 --- a/python/obitools3/utils.pyx +++ b/python/obitools3/utils.pyx @@ -16,6 +16,34 @@ from obitools3.dms.capi.obierrno cimport OBI_LINE_IDX_ERROR, \ #obi_errno import re +import mmap + + +cpdef bytes format_separator(bytes format): + if format == b"fasta": + return b"\n>" + elif format == b"fastq": + return b"\n@" + elif format == b"ngsfilter" or format == b"tabular": + return b"\n" + elif format == b"genbank" or format == b"embl": + return b"\n//" + elif format == b"ecopcr": + return b"\n[^#]" + else: + return None + + +cpdef int count_entries(file, bytes format): + try: + sep = format_separator(format) + if sep is None: + return -1 + sep = re.compile(sep) + mmapped_file = mmap.mmap(file.fileno(), 0, access=mmap.ACCESS_READ) + return len(re.findall(sep, mmapped_file)) + except: + return -1 # TODO RollbackException?