import: fixed count estimation for tabular files with header

2021-03-30 09:07:14 +13:00
parent 6026129ca8
commit 847c9c816d
3 changed files with 6 additions and 7 deletions
--- a/python/obitools3/uri/decode.pyx
+++ b/python/obitools3/uri/decode.pyx
@ -389,10 +389,7 @@ def open_uri(uri,
            sep = tobytes(qualifiers[b"sep"][0][0])
        else:
            try:
-                sep = config["obi"]["sep"]
+                sep = tobytes(config["obi"]["sep"])
                if sep == '\\t':   # dirty workaround for flake8(?) issue that reads '\t' as '\'+'t' when parsing the option value
                    sep = '\t'
                sep = tobytes(sep)
            except KeyError:
                sep=None
@ -568,6 +565,6 @@ def open_uri(uri,
        entry_count = -1
        if input:
-            entry_count = count_entries(file, format)
+            entry_count = count_entries(file, format, header)
        return (file, iseq, objclass, urib, entry_count)
--- a/python/obitools3/utils.pxd
+++ b/python/obitools3/utils.pxd
@ -3,7 +3,7 @@
 from obitools3.dms.capi.obitypes cimport obitype_t, index_t
 cpdef bytes format_uniq_pattern(bytes format)
-cpdef int count_entries(file, bytes format)
+cpdef int count_entries(file, bytes format, bint header)
 cdef obi_errno_to_exception(index_t line_nb=*, object elt_id=*, str error_message=*)
--- a/python/obitools3/utils.pyx
+++ b/python/obitools3/utils.pyx
@ -40,7 +40,7 @@ cpdef bytes format_uniq_pattern(bytes format):
        return None
-cpdef int count_entries(file, bytes format):
+cpdef int count_entries(file, bytes format, bint header):
    try:
        sep = format_uniq_pattern(format)
@ -75,6 +75,8 @@ cpdef int count_entries(file, bytes format):
            total_count += len(re.findall(sep, mmapped_file))
            if format != b"ngsfilter" and format != b"tabular" and format != b"embl" and format != b"genbank" and format != b"fastq":
                total_count += 1 # adding +1 for 1st entry because separators include \n (ngsfilter and tabular already count one more because of last \n)
            if format == b"tabular" and header: # not counting header as an entry
                total_count -= 1
    except:
        if len(files) > 1: