From 7dcbc3401713d741c5ce5529e7f2749dbb40d922 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Thu, 30 Jul 2020 16:56:36 +0200 Subject: [PATCH] import: fixed entry count estimation when importing fastq files --- python/obitools3/utils.pxd | 2 +- python/obitools3/utils.pyx | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/obitools3/utils.pxd b/python/obitools3/utils.pxd index 53d3765..eeb1266 100755 --- a/python/obitools3/utils.pxd +++ b/python/obitools3/utils.pxd @@ -2,7 +2,7 @@ from obitools3.dms.capi.obitypes cimport obitype_t, index_t -cpdef bytes format_separator(bytes format) +cpdef bytes format_uniq_pattern(bytes format) cpdef int count_entries(file, bytes format) cdef obi_errno_to_exception(index_t line_nb=*, object elt_id=*, str error_message=*) diff --git a/python/obitools3/utils.pyx b/python/obitools3/utils.pyx index bde2c80..3959aa0 100755 --- a/python/obitools3/utils.pyx +++ b/python/obitools3/utils.pyx @@ -24,11 +24,11 @@ import glob import gzip -cpdef bytes format_separator(bytes format): +cpdef bytes format_uniq_pattern(bytes format): if format == b"fasta": return b"\n>" elif format == b"fastq": - return b"\n@" + return b"\n\+\n" elif format == b"ngsfilter" or format == b"tabular": return b"\n" elif format == b"genbank" or format == b"embl": @@ -42,7 +42,7 @@ cpdef bytes format_separator(bytes format): cpdef int count_entries(file, bytes format): try: - sep = format_separator(format) + sep = format_uniq_pattern(format) if sep is None: return -1 sep = re.compile(sep) @@ -72,7 +72,7 @@ cpdef int count_entries(file, bytes format): return -1 mmapped_file = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) total_count += len(re.findall(sep, mmapped_file)) - if format != b"ngsfilter" and format != b"tabular" and format != b"embl" and format != b"genbank": + if format != b"ngsfilter" and format != b"tabular" and format != b"embl" and format != b"genbank" and format != b"fastq": total_count += 1 # adding +1 for 1st entry because separators include \n (ngsfilter and tabular already count one more because of last \n) except: