import: fixed entry count estimation when importing fastq files

This commit is contained in:
Celine Mercier
2020-07-30 16:56:36 +02:00
parent db2202c8b4
commit 7dcbc34017
2 changed files with 5 additions and 5 deletions

View File

@ -2,7 +2,7 @@
from obitools3.dms.capi.obitypes cimport obitype_t, index_t from obitools3.dms.capi.obitypes cimport obitype_t, index_t
cpdef bytes format_separator(bytes format) cpdef bytes format_uniq_pattern(bytes format)
cpdef int count_entries(file, bytes format) cpdef int count_entries(file, bytes format)
cdef obi_errno_to_exception(index_t line_nb=*, object elt_id=*, str error_message=*) cdef obi_errno_to_exception(index_t line_nb=*, object elt_id=*, str error_message=*)

View File

@ -24,11 +24,11 @@ import glob
import gzip import gzip
cpdef bytes format_separator(bytes format): cpdef bytes format_uniq_pattern(bytes format):
if format == b"fasta": if format == b"fasta":
return b"\n>" return b"\n>"
elif format == b"fastq": elif format == b"fastq":
return b"\n@" return b"\n\+\n"
elif format == b"ngsfilter" or format == b"tabular": elif format == b"ngsfilter" or format == b"tabular":
return b"\n" return b"\n"
elif format == b"genbank" or format == b"embl": elif format == b"genbank" or format == b"embl":
@ -42,7 +42,7 @@ cpdef bytes format_separator(bytes format):
cpdef int count_entries(file, bytes format): cpdef int count_entries(file, bytes format):
try: try:
sep = format_separator(format) sep = format_uniq_pattern(format)
if sep is None: if sep is None:
return -1 return -1
sep = re.compile(sep) sep = re.compile(sep)
@ -72,7 +72,7 @@ cpdef int count_entries(file, bytes format):
return -1 return -1
mmapped_file = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) mmapped_file = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
total_count += len(re.findall(sep, mmapped_file)) total_count += len(re.findall(sep, mmapped_file))
if format != b"ngsfilter" and format != b"tabular" and format != b"embl" and format != b"genbank": if format != b"ngsfilter" and format != b"tabular" and format != b"embl" and format != b"genbank" and format != b"fastq":
total_count += 1 # adding +1 for 1st entry because separators include \n (ngsfilter and tabular already count one more because of last \n) total_count += 1 # adding +1 for 1st entry because separators include \n (ngsfilter and tabular already count one more because of last \n)
except: except: