import: fixed entry count estimation when importing fastq files
This commit is contained in:
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
from obitools3.dms.capi.obitypes cimport obitype_t, index_t
|
from obitools3.dms.capi.obitypes cimport obitype_t, index_t
|
||||||
|
|
||||||
cpdef bytes format_separator(bytes format)
|
cpdef bytes format_uniq_pattern(bytes format)
|
||||||
cpdef int count_entries(file, bytes format)
|
cpdef int count_entries(file, bytes format)
|
||||||
|
|
||||||
cdef obi_errno_to_exception(index_t line_nb=*, object elt_id=*, str error_message=*)
|
cdef obi_errno_to_exception(index_t line_nb=*, object elt_id=*, str error_message=*)
|
||||||
|
@ -24,11 +24,11 @@ import glob
|
|||||||
import gzip
|
import gzip
|
||||||
|
|
||||||
|
|
||||||
cpdef bytes format_separator(bytes format):
|
cpdef bytes format_uniq_pattern(bytes format):
|
||||||
if format == b"fasta":
|
if format == b"fasta":
|
||||||
return b"\n>"
|
return b"\n>"
|
||||||
elif format == b"fastq":
|
elif format == b"fastq":
|
||||||
return b"\n@"
|
return b"\n\+\n"
|
||||||
elif format == b"ngsfilter" or format == b"tabular":
|
elif format == b"ngsfilter" or format == b"tabular":
|
||||||
return b"\n"
|
return b"\n"
|
||||||
elif format == b"genbank" or format == b"embl":
|
elif format == b"genbank" or format == b"embl":
|
||||||
@ -42,7 +42,7 @@ cpdef bytes format_separator(bytes format):
|
|||||||
cpdef int count_entries(file, bytes format):
|
cpdef int count_entries(file, bytes format):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
sep = format_separator(format)
|
sep = format_uniq_pattern(format)
|
||||||
if sep is None:
|
if sep is None:
|
||||||
return -1
|
return -1
|
||||||
sep = re.compile(sep)
|
sep = re.compile(sep)
|
||||||
@ -72,7 +72,7 @@ cpdef int count_entries(file, bytes format):
|
|||||||
return -1
|
return -1
|
||||||
mmapped_file = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
|
mmapped_file = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
|
||||||
total_count += len(re.findall(sep, mmapped_file))
|
total_count += len(re.findall(sep, mmapped_file))
|
||||||
if format != b"ngsfilter" and format != b"tabular" and format != b"embl" and format != b"genbank":
|
if format != b"ngsfilter" and format != b"tabular" and format != b"embl" and format != b"genbank" and format != b"fastq":
|
||||||
total_count += 1 # adding +1 for 1st entry because separators include \n (ngsfilter and tabular already count one more because of last \n)
|
total_count += 1 # adding +1 for 1st entry because separators include \n (ngsfilter and tabular already count one more because of last \n)
|
||||||
|
|
||||||
except:
|
except:
|
||||||
|
Reference in New Issue
Block a user