From da0e3d4043673af1cd232c939fdc86670bbe93a2 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Wed, 17 Oct 2018 16:41:15 +0200 Subject: [PATCH] Cython: added full handling of NA strings when importing files --- python/obitools3/commands/import.pyx | 11 ++--------- python/obitools3/parsers/fasta.pyx | 5 +++-- python/obitools3/parsers/fastq.pyx | 19 ++++++++++++------- python/obitools3/parsers/header.pxd | 2 +- python/obitools3/parsers/header.pyx | 4 ++-- python/obitools3/parsers/ngsfilter.pyx | 2 ++ python/obitools3/parsers/tab.pyx | 8 ++++---- python/obitools3/parsers/universal.pyx | 8 ++++++-- python/obitools3/uri/decode.pyx | 8 ++++++-- python/obitools3/utils.pxd | 2 +- python/obitools3/utils.pyx | 6 ++++-- 11 files changed, 43 insertions(+), 32 deletions(-) diff --git a/python/obitools3/commands/import.pyx b/python/obitools3/commands/import.pyx index 4c7d8b5..04dd8df 100644 --- a/python/obitools3/commands/import.pyx +++ b/python/obitools3/commands/import.pyx @@ -83,7 +83,6 @@ def run(config): cdef list old_elements_names cdef list new_elements_names cdef ProgressBar pb - cdef bytes NA_value global obi_errno DMS.obi_atexit() @@ -122,9 +121,7 @@ def run(config): pb = ProgressBar(10000000, config, seconde=5) # TODO should be number of records in file entries = input[1] - - NA_value = tobytes(config['obi']['inputnastring']) - + NUC_SEQS_view = False if isinstance(output[1], View) : view = output[1] @@ -169,11 +166,7 @@ def run(config): value = entry[tag] if tag == b"taxid": tag = b"TAXID" - - # Check NA value - if value == NA_value : - value = None - + if tag not in dcols : value_type = type(value) diff --git a/python/obitools3/parsers/fasta.pyx b/python/obitools3/parsers/fasta.pyx index e052851..b01dc4b 100644 --- a/python/obitools3/parsers/fasta.pyx +++ b/python/obitools3/parsers/fasta.pyx @@ -93,7 +93,8 @@ def fastaNucIterator(lineiterator, int skip=0, only=None, firstline=None, - int buffersize=100000000 + int buffersize=100000000, + bytes nastring=b"NA" ): cdef bytes ident @@ -143,7 +144,7 @@ def fastaNucIterator(lineiterator, pass skipped += 1 - ident,tags,definition = parseHeader(line) + ident,tags,definition = parseHeader(line, nastring=nastring) s = [] line = next(iterator) diff --git a/python/obitools3/parsers/fastq.pyx b/python/obitools3/parsers/fastq.pyx index c92f231..df0a278 100644 --- a/python/obitools3/parsers/fastq.pyx +++ b/python/obitools3/parsers/fastq.pyx @@ -15,19 +15,22 @@ def fastqIterator(lineiterator, int offset=-1, bint noquality=False, firstline=None, - int buffersize=100000000 + int buffersize=100000000, + bytes nastring=b"NA" ): if noquality: return fastqWithoutQualityIterator(lineiterator, skip,only, firstline, - buffersize) + buffersize, + nastring) else: return fastqWithQualityIterator(lineiterator, skip,only, offset, firstline, - buffersize) + buffersize, + nastring) def fastqWithQualityIterator(lineiterator, @@ -35,7 +38,8 @@ def fastqWithQualityIterator(lineiterator, only=None, int offset=-1, firstline=None, - int buffersize=100000000 + int buffersize=100000000, + bytes nastring=b"NA" ): cdef LineBuffer lb @@ -84,7 +88,7 @@ def fastqWithQualityIterator(lineiterator, if ionly >= 0 and read >= ionly: break - ident,tags,definition = parseHeader(hline) + ident,tags,definition = parseHeader(hline, nastring=nastring) sequence = line[0:-1] next(i) quality = next(i)[0:-1] @@ -106,7 +110,8 @@ def fastqWithoutQualityIterator(lineiterator, int skip=0, only=None, firstline=None, - int buffersize=100000000 + int buffersize=100000000, + bytes nastring=b"NA" ): cdef bytes ident cdef bytes definition @@ -154,7 +159,7 @@ def fastqWithoutQualityIterator(lineiterator, if ionly >= 0 and read >= ionly: break - ident,tags,definition = parseHeader(hline) + ident,tags,definition = parseHeader(hline, nastring=nastring) sequence = line[0:-1] next(i) next(i) diff --git a/python/obitools3/parsers/header.pxd b/python/obitools3/parsers/header.pxd index 423c4d9..9810f6a 100644 --- a/python/obitools3/parsers/header.pxd +++ b/python/obitools3/parsers/header.pxd @@ -1,4 +1,4 @@ #cython: language_level=3 -cpdef tuple parseHeader(bytes header) +cpdef tuple parseHeader(bytes header, bytes nastring=*) diff --git a/python/obitools3/parsers/header.pyx b/python/obitools3/parsers/header.pyx index cb0261a..ea0fa17 100644 --- a/python/obitools3/parsers/header.pyx +++ b/python/obitools3/parsers/header.pyx @@ -13,7 +13,7 @@ import re __ret__ = re.compile(b'''(([^ ]+)=('[^']*'|"[^"]*"|[^;]+); *)+?''') -cpdef tuple parseHeader(bytes header): +cpdef tuple parseHeader(bytes header, bytes nastring=b"NA"): cdef list m cdef dict tags cdef bytes definition @@ -34,7 +34,7 @@ cpdef tuple parseHeader(bytes header): m = __ret__.findall(second) if m: - tags = dict([(a[1],__etag__(a[2])) for a in m]) + tags = dict([(a[1],__etag__(a[2], nastring=nastring)) for a in m]) definition = second.split(m[-1][0],1)[1].strip() else: tags = {} diff --git a/python/obitools3/parsers/ngsfilter.pyx b/python/obitools3/parsers/ngsfilter.pyx index ce841b1..1212d36 100644 --- a/python/obitools3/parsers/ngsfilter.pyx +++ b/python/obitools3/parsers/ngsfilter.pyx @@ -16,6 +16,7 @@ def ngsfilterIterator(lineiterator, bint stripwhite=True, bint blanklineskip=True, bytes commentchar=b"#", + bytes nastring=b"NA", int skip=0, only=None, firstline=None, @@ -69,6 +70,7 @@ def ngsfilterIterator(lineiterator, stripwhite = stripwhite, blanklineskip = blanklineskip, commentchar = commentchar, + nastring = nastring, skip = skip, only = only, firstline = None) diff --git a/python/obitools3/parsers/tab.pyx b/python/obitools3/parsers/tab.pyx index a2fcb7d..8d731f9 100644 --- a/python/obitools3/parsers/tab.pyx +++ b/python/obitools3/parsers/tab.pyx @@ -17,6 +17,7 @@ def tabIterator(lineiterator, bint stripwhite=True, bint blanklineskip=True, bytes commentchar=b"#", + bytes nastring=b"NA", int skip=0, only=None, firstline=None, @@ -89,11 +90,10 @@ def tabIterator(lineiterator, data = [x.strip() for x in data] for i in range(len(data)): - if key_types: - type_func = key_types[i] + if key_types: # TODO handle None when key types are actually read + view_line[keys[i]] = key_types[i](data[i]) else: - type_func = __etag__ - view_line[keys[i]] = type_func(data[i]) + view_line[keys[i]] = __etag__(data[i], nastring=nastring) yield view_line diff --git a/python/obitools3/parsers/universal.pyx b/python/obitools3/parsers/universal.pyx index 8e90dd7..942e661 100644 --- a/python/obitools3/parsers/universal.pyx +++ b/python/obitools3/parsers/universal.pyx @@ -81,7 +81,8 @@ def entryIteratorFactory(lineiterator, return (fastaNucIterator(lineiterator, skip=skip,only=only, firstline=first, - buffersize=buffersize), + buffersize=buffersize, + nastring=nastring), Nuc_Seq) else: raise NotImplementedError() @@ -91,7 +92,8 @@ def entryIteratorFactory(lineiterator, offset=offset, noquality=noquality, firstline=first, - buffersize=buffersize), + buffersize=buffersize, + nastring=nastring), Nuc_Seq) elif format==b'tabular': return (tabIterator(lineiterator, @@ -101,6 +103,7 @@ def entryIteratorFactory(lineiterator, stripwhite = stripwhite, blanklineskip = blanklineskip, commentchar = commentchar, + nastring=nastring, skip = skip, only = only, firstline=first, @@ -113,6 +116,7 @@ def entryIteratorFactory(lineiterator, stripwhite = stripwhite, blanklineskip = blanklineskip, commentchar = commentchar, + nastring=nastring, skip = skip, only = only, firstline=first, diff --git a/python/obitools3/uri/decode.pyx b/python/obitools3/uri/decode.pyx index c23adf7..863f040 100644 --- a/python/obitools3/uri/decode.pyx +++ b/python/obitools3/uri/decode.pyx @@ -425,7 +425,8 @@ def open_uri(uri, if input: iseq = fastaNucIterator(file, skip=skip, - only=only) + only=only, + nastring=nastring) else: iseq = FastaNucWriter(FastaFormat(printNAKeys=printna, NAString=nastring), file, @@ -437,7 +438,8 @@ def open_uri(uri, skip=skip, only=only, offset=offset, - noquality=noquality) + noquality=noquality, + nastring=nastring) else: iseq = FastqWriter(FastqFormat(printNAKeys=printna, NAString=nastring), file, @@ -464,6 +466,7 @@ def open_uri(uri, stripwhite = stripwhite, blanklineskip = blanklineskip, commentchar = commentchar, + nastring=nastring, skip = skip, only = only) else: @@ -477,6 +480,7 @@ def open_uri(uri, stripwhite = stripwhite, blanklineskip = blanklineskip, commentchar = commentchar, + nastring=nastring, skip = skip, only = only) else: diff --git a/python/obitools3/utils.pxd b/python/obitools3/utils.pxd index 5360f95..cec7894 100644 --- a/python/obitools3/utils.pxd +++ b/python/obitools3/utils.pxd @@ -18,4 +18,4 @@ cdef obitype_t update_obitype(obitype_t obitype, object new_value) cdef obitype_t get_obitype_iterable_value(object value) cdef obitype_t get_obitype(object value) -cdef object __etag__(bytes x) +cdef object __etag__(bytes x, bytes nastring=*) diff --git a/python/obitools3/utils.pyx b/python/obitools3/utils.pyx index 319501b..baefcaf 100644 --- a/python/obitools3/utils.pyx +++ b/python/obitools3/utils.pyx @@ -247,11 +247,13 @@ __re_dict__ = re.compile(b"""^\{\ * __re_val__ = re.compile(b"""(("[^"]*"|'[^']*') *: *([^,}]+|"[^"]*"|'[^']*') *[,}] *)""") -cdef object __etag__(bytes x): +cdef object __etag__(bytes x, bytes nastring=b"NA"): cdef list elements cdef tuple i - if __re_int__.match(x): + if x == nastring: + v = None + elif __re_int__.match(x): v=int(x) elif __re_float__.match(x): v=float(x)