Cython: added full handling of NA strings when importing files

2018-10-17 16:41:15 +02:00
parent da76f911db
commit da0e3d4043
11 changed files with 43 additions and 32 deletions
--- a/python/obitools3/commands/import.pyx
+++ b/python/obitools3/commands/import.pyx
@ -83,7 +83,6 @@ def run(config):
    cdef   list        old_elements_names
    cdef   list        new_elements_names
    cdef   ProgressBar pb
-    cdef   bytes       NA_value
    global             obi_errno
        
    DMS.obi_atexit()
@ -122,9 +121,7 @@ def run(config):
    pb = ProgressBar(10000000, config, seconde=5)   # TODO should be number of records in file
    
    entries = input[1]
-    
-    NA_value = tobytes(config['obi']['inputnastring'])
-    
+        
    NUC_SEQS_view = False
    if isinstance(output[1], View) :
        view = output[1]
@ -169,11 +166,7 @@ def run(config):
                value = entry[tag]
                if tag == b"taxid":
                    tag = b"TAXID"
-                
-                # Check NA value
-                if value == NA_value :
-                    value = None
-                 
+                                 
                if tag not in dcols :
                     
                    value_type = type(value)
--- a/python/obitools3/parsers/fasta.pyx
+++ b/python/obitools3/parsers/fasta.pyx
@ -93,7 +93,8 @@ def fastaNucIterator(lineiterator,
                     int skip=0,
                     only=None,
                     firstline=None,
-                     int buffersize=100000000
+                     int buffersize=100000000,
+                     bytes nastring=b"NA"
                    ):
    
    cdef bytes      ident
@ -143,7 +144,7 @@ def fastaNucIterator(lineiterator,
                pass
            skipped += 1

-        ident,tags,definition = parseHeader(line)
+        ident,tags,definition = parseHeader(line, nastring=nastring)
        s = []
        line = next(iterator)
    
--- a/python/obitools3/parsers/fastq.pyx
+++ b/python/obitools3/parsers/fastq.pyx
@ -15,19 +15,22 @@ def fastqIterator(lineiterator,
                  int offset=-1,
                  bint noquality=False,
                  firstline=None,
-                  int buffersize=100000000
+                  int buffersize=100000000,
+                  bytes nastring=b"NA"
                 ):
    if noquality:
        return fastqWithoutQualityIterator(lineiterator,
                                           skip,only,
                                           firstline,
-                                           buffersize)
+                                           buffersize,
+                                           nastring)
    else:
        return fastqWithQualityIterator(lineiterator,
                                        skip,only,
                                        offset,
                                        firstline,
-                                        buffersize)
+                                        buffersize,
+                                        nastring)


 def fastqWithQualityIterator(lineiterator, 
@ -35,7 +38,8 @@ def fastqWithQualityIterator(lineiterator,
                  only=None,
                  int offset=-1,
                  firstline=None,
-                  int buffersize=100000000
+                  int buffersize=100000000,
+                  bytes nastring=b"NA"
                 ):
    
    cdef LineBuffer lb
@ -84,7 +88,7 @@ def fastqWithQualityIterator(lineiterator,
        if ionly >= 0 and read >= ionly:
            break
        
-        ident,tags,definition = parseHeader(hline)
+        ident,tags,definition = parseHeader(hline, nastring=nastring)
        sequence  = line[0:-1]
        next(i)
        quality   = next(i)[0:-1]
@ -106,7 +110,8 @@ def fastqWithoutQualityIterator(lineiterator,
                  int skip=0,
                  only=None,
                  firstline=None,
-                  int buffersize=100000000
+                  int buffersize=100000000,
+                  bytes nastring=b"NA"
                 ):
    cdef bytes      ident
    cdef bytes      definition
@ -154,7 +159,7 @@ def fastqWithoutQualityIterator(lineiterator,
        if ionly >= 0 and read >= ionly:
            break
       
-        ident,tags,definition = parseHeader(hline)
+        ident,tags,definition = parseHeader(hline, nastring=nastring)
        sequence  = line[0:-1]
        next(i)
        next(i)
--- a/python/obitools3/parsers/header.pxd
+++ b/python/obitools3/parsers/header.pxd
@ -1,4 +1,4 @@
 #cython: language_level=3


-cpdef tuple parseHeader(bytes header)
+cpdef tuple parseHeader(bytes header, bytes nastring=*)
--- a/python/obitools3/parsers/header.pyx
+++ b/python/obitools3/parsers/header.pyx
@ -13,7 +13,7 @@ import re
 __ret__  = re.compile(b'''(([^ ]+)=('[^']*'|"[^"]*"|[^;]+); *)+?''')


-cpdef tuple parseHeader(bytes header):
+cpdef tuple parseHeader(bytes header, bytes nastring=b"NA"):
    cdef list  m 
    cdef dict  tags 
    cdef bytes definition
@ -34,7 +34,7 @@ cpdef tuple parseHeader(bytes header):
        m = __ret__.findall(second)
        
        if m:
-            tags = dict([(a[1],__etag__(a[2])) for a in m])
+            tags = dict([(a[1],__etag__(a[2], nastring=nastring)) for a in m])
            definition = second.split(m[-1][0],1)[1].strip()
        else:
            tags = {}
--- a/python/obitools3/parsers/ngsfilter.pyx
+++ b/python/obitools3/parsers/ngsfilter.pyx
@ -16,6 +16,7 @@ def ngsfilterIterator(lineiterator,
                      bint stripwhite=True,
                      bint blanklineskip=True,
                      bytes commentchar=b"#",
+                      bytes nastring=b"NA",
                      int skip=0,
                      only=None,
                      firstline=None,
@ -69,6 +70,7 @@ def ngsfilterIterator(lineiterator,
                       stripwhite = stripwhite,
                       blanklineskip = blanklineskip,
                       commentchar = commentchar,
+                       nastring = nastring,
                       skip = skip,
                       only = only,
                       firstline = None)
--- a/python/obitools3/parsers/tab.pyx
+++ b/python/obitools3/parsers/tab.pyx
@ -17,6 +17,7 @@ def tabIterator(lineiterator,
                bint stripwhite=True,
                bint blanklineskip=True,
                bytes commentchar=b"#",
+                bytes nastring=b"NA",
                int skip=0,
                only=None,
                firstline=None,
@ -89,11 +90,10 @@ def tabIterator(lineiterator,
            data = [x.strip() for x in data]
        
        for i in range(len(data)):
-            if key_types:
-                type_func = key_types[i]
+            if key_types:  # TODO handle None when key types are actually read
+                view_line[keys[i]] = key_types[i](data[i])
            else:
-                type_func = __etag__
-            view_line[keys[i]] = type_func(data[i])
+                view_line[keys[i]] = __etag__(data[i], nastring=nastring)
        
        yield view_line
        
--- a/python/obitools3/parsers/universal.pyx
+++ b/python/obitools3/parsers/universal.pyx
@ -81,7 +81,8 @@ def entryIteratorFactory(lineiterator,
            return (fastaNucIterator(lineiterator,
                                    skip=skip,only=only,
                                    firstline=first,
-                                    buffersize=buffersize),
+                                    buffersize=buffersize,
+                                    nastring=nastring),
                    Nuc_Seq)
        else:
            raise NotImplementedError()
@ -91,7 +92,8 @@ def entryIteratorFactory(lineiterator,
                                 offset=offset,
                                 noquality=noquality,
                                 firstline=first,
-                                 buffersize=buffersize),
+                                 buffersize=buffersize,
+                                 nastring=nastring),
                    Nuc_Seq)
    elif format==b'tabular':
            return (tabIterator(lineiterator,
@ -101,6 +103,7 @@ def entryIteratorFactory(lineiterator,
                                stripwhite = stripwhite,
                                blanklineskip = blanklineskip,
                                commentchar = commentchar,
+                                nastring=nastring,
                                skip = skip,
                                only = only,
                                firstline=first,
@ -113,6 +116,7 @@ def entryIteratorFactory(lineiterator,
                                      stripwhite = stripwhite,
                                      blanklineskip = blanklineskip,
                                      commentchar = commentchar,
+                                      nastring=nastring,
                                      skip = skip,
                                      only = only,
                                      firstline=first,
--- a/python/obitools3/uri/decode.pyx
+++ b/python/obitools3/uri/decode.pyx
@ -425,7 +425,8 @@ def open_uri(uri,
                    if input:
                        iseq = fastaNucIterator(file, 
                                                skip=skip, 
-                                                only=only)
+                                                only=only,
+                                                nastring=nastring)
                    else:
                        iseq = FastaNucWriter(FastaFormat(printNAKeys=printna, NAString=nastring), 
                                              file,
@ -437,7 +438,8 @@ def open_uri(uri,
                                             skip=skip, 
                                             only=only,
                                             offset=offset,
-                                             noquality=noquality)
+                                             noquality=noquality,
+                                             nastring=nastring)
                    else:
                        iseq = FastqWriter(FastqFormat(printNAKeys=printna, NAString=nastring), 
                                           file,
@ -464,6 +466,7 @@ def open_uri(uri,
                                       stripwhite = stripwhite,
                                       blanklineskip = blanklineskip,
                                       commentchar = commentchar,
+                                       nastring=nastring,
                                       skip = skip,
                                       only = only)
                else:
@ -477,6 +480,7 @@ def open_uri(uri,
                                             stripwhite = stripwhite,
                                             blanklineskip = blanklineskip,
                                             commentchar = commentchar,
+                                             nastring=nastring,
                                             skip = skip,
                                             only = only)
                else:
--- a/python/obitools3/utils.pxd
+++ b/python/obitools3/utils.pxd
@ -18,4 +18,4 @@ cdef obitype_t update_obitype(obitype_t obitype, object new_value)
 cdef obitype_t get_obitype_iterable_value(object value)
 cdef obitype_t get_obitype(object value)

-cdef object __etag__(bytes x)
+cdef object __etag__(bytes x, bytes nastring=*)
--- a/python/obitools3/utils.pyx
+++ b/python/obitools3/utils.pyx
@ -247,11 +247,13 @@ __re_dict__     = re.compile(b"""^\{\ *

 __re_val__ = re.compile(b"""(("[^"]*"|'[^']*') *: *([^,}]+|"[^"]*"|'[^']*') *[,}] *)""")

-cdef object __etag__(bytes x):
+cdef object __etag__(bytes x, bytes nastring=b"NA"):
    cdef list elements
    cdef tuple i
    
-    if __re_int__.match(x):
+    if x == nastring:
+        v = None
+    elif __re_int__.match(x):
        v=int(x)
    elif __re_float__.match(x):
        v=float(x)