Cython: updated the URI decoding to handle outputs other than DMS

2018-10-17 11:21:29 +02:00
parent 58589e04be
commit 135d3b6e67
1 changed files with 143 additions and 72 deletions
--- a/python/obitools3/uri/decode.pyx
+++ b/python/obitools3/uri/decode.pyx
@ -12,6 +12,11 @@ from obitools3.parsers.ngsfilter import ngsfilterIterator
 from obitools3.parsers.embl import emblIterator
 from obitools3.parsers.universal import entryIteratorFactory

+from obitools3.writers.fasta import FastaNucWriter
+from obitools3.writers.fastq import FastqWriter
+from obitools3.format.fasta import FastaFormat
+from obitools3.format.fastq import FastqFormat
+
 from obitools3.dms.obiseq import Nuc_Seq
 from obitools3.apps.config import getConfiguration,logger
 from obitools3.apps.temp import get_temp_dms
@ -56,7 +61,9 @@ cdef open_dms(bytes path, bint create=False):
        pos=pos+1
    return None

-def open_dms_element(DMS dms, bytes path, 
+
+def open_dms_element(DMS dms, 
+                     bytes path, 
                     bint create=False,
                     type newviewtype=View):
    """
@ -139,12 +146,21 @@ def open_dms_element(DMS dms, bytes path,
    if len(path_parts) > 4:
            raise MalformedURIException('Malformed View URI')
    
-    return (dms,subsubpart)
+    return (dms, subsubpart)
                         

+'''  
+#TODO discuss returned object. Return a dict? or some class instance?
+Reads an URI and returns a tuple containing:
+    (1) The opened file or DMS, or the URI itself if nothing could be opened by the function
+    (2) The opened view or iterator on the opened file or writer
+    (3) The class of object returned or handled by (2)
+    (4) The original URI in bytes
+'''
 def open_uri(uri,
             bint input=True,
             type newviewtype=View):
+    
    cdef bytes urib = tobytes(uri)
    cdef bytes scheme
    cdef tuple dms
@ -153,7 +169,7 @@ def open_uri(uri,
    
    config = getConfiguration()
    urip = urlparse(urib)
-        
+            
    if 'obi' not in config:
        config['obi']={}
    
@ -166,22 +182,26 @@ def open_uri(uri,
        create=(not input) and (not config["obi"]["nocreatedms"])
    except KeyError:
        create=not input
-        
+    
    scheme = urip.scheme
    
    error = None
    
-    if scheme==b"" or scheme==b"dms" :
-        dms = open_dms(urip.path,create)
+    if scheme==b"dms" or \
+        (scheme==b"" and \
+         (((not input) and "outputformat" not in config["obi"]) or \
+         (input and "inputformat" not in config["obi"]))):   # TODO maybe not best way
+        
+        dms = open_dms(urip.path, create)
        if dms is None and default_dms is not None:
            dms=(default_dms, urip.path)

        if dms is not None:
            try:
-                resource=open_dms_element(dms[0],dms[1],
+                resource=open_dms_element(dms[0],
+                                          dms[1],
                                          create,
-                                          newviewtype
-                                          )
+                                          newviewtype)
                                
                scheme=b"dms"
                urip = ParseResultBytes(scheme=b"dms", 
@ -200,32 +220,42 @@ def open_uri(uri,
                        urlunparse(urip))
            except Exception as e:
                error=e
-                
+                   
    if scheme==b"dms" :
        logger('Error','cannot open DMS: %s', uri)
        raise FileNotFoundError('uri')
-           
-    #if not urip.scheme:      # TODO not sure what it was supposed to do but not working as intended
-    #    urib=b"file:"+urib
+
+    if not urip.scheme:
+        urib=b"file:"+urib
    
-    try:
-        file = uopen(urib)
-        logger('info','Opened file: %s', tostr(urib))
-    except Exception as e:  # TODO discuss: if can't open file, return the character string itself
-        file = urib
-        iseq = urib
-        objclass = bytes
+    if input:
+        try:
+            file = uopen(urip.path, mode='rb')
+            logger('info','Opened file: %s', urip.path)
+        except Exception as e:  # TODO discuss: if can't open file, return the character string itself
+            file = tobytes(uri)
+            iseq = urib
+            objclass = bytes
+    else:  # TODO update uopen to be able to write? 
+        file = open(urip.path, 'wb')
        
    if file is not None:
        qualifiers=parse_qs(urip.query)
        
-        if b'format' in qualifiers:
+        if input and b'format' in qualifiers:
            format = qualifiers[b'format'][0]
-        else:
-            try:
-                format=config["obi"]["format"]
-            except KeyError:
-                format=None
+        else:   # TODO discuss priorities
+            if urip.scheme:
+                format = urip.scheme
+            else:
+                try:
+                    if input:
+                        formatkey = "inputformat"
+                    else:
+                        formatkey = "outputformat"
+                    format=config["obi"][formatkey]
+                except KeyError:
+                    format=None
        
        if b'seqtype' in qualifiers:
            seqtype=qualifiers[b'seqtype'][0]
@ -248,7 +278,6 @@ def open_uri(uri,
        if skip < 0:    
            raise MalformedURIException('Malformed skip argument in URI')
        
-        
        if b'only' in qualifiers:
            only=int(qualifiers[b"only"][0])
        else:
@ -259,7 +288,6 @@ def open_uri(uri,
        if only is not None and only <= 0:    
            raise MalformedURIException('Malformed only argument in URI')
        
-            
        if b"skiperror" in qualifiers:
            try:
                skiperror=eval(qualifiers[b"skiperror"][0])
@ -332,12 +360,27 @@ def open_uri(uri,
                dec=tobytes(config["obi"]["dec"])
            except KeyError:
                dec=b"."
+
+        if b"printna" in qualifiers:
+            try:
+                printna=eval(qualifiers[b"printna"][0])
+            except Exception as e:
+                raise MalformedURIException("Malformed 'print NA' argument in URI")
+        else:
+            try:
+                printna=config["obi"]["printna"]
+            except KeyError:
+                printna=False
        
        if b"nastring" in qualifiers:
            nastring=tobytes(qualifiers[b"nastring"][0])
        else:
            try:
-                nastring=tobytes(config["obi"]["nastring"])
+                if input:
+                    nakey = "inputnastring"
+                else:
+                    nakey = "outputnastring"
+                nastring=tobytes(config["obi"][nakey])
            except KeyError:
                nastring=b'NA'
                
@ -377,63 +420,91 @@ def open_uri(uri,

        if format is not None:
            if seqtype==b"nuc":
-                objclass = Nuc_Seq
+                objclass = Nuc_Seq    # Nuc_Seq_Stored? TODO
                if format==b"fasta":
-                    iseq = fastaNucIterator(file, 
+                    if input:
+                        iseq = fastaNucIterator(file, 
+                                                skip=skip, 
+                                                only=only)
+                    else:
+                        iseq = FastaNucWriter(FastaFormat(printNAKeys=printna, NAString=nastring), 
+                                              file,
+                                              skip=skip,
+                                              only=only)
+                elif format==b"fastq":
+                    if input:
+                        iseq = fastqIterator(file,
+                                             skip=skip, 
+                                             only=only,
+                                             offset=offset,
+                                             noquality=noquality)
+                    else:
+                        iseq = FastqWriter(FastqFormat(printNAKeys=printna, NAString=nastring), 
+                                           file,
+                                           skip=skip,
+                                           only=only)
+                elif format==b"embl":
+                    if input:
+                        iseq = emblIterator(file, 
                                            skip=skip, 
                                            only=only)
-                elif format==b"fastq":
-                    iseq = fastqIterator(file,
-                                         skip=skip, 
-                                         only=only,
-                                         offset=offset,
-                                         noquality=noquality)
-                elif format==b"embl":
-                    iseq = emblIterator(file, 
-                                        skip=skip, 
-                                        only=only)
+                    else:
+                        raise NotImplementedError('Output sequence file format not implemented')
                else:
                    raise NotImplementedError('Sequence file format not implemented')
            elif seqtype==b"prot":
                raise NotImplementedError()
            elif format==b"tabular":
                objclass = dict
-                iseq = tabIterator(file,
-                                   header = header,
-                                   sep = sep,
-                                   dec = dec,
-                                   stripwhite = stripwhite,
-                                   blanklineskip = blanklineskip,
-                                   commentchar = commentchar,
-                                   skip = skip,
-                                   only = only)
+                if input:
+                    iseq = tabIterator(file,
+                                       header = header,
+                                       sep = sep,
+                                       dec = dec,
+                                       stripwhite = stripwhite,
+                                       blanklineskip = blanklineskip,
+                                       commentchar = commentchar,
+                                       skip = skip,
+                                       only = only)
+                else:
+                    raise NotImplementedError('Output sequence file format not implemented')
            elif format==b"ngsfilter":
                objclass = dict
-                iseq = ngsfilterIterator(file,
-                                         sep = sep,
-                                         dec = dec,
-                                         stripwhite = stripwhite,
-                                         blanklineskip = blanklineskip,
-                                         commentchar = commentchar,
-                                         skip = skip,
-                                         only = only)
+                if input:
+                    iseq = ngsfilterIterator(file,
+                                             sep = sep,
+                                             dec = dec,
+                                             stripwhite = stripwhite,
+                                             blanklineskip = blanklineskip,
+                                             commentchar = commentchar,
+                                             skip = skip,
+                                             only = only)
+                else:
+                    raise NotImplementedError('Output sequence file format not implemented')
        else:
-            iseq, objclass = entryIteratorFactory(file,
-                                                  skip, only,
-                                                  seqtype,
-                                                  offset,
-                                                  noquality,
-                                                  skiperror,
-                                                  header,
-                                                  sep,
-                                                  dec,
-                                                  nastring,
-                                                  stripwhite,
-                                                  blanklineskip,
-                                                  commentchar)
+            if input:
+                iseq, objclass = entryIteratorFactory(file,
+                                                      skip, only,
+                                                      seqtype,
+                                                      offset,
+                                                      noquality,
+                                                      skiperror,
+                                                      header,
+                                                      sep,
+                                                      dec,
+                                                      nastring,
+                                                      stripwhite,
+                                                      blanklineskip,
+                                                      commentchar)
+            else:    # default export is in fasta? or tab? TODO
+                objclass = Nuc_Seq   # Nuc_Seq_Stored? TODO
+                iseq = FastaNucWriter(FastaFormat(printNAKeys=printna, NAString=nastring), 
+                                      file,
+                                      skip=skip,
+                                      only=only)
        
        #tmpdms = get_temp_dms()
-        
+                
        return (file, iseq, objclass, urib)