Patch decoding of URL

2017-07-28 12:41:28 +02:00
parent 84bb93096f
commit b9c65a871f
9 changed files with 276 additions and 260 deletions
--- a/python/obi.py
+++ b/python/obi.py
@ -22,6 +22,7 @@ default_config = { 'software'       : "The OBITools",
                   'loglevel'       : 'INFO',
                   'progress'       : True,
                   'inputURI'       : None,
+                   'outputURI'      : None,
                   'defaultdms'     : None,
                   'inputview'      : None,
                   'outputview'     : None,
--- a/python/obitools3/apps/config.pyx
+++ b/python/obitools3/apps/config.pyx
@ -101,3 +101,14 @@ cpdef dict getConfiguration(str root_config_name="__default__",
    config['__done__']=True
            
    return config
+
+def logger(level, *messages):
+    try:
+        config=getConfiguration()
+        root = config["__root_config__"]
+        l = config[root]['logger']
+        if config[root]['verbose']:
+            getattr(l, level)(*messages)
+    except:
+        print(*messages,file=sys.stderr)
+
--- a/python/obitools3/apps/logging.pyx
+++ b/python/obitools3/apps/logging.pyx
@ -42,5 +42,7 @@ cpdef getLogger(dict config):
    rootlogger.setLevel(loglevel)
    
    config[root]['logger']=rootlogger
+    config[root]['verbose']=True
        
    return rootlogger
+
--- a/python/obitools3/apps/optiongroups/init.py
+++ b/python/obitools3/apps/optiongroups/init.py
@ -2,8 +2,8 @@ def __addInputOption(optionManager):
    
    optionManager.add_argument(
                    dest='obi:inputURI',  
-                    metavar='index', 
-                    help='index root filename (produced by the oa index command)')
+                    metavar='INPUT', 
+                    help='Data source URI')
    

    group = optionManager.add_argument_group("Restriction to a sub-part options",
@ -23,6 +23,11 @@ def __addInputOption(optionManager):
                     type=int,
                     help="treat only N sequences")

+    group.add_argument('--na-string',
+                     action="store", dest="obi:nastring",
+                     default=b"NA",
+                     type=bytes,
+                     help="String associated to Non Available (NA) values")
    

 def __addSequenceInputOption(optionManager):
@ -124,12 +129,6 @@ def __addTabularInputOption(optionManager):
                     type=bytes,
                     help="Decimal separator")
    
-    group.add_argument('--na-string',
-                     action="store", dest="obi:nastring",
-                     default=b"NA",
-                     type=bytes,
-                     help="String associated to Non Available (NA) values")
-    
    group.add_argument('--strip-white',
                     action="store_false", dest="obi:stripwhite",
                     default=True,
@ -161,3 +160,14 @@ def addAllInputOption(optionManager):
    __addInputOption(optionManager)
    __addSequenceInputOption(optionManager)
    __addTabularInputOption(optionManager)
+    
+    
+def __addOutputOption(optionManager):
+    
+    optionManager.add_argument(
+                    dest='obi:outputURI',  
+                    metavar='OUTPUT', 
+                    help='Data destination URI')
+
+def addMinimalOutputOption(optionManager):
+    __addOutputOption(optionManager)
--- a/python/obitools3/commands/import.pyx
+++ b/python/obitools3/commands/import.pyx
@ -2,6 +2,8 @@

 # TODO cimport generate errors with argument numbers, but without them some variables can't be declared

+import sys
+
 from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
 from obitools3.files.universalopener cimport uopen
 from obitools3.parsers.fasta import fastaIterator
@ -20,6 +22,8 @@ from obitools3.dms.capi.obitypes cimport obitype_t, \

 from obitools3.dms.capi.obierrno cimport obi_errno

+from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
+from obitools3.uri.decode import open_uri

 __title__="Imports sequences from different formats into a DMS"
 
@ -30,83 +34,14 @@ default_config = {   'destview'     : None,
                     'skiperror'    : False,
                     'seqinformat'  : None,
                     'moltype'      : 'nuc',
-                     'filename'     : None
+                     'source'     : None
                 }

 def addOptions(parser):
-    parser.add_argument(dest='import:filename',     
-                        metavar='<FILENAME>', 
-                        nargs='?', 
-                        default=None,
-                        help='Name of the sequence file to import' )
    
-    group=parser.add_argument_group('obi import specific options')
+    addSequenceInputOption(parser)
+    addMinimalOutputOption(parser)

-    group.add_argument('--default-dms','-d',
-                     action="store", dest="obi:defaultdms",
-                     metavar='<DMS NAME>',
-                     default=None,
-                     type=str,
-                     help="Name of the default DMS for reading and writing data")
-
-    group.add_argument('--destination-view','-v',
-                     action="store", dest="import:destview",
-                     metavar='<VIEW NAME>',
-                     default=None,
-                     type=str,
-                     required=True,
-                     help="Name of the default DMS for reading and writing data")
-
-    group.add_argument('--skip',
-                     action="store", dest="import:skip",
-                     metavar='<N>',
-                     default=0,
-                     type=int,
-                     help="Skip the N first sequences")
-
-    group.add_argument('--only',
-                     action="store", dest="import:only",
-                     metavar='<N>',
-                     default=None,
-                     type=int,
-                     help="Treat only N sequences")
-
-    group.add_argument('--skip-on-error',
-                     action="store_true", dest="import:skiperror",
-                     default=None,
-                     help="Skip sequence entries with parse error")
-
-    group.add_argument('--fasta',
-                     action="store_const", dest="import:seqinformat",
-                     default=None,
-                     const='fasta',
-                     help="Input file is in fasta nucleic format (including obitools fasta extentions)")
-
-    group.add_argument('--fastq',
-                     action="store_const", dest="import:seqinformat",
-                     default=None,
-                     const='fastq',
-                     help="Input file is in sanger fastq nucleic format (standard fastq)")
-
-    group.add_argument('--nuc',
-                     action="store_const", dest="import:moltype",
-                     default=None,
-                     const='nuc',
-                     help="Input file contains nucleic sequences")
-
-    group.add_argument('--prot',
-                     action="store_const", dest="import:moltype",
-                     default=None,
-                     const='pep',
-                     help="Input file contains protein sequences")
-
-    group.add_argument('--NA',
-                     action="store", dest="import:NA",
-                     metavar='<NA_value>',
-                     default='NA',
-                     type=str,
-                     help="Character string for Not Available values in the input file "
-                          "(default: 'NA'")


 def run(config):
@ -142,147 +77,159 @@ def run(config):
    cdef   ProgressBar pb
    global             obi_errno
    
-    pb = ProgressBar(1000000, config, seconde=5)   # TODO should be number of records in file
    
-    inputs = uopen(config['import']['filename'])
+    logger=config['obi']['logger']
    
-    # Create or open DMS
-    d = DMS.open_or_new(config['obi']['defaultdms'])
    
-    get_quality = False
-    NUC_SEQS_view = False
-    if config['import']['seqinformat']=='fasta':
-        get_quality = False
-        NUC_SEQS_view = True
-        iseq = fastaIterator(inputs, skip=config['import']['skip'])
-        view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
-    elif config['import']['seqinformat']=='fastq':
-        get_quality = True
-        NUC_SEQS_view = True
-        iseq = fastqIterator(inputs, skip=config['import']['skip'])
-        view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
-    else:
-        raise RuntimeError('File format not handled')
+    logger.info("obi import : imports file into an DMS")
    
-    # Save basic columns in variables for optimization
-    if NUC_SEQS_view :
-        id_col = view["ID"]
-        def_col = view["DEFINITION"]
-        seq_col = view["NUC_SEQ"]
-        if get_quality :
-            qual_col = view["QUALITY"]
+    inputs = open_uri(config['obi']['inputURI'])
    
-    dcols = {}
+    print(inputs)
    
-    i = 0
-    for seq in iseq :
-        if i == config['import']['only'] :
-            break
-        else :
-            pb(i)
-            if NUC_SEQS_view :
-                id_col[i] = seq['id']
-                def_col[i] = seq['definition']
-                seq_col[i] = seq['sequence']
-                if get_quality :
-                    qual_col[i] = seq['quality']
+    sys.exit()
    
-            for tag in seq['tags'] :
-                            
-                value = seq['tags'][tag]
-                
-                # Check NA value
-                if value == config['import']['NA'] :
-                    value = None
-                
-                if tag not in dcols :
-                    
-                    value_type = type(value)
-                    nb_elts = 1
-                    value_obitype = OBI_VOID
-                    
-                    if value_type == dict or value_type == list :
-                        nb_elts = len(value)
-                        elt_names = list(value)
-                    else :
-                        nb_elts = 1
-                        elt_names = None
-                    
-                    value_obitype = get_obitype(value)
-                    
-                    if value_obitype != OBI_VOID :
-                        dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
-                                                
-                        # Fill value
-                        dcols[tag][0][i] = value
-                    
-                    # TODO else log error?
-
-                else :
-        
-                    rewrite = False
-
-                    # Check type adequation
-                    old_type = dcols[tag][1]
-                    new_type = OBI_VOID
-                    new_type = update_obitype(old_type, value)
-                    if old_type != new_type :
-                        rewrite = True
-
-                    try:
-                        # Fill value
-                        dcols[tag][0][i] = value
-                    
-                    except IndexError :
-                                                
-                        value_type = type(value)
-                        old_column = dcols[tag][0]
-                        old_nb_elements_per_line = old_column.nb_elements_per_line
-                        new_nb_elements_per_line = 0
-                        old_elements_names = old_column.elements_names
-                        new_elements_names = None
-    
-                        #####################################################################
-                        
-                        # Check the length and keys of column lines if needed
-                        if value_type == dict :    # Check dictionary keys
-                            for k in value :
-                                if k not in old_elements_names :
-                                    new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
-                                    rewrite = True
-                                    break
-                        
-                        elif value_type == list or value_type == tuple :  # Check vector length
-                            if old_nb_elements_per_line < len(value) :
-                                new_nb_elements_per_line = len(value)
-                                rewrite = True
-                        
-                        #####################################################################
-                        
-                        if rewrite :
-                            if new_nb_elements_per_line == 0 and new_elements_names is not None :
-                                new_nb_elements_per_line = len(new_elements_names)
-                            
-                            # Reset obierrno 
-                            obi_errno = 0
-
-                            dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, 
-                                                                                   new_data_type=new_type, 
-                                                                                   new_nb_elements_per_line=new_nb_elements_per_line,
-                                                                                   new_elements_names=new_elements_names), 
-                                          value_obitype)
-                            
-                            # Update the dictionary:
-                            for t in dcols :
-                                dcols[t] = (view[t], dcols[t][1])
-                            
-                            # Fill value
-                            dcols[tag][0][i] = value
-                                    
-            i+=1
-
-    print("\n")
-    print(view.__repr__())
- 
-    d.close()
+#     pb = ProgressBar(1000000, config, seconde=5)   # TODO should be number of records in file
+#        
+#     inputs = uopen(config['import']['filename'])
+# 
+#     # Create or open DMS
+#     d = DMS.open_or_new(config['obi']['defaultdms'])
+#     
+#     get_quality = False
+#     NUC_SEQS_view = False
+#     if config['import']['seqinformat']=='fasta':
+#         get_quality = False
+#         NUC_SEQS_view = True
+#         iseq = fastaIterator(inputs, skip=config['import']['skip'])
+#         view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
+#     elif config['import']['seqinformat']=='fastq':
+#         get_quality = True
+#         NUC_SEQS_view = True
+#         iseq = fastqIterator(inputs, skip=config['import']['skip'])
+#         view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
+#     else:
+#         raise RuntimeError('File format not handled')
+#         
+#     # Save basic columns in variables for optimization
+#     if NUC_SEQS_view :
+#         id_col = view["ID"]
+#         def_col = view["DEFINITION"]
+#         seq_col = view["NUC_SEQ"]
+#         if get_quality :
+#             qual_col = view["QUALITY"]
+#     
+#     dcols = {}
+#     
+#     i = 0
+#     for seq in iseq :
+#         if i == config['import']['only'] :
+#             break
+#         else :
+#             pb(i)
+#             if NUC_SEQS_view :
+#                 id_col[i] = seq['id']
+#                 def_col[i] = seq['definition']
+#                 seq_col[i] = seq['sequence']
+#                 if get_quality :
+#                     qual_col[i] = seq['quality']
+#             
+#             for tag in seq['tags'] :
+#                             
+#                 value = seq['tags'][tag]
+#                 
+#                 # Check NA value
+#                 if value == config['import']['NA'] :
+#                     value = None
+#                 
+#                 if tag not in dcols :
+#                     
+#                     value_type = type(value)
+#                     nb_elts = 1
+#                     value_obitype = OBI_VOID
+#                     
+#                     if value_type == dict or value_type == list :
+#                         nb_elts = len(value)
+#                         elt_names = list(value)
+#                     else :
+#                         nb_elts = 1
+#                         elt_names = None
+#                     
+#                     value_obitype = get_obitype(value)
+#                     
+#                     if value_obitype != OBI_VOID :
+#                         dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
+#                                                 
+#                         # Fill value
+#                         dcols[tag][0][i] = value
+#                     
+#                     # TODO else log error?
+# 
+#                 else :
+#         
+#                     rewrite = False
+# 
+#                     # Check type adequation
+#                     old_type = dcols[tag][1]
+#                     new_type = OBI_VOID
+#                     new_type = update_obitype(old_type, value)
+#                     if old_type != new_type :
+#                         rewrite = True
+# 
+#                     try:
+#                         # Fill value
+#                         dcols[tag][0][i] = value
+#                     
+#                     except IndexError :
+#                                                 
+#                         value_type = type(value)
+#                         old_column = dcols[tag][0]
+#                         old_nb_elements_per_line = old_column.nb_elements_per_line
+#                         new_nb_elements_per_line = 0
+#                         old_elements_names = old_column.elements_names
+#                         new_elements_names = None
+#     
+#                         #####################################################################
+#                         
+#                         # Check the length and keys of column lines if needed
+#                         if value_type == dict :    # Check dictionary keys
+#                             for k in value :
+#                                 if k not in old_elements_names :
+#                                     new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
+#                                     rewrite = True
+#                                     break
+#                         
+#                         elif value_type == list or value_type == tuple :  # Check vector length
+#                             if old_nb_elements_per_line < len(value) :
+#                                 new_nb_elements_per_line = len(value)
+#                                 rewrite = True
+#                         
+#                         #####################################################################
+#                         
+#                         if rewrite :
+#                             if new_nb_elements_per_line == 0 and new_elements_names is not None :
+#                                 new_nb_elements_per_line = len(new_elements_names)
+#                             
+#                             # Reset obierrno 
+#                             obi_errno = 0
+# 
+#                             dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, 
+#                                                                                    new_data_type=new_type, 
+#                                                                                    new_nb_elements_per_line=new_nb_elements_per_line,
+#                                                                                    new_elements_names=new_elements_names), 
+#                                           value_obitype)
+#                             
+#                             # Update the dictionary:
+#                             for t in dcols :
+#                                 dcols[t] = (view[t], dcols[t][1])
+#                             
+#                             # Fill value
+#                             dcols[tag][0][i] = value
+#                                     
+#             i+=1
+# 
+#     print("\n")
+#     print(view.__repr__())
+#  
+#     d.close()

--- a/python/obitools3/parsers/universal.pyx
+++ b/python/obitools3/parsers/universal.pyx
@ -47,7 +47,7 @@ def entryIteratorFactory(lineiterator,
        
    first=next(i)

-    format="tab"
+    format=b"tabular"
    
    if first[0]==">":
        format=b"fasta"
@ -61,9 +61,6 @@ def entryIteratorFactory(lineiterator,
        format=b"ecopcrfile"
    elif is_ngsfilter_line(first):
        format=b"ngsfilter"
-    else:
-        format=b"tabular"
-        
        
    if format==b'fasta':
        if seqtype == b'nuc':
--- a/python/obitools3/uri/decode.pxd
+++ b/python/obitools3/uri/decode.pxd
@ -4,3 +4,7 @@ from obitools3.dms.dms cimport DMS
 from obitools3.dms.view.view cimport View
 from obitools3.dms.column.column cimport Column
 from obitools3.dms.taxo.taxo cimport Taxonomy
+
+from obitools3.utils cimport tobytes, tostr
+from obitools3.files.universalopener cimport uopen
+
--- a/python/obitools3/uri/decode.pyx
+++ b/python/obitools3/uri/decode.pyx
@ -3,16 +3,14 @@
 from urllib.parse import urlparse, urlunparse, parse_qs, ParseResultBytes
 from os.path import isdir, isfile, basename, join

-from obitools3.utils import tobytes
-
 from obitools3.dms.dms import DMS

-from obitools3.files.universalopener import uopen
 from obitools3.parsers.fasta import fastaNucIterator
 from obitools3.parsers.fastq import fastqIterator
 from obitools3.parsers.universal import entryIteratorFactory

 from obitools3.dms.obiseq import Nuc_Seq
+from obitools3.apps.config import getConfiguration,logger

 class MalformedURIException(RuntimeError):
    pass
@ -130,21 +128,29 @@ def open_dms_element(DMS dms, bytes path):
    return (dms,subsubpart)
                         

-def open_uri(uri,input=True,config={}):
+def open_uri(uri,bint input=True):
    cdef bytes urib = tobytes(uri)
    cdef bytes scheme
    cdef tuple dms
+    cdef dict qualifiers
+    cdef DMS default_dms
    
+    config = getConfiguration()
    urip = urlparse(urib)
        
-    default_dms=config["obi"]["defaultdms"]
+    if 'obi' not in config:
+        config['obi']={}
+    
+    try:
+        default_dms=config["obi"]["defaultdms"]
+    except KeyError:
+        default_dms=None
        
    scheme = urip.scheme

    error = None
    
    if scheme==b"" :
-        scheme=b'file'
        dms = open_dms(urip.path)
        if dms is None and default_dms is not None:
            dms=(default_dms,urip.path)
@ -168,16 +174,12 @@ def open_uri(uri,input=True,config={}):
            except Exception as e:
                error=e
           
-    urip = ParseResultBytes(scheme=scheme, 
-                            netloc=urip.netloc, 
-                            path=urip.path,
-                            params=urip.params, 
-                            query=urip.query, 
-                            fragment=urip.fragment)
-    uri=urlunparse(urip)
+    if not urip.scheme:
+        urib=b"file:"+urib
    
    try:
-        file = uopen(uri)
+        logger('info','Trying to open file : %s', tostr(urib))
+        file = uopen(tostr(urib))
    except Exception as e:
        file = None
        error=e 
@ -189,17 +191,26 @@ def open_uri(uri,input=True,config={}):
        if b'format' in qualifiers:
            format = qualifiers[b'format'][0]
        else:
-            format=config["obi"]["fileformat"]
+            try:
+                format=config["obi"]["fileformat"]
+            except KeyError:
+                format=None
        
        if b'seqtype' in qualifiers:
            seqtype=qualifiers[b'seqtype'][0]
        else:
-            seqtype=config["obi"]["seqtype"]
+            try:
+                seqtype=config["obi"]["seqtype"]
+            except KeyError:
+                seqtype=b'nuc'
        
        if b'skip' in qualifiers:
            skip=int(qualifiers[b"skip"][0])
        else:
-            skip=config["obi"]["skeep"]
+            try:
+                skip=config["obi"]["skip"]
+            except KeyError:
+                skip=0
        if skip < 0:    
            raise MalformedURIException('Malformed skip argument in URI')
        
@ -207,8 +218,11 @@ def open_uri(uri,input=True,config={}):
        if b'only' in qualifiers:
            only=int(qualifiers[b"only"][0])
        else:
-            only=config["obi"]["only"]
-        if only <= 0:    
+            try:
+                only=config["obi"]["only"]
+            except KeyError:
+                only=None
+        if only is not None and only <= 0:    
            raise MalformedURIException('Malformed only argument in URI')
        
            
@ -218,7 +232,10 @@ def open_uri(uri,input=True,config={}):
            except Exception as e:
                raise MalformedURIException('Malformed skiperror argument in URI')
        else:
-            skiperror=config["obi"]["skiperror"]
+            try:
+                skiperror=config["obi"]["skiperror"]
+            except KeyError:
+                skiperror=True
        if not isinstance(skiperror, bool):    
            raise MalformedURIException('Malformed skiperror argument in URI')
      
@ -228,7 +245,10 @@ def open_uri(uri,input=True,config={}):
            except Exception as e:
                raise MalformedURIException('Malformed noquality argument in URI')
        else:
-            noquality=config["obi"]["noquality"]
+            try:
+                noquality=config["obi"]["noquality"]
+            except KeyError:
+                noquality=False
        if not isinstance(noquality, bool):    
            raise MalformedURIException('Malformed noquality argument in URI')
      
@ -238,7 +258,10 @@ def open_uri(uri,input=True,config={}):
            elif qualifiers[b"qualityformat"][0]=="solexa":
                offset=64
        else:
-            offset=config["obi"]["qualityoffset"]
+            try:
+                offset=config["obi"]["qualityoffset"]
+            except KeyError:
+                offset=33
            
        if b"header" in qualifiers:
            try:
@ -246,14 +269,20 @@ def open_uri(uri,input=True,config={}):
            except Exception as e:
                raise MalformedURIException('Malformed header argument in URI')
        else:
-            header=config["obi"]["header"]
+            try:
+                header=config["obi"]["header"]
+            except KeyError:
+                header=False
        if not isinstance(header, bool):    
            raise MalformedURIException('Malformed header argument in URI')
       
        if b"sep" in qualifiers:
            sep=qualifiers[b"sep"][0][0]
        else:
-            seq=config["obi"]["sep"]
+            try:
+                sep=config["obi"]["sep"]
+            except KeyError:
+                sep=None
        
 #        if b"quote" in qualifiers:
 #            pass
@ -261,12 +290,18 @@ def open_uri(uri,input=True,config={}):
        if b"dec" in qualifiers:
            dec=qualifiers[b"dec"][0][0]
        else:
-            dec=config["obi"]["dec"]
+            try:
+                dec=config["obi"]["dec"]
+            except KeyError:
+                dec=b"."
        
        if b"nastring" in qualifiers:
            nastring=qualifiers[b"nastring"][0]
        else:
-            nastring=config["obi"]["nastring"]
+            try:
+                nastring=config["obi"]["nastring"]
+            except KeyError:
+                nastring=b'NA'
                
        if b"stripwhite" in qualifiers:
            try:
@ -274,7 +309,10 @@ def open_uri(uri,input=True,config={}):
            except Exception as e:
                raise MalformedURIException('Malformed stripwhite argument in URI')
        else:
-            stripwhite=config["obi"]["stripwhite"]
+            try:
+                stripwhite=config["obi"]["stripwhite"]
+            except KeyError:
+                stripwhite=True
        if not isinstance(stripwhite, bool):    
            raise MalformedURIException('Malformed stripwhite argument in URI')
        
@ -284,14 +322,20 @@ def open_uri(uri,input=True,config={}):
            except Exception as e:
                raise MalformedURIException('Malformed blanklineskip argument in URI')
        else:
-            blanklineskip=config["obi"]["blanklineskip"]
+            try:
+                blanklineskip=config["obi"]["blanklineskip"]
+            except KeyError:
+                blanklineskip=True
        if not isinstance(blanklineskip, bool):    
            raise MalformedURIException('Malformed blanklineskip argument in URI')
        
        if b"commentchar" in qualifiers:
            commentchar=qualifiers[b"commentchar"][0][0]
        else:
-            commentchar=config["obi"]["commentchar"]
+            try:
+                commentchar=config["obi"]["commentchar"]
+            except KeyError:
+                commentchar=b'#'

        if format is not None:
            if qualifiers[b"seqtype"]==b"nuc":