Patch decoding of URL

2017-07-28 12:41:28 +02:00
parent 84bb93096f
commit b9c65a871f
9 changed files with 276 additions and 260 deletions
--- a/python/obi.py
+++ b/python/obi.py
@ -22,6 +22,7 @@ default_config = { 'software'       : "The OBITools",
                   'loglevel'       : 'INFO',
                   'progress'       : True,
                   'inputURI'       : None,
                   'outputURI'      : None,
                   'defaultdms'     : None,
                   'inputview'      : None,
                   'outputview'     : None,
--- a/python/obitools3/apps/config.pyx
+++ b/python/obitools3/apps/config.pyx
@ -101,3 +101,14 @@ cpdef dict getConfiguration(str root_config_name="__default__",
    config['__done__']=True
    return config
 def logger(level, *messages):
    try:
        config=getConfiguration()
        root = config["__root_config__"]
        l = config[root]['logger']
        if config[root]['verbose']:
            getattr(l, level)(*messages)
    except:
        print(*messages,file=sys.stderr)
--- a/python/obitools3/apps/logging.pyx
+++ b/python/obitools3/apps/logging.pyx
@ -42,5 +42,7 @@ cpdef getLogger(dict config):
    rootlogger.setLevel(loglevel)
    config[root]['logger']=rootlogger
    config[root]['verbose']=True
    return rootlogger
--- a/python/obitools3/apps/optiongroups/init.py
+++ b/python/obitools3/apps/optiongroups/init.py
@ -2,8 +2,8 @@ def __addInputOption(optionManager):
    optionManager.add_argument(
                    dest='obi:inputURI',  
-                    metavar='index', 
+                    metavar='INPUT', 
-                    help='index root filename (produced by the oa index command)')
+                    help='Data source URI')
    group = optionManager.add_argument_group("Restriction to a sub-part options",
@ -23,6 +23,11 @@ def __addInputOption(optionManager):
                     type=int,
                     help="treat only N sequences")
    group.add_argument('--na-string',
                     action="store", dest="obi:nastring",
                     default=b"NA",
                     type=bytes,
                     help="String associated to Non Available (NA) values")
 def __addSequenceInputOption(optionManager):
@ -124,12 +129,6 @@ def __addTabularInputOption(optionManager):
                     type=bytes,
                     help="Decimal separator")
    group.add_argument('--na-string',
                     action="store", dest="obi:nastring",
                     default=b"NA",
                     type=bytes,
                     help="String associated to Non Available (NA) values")
    group.add_argument('--strip-white',
                     action="store_false", dest="obi:stripwhite",
                     default=True,
@ -161,3 +160,14 @@ def addAllInputOption(optionManager):
    __addInputOption(optionManager)
    __addSequenceInputOption(optionManager)
    __addTabularInputOption(optionManager)
 def __addOutputOption(optionManager):
    optionManager.add_argument(
                    dest='obi:outputURI',  
                    metavar='OUTPUT', 
                    help='Data destination URI')
 def addMinimalOutputOption(optionManager):
    __addOutputOption(optionManager)
--- a/python/obitools3/commands/import.pyx
+++ b/python/obitools3/commands/import.pyx
@ -2,6 +2,8 @@
 # TODO cimport generate errors with argument numbers, but without them some variables can't be declared
 import sys
 from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
 from obitools3.files.universalopener cimport uopen
 from obitools3.parsers.fasta import fastaIterator
@ -20,6 +22,8 @@ from obitools3.dms.capi.obitypes cimport obitype_t, \
 from obitools3.dms.capi.obierrno cimport obi_errno
 from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
 from obitools3.uri.decode import open_uri
 __title__="Imports sequences from different formats into a DMS"
@ -30,83 +34,14 @@ default_config = {   'destview'     : None,
                     'skiperror'    : False,
                     'seqinformat'  : None,
                     'moltype'      : 'nuc',
-                     'filename'     : None
+                     'source'     : None
                 }
 def addOptions(parser):
    parser.add_argument(dest='import:filename',     
                        metavar='<FILENAME>', 
                        nargs='?', 
                        default=None,
                        help='Name of the sequence file to import' )
-    group=parser.add_argument_group('obi import specific options')
+    addSequenceInputOption(parser)
    addMinimalOutputOption(parser)
    group.add_argument('--default-dms','-d',
                     action="store", dest="obi:defaultdms",
                     metavar='<DMS NAME>',
                     default=None,
                     type=str,
                     help="Name of the default DMS for reading and writing data")
    group.add_argument('--destination-view','-v',
                     action="store", dest="import:destview",
                     metavar='<VIEW NAME>',
                     default=None,
                     type=str,
                     required=True,
                     help="Name of the default DMS for reading and writing data")
    group.add_argument('--skip',
                     action="store", dest="import:skip",
                     metavar='<N>',
                     default=0,
                     type=int,
                     help="Skip the N first sequences")
    group.add_argument('--only',
                     action="store", dest="import:only",
                     metavar='<N>',
                     default=None,
                     type=int,
                     help="Treat only N sequences")
    group.add_argument('--skip-on-error',
                     action="store_true", dest="import:skiperror",
                     default=None,
                     help="Skip sequence entries with parse error")
    group.add_argument('--fasta',
                     action="store_const", dest="import:seqinformat",
                     default=None,
                     const='fasta',
                     help="Input file is in fasta nucleic format (including obitools fasta extentions)")
    group.add_argument('--fastq',
                     action="store_const", dest="import:seqinformat",
                     default=None,
                     const='fastq',
                     help="Input file is in sanger fastq nucleic format (standard fastq)")
    group.add_argument('--nuc',
                     action="store_const", dest="import:moltype",
                     default=None,
                     const='nuc',
                     help="Input file contains nucleic sequences")
    group.add_argument('--prot',
                     action="store_const", dest="import:moltype",
                     default=None,
                     const='pep',
                     help="Input file contains protein sequences")
    group.add_argument('--NA',
                     action="store", dest="import:NA",
                     metavar='<NA_value>',
                     default='NA',
                     type=str,
                     help="Character string for Not Available values in the input file "
                          "(default: 'NA'")
 def run(config):
@ -142,147 +77,159 @@ def run(config):
    cdef   ProgressBar pb
    global             obi_errno
    pb = ProgressBar(1000000, config, seconde=5)   # TODO should be number of records in file
-    inputs = uopen(config['import']['filename'])
+    logger=config['obi']['logger']
    # Create or open DMS
    d = DMS.open_or_new(config['obi']['defaultdms'])
-    get_quality = False
+    logger.info("obi import : imports file into an DMS")
    NUC_SEQS_view = False
    if config['import']['seqinformat']=='fasta':
        get_quality = False
        NUC_SEQS_view = True
        iseq = fastaIterator(inputs, skip=config['import']['skip'])
        view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
    elif config['import']['seqinformat']=='fastq':
        get_quality = True
        NUC_SEQS_view = True
        iseq = fastqIterator(inputs, skip=config['import']['skip'])
        view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
    else:
        raise RuntimeError('File format not handled')
-    # Save basic columns in variables for optimization
+    inputs = open_uri(config['obi']['inputURI'])
    if NUC_SEQS_view :
        id_col = view["ID"]
        def_col = view["DEFINITION"]
        seq_col = view["NUC_SEQ"]
        if get_quality :
            qual_col = view["QUALITY"]
-    dcols = {}
+    print(inputs)
-    i = 0
+    sys.exit()
    for seq in iseq :
        if i == config['import']['only'] :
            break
        else :
            pb(i)
            if NUC_SEQS_view :
                id_col[i] = seq['id']
                def_col[i] = seq['definition']
                seq_col[i] = seq['sequence']
                if get_quality :
                    qual_col[i] = seq['quality']
-            for tag in seq['tags'] :
+#     pb = ProgressBar(1000000, config, seconde=5)   # TODO should be number of records in file
-                            
+#        
-                value = seq['tags'][tag]
+#     inputs = uopen(config['import']['filename'])
-                
+# 
-                # Check NA value
+#     # Create or open DMS
-                if value == config['import']['NA'] :
+#     d = DMS.open_or_new(config['obi']['defaultdms'])
-                    value = None
+#     
-                
+#     get_quality = False
-                if tag not in dcols :
+#     NUC_SEQS_view = False
-                    
+#     if config['import']['seqinformat']=='fasta':
-                    value_type = type(value)
+#         get_quality = False
-                    nb_elts = 1
+#         NUC_SEQS_view = True
-                    value_obitype = OBI_VOID
+#         iseq = fastaIterator(inputs, skip=config['import']['skip'])
-                    
+#         view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
-                    if value_type == dict or value_type == list :
+#     elif config['import']['seqinformat']=='fastq':
-                        nb_elts = len(value)
+#         get_quality = True
-                        elt_names = list(value)
+#         NUC_SEQS_view = True
-                    else :
+#         iseq = fastqIterator(inputs, skip=config['import']['skip'])
-                        nb_elts = 1
+#         view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
-                        elt_names = None
+#     else:
-                    
+#         raise RuntimeError('File format not handled')
-                    value_obitype = get_obitype(value)
+#         
-                    
+#     # Save basic columns in variables for optimization
-                    if value_obitype != OBI_VOID :
+#     if NUC_SEQS_view :
-                        dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
+#         id_col = view["ID"]
-                                                
+#         def_col = view["DEFINITION"]
-                        # Fill value
+#         seq_col = view["NUC_SEQ"]
-                        dcols[tag][0][i] = value
+#         if get_quality :
-                    
+#             qual_col = view["QUALITY"]
-                    # TODO else log error?
+#     
-
+#     dcols = {}
-                else :
+#     
-        
+#     i = 0
-                    rewrite = False
+#     for seq in iseq :
-
+#         if i == config['import']['only'] :
-                    # Check type adequation
+#             break
-                    old_type = dcols[tag][1]
+#         else :
-                    new_type = OBI_VOID
+#             pb(i)
-                    new_type = update_obitype(old_type, value)
+#             if NUC_SEQS_view :
-                    if old_type != new_type :
+#                 id_col[i] = seq['id']
-                        rewrite = True
+#                 def_col[i] = seq['definition']
-
+#                 seq_col[i] = seq['sequence']
-                    try:
+#                 if get_quality :
-                        # Fill value
+#                     qual_col[i] = seq['quality']
-                        dcols[tag][0][i] = value
+#             
-                    
+#             for tag in seq['tags'] :
-                    except IndexError :
+#                             
-                                                
+#                 value = seq['tags'][tag]
-                        value_type = type(value)
+#                 
-                        old_column = dcols[tag][0]
+#                 # Check NA value
-                        old_nb_elements_per_line = old_column.nb_elements_per_line
+#                 if value == config['import']['NA'] :
-                        new_nb_elements_per_line = 0
+#                     value = None
-                        old_elements_names = old_column.elements_names
+#                 
-                        new_elements_names = None
+#                 if tag not in dcols :
-    
+#                     
-                        #####################################################################
+#                     value_type = type(value)
-                        
+#                     nb_elts = 1
-                        # Check the length and keys of column lines if needed
+#                     value_obitype = OBI_VOID
-                        if value_type == dict :    # Check dictionary keys
+#                     
-                            for k in value :
+#                     if value_type == dict or value_type == list :
-                                if k not in old_elements_names :
+#                         nb_elts = len(value)
-                                    new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
+#                         elt_names = list(value)
-                                    rewrite = True
+#                     else :
-                                    break
+#                         nb_elts = 1
-                        
+#                         elt_names = None
-                        elif value_type == list or value_type == tuple :  # Check vector length
+#                     
-                            if old_nb_elements_per_line < len(value) :
+#                     value_obitype = get_obitype(value)
-                                new_nb_elements_per_line = len(value)
+#                     
-                                rewrite = True
+#                     if value_obitype != OBI_VOID :
-                        
+#                         dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
-                        #####################################################################
+#                                                 
-                        
+#                         # Fill value
-                        if rewrite :
+#                         dcols[tag][0][i] = value
-                            if new_nb_elements_per_line == 0 and new_elements_names is not None :
+#                     
-                                new_nb_elements_per_line = len(new_elements_names)
+#                     # TODO else log error?
-                            
+# 
-                            # Reset obierrno 
+#                 else :
-                            obi_errno = 0
+#         
-
+#                     rewrite = False
-                            dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, 
+# 
-                                                                                   new_data_type=new_type, 
+#                     # Check type adequation
-                                                                                   new_nb_elements_per_line=new_nb_elements_per_line,
+#                     old_type = dcols[tag][1]
-                                                                                   new_elements_names=new_elements_names), 
+#                     new_type = OBI_VOID
-                                          value_obitype)
+#                     new_type = update_obitype(old_type, value)
-                            
+#                     if old_type != new_type :
-                            # Update the dictionary:
+#                         rewrite = True
-                            for t in dcols :
+# 
-                                dcols[t] = (view[t], dcols[t][1])
+#                     try:
-                            
+#                         # Fill value
-                            # Fill value
+#                         dcols[tag][0][i] = value
-                            dcols[tag][0][i] = value
+#                     
-                                    
+#                     except IndexError :
-            i+=1
+#                                                 
-
+#                         value_type = type(value)
-    print("\n")
+#                         old_column = dcols[tag][0]
-    print(view.__repr__())
+#                         old_nb_elements_per_line = old_column.nb_elements_per_line
- 
+#                         new_nb_elements_per_line = 0
-    d.close()
+#                         old_elements_names = old_column.elements_names
 #                         new_elements_names = None
 #     
 #                         #####################################################################
 #                         
 #                         # Check the length and keys of column lines if needed
 #                         if value_type == dict :    # Check dictionary keys
 #                             for k in value :
 #                                 if k not in old_elements_names :
 #                                     new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
 #                                     rewrite = True
 #                                     break
 #                         
 #                         elif value_type == list or value_type == tuple :  # Check vector length
 #                             if old_nb_elements_per_line < len(value) :
 #                                 new_nb_elements_per_line = len(value)
 #                                 rewrite = True
 #                         
 #                         #####################################################################
 #                         
 #                         if rewrite :
 #                             if new_nb_elements_per_line == 0 and new_elements_names is not None :
 #                                 new_nb_elements_per_line = len(new_elements_names)
 #                             
 #                             # Reset obierrno 
 #                             obi_errno = 0
 # 
 #                             dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, 
 #                                                                                    new_data_type=new_type, 
 #                                                                                    new_nb_elements_per_line=new_nb_elements_per_line,
 #                                                                                    new_elements_names=new_elements_names), 
 #                                           value_obitype)
 #                             
 #                             # Update the dictionary:
 #                             for t in dcols :
 #                                 dcols[t] = (view[t], dcols[t][1])
 #                             
 #                             # Fill value
 #                             dcols[tag][0][i] = value
 #                                     
 #             i+=1
 # 
 #     print("\n")
 #     print(view.__repr__())
 #  
 #     d.close()
--- a/python/obitools3/parsers/universal.pyx
+++ b/python/obitools3/parsers/universal.pyx
@ -47,7 +47,7 @@ def entryIteratorFactory(lineiterator,
    first=next(i)
-    format="tab"
+    format=b"tabular"
    if first[0]==">":
        format=b"fasta"
@ -61,9 +61,6 @@ def entryIteratorFactory(lineiterator,
        format=b"ecopcrfile"
    elif is_ngsfilter_line(first):
        format=b"ngsfilter"
    else:
        format=b"tabular"
    if format==b'fasta':
        if seqtype == b'nuc':
--- a/python/obitools3/uri/decode.pxd
+++ b/python/obitools3/uri/decode.pxd
@ -4,3 +4,7 @@ from obitools3.dms.dms cimport DMS
 from obitools3.dms.view.view cimport View
 from obitools3.dms.column.column cimport Column
 from obitools3.dms.taxo.taxo cimport Taxonomy
 from obitools3.utils cimport tobytes, tostr
 from obitools3.files.universalopener cimport uopen
--- a/python/obitools3/uri/decode.pyx
+++ b/python/obitools3/uri/decode.pyx
@ -3,16 +3,14 @@
 from urllib.parse import urlparse, urlunparse, parse_qs, ParseResultBytes
 from os.path import isdir, isfile, basename, join
 from obitools3.utils import tobytes
 from obitools3.dms.dms import DMS
 from obitools3.files.universalopener import uopen
 from obitools3.parsers.fasta import fastaNucIterator
 from obitools3.parsers.fastq import fastqIterator
 from obitools3.parsers.universal import entryIteratorFactory
 from obitools3.dms.obiseq import Nuc_Seq
 from obitools3.apps.config import getConfiguration,logger
 class MalformedURIException(RuntimeError):
    pass
@ -130,21 +128,29 @@ def open_dms_element(DMS dms, bytes path):
    return (dms,subsubpart)
-def open_uri(uri,input=True,config={}):
+def open_uri(uri,bint input=True):
    cdef bytes urib = tobytes(uri)
    cdef bytes scheme
    cdef tuple dms
    cdef dict qualifiers
    cdef DMS default_dms
    config = getConfiguration()
    urip = urlparse(urib)
    if 'obi' not in config:
        config['obi']={}
    try:
        default_dms=config["obi"]["defaultdms"]
    except KeyError:
        default_dms=None
    scheme = urip.scheme
    error = None
    if scheme==b"" :
        scheme=b'file'
        dms = open_dms(urip.path)
        if dms is None and default_dms is not None:
            dms=(default_dms,urip.path)
@ -168,16 +174,12 @@ def open_uri(uri,input=True,config={}):
            except Exception as e:
                error=e
-    urip = ParseResultBytes(scheme=scheme, 
+    if not urip.scheme:
-                            netloc=urip.netloc, 
+        urib=b"file:"+urib
                            path=urip.path,
                            params=urip.params, 
                            query=urip.query, 
                            fragment=urip.fragment)
    uri=urlunparse(urip)
    try:
-        file = uopen(uri)
+        logger('info','Trying to open file : %s', tostr(urib))
        file = uopen(tostr(urib))
    except Exception as e:
        file = None
        error=e 
@ -189,17 +191,26 @@ def open_uri(uri,input=True,config={}):
        if b'format' in qualifiers:
            format = qualifiers[b'format'][0]
        else:
            try:
                format=config["obi"]["fileformat"]
            except KeyError:
                format=None
        if b'seqtype' in qualifiers:
            seqtype=qualifiers[b'seqtype'][0]
        else:
            try:
                seqtype=config["obi"]["seqtype"]
            except KeyError:
                seqtype=b'nuc'
        if b'skip' in qualifiers:
            skip=int(qualifiers[b"skip"][0])
        else:
-            skip=config["obi"]["skeep"]
+            try:
                skip=config["obi"]["skip"]
            except KeyError:
                skip=0
        if skip < 0:    
            raise MalformedURIException('Malformed skip argument in URI')
@ -207,8 +218,11 @@ def open_uri(uri,input=True,config={}):
        if b'only' in qualifiers:
            only=int(qualifiers[b"only"][0])
        else:
            try:
                only=config["obi"]["only"]
-        if only <= 0:    
+            except KeyError:
                only=None
        if only is not None and only <= 0:    
            raise MalformedURIException('Malformed only argument in URI')
@ -218,7 +232,10 @@ def open_uri(uri,input=True,config={}):
            except Exception as e:
                raise MalformedURIException('Malformed skiperror argument in URI')
        else:
            try:
                skiperror=config["obi"]["skiperror"]
            except KeyError:
                skiperror=True
        if not isinstance(skiperror, bool):    
            raise MalformedURIException('Malformed skiperror argument in URI')
@ -228,7 +245,10 @@ def open_uri(uri,input=True,config={}):
            except Exception as e:
                raise MalformedURIException('Malformed noquality argument in URI')
        else:
            try:
                noquality=config["obi"]["noquality"]
            except KeyError:
                noquality=False
        if not isinstance(noquality, bool):    
            raise MalformedURIException('Malformed noquality argument in URI')
@ -238,7 +258,10 @@ def open_uri(uri,input=True,config={}):
            elif qualifiers[b"qualityformat"][0]=="solexa":
                offset=64
        else:
            try:
                offset=config["obi"]["qualityoffset"]
            except KeyError:
                offset=33
        if b"header" in qualifiers:
            try:
@ -246,14 +269,20 @@ def open_uri(uri,input=True,config={}):
            except Exception as e:
                raise MalformedURIException('Malformed header argument in URI')
        else:
            try:
                header=config["obi"]["header"]
            except KeyError:
                header=False
        if not isinstance(header, bool):    
            raise MalformedURIException('Malformed header argument in URI')
        if b"sep" in qualifiers:
            sep=qualifiers[b"sep"][0][0]
        else:
-            seq=config["obi"]["sep"]
+            try:
                sep=config["obi"]["sep"]
            except KeyError:
                sep=None
 #        if b"quote" in qualifiers:
 #            pass
@ -261,12 +290,18 @@ def open_uri(uri,input=True,config={}):
        if b"dec" in qualifiers:
            dec=qualifiers[b"dec"][0][0]
        else:
            try:
                dec=config["obi"]["dec"]
            except KeyError:
                dec=b"."
        if b"nastring" in qualifiers:
            nastring=qualifiers[b"nastring"][0]
        else:
            try:
                nastring=config["obi"]["nastring"]
            except KeyError:
                nastring=b'NA'
        if b"stripwhite" in qualifiers:
            try:
@ -274,7 +309,10 @@ def open_uri(uri,input=True,config={}):
            except Exception as e:
                raise MalformedURIException('Malformed stripwhite argument in URI')
        else:
            try:
                stripwhite=config["obi"]["stripwhite"]
            except KeyError:
                stripwhite=True
        if not isinstance(stripwhite, bool):    
            raise MalformedURIException('Malformed stripwhite argument in URI')
@ -284,14 +322,20 @@ def open_uri(uri,input=True,config={}):
            except Exception as e:
                raise MalformedURIException('Malformed blanklineskip argument in URI')
        else:
            try:
                blanklineskip=config["obi"]["blanklineskip"]
            except KeyError:
                blanklineskip=True
        if not isinstance(blanklineskip, bool):    
            raise MalformedURIException('Malformed blanklineskip argument in URI')
        if b"commentchar" in qualifiers:
            commentchar=qualifiers[b"commentchar"][0][0]
        else:
            try:
                commentchar=config["obi"]["commentchar"]
            except KeyError:
                commentchar=b'#'
        if format is not None:
            if qualifiers[b"seqtype"]==b"nuc":