Patch decoding of URL

This commit is contained in:
2017-07-28 12:41:28 +02:00
parent 84bb93096f
commit b9c65a871f
9 changed files with 276 additions and 260 deletions

View File

@ -22,6 +22,7 @@ default_config = { 'software' : "The OBITools",
'loglevel' : 'INFO', 'loglevel' : 'INFO',
'progress' : True, 'progress' : True,
'inputURI' : None, 'inputURI' : None,
'outputURI' : None,
'defaultdms' : None, 'defaultdms' : None,
'inputview' : None, 'inputview' : None,
'outputview' : None, 'outputview' : None,

View File

@ -101,3 +101,14 @@ cpdef dict getConfiguration(str root_config_name="__default__",
config['__done__']=True config['__done__']=True
return config return config
def logger(level, *messages):
try:
config=getConfiguration()
root = config["__root_config__"]
l = config[root]['logger']
if config[root]['verbose']:
getattr(l, level)(*messages)
except:
print(*messages,file=sys.stderr)

View File

@ -42,5 +42,7 @@ cpdef getLogger(dict config):
rootlogger.setLevel(loglevel) rootlogger.setLevel(loglevel)
config[root]['logger']=rootlogger config[root]['logger']=rootlogger
config[root]['verbose']=True
return rootlogger return rootlogger

View File

@ -2,8 +2,8 @@ def __addInputOption(optionManager):
optionManager.add_argument( optionManager.add_argument(
dest='obi:inputURI', dest='obi:inputURI',
metavar='index', metavar='INPUT',
help='index root filename (produced by the oa index command)') help='Data source URI')
group = optionManager.add_argument_group("Restriction to a sub-part options", group = optionManager.add_argument_group("Restriction to a sub-part options",
@ -23,6 +23,11 @@ def __addInputOption(optionManager):
type=int, type=int,
help="treat only N sequences") help="treat only N sequences")
group.add_argument('--na-string',
action="store", dest="obi:nastring",
default=b"NA",
type=bytes,
help="String associated to Non Available (NA) values")
def __addSequenceInputOption(optionManager): def __addSequenceInputOption(optionManager):
@ -124,12 +129,6 @@ def __addTabularInputOption(optionManager):
type=bytes, type=bytes,
help="Decimal separator") help="Decimal separator")
group.add_argument('--na-string',
action="store", dest="obi:nastring",
default=b"NA",
type=bytes,
help="String associated to Non Available (NA) values")
group.add_argument('--strip-white', group.add_argument('--strip-white',
action="store_false", dest="obi:stripwhite", action="store_false", dest="obi:stripwhite",
default=True, default=True,
@ -161,3 +160,14 @@ def addAllInputOption(optionManager):
__addInputOption(optionManager) __addInputOption(optionManager)
__addSequenceInputOption(optionManager) __addSequenceInputOption(optionManager)
__addTabularInputOption(optionManager) __addTabularInputOption(optionManager)
def __addOutputOption(optionManager):
optionManager.add_argument(
dest='obi:outputURI',
metavar='OUTPUT',
help='Data destination URI')
def addMinimalOutputOption(optionManager):
__addOutputOption(optionManager)

View File

@ -2,6 +2,8 @@
# TODO cimport generate errors with argument numbers, but without them some variables can't be declared # TODO cimport generate errors with argument numbers, but without them some variables can't be declared
import sys
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.files.universalopener cimport uopen from obitools3.files.universalopener cimport uopen
from obitools3.parsers.fasta import fastaIterator from obitools3.parsers.fasta import fastaIterator
@ -20,6 +22,8 @@ from obitools3.dms.capi.obitypes cimport obitype_t, \
from obitools3.dms.capi.obierrno cimport obi_errno from obitools3.dms.capi.obierrno cimport obi_errno
from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
from obitools3.uri.decode import open_uri
__title__="Imports sequences from different formats into a DMS" __title__="Imports sequences from different formats into a DMS"
@ -30,83 +34,14 @@ default_config = { 'destview' : None,
'skiperror' : False, 'skiperror' : False,
'seqinformat' : None, 'seqinformat' : None,
'moltype' : 'nuc', 'moltype' : 'nuc',
'filename' : None 'source' : None
} }
def addOptions(parser): def addOptions(parser):
parser.add_argument(dest='import:filename',
metavar='<FILENAME>',
nargs='?',
default=None,
help='Name of the sequence file to import' )
group=parser.add_argument_group('obi import specific options') addSequenceInputOption(parser)
addMinimalOutputOption(parser)
group.add_argument('--default-dms','-d',
action="store", dest="obi:defaultdms",
metavar='<DMS NAME>',
default=None,
type=str,
help="Name of the default DMS for reading and writing data")
group.add_argument('--destination-view','-v',
action="store", dest="import:destview",
metavar='<VIEW NAME>',
default=None,
type=str,
required=True,
help="Name of the default DMS for reading and writing data")
group.add_argument('--skip',
action="store", dest="import:skip",
metavar='<N>',
default=0,
type=int,
help="Skip the N first sequences")
group.add_argument('--only',
action="store", dest="import:only",
metavar='<N>',
default=None,
type=int,
help="Treat only N sequences")
group.add_argument('--skip-on-error',
action="store_true", dest="import:skiperror",
default=None,
help="Skip sequence entries with parse error")
group.add_argument('--fasta',
action="store_const", dest="import:seqinformat",
default=None,
const='fasta',
help="Input file is in fasta nucleic format (including obitools fasta extentions)")
group.add_argument('--fastq',
action="store_const", dest="import:seqinformat",
default=None,
const='fastq',
help="Input file is in sanger fastq nucleic format (standard fastq)")
group.add_argument('--nuc',
action="store_const", dest="import:moltype",
default=None,
const='nuc',
help="Input file contains nucleic sequences")
group.add_argument('--prot',
action="store_const", dest="import:moltype",
default=None,
const='pep',
help="Input file contains protein sequences")
group.add_argument('--NA',
action="store", dest="import:NA",
metavar='<NA_value>',
default='NA',
type=str,
help="Character string for Not Available values in the input file "
"(default: 'NA'")
def run(config): def run(config):
@ -142,147 +77,159 @@ def run(config):
cdef ProgressBar pb cdef ProgressBar pb
global obi_errno global obi_errno
pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file
inputs = uopen(config['import']['filename']) logger=config['obi']['logger']
# Create or open DMS
d = DMS.open_or_new(config['obi']['defaultdms'])
get_quality = False logger.info("obi import : imports file into an DMS")
NUC_SEQS_view = False
if config['import']['seqinformat']=='fasta':
get_quality = False
NUC_SEQS_view = True
iseq = fastaIterator(inputs, skip=config['import']['skip'])
view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
elif config['import']['seqinformat']=='fastq':
get_quality = True
NUC_SEQS_view = True
iseq = fastqIterator(inputs, skip=config['import']['skip'])
view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
else:
raise RuntimeError('File format not handled')
# Save basic columns in variables for optimization inputs = open_uri(config['obi']['inputURI'])
if NUC_SEQS_view :
id_col = view["ID"]
def_col = view["DEFINITION"]
seq_col = view["NUC_SEQ"]
if get_quality :
qual_col = view["QUALITY"]
dcols = {} print(inputs)
i = 0 sys.exit()
for seq in iseq :
if i == config['import']['only'] :
break
else :
pb(i)
if NUC_SEQS_view :
id_col[i] = seq['id']
def_col[i] = seq['definition']
seq_col[i] = seq['sequence']
if get_quality :
qual_col[i] = seq['quality']
for tag in seq['tags'] : # pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file
#
value = seq['tags'][tag] # inputs = uopen(config['import']['filename'])
#
# Check NA value # # Create or open DMS
if value == config['import']['NA'] : # d = DMS.open_or_new(config['obi']['defaultdms'])
value = None #
# get_quality = False
if tag not in dcols : # NUC_SEQS_view = False
# if config['import']['seqinformat']=='fasta':
value_type = type(value) # get_quality = False
nb_elts = 1 # NUC_SEQS_view = True
value_obitype = OBI_VOID # iseq = fastaIterator(inputs, skip=config['import']['skip'])
# view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
if value_type == dict or value_type == list : # elif config['import']['seqinformat']=='fastq':
nb_elts = len(value) # get_quality = True
elt_names = list(value) # NUC_SEQS_view = True
else : # iseq = fastqIterator(inputs, skip=config['import']['skip'])
nb_elts = 1 # view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
elt_names = None # else:
# raise RuntimeError('File format not handled')
value_obitype = get_obitype(value) #
# # Save basic columns in variables for optimization
if value_obitype != OBI_VOID : # if NUC_SEQS_view :
dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype) # id_col = view["ID"]
# def_col = view["DEFINITION"]
# Fill value # seq_col = view["NUC_SEQ"]
dcols[tag][0][i] = value # if get_quality :
# qual_col = view["QUALITY"]
# TODO else log error? #
# dcols = {}
else : #
# i = 0
rewrite = False # for seq in iseq :
# if i == config['import']['only'] :
# Check type adequation # break
old_type = dcols[tag][1] # else :
new_type = OBI_VOID # pb(i)
new_type = update_obitype(old_type, value) # if NUC_SEQS_view :
if old_type != new_type : # id_col[i] = seq['id']
rewrite = True # def_col[i] = seq['definition']
# seq_col[i] = seq['sequence']
try: # if get_quality :
# Fill value # qual_col[i] = seq['quality']
dcols[tag][0][i] = value #
# for tag in seq['tags'] :
except IndexError : #
# value = seq['tags'][tag]
value_type = type(value) #
old_column = dcols[tag][0] # # Check NA value
old_nb_elements_per_line = old_column.nb_elements_per_line # if value == config['import']['NA'] :
new_nb_elements_per_line = 0 # value = None
old_elements_names = old_column.elements_names #
new_elements_names = None # if tag not in dcols :
#
##################################################################### # value_type = type(value)
# nb_elts = 1
# Check the length and keys of column lines if needed # value_obitype = OBI_VOID
if value_type == dict : # Check dictionary keys #
for k in value : # if value_type == dict or value_type == list :
if k not in old_elements_names : # nb_elts = len(value)
new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value])) # elt_names = list(value)
rewrite = True # else :
break # nb_elts = 1
# elt_names = None
elif value_type == list or value_type == tuple : # Check vector length #
if old_nb_elements_per_line < len(value) : # value_obitype = get_obitype(value)
new_nb_elements_per_line = len(value) #
rewrite = True # if value_obitype != OBI_VOID :
# dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
##################################################################### #
# # Fill value
if rewrite : # dcols[tag][0][i] = value
if new_nb_elements_per_line == 0 and new_elements_names is not None : #
new_nb_elements_per_line = len(new_elements_names) # # TODO else log error?
#
# Reset obierrno # else :
obi_errno = 0 #
# rewrite = False
dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, #
new_data_type=new_type, # # Check type adequation
new_nb_elements_per_line=new_nb_elements_per_line, # old_type = dcols[tag][1]
new_elements_names=new_elements_names), # new_type = OBI_VOID
value_obitype) # new_type = update_obitype(old_type, value)
# if old_type != new_type :
# Update the dictionary: # rewrite = True
for t in dcols : #
dcols[t] = (view[t], dcols[t][1]) # try:
# # Fill value
# Fill value # dcols[tag][0][i] = value
dcols[tag][0][i] = value #
# except IndexError :
i+=1 #
# value_type = type(value)
print("\n") # old_column = dcols[tag][0]
print(view.__repr__()) # old_nb_elements_per_line = old_column.nb_elements_per_line
# new_nb_elements_per_line = 0
d.close() # old_elements_names = old_column.elements_names
# new_elements_names = None
#
# #####################################################################
#
# # Check the length and keys of column lines if needed
# if value_type == dict : # Check dictionary keys
# for k in value :
# if k not in old_elements_names :
# new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
# rewrite = True
# break
#
# elif value_type == list or value_type == tuple : # Check vector length
# if old_nb_elements_per_line < len(value) :
# new_nb_elements_per_line = len(value)
# rewrite = True
#
# #####################################################################
#
# if rewrite :
# if new_nb_elements_per_line == 0 and new_elements_names is not None :
# new_nb_elements_per_line = len(new_elements_names)
#
# # Reset obierrno
# obi_errno = 0
#
# dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
# new_data_type=new_type,
# new_nb_elements_per_line=new_nb_elements_per_line,
# new_elements_names=new_elements_names),
# value_obitype)
#
# # Update the dictionary:
# for t in dcols :
# dcols[t] = (view[t], dcols[t][1])
#
# # Fill value
# dcols[tag][0][i] = value
#
# i+=1
#
# print("\n")
# print(view.__repr__())
#
# d.close()

View File

@ -47,7 +47,7 @@ def entryIteratorFactory(lineiterator,
first=next(i) first=next(i)
format="tab" format=b"tabular"
if first[0]==">": if first[0]==">":
format=b"fasta" format=b"fasta"
@ -61,9 +61,6 @@ def entryIteratorFactory(lineiterator,
format=b"ecopcrfile" format=b"ecopcrfile"
elif is_ngsfilter_line(first): elif is_ngsfilter_line(first):
format=b"ngsfilter" format=b"ngsfilter"
else:
format=b"tabular"
if format==b'fasta': if format==b'fasta':
if seqtype == b'nuc': if seqtype == b'nuc':

View File

@ -4,3 +4,7 @@ from obitools3.dms.dms cimport DMS
from obitools3.dms.view.view cimport View from obitools3.dms.view.view cimport View
from obitools3.dms.column.column cimport Column from obitools3.dms.column.column cimport Column
from obitools3.dms.taxo.taxo cimport Taxonomy from obitools3.dms.taxo.taxo cimport Taxonomy
from obitools3.utils cimport tobytes, tostr
from obitools3.files.universalopener cimport uopen

View File

@ -3,16 +3,14 @@
from urllib.parse import urlparse, urlunparse, parse_qs, ParseResultBytes from urllib.parse import urlparse, urlunparse, parse_qs, ParseResultBytes
from os.path import isdir, isfile, basename, join from os.path import isdir, isfile, basename, join
from obitools3.utils import tobytes
from obitools3.dms.dms import DMS from obitools3.dms.dms import DMS
from obitools3.files.universalopener import uopen
from obitools3.parsers.fasta import fastaNucIterator from obitools3.parsers.fasta import fastaNucIterator
from obitools3.parsers.fastq import fastqIterator from obitools3.parsers.fastq import fastqIterator
from obitools3.parsers.universal import entryIteratorFactory from obitools3.parsers.universal import entryIteratorFactory
from obitools3.dms.obiseq import Nuc_Seq from obitools3.dms.obiseq import Nuc_Seq
from obitools3.apps.config import getConfiguration,logger
class MalformedURIException(RuntimeError): class MalformedURIException(RuntimeError):
pass pass
@ -130,21 +128,29 @@ def open_dms_element(DMS dms, bytes path):
return (dms,subsubpart) return (dms,subsubpart)
def open_uri(uri,input=True,config={}): def open_uri(uri,bint input=True):
cdef bytes urib = tobytes(uri) cdef bytes urib = tobytes(uri)
cdef bytes scheme cdef bytes scheme
cdef tuple dms cdef tuple dms
cdef dict qualifiers
cdef DMS default_dms
config = getConfiguration()
urip = urlparse(urib) urip = urlparse(urib)
if 'obi' not in config:
config['obi']={}
try:
default_dms=config["obi"]["defaultdms"] default_dms=config["obi"]["defaultdms"]
except KeyError:
default_dms=None
scheme = urip.scheme scheme = urip.scheme
error = None error = None
if scheme==b"" : if scheme==b"" :
scheme=b'file'
dms = open_dms(urip.path) dms = open_dms(urip.path)
if dms is None and default_dms is not None: if dms is None and default_dms is not None:
dms=(default_dms,urip.path) dms=(default_dms,urip.path)
@ -168,16 +174,12 @@ def open_uri(uri,input=True,config={}):
except Exception as e: except Exception as e:
error=e error=e
urip = ParseResultBytes(scheme=scheme, if not urip.scheme:
netloc=urip.netloc, urib=b"file:"+urib
path=urip.path,
params=urip.params,
query=urip.query,
fragment=urip.fragment)
uri=urlunparse(urip)
try: try:
file = uopen(uri) logger('info','Trying to open file : %s', tostr(urib))
file = uopen(tostr(urib))
except Exception as e: except Exception as e:
file = None file = None
error=e error=e
@ -189,17 +191,26 @@ def open_uri(uri,input=True,config={}):
if b'format' in qualifiers: if b'format' in qualifiers:
format = qualifiers[b'format'][0] format = qualifiers[b'format'][0]
else: else:
try:
format=config["obi"]["fileformat"] format=config["obi"]["fileformat"]
except KeyError:
format=None
if b'seqtype' in qualifiers: if b'seqtype' in qualifiers:
seqtype=qualifiers[b'seqtype'][0] seqtype=qualifiers[b'seqtype'][0]
else: else:
try:
seqtype=config["obi"]["seqtype"] seqtype=config["obi"]["seqtype"]
except KeyError:
seqtype=b'nuc'
if b'skip' in qualifiers: if b'skip' in qualifiers:
skip=int(qualifiers[b"skip"][0]) skip=int(qualifiers[b"skip"][0])
else: else:
skip=config["obi"]["skeep"] try:
skip=config["obi"]["skip"]
except KeyError:
skip=0
if skip < 0: if skip < 0:
raise MalformedURIException('Malformed skip argument in URI') raise MalformedURIException('Malformed skip argument in URI')
@ -207,8 +218,11 @@ def open_uri(uri,input=True,config={}):
if b'only' in qualifiers: if b'only' in qualifiers:
only=int(qualifiers[b"only"][0]) only=int(qualifiers[b"only"][0])
else: else:
try:
only=config["obi"]["only"] only=config["obi"]["only"]
if only <= 0: except KeyError:
only=None
if only is not None and only <= 0:
raise MalformedURIException('Malformed only argument in URI') raise MalformedURIException('Malformed only argument in URI')
@ -218,7 +232,10 @@ def open_uri(uri,input=True,config={}):
except Exception as e: except Exception as e:
raise MalformedURIException('Malformed skiperror argument in URI') raise MalformedURIException('Malformed skiperror argument in URI')
else: else:
try:
skiperror=config["obi"]["skiperror"] skiperror=config["obi"]["skiperror"]
except KeyError:
skiperror=True
if not isinstance(skiperror, bool): if not isinstance(skiperror, bool):
raise MalformedURIException('Malformed skiperror argument in URI') raise MalformedURIException('Malformed skiperror argument in URI')
@ -228,7 +245,10 @@ def open_uri(uri,input=True,config={}):
except Exception as e: except Exception as e:
raise MalformedURIException('Malformed noquality argument in URI') raise MalformedURIException('Malformed noquality argument in URI')
else: else:
try:
noquality=config["obi"]["noquality"] noquality=config["obi"]["noquality"]
except KeyError:
noquality=False
if not isinstance(noquality, bool): if not isinstance(noquality, bool):
raise MalformedURIException('Malformed noquality argument in URI') raise MalformedURIException('Malformed noquality argument in URI')
@ -238,7 +258,10 @@ def open_uri(uri,input=True,config={}):
elif qualifiers[b"qualityformat"][0]=="solexa": elif qualifiers[b"qualityformat"][0]=="solexa":
offset=64 offset=64
else: else:
try:
offset=config["obi"]["qualityoffset"] offset=config["obi"]["qualityoffset"]
except KeyError:
offset=33
if b"header" in qualifiers: if b"header" in qualifiers:
try: try:
@ -246,14 +269,20 @@ def open_uri(uri,input=True,config={}):
except Exception as e: except Exception as e:
raise MalformedURIException('Malformed header argument in URI') raise MalformedURIException('Malformed header argument in URI')
else: else:
try:
header=config["obi"]["header"] header=config["obi"]["header"]
except KeyError:
header=False
if not isinstance(header, bool): if not isinstance(header, bool):
raise MalformedURIException('Malformed header argument in URI') raise MalformedURIException('Malformed header argument in URI')
if b"sep" in qualifiers: if b"sep" in qualifiers:
sep=qualifiers[b"sep"][0][0] sep=qualifiers[b"sep"][0][0]
else: else:
seq=config["obi"]["sep"] try:
sep=config["obi"]["sep"]
except KeyError:
sep=None
# if b"quote" in qualifiers: # if b"quote" in qualifiers:
# pass # pass
@ -261,12 +290,18 @@ def open_uri(uri,input=True,config={}):
if b"dec" in qualifiers: if b"dec" in qualifiers:
dec=qualifiers[b"dec"][0][0] dec=qualifiers[b"dec"][0][0]
else: else:
try:
dec=config["obi"]["dec"] dec=config["obi"]["dec"]
except KeyError:
dec=b"."
if b"nastring" in qualifiers: if b"nastring" in qualifiers:
nastring=qualifiers[b"nastring"][0] nastring=qualifiers[b"nastring"][0]
else: else:
try:
nastring=config["obi"]["nastring"] nastring=config["obi"]["nastring"]
except KeyError:
nastring=b'NA'
if b"stripwhite" in qualifiers: if b"stripwhite" in qualifiers:
try: try:
@ -274,7 +309,10 @@ def open_uri(uri,input=True,config={}):
except Exception as e: except Exception as e:
raise MalformedURIException('Malformed stripwhite argument in URI') raise MalformedURIException('Malformed stripwhite argument in URI')
else: else:
try:
stripwhite=config["obi"]["stripwhite"] stripwhite=config["obi"]["stripwhite"]
except KeyError:
stripwhite=True
if not isinstance(stripwhite, bool): if not isinstance(stripwhite, bool):
raise MalformedURIException('Malformed stripwhite argument in URI') raise MalformedURIException('Malformed stripwhite argument in URI')
@ -284,14 +322,20 @@ def open_uri(uri,input=True,config={}):
except Exception as e: except Exception as e:
raise MalformedURIException('Malformed blanklineskip argument in URI') raise MalformedURIException('Malformed blanklineskip argument in URI')
else: else:
try:
blanklineskip=config["obi"]["blanklineskip"] blanklineskip=config["obi"]["blanklineskip"]
except KeyError:
blanklineskip=True
if not isinstance(blanklineskip, bool): if not isinstance(blanklineskip, bool):
raise MalformedURIException('Malformed blanklineskip argument in URI') raise MalformedURIException('Malformed blanklineskip argument in URI')
if b"commentchar" in qualifiers: if b"commentchar" in qualifiers:
commentchar=qualifiers[b"commentchar"][0][0] commentchar=qualifiers[b"commentchar"][0][0]
else: else:
try:
commentchar=config["obi"]["commentchar"] commentchar=config["obi"]["commentchar"]
except KeyError:
commentchar=b'#'
if format is not None: if format is not None:
if qualifiers[b"seqtype"]==b"nuc": if qualifiers[b"seqtype"]==b"nuc":