Patch decoding of URL

This commit is contained in:
2017-07-28 12:41:28 +02:00
parent 84bb93096f
commit b9c65a871f
9 changed files with 276 additions and 260 deletions

View File

@ -22,6 +22,7 @@ default_config = { 'software' : "The OBITools",
'loglevel' : 'INFO',
'progress' : True,
'inputURI' : None,
'outputURI' : None,
'defaultdms' : None,
'inputview' : None,
'outputview' : None,

View File

@ -101,3 +101,14 @@ cpdef dict getConfiguration(str root_config_name="__default__",
config['__done__']=True
return config
def logger(level, *messages):
try:
config=getConfiguration()
root = config["__root_config__"]
l = config[root]['logger']
if config[root]['verbose']:
getattr(l, level)(*messages)
except:
print(*messages,file=sys.stderr)

View File

@ -42,5 +42,7 @@ cpdef getLogger(dict config):
rootlogger.setLevel(loglevel)
config[root]['logger']=rootlogger
config[root]['verbose']=True
return rootlogger

View File

@ -2,8 +2,8 @@ def __addInputOption(optionManager):
optionManager.add_argument(
dest='obi:inputURI',
metavar='index',
help='index root filename (produced by the oa index command)')
metavar='INPUT',
help='Data source URI')
group = optionManager.add_argument_group("Restriction to a sub-part options",
@ -23,6 +23,11 @@ def __addInputOption(optionManager):
type=int,
help="treat only N sequences")
group.add_argument('--na-string',
action="store", dest="obi:nastring",
default=b"NA",
type=bytes,
help="String associated to Non Available (NA) values")
def __addSequenceInputOption(optionManager):
@ -124,12 +129,6 @@ def __addTabularInputOption(optionManager):
type=bytes,
help="Decimal separator")
group.add_argument('--na-string',
action="store", dest="obi:nastring",
default=b"NA",
type=bytes,
help="String associated to Non Available (NA) values")
group.add_argument('--strip-white',
action="store_false", dest="obi:stripwhite",
default=True,
@ -161,3 +160,14 @@ def addAllInputOption(optionManager):
__addInputOption(optionManager)
__addSequenceInputOption(optionManager)
__addTabularInputOption(optionManager)
def __addOutputOption(optionManager):
optionManager.add_argument(
dest='obi:outputURI',
metavar='OUTPUT',
help='Data destination URI')
def addMinimalOutputOption(optionManager):
__addOutputOption(optionManager)

View File

@ -2,6 +2,8 @@
# TODO cimport generate errors with argument numbers, but without them some variables can't be declared
import sys
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.files.universalopener cimport uopen
from obitools3.parsers.fasta import fastaIterator
@ -20,6 +22,8 @@ from obitools3.dms.capi.obitypes cimport obitype_t, \
from obitools3.dms.capi.obierrno cimport obi_errno
from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
from obitools3.uri.decode import open_uri
__title__="Imports sequences from different formats into a DMS"
@ -30,83 +34,14 @@ default_config = { 'destview' : None,
'skiperror' : False,
'seqinformat' : None,
'moltype' : 'nuc',
'filename' : None
'source' : None
}
def addOptions(parser):
parser.add_argument(dest='import:filename',
metavar='<FILENAME>',
nargs='?',
default=None,
help='Name of the sequence file to import' )
group=parser.add_argument_group('obi import specific options')
addSequenceInputOption(parser)
addMinimalOutputOption(parser)
group.add_argument('--default-dms','-d',
action="store", dest="obi:defaultdms",
metavar='<DMS NAME>',
default=None,
type=str,
help="Name of the default DMS for reading and writing data")
group.add_argument('--destination-view','-v',
action="store", dest="import:destview",
metavar='<VIEW NAME>',
default=None,
type=str,
required=True,
help="Name of the default DMS for reading and writing data")
group.add_argument('--skip',
action="store", dest="import:skip",
metavar='<N>',
default=0,
type=int,
help="Skip the N first sequences")
group.add_argument('--only',
action="store", dest="import:only",
metavar='<N>',
default=None,
type=int,
help="Treat only N sequences")
group.add_argument('--skip-on-error',
action="store_true", dest="import:skiperror",
default=None,
help="Skip sequence entries with parse error")
group.add_argument('--fasta',
action="store_const", dest="import:seqinformat",
default=None,
const='fasta',
help="Input file is in fasta nucleic format (including obitools fasta extentions)")
group.add_argument('--fastq',
action="store_const", dest="import:seqinformat",
default=None,
const='fastq',
help="Input file is in sanger fastq nucleic format (standard fastq)")
group.add_argument('--nuc',
action="store_const", dest="import:moltype",
default=None,
const='nuc',
help="Input file contains nucleic sequences")
group.add_argument('--prot',
action="store_const", dest="import:moltype",
default=None,
const='pep',
help="Input file contains protein sequences")
group.add_argument('--NA',
action="store", dest="import:NA",
metavar='<NA_value>',
default='NA',
type=str,
help="Character string for Not Available values in the input file "
"(default: 'NA'")
def run(config):
@ -142,147 +77,159 @@ def run(config):
cdef ProgressBar pb
global obi_errno
pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file
inputs = uopen(config['import']['filename'])
logger=config['obi']['logger']
# Create or open DMS
d = DMS.open_or_new(config['obi']['defaultdms'])
get_quality = False
NUC_SEQS_view = False
if config['import']['seqinformat']=='fasta':
get_quality = False
NUC_SEQS_view = True
iseq = fastaIterator(inputs, skip=config['import']['skip'])
view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
elif config['import']['seqinformat']=='fastq':
get_quality = True
NUC_SEQS_view = True
iseq = fastqIterator(inputs, skip=config['import']['skip'])
view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
else:
raise RuntimeError('File format not handled')
logger.info("obi import : imports file into an DMS")
# Save basic columns in variables for optimization
if NUC_SEQS_view :
id_col = view["ID"]
def_col = view["DEFINITION"]
seq_col = view["NUC_SEQ"]
if get_quality :
qual_col = view["QUALITY"]
inputs = open_uri(config['obi']['inputURI'])
dcols = {}
print(inputs)
i = 0
for seq in iseq :
if i == config['import']['only'] :
break
else :
pb(i)
if NUC_SEQS_view :
id_col[i] = seq['id']
def_col[i] = seq['definition']
seq_col[i] = seq['sequence']
if get_quality :
qual_col[i] = seq['quality']
sys.exit()
for tag in seq['tags'] :
value = seq['tags'][tag]
# Check NA value
if value == config['import']['NA'] :
value = None
if tag not in dcols :
value_type = type(value)
nb_elts = 1
value_obitype = OBI_VOID
if value_type == dict or value_type == list :
nb_elts = len(value)
elt_names = list(value)
else :
nb_elts = 1
elt_names = None
value_obitype = get_obitype(value)
if value_obitype != OBI_VOID :
dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
# Fill value
dcols[tag][0][i] = value
# TODO else log error?
else :
rewrite = False
# Check type adequation
old_type = dcols[tag][1]
new_type = OBI_VOID
new_type = update_obitype(old_type, value)
if old_type != new_type :
rewrite = True
try:
# Fill value
dcols[tag][0][i] = value
except IndexError :
value_type = type(value)
old_column = dcols[tag][0]
old_nb_elements_per_line = old_column.nb_elements_per_line
new_nb_elements_per_line = 0
old_elements_names = old_column.elements_names
new_elements_names = None
#####################################################################
# Check the length and keys of column lines if needed
if value_type == dict : # Check dictionary keys
for k in value :
if k not in old_elements_names :
new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
rewrite = True
break
elif value_type == list or value_type == tuple : # Check vector length
if old_nb_elements_per_line < len(value) :
new_nb_elements_per_line = len(value)
rewrite = True
#####################################################################
if rewrite :
if new_nb_elements_per_line == 0 and new_elements_names is not None :
new_nb_elements_per_line = len(new_elements_names)
# Reset obierrno
obi_errno = 0
dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
new_data_type=new_type,
new_nb_elements_per_line=new_nb_elements_per_line,
new_elements_names=new_elements_names),
value_obitype)
# Update the dictionary:
for t in dcols :
dcols[t] = (view[t], dcols[t][1])
# Fill value
dcols[tag][0][i] = value
i+=1
print("\n")
print(view.__repr__())
d.close()
# pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file
#
# inputs = uopen(config['import']['filename'])
#
# # Create or open DMS
# d = DMS.open_or_new(config['obi']['defaultdms'])
#
# get_quality = False
# NUC_SEQS_view = False
# if config['import']['seqinformat']=='fasta':
# get_quality = False
# NUC_SEQS_view = True
# iseq = fastaIterator(inputs, skip=config['import']['skip'])
# view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
# elif config['import']['seqinformat']=='fastq':
# get_quality = True
# NUC_SEQS_view = True
# iseq = fastqIterator(inputs, skip=config['import']['skip'])
# view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality)
# else:
# raise RuntimeError('File format not handled')
#
# # Save basic columns in variables for optimization
# if NUC_SEQS_view :
# id_col = view["ID"]
# def_col = view["DEFINITION"]
# seq_col = view["NUC_SEQ"]
# if get_quality :
# qual_col = view["QUALITY"]
#
# dcols = {}
#
# i = 0
# for seq in iseq :
# if i == config['import']['only'] :
# break
# else :
# pb(i)
# if NUC_SEQS_view :
# id_col[i] = seq['id']
# def_col[i] = seq['definition']
# seq_col[i] = seq['sequence']
# if get_quality :
# qual_col[i] = seq['quality']
#
# for tag in seq['tags'] :
#
# value = seq['tags'][tag]
#
# # Check NA value
# if value == config['import']['NA'] :
# value = None
#
# if tag not in dcols :
#
# value_type = type(value)
# nb_elts = 1
# value_obitype = OBI_VOID
#
# if value_type == dict or value_type == list :
# nb_elts = len(value)
# elt_names = list(value)
# else :
# nb_elts = 1
# elt_names = None
#
# value_obitype = get_obitype(value)
#
# if value_obitype != OBI_VOID :
# dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype)
#
# # Fill value
# dcols[tag][0][i] = value
#
# # TODO else log error?
#
# else :
#
# rewrite = False
#
# # Check type adequation
# old_type = dcols[tag][1]
# new_type = OBI_VOID
# new_type = update_obitype(old_type, value)
# if old_type != new_type :
# rewrite = True
#
# try:
# # Fill value
# dcols[tag][0][i] = value
#
# except IndexError :
#
# value_type = type(value)
# old_column = dcols[tag][0]
# old_nb_elements_per_line = old_column.nb_elements_per_line
# new_nb_elements_per_line = 0
# old_elements_names = old_column.elements_names
# new_elements_names = None
#
# #####################################################################
#
# # Check the length and keys of column lines if needed
# if value_type == dict : # Check dictionary keys
# for k in value :
# if k not in old_elements_names :
# new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value]))
# rewrite = True
# break
#
# elif value_type == list or value_type == tuple : # Check vector length
# if old_nb_elements_per_line < len(value) :
# new_nb_elements_per_line = len(value)
# rewrite = True
#
# #####################################################################
#
# if rewrite :
# if new_nb_elements_per_line == 0 and new_elements_names is not None :
# new_nb_elements_per_line = len(new_elements_names)
#
# # Reset obierrno
# obi_errno = 0
#
# dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name,
# new_data_type=new_type,
# new_nb_elements_per_line=new_nb_elements_per_line,
# new_elements_names=new_elements_names),
# value_obitype)
#
# # Update the dictionary:
# for t in dcols :
# dcols[t] = (view[t], dcols[t][1])
#
# # Fill value
# dcols[tag][0][i] = value
#
# i+=1
#
# print("\n")
# print(view.__repr__())
#
# d.close()

View File

@ -47,7 +47,7 @@ def entryIteratorFactory(lineiterator,
first=next(i)
format="tab"
format=b"tabular"
if first[0]==">":
format=b"fasta"
@ -61,9 +61,6 @@ def entryIteratorFactory(lineiterator,
format=b"ecopcrfile"
elif is_ngsfilter_line(first):
format=b"ngsfilter"
else:
format=b"tabular"
if format==b'fasta':
if seqtype == b'nuc':

View File

@ -4,3 +4,7 @@ from obitools3.dms.dms cimport DMS
from obitools3.dms.view.view cimport View
from obitools3.dms.column.column cimport Column
from obitools3.dms.taxo.taxo cimport Taxonomy
from obitools3.utils cimport tobytes, tostr
from obitools3.files.universalopener cimport uopen

View File

@ -3,16 +3,14 @@
from urllib.parse import urlparse, urlunparse, parse_qs, ParseResultBytes
from os.path import isdir, isfile, basename, join
from obitools3.utils import tobytes
from obitools3.dms.dms import DMS
from obitools3.files.universalopener import uopen
from obitools3.parsers.fasta import fastaNucIterator
from obitools3.parsers.fastq import fastqIterator
from obitools3.parsers.universal import entryIteratorFactory
from obitools3.dms.obiseq import Nuc_Seq
from obitools3.apps.config import getConfiguration,logger
class MalformedURIException(RuntimeError):
pass
@ -130,21 +128,29 @@ def open_dms_element(DMS dms, bytes path):
return (dms,subsubpart)
def open_uri(uri,input=True,config={}):
def open_uri(uri,bint input=True):
cdef bytes urib = tobytes(uri)
cdef bytes scheme
cdef tuple dms
cdef dict qualifiers
cdef DMS default_dms
config = getConfiguration()
urip = urlparse(urib)
default_dms=config["obi"]["defaultdms"]
if 'obi' not in config:
config['obi']={}
try:
default_dms=config["obi"]["defaultdms"]
except KeyError:
default_dms=None
scheme = urip.scheme
error = None
if scheme==b"" :
scheme=b'file'
dms = open_dms(urip.path)
if dms is None and default_dms is not None:
dms=(default_dms,urip.path)
@ -168,16 +174,12 @@ def open_uri(uri,input=True,config={}):
except Exception as e:
error=e
urip = ParseResultBytes(scheme=scheme,
netloc=urip.netloc,
path=urip.path,
params=urip.params,
query=urip.query,
fragment=urip.fragment)
uri=urlunparse(urip)
if not urip.scheme:
urib=b"file:"+urib
try:
file = uopen(uri)
logger('info','Trying to open file : %s', tostr(urib))
file = uopen(tostr(urib))
except Exception as e:
file = None
error=e
@ -189,17 +191,26 @@ def open_uri(uri,input=True,config={}):
if b'format' in qualifiers:
format = qualifiers[b'format'][0]
else:
format=config["obi"]["fileformat"]
try:
format=config["obi"]["fileformat"]
except KeyError:
format=None
if b'seqtype' in qualifiers:
seqtype=qualifiers[b'seqtype'][0]
else:
seqtype=config["obi"]["seqtype"]
try:
seqtype=config["obi"]["seqtype"]
except KeyError:
seqtype=b'nuc'
if b'skip' in qualifiers:
skip=int(qualifiers[b"skip"][0])
else:
skip=config["obi"]["skeep"]
try:
skip=config["obi"]["skip"]
except KeyError:
skip=0
if skip < 0:
raise MalformedURIException('Malformed skip argument in URI')
@ -207,8 +218,11 @@ def open_uri(uri,input=True,config={}):
if b'only' in qualifiers:
only=int(qualifiers[b"only"][0])
else:
only=config["obi"]["only"]
if only <= 0:
try:
only=config["obi"]["only"]
except KeyError:
only=None
if only is not None and only <= 0:
raise MalformedURIException('Malformed only argument in URI')
@ -218,7 +232,10 @@ def open_uri(uri,input=True,config={}):
except Exception as e:
raise MalformedURIException('Malformed skiperror argument in URI')
else:
skiperror=config["obi"]["skiperror"]
try:
skiperror=config["obi"]["skiperror"]
except KeyError:
skiperror=True
if not isinstance(skiperror, bool):
raise MalformedURIException('Malformed skiperror argument in URI')
@ -228,7 +245,10 @@ def open_uri(uri,input=True,config={}):
except Exception as e:
raise MalformedURIException('Malformed noquality argument in URI')
else:
noquality=config["obi"]["noquality"]
try:
noquality=config["obi"]["noquality"]
except KeyError:
noquality=False
if not isinstance(noquality, bool):
raise MalformedURIException('Malformed noquality argument in URI')
@ -238,7 +258,10 @@ def open_uri(uri,input=True,config={}):
elif qualifiers[b"qualityformat"][0]=="solexa":
offset=64
else:
offset=config["obi"]["qualityoffset"]
try:
offset=config["obi"]["qualityoffset"]
except KeyError:
offset=33
if b"header" in qualifiers:
try:
@ -246,14 +269,20 @@ def open_uri(uri,input=True,config={}):
except Exception as e:
raise MalformedURIException('Malformed header argument in URI')
else:
header=config["obi"]["header"]
try:
header=config["obi"]["header"]
except KeyError:
header=False
if not isinstance(header, bool):
raise MalformedURIException('Malformed header argument in URI')
if b"sep" in qualifiers:
sep=qualifiers[b"sep"][0][0]
else:
seq=config["obi"]["sep"]
try:
sep=config["obi"]["sep"]
except KeyError:
sep=None
# if b"quote" in qualifiers:
# pass
@ -261,12 +290,18 @@ def open_uri(uri,input=True,config={}):
if b"dec" in qualifiers:
dec=qualifiers[b"dec"][0][0]
else:
dec=config["obi"]["dec"]
try:
dec=config["obi"]["dec"]
except KeyError:
dec=b"."
if b"nastring" in qualifiers:
nastring=qualifiers[b"nastring"][0]
else:
nastring=config["obi"]["nastring"]
try:
nastring=config["obi"]["nastring"]
except KeyError:
nastring=b'NA'
if b"stripwhite" in qualifiers:
try:
@ -274,7 +309,10 @@ def open_uri(uri,input=True,config={}):
except Exception as e:
raise MalformedURIException('Malformed stripwhite argument in URI')
else:
stripwhite=config["obi"]["stripwhite"]
try:
stripwhite=config["obi"]["stripwhite"]
except KeyError:
stripwhite=True
if not isinstance(stripwhite, bool):
raise MalformedURIException('Malformed stripwhite argument in URI')
@ -284,14 +322,20 @@ def open_uri(uri,input=True,config={}):
except Exception as e:
raise MalformedURIException('Malformed blanklineskip argument in URI')
else:
blanklineskip=config["obi"]["blanklineskip"]
try:
blanklineskip=config["obi"]["blanklineskip"]
except KeyError:
blanklineskip=True
if not isinstance(blanklineskip, bool):
raise MalformedURIException('Malformed blanklineskip argument in URI')
if b"commentchar" in qualifiers:
commentchar=qualifiers[b"commentchar"][0][0]
else:
commentchar=config["obi"]["commentchar"]
try:
commentchar=config["obi"]["commentchar"]
except KeyError:
commentchar=b'#'
if format is not None:
if qualifiers[b"seqtype"]==b"nuc":