Cython: updated the URI decoding to handle outputs other than DMS

This commit is contained in:
Celine Mercier
2018-10-17 11:21:29 +02:00
parent 58589e04be
commit 135d3b6e67

View File

@ -12,6 +12,11 @@ from obitools3.parsers.ngsfilter import ngsfilterIterator
from obitools3.parsers.embl import emblIterator
from obitools3.parsers.universal import entryIteratorFactory
from obitools3.writers.fasta import FastaNucWriter
from obitools3.writers.fastq import FastqWriter
from obitools3.format.fasta import FastaFormat
from obitools3.format.fastq import FastqFormat
from obitools3.dms.obiseq import Nuc_Seq
from obitools3.apps.config import getConfiguration,logger
from obitools3.apps.temp import get_temp_dms
@ -56,7 +61,9 @@ cdef open_dms(bytes path, bint create=False):
pos=pos+1
return None
def open_dms_element(DMS dms, bytes path,
def open_dms_element(DMS dms,
bytes path,
bint create=False,
type newviewtype=View):
"""
@ -139,12 +146,21 @@ def open_dms_element(DMS dms, bytes path,
if len(path_parts) > 4:
raise MalformedURIException('Malformed View URI')
return (dms,subsubpart)
return (dms, subsubpart)
'''
#TODO discuss returned object. Return a dict? or some class instance?
Reads an URI and returns a tuple containing:
(1) The opened file or DMS, or the URI itself if nothing could be opened by the function
(2) The opened view or iterator on the opened file or writer
(3) The class of object returned or handled by (2)
(4) The original URI in bytes
'''
def open_uri(uri,
bint input=True,
type newviewtype=View):
cdef bytes urib = tobytes(uri)
cdef bytes scheme
cdef tuple dms
@ -153,7 +169,7 @@ def open_uri(uri,
config = getConfiguration()
urip = urlparse(urib)
if 'obi' not in config:
config['obi']={}
@ -166,22 +182,26 @@ def open_uri(uri,
create=(not input) and (not config["obi"]["nocreatedms"])
except KeyError:
create=not input
scheme = urip.scheme
error = None
if scheme==b"" or scheme==b"dms" :
dms = open_dms(urip.path,create)
if scheme==b"dms" or \
(scheme==b"" and \
(((not input) and "outputformat" not in config["obi"]) or \
(input and "inputformat" not in config["obi"]))): # TODO maybe not best way
dms = open_dms(urip.path, create)
if dms is None and default_dms is not None:
dms=(default_dms, urip.path)
if dms is not None:
try:
resource=open_dms_element(dms[0],dms[1],
resource=open_dms_element(dms[0],
dms[1],
create,
newviewtype
)
newviewtype)
scheme=b"dms"
urip = ParseResultBytes(scheme=b"dms",
@ -200,32 +220,42 @@ def open_uri(uri,
urlunparse(urip))
except Exception as e:
error=e
if scheme==b"dms" :
logger('Error','cannot open DMS: %s', uri)
raise FileNotFoundError('uri')
#if not urip.scheme: # TODO not sure what it was supposed to do but not working as intended
# urib=b"file:"+urib
if not urip.scheme:
urib=b"file:"+urib
try:
file = uopen(urib)
logger('info','Opened file: %s', tostr(urib))
except Exception as e: # TODO discuss: if can't open file, return the character string itself
file = urib
iseq = urib
objclass = bytes
if input:
try:
file = uopen(urip.path, mode='rb')
logger('info','Opened file: %s', urip.path)
except Exception as e: # TODO discuss: if can't open file, return the character string itself
file = tobytes(uri)
iseq = urib
objclass = bytes
else: # TODO update uopen to be able to write?
file = open(urip.path, 'wb')
if file is not None:
qualifiers=parse_qs(urip.query)
if b'format' in qualifiers:
if input and b'format' in qualifiers:
format = qualifiers[b'format'][0]
else:
try:
format=config["obi"]["format"]
except KeyError:
format=None
else: # TODO discuss priorities
if urip.scheme:
format = urip.scheme
else:
try:
if input:
formatkey = "inputformat"
else:
formatkey = "outputformat"
format=config["obi"][formatkey]
except KeyError:
format=None
if b'seqtype' in qualifiers:
seqtype=qualifiers[b'seqtype'][0]
@ -248,7 +278,6 @@ def open_uri(uri,
if skip < 0:
raise MalformedURIException('Malformed skip argument in URI')
if b'only' in qualifiers:
only=int(qualifiers[b"only"][0])
else:
@ -259,7 +288,6 @@ def open_uri(uri,
if only is not None and only <= 0:
raise MalformedURIException('Malformed only argument in URI')
if b"skiperror" in qualifiers:
try:
skiperror=eval(qualifiers[b"skiperror"][0])
@ -332,12 +360,27 @@ def open_uri(uri,
dec=tobytes(config["obi"]["dec"])
except KeyError:
dec=b"."
if b"printna" in qualifiers:
try:
printna=eval(qualifiers[b"printna"][0])
except Exception as e:
raise MalformedURIException("Malformed 'print NA' argument in URI")
else:
try:
printna=config["obi"]["printna"]
except KeyError:
printna=False
if b"nastring" in qualifiers:
nastring=tobytes(qualifiers[b"nastring"][0])
else:
try:
nastring=tobytes(config["obi"]["nastring"])
if input:
nakey = "inputnastring"
else:
nakey = "outputnastring"
nastring=tobytes(config["obi"][nakey])
except KeyError:
nastring=b'NA'
@ -377,63 +420,91 @@ def open_uri(uri,
if format is not None:
if seqtype==b"nuc":
objclass = Nuc_Seq
objclass = Nuc_Seq # Nuc_Seq_Stored? TODO
if format==b"fasta":
iseq = fastaNucIterator(file,
if input:
iseq = fastaNucIterator(file,
skip=skip,
only=only)
else:
iseq = FastaNucWriter(FastaFormat(printNAKeys=printna, NAString=nastring),
file,
skip=skip,
only=only)
elif format==b"fastq":
if input:
iseq = fastqIterator(file,
skip=skip,
only=only,
offset=offset,
noquality=noquality)
else:
iseq = FastqWriter(FastqFormat(printNAKeys=printna, NAString=nastring),
file,
skip=skip,
only=only)
elif format==b"embl":
if input:
iseq = emblIterator(file,
skip=skip,
only=only)
elif format==b"fastq":
iseq = fastqIterator(file,
skip=skip,
only=only,
offset=offset,
noquality=noquality)
elif format==b"embl":
iseq = emblIterator(file,
skip=skip,
only=only)
else:
raise NotImplementedError('Output sequence file format not implemented')
else:
raise NotImplementedError('Sequence file format not implemented')
elif seqtype==b"prot":
raise NotImplementedError()
elif format==b"tabular":
objclass = dict
iseq = tabIterator(file,
header = header,
sep = sep,
dec = dec,
stripwhite = stripwhite,
blanklineskip = blanklineskip,
commentchar = commentchar,
skip = skip,
only = only)
if input:
iseq = tabIterator(file,
header = header,
sep = sep,
dec = dec,
stripwhite = stripwhite,
blanklineskip = blanklineskip,
commentchar = commentchar,
skip = skip,
only = only)
else:
raise NotImplementedError('Output sequence file format not implemented')
elif format==b"ngsfilter":
objclass = dict
iseq = ngsfilterIterator(file,
sep = sep,
dec = dec,
stripwhite = stripwhite,
blanklineskip = blanklineskip,
commentchar = commentchar,
skip = skip,
only = only)
if input:
iseq = ngsfilterIterator(file,
sep = sep,
dec = dec,
stripwhite = stripwhite,
blanklineskip = blanklineskip,
commentchar = commentchar,
skip = skip,
only = only)
else:
raise NotImplementedError('Output sequence file format not implemented')
else:
iseq, objclass = entryIteratorFactory(file,
skip, only,
seqtype,
offset,
noquality,
skiperror,
header,
sep,
dec,
nastring,
stripwhite,
blanklineskip,
commentchar)
if input:
iseq, objclass = entryIteratorFactory(file,
skip, only,
seqtype,
offset,
noquality,
skiperror,
header,
sep,
dec,
nastring,
stripwhite,
blanklineskip,
commentchar)
else: # default export is in fasta? or tab? TODO
objclass = Nuc_Seq # Nuc_Seq_Stored? TODO
iseq = FastaNucWriter(FastaFormat(printNAKeys=printna, NAString=nastring),
file,
skip=skip,
only=only)
#tmpdms = get_temp_dms()
return (file, iseq, objclass, urib)