From 135d3b6e6763a288de58d30f37a5df7e75b9839b Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Wed, 17 Oct 2018 11:21:29 +0200 Subject: [PATCH] Cython: updated the URI decoding to handle outputs other than DMS --- python/obitools3/uri/decode.pyx | 215 +++++++++++++++++++++----------- 1 file changed, 143 insertions(+), 72 deletions(-) diff --git a/python/obitools3/uri/decode.pyx b/python/obitools3/uri/decode.pyx index bc20c39..c23adf7 100644 --- a/python/obitools3/uri/decode.pyx +++ b/python/obitools3/uri/decode.pyx @@ -12,6 +12,11 @@ from obitools3.parsers.ngsfilter import ngsfilterIterator from obitools3.parsers.embl import emblIterator from obitools3.parsers.universal import entryIteratorFactory +from obitools3.writers.fasta import FastaNucWriter +from obitools3.writers.fastq import FastqWriter +from obitools3.format.fasta import FastaFormat +from obitools3.format.fastq import FastqFormat + from obitools3.dms.obiseq import Nuc_Seq from obitools3.apps.config import getConfiguration,logger from obitools3.apps.temp import get_temp_dms @@ -56,7 +61,9 @@ cdef open_dms(bytes path, bint create=False): pos=pos+1 return None -def open_dms_element(DMS dms, bytes path, + +def open_dms_element(DMS dms, + bytes path, bint create=False, type newviewtype=View): """ @@ -139,12 +146,21 @@ def open_dms_element(DMS dms, bytes path, if len(path_parts) > 4: raise MalformedURIException('Malformed View URI') - return (dms,subsubpart) + return (dms, subsubpart) +''' +#TODO discuss returned object. Return a dict? or some class instance? +Reads an URI and returns a tuple containing: + (1) The opened file or DMS, or the URI itself if nothing could be opened by the function + (2) The opened view or iterator on the opened file or writer + (3) The class of object returned or handled by (2) + (4) The original URI in bytes +''' def open_uri(uri, bint input=True, type newviewtype=View): + cdef bytes urib = tobytes(uri) cdef bytes scheme cdef tuple dms @@ -153,7 +169,7 @@ def open_uri(uri, config = getConfiguration() urip = urlparse(urib) - + if 'obi' not in config: config['obi']={} @@ -166,22 +182,26 @@ def open_uri(uri, create=(not input) and (not config["obi"]["nocreatedms"]) except KeyError: create=not input - + scheme = urip.scheme error = None - if scheme==b"" or scheme==b"dms" : - dms = open_dms(urip.path,create) + if scheme==b"dms" or \ + (scheme==b"" and \ + (((not input) and "outputformat" not in config["obi"]) or \ + (input and "inputformat" not in config["obi"]))): # TODO maybe not best way + + dms = open_dms(urip.path, create) if dms is None and default_dms is not None: dms=(default_dms, urip.path) if dms is not None: try: - resource=open_dms_element(dms[0],dms[1], + resource=open_dms_element(dms[0], + dms[1], create, - newviewtype - ) + newviewtype) scheme=b"dms" urip = ParseResultBytes(scheme=b"dms", @@ -200,32 +220,42 @@ def open_uri(uri, urlunparse(urip)) except Exception as e: error=e - + if scheme==b"dms" : logger('Error','cannot open DMS: %s', uri) raise FileNotFoundError('uri') - - #if not urip.scheme: # TODO not sure what it was supposed to do but not working as intended - # urib=b"file:"+urib + + if not urip.scheme: + urib=b"file:"+urib - try: - file = uopen(urib) - logger('info','Opened file: %s', tostr(urib)) - except Exception as e: # TODO discuss: if can't open file, return the character string itself - file = urib - iseq = urib - objclass = bytes + if input: + try: + file = uopen(urip.path, mode='rb') + logger('info','Opened file: %s', urip.path) + except Exception as e: # TODO discuss: if can't open file, return the character string itself + file = tobytes(uri) + iseq = urib + objclass = bytes + else: # TODO update uopen to be able to write? + file = open(urip.path, 'wb') if file is not None: qualifiers=parse_qs(urip.query) - if b'format' in qualifiers: + if input and b'format' in qualifiers: format = qualifiers[b'format'][0] - else: - try: - format=config["obi"]["format"] - except KeyError: - format=None + else: # TODO discuss priorities + if urip.scheme: + format = urip.scheme + else: + try: + if input: + formatkey = "inputformat" + else: + formatkey = "outputformat" + format=config["obi"][formatkey] + except KeyError: + format=None if b'seqtype' in qualifiers: seqtype=qualifiers[b'seqtype'][0] @@ -248,7 +278,6 @@ def open_uri(uri, if skip < 0: raise MalformedURIException('Malformed skip argument in URI') - if b'only' in qualifiers: only=int(qualifiers[b"only"][0]) else: @@ -259,7 +288,6 @@ def open_uri(uri, if only is not None and only <= 0: raise MalformedURIException('Malformed only argument in URI') - if b"skiperror" in qualifiers: try: skiperror=eval(qualifiers[b"skiperror"][0]) @@ -332,12 +360,27 @@ def open_uri(uri, dec=tobytes(config["obi"]["dec"]) except KeyError: dec=b"." + + if b"printna" in qualifiers: + try: + printna=eval(qualifiers[b"printna"][0]) + except Exception as e: + raise MalformedURIException("Malformed 'print NA' argument in URI") + else: + try: + printna=config["obi"]["printna"] + except KeyError: + printna=False if b"nastring" in qualifiers: nastring=tobytes(qualifiers[b"nastring"][0]) else: try: - nastring=tobytes(config["obi"]["nastring"]) + if input: + nakey = "inputnastring" + else: + nakey = "outputnastring" + nastring=tobytes(config["obi"][nakey]) except KeyError: nastring=b'NA' @@ -377,63 +420,91 @@ def open_uri(uri, if format is not None: if seqtype==b"nuc": - objclass = Nuc_Seq + objclass = Nuc_Seq # Nuc_Seq_Stored? TODO if format==b"fasta": - iseq = fastaNucIterator(file, + if input: + iseq = fastaNucIterator(file, + skip=skip, + only=only) + else: + iseq = FastaNucWriter(FastaFormat(printNAKeys=printna, NAString=nastring), + file, + skip=skip, + only=only) + elif format==b"fastq": + if input: + iseq = fastqIterator(file, + skip=skip, + only=only, + offset=offset, + noquality=noquality) + else: + iseq = FastqWriter(FastqFormat(printNAKeys=printna, NAString=nastring), + file, + skip=skip, + only=only) + elif format==b"embl": + if input: + iseq = emblIterator(file, skip=skip, only=only) - elif format==b"fastq": - iseq = fastqIterator(file, - skip=skip, - only=only, - offset=offset, - noquality=noquality) - elif format==b"embl": - iseq = emblIterator(file, - skip=skip, - only=only) + else: + raise NotImplementedError('Output sequence file format not implemented') else: raise NotImplementedError('Sequence file format not implemented') elif seqtype==b"prot": raise NotImplementedError() elif format==b"tabular": objclass = dict - iseq = tabIterator(file, - header = header, - sep = sep, - dec = dec, - stripwhite = stripwhite, - blanklineskip = blanklineskip, - commentchar = commentchar, - skip = skip, - only = only) + if input: + iseq = tabIterator(file, + header = header, + sep = sep, + dec = dec, + stripwhite = stripwhite, + blanklineskip = blanklineskip, + commentchar = commentchar, + skip = skip, + only = only) + else: + raise NotImplementedError('Output sequence file format not implemented') elif format==b"ngsfilter": objclass = dict - iseq = ngsfilterIterator(file, - sep = sep, - dec = dec, - stripwhite = stripwhite, - blanklineskip = blanklineskip, - commentchar = commentchar, - skip = skip, - only = only) + if input: + iseq = ngsfilterIterator(file, + sep = sep, + dec = dec, + stripwhite = stripwhite, + blanklineskip = blanklineskip, + commentchar = commentchar, + skip = skip, + only = only) + else: + raise NotImplementedError('Output sequence file format not implemented') else: - iseq, objclass = entryIteratorFactory(file, - skip, only, - seqtype, - offset, - noquality, - skiperror, - header, - sep, - dec, - nastring, - stripwhite, - blanklineskip, - commentchar) + if input: + iseq, objclass = entryIteratorFactory(file, + skip, only, + seqtype, + offset, + noquality, + skiperror, + header, + sep, + dec, + nastring, + stripwhite, + blanklineskip, + commentchar) + else: # default export is in fasta? or tab? TODO + objclass = Nuc_Seq # Nuc_Seq_Stored? TODO + iseq = FastaNucWriter(FastaFormat(printNAKeys=printna, NAString=nastring), + file, + skip=skip, + only=only) #tmpdms = get_temp_dms() - + return (file, iseq, objclass, urib)