From 15e43bb9a1bdfa0dc07053e5febd36a4661b085d Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Mon, 12 Mar 2018 18:10:43 +0100 Subject: [PATCH] Cython API: obi import can now import ngsfilter files and tabular files --- python/obi.py | 2 +- .../obitools3/apps/optiongroups/__init__.py | 24 +++++--- python/obitools3/commands/import.pyx | 38 ++++++------ python/obitools3/parsers/fasta.pyx | 33 +++++----- python/obitools3/parsers/fastq.pyx | 49 ++++++++------- python/obitools3/parsers/header.pxd | 1 - python/obitools3/parsers/header.pyx | 46 +------------- python/obitools3/parsers/universal.pyx | 56 +++++++++++++---- python/obitools3/uri/decode.pyx | 61 +++++++++++++------ 9 files changed, 168 insertions(+), 142 deletions(-) diff --git a/python/obi.py b/python/obi.py index de84b7d..361641e 100644 --- a/python/obi.py +++ b/python/obi.py @@ -31,7 +31,7 @@ default_config = { 'software' : "The OBITools", 'fileformat' : None, 'skiperror' : True, 'qualityformat' : b'sanger', - 'qualityoffset' : -1, + 'offset' : -1, 'noquality' : False, 'seqtype' : b'nuc', "header" : False, diff --git a/python/obitools3/apps/optiongroups/__init__.py b/python/obitools3/apps/optiongroups/__init__.py index 27f7b1b..d2263a6 100644 --- a/python/obitools3/apps/optiongroups/__init__.py +++ b/python/obitools3/apps/optiongroups/__init__.py @@ -25,8 +25,8 @@ def __addInputOption(optionManager): group.add_argument('--na-string', action="store", dest="obi:nastring", - default=b"NA", - type=bytes, + default="NA", + type=str, help="String associated to Non Available (NA) values") @@ -61,7 +61,7 @@ def __addSequenceInputOption(optionManager): action="store_const", dest="obi:format", default=None, const=b'ngsfilter', - help="Input file is a ngsfilter file") + help="Input file is an ngsfilter file") group.add_argument('--ecopcr-result', action="store_const", dest="obi:format", @@ -75,6 +75,12 @@ def __addSequenceInputOption(optionManager): const=b'ecoprimers', help="Input file is the result of an ecoprimers") + group.add_argument('--tabular', + action="store_const", dest="obi:format", + default=None, + const=b'tabular', + help="Input file is a tabular file") + group.add_argument('--skip-on-error', action="store_true", dest="obi:skiperror", default=False, @@ -120,13 +126,13 @@ def __addTabularInputOption(optionManager): group.add_argument('--sep', action="store", dest="obi:sep", default=None, - type=bytes, + type=str, help="Column separator") group.add_argument('--dec', action="store", dest="obi:dec", - default=b".", - type=bytes, + default=".", + type=str, help="Decimal separator") group.add_argument('--strip-white', @@ -141,8 +147,8 @@ def __addTabularInputOption(optionManager): group.add_argument('--comment-char', action="store", dest="obi:commentchar", - default=b"#", - type=bytes, + default="#", + type=str, help="Lines starting by this char are considered as comment") def __addTaxonomyInputOption(optionManager): @@ -171,7 +177,7 @@ def addSequenceInputOption(optionManager): __addSequenceInputOption(optionManager) def addTabularInputOption(optionManager): - __addInputOption(optionManager) + #__addInputOption(optionManager) # TODO discuss conflict __addTabularInputOption(optionManager) def addTaxonomyInputOption(optionManager): diff --git a/python/obitools3/commands/import.pyx b/python/obitools3/commands/import.pyx index 5f936ed..707fb31 100644 --- a/python/obitools3/commands/import.pyx +++ b/python/obitools3/commands/import.pyx @@ -1,13 +1,8 @@ #cython: language_level=3 -# TODO cimport generate errors with argument numbers, but without them some variables can't be declared - import sys from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport -from obitools3.files.universalopener cimport uopen -from obitools3.parsers.fasta import fastaIterator -from obitools3.parsers.fastq import fastqIterator from obitools3.dms.view.view cimport View from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS from obitools3.dms.column.column cimport Column @@ -24,7 +19,7 @@ from obitools3.dms.capi.obitypes cimport obitype_t, \ from obitools3.dms.capi.obierrno cimport obi_errno -from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption +from obitools3.apps.optiongroups import addSequenceInputOption, addTabularInputOption, addMinimalOutputOption from obitools3.uri.decode import open_uri @@ -45,6 +40,7 @@ default_config = { 'destview' : None, def addOptions(parser): addSequenceInputOption(parser) + addTabularInputOption(parser) addMinimalOutputOption(parser) # addTaxdumpInputOption(parser) @@ -63,8 +59,8 @@ def run(config): cdef int nb_elts cdef object d cdef View view - cdef object iseq - cdef object seq + cdef object entries + cdef object entry cdef Column id_col cdef Column def_col cdef Column seq_col @@ -108,9 +104,9 @@ def run(config): pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file - iseq = input[1] + entries = input[1] - NA_value = config['obi']['nastring'] + NA_value = tobytes(config['obi']['nastring']) # TODO NUC_SEQS_view = False if isinstance(output[1], View) : @@ -121,39 +117,39 @@ def run(config): raise NotImplementedError() # Save basic columns in variables for optimization - if NUC_SEQS_view : - id_col = view[b"ID"] + if NUC_SEQS_view : + id_col = view[b"ID"] # TODO use macros or globals for column names def_col = view[b"DEFINITION"] seq_col = view[b"NUC_SEQ"] dcols = {} i = 0 - for seq in iseq : - + for entry in entries : + pb(i) if NUC_SEQS_view : # Check if there is a sequencing quality associated # TODO if i == 0: - get_quality = b"QUALITY" in seq + get_quality = b"QUALITY" in entry if get_quality: Column.new_column(view, b"QUALITY", OBI_QUAL) qual_col = view[b"QUALITY"] - id_col[i] = seq.id - def_col[i] = seq.definition - seq_col[i] = seq.seq + id_col[i] = entry.id + def_col[i] = entry.definition + seq_col[i] = entry.seq if get_quality : - qual_col[i] = seq.quality + qual_col[i] = entry.quality - for tag in seq : + for tag in entry : if tag != b"ID" and tag != b"DEFINITION" and tag != b"NUC_SEQ" and tag != b"QUALITY" : # TODO hmmm... - value = seq[tag] + value = entry[tag] # Check NA value if value == NA_value : diff --git a/python/obitools3/parsers/fasta.pyx b/python/obitools3/parsers/fasta.pyx index c619d03..0a7b6d1 100644 --- a/python/obitools3/parsers/fasta.pyx +++ b/python/obitools3/parsers/fasta.pyx @@ -17,7 +17,6 @@ def fastaIterator(lineiterator, firstline=None, int buffersize=100000000 ): - cdef LineBuffer lb cdef str ident cdef str definition cdef dict tags @@ -31,23 +30,26 @@ def fastaIterator(lineiterator, else: ionly=int(only) - if isinstance(lineiterator,(str,bytes)): - lineiterator=uopen(lineiterator) - + if isinstance(lineiterator, (str, bytes)): + lineiterator=uopen(lineiterator) if isinstance(lineiterator, LineBuffer): - lb=lineiterator + iterator = iter(lineiterator) else: - lb=LineBuffer(lineiterator,buffersize) + if hasattr(lineiterator, "readlines"): + iterator = iter(LineBuffer(lineiterator, buffersize)) + elif hasattr(lineiterator, '__next__'): + iterator = lineiterator + else: + raise Exception("Invalid line iterator") skipped = 0 - i = iter(lb) + i = iterator if firstline is None: line = next(i) else: line = firstline - while True: if ionly >= 0 and read >= ionly: @@ -81,7 +83,7 @@ def fastaIterator(lineiterator, # definition, # tags=tags, # ) - # TODO + # TODO Seq object yield { "id" : ident, "definition" : definition, "sequence" : sequence, @@ -100,7 +102,6 @@ def fastaNucIterator(lineiterator, firstline=None, int buffersize=100000000 ): - cdef LineBuffer lb cdef str ident cdef str definition cdef dict tags @@ -115,14 +116,16 @@ def fastaNucIterator(lineiterator, ionly = int(only) if isinstance(lineiterator, (str, bytes)): - lineiterator=uopen(lineiterator) - - if isinstance(lineiterator, types.GeneratorType): - iterator = lineiterator + lineiterator=uopen(lineiterator) if isinstance(lineiterator, LineBuffer): iterator = iter(lineiterator) else: - iterator = iter(LineBuffer(lineiterator, buffersize)) + if hasattr(lineiterator, "readlines"): + iterator = iter(LineBuffer(lineiterator, buffersize)) + elif hasattr(lineiterator, '__next__'): + iterator = lineiterator + else: + raise Exception("Invalid line iterator") skipped = 0 read = 0 diff --git a/python/obitools3/parsers/fastq.pyx b/python/obitools3/parsers/fastq.pyx index 83ad7c4..af88b9b 100644 --- a/python/obitools3/parsers/fastq.pyx +++ b/python/obitools3/parsers/fastq.pyx @@ -12,7 +12,7 @@ from obitools3.dms.obiseq cimport Nuc_Seq def fastqIterator(lineiterator, int skip=0, only=None, - int qualityoffset=-1, + int offset=-1, bint noquality=False, firstline=None, int buffersize=100000000 @@ -25,14 +25,14 @@ def fastqIterator(lineiterator, else: return fastqWithQualityIterator(lineiterator, skip,only, - qualityoffset, + offset, firstline, buffersize) def fastqWithQualityIterator(lineiterator, int skip=0, only=None, - int qualityoffset=-1, + int offset=-1, firstline=None, int buffersize=100000000 ): @@ -49,21 +49,25 @@ def fastqWithQualityIterator(lineiterator, ionly=-1 else: ionly=int(only) - - if isinstance(lineiterator,(str,bytes)): - lineiterator=uopen(lineiterator) - + + if isinstance(lineiterator, (str, bytes)): + lineiterator=uopen(lineiterator) if isinstance(lineiterator, LineBuffer): - lb=lineiterator + iterator = iter(lineiterator) else: - lb=LineBuffer(lineiterator,buffersize) - - i = iter(lb) + if hasattr(lineiterator, "readlines"): + iterator = iter(LineBuffer(lineiterator, buffersize)) + elif hasattr(lineiterator, '__next__'): + iterator = lineiterator + else: + raise Exception("Invalid line iterator") + + i = iterator lines_to_skip = skip*4 - (firstline is not None) for skipped in range(lines_to_skip): next(i) - + if skip > 0: firstline=None @@ -88,7 +92,7 @@ def fastqWithQualityIterator(lineiterator, sequence, definition=definition, quality=quality, - offset=qualityoffset, + offset=offset, tags=tags) yield seq @@ -97,7 +101,7 @@ def fastqWithQualityIterator(lineiterator, # "definition" : definition, # "sequence" : sequence, # "quality" : quality, -# "offset" : qualityoffset, +# "offset" : offset, # "tags" : tags, # "annotation" : {} # } @@ -112,7 +116,6 @@ def fastqWithoutQualityIterator(lineiterator, firstline=None, int buffersize=100000000 ): - cdef LineBuffer lb cdef str ident cdef str definition cdef dict tags @@ -126,15 +129,19 @@ def fastqWithoutQualityIterator(lineiterator, else: ionly=int(only) - if isinstance(lineiterator,(str,bytes)): - lineiterator=uopen(lineiterator) - + if isinstance(lineiterator, (str, bytes)): + lineiterator=uopen(lineiterator) if isinstance(lineiterator, LineBuffer): - lb=lineiterator + iterator = iter(lineiterator) else: - lb=LineBuffer(lineiterator,buffersize) + if hasattr(lineiterator, "readlines"): + iterator = iter(LineBuffer(lineiterator, buffersize)) + elif hasattr(lineiterator, '__next__'): + iterator = lineiterator + else: + raise Exception("Invalid line iterator") - i = iter(lb) + i = iterator lines_to_skip = skip*4 - (firstline is not None) for skipped in range(lines_to_skip): diff --git a/python/obitools3/parsers/header.pxd b/python/obitools3/parsers/header.pxd index b09a418..ffcd3cf 100644 --- a/python/obitools3/parsers/header.pxd +++ b/python/obitools3/parsers/header.pxd @@ -1,5 +1,4 @@ #cython: language_level=3 -cdef object __etag__(str x) cpdef tuple parseHeader(str header) diff --git a/python/obitools3/parsers/header.pyx b/python/obitools3/parsers/header.pyx index e54963d..67fd0a6 100644 --- a/python/obitools3/parsers/header.pyx +++ b/python/obitools3/parsers/header.pyx @@ -6,54 +6,12 @@ Created on 25 mars 2016 @author: coissac ''' +from obitools3.utils cimport __etag__ import re -__ret__ = re.compile('''(([^ ]+)=('[^']*'|"[^"]*"|[^;]+); *)+?''') -__re_int__ = re.compile("^[+-]?[0-9]+$") -__re_float__ = re.compile("^[+-]?[0-9]+(\.[0-9]*)?([eE][+-]?[0-9]+)?$") -__re_str__ = re.compile("""^"[^"]*"|'[^']*'$""") -__re_dict__ = re.compile("""^\{\ * - ( - ("[^"]*"|'[^']*') - \ *:\ * - ([^,}]+| - "[^"]*"| - '[^']*' - ) - )? - (\ *,\ * - ("[^"]*"|'[^']*') - \ *:\ * - ([^,}]+| - "[^"]*"| - '[^']*' - ) - )*\ *\}$""", re.VERBOSE) -__re_val__ = re.compile("""(("[^"]*"|'[^']*') *: *([^,}]+|"[^"]*"|'[^']*') *[,}] *)""") +__ret__ = re.compile('''(([^ ]+)=('[^']*'|"[^"]*"|[^;]+); *)+?''') -cdef object __etag__(str x): - cdef list elements - cdef tuple i - - if __re_int__.match(x): - v=int(x) - elif __re_float__.match(x): - v=float(x) - elif __re_str__.match(x): - v=x[1:-1] - elif x=='None': - v=None - elif x=='False': - v=False - elif x=='True': - v=True - elif __re_dict__.match(x): - elements=__re_val__.findall(x) - v=dict([(i[1][1:-1],__etag__(i[2])) for i in elements]) - else: - v=x - return v cpdef tuple parseHeader(str header): cdef list m diff --git a/python/obitools3/parsers/universal.pyx b/python/obitools3/parsers/universal.pyx index feb024f..10f7c37 100644 --- a/python/obitools3/parsers/universal.pyx +++ b/python/obitools3/parsers/universal.pyx @@ -3,12 +3,14 @@ import re from obitools3.parsers.fasta import fastaNucIterator from obitools3.parsers.fastq import fastqIterator +from obitools3.parsers.tab import tabIterator +from obitools3.parsers.ngsfilter import ngsfilterIterator oligore = re.compile("^[ACGTRYSWKMBDHVN]+$",re.I) tagre = re.compile("^([ACGTRYSWKMBDHVN]+|-)(:([ACGTRYSWKMBDHVN]+)|-)?$",re.I) -def is_ngsfilter_line(line): +def is_ngsfilter_line(line): # TODO doesn't work? try: parts = line.split() ok = tagre.match(parts[2]) @@ -23,7 +25,7 @@ def entryIteratorFactory(lineiterator, int skip=0, only=None, bytes seqtype=b'nuc', - int qualityoffset=-1, + int offset=-1, bint noquality=False, bint skiperror=True, bint header=False, @@ -35,15 +37,19 @@ def entryIteratorFactory(lineiterator, bytes commentchar=b"#", int buffersize=100000000): - if isinstance(lineiterator,(str,bytes)): - lineiterator=uopen(lineiterator) - + if isinstance(lineiterator, (str, bytes)): + lineiterator=uopen(lineiterator) if isinstance(lineiterator, LineBuffer): - lb=lineiterator + iterator = iter(lineiterator) else: - lb=LineBuffer(lineiterator, buffersize) + if hasattr(lineiterator, "readlines"): + iterator = iter(LineBuffer(lineiterator, buffersize)) + elif hasattr(lineiterator, '__next__'): + iterator = lineiterator + else: + raise Exception("Invalid line iterator") - i = iter(lb) + i = iterator first=next(i) @@ -57,11 +63,11 @@ def entryIteratorFactory(lineiterator, format=b"embl" elif first[0:6]=='LOCUS ': format=b"genbank" - elif first[0:11]=='#@ecopcr-v2': + elif first[0:11]=='#@ecopcr-v2': # TODO v2???? format=b"ecopcrfile" elif is_ngsfilter_line(first): format=b"ngsfilter" - + # TODO Temporary fix first=None lineiterator.seek(0) @@ -78,12 +84,36 @@ def entryIteratorFactory(lineiterator, elif format==b'fastq': return (fastqIterator(lineiterator, skip=skip,only=only, - qualityoffset=qualityoffset, + offset=offset, noquality=noquality, firstline=first, buffersize=buffersize), Nuc_Seq) - - + elif format==b'tabular': + return (tabIterator(lineiterator, + header = header, + sep = sep, + dec = dec, + stripwhite = stripwhite, + blanklineskip = blanklineskip, + commentchar = commentchar, + skip = skip, + only = only, + firstline=first, + buffersize=buffersize), + dict) + elif format==b'ngsfilter': + return (ngsfilterIterator(lineiterator, + sep = sep, + dec = dec, + stripwhite = stripwhite, + blanklineskip = blanklineskip, + commentchar = commentchar, + skip = skip, + only = only, + firstline=first, + buffersize=buffersize), + dict) + raise NotImplementedError('File format not yet implemented') diff --git a/python/obitools3/uri/decode.pyx b/python/obitools3/uri/decode.pyx index 11b1237..ad65318 100644 --- a/python/obitools3/uri/decode.pyx +++ b/python/obitools3/uri/decode.pyx @@ -7,11 +7,15 @@ from obitools3.dms.dms import DMS from obitools3.parsers.fasta import fastaNucIterator from obitools3.parsers.fastq import fastqIterator +from obitools3.parsers.tab import tabIterator +from obitools3.parsers.ngsfilter import ngsfilterIterator from obitools3.parsers.universal import entryIteratorFactory from obitools3.dms.obiseq import Nuc_Seq from obitools3.apps.config import getConfiguration,logger from obitools3.apps.temp import get_temp_dms +from obitools3.utils cimport tobytes # TODO because can't read options as bytes + class MalformedURIException(RuntimeError): pass @@ -210,22 +214,24 @@ def open_uri(uri, if file is not None: qualifiers=parse_qs(urip.query) - if b'format' in qualifiers: format = qualifiers[b'format'][0] else: try: - format=config["obi"]["fileformat"] + format=config["obi"]["format"] except KeyError: format=None if b'seqtype' in qualifiers: seqtype=qualifiers[b'seqtype'][0] else: - try: - seqtype=config["obi"]["seqtype"] - except KeyError: - seqtype=b'nuc' + if format == b"ngsfilter": # TODO discuss + seqtype=None + else: + try: + seqtype=config["obi"]["seqtype"] + except KeyError: + seqtype=b"nuc" if b'skip' in qualifiers: skip=int(qualifiers[b"skip"][0]) @@ -286,7 +292,7 @@ def open_uri(uri, offset=33 elif config["obi"]["qualityformat"][0]=="solexa": offset=64 - #offset=config["obi"]["qualityoffset"] # TODO discuss + #offset=config["obi"]["offset"] # TODO discuss except KeyError: offset=33 @@ -304,10 +310,10 @@ def open_uri(uri, raise MalformedURIException('Malformed header argument in URI') if b"sep" in qualifiers: - sep=qualifiers[b"sep"][0][0] + sep=tobytes(qualifiers[b"sep"][0][0]) else: try: - sep=config["obi"]["sep"] + sep=tobytes(config["obi"]["sep"]) except KeyError: sep=None @@ -315,18 +321,18 @@ def open_uri(uri, # pass if b"dec" in qualifiers: - dec=qualifiers[b"dec"][0][0] + dec=tobytes(qualifiers[b"dec"][0][0]) else: try: - dec=config["obi"]["dec"] + dec=tobytes(config["obi"]["dec"]) except KeyError: dec=b"." if b"nastring" in qualifiers: - nastring=qualifiers[b"nastring"][0] + nastring=tobytes(qualifiers[b"nastring"][0]) else: try: - nastring=config["obi"]["nastring"] + nastring=tobytes(config["obi"]["nastring"]) except KeyError: nastring=b'NA' @@ -357,15 +363,15 @@ def open_uri(uri, raise MalformedURIException('Malformed blanklineskip argument in URI') if b"commentchar" in qualifiers: - commentchar=qualifiers[b"commentchar"][0][0] + commentchar=tobytes(qualifiers[b"commentchar"][0][0]) else: try: - commentchar=config["obi"]["commentchar"] + commentchar=tobytes(config["obi"]["commentchar"]) except KeyError: commentchar=b'#' if format is not None: - if qualifiers[b"seqtype"]==b"nuc": + if seqtype==b"nuc": objclass = Nuc_Seq if format==b"fasta": iseq = fastaNucIterator(file, @@ -379,8 +385,29 @@ def open_uri(uri, noquality=noquality) else: raise NotImplementedError('Sequence file format not implemented') - elif qualifiers[b"seqtype"]==b"prot": + elif seqtype==b"prot": raise NotImplementedError() + elif format==b"tabular": + objclass = dict + iseq = tabIterator(file, + header = header, + sep = sep, + dec = dec, + stripwhite = stripwhite, + blanklineskip = blanklineskip, + commentchar = commentchar, + skip = skip, + only = only) + elif format==b"ngsfilter": + objclass = dict + iseq = ngsfilterIterator(file, + sep = sep, + dec = dec, + stripwhite = stripwhite, + blanklineskip = blanklineskip, + commentchar = commentchar, + skip = skip, + only = only) else: iseq,objclass = entryIteratorFactory(file, skip, only,