diff --git a/python/obi.py b/python/obi.py index 425cea5..8c7cd75 100644 --- a/python/obi.py +++ b/python/obi.py @@ -22,6 +22,7 @@ default_config = { 'software' : "The OBITools", 'loglevel' : 'INFO', 'progress' : True, 'inputURI' : None, + 'outputURI' : None, 'defaultdms' : None, 'inputview' : None, 'outputview' : None, diff --git a/python/obitools3/apps/config.pxd b/python/obitools3/apps/config.pxd index 74c595f..ae74ef9 100644 --- a/python/obitools3/apps/config.pxd +++ b/python/obitools3/apps/config.pxd @@ -7,4 +7,4 @@ cdef dict buildDefaultConfiguration(str root_config_name, dict config) cpdef dict getConfiguration(str root_config_name=?, - dict config=?) \ No newline at end of file + dict config=?) diff --git a/python/obitools3/apps/config.pyx b/python/obitools3/apps/config.pyx index c8e3baa..e5119bc 100644 --- a/python/obitools3/apps/config.pyx +++ b/python/obitools3/apps/config.pyx @@ -101,3 +101,14 @@ cpdef dict getConfiguration(str root_config_name="__default__", config['__done__']=True return config + +def logger(level, *messages): + try: + config=getConfiguration() + root = config["__root_config__"] + l = config[root]['logger'] + if config[root]['verbose']: + getattr(l, level)(*messages) + except: + print(*messages,file=sys.stderr) + diff --git a/python/obitools3/apps/logging.pyx b/python/obitools3/apps/logging.pyx index 559977d..406dbef 100644 --- a/python/obitools3/apps/logging.pyx +++ b/python/obitools3/apps/logging.pyx @@ -42,5 +42,7 @@ cpdef getLogger(dict config): rootlogger.setLevel(loglevel) config[root]['logger']=rootlogger - + config[root]['verbose']=True + return rootlogger + diff --git a/python/obitools3/apps/optiongroups/__init__.py b/python/obitools3/apps/optiongroups/__init__.py index 38e68b5..4428c1b 100644 --- a/python/obitools3/apps/optiongroups/__init__.py +++ b/python/obitools3/apps/optiongroups/__init__.py @@ -2,8 +2,8 @@ def __addInputOption(optionManager): optionManager.add_argument( dest='obi:inputURI', - metavar='index', - help='index root filename (produced by the oa index command)') + metavar='INPUT', + help='Data source URI') group = optionManager.add_argument_group("Restriction to a sub-part options", @@ -23,7 +23,12 @@ def __addInputOption(optionManager): type=int, help="treat only N sequences") - + group.add_argument('--na-string', + action="store", dest="obi:nastring", + default=b"NA", + type=bytes, + help="String associated to Non Available (NA) values") + def __addSequenceInputOption(optionManager): group = optionManager.add_argument_group("Input format options for sequence files") @@ -124,12 +129,6 @@ def __addTabularInputOption(optionManager): type=bytes, help="Decimal separator") - group.add_argument('--na-string', - action="store", dest="obi:nastring", - default=b"NA", - type=bytes, - help="String associated to Non Available (NA) values") - group.add_argument('--strip-white', action="store_false", dest="obi:stripwhite", default=True, @@ -161,3 +160,14 @@ def addAllInputOption(optionManager): __addInputOption(optionManager) __addSequenceInputOption(optionManager) __addTabularInputOption(optionManager) + + +def __addOutputOption(optionManager): + + optionManager.add_argument( + dest='obi:outputURI', + metavar='OUTPUT', + help='Data destination URI') + +def addMinimalOutputOption(optionManager): + __addOutputOption(optionManager) diff --git a/python/obitools3/commands/import.pyx b/python/obitools3/commands/import.pyx index de684eb..bc7c790 100644 --- a/python/obitools3/commands/import.pyx +++ b/python/obitools3/commands/import.pyx @@ -2,6 +2,8 @@ # TODO cimport generate errors with argument numbers, but without them some variables can't be declared +import sys + from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport from obitools3.files.universalopener cimport uopen from obitools3.parsers.fasta import fastaIterator @@ -20,6 +22,8 @@ from obitools3.dms.capi.obitypes cimport obitype_t, \ from obitools3.dms.capi.obierrno cimport obi_errno +from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption +from obitools3.uri.decode import open_uri __title__="Imports sequences from different formats into a DMS" @@ -30,83 +34,14 @@ default_config = { 'destview' : None, 'skiperror' : False, 'seqinformat' : None, 'moltype' : 'nuc', - 'filename' : None + 'source' : None } def addOptions(parser): - parser.add_argument(dest='import:filename', - metavar='', - nargs='?', - default=None, - help='Name of the sequence file to import' ) + + addSequenceInputOption(parser) + addMinimalOutputOption(parser) - group=parser.add_argument_group('obi import specific options') - - group.add_argument('--default-dms','-d', - action="store", dest="obi:defaultdms", - metavar='', - default=None, - type=str, - help="Name of the default DMS for reading and writing data") - - group.add_argument('--destination-view','-v', - action="store", dest="import:destview", - metavar='', - default=None, - type=str, - required=True, - help="Name of the default DMS for reading and writing data") - - group.add_argument('--skip', - action="store", dest="import:skip", - metavar='', - default=0, - type=int, - help="Skip the N first sequences") - - group.add_argument('--only', - action="store", dest="import:only", - metavar='', - default=None, - type=int, - help="Treat only N sequences") - - group.add_argument('--skip-on-error', - action="store_true", dest="import:skiperror", - default=None, - help="Skip sequence entries with parse error") - - group.add_argument('--fasta', - action="store_const", dest="import:seqinformat", - default=None, - const='fasta', - help="Input file is in fasta nucleic format (including obitools fasta extentions)") - - group.add_argument('--fastq', - action="store_const", dest="import:seqinformat", - default=None, - const='fastq', - help="Input file is in sanger fastq nucleic format (standard fastq)") - - group.add_argument('--nuc', - action="store_const", dest="import:moltype", - default=None, - const='nuc', - help="Input file contains nucleic sequences") - - group.add_argument('--prot', - action="store_const", dest="import:moltype", - default=None, - const='pep', - help="Input file contains protein sequences") - - group.add_argument('--NA', - action="store", dest="import:NA", - metavar='', - default='NA', - type=str, - help="Character string for Not Available values in the input file " - "(default: 'NA'") def run(config): @@ -142,147 +77,159 @@ def run(config): cdef ProgressBar pb global obi_errno - pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file - - inputs = uopen(config['import']['filename']) - - # Create or open DMS - d = DMS.open_or_new(config['obi']['defaultdms']) - get_quality = False - NUC_SEQS_view = False - if config['import']['seqinformat']=='fasta': - get_quality = False - NUC_SEQS_view = True - iseq = fastaIterator(inputs, skip=config['import']['skip']) - view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality) - elif config['import']['seqinformat']=='fastq': - get_quality = True - NUC_SEQS_view = True - iseq = fastqIterator(inputs, skip=config['import']['skip']) - view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality) - else: - raise RuntimeError('File format not handled') - - # Save basic columns in variables for optimization - if NUC_SEQS_view : - id_col = view["ID"] - def_col = view["DEFINITION"] - seq_col = view["NUC_SEQ"] - if get_quality : - qual_col = view["QUALITY"] + logger=config['obi']['logger'] - dcols = {} - i = 0 - for seq in iseq : - if i == config['import']['only'] : - break - else : - pb(i) - if NUC_SEQS_view : - id_col[i] = seq['id'] - def_col[i] = seq['definition'] - seq_col[i] = seq['sequence'] - if get_quality : - qual_col[i] = seq['quality'] - - for tag in seq['tags'] : - - value = seq['tags'][tag] - - # Check NA value - if value == config['import']['NA'] : - value = None - - if tag not in dcols : - - value_type = type(value) - nb_elts = 1 - value_obitype = OBI_VOID - - if value_type == dict or value_type == list : - nb_elts = len(value) - elt_names = list(value) - else : - nb_elts = 1 - elt_names = None - - value_obitype = get_obitype(value) - - if value_obitype != OBI_VOID : - dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype) - - # Fill value - dcols[tag][0][i] = value - - # TODO else log error? - - else : - - rewrite = False - - # Check type adequation - old_type = dcols[tag][1] - new_type = OBI_VOID - new_type = update_obitype(old_type, value) - if old_type != new_type : - rewrite = True - - try: - # Fill value - dcols[tag][0][i] = value - - except IndexError : - - value_type = type(value) - old_column = dcols[tag][0] - old_nb_elements_per_line = old_column.nb_elements_per_line - new_nb_elements_per_line = 0 - old_elements_names = old_column.elements_names - new_elements_names = None + logger.info("obi import : imports file into an DMS") - ##################################################################### - - # Check the length and keys of column lines if needed - if value_type == dict : # Check dictionary keys - for k in value : - if k not in old_elements_names : - new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value])) - rewrite = True - break - - elif value_type == list or value_type == tuple : # Check vector length - if old_nb_elements_per_line < len(value) : - new_nb_elements_per_line = len(value) - rewrite = True - - ##################################################################### - - if rewrite : - if new_nb_elements_per_line == 0 and new_elements_names is not None : - new_nb_elements_per_line = len(new_elements_names) - - # Reset obierrno - obi_errno = 0 - - dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, - new_data_type=new_type, - new_nb_elements_per_line=new_nb_elements_per_line, - new_elements_names=new_elements_names), - value_obitype) - - # Update the dictionary: - for t in dcols : - dcols[t] = (view[t], dcols[t][1]) - - # Fill value - dcols[tag][0][i] = value - - i+=1 - - print("\n") - print(view.__repr__()) - - d.close() + inputs = open_uri(config['obi']['inputURI']) + + print(inputs) + + sys.exit() + +# pb = ProgressBar(1000000, config, seconde=5) # TODO should be number of records in file +# +# inputs = uopen(config['import']['filename']) +# +# # Create or open DMS +# d = DMS.open_or_new(config['obi']['defaultdms']) +# +# get_quality = False +# NUC_SEQS_view = False +# if config['import']['seqinformat']=='fasta': +# get_quality = False +# NUC_SEQS_view = True +# iseq = fastaIterator(inputs, skip=config['import']['skip']) +# view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality) +# elif config['import']['seqinformat']=='fastq': +# get_quality = True +# NUC_SEQS_view = True +# iseq = fastqIterator(inputs, skip=config['import']['skip']) +# view = View_NUC_SEQS.new(d, config['import']['destview'], quality=get_quality) +# else: +# raise RuntimeError('File format not handled') +# +# # Save basic columns in variables for optimization +# if NUC_SEQS_view : +# id_col = view["ID"] +# def_col = view["DEFINITION"] +# seq_col = view["NUC_SEQ"] +# if get_quality : +# qual_col = view["QUALITY"] +# +# dcols = {} +# +# i = 0 +# for seq in iseq : +# if i == config['import']['only'] : +# break +# else : +# pb(i) +# if NUC_SEQS_view : +# id_col[i] = seq['id'] +# def_col[i] = seq['definition'] +# seq_col[i] = seq['sequence'] +# if get_quality : +# qual_col[i] = seq['quality'] +# +# for tag in seq['tags'] : +# +# value = seq['tags'][tag] +# +# # Check NA value +# if value == config['import']['NA'] : +# value = None +# +# if tag not in dcols : +# +# value_type = type(value) +# nb_elts = 1 +# value_obitype = OBI_VOID +# +# if value_type == dict or value_type == list : +# nb_elts = len(value) +# elt_names = list(value) +# else : +# nb_elts = 1 +# elt_names = None +# +# value_obitype = get_obitype(value) +# +# if value_obitype != OBI_VOID : +# dcols[tag] = (Column.new_column(view, tag, value_obitype, nb_elements_per_line=nb_elts, elements_names=elt_names), value_obitype) +# +# # Fill value +# dcols[tag][0][i] = value +# +# # TODO else log error? +# +# else : +# +# rewrite = False +# +# # Check type adequation +# old_type = dcols[tag][1] +# new_type = OBI_VOID +# new_type = update_obitype(old_type, value) +# if old_type != new_type : +# rewrite = True +# +# try: +# # Fill value +# dcols[tag][0][i] = value +# +# except IndexError : +# +# value_type = type(value) +# old_column = dcols[tag][0] +# old_nb_elements_per_line = old_column.nb_elements_per_line +# new_nb_elements_per_line = 0 +# old_elements_names = old_column.elements_names +# new_elements_names = None +# +# ##################################################################### +# +# # Check the length and keys of column lines if needed +# if value_type == dict : # Check dictionary keys +# for k in value : +# if k not in old_elements_names : +# new_elements_names = list(set(old_elements_names+[tobytes(k) for k in value])) +# rewrite = True +# break +# +# elif value_type == list or value_type == tuple : # Check vector length +# if old_nb_elements_per_line < len(value) : +# new_nb_elements_per_line = len(value) +# rewrite = True +# +# ##################################################################### +# +# if rewrite : +# if new_nb_elements_per_line == 0 and new_elements_names is not None : +# new_nb_elements_per_line = len(new_elements_names) +# +# # Reset obierrno +# obi_errno = 0 +# +# dcols[tag] = (view.rewrite_column_with_diff_attributes(old_column.name, +# new_data_type=new_type, +# new_nb_elements_per_line=new_nb_elements_per_line, +# new_elements_names=new_elements_names), +# value_obitype) +# +# # Update the dictionary: +# for t in dcols : +# dcols[t] = (view[t], dcols[t][1]) +# +# # Fill value +# dcols[tag][0][i] = value +# +# i+=1 +# +# print("\n") +# print(view.__repr__()) +# +# d.close() diff --git a/python/obitools3/parsers/universal.pyx b/python/obitools3/parsers/universal.pyx index b92a4cf..8d3d0ab 100644 --- a/python/obitools3/parsers/universal.pyx +++ b/python/obitools3/parsers/universal.pyx @@ -46,8 +46,8 @@ def entryIteratorFactory(lineiterator, i = iter(lb) first=next(i) - - format="tab" + + format=b"tabular" if first[0]==">": format=b"fasta" @@ -61,9 +61,6 @@ def entryIteratorFactory(lineiterator, format=b"ecopcrfile" elif is_ngsfilter_line(first): format=b"ngsfilter" - else: - format=b"tabular" - if format==b'fasta': if seqtype == b'nuc': diff --git a/python/obitools3/uri/decode.pxd b/python/obitools3/uri/decode.pxd index 2d3447b..573415c 100644 --- a/python/obitools3/uri/decode.pxd +++ b/python/obitools3/uri/decode.pxd @@ -4,3 +4,7 @@ from obitools3.dms.dms cimport DMS from obitools3.dms.view.view cimport View from obitools3.dms.column.column cimport Column from obitools3.dms.taxo.taxo cimport Taxonomy + +from obitools3.utils cimport tobytes, tostr +from obitools3.files.universalopener cimport uopen + diff --git a/python/obitools3/uri/decode.pyx b/python/obitools3/uri/decode.pyx index 964869c..093bfd7 100644 --- a/python/obitools3/uri/decode.pyx +++ b/python/obitools3/uri/decode.pyx @@ -3,16 +3,14 @@ from urllib.parse import urlparse, urlunparse, parse_qs, ParseResultBytes from os.path import isdir, isfile, basename, join -from obitools3.utils import tobytes - from obitools3.dms.dms import DMS -from obitools3.files.universalopener import uopen from obitools3.parsers.fasta import fastaNucIterator from obitools3.parsers.fastq import fastqIterator from obitools3.parsers.universal import entryIteratorFactory from obitools3.dms.obiseq import Nuc_Seq +from obitools3.apps.config import getConfiguration,logger class MalformedURIException(RuntimeError): pass @@ -130,21 +128,29 @@ def open_dms_element(DMS dms, bytes path): return (dms,subsubpart) -def open_uri(uri,input=True,config={}): +def open_uri(uri,bint input=True): cdef bytes urib = tobytes(uri) cdef bytes scheme cdef tuple dms + cdef dict qualifiers + cdef DMS default_dms + config = getConfiguration() urip = urlparse(urib) + + if 'obi' not in config: + config['obi']={} - default_dms=config["obi"]["defaultdms"] - + try: + default_dms=config["obi"]["defaultdms"] + except KeyError: + default_dms=None + scheme = urip.scheme error = None if scheme==b"" : - scheme=b'file' dms = open_dms(urip.path) if dms is None and default_dms is not None: dms=(default_dms,urip.path) @@ -167,17 +173,13 @@ def open_uri(uri,input=True,config={}): return (resource[0],resource[1],urlunparse(urip)) except Exception as e: error=e - - urip = ParseResultBytes(scheme=scheme, - netloc=urip.netloc, - path=urip.path, - params=urip.params, - query=urip.query, - fragment=urip.fragment) - uri=urlunparse(urip) + + if not urip.scheme: + urib=b"file:"+urib try: - file = uopen(uri) + logger('info','Trying to open file : %s', tostr(urib)) + file = uopen(tostr(urib)) except Exception as e: file = None error=e @@ -189,17 +191,26 @@ def open_uri(uri,input=True,config={}): if b'format' in qualifiers: format = qualifiers[b'format'][0] else: - format=config["obi"]["fileformat"] + try: + format=config["obi"]["fileformat"] + except KeyError: + format=None if b'seqtype' in qualifiers: seqtype=qualifiers[b'seqtype'][0] else: - seqtype=config["obi"]["seqtype"] + try: + seqtype=config["obi"]["seqtype"] + except KeyError: + seqtype=b'nuc' if b'skip' in qualifiers: skip=int(qualifiers[b"skip"][0]) else: - skip=config["obi"]["skeep"] + try: + skip=config["obi"]["skip"] + except KeyError: + skip=0 if skip < 0: raise MalformedURIException('Malformed skip argument in URI') @@ -207,8 +218,11 @@ def open_uri(uri,input=True,config={}): if b'only' in qualifiers: only=int(qualifiers[b"only"][0]) else: - only=config["obi"]["only"] - if only <= 0: + try: + only=config["obi"]["only"] + except KeyError: + only=None + if only is not None and only <= 0: raise MalformedURIException('Malformed only argument in URI') @@ -218,7 +232,10 @@ def open_uri(uri,input=True,config={}): except Exception as e: raise MalformedURIException('Malformed skiperror argument in URI') else: - skiperror=config["obi"]["skiperror"] + try: + skiperror=config["obi"]["skiperror"] + except KeyError: + skiperror=True if not isinstance(skiperror, bool): raise MalformedURIException('Malformed skiperror argument in URI') @@ -228,7 +245,10 @@ def open_uri(uri,input=True,config={}): except Exception as e: raise MalformedURIException('Malformed noquality argument in URI') else: - noquality=config["obi"]["noquality"] + try: + noquality=config["obi"]["noquality"] + except KeyError: + noquality=False if not isinstance(noquality, bool): raise MalformedURIException('Malformed noquality argument in URI') @@ -238,7 +258,10 @@ def open_uri(uri,input=True,config={}): elif qualifiers[b"qualityformat"][0]=="solexa": offset=64 else: - offset=config["obi"]["qualityoffset"] + try: + offset=config["obi"]["qualityoffset"] + except KeyError: + offset=33 if b"header" in qualifiers: try: @@ -246,14 +269,20 @@ def open_uri(uri,input=True,config={}): except Exception as e: raise MalformedURIException('Malformed header argument in URI') else: - header=config["obi"]["header"] + try: + header=config["obi"]["header"] + except KeyError: + header=False if not isinstance(header, bool): raise MalformedURIException('Malformed header argument in URI') if b"sep" in qualifiers: sep=qualifiers[b"sep"][0][0] else: - seq=config["obi"]["sep"] + try: + sep=config["obi"]["sep"] + except KeyError: + sep=None # if b"quote" in qualifiers: # pass @@ -261,20 +290,29 @@ def open_uri(uri,input=True,config={}): if b"dec" in qualifiers: dec=qualifiers[b"dec"][0][0] else: - dec=config["obi"]["dec"] + try: + dec=config["obi"]["dec"] + except KeyError: + dec=b"." if b"nastring" in qualifiers: nastring=qualifiers[b"nastring"][0] else: - nastring=config["obi"]["nastring"] - + try: + nastring=config["obi"]["nastring"] + except KeyError: + nastring=b'NA' + if b"stripwhite" in qualifiers: try: stripwhite=eval(qualifiers[b"stripwhite"][0]) except Exception as e: raise MalformedURIException('Malformed stripwhite argument in URI') else: - stripwhite=config["obi"]["stripwhite"] + try: + stripwhite=config["obi"]["stripwhite"] + except KeyError: + stripwhite=True if not isinstance(stripwhite, bool): raise MalformedURIException('Malformed stripwhite argument in URI') @@ -284,14 +322,20 @@ def open_uri(uri,input=True,config={}): except Exception as e: raise MalformedURIException('Malformed blanklineskip argument in URI') else: - blanklineskip=config["obi"]["blanklineskip"] + try: + blanklineskip=config["obi"]["blanklineskip"] + except KeyError: + blanklineskip=True if not isinstance(blanklineskip, bool): raise MalformedURIException('Malformed blanklineskip argument in URI') if b"commentchar" in qualifiers: commentchar=qualifiers[b"commentchar"][0][0] else: - commentchar=config["obi"]["commentchar"] + try: + commentchar=config["obi"]["commentchar"] + except KeyError: + commentchar=b'#' if format is not None: if qualifiers[b"seqtype"]==b"nuc":