From 2684535e261e298a6dbfd833124712d9fdee2897 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Wed, 21 Mar 2018 16:39:31 +0100 Subject: [PATCH] New command: obi annotate --- python/obitools3/commands/annotate.pyx | 340 +++++++++++++++++++++++++ 1 file changed, 340 insertions(+) create mode 100644 python/obitools3/commands/annotate.pyx diff --git a/python/obitools3/commands/annotate.pyx b/python/obitools3/commands/annotate.pyx new file mode 100644 index 0000000..302829c --- /dev/null +++ b/python/obitools3/commands/annotate.pyx @@ -0,0 +1,340 @@ +#cython: language_level=3 + +from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport +from obitools3.dms import DMS +from obitools3.dms.view.view cimport View, Line_selection +from obitools3.uri.decode import open_uri +from obitools3.apps.optiongroups import addMinimalInputOption, addTaxonomyInputOption, addMinimalOutputOption +from obitools3.dms.view import RollbackException +from functools import reduce +from obitools3.apps.config import logger +from obitools3.utils cimport tobytes +from obitools3.dms.capi.obiview cimport QUALITY_COLUMN +import time +import math + + +__title__="Annotate views with new tags and edit existing annotations" + + +def addOptions(parser): + + addMinimalInputOption(parser) + addTaxonomyInputOption(parser) + addMinimalOutputOption(parser) + + group=parser.add_argument_group('obi annotate specific options') + + group.add_argument('--seq-rank', # TODO seq/elt/line??? + action="store_true", + dest="annotate:add_rank", + default=False, + help="Add a rank attribute to the sequence " + "indicating the sequence position in the data.") + + group.add_argument('-R', '--rename-tag', + action="append", + dest="annotate:rename_tags", + metavar="", + type=str, + default=[], + help="Change tag name from OLD_NAME to NEW_NAME.") + + group.add_argument('-D', '--delete-tag', + action="append", + dest="annotate:delete_tags", + metavar="", + type=str, + default=[], + help="Delete tag TAG_NAME.") + + group.add_argument('-S', '--set-tag', + action="append", + dest="annotate:set_tags", + metavar="", + type=str, + default=[], + help="Add a new tag named TAG_NAME with " + "a value computed from PYTHON_EXPRESSION.") + + group.add_argument('--set-identifier', + action="store", + dest="annotate:set_identifier", + metavar="", + type=str, + default=None, + help="Set sequence identifier with " + "a value computed from PYTHON_EXPRESSION.") + + group.add_argument('--set-sequence', + action="store", + dest="annotate:set_sequence", + metavar="", + type=str, + default=None, + help="Change the sequence itself with " + "a value computed from PYTHON_EXPRESSION.") + + group.add_argument('--set-definition', + action="store", + dest="annotate:set_definition", + metavar="", + type=str, + default=None, + help="Set sequence definition with " + "a value computed from PYTHON_EXPRESSION.") + + group.add_argument('--run', + action="store", + dest="annotate:run", + metavar="", + type=str, + default=None, + help="Run a python expression on each element.") + + group.add_argument('-C', '--clear', + action="store_true", + dest="annotate:clear", + default=False, + help="Clear all tags except the obligatory ones.") + + group.add_argument('-k','--keep', + action='append', + dest="annotate:keep", + metavar="", + default=[], + type=str, + help="Only keep this tag. (Can be specified several times.)") + + group.add_argument('--length', + action="store_true", + dest="annotate:length", + default=False, + help="Add 'seq_length' tag with sequence length.") + + group.add_argument('--with-taxon-at-rank', + action='append', + dest="annotate:taxon_at_rank", + metavar="", + default=[], + type=str, + help="Add taxonomy annotation at the specified rank level RANK_NAME.") + + +def sequenceTaggerGenerator(config, taxo=None): + + toSet=None + newId=None + newDef=None + newSeq=None + length=None + add_rank=None + run=None + + if 'set_tags' in config['annotate']: # TODO default option problem, to fix + toSet = [x.split(':',1) for x in config['annotate']['set_tags'] if len(x.split(':',1))==2] + if 'set_identifier' in config['annotate']: + newId = config['annotate']['set_identifier'] + if 'set_definition' in config['annotate']: + newDef = config['annotate']['set_definition'] + if 'set_sequence' in config['annotate']: + newSeq = config['annotate']['set_sequence'] + if 'length' in config['annotate']: + length = config['annotate']['length'] + if 'add_rank' in config["annotate"]: + add_rank = config["annotate"]["add_rank"] + if 'run' in config['annotate']: + run = config['annotate']['run'] + counter = [0] + + for i in range(len(toSet)): + for j in range(len(toSet[i])): + toSet[i][j] = tobytes(toSet[i][j]) + + annoteRank=[] + if 'taxon_at_rank' in config['annotate']: + if taxo is not None: + annoteRank = config['annotate']['taxon_at_rank'] + else: + raise Exception("A taxonomy must be provided to annotate taxon ranks") + + def sequenceTagger(seq): + + if counter[0]>=0: + counter[0]+=1 + + for rank in annoteRank: + if 'taxid' in seq: + taxid = seq['taxid'] + if taxid is not None: + rtaxid = taxo.get_taxon_at_rank(taxid, rank) + if rtaxid is not None: + scn = taxo.get_scientific_name(rtaxid) + else: + scn=None + seq[rank]=rtaxid + seq["%s_name"%rank]=scn + + if add_rank: + seq['seq_rank']=counter[0] + + for i,v in toSet: + #try: + if taxo is not None: + environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math} + else: + environ = {'sequence':seq, 'counter':counter[0], 'math':math} + val = eval(v, environ, seq) + #except Exception,e: # TODO discuss usefulness of this + # if options.onlyValid: + # raise e + # val = v + seq[i]=val + + if length: + seq['seq_length']=len(seq) + + if newId is not None: +# try: + if taxo is not None: + environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math} + else: + environ = {'sequence':seq, 'counter':counter[0], 'math':math} + val = eval(newId, environ, seq) +# except Exception,e: +# if options.onlyValid: +# raise e +# val = newId + seq.id=val + + if newDef is not None: +# try: + if taxo is not None: + environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math} + else: + environ = {'sequence':seq, 'counter':counter[0], 'math':math} + val = eval(newDef, environ, seq) +# except Exception,e: +# if options.onlyValid: +# raise e +# val = newDef + seq.definition=val +# + if newSeq is not None: +# try: + if taxo is not None: + environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math} + else: + environ = {'sequence':seq, 'counter':counter[0], 'math':math} + val = eval(newSeq, environ, seq) +# except Exception,e: +# if options.onlyValid: +# raise e +# val = newSeq + seq.seq=val + if 'seq_length' in seq: + seq['seq_length']=len(seq) + # Delete quality since it must match the sequence. + # TODO discuss deleting for each sequence separately + if QUALITY_COLUMN in seq: + seq.view.delete_column(QUALITY_COLUMN) + + if run is not None: +# try: + if taxo is not None: + environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math} + else: + environ = {'sequence':seq, 'counter':counter[0], 'math':math} + eval(run, environ, seq) +# except Exception,e: +# if options.onlyValid: +# raise e + + return sequenceTagger + + +def run(config): + + DMS.obi_atexit() + + logger("info", "obi annotate") + + # Open the input + input = open_uri(config['obi']['inputURI']) + if input is None: + raise Exception("Could not read input view") + i_view = input[1] + + # Read the name of the output view + uri = config['obi']['outputURI'].split('/') + if len(uri)==2: + # Check that input and output DMS are the same (predicate, to discuss) + if config['obi']['inputURI'].split('/')[0] != uri[0]: + raise Exception("Input and output DMS must be the same") + output_view_name = uri[1] + else: + output_view_name = uri[0] + + # Clone output view from input view + o_view = i_view.clone(output_view_name, comments=i_view.comments+b"\nobi annotate") # TODO comments + if o_view is None: + raise Exception("Couldn't create output view") + + if 'taxoURI' in config['obi'] : # TODO default None problem + taxo_uri = open_uri(config['obi']['taxoURI']) + if taxo_uri is None: + raise Exception("Couldn't open taxonomy") + taxo = taxo_uri[1] + else : + taxo = None + + # Initialize the progress bar + pb = ProgressBar(len(o_view), config, seconde=5) + + try: + + # Apply editions + # Editions at view level + if 'delete_tags' in config['annotate']: + toDelete = config['annotate']['delete_tags'][:] + if 'rename_tags' in config['annotate']: + toRename = [x.split(':',1) for x in config['annotate']['rename_tags'] if len(x.split(':',1))==2] + if 'clear' in config['annotate']: + clear = config['annotate']['clear'] + if 'keep' in config['annotate']: + keep = config['annotate']['keep'] + for i in range(len(toDelete)): + toDelete[i] = tobytes(toDelete[i]) + for i in range(len(toRename)): + for j in range(len(toRename[i])): + toRename[i][j] = tobytes(toRename[i][j]) + for i in range(len(keep)): + keep[i] = tobytes(keep[i]) + keep = set(keep) + + if clear or keep: + for k in o_view.keys(): + if k not in keep: + o_view.delete_column(k) + else: + for k in toDelete: + o_view.delete_column(k) + for old_name, new_name in toRename: + if old_name in o_view: + o_view.rename_column(old_name, new_name) + + # Editions at line level + sequenceTagger = sequenceTaggerGenerator(config, taxo=taxo) + for i in range(len(o_view)): + pb(i) + sequenceTagger(o_view[i]) + + except Exception, e: + raise RollbackException("obi annotate error, rollbacking view: "+str(e), o_view) + + print("\n") + print(repr(o_view)) + + input[0].close() + # output[0].close() +