New command: obi annotate
This commit is contained in:
340
python/obitools3/commands/annotate.pyx
Normal file
340
python/obitools3/commands/annotate.pyx
Normal file
@ -0,0 +1,340 @@
|
|||||||
|
#cython: language_level=3
|
||||||
|
|
||||||
|
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
||||||
|
from obitools3.dms import DMS
|
||||||
|
from obitools3.dms.view.view cimport View, Line_selection
|
||||||
|
from obitools3.uri.decode import open_uri
|
||||||
|
from obitools3.apps.optiongroups import addMinimalInputOption, addTaxonomyInputOption, addMinimalOutputOption
|
||||||
|
from obitools3.dms.view import RollbackException
|
||||||
|
from functools import reduce
|
||||||
|
from obitools3.apps.config import logger
|
||||||
|
from obitools3.utils cimport tobytes
|
||||||
|
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN
|
||||||
|
import time
|
||||||
|
import math
|
||||||
|
|
||||||
|
|
||||||
|
__title__="Annotate views with new tags and edit existing annotations"
|
||||||
|
|
||||||
|
|
||||||
|
def addOptions(parser):
|
||||||
|
|
||||||
|
addMinimalInputOption(parser)
|
||||||
|
addTaxonomyInputOption(parser)
|
||||||
|
addMinimalOutputOption(parser)
|
||||||
|
|
||||||
|
group=parser.add_argument_group('obi annotate specific options')
|
||||||
|
|
||||||
|
group.add_argument('--seq-rank', # TODO seq/elt/line???
|
||||||
|
action="store_true",
|
||||||
|
dest="annotate:add_rank",
|
||||||
|
default=False,
|
||||||
|
help="Add a rank attribute to the sequence "
|
||||||
|
"indicating the sequence position in the data.")
|
||||||
|
|
||||||
|
group.add_argument('-R', '--rename-tag',
|
||||||
|
action="append",
|
||||||
|
dest="annotate:rename_tags",
|
||||||
|
metavar="<OLD_NAME:NEW_NAME>",
|
||||||
|
type=str,
|
||||||
|
default=[],
|
||||||
|
help="Change tag name from OLD_NAME to NEW_NAME.")
|
||||||
|
|
||||||
|
group.add_argument('-D', '--delete-tag',
|
||||||
|
action="append",
|
||||||
|
dest="annotate:delete_tags",
|
||||||
|
metavar="<TAG_NAME>",
|
||||||
|
type=str,
|
||||||
|
default=[],
|
||||||
|
help="Delete tag TAG_NAME.")
|
||||||
|
|
||||||
|
group.add_argument('-S', '--set-tag',
|
||||||
|
action="append",
|
||||||
|
dest="annotate:set_tags",
|
||||||
|
metavar="<TAG_NAME:PYTHON_EXPRESSION>",
|
||||||
|
type=str,
|
||||||
|
default=[],
|
||||||
|
help="Add a new tag named TAG_NAME with "
|
||||||
|
"a value computed from PYTHON_EXPRESSION.")
|
||||||
|
|
||||||
|
group.add_argument('--set-identifier',
|
||||||
|
action="store",
|
||||||
|
dest="annotate:set_identifier",
|
||||||
|
metavar="<PYTHON_EXPRESSION>",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Set sequence identifier with "
|
||||||
|
"a value computed from PYTHON_EXPRESSION.")
|
||||||
|
|
||||||
|
group.add_argument('--set-sequence',
|
||||||
|
action="store",
|
||||||
|
dest="annotate:set_sequence",
|
||||||
|
metavar="<PYTHON_EXPRESSION>",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Change the sequence itself with "
|
||||||
|
"a value computed from PYTHON_EXPRESSION.")
|
||||||
|
|
||||||
|
group.add_argument('--set-definition',
|
||||||
|
action="store",
|
||||||
|
dest="annotate:set_definition",
|
||||||
|
metavar="<PYTHON_EXPRESSION>",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Set sequence definition with "
|
||||||
|
"a value computed from PYTHON_EXPRESSION.")
|
||||||
|
|
||||||
|
group.add_argument('--run',
|
||||||
|
action="store",
|
||||||
|
dest="annotate:run",
|
||||||
|
metavar="<PYTHON_EXPRESSION>",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Run a python expression on each element.")
|
||||||
|
|
||||||
|
group.add_argument('-C', '--clear',
|
||||||
|
action="store_true",
|
||||||
|
dest="annotate:clear",
|
||||||
|
default=False,
|
||||||
|
help="Clear all tags except the obligatory ones.")
|
||||||
|
|
||||||
|
group.add_argument('-k','--keep',
|
||||||
|
action='append',
|
||||||
|
dest="annotate:keep",
|
||||||
|
metavar="<TAG>",
|
||||||
|
default=[],
|
||||||
|
type=str,
|
||||||
|
help="Only keep this tag. (Can be specified several times.)")
|
||||||
|
|
||||||
|
group.add_argument('--length',
|
||||||
|
action="store_true",
|
||||||
|
dest="annotate:length",
|
||||||
|
default=False,
|
||||||
|
help="Add 'seq_length' tag with sequence length.")
|
||||||
|
|
||||||
|
group.add_argument('--with-taxon-at-rank',
|
||||||
|
action='append',
|
||||||
|
dest="annotate:taxon_at_rank",
|
||||||
|
metavar="<RANK_NAME>",
|
||||||
|
default=[],
|
||||||
|
type=str,
|
||||||
|
help="Add taxonomy annotation at the specified rank level RANK_NAME.")
|
||||||
|
|
||||||
|
|
||||||
|
def sequenceTaggerGenerator(config, taxo=None):
|
||||||
|
|
||||||
|
toSet=None
|
||||||
|
newId=None
|
||||||
|
newDef=None
|
||||||
|
newSeq=None
|
||||||
|
length=None
|
||||||
|
add_rank=None
|
||||||
|
run=None
|
||||||
|
|
||||||
|
if 'set_tags' in config['annotate']: # TODO default option problem, to fix
|
||||||
|
toSet = [x.split(':',1) for x in config['annotate']['set_tags'] if len(x.split(':',1))==2]
|
||||||
|
if 'set_identifier' in config['annotate']:
|
||||||
|
newId = config['annotate']['set_identifier']
|
||||||
|
if 'set_definition' in config['annotate']:
|
||||||
|
newDef = config['annotate']['set_definition']
|
||||||
|
if 'set_sequence' in config['annotate']:
|
||||||
|
newSeq = config['annotate']['set_sequence']
|
||||||
|
if 'length' in config['annotate']:
|
||||||
|
length = config['annotate']['length']
|
||||||
|
if 'add_rank' in config["annotate"]:
|
||||||
|
add_rank = config["annotate"]["add_rank"]
|
||||||
|
if 'run' in config['annotate']:
|
||||||
|
run = config['annotate']['run']
|
||||||
|
counter = [0]
|
||||||
|
|
||||||
|
for i in range(len(toSet)):
|
||||||
|
for j in range(len(toSet[i])):
|
||||||
|
toSet[i][j] = tobytes(toSet[i][j])
|
||||||
|
|
||||||
|
annoteRank=[]
|
||||||
|
if 'taxon_at_rank' in config['annotate']:
|
||||||
|
if taxo is not None:
|
||||||
|
annoteRank = config['annotate']['taxon_at_rank']
|
||||||
|
else:
|
||||||
|
raise Exception("A taxonomy must be provided to annotate taxon ranks")
|
||||||
|
|
||||||
|
def sequenceTagger(seq):
|
||||||
|
|
||||||
|
if counter[0]>=0:
|
||||||
|
counter[0]+=1
|
||||||
|
|
||||||
|
for rank in annoteRank:
|
||||||
|
if 'taxid' in seq:
|
||||||
|
taxid = seq['taxid']
|
||||||
|
if taxid is not None:
|
||||||
|
rtaxid = taxo.get_taxon_at_rank(taxid, rank)
|
||||||
|
if rtaxid is not None:
|
||||||
|
scn = taxo.get_scientific_name(rtaxid)
|
||||||
|
else:
|
||||||
|
scn=None
|
||||||
|
seq[rank]=rtaxid
|
||||||
|
seq["%s_name"%rank]=scn
|
||||||
|
|
||||||
|
if add_rank:
|
||||||
|
seq['seq_rank']=counter[0]
|
||||||
|
|
||||||
|
for i,v in toSet:
|
||||||
|
#try:
|
||||||
|
if taxo is not None:
|
||||||
|
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
|
||||||
|
else:
|
||||||
|
environ = {'sequence':seq, 'counter':counter[0], 'math':math}
|
||||||
|
val = eval(v, environ, seq)
|
||||||
|
#except Exception,e: # TODO discuss usefulness of this
|
||||||
|
# if options.onlyValid:
|
||||||
|
# raise e
|
||||||
|
# val = v
|
||||||
|
seq[i]=val
|
||||||
|
|
||||||
|
if length:
|
||||||
|
seq['seq_length']=len(seq)
|
||||||
|
|
||||||
|
if newId is not None:
|
||||||
|
# try:
|
||||||
|
if taxo is not None:
|
||||||
|
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
|
||||||
|
else:
|
||||||
|
environ = {'sequence':seq, 'counter':counter[0], 'math':math}
|
||||||
|
val = eval(newId, environ, seq)
|
||||||
|
# except Exception,e:
|
||||||
|
# if options.onlyValid:
|
||||||
|
# raise e
|
||||||
|
# val = newId
|
||||||
|
seq.id=val
|
||||||
|
|
||||||
|
if newDef is not None:
|
||||||
|
# try:
|
||||||
|
if taxo is not None:
|
||||||
|
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
|
||||||
|
else:
|
||||||
|
environ = {'sequence':seq, 'counter':counter[0], 'math':math}
|
||||||
|
val = eval(newDef, environ, seq)
|
||||||
|
# except Exception,e:
|
||||||
|
# if options.onlyValid:
|
||||||
|
# raise e
|
||||||
|
# val = newDef
|
||||||
|
seq.definition=val
|
||||||
|
#
|
||||||
|
if newSeq is not None:
|
||||||
|
# try:
|
||||||
|
if taxo is not None:
|
||||||
|
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
|
||||||
|
else:
|
||||||
|
environ = {'sequence':seq, 'counter':counter[0], 'math':math}
|
||||||
|
val = eval(newSeq, environ, seq)
|
||||||
|
# except Exception,e:
|
||||||
|
# if options.onlyValid:
|
||||||
|
# raise e
|
||||||
|
# val = newSeq
|
||||||
|
seq.seq=val
|
||||||
|
if 'seq_length' in seq:
|
||||||
|
seq['seq_length']=len(seq)
|
||||||
|
# Delete quality since it must match the sequence.
|
||||||
|
# TODO discuss deleting for each sequence separately
|
||||||
|
if QUALITY_COLUMN in seq:
|
||||||
|
seq.view.delete_column(QUALITY_COLUMN)
|
||||||
|
|
||||||
|
if run is not None:
|
||||||
|
# try:
|
||||||
|
if taxo is not None:
|
||||||
|
environ = {'taxonomy' : taxo, 'sequence':seq, 'counter':counter[0], 'math':math}
|
||||||
|
else:
|
||||||
|
environ = {'sequence':seq, 'counter':counter[0], 'math':math}
|
||||||
|
eval(run, environ, seq)
|
||||||
|
# except Exception,e:
|
||||||
|
# if options.onlyValid:
|
||||||
|
# raise e
|
||||||
|
|
||||||
|
return sequenceTagger
|
||||||
|
|
||||||
|
|
||||||
|
def run(config):
|
||||||
|
|
||||||
|
DMS.obi_atexit()
|
||||||
|
|
||||||
|
logger("info", "obi annotate")
|
||||||
|
|
||||||
|
# Open the input
|
||||||
|
input = open_uri(config['obi']['inputURI'])
|
||||||
|
if input is None:
|
||||||
|
raise Exception("Could not read input view")
|
||||||
|
i_view = input[1]
|
||||||
|
|
||||||
|
# Read the name of the output view
|
||||||
|
uri = config['obi']['outputURI'].split('/')
|
||||||
|
if len(uri)==2:
|
||||||
|
# Check that input and output DMS are the same (predicate, to discuss)
|
||||||
|
if config['obi']['inputURI'].split('/')[0] != uri[0]:
|
||||||
|
raise Exception("Input and output DMS must be the same")
|
||||||
|
output_view_name = uri[1]
|
||||||
|
else:
|
||||||
|
output_view_name = uri[0]
|
||||||
|
|
||||||
|
# Clone output view from input view
|
||||||
|
o_view = i_view.clone(output_view_name, comments=i_view.comments+b"\nobi annotate") # TODO comments
|
||||||
|
if o_view is None:
|
||||||
|
raise Exception("Couldn't create output view")
|
||||||
|
|
||||||
|
if 'taxoURI' in config['obi'] : # TODO default None problem
|
||||||
|
taxo_uri = open_uri(config['obi']['taxoURI'])
|
||||||
|
if taxo_uri is None:
|
||||||
|
raise Exception("Couldn't open taxonomy")
|
||||||
|
taxo = taxo_uri[1]
|
||||||
|
else :
|
||||||
|
taxo = None
|
||||||
|
|
||||||
|
# Initialize the progress bar
|
||||||
|
pb = ProgressBar(len(o_view), config, seconde=5)
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
|
# Apply editions
|
||||||
|
# Editions at view level
|
||||||
|
if 'delete_tags' in config['annotate']:
|
||||||
|
toDelete = config['annotate']['delete_tags'][:]
|
||||||
|
if 'rename_tags' in config['annotate']:
|
||||||
|
toRename = [x.split(':',1) for x in config['annotate']['rename_tags'] if len(x.split(':',1))==2]
|
||||||
|
if 'clear' in config['annotate']:
|
||||||
|
clear = config['annotate']['clear']
|
||||||
|
if 'keep' in config['annotate']:
|
||||||
|
keep = config['annotate']['keep']
|
||||||
|
for i in range(len(toDelete)):
|
||||||
|
toDelete[i] = tobytes(toDelete[i])
|
||||||
|
for i in range(len(toRename)):
|
||||||
|
for j in range(len(toRename[i])):
|
||||||
|
toRename[i][j] = tobytes(toRename[i][j])
|
||||||
|
for i in range(len(keep)):
|
||||||
|
keep[i] = tobytes(keep[i])
|
||||||
|
keep = set(keep)
|
||||||
|
|
||||||
|
if clear or keep:
|
||||||
|
for k in o_view.keys():
|
||||||
|
if k not in keep:
|
||||||
|
o_view.delete_column(k)
|
||||||
|
else:
|
||||||
|
for k in toDelete:
|
||||||
|
o_view.delete_column(k)
|
||||||
|
for old_name, new_name in toRename:
|
||||||
|
if old_name in o_view:
|
||||||
|
o_view.rename_column(old_name, new_name)
|
||||||
|
|
||||||
|
# Editions at line level
|
||||||
|
sequenceTagger = sequenceTaggerGenerator(config, taxo=taxo)
|
||||||
|
for i in range(len(o_view)):
|
||||||
|
pb(i)
|
||||||
|
sequenceTagger(o_view[i])
|
||||||
|
|
||||||
|
except Exception, e:
|
||||||
|
raise RollbackException("obi annotate error, rollbacking view: "+str(e), o_view)
|
||||||
|
|
||||||
|
print("\n")
|
||||||
|
print(repr(o_view))
|
||||||
|
|
||||||
|
input[0].close()
|
||||||
|
# output[0].close()
|
||||||
|
|
Reference in New Issue
Block a user