diff --git a/python/obitools3/commands/grep.pyx b/python/obitools3/commands/grep.pyx index 63ac52b..338ff0d 100644 --- a/python/obitools3/commands/grep.pyx +++ b/python/obitools3/commands/grep.pyx @@ -6,13 +6,19 @@ from obitools3.dms.view.view cimport View, Line_selection from obitools3.uri.decode import open_uri from obitools3.apps.optiongroups import addMinimalInputOption, addTaxonomyInputOption, addMinimalOutputOption from obitools3.dms.view import RollbackException -from functools import reduce from obitools3.apps.config import logger +from obitools3.utils cimport tobytes + +from functools import reduce import time +import re __title__="Grep view lines that match the given predicates" + +# TODO should sequences that have a grepped attribute at None be grepped or not? (in obi1 they are but....) + def addOptions(parser): @@ -20,16 +26,236 @@ def addOptions(parser): addTaxonomyInputOption(parser) addMinimalOutputOption(parser) - group=parser.add_argument_group('obi grep specific options') + group=parser.add_argument_group("obi grep specific options") - group.add_argument('--predicate','-p', + group.add_argument("--predicate", "-p", action="append", dest="grep:predicates", - metavar='', + metavar="", default=None, type=str, - help="Grep lines that match the given python expression on or .") + help="Python boolean expression to be evaluated in the " + "sequence/line context. The attribute name can be " + "used in the expression as a variable name." + "An extra variable named 'sequence' or 'line' refers" + "to the sequence or line object itself. " + "Several -p options can be used on the same " + "commande line.") + group.add_argument("-S", "--sequence", + action="store", dest="grep:seq_pattern", + metavar="", + type=str, + help="Regular expression pattern used to select " + "the sequence. The pattern is case insensitive.") + + group.add_argument("-D", "--definition", + action="store", dest="grep:def_pattern", + metavar="", + type=str, + help="Regular expression pattern used to select " + "the definition of the sequence. The pattern is case insensitive.") + + group.add_argument("-I", "--identifier", + action="store", dest="grep:id_pattern", + metavar="", + type=str, + help="Regular expression pattern used to select " + "the identifier of the sequence. The pattern is case insensitive.") + + group.add_argument("--id-list", + action="store", dest="grep:id_list", + metavar="", + type=str, + help="File containing the identifiers of the sequences to select.") + + group.add_argument("-a", "--attribute", + action="append", dest="grep:attribute_patterns", + type=str, + default=[], + metavar=":", + help="Regular expression pattern matched against " + "the attributes of the sequence. " + "The pattern is case sensitive. " + "Several -a options can be used on the same " + "command line.") + + group.add_argument("-A", "--has-attribute", + action="append", dest="grep:attributes", + type=str, + default=[], + metavar="", + help="Select records with the attribute " + "defined (not set to NA value). " + "Several -a options can be used on the same " + "command line.") + + group.add_argument("-L", "--lmax", + action="store", dest="grep:lmax", + metavar="", + type=int, + help="Keep sequences shorter than MAX_LENGTH.") + + group.add_argument("-l", "--lmin", + action="store", dest="grep:lmin", + metavar="", + type=int, + help="Keep sequences longer than MIN_LENGTH.") + + group.add_argument("-v", "--invert-selection", + action="store_true", dest="grep:invert_selection", + default=False, + help="Invert the selection.") + + + group=parser.add_argument_group("Taxonomy filtering specific options") #TODO put somewhere else? not in grep + + group.add_argument('--require-rank', + action="append", dest="grep:required_ranks", + metavar="", + type=str, + default=[], + help="Select sequences with a taxid that is or has " + "a parent of rank .") + group.add_argument('-r', '--required', + action="append", dest="grep:required_taxids", + metavar="", + type=int, + default=[], + help="Select the sequences having the ancestor of taxid . " + "If several ancestors are specified (with \n'-r taxid1 -r taxid2'), " + "the sequences having at least one of them are selected.") + + # TODO useless option equivalent to -r -v? + group.add_argument('-i','--ignore', + action="append", dest="grep:ignored_taxids", + metavar="", + type=int, + default=[], + help="Ignore the sequences having the ancestor of taxid . " + "If several ancestors are specified (with \n'-r taxid1 -r taxid2'), " + "the sequences having at least one of them are ignored.") + + +def Filter_generator(options, tax_filter): + #taxfilter = taxonomyFilterGenerator(options) + + # Initialize conditions + predicates = None + if "predicates" in options: + predicates = options["predicates"] + attributes = None + if "attributes" in options: + attributes = options["attributes"] + lmax = None + if "lmax" in options: + lmax = options["lmax"] + lmin = None + if "lmin" in options: + lmin = options["lmin"] + invert_selection = options["invert_selection"] + id_set = None + if "id_list" in options: + id_set = set(x.strip() for x in open(options["id_list"])) + + # Initialize the regular expression patterns + seq_pattern = None + if "seq_pattern" in options: + seq_pattern = re.compile(tobytes(options["seq_pattern"]), re.I) + id_pattern = None + if "id_pattern" in options: + id_pattern = re.compile(tobytes(options["id_pattern"])) + def_pattern = None + if "def_pattern" in options: + def_pattern = re.compile(tobytes(options["def_pattern"])) + attribute_patterns={} + if "attribute_patterns" in options: + for p in options["attribute_patterns"]: + attribute, pattern = p.split(":", 1) + attribute_patterns[tobytes(attribute)] = re.compile(tobytes(pattern)) + + def filter(line, loc_env): + cdef bint good = True + + if seq_pattern and hasattr(line, "seq"): + good = (seq_pattern.search(line.seq)) + + if good and id_pattern and hasattr(line, "id"): + good = (id_pattern.search(line.id)) + + if good and id_set is not None and hasattr(line, "id"): + good = line.id in id_set + + if good and def_pattern and hasattr(line, "definition"): + good = (def_pattern.search(line.definition)) + + if good and attributes: # TODO discuss that we test not None + good = reduce(lambda bint x, bint y: x and y, + (line[attribute] is not None for attribute in attributes), + True) + + if good and attribute_patterns: + good = (reduce(lambda bint x, bint y : x and y, + (line[attribute] is not None for attribute in attributes), + True) + and + reduce(lambda bint x, bint y: x and y, + ((attribute_patterns[attribute].search(tobytes(str(line[attribute])))) + for attribute in attribute_patterns), + True) + ) + + if good and predicates: + good = (reduce(lambda bint x, bint y: x and y, + (bool(eval(p, loc_env, line)) + for p in predicates), True)) + + if good and lmin: + good = len(line) >= lmin + + if good and lmax: + good = len(line) <= lmax + + if good: + good = tax_filter(line) + + if invert_selection : + good = not good + + return good + + return filter + + +def Taxonomy_filter_generator(taxo, options): + if taxo is not None: + def tax_filter(seq): + good = True + if b'TAXID' in seq and seq[b'TAXID'] is not None: # TODO use macro + taxid = seq[b'TAXID'] + if "required_ranks" in options and options["required_ranks"]: + taxon_at_rank = reduce(lambda x,y: x and y, + (taxo.get_taxon_at_rank(seq[b'TAXID'], rank) is not None + for rank in options["required_ranks"]), + True) + good = good and taxon_at_rank + if "required_taxids" in options and options["required_taxids"]: + good = good and reduce(lambda x,y: x or y, + (taxo.is_ancestor(r, taxid) + for r in options["required_taxids"]), + False) + if "ignored_taxids" in options and options["ignored_taxids"]: + good = good and not reduce(lambda x,y: x or y, + (taxo.is_ancestor(r,taxid) + for r in options["ignored_taxids"]), + False) + return good + else: + def tax_filter(seq): + return True + return tax_filter + + def run(config): DMS.obi_atexit() @@ -37,23 +263,23 @@ def run(config): logger("info", "obi grep") # Open the input - input = open_uri(config['obi']['inputURI']) + input = open_uri(config["obi"]["inputURI"]) if input is None: raise Exception("Could not read input view") i_view = input[1] # Read the name of the output view - uri = config['obi']['outputURI'].split('/') + uri = config["obi"]["outputURI"].split("/") if len(uri)==2: # Check that input and output DMS are the same (predicate, to discuss) - if config['obi']['inputURI'].split('/')[0] != uri[0]: + if config["obi"]["inputURI"].split("/")[0] != uri[0]: raise Exception("Input and output DMS must be the same") output_view_name = uri[1] else: output_view_name = uri[0] - if 'taxoURI' in config['obi'] : # TODO default None problem - taxo_uri = open_uri(config['obi']['taxoURI']) + if "taxoURI" in config["obi"] : # TODO default None problem + taxo_uri = open_uri(config["obi"]["taxoURI"]) if taxo_uri is None: raise Exception("Couldn't open taxonomy") taxo = taxo_uri[1] @@ -64,23 +290,23 @@ def run(config): pb = ProgressBar(len(i_view), config, seconde=5) # Apply filter + tax_filter = Taxonomy_filter_generator(taxo, config["grep"]) + filter = Filter_generator(config["grep"], tax_filter) selection = Line_selection(i_view) for i in range(len(i_view)): pb(i) line = i_view[i] - loc_env = {'sequence': line, 'line': line, 'taxonomy': taxo} - - good = (reduce(lambda bint x, bint y: x and y, - (bool(eval(p, loc_env, line)) - for p in config['grep']['predicates']), True)) + loc_env = {"sequence": line, "line": line, "taxonomy": taxo} + good = filter(line, loc_env) + if good : selection.append(i) # Create output view with the line selection try: - o_view = selection.materialize(output_view_name, comments="obi grep: "+str(config['grep']['predicates'])+"\n") + o_view = selection.materialize(output_view_name, comments="obi grep: "+str(config["grep"])+"\n") except Exception, e: raise RollbackException("obi grep error, rollbacking view: "+str(e), o_view)