obi grep: added all the missing filtering options

2018-08-14 17:11:41 +02:00
parent 3847850a9d
commit 1b6b6d825a
1 changed files with 242 additions and 16 deletions
--- a/python/obitools3/commands/grep.pyx
+++ b/python/obitools3/commands/grep.pyx
@ -6,13 +6,19 @@ from obitools3.dms.view.view cimport View, Line_selection
 from obitools3.uri.decode import open_uri
 from obitools3.apps.optiongroups import addMinimalInputOption, addTaxonomyInputOption, addMinimalOutputOption
 from obitools3.dms.view import RollbackException
-from functools import reduce
 from obitools3.apps.config import logger
+from obitools3.utils cimport tobytes
+
+from functools import reduce
 import time
+import re
 
 
 __title__="Grep view lines that match the given predicates"

+
+# TODO should sequences that have a grepped attribute at None be grepped or not? (in obi1 they are but....)
+
 
 def addOptions(parser):
    
@ -20,16 +26,236 @@ def addOptions(parser):
    addTaxonomyInputOption(parser)
    addMinimalOutputOption(parser)

-    group=parser.add_argument_group('obi grep specific options')
+    group=parser.add_argument_group("obi grep specific options")
 
-    group.add_argument('--predicate','-p',
+    group.add_argument("--predicate", "-p",
                       action="append", dest="grep:predicates",
-                       metavar='<PREDICATE>',
+                       metavar="<PREDICATE>",
                       default=None,
                       type=str,
-                       help="Grep lines that match the given python expression on <line> or <sequence>.")
+                       help="Python boolean expression to be evaluated in the "
+                            "sequence/line context. The attribute name can be "
+                            "used in the expression as a variable name."
+                            "An extra variable named 'sequence' or 'line' refers"
+                            "to the sequence or line object itself. "
+                            "Several -p options can be used on the same "
+                            "commande line.")
 
+    group.add_argument("-S", "--sequence",
+                       action="store", dest="grep:seq_pattern",
+                       metavar="<REGULAR_PATTERN>",
+                       type=str,
+                       help="Regular expression pattern used to select "
+                            "the sequence. The pattern is case insensitive.")
+    
+    group.add_argument("-D", "--definition",
+                       action="store", dest="grep:def_pattern",
+                       metavar="<REGULAR_PATTERN>",
+                       type=str,
+                       help="Regular expression pattern used to select "
+                            "the definition of the sequence. The pattern is case insensitive.")
+    
+    group.add_argument("-I", "--identifier",
+                       action="store", dest="grep:id_pattern",
+                       metavar="<REGULAR_PATTERN>",
+                       type=str,
+                       help="Regular expression pattern used to select "
+                            "the identifier of the sequence. The pattern is case insensitive.")
+    
+    group.add_argument("--id-list",
+                       action="store", dest="grep:id_list",
+                       metavar="<FILE_NAME>",
+                       type=str,
+                       help="File containing the identifiers of the sequences to select.")
+
+    group.add_argument("-a", "--attribute",
+                       action="append", dest="grep:attribute_patterns",
+                       type=str,
+                       default=[],
+                       metavar="<ATTRIBUTE_NAME>:<REGULAR_PATTERN>",
+                       help="Regular expression pattern matched against "
+                            "the attributes of the sequence. "
+                            "The pattern is case sensitive. "
+                            "Several -a options can be used on the same "
+                            "command line.")
+    
+    group.add_argument("-A", "--has-attribute",
+                       action="append", dest="grep:attributes",
+                       type=str,
+                       default=[],
+                       metavar="<ATTRIBUTE_NAME>",
+                       help="Select records with the attribute <ATTRIBUTE_NAME> "
+                            "defined (not set to NA value). "
+                            "Several -a options can be used on the same "
+                            "command line.")
+    
+    group.add_argument("-L", "--lmax",
+                       action="store", dest="grep:lmax",
+                       metavar="<MAX_LENGTH>",
+                       type=int,
+                       help="Keep sequences shorter than MAX_LENGTH.")
+                             
+    group.add_argument("-l", "--lmin",
+                       action="store", dest="grep:lmin",
+                       metavar="<MIN_LENGTH>",
+                       type=int,
+                       help="Keep sequences longer than MIN_LENGTH.")
+    
+    group.add_argument("-v", "--invert-selection",
+                       action="store_true", dest="grep:invert_selection",
+                       default=False,
+                       help="Invert the selection.")
+
+
+    group=parser.add_argument_group("Taxonomy filtering specific options")  #TODO put somewhere else? not in grep
+ 
+    group.add_argument('--require-rank',
+                       action="append", dest="grep:required_ranks",
+                       metavar="<RANK_NAME>",
+                       type=str,
+                       default=[],
+                       help="Select sequences with a taxid that is or has "
+                            "a parent of rank <RANK_NAME>.")
     
+    group.add_argument('-r', '--required',
+                       action="append", dest="grep:required_taxids",
+                       metavar="<TAXID>",
+                       type=int,
+                       default=[],
+                       help="Select the sequences having the ancestor of taxid <TAXID>. "
+                            "If several ancestors are specified (with \n'-r taxid1 -r taxid2'), "
+                            "the sequences having at least one of them are selected.")
+     
+    # TODO useless option equivalent to -r -v?
+    group.add_argument('-i','--ignore',
+                     action="append", dest="grep:ignored_taxids",
+                     metavar="<TAXID>",
+                     type=int,
+                     default=[],
+                     help="Ignore the sequences having the ancestor of taxid <TAXID>. "
+                          "If several ancestors are specified (with \n'-r taxid1 -r taxid2'), "
+                          "the sequences having at least one of them are ignored.")
+
+
+def Filter_generator(options, tax_filter):
+    #taxfilter = taxonomyFilterGenerator(options)
+    
+    # Initialize conditions
+    predicates = None
+    if "predicates" in options:
+        predicates = options["predicates"]
+    attributes = None
+    if "attributes" in options:
+        attributes = options["attributes"]
+    lmax = None
+    if "lmax" in options:
+        lmax = options["lmax"]
+    lmin = None
+    if "lmin" in options:
+        lmin = options["lmin"]
+    invert_selection = options["invert_selection"]
+    id_set = None
+    if "id_list" in options:
+        id_set = set(x.strip() for x in open(options["id_list"]))
+
+    # Initialize the regular expression patterns
+    seq_pattern = None
+    if "seq_pattern" in options:
+        seq_pattern = re.compile(tobytes(options["seq_pattern"]), re.I)
+    id_pattern = None
+    if "id_pattern" in options:
+        id_pattern = re.compile(tobytes(options["id_pattern"]))
+    def_pattern = None
+    if "def_pattern" in options:
+        def_pattern = re.compile(tobytes(options["def_pattern"]))
+    attribute_patterns={}
+    if "attribute_patterns" in options:
+        for p in options["attribute_patterns"]:
+            attribute, pattern = p.split(":", 1)
+            attribute_patterns[tobytes(attribute)] = re.compile(tobytes(pattern))
+    
+    def filter(line, loc_env):
+        cdef bint good = True
+        
+        if seq_pattern and hasattr(line, "seq"):
+            good = <bint>(seq_pattern.search(line.seq))
+        
+        if good and id_pattern and hasattr(line, "id"):
+            good = <bint>(id_pattern.search(line.id))
+            
+        if good and id_set is not None and hasattr(line, "id"):
+            good = line.id in id_set
+            
+        if good and def_pattern and hasattr(line, "definition"):
+            good = <bint>(def_pattern.search(line.definition))
+             
+        if good and attributes:  # TODO discuss that we test not None
+            good = reduce(lambda bint x, bint y: x and y,
+                           (line[attribute] is not None for attribute in attributes),
+                           True)
+             
+        if good and attribute_patterns:
+            good = (reduce(lambda bint x, bint y : x and y, 
+                        (line[attribute] is not None for attribute in attributes),
+                        True)
+                    and
+                    reduce(lambda bint x, bint y: x and y,
+                        (<bint>(attribute_patterns[attribute].search(tobytes(str(line[attribute]))))
+                        for attribute in attribute_patterns), 
+                        True)
+                   )
+             
+        if good and predicates:
+            good = (reduce(lambda bint x, bint y: x and y,
+                    (bool(eval(p, loc_env, line))
+                    for p in predicates), True))
+
+        if good and lmin:
+            good = len(line) >= lmin
+             
+        if good and lmax:
+            good = len(line) <= lmax
+         
+        if good:
+            good = tax_filter(line)
+       
+        if invert_selection :
+            good = not good
+
+        return good
+    
+    return filter
+
+
+def Taxonomy_filter_generator(taxo, options):
+    if taxo is not None:
+        def tax_filter(seq):
+            good = True
+            if b'TAXID' in seq and seq[b'TAXID'] is not None:   # TODO use macro
+                taxid = seq[b'TAXID']
+                if "required_ranks" in options and options["required_ranks"]:
+                    taxon_at_rank = reduce(lambda x,y: x and y,
+                                           (taxo.get_taxon_at_rank(seq[b'TAXID'], rank) is not None
+                                            for rank in options["required_ranks"]),
+                                           True)
+                    good = good and taxon_at_rank 
+                if "required_taxids" in options and options["required_taxids"]:
+                    good = good and reduce(lambda x,y: x or y,
+                                           (taxo.is_ancestor(r, taxid) 
+                                            for r in options["required_taxids"]),
+                                           False)
+                if "ignored_taxids" in options and options["ignored_taxids"]:
+                    good = good and not reduce(lambda x,y: x or y,
+                                               (taxo.is_ancestor(r,taxid) 
+                                                for r in options["ignored_taxids"]),
+                                               False)
+            return good
+    else:
+        def tax_filter(seq):
+            return True
+    return tax_filter
+
+
 def run(config):
     
    DMS.obi_atexit()
@ -37,23 +263,23 @@ def run(config):
    logger("info", "obi grep")

    # Open the input
-    input = open_uri(config['obi']['inputURI'])
+    input = open_uri(config["obi"]["inputURI"])
    if input is None:
        raise Exception("Could not read input view")
    i_view = input[1]

    # Read the name of the output view
-    uri = config['obi']['outputURI'].split('/')
+    uri = config["obi"]["outputURI"].split("/")
    if len(uri)==2:
        # Check that input and output DMS are the same (predicate, to discuss)
-        if config['obi']['inputURI'].split('/')[0] != uri[0]:
+        if config["obi"]["inputURI"].split("/")[0] != uri[0]:
            raise Exception("Input and output DMS must be the same")
        output_view_name = uri[1]
    else:
        output_view_name = uri[0]

-    if 'taxoURI' in config['obi'] :   # TODO default None problem
-        taxo_uri = open_uri(config['obi']['taxoURI'])
+    if "taxoURI" in config["obi"] :   # TODO default None problem
+        taxo_uri = open_uri(config["obi"]["taxoURI"])
        if taxo_uri is None:
            raise Exception("Couldn't open taxonomy")
        taxo = taxo_uri[1]
@ -64,23 +290,23 @@ def run(config):
    pb = ProgressBar(len(i_view), config, seconde=5)
 
    # Apply filter
+    tax_filter = Taxonomy_filter_generator(taxo, config["grep"])
+    filter = Filter_generator(config["grep"], tax_filter)
    selection = Line_selection(i_view)
    for i in range(len(i_view)):
        pb(i)
        line = i_view[i]
                 
-        loc_env = {'sequence': line, 'line': line, 'taxonomy': taxo}
-        
-        good = (reduce(lambda bint x, bint y: x and y,
-                (bool(eval(p, loc_env, line))
-                for p in config['grep']['predicates']), True))
+        loc_env = {"sequence": line, "line": line, "taxonomy": taxo}
        
+        good = filter(line, loc_env)
+ 
        if good :
            selection.append(i)

    # Create output view with the line selection
    try:
-        o_view = selection.materialize(output_view_name, comments="obi grep: "+str(config['grep']['predicates'])+"\n")
+        o_view = selection.materialize(output_view_name, comments="obi grep: "+str(config["grep"])+"\n")
    except Exception, e:
        raise RollbackException("obi grep error, rollbacking view: "+str(e), o_view)