obitools3/python/obitools3/commands/grep.pyx

#cython: language_level=3

from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
from obitools3.dms import DMS
from obitools3.dms.view.view cimport View, Line_selection
from obitools3.uri.decode import open_uri
from obitools3.apps.optiongroups import addMinimalInputOption, addTaxonomyInputOption, addMinimalOutputOption
from obitools3.dms.view import RollbackException
from obitools3.apps.config import logger
from obitools3.utils cimport tobytes

from functools import reduce
import time
import re
import sys


__title__="Grep view lines that match the given predicates"


# TODO should sequences that have a grepped attribute at None be grepped or not? (in obi1 they are but....)


def addOptions(parser):

    addMinimalInputOption(parser)
    addTaxonomyInputOption(parser)
    addMinimalOutputOption(parser)

    group=parser.add_argument_group("obi grep specific options")

    group.add_argument("--predicate", "-p",
                       action="append", dest="grep:grep_predicates",
                       metavar="<PREDICATE>",
                       default=None,
                       type=str,
                       help="Python boolean expression to be evaluated in the "
                            "sequence/line context. The attribute name can be "
                            "used in the expression as a variable name."
                            "An extra variable named 'sequence' or 'line' refers"
                            "to the sequence or line object itself. "
                            "Several -p options can be used on the same "
                            "commande line.")

    group.add_argument("-S", "--sequence",
                       action="store", dest="grep:seq_pattern",
                       metavar="<REGULAR_PATTERN>",
                       type=str,
                       help="Regular expression pattern used to select "
                            "the sequence. The pattern is case insensitive.")

    group.add_argument("-D", "--definition",
                       action="store", dest="grep:def_pattern",
                       metavar="<REGULAR_PATTERN>",
                       type=str,
                       help="Regular expression pattern used to select "
                            "the definition of the sequence. The pattern is case insensitive.")

    group.add_argument("-I", "--identifier",
                       action="store", dest="grep:id_pattern",
                       metavar="<REGULAR_PATTERN>",
                       type=str,
                       help="Regular expression pattern used to select "
                            "the identifier of the sequence. The pattern is case insensitive.")

    group.add_argument("--id-list",
                       action="store", dest="grep:id_list",
                       metavar="<FILE_NAME>",
                       type=str,
                       help="File containing the identifiers of the sequences to select.")

    group.add_argument("-a", "--attribute",
                       action="append", dest="grep:attribute_patterns",
                       type=str,
                       default=[],
                       metavar="<ATTRIBUTE_NAME>:<REGULAR_PATTERN>",
                       help="Regular expression pattern matched against "
                            "the attributes of the sequence. "
                            "The pattern is case sensitive. "
                            "Several -a options can be used on the same "
                            "command line.")

    group.add_argument("-A", "--has-attribute",
                       action="append", dest="grep:attributes",
                       type=str,
                       default=[],
                       metavar="<ATTRIBUTE_NAME>",
                       help="Select records with the attribute <ATTRIBUTE_NAME> "
                            "defined (not set to NA value). "
                            "Several -a options can be used on the same "
                            "command line.")

    group.add_argument("-L", "--lmax",
                       action="store", dest="grep:lmax",
                       metavar="<MAX_LENGTH>",
                       type=int,
                       help="Keep sequences shorter than MAX_LENGTH.")

    group.add_argument("-l", "--lmin",
                       action="store", dest="grep:lmin",
                       metavar="<MIN_LENGTH>",
                       type=int,
                       help="Keep sequences longer than MIN_LENGTH.")

    group.add_argument("-v", "--invert-selection",
                       action="store_true", dest="grep:invert_selection",
                       default=False,
                       help="Invert the selection.")


    group=parser.add_argument_group("Taxonomy filtering specific options")  #TODO put somewhere else? not in grep

    group.add_argument('--require-rank',
                       action="append", dest="grep:required_ranks",
                       metavar="<RANK_NAME>",
                       type=str,
                       default=[],
                       help="Select sequences with a taxid that is or has "
                            "a parent of rank <RANK_NAME>.")

    group.add_argument('-r', '--required',
                       action="append", dest="grep:required_taxids",
                       metavar="<TAXID>",
                       type=int,
                       default=[],
                       help="Select the sequences having the ancestor of taxid <TAXID>. "
                            "If several ancestors are specified (with \n'-r taxid1 -r taxid2'), "
                            "the sequences having at least one of them are selected.")

    # TODO useless option equivalent to -r -v?
    group.add_argument('-i','--ignore',
                     action="append", dest="grep:ignored_taxids",
                     metavar="<TAXID>",
                     type=int,
                     default=[],
                     help="Ignore the sequences having the ancestor of taxid <TAXID>. "
                          "If several ancestors are specified (with \n'-r taxid1 -r taxid2'), "
                          "the sequences having at least one of them are ignored.")


def Filter_generator(options, tax_filter):
    #taxfilter = taxonomyFilterGenerator(options)

    # Initialize conditions
    predicates = None
    if "predicates" in options:
        predicates = options["predicates"]
    attributes = None
    if "attributes" in options:
        attributes = options["attributes"]
    lmax = None
    if "lmax" in options:
        lmax = options["lmax"]
    lmin = None
    if "lmin" in options:
        lmin = options["lmin"]
    invert_selection = options["invert_selection"]
    id_set = None
    if "id_list" in options:
        id_set = set(x.strip() for x in open(options["id_list"]))

    # Initialize the regular expression patterns
    seq_pattern = None
    if "seq_pattern" in options:
        seq_pattern = re.compile(tobytes(options["seq_pattern"]), re.I)
    id_pattern = None
    if "id_pattern" in options:
        id_pattern = re.compile(tobytes(options["id_pattern"]))
    def_pattern = None
    if "def_pattern" in options:
        def_pattern = re.compile(tobytes(options["def_pattern"]))
    attribute_patterns={}
    if "attribute_patterns" in options:
        for p in options["attribute_patterns"]:
            attribute, pattern = p.split(":", 1)
            attribute_patterns[tobytes(attribute)] = re.compile(tobytes(pattern))

    def filter(line, loc_env):
        cdef bint good = True

        if seq_pattern and hasattr(line, "seq"):
            good = <bint>(seq_pattern.search(line.seq))

        if good and id_pattern and hasattr(line, "id"):
            good = <bint>(id_pattern.search(line.id))

        if good and id_set is not None and hasattr(line, "id"):
            good = line.id in id_set

        if good and def_pattern and hasattr(line, "definition"):
            good = <bint>(def_pattern.search(line.definition))

        if good and attributes:  # TODO discuss that we test not None
            good = reduce(lambda bint x, bint y: x and y,
                           (line[attribute] is not None for attribute in attributes),
                           True)

        if good and attribute_patterns:
            good = (reduce(lambda bint x, bint y : x and y,
                        (line[attribute] is not None for attribute in attributes),
                        True)
                    and
                    reduce(lambda bint x, bint y: x and y,
                        (<bint>(attribute_patterns[attribute].search(tobytes(str(line[attribute]))))
                        for attribute in attribute_patterns),
                        True)
                   )

        if good and predicates:
            good = (reduce(lambda bint x, bint y: x and y,
                    (bool(eval(p, loc_env, line))
                    for p in predicates), True))

        if good and lmin:
            good = len(line) >= lmin

        if good and lmax:
            good = len(line) <= lmax

        if good:
            good = tax_filter(line)

        if invert_selection :
            good = not good

        return good

    return filter


def Taxonomy_filter_generator(taxo, options):
    if taxo is not None:
        def tax_filter(seq):
            good = True
            if b'TAXID' in seq and seq[b'TAXID'] is not None:   # TODO use macro
                taxid = seq[b'TAXID']
                if "required_ranks" in options and options["required_ranks"]:
                    taxon_at_rank = reduce(lambda x,y: x and y,
                                           (taxo.get_taxon_at_rank(seq[b'TAXID'], rank) is not None
                                            for rank in options["required_ranks"]),
                                           True)
                    good = good and taxon_at_rank
                if "required_taxids" in options and options["required_taxids"]:
                    good = good and reduce(lambda x,y: x or y,
                                           (taxo.is_ancestor(r, taxid)
                                            for r in options["required_taxids"]),
                                           False)
                if "ignored_taxids" in options and options["ignored_taxids"]:
                    good = good and not reduce(lambda x,y: x or y,
                                               (taxo.is_ancestor(r,taxid)
                                                for r in options["ignored_taxids"]),
                                               False)
            return good
    else:
        def tax_filter(seq):
            return True
    return tax_filter


def run(config):

    DMS.obi_atexit()

    logger("info", "obi grep")

    # Open the input
    input = open_uri(config["obi"]["inputURI"])
    if input is None:
        raise Exception("Could not read input view")
    i_view = input[1]

    # Read the name of the output view
    uri = config["obi"]["outputURI"].split("/")
    if len(uri)==2:
        # Check that input and output DMS are the same (predicate, to discuss)
        if config["obi"]["inputURI"].split("/")[0] != uri[0]:
            raise Exception("Input and output DMS must be the same")
        output_view_name = uri[1]
    else:
        output_view_name = uri[0]

    if "taxoURI" in config["obi"] :   # TODO default None problem
        taxo_uri = open_uri(config["obi"]["taxoURI"])
        if taxo_uri is None:
            raise Exception("Couldn't open taxonomy")
        taxo = taxo_uri[1]
    else :
        taxo = None

    # Initialize the progress bar
    pb = ProgressBar(len(i_view), config, seconde=5)

    # Apply filter
    tax_filter = Taxonomy_filter_generator(taxo, config["grep"])
    filter = Filter_generator(config["grep"], tax_filter)
    selection = Line_selection(i_view)
    for i in range(len(i_view)):
        pb(i)
        line = i_view[i]

        loc_env = {"sequence": line, "line": line, "taxonomy": taxo}

        good = filter(line, loc_env)

        if good :
            selection.append(i)

    # Create output view with the line selection
    try:
        o_view = selection.materialize(output_view_name)
    except Exception, e:
        raise RollbackException("obi grep error, rollbacking view: "+str(e), o_view)

    # TODO DISCUSS if output URI to different DMS, copy view?

    # Save command config in View and DMS comments
    command_line = " ".join(sys.argv[1:])
    input_dms_name=[input[0].name]
    input_view_name=[input[1].name]
    if 'taxoURI' in config['obi'] and config['obi']['taxoURI'] is not None:
        input_dms_name.append(config['obi']['taxoURI'].split("/", 1)[0])
        input_view_name.append(config['obi']['taxoURI'].split("/", 1)[1])
    o_view.write_config(config, "grep", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
    input[0].record_command_line(command_line)  # TODO assuming input and output dms are the same

    print("\n")
    print(repr(o_view))

    input[0].close()
    #output[0].close()