From 7f6d1597fce55f01d3f0e35db79e1996b5f75a40 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Sat, 28 Jul 2018 16:48:11 +0200 Subject: [PATCH] Taxonomy: added functions to check if a taxonomy already exists in a DMS, and added taxdump import from a compressed file --- .../obitools3/apps/optiongroups/__init__.py | 6 +-- python/obitools3/dms/capi/obitaxonomy.pxd | 4 +- python/obitools3/dms/taxo/taxo.pyx | 45 +++++++++++++--- src/obidms_taxonomy.c | 51 ++++++++++++++++++- src/obidms_taxonomy.h | 26 ++++++++++ 5 files changed, 120 insertions(+), 12 deletions(-) diff --git a/python/obitools3/apps/optiongroups/__init__.py b/python/obitools3/apps/optiongroups/__init__.py index d2263a6..6bd6d50 100644 --- a/python/obitools3/apps/optiongroups/__init__.py +++ b/python/obitools3/apps/optiongroups/__init__.py @@ -165,9 +165,9 @@ def __addTaxdumpInputOption(optionManager): group = optionManager.add_argument_group("Input format options for taxdump") group.add_argument('--taxdump', - action="store", dest="obi:taxdump", - default=None, - help="Taxdump path") + action="store_true", dest="obi:taxdump", + default=False, + help="Whether the input is a taxdump") def addMinimalInputOption(optionManager): __addInputOption(optionManager) diff --git a/python/obitools3/dms/capi/obitaxonomy.pxd b/python/obitools3/dms/capi/obitaxonomy.pxd index 80b4913..5871279 100644 --- a/python/obitools3/dms/capi/obitaxonomy.pxd +++ b/python/obitools3/dms/capi/obitaxonomy.pxd @@ -37,7 +37,9 @@ cdef extern from "obidms_taxonomy.h" nogil: ctypedef OBIDMS_taxonomy_t* OBIDMS_taxonomy_p - + + int obi_taxonomy_exists(OBIDMS_p dms, const char* taxonomy_name) + OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const_char_p taxonomy_name, bint read_alternative_names) OBIDMS_taxonomy_p obi_read_taxdump(const_char_p taxdump) diff --git a/python/obitools3/dms/taxo/taxo.pyx b/python/obitools3/dms/taxo/taxo.pyx index 86ebcf4..3cdd500 100644 --- a/python/obitools3/dms/taxo/taxo.pyx +++ b/python/obitools3/dms/taxo/taxo.pyx @@ -2,7 +2,8 @@ from obitools3.utils cimport str2bytes, bytes2str, tobytes, tostr -from ..capi.obitaxonomy cimport obi_read_taxonomy, \ +from ..capi.obitaxonomy cimport obi_taxonomy_exists, \ + obi_read_taxonomy, \ obi_read_taxdump, \ obi_write_taxonomy, \ obi_close_taxonomy, \ @@ -17,6 +18,8 @@ from ..capi.obitaxonomy cimport obi_read_taxonomy, \ from cpython.pycapsule cimport PyCapsule_New, PyCapsule_GetPointer +import tarfile + cdef class Taxonomy(OBIWrapper) : # TODO function to import taxonomy? @@ -25,6 +28,16 @@ cdef class Taxonomy(OBIWrapper) : return (self._pointer) + @staticmethod + def exists(DMS dms, object name) : + e = obi_taxonomy_exists(dms.pointer(), tobytes(name)) + if e < 0: + raise RuntimeError("Error : Cannot check if taxonomy %s exists" + % tostr(name)) + else: + return e + + @staticmethod def open(DMS dms, object name) : @@ -46,27 +59,45 @@ cdef class Taxonomy(OBIWrapper) : taxo._ranks = [] for r in range((pointer).ranks.count) : taxo._ranks.append(obi_taxo_rank_index_to_label(r, (pointer).ranks)) - + return taxo @staticmethod - def open_taxdump(DMS dms, object name) : + def open_taxdump(DMS dms, object path) : cdef void* pointer cdef Taxonomy taxo - - pointer = obi_read_taxdump(tobytes(name)) + cdef bytes path_b + cdef int idx + + path_b = tobytes(path) + folder_path = path_b + + if path_b.endswith(b"tar.gz") or path_b.endswith(b"tar"): + idx = path_b.index(b".tar") + folder_path = path_b[:idx] + + if path_b.endswith(b"tar.gz"): + tar = tarfile.open(path_b, "r:gz") + tar.extractall(path=tostr(folder_path)) + tar.close() + elif path_b.endswith(b"tar"): + tar = tarfile.open(path_b, "r:") + tar.extractall(path=tostr(folder_path)) + tar.close() + + pointer = obi_read_taxdump(folder_path) if pointer == NULL : raise RuntimeError("Error : Cannot read taxonomy %s" - % tostr(name)) + % tostr(folder_path)) taxo = OBIWrapper.new_wrapper(Taxonomy, pointer) dms.register(taxo) taxo._dms = dms - taxo._name = tobytes(name) + taxo._name = folder_path taxo._ranks = [] for r in range((pointer).ranks.count) : diff --git a/src/obidms_taxonomy.c b/src/obidms_taxonomy.c index 060c06d..1e113b5 100644 --- a/src/obidms_taxonomy.c +++ b/src/obidms_taxonomy.c @@ -2740,6 +2740,35 @@ int read_names_dmp(const char* taxdump, OBIDMS_taxonomy_p tax) **********************************************************************/ +int obi_taxonomy_exists(OBIDMS_p dms, const char* taxonomy_name) +{ + char* taxonomy_path; + DIR* dir; + + taxonomy_path = get_taxonomy_path(dms, taxonomy_name); + if (taxonomy_path == NULL) + return -1; + + dir = opendir(taxonomy_path); + if (dir) + { + /* Directory exists. */ + closedir(dir); + return 1; + } + else if (ENOENT == errno) + { + /* Directory does not exist. */ + return 0; + } + else + { + /* opendir() failed for some other reason. */ + return -1; + } +} + + OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { OBIDMS_taxonomy_p tax; @@ -3705,7 +3734,7 @@ ecotx_t* obi_taxo_get_kingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy) ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy) { - static int32_t rankindex = -1; + static int32_t rankindex = -1; if (taxonomy == NULL) { @@ -3738,3 +3767,23 @@ const char* obi_taxo_rank_index_to_label(int32_t rank_idx, ecorankidx_t* ranks) return (ranks->label)[rank_idx]; } + +int obi_taxo_is_taxid_included(OBIDMS_taxonomy_p taxonomy, + int32_t* restrict_to_taxids, + int32_t count, + int32_t taxid) +{ + int i; + ecotx_t* taxon; + + taxon = obi_taxo_get_taxon_with_taxid(taxonomy, taxid); + + if (taxon) + for (i=0; i < count; i++) + if ((taxon->taxid == restrict_to_taxids[i]) || + (obi_taxo_is_taxon_under_taxid(taxon, restrict_to_taxids[i]))) + return 1; + + return 0; +} + diff --git a/src/obidms_taxonomy.h b/src/obidms_taxonomy.h index 6d50ba0..8b5c130 100644 --- a/src/obidms_taxonomy.h +++ b/src/obidms_taxonomy.h @@ -183,6 +183,25 @@ typedef struct OBIDMS_taxonomy_t { } OBIDMS_taxonomy_t, *OBIDMS_taxonomy_p; + +/** + * @brief Function checking whether a taxonomy is already registered in a DMS using its name. + * + * @param dms The path to the taxdump directory. + * + * @param dms A pointer on the DMS. + * @param taxonomy_name The name (prefix) of the taxonomy. + * + * @retval 1 if the taxonomy exists. + * @retval 0 if the taxonomy does not exist + * @retval -1 if an error occurred. + * + * @since June 2018 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +int obi_taxonomy_exists(OBIDMS_p dms, const char* taxonomy_name); + + /** * @brief Function reading an NCBI taxdump and loading its information into a taxonomy structure. * @@ -414,3 +433,10 @@ ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy); * @see rank_label_to_index() */ const char* obi_taxo_rank_index_to_label(int32_t rank_idx, ecorankidx_t* ranks); + + +// TODO +int obi_taxo_is_taxid_included(OBIDMS_taxonomy_p taxonomy, + int32_t* restrict_to_taxids, + int32_t count, + int32_t taxid);