From 112e12cab09bbda2393f4d74202ff288187be159 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Fri, 30 Oct 2020 10:45:20 +0100 Subject: [PATCH] Taxonomy: new functions to find taxa by name --- python/obitools3/dms/capi/obitaxonomy.pxd | 22 +++++- python/obitools3/dms/capi/obiview.pxd | 1 + python/obitools3/dms/taxo/taxo.pxd | 3 + python/obitools3/dms/taxo/taxo.pyx | 53 +++++++++++-- src/obidms_taxonomy.c | 12 +++ src/obidms_taxonomy.h | 45 ++++++++++- src/obiview.h | 96 ++++++++++++----------- 7 files changed, 174 insertions(+), 58 deletions(-) diff --git a/python/obitools3/dms/capi/obitaxonomy.pxd b/python/obitools3/dms/capi/obitaxonomy.pxd index 5871279..3f00a36 100755 --- a/python/obitools3/dms/capi/obitaxonomy.pxd +++ b/python/obitools3/dms/capi/obitaxonomy.pxd @@ -7,6 +7,8 @@ from libc.stdint cimport int32_t cdef extern from "obidms_taxonomy.h" nogil: + extern int MIN_LOCAL_TAXID + struct ecotxnode : int32_t taxid int32_t rank @@ -18,6 +20,13 @@ cdef extern from "obidms_taxonomy.h" nogil: ctypedef ecotxnode ecotx_t + struct econame_t : # can't get this struct to be accepted by Cython ('unknown size') + char* name + char* class_name + int32_t is_scientific_name + ecotxnode* taxon + + struct ecotxidx_t : int32_t count int32_t max_taxid @@ -30,9 +39,14 @@ cdef extern from "obidms_taxonomy.h" nogil: char** label + struct econameidx_t : + int32_t count + econame_t* names + + struct OBIDMS_taxonomy_t : ecorankidx_t* ranks -# econameidx_t* names + econameidx_t* names ecotxidx_t* taxa ctypedef OBIDMS_taxonomy_t* OBIDMS_taxonomy_p @@ -51,7 +65,11 @@ cdef extern from "obidms_taxonomy.h" nogil: ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx) ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid) + + char* obi_taxo_get_name_from_name_idx(OBIDMS_taxonomy_p taxonomy, int32_t idx) + ecotx_t* obi_taxo_get_taxon_from_name_idx(OBIDMS_taxonomy_p taxonomy, int32_t idx) + bint obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid) ecotx_t* obi_taxo_get_species(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy) @@ -71,4 +89,4 @@ cdef extern from "obidms_taxonomy.h" nogil: int obi_taxo_add_preferred_name_with_taxon(OBIDMS_taxonomy_p tax, ecotx_t* taxon, const char* preferred_name) const char* obi_taxo_rank_index_to_label(int32_t rank_idx, ecorankidx_t* ranks) - \ No newline at end of file + diff --git a/python/obitools3/dms/capi/obiview.pxd b/python/obitools3/dms/capi/obiview.pxd index 8688bf6..db0f241 100755 --- a/python/obitools3/dms/capi/obiview.pxd +++ b/python/obitools3/dms/capi/obiview.pxd @@ -27,6 +27,7 @@ cdef extern from "obiview.h" nogil: extern const_char_p REVERSE_QUALITY_COLUMN extern const_char_p REVERSE_SEQUENCE_COLUMN extern const_char_p COUNT_COLUMN + extern const_char_p SCIENTIFIC_NAME_COLUMN extern const_char_p TAXID_COLUMN extern const_char_p MERGED_TAXID_COLUMN extern const_char_p MERGED_PREFIX diff --git a/python/obitools3/dms/taxo/taxo.pxd b/python/obitools3/dms/taxo/taxo.pxd index 0e4cbd0..3312624 100755 --- a/python/obitools3/dms/taxo/taxo.pxd +++ b/python/obitools3/dms/taxo/taxo.pxd @@ -11,11 +11,14 @@ cdef class Taxonomy(OBIWrapper) : cdef bytes _name cdef DMS _dms cdef list _ranks + cdef dict _name_dict cdef inline OBIDMS_taxonomy_p pointer(self) + cdef fill_name_dict(self) cpdef Taxon get_taxon_by_idx(self, int idx) cpdef Taxon get_taxon_by_taxid(self, int taxid) + cpdef Taxon get_taxon_by_name(self, object taxon_name, object restricting_taxid=*) cpdef write(self, object prefix) cpdef int add_taxon(self, str name, str rank_name, int parent_taxid, int min_taxid=*) cpdef object get_species(self, int taxid) diff --git a/python/obitools3/dms/taxo/taxo.pyx b/python/obitools3/dms/taxo/taxo.pyx index d6c0b4e..c0f1781 100755 --- a/python/obitools3/dms/taxo/taxo.pyx +++ b/python/obitools3/dms/taxo/taxo.pyx @@ -15,7 +15,11 @@ from ..capi.obitaxonomy cimport obi_taxonomy_exists, \ obi_taxo_get_species, \ obi_taxo_get_genus, \ obi_taxo_get_family, \ - ecotx_t + ecotx_t, \ + econame_t, \ + obi_taxo_get_name_from_name_idx, \ + obi_taxo_get_taxon_from_name_idx + from cpython.pycapsule cimport PyCapsule_New, PyCapsule_GetPointer import tarfile @@ -24,11 +28,29 @@ from libc.stdlib cimport free cdef class Taxonomy(OBIWrapper) : - # TODO function to import taxonomy? - + # TODO function to import taxonomy? + cdef inline OBIDMS_taxonomy_p pointer(self) : return (self._pointer) + cdef fill_name_dict(self): + print("Indexing taxon names...") + + cdef OBIDMS_taxonomy_p pointer = self.pointer() + cdef ecotx_t* taxon_p + cdef object taxon_capsule + cdef bytes name + cdef int count + cdef int n + + count = (pointer).names.count + + for n in range(count) : + name = obi_taxo_get_name_from_name_idx(pointer, n) + taxon_p = obi_taxo_get_taxon_from_name_idx(pointer, n) + taxon_capsule = PyCapsule_New(taxon_p, NULL, NULL) + self._name_dict[name] = Taxon(taxon_capsule, self) + @staticmethod def exists(DMS dms, object name) : @@ -75,7 +97,8 @@ cdef class Taxonomy(OBIWrapper) : taxo._dms = dms taxo._name = tobytes(name) - + taxo._name_dict = {} + taxo.fill_name_dict() taxo._ranks = [] for r in range((pointer).ranks.count) : taxo._ranks.append(obi_taxo_rank_index_to_label(r, (pointer).ranks)) @@ -118,7 +141,8 @@ cdef class Taxonomy(OBIWrapper) : taxo._dms = dms taxo._name = folder_path - + taxo._name_dict = {} + taxo.fill_name_dict() taxo._ranks = [] for r in range((pointer).ranks.count) : taxo._ranks.append(obi_taxo_rank_index_to_label(r, (pointer).ranks)) @@ -129,8 +153,8 @@ cdef class Taxonomy(OBIWrapper) : def __getitem__(self, object ref): if type(ref) == int : return self.get_taxon_by_taxid(ref) - else : - raise NotImplementedError() + elif type(ref) == str or type(ref) == bytes : + return self.get_taxon_by_name(ref) cpdef Taxon get_taxon_by_taxid(self, int taxid): @@ -143,6 +167,19 @@ cdef class Taxonomy(OBIWrapper) : return Taxon(taxon_capsule, self) + cpdef Taxon get_taxon_by_name(self, object taxon_name, object restricting_taxid=None): + taxon = self._name_dict.get(tobytes(taxon_name), None) + if not taxon: + return None + elif restricting_taxid: + if self.is_ancestor(restricting_taxid, taxon.taxid): + return taxon + else: + return None + else: + return taxon + + cpdef Taxon get_taxon_by_idx(self, int idx): cdef ecotx_t* taxa cdef ecotx_t* taxon_p @@ -232,7 +269,7 @@ cdef class Taxonomy(OBIWrapper) : taxa = self.pointer().taxa.taxon - # Yield each taxid + # Yield each taxon for t in range(self.pointer().taxa.count): taxon_p = (taxa+t) taxon_capsule = PyCapsule_New(taxon_p, NULL, NULL) diff --git a/src/obidms_taxonomy.c b/src/obidms_taxonomy.c index 0815aae..1b5b700 100755 --- a/src/obidms_taxonomy.c +++ b/src/obidms_taxonomy.c @@ -3649,6 +3649,18 @@ ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid } +char* obi_taxo_get_name_from_name_idx(OBIDMS_taxonomy_p taxonomy, int32_t idx) +{ + return (((taxonomy->names)->names)[idx]).name; +} + + +ecotx_t* obi_taxo_get_taxon_from_name_idx(OBIDMS_taxonomy_p taxonomy, int32_t idx) +{ + return (((taxonomy->names)->names)[idx]).taxon; +} + + int obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid) // TODO discuss that this doesn't work with deprecated taxids { ecotx_t* next_parent; diff --git a/src/obidms_taxonomy.h b/src/obidms_taxonomy.h index c5ba099..ffdb18d 100755 --- a/src/obidms_taxonomy.h +++ b/src/obidms_taxonomy.h @@ -447,8 +447,51 @@ ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy); const char* obi_taxo_rank_index_to_label(int32_t rank_idx, ecorankidx_t* ranks); -// TODO +/** + * @brief Function checking whether a taxid is included in a subset of the taxonomy. + * + * @param taxonomy A pointer on the taxonomy structure. + * @param restrict_to_taxids An array of taxids. The researched taxid must be under at least one of those array taxids. + * @param count Number of taxids in restrict_to_taxids. + * @param taxid The taxid to check. + * + * @returns A value indicating whether the taxid is included in the chosen subset of the taxonomy. + * @retval 0 if the taxid is not included in the subset of the taxonomy. + * @retval 1 if the taxid is included in the subset of the taxonomy. + * + * @since October 2020 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ int obi_taxo_is_taxid_included(OBIDMS_taxonomy_p taxonomy, int32_t* restrict_to_taxids, int32_t count, int32_t taxid); + + +/** + * @brief Function returning the name of a taxon from its index in the taxonomy name index (econameidx_t). + * + * @param taxonomy A pointer on the taxonomy structure. + * @param idx The index at which the name is in the taxonomy name index (econameidx_t). + * + * @returns The taxon name. + * + * @since October 2020 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +char* obi_taxo_get_name_from_name_idx(OBIDMS_taxonomy_p taxonomy, int32_t idx); + + +/** + * @brief Function returning a taxon structure from its index in the taxonomy name index (econameidx_t). + * + * @param taxonomy A pointer on the taxonomy structure. + * @param idx The index at which the taxon is in the taxonomy name index (econameidx_t). + * + * @returns The taxon structure. + * + * @since October 2020 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +ecotx_t* obi_taxo_get_taxon_from_name_idx(OBIDMS_taxonomy_p taxonomy, int32_t idx); + diff --git a/src/obiview.h b/src/obiview.h index 0805cca..e928741 100755 --- a/src/obiview.h +++ b/src/obiview.h @@ -30,54 +30,56 @@ #include "obiblob.h" -#define OBIVIEW_NAME_MAX_LENGTH (249) /**< The maximum length of an OBIDMS view name, without the extension. - */ -#define VIEW_TYPE_MAX_LENGTH (1024) /**< The maximum length of the type name of a view. - */ -#define LINES_COLUMN_NAME "LINES" /**< The name of the column containing the line selections - * in all views. - */ -#define VIEW_TYPE_NUC_SEQS "NUC_SEQS_VIEW" /**< The type name of views based on nucleotide sequences - * and their metadata. - */ -#define NUC_SEQUENCE_COLUMN "NUC_SEQ" /**< The name of the column containing the nucleotide sequences - * in NUC_SEQS_VIEW views. - */ -#define ID_COLUMN "ID" /**< The name of the column containing the sequence identifiers - * in NUC_SEQS_VIEW views. - */ -#define DEFINITION_COLUMN "DEFINITION" /**< The name of the column containing the sequence definitions - * in NUC_SEQS_VIEW views. - */ -#define QUALITY_COLUMN "QUALITY" /**< The name of the column containing the sequence qualities - * in NUC_SEQS_VIEW views. - */ -#define REVERSE_QUALITY_COLUMN "REVERSE_QUALITY" /**< The name of the column containing the sequence qualities - * of the reverse read (generated by ngsfilter, used by alignpairedend). - */ +#define OBIVIEW_NAME_MAX_LENGTH (249) /**< The maximum length of an OBIDMS view name, without the extension. + */ +#define VIEW_TYPE_MAX_LENGTH (1024) /**< The maximum length of the type name of a view. + */ +#define LINES_COLUMN_NAME "LINES" /**< The name of the column containing the line selections + * in all views. + */ +#define VIEW_TYPE_NUC_SEQS "NUC_SEQS_VIEW" /**< The type name of views based on nucleotide sequences + * and their metadata. + */ +#define NUC_SEQUENCE_COLUMN "NUC_SEQ" /**< The name of the column containing the nucleotide sequences + * in NUC_SEQS_VIEW views. + */ +#define ID_COLUMN "ID" /**< The name of the column containing the sequence identifiers + * in NUC_SEQS_VIEW views. + */ +#define DEFINITION_COLUMN "DEFINITION" /**< The name of the column containing the sequence definitions + * in NUC_SEQS_VIEW views. + */ +#define QUALITY_COLUMN "QUALITY" /**< The name of the column containing the sequence qualities + * in NUC_SEQS_VIEW views. + */ +#define REVERSE_QUALITY_COLUMN "REVERSE_QUALITY" /**< The name of the column containing the sequence qualities + * of the reverse read (generated by ngsfilter, used by alignpairedend). + */ #define REVERSE_SEQUENCE_COLUMN "REVERSE_SEQUENCE" /**< The name of the column containing the sequence - * of the reverse read (generated by ngsfilter, used by alignpairedend). - */ -#define QUALITY_COLUMN "QUALITY" /**< The name of the column containing the sequence qualities - * in NUC_SEQS_VIEW views. - */ -#define COUNT_COLUMN "COUNT" /**< The name of the column containing the sequence counts - * in NUC_SEQS_VIEW views. - */ -#define TAXID_COLUMN "TAXID" /**< The name of the column containing the taxids. TODO subtype of INT column? - */ -#define MERGED_TAXID_COLUMN "MERGED_TAXID" /**< The name of the column containing the merged taxids information. - */ -#define MERGED_PREFIX "MERGED_" /**< The prefix to prepend to column names when merging informations during obi uniq. - */ -#define TAXID_DIST_COLUMN "TAXID_DIST" /**< The name of the column containing a dictionary of taxid:[list of ids] when merging informations during obi uniq. - */ -#define MERGED_COLUMN "MERGED" /**< The name of the column containing a list of ids when merging informations during obi uniq. - */ -#define ID_PREFIX "seq" /**< The default prefix of sequence identifiers in automatic ID columns. - */ -#define PREDICATE_KEY "predicates" /**< The key used in the json-formatted view comments to store predicates. - */ + * of the reverse read (generated by ngsfilter, used by alignpairedend). + */ +#define QUALITY_COLUMN "QUALITY" /**< The name of the column containing the sequence qualities + * in NUC_SEQS_VIEW views. + */ +#define COUNT_COLUMN "COUNT" /**< The name of the column containing the sequence counts + * in NUC_SEQS_VIEW views. + */ +#define SCIENTIFIC_NAME_COLUMN "SCIENTIFIC_NAME" /**< The name of the column containing the taxon scientific name. + */ +#define TAXID_COLUMN "TAXID" /**< The name of the column containing the taxids. TODO subtype of INT column? + */ +#define MERGED_TAXID_COLUMN "MERGED_TAXID" /**< The name of the column containing the merged taxids information. + */ +#define MERGED_PREFIX "MERGED_" /**< The prefix to prepend to column names when merging informations during obi uniq. + */ +#define TAXID_DIST_COLUMN "TAXID_DIST" /**< The name of the column containing a dictionary of taxid:[list of ids] when merging informations during obi uniq. + */ +#define MERGED_COLUMN "MERGED" /**< The name of the column containing a list of ids when merging informations during obi uniq. + */ +#define ID_PREFIX "seq" /**< The default prefix of sequence identifiers in automatic ID columns. + */ +#define PREDICATE_KEY "predicates" /**< The key used in the json-formatted view comments to store predicates. + */ /**