Taxonomy: new functions to find taxa by name
This commit is contained in:
@ -7,6 +7,8 @@ from libc.stdint cimport int32_t
|
|||||||
|
|
||||||
cdef extern from "obidms_taxonomy.h" nogil:
|
cdef extern from "obidms_taxonomy.h" nogil:
|
||||||
|
|
||||||
|
extern int MIN_LOCAL_TAXID
|
||||||
|
|
||||||
struct ecotxnode :
|
struct ecotxnode :
|
||||||
int32_t taxid
|
int32_t taxid
|
||||||
int32_t rank
|
int32_t rank
|
||||||
@ -18,6 +20,13 @@ cdef extern from "obidms_taxonomy.h" nogil:
|
|||||||
ctypedef ecotxnode ecotx_t
|
ctypedef ecotxnode ecotx_t
|
||||||
|
|
||||||
|
|
||||||
|
struct econame_t : # can't get this struct to be accepted by Cython ('unknown size')
|
||||||
|
char* name
|
||||||
|
char* class_name
|
||||||
|
int32_t is_scientific_name
|
||||||
|
ecotxnode* taxon
|
||||||
|
|
||||||
|
|
||||||
struct ecotxidx_t :
|
struct ecotxidx_t :
|
||||||
int32_t count
|
int32_t count
|
||||||
int32_t max_taxid
|
int32_t max_taxid
|
||||||
@ -30,9 +39,14 @@ cdef extern from "obidms_taxonomy.h" nogil:
|
|||||||
char** label
|
char** label
|
||||||
|
|
||||||
|
|
||||||
|
struct econameidx_t :
|
||||||
|
int32_t count
|
||||||
|
econame_t* names
|
||||||
|
|
||||||
|
|
||||||
struct OBIDMS_taxonomy_t :
|
struct OBIDMS_taxonomy_t :
|
||||||
ecorankidx_t* ranks
|
ecorankidx_t* ranks
|
||||||
# econameidx_t* names
|
econameidx_t* names
|
||||||
ecotxidx_t* taxa
|
ecotxidx_t* taxa
|
||||||
|
|
||||||
ctypedef OBIDMS_taxonomy_t* OBIDMS_taxonomy_p
|
ctypedef OBIDMS_taxonomy_t* OBIDMS_taxonomy_p
|
||||||
@ -52,6 +66,10 @@ cdef extern from "obidms_taxonomy.h" nogil:
|
|||||||
|
|
||||||
ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid)
|
ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid)
|
||||||
|
|
||||||
|
char* obi_taxo_get_name_from_name_idx(OBIDMS_taxonomy_p taxonomy, int32_t idx)
|
||||||
|
|
||||||
|
ecotx_t* obi_taxo_get_taxon_from_name_idx(OBIDMS_taxonomy_p taxonomy, int32_t idx)
|
||||||
|
|
||||||
bint obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid)
|
bint obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid)
|
||||||
|
|
||||||
ecotx_t* obi_taxo_get_species(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
|
ecotx_t* obi_taxo_get_species(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
|
||||||
|
@ -27,6 +27,7 @@ cdef extern from "obiview.h" nogil:
|
|||||||
extern const_char_p REVERSE_QUALITY_COLUMN
|
extern const_char_p REVERSE_QUALITY_COLUMN
|
||||||
extern const_char_p REVERSE_SEQUENCE_COLUMN
|
extern const_char_p REVERSE_SEQUENCE_COLUMN
|
||||||
extern const_char_p COUNT_COLUMN
|
extern const_char_p COUNT_COLUMN
|
||||||
|
extern const_char_p SCIENTIFIC_NAME_COLUMN
|
||||||
extern const_char_p TAXID_COLUMN
|
extern const_char_p TAXID_COLUMN
|
||||||
extern const_char_p MERGED_TAXID_COLUMN
|
extern const_char_p MERGED_TAXID_COLUMN
|
||||||
extern const_char_p MERGED_PREFIX
|
extern const_char_p MERGED_PREFIX
|
||||||
|
@ -11,11 +11,14 @@ cdef class Taxonomy(OBIWrapper) :
|
|||||||
cdef bytes _name
|
cdef bytes _name
|
||||||
cdef DMS _dms
|
cdef DMS _dms
|
||||||
cdef list _ranks
|
cdef list _ranks
|
||||||
|
cdef dict _name_dict
|
||||||
|
|
||||||
cdef inline OBIDMS_taxonomy_p pointer(self)
|
cdef inline OBIDMS_taxonomy_p pointer(self)
|
||||||
|
cdef fill_name_dict(self)
|
||||||
|
|
||||||
cpdef Taxon get_taxon_by_idx(self, int idx)
|
cpdef Taxon get_taxon_by_idx(self, int idx)
|
||||||
cpdef Taxon get_taxon_by_taxid(self, int taxid)
|
cpdef Taxon get_taxon_by_taxid(self, int taxid)
|
||||||
|
cpdef Taxon get_taxon_by_name(self, object taxon_name, object restricting_taxid=*)
|
||||||
cpdef write(self, object prefix)
|
cpdef write(self, object prefix)
|
||||||
cpdef int add_taxon(self, str name, str rank_name, int parent_taxid, int min_taxid=*)
|
cpdef int add_taxon(self, str name, str rank_name, int parent_taxid, int min_taxid=*)
|
||||||
cpdef object get_species(self, int taxid)
|
cpdef object get_species(self, int taxid)
|
||||||
|
@ -15,7 +15,11 @@ from ..capi.obitaxonomy cimport obi_taxonomy_exists, \
|
|||||||
obi_taxo_get_species, \
|
obi_taxo_get_species, \
|
||||||
obi_taxo_get_genus, \
|
obi_taxo_get_genus, \
|
||||||
obi_taxo_get_family, \
|
obi_taxo_get_family, \
|
||||||
ecotx_t
|
ecotx_t, \
|
||||||
|
econame_t, \
|
||||||
|
obi_taxo_get_name_from_name_idx, \
|
||||||
|
obi_taxo_get_taxon_from_name_idx
|
||||||
|
|
||||||
|
|
||||||
from cpython.pycapsule cimport PyCapsule_New, PyCapsule_GetPointer
|
from cpython.pycapsule cimport PyCapsule_New, PyCapsule_GetPointer
|
||||||
import tarfile
|
import tarfile
|
||||||
@ -29,6 +33,24 @@ cdef class Taxonomy(OBIWrapper) :
|
|||||||
cdef inline OBIDMS_taxonomy_p pointer(self) :
|
cdef inline OBIDMS_taxonomy_p pointer(self) :
|
||||||
return <OBIDMS_taxonomy_p>(self._pointer)
|
return <OBIDMS_taxonomy_p>(self._pointer)
|
||||||
|
|
||||||
|
cdef fill_name_dict(self):
|
||||||
|
print("Indexing taxon names...")
|
||||||
|
|
||||||
|
cdef OBIDMS_taxonomy_p pointer = self.pointer()
|
||||||
|
cdef ecotx_t* taxon_p
|
||||||
|
cdef object taxon_capsule
|
||||||
|
cdef bytes name
|
||||||
|
cdef int count
|
||||||
|
cdef int n
|
||||||
|
|
||||||
|
count = (<OBIDMS_taxonomy_p>pointer).names.count
|
||||||
|
|
||||||
|
for n in range(count) :
|
||||||
|
name = obi_taxo_get_name_from_name_idx(pointer, n)
|
||||||
|
taxon_p = obi_taxo_get_taxon_from_name_idx(pointer, n)
|
||||||
|
taxon_capsule = PyCapsule_New(taxon_p, NULL, NULL)
|
||||||
|
self._name_dict[name] = Taxon(taxon_capsule, self)
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def exists(DMS dms, object name) :
|
def exists(DMS dms, object name) :
|
||||||
@ -75,7 +97,8 @@ cdef class Taxonomy(OBIWrapper) :
|
|||||||
|
|
||||||
taxo._dms = dms
|
taxo._dms = dms
|
||||||
taxo._name = tobytes(name)
|
taxo._name = tobytes(name)
|
||||||
|
taxo._name_dict = {}
|
||||||
|
taxo.fill_name_dict()
|
||||||
taxo._ranks = []
|
taxo._ranks = []
|
||||||
for r in range((<OBIDMS_taxonomy_p>pointer).ranks.count) :
|
for r in range((<OBIDMS_taxonomy_p>pointer).ranks.count) :
|
||||||
taxo._ranks.append(obi_taxo_rank_index_to_label(r, (<OBIDMS_taxonomy_p>pointer).ranks))
|
taxo._ranks.append(obi_taxo_rank_index_to_label(r, (<OBIDMS_taxonomy_p>pointer).ranks))
|
||||||
@ -118,7 +141,8 @@ cdef class Taxonomy(OBIWrapper) :
|
|||||||
|
|
||||||
taxo._dms = dms
|
taxo._dms = dms
|
||||||
taxo._name = folder_path
|
taxo._name = folder_path
|
||||||
|
taxo._name_dict = {}
|
||||||
|
taxo.fill_name_dict()
|
||||||
taxo._ranks = []
|
taxo._ranks = []
|
||||||
for r in range((<OBIDMS_taxonomy_p>pointer).ranks.count) :
|
for r in range((<OBIDMS_taxonomy_p>pointer).ranks.count) :
|
||||||
taxo._ranks.append(obi_taxo_rank_index_to_label(r, (<OBIDMS_taxonomy_p>pointer).ranks))
|
taxo._ranks.append(obi_taxo_rank_index_to_label(r, (<OBIDMS_taxonomy_p>pointer).ranks))
|
||||||
@ -129,8 +153,8 @@ cdef class Taxonomy(OBIWrapper) :
|
|||||||
def __getitem__(self, object ref):
|
def __getitem__(self, object ref):
|
||||||
if type(ref) == int :
|
if type(ref) == int :
|
||||||
return self.get_taxon_by_taxid(ref)
|
return self.get_taxon_by_taxid(ref)
|
||||||
else :
|
elif type(ref) == str or type(ref) == bytes :
|
||||||
raise NotImplementedError()
|
return self.get_taxon_by_name(ref)
|
||||||
|
|
||||||
|
|
||||||
cpdef Taxon get_taxon_by_taxid(self, int taxid):
|
cpdef Taxon get_taxon_by_taxid(self, int taxid):
|
||||||
@ -143,6 +167,19 @@ cdef class Taxonomy(OBIWrapper) :
|
|||||||
return Taxon(taxon_capsule, self)
|
return Taxon(taxon_capsule, self)
|
||||||
|
|
||||||
|
|
||||||
|
cpdef Taxon get_taxon_by_name(self, object taxon_name, object restricting_taxid=None):
|
||||||
|
taxon = self._name_dict.get(tobytes(taxon_name), None)
|
||||||
|
if not taxon:
|
||||||
|
return None
|
||||||
|
elif restricting_taxid:
|
||||||
|
if self.is_ancestor(restricting_taxid, taxon.taxid):
|
||||||
|
return taxon
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return taxon
|
||||||
|
|
||||||
|
|
||||||
cpdef Taxon get_taxon_by_idx(self, int idx):
|
cpdef Taxon get_taxon_by_idx(self, int idx):
|
||||||
cdef ecotx_t* taxa
|
cdef ecotx_t* taxa
|
||||||
cdef ecotx_t* taxon_p
|
cdef ecotx_t* taxon_p
|
||||||
@ -232,7 +269,7 @@ cdef class Taxonomy(OBIWrapper) :
|
|||||||
|
|
||||||
taxa = self.pointer().taxa.taxon
|
taxa = self.pointer().taxa.taxon
|
||||||
|
|
||||||
# Yield each taxid
|
# Yield each taxon
|
||||||
for t in range(self.pointer().taxa.count):
|
for t in range(self.pointer().taxa.count):
|
||||||
taxon_p = <ecotx_t*> (taxa+t)
|
taxon_p = <ecotx_t*> (taxa+t)
|
||||||
taxon_capsule = PyCapsule_New(taxon_p, NULL, NULL)
|
taxon_capsule = PyCapsule_New(taxon_p, NULL, NULL)
|
||||||
|
@ -3649,6 +3649,18 @@ ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
char* obi_taxo_get_name_from_name_idx(OBIDMS_taxonomy_p taxonomy, int32_t idx)
|
||||||
|
{
|
||||||
|
return (((taxonomy->names)->names)[idx]).name;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
ecotx_t* obi_taxo_get_taxon_from_name_idx(OBIDMS_taxonomy_p taxonomy, int32_t idx)
|
||||||
|
{
|
||||||
|
return (((taxonomy->names)->names)[idx]).taxon;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
int obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid) // TODO discuss that this doesn't work with deprecated taxids
|
int obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid) // TODO discuss that this doesn't work with deprecated taxids
|
||||||
{
|
{
|
||||||
ecotx_t* next_parent;
|
ecotx_t* next_parent;
|
||||||
|
@ -447,8 +447,51 @@ ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy);
|
|||||||
const char* obi_taxo_rank_index_to_label(int32_t rank_idx, ecorankidx_t* ranks);
|
const char* obi_taxo_rank_index_to_label(int32_t rank_idx, ecorankidx_t* ranks);
|
||||||
|
|
||||||
|
|
||||||
// TODO
|
/**
|
||||||
|
* @brief Function checking whether a taxid is included in a subset of the taxonomy.
|
||||||
|
*
|
||||||
|
* @param taxonomy A pointer on the taxonomy structure.
|
||||||
|
* @param restrict_to_taxids An array of taxids. The researched taxid must be under at least one of those array taxids.
|
||||||
|
* @param count Number of taxids in restrict_to_taxids.
|
||||||
|
* @param taxid The taxid to check.
|
||||||
|
*
|
||||||
|
* @returns A value indicating whether the taxid is included in the chosen subset of the taxonomy.
|
||||||
|
* @retval 0 if the taxid is not included in the subset of the taxonomy.
|
||||||
|
* @retval 1 if the taxid is included in the subset of the taxonomy.
|
||||||
|
*
|
||||||
|
* @since October 2020
|
||||||
|
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||||
|
*/
|
||||||
int obi_taxo_is_taxid_included(OBIDMS_taxonomy_p taxonomy,
|
int obi_taxo_is_taxid_included(OBIDMS_taxonomy_p taxonomy,
|
||||||
int32_t* restrict_to_taxids,
|
int32_t* restrict_to_taxids,
|
||||||
int32_t count,
|
int32_t count,
|
||||||
int32_t taxid);
|
int32_t taxid);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Function returning the name of a taxon from its index in the taxonomy name index (econameidx_t).
|
||||||
|
*
|
||||||
|
* @param taxonomy A pointer on the taxonomy structure.
|
||||||
|
* @param idx The index at which the name is in the taxonomy name index (econameidx_t).
|
||||||
|
*
|
||||||
|
* @returns The taxon name.
|
||||||
|
*
|
||||||
|
* @since October 2020
|
||||||
|
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||||
|
*/
|
||||||
|
char* obi_taxo_get_name_from_name_idx(OBIDMS_taxonomy_p taxonomy, int32_t idx);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Function returning a taxon structure from its index in the taxonomy name index (econameidx_t).
|
||||||
|
*
|
||||||
|
* @param taxonomy A pointer on the taxonomy structure.
|
||||||
|
* @param idx The index at which the taxon is in the taxonomy name index (econameidx_t).
|
||||||
|
*
|
||||||
|
* @returns The taxon structure.
|
||||||
|
*
|
||||||
|
* @since October 2020
|
||||||
|
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||||
|
*/
|
||||||
|
ecotx_t* obi_taxo_get_taxon_from_name_idx(OBIDMS_taxonomy_p taxonomy, int32_t idx);
|
||||||
|
|
||||||
|
@ -64,6 +64,8 @@
|
|||||||
#define COUNT_COLUMN "COUNT" /**< The name of the column containing the sequence counts
|
#define COUNT_COLUMN "COUNT" /**< The name of the column containing the sequence counts
|
||||||
* in NUC_SEQS_VIEW views.
|
* in NUC_SEQS_VIEW views.
|
||||||
*/
|
*/
|
||||||
|
#define SCIENTIFIC_NAME_COLUMN "SCIENTIFIC_NAME" /**< The name of the column containing the taxon scientific name.
|
||||||
|
*/
|
||||||
#define TAXID_COLUMN "TAXID" /**< The name of the column containing the taxids. TODO subtype of INT column?
|
#define TAXID_COLUMN "TAXID" /**< The name of the column containing the taxids. TODO subtype of INT column?
|
||||||
*/
|
*/
|
||||||
#define MERGED_TAXID_COLUMN "MERGED_TAXID" /**< The name of the column containing the merged taxids information.
|
#define MERGED_TAXID_COLUMN "MERGED_TAXID" /**< The name of the column containing the merged taxids information.
|
||||||
|
Reference in New Issue
Block a user