Taxonomy: new functions to find taxa by name

This commit is contained in:
Celine Mercier
2020-10-30 10:45:20 +01:00
parent b9b4cec5b5
commit 112e12cab0
7 changed files with 174 additions and 58 deletions

View File

@ -7,6 +7,8 @@ from libc.stdint cimport int32_t
cdef extern from "obidms_taxonomy.h" nogil:
extern int MIN_LOCAL_TAXID
struct ecotxnode :
int32_t taxid
int32_t rank
@ -18,6 +20,13 @@ cdef extern from "obidms_taxonomy.h" nogil:
ctypedef ecotxnode ecotx_t
struct econame_t : # can't get this struct to be accepted by Cython ('unknown size')
char* name
char* class_name
int32_t is_scientific_name
ecotxnode* taxon
struct ecotxidx_t :
int32_t count
int32_t max_taxid
@ -30,9 +39,14 @@ cdef extern from "obidms_taxonomy.h" nogil:
char** label
struct econameidx_t :
int32_t count
econame_t* names
struct OBIDMS_taxonomy_t :
ecorankidx_t* ranks
# econameidx_t* names
econameidx_t* names
ecotxidx_t* taxa
ctypedef OBIDMS_taxonomy_t* OBIDMS_taxonomy_p
@ -51,7 +65,11 @@ cdef extern from "obidms_taxonomy.h" nogil:
ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx)
ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid)
char* obi_taxo_get_name_from_name_idx(OBIDMS_taxonomy_p taxonomy, int32_t idx)
ecotx_t* obi_taxo_get_taxon_from_name_idx(OBIDMS_taxonomy_p taxonomy, int32_t idx)
bint obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid)
ecotx_t* obi_taxo_get_species(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
@ -71,4 +89,4 @@ cdef extern from "obidms_taxonomy.h" nogil:
int obi_taxo_add_preferred_name_with_taxon(OBIDMS_taxonomy_p tax, ecotx_t* taxon, const char* preferred_name)
const char* obi_taxo_rank_index_to_label(int32_t rank_idx, ecorankidx_t* ranks)

View File

@ -27,6 +27,7 @@ cdef extern from "obiview.h" nogil:
extern const_char_p REVERSE_QUALITY_COLUMN
extern const_char_p REVERSE_SEQUENCE_COLUMN
extern const_char_p COUNT_COLUMN
extern const_char_p SCIENTIFIC_NAME_COLUMN
extern const_char_p TAXID_COLUMN
extern const_char_p MERGED_TAXID_COLUMN
extern const_char_p MERGED_PREFIX

View File

@ -11,11 +11,14 @@ cdef class Taxonomy(OBIWrapper) :
cdef bytes _name
cdef DMS _dms
cdef list _ranks
cdef dict _name_dict
cdef inline OBIDMS_taxonomy_p pointer(self)
cdef fill_name_dict(self)
cpdef Taxon get_taxon_by_idx(self, int idx)
cpdef Taxon get_taxon_by_taxid(self, int taxid)
cpdef Taxon get_taxon_by_name(self, object taxon_name, object restricting_taxid=*)
cpdef write(self, object prefix)
cpdef int add_taxon(self, str name, str rank_name, int parent_taxid, int min_taxid=*)
cpdef object get_species(self, int taxid)

View File

@ -15,7 +15,11 @@ from ..capi.obitaxonomy cimport obi_taxonomy_exists, \
obi_taxo_get_species, \
obi_taxo_get_genus, \
obi_taxo_get_family, \
ecotx_t
ecotx_t, \
econame_t, \
obi_taxo_get_name_from_name_idx, \
obi_taxo_get_taxon_from_name_idx
from cpython.pycapsule cimport PyCapsule_New, PyCapsule_GetPointer
import tarfile
@ -24,11 +28,29 @@ from libc.stdlib cimport free
cdef class Taxonomy(OBIWrapper) :
# TODO function to import taxonomy?
# TODO function to import taxonomy?
cdef inline OBIDMS_taxonomy_p pointer(self) :
return <OBIDMS_taxonomy_p>(self._pointer)
cdef fill_name_dict(self):
print("Indexing taxon names...")
cdef OBIDMS_taxonomy_p pointer = self.pointer()
cdef ecotx_t* taxon_p
cdef object taxon_capsule
cdef bytes name
cdef int count
cdef int n
count = (<OBIDMS_taxonomy_p>pointer).names.count
for n in range(count) :
name = obi_taxo_get_name_from_name_idx(pointer, n)
taxon_p = obi_taxo_get_taxon_from_name_idx(pointer, n)
taxon_capsule = PyCapsule_New(taxon_p, NULL, NULL)
self._name_dict[name] = Taxon(taxon_capsule, self)
@staticmethod
def exists(DMS dms, object name) :
@ -75,7 +97,8 @@ cdef class Taxonomy(OBIWrapper) :
taxo._dms = dms
taxo._name = tobytes(name)
taxo._name_dict = {}
taxo.fill_name_dict()
taxo._ranks = []
for r in range((<OBIDMS_taxonomy_p>pointer).ranks.count) :
taxo._ranks.append(obi_taxo_rank_index_to_label(r, (<OBIDMS_taxonomy_p>pointer).ranks))
@ -118,7 +141,8 @@ cdef class Taxonomy(OBIWrapper) :
taxo._dms = dms
taxo._name = folder_path
taxo._name_dict = {}
taxo.fill_name_dict()
taxo._ranks = []
for r in range((<OBIDMS_taxonomy_p>pointer).ranks.count) :
taxo._ranks.append(obi_taxo_rank_index_to_label(r, (<OBIDMS_taxonomy_p>pointer).ranks))
@ -129,8 +153,8 @@ cdef class Taxonomy(OBIWrapper) :
def __getitem__(self, object ref):
if type(ref) == int :
return self.get_taxon_by_taxid(ref)
else :
raise NotImplementedError()
elif type(ref) == str or type(ref) == bytes :
return self.get_taxon_by_name(ref)
cpdef Taxon get_taxon_by_taxid(self, int taxid):
@ -143,6 +167,19 @@ cdef class Taxonomy(OBIWrapper) :
return Taxon(taxon_capsule, self)
cpdef Taxon get_taxon_by_name(self, object taxon_name, object restricting_taxid=None):
taxon = self._name_dict.get(tobytes(taxon_name), None)
if not taxon:
return None
elif restricting_taxid:
if self.is_ancestor(restricting_taxid, taxon.taxid):
return taxon
else:
return None
else:
return taxon
cpdef Taxon get_taxon_by_idx(self, int idx):
cdef ecotx_t* taxa
cdef ecotx_t* taxon_p
@ -232,7 +269,7 @@ cdef class Taxonomy(OBIWrapper) :
taxa = self.pointer().taxa.taxon
# Yield each taxid
# Yield each taxon
for t in range(self.pointer().taxa.count):
taxon_p = <ecotx_t*> (taxa+t)
taxon_capsule = PyCapsule_New(taxon_p, NULL, NULL)