Taxonomy: new functions and improvements

This commit is contained in:
Celine Mercier
2017-10-04 15:55:13 +02:00
parent 0ab081f79e
commit 535692b020
6 changed files with 329 additions and 96 deletions

View File

@ -25,8 +25,13 @@ cdef extern from "obidms_taxonomy.h" nogil:
ecotx_t* taxon
struct ecorankidx_t :
int32_t count
char** label
struct OBIDMS_taxonomy_t :
# ecorankidx_t* ranks
ecorankidx_t* ranks
# econameidx_t* names
ecotxidx_t* taxa
@ -63,3 +68,5 @@ cdef extern from "obidms_taxonomy.h" nogil:
int obi_taxo_add_preferred_name_with_taxon(OBIDMS_taxonomy_p tax, ecotx_t* taxon, const char* preferred_name)
const char* obi_taxo_rank_index_to_label(int32_t rank_idx, ecorankidx_t* ranks)

View File

@ -0,0 +1,2 @@
from .taxo import Taxonomy # @UnresolvedImport
from .taxo import Taxon # @UnresolvedImport

View File

@ -8,15 +8,23 @@ from ..object cimport OBIWrapper
cdef class Taxonomy(OBIWrapper) :
cdef str _name # TODO keep as bytes?
cdef DMS _dms
cdef bytes _name
cdef DMS _dms
cdef list _ranks
cdef inline OBIDMS_taxonomy_p pointer(self)
cpdef get_taxon_by_idx(self, int idx)
cpdef write(self, str prefix)
cpdef Taxon get_taxon_by_idx(self, int idx)
cpdef Taxon get_taxon_by_taxid(self, int taxid)
cpdef write(self, object prefix)
cpdef int add_taxon(self, str name, str rank_name, int parent_taxid, int min_taxid=*)
cpdef object get_species(self, int taxid)
cpdef object get_genus(self, int taxid)
cpdef object get_family(self, int taxid)
cpdef bytes get_scientific_name(self, int taxid)
cpdef bytes get_rank(self, int taxid)
cdef class Taxon :
cdef ecotx_t* _pointer
cdef ecotx_t* _pointer
cdef Taxonomy _tax

View File

@ -1,6 +1,6 @@
#cython: language_level=3
from obitools3.utils cimport str2bytes, bytes2str, tobytes
from obitools3.utils cimport str2bytes, bytes2str, tobytes, tostr
from ..capi.obitaxonomy cimport obi_read_taxonomy, \
obi_read_taxdump, \
@ -9,6 +9,10 @@ from ..capi.obitaxonomy cimport obi_read_taxonomy, \
obi_taxo_get_taxon_with_taxid, \
obi_taxo_add_local_taxon, \
obi_taxo_add_preferred_name_with_taxon, \
obi_taxo_rank_index_to_label, \
obi_taxo_get_species, \
obi_taxo_get_genus, \
obi_taxo_get_family, \
ecotx_t
from cpython.pycapsule cimport PyCapsule_New, PyCapsule_GetPointer
@ -22,57 +26,138 @@ cdef class Taxonomy(OBIWrapper) :
@staticmethod
def open(DMS dms, str name, bint taxdump=False) :
def open(DMS dms, object name) :
cdef void* pointer
cdef Taxonomy taxo
if taxdump :
pointer = <void*>obi_read_taxdump(tobytes(name))
else :
pointer = <void*>obi_read_taxonomy(dms.pointer(), tobytes(name), True) # TODO discuss
# TODO if not found in DMS, try to import?
pointer = <void*>obi_read_taxonomy(dms.pointer(), tobytes(name), True)
if pointer == NULL :
raise RuntimeError("Error : Cannot read taxonomy %s"
% name)
% tostr(name))
taxo = OBIWrapper.new_wrapper(Taxonomy, pointer)
dms.register(taxo)
taxo._dms = dms
taxo._name = tobytes(name)
taxo._ranks = []
for r in range((<OBIDMS_taxonomy_p>pointer).ranks.count) :
taxo._ranks.append(obi_taxo_rank_index_to_label(r, (<OBIDMS_taxonomy_p>pointer).ranks))
return taxo
@staticmethod
def open_taxdump(DMS dms, object name) :
cdef void* pointer
cdef Taxonomy taxo
pointer = <void*>obi_read_taxdump(tobytes(name))
if pointer == NULL :
raise RuntimeError("Error : Cannot read taxonomy %s"
% tostr(name))
taxo = OBIWrapper.new_wrapper(Taxonomy, pointer)
dms.register(taxo)
taxo._dms = dms
taxo._name = name
taxo._name = tobytes(name)
taxo._ranks = []
for r in range((<OBIDMS_taxonomy_p>pointer).ranks.count) :
taxo._ranks.append(obi_taxo_rank_index_to_label(r, (<OBIDMS_taxonomy_p>pointer).ranks))
return taxo
def __getitem__(self, object ref):
cdef ecotx_t* taxon_p
cdef object taxon_capsule
def __getitem__(self, object ref):
if type(ref) == int :
taxon_p = obi_taxo_get_taxon_with_taxid(self.pointer(), ref)
if taxon_p == NULL :
raise Exception("Taxon not found")
taxon_capsule = PyCapsule_New(taxon_p, NULL, NULL)
return Taxon(taxon_capsule, self)
return self.get_taxon_by_taxid(ref)
else :
raise Exception("Not implemented")
cpdef get_taxon_by_idx(self, int idx):
cpdef Taxon get_taxon_by_taxid(self, int taxid):
cdef ecotx_t* taxon_p
cdef object taxon_capsule
taxon_p = obi_taxo_get_taxon_with_taxid(self.pointer(), taxid)
if taxon_p == NULL:
raise Exception("Error getting a taxon with given taxid", taxid)
taxon_capsule = PyCapsule_New(taxon_p, NULL, NULL)
return Taxon(taxon_capsule, self)
cpdef Taxon get_taxon_by_idx(self, int idx):
cdef ecotx_t* taxa
cdef ecotx_t* taxon_p
cdef object taxon_capsule
if idx >= self.pointer().taxa.count :
raise Exception("Error getting a taxon with given index: no taxid with this index", idx)
taxa = self.pointer().taxa.taxon
taxon_p = <ecotx_t*> (taxa+idx)
taxon_capsule = PyCapsule_New(taxon_p, NULL, NULL)
return Taxon(taxon_capsule, self)
cpdef object get_species(self, int taxid):
cdef ecotx_t* taxon_p
cdef ecotx_t* species_p
taxon_p = obi_taxo_get_taxon_with_taxid(self.pointer(), taxid)
if taxon_p == NULL:
raise Exception("Error getting a taxon with given taxid", taxid)
species_p = obi_taxo_get_species(taxon_p, self.pointer())
if species_p == NULL :
return None
else :
return <int>(species_p.taxid)
cpdef object get_genus(self, int taxid):
cdef ecotx_t* taxon_p
cdef ecotx_t* genus_p
taxon_p = obi_taxo_get_taxon_with_taxid(self.pointer(), taxid)
if taxon_p == NULL:
raise Exception("Error getting a taxon with given taxid", taxid)
genus_p = obi_taxo_get_genus(taxon_p, self.pointer())
if genus_p == NULL :
return None
else :
return <int>(genus_p.taxid)
cpdef object get_family(self, int taxid):
cdef ecotx_t* taxon_p
cdef ecotx_t* family_p
taxon_p = obi_taxo_get_taxon_with_taxid(self.pointer(), taxid)
if taxon_p == NULL:
raise Exception("Error getting a taxon with given taxid", taxid)
family_p = obi_taxo_get_family(taxon_p, self.pointer())
if family_p == NULL :
return None
else :
return <int>(family_p.taxid)
cpdef bytes get_scientific_name(self, int taxid):
cdef ecotx_t* taxon_p
taxon_p = obi_taxo_get_taxon_with_taxid(self.pointer(), taxid)
if taxon_p == NULL:
raise Exception("Error getting a taxon with given taxid", taxid)
return taxon_p.name
cpdef bytes get_rank(self, int taxid):
cdef ecotx_t* taxon_p
taxon_p = obi_taxo_get_taxon_with_taxid(self.pointer(), taxid)
if taxon_p == NULL:
raise Exception("Error getting a taxon with given taxid", taxid)
return self._ranks[taxon_p.rank]
def __len__(self):
return self.pointer().taxa.count
@ -94,7 +179,7 @@ cdef class Taxonomy(OBIWrapper) :
yield Taxon(taxon_capsule, self)
cpdef write(self, str prefix) :
cpdef write(self, object prefix) :
if obi_write_taxonomy(self._dms.pointer(), self.pointer(), tobytes(prefix)) < 0 :
raise Exception("Error writing the taxonomy to binary files")
@ -108,10 +193,7 @@ cdef class Taxonomy(OBIWrapper) :
return taxid
def close(self) :
cdef OBIDMS_taxonomy_p pointer = self.pointer()
def close(self) :
if self.active() :
self._dms.unregister(self)
OBIWrapper.close(self)
@ -124,6 +206,57 @@ cdef class Taxonomy(OBIWrapper) :
@property
def name(self):
return self._name
def parental_tree_iterator(self, int taxid):
"""
return parental tree for given taxonomic id starting from
first ancestor to the root.
"""
cdef Taxon taxon
taxon = self.get_taxon_by_idx(taxid)
if taxon is not None:
while taxon.parent.taxid != 1: # TODO was 0 before?
yield taxon
taxon = taxon.parent
yield self[1]
else:
raise StopIteration
def last_common_taxon(self, *taxids):
cdef list t1
cdef list t2
cdef Taxon x
cdef int count
cdef int i
cdef int ancestor
if not taxids:
return None
if len(taxids)==1:
return taxids[0]
if len(taxids)==2:
t1 = [x.taxid for x in self.parental_tree_iterator(taxids[0])]
t2 = [x.taxid for x in self.parental_tree_iterator(taxids[1])]
t1.reverse()
t2.reverse()
count = min(len(t1),len(t2))
i=0
while(i < count and t1[i]==t2[i]):
i+=1
i-=1
return t1[i]
ancestor = taxids[0]
for taxon in taxids[1:]:
ancestor = self.last_common_taxon(ancestor, taxon)
return ancestor
cdef class Taxon : # TODO dict subclass?
@ -143,12 +276,12 @@ cdef class Taxon : # TODO dict subclass?
(self.farest == taxon2.farest) and \
(self.parent.taxid == taxon2.parent.taxid) and \
(self.preferred_name == taxon2.preferred_name)
# name property getter
@property
def name(self):
return bytes2str(self._pointer.name)
return self._pointer.name
# taxid property getter
@property
@ -158,7 +291,7 @@ cdef class Taxon : # TODO dict subclass?
# rank property getter
@property
def rank(self):
return self._pointer.rank
return ((self._tax)._ranks)[(self._pointer).rank]
# farest property getter
@property