Taxonomy: new functions and improvements

This commit is contained in:
Celine Mercier
2017-10-04 15:55:13 +02:00
parent 0ab081f79e
commit 535692b020
6 changed files with 329 additions and 96 deletions

View File

@ -25,8 +25,13 @@ cdef extern from "obidms_taxonomy.h" nogil:
ecotx_t* taxon
struct ecorankidx_t :
int32_t count
char** label
struct OBIDMS_taxonomy_t :
# ecorankidx_t* ranks
ecorankidx_t* ranks
# econameidx_t* names
ecotxidx_t* taxa
@ -63,3 +68,5 @@ cdef extern from "obidms_taxonomy.h" nogil:
int obi_taxo_add_preferred_name_with_taxon(OBIDMS_taxonomy_p tax, ecotx_t* taxon, const char* preferred_name)
const char* obi_taxo_rank_index_to_label(int32_t rank_idx, ecorankidx_t* ranks)

View File

@ -0,0 +1,2 @@
from .taxo import Taxonomy # @UnresolvedImport
from .taxo import Taxon # @UnresolvedImport

View File

@ -8,14 +8,22 @@ from ..object cimport OBIWrapper
cdef class Taxonomy(OBIWrapper) :
cdef str _name # TODO keep as bytes?
cdef bytes _name
cdef DMS _dms
cdef list _ranks
cdef inline OBIDMS_taxonomy_p pointer(self)
cpdef get_taxon_by_idx(self, int idx)
cpdef write(self, str prefix)
cpdef Taxon get_taxon_by_idx(self, int idx)
cpdef Taxon get_taxon_by_taxid(self, int taxid)
cpdef write(self, object prefix)
cpdef int add_taxon(self, str name, str rank_name, int parent_taxid, int min_taxid=*)
cpdef object get_species(self, int taxid)
cpdef object get_genus(self, int taxid)
cpdef object get_family(self, int taxid)
cpdef bytes get_scientific_name(self, int taxid)
cpdef bytes get_rank(self, int taxid)
cdef class Taxon :
cdef ecotx_t* _pointer

View File

@ -1,6 +1,6 @@
#cython: language_level=3
from obitools3.utils cimport str2bytes, bytes2str, tobytes
from obitools3.utils cimport str2bytes, bytes2str, tobytes, tostr
from ..capi.obitaxonomy cimport obi_read_taxonomy, \
obi_read_taxdump, \
@ -9,6 +9,10 @@ from ..capi.obitaxonomy cimport obi_read_taxonomy, \
obi_taxo_get_taxon_with_taxid, \
obi_taxo_add_local_taxon, \
obi_taxo_add_preferred_name_with_taxon, \
obi_taxo_rank_index_to_label, \
obi_taxo_get_species, \
obi_taxo_get_genus, \
obi_taxo_get_family, \
ecotx_t
from cpython.pycapsule cimport PyCapsule_New, PyCapsule_GetPointer
@ -22,58 +26,139 @@ cdef class Taxonomy(OBIWrapper) :
@staticmethod
def open(DMS dms, str name, bint taxdump=False) :
def open(DMS dms, object name) :
cdef void* pointer
cdef Taxonomy taxo
if taxdump :
pointer = <void*>obi_read_taxdump(tobytes(name))
else :
pointer = <void*>obi_read_taxonomy(dms.pointer(), tobytes(name), True) # TODO discuss
# TODO if not found in DMS, try to import?
pointer = <void*>obi_read_taxonomy(dms.pointer(), tobytes(name), True)
if pointer == NULL :
raise RuntimeError("Error : Cannot read taxonomy %s"
% name)
% tostr(name))
taxo = OBIWrapper.new_wrapper(Taxonomy, pointer)
dms.register(taxo)
taxo._dms = dms
taxo._name = name
taxo._name = tobytes(name)
taxo._ranks = []
for r in range((<OBIDMS_taxonomy_p>pointer).ranks.count) :
taxo._ranks.append(obi_taxo_rank_index_to_label(r, (<OBIDMS_taxonomy_p>pointer).ranks))
return taxo
@staticmethod
def open_taxdump(DMS dms, object name) :
cdef void* pointer
cdef Taxonomy taxo
pointer = <void*>obi_read_taxdump(tobytes(name))
if pointer == NULL :
raise RuntimeError("Error : Cannot read taxonomy %s"
% tostr(name))
taxo = OBIWrapper.new_wrapper(Taxonomy, pointer)
dms.register(taxo)
taxo._dms = dms
taxo._name = tobytes(name)
taxo._ranks = []
for r in range((<OBIDMS_taxonomy_p>pointer).ranks.count) :
taxo._ranks.append(obi_taxo_rank_index_to_label(r, (<OBIDMS_taxonomy_p>pointer).ranks))
return taxo
def __getitem__(self, object ref):
cdef ecotx_t* taxon_p
cdef object taxon_capsule
if type(ref) == int :
taxon_p = obi_taxo_get_taxon_with_taxid(self.pointer(), ref)
if taxon_p == NULL :
raise Exception("Taxon not found")
taxon_capsule = PyCapsule_New(taxon_p, NULL, NULL)
return Taxon(taxon_capsule, self)
return self.get_taxon_by_taxid(ref)
else :
raise Exception("Not implemented")
cpdef get_taxon_by_idx(self, int idx):
cpdef Taxon get_taxon_by_taxid(self, int taxid):
cdef ecotx_t* taxon_p
cdef object taxon_capsule
taxon_p = obi_taxo_get_taxon_with_taxid(self.pointer(), taxid)
if taxon_p == NULL:
raise Exception("Error getting a taxon with given taxid", taxid)
taxon_capsule = PyCapsule_New(taxon_p, NULL, NULL)
return Taxon(taxon_capsule, self)
cpdef Taxon get_taxon_by_idx(self, int idx):
cdef ecotx_t* taxa
cdef ecotx_t* taxon_p
cdef object taxon_capsule
if idx >= self.pointer().taxa.count :
raise Exception("Error getting a taxon with given index: no taxid with this index", idx)
taxa = self.pointer().taxa.taxon
taxon_p = <ecotx_t*> (taxa+idx)
taxon_capsule = PyCapsule_New(taxon_p, NULL, NULL)
return Taxon(taxon_capsule, self)
cpdef object get_species(self, int taxid):
cdef ecotx_t* taxon_p
cdef ecotx_t* species_p
taxon_p = obi_taxo_get_taxon_with_taxid(self.pointer(), taxid)
if taxon_p == NULL:
raise Exception("Error getting a taxon with given taxid", taxid)
species_p = obi_taxo_get_species(taxon_p, self.pointer())
if species_p == NULL :
return None
else :
return <int>(species_p.taxid)
cpdef object get_genus(self, int taxid):
cdef ecotx_t* taxon_p
cdef ecotx_t* genus_p
taxon_p = obi_taxo_get_taxon_with_taxid(self.pointer(), taxid)
if taxon_p == NULL:
raise Exception("Error getting a taxon with given taxid", taxid)
genus_p = obi_taxo_get_genus(taxon_p, self.pointer())
if genus_p == NULL :
return None
else :
return <int>(genus_p.taxid)
cpdef object get_family(self, int taxid):
cdef ecotx_t* taxon_p
cdef ecotx_t* family_p
taxon_p = obi_taxo_get_taxon_with_taxid(self.pointer(), taxid)
if taxon_p == NULL:
raise Exception("Error getting a taxon with given taxid", taxid)
family_p = obi_taxo_get_family(taxon_p, self.pointer())
if family_p == NULL :
return None
else :
return <int>(family_p.taxid)
cpdef bytes get_scientific_name(self, int taxid):
cdef ecotx_t* taxon_p
taxon_p = obi_taxo_get_taxon_with_taxid(self.pointer(), taxid)
if taxon_p == NULL:
raise Exception("Error getting a taxon with given taxid", taxid)
return taxon_p.name
cpdef bytes get_rank(self, int taxid):
cdef ecotx_t* taxon_p
taxon_p = obi_taxo_get_taxon_with_taxid(self.pointer(), taxid)
if taxon_p == NULL:
raise Exception("Error getting a taxon with given taxid", taxid)
return self._ranks[taxon_p.rank]
def __len__(self):
return self.pointer().taxa.count
@ -94,7 +179,7 @@ cdef class Taxonomy(OBIWrapper) :
yield Taxon(taxon_capsule, self)
cpdef write(self, str prefix) :
cpdef write(self, object prefix) :
if obi_write_taxonomy(self._dms.pointer(), self.pointer(), tobytes(prefix)) < 0 :
raise Exception("Error writing the taxonomy to binary files")
@ -109,9 +194,6 @@ cdef class Taxonomy(OBIWrapper) :
def close(self) :
cdef OBIDMS_taxonomy_p pointer = self.pointer()
if self.active() :
self._dms.unregister(self)
OBIWrapper.close(self)
@ -126,6 +208,57 @@ cdef class Taxonomy(OBIWrapper) :
return self._name
def parental_tree_iterator(self, int taxid):
"""
return parental tree for given taxonomic id starting from
first ancestor to the root.
"""
cdef Taxon taxon
taxon = self.get_taxon_by_idx(taxid)
if taxon is not None:
while taxon.parent.taxid != 1: # TODO was 0 before?
yield taxon
taxon = taxon.parent
yield self[1]
else:
raise StopIteration
def last_common_taxon(self, *taxids):
cdef list t1
cdef list t2
cdef Taxon x
cdef int count
cdef int i
cdef int ancestor
if not taxids:
return None
if len(taxids)==1:
return taxids[0]
if len(taxids)==2:
t1 = [x.taxid for x in self.parental_tree_iterator(taxids[0])]
t2 = [x.taxid for x in self.parental_tree_iterator(taxids[1])]
t1.reverse()
t2.reverse()
count = min(len(t1),len(t2))
i=0
while(i < count and t1[i]==t2[i]):
i+=1
i-=1
return t1[i]
ancestor = taxids[0]
for taxon in taxids[1:]:
ancestor = self.last_common_taxon(ancestor, taxon)
return ancestor
cdef class Taxon : # TODO dict subclass?
def __init__(self, object taxon_capsule, Taxonomy tax) :
@ -148,7 +281,7 @@ cdef class Taxon : # TODO dict subclass?
# name property getter
@property
def name(self):
return bytes2str(self._pointer.name)
return self._pointer.name
# taxid property getter
@property
@ -158,7 +291,7 @@ cdef class Taxon : # TODO dict subclass?
# rank property getter
@property
def rank(self):
return self._pointer.rank
return ((self._tax)._ranks)[(self._pointer).rank]
# farest property getter
@property

View File

@ -148,7 +148,7 @@ static char* get_taxonomy_path(OBIDMS_p dms, const char* tax_name);
* @returns The index of a rank in the ecorankidx_t structure.
* @retval -1 if the rank was not found.
*/
static int32_t rank_index(const char* label, ecorankidx_t* ranks);
static int32_t rank_label_to_index(const char* label, ecorankidx_t* ranks);
/**
@ -543,7 +543,7 @@ static char* get_taxonomy_path(OBIDMS_p dms, const char* tax_name)
}
static int32_t rank_index(const char* label, ecorankidx_t* ranks)
static int32_t rank_label_to_index(const char* label, ecorankidx_t* ranks)
{
char **rep;
@ -3501,6 +3501,13 @@ ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx)
ecotx_t* current_taxon;
ecotx_t* next_taxon;
if (taxon == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError getting the parent of a taxon at a given rank: taxon pointer is NULL");
return NULL;
}
current_taxon = taxon;
next_taxon = current_taxon->parent;
@ -3524,6 +3531,13 @@ ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid
ecomerged_t *indexed_taxon;
int32_t count;
if (taxonomy == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError trying to get a taxon with its taxid: taxonomy pointer is NULL");
return NULL;
}
count = (taxonomy->merged_idx)->count;
indexed_taxon = (ecomerged_t*) bsearch((const void *) ((size_t) taxid),
@ -3543,12 +3557,19 @@ ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid
}
bool obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid) // TODO discuss that this doesn't work with deprecated taxids
int obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid) // TODO discuss that this doesn't work with deprecated taxids
{
ecotx_t* next_parent;
next_parent = taxon->parent;
if (taxon == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError checking if a taxon is under another: taxon pointer is NULL");
return -1;
}
while ((other_taxid != next_parent->taxid) && (strcmp(next_parent->name, "root")))
next_parent = next_parent->parent;
@ -3561,19 +3582,27 @@ bool obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid) // TODO
ecotx_t* obi_taxo_get_species(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
{
static OBIDMS_taxonomy_p tax = NULL;
static int32_t rankindex = -1;
if (taxonomy && (tax != taxonomy))
{
rankindex = rank_index("species", taxonomy->ranks);
tax = taxonomy;
}
if (!tax || (rankindex < 0))
if (taxonomy == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError trying to get the species associated with a taxon: No taxonomy defined");
obidebug(1, "\nError trying to get the species associated with a taxon: taxonomy pointer is NULL");
return NULL;
}
if (taxon == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError trying to get the species associated with a taxon: taxon pointer is NULL");
return NULL;
}
rankindex = rank_label_to_index("species", taxonomy->ranks);
if (rankindex < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError trying to get the species associated with a taxon: error getting rank index");
return NULL;
}
@ -3583,19 +3612,27 @@ ecotx_t* obi_taxo_get_species(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
ecotx_t* obi_taxo_get_genus(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
{
static OBIDMS_taxonomy_p tax = NULL;
static int32_t rankindex = -1;
if (taxonomy && (tax != taxonomy))
{
rankindex = rank_index("genus", taxonomy->ranks);
tax = taxonomy;
}
if (!tax || (rankindex < 0))
if (taxonomy == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError trying to get the genus associated with a taxon: No taxonomy defined");
obidebug(1, "\nError trying to get the genus associated with a taxon: taxonomy pointer is NULL");
return NULL;
}
if (taxon == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError trying to get the genus associated with a taxon: taxon pointer is NULL");
return NULL;
}
rankindex = rank_label_to_index("genus", taxonomy->ranks);
if (rankindex < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError trying to get the genus associated with a taxon: error getting rank index");
return NULL;
}
@ -3605,19 +3642,27 @@ ecotx_t* obi_taxo_get_genus(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
ecotx_t* obi_taxo_get_family(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
{
static OBIDMS_taxonomy_p tax = NULL;
static int32_t rankindex = -1;
if (taxonomy && (tax != taxonomy))
{
rankindex = rank_index("family", taxonomy->ranks);
tax = taxonomy;
}
if (!tax || (rankindex < 0))
if (taxonomy == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError trying to get the family associated with a taxon: No taxonomy defined");
obidebug(1, "\nError trying to get the family associated with a taxon: taxonomy pointer is NULL");
return NULL;
}
if (taxon == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError trying to get the family associated with a taxon: taxon pointer is NULL");
return NULL;
}
rankindex = rank_label_to_index("family", taxonomy->ranks);
if (rankindex < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError trying to get the family associated with a taxon: error getting rank index");
return NULL;
}
@ -3627,19 +3672,27 @@ ecotx_t* obi_taxo_get_family(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
ecotx_t* obi_taxo_get_kingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
{
static OBIDMS_taxonomy_p tax = NULL;
static int32_t rankindex = -1;
if (taxonomy && (tax != taxonomy))
{
rankindex = rank_index("kingdom", taxonomy->ranks);
tax = taxonomy;
}
if (!tax || (rankindex < 0))
if (taxonomy == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError trying to get the kingdom associated with a taxon: No taxonomy defined");
obidebug(1, "\nError trying to get the kingdom associated with a taxon: taxonomy pointer is NULL");
return NULL;
}
if (taxon == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError trying to get the kingdom associated with a taxon: taxon pointer is NULL");
return NULL;
}
rankindex = rank_label_to_index("kingdom", taxonomy->ranks);
if (rankindex < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError trying to get the kingdom associated with a taxon: error getting rank index");
return NULL;
}
@ -3649,22 +3702,36 @@ ecotx_t* obi_taxo_get_kingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
{
static OBIDMS_taxonomy_p tax = NULL;
static int32_t rankindex = -1;
if (taxonomy && (tax != taxonomy))
{
rankindex = rank_index("superkingdom", taxonomy->ranks);
tax = taxonomy;
}
if (!tax || (rankindex < 0))
if (taxonomy == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError trying to get the superkingdom associated with a taxon: No taxonomy defined");
obidebug(1, "\nError trying to get the superkingdom associated with a taxon: taxonomy pointer is NULL");
return NULL;
}
if (taxon == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError trying to get the superkingdom associated with a taxon: taxon pointer is NULL");
return NULL;
}
rankindex = rank_label_to_index("superkingdom", taxonomy->ranks);
if (rankindex < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError trying to get the superkingdom associated with a taxon: error getting rank index");
return NULL;
}
return obi_taxo_get_parent_at_rank(taxon, rankindex);
}
const char* obi_taxo_rank_index_to_label(int32_t rank_idx, ecorankidx_t* ranks)
{
return (ranks->label)[rank_idx];
}

View File

@ -308,7 +308,7 @@ int obi_taxo_add_preferred_name_with_taxon(OBIDMS_taxonomy_p tax, ecotx_t* taxon
* @param rankidx The index of the rank wanted.
*
* @returns A pointer on the parent taxon at the wanted rank.
* @retval NULL if no parent taxon was found at the wanted rank.
* @retval NULL if no parent taxon was found at the wanted rank or if an error occurred.
*/
ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx);
@ -320,7 +320,7 @@ ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx);
* @param taxid The taxid of the taxon.
*
* @returns A pointer on the wanted taxon.
* @retval NULL if no taxon was found with the given taxid.
* @retval NULL if no taxon was found with the given taxid or if an error occurred.
*
* @since January 2017
* @author Celine Mercier (celine.mercier@metabarcoding.org)
@ -334,9 +334,12 @@ ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid
* @param taxon A pointer on the first taxon.
* @param other_taxid The taxid of the second taxon.
*
* @returns A boolean indicating whether the first taxon is under the second taxon in the taxonomy tree.
* @returns A value indicating whether the first taxon is under the second taxon in the taxonomy tree.
* @retval 0 if the first taxon is not under the second taxon in the taxonomy tree.
* @retval 1 if the first taxon is under the second taxon in the taxonomy tree.
* @retval -1 if an error occurred.
*/
bool obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid);
int obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid);
/**
@ -398,3 +401,16 @@ ecotx_t* obi_taxo_get_kingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy);
*/
ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy);
/**
* @brief Function returning the label of a rank in an ecorankidx_t structure.
*
* @param rank_idx The index of the rank.
* @param ranks A pointer on an ecorankidx_t structure.
*
* @returns The label of a rank in the ecorankidx_t structure.
* @retval NULL if there is no rank at that index.
*
* @see rank_label_to_index()
*/
const char* obi_taxo_rank_index_to_label(int32_t rank_idx, ecorankidx_t* ranks);