diff --git a/python/obitools3/obidms/_obitaxo.pxd b/python/obitools3/obidms/_obitaxo.pxd index fba81e9..51c6c0f 100644 --- a/python/obitools3/obidms/_obitaxo.pxd +++ b/python/obitools3/obidms/_obitaxo.pxd @@ -10,8 +10,9 @@ cdef class OBI_Taxonomy : cdef OBIDMS_taxonomy_p _pointer cdef OBIDMS _dms - cpdef close(self) cpdef write(self, str prefix) + cpdef int add_taxon(self, str name, str rank_name, int parent_taxid, int min_taxid=*) + cpdef close(self) cdef class OBI_Taxon : diff --git a/python/obitools3/obidms/_obitaxo.pyx b/python/obitools3/obidms/_obitaxo.pyx index 1e97123..b9d7e98 100644 --- a/python/obitools3/obidms/_obitaxo.pyx +++ b/python/obitools3/obidms/_obitaxo.pyx @@ -7,6 +7,7 @@ from .capi.obitaxonomy cimport obi_read_taxonomy, \ obi_write_taxonomy, \ obi_close_taxonomy, \ obi_taxo_get_taxon_with_taxid, \ + obi_taxonomy_add_local_taxon, \ ecotx_t @@ -38,6 +39,8 @@ cdef class OBI_Taxonomy : if type(ref) == int : taxon_p = obi_taxo_get_taxon_with_taxid(self._pointer, ref) + if taxon_p == NULL : + raise Exception("Taxon not found") taxon_capsule = PyCapsule_New(taxon_p, NULL, NULL) return OBI_Taxon(taxon_capsule) else : @@ -55,7 +58,7 @@ cdef class OBI_Taxonomy : # Yield each taxid for t in range(self._pointer.taxa.count): - taxon_p = (taxa+t) # TODO not compiling for mysterious cython reasons + taxon_p = (taxa+t) taxon_capsule = PyCapsule_New(taxon_p, NULL, NULL) yield OBI_Taxon(taxon_capsule) @@ -64,6 +67,15 @@ cdef class OBI_Taxonomy : if obi_write_taxonomy(self._dms._pointer, self._pointer, str2bytes(prefix)) < 0 : raise Exception("Error writing the taxonomy to binary files") + + cpdef int add_taxon(self, str name, str rank_name, int parent_taxid, int min_taxid=10000000) : + cdef int taxid + taxid = obi_taxonomy_add_local_taxon(self._pointer, str2bytes(name), str2bytes(rank_name), parent_taxid, min_taxid) + if taxid < 0 : + raise Exception("Error adding a new taxon to the taxonomy") + else : + return taxid + cpdef close(self) : if (obi_close_taxonomy(self._pointer) < 0) : diff --git a/python/obitools3/obidms/capi/obitaxonomy.pxd b/python/obitools3/obidms/capi/obitaxonomy.pxd index 56bffaf..99cd7e4 100644 --- a/python/obitools3/obidms/capi/obitaxonomy.pxd +++ b/python/obitools3/obidms/capi/obitaxonomy.pxd @@ -56,3 +56,4 @@ cdef extern from "obidms_taxonomy.h" nogil: ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy) + int obi_taxonomy_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const char* rank_name, int32_t parent_taxid, int32_t min_taxid) diff --git a/src/obidms_taxonomy.c b/src/obidms_taxonomy.c index 1a026c4..dc71557 100644 --- a/src/obidms_taxonomy.c +++ b/src/obidms_taxonomy.c @@ -135,10 +135,6 @@ void* read_ecorecord(FILE* f, int32_t* record_size) return NULL; } -// if (!(obi_is_little_endian())) // TODO note: keeping for now for testing purposes -// if (is_big_endian()) -// *record_size=swap_int32_t(*record_size); - if (buffer_size < *record_size) { if (buffer) @@ -179,15 +175,6 @@ ecotx_t* readnext_ecotaxon(FILE* f, ecotx_t* taxon) if (!raw) return NULL; -// if (!(obi_is_little_endian())) // TODO -// if (is_big_endian()) -// { -// raw->name_length = swap_int32_t(raw->name_length); -// raw->parent = swap_int32_t(raw->parent); -// raw->rank = swap_int32_t(raw->rank); -// raw->taxid = swap_int32_t(raw->taxid); -// } - taxon->parent = (ecotx_t*) ((size_t) raw->parent); taxon->taxid = raw->taxid; taxon->rank = raw->rank; @@ -247,10 +234,6 @@ FILE* open_ecorecorddb(const char* file_name, return NULL; } -// if (!(obi_is_little_endian())) // TODO -// if (is_big_endian()) -// *count = swap_int32_t(*count); - return f; } @@ -341,6 +324,8 @@ ecotxidx_t* read_taxonomyidx(const char* taxa_file_name, const char* local_taxa_ } taxa_index->count = count_taxa + count_local_taxa; + taxa_index->ncbi_count = count_taxa; + taxa_index->local_count = count_local_taxa; taxa_index->buffer_size = taxa_index->count; taxa_index->max_taxid = 0; @@ -370,8 +355,6 @@ ecotxidx_t* read_taxonomyidx(const char* taxa_file_name, const char* local_taxa_ taxa_index->max_taxid = taxa_index->taxon[i].taxid; } - printf("Computing longest branches...\n"); - for (i=0; i < count_taxa; i++) { t = taxa_index->taxon+i; @@ -409,15 +392,6 @@ econame_t* readnext_econame(FILE* f, econame_t* name, OBIDMS_taxonomy_p taxonomy if (raw == NULL) return NULL; -// if (!(obi_is_little_endian())) // TODO -// if (is_big_endian()) -// { -// raw->is_scientific_name = swap_int32_t(raw->is_scientific_name); -// raw->name_length = swap_int32_t(raw->name_length); -// raw->class_length = swap_int32_t(raw->class_length); -// raw->taxid = swap_int32_t(raw->taxid); -// } - name->is_scientific_name = raw->is_scientific_name; name->name = malloc((raw->name_length + 1) * sizeof(char)); @@ -491,367 +465,6 @@ econameidx_t* read_nameidx(const char *file_name, OBIDMS_taxonomy_p taxonomy) } -/////// PUBLIC ///////// - - -OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, bool read_alternative_names) -{ - OBIDMS_taxonomy_p tax; - char* taxonomy_path; - char* ranks_file_name; - char* taxa_file_name; - char* local_taxa_file_name; - char* alter_names_file_name; - int buffer_size; - - tax = (OBIDMS_taxonomy_p) malloc(sizeof(OBIDMS_taxonomy_t)); - if (tax == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for a taxonomy structure"); - return NULL; - } - - tax->ranks = NULL; - tax->taxa = NULL; - tax->names = NULL; - - buffer_size = 2048; - - taxonomy_path = get_taxonomy_path(dms, taxonomy_name); - if (taxonomy_path == NULL) - return NULL; - - // Read ranks - ranks_file_name = (char*) malloc(buffer_size*sizeof(char)); - if (ranks_file_name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for ranks file name"); - free(taxonomy_path); - free(tax); - return NULL; - } - if (snprintf(ranks_file_name, buffer_size, "%s/%s.rdx", taxonomy_path, taxonomy_name) < 0) - { - obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nError building ranks file name"); - free(taxonomy_path); - free(ranks_file_name); - free(tax); - return NULL; - } - tax->ranks = read_rankidx(ranks_file_name); - if (tax->ranks == NULL) - { - obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nError building ranks file name"); - free(taxonomy_path); - free(ranks_file_name); - free(tax); - return NULL; - } - free(ranks_file_name); - - // Read taxa - taxa_file_name = (char*) malloc(buffer_size*sizeof(char)); - if (taxa_file_name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for taxa file name"); - free(taxonomy_path); - obi_close_taxonomy(tax); - return NULL; - } - if (snprintf(taxa_file_name, buffer_size, "%s/%s.tdx", taxonomy_path, taxonomy_name) < 0) - { - obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nError building taxa file name"); - free(taxonomy_path); - free(taxa_file_name); - obi_close_taxonomy(tax); - return NULL; - } - local_taxa_file_name = (char*) malloc(buffer_size*sizeof(char)); - if (local_taxa_file_name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for taxa file name"); - free(taxonomy_path); - free(taxa_file_name); - obi_close_taxonomy(tax); - return NULL; - } - if (snprintf(local_taxa_file_name, buffer_size, "%s/%s.ldx", taxonomy_path, taxonomy_name) < 0) - { - obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nError building local taxa file name"); - free(taxonomy_path); - free(taxa_file_name); - free(local_taxa_file_name); - obi_close_taxonomy(tax); - return NULL; - } - tax->taxa = read_taxonomyidx(taxa_file_name, local_taxa_file_name); - if (tax->taxa == NULL) - { - free(taxonomy_path); - free(taxa_file_name); - free(local_taxa_file_name); - obi_close_taxonomy(tax); - return NULL; - } - free(taxa_file_name); - free(local_taxa_file_name); - - // Read alternative names - if (read_alternative_names) - { - alter_names_file_name = (char*) malloc(buffer_size*sizeof(char)); - if (alter_names_file_name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for alternative names file name"); - free(taxonomy_path); - obi_close_taxonomy(tax); - return NULL; - } - if (snprintf(alter_names_file_name, buffer_size, "%s/%s.ndx", taxonomy_path, taxonomy_name) < 0) - { - obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nError building alternative names file name"); - free(taxonomy_path); - free(alter_names_file_name); - obi_close_taxonomy(tax); - return NULL; - } - tax->names = read_nameidx(alter_names_file_name, tax); - if (tax->names == NULL) - { - free(taxonomy_path); - free(alter_names_file_name); - obi_close_taxonomy(tax); - return NULL; - } - free(alter_names_file_name); - } - - free(taxonomy_path); - - return tax; -} - - -int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy) -{ - int i; - - if (taxonomy) - { - if (taxonomy->ranks) - { - for (i=0; i < (taxonomy->ranks)->count; i++) - { - if ((taxonomy->ranks)->label[i]) - free((taxonomy->ranks)->label[i]); - } - free(taxonomy->ranks); - } - - if (taxonomy->names) - { - for (i=0; i < (taxonomy->names)->count; i++) - { - if (((taxonomy->names)->names[i]).name) - free(((taxonomy->names)->names[i]).name); - if (((taxonomy->names)->names[i]).class_name) - free(((taxonomy->names)->names[i]).class_name); - } - free(taxonomy->names); - } - - if (taxonomy->taxa) - { - for (i=0; i < (taxonomy->taxa)->count; i++) - { - if (((taxonomy->taxa)->taxon[i]).name) - free(((taxonomy->taxa)->taxon[i]).name); - } - free(taxonomy->taxa); - } - - free(taxonomy); - } - - return 0; -} - - -////////////////////////////////////////////////////////////////////////// - - -ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx) -{ - ecotx_t* current_taxon; - ecotx_t* next_taxon; - - current_taxon = taxon; - next_taxon = current_taxon->parent; - - while ((current_taxon != next_taxon) && // root node - (current_taxon->rank != rankidx)) - { - current_taxon = next_taxon; - next_taxon = current_taxon->parent; - } - - if (current_taxon->rank == rankidx) - return current_taxon; - else - return NULL; -} - - -ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid) -{ - ecotx_t *current_taxon; - int32_t count; - - count = taxonomy->taxa->count; - - current_taxon = (ecotx_t*) bsearch((const void *) ((size_t) taxid), - (const void *) taxonomy->taxa->taxon, - count, - sizeof(ecotx_t), - cmp_taxids); - return current_taxon; -} - - -bool obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid) -{ - ecotx_t* next_parent; - - next_parent = taxon->parent; - - while ((other_taxid != next_parent->taxid) && (strcmp(next_parent->name, "root"))) - next_parent = next_parent->parent; - - if (other_taxid == next_parent->taxid) - return 1; - else - return 0; -} - - -ecotx_t* obi_taxo_get_species(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy) -{ - static OBIDMS_taxonomy_p tax = NULL; - static int32_t rankindex = -1; - - if (taxonomy && (tax != taxonomy)) - { - rankindex = rank_index("species", taxonomy->ranks); - tax = taxonomy; - } - - if (!tax || (rankindex < 0)) - { - obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nError trying to get the species associated with a taxon: No taxonomy defined"); - return NULL; - } - - return obi_taxo_get_parent_at_rank(taxon, rankindex); -} - - -ecotx_t* obi_taxo_get_genus(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy) -{ - static OBIDMS_taxonomy_p tax = NULL; - static int32_t rankindex = -1; - - if (taxonomy && (tax != taxonomy)) - { - rankindex = rank_index("genus", taxonomy->ranks); - tax = taxonomy; - } - - if (!tax || (rankindex < 0)) - { - obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nError trying to get the genus associated with a taxon: No taxonomy defined"); - return NULL; - } - - return obi_taxo_get_parent_at_rank(taxon, rankindex); -} - - -ecotx_t* obi_taxo_get_family(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy) -{ - static OBIDMS_taxonomy_p tax = NULL; - static int32_t rankindex = -1; - - if (taxonomy && (tax != taxonomy)) - { - rankindex = rank_index("family", taxonomy->ranks); - tax = taxonomy; - } - - if (!tax || (rankindex < 0)) - { - obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nError trying to get the family associated with a taxon: No taxonomy defined"); - return NULL; - } - - return obi_taxo_get_parent_at_rank(taxon, rankindex); -} - - -ecotx_t* obi_taxo_get_kingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy) -{ - static OBIDMS_taxonomy_p tax = NULL; - static int32_t rankindex = -1; - - if (taxonomy && (tax != taxonomy)) - { - rankindex = rank_index("kingdom", taxonomy->ranks); - tax = taxonomy; - } - - if (!tax || (rankindex < 0)) - { - obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nError trying to get the kingdom associated with a taxon: No taxonomy defined"); - return NULL; - } - - return obi_taxo_get_parent_at_rank(taxon, rankindex); -} - - -ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy) -{ - static OBIDMS_taxonomy_p tax = NULL; - static int32_t rankindex = -1; - - if (taxonomy && (tax != taxonomy)) - { - rankindex = rank_index("superkingdom", taxonomy->ranks); - tax = taxonomy; - } - - if (!tax || (rankindex < 0)) - { - obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nError trying to get the superkingdom associated with a taxon: No taxonomy defined"); - return NULL; - } - - return obi_taxo_get_parent_at_rank(taxon, rankindex); -} @@ -973,7 +586,7 @@ int write_taxonomyidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_ // Compute file size file_size = sizeof(int32_t); // To store record count - for (i=0; i < (tax->taxa)->count; i++) + for (i=0; i < (tax->taxa)->ncbi_count; i++) { file_size = file_size + sizeof(int32_t) * 5; // To store record size, taxid, rank index, parent index, and name length file_size = file_size + strlen(tax->taxa->taxon[i].name); // To store name @@ -1022,7 +635,7 @@ int write_taxonomyidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_ } // Write record count - if (write(file_descriptor, &(tax->taxa->count), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t))) + if (write(file_descriptor, &(tax->taxa->ncbi_count), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t))) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError writing in a binary taxonomy file"); @@ -1031,7 +644,144 @@ int write_taxonomyidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_ } // Write records - for (i=0; i < tax->taxa->count; i++) + for (i=0; i < (tax->taxa)->ncbi_count; i++) + { + name_length = strlen(tax->taxa->taxon[i].name); + record_size = 4*sizeof(int32_t) + name_length; + + // Write record size + if (write(file_descriptor, &record_size, sizeof(int32_t)) < ((ssize_t) sizeof(int32_t))) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError writing in a binary taxonomy file"); + close(file_descriptor); + return -1; + } + // Write taxid + if (write(file_descriptor, &(tax->taxa->taxon[i].taxid), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t))) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError writing in a binary taxonomy file"); + close(file_descriptor); + return -1; + } + // Write rank index + if (write(file_descriptor, &(tax->taxa->taxon[i].rank), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t))) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError writing in a binary taxonomy file"); + close(file_descriptor); + return -1; + } + // Write parent index + if (write(file_descriptor, &((tax->taxa->taxon[i].parent)->idx), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t))) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError writing in a binary taxonomy file"); + close(file_descriptor); + return -1; + } + // Write name length + if (write(file_descriptor, &name_length, sizeof(int32_t)) < ((ssize_t) sizeof(int32_t))) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError writing in a binary taxonomy file"); + close(file_descriptor); + return -1; + } + // Write name + if (write(file_descriptor, tax->taxa->taxon[i].name, name_length) < ((ssize_t) name_length)) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError writing in a binary taxonomy file"); + close(file_descriptor); + return -1; + } + } + + // Close file + if (close(file_descriptor) < 0) + { + obi_set_errno(OBIDMS_UNKNOWN_ERROR); + obidebug(1, "\nError closing a DMS information file"); + return -1; + } + + return 0; +} + + +int write_local_taxonomy_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct? +{ + int i; + char* file_name; + int file_descriptor; + off_t file_size; + char* taxonomy_path; + int32_t name_length; + int32_t record_size; + + // Compute file size + file_size = sizeof(int32_t); // To store record count + for (i=(tax->taxa)->ncbi_count; i < (tax->taxa)->count; i++) + { + file_size = file_size + sizeof(int32_t) * 5; // To store record size, taxid, rank index, parent index, and name length + file_size = file_size + strlen(tax->taxa->taxon[i].name); // To store name + } + + // Build the taxonomy directory path + taxonomy_path = get_taxonomy_path(dms, taxonomy_name); + + file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 5)*sizeof(char)); + if (file_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating the memory for a binary taxonomy file name"); + return -1; + } + + // Build the file path + if (sprintf(file_name, "%s/%s.ldx", taxonomy_path, taxonomy_name) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError building a binary taxonomy file name"); + return -1; + } + + free(taxonomy_path); + + // Create file + file_descriptor = open(file_name, O_RDWR | O_CREAT, 0777); + if (file_descriptor < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError creating a binary taxonomy file"); + free(file_name); + return -1; + } + + free(file_name); + + // Truncate the file to the right size + if (ftruncate(file_descriptor, file_size) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError truncating a binary taxonomy file"); + close(file_descriptor); + return -1; + } + + // Write record count + if (write(file_descriptor, &((tax->taxa)->local_count), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t))) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError writing in a binary taxonomy file"); + close(file_descriptor); + return -1; + } + + // Write records + for (i=(tax->taxa)->ncbi_count; i < (tax->taxa)->count; i++) { name_length = strlen(tax->taxa->taxon[i].name); record_size = 4*sizeof(int32_t) + name_length; @@ -1274,12 +1024,15 @@ int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name return -1; if (write_nameidx(dms, tax, tax_name) < 0) return -1; + // Check if there are local taxa (if so last taxon is local) + if ((tax->taxa)->local_count > 0) + if (write_local_taxonomy_idx(dms, tax, tax_name) < 0) + return -1; return 0; } - OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { OBIDMS_taxonomy_p tax; @@ -1312,6 +1065,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) tax->taxa = NULL; tax->names = NULL; + tax->dms = NULL; + (tax->tax_name)[0] = '\0'; + // TODO check if taxdump path is for a gz file to unzip or a directory tax_dir = opendir(taxdump); @@ -1533,6 +1289,8 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) // Store count (tax->taxa)->count = n; + (tax->taxa)->ncbi_count = n; + (tax->taxa)->local_count = 0; // Truncate the structure memory to the right size tax->taxa = (ecotxidx_t*) realloc(tax->taxa, sizeof(ecotxidx_t) + sizeof(ecotx_t) * (tax->taxa)->count); @@ -1914,7 +1672,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) ((tax->taxa)->taxon)[i].parent = obi_taxo_get_taxon_with_taxid(tax, parent_taxids[i]); if (((tax->taxa)->taxon)[i].parent == NULL) { - obi_set_errno(OBI_MALLOC_ERROR); + obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError: taxon parent not found"); obi_close_taxonomy(tax); free(parent_taxids); @@ -1924,7 +1682,6 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) (((tax->taxa)->taxon)[i].parent)->farest = 0; } - // TODO what is this for??? (tax->taxa)->buffer_size = (tax->taxa)->count; // Compute longest branches TODO what is this for??? @@ -1958,5 +1715,491 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) } +int obi_taxonomy_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const char* rank_name, int32_t parent_taxid, int32_t min_taxid) +{ + int32_t taxid; + ecotx_t* taxon; + econame_t* name_struct; + int i; + + // Enlarge the structure memory for a new taxon + tax->taxa = (ecotxidx_t*) realloc(tax->taxa, sizeof(ecotxidx_t) + sizeof(ecotx_t) * (((tax->taxa)->count) + 1)); + if (tax->taxa == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for a taxonomy structure to add a new taxon"); + return -1; + } + + // Compute new taxid that must be equal or greater than 1E7 and greater than the maximum taxid existing in the taxonomy + if (min_taxid < MIN_LOCAL_TAXID) + min_taxid = MIN_LOCAL_TAXID; + if (min_taxid > (tax->taxa)->max_taxid) + taxid = min_taxid; + else + taxid = ((tax->taxa)->max_taxid) + 1; + + // Fill the ecotx_t node structure + taxon = ((tax->taxa)->taxon)+((tax->taxa)->count); + taxon->taxid = taxid; + taxon->idx = (tax->taxa)->count; + taxon->local = true; + taxon->name = (char*) malloc((strlen(name) + 1) * sizeof(char)); + if (taxon->name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a taxon name to add a new taxon"); + return -1; + } + strcpy(taxon->name, name); + taxon->rank = -1; + for (i=0; i < (tax->ranks)->count; i++) + { + if (strcmp(rank_name, ((tax->ranks)->label)[i]) == 0) + { + taxon->rank = i; + break; + } + } + if (taxon->rank == -1) // TODO Discuss possibility of creating rank if doesn't exist + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError: taxon rank not found when adding a new taxon"); + return -1; + } + taxon->parent = obi_taxo_get_taxon_with_taxid(tax, parent_taxid); + if (taxon->parent == NULL) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError: taxon parent not found when adding a new taxon"); + return -1; + } + taxon->farest = 0; // TODO not sure + + // Update taxonomy counts etc + (tax->taxa)->max_taxid = taxid; + ((tax->taxa)->count)++; + ((tax->taxa)->local_count)++; + (tax->taxa)->buffer_size = (tax->taxa)->count; + + // Add new name in names structure + // Allocate memory for new name + tax->names = (econameidx_t*) realloc(tax->names, sizeof(econameidx_t) + sizeof(econame_t) * ((tax->names)->count + 1)); + if (tax->names == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for a taxonomy structure to add a new taxon"); + return -1; + } + + // Add new name + name_struct = (tax->names)->names + ((tax->names)->count); + name_struct->name = (char*) malloc((strlen(name) + 1) * sizeof(char)); + if (name_struct->name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a taxon name to add a new taxon"); + return -1; + } + strcpy(name_struct->name, name); + name_struct->class_name = (char*) malloc((strlen("scientific name") + 1) * sizeof(char)); + if (name_struct->class_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a taxon class name to add a new taxon"); + return -1; + } + strcpy(name_struct->class_name, "scientific name"); + name_struct->is_scientific_name = true; + name_struct->taxon = ((tax->taxa)->taxon) + ((tax->taxa)->count) - 1; + + // Sort names in alphabetical order + qsort((tax->names)->names, (tax->names)->count, sizeof(econame_t), cmp_names); + + // Update name count + ((tax->names)->count)++; + + return taxid; +} + + +/////// PUBLIC ///////// + + +OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, bool read_alternative_names) +{ + OBIDMS_taxonomy_p tax; + char* taxonomy_path; + char* ranks_file_name; + char* taxa_file_name; + char* local_taxa_file_name; + char* alter_names_file_name; + int buffer_size; + + tax = (OBIDMS_taxonomy_p) malloc(sizeof(OBIDMS_taxonomy_t)); + if (tax == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a taxonomy structure"); + return NULL; + } + + tax->ranks = NULL; + tax->taxa = NULL; + tax->names = NULL; + + tax->dms = dms; + + strcpy(tax->tax_name, taxonomy_name); + + buffer_size = 2048; + + taxonomy_path = get_taxonomy_path(dms, taxonomy_name); + if (taxonomy_path == NULL) + return NULL; + + // Read ranks + ranks_file_name = (char*) malloc(buffer_size*sizeof(char)); + if (ranks_file_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for ranks file name"); + free(taxonomy_path); + free(tax); + return NULL; + } + if (snprintf(ranks_file_name, buffer_size, "%s/%s.rdx", taxonomy_path, taxonomy_name) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError building ranks file name"); + free(taxonomy_path); + free(ranks_file_name); + free(tax); + return NULL; + } + tax->ranks = read_rankidx(ranks_file_name); + if (tax->ranks == NULL) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError building ranks file name"); + free(taxonomy_path); + free(ranks_file_name); + free(tax); + return NULL; + } + free(ranks_file_name); + + // Read taxa + taxa_file_name = (char*) malloc(buffer_size*sizeof(char)); + if (taxa_file_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for taxa file name"); + free(taxonomy_path); + obi_close_taxonomy(tax); + return NULL; + } + if (snprintf(taxa_file_name, buffer_size, "%s/%s.tdx", taxonomy_path, taxonomy_name) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError building taxa file name"); + free(taxonomy_path); + free(taxa_file_name); + obi_close_taxonomy(tax); + return NULL; + } + local_taxa_file_name = (char*) malloc(buffer_size*sizeof(char)); + if (local_taxa_file_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for taxa file name"); + free(taxonomy_path); + free(taxa_file_name); + obi_close_taxonomy(tax); + return NULL; + } + if (snprintf(local_taxa_file_name, buffer_size, "%s/%s.ldx", taxonomy_path, taxonomy_name) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError building local taxa file name"); + free(taxonomy_path); + free(taxa_file_name); + free(local_taxa_file_name); + obi_close_taxonomy(tax); + return NULL; + } + tax->taxa = read_taxonomyidx(taxa_file_name, local_taxa_file_name); + if (tax->taxa == NULL) + { + free(taxonomy_path); + free(taxa_file_name); + free(local_taxa_file_name); + obi_close_taxonomy(tax); + return NULL; + } + free(taxa_file_name); + free(local_taxa_file_name); + + // Read alternative names + if (read_alternative_names) + { + alter_names_file_name = (char*) malloc(buffer_size*sizeof(char)); + if (alter_names_file_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for alternative names file name"); + free(taxonomy_path); + obi_close_taxonomy(tax); + return NULL; + } + if (snprintf(alter_names_file_name, buffer_size, "%s/%s.ndx", taxonomy_path, taxonomy_name) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError building alternative names file name"); + free(taxonomy_path); + free(alter_names_file_name); + obi_close_taxonomy(tax); + return NULL; + } + tax->names = read_nameidx(alter_names_file_name, tax); + if (tax->names == NULL) + { + free(taxonomy_path); + free(alter_names_file_name); + obi_close_taxonomy(tax); + return NULL; + } + free(alter_names_file_name); + } + + free(taxonomy_path); + + return tax; +} + + +int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy) +{ + int i; + + // Update local informations (local taxa and preferred names) if there are any + if ((taxonomy->taxa)->local_count > 0) + { + if (taxonomy->dms == NULL) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError closing a taxonomy with local files but no DMS associated (probably read directly from taxdump)"); // TODO discuss + } + if (write_local_taxonomy_idx(taxonomy->dms, taxonomy, taxonomy->tax_name) < 0) + return -1; + } + + if (taxonomy) + { + if (taxonomy->ranks) + { + for (i=0; i < (taxonomy->ranks)->count; i++) + { + if ((taxonomy->ranks)->label[i]) + free((taxonomy->ranks)->label[i]); + } + free(taxonomy->ranks); + } + + if (taxonomy->names) + { + for (i=0; i < (taxonomy->names)->count; i++) + { + if (((taxonomy->names)->names[i]).name) + free(((taxonomy->names)->names[i]).name); + if (((taxonomy->names)->names[i]).class_name) + free(((taxonomy->names)->names[i]).class_name); + } + free(taxonomy->names); + } + + if (taxonomy->taxa) + { + for (i=0; i < (taxonomy->taxa)->count; i++) + { + if (((taxonomy->taxa)->taxon[i]).name) + free(((taxonomy->taxa)->taxon[i]).name); + } + free(taxonomy->taxa); + } + + free(taxonomy); + } + + return 0; +} + + +////////////////////////////////////////////////////////////////////////// + + +ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx) +{ + ecotx_t* current_taxon; + ecotx_t* next_taxon; + + current_taxon = taxon; + next_taxon = current_taxon->parent; + + while ((current_taxon != next_taxon) && // root node + (current_taxon->rank != rankidx)) + { + current_taxon = next_taxon; + next_taxon = current_taxon->parent; + } + + if (current_taxon->rank == rankidx) + return current_taxon; + else + return NULL; +} + + +ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid) +{ + ecotx_t *current_taxon; + int32_t count; + + count = (taxonomy->taxa)->count; + + current_taxon = (ecotx_t*) bsearch((const void *) ((size_t) taxid), + (const void *) taxonomy->taxa->taxon, + count, + sizeof(ecotx_t), + cmp_taxids); + return current_taxon; +} + + +bool obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid) +{ + ecotx_t* next_parent; + + next_parent = taxon->parent; + + while ((other_taxid != next_parent->taxid) && (strcmp(next_parent->name, "root"))) + next_parent = next_parent->parent; + + if (other_taxid == next_parent->taxid) + return 1; + else + return 0; +} + + +ecotx_t* obi_taxo_get_species(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy) +{ + static OBIDMS_taxonomy_p tax = NULL; + static int32_t rankindex = -1; + + if (taxonomy && (tax != taxonomy)) + { + rankindex = rank_index("species", taxonomy->ranks); + tax = taxonomy; + } + + if (!tax || (rankindex < 0)) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError trying to get the species associated with a taxon: No taxonomy defined"); + return NULL; + } + + return obi_taxo_get_parent_at_rank(taxon, rankindex); +} + + +ecotx_t* obi_taxo_get_genus(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy) +{ + static OBIDMS_taxonomy_p tax = NULL; + static int32_t rankindex = -1; + + if (taxonomy && (tax != taxonomy)) + { + rankindex = rank_index("genus", taxonomy->ranks); + tax = taxonomy; + } + + if (!tax || (rankindex < 0)) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError trying to get the genus associated with a taxon: No taxonomy defined"); + return NULL; + } + + return obi_taxo_get_parent_at_rank(taxon, rankindex); +} + + +ecotx_t* obi_taxo_get_family(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy) +{ + static OBIDMS_taxonomy_p tax = NULL; + static int32_t rankindex = -1; + + if (taxonomy && (tax != taxonomy)) + { + rankindex = rank_index("family", taxonomy->ranks); + tax = taxonomy; + } + + if (!tax || (rankindex < 0)) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError trying to get the family associated with a taxon: No taxonomy defined"); + return NULL; + } + + return obi_taxo_get_parent_at_rank(taxon, rankindex); +} + + +ecotx_t* obi_taxo_get_kingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy) +{ + static OBIDMS_taxonomy_p tax = NULL; + static int32_t rankindex = -1; + + if (taxonomy && (tax != taxonomy)) + { + rankindex = rank_index("kingdom", taxonomy->ranks); + tax = taxonomy; + } + + if (!tax || (rankindex < 0)) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError trying to get the kingdom associated with a taxon: No taxonomy defined"); + return NULL; + } + + return obi_taxo_get_parent_at_rank(taxon, rankindex); +} + + +ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy) +{ + static OBIDMS_taxonomy_p tax = NULL; + static int32_t rankindex = -1; + + if (taxonomy && (tax != taxonomy)) + { + rankindex = rank_index("superkingdom", taxonomy->ranks); + tax = taxonomy; + } + + if (!tax || (rankindex < 0)) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError trying to get the superkingdom associated with a taxon: No taxonomy defined"); + return NULL; + } + + return obi_taxo_get_parent_at_rank(taxon, rankindex); +} + diff --git a/src/obidms_taxonomy.h b/src/obidms_taxonomy.h index 9331a17..e70e892 100644 --- a/src/obidms_taxonomy.h +++ b/src/obidms_taxonomy.h @@ -17,6 +17,10 @@ #include "obidms.h" +#define MIN_LOCAL_TAXID (10000000) +#define TAX_NAME_LEN (1024) + + typedef struct { int32_t taxid; int32_t rank; @@ -33,11 +37,14 @@ typedef struct ecotxnode { int32_t idx; struct ecotxnode* parent; char* name; + bool local; } ecotx_t; typedef struct { int32_t count; + int32_t ncbi_count; + int32_t local_count; int32_t max_taxid; int32_t buffer_size; ecotx_t taxon[1]; @@ -74,6 +81,8 @@ typedef struct { typedef struct OBIDMS_taxonomy_t { + char tax_name[TAX_NAME_LEN]; + OBIDMS_p dms; ecorankidx_t* ranks; econameidx_t* names; ecotxidx_t* taxa; @@ -109,3 +118,4 @@ int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump); +int obi_taxonomy_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const char* rank_name, int32_t parent_taxid, int32_t min_taxid);