diff --git a/python/obitools3/obidms/_obitaxo.pxd b/python/obitools3/obidms/_obitaxo.pxd index 91c3a5b..fba81e9 100644 --- a/python/obitools3/obidms/_obitaxo.pxd +++ b/python/obitools3/obidms/_obitaxo.pxd @@ -11,7 +11,7 @@ cdef class OBI_Taxonomy : cdef OBIDMS _dms cpdef close(self) - cpdef _write(self, str prefix) + cpdef write(self, str prefix) cdef class OBI_Taxon : diff --git a/python/obitools3/obidms/_obitaxo.pyx b/python/obitools3/obidms/_obitaxo.pyx index 0466892..dd82ecd 100644 --- a/python/obitools3/obidms/_obitaxo.pyx +++ b/python/obitools3/obidms/_obitaxo.pyx @@ -3,11 +3,11 @@ from obitools3.utils cimport bytes2str, str2bytes from .capi.obitaxonomy cimport obi_read_taxonomy, \ + obi_read_taxdump, \ + obi_write_taxonomy, \ obi_close_taxonomy, \ - obi_taxo_get_taxon_with_taxid, \ - write_rankidx, \ - write_taxonomyidx, \ - write_nameidx + obi_taxo_get_taxon_with_taxid + from ._obidms cimport OBIDMS @@ -19,11 +19,14 @@ cdef class OBI_Taxonomy : # TODO function to import taxonomy? - def __init__(self, OBIDMS dms, str name) : + def __init__(self, OBIDMS dms, str name, bint taxdump=False) : self._dms = dms self._name = name - self._pointer = obi_read_taxonomy(dms._pointer, str2bytes(name), True) # TODO discuss + if taxdump : + self._pointer = obi_read_taxdump(str2bytes(name)) + else : + self._pointer = obi_read_taxonomy(dms._pointer, str2bytes(name), True) # TODO discuss # TODO if not found in DMS, try to import? @@ -40,19 +43,31 @@ cdef class OBI_Taxonomy : raise Exception("Not implemented") +# def __iter__(self): +# +# cdef ecotx_t* taxa +# cdef ecotx_t* taxon_p +# cdef object taxon_capsule +# +# taxa = self._pointer.taxa.taxon +# +# # Yield each taxid +# for t in range(self._pointer.taxa.count): +# taxon_p = taxa+t # TODO not compiling for mysterious cython reasons +# taxon_capsule = PyCapsule_New(taxon_p, NULL, NULL) +# yield OBI_Taxon(taxon_capsule) + + + cpdef write(self, str prefix) : + if obi_write_taxonomy(self._dms._pointer, self._pointer, str2bytes(prefix)) < 0 : + raise Exception("Error writing the taxonomy to binary files") + + cpdef close(self) : if (obi_close_taxonomy(self._pointer) < 0) : raise Exception("Error closing the taxonomy") - cpdef _write(self, str prefix) : - if (write_rankidx(self._dms._pointer, self._pointer, str2bytes(prefix)) < 0) : - raise Exception("Error writing the taxonomy rank file") - if (write_taxonomyidx(self._dms._pointer, self._pointer, str2bytes(prefix)) < 0) : - raise Exception("Error writing the taxonomy taxa file") - if (write_nameidx(self._dms._pointer, self._pointer, str2bytes(prefix)) < 0) : - raise Exception("Error writing the taxonomy taxa file") - cdef class OBI_Taxon : # TODO dict subclass? diff --git a/python/obitools3/obidms/capi/obitaxonomy.pxd b/python/obitools3/obidms/capi/obitaxonomy.pxd index aa61332..56bffaf 100644 --- a/python/obitools3/obidms/capi/obitaxonomy.pxd +++ b/python/obitools3/obidms/capi/obitaxonomy.pxd @@ -7,9 +7,6 @@ from libc.stdint cimport int32_t cdef extern from "obidms_taxonomy.h" nogil: - struct OBIDMS_taxonomy_t - ctypedef OBIDMS_taxonomy_t* OBIDMS_taxonomy_p - struct ecotxnode : int32_t taxid int32_t rank @@ -20,8 +17,27 @@ cdef extern from "obidms_taxonomy.h" nogil: ctypedef ecotxnode ecotx_t - OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const_char_p taxonomy_name, bint read_alternative_names) + struct ecotxidx_t : + int32_t count + int32_t max_taxid + int32_t buffer_size + ecotx_t* taxon + + + struct OBIDMS_taxonomy_t : +# ecorankidx_t* ranks +# econameidx_t* names + ecotxidx_t* taxa + + ctypedef OBIDMS_taxonomy_t* OBIDMS_taxonomy_p + + OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const_char_p taxonomy_name, bint read_alternative_names) + + OBIDMS_taxonomy_p obi_read_taxdump(const_char_p taxdump) + + int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const_char_p tax_name) + int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy) ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx) @@ -40,7 +56,3 @@ cdef extern from "obidms_taxonomy.h" nogil: ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy) - - int write_rankidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const_char_p taxonomy_name) - int write_taxonomyidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const_char_p taxonomy_name) - int write_nameidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const_char_p taxonomy_name) diff --git a/src/obidms_taxonomy.c b/src/obidms_taxonomy.c index a0cb518..1a026c4 100644 --- a/src/obidms_taxonomy.c +++ b/src/obidms_taxonomy.c @@ -15,10 +15,12 @@ #include #include #include +#include +#include +#include #include "obidms_taxonomy.h" #include "obidms.h" -#include "obilittlebigman.h" // TODO the function from this checking the endianness does not seem to work properly #include "obidebug.h" #include "obierrno.h" #include "utils.h" @@ -26,38 +28,54 @@ #define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) -// TODO : the malloc aren't checked but shouldn't exist for long because mapping instead -// error checking and file closing in general aren't done properly yet -// The endianness eventually shouldn't need checking too, as the machine will write the taxonomy with its endianness. - -int32_t is_big_endian() -{ - int32_t i=1; - - return (int32_t)((char*)&i)[0]; -} - - -int32_t swap_int32_t(int32_t i) -{ - return SWAPINT32(i); -} - - -int compareRankLabel(const void *label1, const void *label2) +int cmp_rank_labels(const void* label1, const void* label2) { return strcmp((const char*)label1,*(const char**)label2); } +static int cmp_taxids(const void* ptaxid, const void* ptaxon) +{ + ecotx_t* current_taxon = (ecotx_t*) ptaxon; + int32_t taxid = (int32_t) ((size_t) ptaxid); + return taxid - current_taxon->taxid; +} + + +static int cmp_str(const void* s1, const void* s2) +{ + return strcmp(*((char**)s1), *((char**)s2)); +} + + +static int cmp_names(const void* n1, const void* n2) +{ + econame_t name1 = *((econame_t*)n1); + econame_t name2 = *((econame_t*)n2); + + return strcmp(name1.name, name2.name); +} + + char* get_taxonomy_path(OBIDMS_p dms, const char* tax_name) { char* all_tax_dir_path; char* tax_path; all_tax_dir_path = obi_dms_get_full_path(dms, TAXONOMY_DIR_NAME); + if (all_tax_dir_path == NULL) + return NULL; + tax_path = (char*) malloc((strlen(all_tax_dir_path) + strlen(tax_name) + 2)*sizeof(char)); + if (tax_path == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for taxonomy path"); + free(all_tax_dir_path); + return NULL; + } + if (sprintf(tax_path, "%s/%s", all_tax_dir_path, tax_name) < 0) { obi_set_errno(OBI_TAXONOMY_ERROR); @@ -76,7 +94,7 @@ int32_t rank_index(const char* label, ecorankidx_t* ranks) { char **rep; - rep = bsearch(label, ranks->label, ranks->count, sizeof(char*), compareRankLabel); + rep = bsearch(label, ranks->label, ranks->count, sizeof(char*), cmp_rank_labels); if (rep) return rep-ranks->label; @@ -104,7 +122,11 @@ void* read_ecorecord(FILE* f, int32_t* record_size) f); if (feof(f)) - return NULL; + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError reading a taxonomy file: reached end of file"); + return NULL; + } if (read != 1) { @@ -113,7 +135,7 @@ void* read_ecorecord(FILE* f, int32_t* record_size) return NULL; } -// if (!(obi_is_little_endian())) // TODO +// if (!(obi_is_little_endian())) // TODO note: keeping for now for testing purposes // if (is_big_endian()) // *record_size=swap_int32_t(*record_size); @@ -125,7 +147,7 @@ void* read_ecorecord(FILE* f, int32_t* record_size) buffer = malloc(*record_size); if (buffer == NULL) { - obi_set_errno(OBI_TAXONOMY_ERROR); + obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError reading a taxonomy file: error allocating memory"); return NULL; } @@ -140,6 +162,7 @@ void* read_ecorecord(FILE* f, int32_t* record_size) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError reading a taxonomy file: error reading a record %d, %d", read, *record_size); + free(buffer); return NULL; } @@ -153,7 +176,6 @@ ecotx_t* readnext_ecotaxon(FILE* f, ecotx_t* taxon) int32_t record_length; raw = read_ecorecord(f, &record_length); - if (!raw) return NULL; @@ -172,8 +194,15 @@ ecotx_t* readnext_ecotaxon(FILE* f, ecotx_t* taxon) taxon->farest = -1; taxon->name = malloc((raw->name_length+1) * sizeof(char)); + if (taxon->name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reading a taxonomy file: error allocating memory"); + return NULL; + } strncpy(taxon->name, raw->name, raw->name_length); + taxon->name[raw->name_length] = 0; // TODO note: this line is probably missing in ROBITaxonomy and source of a bug return taxon; } @@ -194,11 +223,13 @@ FILE* open_ecorecorddb(const char* file_name, { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nCouldn't open a taxonomy file"); + fclose(f); return NULL; } else { *count = 0; + fclose(f); return NULL; } } @@ -212,6 +243,7 @@ FILE* open_ecorecorddb(const char* file_name, { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError reading taxonomy record size"); + fclose(f); return NULL; } @@ -233,22 +265,47 @@ ecorankidx_t* read_rankidx(const char* ranks_file_name) char* buffer; ranks_file = open_ecorecorddb(ranks_file_name, &count, 0); - if (ranks_file==NULL) return NULL; ranks_index = (ecorankidx_t*) malloc(sizeof(ecorankidx_t) + sizeof(char*) * (count-1)); + if (ranks_index == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for taxonomy rank structure"); + fclose(ranks_file); + return NULL; + } ranks_index->count = count; for (i=0; i < count; i++) { buffer = read_ecorecord(ranks_file, &rank_length); + if (buffer == NULL) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError reading a value in a taxonomy file"); + fclose(ranks_file); + free(ranks_index); + return NULL; + } ranks_index->label[i] = (char*) malloc(rank_length+1); + if (ranks_index->label[i] == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for taxonomy rank label"); + fclose(ranks_file); + free(ranks_index); + free(buffer); + return NULL; + } strncpy(ranks_index->label[i], buffer, rank_length); (ranks_index->label[i])[rank_length] = 0; } + fclose(ranks_file); + return ranks_index; } @@ -264,8 +321,7 @@ ecotxidx_t* read_taxonomyidx(const char* taxa_file_name, const char* local_taxa_ int32_t i; int32_t j; - f_taxa = open_ecorecorddb(taxa_file_name, &count_taxa,0); - + f_taxa = open_ecorecorddb(taxa_file_name, &count_taxa, 1); if (f_taxa == NULL) { obidebug(1, "\nError reading taxonomy taxa file"); @@ -275,6 +331,14 @@ ecotxidx_t* read_taxonomyidx(const char* taxa_file_name, const char* local_taxa_ f_local_taxa = open_ecorecorddb(local_taxa_file_name, &count_local_taxa, 0); taxa_index = (ecotxidx_t*) malloc(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count_taxa + count_local_taxa - 1)); + if (taxa_index == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for taxonomy structure"); + fclose(f_taxa); + fclose(f_local_taxa); + return NULL; + } taxa_index->count = count_taxa + count_local_taxa; taxa_index->buffer_size = taxa_index->count; @@ -328,20 +392,22 @@ ecotxidx_t* read_taxonomyidx(const char* taxa_file_name, const char* local_taxa_ } } + fclose(f_taxa); + if (f_local_taxa != NULL) + fclose(f_local_taxa); + return taxa_index; } econame_t* readnext_econame(FILE* f, econame_t* name, OBIDMS_taxonomy_p taxonomy) { - econameformat_t* raw; - int32_t record_length; + int32_t record_length; raw = read_ecorecord(f, &record_length); - - if (!raw) - return NULL; + if (raw == NULL) + return NULL; // if (!(obi_is_little_endian())) // TODO // if (is_big_endian()) @@ -355,10 +421,25 @@ econame_t* readnext_econame(FILE* f, econame_t* name, OBIDMS_taxonomy_p taxonomy name->is_scientific_name = raw->is_scientific_name; name->name = malloc((raw->name_length + 1) * sizeof(char)); + if (name->name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a taxon name"); + free(raw); + return NULL; + } strncpy(name->name, raw->names, raw->name_length); name->name[raw->name_length] = 0; name->class_name = malloc((raw->class_length+1) * sizeof(char)); + if (name->class_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a taxon class name"); + free(name->name); + free(raw); + return NULL; + } strncpy(name->class_name,(raw->names + raw->name_length), raw->class_length); name->class_name[raw->class_length] = 0; @@ -376,29 +457,40 @@ econameidx_t* read_nameidx(const char *file_name, OBIDMS_taxonomy_p taxonomy) int32_t i; f = open_ecorecorddb(file_name, &count, 0); - if (f == NULL) - return NULL; + { + obidebug(1, "\nError reading taxonomy name file"); + return NULL; + } index_names = (econameidx_t*) malloc(sizeof(econameidx_t) + sizeof(econame_t) * (count-1)); + if (index_names == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reading taxonomy name file"); + return NULL; + } index_names->count = count; for (i=0; i < count; i++) + { readnext_econame(f, (index_names->names)+i, taxonomy); + if ((index_names->names)+i == NULL) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError reading taxonomy name file"); + free(index_names); + return NULL; + } + } + + fclose(f); return index_names; } -static int bcomptaxon (const void* ptaxid, const void* ptaxon) -{ - ecotx_t* current_taxon = (ecotx_t*) ptaxon; - int32_t taxid = (int32_t) ((size_t) ptaxid); - return taxid - current_taxon->taxid; -} - - /////// PUBLIC ///////// @@ -413,35 +505,50 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo int buffer_size; tax = (OBIDMS_taxonomy_p) malloc(sizeof(OBIDMS_taxonomy_t)); + if (tax == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a taxonomy structure"); + return NULL; + } tax->ranks = NULL; tax->taxa = NULL; tax->names = NULL; - buffer_size = 2048; // TODO + buffer_size = 2048; taxonomy_path = get_taxonomy_path(dms, taxonomy_name); + if (taxonomy_path == NULL) + return NULL; // Read ranks ranks_file_name = (char*) malloc(buffer_size*sizeof(char)); if (ranks_file_name == NULL) { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for ranks file name"); free(taxonomy_path); - obi_close_taxonomy(tax); + free(tax); return NULL; } if (snprintf(ranks_file_name, buffer_size, "%s/%s.rdx", taxonomy_path, taxonomy_name) < 0) { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError building ranks file name"); free(taxonomy_path); free(ranks_file_name); - obi_close_taxonomy(tax); + free(tax); return NULL; } tax->ranks = read_rankidx(ranks_file_name); if (tax->ranks == NULL) { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError building ranks file name"); + free(taxonomy_path); free(ranks_file_name); - obi_close_taxonomy(tax); + free(tax); return NULL; } free(ranks_file_name); @@ -450,12 +557,16 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo taxa_file_name = (char*) malloc(buffer_size*sizeof(char)); if (taxa_file_name == NULL) { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for taxa file name"); free(taxonomy_path); obi_close_taxonomy(tax); return NULL; } if (snprintf(taxa_file_name, buffer_size, "%s/%s.tdx", taxonomy_path, taxonomy_name) < 0) { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError building taxa file name"); free(taxonomy_path); free(taxa_file_name); obi_close_taxonomy(tax); @@ -464,6 +575,8 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo local_taxa_file_name = (char*) malloc(buffer_size*sizeof(char)); if (local_taxa_file_name == NULL) { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for taxa file name"); free(taxonomy_path); free(taxa_file_name); obi_close_taxonomy(tax); @@ -471,6 +584,8 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo } if (snprintf(local_taxa_file_name, buffer_size, "%s/%s.ldx", taxonomy_path, taxonomy_name) < 0) { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError building local taxa file name"); free(taxonomy_path); free(taxa_file_name); free(local_taxa_file_name); @@ -495,12 +610,16 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo alter_names_file_name = (char*) malloc(buffer_size*sizeof(char)); if (alter_names_file_name == NULL) { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for alternative names file name"); free(taxonomy_path); obi_close_taxonomy(tax); return NULL; } if (snprintf(alter_names_file_name, buffer_size, "%s/%s.ndx", taxonomy_path, taxonomy_name) < 0) { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError building alternative names file name"); free(taxonomy_path); free(alter_names_file_name); obi_close_taxonomy(tax); @@ -509,9 +628,10 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo tax->names = read_nameidx(alter_names_file_name, tax); if (tax->names == NULL) { - free(alter_names_file_name); - obi_close_taxonomy(tax); - return NULL; + free(taxonomy_path); + free(alter_names_file_name); + obi_close_taxonomy(tax); + return NULL; } free(alter_names_file_name); } @@ -524,25 +644,46 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy) { + int i; + if (taxonomy) { if (taxonomy->ranks) - free(taxonomy->ranks); // TODO those don't free everything + { + for (i=0; i < (taxonomy->ranks)->count; i++) + { + if ((taxonomy->ranks)->label[i]) + free((taxonomy->ranks)->label[i]); + } + free(taxonomy->ranks); + } if (taxonomy->names) + { + for (i=0; i < (taxonomy->names)->count; i++) + { + if (((taxonomy->names)->names[i]).name) + free(((taxonomy->names)->names[i]).name); + if (((taxonomy->names)->names[i]).class_name) + free(((taxonomy->names)->names[i]).class_name); + } free(taxonomy->names); + } if (taxonomy->taxa) + { + for (i=0; i < (taxonomy->taxa)->count; i++) + { + if (((taxonomy->taxa)->taxon[i]).name) + free(((taxonomy->taxa)->taxon[i]).name); + } free(taxonomy->taxa); + } free(taxonomy); - - return 0; } - // close files - - return 1; + return 0; } @@ -582,7 +723,7 @@ ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid (const void *) taxonomy->taxa->taxon, count, sizeof(ecotx_t), - bcomptaxon); + cmp_taxids); return current_taxon; } @@ -760,7 +901,7 @@ int write_rankidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name if (file_descriptor < 0) { obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nError creating a binary taxonomy file"); + obidebug(1, "\nError creating a binary taxonomy file %s", file_name); free(file_name); return -1; } @@ -1096,7 +1237,7 @@ int write_nameidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name // Close file if (close(file_descriptor) < 0) { - obi_set_errno(OBIDMS_UNKNOWN_ERROR); + obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError closing a DMS information file"); return -1; } @@ -1104,3 +1245,718 @@ int write_nameidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name return 0; } + + +int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name) +{ + char* taxonomy_path; + + // Build the taxonomy directory path + taxonomy_path = get_taxonomy_path(dms, tax_name); + if (taxonomy_path == NULL) + return -1; + + // Try to create the directory + if (mkdir(taxonomy_path, 00777) < 0) + { + if (errno == EEXIST) + obidebug(1, "\nA taxonomy already exists with this name."); + obidebug(1, "\nProblem creating a new taxonomy directory"); + free(taxonomy_path); + return -1; + } + + free(taxonomy_path); + + if (write_rankidx(dms, tax, tax_name) < 0) + return -1; + if (write_taxonomyidx(dms, tax, tax_name) < 0) + return -1; + if (write_nameidx(dms, tax, tax_name) < 0) + return -1; + + return 0; +} + + + +OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) +{ + OBIDMS_taxonomy_p tax; + struct dirent* dp; + DIR* tax_dir; + FILE* file; + bool nodes_found=false; + bool names_found=false; + char line[2048]; // TODO large enough? + char* elt; + char* file_name; + int buffer_size; + int i, j; + int n; + char** rank_names; + int* parent_taxids; + int taxid; + bool already_in; + ecotx_t* t; + + // Initialize taxonomy structure + tax = (OBIDMS_taxonomy_p) malloc(sizeof(OBIDMS_taxonomy_t)); + if (tax == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating the memory for a taxonomy structure"); + return NULL; + } + tax->ranks = NULL; + tax->taxa = NULL; + tax->names = NULL; + + // TODO check if taxdump path is for a gz file to unzip or a directory + + tax_dir = opendir(taxdump); + if (tax_dir == NULL) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nProblem opening a taxdump directory"); + free(tax); + return NULL; + } + + // Go through taxonomy files + while ((dp = readdir(tax_dir)) != NULL) + { + if (strcmp(dp->d_name, "nodes.dmp") == 0) + { + nodes_found = true; + buffer_size = 10000; + + // Initializing the taxa structure + tax->taxa = (ecotxidx_t*) malloc(sizeof(ecotxidx_t) + sizeof(ecotx_t) * buffer_size); + if (tax->taxa == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating the memory for a taxonomy structure"); + free(tax); + closedir(tax_dir); + return NULL; + } + + // Initialize rank names and parent taxids arrays + parent_taxids = malloc(buffer_size * sizeof(int)); + if (file_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating the memory for a file name"); + obi_close_taxonomy(tax); + closedir(tax_dir); + return NULL; + } + + rank_names = malloc(buffer_size * sizeof(char*)); + if (file_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating the memory for a file name"); + obi_close_taxonomy(tax); + free(parent_taxids); + closedir(tax_dir); + return NULL; + } + + // Allocating the memory for the file name + file_name = (char*) malloc((strlen(taxdump) + 10)*sizeof(char)); + if (file_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating the memory for a file name"); + obi_close_taxonomy(tax); + free(parent_taxids); + free(rank_names); + closedir(tax_dir); + return NULL; + } + + // Build the file path + if (sprintf(file_name, "%s/nodes.dmp", taxdump) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError building a taxonomy file name"); + obi_close_taxonomy(tax); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + free(file_name); + return NULL; + } + + file = fopen(file_name, "r"); + if (file == NULL) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nProblem opening a taxonomy file"); + obi_close_taxonomy(tax); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + free(file_name); + return NULL; + } + + free(file_name); + + (tax->taxa)->max_taxid = 0; + n = 0; + while (fgets(line, sizeof(line), file)) + { + // Enlarge structures if needed + if (n == buffer_size) + { + buffer_size = buffer_size * 2; + + tax->taxa = (ecotxidx_t*) realloc(tax->taxa, sizeof(ecotxidx_t) + sizeof(ecotx_t) * buffer_size); + if (tax->taxa == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for a taxonomy structure"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + return NULL; + } + + parent_taxids = (int*) realloc(parent_taxids, sizeof(int) * buffer_size); + if (parent_taxids == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for a taxonomy structure"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + return NULL; + } + + rank_names = (char**) realloc(rank_names, sizeof(char*) * buffer_size); + if (rank_names == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for a taxonomy structure"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + return NULL; + } + } + + // Check for terminal '\n' character (line complete) + if (line[strlen(line) - 1] != '\n') + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError: line buffer size not large enough for line in taxonomy file"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + return NULL; + } + + (tax->taxa)->taxon[n].idx = n; + + // Parse 3 first elements separated by '|' + + elt = strtok(line, "|"); + + // Remove the last character (tab character) + elt[strlen(elt)-1] = '\0'; + + // First element: taxid + (tax->taxa)->taxon[n].taxid = atoi(elt); + + // Update max taxid + if ((tax->taxa)->taxon[n].taxid > (tax->taxa)->max_taxid) + (tax->taxa)->max_taxid = (tax->taxa)->taxon[n].taxid; + + // Initialize farest taxid value + (tax->taxa)->taxon[n].farest = -1; + + i = 1; + while (i < 3) + { + elt = strtok(NULL, "|"); + + // Remove the first and the last characters (tab characters) + elt = elt+1; + elt[strlen(elt)-1] = '\0'; + + if (i == 1) + parent_taxids[n] = atoi(elt); + else if (i == 2) + { + rank_names[n] = (char*) malloc((strlen(elt)+1) * sizeof(char)); + if (rank_names[n] == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for taxon rank name"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + return NULL; + } + strcpy(rank_names[n], elt); + } + i++; + } + n++; + } + + // Check that fgets stopped because it reached EOF + if (!feof(file)) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError: file reading was stopped before end of file"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + return NULL; + } + + // Store count + (tax->taxa)->count = n; + + // Truncate the structure memory to the right size + tax->taxa = (ecotxidx_t*) realloc(tax->taxa, sizeof(ecotxidx_t) + sizeof(ecotx_t) * (tax->taxa)->count); + if (tax->taxa == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for taxonomy structure"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + return NULL; + } + + fclose(file); + } + } + closedir(tax_dir); + + + // Go through directory again for next file + tax_dir = opendir(taxdump); + if (tax_dir == NULL) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nProblem opening a taxdump directory"); + obi_close_taxonomy(tax); + free(parent_taxids); + free(rank_names); + return NULL; + } + + // Go through taxonomy files + while ((dp = readdir(tax_dir)) != NULL) + { + if (strcmp(dp->d_name, "names.dmp") == 0) + { + names_found = true; + buffer_size = 10000; + + // Initializing the names structure + tax->names = (econameidx_t*) malloc(sizeof(econameidx_t) + sizeof(econame_t) * buffer_size); + if (tax->names == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating the memory for a taxonomy structure"); + obi_close_taxonomy(tax); + free(parent_taxids); + free(rank_names); + closedir(tax_dir); + return NULL; + } + + // Allocating the memory for the file name + file_name = (char*) malloc((strlen(taxdump) + 10)*sizeof(char)); + if (file_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating the memory for a file name"); + obi_close_taxonomy(tax); + free(parent_taxids); + free(rank_names); + closedir(tax_dir); + return NULL; + } + + // Build the file path + if (sprintf(file_name, "%s/names.dmp", taxdump) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError building a taxonomy file name"); + obi_close_taxonomy(tax); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + free(file_name); + return NULL; + } + + file = fopen(file_name, "r"); + if (file == NULL) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nProblem opening a taxonomy file"); + obi_close_taxonomy(tax); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + free(file_name); + return NULL; + } + + free(file_name); + + n = 0; + j = 0; + while (fgets(line, sizeof(line), file)) + { + // Enlarge structures if needed + if (n == buffer_size) + { + buffer_size = buffer_size * 2; + tax->names = (econameidx_t*) realloc(tax->names, sizeof(econameidx_t) + sizeof(econame_t) * buffer_size); + if (tax->names == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for a taxonomy structure"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + return NULL; + } + } + + // Check for terminal '\n' character (line complete) + if (line[strlen(line) - 1] != '\n') + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError: line buffer size not large enough for line in taxonomy file"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + return NULL; + } + + // Parse 4 first elements separated by '|' + + elt = strtok(line, "|"); + + // Remove the last character (tab character) + elt[strlen(elt)-1] = '\0'; + + // First element: taxid + taxid = atoi(elt); + // Find taxid in taxa structure and store pointer in names structure + i = j; + while ((i < (tax->taxa)->count) && ((tax->taxa)->taxon[i].taxid != taxid)) + i++; + if (i == (tax->taxa)->count) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError: could not find taxon associated to name when reading taxdump"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + return NULL; + } + j = i; // Because there are several names by taxon but they are in the same order + (tax->names)->names[n].taxon = ((tax->taxa)->taxon)+i; + + i = 1; + while (i < 4) + { + elt = strtok(NULL, "|"); + + // Remove the first and the last characters (tab characters) + elt = elt+1; + elt[strlen(elt)-1] = '\0'; + + if (i == 1) // Name + { + (tax->names)->names[n].name = (char*) malloc((strlen(elt) + 1) * sizeof(char)); + if ((tax->names)->names[n].name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a taxon name"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + return NULL; + } + strcpy((tax->names)->names[n].name, elt); + } + else if (i == 3) // Class name + { + (tax->names)->names[n].class_name = (char*) malloc((strlen(elt) + 1) * sizeof(char)); + if ((tax->names)->names[n].class_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a taxon class name"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + return NULL; + } + strcpy((tax->names)->names[n].class_name, elt); + if (strcmp(elt, "scientific name") == 0) + { + (tax->names)->names[n].is_scientific_name = 1; + } + else + (tax->names)->names[n].is_scientific_name = 0; + } + i++; + } + n++; + } + + // Check that fgets stopped because it reached EOF + if (!feof(file)) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError: file reading was stopped before end of file"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + return NULL; + } + + // Store count + (tax->names)->count = n; + + // Truncate the structure memory to the right size + tax->names = (econameidx_t*) realloc(tax->names, sizeof(econameidx_t) + sizeof(econame_t) * (tax->names)->count); + if (tax->names == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for a a taxonomy structure"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + return NULL; + } + + fclose(file); + } + } + closedir(tax_dir); + + if (!nodes_found) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nProblem reading taxdump: nodes.dmp file not found"); + obi_close_taxonomy(tax); + free(parent_taxids); + free(rank_names); + return NULL; + } + if (!names_found) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nProblem reading taxdump: names.dmp file not found"); + obi_close_taxonomy(tax); + free(parent_taxids); + free(rank_names); + return NULL; + } + + // Go through data to fill the taxonomy structure + + // Build rank list + + // Initialize rank structure + buffer_size = 10; + tax->ranks = (ecorankidx_t*) malloc(sizeof(ecorankidx_t) + sizeof(char*) * buffer_size); + if (tax->ranks == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for taxon rank array"); + obi_close_taxonomy(tax); + free(parent_taxids); + free(rank_names); + return NULL; + } + (tax->ranks)->count = 0; + for (i=0; i < (tax->taxa)->count; i++) + { + already_in = false; + for (j=0; j < (tax->ranks)->count; j++) + { + if (strcmp(rank_names[i], ((tax->ranks)->label)[j]) == 0) + { + already_in = true; + break; + } + } + if (!already_in) + { + // Realloc rank structure if needed + if ((tax->ranks)->count == buffer_size) + { + buffer_size = buffer_size + 10; + tax->ranks = (ecorankidx_t*) realloc(tax->ranks, sizeof(ecorankidx_t) + sizeof(char*) * buffer_size); + if (tax->ranks == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for taxon ranks"); + obi_close_taxonomy(tax); + free(parent_taxids); + free(rank_names); + return NULL; + } + } + + // Store new rank + ((tax->ranks)->label)[(tax->ranks)->count] = (char*) malloc((strlen(rank_names[i]) + 1) * sizeof(char)); + if (((tax->ranks)->label)[(tax->ranks)->count] == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for taxon rank names"); + obi_close_taxonomy(tax); + free(parent_taxids); + free(rank_names); + return NULL; + } + strcpy(((tax->ranks)->label)[(tax->ranks)->count], rank_names[i]); + ((tax->ranks)->count)++; + } + } + + // Truncate to the number of ranks recorded + tax->ranks = (ecorankidx_t*) realloc(tax->ranks, sizeof(ecorankidx_t) + sizeof(char*) * (tax->ranks)->count); + if (tax->ranks == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for taxon ranks"); + obi_close_taxonomy(tax); + free(parent_taxids); + free(rank_names); + return NULL; + } + + // Sort in alphabetical order + qsort((tax->ranks)->label, (tax->ranks)->count, sizeof(char*), cmp_str); + + // Associate the taxa with their rank indices + for (i=0; i < (tax->taxa)->count; i++) + { + for (j=0; j < (tax->ranks)->count; j++) + { + if (strcmp(rank_names[i], ((tax->ranks)->label)[j]) == 0) + { + ((tax->taxa)->taxon)[i].rank = j; + break; + } + } + } + + // Associate the taxa with their scientific name + for (i=0; i < (tax->names)->count; i++) + { + if ((tax->names)->names[i].is_scientific_name) + { + ((tax->names)->names[i].taxon)->name = (char*) malloc((strlen((((tax->names)->names)[i]).name) + 1) * sizeof(char)); + if (((tax->names)->names[i].taxon)->name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for taxon ranks"); + obi_close_taxonomy(tax); + free(parent_taxids); + free(rank_names); + return NULL; + } + strcpy(((tax->names)->names[i].taxon)->name, (((tax->names)->names)[i]).name); + } + } + + // Sort names in alphabetical order + qsort((tax->names)->names, (tax->names)->count, sizeof(econame_t), cmp_names); + + // Associate the taxa with their parent + for (i=0; i < (tax->taxa)->count; i++) + { + ((tax->taxa)->taxon)[i].parent = obi_taxo_get_taxon_with_taxid(tax, parent_taxids[i]); + if (((tax->taxa)->taxon)[i].parent == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError: taxon parent not found"); + obi_close_taxonomy(tax); + free(parent_taxids); + free(rank_names); + return NULL; + } + (((tax->taxa)->taxon)[i].parent)->farest = 0; + } + + // TODO what is this for??? + (tax->taxa)->buffer_size = (tax->taxa)->count; + + // Compute longest branches TODO what is this for??? + for (i=0; i < (tax->taxa)->count; i++) + { + t = (((tax->taxa))->taxon)+i; + if (t->farest == -1) + { + t->farest=0; + while (t->parent != t) + { + j = t->farest + 1; + if (j > t->parent->farest) + { + t->parent->farest = j; + t=t->parent; + } + else + t = (tax->taxa)->taxon; + } + } + } + + // Freeing + free(parent_taxids); + for (i=0; i < (tax->taxa)->count; i++) + free(rank_names[i]); + free(rank_names); + + return tax; +} + + + + diff --git a/src/obidms_taxonomy.h b/src/obidms_taxonomy.h index d0902f8..9331a17 100644 --- a/src/obidms_taxonomy.h +++ b/src/obidms_taxonomy.h @@ -17,10 +17,6 @@ #include "obidms.h" -#define SWAPINT32(x) ((((x) << 24) & 0xFF000000) | (((x) << 8) & 0xFF0000) | \ - (((x) >> 8) & 0xFF00) | (((x) >> 24) & 0xFF)) - - typedef struct { int32_t taxid; int32_t rank; @@ -72,8 +68,8 @@ typedef struct { typedef struct { - int32_t count; - econame_t names[1]; + int32_t count; + econame_t names[1]; } econameidx_t; @@ -109,4 +105,7 @@ int write_rankidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name int write_taxonomyidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name); int write_nameidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name); +int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name); + +OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump);