From d68374018bf43eedee8cb3e1500a350b6f308694 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Fri, 6 Jan 2017 15:52:21 +0100 Subject: [PATCH] Taxonomy: functions to read the *.adx file (containing the deprecated and current taxids and their corresponding indices in the taxa structure) and to find the taxa using the merged index. --- src/obidms_taxonomy.c | 140 +++++++++++++++++++++++++++++++++++++++--- src/obidms_taxonomy.h | 1 + 2 files changed, 132 insertions(+), 9 deletions(-) diff --git a/src/obidms_taxonomy.c b/src/obidms_taxonomy.c index bf9d773..2779699 100644 --- a/src/obidms_taxonomy.c +++ b/src/obidms_taxonomy.c @@ -35,7 +35,7 @@ int cmp_rank_labels(const void* label1, const void* label2) } -static int cmp_taxids(const void* ptaxid, const void* ptaxon) +static int cmp_taxids_in_ecotx_t(const void* ptaxid, const void* ptaxon) { ecotx_t* current_taxon = (ecotx_t*) ptaxon; int32_t taxid = (int32_t) ((size_t) ptaxid); @@ -43,6 +43,14 @@ static int cmp_taxids(const void* ptaxid, const void* ptaxon) } +static int cmp_taxids_in_ecomerged_t(const void* ptaxid, const void* ptaxon) +{ + ecomerged_t* current_taxon = (ecomerged_t*) ptaxon; + int32_t taxid = (int32_t) ((size_t) ptaxid); + return taxid - current_taxon->taxid; +} + + static int cmp_str(const void* s1, const void* s2) { return strcmp(*((char**)s1), *((char**)s2)); @@ -467,6 +475,55 @@ econameidx_t* read_nameidx(const char *file_name, OBIDMS_taxonomy_p taxonomy) +ecomergedidx_t* read_mergedidx(const char *file_name, OBIDMS_taxonomy_p taxonomy) +{ + int32_t count; + FILE* f; + ecomergedidx_t* index_merged_idx; + ecomerged_t* merged_idx; + int32_t i; + int32_t record_length; + + f = open_ecorecorddb(file_name, &count, 0); + if (f == NULL) + { + obidebug(1, "\nError reading taxonomy name file"); + return NULL; + } + + index_merged_idx = (ecomergedidx_t*) malloc(sizeof(ecomergedidx_t) + (sizeof(ecomerged_t) * count)); + if (index_merged_idx == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reading taxonomy name file"); + return NULL; + } + + index_merged_idx->count = count; + + for (i=0; i < count; i++) + { + merged_idx = read_ecorecord(f, &record_length); + memcpy((index_merged_idx->merged)+i, merged_idx, record_length); + if ((index_merged_idx->merged)+i == NULL) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError reading taxonomy name file"); + free(index_merged_idx); + return NULL; + } + } + + fclose(f); + + return index_merged_idx; +} + + + + + + // Functions to write taxonomy structure to binary files @@ -1003,6 +1060,7 @@ int write_mergedidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_na int file_descriptor; off_t file_size; char* taxonomy_path; + int32_t record_size; // Compute file size file_size = sizeof(int32_t) + (sizeof(int32_t) * 3 * (tax->merged_idx)->count); @@ -1058,9 +1116,20 @@ int write_mergedidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_na return -1; } + record_size = 2 * sizeof(int32_t); + // Write merged indices for (i=0; i < (tax->merged_idx)->count; i++) { + // Write record size + if (write(file_descriptor, &(record_size), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t))) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError writing in a binary taxonomy file"); + close(file_descriptor); + return -1; + } + // Write taxid if (write(file_descriptor, &(((tax->merged_idx)->merged)[i].taxid), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t))) { @@ -1566,7 +1635,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) // Store the deprecated taxid with the index that refers to the new taxid // Find the index of the new taxid - t = obi_taxo_get_taxon_with_taxid(tax, taxid); + t = obi_taxo_get_taxon_with_current_taxid(tax, taxid); // Store the old taxid with the index (tax->merged_idx)->merged[n].taxid = old_taxid; (tax->merged_idx)->merged[n].idx = t->idx; @@ -1966,7 +2035,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) // Associate the taxa with their parent for (i=0; i < (tax->taxa)->count; i++) { - ((tax->taxa)->taxon)[i].parent = obi_taxo_get_taxon_with_taxid(tax, parent_taxids[i]); + ((tax->taxa)->taxon)[i].parent = obi_taxo_get_taxon_with_current_taxid(tax, parent_taxids[i]); if (((tax->taxa)->taxon)[i].parent == NULL) { obi_set_errno(OBI_TAXONOMY_ERROR); @@ -2129,6 +2198,7 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo char* taxonomy_path; char* ranks_file_name; char* taxa_file_name; + char* merged_idx_file_name; char* local_taxa_file_name; char* alter_names_file_name; int buffer_size; @@ -2238,6 +2308,35 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo free(taxa_file_name); free(local_taxa_file_name); + // Read merged index (old and current taxids referring to indices in the taxa structure) + merged_idx_file_name = (char*) malloc(buffer_size*sizeof(char)); + if (merged_idx_file_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for merged index file name"); + free(taxonomy_path); + obi_close_taxonomy(tax); + return NULL; + } + if (snprintf(merged_idx_file_name, buffer_size, "%s/%s.adx", taxonomy_path, taxonomy_name) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError building merged index file name"); + free(taxonomy_path); + free(merged_idx_file_name); + obi_close_taxonomy(tax); + return NULL; + } + tax->merged_idx = read_mergedidx(merged_idx_file_name, tax); + if (tax->merged_idx == NULL) + { + free(taxonomy_path); + free(merged_idx_file_name); + obi_close_taxonomy(tax); + return NULL; + } + free(merged_idx_file_name); + // Read alternative names if (read_alternative_names) { @@ -2363,10 +2462,10 @@ ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx) } -ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid) +ecotx_t* obi_taxo_get_taxon_with_current_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid) // TODO discuss keeping private? { - ecotx_t *current_taxon; - int32_t count; + ecotx_t *current_taxon; + int32_t count; count = (taxonomy->taxa)->count; @@ -2374,12 +2473,35 @@ ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid (const void *) taxonomy->taxa->taxon, count, sizeof(ecotx_t), - cmp_taxids); + cmp_taxids_in_ecotx_t); return current_taxon; } -bool obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid) +ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid) +{ + ecotx_t *current_taxon; + ecomerged_t *indexed_taxon; + int32_t count; + + count = (taxonomy->merged_idx)->count; + + indexed_taxon = (ecomerged_t*) bsearch((const void *) ((size_t) taxid), + (const void *) taxonomy->merged_idx->merged, + count, + sizeof(ecomerged_t), + cmp_taxids_in_ecomerged_t); + + if (indexed_taxon == NULL) + current_taxon = NULL; + else + current_taxon = (taxonomy->taxa->taxon)+(indexed_taxon->idx); + + return current_taxon; +} + + +bool obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid) // TODO discuss that this doesn't work with deprecated taxids { ecotx_t* next_parent; @@ -2486,7 +2608,7 @@ ecotx_t* obi_taxo_get_kingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy) ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy) { static OBIDMS_taxonomy_p tax = NULL; - static int32_t rankindex = -1; + static int32_t rankindex = -1; if (taxonomy && (tax != taxonomy)) { diff --git a/src/obidms_taxonomy.h b/src/obidms_taxonomy.h index 48d0293..33d8aba 100644 --- a/src/obidms_taxonomy.h +++ b/src/obidms_taxonomy.h @@ -108,6 +108,7 @@ int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy); ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx); +ecotx_t* obi_taxo_get_taxon_with_current_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid); ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid); bool obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid);