From 897032387f5159b9d8e6d19cc94a4ac8be77887c Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Thu, 5 Jan 2017 14:28:36 +0100 Subject: [PATCH] Taxonomy: reading merged.dmp file in taxdump --- src/obidms_taxonomy.c | 226 ++++++++++++++++++++++++++++++++++++++++-- src/obidms_taxonomy.h | 35 +++++-- 2 files changed, 240 insertions(+), 21 deletions(-) diff --git a/src/obidms_taxonomy.c b/src/obidms_taxonomy.c index 3eafd85..e096a8c 100644 --- a/src/obidms_taxonomy.c +++ b/src/obidms_taxonomy.c @@ -1041,6 +1041,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) FILE* file; bool nodes_found=false; bool names_found=false; + bool merged_found=false; char line[2048]; // TODO large enough? char* elt; char* file_name; @@ -1049,9 +1050,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) int n; char** rank_names; int* parent_taxids; - int taxid; + int taxid, old_taxid; bool already_in; - ecotx_t* t; + ecotx_t* t; // Initialize taxonomy structure tax = (OBIDMS_taxonomy_p) malloc(sizeof(OBIDMS_taxonomy_t)); @@ -1061,9 +1062,10 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) obidebug(1, "\nError allocating the memory for a taxonomy structure"); return NULL; } - tax->ranks = NULL; - tax->taxa = NULL; - tax->names = NULL; + tax->ranks = NULL; + tax->taxa = NULL; + tax->names = NULL; + tax->merged_idx = NULL; tax->dms = NULL; (tax->tax_name)[0] = '\0'; @@ -1312,6 +1314,204 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) closedir(tax_dir); + // Go through directory again for next file // TODO make separate functions? + tax_dir = opendir(taxdump); + if (tax_dir == NULL) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nProblem opening a taxdump directory"); + obi_close_taxonomy(tax); + free(parent_taxids); + free(rank_names); + return NULL; + } + + // Go through taxonomy files + while ((dp = readdir(tax_dir)) != NULL) + { + if (strcmp(dp->d_name, "merged.dmp") == 0) + { + merged_found = true; // TODO + buffer_size = 10000; + + // Initializing the merged structure + tax->merged_idx = (ecomergedidx_t*) malloc(sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size); + if (tax->merged_idx == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating the memory for a taxonomy structure"); + obi_close_taxonomy(tax); + free(parent_taxids); + free(rank_names); + closedir(tax_dir); + return NULL; + } + + // Allocating the memory for the file name + file_name = (char*) malloc((strlen(taxdump) + 12)*sizeof(char)); + if (file_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating the memory for a file name"); + obi_close_taxonomy(tax); + free(parent_taxids); + free(rank_names); + closedir(tax_dir); + return NULL; + } + + // Build the file path + if (sprintf(file_name, "%s/merged.dmp", taxdump) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError building a taxonomy file name"); + obi_close_taxonomy(tax); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + free(file_name); + return NULL; + } + + file = fopen(file_name, "r"); + if (file == NULL) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nProblem opening a taxonomy file"); + obi_close_taxonomy(tax); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + free(file_name); + return NULL; + } + + free(file_name); + + n = 0; + i = 0; + while (fgets(line, sizeof(line), file)) + { + // Check for terminal '\n' character (line complete) + if (line[strlen(line) - 1] != '\n') + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError: line buffer size not large enough for line in taxonomy file"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + return NULL; + } + + // Parse the 2 elements separated by '|' + + // Get first element + elt = strtok(line, "|"); + // Remove the last character (tab character) + elt[strlen(elt)-1] = '\0'; + // First element: old deprecated taxid + old_taxid = atoi(elt); + + // Get 2nd element: new taxid + elt = strtok(NULL, "|"); + // Remove the first and the last characters (tab characters) + elt = elt+1; + elt[strlen(elt)-1] = '\0'; + taxid = atoi(elt); + + // Store the old taxid in the merged_idx ordered taxid list + // First, store the taxids from the current taxonomy that come before + while ((i < (tax->taxa)->count) && ((tax->taxa)->taxon[i].taxid < old_taxid)) + { + // Enlarge structures if needed + if (n == buffer_size) + { + buffer_size = buffer_size * 2; + tax->merged_idx = (ecomergedidx_t*) realloc(tax->merged_idx, sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size); + if (tax->merged_idx == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for a taxonomy structure"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + return NULL; + } + } + + (tax->merged_idx)->merged[n].taxid = (tax->taxa)->taxon[i].taxid; + (tax->merged_idx)->merged[n].idx = i; + i++; + n++; + } + + // Enlarge structures if needed + if (n == buffer_size) + { + buffer_size = buffer_size * 2; + tax->merged_idx = (ecomergedidx_t*) realloc(tax->merged_idx, sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size); + if (tax->merged_idx == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for a taxonomy structure"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + return NULL; + } + } + + // Store the deprecated taxid with the index that refers to the new taxid + // Find the index of the new taxid + t = obi_taxo_get_taxon_with_taxid(tax, taxid); + // Store the old taxid with the index + (tax->merged_idx)->merged[n].taxid = old_taxid; + (tax->merged_idx)->merged[n].idx = t->idx; + n++; + } + + // Check that fgets stopped because it reached EOF + if (!feof(file)) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError: file reading was stopped before end of file"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + return NULL; + } + + // Store count + (tax->merged_idx)->count = n; + + // Truncate the structure memory to the right size + tax->merged_idx = (ecomergedidx_t*) realloc(tax->merged_idx, sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * (tax->merged_idx)->count); + if (tax->merged_idx == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for a a taxonomy structure"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + return NULL; + } + + fclose(file); + } + } + closedir(tax_dir); + + // Go through directory again for next file tax_dir = opendir(taxdump); if (tax_dir == NULL) @@ -1346,7 +1546,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) } // Allocating the memory for the file name - file_name = (char*) malloc((strlen(taxdump) + 10)*sizeof(char)); + file_name = (char*) malloc((strlen(taxdump) + 11)*sizeof(char)); if (file_name == NULL) { obi_set_errno(OBI_MALLOC_ERROR); @@ -1684,7 +1884,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) (tax->taxa)->buffer_size = (tax->taxa)->count; - // Compute longest branches TODO what is this for??? + // Compute longest branches for (i=0; i < (tax->taxa)->count; i++) { t = (((tax->taxa))->taxon)+i; @@ -1844,9 +2044,10 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo return NULL; } - tax->ranks = NULL; - tax->taxa = NULL; - tax->names = NULL; + tax->ranks = NULL; + tax->taxa = NULL; + tax->names = NULL; + tax->merged_idx = NULL; tax->dms = dms; @@ -2028,6 +2229,11 @@ int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy) free(taxonomy->taxa); } + if (taxonomy->merged_idx) + { + free(taxonomy->merged_idx); + } + free(taxonomy); } diff --git a/src/obidms_taxonomy.h b/src/obidms_taxonomy.h index e70e892..543a257 100644 --- a/src/obidms_taxonomy.h +++ b/src/obidms_taxonomy.h @@ -26,12 +26,12 @@ typedef struct { int32_t rank; int32_t parent; int32_t name_length; - char name[1]; + char name[]; } ecotxformat_t; typedef struct ecotxnode { - int32_t taxid; + int32_t taxid; // TODO discuss that this is will be the current taxid even if the struct was accessed through a deprecated one int32_t rank; int32_t farest; int32_t idx; @@ -47,13 +47,13 @@ typedef struct { int32_t local_count; int32_t max_taxid; int32_t buffer_size; - ecotx_t taxon[1]; + ecotx_t taxon[]; } ecotxidx_t; typedef struct { int32_t count; - char* label[1]; + char* label[]; } ecorankidx_t; @@ -62,7 +62,7 @@ typedef struct { int32_t name_length; int32_t class_length; int32_t taxid; // taxid idx - char names[1]; + char names[]; } econameformat_t; @@ -76,16 +76,29 @@ typedef struct { typedef struct { int32_t count; - econame_t names[1]; + econame_t names[]; } econameidx_t; +typedef struct { + int32_t taxid; + int32_t idx; +} ecomerged_t; + + +typedef struct { + int32_t count; + ecomerged_t merged[]; +} ecomergedidx_t; + + typedef struct OBIDMS_taxonomy_t { - char tax_name[TAX_NAME_LEN]; - OBIDMS_p dms; - ecorankidx_t* ranks; - econameidx_t* names; - ecotxidx_t* taxa; + char tax_name[TAX_NAME_LEN]; + OBIDMS_p dms; + ecomergedidx_t* merged_idx; + ecorankidx_t* ranks; + econameidx_t* names; + ecotxidx_t* taxa; } OBIDMS_taxonomy_t, *OBIDMS_taxonomy_p;