diff --git a/src/obidms_taxonomy.c b/src/obidms_taxonomy.c index 2779699..c70f788 100644 --- a/src/obidms_taxonomy.c +++ b/src/obidms_taxonomy.c @@ -1208,12 +1208,15 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) bool nodes_found=false; bool names_found=false; bool merged_found=false; + bool delnodes_found=false; + int32_t* delnodes=NULL; + int32_t delnodes_count; char line[2048]; // TODO large enough? char* elt; char* file_name; int buffer_size; int i, j; - int n; + int n, nD, nT; char** rank_names; int* parent_taxids; int taxid, old_taxid; @@ -1495,14 +1498,14 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) // Go through taxonomy files while ((dp = readdir(tax_dir)) != NULL) { - if (strcmp(dp->d_name, "merged.dmp") == 0) + if (strcmp(dp->d_name, "delnodes.dmp") == 0) { - merged_found = true; + delnodes_found = true; buffer_size = 10000; - // Initializing the merged structure - tax->merged_idx = (ecomergedidx_t*) malloc(sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size); - if (tax->merged_idx == NULL) + // Initializing the list of deleted nodes + delnodes = (int32_t*) malloc(sizeof(int32_t) * buffer_size); + if (delnodes == NULL) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError allocating the memory for a taxonomy structure"); @@ -1522,6 +1525,156 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) obi_close_taxonomy(tax); free(parent_taxids); free(rank_names); + free(delnodes); + closedir(tax_dir); + return NULL; + } + + // Build the file path + if (sprintf(file_name, "%s/delnodes.dmp", taxdump) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError building a taxonomy file name"); + obi_close_taxonomy(tax); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + free(file_name); + free(delnodes); + return NULL; + } + + file = fopen(file_name, "r"); + if (file == NULL) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nProblem opening a taxonomy file"); + obi_close_taxonomy(tax); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + free(file_name); + free(delnodes); + return NULL; + } + + free(file_name); + + n = 0; + while (fgets(line, sizeof(line), file)) + { + // Check for terminal '\n' character (line complete) + if (line[strlen(line) - 1] != '\n') + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError: line buffer size not large enough for line in taxonomy file"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + free(delnodes); + return NULL; + } + + // Get first and only element of the line (the deprecated taxid) + elt = strtok(line, "|"); + // Remove the last character (tab character) + elt[strlen(elt)-1] = '\0'; + // First element: old deprecated taxid + old_taxid = atoi(elt); + + // Store the old taxid in the list of deleted taxids + // Enlarge array if needed + if (n == buffer_size) + { + buffer_size = buffer_size * 2; + delnodes = (int32_t*) realloc(tax->merged_idx, sizeof(int32_t) * buffer_size); + if (delnodes == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for a taxonomy structure"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + return NULL; + } + } + + delnodes[n] = old_taxid; + n++; + } + + // Check that fgets stopped because it reached EOF + if (!feof(file)) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError: file reading was stopped before end of file"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + free(delnodes); + return NULL; + } + + // Store count + delnodes_count = n; + + fclose(file); + } + } + closedir(tax_dir); + + + // Go through directory again for next file // TODO make separate functions? + tax_dir = opendir(taxdump); + if (tax_dir == NULL) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nProblem opening a taxdump directory"); + obi_close_taxonomy(tax); + free(parent_taxids); + free(rank_names); + free(delnodes); + return NULL; + } + + // Go through taxonomy files + while ((dp = readdir(tax_dir)) != NULL) + { + if (strcmp(dp->d_name, "merged.dmp") == 0) + { + merged_found = true; + buffer_size = 10000; + + // Initializing the merged structure + tax->merged_idx = (ecomergedidx_t*) malloc(sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size); + if (tax->merged_idx == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating the memory for a taxonomy structure"); + obi_close_taxonomy(tax); + free(parent_taxids); + free(rank_names); + free(delnodes); + closedir(tax_dir); + return NULL; + } + + // Allocating the memory for the file name + file_name = (char*) malloc((strlen(taxdump) + 12)*sizeof(char)); + if (file_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating the memory for a file name"); + obi_close_taxonomy(tax); + free(parent_taxids); + free(rank_names); + free(delnodes); closedir(tax_dir); return NULL; } @@ -1536,6 +1689,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) free(parent_taxids); free(rank_names); free(file_name); + free(delnodes); return NULL; } @@ -1549,13 +1703,15 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) free(parent_taxids); free(rank_names); free(file_name); + free(delnodes); return NULL; } free(file_name); - n = 0; - i = 0; + nT = 0; // to point in current taxa list while merging + nD = delnodes_count-1; // to point in deleted taxids list while merging (going from count-1 to 0 because taxids are sorted in descending order) + n = 0; // to point in final merged list while merging while (fgets(line, sizeof(line), file)) { // Check for terminal '\n' character (line complete) @@ -1568,6 +1724,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) closedir(tax_dir); free(parent_taxids); free(rank_names); + free(delnodes); return NULL; } @@ -1588,34 +1745,68 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) taxid = atoi(elt); // Store the old taxid in the merged_idx ordered taxid list - // First, store the taxids from the current taxonomy that come before - while ((i < (tax->taxa)->count) && ((tax->taxa)->taxon[i].taxid < old_taxid)) + // The merged list is an ordered list of the current taxids, the deprecated taxids that have current references, + // and the deleted taxids with no current reference. An element of the list is composed of the taxid, and the index + // of the taxon in the taxa structure, or -1 for deleted taxids. + // Creating the merged list requires to merge the 3 ordered lists into one. + while (((nT < (tax->taxa)->count) && ((tax->taxa)->taxon[nT].taxid < old_taxid)) && ((nD >= 0) && (delnodes[nD] < old_taxid))) { - // Enlarge structures if needed - if (n == buffer_size) - { - buffer_size = buffer_size * 2; - tax->merged_idx = (ecomergedidx_t*) realloc(tax->merged_idx, sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size); - if (tax->merged_idx == NULL) + if ((tax->taxa)->taxon[nT].taxid < delnodes[nD]) + { // Add element from taxa list + // Enlarge structure if needed + if (n == buffer_size) { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError reallocating memory for a taxonomy structure"); - obi_close_taxonomy(tax); - fclose(file); - closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + buffer_size = buffer_size * 2; + tax->merged_idx = (ecomergedidx_t*) realloc(tax->merged_idx, sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size); + if (tax->merged_idx == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for a taxonomy structure"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + free(delnodes); + return NULL; + } } - } - (tax->merged_idx)->merged[n].taxid = (tax->taxa)->taxon[i].taxid; - (tax->merged_idx)->merged[n].idx = i; - i++; - n++; + (tax->merged_idx)->merged[n].taxid = (tax->taxa)->taxon[nT].taxid; + (tax->merged_idx)->merged[n].idx = nT; + nT++; + n++; + } + else if (delnodes[nD] < (tax->taxa)->taxon[nT].taxid) + { // Add element from deleted taxids list + // Enlarge structure if needed + if (n == buffer_size) + { + buffer_size = buffer_size * 2; + tax->merged_idx = (ecomergedidx_t*) realloc(tax->merged_idx, sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size); + if (tax->merged_idx == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for a taxonomy structure"); + obi_close_taxonomy(tax); + fclose(file); + closedir(tax_dir); + free(parent_taxids); + free(rank_names); + free(delnodes); + return NULL; + } + } + + (tax->merged_idx)->merged[n].taxid = delnodes[nD]; + (tax->merged_idx)->merged[n].idx = -1; // The index to tag deleted taxids is -1 + nD--; + n++; + } } - // Enlarge structures if needed + // Add the deprecated taxid + // Enlarge structure if needed if (n == buffer_size) { buffer_size = buffer_size * 2; @@ -1629,6 +1820,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) closedir(tax_dir); free(parent_taxids); free(rank_names); + free(delnodes); return NULL; } } @@ -1675,6 +1867,10 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) fclose(file); } } + + // Free delnodes array, not needed anymore + free(delnodes); + closedir(tax_dir); @@ -2494,6 +2690,8 @@ ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid if (indexed_taxon == NULL) current_taxon = NULL; + else if (indexed_taxon->idx == -1) + current_taxon = NULL; // TODO discuss what to do when old deleted taxon else current_taxon = (taxonomy->taxa->taxon)+(indexed_taxon->idx);