From 0385a92e02c8c66d2d45fd880ac2c7a8f25071f2 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Wed, 11 Jan 2017 16:36:08 +0100 Subject: [PATCH] Taxonomy: Refactored the taxdump reading, and little fixes --- src/obidms_taxonomy.c | 769 ++++++++++++++++++++++-------------------- 1 file changed, 410 insertions(+), 359 deletions(-) diff --git a/src/obidms_taxonomy.c b/src/obidms_taxonomy.c index c70f788..6295be8 100644 --- a/src/obidms_taxonomy.c +++ b/src/obidms_taxonomy.c @@ -259,7 +259,7 @@ ecorankidx_t* read_rankidx(const char* ranks_file_name) if (ranks_file==NULL) return NULL; - ranks_index = (ecorankidx_t*) malloc(sizeof(ecorankidx_t) + sizeof(char*) * (count-1)); + ranks_index = (ecorankidx_t*) malloc(sizeof(ecorankidx_t) + sizeof(char*) * count); if (ranks_index == NULL) { obi_set_errno(OBI_MALLOC_ERROR); @@ -321,7 +321,7 @@ ecotxidx_t* read_taxonomyidx(const char* taxa_file_name, const char* local_taxa_ f_local_taxa = open_ecorecorddb(local_taxa_file_name, &count_local_taxa, 0); - taxa_index = (ecotxidx_t*) malloc(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count_taxa + count_local_taxa - 1)); + taxa_index = (ecotxidx_t*) malloc(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count_taxa + count_local_taxa)); if (taxa_index == NULL) { obi_set_errno(OBI_MALLOC_ERROR); @@ -445,7 +445,7 @@ econameidx_t* read_nameidx(const char *file_name, OBIDMS_taxonomy_p taxonomy) return NULL; } - index_names = (econameidx_t*) malloc(sizeof(econameidx_t) + sizeof(econame_t) * (count-1)); + index_names = (econameidx_t*) malloc(sizeof(econameidx_t) + sizeof(econame_t) * count); if (index_names == NULL) { obi_set_errno(OBI_MALLOC_ERROR); @@ -548,7 +548,7 @@ int write_rankidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name // Build the taxonomy directory path taxonomy_path = get_taxonomy_path(dms, taxonomy_name); - file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 5)*sizeof(char)); + file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 6)*sizeof(char)); if (file_name == NULL) { obi_set_errno(OBI_MALLOC_ERROR); @@ -622,8 +622,8 @@ int write_rankidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name // Close file if (close(file_descriptor) < 0) { - obi_set_errno(OBIDMS_UNKNOWN_ERROR); - obidebug(1, "\nError closing a DMS information file"); + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError closing an rdx taxonomy file"); return -1; } @@ -652,7 +652,7 @@ int write_taxonomyidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_ // Build the taxonomy directory path taxonomy_path = get_taxonomy_path(dms, taxonomy_name); - file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 5)*sizeof(char)); + file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 6)*sizeof(char)); if (file_name == NULL) { obi_set_errno(OBI_MALLOC_ERROR); @@ -759,8 +759,8 @@ int write_taxonomyidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_ // Close file if (close(file_descriptor) < 0) { - obi_set_errno(OBIDMS_UNKNOWN_ERROR); - obidebug(1, "\nError closing a DMS information file"); + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError closing a tdx taxonomy file"); return -1; } @@ -789,7 +789,7 @@ int write_local_taxonomy_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* ta // Build the taxonomy directory path taxonomy_path = get_taxonomy_path(dms, taxonomy_name); - file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 5)*sizeof(char)); + file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 6)*sizeof(char)); if (file_name == NULL) { obi_set_errno(OBI_MALLOC_ERROR); @@ -896,8 +896,8 @@ int write_local_taxonomy_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* ta // Close file if (close(file_descriptor) < 0) { - obi_set_errno(OBIDMS_UNKNOWN_ERROR); - obidebug(1, "\nError closing a DMS information file"); + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError closing a ldx taxonomy file"); return -1; } @@ -928,7 +928,7 @@ int write_nameidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name // Build the taxonomy directory path taxonomy_path = get_taxonomy_path(dms, taxonomy_name); - file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 5)*sizeof(char)); + file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 6)*sizeof(char)); if (file_name == NULL) { obi_set_errno(OBI_MALLOC_ERROR); @@ -1045,7 +1045,7 @@ int write_nameidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name if (close(file_descriptor) < 0) { obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nError closing a DMS information file"); + obidebug(1, "\nError closing a ndx taxonomy file"); return -1; } @@ -1153,7 +1153,7 @@ int write_mergedidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_na if (close(file_descriptor) < 0) { obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nError closing a taxonomy file file"); + obidebug(1, "\nError closing an adx taxonomy file"); return -1; } @@ -1199,64 +1199,56 @@ int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name } -OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) -{ - OBIDMS_taxonomy_p tax; - struct dirent* dp; - DIR* tax_dir; - FILE* file; - bool nodes_found=false; - bool names_found=false; - bool merged_found=false; - bool delnodes_found=false; - int32_t* delnodes=NULL; - int32_t delnodes_count; - char line[2048]; // TODO large enough? - char* elt; - char* file_name; - int buffer_size; - int i, j; - int n, nD, nT; - char** rank_names; - int* parent_taxids; - int taxid, old_taxid; - bool already_in; - ecotx_t* t; - // Initialize taxonomy structure - tax = (OBIDMS_taxonomy_p) malloc(sizeof(OBIDMS_taxonomy_t)); - if (tax == NULL) +int read_nodes_dmp(const char* taxdump, OBIDMS_taxonomy_p tax, char*** rank_names_p, int** parent_taxids_p) +{ + struct dirent* dp; + DIR* tax_dir; + FILE* file; + char* file_name; + bool file_found=false; + char line[2048]; // TODO large enough? + char* elt; + int buffer_size; + int i, n; + + buffer_size = 10000; + + // Initialize rank names and parent taxids arrays + *parent_taxids_p = malloc(buffer_size * sizeof(int)); + if (*parent_taxids_p == NULL) { obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating the memory for a taxonomy structure"); - return NULL; + obidebug(1, "\nError allocating the memory for the parent taxids array"); + return -1; } - tax->ranks = NULL; - tax->taxa = NULL; - tax->names = NULL; - tax->merged_idx = NULL; - tax->dms = NULL; - (tax->tax_name)[0] = '\0'; - - // TODO check if taxdump path is for a gz file to unzip or a directory + *rank_names_p = malloc(buffer_size * sizeof(char*)); + if (*rank_names_p == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating the memory for the rank names array"); + free(*parent_taxids_p); + return -1; + } + // Open the taxdum directory tax_dir = opendir(taxdump); if (tax_dir == NULL) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nProblem opening a taxdump directory"); - free(tax); - return NULL; + free(*parent_taxids_p); + free(*rank_names_p); + return -1; } - // Go through taxonomy files + // Look for the 'nodes.dmp' file while ((dp = readdir(tax_dir)) != NULL) { if (strcmp(dp->d_name, "nodes.dmp") == 0) { - nodes_found = true; - buffer_size = 10000; + file_found = true; // Initializing the taxa structure tax->taxa = (ecotxidx_t*) malloc(sizeof(ecotxidx_t) + sizeof(ecotx_t) * buffer_size); @@ -1264,57 +1256,34 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError allocating the memory for a taxonomy structure"); - free(tax); + free(*parent_taxids_p); + free(*rank_names_p); closedir(tax_dir); - return NULL; - } - - // Initialize rank names and parent taxids arrays - parent_taxids = malloc(buffer_size * sizeof(int)); - if (file_name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating the memory for a file name"); - obi_close_taxonomy(tax); - closedir(tax_dir); - return NULL; - } - - rank_names = malloc(buffer_size * sizeof(char*)); - if (file_name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating the memory for a file name"); - obi_close_taxonomy(tax); - free(parent_taxids); - closedir(tax_dir); - return NULL; + return -1; } // Allocating the memory for the file name - file_name = (char*) malloc((strlen(taxdump) + 10)*sizeof(char)); + file_name = (char*) malloc((strlen(taxdump) + 11)*sizeof(char)); if (file_name == NULL) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError allocating the memory for a file name"); - obi_close_taxonomy(tax); - free(parent_taxids); - free(rank_names); + free(*parent_taxids_p); + free(*rank_names_p); closedir(tax_dir); - return NULL; + return -1; } // Build the file path if (sprintf(file_name, "%s/nodes.dmp", taxdump) < 0) { obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nError building a taxonomy file name"); - obi_close_taxonomy(tax); + obidebug(1, "\nError building a taxonomy file name for 'nodes.dmp'"); + free(*parent_taxids_p); + free(*rank_names_p); closedir(tax_dir); - free(parent_taxids); - free(rank_names); free(file_name); - return NULL; + return -1; } file = fopen(file_name, "r"); @@ -1322,12 +1291,11 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nProblem opening a taxonomy file"); - obi_close_taxonomy(tax); + free(*parent_taxids_p); + free(*rank_names_p); closedir(tax_dir); - free(parent_taxids); - free(rank_names); free(file_name); - return NULL; + return -1; } free(file_name); @@ -1346,38 +1314,35 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError reallocating memory for a taxonomy structure"); - obi_close_taxonomy(tax); + free(*parent_taxids_p); + free(*rank_names_p); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } - parent_taxids = (int*) realloc(parent_taxids, sizeof(int) * buffer_size); - if (parent_taxids == NULL) + *parent_taxids_p = (int*) realloc(*parent_taxids_p, sizeof(int) * buffer_size); + if (*parent_taxids_p == NULL) { obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError reallocating memory for a taxonomy structure"); - obi_close_taxonomy(tax); + obidebug(1, "\nError reallocating memory for the parent taxids array"); + free(*parent_taxids_p); + free(*rank_names_p); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } - rank_names = (char**) realloc(rank_names, sizeof(char*) * buffer_size); - if (rank_names == NULL) + *rank_names_p = (char**) realloc(*rank_names_p, sizeof(char*) * buffer_size); + if (*rank_names_p == NULL) { obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError reallocating memory for a taxonomy structure"); - obi_close_taxonomy(tax); + obidebug(1, "\nError reallocating memory for the rank names array"); + free(*parent_taxids_p); + free(*rank_names_p); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } } @@ -1386,12 +1351,11 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError: line buffer size not large enough for line in taxonomy file"); - obi_close_taxonomy(tax); + free(*parent_taxids_p); + free(*rank_names_p); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } (tax->taxa)->taxon[n].idx = n; @@ -1423,22 +1387,21 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) elt[strlen(elt)-1] = '\0'; if (i == 1) - parent_taxids[n] = atoi(elt); + (*parent_taxids_p)[n] = atoi(elt); else if (i == 2) { - rank_names[n] = (char*) malloc((strlen(elt)+1) * sizeof(char)); - if (rank_names[n] == NULL) + (*rank_names_p)[n] = (char*) malloc((strlen(elt)+1) * sizeof(char)); + if ((*rank_names_p)[n] == NULL) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError allocating memory for taxon rank name"); - obi_close_taxonomy(tax); + free(*parent_taxids_p); + free(*rank_names_p); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } - strcpy(rank_names[n], elt); + strcpy((*rank_names_p)[n], elt); } i++; } @@ -1450,12 +1413,11 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError: file reading was stopped before end of file"); - obi_close_taxonomy(tax); + free(*parent_taxids_p); + free(*rank_names_p); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } // Store count @@ -1469,30 +1431,79 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError reallocating memory for taxonomy structure"); - obi_close_taxonomy(tax); + free(*parent_taxids_p); + free(*rank_names_p); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } - fclose(file); + if (fclose(file) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError closing a taxdump file"); + free(*parent_taxids_p); + free(*rank_names_p); + closedir(tax_dir); + return -1; + } } } - closedir(tax_dir); + if (closedir(tax_dir) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError closing a taxdump directory"); + free(*parent_taxids_p); + free(*rank_names_p); + closedir(tax_dir); + return -1; + } + + if ( ! file_found) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError: could not find 'nodes.dmp' file in taxdump directory"); + free(*parent_taxids_p); + free(*rank_names_p); + return -1; + } + + return 0; +} - // Go through directory again for next file // TODO make separate functions? +int read_delnodes_dmp(const char* taxdump, OBIDMS_taxonomy_p tax, int32_t** delnodes_p, int32_t* delnodes_count) +{ + struct dirent* dp; + DIR* tax_dir; + FILE* file; + char* file_name; + bool file_found=false; + char line[2048]; // TODO large enough? + char* elt; + int buffer_size; + int n; + int old_taxid; + + buffer_size = 10000; + + // Initializing the list of deleted nodes + *delnodes_p = (int32_t*) malloc(sizeof(int32_t) * buffer_size); + if (*delnodes_p == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating the memory for the deleted taxids array"); + return -1; + } + tax_dir = opendir(taxdump); if (tax_dir == NULL) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nProblem opening a taxdump directory"); - obi_close_taxonomy(tax); - free(parent_taxids); - free(rank_names); - return NULL; + closedir(tax_dir); + free(*delnodes_p); + return -1; } // Go through taxonomy files @@ -1500,34 +1511,17 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { if (strcmp(dp->d_name, "delnodes.dmp") == 0) { - delnodes_found = true; - buffer_size = 10000; - - // Initializing the list of deleted nodes - delnodes = (int32_t*) malloc(sizeof(int32_t) * buffer_size); - if (delnodes == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating the memory for a taxonomy structure"); - obi_close_taxonomy(tax); - free(parent_taxids); - free(rank_names); - closedir(tax_dir); - return NULL; - } + file_found = true; // Allocating the memory for the file name - file_name = (char*) malloc((strlen(taxdump) + 12)*sizeof(char)); + file_name = (char*) malloc((strlen(taxdump) + 14)*sizeof(char)); if (file_name == NULL) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError allocating the memory for a file name"); - obi_close_taxonomy(tax); - free(parent_taxids); - free(rank_names); - free(delnodes); closedir(tax_dir); - return NULL; + free(*delnodes_p); + return -1; } // Build the file path @@ -1535,13 +1529,10 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError building a taxonomy file name"); - obi_close_taxonomy(tax); closedir(tax_dir); - free(parent_taxids); - free(rank_names); + free(*delnodes_p); free(file_name); - free(delnodes); - return NULL; + return -1; } file = fopen(file_name, "r"); @@ -1549,13 +1540,10 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nProblem opening a taxonomy file"); - obi_close_taxonomy(tax); closedir(tax_dir); - free(parent_taxids); - free(rank_names); free(file_name); - free(delnodes); - return NULL; + free(*delnodes_p); + return -1; } free(file_name); @@ -1568,13 +1556,10 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError: line buffer size not large enough for line in taxonomy file"); - obi_close_taxonomy(tax); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - free(delnodes); - return NULL; + free(*delnodes_p); + return -1; } // Get first and only element of the line (the deprecated taxid) @@ -1589,21 +1574,18 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) if (n == buffer_size) { buffer_size = buffer_size * 2; - delnodes = (int32_t*) realloc(tax->merged_idx, sizeof(int32_t) * buffer_size); - if (delnodes == NULL) + (*delnodes_p) = (int32_t*) realloc(tax->merged_idx, sizeof(int32_t) * buffer_size); + if ((*delnodes_p) == NULL) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError reallocating memory for a taxonomy structure"); - obi_close_taxonomy(tax); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } } - delnodes[n] = old_taxid; + (*delnodes_p)[n] = old_taxid; n++; } @@ -1612,35 +1594,67 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError: file reading was stopped before end of file"); - obi_close_taxonomy(tax); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - free(delnodes); - return NULL; + free(*delnodes_p); + return -1; } // Store count - delnodes_count = n; + *delnodes_count = n; - fclose(file); + if (fclose(file) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError closing a taxdump file"); + free(*delnodes_p); + closedir(tax_dir); + return -1; + } } } - closedir(tax_dir); + if (closedir(tax_dir) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError closing a taxdump directory"); + free(*delnodes_p); + return -1; + } + + if ( ! file_found) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError: could not find 'delnodes.dmp' file in taxdump directory"); + free(*delnodes_p); + return -1; + } + + return 0; +} - // Go through directory again for next file // TODO make separate functions? +int read_merged_dmp(const char* taxdump, OBIDMS_taxonomy_p tax, int32_t* delnodes, int32_t delnodes_count) +{ + int n, nD, nT; + int taxid, old_taxid; + ecotx_t* t; + struct dirent* dp; + DIR* tax_dir; + FILE* file; + char* file_name; + bool file_found=false; + char line[2048]; // TODO large enough? + char* elt; + int buffer_size; + + buffer_size = 10000; + tax_dir = opendir(taxdump); if (tax_dir == NULL) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nProblem opening a taxdump directory"); - obi_close_taxonomy(tax); - free(parent_taxids); - free(rank_names); - free(delnodes); - return NULL; + return -1; } // Go through taxonomy files @@ -1648,8 +1662,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { if (strcmp(dp->d_name, "merged.dmp") == 0) { - merged_found = true; - buffer_size = 10000; + file_found = true; // Initializing the merged structure tax->merged_idx = (ecomergedidx_t*) malloc(sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size); @@ -1657,12 +1670,8 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError allocating the memory for a taxonomy structure"); - obi_close_taxonomy(tax); - free(parent_taxids); - free(rank_names); - free(delnodes); closedir(tax_dir); - return NULL; + return -1; } // Allocating the memory for the file name @@ -1671,12 +1680,8 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError allocating the memory for a file name"); - obi_close_taxonomy(tax); - free(parent_taxids); - free(rank_names); - free(delnodes); closedir(tax_dir); - return NULL; + return -1; } // Build the file path @@ -1684,13 +1689,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError building a taxonomy file name"); - obi_close_taxonomy(tax); closedir(tax_dir); - free(parent_taxids); - free(rank_names); free(file_name); - free(delnodes); - return NULL; + return -1; } file = fopen(file_name, "r"); @@ -1698,13 +1699,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nProblem opening a taxonomy file"); - obi_close_taxonomy(tax); closedir(tax_dir); - free(parent_taxids); - free(rank_names); free(file_name); - free(delnodes); - return NULL; + return -1; } free(file_name); @@ -1719,13 +1716,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError: line buffer size not large enough for line in taxonomy file"); - obi_close_taxonomy(tax); - fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - free(delnodes); - return NULL; + fclose(file); + return -1; } // Parse the 2 elements separated by '|' @@ -1762,13 +1755,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError reallocating memory for a taxonomy structure"); - obi_close_taxonomy(tax); - fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - free(delnodes); - return NULL; + fclose(file); + return -1; } } @@ -1788,13 +1777,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError reallocating memory for a taxonomy structure"); - obi_close_taxonomy(tax); - fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - free(delnodes); - return NULL; + fclose(file); + return -1; } } @@ -1815,13 +1800,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError reallocating memory for a taxonomy structure"); - obi_close_taxonomy(tax); - fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - free(delnodes); - return NULL; + fclose(file); + return -1; } } @@ -1839,12 +1820,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError: file reading was stopped before end of file"); - obi_close_taxonomy(tax); - fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + fclose(file); + return -1; } // Store count @@ -1856,34 +1834,63 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError reallocating memory for a a taxonomy structure"); - obi_close_taxonomy(tax); - fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + fclose(file); + return -1; } - fclose(file); + if (fclose(file) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError closing a taxdump file"); + closedir(tax_dir); + return -1; + } } } + if (closedir(tax_dir) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError closing a taxdump directory"); + closedir(tax_dir); + return -1; + } + + if ( ! file_found) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError: could not find 'merged.dmp' file in taxdump directory"); + return -1; + } // Free delnodes array, not needed anymore free(delnodes); - closedir(tax_dir); + return 0; +} - // Go through directory again for next file +int read_names_dmp(const char* taxdump, OBIDMS_taxonomy_p tax) +{ + int i, j, n; + int taxid; + struct dirent* dp; + DIR* tax_dir; + FILE* file; + char* file_name; + bool file_found=false; + char line[2048]; // TODO large enough? + char* elt; + int buffer_size; + + buffer_size = 10000; + tax_dir = opendir(taxdump); if (tax_dir == NULL) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nProblem opening a taxdump directory"); - obi_close_taxonomy(tax); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } // Go through taxonomy files @@ -1891,8 +1898,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { if (strcmp(dp->d_name, "names.dmp") == 0) { - names_found = true; - buffer_size = 10000; + file_found = true; // Initializing the names structure tax->names = (econameidx_t*) malloc(sizeof(econameidx_t) + sizeof(econame_t) * buffer_size); @@ -1900,11 +1906,8 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError allocating the memory for a taxonomy structure"); - obi_close_taxonomy(tax); - free(parent_taxids); - free(rank_names); closedir(tax_dir); - return NULL; + return -1; } // Allocating the memory for the file name @@ -1913,11 +1916,8 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError allocating the memory for a file name"); - obi_close_taxonomy(tax); - free(parent_taxids); - free(rank_names); closedir(tax_dir); - return NULL; + return -1; } // Build the file path @@ -1925,12 +1925,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError building a taxonomy file name"); - obi_close_taxonomy(tax); - closedir(tax_dir); - free(parent_taxids); - free(rank_names); free(file_name); - return NULL; + closedir(tax_dir); + return -1; } file = fopen(file_name, "r"); @@ -1938,12 +1935,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nProblem opening a taxonomy file"); - obi_close_taxonomy(tax); - closedir(tax_dir); - free(parent_taxids); - free(rank_names); free(file_name); - return NULL; + closedir(tax_dir); + return -1; } free(file_name); @@ -1961,12 +1955,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError reallocating memory for a taxonomy structure"); - obi_close_taxonomy(tax); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } } @@ -1975,12 +1966,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError: line buffer size not large enough for line in taxonomy file"); - obi_close_taxonomy(tax); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } // Parse 4 first elements separated by '|' @@ -2000,12 +1988,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError: could not find taxon associated to name when reading taxdump"); - obi_close_taxonomy(tax); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } j = i; // Because there are several names by taxon but they are in the same order (tax->names)->names[n].taxon = ((tax->taxa)->taxon)+i; @@ -2029,9 +2014,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) obi_close_taxonomy(tax); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } strcpy((tax->names)->names[n].name, elt); } @@ -2042,12 +2025,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError allocating memory for a taxon class name"); - obi_close_taxonomy(tax); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } strcpy((tax->names)->names[n].class_name, elt); if (strcmp(elt, "scientific name") == 0) @@ -2067,12 +2047,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError: file reading was stopped before end of file"); - obi_close_taxonomy(tax); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } // Store count @@ -2084,35 +2061,109 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { obi_set_errno(OBI_MALLOC_ERROR); obidebug(1, "\nError reallocating memory for a a taxonomy structure"); - obi_close_taxonomy(tax); fclose(file); closedir(tax_dir); - free(parent_taxids); - free(rank_names); - return NULL; + return -1; } - fclose(file); + if (fclose(file) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError closing a taxdump file"); + closedir(tax_dir); + return -1; + } } } - closedir(tax_dir); - - if (!nodes_found) + if (closedir(tax_dir) < 0) { obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nProblem reading taxdump: nodes.dmp file not found"); - obi_close_taxonomy(tax); - free(parent_taxids); - free(rank_names); + obidebug(1, "\nError closing a taxdump directory"); + closedir(tax_dir); + return -1; + } + + if ( ! file_found) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError: could not find 'merged.dmp' file in taxdump directory"); + return -1; + } + + return 0; +} + + +OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) +{ + OBIDMS_taxonomy_p tax; + char** rank_names=NULL; + int* parent_taxids=NULL; + int32_t* delnodes=NULL; + int32_t delnodes_count; + bool already_in; + ecotx_t* t; + int buffer_size; + int i, j; + + // Initialize taxonomy structure + tax = (OBIDMS_taxonomy_p) malloc(sizeof(OBIDMS_taxonomy_t)); + if (tax == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating the memory for a taxonomy structure"); return NULL; } - if (!names_found) + tax->ranks = NULL; + tax->taxa = NULL; + tax->names = NULL; + tax->merged_idx = NULL; + + tax->dms = NULL; + (tax->tax_name)[0] = '\0'; + + // TODO check if taxdump path is for a gz file to unzip or a directory + + // READ NODES.DMP + if (read_nodes_dmp(taxdump, tax, &rank_names, &parent_taxids) < 0) { obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nProblem reading taxdump: names.dmp file not found"); + obidebug(1, "\nProblem reading 'nodes.dmp'"); + obi_close_taxonomy(tax); + return NULL; + } + + // READ DELNODES.DMP + if (read_delnodes_dmp(taxdump, tax, &delnodes, &delnodes_count) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nProblem reading 'delnodes.dmp'"); obi_close_taxonomy(tax); - free(parent_taxids); free(rank_names); + free(parent_taxids); + return NULL; + } + + // READ MERGED.DMP + if (read_merged_dmp(taxdump, tax, delnodes, delnodes_count) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nProblem reading 'merged.dmp'"); + obi_close_taxonomy(tax); + free(delnodes); + free(rank_names); + free(parent_taxids); + return NULL; + } + + // READ NAMES.DMP + if (read_names_dmp(taxdump, tax) < 0) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nProblem reading 'names.dmp'"); + obi_close_taxonomy(tax); + free(rank_names); + free(parent_taxids); return NULL; } @@ -2246,7 +2297,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) (tax->taxa)->buffer_size = (tax->taxa)->count; - // Compute longest branches + // Compute longest branches (used to compute distances between taxa faster) for (i=0; i < (tax->taxa)->count; i++) { t = (((tax->taxa))->taxon)+i; @@ -2281,8 +2332,8 @@ int obi_taxonomy_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const { int32_t taxid; ecotx_t* taxon; - econame_t* name_struct; - int i; + int i; +// econame_t* name_struct; // Enlarge the structure memory for a new taxon tax->taxa = (ecotxidx_t*) realloc(tax->taxa, sizeof(ecotxidx_t) + sizeof(ecotx_t) * (((tax->taxa)->count) + 1)); @@ -2323,7 +2374,7 @@ int obi_taxonomy_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const break; } } - if (taxon->rank == -1) // TODO Discuss possibility of creating rank if doesn't exist + if (taxon->rank == -1) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError: taxon rank not found when adding a new taxon"); @@ -2336,7 +2387,7 @@ int obi_taxonomy_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const obidebug(1, "\nError: taxon parent not found when adding a new taxon"); return -1; } - taxon->farest = 0; // TODO not sure + taxon->farest = 0; // Update taxonomy counts etc (tax->taxa)->max_taxid = taxid; @@ -2344,42 +2395,42 @@ int obi_taxonomy_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const ((tax->taxa)->local_count)++; (tax->taxa)->buffer_size = (tax->taxa)->count; - // Add new name in names structure // TODO discuss because in OBITools1 the new names were not written in .ndx - // Allocate memory for new name - tax->names = (econameidx_t*) realloc(tax->names, sizeof(econameidx_t) + sizeof(econame_t) * ((tax->names)->count + 1)); - if (tax->names == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError reallocating memory for a taxonomy structure to add a new taxon"); - return -1; - } - - // Add new name - name_struct = (tax->names)->names + ((tax->names)->count); - name_struct->name = (char*) malloc((strlen(name) + 1) * sizeof(char)); - if (name_struct->name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for a taxon name to add a new taxon"); - return -1; - } - strcpy(name_struct->name, name); - name_struct->class_name = (char*) malloc((strlen("scientific name") + 1) * sizeof(char)); - if (name_struct->class_name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for a taxon class name to add a new taxon"); - return -1; - } - strcpy(name_struct->class_name, "scientific name"); - name_struct->is_scientific_name = true; - name_struct->taxon = ((tax->taxa)->taxon) + ((tax->taxa)->count) - 1; - - // Sort names in alphabetical order - qsort((tax->names)->names, (tax->names)->count, sizeof(econame_t), cmp_names); - - // Update name count - ((tax->names)->count)++; +// // Add new name in names structure // Commented because the new name was not added in the .ndx file in the OBITools1 +// // Allocate memory for new name +// tax->names = (econameidx_t*) realloc(tax->names, sizeof(econameidx_t) + sizeof(econame_t) * ((tax->names)->count + 1)); +// if (tax->names == NULL) +// { +// obi_set_errno(OBI_MALLOC_ERROR); +// obidebug(1, "\nError reallocating memory for a taxonomy structure to add a new taxon"); +// return -1; +// } +// +// // Add new name +// name_struct = (tax->names)->names + ((tax->names)->count); +// name_struct->name = (char*) malloc((strlen(name) + 1) * sizeof(char)); +// if (name_struct->name == NULL) +// { +// obi_set_errno(OBI_MALLOC_ERROR); +// obidebug(1, "\nError allocating memory for a taxon name to add a new taxon"); +// return -1; +// } +// strcpy(name_struct->name, name); +// name_struct->class_name = (char*) malloc((strlen("scientific name") + 1) * sizeof(char)); +// if (name_struct->class_name == NULL) +// { +// obi_set_errno(OBI_MALLOC_ERROR); +// obidebug(1, "\nError allocating memory for a taxon class name to add a new taxon"); +// return -1; +// } +// strcpy(name_struct->class_name, "scientific name"); +// name_struct->is_scientific_name = true; +// name_struct->taxon = ((tax->taxa)->taxon) + ((tax->taxa)->count) - 1; +// +// // Sort names in alphabetical order +// qsort((tax->names)->names, (tax->names)->count, sizeof(econame_t), cmp_names); +// +// // Update name count +// ((tax->names)->count)++; return taxid; } @@ -2445,7 +2496,7 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo if (tax->ranks == NULL) { obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nError building ranks file name"); + obidebug(1, "\nError reading taxonomy ranks file (check taxonomy name spelling)"); free(taxonomy_path); free(ranks_file_name); free(tax);