Taxonomy: informations about deleted taxids is now read from
delnodes.dmp file and added to *.adx file
This commit is contained in:
@ -1208,12 +1208,15 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
|
||||
bool nodes_found=false;
|
||||
bool names_found=false;
|
||||
bool merged_found=false;
|
||||
bool delnodes_found=false;
|
||||
int32_t* delnodes=NULL;
|
||||
int32_t delnodes_count;
|
||||
char line[2048]; // TODO large enough?
|
||||
char* elt;
|
||||
char* file_name;
|
||||
int buffer_size;
|
||||
int i, j;
|
||||
int n;
|
||||
int n, nD, nT;
|
||||
char** rank_names;
|
||||
int* parent_taxids;
|
||||
int taxid, old_taxid;
|
||||
@ -1495,14 +1498,14 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
|
||||
// Go through taxonomy files
|
||||
while ((dp = readdir(tax_dir)) != NULL)
|
||||
{
|
||||
if (strcmp(dp->d_name, "merged.dmp") == 0)
|
||||
if (strcmp(dp->d_name, "delnodes.dmp") == 0)
|
||||
{
|
||||
merged_found = true;
|
||||
delnodes_found = true;
|
||||
buffer_size = 10000;
|
||||
|
||||
// Initializing the merged structure
|
||||
tax->merged_idx = (ecomergedidx_t*) malloc(sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size);
|
||||
if (tax->merged_idx == NULL)
|
||||
// Initializing the list of deleted nodes
|
||||
delnodes = (int32_t*) malloc(sizeof(int32_t) * buffer_size);
|
||||
if (delnodes == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_MALLOC_ERROR);
|
||||
obidebug(1, "\nError allocating the memory for a taxonomy structure");
|
||||
@ -1522,6 +1525,156 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
|
||||
obi_close_taxonomy(tax);
|
||||
free(parent_taxids);
|
||||
free(rank_names);
|
||||
free(delnodes);
|
||||
closedir(tax_dir);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Build the file path
|
||||
if (sprintf(file_name, "%s/delnodes.dmp", taxdump) < 0)
|
||||
{
|
||||
obi_set_errno(OBI_TAXONOMY_ERROR);
|
||||
obidebug(1, "\nError building a taxonomy file name");
|
||||
obi_close_taxonomy(tax);
|
||||
closedir(tax_dir);
|
||||
free(parent_taxids);
|
||||
free(rank_names);
|
||||
free(file_name);
|
||||
free(delnodes);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
file = fopen(file_name, "r");
|
||||
if (file == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_TAXONOMY_ERROR);
|
||||
obidebug(1, "\nProblem opening a taxonomy file");
|
||||
obi_close_taxonomy(tax);
|
||||
closedir(tax_dir);
|
||||
free(parent_taxids);
|
||||
free(rank_names);
|
||||
free(file_name);
|
||||
free(delnodes);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
free(file_name);
|
||||
|
||||
n = 0;
|
||||
while (fgets(line, sizeof(line), file))
|
||||
{
|
||||
// Check for terminal '\n' character (line complete)
|
||||
if (line[strlen(line) - 1] != '\n')
|
||||
{
|
||||
obi_set_errno(OBI_TAXONOMY_ERROR);
|
||||
obidebug(1, "\nError: line buffer size not large enough for line in taxonomy file");
|
||||
obi_close_taxonomy(tax);
|
||||
fclose(file);
|
||||
closedir(tax_dir);
|
||||
free(parent_taxids);
|
||||
free(rank_names);
|
||||
free(delnodes);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Get first and only element of the line (the deprecated taxid)
|
||||
elt = strtok(line, "|");
|
||||
// Remove the last character (tab character)
|
||||
elt[strlen(elt)-1] = '\0';
|
||||
// First element: old deprecated taxid
|
||||
old_taxid = atoi(elt);
|
||||
|
||||
// Store the old taxid in the list of deleted taxids
|
||||
// Enlarge array if needed
|
||||
if (n == buffer_size)
|
||||
{
|
||||
buffer_size = buffer_size * 2;
|
||||
delnodes = (int32_t*) realloc(tax->merged_idx, sizeof(int32_t) * buffer_size);
|
||||
if (delnodes == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_MALLOC_ERROR);
|
||||
obidebug(1, "\nError reallocating memory for a taxonomy structure");
|
||||
obi_close_taxonomy(tax);
|
||||
fclose(file);
|
||||
closedir(tax_dir);
|
||||
free(parent_taxids);
|
||||
free(rank_names);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
delnodes[n] = old_taxid;
|
||||
n++;
|
||||
}
|
||||
|
||||
// Check that fgets stopped because it reached EOF
|
||||
if (!feof(file))
|
||||
{
|
||||
obi_set_errno(OBI_TAXONOMY_ERROR);
|
||||
obidebug(1, "\nError: file reading was stopped before end of file");
|
||||
obi_close_taxonomy(tax);
|
||||
fclose(file);
|
||||
closedir(tax_dir);
|
||||
free(parent_taxids);
|
||||
free(rank_names);
|
||||
free(delnodes);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Store count
|
||||
delnodes_count = n;
|
||||
|
||||
fclose(file);
|
||||
}
|
||||
}
|
||||
closedir(tax_dir);
|
||||
|
||||
|
||||
// Go through directory again for next file // TODO make separate functions?
|
||||
tax_dir = opendir(taxdump);
|
||||
if (tax_dir == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_TAXONOMY_ERROR);
|
||||
obidebug(1, "\nProblem opening a taxdump directory");
|
||||
obi_close_taxonomy(tax);
|
||||
free(parent_taxids);
|
||||
free(rank_names);
|
||||
free(delnodes);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Go through taxonomy files
|
||||
while ((dp = readdir(tax_dir)) != NULL)
|
||||
{
|
||||
if (strcmp(dp->d_name, "merged.dmp") == 0)
|
||||
{
|
||||
merged_found = true;
|
||||
buffer_size = 10000;
|
||||
|
||||
// Initializing the merged structure
|
||||
tax->merged_idx = (ecomergedidx_t*) malloc(sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size);
|
||||
if (tax->merged_idx == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_MALLOC_ERROR);
|
||||
obidebug(1, "\nError allocating the memory for a taxonomy structure");
|
||||
obi_close_taxonomy(tax);
|
||||
free(parent_taxids);
|
||||
free(rank_names);
|
||||
free(delnodes);
|
||||
closedir(tax_dir);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Allocating the memory for the file name
|
||||
file_name = (char*) malloc((strlen(taxdump) + 12)*sizeof(char));
|
||||
if (file_name == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_MALLOC_ERROR);
|
||||
obidebug(1, "\nError allocating the memory for a file name");
|
||||
obi_close_taxonomy(tax);
|
||||
free(parent_taxids);
|
||||
free(rank_names);
|
||||
free(delnodes);
|
||||
closedir(tax_dir);
|
||||
return NULL;
|
||||
}
|
||||
@ -1536,6 +1689,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
|
||||
free(parent_taxids);
|
||||
free(rank_names);
|
||||
free(file_name);
|
||||
free(delnodes);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -1549,13 +1703,15 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
|
||||
free(parent_taxids);
|
||||
free(rank_names);
|
||||
free(file_name);
|
||||
free(delnodes);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
free(file_name);
|
||||
|
||||
n = 0;
|
||||
i = 0;
|
||||
nT = 0; // to point in current taxa list while merging
|
||||
nD = delnodes_count-1; // to point in deleted taxids list while merging (going from count-1 to 0 because taxids are sorted in descending order)
|
||||
n = 0; // to point in final merged list while merging
|
||||
while (fgets(line, sizeof(line), file))
|
||||
{
|
||||
// Check for terminal '\n' character (line complete)
|
||||
@ -1568,6 +1724,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
|
||||
closedir(tax_dir);
|
||||
free(parent_taxids);
|
||||
free(rank_names);
|
||||
free(delnodes);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -1588,34 +1745,68 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
|
||||
taxid = atoi(elt);
|
||||
|
||||
// Store the old taxid in the merged_idx ordered taxid list
|
||||
// First, store the taxids from the current taxonomy that come before
|
||||
while ((i < (tax->taxa)->count) && ((tax->taxa)->taxon[i].taxid < old_taxid))
|
||||
// The merged list is an ordered list of the current taxids, the deprecated taxids that have current references,
|
||||
// and the deleted taxids with no current reference. An element of the list is composed of the taxid, and the index
|
||||
// of the taxon in the taxa structure, or -1 for deleted taxids.
|
||||
// Creating the merged list requires to merge the 3 ordered lists into one.
|
||||
while (((nT < (tax->taxa)->count) && ((tax->taxa)->taxon[nT].taxid < old_taxid)) && ((nD >= 0) && (delnodes[nD] < old_taxid)))
|
||||
{
|
||||
// Enlarge structures if needed
|
||||
if (n == buffer_size)
|
||||
{
|
||||
buffer_size = buffer_size * 2;
|
||||
tax->merged_idx = (ecomergedidx_t*) realloc(tax->merged_idx, sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size);
|
||||
if (tax->merged_idx == NULL)
|
||||
if ((tax->taxa)->taxon[nT].taxid < delnodes[nD])
|
||||
{ // Add element from taxa list
|
||||
// Enlarge structure if needed
|
||||
if (n == buffer_size)
|
||||
{
|
||||
obi_set_errno(OBI_MALLOC_ERROR);
|
||||
obidebug(1, "\nError reallocating memory for a taxonomy structure");
|
||||
obi_close_taxonomy(tax);
|
||||
fclose(file);
|
||||
closedir(tax_dir);
|
||||
free(parent_taxids);
|
||||
free(rank_names);
|
||||
return NULL;
|
||||
buffer_size = buffer_size * 2;
|
||||
tax->merged_idx = (ecomergedidx_t*) realloc(tax->merged_idx, sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size);
|
||||
if (tax->merged_idx == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_MALLOC_ERROR);
|
||||
obidebug(1, "\nError reallocating memory for a taxonomy structure");
|
||||
obi_close_taxonomy(tax);
|
||||
fclose(file);
|
||||
closedir(tax_dir);
|
||||
free(parent_taxids);
|
||||
free(rank_names);
|
||||
free(delnodes);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(tax->merged_idx)->merged[n].taxid = (tax->taxa)->taxon[i].taxid;
|
||||
(tax->merged_idx)->merged[n].idx = i;
|
||||
i++;
|
||||
n++;
|
||||
(tax->merged_idx)->merged[n].taxid = (tax->taxa)->taxon[nT].taxid;
|
||||
(tax->merged_idx)->merged[n].idx = nT;
|
||||
nT++;
|
||||
n++;
|
||||
}
|
||||
else if (delnodes[nD] < (tax->taxa)->taxon[nT].taxid)
|
||||
{ // Add element from deleted taxids list
|
||||
// Enlarge structure if needed
|
||||
if (n == buffer_size)
|
||||
{
|
||||
buffer_size = buffer_size * 2;
|
||||
tax->merged_idx = (ecomergedidx_t*) realloc(tax->merged_idx, sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size);
|
||||
if (tax->merged_idx == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_MALLOC_ERROR);
|
||||
obidebug(1, "\nError reallocating memory for a taxonomy structure");
|
||||
obi_close_taxonomy(tax);
|
||||
fclose(file);
|
||||
closedir(tax_dir);
|
||||
free(parent_taxids);
|
||||
free(rank_names);
|
||||
free(delnodes);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
(tax->merged_idx)->merged[n].taxid = delnodes[nD];
|
||||
(tax->merged_idx)->merged[n].idx = -1; // The index to tag deleted taxids is -1
|
||||
nD--;
|
||||
n++;
|
||||
}
|
||||
}
|
||||
|
||||
// Enlarge structures if needed
|
||||
// Add the deprecated taxid
|
||||
// Enlarge structure if needed
|
||||
if (n == buffer_size)
|
||||
{
|
||||
buffer_size = buffer_size * 2;
|
||||
@ -1629,6 +1820,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
|
||||
closedir(tax_dir);
|
||||
free(parent_taxids);
|
||||
free(rank_names);
|
||||
free(delnodes);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
@ -1675,6 +1867,10 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
|
||||
fclose(file);
|
||||
}
|
||||
}
|
||||
|
||||
// Free delnodes array, not needed anymore
|
||||
free(delnodes);
|
||||
|
||||
closedir(tax_dir);
|
||||
|
||||
|
||||
@ -2494,6 +2690,8 @@ ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid
|
||||
|
||||
if (indexed_taxon == NULL)
|
||||
current_taxon = NULL;
|
||||
else if (indexed_taxon->idx == -1)
|
||||
current_taxon = NULL; // TODO discuss what to do when old deleted taxon
|
||||
else
|
||||
current_taxon = (taxonomy->taxa->taxon)+(indexed_taxon->idx);
|
||||
|
||||
|
Reference in New Issue
Block a user