Taxonomy: informations about deleted taxids is now read from

delnodes.dmp file and added to *.adx file
This commit is contained in:
Celine Mercier
2017-01-09 17:28:49 +01:00
parent d68374018b
commit 41ad3deec0

View File

@ -1208,12 +1208,15 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
bool nodes_found=false;
bool names_found=false;
bool merged_found=false;
bool delnodes_found=false;
int32_t* delnodes=NULL;
int32_t delnodes_count;
char line[2048]; // TODO large enough?
char* elt;
char* file_name;
int buffer_size;
int i, j;
int n;
int n, nD, nT;
char** rank_names;
int* parent_taxids;
int taxid, old_taxid;
@ -1495,14 +1498,14 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
// Go through taxonomy files
while ((dp = readdir(tax_dir)) != NULL)
{
if (strcmp(dp->d_name, "merged.dmp") == 0)
if (strcmp(dp->d_name, "delnodes.dmp") == 0)
{
merged_found = true;
delnodes_found = true;
buffer_size = 10000;
// Initializing the merged structure
tax->merged_idx = (ecomergedidx_t*) malloc(sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size);
if (tax->merged_idx == NULL)
// Initializing the list of deleted nodes
delnodes = (int32_t*) malloc(sizeof(int32_t) * buffer_size);
if (delnodes == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating the memory for a taxonomy structure");
@ -1522,6 +1525,156 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
obi_close_taxonomy(tax);
free(parent_taxids);
free(rank_names);
free(delnodes);
closedir(tax_dir);
return NULL;
}
// Build the file path
if (sprintf(file_name, "%s/delnodes.dmp", taxdump) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError building a taxonomy file name");
obi_close_taxonomy(tax);
closedir(tax_dir);
free(parent_taxids);
free(rank_names);
free(file_name);
free(delnodes);
return NULL;
}
file = fopen(file_name, "r");
if (file == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nProblem opening a taxonomy file");
obi_close_taxonomy(tax);
closedir(tax_dir);
free(parent_taxids);
free(rank_names);
free(file_name);
free(delnodes);
return NULL;
}
free(file_name);
n = 0;
while (fgets(line, sizeof(line), file))
{
// Check for terminal '\n' character (line complete)
if (line[strlen(line) - 1] != '\n')
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError: line buffer size not large enough for line in taxonomy file");
obi_close_taxonomy(tax);
fclose(file);
closedir(tax_dir);
free(parent_taxids);
free(rank_names);
free(delnodes);
return NULL;
}
// Get first and only element of the line (the deprecated taxid)
elt = strtok(line, "|");
// Remove the last character (tab character)
elt[strlen(elt)-1] = '\0';
// First element: old deprecated taxid
old_taxid = atoi(elt);
// Store the old taxid in the list of deleted taxids
// Enlarge array if needed
if (n == buffer_size)
{
buffer_size = buffer_size * 2;
delnodes = (int32_t*) realloc(tax->merged_idx, sizeof(int32_t) * buffer_size);
if (delnodes == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reallocating memory for a taxonomy structure");
obi_close_taxonomy(tax);
fclose(file);
closedir(tax_dir);
free(parent_taxids);
free(rank_names);
return NULL;
}
}
delnodes[n] = old_taxid;
n++;
}
// Check that fgets stopped because it reached EOF
if (!feof(file))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError: file reading was stopped before end of file");
obi_close_taxonomy(tax);
fclose(file);
closedir(tax_dir);
free(parent_taxids);
free(rank_names);
free(delnodes);
return NULL;
}
// Store count
delnodes_count = n;
fclose(file);
}
}
closedir(tax_dir);
// Go through directory again for next file // TODO make separate functions?
tax_dir = opendir(taxdump);
if (tax_dir == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nProblem opening a taxdump directory");
obi_close_taxonomy(tax);
free(parent_taxids);
free(rank_names);
free(delnodes);
return NULL;
}
// Go through taxonomy files
while ((dp = readdir(tax_dir)) != NULL)
{
if (strcmp(dp->d_name, "merged.dmp") == 0)
{
merged_found = true;
buffer_size = 10000;
// Initializing the merged structure
tax->merged_idx = (ecomergedidx_t*) malloc(sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size);
if (tax->merged_idx == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating the memory for a taxonomy structure");
obi_close_taxonomy(tax);
free(parent_taxids);
free(rank_names);
free(delnodes);
closedir(tax_dir);
return NULL;
}
// Allocating the memory for the file name
file_name = (char*) malloc((strlen(taxdump) + 12)*sizeof(char));
if (file_name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating the memory for a file name");
obi_close_taxonomy(tax);
free(parent_taxids);
free(rank_names);
free(delnodes);
closedir(tax_dir);
return NULL;
}
@ -1536,6 +1689,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
free(parent_taxids);
free(rank_names);
free(file_name);
free(delnodes);
return NULL;
}
@ -1549,13 +1703,15 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
free(parent_taxids);
free(rank_names);
free(file_name);
free(delnodes);
return NULL;
}
free(file_name);
n = 0;
i = 0;
nT = 0; // to point in current taxa list while merging
nD = delnodes_count-1; // to point in deleted taxids list while merging (going from count-1 to 0 because taxids are sorted in descending order)
n = 0; // to point in final merged list while merging
while (fgets(line, sizeof(line), file))
{
// Check for terminal '\n' character (line complete)
@ -1568,6 +1724,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
closedir(tax_dir);
free(parent_taxids);
free(rank_names);
free(delnodes);
return NULL;
}
@ -1588,10 +1745,15 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
taxid = atoi(elt);
// Store the old taxid in the merged_idx ordered taxid list
// First, store the taxids from the current taxonomy that come before
while ((i < (tax->taxa)->count) && ((tax->taxa)->taxon[i].taxid < old_taxid))
// The merged list is an ordered list of the current taxids, the deprecated taxids that have current references,
// and the deleted taxids with no current reference. An element of the list is composed of the taxid, and the index
// of the taxon in the taxa structure, or -1 for deleted taxids.
// Creating the merged list requires to merge the 3 ordered lists into one.
while (((nT < (tax->taxa)->count) && ((tax->taxa)->taxon[nT].taxid < old_taxid)) && ((nD >= 0) && (delnodes[nD] < old_taxid)))
{
// Enlarge structures if needed
if ((tax->taxa)->taxon[nT].taxid < delnodes[nD])
{ // Add element from taxa list
// Enlarge structure if needed
if (n == buffer_size)
{
buffer_size = buffer_size * 2;
@ -1605,17 +1767,19 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
closedir(tax_dir);
free(parent_taxids);
free(rank_names);
free(delnodes);
return NULL;
}
}
(tax->merged_idx)->merged[n].taxid = (tax->taxa)->taxon[i].taxid;
(tax->merged_idx)->merged[n].idx = i;
i++;
(tax->merged_idx)->merged[n].taxid = (tax->taxa)->taxon[nT].taxid;
(tax->merged_idx)->merged[n].idx = nT;
nT++;
n++;
}
// Enlarge structures if needed
else if (delnodes[nD] < (tax->taxa)->taxon[nT].taxid)
{ // Add element from deleted taxids list
// Enlarge structure if needed
if (n == buffer_size)
{
buffer_size = buffer_size * 2;
@ -1629,6 +1793,34 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
closedir(tax_dir);
free(parent_taxids);
free(rank_names);
free(delnodes);
return NULL;
}
}
(tax->merged_idx)->merged[n].taxid = delnodes[nD];
(tax->merged_idx)->merged[n].idx = -1; // The index to tag deleted taxids is -1
nD--;
n++;
}
}
// Add the deprecated taxid
// Enlarge structure if needed
if (n == buffer_size)
{
buffer_size = buffer_size * 2;
tax->merged_idx = (ecomergedidx_t*) realloc(tax->merged_idx, sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size);
if (tax->merged_idx == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reallocating memory for a taxonomy structure");
obi_close_taxonomy(tax);
fclose(file);
closedir(tax_dir);
free(parent_taxids);
free(rank_names);
free(delnodes);
return NULL;
}
}
@ -1675,6 +1867,10 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
fclose(file);
}
}
// Free delnodes array, not needed anymore
free(delnodes);
closedir(tax_dir);
@ -2494,6 +2690,8 @@ ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid
if (indexed_taxon == NULL)
current_taxon = NULL;
else if (indexed_taxon->idx == -1)
current_taxon = NULL; // TODO discuss what to do when old deleted taxon
else
current_taxon = (taxonomy->taxa->taxon)+(indexed_taxon->idx);