Taxonomy: reading merged.dmp file in taxdump

This commit is contained in:
Celine Mercier
2017-01-05 14:28:36 +01:00
parent 8e92bf6dac
commit 897032387f
2 changed files with 240 additions and 21 deletions

View File

@ -1041,6 +1041,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
FILE* file;
bool nodes_found=false;
bool names_found=false;
bool merged_found=false;
char line[2048]; // TODO large enough?
char* elt;
char* file_name;
@ -1049,9 +1050,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
int n;
char** rank_names;
int* parent_taxids;
int taxid;
int taxid, old_taxid;
bool already_in;
ecotx_t* t;
ecotx_t* t;
// Initialize taxonomy structure
tax = (OBIDMS_taxonomy_p) malloc(sizeof(OBIDMS_taxonomy_t));
@ -1061,9 +1062,10 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
obidebug(1, "\nError allocating the memory for a taxonomy structure");
return NULL;
}
tax->ranks = NULL;
tax->taxa = NULL;
tax->names = NULL;
tax->ranks = NULL;
tax->taxa = NULL;
tax->names = NULL;
tax->merged_idx = NULL;
tax->dms = NULL;
(tax->tax_name)[0] = '\0';
@ -1312,6 +1314,204 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
closedir(tax_dir);
// Go through directory again for next file // TODO make separate functions?
tax_dir = opendir(taxdump);
if (tax_dir == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nProblem opening a taxdump directory");
obi_close_taxonomy(tax);
free(parent_taxids);
free(rank_names);
return NULL;
}
// Go through taxonomy files
while ((dp = readdir(tax_dir)) != NULL)
{
if (strcmp(dp->d_name, "merged.dmp") == 0)
{
merged_found = true; // TODO
buffer_size = 10000;
// Initializing the merged structure
tax->merged_idx = (ecomergedidx_t*) malloc(sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size);
if (tax->merged_idx == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating the memory for a taxonomy structure");
obi_close_taxonomy(tax);
free(parent_taxids);
free(rank_names);
closedir(tax_dir);
return NULL;
}
// Allocating the memory for the file name
file_name = (char*) malloc((strlen(taxdump) + 12)*sizeof(char));
if (file_name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating the memory for a file name");
obi_close_taxonomy(tax);
free(parent_taxids);
free(rank_names);
closedir(tax_dir);
return NULL;
}
// Build the file path
if (sprintf(file_name, "%s/merged.dmp", taxdump) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError building a taxonomy file name");
obi_close_taxonomy(tax);
closedir(tax_dir);
free(parent_taxids);
free(rank_names);
free(file_name);
return NULL;
}
file = fopen(file_name, "r");
if (file == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nProblem opening a taxonomy file");
obi_close_taxonomy(tax);
closedir(tax_dir);
free(parent_taxids);
free(rank_names);
free(file_name);
return NULL;
}
free(file_name);
n = 0;
i = 0;
while (fgets(line, sizeof(line), file))
{
// Check for terminal '\n' character (line complete)
if (line[strlen(line) - 1] != '\n')
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError: line buffer size not large enough for line in taxonomy file");
obi_close_taxonomy(tax);
fclose(file);
closedir(tax_dir);
free(parent_taxids);
free(rank_names);
return NULL;
}
// Parse the 2 elements separated by '|'
// Get first element
elt = strtok(line, "|");
// Remove the last character (tab character)
elt[strlen(elt)-1] = '\0';
// First element: old deprecated taxid
old_taxid = atoi(elt);
// Get 2nd element: new taxid
elt = strtok(NULL, "|");
// Remove the first and the last characters (tab characters)
elt = elt+1;
elt[strlen(elt)-1] = '\0';
taxid = atoi(elt);
// Store the old taxid in the merged_idx ordered taxid list
// First, store the taxids from the current taxonomy that come before
while ((i < (tax->taxa)->count) && ((tax->taxa)->taxon[i].taxid < old_taxid))
{
// Enlarge structures if needed
if (n == buffer_size)
{
buffer_size = buffer_size * 2;
tax->merged_idx = (ecomergedidx_t*) realloc(tax->merged_idx, sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size);
if (tax->merged_idx == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reallocating memory for a taxonomy structure");
obi_close_taxonomy(tax);
fclose(file);
closedir(tax_dir);
free(parent_taxids);
free(rank_names);
return NULL;
}
}
(tax->merged_idx)->merged[n].taxid = (tax->taxa)->taxon[i].taxid;
(tax->merged_idx)->merged[n].idx = i;
i++;
n++;
}
// Enlarge structures if needed
if (n == buffer_size)
{
buffer_size = buffer_size * 2;
tax->merged_idx = (ecomergedidx_t*) realloc(tax->merged_idx, sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size);
if (tax->merged_idx == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reallocating memory for a taxonomy structure");
obi_close_taxonomy(tax);
fclose(file);
closedir(tax_dir);
free(parent_taxids);
free(rank_names);
return NULL;
}
}
// Store the deprecated taxid with the index that refers to the new taxid
// Find the index of the new taxid
t = obi_taxo_get_taxon_with_taxid(tax, taxid);
// Store the old taxid with the index
(tax->merged_idx)->merged[n].taxid = old_taxid;
(tax->merged_idx)->merged[n].idx = t->idx;
n++;
}
// Check that fgets stopped because it reached EOF
if (!feof(file))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError: file reading was stopped before end of file");
obi_close_taxonomy(tax);
fclose(file);
closedir(tax_dir);
free(parent_taxids);
free(rank_names);
return NULL;
}
// Store count
(tax->merged_idx)->count = n;
// Truncate the structure memory to the right size
tax->merged_idx = (ecomergedidx_t*) realloc(tax->merged_idx, sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * (tax->merged_idx)->count);
if (tax->merged_idx == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reallocating memory for a a taxonomy structure");
obi_close_taxonomy(tax);
fclose(file);
closedir(tax_dir);
free(parent_taxids);
free(rank_names);
return NULL;
}
fclose(file);
}
}
closedir(tax_dir);
// Go through directory again for next file
tax_dir = opendir(taxdump);
if (tax_dir == NULL)
@ -1346,7 +1546,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
}
// Allocating the memory for the file name
file_name = (char*) malloc((strlen(taxdump) + 10)*sizeof(char));
file_name = (char*) malloc((strlen(taxdump) + 11)*sizeof(char));
if (file_name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
@ -1684,7 +1884,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
(tax->taxa)->buffer_size = (tax->taxa)->count;
// Compute longest branches TODO what is this for???
// Compute longest branches
for (i=0; i < (tax->taxa)->count; i++)
{
t = (((tax->taxa))->taxon)+i;
@ -1844,9 +2044,10 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo
return NULL;
}
tax->ranks = NULL;
tax->taxa = NULL;
tax->names = NULL;
tax->ranks = NULL;
tax->taxa = NULL;
tax->names = NULL;
tax->merged_idx = NULL;
tax->dms = dms;
@ -2028,6 +2229,11 @@ int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy)
free(taxonomy->taxa);
}
if (taxonomy->merged_idx)
{
free(taxonomy->merged_idx);
}
free(taxonomy);
}

View File

@ -26,12 +26,12 @@ typedef struct {
int32_t rank;
int32_t parent;
int32_t name_length;
char name[1];
char name[];
} ecotxformat_t;
typedef struct ecotxnode {
int32_t taxid;
int32_t taxid; // TODO discuss that this is will be the current taxid even if the struct was accessed through a deprecated one
int32_t rank;
int32_t farest;
int32_t idx;
@ -47,13 +47,13 @@ typedef struct {
int32_t local_count;
int32_t max_taxid;
int32_t buffer_size;
ecotx_t taxon[1];
ecotx_t taxon[];
} ecotxidx_t;
typedef struct {
int32_t count;
char* label[1];
char* label[];
} ecorankidx_t;
@ -62,7 +62,7 @@ typedef struct {
int32_t name_length;
int32_t class_length;
int32_t taxid; // taxid idx
char names[1];
char names[];
} econameformat_t;
@ -76,16 +76,29 @@ typedef struct {
typedef struct {
int32_t count;
econame_t names[1];
econame_t names[];
} econameidx_t;
typedef struct {
int32_t taxid;
int32_t idx;
} ecomerged_t;
typedef struct {
int32_t count;
ecomerged_t merged[];
} ecomergedidx_t;
typedef struct OBIDMS_taxonomy_t {
char tax_name[TAX_NAME_LEN];
OBIDMS_p dms;
ecorankidx_t* ranks;
econameidx_t* names;
ecotxidx_t* taxa;
char tax_name[TAX_NAME_LEN];
OBIDMS_p dms;
ecomergedidx_t* merged_idx;
ecorankidx_t* ranks;
econameidx_t* names;
ecotxidx_t* taxa;
} OBIDMS_taxonomy_t, *OBIDMS_taxonomy_p;