Taxonomy: reading merged.dmp file in taxdump
This commit is contained in:
@ -1041,6 +1041,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
|
||||
FILE* file;
|
||||
bool nodes_found=false;
|
||||
bool names_found=false;
|
||||
bool merged_found=false;
|
||||
char line[2048]; // TODO large enough?
|
||||
char* elt;
|
||||
char* file_name;
|
||||
@ -1049,9 +1050,9 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
|
||||
int n;
|
||||
char** rank_names;
|
||||
int* parent_taxids;
|
||||
int taxid;
|
||||
int taxid, old_taxid;
|
||||
bool already_in;
|
||||
ecotx_t* t;
|
||||
ecotx_t* t;
|
||||
|
||||
// Initialize taxonomy structure
|
||||
tax = (OBIDMS_taxonomy_p) malloc(sizeof(OBIDMS_taxonomy_t));
|
||||
@ -1061,9 +1062,10 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
|
||||
obidebug(1, "\nError allocating the memory for a taxonomy structure");
|
||||
return NULL;
|
||||
}
|
||||
tax->ranks = NULL;
|
||||
tax->taxa = NULL;
|
||||
tax->names = NULL;
|
||||
tax->ranks = NULL;
|
||||
tax->taxa = NULL;
|
||||
tax->names = NULL;
|
||||
tax->merged_idx = NULL;
|
||||
|
||||
tax->dms = NULL;
|
||||
(tax->tax_name)[0] = '\0';
|
||||
@ -1312,6 +1314,204 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
|
||||
closedir(tax_dir);
|
||||
|
||||
|
||||
// Go through directory again for next file // TODO make separate functions?
|
||||
tax_dir = opendir(taxdump);
|
||||
if (tax_dir == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_TAXONOMY_ERROR);
|
||||
obidebug(1, "\nProblem opening a taxdump directory");
|
||||
obi_close_taxonomy(tax);
|
||||
free(parent_taxids);
|
||||
free(rank_names);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Go through taxonomy files
|
||||
while ((dp = readdir(tax_dir)) != NULL)
|
||||
{
|
||||
if (strcmp(dp->d_name, "merged.dmp") == 0)
|
||||
{
|
||||
merged_found = true; // TODO
|
||||
buffer_size = 10000;
|
||||
|
||||
// Initializing the merged structure
|
||||
tax->merged_idx = (ecomergedidx_t*) malloc(sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size);
|
||||
if (tax->merged_idx == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_MALLOC_ERROR);
|
||||
obidebug(1, "\nError allocating the memory for a taxonomy structure");
|
||||
obi_close_taxonomy(tax);
|
||||
free(parent_taxids);
|
||||
free(rank_names);
|
||||
closedir(tax_dir);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Allocating the memory for the file name
|
||||
file_name = (char*) malloc((strlen(taxdump) + 12)*sizeof(char));
|
||||
if (file_name == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_MALLOC_ERROR);
|
||||
obidebug(1, "\nError allocating the memory for a file name");
|
||||
obi_close_taxonomy(tax);
|
||||
free(parent_taxids);
|
||||
free(rank_names);
|
||||
closedir(tax_dir);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Build the file path
|
||||
if (sprintf(file_name, "%s/merged.dmp", taxdump) < 0)
|
||||
{
|
||||
obi_set_errno(OBI_TAXONOMY_ERROR);
|
||||
obidebug(1, "\nError building a taxonomy file name");
|
||||
obi_close_taxonomy(tax);
|
||||
closedir(tax_dir);
|
||||
free(parent_taxids);
|
||||
free(rank_names);
|
||||
free(file_name);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
file = fopen(file_name, "r");
|
||||
if (file == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_TAXONOMY_ERROR);
|
||||
obidebug(1, "\nProblem opening a taxonomy file");
|
||||
obi_close_taxonomy(tax);
|
||||
closedir(tax_dir);
|
||||
free(parent_taxids);
|
||||
free(rank_names);
|
||||
free(file_name);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
free(file_name);
|
||||
|
||||
n = 0;
|
||||
i = 0;
|
||||
while (fgets(line, sizeof(line), file))
|
||||
{
|
||||
// Check for terminal '\n' character (line complete)
|
||||
if (line[strlen(line) - 1] != '\n')
|
||||
{
|
||||
obi_set_errno(OBI_TAXONOMY_ERROR);
|
||||
obidebug(1, "\nError: line buffer size not large enough for line in taxonomy file");
|
||||
obi_close_taxonomy(tax);
|
||||
fclose(file);
|
||||
closedir(tax_dir);
|
||||
free(parent_taxids);
|
||||
free(rank_names);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Parse the 2 elements separated by '|'
|
||||
|
||||
// Get first element
|
||||
elt = strtok(line, "|");
|
||||
// Remove the last character (tab character)
|
||||
elt[strlen(elt)-1] = '\0';
|
||||
// First element: old deprecated taxid
|
||||
old_taxid = atoi(elt);
|
||||
|
||||
// Get 2nd element: new taxid
|
||||
elt = strtok(NULL, "|");
|
||||
// Remove the first and the last characters (tab characters)
|
||||
elt = elt+1;
|
||||
elt[strlen(elt)-1] = '\0';
|
||||
taxid = atoi(elt);
|
||||
|
||||
// Store the old taxid in the merged_idx ordered taxid list
|
||||
// First, store the taxids from the current taxonomy that come before
|
||||
while ((i < (tax->taxa)->count) && ((tax->taxa)->taxon[i].taxid < old_taxid))
|
||||
{
|
||||
// Enlarge structures if needed
|
||||
if (n == buffer_size)
|
||||
{
|
||||
buffer_size = buffer_size * 2;
|
||||
tax->merged_idx = (ecomergedidx_t*) realloc(tax->merged_idx, sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size);
|
||||
if (tax->merged_idx == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_MALLOC_ERROR);
|
||||
obidebug(1, "\nError reallocating memory for a taxonomy structure");
|
||||
obi_close_taxonomy(tax);
|
||||
fclose(file);
|
||||
closedir(tax_dir);
|
||||
free(parent_taxids);
|
||||
free(rank_names);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
(tax->merged_idx)->merged[n].taxid = (tax->taxa)->taxon[i].taxid;
|
||||
(tax->merged_idx)->merged[n].idx = i;
|
||||
i++;
|
||||
n++;
|
||||
}
|
||||
|
||||
// Enlarge structures if needed
|
||||
if (n == buffer_size)
|
||||
{
|
||||
buffer_size = buffer_size * 2;
|
||||
tax->merged_idx = (ecomergedidx_t*) realloc(tax->merged_idx, sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size);
|
||||
if (tax->merged_idx == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_MALLOC_ERROR);
|
||||
obidebug(1, "\nError reallocating memory for a taxonomy structure");
|
||||
obi_close_taxonomy(tax);
|
||||
fclose(file);
|
||||
closedir(tax_dir);
|
||||
free(parent_taxids);
|
||||
free(rank_names);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
// Store the deprecated taxid with the index that refers to the new taxid
|
||||
// Find the index of the new taxid
|
||||
t = obi_taxo_get_taxon_with_taxid(tax, taxid);
|
||||
// Store the old taxid with the index
|
||||
(tax->merged_idx)->merged[n].taxid = old_taxid;
|
||||
(tax->merged_idx)->merged[n].idx = t->idx;
|
||||
n++;
|
||||
}
|
||||
|
||||
// Check that fgets stopped because it reached EOF
|
||||
if (!feof(file))
|
||||
{
|
||||
obi_set_errno(OBI_TAXONOMY_ERROR);
|
||||
obidebug(1, "\nError: file reading was stopped before end of file");
|
||||
obi_close_taxonomy(tax);
|
||||
fclose(file);
|
||||
closedir(tax_dir);
|
||||
free(parent_taxids);
|
||||
free(rank_names);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Store count
|
||||
(tax->merged_idx)->count = n;
|
||||
|
||||
// Truncate the structure memory to the right size
|
||||
tax->merged_idx = (ecomergedidx_t*) realloc(tax->merged_idx, sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * (tax->merged_idx)->count);
|
||||
if (tax->merged_idx == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_MALLOC_ERROR);
|
||||
obidebug(1, "\nError reallocating memory for a a taxonomy structure");
|
||||
obi_close_taxonomy(tax);
|
||||
fclose(file);
|
||||
closedir(tax_dir);
|
||||
free(parent_taxids);
|
||||
free(rank_names);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
fclose(file);
|
||||
}
|
||||
}
|
||||
closedir(tax_dir);
|
||||
|
||||
|
||||
// Go through directory again for next file
|
||||
tax_dir = opendir(taxdump);
|
||||
if (tax_dir == NULL)
|
||||
@ -1346,7 +1546,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
|
||||
}
|
||||
|
||||
// Allocating the memory for the file name
|
||||
file_name = (char*) malloc((strlen(taxdump) + 10)*sizeof(char));
|
||||
file_name = (char*) malloc((strlen(taxdump) + 11)*sizeof(char));
|
||||
if (file_name == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_MALLOC_ERROR);
|
||||
@ -1684,7 +1884,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
|
||||
|
||||
(tax->taxa)->buffer_size = (tax->taxa)->count;
|
||||
|
||||
// Compute longest branches TODO what is this for???
|
||||
// Compute longest branches
|
||||
for (i=0; i < (tax->taxa)->count; i++)
|
||||
{
|
||||
t = (((tax->taxa))->taxon)+i;
|
||||
@ -1844,9 +2044,10 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo
|
||||
return NULL;
|
||||
}
|
||||
|
||||
tax->ranks = NULL;
|
||||
tax->taxa = NULL;
|
||||
tax->names = NULL;
|
||||
tax->ranks = NULL;
|
||||
tax->taxa = NULL;
|
||||
tax->names = NULL;
|
||||
tax->merged_idx = NULL;
|
||||
|
||||
tax->dms = dms;
|
||||
|
||||
@ -2028,6 +2229,11 @@ int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy)
|
||||
free(taxonomy->taxa);
|
||||
}
|
||||
|
||||
if (taxonomy->merged_idx)
|
||||
{
|
||||
free(taxonomy->merged_idx);
|
||||
}
|
||||
|
||||
free(taxonomy);
|
||||
}
|
||||
|
||||
|
@ -26,12 +26,12 @@ typedef struct {
|
||||
int32_t rank;
|
||||
int32_t parent;
|
||||
int32_t name_length;
|
||||
char name[1];
|
||||
char name[];
|
||||
} ecotxformat_t;
|
||||
|
||||
|
||||
typedef struct ecotxnode {
|
||||
int32_t taxid;
|
||||
int32_t taxid; // TODO discuss that this is will be the current taxid even if the struct was accessed through a deprecated one
|
||||
int32_t rank;
|
||||
int32_t farest;
|
||||
int32_t idx;
|
||||
@ -47,13 +47,13 @@ typedef struct {
|
||||
int32_t local_count;
|
||||
int32_t max_taxid;
|
||||
int32_t buffer_size;
|
||||
ecotx_t taxon[1];
|
||||
ecotx_t taxon[];
|
||||
} ecotxidx_t;
|
||||
|
||||
|
||||
typedef struct {
|
||||
int32_t count;
|
||||
char* label[1];
|
||||
char* label[];
|
||||
} ecorankidx_t;
|
||||
|
||||
|
||||
@ -62,7 +62,7 @@ typedef struct {
|
||||
int32_t name_length;
|
||||
int32_t class_length;
|
||||
int32_t taxid; // taxid idx
|
||||
char names[1];
|
||||
char names[];
|
||||
} econameformat_t;
|
||||
|
||||
|
||||
@ -76,16 +76,29 @@ typedef struct {
|
||||
|
||||
typedef struct {
|
||||
int32_t count;
|
||||
econame_t names[1];
|
||||
econame_t names[];
|
||||
} econameidx_t;
|
||||
|
||||
|
||||
typedef struct {
|
||||
int32_t taxid;
|
||||
int32_t idx;
|
||||
} ecomerged_t;
|
||||
|
||||
|
||||
typedef struct {
|
||||
int32_t count;
|
||||
ecomerged_t merged[];
|
||||
} ecomergedidx_t;
|
||||
|
||||
|
||||
typedef struct OBIDMS_taxonomy_t {
|
||||
char tax_name[TAX_NAME_LEN];
|
||||
OBIDMS_p dms;
|
||||
ecorankidx_t* ranks;
|
||||
econameidx_t* names;
|
||||
ecotxidx_t* taxa;
|
||||
char tax_name[TAX_NAME_LEN];
|
||||
OBIDMS_p dms;
|
||||
ecomergedidx_t* merged_idx;
|
||||
ecorankidx_t* ranks;
|
||||
econameidx_t* names;
|
||||
ecotxidx_t* taxa;
|
||||
} OBIDMS_taxonomy_t, *OBIDMS_taxonomy_p;
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user