Taxonomy: functions to read the *.adx file (containing the deprecated

and current taxids and their corresponding indices in the taxa
structure) and to find the taxa using the merged index.
This commit is contained in:
Celine Mercier
2017-01-06 15:52:21 +01:00
parent f396625f98
commit d68374018b
2 changed files with 132 additions and 9 deletions

View File

@ -35,7 +35,7 @@ int cmp_rank_labels(const void* label1, const void* label2)
}
static int cmp_taxids(const void* ptaxid, const void* ptaxon)
static int cmp_taxids_in_ecotx_t(const void* ptaxid, const void* ptaxon)
{
ecotx_t* current_taxon = (ecotx_t*) ptaxon;
int32_t taxid = (int32_t) ((size_t) ptaxid);
@ -43,6 +43,14 @@ static int cmp_taxids(const void* ptaxid, const void* ptaxon)
}
static int cmp_taxids_in_ecomerged_t(const void* ptaxid, const void* ptaxon)
{
ecomerged_t* current_taxon = (ecomerged_t*) ptaxon;
int32_t taxid = (int32_t) ((size_t) ptaxid);
return taxid - current_taxon->taxid;
}
static int cmp_str(const void* s1, const void* s2)
{
return strcmp(*((char**)s1), *((char**)s2));
@ -467,6 +475,55 @@ econameidx_t* read_nameidx(const char *file_name, OBIDMS_taxonomy_p taxonomy)
ecomergedidx_t* read_mergedidx(const char *file_name, OBIDMS_taxonomy_p taxonomy)
{
int32_t count;
FILE* f;
ecomergedidx_t* index_merged_idx;
ecomerged_t* merged_idx;
int32_t i;
int32_t record_length;
f = open_ecorecorddb(file_name, &count, 0);
if (f == NULL)
{
obidebug(1, "\nError reading taxonomy name file");
return NULL;
}
index_merged_idx = (ecomergedidx_t*) malloc(sizeof(ecomergedidx_t) + (sizeof(ecomerged_t) * count));
if (index_merged_idx == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reading taxonomy name file");
return NULL;
}
index_merged_idx->count = count;
for (i=0; i < count; i++)
{
merged_idx = read_ecorecord(f, &record_length);
memcpy((index_merged_idx->merged)+i, merged_idx, record_length);
if ((index_merged_idx->merged)+i == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError reading taxonomy name file");
free(index_merged_idx);
return NULL;
}
}
fclose(f);
return index_merged_idx;
}
// Functions to write taxonomy structure to binary files
@ -1003,6 +1060,7 @@ int write_mergedidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_na
int file_descriptor;
off_t file_size;
char* taxonomy_path;
int32_t record_size;
// Compute file size
file_size = sizeof(int32_t) + (sizeof(int32_t) * 3 * (tax->merged_idx)->count);
@ -1058,9 +1116,20 @@ int write_mergedidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_na
return -1;
}
record_size = 2 * sizeof(int32_t);
// Write merged indices
for (i=0; i < (tax->merged_idx)->count; i++)
{
// Write record size
if (write(file_descriptor, &(record_size), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write taxid
if (write(file_descriptor, &(((tax->merged_idx)->merged)[i].taxid), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
@ -1566,7 +1635,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
// Store the deprecated taxid with the index that refers to the new taxid
// Find the index of the new taxid
t = obi_taxo_get_taxon_with_taxid(tax, taxid);
t = obi_taxo_get_taxon_with_current_taxid(tax, taxid);
// Store the old taxid with the index
(tax->merged_idx)->merged[n].taxid = old_taxid;
(tax->merged_idx)->merged[n].idx = t->idx;
@ -1966,7 +2035,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
// Associate the taxa with their parent
for (i=0; i < (tax->taxa)->count; i++)
{
((tax->taxa)->taxon)[i].parent = obi_taxo_get_taxon_with_taxid(tax, parent_taxids[i]);
((tax->taxa)->taxon)[i].parent = obi_taxo_get_taxon_with_current_taxid(tax, parent_taxids[i]);
if (((tax->taxa)->taxon)[i].parent == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
@ -2129,6 +2198,7 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo
char* taxonomy_path;
char* ranks_file_name;
char* taxa_file_name;
char* merged_idx_file_name;
char* local_taxa_file_name;
char* alter_names_file_name;
int buffer_size;
@ -2238,6 +2308,35 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo
free(taxa_file_name);
free(local_taxa_file_name);
// Read merged index (old and current taxids referring to indices in the taxa structure)
merged_idx_file_name = (char*) malloc(buffer_size*sizeof(char));
if (merged_idx_file_name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for merged index file name");
free(taxonomy_path);
obi_close_taxonomy(tax);
return NULL;
}
if (snprintf(merged_idx_file_name, buffer_size, "%s/%s.adx", taxonomy_path, taxonomy_name) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError building merged index file name");
free(taxonomy_path);
free(merged_idx_file_name);
obi_close_taxonomy(tax);
return NULL;
}
tax->merged_idx = read_mergedidx(merged_idx_file_name, tax);
if (tax->merged_idx == NULL)
{
free(taxonomy_path);
free(merged_idx_file_name);
obi_close_taxonomy(tax);
return NULL;
}
free(merged_idx_file_name);
// Read alternative names
if (read_alternative_names)
{
@ -2363,10 +2462,10 @@ ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx)
}
ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid)
ecotx_t* obi_taxo_get_taxon_with_current_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid) // TODO discuss keeping private?
{
ecotx_t *current_taxon;
int32_t count;
ecotx_t *current_taxon;
int32_t count;
count = (taxonomy->taxa)->count;
@ -2374,12 +2473,35 @@ ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid
(const void *) taxonomy->taxa->taxon,
count,
sizeof(ecotx_t),
cmp_taxids);
cmp_taxids_in_ecotx_t);
return current_taxon;
}
bool obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid)
ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid)
{
ecotx_t *current_taxon;
ecomerged_t *indexed_taxon;
int32_t count;
count = (taxonomy->merged_idx)->count;
indexed_taxon = (ecomerged_t*) bsearch((const void *) ((size_t) taxid),
(const void *) taxonomy->merged_idx->merged,
count,
sizeof(ecomerged_t),
cmp_taxids_in_ecomerged_t);
if (indexed_taxon == NULL)
current_taxon = NULL;
else
current_taxon = (taxonomy->taxa->taxon)+(indexed_taxon->idx);
return current_taxon;
}
bool obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid) // TODO discuss that this doesn't work with deprecated taxids
{
ecotx_t* next_parent;
@ -2486,7 +2608,7 @@ ecotx_t* obi_taxo_get_kingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
{
static OBIDMS_taxonomy_p tax = NULL;
static int32_t rankindex = -1;
static int32_t rankindex = -1;
if (taxonomy && (tax != taxonomy))
{

View File

@ -108,6 +108,7 @@ int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy);
ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx);
ecotx_t* obi_taxo_get_taxon_with_current_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid);
ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid);
bool obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid);