Files
obitools3/src/obidms_taxonomy.c

2880 lines
73 KiB
C
Raw Normal View History

/********************************************************************
* OBIDMS taxonomy functions *
********************************************************************/
/**
* @file obidms_taxonomy.c
* @author Celine Mercier (celine.mercier@metabarcoding.org)
* @date March 2nd 2016
* @brief Functions for reading binary taxonomy files.
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdbool.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <unistd.h>
#include "obidms_taxonomy.h"
#include "obidms.h"
#include "obidebug.h"
#include "obierrno.h"
#include "utils.h"
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
int cmp_rank_labels(const void* label1, const void* label2)
{
return strcmp((const char*)label1,*(const char**)label2);
}
static int cmp_taxids_in_ecotx_t(const void* ptaxid, const void* ptaxon)
{
ecotx_t* current_taxon = (ecotx_t*) ptaxon;
int32_t taxid = (int32_t) ((size_t) ptaxid);
return taxid - current_taxon->taxid;
}
static int cmp_taxids_in_ecomerged_t(const void* ptaxid, const void* ptaxon)
{
ecomerged_t* current_taxon = (ecomerged_t*) ptaxon;
int32_t taxid = (int32_t) ((size_t) ptaxid);
return taxid - current_taxon->taxid;
}
static int cmp_str(const void* s1, const void* s2)
{
return strcmp(*((char**)s1), *((char**)s2));
}
static int cmp_names(const void* n1, const void* n2)
{
econame_t name1 = *((econame_t*)n1);
econame_t name2 = *((econame_t*)n2);
return strcmp(name1.name, name2.name);
}
char* get_taxonomy_path(OBIDMS_p dms, const char* tax_name)
{
char* all_tax_dir_path;
char* tax_path;
all_tax_dir_path = obi_dms_get_full_path(dms, TAXONOMY_DIR_NAME);
if (all_tax_dir_path == NULL)
return NULL;
tax_path = (char*) malloc((strlen(all_tax_dir_path) + strlen(tax_name) + 2)*sizeof(char));
if (tax_path == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for taxonomy path");
free(all_tax_dir_path);
return NULL;
}
if (sprintf(tax_path, "%s/%s", all_tax_dir_path, tax_name) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError building taxonomy path");
free(all_tax_dir_path);
return NULL;
}
free(all_tax_dir_path);
return tax_path;
}
int32_t rank_index(const char* label, ecorankidx_t* ranks)
{
char **rep;
rep = bsearch(label, ranks->label, ranks->count, sizeof(char*), cmp_rank_labels);
if (rep)
return rep-ranks->label;
return -1;
}
void* read_ecorecord(FILE* f, int32_t* record_size)
{
static void* buffer = NULL;
int32_t buffer_size = 0;
int32_t read;
if (!record_size)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError reading a taxonomy file: record_size can not be NULL");
return NULL;
}
read = fread(record_size,
sizeof(int32_t),
1,
f);
if (feof(f))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError reading a taxonomy file: reached end of file");
return NULL;
}
if (read != 1)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError reading a taxonomy file: error reading record size");
return NULL;
}
if (buffer_size < *record_size)
{
if (buffer)
buffer = realloc(buffer, *record_size);
else
buffer = malloc(*record_size);
if (buffer == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reading a taxonomy file: error allocating memory");
return NULL;
}
}
read = fread(buffer,
*record_size,
1,
f);
if (read != 1)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError reading a taxonomy file: error reading a record %d, %d", read, *record_size);
free(buffer);
return NULL;
}
return buffer;
};
ecotx_t* readnext_ecotaxon(FILE* f, ecotx_t* taxon)
{
ecotxformat_t* raw;
int32_t record_length;
raw = read_ecorecord(f, &record_length);
if (!raw)
return NULL;
taxon->parent = (ecotx_t*) ((size_t) raw->parent);
taxon->taxid = raw->taxid;
taxon->rank = raw->rank;
taxon->farest = -1;
taxon->name = malloc((raw->name_length+1) * sizeof(char));
if (taxon->name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reading a taxonomy file: error allocating memory");
return NULL;
}
strncpy(taxon->name, raw->name, raw->name_length);
taxon->name[raw->name_length] = 0; // TODO note: this line is probably missing in ROBITaxonomy and source of a bug
return taxon;
}
FILE* open_ecorecorddb(const char* file_name,
int32_t* count,
int32_t abort_on_open_error)
{
FILE* f;
int32_t read;
f = fopen(file_name, "rb");
if (!f)
{
if (abort_on_open_error)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nCouldn't open a taxonomy file");
fclose(f);
return NULL;
}
else
{
*count = 0;
fclose(f);
return NULL;
}
}
read = fread(count,
sizeof(int32_t),
1,
f);
if (read != 1)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError reading taxonomy record size");
fclose(f);
return NULL;
}
return f;
}
ecorankidx_t* read_rankidx(const char* ranks_file_name)
{
int32_t count;
FILE* ranks_file;
ecorankidx_t* ranks_index;
int32_t i;
int32_t rank_length;
char* buffer;
ranks_file = open_ecorecorddb(ranks_file_name, &count, 0);
if (ranks_file==NULL)
return NULL;
ranks_index = (ecorankidx_t*) malloc(sizeof(ecorankidx_t) + sizeof(char*) * count);
if (ranks_index == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for taxonomy rank structure");
fclose(ranks_file);
return NULL;
}
ranks_index->count = count;
for (i=0; i < count; i++)
{
buffer = read_ecorecord(ranks_file, &rank_length);
if (buffer == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError reading a value in a taxonomy file");
fclose(ranks_file);
free(ranks_index);
return NULL;
}
ranks_index->label[i] = (char*) malloc(rank_length+1);
if (ranks_index->label[i] == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for taxonomy rank label");
fclose(ranks_file);
free(ranks_index);
free(buffer);
return NULL;
}
strncpy(ranks_index->label[i], buffer, rank_length);
(ranks_index->label[i])[rank_length] = 0;
}
fclose(ranks_file);
return ranks_index;
}
ecotxidx_t* read_taxonomyidx(const char* taxa_file_name, const char* local_taxa_file_name)
{
int32_t count_taxa;
int32_t count_local_taxa;
FILE* f_taxa;
FILE* f_local_taxa;
ecotxidx_t* taxa_index;
struct ecotxnode* t;
int32_t i;
int32_t j;
f_taxa = open_ecorecorddb(taxa_file_name, &count_taxa, 1);
if (f_taxa == NULL)
{
obidebug(1, "\nError reading taxonomy taxa file");
return NULL;
}
f_local_taxa = open_ecorecorddb(local_taxa_file_name, &count_local_taxa, 0);
taxa_index = (ecotxidx_t*) malloc(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count_taxa + count_local_taxa));
if (taxa_index == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for taxonomy structure");
fclose(f_taxa);
fclose(f_local_taxa);
return NULL;
}
taxa_index->count = count_taxa + count_local_taxa;
taxa_index->ncbi_count = count_taxa;
taxa_index->local_count = count_local_taxa;
taxa_index->buffer_size = taxa_index->count;
taxa_index->max_taxid = 0;
printf("Reading %d taxa...\n", count_taxa);
for (i=0; i<count_taxa; i++)
{
readnext_ecotaxon(f_taxa, &(taxa_index->taxon[i]));
taxa_index->taxon[i].parent = taxa_index->taxon + (size_t) taxa_index->taxon[i].parent;
taxa_index->taxon[i].parent->farest = 0;
if (taxa_index->taxon[i].taxid > taxa_index->max_taxid)
taxa_index->max_taxid = taxa_index->taxon[i].taxid;
}
if (count_local_taxa > 0)
printf("Reading %d local taxa...\n", count_local_taxa);
else
printf("No local taxa\n");
count_taxa = taxa_index->count;
for (; i < count_taxa; i++){
readnext_ecotaxon(f_local_taxa, &(taxa_index->taxon[i]));
taxa_index->taxon[i].idx = i;
taxa_index->taxon[i].parent = taxa_index->taxon + (size_t) taxa_index->taxon[i].parent;
taxa_index->taxon[i].parent->farest=0;
if (taxa_index->taxon[i].taxid > taxa_index->max_taxid)
taxa_index->max_taxid = taxa_index->taxon[i].taxid;
}
for (i=0; i < count_taxa; i++)
{
t = taxa_index->taxon+i;
if (t->farest == -1)
{
t->farest=0;
while (t->parent != t)
{
j = t->farest + 1;
if (j > t->parent->farest)
{
t->parent->farest = j;
t=t->parent;
}
else
t = taxa_index->taxon;
}
}
}
fclose(f_taxa);
if (f_local_taxa != NULL)
fclose(f_local_taxa);
return taxa_index;
}
econame_t* readnext_econame(FILE* f, econame_t* name, OBIDMS_taxonomy_p taxonomy)
{
econameformat_t* raw;
int32_t record_length;
raw = read_ecorecord(f, &record_length);
if (raw == NULL)
return NULL;
name->is_scientific_name = raw->is_scientific_name;
name->name = malloc((raw->name_length + 1) * sizeof(char));
if (name->name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for a taxon name");
free(raw);
return NULL;
}
strncpy(name->name, raw->names, raw->name_length);
name->name[raw->name_length] = 0;
name->class_name = malloc((raw->class_length+1) * sizeof(char));
if (name->class_name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for a taxon class name");
free(name->name);
free(raw);
return NULL;
}
strncpy(name->class_name,(raw->names + raw->name_length), raw->class_length);
name->class_name[raw->class_length] = 0;
name->taxon = taxonomy->taxa->taxon + raw->taxid;
return name;
}
econameidx_t* read_nameidx(const char *file_name, OBIDMS_taxonomy_p taxonomy)
{
int32_t count;
FILE* f;
econameidx_t* index_names;
int32_t i;
f = open_ecorecorddb(file_name, &count, 0);
if (f == NULL)
{
obidebug(1, "\nError reading taxonomy name file");
return NULL;
}
index_names = (econameidx_t*) malloc(sizeof(econameidx_t) + sizeof(econame_t) * count);
if (index_names == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reading taxonomy name file");
return NULL;
}
index_names->count = count;
for (i=0; i < count; i++)
{
readnext_econame(f, (index_names->names)+i, taxonomy);
if ((index_names->names)+i == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError reading taxonomy name file");
free(index_names);
return NULL;
}
}
fclose(f);
return index_names;
}
ecomergedidx_t* read_mergedidx(const char *file_name, OBIDMS_taxonomy_p taxonomy)
{
int32_t count;
FILE* f;
ecomergedidx_t* index_merged_idx;
ecomerged_t* merged_idx;
int32_t i;
int32_t record_length;
f = open_ecorecorddb(file_name, &count, 0);
if (f == NULL)
{
obidebug(1, "\nError reading taxonomy name file");
return NULL;
}
index_merged_idx = (ecomergedidx_t*) malloc(sizeof(ecomergedidx_t) + (sizeof(ecomerged_t) * count));
if (index_merged_idx == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reading taxonomy name file");
return NULL;
}
index_merged_idx->count = count;
for (i=0; i < count; i++)
{
merged_idx = read_ecorecord(f, &record_length);
memcpy((index_merged_idx->merged)+i, merged_idx, record_length);
if ((index_merged_idx->merged)+i == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError reading taxonomy name file");
free(index_merged_idx);
return NULL;
}
}
fclose(f);
return index_merged_idx;
}
// Functions to write taxonomy structure to binary files
int write_rankidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct?
{
int i;
char* file_name;
int file_descriptor;
off_t file_size;
char* taxonomy_path;
int32_t length;
// Compute file size
file_size = sizeof(int32_t);
for (i=0; i < (tax->ranks)->count; i++)
{
file_size = file_size + sizeof(int32_t); // To store label size
file_size = file_size + strlen(((tax->ranks)->label)[i]); // To store label
}
// Build the taxonomy directory path
taxonomy_path = get_taxonomy_path(dms, taxonomy_name);
file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 6)*sizeof(char));
if (file_name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating the memory for a binary taxonomy file name");
return -1;
}
// Build the file path
if (sprintf(file_name, "%s/%s.rdx", taxonomy_path, taxonomy_name) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError building a binary taxonomy file name");
return -1;
}
free(taxonomy_path);
// Create file
file_descriptor = open(file_name, O_RDWR | O_CREAT | O_EXCL, 0777);
if (file_descriptor < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError creating a binary taxonomy file %s", file_name);
free(file_name);
return -1;
}
free(file_name);
// Truncate the file to the right size
if (ftruncate(file_descriptor, file_size) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError truncating a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write rank count
if (write(file_descriptor, &((tax->ranks)->count), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write ranks
for (i=0; i < (tax->ranks)->count; i++)
{
length = strlen(((tax->ranks)->label)[i]);
// Write rank size
if (write(file_descriptor, &length, sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write rank label
if (write(file_descriptor, ((tax->ranks)->label)[i], length) < ((ssize_t) length))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
}
// Close file
if (close(file_descriptor) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError closing an rdx taxonomy file");
return -1;
}
return 0;
}
int write_taxonomyidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct?
{
int i;
char* file_name;
int file_descriptor;
off_t file_size;
char* taxonomy_path;
int32_t name_length;
int32_t record_size;
// Compute file size
file_size = sizeof(int32_t); // To store record count
for (i=0; i < (tax->taxa)->ncbi_count; i++)
{
file_size = file_size + sizeof(int32_t) * 5; // To store record size, taxid, rank index, parent index, and name length
file_size = file_size + strlen(tax->taxa->taxon[i].name); // To store name
}
// Build the taxonomy directory path
taxonomy_path = get_taxonomy_path(dms, taxonomy_name);
file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 6)*sizeof(char));
if (file_name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating the memory for a binary taxonomy file name");
return -1;
}
// Build the file path
if (sprintf(file_name, "%s/%s.tdx", taxonomy_path, taxonomy_name) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError building a binary taxonomy file name");
return -1;
}
free(taxonomy_path);
// Create file
file_descriptor = open(file_name, O_RDWR | O_CREAT | O_EXCL, 0777);
if (file_descriptor < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError creating a binary taxonomy file");
free(file_name);
return -1;
}
free(file_name);
// Truncate the file to the right size
if (ftruncate(file_descriptor, file_size) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError truncating a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write record count
if (write(file_descriptor, &(tax->taxa->ncbi_count), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write records
for (i=0; i < (tax->taxa)->ncbi_count; i++)
{
name_length = strlen(tax->taxa->taxon[i].name);
record_size = 4*sizeof(int32_t) + name_length;
// Write record size
if (write(file_descriptor, &record_size, sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write taxid
if (write(file_descriptor, &(tax->taxa->taxon[i].taxid), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write rank index
if (write(file_descriptor, &(tax->taxa->taxon[i].rank), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write parent index
if (write(file_descriptor, &((tax->taxa->taxon[i].parent)->idx), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write name length
if (write(file_descriptor, &name_length, sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write name
if (write(file_descriptor, tax->taxa->taxon[i].name, name_length) < ((ssize_t) name_length))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
}
// Close file
if (close(file_descriptor) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError closing a tdx taxonomy file");
return -1;
}
return 0;
}
int write_local_taxonomy_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct?
{
int i;
char* file_name;
int file_descriptor;
off_t file_size;
char* taxonomy_path;
int32_t name_length;
int32_t record_size;
// Compute file size
file_size = sizeof(int32_t); // To store record count
for (i=(tax->taxa)->ncbi_count; i < (tax->taxa)->count; i++)
{
file_size = file_size + sizeof(int32_t) * 5; // To store record size, taxid, rank index, parent index, and name length
file_size = file_size + strlen(tax->taxa->taxon[i].name); // To store name
}
// Build the taxonomy directory path
taxonomy_path = get_taxonomy_path(dms, taxonomy_name);
file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 6)*sizeof(char));
if (file_name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating the memory for a binary taxonomy file name");
return -1;
}
// Build the file path
if (sprintf(file_name, "%s/%s.ldx", taxonomy_path, taxonomy_name) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError building a binary taxonomy file name");
return -1;
}
free(taxonomy_path);
// Create file
file_descriptor = open(file_name, O_RDWR | O_CREAT, 0777);
if (file_descriptor < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError creating a binary taxonomy file");
free(file_name);
return -1;
}
free(file_name);
// Truncate the file to the right size
if (ftruncate(file_descriptor, file_size) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError truncating a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write record count
if (write(file_descriptor, &((tax->taxa)->local_count), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write records
for (i=(tax->taxa)->ncbi_count; i < (tax->taxa)->count; i++)
{
name_length = strlen(tax->taxa->taxon[i].name);
record_size = 4*sizeof(int32_t) + name_length;
// Write record size
if (write(file_descriptor, &record_size, sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write taxid
if (write(file_descriptor, &(tax->taxa->taxon[i].taxid), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write rank index
if (write(file_descriptor, &(tax->taxa->taxon[i].rank), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write parent index
if (write(file_descriptor, &((tax->taxa->taxon[i].parent)->idx), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write name length
if (write(file_descriptor, &name_length, sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write name
if (write(file_descriptor, tax->taxa->taxon[i].name, name_length) < ((ssize_t) name_length))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
}
// Close file
if (close(file_descriptor) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError closing a ldx taxonomy file");
return -1;
}
return 0;
}
int write_nameidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct?
{
int i;
char* file_name;
int file_descriptor;
off_t file_size;
char* taxonomy_path;
int32_t name_length;
int32_t class_length;
int32_t record_size;
// Compute file size
file_size = sizeof(int32_t); // To store record count
for (i=0; i < (tax->names)->count; i++)
{
file_size = file_size + sizeof(int32_t) * 5; // To store record size, taxid, rank index, parent index, and name length
file_size = file_size + strlen(tax->names->names[i].name); // To store name
file_size = file_size + strlen(tax->names->names[i].class_name); // To store name
}
// Build the taxonomy directory path
taxonomy_path = get_taxonomy_path(dms, taxonomy_name);
file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 6)*sizeof(char));
if (file_name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating the memory for a binary taxonomy file name");
return -1;
}
// Build the file path
if (sprintf(file_name, "%s/%s.ndx", taxonomy_path, taxonomy_name) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError building a binary taxonomy file name");
return -1;
}
free(taxonomy_path);
// Create file
file_descriptor = open(file_name, O_RDWR | O_CREAT | O_EXCL, 0777);
if (file_descriptor < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError creating a binary taxonomy file");
free(file_name);
return -1;
}
free(file_name);
// Truncate the file to the right size
if (ftruncate(file_descriptor, file_size) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError truncating a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write record count
if (write(file_descriptor, &(tax->names->count), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write records
for (i=0; i < tax->names->count; i++)
{
name_length = strlen(tax->names->names[i].name);
class_length = strlen(tax->names->names[i].class_name);
record_size = 4*sizeof(int32_t) + name_length + class_length;
// Write record size
if (write(file_descriptor, &record_size, sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write if the name is a scientific name
if (write(file_descriptor, &(tax->names->names[i].is_scientific_name), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write name length
if (write(file_descriptor, &name_length, sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write class length
if (write(file_descriptor, &class_length, sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write taxid index
if (write(file_descriptor, &(tax->names->names[i].taxon->idx), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write name
if (write(file_descriptor, tax->names->names[i].name, name_length) < ((ssize_t) name_length))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write class
if (write(file_descriptor, tax->names->names[i].class_name, class_length) < ((ssize_t) class_length))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
}
// Close file
if (close(file_descriptor) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError closing a ndx taxonomy file");
return -1;
}
return 0;
}
int write_mergedidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct?
{
int i;
char* file_name;
int file_descriptor;
off_t file_size;
char* taxonomy_path;
int32_t record_size;
// Compute file size
file_size = sizeof(int32_t) + (sizeof(int32_t) * 3 * (tax->merged_idx)->count);
// Build the taxonomy directory path
taxonomy_path = get_taxonomy_path(dms, taxonomy_name);
file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 6)*sizeof(char));
if (file_name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating the memory for a binary taxonomy file name");
return -1;
}
// Build the file path
if (sprintf(file_name, "%s/%s.adx", taxonomy_path, taxonomy_name) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError building a binary taxonomy file name");
return -1;
}
free(taxonomy_path);
// Create file
file_descriptor = open(file_name, O_RDWR | O_CREAT | O_EXCL, 0777);
if (file_descriptor < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError creating a binary taxonomy file %s", file_name);
free(file_name);
return -1;
}
free(file_name);
// Truncate the file to the right size
if (ftruncate(file_descriptor, file_size) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError truncating a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write merged indices count
if (write(file_descriptor, &((tax->merged_idx)->count), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
record_size = 2 * sizeof(int32_t);
// Write merged indices
for (i=0; i < (tax->merged_idx)->count; i++)
{
// Write record size
if (write(file_descriptor, &(record_size), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write taxid
if (write(file_descriptor, &(((tax->merged_idx)->merged)[i].taxid), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
// Write index corresponding to the taxid in the ecotxidx_t structure
if (write(file_descriptor, &(((tax->merged_idx)->merged)[i].idx), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError writing in a binary taxonomy file");
close(file_descriptor);
return -1;
}
}
// Close file
if (close(file_descriptor) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError closing an adx taxonomy file");
return -1;
}
return 0;
}
int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name)
{
char* taxonomy_path;
// Build the taxonomy directory path
taxonomy_path = get_taxonomy_path(dms, tax_name);
if (taxonomy_path == NULL)
return -1;
// Try to create the directory
if (mkdir(taxonomy_path, 00777) < 0)
{
if (errno == EEXIST)
obidebug(1, "\nA taxonomy already exists with this name.");
obidebug(1, "\nProblem creating a new taxonomy directory");
free(taxonomy_path);
return -1;
}
free(taxonomy_path);
if (write_rankidx(dms, tax, tax_name) < 0)
return -1;
if (write_taxonomyidx(dms, tax, tax_name) < 0)
return -1;
if (write_nameidx(dms, tax, tax_name) < 0)
return -1;
if (write_mergedidx(dms, tax, tax_name) < 0)
return -1;
// Check if there are local taxa (if so last taxon is local)
if ((tax->taxa)->local_count > 0)
if (write_local_taxonomy_idx(dms, tax, tax_name) < 0)
return -1;
return 0;
}
int read_nodes_dmp(const char* taxdump, OBIDMS_taxonomy_p tax, char*** rank_names_p, int** parent_taxids_p)
{
struct dirent* dp;
DIR* tax_dir;
FILE* file;
char* file_name;
bool file_found=false;
char line[2048]; // TODO large enough?
char* elt;
int buffer_size;
int i, n;
buffer_size = 10000;
// Initialize rank names and parent taxids arrays
*parent_taxids_p = malloc(buffer_size * sizeof(int));
if (*parent_taxids_p == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating the memory for the parent taxids array");
return -1;
}
*rank_names_p = malloc(buffer_size * sizeof(char*));
if (*rank_names_p == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating the memory for the rank names array");
free(*parent_taxids_p);
return -1;
}
// Open the taxdum directory
tax_dir = opendir(taxdump);
if (tax_dir == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nProblem opening a taxdump directory");
free(*parent_taxids_p);
free(*rank_names_p);
return -1;
}
// Look for the 'nodes.dmp' file
while ((dp = readdir(tax_dir)) != NULL)
{
if (strcmp(dp->d_name, "nodes.dmp") == 0)
{
file_found = true;
// Initializing the taxa structure
tax->taxa = (ecotxidx_t*) malloc(sizeof(ecotxidx_t) + sizeof(ecotx_t) * buffer_size);
if (tax->taxa == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating the memory for a taxonomy structure");
free(*parent_taxids_p);
free(*rank_names_p);
closedir(tax_dir);
return -1;
}
// Allocating the memory for the file name
file_name = (char*) malloc((strlen(taxdump) + 11)*sizeof(char));
if (file_name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating the memory for a file name");
free(*parent_taxids_p);
free(*rank_names_p);
closedir(tax_dir);
return -1;
}
// Build the file path
if (sprintf(file_name, "%s/nodes.dmp", taxdump) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError building a taxonomy file name for 'nodes.dmp'");
free(*parent_taxids_p);
free(*rank_names_p);
closedir(tax_dir);
free(file_name);
return -1;
}
file = fopen(file_name, "r");
if (file == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nProblem opening a taxonomy file");
free(*parent_taxids_p);
free(*rank_names_p);
closedir(tax_dir);
free(file_name);
return -1;
}
free(file_name);
(tax->taxa)->max_taxid = 0;
n = 0;
while (fgets(line, sizeof(line), file))
{
// Enlarge structures if needed
if (n == buffer_size)
{
buffer_size = buffer_size * 2;
tax->taxa = (ecotxidx_t*) realloc(tax->taxa, sizeof(ecotxidx_t) + sizeof(ecotx_t) * buffer_size);
if (tax->taxa == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reallocating memory for a taxonomy structure");
free(*parent_taxids_p);
free(*rank_names_p);
fclose(file);
closedir(tax_dir);
return -1;
}
*parent_taxids_p = (int*) realloc(*parent_taxids_p, sizeof(int) * buffer_size);
if (*parent_taxids_p == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reallocating memory for the parent taxids array");
free(*parent_taxids_p);
free(*rank_names_p);
fclose(file);
closedir(tax_dir);
return -1;
}
*rank_names_p = (char**) realloc(*rank_names_p, sizeof(char*) * buffer_size);
if (*rank_names_p == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reallocating memory for the rank names array");
free(*parent_taxids_p);
free(*rank_names_p);
fclose(file);
closedir(tax_dir);
return -1;
}
}
// Check for terminal '\n' character (line complete)
if (line[strlen(line) - 1] != '\n')
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError: line buffer size not large enough for line in taxonomy file");
free(*parent_taxids_p);
free(*rank_names_p);
fclose(file);
closedir(tax_dir);
return -1;
}
(tax->taxa)->taxon[n].idx = n;
// Parse 3 first elements separated by '|'
elt = strtok(line, "|");
// Remove the last character (tab character)
elt[strlen(elt)-1] = '\0';
// First element: taxid
(tax->taxa)->taxon[n].taxid = atoi(elt);
// Update max taxid
if ((tax->taxa)->taxon[n].taxid > (tax->taxa)->max_taxid)
(tax->taxa)->max_taxid = (tax->taxa)->taxon[n].taxid;
// Initialize farest taxid value
(tax->taxa)->taxon[n].farest = -1;
i = 1;
while (i < 3)
{
elt = strtok(NULL, "|");
// Remove the first and the last characters (tab characters)
elt = elt+1;
elt[strlen(elt)-1] = '\0';
if (i == 1)
(*parent_taxids_p)[n] = atoi(elt);
else if (i == 2)
{
(*rank_names_p)[n] = (char*) malloc((strlen(elt)+1) * sizeof(char));
if ((*rank_names_p)[n] == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for taxon rank name");
free(*parent_taxids_p);
free(*rank_names_p);
fclose(file);
closedir(tax_dir);
return -1;
}
strcpy((*rank_names_p)[n], elt);
}
i++;
}
n++;
}
// Check that fgets stopped because it reached EOF
if (!feof(file))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError: file reading was stopped before end of file");
free(*parent_taxids_p);
free(*rank_names_p);
fclose(file);
closedir(tax_dir);
return -1;
}
// Store count
(tax->taxa)->count = n;
(tax->taxa)->ncbi_count = n;
(tax->taxa)->local_count = 0;
// Truncate the structure memory to the right size
tax->taxa = (ecotxidx_t*) realloc(tax->taxa, sizeof(ecotxidx_t) + sizeof(ecotx_t) * (tax->taxa)->count);
if (tax->taxa == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reallocating memory for taxonomy structure");
free(*parent_taxids_p);
free(*rank_names_p);
fclose(file);
closedir(tax_dir);
return -1;
}
if (fclose(file) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError closing a taxdump file");
free(*parent_taxids_p);
free(*rank_names_p);
closedir(tax_dir);
return -1;
}
}
}
if (closedir(tax_dir) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError closing a taxdump directory");
free(*parent_taxids_p);
free(*rank_names_p);
closedir(tax_dir);
return -1;
}
if ( ! file_found)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError: could not find 'nodes.dmp' file in taxdump directory");
free(*parent_taxids_p);
free(*rank_names_p);
return -1;
}
return 0;
}
int read_delnodes_dmp(const char* taxdump, OBIDMS_taxonomy_p tax, int32_t** delnodes_p, int32_t* delnodes_count)
{
struct dirent* dp;
DIR* tax_dir;
FILE* file;
char* file_name;
bool file_found=false;
char line[2048]; // TODO large enough?
char* elt;
int buffer_size;
int n;
int old_taxid;
buffer_size = 10000;
// Initializing the list of deleted nodes
*delnodes_p = (int32_t*) malloc(sizeof(int32_t) * buffer_size);
if (*delnodes_p == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating the memory for the deleted taxids array");
return -1;
}
tax_dir = opendir(taxdump);
if (tax_dir == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nProblem opening a taxdump directory");
closedir(tax_dir);
free(*delnodes_p);
return -1;
}
// Go through taxonomy files
while ((dp = readdir(tax_dir)) != NULL)
{
if (strcmp(dp->d_name, "delnodes.dmp") == 0)
{
file_found = true;
// Allocating the memory for the file name
file_name = (char*) malloc((strlen(taxdump) + 14)*sizeof(char));
if (file_name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating the memory for a file name");
closedir(tax_dir);
free(*delnodes_p);
return -1;
}
// Build the file path
if (sprintf(file_name, "%s/delnodes.dmp", taxdump) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError building a taxonomy file name");
closedir(tax_dir);
free(*delnodes_p);
free(file_name);
return -1;
}
file = fopen(file_name, "r");
if (file == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nProblem opening a taxonomy file");
closedir(tax_dir);
free(file_name);
free(*delnodes_p);
return -1;
}
free(file_name);
n = 0;
while (fgets(line, sizeof(line), file))
{
// Check for terminal '\n' character (line complete)
if (line[strlen(line) - 1] != '\n')
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError: line buffer size not large enough for line in taxonomy file");
fclose(file);
closedir(tax_dir);
free(*delnodes_p);
return -1;
}
// Get first and only element of the line (the deprecated taxid)
elt = strtok(line, "|");
// Remove the last character (tab character)
elt[strlen(elt)-1] = '\0';
// First element: old deprecated taxid
old_taxid = atoi(elt);
// Store the old taxid in the list of deleted taxids
// Enlarge array if needed
if (n == buffer_size)
{
buffer_size = buffer_size * 2;
(*delnodes_p) = (int32_t*) realloc(tax->merged_idx, sizeof(int32_t) * buffer_size);
if ((*delnodes_p) == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reallocating memory for a taxonomy structure");
fclose(file);
closedir(tax_dir);
return -1;
}
}
(*delnodes_p)[n] = old_taxid;
n++;
}
// Check that fgets stopped because it reached EOF
if (!feof(file))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError: file reading was stopped before end of file");
fclose(file);
closedir(tax_dir);
free(*delnodes_p);
return -1;
}
// Store count
*delnodes_count = n;
if (fclose(file) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError closing a taxdump file");
free(*delnodes_p);
closedir(tax_dir);
return -1;
}
}
}
if (closedir(tax_dir) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError closing a taxdump directory");
free(*delnodes_p);
return -1;
}
if ( ! file_found)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError: could not find 'delnodes.dmp' file in taxdump directory");
free(*delnodes_p);
return -1;
}
return 0;
}
int read_merged_dmp(const char* taxdump, OBIDMS_taxonomy_p tax, int32_t* delnodes, int32_t delnodes_count)
{
int n, nD, nT;
int taxid, old_taxid;
ecotx_t* t;
struct dirent* dp;
DIR* tax_dir;
FILE* file;
char* file_name;
bool file_found=false;
char line[2048]; // TODO large enough?
char* elt;
int buffer_size;
buffer_size = 10000;
tax_dir = opendir(taxdump);
if (tax_dir == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nProblem opening a taxdump directory");
return -1;
}
// Go through taxonomy files
while ((dp = readdir(tax_dir)) != NULL)
{
if (strcmp(dp->d_name, "merged.dmp") == 0)
{
file_found = true;
// Initializing the merged structure
tax->merged_idx = (ecomergedidx_t*) malloc(sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size);
if (tax->merged_idx == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating the memory for a taxonomy structure");
closedir(tax_dir);
return -1;
}
// Allocating the memory for the file name
file_name = (char*) malloc((strlen(taxdump) + 12)*sizeof(char));
if (file_name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating the memory for a file name");
closedir(tax_dir);
return -1;
}
// Build the file path
if (sprintf(file_name, "%s/merged.dmp", taxdump) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError building a taxonomy file name");
closedir(tax_dir);
free(file_name);
return -1;
}
file = fopen(file_name, "r");
if (file == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nProblem opening a taxonomy file");
closedir(tax_dir);
free(file_name);
return -1;
}
free(file_name);
nT = 0; // to point in current taxa list while merging
nD = delnodes_count-1; // to point in deleted taxids list while merging (going from count-1 to 0 because taxids are sorted in descending order)
n = 0; // to point in final merged list while merging
while (fgets(line, sizeof(line), file))
{
// Check for terminal '\n' character (line complete)
if (line[strlen(line) - 1] != '\n')
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError: line buffer size not large enough for line in taxonomy file");
closedir(tax_dir);
fclose(file);
return -1;
}
// Parse the 2 elements separated by '|'
// Get first element
elt = strtok(line, "|");
// Remove the last character (tab character)
elt[strlen(elt)-1] = '\0';
// First element: old deprecated taxid
old_taxid = atoi(elt);
// Get 2nd element: new taxid
elt = strtok(NULL, "|");
// Remove the first and the last characters (tab characters)
elt = elt+1;
elt[strlen(elt)-1] = '\0';
taxid = atoi(elt);
// Store the old taxid in the merged_idx ordered taxid list
// The merged list is an ordered list of the current taxids, the deprecated taxids that have current references,
// and the deleted taxids with no current reference. An element of the list is composed of the taxid, and the index
// of the taxon in the taxa structure, or -1 for deleted taxids.
// Creating the merged list requires to merge the 3 ordered lists into one.
while (((nT < (tax->taxa)->count) && ((tax->taxa)->taxon[nT].taxid < old_taxid)) && ((nD >= 0) && (delnodes[nD] < old_taxid)))
{
if ((tax->taxa)->taxon[nT].taxid < delnodes[nD])
{ // Add element from taxa list
// Enlarge structure if needed
if (n == buffer_size)
{
buffer_size = buffer_size * 2;
tax->merged_idx = (ecomergedidx_t*) realloc(tax->merged_idx, sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size);
if (tax->merged_idx == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reallocating memory for a taxonomy structure");
closedir(tax_dir);
fclose(file);
return -1;
}
}
(tax->merged_idx)->merged[n].taxid = (tax->taxa)->taxon[nT].taxid;
(tax->merged_idx)->merged[n].idx = nT;
nT++;
n++;
}
else if (delnodes[nD] < (tax->taxa)->taxon[nT].taxid)
{ // Add element from deleted taxids list
// Enlarge structure if needed
if (n == buffer_size)
{
buffer_size = buffer_size * 2;
tax->merged_idx = (ecomergedidx_t*) realloc(tax->merged_idx, sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size);
if (tax->merged_idx == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reallocating memory for a taxonomy structure");
closedir(tax_dir);
fclose(file);
return -1;
}
}
(tax->merged_idx)->merged[n].taxid = delnodes[nD];
(tax->merged_idx)->merged[n].idx = -1; // The index to tag deleted taxids is -1
nD--;
n++;
}
}
// Add the deprecated taxid
// Enlarge structure if needed
if (n == buffer_size)
{
buffer_size = buffer_size * 2;
tax->merged_idx = (ecomergedidx_t*) realloc(tax->merged_idx, sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * buffer_size);
if (tax->merged_idx == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reallocating memory for a taxonomy structure");
closedir(tax_dir);
fclose(file);
return -1;
}
}
// Store the deprecated taxid with the index that refers to the new taxid
// Find the index of the new taxid
t = obi_taxo_get_taxon_with_current_taxid(tax, taxid);
// Store the old taxid with the index
(tax->merged_idx)->merged[n].taxid = old_taxid;
(tax->merged_idx)->merged[n].idx = t->idx;
n++;
}
// Check that fgets stopped because it reached EOF
if (!feof(file))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError: file reading was stopped before end of file");
closedir(tax_dir);
fclose(file);
return -1;
}
// Store count
(tax->merged_idx)->count = n;
// Truncate the structure memory to the right size
tax->merged_idx = (ecomergedidx_t*) realloc(tax->merged_idx, sizeof(ecomergedidx_t) + sizeof(ecomerged_t) * (tax->merged_idx)->count);
if (tax->merged_idx == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reallocating memory for a a taxonomy structure");
closedir(tax_dir);
fclose(file);
return -1;
}
if (fclose(file) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError closing a taxdump file");
closedir(tax_dir);
return -1;
}
}
}
if (closedir(tax_dir) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError closing a taxdump directory");
closedir(tax_dir);
return -1;
}
if ( ! file_found)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError: could not find 'merged.dmp' file in taxdump directory");
return -1;
}
// Free delnodes array, not needed anymore
free(delnodes);
return 0;
}
int read_names_dmp(const char* taxdump, OBIDMS_taxonomy_p tax)
{
int i, j, n;
int taxid;
struct dirent* dp;
DIR* tax_dir;
FILE* file;
char* file_name;
bool file_found=false;
char line[2048]; // TODO large enough?
char* elt;
int buffer_size;
buffer_size = 10000;
tax_dir = opendir(taxdump);
if (tax_dir == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nProblem opening a taxdump directory");
return -1;
}
// Go through taxonomy files
while ((dp = readdir(tax_dir)) != NULL)
{
if (strcmp(dp->d_name, "names.dmp") == 0)
{
file_found = true;
// Initializing the names structure
tax->names = (econameidx_t*) malloc(sizeof(econameidx_t) + sizeof(econame_t) * buffer_size);
if (tax->names == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating the memory for a taxonomy structure");
closedir(tax_dir);
return -1;
}
// Allocating the memory for the file name
file_name = (char*) malloc((strlen(taxdump) + 11)*sizeof(char));
if (file_name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating the memory for a file name");
closedir(tax_dir);
return -1;
}
// Build the file path
if (sprintf(file_name, "%s/names.dmp", taxdump) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError building a taxonomy file name");
free(file_name);
closedir(tax_dir);
return -1;
}
file = fopen(file_name, "r");
if (file == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nProblem opening a taxonomy file");
free(file_name);
closedir(tax_dir);
return -1;
}
free(file_name);
n = 0;
j = 0;
while (fgets(line, sizeof(line), file))
{
// Enlarge structures if needed
if (n == buffer_size)
{
buffer_size = buffer_size * 2;
tax->names = (econameidx_t*) realloc(tax->names, sizeof(econameidx_t) + sizeof(econame_t) * buffer_size);
if (tax->names == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reallocating memory for a taxonomy structure");
fclose(file);
closedir(tax_dir);
return -1;
}
}
// Check for terminal '\n' character (line complete)
if (line[strlen(line) - 1] != '\n')
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError: line buffer size not large enough for line in taxonomy file");
fclose(file);
closedir(tax_dir);
return -1;
}
// Parse 4 first elements separated by '|'
elt = strtok(line, "|");
// Remove the last character (tab character)
elt[strlen(elt)-1] = '\0';
// First element: taxid
taxid = atoi(elt);
// Find taxid in taxa structure and store pointer in names structure
i = j;
while ((i < (tax->taxa)->count) && ((tax->taxa)->taxon[i].taxid != taxid))
i++;
if (i == (tax->taxa)->count)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError: could not find taxon associated to name when reading taxdump");
fclose(file);
closedir(tax_dir);
return -1;
}
j = i; // Because there are several names by taxon but they are in the same order
(tax->names)->names[n].taxon = ((tax->taxa)->taxon)+i;
i = 1;
while (i < 4)
{
elt = strtok(NULL, "|");
// Remove the first and the last characters (tab characters)
elt = elt+1;
elt[strlen(elt)-1] = '\0';
if (i == 1) // Name
{
(tax->names)->names[n].name = (char*) malloc((strlen(elt) + 1) * sizeof(char));
if ((tax->names)->names[n].name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for a taxon name");
obi_close_taxonomy(tax);
fclose(file);
closedir(tax_dir);
return -1;
}
strcpy((tax->names)->names[n].name, elt);
}
else if (i == 3) // Class name
{
(tax->names)->names[n].class_name = (char*) malloc((strlen(elt) + 1) * sizeof(char));
if ((tax->names)->names[n].class_name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for a taxon class name");
fclose(file);
closedir(tax_dir);
return -1;
}
strcpy((tax->names)->names[n].class_name, elt);
if (strcmp(elt, "scientific name") == 0)
{
(tax->names)->names[n].is_scientific_name = 1;
}
else
(tax->names)->names[n].is_scientific_name = 0;
}
i++;
}
n++;
}
// Check that fgets stopped because it reached EOF
if (!feof(file))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError: file reading was stopped before end of file");
fclose(file);
closedir(tax_dir);
return -1;
}
// Store count
(tax->names)->count = n;
// Truncate the structure memory to the right size
tax->names = (econameidx_t*) realloc(tax->names, sizeof(econameidx_t) + sizeof(econame_t) * (tax->names)->count);
if (tax->names == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reallocating memory for a a taxonomy structure");
fclose(file);
closedir(tax_dir);
return -1;
}
if (fclose(file) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError closing a taxdump file");
closedir(tax_dir);
return -1;
}
}
}
if (closedir(tax_dir) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError closing a taxdump directory");
closedir(tax_dir);
return -1;
}
if ( ! file_found)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError: could not find 'merged.dmp' file in taxdump directory");
return -1;
}
return 0;
}
OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
{
OBIDMS_taxonomy_p tax;
char** rank_names=NULL;
int* parent_taxids=NULL;
int32_t* delnodes=NULL;
int32_t delnodes_count;
bool already_in;
ecotx_t* t;
int buffer_size;
int i, j;
// Initialize taxonomy structure
tax = (OBIDMS_taxonomy_p) malloc(sizeof(OBIDMS_taxonomy_t));
if (tax == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating the memory for a taxonomy structure");
return NULL;
}
tax->ranks = NULL;
tax->taxa = NULL;
tax->names = NULL;
tax->merged_idx = NULL;
tax->dms = NULL;
(tax->tax_name)[0] = '\0';
// TODO check if taxdump path is for a gz file to unzip or a directory
// READ NODES.DMP
if (read_nodes_dmp(taxdump, tax, &rank_names, &parent_taxids) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nProblem reading 'nodes.dmp'");
obi_close_taxonomy(tax);
return NULL;
}
// READ DELNODES.DMP
if (read_delnodes_dmp(taxdump, tax, &delnodes, &delnodes_count) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nProblem reading 'delnodes.dmp'");
obi_close_taxonomy(tax);
free(rank_names);
free(parent_taxids);
return NULL;
}
// READ MERGED.DMP
if (read_merged_dmp(taxdump, tax, delnodes, delnodes_count) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nProblem reading 'merged.dmp'");
obi_close_taxonomy(tax);
free(delnodes);
free(rank_names);
free(parent_taxids);
return NULL;
}
// READ NAMES.DMP
if (read_names_dmp(taxdump, tax) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nProblem reading 'names.dmp'");
obi_close_taxonomy(tax);
free(rank_names);
free(parent_taxids);
return NULL;
}
// Go through data to fill the taxonomy structure
// Build rank list
// Initialize rank structure
buffer_size = 10;
tax->ranks = (ecorankidx_t*) malloc(sizeof(ecorankidx_t) + sizeof(char*) * buffer_size);
if (tax->ranks == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for taxon rank array");
obi_close_taxonomy(tax);
free(parent_taxids);
free(rank_names);
return NULL;
}
(tax->ranks)->count = 0;
for (i=0; i < (tax->taxa)->count; i++)
{
already_in = false;
for (j=0; j < (tax->ranks)->count; j++)
{
if (strcmp(rank_names[i], ((tax->ranks)->label)[j]) == 0)
{
already_in = true;
break;
}
}
if (!already_in)
{
// Realloc rank structure if needed
if ((tax->ranks)->count == buffer_size)
{
buffer_size = buffer_size + 10;
tax->ranks = (ecorankidx_t*) realloc(tax->ranks, sizeof(ecorankidx_t) + sizeof(char*) * buffer_size);
if (tax->ranks == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reallocating memory for taxon ranks");
obi_close_taxonomy(tax);
free(parent_taxids);
free(rank_names);
return NULL;
}
}
// Store new rank
((tax->ranks)->label)[(tax->ranks)->count] = (char*) malloc((strlen(rank_names[i]) + 1) * sizeof(char));
if (((tax->ranks)->label)[(tax->ranks)->count] == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for taxon rank names");
obi_close_taxonomy(tax);
free(parent_taxids);
free(rank_names);
return NULL;
}
strcpy(((tax->ranks)->label)[(tax->ranks)->count], rank_names[i]);
((tax->ranks)->count)++;
}
}
// Truncate to the number of ranks recorded
tax->ranks = (ecorankidx_t*) realloc(tax->ranks, sizeof(ecorankidx_t) + sizeof(char*) * (tax->ranks)->count);
if (tax->ranks == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reallocating memory for taxon ranks");
obi_close_taxonomy(tax);
free(parent_taxids);
free(rank_names);
return NULL;
}
// Sort in alphabetical order
qsort((tax->ranks)->label, (tax->ranks)->count, sizeof(char*), cmp_str);
// Associate the taxa with their rank indices
for (i=0; i < (tax->taxa)->count; i++)
{
for (j=0; j < (tax->ranks)->count; j++)
{
if (strcmp(rank_names[i], ((tax->ranks)->label)[j]) == 0)
{
((tax->taxa)->taxon)[i].rank = j;
break;
}
}
}
// Associate the taxa with their scientific name
for (i=0; i < (tax->names)->count; i++)
{
if ((tax->names)->names[i].is_scientific_name)
{
((tax->names)->names[i].taxon)->name = (char*) malloc((strlen((((tax->names)->names)[i]).name) + 1) * sizeof(char));
if (((tax->names)->names[i].taxon)->name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reallocating memory for taxon ranks");
obi_close_taxonomy(tax);
free(parent_taxids);
free(rank_names);
return NULL;
}
strcpy(((tax->names)->names[i].taxon)->name, (((tax->names)->names)[i]).name);
}
}
// Sort names in alphabetical order
qsort((tax->names)->names, (tax->names)->count, sizeof(econame_t), cmp_names);
// Associate the taxa with their parent
for (i=0; i < (tax->taxa)->count; i++)
{
((tax->taxa)->taxon)[i].parent = obi_taxo_get_taxon_with_current_taxid(tax, parent_taxids[i]);
if (((tax->taxa)->taxon)[i].parent == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError: taxon parent not found");
obi_close_taxonomy(tax);
free(parent_taxids);
free(rank_names);
return NULL;
}
(((tax->taxa)->taxon)[i].parent)->farest = 0;
}
(tax->taxa)->buffer_size = (tax->taxa)->count;
// Compute longest branches (used to compute distances between taxa faster)
for (i=0; i < (tax->taxa)->count; i++)
{
t = (((tax->taxa))->taxon)+i;
if (t->farest == -1)
{
t->farest=0;
while (t->parent != t)
{
j = t->farest + 1;
if (j > t->parent->farest)
{
t->parent->farest = j;
t=t->parent;
}
else
t = (tax->taxa)->taxon;
}
}
}
// Freeing
free(parent_taxids);
for (i=0; i < (tax->taxa)->count; i++)
free(rank_names[i]);
free(rank_names);
return tax;
}
int obi_taxonomy_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const char* rank_name, int32_t parent_taxid, int32_t min_taxid)
{
int32_t taxid;
ecotx_t* taxon;
int i;
// econame_t* name_struct;
// Enlarge the structure memory for a new taxon
tax->taxa = (ecotxidx_t*) realloc(tax->taxa, sizeof(ecotxidx_t) + sizeof(ecotx_t) * (((tax->taxa)->count) + 1));
if (tax->taxa == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError reallocating memory for a taxonomy structure to add a new taxon");
return -1;
}
// Compute new taxid that must be equal or greater than 1E7 and greater than the maximum taxid existing in the taxonomy
if (min_taxid < MIN_LOCAL_TAXID)
min_taxid = MIN_LOCAL_TAXID;
if (min_taxid > (tax->taxa)->max_taxid)
taxid = min_taxid;
else
taxid = ((tax->taxa)->max_taxid) + 1;
// Fill the ecotx_t node structure
taxon = ((tax->taxa)->taxon)+((tax->taxa)->count);
taxon->taxid = taxid;
taxon->idx = (tax->taxa)->count;
taxon->local = true;
taxon->name = (char*) malloc((strlen(name) + 1) * sizeof(char));
if (taxon->name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for a taxon name to add a new taxon");
return -1;
}
strcpy(taxon->name, name);
taxon->rank = -1;
for (i=0; i < (tax->ranks)->count; i++)
{
if (strcmp(rank_name, ((tax->ranks)->label)[i]) == 0)
{
taxon->rank = i;
break;
}
}
if (taxon->rank == -1)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError: taxon rank not found when adding a new taxon");
return -1;
}
taxon->parent = obi_taxo_get_taxon_with_taxid(tax, parent_taxid);
if (taxon->parent == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError: taxon parent not found when adding a new taxon");
return -1;
}
taxon->farest = 0;
// Update taxonomy counts etc
(tax->taxa)->max_taxid = taxid;
((tax->taxa)->count)++;
((tax->taxa)->local_count)++;
(tax->taxa)->buffer_size = (tax->taxa)->count;
// // Add new name in names structure // Commented because the new name was not added in the .ndx file in the OBITools1
// // Allocate memory for new name
// tax->names = (econameidx_t*) realloc(tax->names, sizeof(econameidx_t) + sizeof(econame_t) * ((tax->names)->count + 1));
// if (tax->names == NULL)
// {
// obi_set_errno(OBI_MALLOC_ERROR);
// obidebug(1, "\nError reallocating memory for a taxonomy structure to add a new taxon");
// return -1;
// }
//
// // Add new name
// name_struct = (tax->names)->names + ((tax->names)->count);
// name_struct->name = (char*) malloc((strlen(name) + 1) * sizeof(char));
// if (name_struct->name == NULL)
// {
// obi_set_errno(OBI_MALLOC_ERROR);
// obidebug(1, "\nError allocating memory for a taxon name to add a new taxon");
// return -1;
// }
// strcpy(name_struct->name, name);
// name_struct->class_name = (char*) malloc((strlen("scientific name") + 1) * sizeof(char));
// if (name_struct->class_name == NULL)
// {
// obi_set_errno(OBI_MALLOC_ERROR);
// obidebug(1, "\nError allocating memory for a taxon class name to add a new taxon");
// return -1;
// }
// strcpy(name_struct->class_name, "scientific name");
// name_struct->is_scientific_name = true;
// name_struct->taxon = ((tax->taxa)->taxon) + ((tax->taxa)->count) - 1;
//
// // Sort names in alphabetical order
// qsort((tax->names)->names, (tax->names)->count, sizeof(econame_t), cmp_names);
//
// // Update name count
// ((tax->names)->count)++;
return taxid;
}
/////// PUBLIC /////////
OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, bool read_alternative_names)
{
OBIDMS_taxonomy_p tax;
char* taxonomy_path;
char* ranks_file_name;
char* taxa_file_name;
char* merged_idx_file_name;
char* local_taxa_file_name;
char* alter_names_file_name;
int buffer_size;
tax = (OBIDMS_taxonomy_p) malloc(sizeof(OBIDMS_taxonomy_t));
if (tax == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for a taxonomy structure");
return NULL;
}
tax->ranks = NULL;
tax->taxa = NULL;
tax->names = NULL;
tax->merged_idx = NULL;
tax->dms = dms;
strcpy(tax->tax_name, taxonomy_name);
buffer_size = 2048;
taxonomy_path = get_taxonomy_path(dms, taxonomy_name);
if (taxonomy_path == NULL)
return NULL;
// Read ranks
ranks_file_name = (char*) malloc(buffer_size*sizeof(char));
if (ranks_file_name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for ranks file name");
free(taxonomy_path);
free(tax);
return NULL;
}
if (snprintf(ranks_file_name, buffer_size, "%s/%s.rdx", taxonomy_path, taxonomy_name) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError building ranks file name");
free(taxonomy_path);
free(ranks_file_name);
free(tax);
return NULL;
}
tax->ranks = read_rankidx(ranks_file_name);
if (tax->ranks == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError reading taxonomy ranks file (check taxonomy name spelling)");
free(taxonomy_path);
free(ranks_file_name);
free(tax);
return NULL;
}
free(ranks_file_name);
// Read taxa
taxa_file_name = (char*) malloc(buffer_size*sizeof(char));
if (taxa_file_name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for taxa file name");
free(taxonomy_path);
obi_close_taxonomy(tax);
return NULL;
}
if (snprintf(taxa_file_name, buffer_size, "%s/%s.tdx", taxonomy_path, taxonomy_name) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError building taxa file name");
free(taxonomy_path);
free(taxa_file_name);
obi_close_taxonomy(tax);
return NULL;
}
local_taxa_file_name = (char*) malloc(buffer_size*sizeof(char));
if (local_taxa_file_name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for taxa file name");
free(taxonomy_path);
free(taxa_file_name);
obi_close_taxonomy(tax);
return NULL;
}
if (snprintf(local_taxa_file_name, buffer_size, "%s/%s.ldx", taxonomy_path, taxonomy_name) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError building local taxa file name");
free(taxonomy_path);
free(taxa_file_name);
free(local_taxa_file_name);
obi_close_taxonomy(tax);
return NULL;
}
tax->taxa = read_taxonomyidx(taxa_file_name, local_taxa_file_name);
if (tax->taxa == NULL)
{
free(taxonomy_path);
free(taxa_file_name);
free(local_taxa_file_name);
obi_close_taxonomy(tax);
return NULL;
}
free(taxa_file_name);
free(local_taxa_file_name);
// Read merged index (old and current taxids referring to indices in the taxa structure)
merged_idx_file_name = (char*) malloc(buffer_size*sizeof(char));
if (merged_idx_file_name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for merged index file name");
free(taxonomy_path);
obi_close_taxonomy(tax);
return NULL;
}
if (snprintf(merged_idx_file_name, buffer_size, "%s/%s.adx", taxonomy_path, taxonomy_name) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError building merged index file name");
free(taxonomy_path);
free(merged_idx_file_name);
obi_close_taxonomy(tax);
return NULL;
}
tax->merged_idx = read_mergedidx(merged_idx_file_name, tax);
if (tax->merged_idx == NULL)
{
free(taxonomy_path);
free(merged_idx_file_name);
obi_close_taxonomy(tax);
return NULL;
}
free(merged_idx_file_name);
// Read alternative names
if (read_alternative_names)
{
alter_names_file_name = (char*) malloc(buffer_size*sizeof(char));
if (alter_names_file_name == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError allocating memory for alternative names file name");
free(taxonomy_path);
obi_close_taxonomy(tax);
return NULL;
}
if (snprintf(alter_names_file_name, buffer_size, "%s/%s.ndx", taxonomy_path, taxonomy_name) < 0)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError building alternative names file name");
free(taxonomy_path);
free(alter_names_file_name);
obi_close_taxonomy(tax);
return NULL;
}
tax->names = read_nameidx(alter_names_file_name, tax);
if (tax->names == NULL)
{
free(taxonomy_path);
free(alter_names_file_name);
obi_close_taxonomy(tax);
return NULL;
}
free(alter_names_file_name);
}
free(taxonomy_path);
return tax;
}
int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy)
{
int i;
// Update local informations (local taxa and preferred names) if there are any
if ((taxonomy->taxa)->local_count > 0)
{
if (taxonomy->dms == NULL)
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError closing a taxonomy with local files but no DMS associated (probably read directly from taxdump)"); // TODO discuss
}
if (write_local_taxonomy_idx(taxonomy->dms, taxonomy, taxonomy->tax_name) < 0)
return -1;
}
if (taxonomy)
{
if (taxonomy->ranks)
{
for (i=0; i < (taxonomy->ranks)->count; i++)
{
if ((taxonomy->ranks)->label[i])
free((taxonomy->ranks)->label[i]);
}
free(taxonomy->ranks);
}
if (taxonomy->names)
{
for (i=0; i < (taxonomy->names)->count; i++)
{
if (((taxonomy->names)->names[i]).name)
free(((taxonomy->names)->names[i]).name);
if (((taxonomy->names)->names[i]).class_name)
free(((taxonomy->names)->names[i]).class_name);
}
free(taxonomy->names);
}
if (taxonomy->taxa)
{
for (i=0; i < (taxonomy->taxa)->count; i++)
{
if (((taxonomy->taxa)->taxon[i]).name)
free(((taxonomy->taxa)->taxon[i]).name);
}
free(taxonomy->taxa);
}
if (taxonomy->merged_idx)
{
free(taxonomy->merged_idx);
}
free(taxonomy);
}
return 0;
}
//////////////////////////////////////////////////////////////////////////
ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx)
{
ecotx_t* current_taxon;
ecotx_t* next_taxon;
current_taxon = taxon;
next_taxon = current_taxon->parent;
while ((current_taxon != next_taxon) && // root node
(current_taxon->rank != rankidx))
{
current_taxon = next_taxon;
next_taxon = current_taxon->parent;
}
if (current_taxon->rank == rankidx)
return current_taxon;
else
return NULL;
}
ecotx_t* obi_taxo_get_taxon_with_current_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid) // TODO discuss keeping private?
{
ecotx_t *current_taxon;
int32_t count;
count = (taxonomy->taxa)->count;
current_taxon = (ecotx_t*) bsearch((const void *) ((size_t) taxid),
(const void *) taxonomy->taxa->taxon,
count,
sizeof(ecotx_t),
cmp_taxids_in_ecotx_t);
return current_taxon;
}
ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid)
{
ecotx_t *current_taxon;
ecomerged_t *indexed_taxon;
int32_t count;
count = (taxonomy->merged_idx)->count;
indexed_taxon = (ecomerged_t*) bsearch((const void *) ((size_t) taxid),
(const void *) taxonomy->merged_idx->merged,
count,
sizeof(ecomerged_t),
cmp_taxids_in_ecomerged_t);
if (indexed_taxon == NULL)
current_taxon = NULL;
else if (indexed_taxon->idx == -1)
current_taxon = NULL; // TODO discuss what to do when old deleted taxon
else
current_taxon = (taxonomy->taxa->taxon)+(indexed_taxon->idx);
return current_taxon;
}
bool obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid) // TODO discuss that this doesn't work with deprecated taxids
{
ecotx_t* next_parent;
next_parent = taxon->parent;
while ((other_taxid != next_parent->taxid) && (strcmp(next_parent->name, "root")))
next_parent = next_parent->parent;
if (other_taxid == next_parent->taxid)
return 1;
else
return 0;
}
ecotx_t* obi_taxo_get_species(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
{
static OBIDMS_taxonomy_p tax = NULL;
static int32_t rankindex = -1;
if (taxonomy && (tax != taxonomy))
{
rankindex = rank_index("species", taxonomy->ranks);
tax = taxonomy;
}
if (!tax || (rankindex < 0))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError trying to get the species associated with a taxon: No taxonomy defined");
return NULL;
}
return obi_taxo_get_parent_at_rank(taxon, rankindex);
}
ecotx_t* obi_taxo_get_genus(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
{
static OBIDMS_taxonomy_p tax = NULL;
static int32_t rankindex = -1;
if (taxonomy && (tax != taxonomy))
{
rankindex = rank_index("genus", taxonomy->ranks);
tax = taxonomy;
}
if (!tax || (rankindex < 0))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError trying to get the genus associated with a taxon: No taxonomy defined");
return NULL;
}
return obi_taxo_get_parent_at_rank(taxon, rankindex);
}
ecotx_t* obi_taxo_get_family(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
{
static OBIDMS_taxonomy_p tax = NULL;
static int32_t rankindex = -1;
if (taxonomy && (tax != taxonomy))
{
rankindex = rank_index("family", taxonomy->ranks);
tax = taxonomy;
}
if (!tax || (rankindex < 0))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError trying to get the family associated with a taxon: No taxonomy defined");
return NULL;
}
return obi_taxo_get_parent_at_rank(taxon, rankindex);
}
ecotx_t* obi_taxo_get_kingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
{
static OBIDMS_taxonomy_p tax = NULL;
static int32_t rankindex = -1;
if (taxonomy && (tax != taxonomy))
{
rankindex = rank_index("kingdom", taxonomy->ranks);
tax = taxonomy;
}
if (!tax || (rankindex < 0))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError trying to get the kingdom associated with a taxon: No taxonomy defined");
return NULL;
}
return obi_taxo_get_parent_at_rank(taxon, rankindex);
}
ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
{
static OBIDMS_taxonomy_p tax = NULL;
static int32_t rankindex = -1;
if (taxonomy && (tax != taxonomy))
{
rankindex = rank_index("superkingdom", taxonomy->ranks);
tax = taxonomy;
}
if (!tax || (rankindex < 0))
{
obi_set_errno(OBI_TAXONOMY_ERROR);
obidebug(1, "\nError trying to get the superkingdom associated with a taxon: No taxonomy defined");
return NULL;
}
return obi_taxo_get_parent_at_rank(taxon, rankindex);
}