2206 lines
56 KiB
C
2206 lines
56 KiB
C
/********************************************************************
|
|
* OBIDMS taxonomy functions *
|
|
********************************************************************/
|
|
|
|
/**
|
|
* @file obidms_taxonomy.c
|
|
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
|
* @date March 2nd 2016
|
|
* @brief Functions for reading binary taxonomy files.
|
|
*/
|
|
|
|
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <stdbool.h>
|
|
#include <fcntl.h>
|
|
#include <sys/stat.h>
|
|
#include <sys/mman.h>
|
|
#include <unistd.h>
|
|
|
|
#include "obidms_taxonomy.h"
|
|
#include "obidms.h"
|
|
#include "obidebug.h"
|
|
#include "obierrno.h"
|
|
#include "utils.h"
|
|
|
|
|
|
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
|
|
|
|
|
|
int cmp_rank_labels(const void* label1, const void* label2)
|
|
{
|
|
return strcmp((const char*)label1,*(const char**)label2);
|
|
}
|
|
|
|
|
|
static int cmp_taxids(const void* ptaxid, const void* ptaxon)
|
|
{
|
|
ecotx_t* current_taxon = (ecotx_t*) ptaxon;
|
|
int32_t taxid = (int32_t) ((size_t) ptaxid);
|
|
return taxid - current_taxon->taxid;
|
|
}
|
|
|
|
|
|
static int cmp_str(const void* s1, const void* s2)
|
|
{
|
|
return strcmp(*((char**)s1), *((char**)s2));
|
|
}
|
|
|
|
|
|
static int cmp_names(const void* n1, const void* n2)
|
|
{
|
|
econame_t name1 = *((econame_t*)n1);
|
|
econame_t name2 = *((econame_t*)n2);
|
|
|
|
return strcmp(name1.name, name2.name);
|
|
}
|
|
|
|
|
|
char* get_taxonomy_path(OBIDMS_p dms, const char* tax_name)
|
|
{
|
|
char* all_tax_dir_path;
|
|
char* tax_path;
|
|
|
|
all_tax_dir_path = obi_dms_get_full_path(dms, TAXONOMY_DIR_NAME);
|
|
if (all_tax_dir_path == NULL)
|
|
return NULL;
|
|
|
|
tax_path = (char*) malloc((strlen(all_tax_dir_path) + strlen(tax_name) + 2)*sizeof(char));
|
|
if (tax_path == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating memory for taxonomy path");
|
|
free(all_tax_dir_path);
|
|
return NULL;
|
|
}
|
|
|
|
if (sprintf(tax_path, "%s/%s", all_tax_dir_path, tax_name) < 0)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError building taxonomy path");
|
|
free(all_tax_dir_path);
|
|
return NULL;
|
|
}
|
|
|
|
free(all_tax_dir_path);
|
|
|
|
return tax_path;
|
|
}
|
|
|
|
|
|
int32_t rank_index(const char* label, ecorankidx_t* ranks)
|
|
{
|
|
char **rep;
|
|
|
|
rep = bsearch(label, ranks->label, ranks->count, sizeof(char*), cmp_rank_labels);
|
|
|
|
if (rep)
|
|
return rep-ranks->label;
|
|
|
|
return -1;
|
|
}
|
|
|
|
|
|
void* read_ecorecord(FILE* f, int32_t* record_size)
|
|
{
|
|
static void* buffer = NULL;
|
|
int32_t buffer_size = 0;
|
|
int32_t read;
|
|
|
|
if (!record_size)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError reading a taxonomy file: record_size can not be NULL");
|
|
return NULL;
|
|
}
|
|
|
|
read = fread(record_size,
|
|
sizeof(int32_t),
|
|
1,
|
|
f);
|
|
|
|
if (feof(f))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError reading a taxonomy file: reached end of file");
|
|
return NULL;
|
|
}
|
|
|
|
if (read != 1)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError reading a taxonomy file: error reading record size");
|
|
return NULL;
|
|
}
|
|
|
|
if (buffer_size < *record_size)
|
|
{
|
|
if (buffer)
|
|
buffer = realloc(buffer, *record_size);
|
|
else
|
|
buffer = malloc(*record_size);
|
|
if (buffer == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError reading a taxonomy file: error allocating memory");
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
read = fread(buffer,
|
|
*record_size,
|
|
1,
|
|
f);
|
|
|
|
if (read != 1)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError reading a taxonomy file: error reading a record %d, %d", read, *record_size);
|
|
free(buffer);
|
|
return NULL;
|
|
}
|
|
|
|
return buffer;
|
|
};
|
|
|
|
|
|
ecotx_t* readnext_ecotaxon(FILE* f, ecotx_t* taxon)
|
|
{
|
|
ecotxformat_t* raw;
|
|
int32_t record_length;
|
|
|
|
raw = read_ecorecord(f, &record_length);
|
|
if (!raw)
|
|
return NULL;
|
|
|
|
taxon->parent = (ecotx_t*) ((size_t) raw->parent);
|
|
taxon->taxid = raw->taxid;
|
|
taxon->rank = raw->rank;
|
|
taxon->farest = -1;
|
|
|
|
taxon->name = malloc((raw->name_length+1) * sizeof(char));
|
|
if (taxon->name == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError reading a taxonomy file: error allocating memory");
|
|
return NULL;
|
|
}
|
|
|
|
strncpy(taxon->name, raw->name, raw->name_length);
|
|
taxon->name[raw->name_length] = 0; // TODO note: this line is probably missing in ROBITaxonomy and source of a bug
|
|
|
|
return taxon;
|
|
}
|
|
|
|
|
|
FILE* open_ecorecorddb(const char* file_name,
|
|
int32_t* count,
|
|
int32_t abort_on_open_error)
|
|
{
|
|
FILE* f;
|
|
int32_t read;
|
|
|
|
f = fopen(file_name, "rb");
|
|
|
|
if (!f)
|
|
{
|
|
if (abort_on_open_error)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nCouldn't open a taxonomy file");
|
|
fclose(f);
|
|
return NULL;
|
|
}
|
|
else
|
|
{
|
|
*count = 0;
|
|
fclose(f);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
read = fread(count,
|
|
sizeof(int32_t),
|
|
1,
|
|
f);
|
|
|
|
if (read != 1)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError reading taxonomy record size");
|
|
fclose(f);
|
|
return NULL;
|
|
}
|
|
|
|
return f;
|
|
}
|
|
|
|
|
|
ecorankidx_t* read_rankidx(const char* ranks_file_name)
|
|
{
|
|
int32_t count;
|
|
FILE* ranks_file;
|
|
ecorankidx_t* ranks_index;
|
|
int32_t i;
|
|
int32_t rank_length;
|
|
char* buffer;
|
|
|
|
ranks_file = open_ecorecorddb(ranks_file_name, &count, 0);
|
|
if (ranks_file==NULL)
|
|
return NULL;
|
|
|
|
ranks_index = (ecorankidx_t*) malloc(sizeof(ecorankidx_t) + sizeof(char*) * (count-1));
|
|
if (ranks_index == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating memory for taxonomy rank structure");
|
|
fclose(ranks_file);
|
|
return NULL;
|
|
}
|
|
|
|
ranks_index->count = count;
|
|
|
|
for (i=0; i < count; i++)
|
|
{
|
|
buffer = read_ecorecord(ranks_file, &rank_length);
|
|
if (buffer == NULL)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError reading a value in a taxonomy file");
|
|
fclose(ranks_file);
|
|
free(ranks_index);
|
|
return NULL;
|
|
}
|
|
ranks_index->label[i] = (char*) malloc(rank_length+1);
|
|
if (ranks_index->label[i] == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating memory for taxonomy rank label");
|
|
fclose(ranks_file);
|
|
free(ranks_index);
|
|
free(buffer);
|
|
return NULL;
|
|
}
|
|
strncpy(ranks_index->label[i], buffer, rank_length);
|
|
(ranks_index->label[i])[rank_length] = 0;
|
|
}
|
|
|
|
fclose(ranks_file);
|
|
|
|
return ranks_index;
|
|
}
|
|
|
|
|
|
ecotxidx_t* read_taxonomyidx(const char* taxa_file_name, const char* local_taxa_file_name)
|
|
{
|
|
int32_t count_taxa;
|
|
int32_t count_local_taxa;
|
|
FILE* f_taxa;
|
|
FILE* f_local_taxa;
|
|
ecotxidx_t* taxa_index;
|
|
struct ecotxnode* t;
|
|
int32_t i;
|
|
int32_t j;
|
|
|
|
f_taxa = open_ecorecorddb(taxa_file_name, &count_taxa, 1);
|
|
if (f_taxa == NULL)
|
|
{
|
|
obidebug(1, "\nError reading taxonomy taxa file");
|
|
return NULL;
|
|
}
|
|
|
|
f_local_taxa = open_ecorecorddb(local_taxa_file_name, &count_local_taxa, 0);
|
|
|
|
taxa_index = (ecotxidx_t*) malloc(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count_taxa + count_local_taxa - 1));
|
|
if (taxa_index == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating memory for taxonomy structure");
|
|
fclose(f_taxa);
|
|
fclose(f_local_taxa);
|
|
return NULL;
|
|
}
|
|
|
|
taxa_index->count = count_taxa + count_local_taxa;
|
|
taxa_index->ncbi_count = count_taxa;
|
|
taxa_index->local_count = count_local_taxa;
|
|
taxa_index->buffer_size = taxa_index->count;
|
|
|
|
taxa_index->max_taxid = 0;
|
|
printf("Reading %d taxa...\n", count_taxa);
|
|
for (i=0; i<count_taxa; i++)
|
|
{
|
|
readnext_ecotaxon(f_taxa, &(taxa_index->taxon[i]));
|
|
taxa_index->taxon[i].parent = taxa_index->taxon + (size_t) taxa_index->taxon[i].parent;
|
|
taxa_index->taxon[i].parent->farest = 0;
|
|
if (taxa_index->taxon[i].taxid > taxa_index->max_taxid)
|
|
taxa_index->max_taxid = taxa_index->taxon[i].taxid;
|
|
}
|
|
|
|
if (count_local_taxa > 0)
|
|
printf("Reading %d local taxa...\n", count_local_taxa);
|
|
else
|
|
printf("No local taxa\n");
|
|
|
|
count_taxa = taxa_index->count;
|
|
|
|
for (; i < count_taxa; i++){
|
|
readnext_ecotaxon(f_local_taxa, &(taxa_index->taxon[i]));
|
|
taxa_index->taxon[i].idx = i;
|
|
taxa_index->taxon[i].parent = taxa_index->taxon + (size_t) taxa_index->taxon[i].parent;
|
|
taxa_index->taxon[i].parent->farest=0;
|
|
if (taxa_index->taxon[i].taxid > taxa_index->max_taxid)
|
|
taxa_index->max_taxid = taxa_index->taxon[i].taxid;
|
|
}
|
|
|
|
for (i=0; i < count_taxa; i++)
|
|
{
|
|
t = taxa_index->taxon+i;
|
|
if (t->farest == -1)
|
|
{
|
|
t->farest=0;
|
|
while (t->parent != t)
|
|
{
|
|
j = t->farest + 1;
|
|
if (j > t->parent->farest)
|
|
{
|
|
t->parent->farest = j;
|
|
t=t->parent;
|
|
}
|
|
else
|
|
t = taxa_index->taxon;
|
|
}
|
|
}
|
|
}
|
|
|
|
fclose(f_taxa);
|
|
if (f_local_taxa != NULL)
|
|
fclose(f_local_taxa);
|
|
|
|
return taxa_index;
|
|
}
|
|
|
|
|
|
econame_t* readnext_econame(FILE* f, econame_t* name, OBIDMS_taxonomy_p taxonomy)
|
|
{
|
|
econameformat_t* raw;
|
|
int32_t record_length;
|
|
|
|
raw = read_ecorecord(f, &record_length);
|
|
if (raw == NULL)
|
|
return NULL;
|
|
|
|
name->is_scientific_name = raw->is_scientific_name;
|
|
|
|
name->name = malloc((raw->name_length + 1) * sizeof(char));
|
|
if (name->name == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating memory for a taxon name");
|
|
free(raw);
|
|
return NULL;
|
|
}
|
|
strncpy(name->name, raw->names, raw->name_length);
|
|
name->name[raw->name_length] = 0;
|
|
|
|
name->class_name = malloc((raw->class_length+1) * sizeof(char));
|
|
if (name->class_name == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating memory for a taxon class name");
|
|
free(name->name);
|
|
free(raw);
|
|
return NULL;
|
|
}
|
|
strncpy(name->class_name,(raw->names + raw->name_length), raw->class_length);
|
|
name->class_name[raw->class_length] = 0;
|
|
|
|
name->taxon = taxonomy->taxa->taxon + raw->taxid;
|
|
|
|
return name;
|
|
}
|
|
|
|
|
|
econameidx_t* read_nameidx(const char *file_name, OBIDMS_taxonomy_p taxonomy)
|
|
{
|
|
int32_t count;
|
|
FILE* f;
|
|
econameidx_t* index_names;
|
|
int32_t i;
|
|
|
|
f = open_ecorecorddb(file_name, &count, 0);
|
|
if (f == NULL)
|
|
{
|
|
obidebug(1, "\nError reading taxonomy name file");
|
|
return NULL;
|
|
}
|
|
|
|
index_names = (econameidx_t*) malloc(sizeof(econameidx_t) + sizeof(econame_t) * (count-1));
|
|
if (index_names == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError reading taxonomy name file");
|
|
return NULL;
|
|
}
|
|
|
|
index_names->count = count;
|
|
|
|
for (i=0; i < count; i++)
|
|
{
|
|
readnext_econame(f, (index_names->names)+i, taxonomy);
|
|
if ((index_names->names)+i == NULL)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError reading taxonomy name file");
|
|
free(index_names);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
fclose(f);
|
|
|
|
return index_names;
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Functions to write taxonomy structure to binary files
|
|
|
|
int write_rankidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct?
|
|
{
|
|
int i;
|
|
char* file_name;
|
|
int file_descriptor;
|
|
off_t file_size;
|
|
char* taxonomy_path;
|
|
int32_t length;
|
|
|
|
// Compute file size
|
|
file_size = sizeof(int32_t);
|
|
for (i=0; i < (tax->ranks)->count; i++)
|
|
{
|
|
file_size = file_size + sizeof(int32_t); // To store label size
|
|
file_size = file_size + strlen(((tax->ranks)->label)[i]); // To store label
|
|
}
|
|
|
|
// Build the taxonomy directory path
|
|
taxonomy_path = get_taxonomy_path(dms, taxonomy_name);
|
|
|
|
file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 5)*sizeof(char));
|
|
if (file_name == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating the memory for a binary taxonomy file name");
|
|
return -1;
|
|
}
|
|
|
|
// Build the file path
|
|
if (sprintf(file_name, "%s/%s.rdx", taxonomy_path, taxonomy_name) < 0)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError building a binary taxonomy file name");
|
|
return -1;
|
|
}
|
|
|
|
free(taxonomy_path);
|
|
|
|
// Create file
|
|
file_descriptor = open(file_name, O_RDWR | O_CREAT | O_EXCL, 0777);
|
|
if (file_descriptor < 0)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError creating a binary taxonomy file %s", file_name);
|
|
free(file_name);
|
|
return -1;
|
|
}
|
|
|
|
free(file_name);
|
|
|
|
// Truncate the file to the right size
|
|
if (ftruncate(file_descriptor, file_size) < 0)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError truncating a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
|
|
// Write rank count
|
|
if (write(file_descriptor, &((tax->ranks)->count), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError writing in a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
|
|
// Write ranks
|
|
for (i=0; i < (tax->ranks)->count; i++)
|
|
{
|
|
length = strlen(((tax->ranks)->label)[i]);
|
|
|
|
// Write rank size
|
|
if (write(file_descriptor, &length, sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError writing in a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
// Write rank label
|
|
if (write(file_descriptor, ((tax->ranks)->label)[i], length) < ((ssize_t) length))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError writing in a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
// Close file
|
|
if (close(file_descriptor) < 0)
|
|
{
|
|
obi_set_errno(OBIDMS_UNKNOWN_ERROR);
|
|
obidebug(1, "\nError closing a DMS information file");
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
int write_taxonomyidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct?
|
|
{
|
|
int i;
|
|
char* file_name;
|
|
int file_descriptor;
|
|
off_t file_size;
|
|
char* taxonomy_path;
|
|
int32_t name_length;
|
|
int32_t record_size;
|
|
|
|
// Compute file size
|
|
file_size = sizeof(int32_t); // To store record count
|
|
for (i=0; i < (tax->taxa)->ncbi_count; i++)
|
|
{
|
|
file_size = file_size + sizeof(int32_t) * 5; // To store record size, taxid, rank index, parent index, and name length
|
|
file_size = file_size + strlen(tax->taxa->taxon[i].name); // To store name
|
|
}
|
|
|
|
// Build the taxonomy directory path
|
|
taxonomy_path = get_taxonomy_path(dms, taxonomy_name);
|
|
|
|
file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 5)*sizeof(char));
|
|
if (file_name == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating the memory for a binary taxonomy file name");
|
|
return -1;
|
|
}
|
|
|
|
// Build the file path
|
|
if (sprintf(file_name, "%s/%s.tdx", taxonomy_path, taxonomy_name) < 0)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError building a binary taxonomy file name");
|
|
return -1;
|
|
}
|
|
|
|
free(taxonomy_path);
|
|
|
|
// Create file
|
|
file_descriptor = open(file_name, O_RDWR | O_CREAT | O_EXCL, 0777);
|
|
if (file_descriptor < 0)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError creating a binary taxonomy file");
|
|
free(file_name);
|
|
return -1;
|
|
}
|
|
|
|
free(file_name);
|
|
|
|
// Truncate the file to the right size
|
|
if (ftruncate(file_descriptor, file_size) < 0)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError truncating a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
|
|
// Write record count
|
|
if (write(file_descriptor, &(tax->taxa->ncbi_count), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError writing in a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
|
|
// Write records
|
|
for (i=0; i < (tax->taxa)->ncbi_count; i++)
|
|
{
|
|
name_length = strlen(tax->taxa->taxon[i].name);
|
|
record_size = 4*sizeof(int32_t) + name_length;
|
|
|
|
// Write record size
|
|
if (write(file_descriptor, &record_size, sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError writing in a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
// Write taxid
|
|
if (write(file_descriptor, &(tax->taxa->taxon[i].taxid), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError writing in a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
// Write rank index
|
|
if (write(file_descriptor, &(tax->taxa->taxon[i].rank), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError writing in a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
// Write parent index
|
|
if (write(file_descriptor, &((tax->taxa->taxon[i].parent)->idx), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError writing in a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
// Write name length
|
|
if (write(file_descriptor, &name_length, sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError writing in a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
// Write name
|
|
if (write(file_descriptor, tax->taxa->taxon[i].name, name_length) < ((ssize_t) name_length))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError writing in a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
// Close file
|
|
if (close(file_descriptor) < 0)
|
|
{
|
|
obi_set_errno(OBIDMS_UNKNOWN_ERROR);
|
|
obidebug(1, "\nError closing a DMS information file");
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
int write_local_taxonomy_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct?
|
|
{
|
|
int i;
|
|
char* file_name;
|
|
int file_descriptor;
|
|
off_t file_size;
|
|
char* taxonomy_path;
|
|
int32_t name_length;
|
|
int32_t record_size;
|
|
|
|
// Compute file size
|
|
file_size = sizeof(int32_t); // To store record count
|
|
for (i=(tax->taxa)->ncbi_count; i < (tax->taxa)->count; i++)
|
|
{
|
|
file_size = file_size + sizeof(int32_t) * 5; // To store record size, taxid, rank index, parent index, and name length
|
|
file_size = file_size + strlen(tax->taxa->taxon[i].name); // To store name
|
|
}
|
|
|
|
// Build the taxonomy directory path
|
|
taxonomy_path = get_taxonomy_path(dms, taxonomy_name);
|
|
|
|
file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 5)*sizeof(char));
|
|
if (file_name == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating the memory for a binary taxonomy file name");
|
|
return -1;
|
|
}
|
|
|
|
// Build the file path
|
|
if (sprintf(file_name, "%s/%s.ldx", taxonomy_path, taxonomy_name) < 0)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError building a binary taxonomy file name");
|
|
return -1;
|
|
}
|
|
|
|
free(taxonomy_path);
|
|
|
|
// Create file
|
|
file_descriptor = open(file_name, O_RDWR | O_CREAT, 0777);
|
|
if (file_descriptor < 0)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError creating a binary taxonomy file");
|
|
free(file_name);
|
|
return -1;
|
|
}
|
|
|
|
free(file_name);
|
|
|
|
// Truncate the file to the right size
|
|
if (ftruncate(file_descriptor, file_size) < 0)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError truncating a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
|
|
// Write record count
|
|
if (write(file_descriptor, &((tax->taxa)->local_count), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError writing in a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
|
|
// Write records
|
|
for (i=(tax->taxa)->ncbi_count; i < (tax->taxa)->count; i++)
|
|
{
|
|
name_length = strlen(tax->taxa->taxon[i].name);
|
|
record_size = 4*sizeof(int32_t) + name_length;
|
|
|
|
// Write record size
|
|
if (write(file_descriptor, &record_size, sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError writing in a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
// Write taxid
|
|
if (write(file_descriptor, &(tax->taxa->taxon[i].taxid), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError writing in a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
// Write rank index
|
|
if (write(file_descriptor, &(tax->taxa->taxon[i].rank), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError writing in a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
// Write parent index
|
|
if (write(file_descriptor, &((tax->taxa->taxon[i].parent)->idx), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError writing in a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
// Write name length
|
|
if (write(file_descriptor, &name_length, sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError writing in a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
// Write name
|
|
if (write(file_descriptor, tax->taxa->taxon[i].name, name_length) < ((ssize_t) name_length))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError writing in a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
// Close file
|
|
if (close(file_descriptor) < 0)
|
|
{
|
|
obi_set_errno(OBIDMS_UNKNOWN_ERROR);
|
|
obidebug(1, "\nError closing a DMS information file");
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
int write_nameidx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct?
|
|
{
|
|
int i;
|
|
char* file_name;
|
|
int file_descriptor;
|
|
off_t file_size;
|
|
char* taxonomy_path;
|
|
int32_t name_length;
|
|
int32_t class_length;
|
|
int32_t record_size;
|
|
|
|
// Compute file size
|
|
file_size = sizeof(int32_t); // To store record count
|
|
for (i=0; i < (tax->names)->count; i++)
|
|
{
|
|
file_size = file_size + sizeof(int32_t) * 5; // To store record size, taxid, rank index, parent index, and name length
|
|
file_size = file_size + strlen(tax->names->names[i].name); // To store name
|
|
file_size = file_size + strlen(tax->names->names[i].class_name); // To store name
|
|
}
|
|
|
|
// Build the taxonomy directory path
|
|
taxonomy_path = get_taxonomy_path(dms, taxonomy_name);
|
|
|
|
file_name = (char*) malloc((strlen(taxonomy_path) + strlen(taxonomy_name) + 5)*sizeof(char));
|
|
if (file_name == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating the memory for a binary taxonomy file name");
|
|
return -1;
|
|
}
|
|
|
|
// Build the file path
|
|
if (sprintf(file_name, "%s/%s.ndx", taxonomy_path, taxonomy_name) < 0)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError building a binary taxonomy file name");
|
|
return -1;
|
|
}
|
|
|
|
free(taxonomy_path);
|
|
|
|
// Create file
|
|
file_descriptor = open(file_name, O_RDWR | O_CREAT | O_EXCL, 0777);
|
|
if (file_descriptor < 0)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError creating a binary taxonomy file");
|
|
free(file_name);
|
|
return -1;
|
|
}
|
|
|
|
free(file_name);
|
|
|
|
// Truncate the file to the right size
|
|
if (ftruncate(file_descriptor, file_size) < 0)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError truncating a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
|
|
// Write record count
|
|
if (write(file_descriptor, &(tax->names->count), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError writing in a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
|
|
// Write records
|
|
for (i=0; i < tax->names->count; i++)
|
|
{
|
|
name_length = strlen(tax->names->names[i].name);
|
|
class_length = strlen(tax->names->names[i].class_name);
|
|
record_size = 4*sizeof(int32_t) + name_length + class_length;
|
|
|
|
// Write record size
|
|
if (write(file_descriptor, &record_size, sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError writing in a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
// Write if the name is a scientific name
|
|
if (write(file_descriptor, &(tax->names->names[i].is_scientific_name), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError writing in a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
// Write name length
|
|
if (write(file_descriptor, &name_length, sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError writing in a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
// Write class length
|
|
if (write(file_descriptor, &class_length, sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError writing in a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
// Write taxid index
|
|
if (write(file_descriptor, &(tax->names->names[i].taxon->idx), sizeof(int32_t)) < ((ssize_t) sizeof(int32_t)))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError writing in a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
// Write name
|
|
if (write(file_descriptor, tax->names->names[i].name, name_length) < ((ssize_t) name_length))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError writing in a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
// Write class
|
|
if (write(file_descriptor, tax->names->names[i].class_name, class_length) < ((ssize_t) class_length))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError writing in a binary taxonomy file");
|
|
close(file_descriptor);
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
// Close file
|
|
if (close(file_descriptor) < 0)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError closing a DMS information file");
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
|
|
int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name)
|
|
{
|
|
char* taxonomy_path;
|
|
|
|
// Build the taxonomy directory path
|
|
taxonomy_path = get_taxonomy_path(dms, tax_name);
|
|
if (taxonomy_path == NULL)
|
|
return -1;
|
|
|
|
// Try to create the directory
|
|
if (mkdir(taxonomy_path, 00777) < 0)
|
|
{
|
|
if (errno == EEXIST)
|
|
obidebug(1, "\nA taxonomy already exists with this name.");
|
|
obidebug(1, "\nProblem creating a new taxonomy directory");
|
|
free(taxonomy_path);
|
|
return -1;
|
|
}
|
|
|
|
free(taxonomy_path);
|
|
|
|
if (write_rankidx(dms, tax, tax_name) < 0)
|
|
return -1;
|
|
if (write_taxonomyidx(dms, tax, tax_name) < 0)
|
|
return -1;
|
|
if (write_nameidx(dms, tax, tax_name) < 0)
|
|
return -1;
|
|
// Check if there are local taxa (if so last taxon is local)
|
|
if ((tax->taxa)->local_count > 0)
|
|
if (write_local_taxonomy_idx(dms, tax, tax_name) < 0)
|
|
return -1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump)
|
|
{
|
|
OBIDMS_taxonomy_p tax;
|
|
struct dirent* dp;
|
|
DIR* tax_dir;
|
|
FILE* file;
|
|
bool nodes_found=false;
|
|
bool names_found=false;
|
|
char line[2048]; // TODO large enough?
|
|
char* elt;
|
|
char* file_name;
|
|
int buffer_size;
|
|
int i, j;
|
|
int n;
|
|
char** rank_names;
|
|
int* parent_taxids;
|
|
int taxid;
|
|
bool already_in;
|
|
ecotx_t* t;
|
|
|
|
// Initialize taxonomy structure
|
|
tax = (OBIDMS_taxonomy_p) malloc(sizeof(OBIDMS_taxonomy_t));
|
|
if (tax == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating the memory for a taxonomy structure");
|
|
return NULL;
|
|
}
|
|
tax->ranks = NULL;
|
|
tax->taxa = NULL;
|
|
tax->names = NULL;
|
|
|
|
tax->dms = NULL;
|
|
(tax->tax_name)[0] = '\0';
|
|
|
|
// TODO check if taxdump path is for a gz file to unzip or a directory
|
|
|
|
tax_dir = opendir(taxdump);
|
|
if (tax_dir == NULL)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nProblem opening a taxdump directory");
|
|
free(tax);
|
|
return NULL;
|
|
}
|
|
|
|
// Go through taxonomy files
|
|
while ((dp = readdir(tax_dir)) != NULL)
|
|
{
|
|
if (strcmp(dp->d_name, "nodes.dmp") == 0)
|
|
{
|
|
nodes_found = true;
|
|
buffer_size = 10000;
|
|
|
|
// Initializing the taxa structure
|
|
tax->taxa = (ecotxidx_t*) malloc(sizeof(ecotxidx_t) + sizeof(ecotx_t) * buffer_size);
|
|
if (tax->taxa == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating the memory for a taxonomy structure");
|
|
free(tax);
|
|
closedir(tax_dir);
|
|
return NULL;
|
|
}
|
|
|
|
// Initialize rank names and parent taxids arrays
|
|
parent_taxids = malloc(buffer_size * sizeof(int));
|
|
if (file_name == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating the memory for a file name");
|
|
obi_close_taxonomy(tax);
|
|
closedir(tax_dir);
|
|
return NULL;
|
|
}
|
|
|
|
rank_names = malloc(buffer_size * sizeof(char*));
|
|
if (file_name == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating the memory for a file name");
|
|
obi_close_taxonomy(tax);
|
|
free(parent_taxids);
|
|
closedir(tax_dir);
|
|
return NULL;
|
|
}
|
|
|
|
// Allocating the memory for the file name
|
|
file_name = (char*) malloc((strlen(taxdump) + 10)*sizeof(char));
|
|
if (file_name == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating the memory for a file name");
|
|
obi_close_taxonomy(tax);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
closedir(tax_dir);
|
|
return NULL;
|
|
}
|
|
|
|
// Build the file path
|
|
if (sprintf(file_name, "%s/nodes.dmp", taxdump) < 0)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError building a taxonomy file name");
|
|
obi_close_taxonomy(tax);
|
|
closedir(tax_dir);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
free(file_name);
|
|
return NULL;
|
|
}
|
|
|
|
file = fopen(file_name, "r");
|
|
if (file == NULL)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nProblem opening a taxonomy file");
|
|
obi_close_taxonomy(tax);
|
|
closedir(tax_dir);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
free(file_name);
|
|
return NULL;
|
|
}
|
|
|
|
free(file_name);
|
|
|
|
(tax->taxa)->max_taxid = 0;
|
|
n = 0;
|
|
while (fgets(line, sizeof(line), file))
|
|
{
|
|
// Enlarge structures if needed
|
|
if (n == buffer_size)
|
|
{
|
|
buffer_size = buffer_size * 2;
|
|
|
|
tax->taxa = (ecotxidx_t*) realloc(tax->taxa, sizeof(ecotxidx_t) + sizeof(ecotx_t) * buffer_size);
|
|
if (tax->taxa == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError reallocating memory for a taxonomy structure");
|
|
obi_close_taxonomy(tax);
|
|
fclose(file);
|
|
closedir(tax_dir);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
return NULL;
|
|
}
|
|
|
|
parent_taxids = (int*) realloc(parent_taxids, sizeof(int) * buffer_size);
|
|
if (parent_taxids == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError reallocating memory for a taxonomy structure");
|
|
obi_close_taxonomy(tax);
|
|
fclose(file);
|
|
closedir(tax_dir);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
return NULL;
|
|
}
|
|
|
|
rank_names = (char**) realloc(rank_names, sizeof(char*) * buffer_size);
|
|
if (rank_names == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError reallocating memory for a taxonomy structure");
|
|
obi_close_taxonomy(tax);
|
|
fclose(file);
|
|
closedir(tax_dir);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
// Check for terminal '\n' character (line complete)
|
|
if (line[strlen(line) - 1] != '\n')
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError: line buffer size not large enough for line in taxonomy file");
|
|
obi_close_taxonomy(tax);
|
|
fclose(file);
|
|
closedir(tax_dir);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
return NULL;
|
|
}
|
|
|
|
(tax->taxa)->taxon[n].idx = n;
|
|
|
|
// Parse 3 first elements separated by '|'
|
|
|
|
elt = strtok(line, "|");
|
|
|
|
// Remove the last character (tab character)
|
|
elt[strlen(elt)-1] = '\0';
|
|
|
|
// First element: taxid
|
|
(tax->taxa)->taxon[n].taxid = atoi(elt);
|
|
|
|
// Update max taxid
|
|
if ((tax->taxa)->taxon[n].taxid > (tax->taxa)->max_taxid)
|
|
(tax->taxa)->max_taxid = (tax->taxa)->taxon[n].taxid;
|
|
|
|
// Initialize farest taxid value
|
|
(tax->taxa)->taxon[n].farest = -1;
|
|
|
|
i = 1;
|
|
while (i < 3)
|
|
{
|
|
elt = strtok(NULL, "|");
|
|
|
|
// Remove the first and the last characters (tab characters)
|
|
elt = elt+1;
|
|
elt[strlen(elt)-1] = '\0';
|
|
|
|
if (i == 1)
|
|
parent_taxids[n] = atoi(elt);
|
|
else if (i == 2)
|
|
{
|
|
rank_names[n] = (char*) malloc((strlen(elt)+1) * sizeof(char));
|
|
if (rank_names[n] == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating memory for taxon rank name");
|
|
obi_close_taxonomy(tax);
|
|
fclose(file);
|
|
closedir(tax_dir);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
return NULL;
|
|
}
|
|
strcpy(rank_names[n], elt);
|
|
}
|
|
i++;
|
|
}
|
|
n++;
|
|
}
|
|
|
|
// Check that fgets stopped because it reached EOF
|
|
if (!feof(file))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError: file reading was stopped before end of file");
|
|
obi_close_taxonomy(tax);
|
|
fclose(file);
|
|
closedir(tax_dir);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
return NULL;
|
|
}
|
|
|
|
// Store count
|
|
(tax->taxa)->count = n;
|
|
(tax->taxa)->ncbi_count = n;
|
|
(tax->taxa)->local_count = 0;
|
|
|
|
// Truncate the structure memory to the right size
|
|
tax->taxa = (ecotxidx_t*) realloc(tax->taxa, sizeof(ecotxidx_t) + sizeof(ecotx_t) * (tax->taxa)->count);
|
|
if (tax->taxa == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError reallocating memory for taxonomy structure");
|
|
obi_close_taxonomy(tax);
|
|
fclose(file);
|
|
closedir(tax_dir);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
return NULL;
|
|
}
|
|
|
|
fclose(file);
|
|
}
|
|
}
|
|
closedir(tax_dir);
|
|
|
|
|
|
// Go through directory again for next file
|
|
tax_dir = opendir(taxdump);
|
|
if (tax_dir == NULL)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nProblem opening a taxdump directory");
|
|
obi_close_taxonomy(tax);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
return NULL;
|
|
}
|
|
|
|
// Go through taxonomy files
|
|
while ((dp = readdir(tax_dir)) != NULL)
|
|
{
|
|
if (strcmp(dp->d_name, "names.dmp") == 0)
|
|
{
|
|
names_found = true;
|
|
buffer_size = 10000;
|
|
|
|
// Initializing the names structure
|
|
tax->names = (econameidx_t*) malloc(sizeof(econameidx_t) + sizeof(econame_t) * buffer_size);
|
|
if (tax->names == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating the memory for a taxonomy structure");
|
|
obi_close_taxonomy(tax);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
closedir(tax_dir);
|
|
return NULL;
|
|
}
|
|
|
|
// Allocating the memory for the file name
|
|
file_name = (char*) malloc((strlen(taxdump) + 10)*sizeof(char));
|
|
if (file_name == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating the memory for a file name");
|
|
obi_close_taxonomy(tax);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
closedir(tax_dir);
|
|
return NULL;
|
|
}
|
|
|
|
// Build the file path
|
|
if (sprintf(file_name, "%s/names.dmp", taxdump) < 0)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError building a taxonomy file name");
|
|
obi_close_taxonomy(tax);
|
|
closedir(tax_dir);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
free(file_name);
|
|
return NULL;
|
|
}
|
|
|
|
file = fopen(file_name, "r");
|
|
if (file == NULL)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nProblem opening a taxonomy file");
|
|
obi_close_taxonomy(tax);
|
|
closedir(tax_dir);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
free(file_name);
|
|
return NULL;
|
|
}
|
|
|
|
free(file_name);
|
|
|
|
n = 0;
|
|
j = 0;
|
|
while (fgets(line, sizeof(line), file))
|
|
{
|
|
// Enlarge structures if needed
|
|
if (n == buffer_size)
|
|
{
|
|
buffer_size = buffer_size * 2;
|
|
tax->names = (econameidx_t*) realloc(tax->names, sizeof(econameidx_t) + sizeof(econame_t) * buffer_size);
|
|
if (tax->names == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError reallocating memory for a taxonomy structure");
|
|
obi_close_taxonomy(tax);
|
|
fclose(file);
|
|
closedir(tax_dir);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
// Check for terminal '\n' character (line complete)
|
|
if (line[strlen(line) - 1] != '\n')
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError: line buffer size not large enough for line in taxonomy file");
|
|
obi_close_taxonomy(tax);
|
|
fclose(file);
|
|
closedir(tax_dir);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
return NULL;
|
|
}
|
|
|
|
// Parse 4 first elements separated by '|'
|
|
|
|
elt = strtok(line, "|");
|
|
|
|
// Remove the last character (tab character)
|
|
elt[strlen(elt)-1] = '\0';
|
|
|
|
// First element: taxid
|
|
taxid = atoi(elt);
|
|
// Find taxid in taxa structure and store pointer in names structure
|
|
i = j;
|
|
while ((i < (tax->taxa)->count) && ((tax->taxa)->taxon[i].taxid != taxid))
|
|
i++;
|
|
if (i == (tax->taxa)->count)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError: could not find taxon associated to name when reading taxdump");
|
|
obi_close_taxonomy(tax);
|
|
fclose(file);
|
|
closedir(tax_dir);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
return NULL;
|
|
}
|
|
j = i; // Because there are several names by taxon but they are in the same order
|
|
(tax->names)->names[n].taxon = ((tax->taxa)->taxon)+i;
|
|
|
|
i = 1;
|
|
while (i < 4)
|
|
{
|
|
elt = strtok(NULL, "|");
|
|
|
|
// Remove the first and the last characters (tab characters)
|
|
elt = elt+1;
|
|
elt[strlen(elt)-1] = '\0';
|
|
|
|
if (i == 1) // Name
|
|
{
|
|
(tax->names)->names[n].name = (char*) malloc((strlen(elt) + 1) * sizeof(char));
|
|
if ((tax->names)->names[n].name == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating memory for a taxon name");
|
|
obi_close_taxonomy(tax);
|
|
fclose(file);
|
|
closedir(tax_dir);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
return NULL;
|
|
}
|
|
strcpy((tax->names)->names[n].name, elt);
|
|
}
|
|
else if (i == 3) // Class name
|
|
{
|
|
(tax->names)->names[n].class_name = (char*) malloc((strlen(elt) + 1) * sizeof(char));
|
|
if ((tax->names)->names[n].class_name == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating memory for a taxon class name");
|
|
obi_close_taxonomy(tax);
|
|
fclose(file);
|
|
closedir(tax_dir);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
return NULL;
|
|
}
|
|
strcpy((tax->names)->names[n].class_name, elt);
|
|
if (strcmp(elt, "scientific name") == 0)
|
|
{
|
|
(tax->names)->names[n].is_scientific_name = 1;
|
|
}
|
|
else
|
|
(tax->names)->names[n].is_scientific_name = 0;
|
|
}
|
|
i++;
|
|
}
|
|
n++;
|
|
}
|
|
|
|
// Check that fgets stopped because it reached EOF
|
|
if (!feof(file))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError: file reading was stopped before end of file");
|
|
obi_close_taxonomy(tax);
|
|
fclose(file);
|
|
closedir(tax_dir);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
return NULL;
|
|
}
|
|
|
|
// Store count
|
|
(tax->names)->count = n;
|
|
|
|
// Truncate the structure memory to the right size
|
|
tax->names = (econameidx_t*) realloc(tax->names, sizeof(econameidx_t) + sizeof(econame_t) * (tax->names)->count);
|
|
if (tax->names == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError reallocating memory for a a taxonomy structure");
|
|
obi_close_taxonomy(tax);
|
|
fclose(file);
|
|
closedir(tax_dir);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
return NULL;
|
|
}
|
|
|
|
fclose(file);
|
|
}
|
|
}
|
|
closedir(tax_dir);
|
|
|
|
if (!nodes_found)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nProblem reading taxdump: nodes.dmp file not found");
|
|
obi_close_taxonomy(tax);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
return NULL;
|
|
}
|
|
if (!names_found)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nProblem reading taxdump: names.dmp file not found");
|
|
obi_close_taxonomy(tax);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
return NULL;
|
|
}
|
|
|
|
// Go through data to fill the taxonomy structure
|
|
|
|
// Build rank list
|
|
|
|
// Initialize rank structure
|
|
buffer_size = 10;
|
|
tax->ranks = (ecorankidx_t*) malloc(sizeof(ecorankidx_t) + sizeof(char*) * buffer_size);
|
|
if (tax->ranks == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating memory for taxon rank array");
|
|
obi_close_taxonomy(tax);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
return NULL;
|
|
}
|
|
(tax->ranks)->count = 0;
|
|
for (i=0; i < (tax->taxa)->count; i++)
|
|
{
|
|
already_in = false;
|
|
for (j=0; j < (tax->ranks)->count; j++)
|
|
{
|
|
if (strcmp(rank_names[i], ((tax->ranks)->label)[j]) == 0)
|
|
{
|
|
already_in = true;
|
|
break;
|
|
}
|
|
}
|
|
if (!already_in)
|
|
{
|
|
// Realloc rank structure if needed
|
|
if ((tax->ranks)->count == buffer_size)
|
|
{
|
|
buffer_size = buffer_size + 10;
|
|
tax->ranks = (ecorankidx_t*) realloc(tax->ranks, sizeof(ecorankidx_t) + sizeof(char*) * buffer_size);
|
|
if (tax->ranks == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError reallocating memory for taxon ranks");
|
|
obi_close_taxonomy(tax);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
// Store new rank
|
|
((tax->ranks)->label)[(tax->ranks)->count] = (char*) malloc((strlen(rank_names[i]) + 1) * sizeof(char));
|
|
if (((tax->ranks)->label)[(tax->ranks)->count] == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating memory for taxon rank names");
|
|
obi_close_taxonomy(tax);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
return NULL;
|
|
}
|
|
strcpy(((tax->ranks)->label)[(tax->ranks)->count], rank_names[i]);
|
|
((tax->ranks)->count)++;
|
|
}
|
|
}
|
|
|
|
// Truncate to the number of ranks recorded
|
|
tax->ranks = (ecorankidx_t*) realloc(tax->ranks, sizeof(ecorankidx_t) + sizeof(char*) * (tax->ranks)->count);
|
|
if (tax->ranks == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError reallocating memory for taxon ranks");
|
|
obi_close_taxonomy(tax);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
return NULL;
|
|
}
|
|
|
|
// Sort in alphabetical order
|
|
qsort((tax->ranks)->label, (tax->ranks)->count, sizeof(char*), cmp_str);
|
|
|
|
// Associate the taxa with their rank indices
|
|
for (i=0; i < (tax->taxa)->count; i++)
|
|
{
|
|
for (j=0; j < (tax->ranks)->count; j++)
|
|
{
|
|
if (strcmp(rank_names[i], ((tax->ranks)->label)[j]) == 0)
|
|
{
|
|
((tax->taxa)->taxon)[i].rank = j;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Associate the taxa with their scientific name
|
|
for (i=0; i < (tax->names)->count; i++)
|
|
{
|
|
if ((tax->names)->names[i].is_scientific_name)
|
|
{
|
|
((tax->names)->names[i].taxon)->name = (char*) malloc((strlen((((tax->names)->names)[i]).name) + 1) * sizeof(char));
|
|
if (((tax->names)->names[i].taxon)->name == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError reallocating memory for taxon ranks");
|
|
obi_close_taxonomy(tax);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
return NULL;
|
|
}
|
|
strcpy(((tax->names)->names[i].taxon)->name, (((tax->names)->names)[i]).name);
|
|
}
|
|
}
|
|
|
|
// Sort names in alphabetical order
|
|
qsort((tax->names)->names, (tax->names)->count, sizeof(econame_t), cmp_names);
|
|
|
|
// Associate the taxa with their parent
|
|
for (i=0; i < (tax->taxa)->count; i++)
|
|
{
|
|
((tax->taxa)->taxon)[i].parent = obi_taxo_get_taxon_with_taxid(tax, parent_taxids[i]);
|
|
if (((tax->taxa)->taxon)[i].parent == NULL)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError: taxon parent not found");
|
|
obi_close_taxonomy(tax);
|
|
free(parent_taxids);
|
|
free(rank_names);
|
|
return NULL;
|
|
}
|
|
(((tax->taxa)->taxon)[i].parent)->farest = 0;
|
|
}
|
|
|
|
(tax->taxa)->buffer_size = (tax->taxa)->count;
|
|
|
|
// Compute longest branches TODO what is this for???
|
|
for (i=0; i < (tax->taxa)->count; i++)
|
|
{
|
|
t = (((tax->taxa))->taxon)+i;
|
|
if (t->farest == -1)
|
|
{
|
|
t->farest=0;
|
|
while (t->parent != t)
|
|
{
|
|
j = t->farest + 1;
|
|
if (j > t->parent->farest)
|
|
{
|
|
t->parent->farest = j;
|
|
t=t->parent;
|
|
}
|
|
else
|
|
t = (tax->taxa)->taxon;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Freeing
|
|
free(parent_taxids);
|
|
for (i=0; i < (tax->taxa)->count; i++)
|
|
free(rank_names[i]);
|
|
free(rank_names);
|
|
|
|
return tax;
|
|
}
|
|
|
|
|
|
int obi_taxonomy_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const char* rank_name, int32_t parent_taxid, int32_t min_taxid)
|
|
{
|
|
int32_t taxid;
|
|
ecotx_t* taxon;
|
|
econame_t* name_struct;
|
|
int i;
|
|
|
|
// Enlarge the structure memory for a new taxon
|
|
tax->taxa = (ecotxidx_t*) realloc(tax->taxa, sizeof(ecotxidx_t) + sizeof(ecotx_t) * (((tax->taxa)->count) + 1));
|
|
if (tax->taxa == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError reallocating memory for a taxonomy structure to add a new taxon");
|
|
return -1;
|
|
}
|
|
|
|
// Compute new taxid that must be equal or greater than 1E7 and greater than the maximum taxid existing in the taxonomy
|
|
if (min_taxid < MIN_LOCAL_TAXID)
|
|
min_taxid = MIN_LOCAL_TAXID;
|
|
if (min_taxid > (tax->taxa)->max_taxid)
|
|
taxid = min_taxid;
|
|
else
|
|
taxid = ((tax->taxa)->max_taxid) + 1;
|
|
|
|
// Fill the ecotx_t node structure
|
|
taxon = ((tax->taxa)->taxon)+((tax->taxa)->count);
|
|
taxon->taxid = taxid;
|
|
taxon->idx = (tax->taxa)->count;
|
|
taxon->local = true;
|
|
taxon->name = (char*) malloc((strlen(name) + 1) * sizeof(char));
|
|
if (taxon->name == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating memory for a taxon name to add a new taxon");
|
|
return -1;
|
|
}
|
|
strcpy(taxon->name, name);
|
|
taxon->rank = -1;
|
|
for (i=0; i < (tax->ranks)->count; i++)
|
|
{
|
|
if (strcmp(rank_name, ((tax->ranks)->label)[i]) == 0)
|
|
{
|
|
taxon->rank = i;
|
|
break;
|
|
}
|
|
}
|
|
if (taxon->rank == -1) // TODO Discuss possibility of creating rank if doesn't exist
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError: taxon rank not found when adding a new taxon");
|
|
return -1;
|
|
}
|
|
taxon->parent = obi_taxo_get_taxon_with_taxid(tax, parent_taxid);
|
|
if (taxon->parent == NULL)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError: taxon parent not found when adding a new taxon");
|
|
return -1;
|
|
}
|
|
taxon->farest = 0; // TODO not sure
|
|
|
|
// Update taxonomy counts etc
|
|
(tax->taxa)->max_taxid = taxid;
|
|
((tax->taxa)->count)++;
|
|
((tax->taxa)->local_count)++;
|
|
(tax->taxa)->buffer_size = (tax->taxa)->count;
|
|
|
|
// Add new name in names structure // TODO discuss because in OBITools1 the new names were not written in .ndx
|
|
// Allocate memory for new name
|
|
tax->names = (econameidx_t*) realloc(tax->names, sizeof(econameidx_t) + sizeof(econame_t) * ((tax->names)->count + 1));
|
|
if (tax->names == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError reallocating memory for a taxonomy structure to add a new taxon");
|
|
return -1;
|
|
}
|
|
|
|
// Add new name
|
|
name_struct = (tax->names)->names + ((tax->names)->count);
|
|
name_struct->name = (char*) malloc((strlen(name) + 1) * sizeof(char));
|
|
if (name_struct->name == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating memory for a taxon name to add a new taxon");
|
|
return -1;
|
|
}
|
|
strcpy(name_struct->name, name);
|
|
name_struct->class_name = (char*) malloc((strlen("scientific name") + 1) * sizeof(char));
|
|
if (name_struct->class_name == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating memory for a taxon class name to add a new taxon");
|
|
return -1;
|
|
}
|
|
strcpy(name_struct->class_name, "scientific name");
|
|
name_struct->is_scientific_name = true;
|
|
name_struct->taxon = ((tax->taxa)->taxon) + ((tax->taxa)->count) - 1;
|
|
|
|
// Sort names in alphabetical order
|
|
qsort((tax->names)->names, (tax->names)->count, sizeof(econame_t), cmp_names);
|
|
|
|
// Update name count
|
|
((tax->names)->count)++;
|
|
|
|
return taxid;
|
|
}
|
|
|
|
|
|
/////// PUBLIC /////////
|
|
|
|
|
|
OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, bool read_alternative_names)
|
|
{
|
|
OBIDMS_taxonomy_p tax;
|
|
char* taxonomy_path;
|
|
char* ranks_file_name;
|
|
char* taxa_file_name;
|
|
char* local_taxa_file_name;
|
|
char* alter_names_file_name;
|
|
int buffer_size;
|
|
|
|
tax = (OBIDMS_taxonomy_p) malloc(sizeof(OBIDMS_taxonomy_t));
|
|
if (tax == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating memory for a taxonomy structure");
|
|
return NULL;
|
|
}
|
|
|
|
tax->ranks = NULL;
|
|
tax->taxa = NULL;
|
|
tax->names = NULL;
|
|
|
|
tax->dms = dms;
|
|
|
|
strcpy(tax->tax_name, taxonomy_name);
|
|
|
|
buffer_size = 2048;
|
|
|
|
taxonomy_path = get_taxonomy_path(dms, taxonomy_name);
|
|
if (taxonomy_path == NULL)
|
|
return NULL;
|
|
|
|
// Read ranks
|
|
ranks_file_name = (char*) malloc(buffer_size*sizeof(char));
|
|
if (ranks_file_name == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating memory for ranks file name");
|
|
free(taxonomy_path);
|
|
free(tax);
|
|
return NULL;
|
|
}
|
|
if (snprintf(ranks_file_name, buffer_size, "%s/%s.rdx", taxonomy_path, taxonomy_name) < 0)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError building ranks file name");
|
|
free(taxonomy_path);
|
|
free(ranks_file_name);
|
|
free(tax);
|
|
return NULL;
|
|
}
|
|
tax->ranks = read_rankidx(ranks_file_name);
|
|
if (tax->ranks == NULL)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError building ranks file name");
|
|
free(taxonomy_path);
|
|
free(ranks_file_name);
|
|
free(tax);
|
|
return NULL;
|
|
}
|
|
free(ranks_file_name);
|
|
|
|
// Read taxa
|
|
taxa_file_name = (char*) malloc(buffer_size*sizeof(char));
|
|
if (taxa_file_name == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating memory for taxa file name");
|
|
free(taxonomy_path);
|
|
obi_close_taxonomy(tax);
|
|
return NULL;
|
|
}
|
|
if (snprintf(taxa_file_name, buffer_size, "%s/%s.tdx", taxonomy_path, taxonomy_name) < 0)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError building taxa file name");
|
|
free(taxonomy_path);
|
|
free(taxa_file_name);
|
|
obi_close_taxonomy(tax);
|
|
return NULL;
|
|
}
|
|
local_taxa_file_name = (char*) malloc(buffer_size*sizeof(char));
|
|
if (local_taxa_file_name == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating memory for taxa file name");
|
|
free(taxonomy_path);
|
|
free(taxa_file_name);
|
|
obi_close_taxonomy(tax);
|
|
return NULL;
|
|
}
|
|
if (snprintf(local_taxa_file_name, buffer_size, "%s/%s.ldx", taxonomy_path, taxonomy_name) < 0)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError building local taxa file name");
|
|
free(taxonomy_path);
|
|
free(taxa_file_name);
|
|
free(local_taxa_file_name);
|
|
obi_close_taxonomy(tax);
|
|
return NULL;
|
|
}
|
|
tax->taxa = read_taxonomyidx(taxa_file_name, local_taxa_file_name);
|
|
if (tax->taxa == NULL)
|
|
{
|
|
free(taxonomy_path);
|
|
free(taxa_file_name);
|
|
free(local_taxa_file_name);
|
|
obi_close_taxonomy(tax);
|
|
return NULL;
|
|
}
|
|
free(taxa_file_name);
|
|
free(local_taxa_file_name);
|
|
|
|
// Read alternative names
|
|
if (read_alternative_names)
|
|
{
|
|
alter_names_file_name = (char*) malloc(buffer_size*sizeof(char));
|
|
if (alter_names_file_name == NULL)
|
|
{
|
|
obi_set_errno(OBI_MALLOC_ERROR);
|
|
obidebug(1, "\nError allocating memory for alternative names file name");
|
|
free(taxonomy_path);
|
|
obi_close_taxonomy(tax);
|
|
return NULL;
|
|
}
|
|
if (snprintf(alter_names_file_name, buffer_size, "%s/%s.ndx", taxonomy_path, taxonomy_name) < 0)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError building alternative names file name");
|
|
free(taxonomy_path);
|
|
free(alter_names_file_name);
|
|
obi_close_taxonomy(tax);
|
|
return NULL;
|
|
}
|
|
tax->names = read_nameidx(alter_names_file_name, tax);
|
|
if (tax->names == NULL)
|
|
{
|
|
free(taxonomy_path);
|
|
free(alter_names_file_name);
|
|
obi_close_taxonomy(tax);
|
|
return NULL;
|
|
}
|
|
free(alter_names_file_name);
|
|
}
|
|
|
|
free(taxonomy_path);
|
|
|
|
return tax;
|
|
}
|
|
|
|
|
|
int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy)
|
|
{
|
|
int i;
|
|
|
|
// Update local informations (local taxa and preferred names) if there are any
|
|
if ((taxonomy->taxa)->local_count > 0)
|
|
{
|
|
if (taxonomy->dms == NULL)
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError closing a taxonomy with local files but no DMS associated (probably read directly from taxdump)"); // TODO discuss
|
|
}
|
|
if (write_local_taxonomy_idx(taxonomy->dms, taxonomy, taxonomy->tax_name) < 0)
|
|
return -1;
|
|
}
|
|
|
|
if (taxonomy)
|
|
{
|
|
if (taxonomy->ranks)
|
|
{
|
|
for (i=0; i < (taxonomy->ranks)->count; i++)
|
|
{
|
|
if ((taxonomy->ranks)->label[i])
|
|
free((taxonomy->ranks)->label[i]);
|
|
}
|
|
free(taxonomy->ranks);
|
|
}
|
|
|
|
if (taxonomy->names)
|
|
{
|
|
for (i=0; i < (taxonomy->names)->count; i++)
|
|
{
|
|
if (((taxonomy->names)->names[i]).name)
|
|
free(((taxonomy->names)->names[i]).name);
|
|
if (((taxonomy->names)->names[i]).class_name)
|
|
free(((taxonomy->names)->names[i]).class_name);
|
|
}
|
|
free(taxonomy->names);
|
|
}
|
|
|
|
if (taxonomy->taxa)
|
|
{
|
|
for (i=0; i < (taxonomy->taxa)->count; i++)
|
|
{
|
|
if (((taxonomy->taxa)->taxon[i]).name)
|
|
free(((taxonomy->taxa)->taxon[i]).name);
|
|
}
|
|
free(taxonomy->taxa);
|
|
}
|
|
|
|
free(taxonomy);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx)
|
|
{
|
|
ecotx_t* current_taxon;
|
|
ecotx_t* next_taxon;
|
|
|
|
current_taxon = taxon;
|
|
next_taxon = current_taxon->parent;
|
|
|
|
while ((current_taxon != next_taxon) && // root node
|
|
(current_taxon->rank != rankidx))
|
|
{
|
|
current_taxon = next_taxon;
|
|
next_taxon = current_taxon->parent;
|
|
}
|
|
|
|
if (current_taxon->rank == rankidx)
|
|
return current_taxon;
|
|
else
|
|
return NULL;
|
|
}
|
|
|
|
|
|
ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid)
|
|
{
|
|
ecotx_t *current_taxon;
|
|
int32_t count;
|
|
|
|
count = (taxonomy->taxa)->count;
|
|
|
|
current_taxon = (ecotx_t*) bsearch((const void *) ((size_t) taxid),
|
|
(const void *) taxonomy->taxa->taxon,
|
|
count,
|
|
sizeof(ecotx_t),
|
|
cmp_taxids);
|
|
return current_taxon;
|
|
}
|
|
|
|
|
|
bool obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid)
|
|
{
|
|
ecotx_t* next_parent;
|
|
|
|
next_parent = taxon->parent;
|
|
|
|
while ((other_taxid != next_parent->taxid) && (strcmp(next_parent->name, "root")))
|
|
next_parent = next_parent->parent;
|
|
|
|
if (other_taxid == next_parent->taxid)
|
|
return 1;
|
|
else
|
|
return 0;
|
|
}
|
|
|
|
|
|
ecotx_t* obi_taxo_get_species(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
|
|
{
|
|
static OBIDMS_taxonomy_p tax = NULL;
|
|
static int32_t rankindex = -1;
|
|
|
|
if (taxonomy && (tax != taxonomy))
|
|
{
|
|
rankindex = rank_index("species", taxonomy->ranks);
|
|
tax = taxonomy;
|
|
}
|
|
|
|
if (!tax || (rankindex < 0))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError trying to get the species associated with a taxon: No taxonomy defined");
|
|
return NULL;
|
|
}
|
|
|
|
return obi_taxo_get_parent_at_rank(taxon, rankindex);
|
|
}
|
|
|
|
|
|
ecotx_t* obi_taxo_get_genus(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
|
|
{
|
|
static OBIDMS_taxonomy_p tax = NULL;
|
|
static int32_t rankindex = -1;
|
|
|
|
if (taxonomy && (tax != taxonomy))
|
|
{
|
|
rankindex = rank_index("genus", taxonomy->ranks);
|
|
tax = taxonomy;
|
|
}
|
|
|
|
if (!tax || (rankindex < 0))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError trying to get the genus associated with a taxon: No taxonomy defined");
|
|
return NULL;
|
|
}
|
|
|
|
return obi_taxo_get_parent_at_rank(taxon, rankindex);
|
|
}
|
|
|
|
|
|
ecotx_t* obi_taxo_get_family(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
|
|
{
|
|
static OBIDMS_taxonomy_p tax = NULL;
|
|
static int32_t rankindex = -1;
|
|
|
|
if (taxonomy && (tax != taxonomy))
|
|
{
|
|
rankindex = rank_index("family", taxonomy->ranks);
|
|
tax = taxonomy;
|
|
}
|
|
|
|
if (!tax || (rankindex < 0))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError trying to get the family associated with a taxon: No taxonomy defined");
|
|
return NULL;
|
|
}
|
|
|
|
return obi_taxo_get_parent_at_rank(taxon, rankindex);
|
|
}
|
|
|
|
|
|
ecotx_t* obi_taxo_get_kingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
|
|
{
|
|
static OBIDMS_taxonomy_p tax = NULL;
|
|
static int32_t rankindex = -1;
|
|
|
|
if (taxonomy && (tax != taxonomy))
|
|
{
|
|
rankindex = rank_index("kingdom", taxonomy->ranks);
|
|
tax = taxonomy;
|
|
}
|
|
|
|
if (!tax || (rankindex < 0))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError trying to get the kingdom associated with a taxon: No taxonomy defined");
|
|
return NULL;
|
|
}
|
|
|
|
return obi_taxo_get_parent_at_rank(taxon, rankindex);
|
|
}
|
|
|
|
|
|
ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy)
|
|
{
|
|
static OBIDMS_taxonomy_p tax = NULL;
|
|
static int32_t rankindex = -1;
|
|
|
|
if (taxonomy && (tax != taxonomy))
|
|
{
|
|
rankindex = rank_index("superkingdom", taxonomy->ranks);
|
|
tax = taxonomy;
|
|
}
|
|
|
|
if (!tax || (rankindex < 0))
|
|
{
|
|
obi_set_errno(OBI_TAXONOMY_ERROR);
|
|
obidebug(1, "\nError trying to get the superkingdom associated with a taxon: No taxonomy defined");
|
|
return NULL;
|
|
}
|
|
|
|
return obi_taxo_get_parent_at_rank(taxon, rankindex);
|
|
}
|
|
|
|
|
|
|