/******************************************************************** * OBIDMS taxonomy functions * ********************************************************************/ /** * @file obidms_taxonomy.c * @author Celine Mercier (celine.mercier@metabarcoding.org) * @date March 2nd 2016 * @brief Functions for reading binary taxonomy files. */ #include #include #include #include #include #include "obidms_taxonomy.h" #include "obidms.h" #include "obidebug.h" #include "obierrno.h" #include "utils.h" #define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) // TODO : the malloc aren't checked but won't exist for long because mapping instead int compareRankLabel(const void *label1, const void *label2) { return strcmp((const char*)label1,*(const char**)label2); } int32_t rank_index(const char* label, ecorankidx_t* ranks) { char **rep; rep = bsearch(label, ranks->label, ranks->count, sizeof(char*), compareRankLabel); if (rep) return rep-ranks->label; // TODO what??? return -1; } void* read_ecorecord(FILE* f, int32_t* record_size) { static void* buffer = NULL; int32_t buffer_size = 0; int32_t read; if (!record_size) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError reading a taxonomy file: record_size can not be NULL"); return NULL; } read = fread(record_size, 1, sizeof(int32_t), f); if (feof(f)) return NULL; if (read != sizeof(int32_t)) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError reading a taxonomy file: error reading record size"); return NULL; } // if (is_big_endian()) // TODO // *recordSize=swap_int32_t(*recordSize); if (buffer_size < *record_size) { if (buffer) buffer = realloc(buffer, *record_size); else buffer = malloc(*record_size); if (buffer == NULL) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError reading a taxonomy file: error allocating memory"); return NULL; } } read = fread(buffer, 1, *record_size, f); if (read != *record_size) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError reading a taxonomy file: error reading a record %d, %d", read, *record_size); return NULL; } return buffer; }; ecotx_t* readnext_ecotaxon(FILE* f, ecotx_t* taxon) { ecotxformat_t* raw; int32_t record_length; raw = read_ecorecord(f, &record_length); if (!raw) return NULL; // if (is_big_endian()) // TODO // { // raw->namelength = swap_int32_t(raw->namelength); // raw->parent = swap_int32_t(raw->parent); // raw->rank = swap_int32_t(raw->rank); // raw->taxid = swap_int32_t(raw->taxid); // } taxon->parent = (ecotx_t*) ((size_t) raw->parent); taxon->taxid = raw->taxid; taxon->rank = raw->rank; taxon->farest = -1; taxon->name = malloc((raw->name_length+1) * sizeof(char)); strncpy(taxon->name, raw->name, raw->name_length); return taxon; } FILE* open_ecorecorddb(const char* file_name, int32_t* count, int32_t abort_on_open_error) { FILE* f; int32_t read; fprintf(stderr, "\n%s\n", file_name); f = fopen(file_name, "rb"); if (!f) { if (abort_on_open_error) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nCouldn't open a taxonomy file"); return NULL; } else { *count = 0; return NULL; } } read = fread(count, 1, sizeof(int32_t), f); if (read != sizeof(int32_t)) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError reading taxonomy record size"); return NULL; } // if (!obi_is_little_endian()) // TODO // *count = swap_int32_t(*count); return f; } ecorankidx_t* read_rankidx(const char* ranks_file_name) { int32_t count; FILE* ranks_file; ecorankidx_t* ranks_index; int32_t i; int32_t rank_length; char* buffer; ranks_file = open_ecorecorddb(ranks_file_name, &count, 0); if (ranks_file==NULL) return NULL; ranks_index = (ecorankidx_t*) malloc(sizeof(ecorankidx_t) + sizeof(char*) * (count-1)); ranks_index->count = count; for (i=0; i < count; i++) { buffer = read_ecorecord(ranks_file, &rank_length); ranks_index->label[i] = (char*) malloc(rank_length+1); strncpy(ranks_index->label[i], buffer, rank_length); } return ranks_index; } ecotxidx_t* read_taxonomyidx(const char* taxa_file_name, const char* local_taxa_file_name) { int32_t count_taxa; int32_t count_local_taxa; FILE* f_taxa; FILE* f_local_taxa; ecotxidx_t* taxa_index; struct ecotxnode* t; int32_t i; int32_t j; f_taxa = open_ecorecorddb(taxa_file_name, &count_taxa,0); if (f_taxa == NULL) { obidebug(1, "\nError reading taxonomy taxa file"); return NULL; } f_local_taxa = open_ecorecorddb(local_taxa_file_name, &count_local_taxa, 0); taxa_index = (ecotxidx_t*) malloc(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count_taxa + count_local_taxa - 1)); taxa_index->count = count_taxa + count_local_taxa; taxa_index->buffer_size = taxa_index->count; taxa_index->max_taxid = 0; printf("Reading %d taxa...\n", count_taxa); for (i=0; itaxon[i])); taxa_index->taxon[i].parent = taxa_index->taxon + (size_t) taxa_index->taxon[i].parent; taxa_index->taxon[i].parent->farest = 0; if (taxa_index->taxon[i].taxid > taxa_index->max_taxid) taxa_index->max_taxid = taxa_index->taxon[i].taxid; } if (count_local_taxa > 0) printf("Reading %d local taxa...\n", count_local_taxa); else printf("No local taxa\n"); count_taxa = taxa_index->count; for (; i < count_taxa; i++){ readnext_ecotaxon(f_local_taxa, &(taxa_index->taxon[i])); taxa_index->taxon[i].parent = taxa_index->taxon + (size_t) taxa_index->taxon[i].parent; taxa_index->taxon[i].parent->farest=0; if (taxa_index->taxon[i].taxid > taxa_index->max_taxid) taxa_index->max_taxid = taxa_index->taxon[i].taxid; } printf("Computing longest branches...\n"); for (i=0; i < count_taxa; i++) { t = taxa_index->taxon+i; if (t->farest == -1) { t->farest=0; while (t->parent != t) { j = t->farest + 1; if (j > t->parent->farest) { t->parent->farest = j; t=t->parent; } else t = taxa_index->taxon; } } } return taxa_index; } econame_t* readnext_econame(FILE* f, econame_t* name, OBIDMS_taxonomy_p taxonomy) { econameformat_t* raw; int32_t record_length; raw = read_ecorecord(f, &record_length); if (!raw) return NULL; // if (is_big_endian()) // TODO // { // raw->is_scientificname = swap_int32_t(raw->is_scientificname); // raw->namelength = swap_int32_t(raw->namelength); // raw->classlength = swap_int32_t(raw->classlength); // raw->taxid = swap_int32_t(raw->taxid); // } name->is_scientific_name = raw->is_scientific_name; name->name = malloc((raw->name_length + 1) * sizeof(char)); strncpy(name->name, raw->names, raw->name_length); name->name[raw->name_length] = 0; name->class_name = malloc((raw->class_length+1) * sizeof(char)); strncpy(name->class_name,(raw->names + raw->name_length), raw->class_length); name->class_name[raw->class_length] = 0; name->taxon = taxonomy->taxa->taxon + raw->taxid; return name; } econameidx_t* read_nameidx(const char *file_name, OBIDMS_taxonomy_p taxonomy) { int32_t count; FILE* f; econameidx_t* index_names; int32_t i; f = open_ecorecorddb(file_name, &count, 0); if (f == NULL) return NULL; index_names = (econameidx_t*) malloc(sizeof(econameidx_t) + sizeof(econame_t) * (count-1)); index_names->count = count; for (i=0; i < count; i++) readnext_econame(f, (index_names->names)+i, taxonomy); return index_names; } static int bcomptaxon (const void* ptaxid, const void* ptaxon) { ecotx_t* current_taxon = (ecotx_t*) ptaxon; int32_t taxid = (int32_t) ((size_t) ptaxid); return taxid - current_taxon->taxid; } /////// PUBLIC ///////// OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, bool read_alternative_names) { OBIDMS_taxonomy_p tax; char* main_taxonomy_dir_path; char* taxonomy_path; char* ranks_file_name; char* taxa_file_name; char* local_taxa_file_name; char* alter_names_file_name; int buffer_size; tax = (OBIDMS_taxonomy_p) malloc(sizeof(OBIDMS_taxonomy_t)); tax->ranks = NULL; tax->taxa = NULL; tax->names = NULL; buffer_size = 2048; // TODO main_taxonomy_dir_path = obi_dms_get_full_path(dms, TAXONOMY_DIR_NAME); taxonomy_path = (char*) malloc((strlen(main_taxonomy_dir_path) + strlen(taxonomy_name) + strlen(taxonomy_name) + 3)*sizeof(char)); if (sprintf(taxonomy_path, "%s/%s/%s", main_taxonomy_dir_path, taxonomy_name, taxonomy_name) < 0) { free(main_taxonomy_dir_path); obi_close_taxonomy(tax); return NULL; } free(main_taxonomy_dir_path); // Read ranks ranks_file_name = (char*) malloc(buffer_size*sizeof(char)); if (ranks_file_name == NULL) { free(taxonomy_path); obi_close_taxonomy(tax); return NULL; } if (snprintf(ranks_file_name, buffer_size, "%s.rdx", taxonomy_path) < 0) { free(taxonomy_path); free(ranks_file_name); obi_close_taxonomy(tax); return NULL; } tax->ranks = read_rankidx(ranks_file_name); if (tax->ranks == NULL) { free(ranks_file_name); obi_close_taxonomy(tax); return NULL; } free(ranks_file_name); // Read taxa taxa_file_name = (char*) malloc(buffer_size*sizeof(char)); if (taxa_file_name == NULL) { free(taxonomy_path); obi_close_taxonomy(tax); return NULL; } if (snprintf(taxa_file_name, buffer_size,"%s.tdx", taxonomy_path) < 0) { free(taxonomy_path); free(taxa_file_name); obi_close_taxonomy(tax); return NULL; } local_taxa_file_name = (char*) malloc(buffer_size*sizeof(char)); if (local_taxa_file_name == NULL) { free(taxonomy_path); free(taxa_file_name); obi_close_taxonomy(tax); return NULL; } if (snprintf(local_taxa_file_name, buffer_size,"%s.ldx", taxonomy_path) < 0) { free(taxonomy_path); free(taxa_file_name); free(local_taxa_file_name); obi_close_taxonomy(tax); return NULL; } tax->taxa = read_taxonomyidx(taxa_file_name, local_taxa_file_name); if (tax->taxa == NULL) { free(taxonomy_path); free(taxa_file_name); free(local_taxa_file_name); obi_close_taxonomy(tax); return NULL; } free(taxa_file_name); free(local_taxa_file_name); // Read alternative names if (read_alternative_names) { alter_names_file_name = (char*) malloc(buffer_size*sizeof(char)); if (alter_names_file_name == NULL) { free(taxonomy_path); obi_close_taxonomy(tax); return NULL; } if (snprintf(alter_names_file_name, buffer_size,"%s.ndx", taxonomy_path) < 0) { free(taxonomy_path); free(alter_names_file_name); obi_close_taxonomy(tax); return NULL; } tax->names = read_nameidx(alter_names_file_name, tax); if (tax->names == NULL) { free(alter_names_file_name); obi_close_taxonomy(tax); return NULL; } free(alter_names_file_name); } free(taxonomy_path); return tax; } int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy) { if (taxonomy) { if (taxonomy->ranks) free(taxonomy->ranks); // TODO those don't free everything but mapping will replace anyway if (taxonomy->names) free(taxonomy->names); if (taxonomy->taxa) free(taxonomy->taxa); free(taxonomy); return 0; } // TODO no closing files? return 1; } ////////////////////////////////////////////////////////////////////////// ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx) { ecotx_t* current_taxon; ecotx_t* next_taxon; current_taxon = taxon; next_taxon = current_taxon->parent; while ((current_taxon != next_taxon) && // root node (current_taxon->rank != rankidx)) { current_taxon = next_taxon; next_taxon = current_taxon->parent; } if (current_taxon->rank == rankidx) return current_taxon; else return NULL; } ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid) { ecotx_t *current_taxon; int32_t count; count = taxonomy->taxa->count; current_taxon = (ecotx_t*) bsearch((const void *) ((size_t) taxid), (const void *) taxonomy->taxa->taxon, count, sizeof(ecotx_t), bcomptaxon); return current_taxon; } bool obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid) { ecotx_t* next_parent; next_parent = taxon->parent; while ((other_taxid != next_parent->taxid) && (strcmp(next_parent->name, "root"))) next_parent = next_parent->parent; if (other_taxid == next_parent->taxid) return 1; else return 0; } ecotx_t* obi_taxo_get_species(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy) { static OBIDMS_taxonomy_p tax = NULL; static int32_t rankindex = -1; if (taxonomy && (tax != taxonomy)) { rankindex = rank_index("species", taxonomy->ranks); tax = taxonomy; } if (!tax || (rankindex < 0)) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError trying to get the species associated with a taxon: No taxonomy defined"); return NULL; } return obi_taxo_get_parent_at_rank(taxon, rankindex); } ecotx_t* obi_taxo_get_genus(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy) { static OBIDMS_taxonomy_p tax = NULL; static int32_t rankindex = -1; if (taxonomy && (tax != taxonomy)) { rankindex = rank_index("genus", taxonomy->ranks); tax = taxonomy; } if (!tax || (rankindex < 0)) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError trying to get the genus associated with a taxon: No taxonomy defined"); return NULL; } return obi_taxo_get_parent_at_rank(taxon, rankindex); } ecotx_t* obi_taxo_get_family(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy) { static OBIDMS_taxonomy_p tax = NULL; static int32_t rankindex = -1; if (taxonomy && (tax != taxonomy)) { rankindex = rank_index("family", taxonomy->ranks); tax = taxonomy; } if (!tax || (rankindex < 0)) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError trying to get the family associated with a taxon: No taxonomy defined"); return NULL; } return obi_taxo_get_parent_at_rank(taxon, rankindex); } ecotx_t* obi_taxo_get_kingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy) { static OBIDMS_taxonomy_p tax = NULL; static int32_t rankindex = -1; if (taxonomy && (tax != taxonomy)) { rankindex = rank_index("kingdom", taxonomy->ranks); tax = taxonomy; } if (!tax || (rankindex < 0)) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError trying to get the kingdom associated with a taxon: No taxonomy defined"); return NULL; } return obi_taxo_get_parent_at_rank(taxon, rankindex); } ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy) { static OBIDMS_taxonomy_p tax = NULL; static int32_t rankindex = -1; if (taxonomy && (tax != taxonomy)) { rankindex = rank_index("superkingdom", taxonomy->ranks); tax = taxonomy; } if (!tax || (rankindex < 0)) { obi_set_errno(OBI_TAXONOMY_ERROR); obidebug(1, "\nError trying to get the superkingdom associated with a taxon: No taxonomy defined"); return NULL; } return obi_taxo_get_parent_at_rank(taxon, rankindex); }