From c0bcdce72450a58195b0feddb5da4a44f510cfc7 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Wed, 18 Jan 2017 18:22:49 +0100 Subject: [PATCH] Taxonomy: documentation for all the functions, and fixed bugs when closing the taxonomy (overwriting of .pdx files, missing freeing, and re-placed a misplaced condition) --- src/obidms_taxonomy.c | 1266 +++++++++++++++++++++++++++-------------- src/obidms_taxonomy.h | 399 ++++++++++--- 2 files changed, 1179 insertions(+), 486 deletions(-) diff --git a/src/obidms_taxonomy.c b/src/obidms_taxonomy.c index 9e081f0..aba8641 100644 --- a/src/obidms_taxonomy.c +++ b/src/obidms_taxonomy.c @@ -6,7 +6,7 @@ * @file obidms_taxonomy.c * @author Celine Mercier (celine.mercier@metabarcoding.org) * @date March 2nd 2016 - * @brief Functions for reading binary taxonomy files. + * @brief Functions for handling the reading and writing of taxonomy files. */ @@ -29,7 +29,436 @@ #define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) -int cmp_rank_labels(const void* label1, const void* label2) +/************************************************************************** + * + * D E C L A R A T I O N O F T H E P R I V A T E F U N C T I O N S + * + **************************************************************************/ + + +/** + * @brief Internal function comparing two rank names. + * + * @param label1 A char* pointer on the first rank name. + * @param label2 A char** pointer on a second pointer, that second char* pointer being on the second rank name. + * (making the function usable with an ecorankidx_t structure and functions like bsearch) + * + * @returns A value < 0 if label1 < label2, + * a value > 0 if label1 > label2, + * and 0 if label1 == label2. + */ +static int cmp_rank_labels(const void* label1, const void* label2); + + +/** + * @brief Internal function comparing two taxids, one of them stored in an ecotx_t structure. + * + * @param ptaxid The first taxid. + * @param ptaxon A pointer on an ecotx_t structure where the second taxid is stored. + * + * @returns A value < 0 if taxid1 < taxid2, + * a value > 0 if taxid1 > taxid2, + * and 0 if taxid1 == taxid2. + */ +static int cmp_taxids_in_ecotx_t(const void* ptaxid, const void* ptaxon); + + +/** + * @brief Internal function comparing two taxids, one of them stored in an ecomerged_t structure. + * + * @param ptaxid The first taxid. + * @param ptaxon A pointer on an ecomerged_t structure where the second taxid is stored. + * + * @returns A value < 0 if taxid1 < taxid2, + * a value > 0 if taxid1 > taxid2, + * and 0 if taxid1 == taxid2. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int cmp_taxids_in_ecomerged_t(const void* ptaxid, const void* ptaxon); + + +/** + * @brief Internal function comparing two character strings pointed to by char** pointers. + * + * @param s1 A char** pointer on a second pointer, that second char* pointer being on the first character string. + * @param s2 A char** pointer on a second pointer, that second char* pointer being on the second character string. + * + * @returns A value < 0 if s1 < s2, + * a value > 0 if s1 > s2, + * and 0 if s1 == s2. + */ +static int cmp_str(const void* s1, const void* s2); + + +/** + * @brief Internal function comparing two taxon names stored in econame_t structures. + * + * @param n1 A pointer on the first econame_t structure. + * @param n2 A pointer on the second econame_t structure. + * + * @returns A value < 0 if n1 < n2, + * a value > 0 if n1 > n2, + * and 0 if n1 == n2. + * + * @since 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int cmp_names(const void* n1, const void* n2); + + +/** + * @brief Internal function comparing returning the ecotx_t structure associated with a taxid. + * + * This function only looks for the taxid in the modern taxonomy, it does not consider deprecated + * and old taxids, unlike obi_taxo_get_taxon_with_taxid(). + * + * @param taxonomy A pointer on the taxonomy structure. + * @param taxid The taxid of the taxon wanted. + * + * @returns A pointer on the ecotx_t structure associated with a taxid. + * + * @see obi_taxo_get_taxon_with_taxid() + */ +static ecotx_t* get_taxon_with_current_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid); + + +/** + * @brief Internal function returning the complete path to a taxonomy directory in a DMS. + * + * @param dms A pointer on the DMS. + * @param tax_name The name of the taxonomy. + * + * @returns The complete path to the taxonomy directory. + * @retval NULL if an error occurred. + * + * @since 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static char* get_taxonomy_path(OBIDMS_p dms, const char* tax_name); + + +/** + * @brief Internal function returning the index of a rank in an ecorankidx_t structure. + * + * @param label The name of the rank. + * @param ranks A pointer on an ecorankidx_t structure. + * + * @returns The index of a rank in the ecorankidx_t structure. + * @retval -1 if the rank was not found. + */ +static int32_t rank_index(const char* label, ecorankidx_t* ranks); + + +/** + * @brief Internal function opening a binary taxonomy file (.tdx, .rdx, .ndx, .adx, .pdx, .ldx). + * + * @param file_name The file path. + * @param count A pointer on an integer that the function will set to the number of records in the file. + * @param abort_on_open_error A boolean indicating whether the function should trigger an error if the file can't be open. + * + * @returns The FILE object. + * @retval NULL if an error occurred or if the file was not found. + */ +static FILE* open_ecorecorddb(const char* file_name, int32_t* count, int32_t abort_on_open_error); + + +/** + * @brief Internal function returning the next record in a binary taxonomy file (.tdx, .rdx, .ndx, .adx, .pdx, .ldx). + * + * @param f The file object with the offset at the start of a record. + * @param record_size A pointer on an integer that the function will set to the size of the record. + * + * @returns A pointer on the read record. + * @retval NULL if an error occurred. + */ +static void* read_ecorecord(FILE* f, int32_t* record_size); + + +/** + * @brief Internal function reading the next taxon record in a .tdx binary taxonomy file. + * + * @param f The file object with the offset at the start of a record. + * @param taxon A pointer on an empty, allocated ecotx_t structure that the function will fill. + * + * @returns A pointer on the read record. + * @retval NULL if an error occurred. + */ +static ecotx_t* readnext_ecotaxon(FILE* f, ecotx_t* taxon); + + +/** + * @brief Internal function reading the next taxon name record in a .ndx binary taxonomy file. + * + * @param f The file object with the offset at the start of a record. + * @param name A pointer on an empty, allocated econame_t structure that the function will fill. + * @param taxonomy A pointer on the taxonomy structure. + * + * @returns A pointer on the read record. + * @retval NULL if an error occurred. + */ +static econame_t* readnext_econame(FILE* f, econame_t* name, OBIDMS_taxonomy_p taxonomy); + + +/** + * @brief Internal function reading the next taxon preferred name record in a .pdx binary taxonomy file. + * + * @param f The file object with the offset at the start of a record. + * @param name A pointer on an empty, allocated econame_t structure that the function will fill. + * @param taxonomy A pointer on the taxonomy structure. + * + * @returns A pointer on the read record. + * @retval NULL if an error occurred. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static econame_t* readnext_ecopreferredname(FILE* f, econame_t* name, OBIDMS_taxonomy_p taxonomy); + + +/** + * @brief Internal function reading a taxonomic ranks (.rdx) binary taxonomy file. + * + * @param ranks_file_name The name of the .rdx file to read. + * + * @returns A pointer on an ecorankidx_t structure. + * @retval NULL if an error occurred. + */ +static ecorankidx_t* read_ranks_idx(const char* ranks_file_name); + + +/** + * @brief Internal function reading the taxa (.tdx, .ldx) binary taxonomy file. + * + * @param taxa_file_name The name of the .tdx file to read. + * @param local_taxa_file_name The name of the .ldx file containing the local taxa to read if there is one. + * + * @returns A pointer on an ecotxidx_t structure. + * @retval NULL if an error occurred. + */ +static ecotxidx_t* read_taxonomy_idx(const char* taxa_file_name, const char* local_taxa_file_name); + + +/** + * @brief Internal function reading a names (.ndx) binary taxonomy file. + * + * @param file_name The name of the .ndx file to read. + * @param taxonomy A pointer on the taxonomy structure. + * + * @returns A pointer on an econameidx_t structure. + * @retval NULL if an error occurred. + */ +static econameidx_t* read_names_idx(const char* file_name, OBIDMS_taxonomy_p taxonomy); + + +/** + * @brief Internal function reading a preferred names (.pdx) binary taxonomy file. + * + * @param file_name The name of the .pdx file to read. + * @param taxonomy A pointer on the taxonomy structure. + * + * @returns A pointer on an econameidx_t structure. + * @retval NULL if an error occurred. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static econameidx_t* read_preferred_names_idx(const char* file_name, OBIDMS_taxonomy_p taxonomy); + + +/** + * @brief Internal function reading a merged index (.adx) binary taxonomy file. + * + * @param file_name The name of the .adx file to read. + * @param taxonomy A pointer on the taxonomy structure. + * + * @returns A pointer on an ecomergedidx_t structure. + * @retval NULL if an error occurred. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static ecomergedidx_t* read_merged_idx(const char* file_name, OBIDMS_taxonomy_p taxonomy); + + +/** + * @brief Internal function writing a rank index (.rdx) binary taxonomy file. + * + * @param dms A pointer on the DMS. + * @param tax A pointer on the taxonomy structure. + * @param taxonomy_name The name of the taxonomy. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + */ +static int write_ranks_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name); + + +/** + * @brief Internal function writing a taxonomy index (.tdx) binary taxonomy file. + * + * @param dms A pointer on the DMS. + * @param tax A pointer on the taxonomy structure. + * @param taxonomy_name The name of the taxonomy. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + */ +static int write_taxonomy_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name); + + +/** + * @brief Internal function writing a local taxonomy index (.ldx) binary taxonomy file. + * + * @param dms A pointer on the DMS. + * @param tax A pointer on the taxonomy structure. + * @param taxonomy_name The name of the taxonomy. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + * + * @since 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int write_local_taxonomy_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name); + + +/** + * @brief Internal function writing a names index (.ndx) binary taxonomy file. + * + * @param dms A pointer on the DMS. + * @param tax A pointer on the taxonomy structure. + * @param taxonomy_name The name of the taxonomy. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + */ +static int write_names_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name); + + +/** + * @brief Internal function writing a preferred names index (.pdx) binary taxonomy file. + * + * @param dms A pointer on the DMS. + * @param tax A pointer on the taxonomy structure. + * @param taxonomy_name The name of the taxonomy. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int write_preferred_names_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name); + + +/** + * @brief Internal function writing a merged index (.adx) binary taxonomy file. + * + * @param dms A pointer on the DMS. + * @param tax A pointer on the taxonomy structure. + * @param taxonomy_name The name of the taxonomy. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int write_merged_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name); + + +/** + * @brief Internal function reading the 'nodes.dmp' file from an NCBI taxdump. + * + * @param taxdump The path to the taxdump. + * @param tax A pointer on the taxonomy structure. + * @param rank_names_p A char*** pointer on a non allocated char* array where the function will store rank names. + * @param parent_taxids_p An int** pointer on a non allocated int array where the function will store parent taxids. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int read_nodes_dmp(const char* taxdump, OBIDMS_taxonomy_p tax, char*** rank_names_p, int** parent_taxids_p); + + +/** + * @brief Internal function reading the 'delnodes.dmp' file from an NCBI taxdump. + * + * @param taxdump The path to the taxdump. + * @param tax A pointer on the taxonomy structure. + * @param delnodes_p An int** pointer on a non allocated int array where the function will store deleted taxids. + * @param delnodes_count An int* pointer where the function will store the number of deleted taxids. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int read_delnodes_dmp(const char* taxdump, OBIDMS_taxonomy_p tax, int32_t** delnodes_p, int32_t* delnodes_count); + + +/** + * @brief Internal function reading the 'merged.dmp' file from an NCBI taxdump. + * + * @warning Should be used AFTER read_nodes_dmp() and read_delnodes_dmp(). + * + * The function merges the information about current nodes previously read in read_nodes_dmp(), + * the information about deleted nodes previously read in read_delnodes_dmp(), and the information read + * in the 'merged.dmp' file, to build the final merged taxon index in the taxonomy structure. + * + * @param taxdump The path to the taxdump. + * @param tax A pointer on the taxonomy structure. + * @param delnodes An int* pointer containing the deleted taxids. + * @param delnodes_count The number of deleted taxids. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int read_merged_dmp(const char* taxdump, OBIDMS_taxonomy_p tax, int32_t* delnodes, int32_t delnodes_count); + + +/** + * @brief Internal function reading the 'names.dmp' file from an NCBI taxdump. + * + * @param taxdump The path to the taxdump. + * @param tax A pointer on the taxonomy structure. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +static int read_names_dmp(const char* taxdump, OBIDMS_taxonomy_p tax); + + +/************************************************************************ + * + * D E F I N I T I O N O F T H E P R I V A T E F U N C T I O N S + * + ************************************************************************/ + + +static int cmp_rank_labels(const void* label1, const void* label2) { return strcmp((const char*)label1,*(const char**)label2); } @@ -66,7 +495,23 @@ static int cmp_names(const void* n1, const void* n2) } -char* get_taxonomy_path(OBIDMS_p dms, const char* tax_name) +static ecotx_t* get_taxon_with_current_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid) +{ + ecotx_t *current_taxon; + int32_t count; + + count = (taxonomy->taxa)->count; + + current_taxon = (ecotx_t*) bsearch((const void *) ((size_t) taxid), + (const void *) taxonomy->taxa->taxon, + count, + sizeof(ecotx_t), + cmp_taxids_in_ecotx_t); + return current_taxon; +} + + +static char* get_taxonomy_path(OBIDMS_p dms, const char* tax_name) { char* all_tax_dir_path; char* tax_path; @@ -98,7 +543,7 @@ char* get_taxonomy_path(OBIDMS_p dms, const char* tax_name) } -int32_t rank_index(const char* label, ecorankidx_t* ranks) +static int32_t rank_index(const char* label, ecorankidx_t* ranks) { char **rep; @@ -111,7 +556,50 @@ int32_t rank_index(const char* label, ecorankidx_t* ranks) } -void* read_ecorecord(FILE* f, int32_t* record_size) +static FILE* open_ecorecorddb(const char* file_name, + int32_t* count, + int32_t abort_on_open_error) +{ + FILE* f; + int32_t read; + + f = fopen(file_name, "rb"); + + if (!f) + { + if (abort_on_open_error) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nCouldn't open a taxonomy file"); + fclose(f); + return NULL; + } + else + { + *count = 0; + fclose(f); + return NULL; + } + } + + read = fread(count, + sizeof(int32_t), + 1, + f); + + if (read != 1) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError reading taxonomy record size"); + fclose(f); + return NULL; + } + + return f; +} + + +static void* read_ecorecord(FILE* f, int32_t* record_size) { static void* buffer = NULL; int32_t buffer_size = 0; @@ -174,7 +662,7 @@ void* read_ecorecord(FILE* f, int32_t* record_size) }; -ecotx_t* readnext_ecotaxon(FILE* f, ecotx_t* taxon) +static ecotx_t* readnext_ecotaxon(FILE* f, ecotx_t* taxon) { ecotxformat_t* raw; int32_t record_length; @@ -203,50 +691,100 @@ ecotx_t* readnext_ecotaxon(FILE* f, ecotx_t* taxon) } -FILE* open_ecorecorddb(const char* file_name, - int32_t* count, - int32_t abort_on_open_error) +static econame_t* readnext_econame(FILE* f, econame_t* name, OBIDMS_taxonomy_p taxonomy) { - FILE* f; - int32_t read; + econameformat_t* raw; + int32_t record_length; - f = fopen(file_name, "rb"); + raw = read_ecorecord(f, &record_length); + if (raw == NULL) + return NULL; - if (!f) + name->is_scientific_name = raw->is_scientific_name; + + name->name = malloc((raw->name_length + 1) * sizeof(char)); + if (name->name == NULL) { - if (abort_on_open_error) - { - obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nCouldn't open a taxonomy file"); - fclose(f); - return NULL; - } - else - { - *count = 0; - fclose(f); - return NULL; - } - } - - read = fread(count, - sizeof(int32_t), - 1, - f); - - if (read != 1) - { - obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nError reading taxonomy record size"); - fclose(f); + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a taxon name"); + free(raw); return NULL; } + strncpy(name->name, raw->names, raw->name_length); + name->name[raw->name_length] = 0; - return f; + name->class_name = malloc((raw->class_length+1) * sizeof(char)); + if (name->class_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a taxon class name"); + free(name->name); + free(raw); + return NULL; + } + strncpy(name->class_name,(raw->names + raw->name_length), raw->class_length); + name->class_name[raw->class_length] = 0; + + name->taxon = taxonomy->taxa->taxon + raw->taxid; + + return name; } -ecorankidx_t* read_ranks_idx(const char* ranks_file_name) +static econame_t* readnext_ecopreferredname(FILE* f, econame_t* name, OBIDMS_taxonomy_p taxonomy) +{ + econameformat_t* raw; + int32_t record_length; + + raw = read_ecorecord(f, &record_length); + if (raw == NULL) + return NULL; + + name->is_scientific_name = raw->is_scientific_name; + + name->name = malloc((raw->name_length + 1) * sizeof(char)); + if (name->name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a taxon preferred name"); + free(raw); + return NULL; + } + strncpy(name->name, raw->names, raw->name_length); + name->name[raw->name_length] = 0; + + name->class_name = malloc((raw->class_length+1) * sizeof(char)); + if (name->class_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a taxon class name"); + free(name->name); + free(raw); + return NULL; + } + strncpy(name->class_name,(raw->names + raw->name_length), raw->class_length); + name->class_name[raw->class_length] = 0; + + name->taxon = taxonomy->taxa->taxon + raw->taxid; + + // Add the preferred name in the taxon structure // TODO discuss: couldn't they all use the same pointer? + (taxonomy->taxa->taxon + raw->taxid)->preferred_name = malloc((raw->name_length + 1) * sizeof(char)); + if ((taxonomy->taxa->taxon + raw->taxid)->preferred_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a taxon preferred name"); + free(name->name); + free(name->class_name); + free(raw); + return NULL; + } + strcpy((taxonomy->taxa->taxon + raw->taxid)->preferred_name, name->name); + + return name; +} + + +static ecorankidx_t* read_ranks_idx(const char* ranks_file_name) { int32_t count; FILE* ranks_file; @@ -301,7 +839,7 @@ ecorankidx_t* read_ranks_idx(const char* ranks_file_name) } -ecotxidx_t* read_taxonomy_idx(const char* taxa_file_name, const char* local_taxa_file_name) +static ecotxidx_t* read_taxonomy_idx(const char* taxa_file_name, const char* local_taxa_file_name) { int32_t count_taxa; int32_t count_local_taxa; @@ -394,100 +932,7 @@ ecotxidx_t* read_taxonomy_idx(const char* taxa_file_name, const char* local_taxa } -econame_t* readnext_econame(FILE* f, econame_t* name, OBIDMS_taxonomy_p taxonomy) -{ - econameformat_t* raw; - int32_t record_length; - - raw = read_ecorecord(f, &record_length); - if (raw == NULL) - return NULL; - - name->is_scientific_name = raw->is_scientific_name; - - name->name = malloc((raw->name_length + 1) * sizeof(char)); - if (name->name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for a taxon name"); - free(raw); - return NULL; - } - strncpy(name->name, raw->names, raw->name_length); - name->name[raw->name_length] = 0; - - name->class_name = malloc((raw->class_length+1) * sizeof(char)); - if (name->class_name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for a taxon class name"); - free(name->name); - free(raw); - return NULL; - } - strncpy(name->class_name,(raw->names + raw->name_length), raw->class_length); - name->class_name[raw->class_length] = 0; - - name->taxon = taxonomy->taxa->taxon + raw->taxid; - - return name; -} - - -econame_t* readnext_ecopreferredname(FILE* f, econame_t* name, OBIDMS_taxonomy_p taxonomy) -{ - econameformat_t* raw; - int32_t record_length; - - raw = read_ecorecord(f, &record_length); - if (raw == NULL) - return NULL; - - name->is_scientific_name = raw->is_scientific_name; - - name->name = malloc((raw->name_length + 1) * sizeof(char)); - if (name->name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for a taxon preferred name"); - free(raw); - return NULL; - } - strncpy(name->name, raw->names, raw->name_length); - name->name[raw->name_length] = 0; - - name->class_name = malloc((raw->class_length+1) * sizeof(char)); - if (name->class_name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for a taxon class name"); - free(name->name); - free(raw); - return NULL; - } - strncpy(name->class_name,(raw->names + raw->name_length), raw->class_length); - name->class_name[raw->class_length] = 0; - - name->taxon = taxonomy->taxa->taxon + raw->taxid; - - // Add the preferred name in the taxon structure // TODO discuss: couldn't they all use the same pointer? - (taxonomy->taxa->taxon + raw->taxid)->preferred_name = malloc((raw->name_length + 1) * sizeof(char)); - if ((taxonomy->taxa->taxon + raw->taxid)->preferred_name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for a taxon preferred name"); - free(name->name); - free(name->class_name); - free(raw); - return NULL; - } - strcpy((taxonomy->taxa->taxon + raw->taxid)->preferred_name, name->name); - - return name; -} - - -econameidx_t* read_names_idx(const char *file_name, OBIDMS_taxonomy_p taxonomy) +static econameidx_t* read_names_idx(const char *file_name, OBIDMS_taxonomy_p taxonomy) { int32_t count; FILE* f; @@ -526,7 +971,7 @@ econameidx_t* read_names_idx(const char *file_name, OBIDMS_taxonomy_p taxonomy) } -econameidx_t* read_preferred_names_idx(const char *file_name, OBIDMS_taxonomy_p taxonomy) +static econameidx_t* read_preferred_names_idx(const char *file_name, OBIDMS_taxonomy_p taxonomy) { int32_t count; FILE* f; @@ -565,7 +1010,7 @@ econameidx_t* read_preferred_names_idx(const char *file_name, OBIDMS_taxonomy_p } -ecomergedidx_t* read_merged_idx(const char *file_name, OBIDMS_taxonomy_p taxonomy) +static ecomergedidx_t* read_merged_idx(const char *file_name, OBIDMS_taxonomy_p taxonomy) { int32_t count; FILE* f; @@ -610,15 +1055,7 @@ ecomergedidx_t* read_merged_idx(const char *file_name, OBIDMS_taxonomy_p taxonom } - - - - - - -// Functions to write taxonomy structure to binary files - -int write_ranks_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? +static int write_ranks_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? { int i; char* file_name; @@ -721,7 +1158,7 @@ int write_ranks_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_na } -int write_taxonomy_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct? +static int write_taxonomy_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct? { int i; char* file_name; @@ -858,7 +1295,7 @@ int write_taxonomy_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy } -int write_local_taxonomy_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct? +static int write_local_taxonomy_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct? { int i; char* file_name; @@ -995,7 +1432,7 @@ int write_local_taxonomy_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* ta } -int write_names_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct? +static int write_names_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct? { int i; char* file_name; @@ -1143,7 +1580,7 @@ int write_names_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_na } -int write_preferred_names_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct? +static int write_preferred_names_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? keep argument but if NULL, use the one in struct? { int i; char* file_name; @@ -1185,7 +1622,7 @@ int write_preferred_names_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* t free(taxonomy_path); // Create file - file_descriptor = open(file_name, O_RDWR | O_CREAT | O_EXCL, 0777); + file_descriptor = open(file_name, O_RDWR | O_CREAT, 0777); if (file_descriptor < 0) { obi_set_errno(OBI_TAXONOMY_ERROR); @@ -1291,7 +1728,7 @@ int write_preferred_names_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* t } -int write_merged_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? +static int write_merged_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_name) // TODO prefix in taxonomy struct? { int i; char* file_name; @@ -1399,48 +1836,6 @@ int write_merged_idx(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* taxonomy_n } -int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name) -{ - char* taxonomy_path; - - // Build the taxonomy directory path - taxonomy_path = get_taxonomy_path(dms, tax_name); - if (taxonomy_path == NULL) - return -1; - - // Try to create the directory - if (mkdir(taxonomy_path, 00777) < 0) - { - if (errno == EEXIST) - obidebug(1, "\nA taxonomy already exists with this name."); - obidebug(1, "\nProblem creating a new taxonomy directory"); - free(taxonomy_path); - return -1; - } - - free(taxonomy_path); - - if (write_ranks_idx(dms, tax, tax_name) < 0) - return -1; - if (write_taxonomy_idx(dms, tax, tax_name) < 0) - return -1; - if (write_names_idx(dms, tax, tax_name) < 0) - return -1; - if (write_merged_idx(dms, tax, tax_name) < 0) - return -1; - // Check if there are local taxa (if so last taxon is local) - if ((tax->taxa)->local_count > 0) - if (write_local_taxonomy_idx(dms, tax, tax_name) < 0) - return -1; - // Write preferred names if there are some - if (tax->preferred_names != NULL) - if (write_preferred_names_idx(dms, tax, tax_name) < 0) - return -1; - return 0; -} - - - int read_nodes_dmp(const char* taxdump, OBIDMS_taxonomy_p tax, char*** rank_names_p, int** parent_taxids_p) { struct dirent* dp; @@ -2049,7 +2444,7 @@ int read_merged_dmp(const char* taxdump, OBIDMS_taxonomy_p tax, int32_t* delnode // Store the deprecated taxid with the index that refers to the new taxid // Find the index of the new taxid - t = obi_taxo_get_taxon_with_current_taxid(tax, taxid); + t = get_taxon_with_current_taxid(tax, taxid); // Store the old taxid with the index (tax->merged_idx)->merged[n].taxid = old_taxid; (tax->merged_idx)->merged[n].idx = t->idx; @@ -2335,6 +2730,13 @@ int read_names_dmp(const char* taxdump, OBIDMS_taxonomy_p tax) } +/********************************************************************** + * + * D E F I N I T I O N O F T H E P U B L I C F U N C T I O N S + * + **********************************************************************/ + + OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) { OBIDMS_taxonomy_p tax; @@ -2524,7 +2926,7 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) // Associate the taxa with their parent for (i=0; i < (tax->taxa)->count; i++) { - ((tax->taxa)->taxon)[i].parent = obi_taxo_get_taxon_with_current_taxid(tax, parent_taxids[i]); + ((tax->taxa)->taxon)[i].parent = get_taxon_with_current_taxid(tax, parent_taxids[i]); if (((tax->taxa)->taxon)[i].parent == NULL) { obi_set_errno(OBI_TAXONOMY_ERROR); @@ -2574,192 +2976,6 @@ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump) } -int obi_taxo_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const char* rank_name, int32_t parent_taxid, int32_t min_taxid) -{ - int32_t taxid; - ecotx_t* taxon; - int i; -// econame_t* name_struct; - - // Enlarge the structure memory for a new taxon - tax->taxa = (ecotxidx_t*) realloc(tax->taxa, sizeof(ecotxidx_t) + sizeof(ecotx_t) * (((tax->taxa)->count) + 1)); - if (tax->taxa == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError reallocating memory for a taxonomy structure to add a new taxon"); - return -1; - } - - // Compute new taxid that must be equal or greater than 1E7 and greater than the maximum taxid existing in the taxonomy - if (min_taxid < MIN_LOCAL_TAXID) - min_taxid = MIN_LOCAL_TAXID; - if (min_taxid > (tax->taxa)->max_taxid) - taxid = min_taxid; - else - taxid = ((tax->taxa)->max_taxid) + 1; - - // Fill the ecotx_t node structure - taxon = ((tax->taxa)->taxon)+((tax->taxa)->count); - taxon->taxid = taxid; - taxon->idx = (tax->taxa)->count; - taxon->local = true; - taxon->name = (char*) malloc((strlen(name) + 1) * sizeof(char)); - if (taxon->name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for a taxon name to add a new taxon"); - return -1; - } - strcpy(taxon->name, name); - taxon->rank = -1; - for (i=0; i < (tax->ranks)->count; i++) - { - if (strcmp(rank_name, ((tax->ranks)->label)[i]) == 0) - { - taxon->rank = i; - break; - } - } - if (taxon->rank == -1) - { - obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nError: taxon rank not found when adding a new taxon"); - return -1; - } - taxon->parent = obi_taxo_get_taxon_with_taxid(tax, parent_taxid); - if (taxon->parent == NULL) - { - obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nError: taxon parent not found when adding a new taxon"); - return -1; - } - taxon->farest = 0; - - // Update taxonomy counts etc - (tax->taxa)->max_taxid = taxid; - ((tax->taxa)->count)++; - ((tax->taxa)->local_count)++; - (tax->taxa)->buffer_size = (tax->taxa)->count; - -// // Add new name in names structure // Commented because the new name was not added in the .ndx file in the OBITools1 -// // Allocate memory for new name -// tax->names = (econameidx_t*) realloc(tax->names, sizeof(econameidx_t) + sizeof(econame_t) * ((tax->names)->count + 1)); -// if (tax->names == NULL) -// { -// obi_set_errno(OBI_MALLOC_ERROR); -// obidebug(1, "\nError reallocating memory for a taxonomy structure to add a new taxon"); -// return -1; -// } -// -// // Add new name -// name_struct = (tax->names)->names + ((tax->names)->count); -// name_struct->name = (char*) malloc((strlen(name) + 1) * sizeof(char)); -// if (name_struct->name == NULL) -// { -// obi_set_errno(OBI_MALLOC_ERROR); -// obidebug(1, "\nError allocating memory for a taxon name to add a new taxon"); -// return -1; -// } -// strcpy(name_struct->name, name); -// name_struct->class_name = (char*) malloc((strlen("scientific name") + 1) * sizeof(char)); -// if (name_struct->class_name == NULL) -// { -// obi_set_errno(OBI_MALLOC_ERROR); -// obidebug(1, "\nError allocating memory for a taxon class name to add a new taxon"); -// return -1; -// } -// strcpy(name_struct->class_name, "scientific name"); -// name_struct->is_scientific_name = true; -// name_struct->taxon = ((tax->taxa)->taxon) + ((tax->taxa)->count) - 1; -// -// // Sort names in alphabetical order -// qsort((tax->names)->names, (tax->names)->count, sizeof(econame_t), cmp_names); -// -// // Update name count -// ((tax->names)->count)++; - - return taxid; -} - - -int obi_taxo_add_preferred_name_with_taxid(OBIDMS_taxonomy_p tax, int32_t taxid, const char* preferred_name) -{ - ecotx_t* taxon; - - taxon = obi_taxo_get_taxon_with_taxid(tax, taxid); - - return obi_taxo_add_preferred_name_with_taxon(tax, taxon, preferred_name); -} - - -int obi_taxo_add_preferred_name_with_taxon(OBIDMS_taxonomy_p tax, ecotx_t* taxon, const char* preferred_name) -{ - econame_t* name_struct; - - // Free previous preferred name if there is one - if (taxon->preferred_name != NULL) - free(taxon->preferred_name); - - taxon->preferred_name = (char*) malloc((strlen(preferred_name) + 1) * sizeof(char)); - if (taxon->preferred_name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for a new preferred name for a taxon"); - return -1; - } - strcpy(taxon->preferred_name, preferred_name); - - // Add new name in preferred names structure - // Allocate or reallocate memory for new name - if (tax->preferred_names == NULL) - { - tax->preferred_names = (econameidx_t*) malloc(sizeof(econameidx_t) + sizeof(econame_t)); - (tax->preferred_names)->count = 0; - } - else - tax->preferred_names = (econameidx_t*) realloc(tax->preferred_names, sizeof(econameidx_t) + sizeof(econame_t) * ((tax->preferred_names)->count + 1)); - if (tax->preferred_names == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError reallocating memory for a taxonomy structure to add a new preferred name"); - return -1; - } - - // Add new preferred name - name_struct = (tax->preferred_names)->names + ((tax->preferred_names)->count); - name_struct->name = (char*) malloc((strlen(preferred_name) + 1) * sizeof(char)); - if (name_struct->name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for a new taxon preferred name"); - return -1; - } - strcpy(name_struct->name, preferred_name); - - name_struct->class_name = (char*) malloc((strlen("preferred name") + 1) * sizeof(char)); - if (name_struct->class_name == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for a taxon class name to add a new preferred name"); - return -1; - } - strcpy(name_struct->class_name, "preferred name"); - name_struct->is_scientific_name = false; - name_struct->taxon = taxon; - - // Sort preferred names in alphabetical order - qsort((tax->preferred_names)->names, (tax->preferred_names)->count, sizeof(econame_t), cmp_names); - - // Update preferred name count - ((tax->preferred_names)->count)++; - - return 0; -} - - -/////// PUBLIC ///////// - - OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, bool read_alternative_names) { OBIDMS_taxonomy_p tax; @@ -2977,28 +3193,82 @@ OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, boo } +int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name) +{ + char* taxonomy_path; + + // Build the taxonomy directory path + taxonomy_path = get_taxonomy_path(dms, tax_name); + if (taxonomy_path == NULL) + return -1; + + // Try to create the directory + if (mkdir(taxonomy_path, 00777) < 0) + { + if (errno == EEXIST) + obidebug(1, "\nA taxonomy already exists with this name."); + obidebug(1, "\nProblem creating a new taxonomy directory"); + free(taxonomy_path); + return -1; + } + + free(taxonomy_path); + + if (write_ranks_idx(dms, tax, tax_name) < 0) + return -1; + if (write_taxonomy_idx(dms, tax, tax_name) < 0) + return -1; + if (write_names_idx(dms, tax, tax_name) < 0) + return -1; + if (write_merged_idx(dms, tax, tax_name) < 0) + return -1; + // Check if there are local taxa (if so last taxon is local) + if ((tax->taxa)->local_count > 0) + if (write_local_taxonomy_idx(dms, tax, tax_name) < 0) + return -1; + // Write preferred names if there are some + if (tax->preferred_names != NULL) + if (write_preferred_names_idx(dms, tax, tax_name) < 0) + return -1; + return 0; +} + + int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy) { int i; - // Update local informations (local taxa and preferred names) if there are any - if ((taxonomy->taxa)->local_count > 0) - { - if (taxonomy->dms == NULL) - { - obi_set_errno(OBI_TAXONOMY_ERROR); - obidebug(1, "\nError closing a taxonomy with local files but no DMS associated (probably read directly from taxdump)"); // TODO discuss - } - if (write_local_taxonomy_idx(taxonomy->dms, taxonomy, taxonomy->tax_name) < 0) - return -1; - } - // Write preferred names if there are some - if (taxonomy->preferred_names != NULL) - if (write_preferred_names_idx(taxonomy->dms, taxonomy, taxonomy->tax_name) < 0) - return -1; - if (taxonomy) { + // Update local informations (local taxa and preferred names) if there are any + if ((taxonomy->taxa)->local_count > 0) + { + if (taxonomy->dms == NULL) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError closing a taxonomy with local files but no DMS associated (probably read directly from taxdump)"); // TODO discuss + } + if (write_local_taxonomy_idx(taxonomy->dms, taxonomy, taxonomy->tax_name) < 0) + return -1; + } + + // Write preferred names if there are some + if (taxonomy->preferred_names) + { + if (write_preferred_names_idx(taxonomy->dms, taxonomy, taxonomy->tax_name) < 0) + return -1; + + // Free preferred names + for (i=0; i < (taxonomy->preferred_names)->count; i++) + { + if (((taxonomy->preferred_names)->names[i]).name) + free(((taxonomy->preferred_names)->names[i]).name); + if (((taxonomy->preferred_names)->names[i]).class_name) + free(((taxonomy->preferred_names)->names[i]).class_name); + } + free(taxonomy->preferred_names); + } + if (taxonomy->ranks) { for (i=0; i < (taxonomy->ranks)->count; i++) @@ -3043,7 +3313,187 @@ int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy) } -////////////////////////////////////////////////////////////////////////// +int obi_taxo_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const char* rank_name, int32_t parent_taxid, int32_t min_taxid) +{ + int32_t taxid; + ecotx_t* taxon; + int i; +// econame_t* name_struct; + + // Enlarge the structure memory for a new taxon + tax->taxa = (ecotxidx_t*) realloc(tax->taxa, sizeof(ecotxidx_t) + sizeof(ecotx_t) * (((tax->taxa)->count) + 1)); + if (tax->taxa == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for a taxonomy structure to add a new taxon"); + return -1; + } + + // Compute new taxid that must be equal or greater than 1E7 and greater than the maximum taxid existing in the taxonomy + if (min_taxid < MIN_LOCAL_TAXID) + min_taxid = MIN_LOCAL_TAXID; + if (min_taxid > (tax->taxa)->max_taxid) + taxid = min_taxid; + else + taxid = ((tax->taxa)->max_taxid) + 1; + + // Fill the ecotx_t node structure + taxon = ((tax->taxa)->taxon)+((tax->taxa)->count); + taxon->taxid = taxid; + taxon->idx = (tax->taxa)->count; + taxon->local = true; + taxon->name = (char*) malloc((strlen(name) + 1) * sizeof(char)); + if (taxon->name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a taxon name to add a new taxon"); + return -1; + } + strcpy(taxon->name, name); + taxon->rank = -1; + for (i=0; i < (tax->ranks)->count; i++) + { + if (strcmp(rank_name, ((tax->ranks)->label)[i]) == 0) + { + taxon->rank = i; + break; + } + } + if (taxon->rank == -1) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError: taxon rank not found when adding a new taxon"); + return -1; + } + taxon->parent = obi_taxo_get_taxon_with_taxid(tax, parent_taxid); + if (taxon->parent == NULL) + { + obi_set_errno(OBI_TAXONOMY_ERROR); + obidebug(1, "\nError: taxon parent not found when adding a new taxon"); + return -1; + } + taxon->farest = 0; + + // Update taxonomy counts etc + (tax->taxa)->max_taxid = taxid; + ((tax->taxa)->count)++; + ((tax->taxa)->local_count)++; + (tax->taxa)->buffer_size = (tax->taxa)->count; + +// // Add new name in names structure // Commented because the new name was not added in the .ndx file in the OBITools1 +// // Allocate memory for new name +// tax->names = (econameidx_t*) realloc(tax->names, sizeof(econameidx_t) + sizeof(econame_t) * ((tax->names)->count + 1)); +// if (tax->names == NULL) +// { +// obi_set_errno(OBI_MALLOC_ERROR); +// obidebug(1, "\nError reallocating memory for a taxonomy structure to add a new taxon"); +// return -1; +// } +// +// // Add new name +// name_struct = (tax->names)->names + ((tax->names)->count); +// name_struct->name = (char*) malloc((strlen(name) + 1) * sizeof(char)); +// if (name_struct->name == NULL) +// { +// obi_set_errno(OBI_MALLOC_ERROR); +// obidebug(1, "\nError allocating memory for a taxon name to add a new taxon"); +// return -1; +// } +// strcpy(name_struct->name, name); +// name_struct->class_name = (char*) malloc((strlen("scientific name") + 1) * sizeof(char)); +// if (name_struct->class_name == NULL) +// { +// obi_set_errno(OBI_MALLOC_ERROR); +// obidebug(1, "\nError allocating memory for a taxon class name to add a new taxon"); +// return -1; +// } +// strcpy(name_struct->class_name, "scientific name"); +// name_struct->is_scientific_name = true; +// name_struct->taxon = ((tax->taxa)->taxon) + ((tax->taxa)->count) - 1; +// +// // Sort names in alphabetical order +// qsort((tax->names)->names, (tax->names)->count, sizeof(econame_t), cmp_names); +// +// // Update name count +// ((tax->names)->count)++; + + return taxid; +} + + +int obi_taxo_add_preferred_name_with_taxid(OBIDMS_taxonomy_p tax, int32_t taxid, const char* preferred_name) +{ + ecotx_t* taxon; + + taxon = obi_taxo_get_taxon_with_taxid(tax, taxid); + + return obi_taxo_add_preferred_name_with_taxon(tax, taxon, preferred_name); +} + + +int obi_taxo_add_preferred_name_with_taxon(OBIDMS_taxonomy_p tax, ecotx_t* taxon, const char* preferred_name) +{ + econame_t* name_struct; + + // Free previous preferred name if there is one + if (taxon->preferred_name != NULL) + free(taxon->preferred_name); + + taxon->preferred_name = (char*) malloc((strlen(preferred_name) + 1) * sizeof(char)); + if (taxon->preferred_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a new preferred name for a taxon"); + return -1; + } + strcpy(taxon->preferred_name, preferred_name); + + // Add new name in preferred names structure + // Allocate or reallocate memory for new name + if (tax->preferred_names == NULL) + { + tax->preferred_names = (econameidx_t*) malloc(sizeof(econameidx_t) + sizeof(econame_t)); + (tax->preferred_names)->count = 0; + } + else + tax->preferred_names = (econameidx_t*) realloc(tax->preferred_names, sizeof(econameidx_t) + sizeof(econame_t) * ((tax->preferred_names)->count + 1)); + if (tax->preferred_names == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError reallocating memory for a taxonomy structure to add a new preferred name"); + return -1; + } + + // Add new preferred name + name_struct = (tax->preferred_names)->names + ((tax->preferred_names)->count); + name_struct->name = (char*) malloc((strlen(preferred_name) + 1) * sizeof(char)); + if (name_struct->name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a new taxon preferred name"); + return -1; + } + strcpy(name_struct->name, preferred_name); + + name_struct->class_name = (char*) malloc((strlen("preferred name") + 1) * sizeof(char)); + if (name_struct->class_name == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for a taxon class name to add a new preferred name"); + return -1; + } + strcpy(name_struct->class_name, "preferred name"); + name_struct->is_scientific_name = false; + name_struct->taxon = taxon; + + // Sort preferred names in alphabetical order + qsort((tax->preferred_names)->names, (tax->preferred_names)->count, sizeof(econame_t), cmp_names); + + // Update preferred name count + ((tax->preferred_names)->count)++; + + return 0; +} ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx) @@ -3068,22 +3518,6 @@ ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx) } -ecotx_t* obi_taxo_get_taxon_with_current_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid) // TODO discuss keeping private? -{ - ecotx_t *current_taxon; - int32_t count; - - count = (taxonomy->taxa)->count; - - current_taxon = (ecotx_t*) bsearch((const void *) ((size_t) taxid), - (const void *) taxonomy->taxa->taxon, - count, - sizeof(ecotx_t), - cmp_taxids_in_ecotx_t); - return current_taxon; -} - - ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid) { ecotx_t *current_taxon; @@ -3234,5 +3668,3 @@ ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy) return obi_taxo_get_parent_at_rank(taxon, rankindex); } - - diff --git a/src/obidms_taxonomy.h b/src/obidms_taxonomy.h index fa2f511..dcce499 100644 --- a/src/obidms_taxonomy.h +++ b/src/obidms_taxonomy.h @@ -6,7 +6,7 @@ * @file obidms_taxonomy.h * @author Celine Mercier (celine.mercier@metabarcoding.org) * @date March 2nd 2016 - * @brief Header file for the functions handling the reading of binary taxonomy files. + * @brief Header file for the functions handling the reading and writing of taxonomy files. */ @@ -17,123 +17,384 @@ #include "obidms.h" -#define MIN_LOCAL_TAXID (10000000) -#define TAX_NAME_LEN (1024) +#define MIN_LOCAL_TAXID (10000000) /**< The minimum taxid for a taxon added locally (i.e. not an NCBI taxon). + */ +#define TAX_NAME_LEN (1024) /**< The maximum length for the taxonomy name. + */ +/** + * @brief Structure for a taxon as stored in a .tdx file. + */ typedef struct { - int32_t taxid; - int32_t rank; - int32_t parent; - int32_t name_length; - char name[]; + int32_t taxid; /**< Taxid. + */ + int32_t rank; /**< Rank index. + */ + int32_t parent; /**< Index, in the taxid index, of the parent node in the taxonomic tree. + */ + int32_t name_length; /**< Length of the taxon scientific name. + */ + char name[]; /**< Scientific name of the taxon. + */ } ecotxformat_t; +/** + * @brief Structure for a taxon as stored in a taxonomy structure. + */ typedef struct ecotxnode { - int32_t taxid; // TODO discuss that this is will be the current taxid even if the struct was accessed through a deprecated one - int32_t rank; - int32_t farest; - int32_t idx; - struct ecotxnode* parent; - char* name; // scientific name - char* preferred_name; // preferred name - bool local; + int32_t taxid; /**< Taxid. // TODO discuss that this is will be the current taxid even if the struct was accessed through a deprecated one + */ + int32_t rank; /**< Rank index in ecorankidx_t structure. + */ + int32_t farest; /**< Longest branch length, used to compute distances between taxa faster. + */ + int32_t idx; /**< Index in the ecotxidx_t structure. + */ + struct ecotxnode* parent; /**< Pointer on the parent node in the taxonomic tree. + */ + char* name; /**< Scientific name of the taxon. + */ + char* preferred_name; /**< Preferred name of the taxon if there is one, otherwise NULL. + */ + bool local; /**< A boolean indicating whether the taxon is local or not. + */ } ecotx_t; +/** + * @brief Structure for the taxon index in a taxonomy structure. + */ typedef struct { - int32_t count; - int32_t ncbi_count; - int32_t local_count; - int32_t max_taxid; - int32_t buffer_size; - ecotx_t taxon[]; + int32_t count; /**< Number of taxa. + */ + int32_t ncbi_count; /**< Number of NCBI taxa. + */ + int32_t local_count; /**< Number of taxa added locally. + */ + int32_t max_taxid; /**< Maximum taxid existing in the taxon index. + */ + int32_t buffer_size; /**< Number of taxa. // TODO kept this but not sure of its use + */ + ecotx_t taxon[]; /**< Taxon array. + */ } ecotxidx_t; +/** + * @brief Structure for the rank index in a taxonomy structure. + */ typedef struct { - int32_t count; - char* label[]; + int32_t count; /**< Number of ranks. + */ + char* label[]; /**< Array of rank names. + */ } ecorankidx_t; +/** + * @brief Structure for a taxon name as stored in a .ndx file. + */ typedef struct { - int32_t is_scientific_name; - int32_t name_length; - int32_t class_length; - int32_t taxid; // taxid idx - char names[]; + int32_t is_scientific_name; /**< A boolean indicating whether the name is a scientific name or not. + */ + int32_t name_length; /**< The name length. + */ + int32_t class_length; /**< The name class length. + */ + int32_t taxid; /**< Index of the taxon in the taxid index. + */ + char names[]; /**< Taxon name and name class concatenated. + */ } econameformat_t; +/** + * @brief Structure for a taxon name as stored in a taxonomy structure. + */ typedef struct { - char* name; - char* class_name; - int32_t is_scientific_name; - struct ecotxnode* taxon; + char* name; /**< Taxon name. + */ + char* class_name; /**< Name class. + */ + int32_t is_scientific_name; /**< A boolean indicating whether the name is a scientific name or not. + */ + struct ecotxnode* taxon; /**< Pointer on the taxon in the taxon index. + */ } econame_t; +/** + * @brief Structure for the name index in a taxonomy structure. + */ typedef struct { - int32_t count; - econame_t names[]; + int32_t count; /**< Number of names. + */ + econame_t names[]; /**< Array of names. + */ } econameidx_t; +/** + * @brief Structure for a taxid/index pair as stored in a taxonomy structure. + */ typedef struct { - int32_t taxid; - int32_t idx; + int32_t taxid; /**< Taxid. + */ + int32_t idx; /**< Index of the taxid in the taxon index, -1 if the taxid is deprecated. + */ } ecomerged_t; +/** + * @brief Structure for a merged taxid index in a taxonomy structure. + * + * This index includes all deprecated taxids that now refer to different taxids, and + * the deprecated taxids that are deleted. + * + */ typedef struct { - int32_t count; - ecomerged_t merged[]; + int32_t count; /**< Number of taxid/index pairs. + */ + ecomerged_t merged[]; /**< Array of taxid/index pairs. + */ } ecomergedidx_t; +/** + * @brief Structure for a taxonomy. + */ typedef struct OBIDMS_taxonomy_t { - char tax_name[TAX_NAME_LEN]; - OBIDMS_p dms; - ecomergedidx_t* merged_idx; - ecorankidx_t* ranks; - econameidx_t* names; - econameidx_t* preferred_names; - ecotxidx_t* taxa; + char tax_name[TAX_NAME_LEN]; /**< Taxonomy name. + */ + OBIDMS_p dms; /**< A pointer on the DMS to which the taxonomy belongs. + */ + ecomergedidx_t* merged_idx; /**< Merged taxid index. + */ + ecorankidx_t* ranks; /**< Taxonomic ranks. + */ + econameidx_t* names; /**< Taxon names. + */ + econameidx_t* preferred_names; /**< Taxon preferred names (i.e. added locally). + */ + ecotxidx_t* taxa; /**< Taxa. + */ } OBIDMS_taxonomy_t, *OBIDMS_taxonomy_p; -OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, bool read_alternative_names); - -int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy); - -ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx); - -ecotx_t* obi_taxo_get_taxon_with_current_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid); -ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid); - -bool obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid); - -ecotx_t* obi_taxo_get_species(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy); - -ecotx_t* obi_taxo_get_genus(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy); - -ecotx_t* obi_taxo_get_family(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy); - -ecotx_t* obi_taxo_get_kingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy); - -ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy); - -int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name); - +/** + * @brief Function reading an NCBI taxdump and loading its information into a taxonomy structure. + * + * @param taxdump The path to the taxdump directory. + * + * @returns A pointer on the read taxonomy structure. + * @retval NULL if an error occurred. + * + * @since 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump); + +/** + * @brief Function reading a binary taxonomy database (i.e. a set of .tdx, .ndx, .rdx, .adx, .ldx, .pdx files) + * and loading its information into a taxonomy structure. + * + * @param dms A pointer on the DMS to which the taxonomy belongs. + * @param taxonomy_name The name (prefix) of the taxonomy. + * @param read_alternative_names A boolean indicating whether names other than scientific and preferred names should be read. + * + * @returns A pointer on the read taxonomy structure. + * @retval NULL if an error occurred. + * + * @since 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, bool read_alternative_names); + + +/** + * @brief Function writing a binary taxonomy database (i.e. a set of .tdx, .ndx, .rdx, .adx, .ldx, .pdx files). + * + * @param dms A pointer on the DMS to which the taxonomy belongs. + * @param tax A pointer on the taxonomy structure. + * @param tax_name The name (prefix) of the taxonomy. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + * + * @since 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name); + + +/** + * @brief Function closing a taxonomy structure. + * + * This function writes all changes to the binary files (local taxa and preferred names) and free all allocated memory for the structure. + * + * @param taxonomy A pointer on the taxonomy structure. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + * + * @since 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy); + + +/** + * @brief Function adding a local taxon to a taxonomy. + * + * @param tax A pointer on the taxonomy structure. + * @param name The taxon scientific name. + * @param rank_name The taxon rank name. + * @param parent_taxid The taxid of the parent node in the taxonomic tree. + * @param min_taxid The minimum taxid to give to the new taxon (the function will choose a new taxid >= min_taxid and >= MIN_LOCAL_TAXID). + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + * + * @since 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ int obi_taxo_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const char* rank_name, int32_t parent_taxid, int32_t min_taxid); + +/** + * @brief Function adding a preferred name to a taxon in a taxonomy, referred to by its taxid. + * + * @param tax A pointer on the taxonomy structure. + * @param taxid The taxid of the taxon that should have a new preferred name. + * @param preferred_name The new preferred name. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ int obi_taxo_add_preferred_name_with_taxid(OBIDMS_taxonomy_p tax, int32_t taxid, const char* preferred_name); + +/** + * @brief Function adding a preferred name to a taxon in a taxonomy, referred to by the taxon pointer. + * + * @param tax A pointer on the taxonomy structure. + * @param taxon A pointer on the taxon that should have a new preferred name. + * @param preferred_name The new preferred name. + * + * @returns An integer value indicating the success of the operation. + * @retval 0 on success. + * @retval -1 if an error occurred. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ int obi_taxo_add_preferred_name_with_taxon(OBIDMS_taxonomy_p tax, ecotx_t* taxon, const char* preferred_name); +/** + * @brief Function returning the parent of a taxon at a given rank. + * + * @param taxon A pointer on the taxon. + * @param rankidx The index of the rank wanted. + * + * @returns A pointer on the parent taxon at the wanted rank. + * @retval NULL if no parent taxon was found at the wanted rank. + */ +ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx); + + +/** + * @brief Function returning a taxon given its taxid. + * + * @param taxonomy A pointer on the taxonomy. + * @param taxid The taxid of the taxon. + * + * @returns A pointer on the wanted taxon. + * @retval NULL if no taxon was found with the given taxid. + * + * @since January 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid); + + +/** + * @brief Function checking whether a taxon is under another in the taxonomy tree. + * + * @param taxon A pointer on the first taxon. + * @param other_taxid The taxid of the second taxon. + * + * @returns A boolean indicating whether the first taxon is under the second taxon in the taxonomy tree. + */ +bool obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid); + + +/** + * @brief Function returning the parent of a taxon at the species level. + * + * @param taxon A pointer on the taxon. + * @param taxonomy A pointer on the taxonomy structure. + * + * @returns A pointer on the parent taxon at the species level. + * @retval NULL if no parent taxon was found at the wanted rank. + */ +ecotx_t* obi_taxo_get_species(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy); + + +/** + * @brief Function returning the parent of a taxon at the genus level. + * + * @param taxon A pointer on the taxon. + * @param taxonomy A pointer on the taxonomy structure. + * + * @returns A pointer on the parent taxon at the genus level. + * @retval NULL if no parent taxon was found at the wanted rank. + */ +ecotx_t* obi_taxo_get_genus(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy); + + +/** + * @brief Function returning the parent of a taxon at the family level. + * + * @param taxon A pointer on the taxon. + * @param taxonomy A pointer on the taxonomy structure. + * + * @returns A pointer on the parent taxon at the family level. + * @retval NULL if no parent taxon was found at the wanted rank. + */ +ecotx_t* obi_taxo_get_family(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy); + + +/** + * @brief Function returning the parent of a taxon at the kingdom level. + * + * @param taxon A pointer on the taxon. + * @param taxonomy A pointer on the taxonomy structure. + * + * @returns A pointer on the parent taxon at the kingdom level. + * @retval NULL if no parent taxon was found at the wanted rank. + */ +ecotx_t* obi_taxo_get_kingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy); + + +/** + * @brief Function returning the parent of a taxon at the superkingdom level. + * + * @param taxon A pointer on the taxon. + * @param taxonomy A pointer on the taxonomy structure. + * + * @returns A pointer on the parent taxon at the superkingdom level. + * @retval NULL if no parent taxon was found at the wanted rank. + */ +ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy);