Taxonomy: documentation for all the functions, and fixed bugs when

closing the taxonomy (overwriting of .pdx files, missing freeing, and
re-placed a misplaced condition)
This commit is contained in:
Celine Mercier
2017-01-18 18:22:49 +01:00
parent c065c1914a
commit c0bcdce724
2 changed files with 1179 additions and 486 deletions

File diff suppressed because it is too large Load Diff

View File

@ -6,7 +6,7 @@
* @file obidms_taxonomy.h * @file obidms_taxonomy.h
* @author Celine Mercier (celine.mercier@metabarcoding.org) * @author Celine Mercier (celine.mercier@metabarcoding.org)
* @date March 2nd 2016 * @date March 2nd 2016
* @brief Header file for the functions handling the reading of binary taxonomy files. * @brief Header file for the functions handling the reading and writing of taxonomy files.
*/ */
@ -17,123 +17,384 @@
#include "obidms.h" #include "obidms.h"
#define MIN_LOCAL_TAXID (10000000) #define MIN_LOCAL_TAXID (10000000) /**< The minimum taxid for a taxon added locally (i.e. not an NCBI taxon).
#define TAX_NAME_LEN (1024) */
#define TAX_NAME_LEN (1024) /**< The maximum length for the taxonomy name.
*/
/**
* @brief Structure for a taxon as stored in a .tdx file.
*/
typedef struct { typedef struct {
int32_t taxid; int32_t taxid; /**< Taxid.
int32_t rank; */
int32_t parent; int32_t rank; /**< Rank index.
int32_t name_length; */
char name[]; int32_t parent; /**< Index, in the taxid index, of the parent node in the taxonomic tree.
*/
int32_t name_length; /**< Length of the taxon scientific name.
*/
char name[]; /**< Scientific name of the taxon.
*/
} ecotxformat_t; } ecotxformat_t;
/**
* @brief Structure for a taxon as stored in a taxonomy structure.
*/
typedef struct ecotxnode { typedef struct ecotxnode {
int32_t taxid; // TODO discuss that this is will be the current taxid even if the struct was accessed through a deprecated one int32_t taxid; /**< Taxid. // TODO discuss that this is will be the current taxid even if the struct was accessed through a deprecated one
int32_t rank; */
int32_t farest; int32_t rank; /**< Rank index in ecorankidx_t structure.
int32_t idx; */
struct ecotxnode* parent; int32_t farest; /**< Longest branch length, used to compute distances between taxa faster.
char* name; // scientific name */
char* preferred_name; // preferred name int32_t idx; /**< Index in the ecotxidx_t structure.
bool local; */
struct ecotxnode* parent; /**< Pointer on the parent node in the taxonomic tree.
*/
char* name; /**< Scientific name of the taxon.
*/
char* preferred_name; /**< Preferred name of the taxon if there is one, otherwise NULL.
*/
bool local; /**< A boolean indicating whether the taxon is local or not.
*/
} ecotx_t; } ecotx_t;
/**
* @brief Structure for the taxon index in a taxonomy structure.
*/
typedef struct { typedef struct {
int32_t count; int32_t count; /**< Number of taxa.
int32_t ncbi_count; */
int32_t local_count; int32_t ncbi_count; /**< Number of NCBI taxa.
int32_t max_taxid; */
int32_t buffer_size; int32_t local_count; /**< Number of taxa added locally.
ecotx_t taxon[]; */
int32_t max_taxid; /**< Maximum taxid existing in the taxon index.
*/
int32_t buffer_size; /**< Number of taxa. // TODO kept this but not sure of its use
*/
ecotx_t taxon[]; /**< Taxon array.
*/
} ecotxidx_t; } ecotxidx_t;
/**
* @brief Structure for the rank index in a taxonomy structure.
*/
typedef struct { typedef struct {
int32_t count; int32_t count; /**< Number of ranks.
char* label[]; */
char* label[]; /**< Array of rank names.
*/
} ecorankidx_t; } ecorankidx_t;
/**
* @brief Structure for a taxon name as stored in a .ndx file.
*/
typedef struct { typedef struct {
int32_t is_scientific_name; int32_t is_scientific_name; /**< A boolean indicating whether the name is a scientific name or not.
int32_t name_length; */
int32_t class_length; int32_t name_length; /**< The name length.
int32_t taxid; // taxid idx */
char names[]; int32_t class_length; /**< The name class length.
*/
int32_t taxid; /**< Index of the taxon in the taxid index.
*/
char names[]; /**< Taxon name and name class concatenated.
*/
} econameformat_t; } econameformat_t;
/**
* @brief Structure for a taxon name as stored in a taxonomy structure.
*/
typedef struct { typedef struct {
char* name; char* name; /**< Taxon name.
char* class_name; */
int32_t is_scientific_name; char* class_name; /**< Name class.
struct ecotxnode* taxon; */
int32_t is_scientific_name; /**< A boolean indicating whether the name is a scientific name or not.
*/
struct ecotxnode* taxon; /**< Pointer on the taxon in the taxon index.
*/
} econame_t; } econame_t;
/**
* @brief Structure for the name index in a taxonomy structure.
*/
typedef struct { typedef struct {
int32_t count; int32_t count; /**< Number of names.
econame_t names[]; */
econame_t names[]; /**< Array of names.
*/
} econameidx_t; } econameidx_t;
/**
* @brief Structure for a taxid/index pair as stored in a taxonomy structure.
*/
typedef struct { typedef struct {
int32_t taxid; int32_t taxid; /**< Taxid.
int32_t idx; */
int32_t idx; /**< Index of the taxid in the taxon index, -1 if the taxid is deprecated.
*/
} ecomerged_t; } ecomerged_t;
/**
* @brief Structure for a merged taxid index in a taxonomy structure.
*
* This index includes all deprecated taxids that now refer to different taxids, and
* the deprecated taxids that are deleted.
*
*/
typedef struct { typedef struct {
int32_t count; int32_t count; /**< Number of taxid/index pairs.
ecomerged_t merged[]; */
ecomerged_t merged[]; /**< Array of taxid/index pairs.
*/
} ecomergedidx_t; } ecomergedidx_t;
/**
* @brief Structure for a taxonomy.
*/
typedef struct OBIDMS_taxonomy_t { typedef struct OBIDMS_taxonomy_t {
char tax_name[TAX_NAME_LEN]; char tax_name[TAX_NAME_LEN]; /**< Taxonomy name.
OBIDMS_p dms; */
ecomergedidx_t* merged_idx; OBIDMS_p dms; /**< A pointer on the DMS to which the taxonomy belongs.
ecorankidx_t* ranks; */
econameidx_t* names; ecomergedidx_t* merged_idx; /**< Merged taxid index.
econameidx_t* preferred_names; */
ecotxidx_t* taxa; ecorankidx_t* ranks; /**< Taxonomic ranks.
*/
econameidx_t* names; /**< Taxon names.
*/
econameidx_t* preferred_names; /**< Taxon preferred names (i.e. added locally).
*/
ecotxidx_t* taxa; /**< Taxa.
*/
} OBIDMS_taxonomy_t, *OBIDMS_taxonomy_p; } OBIDMS_taxonomy_t, *OBIDMS_taxonomy_p;
OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, bool read_alternative_names); /**
* @brief Function reading an NCBI taxdump and loading its information into a taxonomy structure.
int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy); *
* @param taxdump The path to the taxdump directory.
ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx); *
* @returns A pointer on the read taxonomy structure.
ecotx_t* obi_taxo_get_taxon_with_current_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid); * @retval NULL if an error occurred.
ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid); *
* @since 2016
bool obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid); * @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
ecotx_t* obi_taxo_get_species(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy);
ecotx_t* obi_taxo_get_genus(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy);
ecotx_t* obi_taxo_get_family(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy);
ecotx_t* obi_taxo_get_kingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy);
ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy);
int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name);
OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump); OBIDMS_taxonomy_p obi_read_taxdump(const char* taxdump);
/**
* @brief Function reading a binary taxonomy database (i.e. a set of .tdx, .ndx, .rdx, .adx, .ldx, .pdx files)
* and loading its information into a taxonomy structure.
*
* @param dms A pointer on the DMS to which the taxonomy belongs.
* @param taxonomy_name The name (prefix) of the taxonomy.
* @param read_alternative_names A boolean indicating whether names other than scientific and preferred names should be read.
*
* @returns A pointer on the read taxonomy structure.
* @retval NULL if an error occurred.
*
* @since 2016
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
OBIDMS_taxonomy_p obi_read_taxonomy(OBIDMS_p dms, const char* taxonomy_name, bool read_alternative_names);
/**
* @brief Function writing a binary taxonomy database (i.e. a set of .tdx, .ndx, .rdx, .adx, .ldx, .pdx files).
*
* @param dms A pointer on the DMS to which the taxonomy belongs.
* @param tax A pointer on the taxonomy structure.
* @param tax_name The name (prefix) of the taxonomy.
*
* @returns An integer value indicating the success of the operation.
* @retval 0 on success.
* @retval -1 if an error occurred.
*
* @since 2016
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
int obi_write_taxonomy(OBIDMS_p dms, OBIDMS_taxonomy_p tax, const char* tax_name);
/**
* @brief Function closing a taxonomy structure.
*
* This function writes all changes to the binary files (local taxa and preferred names) and free all allocated memory for the structure.
*
* @param taxonomy A pointer on the taxonomy structure.
*
* @returns An integer value indicating the success of the operation.
* @retval 0 on success.
* @retval -1 if an error occurred.
*
* @since 2016
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
int obi_close_taxonomy(OBIDMS_taxonomy_p taxonomy);
/**
* @brief Function adding a local taxon to a taxonomy.
*
* @param tax A pointer on the taxonomy structure.
* @param name The taxon scientific name.
* @param rank_name The taxon rank name.
* @param parent_taxid The taxid of the parent node in the taxonomic tree.
* @param min_taxid The minimum taxid to give to the new taxon (the function will choose a new taxid >= min_taxid and >= MIN_LOCAL_TAXID).
*
* @returns An integer value indicating the success of the operation.
* @retval 0 on success.
* @retval -1 if an error occurred.
*
* @since 2016
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
int obi_taxo_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const char* rank_name, int32_t parent_taxid, int32_t min_taxid); int obi_taxo_add_local_taxon(OBIDMS_taxonomy_p tax, const char* name, const char* rank_name, int32_t parent_taxid, int32_t min_taxid);
/**
* @brief Function adding a preferred name to a taxon in a taxonomy, referred to by its taxid.
*
* @param tax A pointer on the taxonomy structure.
* @param taxid The taxid of the taxon that should have a new preferred name.
* @param preferred_name The new preferred name.
*
* @returns An integer value indicating the success of the operation.
* @retval 0 on success.
* @retval -1 if an error occurred.
*
* @since January 2017
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
int obi_taxo_add_preferred_name_with_taxid(OBIDMS_taxonomy_p tax, int32_t taxid, const char* preferred_name); int obi_taxo_add_preferred_name_with_taxid(OBIDMS_taxonomy_p tax, int32_t taxid, const char* preferred_name);
/**
* @brief Function adding a preferred name to a taxon in a taxonomy, referred to by the taxon pointer.
*
* @param tax A pointer on the taxonomy structure.
* @param taxon A pointer on the taxon that should have a new preferred name.
* @param preferred_name The new preferred name.
*
* @returns An integer value indicating the success of the operation.
* @retval 0 on success.
* @retval -1 if an error occurred.
*
* @since January 2017
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
int obi_taxo_add_preferred_name_with_taxon(OBIDMS_taxonomy_p tax, ecotx_t* taxon, const char* preferred_name); int obi_taxo_add_preferred_name_with_taxon(OBIDMS_taxonomy_p tax, ecotx_t* taxon, const char* preferred_name);
/**
* @brief Function returning the parent of a taxon at a given rank.
*
* @param taxon A pointer on the taxon.
* @param rankidx The index of the rank wanted.
*
* @returns A pointer on the parent taxon at the wanted rank.
* @retval NULL if no parent taxon was found at the wanted rank.
*/
ecotx_t* obi_taxo_get_parent_at_rank(ecotx_t* taxon, int32_t rankidx);
/**
* @brief Function returning a taxon given its taxid.
*
* @param taxonomy A pointer on the taxonomy.
* @param taxid The taxid of the taxon.
*
* @returns A pointer on the wanted taxon.
* @retval NULL if no taxon was found with the given taxid.
*
* @since January 2017
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
ecotx_t* obi_taxo_get_taxon_with_taxid(OBIDMS_taxonomy_p taxonomy, int32_t taxid);
/**
* @brief Function checking whether a taxon is under another in the taxonomy tree.
*
* @param taxon A pointer on the first taxon.
* @param other_taxid The taxid of the second taxon.
*
* @returns A boolean indicating whether the first taxon is under the second taxon in the taxonomy tree.
*/
bool obi_taxo_is_taxon_under_taxid(ecotx_t* taxon, int32_t other_taxid);
/**
* @brief Function returning the parent of a taxon at the species level.
*
* @param taxon A pointer on the taxon.
* @param taxonomy A pointer on the taxonomy structure.
*
* @returns A pointer on the parent taxon at the species level.
* @retval NULL if no parent taxon was found at the wanted rank.
*/
ecotx_t* obi_taxo_get_species(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy);
/**
* @brief Function returning the parent of a taxon at the genus level.
*
* @param taxon A pointer on the taxon.
* @param taxonomy A pointer on the taxonomy structure.
*
* @returns A pointer on the parent taxon at the genus level.
* @retval NULL if no parent taxon was found at the wanted rank.
*/
ecotx_t* obi_taxo_get_genus(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy);
/**
* @brief Function returning the parent of a taxon at the family level.
*
* @param taxon A pointer on the taxon.
* @param taxonomy A pointer on the taxonomy structure.
*
* @returns A pointer on the parent taxon at the family level.
* @retval NULL if no parent taxon was found at the wanted rank.
*/
ecotx_t* obi_taxo_get_family(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy);
/**
* @brief Function returning the parent of a taxon at the kingdom level.
*
* @param taxon A pointer on the taxon.
* @param taxonomy A pointer on the taxonomy structure.
*
* @returns A pointer on the parent taxon at the kingdom level.
* @retval NULL if no parent taxon was found at the wanted rank.
*/
ecotx_t* obi_taxo_get_kingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy);
/**
* @brief Function returning the parent of a taxon at the superkingdom level.
*
* @param taxon A pointer on the taxon.
* @param taxonomy A pointer on the taxonomy structure.
*
* @returns A pointer on the parent taxon at the superkingdom level.
* @retval NULL if no parent taxon was found at the wanted rank.
*/
ecotx_t* obi_taxo_get_superkingdom(ecotx_t* taxon, OBIDMS_taxonomy_p taxonomy);