fixes #61 : now reading merged taxids information when building a
reference database
This commit is contained in:
@ -40,6 +40,27 @@
|
||||
*
|
||||
**************************************************************************/
|
||||
|
||||
/**
|
||||
* Internal function computing the last common ancestor (LCA) of a list of merged taxids (generated by obi uniq).
|
||||
* Also works on just simple taxid (returns the associated taxon).
|
||||
*
|
||||
* @param view A pointer on the view containing the taxid information (merged or not).
|
||||
* @param taxid_column A pointer on the column where taxids are kept (merged or not, aka int array or int columns).
|
||||
* @param idx The index of the sequence to compute the LCA of in the view.
|
||||
* @param merged Whether the taxid information is made of arrays of taxids or a single taxid.
|
||||
* @param tax A pointer on a taxonomy structure.
|
||||
* @param taxid_count The maximum number of taxids associated with one sequence (aka the number of elements in taxid_column).
|
||||
* @param taxid_str_indices A pointer on the list of indices in taxid_strings referring to the taxids stored as strings (see next param).
|
||||
* @param taxid_strings A pointer on the list of taxids stored as strings in the taxid column header (they correspond to the element names).
|
||||
*
|
||||
* @returns A pointer on the LCA taxon.
|
||||
* @retval NULL if an error occurred.
|
||||
*
|
||||
* @since August 2019
|
||||
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||
*/
|
||||
static inline ecotx_t* get_lca_from_merged_taxids(Obiview_p view, OBIDMS_column_p taxid_column, index_t idx, bool merged,
|
||||
OBIDMS_taxonomy_p tax, index_t taxid_count, int64_t* taxid_str_indices, char* taxid_strings);
|
||||
|
||||
|
||||
/************************************************************************
|
||||
@ -49,6 +70,54 @@
|
||||
************************************************************************/
|
||||
|
||||
|
||||
static inline ecotx_t* get_lca_from_merged_taxids(Obiview_p view, OBIDMS_column_p taxid_column, index_t idx, bool merged,
|
||||
OBIDMS_taxonomy_p tax, index_t taxid_count, int64_t* taxid_str_indices, char* taxid_strings)
|
||||
{
|
||||
ecotx_t* taxon = NULL;
|
||||
ecotx_t* lca = NULL;
|
||||
int32_t taxid;
|
||||
index_t taxid_idx;
|
||||
int64_t taxid_str_idx;
|
||||
char* taxid_str;
|
||||
obiint_t n;
|
||||
|
||||
for (taxid_idx=0; taxid_idx<taxid_count; taxid_idx++) // get lca of all taxids associated with the sequence
|
||||
{
|
||||
n = obi_get_int_with_elt_idx_and_col_p_in_view(view, taxid_column, idx, taxid_idx);
|
||||
if (n != OBIInt_NA) // The taxid of this column is associated with this sequence
|
||||
{
|
||||
if (merged)
|
||||
{
|
||||
taxid_str_idx = taxid_str_indices[taxid_idx];
|
||||
taxid_str = taxid_strings+taxid_str_idx;
|
||||
taxid = atoi(taxid_str);
|
||||
}
|
||||
else
|
||||
taxid = n;
|
||||
|
||||
taxon = obi_taxo_get_taxon_with_taxid(tax, taxid);
|
||||
if (taxon == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting a taxon with taxid %d when building a reference database, seq %lld in refs view", taxid, idx);
|
||||
return NULL;
|
||||
}
|
||||
if (lca == NULL)
|
||||
lca = taxon;
|
||||
else
|
||||
{
|
||||
// Compute LCA
|
||||
lca = obi_taxo_get_lca(taxon, lca);
|
||||
if (lca == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting the last common ancestor of two taxa when building a reference database");
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return lca;
|
||||
}
|
||||
|
||||
|
||||
/**********************************************************************
|
||||
*
|
||||
@ -78,9 +147,12 @@ int build_reference_db(const char* dms_name,
|
||||
OBIDMS_column_p matrix_score_column = NULL;
|
||||
OBIDMS_column_p final_lca_taxid_a_column = NULL;
|
||||
OBIDMS_column_p final_score_a_column = NULL;
|
||||
obiint_t taxid1, taxid2, taxid_lca;
|
||||
ecotx_t* taxon1 = NULL;
|
||||
ecotx_t* taxon2 = NULL;
|
||||
char* taxid_strings;
|
||||
int64_t* taxid_str_indices;
|
||||
index_t taxid_count;
|
||||
obiint_t taxid_lca;
|
||||
ecotx_t* lca_1 = NULL;
|
||||
ecotx_t* lca_2 = NULL;
|
||||
ecotx_t* lca = NULL;
|
||||
index_t idx1, idx2;
|
||||
index_t i, j, k;
|
||||
@ -94,6 +166,7 @@ int build_reference_db(const char* dms_name,
|
||||
obiint_t lca_taxid_array_writable[1000];
|
||||
obifloat_t score_array_writable[1000];
|
||||
bool modified;
|
||||
bool merged;
|
||||
|
||||
// Discuss keeping the matrix view or not
|
||||
matrix_view_name = calloc((strlen(o_view_name)+strlen("_matrix")+1), sizeof(char));
|
||||
@ -203,12 +276,37 @@ int build_reference_db(const char* dms_name,
|
||||
obidebug(1, "\nError opening the second index column when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
refs_taxid_column = obi_view_get_column(refs_view, TAXID_COLUMN);
|
||||
if (obi_view_column_exists(refs_view, MERGED_TAXID_COLUMN))
|
||||
{
|
||||
refs_taxid_column = obi_view_get_column(refs_view, MERGED_TAXID_COLUMN);
|
||||
merged = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
refs_taxid_column = obi_view_get_column(refs_view, TAXID_COLUMN);
|
||||
merged = false;
|
||||
}
|
||||
if (refs_taxid_column == NULL)
|
||||
{
|
||||
obidebug(1, "\nError opening the taxid column when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
// Check that the refs taxid column doesn't contain character strings to parse
|
||||
// (happens with obi uniq when there is a lot of elements per line, see options to set the limit,
|
||||
// parsing of those strings in C not implemented yet)
|
||||
if ((refs_taxid_column->header)->to_eval)
|
||||
{
|
||||
obidebug(1, "\nError opening the column containing the taxids of the reference sequences when building a reference database: "
|
||||
"run previous obi uniq with a higher threshold for the --max-elts option while waiting for the implementation of this feature");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Get the (maximum) number of taxids associated with a sequence
|
||||
taxid_count = (refs_taxid_column->header)->nb_elements_per_line;
|
||||
// Get pointers on element names (aka taxids) and their start indices for faster access
|
||||
taxid_strings = (refs_taxid_column->header)->elements_names;
|
||||
taxid_str_indices = (refs_taxid_column->header)->elements_names_idx;
|
||||
|
||||
matrix_lca_taxid_column = obi_view_get_column(matrix_with_lca_view, LCA_TAXID_COLUMN_NAME);
|
||||
if (matrix_lca_taxid_column == NULL)
|
||||
{
|
||||
@ -220,39 +318,30 @@ int build_reference_db(const char* dms_name,
|
||||
// For each pair
|
||||
for (i=0; i<(matrix_with_lca_view->infos)->line_count; i++)
|
||||
{
|
||||
// Read taxid1 and get taxon1
|
||||
// Read all taxids associated with the first sequence and compute their LCA
|
||||
// Read line index
|
||||
idx1 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx1_column, i, 0);
|
||||
taxid1 = obi_get_int_with_elt_idx_and_col_p_in_view(refs_view, refs_taxid_column, idx1, 0);
|
||||
if (taxid1 == OBIInt_NA)
|
||||
lca_1 = get_lca_from_merged_taxids(refs_view, refs_taxid_column, idx1, merged,
|
||||
tax, taxid_count, taxid_str_indices, taxid_strings);
|
||||
if (lca_1 == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting a taxid when building a reference database, seq %lld in refs view has no taxid associated", idx1);
|
||||
return -1;
|
||||
}
|
||||
taxon1 = obi_taxo_get_taxon_with_taxid(tax, taxid1);
|
||||
if (taxon1 == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting a taxon with taxid %d when building a reference database, seq %lld in refs view", taxid1, idx1);
|
||||
obidebug(1, "\nError getting the last common ancestor of merged taxids when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Read taxid2 and get taxon2
|
||||
// Read all taxids associated with the second sequence and compute their LCA
|
||||
// Read line index
|
||||
idx2 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx2_column, i, 0);
|
||||
taxid2 = obi_get_int_with_elt_idx_and_col_p_in_view(refs_view, refs_taxid_column, idx2, 0);
|
||||
if (taxid2 == OBIInt_NA)
|
||||
lca_2 = get_lca_from_merged_taxids(refs_view, refs_taxid_column, idx2, merged,
|
||||
tax, taxid_count, taxid_str_indices, taxid_strings);
|
||||
if (lca_2 == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting a taxid when building a reference database, seq %lld in refs view has no taxid associated", idx1);
|
||||
return -1;
|
||||
}
|
||||
taxon2 = obi_taxo_get_taxon_with_taxid(tax, taxid2);
|
||||
if (taxon2 == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting a taxon with taxid %d when building a reference database, seq %lld in refs view", taxid2, idx2);
|
||||
obidebug(1, "\nError getting the last common ancestor of merged taxids when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Compute and write their LCA
|
||||
lca = obi_taxo_get_lca(taxon1, taxon2);
|
||||
lca = obi_taxo_get_lca(lca_1, lca_2);
|
||||
if (lca == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting the last common ancestor of two taxa when building a reference database");
|
||||
@ -373,11 +462,11 @@ int build_reference_db(const char* dms_name,
|
||||
score_array_writable_length = score_array_length;
|
||||
|
||||
// Check that lengths are equal (TODO eventually remove?)
|
||||
if (taxid_array_length != score_array_length)
|
||||
{
|
||||
obidebug(1, "\nError building a reference database: LCA taxid and score arrays' lengths are not equal (%d and %d)", taxid_array_length, score_array_length);
|
||||
return -1;
|
||||
}
|
||||
// if (taxid_array_length != score_array_length)
|
||||
// {
|
||||
// obidebug(1, "\nError building a reference database: LCA taxid and score arrays' lengths are not equal (%d and %d)", taxid_array_length, score_array_length);
|
||||
// return -1;
|
||||
// }
|
||||
|
||||
// If empty, add values
|
||||
if (taxid_array_length == 0)
|
||||
@ -420,8 +509,11 @@ int build_reference_db(const char* dms_name,
|
||||
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
|
||||
score_array_writable[k] = score_array_writable[k+1];
|
||||
}
|
||||
taxid_array_writable_length--;
|
||||
score_array_writable_length--;
|
||||
if (k>(j-1))
|
||||
{
|
||||
taxid_array_writable_length--;
|
||||
score_array_writable_length--;
|
||||
}
|
||||
j--;
|
||||
}
|
||||
}
|
||||
@ -456,8 +548,11 @@ int build_reference_db(const char* dms_name,
|
||||
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
|
||||
score_array_writable[k] = score_array_writable[k+1];
|
||||
}
|
||||
taxid_array_writable_length--;
|
||||
score_array_writable_length--;
|
||||
if (k>(j-1))
|
||||
{
|
||||
taxid_array_writable_length--;
|
||||
score_array_writable_length--;
|
||||
}
|
||||
j--;
|
||||
}
|
||||
}
|
||||
@ -484,8 +579,11 @@ int build_reference_db(const char* dms_name,
|
||||
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
|
||||
score_array_writable[k] = score_array_writable[k+1];
|
||||
}
|
||||
taxid_array_writable_length--;
|
||||
score_array_writable_length--;
|
||||
if (k>(j-1))
|
||||
{
|
||||
taxid_array_writable_length--;
|
||||
score_array_writable_length--;
|
||||
}
|
||||
j--;
|
||||
}
|
||||
}
|
||||
@ -516,11 +614,11 @@ int build_reference_db(const char* dms_name,
|
||||
score_array_writable_length = score_array_length;
|
||||
|
||||
// Check that lengths are equal (TODO eventually remove?)
|
||||
if (taxid_array_length != score_array_length)
|
||||
{
|
||||
obidebug(1, "\nError building a reference database: LCA taxid and score arrays' lengths are not equal (%d and %d)", taxid_array_length, score_array_length);
|
||||
return -1;
|
||||
}
|
||||
// if (taxid_array_length != score_array_length)
|
||||
// {
|
||||
// obidebug(1, "\nError building a reference database: LCA taxid and score arrays' lengths are not equal (%d and %d)", taxid_array_length, score_array_length);
|
||||
// return -1;
|
||||
// }
|
||||
|
||||
// If empty, add values
|
||||
if (taxid_array_length == 0)
|
||||
@ -563,8 +661,11 @@ int build_reference_db(const char* dms_name,
|
||||
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
|
||||
score_array_writable[k] = score_array_writable[k+1];
|
||||
}
|
||||
taxid_array_writable_length--;
|
||||
score_array_writable_length--;
|
||||
if (k>(j-1))
|
||||
{
|
||||
taxid_array_writable_length--;
|
||||
score_array_writable_length--;
|
||||
}
|
||||
j--;
|
||||
}
|
||||
}
|
||||
@ -599,8 +700,11 @@ int build_reference_db(const char* dms_name,
|
||||
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
|
||||
score_array_writable[k] = score_array_writable[k+1];
|
||||
}
|
||||
taxid_array_writable_length--;
|
||||
score_array_writable_length--;
|
||||
if (k>(j-1))
|
||||
{
|
||||
taxid_array_writable_length--;
|
||||
score_array_writable_length--;
|
||||
}
|
||||
j--;
|
||||
}
|
||||
}
|
||||
@ -627,8 +731,11 @@ int build_reference_db(const char* dms_name,
|
||||
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
|
||||
score_array_writable[k] = score_array_writable[k+1];
|
||||
}
|
||||
taxid_array_writable_length--;
|
||||
score_array_writable_length--;
|
||||
if (k>(j-1))
|
||||
{
|
||||
taxid_array_writable_length--;
|
||||
score_array_writable_length--;
|
||||
}
|
||||
j--;
|
||||
}
|
||||
}
|
||||
@ -651,6 +758,36 @@ int build_reference_db(const char* dms_name,
|
||||
}
|
||||
}
|
||||
|
||||
// Fill empty LCA informations (because filling from potentially sparse alignment matrix) with the sequence taxid
|
||||
score=1.0; // technically getting LCA of identical sequences
|
||||
for (i=0; i<(o_view->infos)->line_count; i++)
|
||||
{
|
||||
obi_get_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, i, &taxid_array_length);
|
||||
if (taxid_array_length == 0) // no LCA set
|
||||
{
|
||||
lca = get_lca_from_merged_taxids(refs_view, refs_taxid_column, i, merged,
|
||||
tax, taxid_count, taxid_str_indices, taxid_strings);
|
||||
if (lca == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting the last common ancestor of merged taxids when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
|
||||
taxid_lca = lca->taxid;
|
||||
// Set them in output view
|
||||
if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, i, &taxid_lca, (uint8_t) (obi_sizeof(OBI_INT) * 8), 1) < 0)
|
||||
{
|
||||
obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
if (obi_set_array_with_col_p_in_view(o_view, final_score_a_column, i, &score, (uint8_t) (obi_sizeof(OBI_FLOAT) * 8), 1) < 0)
|
||||
{
|
||||
obidebug(1, "\nError setting a score array in a column when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Close views and DMS
|
||||
if (obi_save_and_close_view(refs_view) < 0)
|
||||
{
|
||||
|
Reference in New Issue
Block a user