fixes #61 : now reading merged taxids information when building a

reference database
This commit is contained in:
Celine Mercier
2019-08-19 12:30:56 +02:00
parent af7cecf59f
commit ae5f42c260

View File

@ -40,6 +40,27 @@
* *
**************************************************************************/ **************************************************************************/
/**
* Internal function computing the last common ancestor (LCA) of a list of merged taxids (generated by obi uniq).
* Also works on just simple taxid (returns the associated taxon).
*
* @param view A pointer on the view containing the taxid information (merged or not).
* @param taxid_column A pointer on the column where taxids are kept (merged or not, aka int array or int columns).
* @param idx The index of the sequence to compute the LCA of in the view.
* @param merged Whether the taxid information is made of arrays of taxids or a single taxid.
* @param tax A pointer on a taxonomy structure.
* @param taxid_count The maximum number of taxids associated with one sequence (aka the number of elements in taxid_column).
* @param taxid_str_indices A pointer on the list of indices in taxid_strings referring to the taxids stored as strings (see next param).
* @param taxid_strings A pointer on the list of taxids stored as strings in the taxid column header (they correspond to the element names).
*
* @returns A pointer on the LCA taxon.
* @retval NULL if an error occurred.
*
* @since August 2019
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
static inline ecotx_t* get_lca_from_merged_taxids(Obiview_p view, OBIDMS_column_p taxid_column, index_t idx, bool merged,
OBIDMS_taxonomy_p tax, index_t taxid_count, int64_t* taxid_str_indices, char* taxid_strings);
/************************************************************************ /************************************************************************
@ -49,6 +70,54 @@
************************************************************************/ ************************************************************************/
static inline ecotx_t* get_lca_from_merged_taxids(Obiview_p view, OBIDMS_column_p taxid_column, index_t idx, bool merged,
OBIDMS_taxonomy_p tax, index_t taxid_count, int64_t* taxid_str_indices, char* taxid_strings)
{
ecotx_t* taxon = NULL;
ecotx_t* lca = NULL;
int32_t taxid;
index_t taxid_idx;
int64_t taxid_str_idx;
char* taxid_str;
obiint_t n;
for (taxid_idx=0; taxid_idx<taxid_count; taxid_idx++) // get lca of all taxids associated with the sequence
{
n = obi_get_int_with_elt_idx_and_col_p_in_view(view, taxid_column, idx, taxid_idx);
if (n != OBIInt_NA) // The taxid of this column is associated with this sequence
{
if (merged)
{
taxid_str_idx = taxid_str_indices[taxid_idx];
taxid_str = taxid_strings+taxid_str_idx;
taxid = atoi(taxid_str);
}
else
taxid = n;
taxon = obi_taxo_get_taxon_with_taxid(tax, taxid);
if (taxon == NULL)
{
obidebug(1, "\nError getting a taxon with taxid %d when building a reference database, seq %lld in refs view", taxid, idx);
return NULL;
}
if (lca == NULL)
lca = taxon;
else
{
// Compute LCA
lca = obi_taxo_get_lca(taxon, lca);
if (lca == NULL)
{
obidebug(1, "\nError getting the last common ancestor of two taxa when building a reference database");
return NULL;
}
}
}
}
return lca;
}
/********************************************************************** /**********************************************************************
* *
@ -78,9 +147,12 @@ int build_reference_db(const char* dms_name,
OBIDMS_column_p matrix_score_column = NULL; OBIDMS_column_p matrix_score_column = NULL;
OBIDMS_column_p final_lca_taxid_a_column = NULL; OBIDMS_column_p final_lca_taxid_a_column = NULL;
OBIDMS_column_p final_score_a_column = NULL; OBIDMS_column_p final_score_a_column = NULL;
obiint_t taxid1, taxid2, taxid_lca; char* taxid_strings;
ecotx_t* taxon1 = NULL; int64_t* taxid_str_indices;
ecotx_t* taxon2 = NULL; index_t taxid_count;
obiint_t taxid_lca;
ecotx_t* lca_1 = NULL;
ecotx_t* lca_2 = NULL;
ecotx_t* lca = NULL; ecotx_t* lca = NULL;
index_t idx1, idx2; index_t idx1, idx2;
index_t i, j, k; index_t i, j, k;
@ -94,6 +166,7 @@ int build_reference_db(const char* dms_name,
obiint_t lca_taxid_array_writable[1000]; obiint_t lca_taxid_array_writable[1000];
obifloat_t score_array_writable[1000]; obifloat_t score_array_writable[1000];
bool modified; bool modified;
bool merged;
// Discuss keeping the matrix view or not // Discuss keeping the matrix view or not
matrix_view_name = calloc((strlen(o_view_name)+strlen("_matrix")+1), sizeof(char)); matrix_view_name = calloc((strlen(o_view_name)+strlen("_matrix")+1), sizeof(char));
@ -203,12 +276,37 @@ int build_reference_db(const char* dms_name,
obidebug(1, "\nError opening the second index column when building a reference database"); obidebug(1, "\nError opening the second index column when building a reference database");
return -1; return -1;
} }
refs_taxid_column = obi_view_get_column(refs_view, TAXID_COLUMN); if (obi_view_column_exists(refs_view, MERGED_TAXID_COLUMN))
{
refs_taxid_column = obi_view_get_column(refs_view, MERGED_TAXID_COLUMN);
merged = true;
}
else
{
refs_taxid_column = obi_view_get_column(refs_view, TAXID_COLUMN);
merged = false;
}
if (refs_taxid_column == NULL) if (refs_taxid_column == NULL)
{ {
obidebug(1, "\nError opening the taxid column when building a reference database"); obidebug(1, "\nError opening the taxid column when building a reference database");
return -1; return -1;
} }
// Check that the refs taxid column doesn't contain character strings to parse
// (happens with obi uniq when there is a lot of elements per line, see options to set the limit,
// parsing of those strings in C not implemented yet)
if ((refs_taxid_column->header)->to_eval)
{
obidebug(1, "\nError opening the column containing the taxids of the reference sequences when building a reference database: "
"run previous obi uniq with a higher threshold for the --max-elts option while waiting for the implementation of this feature");
return -1;
}
// Get the (maximum) number of taxids associated with a sequence
taxid_count = (refs_taxid_column->header)->nb_elements_per_line;
// Get pointers on element names (aka taxids) and their start indices for faster access
taxid_strings = (refs_taxid_column->header)->elements_names;
taxid_str_indices = (refs_taxid_column->header)->elements_names_idx;
matrix_lca_taxid_column = obi_view_get_column(matrix_with_lca_view, LCA_TAXID_COLUMN_NAME); matrix_lca_taxid_column = obi_view_get_column(matrix_with_lca_view, LCA_TAXID_COLUMN_NAME);
if (matrix_lca_taxid_column == NULL) if (matrix_lca_taxid_column == NULL)
{ {
@ -220,39 +318,30 @@ int build_reference_db(const char* dms_name,
// For each pair // For each pair
for (i=0; i<(matrix_with_lca_view->infos)->line_count; i++) for (i=0; i<(matrix_with_lca_view->infos)->line_count; i++)
{ {
// Read taxid1 and get taxon1 // Read all taxids associated with the first sequence and compute their LCA
// Read line index // Read line index
idx1 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx1_column, i, 0); idx1 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx1_column, i, 0);
taxid1 = obi_get_int_with_elt_idx_and_col_p_in_view(refs_view, refs_taxid_column, idx1, 0); lca_1 = get_lca_from_merged_taxids(refs_view, refs_taxid_column, idx1, merged,
if (taxid1 == OBIInt_NA) tax, taxid_count, taxid_str_indices, taxid_strings);
if (lca_1 == NULL)
{ {
obidebug(1, "\nError getting a taxid when building a reference database, seq %lld in refs view has no taxid associated", idx1); obidebug(1, "\nError getting the last common ancestor of merged taxids when building a reference database");
return -1;
}
taxon1 = obi_taxo_get_taxon_with_taxid(tax, taxid1);
if (taxon1 == NULL)
{
obidebug(1, "\nError getting a taxon with taxid %d when building a reference database, seq %lld in refs view", taxid1, idx1);
return -1; return -1;
} }
// Read taxid2 and get taxon2 // Read all taxids associated with the second sequence and compute their LCA
// Read line index
idx2 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx2_column, i, 0); idx2 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx2_column, i, 0);
taxid2 = obi_get_int_with_elt_idx_and_col_p_in_view(refs_view, refs_taxid_column, idx2, 0); lca_2 = get_lca_from_merged_taxids(refs_view, refs_taxid_column, idx2, merged,
if (taxid2 == OBIInt_NA) tax, taxid_count, taxid_str_indices, taxid_strings);
if (lca_2 == NULL)
{ {
obidebug(1, "\nError getting a taxid when building a reference database, seq %lld in refs view has no taxid associated", idx1); obidebug(1, "\nError getting the last common ancestor of merged taxids when building a reference database");
return -1;
}
taxon2 = obi_taxo_get_taxon_with_taxid(tax, taxid2);
if (taxon2 == NULL)
{
obidebug(1, "\nError getting a taxon with taxid %d when building a reference database, seq %lld in refs view", taxid2, idx2);
return -1; return -1;
} }
// Compute and write their LCA // Compute and write their LCA
lca = obi_taxo_get_lca(taxon1, taxon2); lca = obi_taxo_get_lca(lca_1, lca_2);
if (lca == NULL) if (lca == NULL)
{ {
obidebug(1, "\nError getting the last common ancestor of two taxa when building a reference database"); obidebug(1, "\nError getting the last common ancestor of two taxa when building a reference database");
@ -373,11 +462,11 @@ int build_reference_db(const char* dms_name,
score_array_writable_length = score_array_length; score_array_writable_length = score_array_length;
// Check that lengths are equal (TODO eventually remove?) // Check that lengths are equal (TODO eventually remove?)
if (taxid_array_length != score_array_length) // if (taxid_array_length != score_array_length)
{ // {
obidebug(1, "\nError building a reference database: LCA taxid and score arrays' lengths are not equal (%d and %d)", taxid_array_length, score_array_length); // obidebug(1, "\nError building a reference database: LCA taxid and score arrays' lengths are not equal (%d and %d)", taxid_array_length, score_array_length);
return -1; // return -1;
} // }
// If empty, add values // If empty, add values
if (taxid_array_length == 0) if (taxid_array_length == 0)
@ -420,8 +509,11 @@ int build_reference_db(const char* dms_name,
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1]; lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
score_array_writable[k] = score_array_writable[k+1]; score_array_writable[k] = score_array_writable[k+1];
} }
taxid_array_writable_length--; if (k>(j-1))
score_array_writable_length--; {
taxid_array_writable_length--;
score_array_writable_length--;
}
j--; j--;
} }
} }
@ -456,8 +548,11 @@ int build_reference_db(const char* dms_name,
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1]; lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
score_array_writable[k] = score_array_writable[k+1]; score_array_writable[k] = score_array_writable[k+1];
} }
taxid_array_writable_length--; if (k>(j-1))
score_array_writable_length--; {
taxid_array_writable_length--;
score_array_writable_length--;
}
j--; j--;
} }
} }
@ -484,8 +579,11 @@ int build_reference_db(const char* dms_name,
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1]; lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
score_array_writable[k] = score_array_writable[k+1]; score_array_writable[k] = score_array_writable[k+1];
} }
taxid_array_writable_length--; if (k>(j-1))
score_array_writable_length--; {
taxid_array_writable_length--;
score_array_writable_length--;
}
j--; j--;
} }
} }
@ -516,11 +614,11 @@ int build_reference_db(const char* dms_name,
score_array_writable_length = score_array_length; score_array_writable_length = score_array_length;
// Check that lengths are equal (TODO eventually remove?) // Check that lengths are equal (TODO eventually remove?)
if (taxid_array_length != score_array_length) // if (taxid_array_length != score_array_length)
{ // {
obidebug(1, "\nError building a reference database: LCA taxid and score arrays' lengths are not equal (%d and %d)", taxid_array_length, score_array_length); // obidebug(1, "\nError building a reference database: LCA taxid and score arrays' lengths are not equal (%d and %d)", taxid_array_length, score_array_length);
return -1; // return -1;
} // }
// If empty, add values // If empty, add values
if (taxid_array_length == 0) if (taxid_array_length == 0)
@ -563,8 +661,11 @@ int build_reference_db(const char* dms_name,
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1]; lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
score_array_writable[k] = score_array_writable[k+1]; score_array_writable[k] = score_array_writable[k+1];
} }
taxid_array_writable_length--; if (k>(j-1))
score_array_writable_length--; {
taxid_array_writable_length--;
score_array_writable_length--;
}
j--; j--;
} }
} }
@ -599,8 +700,11 @@ int build_reference_db(const char* dms_name,
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1]; lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
score_array_writable[k] = score_array_writable[k+1]; score_array_writable[k] = score_array_writable[k+1];
} }
taxid_array_writable_length--; if (k>(j-1))
score_array_writable_length--; {
taxid_array_writable_length--;
score_array_writable_length--;
}
j--; j--;
} }
} }
@ -627,8 +731,11 @@ int build_reference_db(const char* dms_name,
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1]; lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
score_array_writable[k] = score_array_writable[k+1]; score_array_writable[k] = score_array_writable[k+1];
} }
taxid_array_writable_length--; if (k>(j-1))
score_array_writable_length--; {
taxid_array_writable_length--;
score_array_writable_length--;
}
j--; j--;
} }
} }
@ -651,6 +758,36 @@ int build_reference_db(const char* dms_name,
} }
} }
// Fill empty LCA informations (because filling from potentially sparse alignment matrix) with the sequence taxid
score=1.0; // technically getting LCA of identical sequences
for (i=0; i<(o_view->infos)->line_count; i++)
{
obi_get_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, i, &taxid_array_length);
if (taxid_array_length == 0) // no LCA set
{
lca = get_lca_from_merged_taxids(refs_view, refs_taxid_column, i, merged,
tax, taxid_count, taxid_str_indices, taxid_strings);
if (lca == NULL)
{
obidebug(1, "\nError getting the last common ancestor of merged taxids when building a reference database");
return -1;
}
taxid_lca = lca->taxid;
// Set them in output view
if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, i, &taxid_lca, (uint8_t) (obi_sizeof(OBI_INT) * 8), 1) < 0)
{
obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database");
return -1;
}
if (obi_set_array_with_col_p_in_view(o_view, final_score_a_column, i, &score, (uint8_t) (obi_sizeof(OBI_FLOAT) * 8), 1) < 0)
{
obidebug(1, "\nError setting a score array in a column when building a reference database");
return -1;
}
}
}
// Close views and DMS // Close views and DMS
if (obi_save_and_close_view(refs_view) < 0) if (obi_save_and_close_view(refs_view) < 0)
{ {