C: new function to build a reference database with LCA and score

metadata for the taxonomic assignment of sequences
This commit is contained in:
Celine Mercier
2018-11-27 15:56:50 +01:00
parent 730ea99f85
commit f942dd856f
2 changed files with 728 additions and 0 deletions

672
src/build_reference_db.c Executable file
View File

@ -0,0 +1,672 @@
/***********************************************************************************
* Functions to build referece databases for the taxonomic assignment of sequences *
***********************************************************************************/
/**
* @file build_reference_db.c
* @author Celine Mercier (celine.mercier@metabarcoding.org)
* @date November 15th 2018
* @brief Functions to build referece databases for the taxonomic assignment of sequences.
*/
//#define OMP_SUPPORT // TODO
#ifdef OMP_SUPPORT
#include <omp.h>
#endif
#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#include <search.h>
#include <sys/time.h>
#include "build_reference_db.h"
#include "obidms.h"
#include "obidebug.h"
#include "obierrno.h"
#include "obitypes.h"
#include "obiview.h"
#include "obi_lcs.h"
#include "obidms_taxonomy.h"
#include "obidmscolumn_array.h"
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
/**************************************************************************
*
* D E C L A R A T I O N O F T H E P R I V A T E F U N C T I O N S
*
**************************************************************************/
/************************************************************************
*
* D E F I N I T I O N O F T H E P R I V A T E F U N C T I O N S
*
************************************************************************/
/**********************************************************************
*
* D E F I N I T I O N O F T H E P U B L I C F U N C T I O N S
*
**********************************************************************/
int build_reference_db(const char* dms_name,
const char* refs_view_name,
const char* taxonomy_name,
const char* o_view_name,
const char* o_view_comments,
double threshold)
{
OBIDMS_p dms = NULL;
OBIDMS_taxonomy_p tax = NULL;
char* matrix_view_name = NULL; // TODO discuss that it must be unique
char* matrix_with_lca_view_name = NULL; // TODO discuss that it must be unique
Obiview_p matrix_with_lca_view = NULL;
Obiview_p refs_view = NULL;
Obiview_p o_view = NULL;
OBIDMS_column_p matrix_idx1_column = NULL;
OBIDMS_column_p matrix_idx2_column = NULL;
OBIDMS_column_p refs_taxid_column = NULL;
OBIDMS_column_p matrix_lca_taxid_column = NULL;
OBIDMS_column_p matrix_score_column = NULL;
OBIDMS_column_p final_lca_taxid_a_column = NULL;
OBIDMS_column_p final_score_a_column = NULL;
obiint_t taxid1, taxid2, taxid_lca;
ecotx_t* taxon1 = NULL;
ecotx_t* taxon2 = NULL;
ecotx_t* lca = NULL;
index_t idx1, idx2;
index_t i, j, k;
int32_t taxid_array_length;
int32_t score_array_length;
int32_t taxid_array_writable_length;
int32_t score_array_writable_length;
obifloat_t score;
obiint_t* lca_taxid_array;
obifloat_t* score_array;
obiint_t lca_taxid_array_writable[1000];
obifloat_t score_array_writable[1000];
bool modified;
// Discuss keeping the matrix view or not
matrix_view_name = calloc((strlen(o_view_name)+strlen("_matrix")+1), sizeof(char));
if (matrix_view_name == NULL)
{
obidebug(1, "\nError allocating memory for the name of the matrix view when building a reference database");
return -1;
}
matrix_view_name = strcpy(matrix_view_name, o_view_name);
strcat(matrix_view_name, "_matrix");
if (obi_lcs_align_one_column(dms_name,
refs_view_name,
"",
"",
"",
matrix_view_name,
"{}", // TODO proper comments?
false, false,
threshold, true, 0, true,
1) < 0)
{
obidebug(1, "\nError aligning the sequences when building a reference database");
}
// Add a column to the matrix view for LCAs
// Clone the view with a new name
// Build the view name
matrix_with_lca_view_name = calloc((strlen(o_view_name)+strlen("_matrix_with_lca")+1), sizeof(char));
if (matrix_with_lca_view_name == NULL)
{
obidebug(1, "\nError allocating memory for the name of the matrix view with LCA when building a reference database");
return -1;
}
matrix_with_lca_view_name = strcpy(matrix_with_lca_view_name, o_view_name);
strcat(matrix_with_lca_view_name, "_matrix_with_lca");
// Clone the matrix view
// Open the DMS
dms = obi_open_dms(dms_name);
if (dms == NULL)
{
obidebug(1, "\nError opening the DMS when building a reference database");
return -1;
}
// Clone the view
matrix_with_lca_view = obi_clone_view_from_name(dms, matrix_view_name, matrix_with_lca_view_name, NULL, "{}");
if (matrix_with_lca_view == NULL)
{
obidebug(1, "\nError creating the matrix with LCA view when building a reference database");
return -1;
}
// Add the LCA taxid column
if (obi_view_add_column(matrix_with_lca_view,
LCA_TAXID_COLUMN_NAME,
-1,
LCA_TAXID_COLUMN_NAME,
OBI_INT,
-1,
1,
"",
false,
false,
false,
"",
"",
-1,
"{}",
true)
< 0)
{
obidebug(1, "\nError adding the LCA column to the matrix with LCA view when building a reference database");
return -1;
}
// Open the taxonomy
tax = obi_read_taxonomy(dms, taxonomy_name, true);
if (tax == NULL)
{
obidebug(1, "\nError reading the taxonomy when building a reference database");
return -1;
}
// Open the reference sequences view
refs_view = obi_open_view(dms, refs_view_name);
if (refs_view == NULL)
{
obidebug(1, "\nError opening the reference sequences view when building a reference database");
return -1;
}
// Save column pointers
matrix_idx1_column = obi_view_get_column(matrix_with_lca_view, IDX1_COLUMN_NAME);
if (matrix_idx1_column == NULL)
{
obidebug(1, "\nError opening the first index column when building a reference database");
return -1;
}
matrix_idx2_column = obi_view_get_column(matrix_with_lca_view, IDX2_COLUMN_NAME);
if (matrix_idx2_column == NULL)
{
obidebug(1, "\nError opening the second index column when building a reference database");
return -1;
}
refs_taxid_column = obi_view_get_column(refs_view, TAXID_COLUMN);
if (refs_taxid_column == NULL)
{
obidebug(1, "\nError opening the taxid column when building a reference database");
return -1;
}
matrix_lca_taxid_column = obi_view_get_column(matrix_with_lca_view, LCA_TAXID_COLUMN_NAME);
if (matrix_lca_taxid_column == NULL)
{
obidebug(1, "\nError opening the LCA column when building a reference database");
return -1;
}
// Compute all the LCAs
// For each pair
for (i=0; i<(matrix_with_lca_view->infos)->line_count; i++)
{
// Read taxid1 and get taxon1
// Read line index
idx1 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx1_column, i, 0);
taxid1 = obi_get_int_with_elt_idx_and_col_p_in_view(refs_view, refs_taxid_column, idx1, 0);
taxon1 = obi_taxo_get_taxon_with_taxid(tax, taxid1);
if (taxon1 == NULL)
{
obidebug(1, "\nError getting a taxon when building a reference database");
return -1;
}
// Read taxid2 and get taxon2
idx2 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx2_column, i, 0);
taxid2 = obi_get_int_with_elt_idx_and_col_p_in_view(refs_view, refs_taxid_column, idx2, 0);
taxon2 = obi_taxo_get_taxon_with_taxid(tax, taxid2);
if (taxon2 == NULL)
{
obidebug(1, "\nError getting a taxon when building a reference database");
return -1;
}
// Compute and write their LCA
lca = obi_taxo_get_lca(taxon1, taxon2);
if (lca == NULL)
{
obidebug(1, "\nError getting the last common ancestor of two taxa when building a reference database");
return -1;
}
taxid_lca = lca->taxid;
if (obi_set_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_lca_taxid_column, i, 0, taxid_lca) < 0)
{
obidebug(1, "\nError writing the last common ancestor of two taxa when building a reference database");
return -1;
}
}
// Clone refs view, add 2 arrays columns for lca and score, compute and write them
// Clone refs view
o_view = obi_clone_view(dms, refs_view, o_view_name, NULL, o_view_comments);
if (o_view == NULL)
{
obidebug(1, "\nError cloning the view of references when building a reference database");
return -1;
}
// Add the LCA taxid array column
if (obi_view_add_column(o_view,
LCA_TAXID_ARRAY_COLUMN_NAME,
-1,
LCA_TAXID_ARRAY_COLUMN_NAME,
OBI_INT,
-1,
1,
"",
false,
true,
false,
"",
"",
-1,
"{}",
true)
< 0)
{
obidebug(1, "\nError adding the LCA taxid column to the final view when building a reference database");
return -1;
}
// Add the score array column
if (obi_view_add_column(o_view,
LCA_SCORE_ARRAY_COLUMN_NAME,
-1,
LCA_SCORE_ARRAY_COLUMN_NAME,
OBI_FLOAT,
-1,
1,
"",
false,
true,
false,
"",
"",
-1,
"{}",
true)
< 0)
{
obidebug(1, "\nError adding the score column to the final view when building a reference database");
return -1;
}
// Open the newly added columns
final_lca_taxid_a_column = obi_view_get_column(o_view, LCA_TAXID_ARRAY_COLUMN_NAME);
if (final_lca_taxid_a_column == NULL)
{
obidebug(1, "\nError opening the LCA taxid array column when building a reference database");
return -1;
}
final_score_a_column = obi_view_get_column(o_view, LCA_SCORE_ARRAY_COLUMN_NAME);
if (final_score_a_column == NULL)
{
obidebug(1, "\nError opening the score array column when building a reference database");
return -1;
}
// Open alignment score column
matrix_score_column = obi_view_get_column(matrix_with_lca_view, SCORE_COLUMN_NAME);
if (matrix_score_column == NULL)
{
obidebug(1, "\nError opening the alignment score column when building a reference database");
return -1;
}
// For each sequence, look for all its alignments in the matrix, and for each different LCA taxid/score, order them and write them
// Going through matrix once, filling refs arrays on the go for efficiency
for (i=0; i<(matrix_with_lca_view->infos)->line_count; i++)
{
// Read ref line indexes
idx1 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx1_column, i, 0);
idx2 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx2_column, i, 0);
// Read LCA taxid
taxid_lca = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_lca_taxid_column, i, 0);
// Get LCA taxon
lca = obi_taxo_get_taxon_with_taxid(tax, taxid_lca);
if (lca == NULL)
{
obidebug(1, "\nError getting a LCA from taxid when building a reference database");
return -1;
}
// Read alignment score
score = obi_get_float_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_score_column, i, 0);
///////////////// Compute for first sequence \\\\\\\\\\\\\\\\\\\\\\\
// Read arrays
lca_taxid_array = (obiint_t*) obi_column_get_array(final_lca_taxid_a_column, idx1, &taxid_array_length);
score_array = (obifloat_t*) obi_column_get_array(final_score_a_column, idx1, &score_array_length);
taxid_array_writable_length = taxid_array_length;
score_array_writable_length = score_array_length;
// Check that lengths are equal (TODO eventually remove?)
if (taxid_array_length != score_array_length)
{
obidebug(1, "\nError building a reference database: LCA taxid and score arrays' lengths are not equal (%d and %d)", taxid_array_length, score_array_length);
return -1;
}
// If empty, add values
if (taxid_array_length == 0)
{
if (obi_column_set_array(final_lca_taxid_a_column, idx1, &taxid_lca, (uint8_t) obi_sizeof(OBI_INT), 1) < 0)
{
obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database");
return -1;
}
if (obi_column_set_array(final_score_a_column, idx1, &score, (uint8_t) obi_sizeof(OBI_FLOAT), 1) < 0)
{
obidebug(1, "\nError setting a score array in a column when building a reference database");
return -1;
}
}
else
{
j = 0;
modified = false;
while (j < taxid_array_length)
{
if (taxid_lca == lca_taxid_array[j]) // Same LCA taxid: replace if the similarity score is greater
{
if (score > score_array[j])
{
// Copy in array to make writable
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
modified = true;
// Better score for the same LCA, replace this LCA/score pair
lca_taxid_array_writable[j] = taxid_lca;
score_array_writable[j] = score;
// Remove the previous (children) LCAs from the array if their score is equal or lower
while ((j>0) && (score_array_writable[j-1] <= score))
{
for (k=j-1; k<taxid_array_writable_length-1; k++)
{
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
score_array_writable[k] = score_array_writable[k+1];
}
taxid_array_writable_length--;
score_array_writable_length--;
j--;
}
}
break;
}
else if (obi_taxo_is_taxon_under_taxid(lca, lca_taxid_array[j])) // Array LCA is a parent LCA
{
if (score > score_array[j])
{
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
modified = true;
// Insert new LCA/score pair
for (k=taxid_array_writable_length; k>=j+1; k--)
{
lca_taxid_array_writable[k] = lca_taxid_array_writable[k-1];
score_array_writable[k] = score_array_writable[k-1];
}
taxid_array_writable_length++;
score_array_writable_length++;
lca_taxid_array_writable[j] = taxid_lca;
score_array_writable[j] = score;
// Remove the previous (children) LCAs from the array if their score is equal or lower
while ((j>0) && (score_array_writable[j-1] <= score))
{
for (k=j-1; k<taxid_array_writable_length-1; k++)
{
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
score_array_writable[k] = score_array_writable[k+1];
}
taxid_array_writable_length--;
score_array_writable_length--;
j--;
}
}
break;
}
j++;
}
if (j == taxid_array_length) // same or parent LCA not found, need to be appended at the end
{
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
modified = true;
// Append LCA
lca_taxid_array_writable[taxid_array_writable_length] = taxid_lca;
score_array_writable[score_array_writable_length] = score;
// Remove the previous (children) LCAs from the array if their score is equal or lower
while ((j>0) && (score_array_writable[j-1] <= score))
{
for (k=j-1; k<taxid_array_writable_length-1; k++)
{
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
score_array_writable[k] = score_array_writable[k+1];
}
taxid_array_writable_length--;
score_array_writable_length--;
j--;
}
}
// Write new arrays
if (modified)
{
if (obi_column_set_array(final_lca_taxid_a_column, idx1, lca_taxid_array_writable, (uint8_t) obi_sizeof(OBI_INT), taxid_array_writable_length) < 0)
{
obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database");
return -1;
}
if (obi_column_set_array(final_score_a_column, idx1, score_array_writable, (uint8_t) obi_sizeof(OBI_FLOAT), score_array_writable_length) < 0)
{
obidebug(1, "\nError setting a score array in a column when building a reference database");
return -1;
}
}
}
///////////////// Compute for second sequence \\\\\\\\\\\\\\\\\\\\\\\
// Read arrays
lca_taxid_array = (obiint_t*) obi_column_get_array(final_lca_taxid_a_column, idx2, &taxid_array_length);
score_array = (obifloat_t*) obi_column_get_array(final_score_a_column, idx2, &score_array_length);
taxid_array_writable_length = taxid_array_length;
score_array_writable_length = score_array_length;
// Check that lengths are equal (TODO eventually remove?)
if (taxid_array_length != score_array_length)
{
obidebug(1, "\nError building a reference database: LCA taxid and score arrays' lengths are not equal (%d and %d)", taxid_array_length, score_array_length);
return -1;
}
// If empty, add values
if (taxid_array_length == 0)
{
if (obi_column_set_array(final_lca_taxid_a_column, idx2, &taxid_lca, (uint8_t) obi_sizeof(OBI_INT), 1) < 0)
{
obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database");
return -1;
}
if (obi_column_set_array(final_score_a_column, idx2, &score, (uint8_t) obi_sizeof(OBI_FLOAT), 1) < 0)
{
obidebug(1, "\nError setting a score array in a column when building a reference database");
return -1;
}
}
else
{
j = 0;
modified = false;
while (j < taxid_array_length)
{
if (taxid_lca == lca_taxid_array[j]) // Same LCA taxid: replace if the similarity score is greater
{
if (score > score_array[j])
{
// Copy in array to make writable
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
modified = true;
// Better score for the same LCA, replace this LCA/score pair
lca_taxid_array_writable[j] = taxid_lca;
score_array_writable[j] = score;
// Remove the previous (children) LCAs from the array if their score is equal or lower
while ((j>0) && (score_array_writable[j-1] <= score))
{
for (k=j-1; k<taxid_array_writable_length-1; k++)
{
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
score_array_writable[k] = score_array_writable[k+1];
}
taxid_array_writable_length--;
score_array_writable_length--;
j--;
}
}
break;
}
else if (obi_taxo_is_taxon_under_taxid(lca, lca_taxid_array[j])) // Array LCA is a parent LCA
{
if (score > score_array[j])
{
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
modified = true;
// Insert new LCA/score pair
for (k=taxid_array_writable_length; k>=j+1; k--)
{
lca_taxid_array_writable[k] = lca_taxid_array_writable[k-1];
score_array_writable[k] = score_array_writable[k-1];
}
taxid_array_writable_length++;
score_array_writable_length++;
lca_taxid_array_writable[j] = taxid_lca;
score_array_writable[j] = score;
// Remove the previous (children) LCAs from the array if their score is equal or lower
while ((j>0) && (score_array_writable[j-1] <= score))
{
for (k=j-1; k<taxid_array_writable_length-1; k++)
{
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
score_array_writable[k] = score_array_writable[k+1];
}
taxid_array_writable_length--;
score_array_writable_length--;
j--;
}
}
break;
}
j++;
}
if (j == taxid_array_length) // same or parent LCA not found, need to be appended at the end
{
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
modified = true;
// Append LCA
lca_taxid_array_writable[taxid_array_writable_length] = taxid_lca;
score_array_writable[score_array_writable_length] = score;
// Remove the previous (children) LCAs from the array if their score is equal or lower
while ((j>0) && (score_array_writable[j-1] <= score))
{
for (k=j-1; k<taxid_array_writable_length-1; k++)
{
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
score_array_writable[k] = score_array_writable[k+1];
}
taxid_array_writable_length--;
score_array_writable_length--;
j--;
}
}
// Write new arrays
// Copy arrays for modification (can't edit in place, as it is stored in indexer data file)
if (modified)
{
if (obi_column_set_array(final_lca_taxid_a_column, idx2, lca_taxid_array_writable, (uint8_t) obi_sizeof(OBI_INT), taxid_array_writable_length) < 0)
{
obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database");
return -1;
}
if (obi_column_set_array(final_score_a_column, idx2, score_array_writable, (uint8_t) obi_sizeof(OBI_FLOAT), score_array_writable_length) < 0)
{
obidebug(1, "\nError setting a score array in a column when building a reference database");
return -1;
}
}
}
}
// Close views and DMS
if (obi_save_and_close_view(refs_view) < 0)
{
obidebug(1, "\nError closing the reference view after building a reference database");
return -1;
}
if (obi_save_and_close_view(matrix_with_lca_view) < 0)
{
obidebug(1, "\nError closing the matrix with LCA view after building a reference database");
return -1;
}
if (obi_save_and_close_view(o_view) < 0)
{
obidebug(1, "\nError closing the final view after building a reference database");
return -1;
}
if (obi_close_dms(dms, false) < 0)
{
obidebug(1, "\nError closing the DMS after building a reference database");
return -1;
}
// Free everything
free(matrix_view_name);
free(matrix_with_lca_view_name);
fprintf(stderr,"\rDone : 100 %% ");
return 0;
}

56
src/build_reference_db.h Executable file
View File

@ -0,0 +1,56 @@
/****************************************************************************************************
* Header file for functions to build reference databases for the taxonomic assignment of sequences *
****************************************************************************************************/
/**
* @file build_reference_db.h
* @author Celine Mercier (celine.mercier@metabarcoding.org)
* @date November 15th 2018
* @brief Header file for the functions for the taxonomic assignment of sequences.
*/
#ifndef BUILD_REFERENCE_DB_H_
#define BUILD_REFERENCE_DB_H_
#include <stdlib.h>
#include <stdio.h>
#define LCA_TAXID_COLUMN_NAME "LCA_TAXID"
#define LCA_TAXID_ARRAY_COLUMN_NAME "LCA_TAXID"
#define LCA_SCORE_ARRAY_COLUMN_NAME "LCA_SCORE"
/**
* @brief Building of reference databases for the taxonomic assignment of sequences.
*
* Note: The columns where the results are written are automatically named and created.
*
* @param dms_name The name of the DMS.
* @param refs_view_name The name of the view containing the reference sequences annotated with their taxids.
* @param taxonomy_name The name of the taxonomy stored in the DMS.
* @param o_view_name The name of the final reference database to create.
* @param o_view_comments The comments to associate with the final reference database to create.
* @param threshold The threshold (similarity score in identity percentage) at which the database should be created.
* The output database contains the Lowest Common Ancestor and associated highest similarity score for each
* sequence with the sequences with a similarity equal or greater than this threshold.
*
* @returns A value indicating the success of the operation.
* @retval 0 if the operation was successfully completed.
* @retval -1 if an error occurred.
*
* @since November 2018
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
int build_reference_db(const char* dms_name,
const char* refs_view_name,
const char* taxonomy_name,
const char* o_view_name,
const char* o_view_comments,
double threshold);
#endif /* BUILD_REFERENCE_DB_H_ */