C: new function to build a reference database with LCA and score
metadata for the taxonomic assignment of sequences
This commit is contained in:
672
src/build_reference_db.c
Executable file
672
src/build_reference_db.c
Executable file
@ -0,0 +1,672 @@
|
||||
/***********************************************************************************
|
||||
* Functions to build referece databases for the taxonomic assignment of sequences *
|
||||
***********************************************************************************/
|
||||
|
||||
/**
|
||||
* @file build_reference_db.c
|
||||
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||
* @date November 15th 2018
|
||||
* @brief Functions to build referece databases for the taxonomic assignment of sequences.
|
||||
*/
|
||||
|
||||
//#define OMP_SUPPORT // TODO
|
||||
#ifdef OMP_SUPPORT
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <stdbool.h>
|
||||
#include <search.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
#include "build_reference_db.h"
|
||||
#include "obidms.h"
|
||||
#include "obidebug.h"
|
||||
#include "obierrno.h"
|
||||
#include "obitypes.h"
|
||||
#include "obiview.h"
|
||||
#include "obi_lcs.h"
|
||||
#include "obidms_taxonomy.h"
|
||||
#include "obidmscolumn_array.h"
|
||||
|
||||
|
||||
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
|
||||
|
||||
|
||||
/**************************************************************************
|
||||
*
|
||||
* D E C L A R A T I O N O F T H E P R I V A T E F U N C T I O N S
|
||||
*
|
||||
**************************************************************************/
|
||||
|
||||
|
||||
|
||||
/************************************************************************
|
||||
*
|
||||
* D E F I N I T I O N O F T H E P R I V A T E F U N C T I O N S
|
||||
*
|
||||
************************************************************************/
|
||||
|
||||
|
||||
|
||||
/**********************************************************************
|
||||
*
|
||||
* D E F I N I T I O N O F T H E P U B L I C F U N C T I O N S
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
|
||||
int build_reference_db(const char* dms_name,
|
||||
const char* refs_view_name,
|
||||
const char* taxonomy_name,
|
||||
const char* o_view_name,
|
||||
const char* o_view_comments,
|
||||
double threshold)
|
||||
{
|
||||
OBIDMS_p dms = NULL;
|
||||
OBIDMS_taxonomy_p tax = NULL;
|
||||
char* matrix_view_name = NULL; // TODO discuss that it must be unique
|
||||
char* matrix_with_lca_view_name = NULL; // TODO discuss that it must be unique
|
||||
Obiview_p matrix_with_lca_view = NULL;
|
||||
Obiview_p refs_view = NULL;
|
||||
Obiview_p o_view = NULL;
|
||||
OBIDMS_column_p matrix_idx1_column = NULL;
|
||||
OBIDMS_column_p matrix_idx2_column = NULL;
|
||||
OBIDMS_column_p refs_taxid_column = NULL;
|
||||
OBIDMS_column_p matrix_lca_taxid_column = NULL;
|
||||
OBIDMS_column_p matrix_score_column = NULL;
|
||||
OBIDMS_column_p final_lca_taxid_a_column = NULL;
|
||||
OBIDMS_column_p final_score_a_column = NULL;
|
||||
obiint_t taxid1, taxid2, taxid_lca;
|
||||
ecotx_t* taxon1 = NULL;
|
||||
ecotx_t* taxon2 = NULL;
|
||||
ecotx_t* lca = NULL;
|
||||
index_t idx1, idx2;
|
||||
index_t i, j, k;
|
||||
int32_t taxid_array_length;
|
||||
int32_t score_array_length;
|
||||
int32_t taxid_array_writable_length;
|
||||
int32_t score_array_writable_length;
|
||||
obifloat_t score;
|
||||
obiint_t* lca_taxid_array;
|
||||
obifloat_t* score_array;
|
||||
obiint_t lca_taxid_array_writable[1000];
|
||||
obifloat_t score_array_writable[1000];
|
||||
bool modified;
|
||||
|
||||
// Discuss keeping the matrix view or not
|
||||
matrix_view_name = calloc((strlen(o_view_name)+strlen("_matrix")+1), sizeof(char));
|
||||
if (matrix_view_name == NULL)
|
||||
{
|
||||
obidebug(1, "\nError allocating memory for the name of the matrix view when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
matrix_view_name = strcpy(matrix_view_name, o_view_name);
|
||||
strcat(matrix_view_name, "_matrix");
|
||||
|
||||
if (obi_lcs_align_one_column(dms_name,
|
||||
refs_view_name,
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
matrix_view_name,
|
||||
"{}", // TODO proper comments?
|
||||
false, false,
|
||||
threshold, true, 0, true,
|
||||
1) < 0)
|
||||
{
|
||||
obidebug(1, "\nError aligning the sequences when building a reference database");
|
||||
}
|
||||
|
||||
// Add a column to the matrix view for LCAs
|
||||
|
||||
// Clone the view with a new name
|
||||
|
||||
// Build the view name
|
||||
matrix_with_lca_view_name = calloc((strlen(o_view_name)+strlen("_matrix_with_lca")+1), sizeof(char));
|
||||
if (matrix_with_lca_view_name == NULL)
|
||||
{
|
||||
obidebug(1, "\nError allocating memory for the name of the matrix view with LCA when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
matrix_with_lca_view_name = strcpy(matrix_with_lca_view_name, o_view_name);
|
||||
strcat(matrix_with_lca_view_name, "_matrix_with_lca");
|
||||
|
||||
// Clone the matrix view
|
||||
// Open the DMS
|
||||
dms = obi_open_dms(dms_name);
|
||||
if (dms == NULL)
|
||||
{
|
||||
obidebug(1, "\nError opening the DMS when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Clone the view
|
||||
matrix_with_lca_view = obi_clone_view_from_name(dms, matrix_view_name, matrix_with_lca_view_name, NULL, "{}");
|
||||
if (matrix_with_lca_view == NULL)
|
||||
{
|
||||
obidebug(1, "\nError creating the matrix with LCA view when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Add the LCA taxid column
|
||||
if (obi_view_add_column(matrix_with_lca_view,
|
||||
LCA_TAXID_COLUMN_NAME,
|
||||
-1,
|
||||
LCA_TAXID_COLUMN_NAME,
|
||||
OBI_INT,
|
||||
-1,
|
||||
1,
|
||||
"",
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
"",
|
||||
"",
|
||||
-1,
|
||||
"{}",
|
||||
true)
|
||||
< 0)
|
||||
{
|
||||
obidebug(1, "\nError adding the LCA column to the matrix with LCA view when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Open the taxonomy
|
||||
tax = obi_read_taxonomy(dms, taxonomy_name, true);
|
||||
if (tax == NULL)
|
||||
{
|
||||
obidebug(1, "\nError reading the taxonomy when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Open the reference sequences view
|
||||
refs_view = obi_open_view(dms, refs_view_name);
|
||||
if (refs_view == NULL)
|
||||
{
|
||||
obidebug(1, "\nError opening the reference sequences view when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Save column pointers
|
||||
matrix_idx1_column = obi_view_get_column(matrix_with_lca_view, IDX1_COLUMN_NAME);
|
||||
if (matrix_idx1_column == NULL)
|
||||
{
|
||||
obidebug(1, "\nError opening the first index column when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
matrix_idx2_column = obi_view_get_column(matrix_with_lca_view, IDX2_COLUMN_NAME);
|
||||
if (matrix_idx2_column == NULL)
|
||||
{
|
||||
obidebug(1, "\nError opening the second index column when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
refs_taxid_column = obi_view_get_column(refs_view, TAXID_COLUMN);
|
||||
if (refs_taxid_column == NULL)
|
||||
{
|
||||
obidebug(1, "\nError opening the taxid column when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
matrix_lca_taxid_column = obi_view_get_column(matrix_with_lca_view, LCA_TAXID_COLUMN_NAME);
|
||||
if (matrix_lca_taxid_column == NULL)
|
||||
{
|
||||
obidebug(1, "\nError opening the LCA column when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Compute all the LCAs
|
||||
// For each pair
|
||||
for (i=0; i<(matrix_with_lca_view->infos)->line_count; i++)
|
||||
{
|
||||
// Read taxid1 and get taxon1
|
||||
// Read line index
|
||||
idx1 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx1_column, i, 0);
|
||||
taxid1 = obi_get_int_with_elt_idx_and_col_p_in_view(refs_view, refs_taxid_column, idx1, 0);
|
||||
taxon1 = obi_taxo_get_taxon_with_taxid(tax, taxid1);
|
||||
if (taxon1 == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting a taxon when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Read taxid2 and get taxon2
|
||||
idx2 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx2_column, i, 0);
|
||||
taxid2 = obi_get_int_with_elt_idx_and_col_p_in_view(refs_view, refs_taxid_column, idx2, 0);
|
||||
taxon2 = obi_taxo_get_taxon_with_taxid(tax, taxid2);
|
||||
if (taxon2 == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting a taxon when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Compute and write their LCA
|
||||
lca = obi_taxo_get_lca(taxon1, taxon2);
|
||||
if (lca == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting the last common ancestor of two taxa when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
taxid_lca = lca->taxid;
|
||||
if (obi_set_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_lca_taxid_column, i, 0, taxid_lca) < 0)
|
||||
{
|
||||
obidebug(1, "\nError writing the last common ancestor of two taxa when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Clone refs view, add 2 arrays columns for lca and score, compute and write them
|
||||
|
||||
// Clone refs view
|
||||
o_view = obi_clone_view(dms, refs_view, o_view_name, NULL, o_view_comments);
|
||||
if (o_view == NULL)
|
||||
{
|
||||
obidebug(1, "\nError cloning the view of references when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Add the LCA taxid array column
|
||||
if (obi_view_add_column(o_view,
|
||||
LCA_TAXID_ARRAY_COLUMN_NAME,
|
||||
-1,
|
||||
LCA_TAXID_ARRAY_COLUMN_NAME,
|
||||
OBI_INT,
|
||||
-1,
|
||||
1,
|
||||
"",
|
||||
false,
|
||||
true,
|
||||
false,
|
||||
"",
|
||||
"",
|
||||
-1,
|
||||
"{}",
|
||||
true)
|
||||
< 0)
|
||||
{
|
||||
obidebug(1, "\nError adding the LCA taxid column to the final view when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Add the score array column
|
||||
if (obi_view_add_column(o_view,
|
||||
LCA_SCORE_ARRAY_COLUMN_NAME,
|
||||
-1,
|
||||
LCA_SCORE_ARRAY_COLUMN_NAME,
|
||||
OBI_FLOAT,
|
||||
-1,
|
||||
1,
|
||||
"",
|
||||
false,
|
||||
true,
|
||||
false,
|
||||
"",
|
||||
"",
|
||||
-1,
|
||||
"{}",
|
||||
true)
|
||||
< 0)
|
||||
{
|
||||
obidebug(1, "\nError adding the score column to the final view when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Open the newly added columns
|
||||
final_lca_taxid_a_column = obi_view_get_column(o_view, LCA_TAXID_ARRAY_COLUMN_NAME);
|
||||
if (final_lca_taxid_a_column == NULL)
|
||||
{
|
||||
obidebug(1, "\nError opening the LCA taxid array column when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
final_score_a_column = obi_view_get_column(o_view, LCA_SCORE_ARRAY_COLUMN_NAME);
|
||||
if (final_score_a_column == NULL)
|
||||
{
|
||||
obidebug(1, "\nError opening the score array column when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Open alignment score column
|
||||
matrix_score_column = obi_view_get_column(matrix_with_lca_view, SCORE_COLUMN_NAME);
|
||||
if (matrix_score_column == NULL)
|
||||
{
|
||||
obidebug(1, "\nError opening the alignment score column when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// For each sequence, look for all its alignments in the matrix, and for each different LCA taxid/score, order them and write them
|
||||
// Going through matrix once, filling refs arrays on the go for efficiency
|
||||
for (i=0; i<(matrix_with_lca_view->infos)->line_count; i++)
|
||||
{
|
||||
// Read ref line indexes
|
||||
idx1 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx1_column, i, 0);
|
||||
idx2 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx2_column, i, 0);
|
||||
// Read LCA taxid
|
||||
taxid_lca = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_lca_taxid_column, i, 0);
|
||||
// Get LCA taxon
|
||||
lca = obi_taxo_get_taxon_with_taxid(tax, taxid_lca);
|
||||
if (lca == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting a LCA from taxid when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
// Read alignment score
|
||||
score = obi_get_float_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_score_column, i, 0);
|
||||
|
||||
///////////////// Compute for first sequence \\\\\\\\\\\\\\\\\\\\\\\
|
||||
|
||||
// Read arrays
|
||||
lca_taxid_array = (obiint_t*) obi_column_get_array(final_lca_taxid_a_column, idx1, &taxid_array_length);
|
||||
score_array = (obifloat_t*) obi_column_get_array(final_score_a_column, idx1, &score_array_length);
|
||||
|
||||
taxid_array_writable_length = taxid_array_length;
|
||||
score_array_writable_length = score_array_length;
|
||||
|
||||
// Check that lengths are equal (TODO eventually remove?)
|
||||
if (taxid_array_length != score_array_length)
|
||||
{
|
||||
obidebug(1, "\nError building a reference database: LCA taxid and score arrays' lengths are not equal (%d and %d)", taxid_array_length, score_array_length);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// If empty, add values
|
||||
if (taxid_array_length == 0)
|
||||
{
|
||||
if (obi_column_set_array(final_lca_taxid_a_column, idx1, &taxid_lca, (uint8_t) obi_sizeof(OBI_INT), 1) < 0)
|
||||
{
|
||||
obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
if (obi_column_set_array(final_score_a_column, idx1, &score, (uint8_t) obi_sizeof(OBI_FLOAT), 1) < 0)
|
||||
{
|
||||
obidebug(1, "\nError setting a score array in a column when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
j = 0;
|
||||
modified = false;
|
||||
while (j < taxid_array_length)
|
||||
{
|
||||
if (taxid_lca == lca_taxid_array[j]) // Same LCA taxid: replace if the similarity score is greater
|
||||
{
|
||||
if (score > score_array[j])
|
||||
{
|
||||
// Copy in array to make writable
|
||||
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
|
||||
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
|
||||
modified = true;
|
||||
|
||||
// Better score for the same LCA, replace this LCA/score pair
|
||||
lca_taxid_array_writable[j] = taxid_lca;
|
||||
score_array_writable[j] = score;
|
||||
|
||||
// Remove the previous (children) LCAs from the array if their score is equal or lower
|
||||
while ((j>0) && (score_array_writable[j-1] <= score))
|
||||
{
|
||||
for (k=j-1; k<taxid_array_writable_length-1; k++)
|
||||
{
|
||||
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
|
||||
score_array_writable[k] = score_array_writable[k+1];
|
||||
}
|
||||
taxid_array_writable_length--;
|
||||
score_array_writable_length--;
|
||||
j--;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
else if (obi_taxo_is_taxon_under_taxid(lca, lca_taxid_array[j])) // Array LCA is a parent LCA
|
||||
{
|
||||
if (score > score_array[j])
|
||||
{
|
||||
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
|
||||
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
|
||||
modified = true;
|
||||
|
||||
// Insert new LCA/score pair
|
||||
for (k=taxid_array_writable_length; k>=j+1; k--)
|
||||
{
|
||||
lca_taxid_array_writable[k] = lca_taxid_array_writable[k-1];
|
||||
score_array_writable[k] = score_array_writable[k-1];
|
||||
}
|
||||
|
||||
taxid_array_writable_length++;
|
||||
score_array_writable_length++;
|
||||
|
||||
lca_taxid_array_writable[j] = taxid_lca;
|
||||
score_array_writable[j] = score;
|
||||
|
||||
// Remove the previous (children) LCAs from the array if their score is equal or lower
|
||||
while ((j>0) && (score_array_writable[j-1] <= score))
|
||||
{
|
||||
for (k=j-1; k<taxid_array_writable_length-1; k++)
|
||||
{
|
||||
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
|
||||
score_array_writable[k] = score_array_writable[k+1];
|
||||
}
|
||||
taxid_array_writable_length--;
|
||||
score_array_writable_length--;
|
||||
j--;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
j++;
|
||||
}
|
||||
|
||||
if (j == taxid_array_length) // same or parent LCA not found, need to be appended at the end
|
||||
{
|
||||
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
|
||||
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
|
||||
modified = true;
|
||||
|
||||
// Append LCA
|
||||
lca_taxid_array_writable[taxid_array_writable_length] = taxid_lca;
|
||||
score_array_writable[score_array_writable_length] = score;
|
||||
|
||||
// Remove the previous (children) LCAs from the array if their score is equal or lower
|
||||
while ((j>0) && (score_array_writable[j-1] <= score))
|
||||
{
|
||||
for (k=j-1; k<taxid_array_writable_length-1; k++)
|
||||
{
|
||||
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
|
||||
score_array_writable[k] = score_array_writable[k+1];
|
||||
}
|
||||
taxid_array_writable_length--;
|
||||
score_array_writable_length--;
|
||||
j--;
|
||||
}
|
||||
}
|
||||
|
||||
// Write new arrays
|
||||
if (modified)
|
||||
{
|
||||
if (obi_column_set_array(final_lca_taxid_a_column, idx1, lca_taxid_array_writable, (uint8_t) obi_sizeof(OBI_INT), taxid_array_writable_length) < 0)
|
||||
{
|
||||
obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
if (obi_column_set_array(final_score_a_column, idx1, score_array_writable, (uint8_t) obi_sizeof(OBI_FLOAT), score_array_writable_length) < 0)
|
||||
{
|
||||
obidebug(1, "\nError setting a score array in a column when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///////////////// Compute for second sequence \\\\\\\\\\\\\\\\\\\\\\\
|
||||
|
||||
// Read arrays
|
||||
lca_taxid_array = (obiint_t*) obi_column_get_array(final_lca_taxid_a_column, idx2, &taxid_array_length);
|
||||
score_array = (obifloat_t*) obi_column_get_array(final_score_a_column, idx2, &score_array_length);
|
||||
|
||||
taxid_array_writable_length = taxid_array_length;
|
||||
score_array_writable_length = score_array_length;
|
||||
|
||||
// Check that lengths are equal (TODO eventually remove?)
|
||||
if (taxid_array_length != score_array_length)
|
||||
{
|
||||
obidebug(1, "\nError building a reference database: LCA taxid and score arrays' lengths are not equal (%d and %d)", taxid_array_length, score_array_length);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// If empty, add values
|
||||
if (taxid_array_length == 0)
|
||||
{
|
||||
if (obi_column_set_array(final_lca_taxid_a_column, idx2, &taxid_lca, (uint8_t) obi_sizeof(OBI_INT), 1) < 0)
|
||||
{
|
||||
obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
if (obi_column_set_array(final_score_a_column, idx2, &score, (uint8_t) obi_sizeof(OBI_FLOAT), 1) < 0)
|
||||
{
|
||||
obidebug(1, "\nError setting a score array in a column when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
j = 0;
|
||||
modified = false;
|
||||
while (j < taxid_array_length)
|
||||
{
|
||||
if (taxid_lca == lca_taxid_array[j]) // Same LCA taxid: replace if the similarity score is greater
|
||||
{
|
||||
if (score > score_array[j])
|
||||
{
|
||||
// Copy in array to make writable
|
||||
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
|
||||
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
|
||||
modified = true;
|
||||
|
||||
// Better score for the same LCA, replace this LCA/score pair
|
||||
lca_taxid_array_writable[j] = taxid_lca;
|
||||
score_array_writable[j] = score;
|
||||
|
||||
// Remove the previous (children) LCAs from the array if their score is equal or lower
|
||||
while ((j>0) && (score_array_writable[j-1] <= score))
|
||||
{
|
||||
for (k=j-1; k<taxid_array_writable_length-1; k++)
|
||||
{
|
||||
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
|
||||
score_array_writable[k] = score_array_writable[k+1];
|
||||
}
|
||||
taxid_array_writable_length--;
|
||||
score_array_writable_length--;
|
||||
j--;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
else if (obi_taxo_is_taxon_under_taxid(lca, lca_taxid_array[j])) // Array LCA is a parent LCA
|
||||
{
|
||||
if (score > score_array[j])
|
||||
{
|
||||
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
|
||||
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
|
||||
modified = true;
|
||||
|
||||
// Insert new LCA/score pair
|
||||
for (k=taxid_array_writable_length; k>=j+1; k--)
|
||||
{
|
||||
lca_taxid_array_writable[k] = lca_taxid_array_writable[k-1];
|
||||
score_array_writable[k] = score_array_writable[k-1];
|
||||
}
|
||||
|
||||
taxid_array_writable_length++;
|
||||
score_array_writable_length++;
|
||||
|
||||
lca_taxid_array_writable[j] = taxid_lca;
|
||||
score_array_writable[j] = score;
|
||||
|
||||
// Remove the previous (children) LCAs from the array if their score is equal or lower
|
||||
while ((j>0) && (score_array_writable[j-1] <= score))
|
||||
{
|
||||
for (k=j-1; k<taxid_array_writable_length-1; k++)
|
||||
{
|
||||
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
|
||||
score_array_writable[k] = score_array_writable[k+1];
|
||||
}
|
||||
taxid_array_writable_length--;
|
||||
score_array_writable_length--;
|
||||
j--;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
j++;
|
||||
}
|
||||
|
||||
if (j == taxid_array_length) // same or parent LCA not found, need to be appended at the end
|
||||
{
|
||||
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
|
||||
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
|
||||
modified = true;
|
||||
|
||||
// Append LCA
|
||||
lca_taxid_array_writable[taxid_array_writable_length] = taxid_lca;
|
||||
score_array_writable[score_array_writable_length] = score;
|
||||
|
||||
// Remove the previous (children) LCAs from the array if their score is equal or lower
|
||||
while ((j>0) && (score_array_writable[j-1] <= score))
|
||||
{
|
||||
for (k=j-1; k<taxid_array_writable_length-1; k++)
|
||||
{
|
||||
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
|
||||
score_array_writable[k] = score_array_writable[k+1];
|
||||
}
|
||||
taxid_array_writable_length--;
|
||||
score_array_writable_length--;
|
||||
j--;
|
||||
}
|
||||
}
|
||||
|
||||
// Write new arrays
|
||||
// Copy arrays for modification (can't edit in place, as it is stored in indexer data file)
|
||||
if (modified)
|
||||
{
|
||||
if (obi_column_set_array(final_lca_taxid_a_column, idx2, lca_taxid_array_writable, (uint8_t) obi_sizeof(OBI_INT), taxid_array_writable_length) < 0)
|
||||
{
|
||||
obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
if (obi_column_set_array(final_score_a_column, idx2, score_array_writable, (uint8_t) obi_sizeof(OBI_FLOAT), score_array_writable_length) < 0)
|
||||
{
|
||||
obidebug(1, "\nError setting a score array in a column when building a reference database");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Close views and DMS
|
||||
if (obi_save_and_close_view(refs_view) < 0)
|
||||
{
|
||||
obidebug(1, "\nError closing the reference view after building a reference database");
|
||||
return -1;
|
||||
}
|
||||
if (obi_save_and_close_view(matrix_with_lca_view) < 0)
|
||||
{
|
||||
obidebug(1, "\nError closing the matrix with LCA view after building a reference database");
|
||||
return -1;
|
||||
}
|
||||
if (obi_save_and_close_view(o_view) < 0)
|
||||
{
|
||||
obidebug(1, "\nError closing the final view after building a reference database");
|
||||
return -1;
|
||||
}
|
||||
if (obi_close_dms(dms, false) < 0)
|
||||
{
|
||||
obidebug(1, "\nError closing the DMS after building a reference database");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Free everything
|
||||
free(matrix_view_name);
|
||||
free(matrix_with_lca_view_name);
|
||||
|
||||
fprintf(stderr,"\rDone : 100 %% ");
|
||||
return 0;
|
||||
}
|
||||
|
56
src/build_reference_db.h
Executable file
56
src/build_reference_db.h
Executable file
@ -0,0 +1,56 @@
|
||||
/****************************************************************************************************
|
||||
* Header file for functions to build reference databases for the taxonomic assignment of sequences *
|
||||
****************************************************************************************************/
|
||||
|
||||
/**
|
||||
* @file build_reference_db.h
|
||||
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||
* @date November 15th 2018
|
||||
* @brief Header file for the functions for the taxonomic assignment of sequences.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef BUILD_REFERENCE_DB_H_
|
||||
#define BUILD_REFERENCE_DB_H_
|
||||
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
|
||||
#define LCA_TAXID_COLUMN_NAME "LCA_TAXID"
|
||||
#define LCA_TAXID_ARRAY_COLUMN_NAME "LCA_TAXID"
|
||||
#define LCA_SCORE_ARRAY_COLUMN_NAME "LCA_SCORE"
|
||||
|
||||
|
||||
/**
|
||||
* @brief Building of reference databases for the taxonomic assignment of sequences.
|
||||
*
|
||||
* Note: The columns where the results are written are automatically named and created.
|
||||
*
|
||||
* @param dms_name The name of the DMS.
|
||||
* @param refs_view_name The name of the view containing the reference sequences annotated with their taxids.
|
||||
* @param taxonomy_name The name of the taxonomy stored in the DMS.
|
||||
* @param o_view_name The name of the final reference database to create.
|
||||
* @param o_view_comments The comments to associate with the final reference database to create.
|
||||
* @param threshold The threshold (similarity score in identity percentage) at which the database should be created.
|
||||
* The output database contains the Lowest Common Ancestor and associated highest similarity score for each
|
||||
* sequence with the sequences with a similarity equal or greater than this threshold.
|
||||
*
|
||||
* @returns A value indicating the success of the operation.
|
||||
* @retval 0 if the operation was successfully completed.
|
||||
* @retval -1 if an error occurred.
|
||||
*
|
||||
* @since November 2018
|
||||
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||
*/
|
||||
int build_reference_db(const char* dms_name,
|
||||
const char* refs_view_name,
|
||||
const char* taxonomy_name,
|
||||
const char* o_view_name,
|
||||
const char* o_view_comments,
|
||||
double threshold);
|
||||
|
||||
|
||||
#endif /* BUILD_REFERENCE_DB_H_ */
|
||||
|
Reference in New Issue
Block a user