Files
obitools3/src/build_reference_db.c
2021-11-11 15:26:09 +13:00

929 lines
30 KiB
C
Executable File

/***********************************************************************************
* Functions to build reference databases for the taxonomic assignment of sequences *
***********************************************************************************/
/**
* @file build_reference_db.c
* @author Celine Mercier (celine.mercier@metabarcoding.org)
* @date November 15th 2018
* @brief Functions to build referece databases for the taxonomic assignment of sequences.
*/
//#define OMP_SUPPORT // TODO
#ifdef OMP_SUPPORT
#include <omp.h>
#endif
#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#include <search.h>
#include <sys/time.h>
#include "build_reference_db.h"
#include "obidms.h"
#include "obidebug.h"
#include "obierrno.h"
#include "obisig.h"
#include "obitypes.h"
#include "obiview.h"
#include "obi_lcs.h"
#include "obidms_taxonomy.h"
#include "obidmscolumn_array.h"
#include "libjson/json_utils.h"
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
/**************************************************************************
*
* D E C L A R A T I O N O F T H E P R I V A T E F U N C T I O N S
*
**************************************************************************/
/**
* Internal function computing the last common ancestor (LCA) of a list of merged taxids (generated by obi uniq).
* Also works on just simple taxid (returns the associated taxon).
*
* @param view A pointer on the view containing the taxid information (merged or not).
* @param taxid_column A pointer on the column where taxids are kept (merged or not, aka int array or int columns).
* @param idx The index of the sequence to compute the LCA of in the view.
* @param merged Whether the taxid information is made of arrays of taxids or a single taxid.
* @param tax A pointer on a taxonomy structure.
* @param taxid_count The maximum number of taxids associated with one sequence (aka the number of elements in taxid_column).
* @param taxid_str_indices A pointer on the list of indices in taxid_strings referring to the taxids stored as strings (see next param).
* @param taxid_strings A pointer on the list of taxids stored as strings in the taxid column header (they correspond to the element names).
*
* @returns A pointer on the LCA taxon.
* @retval NULL if an error occurred.
*
* @since August 2019
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
static inline ecotx_t* get_lca_from_merged_taxids(Obiview_p view, OBIDMS_column_p taxid_column, index_t idx, bool merged,
OBIDMS_taxonomy_p tax, index_t taxid_count, int64_t* taxid_str_indices, char* taxid_strings);
/************************************************************************
*
* D E F I N I T I O N O F T H E P R I V A T E F U N C T I O N S
*
************************************************************************/
static inline ecotx_t* get_lca_from_merged_taxids(Obiview_p view, OBIDMS_column_p taxid_column, index_t idx, bool merged,
OBIDMS_taxonomy_p tax, index_t taxid_count, int64_t* taxid_str_indices, char* taxid_strings)
{
ecotx_t* taxon = NULL;
ecotx_t* lca = NULL;
ecotx_t* lca1 = NULL;
int32_t taxid;
index_t taxid_idx;
int64_t taxid_str_idx;
char* taxid_str;
obiint_t n;
for (taxid_idx=0; taxid_idx<taxid_count; taxid_idx++) // get lca of all taxids associated with the sequence
{
n = obi_get_int_with_elt_idx_and_col_p_in_view(view, taxid_column, idx, taxid_idx);
if (n != OBIInt_NA) // The taxid of this column is associated with this sequence
{
if (merged)
{
taxid_str_idx = taxid_str_indices[taxid_idx];
taxid_str = taxid_strings+taxid_str_idx;
taxid = atoi(taxid_str);
}
else
taxid = n;
taxon = obi_taxo_get_taxon_with_taxid(tax, taxid);
if (taxon == NULL)
{
obidebug(1, "\nError getting a taxon with taxid %d when building a reference database, seq %lld in refs view", taxid, idx);
return NULL;
}
if (lca == NULL)
lca = taxon;
else
{
// Compute LCA
lca1 = lca;
lca = obi_taxo_get_lca(taxon, lca);
if (lca == NULL)
{
obidebug(1, "\nError getting the last common ancestor of two taxa when building a reference database, %d %d", taxid, lca1->taxid);
return NULL;
}
}
}
}
return lca;
}
/**********************************************************************
*
* D E F I N I T I O N O F T H E P U B L I C F U N C T I O N S
*
**********************************************************************/
int build_reference_db(const char* dms_name,
const char* refs_view_name,
const char* taxonomy_name,
const char* o_view_name,
const char* o_view_comments,
double threshold)
{
OBIDMS_p dms = NULL;
OBIDMS_taxonomy_p tax = NULL;
char* matrix_view_name = NULL; // TODO discuss that it must be unique
char* matrix_with_lca_view_name = NULL; // TODO discuss that it must be unique
Obiview_p matrix_with_lca_view = NULL;
Obiview_p refs_view = NULL;
Obiview_p o_view = NULL;
OBIDMS_column_p matrix_idx1_column = NULL;
OBIDMS_column_p matrix_idx2_column = NULL;
OBIDMS_column_p refs_taxid_column = NULL;
OBIDMS_column_p matrix_lca_taxid_column = NULL;
OBIDMS_column_p matrix_score_column = NULL;
OBIDMS_column_p final_lca_taxid_a_column = NULL;
OBIDMS_column_p final_score_a_column = NULL;
char* taxid_strings;
int64_t* taxid_str_indices;
index_t taxid_count;
obiint_t taxid_lca;
ecotx_t* lca_1 = NULL;
ecotx_t* lca_2 = NULL;
ecotx_t* lca = NULL;
index_t idx1, idx2;
index_t i, j, k, count;
int32_t taxid_array_length;
int32_t score_array_length;
int32_t taxid_array_writable_length;
int32_t score_array_writable_length;
obifloat_t score;
obiint_t* lca_taxid_array;
obifloat_t* score_array;
obiint_t lca_taxid_array_writable[1000];
obifloat_t score_array_writable[1000];
bool modified;
bool merged;
char threshold_str[5];
char* new_comments;
signal(SIGINT, sig_handler);
// Discuss keeping the matrix view or not
matrix_view_name = calloc((strlen(o_view_name)+strlen("_matrix")+1), sizeof(char));
if (matrix_view_name == NULL)
{
obidebug(1, "\nError allocating memory for the name of the matrix view when building a reference database");
return -1;
}
// TODO check and create view name that doesn't already exist
matrix_view_name = strcpy(matrix_view_name, o_view_name);
strcat(matrix_view_name, "_matrix");
fprintf(stderr, "Aligning sequences...\n");
if (obi_lcs_align_one_column(dms_name,
refs_view_name,
"",
"",
"",
matrix_view_name,
"{}",
false, false,
threshold, true, 0, true,
1) < 0)
{
obidebug(1, "\nError aligning the sequences when building a reference database");
}
// Add a column to the matrix view for LCAs
// Clone the view with a new name
// Build the view name
matrix_with_lca_view_name = calloc((strlen(o_view_name)+strlen("_matrix_with_lca")+1), sizeof(char));
if (matrix_with_lca_view_name == NULL)
{
obidebug(1, "\nError allocating memory for the name of the matrix view with LCA when building a reference database");
return -1;
}
matrix_with_lca_view_name = strcpy(matrix_with_lca_view_name, o_view_name);
strcat(matrix_with_lca_view_name, "_matrix_with_lca");
// Clone the matrix view
// Open the DMS
dms = obi_open_dms(dms_name, false);
if (dms == NULL)
{
obidebug(1, "\nError opening the DMS when building a reference database");
return -1;
}
// Clone the view
matrix_with_lca_view = obi_clone_view_from_name(dms, matrix_view_name, matrix_with_lca_view_name, NULL, "{}");
if (matrix_with_lca_view == NULL)
{
obidebug(1, "\nError creating the matrix with LCA view when building a reference database");
return -1;
}
// Add the LCA taxid column
if (obi_view_add_column(matrix_with_lca_view,
LCA_TAXID_COLUMN_NAME,
-1,
LCA_TAXID_COLUMN_NAME,
OBI_INT,
-1,
1,
"",
false,
false,
false,
false,
"",
"",
-1,
"{}",
true)
< 0)
{
obidebug(1, "\nError adding the LCA column to the matrix with LCA view when building a reference database");
return -1;
}
// Open the taxonomy
tax = obi_read_taxonomy(dms, taxonomy_name, true);
if (tax == NULL)
{
obidebug(1, "\nError reading the taxonomy when building a reference database");
return -1;
}
// Open the reference sequences view
refs_view = obi_open_view(dms, refs_view_name);
if (refs_view == NULL)
{
obidebug(1, "\nError opening the reference sequences view when building a reference database");
return -1;
}
// Save column pointers
matrix_idx1_column = obi_view_get_column(matrix_with_lca_view, IDX1_COLUMN_NAME);
if (matrix_idx1_column == NULL)
{
obidebug(1, "\nError opening the first index column when building a reference database");
return -1;
}
matrix_idx2_column = obi_view_get_column(matrix_with_lca_view, IDX2_COLUMN_NAME);
if (matrix_idx2_column == NULL)
{
obidebug(1, "\nError opening the second index column when building a reference database");
return -1;
}
if (obi_view_column_exists(refs_view, MERGED_TAXID_COLUMN))
{
refs_taxid_column = obi_view_get_column(refs_view, MERGED_TAXID_COLUMN);
merged = true;
}
else
{
refs_taxid_column = obi_view_get_column(refs_view, TAXID_COLUMN);
merged = false;
}
if (refs_taxid_column == NULL)
{
obidebug(1, "\nError opening the taxid column when building a reference database");
return -1;
}
// Check that the refs taxid column doesn't contain character strings to parse
// (happens with obi uniq when there is a lot of elements per line, see options to set the limit,
// parsing of those strings in C not implemented yet)
if ((refs_taxid_column->header)->to_eval)
{
obidebug(1, "\nError opening the column containing the taxids of the reference sequences when building a reference database: "
"run previous obi uniq with a higher threshold for the --max-elts option while waiting for the implementation of this feature");
return -1;
}
// Get the (maximum) number of taxids associated with a sequence
taxid_count = (refs_taxid_column->header)->nb_elements_per_line;
// Get pointers on element names (aka taxids) and their start indices for faster access
taxid_strings = (refs_taxid_column->header)->elements_names;
taxid_str_indices = (refs_taxid_column->header)->elements_names_idx;
matrix_lca_taxid_column = obi_view_get_column(matrix_with_lca_view, LCA_TAXID_COLUMN_NAME);
if (matrix_lca_taxid_column == NULL)
{
obidebug(1, "\nError opening the LCA column when building a reference database");
return -1;
}
count = (matrix_with_lca_view->infos)->line_count;
fprintf(stderr, "Computing LCAs...\n");
// Compute all the LCAs
// For each pair
for (i=0; i<count; i++)
{
if (! keep_running)
return -1;
if (i%1000 == 0)
fprintf(stderr,"\rDone : %f %% ", (i / (float) count)*100);
// Read all taxids associated with the first sequence and compute their LCA
// Read line index
idx1 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx1_column, i, 0);
lca_1 = get_lca_from_merged_taxids(refs_view, refs_taxid_column, idx1, merged,
tax, taxid_count, taxid_str_indices, taxid_strings);
if (lca_1 == NULL)
{
obidebug(1, "\nError getting the last common ancestor of merged taxids when building a reference database");
return -1;
}
// Read all taxids associated with the second sequence and compute their LCA
// Read line index
idx2 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx2_column, i, 0);
lca_2 = get_lca_from_merged_taxids(refs_view, refs_taxid_column, idx2, merged,
tax, taxid_count, taxid_str_indices, taxid_strings);
if (lca_2 == NULL)
{
obidebug(1, "\nError getting the last common ancestor of merged taxids when building a reference database");
return -1;
}
// Compute and write their LCA
lca = obi_taxo_get_lca(lca_1, lca_2);
if (lca == NULL)
{
obidebug(1, "\nError getting the last common ancestor of two taxa when building a reference database");
return -1;
}
taxid_lca = lca->taxid;
if (obi_set_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_lca_taxid_column, i, 0, taxid_lca) < 0)
{
obidebug(1, "\nError writing the last common ancestor of two taxa when building a reference database");
return -1;
}
}
fprintf(stderr,"\rDone : 100 %% \n");
// Clone refs view, add 2 arrays columns for lca and score, compute and write them
// Clone refs view
o_view = obi_clone_view(dms, refs_view, o_view_name, NULL, o_view_comments);
if (o_view == NULL)
{
obidebug(1, "\nError cloning the view of references when building a reference database");
return -1;
}
// Add the LCA taxid array column
if (obi_view_add_column(o_view,
LCA_TAXID_ARRAY_COLUMN_NAME,
-1,
LCA_TAXID_ARRAY_COLUMN_NAME,
OBI_INT,
-1,
1,
"",
false,
false,
true,
false,
"",
"",
-1,
"{}",
true)
< 0)
{
obidebug(1, "\nError adding the LCA taxid column to the final view when building a reference database");
return -1;
}
// Add the score array column
if (obi_view_add_column(o_view,
LCA_SCORE_ARRAY_COLUMN_NAME,
-1,
LCA_SCORE_ARRAY_COLUMN_NAME,
OBI_FLOAT,
-1,
1,
"",
false,
false,
true,
false,
"",
"",
-1,
"{}",
true)
< 0)
{
obidebug(1, "\nError adding the score column to the final view when building a reference database");
return -1;
}
// Open the newly added columns
final_lca_taxid_a_column = obi_view_get_column(o_view, LCA_TAXID_ARRAY_COLUMN_NAME);
if (final_lca_taxid_a_column == NULL)
{
obidebug(1, "\nError opening the LCA taxid array column when building a reference database");
return -1;
}
final_score_a_column = obi_view_get_column(o_view, LCA_SCORE_ARRAY_COLUMN_NAME);
if (final_score_a_column == NULL)
{
obidebug(1, "\nError opening the score array column when building a reference database");
return -1;
}
// Open alignment score column
matrix_score_column = obi_view_get_column(matrix_with_lca_view, SCORE_COLUMN_NAME);
if (matrix_score_column == NULL)
{
obidebug(1, "\nError opening the alignment score column when building a reference database");
return -1;
}
fprintf(stderr, "Building LCA arrays...\n");
// For each sequence, look for all its alignments in the matrix, and for each different LCA taxid/score, order them and write them
// Going through matrix once, filling refs arrays on the go for efficiency
for (i=0; i<count; i++)
{
if (! keep_running)
return -1;
if (i%1000 == 0)
fprintf(stderr,"\rDone : %f %% ", (i / (float) count)*100);
// Read ref line indexes
idx1 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx1_column, i, 0);
idx2 = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_idx2_column, i, 0);
// Read LCA taxid
taxid_lca = obi_get_int_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_lca_taxid_column, i, 0);
// Get LCA taxon
lca = obi_taxo_get_taxon_with_taxid(tax, taxid_lca);
if (lca == NULL)
{
obidebug(1, "\nError getting a LCA from taxid when building a reference database, taxid %d", taxid_lca);
return -1;
}
// Read alignment score
score = obi_get_float_with_elt_idx_and_col_p_in_view(matrix_with_lca_view, matrix_score_column, i, 0);
//fprintf(stderr, "\n\ntaxid_lca=%d, score=%f, idx1=%d, idx2=%d", taxid_lca, score, idx1, idx2);
///////////////// Compute for first sequence \\\\\\\\\\\\\\\\\\\\\\\ (TODO function)
// Read arrays
lca_taxid_array = (obiint_t*) obi_get_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, idx1, &taxid_array_length);
score_array = (obifloat_t*) obi_get_array_with_col_p_in_view(o_view, final_score_a_column, idx1, &score_array_length);
taxid_array_writable_length = taxid_array_length;
score_array_writable_length = score_array_length;
// Check that lengths are equal (TODO eventually remove?)
// if (taxid_array_length != score_array_length)
// {
// obidebug(1, "\nError building a reference database: LCA taxid and score arrays' lengths are not equal (%d and %d)", taxid_array_length, score_array_length);
// return -1;
// }
//fprintf(stderr, "\n1st sequence");
// If empty, add values
if (taxid_array_length == 0)
{
//fprintf(stderr, "\nEmpty, add value");
if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, idx1, &taxid_lca, (uint8_t) (obi_sizeof(OBI_INT) * 8), 1) < 0)
{
obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database");
return -1;
}
if (obi_set_array_with_col_p_in_view(o_view, final_score_a_column, idx1, &score, (uint8_t) (obi_sizeof(OBI_FLOAT) * 8), 1) < 0)
{
obidebug(1, "\nError setting a score array in a column when building a reference database");
return -1;
}
}
else
{
//fprintf(stderr, "\nNot empty");
j = 0;
modified = false;
while (j < taxid_array_length)
{
if (taxid_lca == lca_taxid_array[j]) // Same LCA taxid: replace if the similarity score is greater
{
if (score > score_array[j])
{
// Copy in array to make writable
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
modified = true;
//fprintf(stderr, "\nSame LCA, replace %d and %f with %d and %f", lca_taxid_array_writable[j],
// score_array_writable[j], taxid_lca, score);
// Better score for the same LCA, replace this LCA/score pair
lca_taxid_array_writable[j] = taxid_lca;
score_array_writable[j] = score;
// Remove the previous (children) LCAs from the array if their score is equal or lower
while ((j>0) && (score_array_writable[j-1] <= score))
{
for (k=j-1; k<taxid_array_writable_length-1; k++)
{
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
score_array_writable[k] = score_array_writable[k+1];
}
if (k>(j-1))
{
taxid_array_writable_length--;
score_array_writable_length--;
}
j--;
}
}
break;
}
else if (obi_taxo_is_taxon_under_taxid(lca, lca_taxid_array[j])) // Array LCA is a parent LCA
{
if (score > score_array[j])
{
//fprintf(stderr, "\nInsert new");
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
modified = true;
// Insert new LCA/score pair
for (k=taxid_array_writable_length; k>=j+1; k--)
{
lca_taxid_array_writable[k] = lca_taxid_array_writable[k-1];
score_array_writable[k] = score_array_writable[k-1];
}
taxid_array_writable_length++;
score_array_writable_length++;
lca_taxid_array_writable[j] = taxid_lca;
score_array_writable[j] = score;
// Remove the previous (children) LCAs from the array if their score is equal or lower
while ((j>0) && (score_array_writable[j-1] <= score))
{
for (k=j-1; k<taxid_array_writable_length-1; k++)
{
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
score_array_writable[k] = score_array_writable[k+1];
}
if (k>(j-1))
{
taxid_array_writable_length--;
score_array_writable_length--;
}
j--;
}
}
break;
}
j++;
}
if (j == taxid_array_length) // same or parent LCA not found, need to be appended at the end
{
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
modified = true;
//fprintf(stderr, "\nAppend at the end");
// Append LCA
lca_taxid_array_writable[taxid_array_writable_length] = taxid_lca;
score_array_writable[score_array_writable_length] = score;
taxid_array_writable_length++;
score_array_writable_length++;
// Remove the previous (children) LCAs from the array if their score is equal or lower
while ((j>0) && (score_array_writable[j-1] <= score))
{
for (k=j-1; k<taxid_array_writable_length-1; k++)
{
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
score_array_writable[k] = score_array_writable[k+1];
}
if (k>(j-1))
{
taxid_array_writable_length--;
score_array_writable_length--;
}
j--;
}
}
// Write new arrays
if (modified)
{
// fprintf(stderr, "\n\nnew array:");
// for (k=0;k<taxid_array_writable_length;k++)
// {
// lca = obi_taxo_get_taxon_with_taxid(tax, lca_taxid_array_writable[k]);
// fprintf(stderr, "\nLCA=%d, %s, score=%f", lca_taxid_array_writable[k], lca->name, score_array_writable[k]);
// }
if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, idx1, lca_taxid_array_writable, (uint8_t) (obi_sizeof(OBI_INT) * 8), taxid_array_writable_length) < 0)
{
obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database");
return -1;
}
if (obi_set_array_with_col_p_in_view(o_view, final_score_a_column, idx1, score_array_writable, (uint8_t) (obi_sizeof(OBI_FLOAT) * 8), score_array_writable_length) < 0)
{
obidebug(1, "\nError setting a score array in a column when building a reference database");
return -1;
}
}
}
///////////////// Compute for second sequence \\\\\\\\\\\\\\\\\\\\\\\ (TODO function)
// Read arrays
lca_taxid_array = (obiint_t*) obi_get_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, idx2, &taxid_array_length);
score_array = (obifloat_t*) obi_get_array_with_col_p_in_view(o_view, final_score_a_column, idx2, &score_array_length);
taxid_array_writable_length = taxid_array_length;
score_array_writable_length = score_array_length;
// Check that lengths are equal (TODO eventually remove?)
// if (taxid_array_length != score_array_length)
// {
// obidebug(1, "\nError building a reference database: LCA taxid and score arrays' lengths are not equal (%d and %d)", taxid_array_length, score_array_length);
// return -1;
// }
//fprintf(stderr, "\n2nd sequence");
// If empty, add values
if (taxid_array_length == 0)
{
//fprintf(stderr, "\nEmpty, add value");
if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, idx2, &taxid_lca, (uint8_t) (obi_sizeof(OBI_INT) * 8), 1) < 0)
{
obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database");
return -1;
}
if (obi_set_array_with_col_p_in_view(o_view, final_score_a_column, idx2, &score, (uint8_t) (obi_sizeof(OBI_FLOAT) * 8), 1) < 0)
{
obidebug(1, "\nError setting a score array in a column when building a reference database");
return -1;
}
}
else
{
//fprintf(stderr, "\nNot empty");
j = 0;
modified = false;
while (j < taxid_array_length)
{
if (taxid_lca == lca_taxid_array[j]) // Same LCA taxid: replace if the similarity score is greater
{
if (score > score_array[j])
{
// Copy in array to make writable
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
modified = true;
//fprintf(stderr, "\nSame LCA, replace %d and %f with %d and %f", lca_taxid_array_writable[j],
// score_array_writable[j], taxid_lca, score);
// Better score for the same LCA, replace this LCA/score pair
lca_taxid_array_writable[j] = taxid_lca;
score_array_writable[j] = score;
// Remove the previous (children) LCAs from the array if their score is equal or lower
while ((j>0) && (score_array_writable[j-1] <= score))
{
for (k=j-1; k<taxid_array_writable_length-1; k++)
{
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
score_array_writable[k] = score_array_writable[k+1];
}
if (k>(j-1))
{
taxid_array_writable_length--;
score_array_writable_length--;
}
j--;
}
}
break;
}
else if (obi_taxo_is_taxon_under_taxid(lca, lca_taxid_array[j])) // Array LCA is a parent LCA
{
if (score > score_array[j])
{
//fprintf(stderr, "\nInsert new");
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
modified = true;
// Insert new LCA/score pair
for (k=taxid_array_writable_length; k>=j+1; k--)
{
lca_taxid_array_writable[k] = lca_taxid_array_writable[k-1];
score_array_writable[k] = score_array_writable[k-1];
}
taxid_array_writable_length++;
score_array_writable_length++;
lca_taxid_array_writable[j] = taxid_lca;
score_array_writable[j] = score;
// Remove the previous (children) LCAs from the array if their score is equal or lower
while ((j>0) && (score_array_writable[j-1] <= score))
{
for (k=j-1; k<taxid_array_writable_length-1; k++)
{
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
score_array_writable[k] = score_array_writable[k+1];
}
if (k>(j-1))
{
taxid_array_writable_length--;
score_array_writable_length--;
}
j--;
}
}
break;
}
j++;
}
if (j == taxid_array_length) // same or parent LCA not found, need to be appended at the end
{
//fprintf(stderr, "\nAppend at the end");
memcpy(lca_taxid_array_writable, lca_taxid_array, taxid_array_length*sizeof(obiint_t));
memcpy(score_array_writable, score_array, score_array_length*sizeof(obifloat_t));
modified = true;
// Append LCA
lca_taxid_array_writable[taxid_array_writable_length] = taxid_lca;
score_array_writable[score_array_writable_length] = score;
taxid_array_writable_length++;
score_array_writable_length++;
// Remove the previous (children) LCAs from the array if their score is equal or lower
while ((j>0) && (score_array_writable[j-1] <= score))
{
for (k=j-1; k<taxid_array_writable_length-1; k++)
{
lca_taxid_array_writable[k] = lca_taxid_array_writable[k+1];
score_array_writable[k] = score_array_writable[k+1];
}
if (k>(j-1))
{
taxid_array_writable_length--;
score_array_writable_length--;
}
j--;
}
}
// Write new arrays
// Copy arrays for modification (can't edit in place, as it is stored in indexer data file)
if (modified)
{
if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, idx2, lca_taxid_array_writable, (uint8_t) (obi_sizeof(OBI_INT) * 8), taxid_array_writable_length) < 0)
{
obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database");
return -1;
}
if (obi_set_array_with_col_p_in_view(o_view, final_score_a_column, idx2, score_array_writable, (uint8_t) (obi_sizeof(OBI_FLOAT) * 8), score_array_writable_length) < 0)
{
obidebug(1, "\nError setting a score array in a column when building a reference database");
return -1;
}
}
}
}
fprintf(stderr,"\rDone : 100 %% \n");
fprintf(stderr, "Writing results...\n");
count = (o_view->infos)->line_count;
// Fill empty LCA informations (because filling from potentially sparse alignment matrix) with the sequence taxid
score=1.0; // technically getting LCA of identical sequences
for (i=0; i<count; i++)
{
if (i%1000 == 0)
fprintf(stderr,"\rDone : %f %% ", (i / (float) count)*100);
obi_get_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, i, &taxid_array_length);
if (taxid_array_length == 0) // no LCA set
{
lca = get_lca_from_merged_taxids(refs_view, refs_taxid_column, i, merged,
tax, taxid_count, taxid_str_indices, taxid_strings);
if (lca == NULL)
{
obidebug(1, "\nError getting the last common ancestor of merged taxids when building a reference database");
return -1;
}
taxid_lca = lca->taxid;
// Set them in output view
if (obi_set_array_with_col_p_in_view(o_view, final_lca_taxid_a_column, i, &taxid_lca, (uint8_t) (obi_sizeof(OBI_INT) * 8), 1) < 0)
{
obidebug(1, "\nError setting a LCA taxid array in a column when building a reference database");
return -1;
}
if (obi_set_array_with_col_p_in_view(o_view, final_score_a_column, i, &score, (uint8_t) (obi_sizeof(OBI_FLOAT) * 8), 1) < 0)
{
obidebug(1, "\nError setting a score array in a column when building a reference database");
return -1;
}
}
}
fprintf(stderr,"\rDone : 100 %% \n");
// Add information about the threshold used to build the DB
#define snprintf_nowarn(...) (snprintf(__VA_ARGS__) < 0 ? abort() : (void)0)
snprintf_nowarn(threshold_str, 5, "%f", threshold);
new_comments = obi_add_comment((o_view->infos)->comments, DB_THRESHOLD_KEY_IN_COMMENTS, threshold_str);
if (new_comments == NULL)
{
obidebug(1, "\nError adding a comment (db threshold) to a view, key: %s, value: %s", DB_THRESHOLD_KEY_IN_COMMENTS, threshold_str);
return -1;
}
if (obi_view_write_comments(o_view, new_comments) < 0)
{
obidebug(1, "\nError adding a comment (db threshold) to a view, key: %s, value: %s", DB_THRESHOLD_KEY_IN_COMMENTS, threshold_str);
return -1;
}
free(new_comments);
// Close views and DMS
if (obi_save_and_close_view(refs_view) < 0)
{
obidebug(1, "\nError closing the reference view after building a reference database");
return -1;
}
if (obi_save_and_close_view(matrix_with_lca_view) < 0)
{
obidebug(1, "\nError closing the matrix with LCA view after building a reference database");
return -1;
}
if (obi_save_and_close_view(o_view) < 0)
{
obidebug(1, "\nError closing the final view after building a reference database");
return -1;
}
// Delete temporary views
if (obi_delete_view(dms, matrix_view_name) < 0)
{
obidebug(1, "\nError deleting temporary view %s after building a reference database", matrix_view_name);
return -1;
}
if (obi_delete_view(dms, matrix_with_lca_view_name) < 0)
{
obidebug(1, "\nError deleting temporary view %s after building a reference database", matrix_view_name);
return -1;
}
// Close DMS
if (obi_close_dms(dms, false) < 0)
{
obidebug(1, "\nError closing the DMS after building a reference database");
return -1;
}
// Free everything
free(matrix_view_name);
free(matrix_with_lca_view_name);
return 0;
}