obi clean: option to only keep heads now works, fixed a bug where last
sequence was not properly labelled, and code is cleaned, fixed and error checked
This commit is contained in:
248
src/obi_clean.c
248
src/obi_clean.c
@ -41,13 +41,40 @@
|
||||
*
|
||||
**************************************************************************/
|
||||
|
||||
// TODO
|
||||
|
||||
/**
|
||||
* Internal function creating the output columns for the obiclean algorithm.
|
||||
*
|
||||
* @param o_view A pointer on the output view.
|
||||
* @param i_view A pointer on the input view.
|
||||
* @param sample_column A pointer on the column where sample counts are kept.
|
||||
* @param sample_count The number of different samples.
|
||||
*
|
||||
* @retval 0 if the operation was successfully completed.
|
||||
* @retval -1 if an error occurred.
|
||||
*
|
||||
* @since April 2018
|
||||
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||
*/
|
||||
static int create_output_columns(Obiview_p o_view,
|
||||
Obiview_p i_view,
|
||||
OBIDMS_column_p sample_column,
|
||||
int sample_count);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Internal function comparing two indexes (int64_t).
|
||||
*
|
||||
* @param idx1 A pointer on the first index.
|
||||
* @param idx2 A pointer on the second index.
|
||||
*
|
||||
* @returns -1 if idx1 < idx2,
|
||||
* 1 if idx1 > idx2,
|
||||
* and 0 if idx1 == idx2.
|
||||
*
|
||||
* @since April 2018
|
||||
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||
*/
|
||||
static inline int idxcmp(const void* idx1, const void* idx2);
|
||||
|
||||
|
||||
@ -125,7 +152,6 @@ static inline int idxcmp(const void* idx1, const void* idx2)
|
||||
**********************************************************************/
|
||||
|
||||
|
||||
// TODO comment
|
||||
int obi_clean(const char* dms_name,
|
||||
const char* i_view_name,
|
||||
const char* sample_column_name,
|
||||
@ -136,10 +162,12 @@ int obi_clean(const char* dms_name,
|
||||
bool heads_only,
|
||||
int thread_count)
|
||||
{
|
||||
char* o_view_name_temp;
|
||||
float p;
|
||||
index_t i, j;
|
||||
index_t i, j, l;
|
||||
index_t seq_count;
|
||||
index_t* index_array;
|
||||
index_t* line_selection;
|
||||
double score;
|
||||
bool above_threshold;
|
||||
int lcs_length;
|
||||
@ -174,8 +202,6 @@ int obi_clean(const char* dms_name,
|
||||
OBIDMS_column_p singletoncount_column = NULL;
|
||||
OBIDMS_column_p samplecount_column = NULL;
|
||||
|
||||
// TODO other columns
|
||||
|
||||
void* no;
|
||||
void* yes;
|
||||
void* key_p;
|
||||
@ -184,11 +210,13 @@ int obi_clean(const char* dms_name,
|
||||
int reference = 0;
|
||||
bool similarity_mode = false;
|
||||
|
||||
char* seq1;
|
||||
char* seq2;
|
||||
|
||||
// Open DMS
|
||||
dms = obi_dms(dms_name);
|
||||
if (dms == NULL)
|
||||
{
|
||||
obidebug(1, "\nError opening the DMS");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Open input view
|
||||
i_view = obi_open_view(dms, i_view_name);
|
||||
@ -226,11 +254,25 @@ int obi_clean(const char* dms_name,
|
||||
}
|
||||
}
|
||||
|
||||
// Create the output view
|
||||
o_view = obi_clone_view(dms, i_view, o_view_name, NULL, o_view_comments);
|
||||
// Create the output view, or a temporary one if heads only
|
||||
if (heads_only)
|
||||
{
|
||||
o_view_name_temp = calloc((strlen(o_view_name)+strlen("_temp")+1), sizeof(char));
|
||||
if (o_view_name_temp == NULL)
|
||||
{
|
||||
obidebug(1, "\nError allocating memory for the name of a temporary output view in obiclean");
|
||||
return -1;
|
||||
}
|
||||
o_view_name_temp = strcpy(o_view_name_temp, o_view_name);
|
||||
strcat(o_view_name_temp, "_temp");
|
||||
o_view = obi_clone_view(dms, i_view, o_view_name_temp, NULL, o_view_comments);
|
||||
}
|
||||
else
|
||||
o_view = obi_clone_view(dms, i_view, o_view_name, NULL, o_view_comments);
|
||||
|
||||
if (o_view == NULL)
|
||||
{
|
||||
obidebug(1, "\nError creating the output view");
|
||||
obidebug(1, "\nError creating the output view in obiclean");
|
||||
return -1;
|
||||
}
|
||||
|
||||
@ -238,16 +280,48 @@ int obi_clean(const char* dms_name,
|
||||
|
||||
// Create the output columns
|
||||
if (create_output_columns(o_view, i_view, sample_column, sample_count) < 0)
|
||||
// TODO
|
||||
{
|
||||
obidebug(1, "\nError creating the output columns");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Get the created output columns
|
||||
status_column = obi_view_get_column(o_view, CLEAN_STATUS_COLUMN_NAME);
|
||||
if (status_column == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting the obiclean status column");
|
||||
return -1;
|
||||
}
|
||||
head_column = obi_view_get_column(o_view, CLEAN_HEAD_COLUMN_NAME);
|
||||
if (status_column == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting the obiclean head column");
|
||||
return -1;
|
||||
}
|
||||
internalcount_column = obi_view_get_column(o_view, CLEAN_INTERNALCOUNT_COLUMN_NAME);
|
||||
if (status_column == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting the obiclean internal count column");
|
||||
return -1;
|
||||
}
|
||||
headcount_column = obi_view_get_column(o_view, CLEAN_HEADCOUNT_COLUMN_NAME);
|
||||
if (status_column == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting the obiclean head count column");
|
||||
return -1;
|
||||
}
|
||||
singletoncount_column = obi_view_get_column(o_view, CLEAN_SINGLETONCOUNT_COLUMN_NAME);
|
||||
if (status_column == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting the obiclean singleton count column");
|
||||
return -1;
|
||||
}
|
||||
samplecount_column = obi_view_get_column(o_view, CLEAN_SAMPLECOUNT_COLUMN_NAME);
|
||||
if (status_column == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting the obiclean sample count column");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Build kmer tables
|
||||
ktable = hash_seq_column(i_view, iseq_column, 0);
|
||||
@ -262,30 +336,50 @@ int obi_clean(const char* dms_name,
|
||||
|
||||
// Allocate arrays of pointers to binary trees
|
||||
yes_trees = (void**) calloc(seq_count, sizeof(void*));
|
||||
if (yes_trees == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_MALLOC_ERROR);
|
||||
obidebug(1, "\nError allocating memory for 'yes' binary trees");
|
||||
return -1;
|
||||
}
|
||||
no_trees = (void**) calloc(seq_count, sizeof(void*));
|
||||
if (no_trees == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_MALLOC_ERROR);
|
||||
obidebug(1, "\nError allocating memory for 'no' binary trees");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Allocate and fill index array for the binary trees to reference (they don't copy the data)
|
||||
index_array = (index_t*) malloc(seq_count * sizeof(index_t));
|
||||
if (index_array == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_MALLOC_ERROR);
|
||||
obidebug(1, "\nError allocating memory for the index array");
|
||||
return -1;
|
||||
}
|
||||
for (i=0; i < seq_count; i++)
|
||||
index_array[i]=i;
|
||||
|
||||
// Initialize all sequences to singletons or NA if no sequences in that sample
|
||||
for (i=0; i < (seq_count - 1); i++)
|
||||
for (i=0; i<seq_count; i++)
|
||||
{
|
||||
for (sample=0; sample < sample_count; sample++)
|
||||
{
|
||||
if (obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, i, sample) != OBIInt_NA)
|
||||
if (obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, i, sample) != OBIInt_NA) // Only initialize samples where there are some sequences
|
||||
{
|
||||
if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample, 's') < 0)
|
||||
// TODO
|
||||
{
|
||||
obidebug(1, "\nError initializing all sequences to singletons");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (sample=0; sample < sample_count; sample++)
|
||||
{
|
||||
for (i=0; i < (seq_count - 1); i++)
|
||||
for (i=0; i< (seq_count-1); i++)
|
||||
{
|
||||
if (i%1000 == 0)
|
||||
{
|
||||
@ -301,8 +395,6 @@ int obi_clean(const char* dms_name,
|
||||
return -1;
|
||||
}
|
||||
|
||||
seq1 = obi_get_seq_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0);
|
||||
|
||||
// Get count for this sample
|
||||
s1_count = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, i, sample);
|
||||
|
||||
@ -316,8 +408,6 @@ int obi_clean(const char* dms_name,
|
||||
return -1;
|
||||
}
|
||||
|
||||
seq2 = obi_get_seq_with_elt_idx_and_col_p_in_view(i_view, iseq_column, j, 0);
|
||||
|
||||
// Get count for this sample
|
||||
s2_count = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, j, sample);
|
||||
|
||||
@ -326,6 +416,7 @@ int obi_clean(const char* dms_name,
|
||||
((((s1_count >= s2_count) && (((double) s2_count / (double) s1_count) <= max_ratio))) ||
|
||||
(((s2_count >= s1_count) && (((double) s1_count / (double) s2_count) <= max_ratio)))))
|
||||
{
|
||||
|
||||
yes=NULL;
|
||||
no=NULL;
|
||||
no = tfind(index_array+j, &(no_trees[i]), idxcmp);
|
||||
@ -333,7 +424,7 @@ int obi_clean(const char* dms_name,
|
||||
yes = tfind(index_array+j, &(yes_trees[i]), idxcmp);
|
||||
|
||||
above_threshold = false;
|
||||
if ((no == NULL) && (yes == NULL)) //never compared
|
||||
if ((no == NULL) && (yes == NULL)) // never compared before
|
||||
{
|
||||
// Check if the sequences are identical in a quick way (same index in the same indexer)
|
||||
if (obi_get_index_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0) == obi_get_index_with_elt_idx_and_col_p_in_view(i_view, iseq_column, j, 0))
|
||||
@ -354,38 +445,74 @@ int obi_clean(const char* dms_name,
|
||||
if (yes || above_threshold)
|
||||
{
|
||||
if (yes == NULL)
|
||||
// put seqs in their 'yes' trees
|
||||
key_p = tsearch(index_array+j, &(yes_trees[i]), idxcmp); // TODO error checking
|
||||
// Put in 'yes' tree of 1st sequence
|
||||
{
|
||||
key_p = tsearch(index_array+j, &(yes_trees[i]), idxcmp);
|
||||
if (key_p == NULL)
|
||||
{
|
||||
obidebug(1, "\nError adding an index in a binary tree");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// label as head or internal
|
||||
if (s1_count >= s2_count)
|
||||
{
|
||||
if (obi_get_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample) == 's') // seq can become head ONLY if it's a singleton
|
||||
obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample, 'h');
|
||||
{
|
||||
if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample, 'h') < 0)
|
||||
return -1;
|
||||
}
|
||||
// Otherwise it's an internal
|
||||
obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample, 'i');
|
||||
if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample, 'i') < 0)
|
||||
return -1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (obi_get_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample) == 's') // seq can become head ONLY if it's a singleton
|
||||
obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample, 'h');
|
||||
obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample, 'i');
|
||||
{
|
||||
if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample, 'h') < 0)
|
||||
return -1;
|
||||
}
|
||||
if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample, 'i') < 0)
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
else if (no == NULL)
|
||||
// put in 'no' tree of both sequences // TODO why just one then???
|
||||
key_p = tsearch(index_array+j, &(no_trees[i]), idxcmp); // TODO error checking
|
||||
// Put in 'no' tree of 1st sequence
|
||||
{
|
||||
key_p = tsearch(index_array+j, &(no_trees[i]), idxcmp);
|
||||
if (key_p == NULL)
|
||||
{
|
||||
obidebug(1, "\nError adding an index in a binary tree");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
free(seq2);
|
||||
}
|
||||
free(seq1);
|
||||
}
|
||||
}
|
||||
|
||||
free_kmer_tables(ktable, seq_count);
|
||||
free(index_array);
|
||||
free(yes_trees);
|
||||
free(no_trees);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
for (i=0; i < (seq_count - 1); i++)
|
||||
if (heads_only)
|
||||
{
|
||||
line_selection = malloc((o_view->infos)->line_count * sizeof(index_t));
|
||||
if (line_selection == NULL)
|
||||
{
|
||||
obi_set_errno(OBI_MALLOC_ERROR);
|
||||
obidebug(1, "\nError allocating memory for line selection");
|
||||
return -1;
|
||||
}
|
||||
l=0;
|
||||
}
|
||||
|
||||
for (i=0; i<seq_count; i++)
|
||||
{
|
||||
if (i%1000 == 0)
|
||||
{
|
||||
@ -404,7 +531,14 @@ int obi_clean(const char* dms_name,
|
||||
// Check if head or singleton in at least one sample
|
||||
status = obi_get_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample);
|
||||
if ((!head) && ((status == 'h') || (status == 's')))
|
||||
{
|
||||
head = true;
|
||||
if (heads_only)
|
||||
{
|
||||
line_selection[l] = i;
|
||||
l++;
|
||||
}
|
||||
}
|
||||
|
||||
if (status == 's')
|
||||
{
|
||||
@ -424,23 +558,42 @@ int obi_clean(const char* dms_name,
|
||||
|
||||
}
|
||||
|
||||
if (obi_set_bool_with_elt_idx_and_col_p_in_view(o_view, head_column, i, 0, head) < 0)
|
||||
// TODO
|
||||
return -1;
|
||||
if (!heads_only || (heads_only && head)) // Label only if sequence is going to be kept in final view
|
||||
{
|
||||
if (obi_set_bool_with_elt_idx_and_col_p_in_view(o_view, head_column, i, 0, head) < 0)
|
||||
return -1;
|
||||
if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, singletoncount_column, i, 0, singleton_count) < 0)
|
||||
return -1;
|
||||
if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, internalcount_column, i, 0, internal_count) < 0)
|
||||
return -1;
|
||||
if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, headcount_column, i, 0, head_count) < 0)
|
||||
return -1;
|
||||
if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, samplecount_column, i, 0, ind_sample_count) < 0)
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, singletoncount_column, i, 0, singleton_count) < 0)
|
||||
// TODO
|
||||
return -1;
|
||||
if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, internalcount_column, i, 0, internal_count) < 0)
|
||||
// TODO
|
||||
return -1;
|
||||
if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, headcount_column, i, 0, head_count) < 0)
|
||||
// TODO
|
||||
return -1;
|
||||
if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, samplecount_column, i, 0, ind_sample_count) < 0)
|
||||
// TODO
|
||||
return -1;
|
||||
// Flag the end of the line selection
|
||||
if (heads_only)
|
||||
line_selection[l] = -1;
|
||||
|
||||
// Create new view with line selection if heads only
|
||||
if (heads_only)
|
||||
{
|
||||
if (obi_save_and_close_view(o_view) < 0)
|
||||
{
|
||||
obidebug(1, "\nError closing the temporary output view");
|
||||
return -1;
|
||||
}
|
||||
o_view = obi_clone_view_from_name(dms, o_view_name_temp, o_view_name, line_selection, o_view_comments);
|
||||
if (o_view == NULL)
|
||||
{
|
||||
obidebug(1, "\nError cloning the temporary output view to make the final output view");
|
||||
return -1;
|
||||
}
|
||||
|
||||
free(o_view_name_temp);
|
||||
free(line_selection);
|
||||
}
|
||||
|
||||
// Close views
|
||||
@ -449,6 +602,7 @@ int obi_clean(const char* dms_name,
|
||||
obidebug(1, "\nError closing the input view after aligning");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (obi_save_and_close_view(o_view) < 0)
|
||||
{
|
||||
obidebug(1, "\nError closing the output view after aligning");
|
||||
|
Reference in New Issue
Block a user