From 1de308a856d72e89242f115fafe613bc46acde22 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Thu, 31 May 2018 15:11:41 +0200 Subject: [PATCH] obi clean: option to only keep heads now works, fixed a bug where last sequence was not properly labelled, and code is cleaned, fixed and error checked --- src/obi_clean.c | 248 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 201 insertions(+), 47 deletions(-) diff --git a/src/obi_clean.c b/src/obi_clean.c index c9ac057..c814ac0 100644 --- a/src/obi_clean.c +++ b/src/obi_clean.c @@ -41,13 +41,40 @@ * **************************************************************************/ -// TODO + +/** + * Internal function creating the output columns for the obiclean algorithm. + * + * @param o_view A pointer on the output view. + * @param i_view A pointer on the input view. + * @param sample_column A pointer on the column where sample counts are kept. + * @param sample_count The number of different samples. + * + * @retval 0 if the operation was successfully completed. + * @retval -1 if an error occurred. + * + * @since April 2018 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ static int create_output_columns(Obiview_p o_view, Obiview_p i_view, OBIDMS_column_p sample_column, int sample_count); +/** + * @brief Internal function comparing two indexes (int64_t). + * + * @param idx1 A pointer on the first index. + * @param idx2 A pointer on the second index. + * + * @returns -1 if idx1 < idx2, + * 1 if idx1 > idx2, + * and 0 if idx1 == idx2. + * + * @since April 2018 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ static inline int idxcmp(const void* idx1, const void* idx2); @@ -125,7 +152,6 @@ static inline int idxcmp(const void* idx1, const void* idx2) **********************************************************************/ -// TODO comment int obi_clean(const char* dms_name, const char* i_view_name, const char* sample_column_name, @@ -136,10 +162,12 @@ int obi_clean(const char* dms_name, bool heads_only, int thread_count) { + char* o_view_name_temp; float p; - index_t i, j; + index_t i, j, l; index_t seq_count; index_t* index_array; + index_t* line_selection; double score; bool above_threshold; int lcs_length; @@ -174,8 +202,6 @@ int obi_clean(const char* dms_name, OBIDMS_column_p singletoncount_column = NULL; OBIDMS_column_p samplecount_column = NULL; - // TODO other columns - void* no; void* yes; void* key_p; @@ -184,11 +210,13 @@ int obi_clean(const char* dms_name, int reference = 0; bool similarity_mode = false; - char* seq1; - char* seq2; - // Open DMS dms = obi_dms(dms_name); + if (dms == NULL) + { + obidebug(1, "\nError opening the DMS"); + return -1; + } // Open input view i_view = obi_open_view(dms, i_view_name); @@ -226,11 +254,25 @@ int obi_clean(const char* dms_name, } } - // Create the output view - o_view = obi_clone_view(dms, i_view, o_view_name, NULL, o_view_comments); + // Create the output view, or a temporary one if heads only + if (heads_only) + { + o_view_name_temp = calloc((strlen(o_view_name)+strlen("_temp")+1), sizeof(char)); + if (o_view_name_temp == NULL) + { + obidebug(1, "\nError allocating memory for the name of a temporary output view in obiclean"); + return -1; + } + o_view_name_temp = strcpy(o_view_name_temp, o_view_name); + strcat(o_view_name_temp, "_temp"); + o_view = obi_clone_view(dms, i_view, o_view_name_temp, NULL, o_view_comments); + } + else + o_view = obi_clone_view(dms, i_view, o_view_name, NULL, o_view_comments); + if (o_view == NULL) { - obidebug(1, "\nError creating the output view"); + obidebug(1, "\nError creating the output view in obiclean"); return -1; } @@ -238,16 +280,48 @@ int obi_clean(const char* dms_name, // Create the output columns if (create_output_columns(o_view, i_view, sample_column, sample_count) < 0) - // TODO + { + obidebug(1, "\nError creating the output columns"); return -1; + } // Get the created output columns status_column = obi_view_get_column(o_view, CLEAN_STATUS_COLUMN_NAME); + if (status_column == NULL) + { + obidebug(1, "\nError getting the obiclean status column"); + return -1; + } head_column = obi_view_get_column(o_view, CLEAN_HEAD_COLUMN_NAME); + if (status_column == NULL) + { + obidebug(1, "\nError getting the obiclean head column"); + return -1; + } internalcount_column = obi_view_get_column(o_view, CLEAN_INTERNALCOUNT_COLUMN_NAME); + if (status_column == NULL) + { + obidebug(1, "\nError getting the obiclean internal count column"); + return -1; + } headcount_column = obi_view_get_column(o_view, CLEAN_HEADCOUNT_COLUMN_NAME); + if (status_column == NULL) + { + obidebug(1, "\nError getting the obiclean head count column"); + return -1; + } singletoncount_column = obi_view_get_column(o_view, CLEAN_SINGLETONCOUNT_COLUMN_NAME); + if (status_column == NULL) + { + obidebug(1, "\nError getting the obiclean singleton count column"); + return -1; + } samplecount_column = obi_view_get_column(o_view, CLEAN_SAMPLECOUNT_COLUMN_NAME); + if (status_column == NULL) + { + obidebug(1, "\nError getting the obiclean sample count column"); + return -1; + } // Build kmer tables ktable = hash_seq_column(i_view, iseq_column, 0); @@ -262,30 +336,50 @@ int obi_clean(const char* dms_name, // Allocate arrays of pointers to binary trees yes_trees = (void**) calloc(seq_count, sizeof(void*)); + if (yes_trees == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for 'yes' binary trees"); + return -1; + } no_trees = (void**) calloc(seq_count, sizeof(void*)); + if (no_trees == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for 'no' binary trees"); + return -1; + } // Allocate and fill index array for the binary trees to reference (they don't copy the data) index_array = (index_t*) malloc(seq_count * sizeof(index_t)); + if (index_array == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for the index array"); + return -1; + } for (i=0; i < seq_count; i++) index_array[i]=i; // Initialize all sequences to singletons or NA if no sequences in that sample - for (i=0; i < (seq_count - 1); i++) + for (i=0; i= s2_count) && (((double) s2_count / (double) s1_count) <= max_ratio))) || (((s2_count >= s1_count) && (((double) s1_count / (double) s2_count) <= max_ratio))))) { + yes=NULL; no=NULL; no = tfind(index_array+j, &(no_trees[i]), idxcmp); @@ -333,7 +424,7 @@ int obi_clean(const char* dms_name, yes = tfind(index_array+j, &(yes_trees[i]), idxcmp); above_threshold = false; - if ((no == NULL) && (yes == NULL)) //never compared + if ((no == NULL) && (yes == NULL)) // never compared before { // Check if the sequences are identical in a quick way (same index in the same indexer) if (obi_get_index_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0) == obi_get_index_with_elt_idx_and_col_p_in_view(i_view, iseq_column, j, 0)) @@ -354,38 +445,74 @@ int obi_clean(const char* dms_name, if (yes || above_threshold) { if (yes == NULL) - // put seqs in their 'yes' trees - key_p = tsearch(index_array+j, &(yes_trees[i]), idxcmp); // TODO error checking + // Put in 'yes' tree of 1st sequence + { + key_p = tsearch(index_array+j, &(yes_trees[i]), idxcmp); + if (key_p == NULL) + { + obidebug(1, "\nError adding an index in a binary tree"); + return -1; + } + } + // label as head or internal if (s1_count >= s2_count) { if (obi_get_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample) == 's') // seq can become head ONLY if it's a singleton - obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample, 'h'); + { + if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample, 'h') < 0) + return -1; + } // Otherwise it's an internal - obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample, 'i'); + if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample, 'i') < 0) + return -1; } else { if (obi_get_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample) == 's') // seq can become head ONLY if it's a singleton - obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample, 'h'); - obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample, 'i'); + { + if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample, 'h') < 0) + return -1; + } + if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample, 'i') < 0) + return -1; } } else if (no == NULL) - // put in 'no' tree of both sequences // TODO why just one then??? - key_p = tsearch(index_array+j, &(no_trees[i]), idxcmp); // TODO error checking + // Put in 'no' tree of 1st sequence + { + key_p = tsearch(index_array+j, &(no_trees[i]), idxcmp); + if (key_p == NULL) + { + obidebug(1, "\nError adding an index in a binary tree"); + return -1; + } + } } - free(seq2); } - free(seq1); } } free_kmer_tables(ktable, seq_count); + free(index_array); + free(yes_trees); + free(no_trees); fprintf(stderr, "\n"); - for (i=0; i < (seq_count - 1); i++) + if (heads_only) + { + line_selection = malloc((o_view->infos)->line_count * sizeof(index_t)); + if (line_selection == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError allocating memory for line selection"); + return -1; + } + l=0; + } + + for (i=0; i