diff --git a/src/obi_clean.c b/src/obi_clean.c index 65163c8..53d9a56 100755 --- a/src/obi_clean.c +++ b/src/obi_clean.c @@ -159,12 +159,11 @@ int obi_clean(const char* dms_name, bool heads_only, int thread_count) { - char* o_view_name_temp; + char* o_view_name_temp = NULL; float p; index_t i, j, l; index_t seq_count; - index_t* index_array; - index_t* line_selection; + index_t* line_selection = NULL; double score; bool above_threshold; int lcs_length; @@ -184,12 +183,12 @@ int obi_clean(const char* dms_name, int ind_sample_count; char status; - void** yes_trees = NULL; - void** no_trees = NULL; + byte_t* alignment_result_array = NULL; + byte_t ali_result; - int* complete_sample_count_array = NULL; - int* sample_count_array = NULL; - Obi_blob_p* blob_array = NULL; + int* complete_sample_count_array = NULL; + int* sample_count_array = NULL; + Obi_blob_p* blob_array = NULL; OBIDMS_p dms = NULL; Obiview_p i_view = NULL; @@ -203,9 +202,8 @@ int obi_clean(const char* dms_name, OBIDMS_column_p singletoncount_column = NULL; OBIDMS_column_p samplecount_column = NULL; - void* no; - void* yes; - void* key_p; + byte_t no; + byte_t yes; bool normalize = false; int reference = 0; @@ -363,32 +361,16 @@ int obi_clean(const char* dms_name, blob_array[i] = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0); } - // Allocate arrays of pointers to binary trees - yes_trees = (void**) calloc(seq_count, sizeof(void*)); - if (yes_trees == NULL) + // Allocate alignment result array (byte at 0 if not aligned yet, + // 1 if sequence at index has a similarity above the threshold with the current sequence, + // 2 if sequence at index has a similarity below the threshold with the current sequence) + alignment_result_array = (byte_t*) calloc(seq_count, sizeof(byte_t)); + if (alignment_result_array == NULL) { obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for 'yes' binary trees"); + obidebug(1, "\nError allocating memory for alignment result array"); return -1; } - no_trees = (void**) calloc(seq_count, sizeof(void*)); - if (no_trees == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for 'no' binary trees"); - return -1; - } - - // Allocate and fill index array for the binary trees to reference (they don't copy the data) - index_array = (index_t*) malloc(seq_count * sizeof(index_t)); - if (index_array == NULL) - { - obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError allocating memory for the index array"); - return -1; - } - for (i=0; i < seq_count; i++) - index_array[i]=i; // Initialize all sequences to singletons or NA if no sequences in that sample for (i=0; i0 && s2_count>0)) && ((((s1_count >= s2_count) && (((double) s2_count / (double) s1_count) <= max_ratio))) || (((s2_count >= s1_count) && (((double) s1_count / (double) s2_count) <= max_ratio))))) { - yes=NULL; - no=NULL; - no = tfind(index_array+j, &(no_trees[i]), idxcmp); - if (no == NULL) - yes = tfind(index_array+j, &(yes_trees[i]), idxcmp); - + yes = 0; + no = 0; above_threshold = false; - if ((no == NULL) && (yes == NULL)) // never compared before + ali_result = alignment_result_array[j]; + if (ali_result > 0) // already aligned + { + if (ali_result == 2) + no = 1; + else if (ali_result == 1) + yes = 1; + } + else // never compared before { // Check if the sequences are identical in a quick way (same index in the same indexer) if (obi_get_index_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0) == obi_get_index_with_elt_idx_and_col_p_in_view(i_view, iseq_column, j, 0)) @@ -478,16 +466,9 @@ int obi_clean(const char* dms_name, if (yes || above_threshold) { - if (yes == NULL) - // Put in 'yes' tree of 1st sequence - { - key_p = tsearch(index_array+j, &(yes_trees[i]), idxcmp); - if (key_p == NULL) - { - obidebug(1, "\nError adding an index in a binary tree"); - return -1; - } - } + if (yes == 0) + // Set ali result as above the threshold (value 1) + alignment_result_array[j] = 1; // Might be worth having arrays to read values too for some datasets but unlikely // label as head or internal @@ -498,11 +479,12 @@ int obi_clean(const char* dms_name, if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample, 'h') < 0) return -1; } - // Otherwise it's an internal + // Otherwise it's an internal (do nothing) + // Label other sequence as internal no matter what if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample, 'i') < 0) return -1; } - else + else // Same thing but with sequences switched { if (obi_get_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample) == 's') // seq can become head ONLY if it's a singleton { @@ -513,27 +495,20 @@ int obi_clean(const char* dms_name, return -1; } } - else if (no == NULL) - // Put in 'no' tree of 1st sequence - { - key_p = tsearch(index_array+j, &(no_trees[i]), idxcmp); - if (key_p == NULL) - { - obidebug(1, "\nError adding an index in a binary tree"); - return -1; - } - } + else if (no == 0) + // Set ali result as above the threshold (value 2) + alignment_result_array[j] = 2; } } } + // Reset ali result array to 0 + memset(alignment_result_array, 0, seq_count); } free_kmer_tables(ktable, seq_count); - free(index_array); free(complete_sample_count_array); free(blob_array); - free(yes_trees); - free(no_trees); + free(alignment_result_array); fprintf(stderr, "\n"); @@ -653,7 +628,7 @@ int obi_clean(const char* dms_name, return -1; } - fprintf(stderr,"\rDone : 100 %% "); + fprintf(stderr, "\rDone : 100 %% \n"); return 0; }