From debf59b266b8f55643b5b7e9667e899a333b9b68 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Sat, 25 May 2019 18:37:56 +0200 Subject: [PATCH] i loop parallelized: bad --- src/obi_clean.c | 202 +++++++++++++++++++++++++----------------------- 1 file changed, 107 insertions(+), 95 deletions(-) diff --git a/src/obi_clean.c b/src/obi_clean.c index 9cda879..648118b 100755 --- a/src/obi_clean.c +++ b/src/obi_clean.c @@ -183,6 +183,7 @@ int obi_clean(const char* dms_name, char status; byte_t* alignment_result_array = NULL; + byte_t* ali_result_array = NULL; byte_t ali_result; int* complete_sample_count_array = NULL; @@ -211,6 +212,7 @@ int obi_clean(const char* dms_name, bool stop = false; int max_threads = 1; + int thread_id = 0; #ifdef _OPENMP max_threads = omp_get_max_threads(); @@ -375,7 +377,7 @@ int obi_clean(const char* dms_name, // Allocate alignment result array (byte at 0 if not aligned yet, // 1 if sequence at index has a similarity above the threshold with the current sequence, // 2 if sequence at index has a similarity below the threshold with the current sequence) - alignment_result_array = (byte_t*) calloc(seq_count, sizeof(byte_t)); + alignment_result_array = (byte_t*) calloc(thread_count*seq_count, sizeof(byte_t)); if (alignment_result_array == NULL) { obi_set_errno(OBI_MALLOC_ERROR); @@ -399,126 +401,133 @@ int obi_clean(const char* dms_name, } } + #pragma omp parallel shared(seq_count, blob_array, complete_sample_count_array, alignment_result_array, stop) \ + private(ali_result_array, thread_id, i, j, blob1, blob2, s1_count, s2_count, \ + sample_count_array, sample, yes, no, above_threshold, ali_result, score) - for (i=0; i< (seq_count-1); i++) { - if (i%1000 == 0) + + #ifdef _OPENMP + thread_id = omp_get_thread_num(); + ali_result_array = alignment_result_array+thread_id; + #endif + + #pragma omp for schedule(dynamic, 100) + for (i=0; i< (seq_count-1); i++) { - p = (i/(float)seq_count)*100; - fprintf(stderr,"\rDone : %f %% ",p); - } - - // Get first sequence - blob1 = blob_array[i]; -// blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0); // slower - if (blob1 == NULL) - { - obidebug(1, "\nError retrieving sequences to align"); - return -1; - } - - for (sample=0; sample < sample_count; sample++) - { - sample_count_array = complete_sample_count_array+(sample*seq_count); - - // Get count for this sample - s1_count = sample_count_array[i]; - //s1_count = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, i, sample); // slower - - #pragma omp parallel for shared(i, seq_count, s1_count, sample, blob_array, sample_count_array, alignment_result_array, stop) \ - private(j, blob2, s2_count, yes, no, above_threshold, ali_result, score) - - for (j=i+1; j < seq_count; j++) + if (i%1000 == 0) { - // Get second sequence - blob2 = blob_array[j]; -// blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, j, 0); // slower - if (blob2 == NULL) - { - obidebug(1, "\nError retrieving sequences to align"); - stop = true; - } + p = (i/(float)(seq_count/(float)thread_count))*100; + fprintf(stderr,"\rDone : %f %% ",p); + } + + // Get first sequence + blob1 = blob_array[i]; + // blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0); // slower + if (blob1 == NULL) + { + obidebug(1, "\nError retrieving sequences to align"); + stop = true; + } + + for (sample=0; sample < sample_count; sample++) + { + sample_count_array = complete_sample_count_array+(sample*seq_count); // Get count for this sample - s2_count = sample_count_array[j]; - //s2_count = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, j, sample); // slower + s1_count = sample_count_array[i]; + //s1_count = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, i, sample); // slower - // Check all ratios - if (((s1_count!=OBIInt_NA && s2_count!=OBIInt_NA) && (s1_count>0 && s2_count>0)) && - ((((s1_count >= s2_count) && (((double) s2_count / (double) s1_count) <= max_ratio))) || - (((s2_count >= s1_count) && (((double) s1_count / (double) s2_count) <= max_ratio))))) + for (j=i+1; j < seq_count; j++) { - - yes = 0; - no = 0; - above_threshold = false; - ali_result = alignment_result_array[j]; - if (ali_result > 0) // already aligned + // Get second sequence + blob2 = blob_array[j]; + // blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, j, 0); // slower + if (blob2 == NULL) { - if (ali_result == 2) - no = 1; - else if (ali_result == 1) - yes = 1; + obidebug(1, "\nError retrieving sequences to align"); + stop = true; } - else // never compared before + + // Get count for this sample + s2_count = sample_count_array[j]; + //s2_count = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, j, sample); // slower + + // Check all ratios + if (((s1_count!=OBIInt_NA && s2_count!=OBIInt_NA) && (s1_count>0 && s2_count>0)) && + ((((s1_count >= s2_count) && (((double) s2_count / (double) s1_count) <= max_ratio))) || + (((s2_count >= s1_count) && (((double) s1_count / (double) s2_count) <= max_ratio))))) { - // Check if the sequences are identical in a quick way (same index in the same indexer) - if (obi_get_index_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0) == obi_get_index_with_elt_idx_and_col_p_in_view(i_view, iseq_column, j, 0)) - above_threshold = true; - else // the sequences aren't identical + + yes = 0; + no = 0; + above_threshold = false; + ali_result = ali_result_array[j]; + if (ali_result > 0) // already aligned { - // kmer filter - align_filters(ktable, blob1, blob2, i, j, threshold, normalize, reference, similarity_mode, &score, &lcs_min, false); - - // Compute alignment score if filter passed - if (score == -1.0) - score = obiblob_sse_banded_lcs_align(blob1, blob2, threshold, normalize, reference, similarity_mode, &lcs_length, &ali_length); - - above_threshold = ((score >= 0) && (score <= threshold)); + if (ali_result == 2) + no = 1; + else if (ali_result == 1) + yes = 1; } - } - - if (yes || above_threshold) - { - if (yes == 0) - // Set ali result as above the threshold (value 1) - alignment_result_array[j] = 1; - - // Might be worth having arrays to read values too for some datasets but unlikely - // label as head or internal - if (s1_count >= s2_count) + else // never compared before { - if (obi_get_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample) == 's') // seq can become head ONLY if it's a singleton + // Check if the sequences are identical in a quick way (same index in the same indexer) + if (obi_get_index_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0) == obi_get_index_with_elt_idx_and_col_p_in_view(i_view, iseq_column, j, 0)) + above_threshold = true; + else // the sequences aren't identical { - if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample, 'h') < 0) - stop = true; + // kmer filter + align_filters(ktable, blob1, blob2, i, j, threshold, normalize, reference, similarity_mode, &score, &lcs_min, false); + + // Compute alignment score if filter passed + if (score == -1.0) + score = obiblob_sse_banded_lcs_align(blob1, blob2, threshold, normalize, reference, similarity_mode, &lcs_length, &ali_length); + + above_threshold = ((score >= 0) && (score <= threshold)); } - // Otherwise it's an internal (do nothing) - // Label other sequence as internal no matter what - if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample, 'i') < 0) - stop = true; } - else // Same thing but with sequences switched + + if (yes || above_threshold) { - if (obi_get_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample) == 's') // seq can become head ONLY if it's a singleton + if (yes == 0) + // Set ali result as above the threshold (value 1) + ali_result_array[j] = 1; + + // Might be worth having arrays to read values too for some datasets but unlikely + // label as head or internal + if (s1_count >= s2_count) + { + if (obi_get_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample) == 's') // seq can become head ONLY if it's a singleton { - if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample, 'h') < 0) + if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample, 'h') < 0) stop = true; } - if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample, 'i') < 0) - stop = true; + // Otherwise it's an internal (do nothing) + // Label other sequence as internal no matter what + if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample, 'i') < 0) + stop = true; + } + else // Same thing but with sequences switched + { + if (obi_get_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample) == 's') // seq can become head ONLY if it's a singleton + { + if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample, 'h') < 0) + stop = true; + } + if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample, 'i') < 0) + stop = true; + } } + else if (no == 0) + // Set ali result as above the threshold (value 2) + ali_result_array[j] = 2; } - else if (no == 0) - // Set ali result as above the threshold (value 2) - alignment_result_array[j] = 2; } } - if (stop) - return -1; + // Reset ali result array to 0 + memset(ali_result_array, 0, seq_count); } - // Reset ali result array to 0 - memset(alignment_result_array, 0, seq_count); } free_kmer_tables(ktable, seq_count); @@ -528,6 +537,9 @@ int obi_clean(const char* dms_name, fprintf(stderr, "\n"); + if (stop) + return -1; + if (heads_only) { line_selection = malloc((o_view->infos)->line_count * sizeof(index_t));