multithreaded obiclean working but not cleaned
This commit is contained in:
131
src/obi_clean.c
131
src/obi_clean.c
@ -160,33 +160,23 @@ int obi_clean(const char* dms_name,
|
|||||||
{
|
{
|
||||||
char* o_view_name_temp = NULL;
|
char* o_view_name_temp = NULL;
|
||||||
float p;
|
float p;
|
||||||
index_t i, j, l;
|
index_t l;
|
||||||
|
index_t k;
|
||||||
index_t seq_count;
|
index_t seq_count;
|
||||||
index_t* line_selection = NULL;
|
index_t* line_selection = NULL;
|
||||||
double score;
|
|
||||||
bool above_threshold;
|
|
||||||
int lcs_length;
|
|
||||||
int ali_length;
|
|
||||||
Kmer_table_p ktable;
|
Kmer_table_p ktable;
|
||||||
Obi_blob_p blob1;
|
|
||||||
Obi_blob_p blob2;
|
|
||||||
int lcs_min;
|
|
||||||
int sample_count;
|
int sample_count;
|
||||||
int sample;
|
|
||||||
int s1_count;
|
|
||||||
int s2_count;
|
|
||||||
bool head;
|
bool head;
|
||||||
int head_count;
|
int head_count;
|
||||||
int internal_count;
|
int internal_count;
|
||||||
int singleton_count;
|
int singleton_count;
|
||||||
int ind_sample_count;
|
int ind_sample_count;
|
||||||
char status;
|
char status;
|
||||||
|
int samp;
|
||||||
|
|
||||||
byte_t* alignment_result_array = NULL;
|
byte_t* alignment_result_array = NULL;
|
||||||
byte_t ali_result;
|
|
||||||
|
|
||||||
int* complete_sample_count_array = NULL;
|
int* complete_sample_count_array = NULL;
|
||||||
int* sample_count_array = NULL;
|
|
||||||
Obi_blob_p* blob_array = NULL;
|
Obi_blob_p* blob_array = NULL;
|
||||||
|
|
||||||
OBIDMS_p dms = NULL;
|
OBIDMS_p dms = NULL;
|
||||||
@ -201,9 +191,6 @@ int obi_clean(const char* dms_name,
|
|||||||
OBIDMS_column_p singletoncount_column = NULL;
|
OBIDMS_column_p singletoncount_column = NULL;
|
||||||
OBIDMS_column_p samplecount_column = NULL;
|
OBIDMS_column_p samplecount_column = NULL;
|
||||||
|
|
||||||
byte_t no;
|
|
||||||
byte_t yes;
|
|
||||||
|
|
||||||
bool normalize = false;
|
bool normalize = false;
|
||||||
int reference = 0;
|
int reference = 0;
|
||||||
bool similarity_mode = false;
|
bool similarity_mode = false;
|
||||||
@ -352,11 +339,10 @@ int obi_clean(const char* dms_name,
|
|||||||
obidebug(1, "\nError allocating memory for the array of sample counts, size: %lld", seq_count * sample_count * sizeof(int));
|
obidebug(1, "\nError allocating memory for the array of sample counts, size: %lld", seq_count * sample_count * sizeof(int));
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
for (sample=0; sample < sample_count; sample++)
|
for (samp=0; samp < sample_count; samp++)
|
||||||
{
|
{
|
||||||
sample_count_array = complete_sample_count_array+(sample*seq_count);
|
for (k=0; k<seq_count; k++)
|
||||||
for (i=0; i<seq_count; i++)
|
complete_sample_count_array[k+(samp*seq_count)] = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, k, samp);
|
||||||
sample_count_array[i] = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, i, sample);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Allocate arrays for blobs otherwise reading in mapped files takes longer
|
// Allocate arrays for blobs otherwise reading in mapped files takes longer
|
||||||
@ -367,14 +353,15 @@ int obi_clean(const char* dms_name,
|
|||||||
obidebug(1, "\nError allocating memory for the array of blobs");
|
obidebug(1, "\nError allocating memory for the array of blobs");
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
for (i=0; i<seq_count; i++)
|
for (k=0; k<seq_count; k++)
|
||||||
{
|
{
|
||||||
blob_array[i] = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0);
|
blob_array[k] = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, k, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Allocate alignment result array (byte at 0 if not aligned yet,
|
// Allocate alignment result array (byte at 0 if not aligned yet,
|
||||||
// 1 if sequence at index has a similarity above the threshold with the current sequence,
|
// 1 if sequence at index has a similarity above the threshold with the current sequence,
|
||||||
// 2 if sequence at index has a similarity below the threshold with the current sequence)
|
// 2 if sequence at index has a similarity below the threshold with the current sequence)
|
||||||
|
//alignment_result_array = (byte_t*) calloc(thread_count*seq_count, sizeof(byte_t));
|
||||||
alignment_result_array = (byte_t*) calloc(seq_count, sizeof(byte_t));
|
alignment_result_array = (byte_t*) calloc(seq_count, sizeof(byte_t));
|
||||||
if (alignment_result_array == NULL)
|
if (alignment_result_array == NULL)
|
||||||
{
|
{
|
||||||
@ -384,13 +371,13 @@ int obi_clean(const char* dms_name,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Initialize all sequences to singletons or NA if no sequences in that sample
|
// Initialize all sequences to singletons or NA if no sequences in that sample
|
||||||
for (i=0; i<seq_count; i++)
|
for (k=0; k<seq_count; k++)
|
||||||
{
|
{
|
||||||
for (sample=0; sample < sample_count; sample++)
|
for (samp=0; samp < sample_count; samp++)
|
||||||
{
|
{
|
||||||
if (obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, i, sample) != OBIInt_NA) // Only initialize samples where there are some sequences
|
if (obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, k, samp) != OBIInt_NA) // Only initialize samples where there are some sequences
|
||||||
{
|
{
|
||||||
if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample, 's') < 0)
|
if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, k, samp, 's') < 0)
|
||||||
{
|
{
|
||||||
obidebug(1, "\nError initializing all sequences to singletons");
|
obidebug(1, "\nError initializing all sequences to singletons");
|
||||||
return -1;
|
return -1;
|
||||||
@ -399,12 +386,17 @@ int obi_clean(const char* dms_name,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Obi_blob_p blob1;
|
||||||
|
index_t i;
|
||||||
|
byte_t* ali_result_array = alignment_result_array;
|
||||||
|
|
||||||
|
|
||||||
for (i=0; i< (seq_count-1); i++)
|
for (i=0; i< (seq_count-1); i++)
|
||||||
{
|
{
|
||||||
|
|
||||||
if (i%1000 == 0)
|
if (i%1000 == 0)
|
||||||
{
|
{
|
||||||
p = (i/(float)seq_count)*100;
|
p = (i/(float)(seq_count/(float)thread_count))*100;
|
||||||
fprintf(stderr,"\rDone : %f %% ",p);
|
fprintf(stderr,"\rDone : %f %% ",p);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -414,23 +406,57 @@ int obi_clean(const char* dms_name,
|
|||||||
if (blob1 == NULL)
|
if (blob1 == NULL)
|
||||||
{
|
{
|
||||||
obidebug(1, "\nError retrieving sequences to align");
|
obidebug(1, "\nError retrieving sequences to align");
|
||||||
return -1;
|
stop = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#pragma omp parallel default(none) \
|
||||||
|
shared(ali_result_array, thread_count, seq_count, blob_array, complete_sample_count_array, alignment_result_array, stop, blob1, i, \
|
||||||
|
obi_errno, stderr, max_ratio, iseq_column, i_view, similarity_mode, reference, normalize, threshold, ktable, status_column, o_view, sample_count)
|
||||||
|
//private(ali_result_array, thread_id, i, j, p, blob1, blob2, s1_count, s2_count, \
|
||||||
|
sample_count_array, sample, yes, no, above_threshold, ali_length, lcs_length, ali_result, score)
|
||||||
|
|
||||||
|
{
|
||||||
|
|
||||||
|
//byte_t* ali_result_array = NULL;
|
||||||
|
int thread_id = 0;
|
||||||
|
index_t j;
|
||||||
|
float p;
|
||||||
|
Obi_blob_p blob2;
|
||||||
|
int s1_count;
|
||||||
|
int s2_count;
|
||||||
|
int* sample_count_array;
|
||||||
|
int sample;
|
||||||
|
byte_t no;
|
||||||
|
byte_t yes;
|
||||||
|
double score;
|
||||||
|
int lcs_min;
|
||||||
|
bool above_threshold;
|
||||||
|
int lcs_length;
|
||||||
|
int ali_length;
|
||||||
|
byte_t ali_result;
|
||||||
|
|
||||||
|
// #ifdef _OPENMP
|
||||||
|
// thread_id = omp_get_thread_num();
|
||||||
|
// ali_result_array = alignment_result_array+thread_id;
|
||||||
|
// #else
|
||||||
|
// ali_result_array = alignment_result_array;
|
||||||
|
// #endif
|
||||||
|
|
||||||
|
#pragma omp for schedule(dynamic, sample_count/thread_count)
|
||||||
for (sample=0; sample < sample_count; sample++)
|
for (sample=0; sample < sample_count; sample++)
|
||||||
{
|
{
|
||||||
|
|
||||||
sample_count_array = complete_sample_count_array+(sample*seq_count);
|
sample_count_array = complete_sample_count_array+(sample*seq_count);
|
||||||
|
|
||||||
// Get count for this sample
|
// Get count for this sample
|
||||||
s1_count = sample_count_array[i];
|
s1_count = sample_count_array[i];
|
||||||
//s1_count = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, i, sample); // slower
|
//s1_count = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, i, sample); // slower
|
||||||
|
|
||||||
#pragma omp parallel shared(i, seq_count, s1_count, sample, blob_array, sample_count_array, alignment_result_array, stop) \
|
|
||||||
private(j, blob2, s2_count, yes, no, above_threshold, ali_result, score)
|
|
||||||
{
|
|
||||||
#pragma omp for schedule(dynamic, 100)
|
|
||||||
for (j=i+1; j < seq_count; j++)
|
for (j=i+1; j < seq_count; j++)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
//fprintf(stderr, "\nthread=%d, i=%d, sample=%d, j=%d", omp_get_thread_num(),i,sample,j);
|
||||||
// Get second sequence
|
// Get second sequence
|
||||||
blob2 = blob_array[j];
|
blob2 = blob_array[j];
|
||||||
// blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, j, 0); // slower
|
// blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, j, 0); // slower
|
||||||
@ -453,7 +479,7 @@ int obi_clean(const char* dms_name,
|
|||||||
yes = 0;
|
yes = 0;
|
||||||
no = 0;
|
no = 0;
|
||||||
above_threshold = false;
|
above_threshold = false;
|
||||||
ali_result = alignment_result_array[j];
|
ali_result = ali_result_array[j];
|
||||||
if (ali_result > 0) // already aligned
|
if (ali_result > 0) // already aligned
|
||||||
{
|
{
|
||||||
if (ali_result == 2)
|
if (ali_result == 2)
|
||||||
@ -483,7 +509,7 @@ int obi_clean(const char* dms_name,
|
|||||||
{
|
{
|
||||||
if (yes == 0)
|
if (yes == 0)
|
||||||
// Set ali result as above the threshold (value 1)
|
// Set ali result as above the threshold (value 1)
|
||||||
alignment_result_array[j] = 1;
|
ali_result_array[j] = 1;
|
||||||
|
|
||||||
// Might be worth having arrays to read values too for some datasets but unlikely
|
// Might be worth having arrays to read values too for some datasets but unlikely
|
||||||
// label as head or internal
|
// label as head or internal
|
||||||
@ -513,17 +539,19 @@ int obi_clean(const char* dms_name,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
else if (no == 0)
|
else if (no == 0)
|
||||||
// Set ali result as above the threshold (value 2)
|
// Set ali result as above the threshold (value 2)
|
||||||
alignment_result_array[j] = 2;
|
ali_result_array[j] = 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
if (stop)
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
// Reset ali result array to 0
|
// Reset ali result array to 0
|
||||||
memset(alignment_result_array, 0, seq_count);
|
memset(ali_result_array, 0, seq_count);
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
free_kmer_tables(ktable, seq_count);
|
free_kmer_tables(ktable, seq_count);
|
||||||
@ -533,6 +561,9 @@ int obi_clean(const char* dms_name,
|
|||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
|
if (stop)
|
||||||
|
return -1;
|
||||||
|
|
||||||
if (heads_only)
|
if (heads_only)
|
||||||
{
|
{
|
||||||
line_selection = malloc((o_view->infos)->line_count * sizeof(index_t));
|
line_selection = malloc((o_view->infos)->line_count * sizeof(index_t));
|
||||||
@ -545,11 +576,11 @@ int obi_clean(const char* dms_name,
|
|||||||
l=0;
|
l=0;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i=0; i<seq_count; i++)
|
for (k=0; k<seq_count; k++)
|
||||||
{
|
{
|
||||||
if (i%1000 == 0)
|
if (k%1000 == 0)
|
||||||
{
|
{
|
||||||
p = (i/(float)(seq_count))*100;
|
p = (k/(float)(seq_count))*100;
|
||||||
fprintf(stderr, "\rAnnotating : %f %% ",p);
|
fprintf(stderr, "\rAnnotating : %f %% ",p);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -559,16 +590,16 @@ int obi_clean(const char* dms_name,
|
|||||||
singleton_count = 0;
|
singleton_count = 0;
|
||||||
ind_sample_count = 0;
|
ind_sample_count = 0;
|
||||||
|
|
||||||
for (sample=0; sample < sample_count; sample++)
|
for (samp=0; samp < sample_count; samp++)
|
||||||
{
|
{
|
||||||
// Check if head or singleton in at least one sample
|
// Check if head or singleton in at least one sample
|
||||||
status = obi_get_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample);
|
status = obi_get_char_with_elt_idx_and_col_p_in_view(o_view, status_column, k, samp);
|
||||||
if ((!head) && ((status == 'h') || (status == 's')))
|
if ((!head) && ((status == 'h') || (status == 's')))
|
||||||
{
|
{
|
||||||
head = true;
|
head = true;
|
||||||
if (heads_only)
|
if (heads_only)
|
||||||
{
|
{
|
||||||
line_selection[l] = i;
|
line_selection[l] = k;
|
||||||
l++;
|
l++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -593,15 +624,15 @@ int obi_clean(const char* dms_name,
|
|||||||
|
|
||||||
if (!heads_only || (heads_only && head)) // Label only if sequence is going to be kept in final view
|
if (!heads_only || (heads_only && head)) // Label only if sequence is going to be kept in final view
|
||||||
{
|
{
|
||||||
if (obi_set_bool_with_elt_idx_and_col_p_in_view(o_view, head_column, i, 0, head) < 0)
|
if (obi_set_bool_with_elt_idx_and_col_p_in_view(o_view, head_column, k, 0, head) < 0)
|
||||||
return -1;
|
return -1;
|
||||||
if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, singletoncount_column, i, 0, singleton_count) < 0)
|
if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, singletoncount_column, k, 0, singleton_count) < 0)
|
||||||
return -1;
|
return -1;
|
||||||
if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, internalcount_column, i, 0, internal_count) < 0)
|
if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, internalcount_column, k, 0, internal_count) < 0)
|
||||||
return -1;
|
return -1;
|
||||||
if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, headcount_column, i, 0, head_count) < 0)
|
if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, headcount_column, k, 0, head_count) < 0)
|
||||||
return -1;
|
return -1;
|
||||||
if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, samplecount_column, i, 0, ind_sample_count) < 0)
|
if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, samplecount_column, k, 0, ind_sample_count) < 0)
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user