obi clean: made more efficient with arrays (speed ~x15 compared with
OBI1)
This commit is contained in:
52
src/obi_clean.c
Normal file → Executable file
52
src/obi_clean.c
Normal file → Executable file
@ -185,8 +185,12 @@ int obi_clean(const char* dms_name,
|
|||||||
int ind_sample_count;
|
int ind_sample_count;
|
||||||
char status;
|
char status;
|
||||||
|
|
||||||
void** yes_trees;
|
void** yes_trees = NULL;
|
||||||
void** no_trees;
|
void** no_trees = NULL;
|
||||||
|
|
||||||
|
int* complete_sample_count_array = NULL;
|
||||||
|
int* sample_count_array = NULL;
|
||||||
|
Obi_blob_p* blob_array = NULL;
|
||||||
|
|
||||||
OBIDMS_p dms = NULL;
|
OBIDMS_p dms = NULL;
|
||||||
Obiview_p i_view = NULL;
|
Obiview_p i_view = NULL;
|
||||||
@ -332,6 +336,34 @@ int obi_clean(const char* dms_name,
|
|||||||
|
|
||||||
seq_count = (i_view->infos)->line_count;
|
seq_count = (i_view->infos)->line_count;
|
||||||
|
|
||||||
|
// Allocate arrays for sample counts otherwise reading in mapped files takes longer
|
||||||
|
complete_sample_count_array = (int*) malloc(seq_count * sample_count * sizeof(int));
|
||||||
|
if (complete_sample_count_array == NULL)
|
||||||
|
{
|
||||||
|
obi_set_errno(OBI_MALLOC_ERROR);
|
||||||
|
obidebug(1, "\nError allocating memory for the array of sample counts, size: %lld", seq_count * sample_count * sizeof(int));
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
for (sample=0; sample < sample_count; sample++)
|
||||||
|
{
|
||||||
|
sample_count_array = complete_sample_count_array+(sample*seq_count);
|
||||||
|
for (i=0; i<seq_count; i++)
|
||||||
|
sample_count_array[i] = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, i, sample);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Allocate arrays for blobs otherwise reading in mapped files takes longer
|
||||||
|
blob_array = (Obi_blob_p*) malloc(seq_count * sizeof(Obi_blob_p));
|
||||||
|
if (blob_array == NULL)
|
||||||
|
{
|
||||||
|
obi_set_errno(OBI_MALLOC_ERROR);
|
||||||
|
obidebug(1, "\nError allocating memory for the array of blobs");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
for (i=0; i<seq_count; i++)
|
||||||
|
{
|
||||||
|
blob_array[i] = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0);
|
||||||
|
}
|
||||||
|
|
||||||
// Allocate arrays of pointers to binary trees
|
// Allocate arrays of pointers to binary trees
|
||||||
yes_trees = (void**) calloc(seq_count, sizeof(void*));
|
yes_trees = (void**) calloc(seq_count, sizeof(void*));
|
||||||
if (yes_trees == NULL)
|
if (yes_trees == NULL)
|
||||||
@ -377,6 +409,7 @@ int obi_clean(const char* dms_name,
|
|||||||
|
|
||||||
for (sample=0; sample < sample_count; sample++)
|
for (sample=0; sample < sample_count; sample++)
|
||||||
{
|
{
|
||||||
|
sample_count_array = complete_sample_count_array+(sample*seq_count);
|
||||||
for (i=0; i< (seq_count-1); i++)
|
for (i=0; i< (seq_count-1); i++)
|
||||||
{
|
{
|
||||||
if (i%1000 == 0)
|
if (i%1000 == 0)
|
||||||
@ -386,7 +419,8 @@ int obi_clean(const char* dms_name,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Get first sequence
|
// Get first sequence
|
||||||
blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0);
|
blob1 = blob_array[i];
|
||||||
|
// blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0); // slower
|
||||||
if (blob1 == NULL)
|
if (blob1 == NULL)
|
||||||
{
|
{
|
||||||
obidebug(1, "\nError retrieving sequences to align");
|
obidebug(1, "\nError retrieving sequences to align");
|
||||||
@ -394,12 +428,14 @@ int obi_clean(const char* dms_name,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Get count for this sample
|
// Get count for this sample
|
||||||
s1_count = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, i, sample);
|
s1_count = sample_count_array[i];
|
||||||
|
//s1_count = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, i, sample); // slower
|
||||||
|
|
||||||
for (j=i+1; j < seq_count; j++) // TODO parallelize this loop?
|
for (j=i+1; j < seq_count; j++) // TODO parallelize this loop?
|
||||||
{
|
{
|
||||||
// Get second sequence
|
// Get second sequence
|
||||||
blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, j, 0);
|
blob2 = blob_array[j];
|
||||||
|
// blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, j, 0); // slower
|
||||||
if (blob2 == NULL)
|
if (blob2 == NULL)
|
||||||
{
|
{
|
||||||
obidebug(1, "\nError retrieving sequences to align");
|
obidebug(1, "\nError retrieving sequences to align");
|
||||||
@ -407,7 +443,8 @@ int obi_clean(const char* dms_name,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Get count for this sample
|
// Get count for this sample
|
||||||
s2_count = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, j, sample);
|
s2_count = sample_count_array[j];
|
||||||
|
//s2_count = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, j, sample); // slower
|
||||||
|
|
||||||
// Checking ratio
|
// Checking ratio
|
||||||
if (((s1_count!=OBIInt_NA && s2_count!=OBIInt_NA) && (s1_count>0 && s2_count>0)) &&
|
if (((s1_count!=OBIInt_NA && s2_count!=OBIInt_NA) && (s1_count>0 && s2_count>0)) &&
|
||||||
@ -453,6 +490,7 @@ int obi_clean(const char* dms_name,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Might be worth having arrays to read values too for some datasets but unlikely
|
||||||
// label as head or internal
|
// label as head or internal
|
||||||
if (s1_count >= s2_count)
|
if (s1_count >= s2_count)
|
||||||
{
|
{
|
||||||
@ -493,6 +531,8 @@ int obi_clean(const char* dms_name,
|
|||||||
|
|
||||||
free_kmer_tables(ktable, seq_count);
|
free_kmer_tables(ktable, seq_count);
|
||||||
free(index_array);
|
free(index_array);
|
||||||
|
free(complete_sample_count_array);
|
||||||
|
free(blob_array);
|
||||||
free(yes_trees);
|
free(yes_trees);
|
||||||
free(no_trees);
|
free(no_trees);
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user