obi clean: not using tsearch library anymore, a simple byte array
instead. A lot more time and memory efficient. Closes #67
This commit is contained in:
143
src/obi_clean.c
143
src/obi_clean.c
@ -159,12 +159,11 @@ int obi_clean(const char* dms_name,
|
|||||||
bool heads_only,
|
bool heads_only,
|
||||||
int thread_count)
|
int thread_count)
|
||||||
{
|
{
|
||||||
char* o_view_name_temp;
|
char* o_view_name_temp = NULL;
|
||||||
float p;
|
float p;
|
||||||
index_t i, j, l;
|
index_t i, j, l;
|
||||||
index_t seq_count;
|
index_t seq_count;
|
||||||
index_t* index_array;
|
index_t* line_selection = NULL;
|
||||||
index_t* line_selection;
|
|
||||||
double score;
|
double score;
|
||||||
bool above_threshold;
|
bool above_threshold;
|
||||||
int lcs_length;
|
int lcs_length;
|
||||||
@ -184,12 +183,12 @@ int obi_clean(const char* dms_name,
|
|||||||
int ind_sample_count;
|
int ind_sample_count;
|
||||||
char status;
|
char status;
|
||||||
|
|
||||||
void** yes_trees = NULL;
|
byte_t* alignment_result_array = NULL;
|
||||||
void** no_trees = NULL;
|
byte_t ali_result;
|
||||||
|
|
||||||
int* complete_sample_count_array = NULL;
|
int* complete_sample_count_array = NULL;
|
||||||
int* sample_count_array = NULL;
|
int* sample_count_array = NULL;
|
||||||
Obi_blob_p* blob_array = NULL;
|
Obi_blob_p* blob_array = NULL;
|
||||||
|
|
||||||
OBIDMS_p dms = NULL;
|
OBIDMS_p dms = NULL;
|
||||||
Obiview_p i_view = NULL;
|
Obiview_p i_view = NULL;
|
||||||
@ -203,9 +202,8 @@ int obi_clean(const char* dms_name,
|
|||||||
OBIDMS_column_p singletoncount_column = NULL;
|
OBIDMS_column_p singletoncount_column = NULL;
|
||||||
OBIDMS_column_p samplecount_column = NULL;
|
OBIDMS_column_p samplecount_column = NULL;
|
||||||
|
|
||||||
void* no;
|
byte_t no;
|
||||||
void* yes;
|
byte_t yes;
|
||||||
void* key_p;
|
|
||||||
|
|
||||||
bool normalize = false;
|
bool normalize = false;
|
||||||
int reference = 0;
|
int reference = 0;
|
||||||
@ -363,32 +361,16 @@ int obi_clean(const char* dms_name,
|
|||||||
blob_array[i] = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0);
|
blob_array[i] = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Allocate arrays of pointers to binary trees
|
// Allocate alignment result array (byte at 0 if not aligned yet,
|
||||||
yes_trees = (void**) calloc(seq_count, sizeof(void*));
|
// 1 if sequence at index has a similarity above the threshold with the current sequence,
|
||||||
if (yes_trees == NULL)
|
// 2 if sequence at index has a similarity below the threshold with the current sequence)
|
||||||
|
alignment_result_array = (byte_t*) calloc(seq_count, sizeof(byte_t));
|
||||||
|
if (alignment_result_array == NULL)
|
||||||
{
|
{
|
||||||
obi_set_errno(OBI_MALLOC_ERROR);
|
obi_set_errno(OBI_MALLOC_ERROR);
|
||||||
obidebug(1, "\nError allocating memory for 'yes' binary trees");
|
obidebug(1, "\nError allocating memory for alignment result array");
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
no_trees = (void**) calloc(seq_count, sizeof(void*));
|
|
||||||
if (no_trees == NULL)
|
|
||||||
{
|
|
||||||
obi_set_errno(OBI_MALLOC_ERROR);
|
|
||||||
obidebug(1, "\nError allocating memory for 'no' binary trees");
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Allocate and fill index array for the binary trees to reference (they don't copy the data)
|
|
||||||
index_array = (index_t*) malloc(seq_count * sizeof(index_t));
|
|
||||||
if (index_array == NULL)
|
|
||||||
{
|
|
||||||
obi_set_errno(OBI_MALLOC_ERROR);
|
|
||||||
obidebug(1, "\nError allocating memory for the index array");
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
for (i=0; i < seq_count; i++)
|
|
||||||
index_array[i]=i;
|
|
||||||
|
|
||||||
// Initialize all sequences to singletons or NA if no sequences in that sample
|
// Initialize all sequences to singletons or NA if no sequences in that sample
|
||||||
for (i=0; i<seq_count; i++)
|
for (i=0; i<seq_count; i++)
|
||||||
@ -406,25 +388,27 @@ int obi_clean(const char* dms_name,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (sample=0; sample < sample_count; sample++)
|
|
||||||
{
|
|
||||||
sample_count_array = complete_sample_count_array+(sample*seq_count);
|
|
||||||
for (i=0; i< (seq_count-1); i++)
|
|
||||||
{
|
|
||||||
if (i%1000 == 0)
|
|
||||||
{
|
|
||||||
p = ((i+(sample*(seq_count)))/(float)(seq_count*sample_count))*100;
|
|
||||||
fprintf(stderr,"\rDone : %f %% ",p);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get first sequence
|
for (i=0; i< (seq_count-1); i++)
|
||||||
blob1 = blob_array[i];
|
{
|
||||||
|
if (i%1000 == 0)
|
||||||
|
{
|
||||||
|
p = (i/(float)seq_count)*100;
|
||||||
|
fprintf(stderr,"\rDone : %f %% ",p);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get first sequence
|
||||||
|
blob1 = blob_array[i];
|
||||||
// blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0); // slower
|
// blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0); // slower
|
||||||
if (blob1 == NULL)
|
if (blob1 == NULL)
|
||||||
{
|
{
|
||||||
obidebug(1, "\nError retrieving sequences to align");
|
obidebug(1, "\nError retrieving sequences to align");
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (sample=0; sample < sample_count; sample++)
|
||||||
|
{
|
||||||
|
sample_count_array = complete_sample_count_array+(sample*seq_count);
|
||||||
|
|
||||||
// Get count for this sample
|
// Get count for this sample
|
||||||
s1_count = sample_count_array[i];
|
s1_count = sample_count_array[i];
|
||||||
@ -445,20 +429,24 @@ int obi_clean(const char* dms_name,
|
|||||||
s2_count = sample_count_array[j];
|
s2_count = sample_count_array[j];
|
||||||
//s2_count = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, j, sample); // slower
|
//s2_count = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, j, sample); // slower
|
||||||
|
|
||||||
// Checking ratio
|
// Check all ratios
|
||||||
if (((s1_count!=OBIInt_NA && s2_count!=OBIInt_NA) && (s1_count>0 && s2_count>0)) &&
|
if (((s1_count!=OBIInt_NA && s2_count!=OBIInt_NA) && (s1_count>0 && s2_count>0)) &&
|
||||||
((((s1_count >= s2_count) && (((double) s2_count / (double) s1_count) <= max_ratio))) ||
|
((((s1_count >= s2_count) && (((double) s2_count / (double) s1_count) <= max_ratio))) ||
|
||||||
(((s2_count >= s1_count) && (((double) s1_count / (double) s2_count) <= max_ratio)))))
|
(((s2_count >= s1_count) && (((double) s1_count / (double) s2_count) <= max_ratio)))))
|
||||||
{
|
{
|
||||||
|
|
||||||
yes=NULL;
|
yes = 0;
|
||||||
no=NULL;
|
no = 0;
|
||||||
no = tfind(index_array+j, &(no_trees[i]), idxcmp);
|
|
||||||
if (no == NULL)
|
|
||||||
yes = tfind(index_array+j, &(yes_trees[i]), idxcmp);
|
|
||||||
|
|
||||||
above_threshold = false;
|
above_threshold = false;
|
||||||
if ((no == NULL) && (yes == NULL)) // never compared before
|
ali_result = alignment_result_array[j];
|
||||||
|
if (ali_result > 0) // already aligned
|
||||||
|
{
|
||||||
|
if (ali_result == 2)
|
||||||
|
no = 1;
|
||||||
|
else if (ali_result == 1)
|
||||||
|
yes = 1;
|
||||||
|
}
|
||||||
|
else // never compared before
|
||||||
{
|
{
|
||||||
// Check if the sequences are identical in a quick way (same index in the same indexer)
|
// Check if the sequences are identical in a quick way (same index in the same indexer)
|
||||||
if (obi_get_index_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0) == obi_get_index_with_elt_idx_and_col_p_in_view(i_view, iseq_column, j, 0))
|
if (obi_get_index_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0) == obi_get_index_with_elt_idx_and_col_p_in_view(i_view, iseq_column, j, 0))
|
||||||
@ -478,16 +466,9 @@ int obi_clean(const char* dms_name,
|
|||||||
|
|
||||||
if (yes || above_threshold)
|
if (yes || above_threshold)
|
||||||
{
|
{
|
||||||
if (yes == NULL)
|
if (yes == 0)
|
||||||
// Put in 'yes' tree of 1st sequence
|
// Set ali result as above the threshold (value 1)
|
||||||
{
|
alignment_result_array[j] = 1;
|
||||||
key_p = tsearch(index_array+j, &(yes_trees[i]), idxcmp);
|
|
||||||
if (key_p == NULL)
|
|
||||||
{
|
|
||||||
obidebug(1, "\nError adding an index in a binary tree");
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Might be worth having arrays to read values too for some datasets but unlikely
|
// Might be worth having arrays to read values too for some datasets but unlikely
|
||||||
// label as head or internal
|
// label as head or internal
|
||||||
@ -498,11 +479,12 @@ int obi_clean(const char* dms_name,
|
|||||||
if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample, 'h') < 0)
|
if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample, 'h') < 0)
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
// Otherwise it's an internal
|
// Otherwise it's an internal (do nothing)
|
||||||
|
// Label other sequence as internal no matter what
|
||||||
if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample, 'i') < 0)
|
if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample, 'i') < 0)
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
else
|
else // Same thing but with sequences switched
|
||||||
{
|
{
|
||||||
if (obi_get_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample) == 's') // seq can become head ONLY if it's a singleton
|
if (obi_get_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample) == 's') // seq can become head ONLY if it's a singleton
|
||||||
{
|
{
|
||||||
@ -513,27 +495,20 @@ int obi_clean(const char* dms_name,
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (no == NULL)
|
else if (no == 0)
|
||||||
// Put in 'no' tree of 1st sequence
|
// Set ali result as above the threshold (value 2)
|
||||||
{
|
alignment_result_array[j] = 2;
|
||||||
key_p = tsearch(index_array+j, &(no_trees[i]), idxcmp);
|
|
||||||
if (key_p == NULL)
|
|
||||||
{
|
|
||||||
obidebug(1, "\nError adding an index in a binary tree");
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Reset ali result array to 0
|
||||||
|
memset(alignment_result_array, 0, seq_count);
|
||||||
}
|
}
|
||||||
|
|
||||||
free_kmer_tables(ktable, seq_count);
|
free_kmer_tables(ktable, seq_count);
|
||||||
free(index_array);
|
|
||||||
free(complete_sample_count_array);
|
free(complete_sample_count_array);
|
||||||
free(blob_array);
|
free(blob_array);
|
||||||
free(yes_trees);
|
free(alignment_result_array);
|
||||||
free(no_trees);
|
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
@ -653,7 +628,7 @@ int obi_clean(const char* dms_name,
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr,"\rDone : 100 %% ");
|
fprintf(stderr, "\rDone : 100 %% \n");
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user