obi clean: made more efficient with arrays (speed ~x15 compared with

OBI1)
2018-10-21 17:59:02 +02:00
parent d53323e7f4
commit 6305282305
1 changed files with 46 additions and 6 deletions
--- a/src/obi_clean.c
+++ b/src/obi_clean.c
@ -185,8 +185,12 @@ int obi_clean(const char* dms_name,
 	int 			ind_sample_count;
 	char  			status;

-	void**          yes_trees;
-	void**          no_trees;
+	void**          yes_trees = NULL;
+	void**          no_trees = NULL;
+
+	int* complete_sample_count_array = NULL;
+	int* sample_count_array = NULL;
+	Obi_blob_p* blob_array = NULL;

 	OBIDMS_p        dms = NULL;
 	Obiview_p       i_view = NULL;
@ -332,6 +336,34 @@ int obi_clean(const char* dms_name,

 	seq_count = (i_view->infos)->line_count;

+	// Allocate arrays for sample counts otherwise reading in mapped files takes longer
+	complete_sample_count_array = (int*) malloc(seq_count * sample_count * sizeof(int));
+	if (complete_sample_count_array == NULL)
+	{
+		obi_set_errno(OBI_MALLOC_ERROR);
+		obidebug(1, "\nError allocating memory for the array of sample counts, size: %lld", seq_count * sample_count * sizeof(int));
+		return -1;
+	}
+	for (sample=0; sample < sample_count; sample++)
+	{
+		sample_count_array = complete_sample_count_array+(sample*seq_count);
+		for (i=0; i<seq_count; i++)
+			sample_count_array[i] = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, i, sample);
+	}
+
+	// Allocate arrays for blobs otherwise reading in mapped files takes longer
+	blob_array = (Obi_blob_p*) malloc(seq_count * sizeof(Obi_blob_p));
+	if (blob_array == NULL)
+	{
+		obi_set_errno(OBI_MALLOC_ERROR);
+		obidebug(1, "\nError allocating memory for the array of blobs");
+		return -1;
+	}
+	for (i=0; i<seq_count; i++)
+	{
+		blob_array[i] = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0);
+	}
+
 	// Allocate arrays of pointers to binary trees
 	yes_trees = (void**) calloc(seq_count, sizeof(void*));
 	if (yes_trees == NULL)
@ -377,6 +409,7 @@ int obi_clean(const char* dms_name,

 	for (sample=0; sample < sample_count; sample++)
 	{
+		sample_count_array = complete_sample_count_array+(sample*seq_count);
 		for (i=0; i< (seq_count-1); i++)
 		{
 			if (i%1000 == 0)
@ -386,7 +419,8 @@ int obi_clean(const char* dms_name,
 			}

 			// Get first sequence
-			blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0);
+			blob1 = blob_array[i];
+//			blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0);  // slower
 			if (blob1 == NULL)
 			{
 				obidebug(1, "\nError retrieving sequences to align");
@ -394,12 +428,14 @@ int obi_clean(const char* dms_name,
 			}

 			// Get count for this sample
-			s1_count = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, i, sample);
+			s1_count = sample_count_array[i];
+			//s1_count = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, i, sample);   // slower

 			for (j=i+1; j < seq_count; j++)  // TODO parallelize this loop?
 			{
 				// Get second sequence
-				blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, j, 0);
+				blob2 = blob_array[j];
+//				blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, j, 0);   // slower
 				if (blob2 == NULL)
 				{
 					obidebug(1, "\nError retrieving sequences to align");
@ -407,7 +443,8 @@ int obi_clean(const char* dms_name,
 				}

 				// Get count for this sample
-				s2_count = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, j, sample);
+				s2_count = sample_count_array[j];
+				//s2_count = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, j, sample);    // slower

 				// Checking ratio
 				if (((s1_count!=OBIInt_NA && s2_count!=OBIInt_NA) && (s1_count>0 && s2_count>0)) &&
@ -453,6 +490,7 @@ int obi_clean(const char* dms_name,
 							}
 						}

+						// Might be worth having arrays to read values too for some datasets but unlikely
 						// label as head or internal
 						if (s1_count >= s2_count)
 						{
@ -493,6 +531,8 @@ int obi_clean(const char* dms_name,

 	free_kmer_tables(ktable, seq_count);
 	free(index_array);
+	free(complete_sample_count_array);
+	free(blob_array);
 	free(yes_trees);
 	free(no_trees);