Fixed the new alignpaired end to work after ngsfilter with the 9879847

possible cases
2019-02-17 18:32:35 +01:00
parent 4ddd1a1c37
commit e026e9ec83
9 changed files with 151 additions and 83 deletions
--- a/src/kmer_similarity.c
+++ b/src/kmer_similarity.c
@ -64,19 +64,18 @@ void obi_free_shifted_ali(Obi_ali_p ali)


 Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1, index_t elt_idx1,
-							Obiview_p view2, OBIDMS_column_p column2, index_t idx2, index_t elt_idx2,
-							uint8_t kmer_size,
-							int32_t* kmer_pos_array, int32_t* kmer_pos_array_height_p,
-							int32_t* shift_array, int32_t* shift_array_height_p,
-							int32_t* shift_count_array, int32_t* shift_count_array_length_p,
-							bool build_consensus)
+						  Obiview_p view2, OBIDMS_column_p column2, index_t idx2, index_t elt_idx2,
+						  OBIDMS_column_p qual_col1, OBIDMS_column_p qual_col2,
+						  uint8_t kmer_size,
+						  int32_t* kmer_pos_array, int32_t* kmer_pos_array_height_p,
+						  int32_t* shift_array, int32_t* shift_array_height_p,
+						  int32_t* shift_count_array, int32_t* shift_count_array_length_p,
+						  bool build_consensus)
 {
 	Obi_ali_p       ali = NULL;
 	int 			i, j;
 	bool 			switched_seqs;
 	bool			left_ali;
-	OBIDMS_column_p qual_col1;
-	OBIDMS_column_p qual_col2;
 	double 			score = 0.0;
 	Obi_blob_p      blob1 = NULL;
 	Obi_blob_p   	blob2 = NULL;
@ -106,6 +105,8 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
 	int32_t         overlap_len;
 	int32_t         consensus_len;
 	int32_t  		cons_shift;
+	int32_t  		copy_start;
+	int32_t  		copy_len;
 	uint8_t   		kmer;
 	byte_t          nuc;
 	uint8_t         encoding;
@ -116,6 +117,10 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
 	int32_t         shift_array_height       = *shift_array_height_p;
 	int32_t         shift_count_array_length = *shift_count_array_length_p;
 	bool			free_blob1 = false;
+	bool			keep_seq1_start;
+	bool			keep_seq2_start;
+	bool			keep_seq1_end;
+	bool			keep_seq2_end;

 	// Check kmer size
 	if ((kmer_size < 1) || (kmer_size > 4))
@ -143,7 +148,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
 	switched_seqs = false;
 	len1 = blob1->length_decoded_value;
 	len2 = blob2->length_decoded_value;
-	if (len2 > len1)
+	if (len2 < len1)
 	{
 		switched_seqs = true;
 		temp_len = len1;
@ -334,7 +339,8 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
 	// Find the most represented shift
 	best_shift_idx = 0;
 	max_common_kmers = 0;
-	empty_part = (shift_count_array_length-1)/2 - len1;
+	//empty_part = (shift_count_array_length-1)/2 - len1; //TODO wrong in some cases (len1 shorter than overlap or something like that)
+	empty_part = 0;
 	for (i=empty_part; i < (shift_count_array_length - empty_part); i++)  // skipping empty parts of the array
 	{
 		if (shift_count_array[i] > max_common_kmers)
@ -345,10 +351,44 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
 	}
 	best_shift = best_shift_idx - len1;

+	keep_seq1_start = false;
+	keep_seq1_end = false;
+	keep_seq2_start = false;
+	keep_seq2_end = false;
+
+	// The 873863 cases of hell
 	if (best_shift > 0)
-	    overlap_len = len2 - best_shift;
-	else
+	{
+		overlap_len = len2 - best_shift;
+		if (len1 <= overlap_len)
+		{
+			overlap_len = len1;
+			if (! switched_seqs)
+				keep_seq2_end = true;
+			else
+				keep_seq2_start = true;
+		}
+		else if (switched_seqs)
+		{
+			keep_seq2_start = true;
+			keep_seq1_end = true;
+		}
+	}
+	else if (best_shift < 0)
+	{
 	    overlap_len = len1 + best_shift;
+	    if (!switched_seqs)
+	    {
+	    	keep_seq1_start = true;
+	    	keep_seq2_end = true;
+	    }
+	}
+	else
+	{
+		overlap_len = len1;
+		if ((!switched_seqs) && (len2 > len1))
+			keep_seq2_end = true;
+	}

 	score = max_common_kmers + kmer_size - 1;

@ -369,7 +409,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
 	ali->shift = abs_shift;
 	ali->consensus_seq = NULL;
 	ali->consensus_qual = NULL;
-	if (((best_shift < 0) && (!switched_seqs)) || ((best_shift > 0) && switched_seqs))
+	if (((best_shift <= 0) && (!switched_seqs)) || ((best_shift > 0) && switched_seqs))
 	{
 		left_ali = true;
 		strcpy(ali->direction, "left");
@ -383,33 +423,17 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
 	// Build the consensus sequence if asked
 	if (build_consensus)
 	{
-		// Get the quality columns
-		qual_col1 = obi_view_get_column(view1, QUALITY_COLUMN);
-		if (qual_col1 == NULL)
-		{
-			obi_set_errno(OBI_ALIGN_ERROR);
-			obidebug(1, "\nError when computing the kmer similarity between two sequences: can't open quality column to build the consensus sequence");
-			return NULL;
-		}
-		qual_col2 = obi_view_get_column(view2, QUALITY_COLUMN);
-		if (qual_col2 == NULL)
-		{
-			obi_set_errno(OBI_ALIGN_ERROR);
-			obidebug(1, "\nError when computing the kmer similarity between two sequences: can't open quality column to build the consensus sequence");
-			return NULL;
-		}
-
 		// Get the quality arrays
 		qual1 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view1, qual_col1, idx1, 0, &qual1_len);
 		if (qual1 == NULL)
 		{
-			obidebug(1, "\nError getting the blob of the 1st sequence when computing the kmer similarity between two sequences");
+			obidebug(1, "\nError getting the quality of the 1st sequence when computing the kmer similarity between two sequences");
 			return NULL;
 		}
 		qual2 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view2, qual_col2, idx2, 0, &qual2_len);
 		if (qual2 == NULL)
 		{
-			obidebug(1, "\nError getting the blob of the 1st sequence when computing the kmer similarity between two sequences");
+			obidebug(1, "\nError getting the quality of the 2nd sequence when computing the kmer similarity between two sequences");
 			return NULL;
 		}

@ -417,10 +441,10 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
 		if (seq1 == NULL)
 			seq1 = obi_blob_to_seq(blob1);

-		if (left_ali)
-		    consensus_len = len1 + len2 - overlap_len;
+		if (! switched_seqs)
+		    consensus_len = len2 - best_shift;
 		else
-		    consensus_len = overlap_len;
+		    consensus_len = len1 + best_shift;

 		// Allocate memory for consensus sequence
 		consensus_seq = (char*) malloc(consensus_len + 1 * sizeof(char)); // TODO keep malloced too maybe
@ -447,30 +471,27 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
 		// Compute consensus-relative shift for each sequence
 		if (best_shift > 0)
 		{
-			shift1 = best_shift;
-			shift2 = 0;
+			shift1 = 0;
+			shift2 = best_shift;
 		}
 		else
 		{
-			shift1 = 0;
-			shift2 = -(best_shift);
+			shift1 = -(best_shift);
+			shift2 = 0;
 		}

-		// If left alignment, copy first part of first sequence
-		if (left_ali)
+		// Copy first part of first or second sequence depending on cases
+		if (keep_seq1_start)
 		{
-			if (switched_seqs)
-			{
-				strncpy(consensus_seq, seq2, abs_shift);
-				memcpy(consensus_qual, qual2, abs_shift*sizeof(uint8_t));
-				cons_shift = abs_shift;
-			}
-			else
-			{
-				strncpy(consensus_seq, seq1, abs_shift);
-				memcpy(consensus_qual, qual1, abs_shift*sizeof(uint8_t));
-				cons_shift = abs_shift;
-			}
+			strncpy(consensus_seq, seq1, abs_shift);
+			memcpy(consensus_qual, qual1, abs_shift*sizeof(uint8_t));
+			cons_shift = abs_shift;
+		}
+		else if (keep_seq2_start)
+		{
+			strncpy(consensus_seq, seq2, abs_shift);
+			memcpy(consensus_qual, qual2, abs_shift*sizeof(uint8_t));
+			cons_shift = abs_shift;
 		}
 		else
 			cons_shift = 0;
@ -485,19 +506,26 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
 			consensus_qual[pos+cons_shift] = round((qual1[pos+shift1] + qual2[pos+shift2])/2);    // TODO maybe use the (p1*(1-p2/3)) formula (but parenthesis bug???)
 		}

-		// If left alignment, copy last part of first sequence
-		if (left_ali)
+		// Copy last part of first or second sequence depending on cases
+		if (keep_seq1_end)
 		{
-			if (switched_seqs)
+			strncpy(consensus_seq+cons_shift+overlap_len, seq1+overlap_len, len1 - overlap_len);
+			memcpy(consensus_qual+cons_shift+overlap_len, qual1+overlap_len, (len1 - overlap_len)*sizeof(uint8_t));
+		}
+		if (keep_seq2_end)
+		{
+			if (best_shift <= 0)
 			{
-				strncpy(consensus_seq+cons_shift+overlap_len, seq1+overlap_len, len1-overlap_len);
-				memcpy(consensus_qual+cons_shift+overlap_len, qual1+overlap_len, (len1-overlap_len)*sizeof(uint8_t));
+				copy_start = overlap_len;
+				copy_len = len2 - overlap_len;
 			}
-			else
+			if (best_shift > 0)
 			{
-				strncpy(consensus_seq+cons_shift+overlap_len, seq2+overlap_len, len2-overlap_len);
-				memcpy(consensus_qual+cons_shift+overlap_len, qual2+overlap_len, (len2-overlap_len)*sizeof(uint8_t));
+				copy_start = overlap_len + best_shift;
+				copy_len = len2 - overlap_len - best_shift;
 			}
+			strncpy(consensus_seq+cons_shift+overlap_len, seq2+copy_start, copy_len);
+			memcpy(consensus_qual+cons_shift+overlap_len, qual2+copy_start, copy_len*sizeof(uint8_t));
 		}

 		consensus_seq[consensus_len] = '\0';