From 3015310535d2f489ff42d417ecde4a6035310a25 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Sun, 10 Feb 2019 21:02:24 +0100 Subject: [PATCH] Fixed a bug in kmer similarity computation where the fact that sequences could be switched was not accounted for --- src/kmer_similarity.c | 72 +++++++++++++++++++++++++++++++------------ 1 file changed, 52 insertions(+), 20 deletions(-) diff --git a/src/kmer_similarity.c b/src/kmer_similarity.c index c2941bf..7024da9 100755 --- a/src/kmer_similarity.c +++ b/src/kmer_similarity.c @@ -72,7 +72,9 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 bool build_consensus) { Obi_ali_p ali = NULL; - int i, j; + int i, j; + bool switched_seqs; + bool left_ali; OBIDMS_column_p qual_col1; OBIDMS_column_p qual_col2; double score = 0.0; @@ -99,6 +101,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 int32_t kmer_count; int32_t best_shift_idx; int32_t best_shift; + int32_t abs_shift; int32_t max_common_kmers; int32_t overlap_len; int32_t consensus_len; @@ -137,10 +140,12 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 } // Choose the shortest sequence to save kmer positions in array + switched_seqs = false; len1 = blob1->length_decoded_value; len2 = blob2->length_decoded_value; if (len2 > len1) { + switched_seqs = true; temp_len = len1; len1 = len2; len2 = temp_len; @@ -355,13 +360,25 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 return NULL; } + abs_shift = abs(best_shift); + // Save result in Obi_ali structure ali->score = score; ali->consensus_length = 0; ali->overlap_length = overlap_len; - ali->shift = best_shift; + ali->shift = abs_shift; ali->consensus_seq = NULL; ali->consensus_qual = NULL; + if (((best_shift < 0) && (!switched_seqs)) || ((best_shift > 0) && switched_seqs)) + { + left_ali = true; + strcpy(ali->direction, "left"); + } + else + { + left_ali = false; + strcpy(ali->direction, "right"); + } // Build the consensus sequence if asked if (build_consensus) @@ -400,7 +417,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 if (seq1 == NULL) seq1 = obi_blob_to_seq(blob1); - if (best_shift > 0) + if (left_ali) consensus_len = len1 + len2 - overlap_len; else consensus_len = overlap_len; @@ -427,26 +444,33 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 ali->consensus_seq = consensus_seq; ali->consensus_qual = consensus_qual; - // Compute consensus-relative shift for each sequence and store direction - if (best_shift >= 0) + // Compute consensus-relative shift for each sequence + if (best_shift > 0) { - shift1 = 0; - shift2 = best_shift; - strcpy(ali->direction, "left"); + shift1 = best_shift; + shift2 = 0; } else { - shift1 = -(best_shift); - shift2 = 0; - strcpy(ali->direction, "right"); + shift1 = 0; + shift2 = -(best_shift); } - // If positive shift, copy first part of second sequence - if (best_shift > 0) + // If left alignment, copy first part of first sequence + if (left_ali) { - strncpy(consensus_seq, seq2, best_shift); - memcpy(consensus_qual, qual2, best_shift*sizeof(uint8_t)); - cons_shift = best_shift; + if (switched_seqs) + { + strncpy(consensus_seq, seq2, abs_shift); + memcpy(consensus_qual, qual2, abs_shift*sizeof(uint8_t)); + cons_shift = abs_shift; + } + else + { + strncpy(consensus_seq, seq1, abs_shift); + memcpy(consensus_qual, qual1, abs_shift*sizeof(uint8_t)); + cons_shift = abs_shift; + } } else cons_shift = 0; @@ -461,11 +485,19 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 consensus_qual[pos+cons_shift] = round((qual1[pos+shift1] + qual2[pos+shift2])/2); // TODO maybe use the (p1*(1-p2/3)) formula (but parenthesis bug???) } - // If positive shift, copy last part of first sequence - if (best_shift > 0) + // If left alignment, copy last part of first sequence + if (left_ali) { - strncpy(consensus_seq+cons_shift+overlap_len, seq1+overlap_len, len1-overlap_len); - memcpy(consensus_qual+cons_shift+overlap_len, qual1+overlap_len, (len1-overlap_len)*sizeof(uint8_t)); + if (switched_seqs) + { + strncpy(consensus_seq+cons_shift+overlap_len, seq1+overlap_len, len1-overlap_len); + memcpy(consensus_qual+cons_shift+overlap_len, qual1+overlap_len, (len1-overlap_len)*sizeof(uint8_t)); + } + else + { + strncpy(consensus_seq+cons_shift+overlap_len, seq2+overlap_len, len2-overlap_len); + memcpy(consensus_qual+cons_shift+overlap_len, qual2+overlap_len, (len2-overlap_len)*sizeof(uint8_t)); + } } consensus_seq[consensus_len] = '\0';