Fixed a bug in kmer similarity computation where the fact that sequences

could be switched was not accounted for
This commit is contained in:
Celine Mercier
2019-02-10 21:02:24 +01:00
parent 08bcbcd357
commit 3015310535

View File

@ -72,7 +72,9 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
bool build_consensus) bool build_consensus)
{ {
Obi_ali_p ali = NULL; Obi_ali_p ali = NULL;
int i, j; int i, j;
bool switched_seqs;
bool left_ali;
OBIDMS_column_p qual_col1; OBIDMS_column_p qual_col1;
OBIDMS_column_p qual_col2; OBIDMS_column_p qual_col2;
double score = 0.0; double score = 0.0;
@ -99,6 +101,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
int32_t kmer_count; int32_t kmer_count;
int32_t best_shift_idx; int32_t best_shift_idx;
int32_t best_shift; int32_t best_shift;
int32_t abs_shift;
int32_t max_common_kmers; int32_t max_common_kmers;
int32_t overlap_len; int32_t overlap_len;
int32_t consensus_len; int32_t consensus_len;
@ -137,10 +140,12 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
} }
// Choose the shortest sequence to save kmer positions in array // Choose the shortest sequence to save kmer positions in array
switched_seqs = false;
len1 = blob1->length_decoded_value; len1 = blob1->length_decoded_value;
len2 = blob2->length_decoded_value; len2 = blob2->length_decoded_value;
if (len2 > len1) if (len2 > len1)
{ {
switched_seqs = true;
temp_len = len1; temp_len = len1;
len1 = len2; len1 = len2;
len2 = temp_len; len2 = temp_len;
@ -355,13 +360,25 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
return NULL; return NULL;
} }
abs_shift = abs(best_shift);
// Save result in Obi_ali structure // Save result in Obi_ali structure
ali->score = score; ali->score = score;
ali->consensus_length = 0; ali->consensus_length = 0;
ali->overlap_length = overlap_len; ali->overlap_length = overlap_len;
ali->shift = best_shift; ali->shift = abs_shift;
ali->consensus_seq = NULL; ali->consensus_seq = NULL;
ali->consensus_qual = NULL; ali->consensus_qual = NULL;
if (((best_shift < 0) && (!switched_seqs)) || ((best_shift > 0) && switched_seqs))
{
left_ali = true;
strcpy(ali->direction, "left");
}
else
{
left_ali = false;
strcpy(ali->direction, "right");
}
// Build the consensus sequence if asked // Build the consensus sequence if asked
if (build_consensus) if (build_consensus)
@ -400,7 +417,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
if (seq1 == NULL) if (seq1 == NULL)
seq1 = obi_blob_to_seq(blob1); seq1 = obi_blob_to_seq(blob1);
if (best_shift > 0) if (left_ali)
consensus_len = len1 + len2 - overlap_len; consensus_len = len1 + len2 - overlap_len;
else else
consensus_len = overlap_len; consensus_len = overlap_len;
@ -427,26 +444,33 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
ali->consensus_seq = consensus_seq; ali->consensus_seq = consensus_seq;
ali->consensus_qual = consensus_qual; ali->consensus_qual = consensus_qual;
// Compute consensus-relative shift for each sequence and store direction // Compute consensus-relative shift for each sequence
if (best_shift >= 0) if (best_shift > 0)
{ {
shift1 = 0; shift1 = best_shift;
shift2 = best_shift; shift2 = 0;
strcpy(ali->direction, "left");
} }
else else
{ {
shift1 = -(best_shift); shift1 = 0;
shift2 = 0; shift2 = -(best_shift);
strcpy(ali->direction, "right");
} }
// If positive shift, copy first part of second sequence // If left alignment, copy first part of first sequence
if (best_shift > 0) if (left_ali)
{ {
strncpy(consensus_seq, seq2, best_shift); if (switched_seqs)
memcpy(consensus_qual, qual2, best_shift*sizeof(uint8_t)); {
cons_shift = best_shift; strncpy(consensus_seq, seq2, abs_shift);
memcpy(consensus_qual, qual2, abs_shift*sizeof(uint8_t));
cons_shift = abs_shift;
}
else
{
strncpy(consensus_seq, seq1, abs_shift);
memcpy(consensus_qual, qual1, abs_shift*sizeof(uint8_t));
cons_shift = abs_shift;
}
} }
else else
cons_shift = 0; cons_shift = 0;
@ -461,11 +485,19 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
consensus_qual[pos+cons_shift] = round((qual1[pos+shift1] + qual2[pos+shift2])/2); // TODO maybe use the (p1*(1-p2/3)) formula (but parenthesis bug???) consensus_qual[pos+cons_shift] = round((qual1[pos+shift1] + qual2[pos+shift2])/2); // TODO maybe use the (p1*(1-p2/3)) formula (but parenthesis bug???)
} }
// If positive shift, copy last part of first sequence // If left alignment, copy last part of first sequence
if (best_shift > 0) if (left_ali)
{ {
strncpy(consensus_seq+cons_shift+overlap_len, seq1+overlap_len, len1-overlap_len); if (switched_seqs)
memcpy(consensus_qual+cons_shift+overlap_len, qual1+overlap_len, (len1-overlap_len)*sizeof(uint8_t)); {
strncpy(consensus_seq+cons_shift+overlap_len, seq1+overlap_len, len1-overlap_len);
memcpy(consensus_qual+cons_shift+overlap_len, qual1+overlap_len, (len1-overlap_len)*sizeof(uint8_t));
}
else
{
strncpy(consensus_seq+cons_shift+overlap_len, seq2+overlap_len, len2-overlap_len);
memcpy(consensus_qual+cons_shift+overlap_len, qual2+overlap_len, (len2-overlap_len)*sizeof(uint8_t));
}
} }
consensus_seq[consensus_len] = '\0'; consensus_seq[consensus_len] = '\0';