Fixed a bug in kmer similarity computation where the fact that sequences

could be switched was not accounted for
This commit is contained in:
Celine Mercier
2019-02-10 21:02:24 +01:00
parent 08bcbcd357
commit 3015310535

View File

@ -73,6 +73,8 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
{
Obi_ali_p ali = NULL;
int i, j;
bool switched_seqs;
bool left_ali;
OBIDMS_column_p qual_col1;
OBIDMS_column_p qual_col2;
double score = 0.0;
@ -99,6 +101,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
int32_t kmer_count;
int32_t best_shift_idx;
int32_t best_shift;
int32_t abs_shift;
int32_t max_common_kmers;
int32_t overlap_len;
int32_t consensus_len;
@ -137,10 +140,12 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
}
// Choose the shortest sequence to save kmer positions in array
switched_seqs = false;
len1 = blob1->length_decoded_value;
len2 = blob2->length_decoded_value;
if (len2 > len1)
{
switched_seqs = true;
temp_len = len1;
len1 = len2;
len2 = temp_len;
@ -355,13 +360,25 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
return NULL;
}
abs_shift = abs(best_shift);
// Save result in Obi_ali structure
ali->score = score;
ali->consensus_length = 0;
ali->overlap_length = overlap_len;
ali->shift = best_shift;
ali->shift = abs_shift;
ali->consensus_seq = NULL;
ali->consensus_qual = NULL;
if (((best_shift < 0) && (!switched_seqs)) || ((best_shift > 0) && switched_seqs))
{
left_ali = true;
strcpy(ali->direction, "left");
}
else
{
left_ali = false;
strcpy(ali->direction, "right");
}
// Build the consensus sequence if asked
if (build_consensus)
@ -400,7 +417,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
if (seq1 == NULL)
seq1 = obi_blob_to_seq(blob1);
if (best_shift > 0)
if (left_ali)
consensus_len = len1 + len2 - overlap_len;
else
consensus_len = overlap_len;
@ -427,26 +444,33 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
ali->consensus_seq = consensus_seq;
ali->consensus_qual = consensus_qual;
// Compute consensus-relative shift for each sequence and store direction
if (best_shift >= 0)
// Compute consensus-relative shift for each sequence
if (best_shift > 0)
{
shift1 = 0;
shift2 = best_shift;
strcpy(ali->direction, "left");
shift1 = best_shift;
shift2 = 0;
}
else
{
shift1 = -(best_shift);
shift2 = 0;
strcpy(ali->direction, "right");
shift1 = 0;
shift2 = -(best_shift);
}
// If positive shift, copy first part of second sequence
if (best_shift > 0)
// If left alignment, copy first part of first sequence
if (left_ali)
{
strncpy(consensus_seq, seq2, best_shift);
memcpy(consensus_qual, qual2, best_shift*sizeof(uint8_t));
cons_shift = best_shift;
if (switched_seqs)
{
strncpy(consensus_seq, seq2, abs_shift);
memcpy(consensus_qual, qual2, abs_shift*sizeof(uint8_t));
cons_shift = abs_shift;
}
else
{
strncpy(consensus_seq, seq1, abs_shift);
memcpy(consensus_qual, qual1, abs_shift*sizeof(uint8_t));
cons_shift = abs_shift;
}
}
else
cons_shift = 0;
@ -461,12 +485,20 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
consensus_qual[pos+cons_shift] = round((qual1[pos+shift1] + qual2[pos+shift2])/2); // TODO maybe use the (p1*(1-p2/3)) formula (but parenthesis bug???)
}
// If positive shift, copy last part of first sequence
if (best_shift > 0)
// If left alignment, copy last part of first sequence
if (left_ali)
{
if (switched_seqs)
{
strncpy(consensus_seq+cons_shift+overlap_len, seq1+overlap_len, len1-overlap_len);
memcpy(consensus_qual+cons_shift+overlap_len, qual1+overlap_len, (len1-overlap_len)*sizeof(uint8_t));
}
else
{
strncpy(consensus_seq+cons_shift+overlap_len, seq2+overlap_len, len2-overlap_len);
memcpy(consensus_qual+cons_shift+overlap_len, qual2+overlap_len, (len2-overlap_len)*sizeof(uint8_t));
}
}
consensus_seq[consensus_len] = '\0';
}