Fixed a bug in kmer similarity computation where the fact that sequences
could be switched was not accounted for
This commit is contained in:
@ -72,7 +72,9 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
||||
bool build_consensus)
|
||||
{
|
||||
Obi_ali_p ali = NULL;
|
||||
int i, j;
|
||||
int i, j;
|
||||
bool switched_seqs;
|
||||
bool left_ali;
|
||||
OBIDMS_column_p qual_col1;
|
||||
OBIDMS_column_p qual_col2;
|
||||
double score = 0.0;
|
||||
@ -99,6 +101,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
||||
int32_t kmer_count;
|
||||
int32_t best_shift_idx;
|
||||
int32_t best_shift;
|
||||
int32_t abs_shift;
|
||||
int32_t max_common_kmers;
|
||||
int32_t overlap_len;
|
||||
int32_t consensus_len;
|
||||
@ -137,10 +140,12 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
||||
}
|
||||
|
||||
// Choose the shortest sequence to save kmer positions in array
|
||||
switched_seqs = false;
|
||||
len1 = blob1->length_decoded_value;
|
||||
len2 = blob2->length_decoded_value;
|
||||
if (len2 > len1)
|
||||
{
|
||||
switched_seqs = true;
|
||||
temp_len = len1;
|
||||
len1 = len2;
|
||||
len2 = temp_len;
|
||||
@ -355,13 +360,25 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
||||
return NULL;
|
||||
}
|
||||
|
||||
abs_shift = abs(best_shift);
|
||||
|
||||
// Save result in Obi_ali structure
|
||||
ali->score = score;
|
||||
ali->consensus_length = 0;
|
||||
ali->overlap_length = overlap_len;
|
||||
ali->shift = best_shift;
|
||||
ali->shift = abs_shift;
|
||||
ali->consensus_seq = NULL;
|
||||
ali->consensus_qual = NULL;
|
||||
if (((best_shift < 0) && (!switched_seqs)) || ((best_shift > 0) && switched_seqs))
|
||||
{
|
||||
left_ali = true;
|
||||
strcpy(ali->direction, "left");
|
||||
}
|
||||
else
|
||||
{
|
||||
left_ali = false;
|
||||
strcpy(ali->direction, "right");
|
||||
}
|
||||
|
||||
// Build the consensus sequence if asked
|
||||
if (build_consensus)
|
||||
@ -400,7 +417,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
||||
if (seq1 == NULL)
|
||||
seq1 = obi_blob_to_seq(blob1);
|
||||
|
||||
if (best_shift > 0)
|
||||
if (left_ali)
|
||||
consensus_len = len1 + len2 - overlap_len;
|
||||
else
|
||||
consensus_len = overlap_len;
|
||||
@ -427,26 +444,33 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
||||
ali->consensus_seq = consensus_seq;
|
||||
ali->consensus_qual = consensus_qual;
|
||||
|
||||
// Compute consensus-relative shift for each sequence and store direction
|
||||
if (best_shift >= 0)
|
||||
// Compute consensus-relative shift for each sequence
|
||||
if (best_shift > 0)
|
||||
{
|
||||
shift1 = 0;
|
||||
shift2 = best_shift;
|
||||
strcpy(ali->direction, "left");
|
||||
shift1 = best_shift;
|
||||
shift2 = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
shift1 = -(best_shift);
|
||||
shift2 = 0;
|
||||
strcpy(ali->direction, "right");
|
||||
shift1 = 0;
|
||||
shift2 = -(best_shift);
|
||||
}
|
||||
|
||||
// If positive shift, copy first part of second sequence
|
||||
if (best_shift > 0)
|
||||
// If left alignment, copy first part of first sequence
|
||||
if (left_ali)
|
||||
{
|
||||
strncpy(consensus_seq, seq2, best_shift);
|
||||
memcpy(consensus_qual, qual2, best_shift*sizeof(uint8_t));
|
||||
cons_shift = best_shift;
|
||||
if (switched_seqs)
|
||||
{
|
||||
strncpy(consensus_seq, seq2, abs_shift);
|
||||
memcpy(consensus_qual, qual2, abs_shift*sizeof(uint8_t));
|
||||
cons_shift = abs_shift;
|
||||
}
|
||||
else
|
||||
{
|
||||
strncpy(consensus_seq, seq1, abs_shift);
|
||||
memcpy(consensus_qual, qual1, abs_shift*sizeof(uint8_t));
|
||||
cons_shift = abs_shift;
|
||||
}
|
||||
}
|
||||
else
|
||||
cons_shift = 0;
|
||||
@ -461,11 +485,19 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
||||
consensus_qual[pos+cons_shift] = round((qual1[pos+shift1] + qual2[pos+shift2])/2); // TODO maybe use the (p1*(1-p2/3)) formula (but parenthesis bug???)
|
||||
}
|
||||
|
||||
// If positive shift, copy last part of first sequence
|
||||
if (best_shift > 0)
|
||||
// If left alignment, copy last part of first sequence
|
||||
if (left_ali)
|
||||
{
|
||||
strncpy(consensus_seq+cons_shift+overlap_len, seq1+overlap_len, len1-overlap_len);
|
||||
memcpy(consensus_qual+cons_shift+overlap_len, qual1+overlap_len, (len1-overlap_len)*sizeof(uint8_t));
|
||||
if (switched_seqs)
|
||||
{
|
||||
strncpy(consensus_seq+cons_shift+overlap_len, seq1+overlap_len, len1-overlap_len);
|
||||
memcpy(consensus_qual+cons_shift+overlap_len, qual1+overlap_len, (len1-overlap_len)*sizeof(uint8_t));
|
||||
}
|
||||
else
|
||||
{
|
||||
strncpy(consensus_seq+cons_shift+overlap_len, seq2+overlap_len, len2-overlap_len);
|
||||
memcpy(consensus_qual+cons_shift+overlap_len, qual2+overlap_len, (len2-overlap_len)*sizeof(uint8_t));
|
||||
}
|
||||
}
|
||||
|
||||
consensus_seq[consensus_len] = '\0';
|
||||
|
Reference in New Issue
Block a user