alignpairedend: fixed bug that would cut out sequence ends when it
should not have
This commit is contained in:
@ -77,7 +77,6 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
||||
int32_t* shift_count_array;
|
||||
Obi_ali_p ali = NULL;
|
||||
int i, j;
|
||||
bool switched_seqs;
|
||||
bool reversed;
|
||||
int score = 0;
|
||||
Obi_blob_p blob1 = NULL;
|
||||
@ -124,6 +123,8 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
||||
bool keep_seq2_start;
|
||||
bool keep_seq1_end;
|
||||
bool keep_seq2_end;
|
||||
bool left_ali;
|
||||
bool rev_quals = false;
|
||||
|
||||
// Check kmer size
|
||||
if ((kmer_size < 1) || (kmer_size > 4))
|
||||
@ -148,19 +149,8 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
||||
}
|
||||
|
||||
// Choose the shortest sequence to save kmer positions in array
|
||||
switched_seqs = false;
|
||||
len1 = blob1->length_decoded_value;
|
||||
len2 = blob2->length_decoded_value;
|
||||
if (len2 < len1)
|
||||
{
|
||||
switched_seqs = true;
|
||||
temp_len = len1;
|
||||
len1 = len2;
|
||||
len2 = temp_len;
|
||||
temp_blob = blob1;
|
||||
blob1 = blob2;
|
||||
blob2 = temp_blob;
|
||||
}
|
||||
|
||||
// Force encoding on 2 bits by replacing ambiguous nucleotides by 'a's
|
||||
if (blob1->element_size == 4)
|
||||
@ -196,7 +186,47 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
||||
else
|
||||
reversed = false;
|
||||
if (reversed)
|
||||
switched_seqs = !switched_seqs;
|
||||
// unreverse to make cases simpler. Costly but rare (direct match is reverse primer match)
|
||||
{
|
||||
if (seq2 == NULL)
|
||||
seq2 = obi_blob_to_seq(blob2);
|
||||
seq2 = reverse_complement_sequence(seq2);
|
||||
blob2 = obi_seq_to_blob(seq2);
|
||||
|
||||
if (seq1 == NULL)
|
||||
seq1 = obi_blob_to_seq(blob1);
|
||||
seq1 = reverse_complement_sequence(seq1);
|
||||
blob1 = obi_seq_to_blob(seq1);
|
||||
free_blob1 = true;
|
||||
|
||||
// Get the quality arrays
|
||||
qual1 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view1, qual_col1, idx1, 0, &qual1_len);
|
||||
if (qual1 == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting the quality of the 1st sequence when computing the kmer similarity between two sequences");
|
||||
return NULL;
|
||||
}
|
||||
qual2 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view2, qual_col2, idx2, 0, &qual2_len);
|
||||
if (qual2 == NULL)
|
||||
{
|
||||
obidebug(1, "\nError getting the quality of the 2nd sequence when computing the kmer similarity between two sequences");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
uint8_t* newqual1 = malloc(qual1_len*sizeof(uint8_t));
|
||||
uint8_t* newqual2 = malloc(qual2_len*sizeof(uint8_t));
|
||||
|
||||
for (i=0;i<qual1_len;i++)
|
||||
newqual1[i] = qual1[qual1_len-1-i];
|
||||
|
||||
for (i=0;i<qual2_len;i++)
|
||||
newqual2[i] = qual2[qual2_len-1-i];
|
||||
|
||||
qual1 = newqual1;
|
||||
qual2 = newqual2;
|
||||
|
||||
rev_quals = true;
|
||||
}
|
||||
|
||||
// Save total length for the shift counts array
|
||||
total_len = len1 + len2 + 1; // +1 for shift 0
|
||||
@ -237,7 +267,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
else if (len1 >= shift_array_height)
|
||||
else if (total_len >= shift_array_height)
|
||||
{
|
||||
shift_array_height = total_len;
|
||||
*shift_array_p = (int32_t*) realloc(*shift_array_p, ARRAY_LENGTH * shift_array_height * sizeof(int32_t));
|
||||
@ -291,7 +321,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
||||
*shift_array_height_p = shift_array_height;
|
||||
*shift_count_array_length_p = shift_count_array_length;
|
||||
|
||||
// Fill array with positions of kmers in the shortest sequence
|
||||
// Fill array with positions of kmers in the first sequence
|
||||
encoding = blob1->element_size;
|
||||
kmer_count = len1 - kmer_size + 1;
|
||||
for (kmer_idx=0; kmer_idx < kmer_count; kmer_idx++)
|
||||
@ -310,7 +340,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
||||
kmer_pos_array[(kmer*kmer_pos_array_height)+i] = kmer_idx;
|
||||
}
|
||||
|
||||
// Compare positions of kmers between both sequences and store shifts
|
||||
// Compare positions of kmers between both sequences and store shifts (a shift corresponds to pos2 - pos1)
|
||||
kmer_count = blob2->length_decoded_value - kmer_size + 1;
|
||||
for (kmer_idx=0; kmer_idx < kmer_count; kmer_idx++)
|
||||
{
|
||||
@ -374,35 +404,42 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
||||
// The 873863 cases of hell
|
||||
if (best_shift > 0)
|
||||
{
|
||||
left_ali = false;
|
||||
overlap_len = len2 - best_shift;
|
||||
if (len1 <= overlap_len)
|
||||
{
|
||||
overlap_len = len1;
|
||||
if (! switched_seqs)
|
||||
keep_seq2_end = true;
|
||||
else
|
||||
keep_seq2_start = true;
|
||||
}
|
||||
else if (switched_seqs)
|
||||
{
|
||||
keep_seq2_start = true;
|
||||
keep_seq1_end = true;
|
||||
}
|
||||
}
|
||||
else if (best_shift < 0)
|
||||
{
|
||||
left_ali = true;
|
||||
overlap_len = len1 + best_shift;
|
||||
if (!switched_seqs)
|
||||
if (len2 <= overlap_len)
|
||||
{
|
||||
overlap_len = len2;
|
||||
keep_seq1_start = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
keep_seq1_start = true;
|
||||
keep_seq2_end = true;
|
||||
}
|
||||
}
|
||||
else
|
||||
else // if (best_shift == 0)
|
||||
{
|
||||
if (len2 >= len1)
|
||||
{
|
||||
overlap_len = len1;
|
||||
if ((!switched_seqs) && (len2 > len1))
|
||||
keep_seq2_end = true;
|
||||
left_ali = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
overlap_len = len2;
|
||||
left_ali = false; // discussable
|
||||
}
|
||||
}
|
||||
|
||||
ali = (Obi_ali_p) malloc(sizeof(Obi_ali_t));
|
||||
@ -433,7 +470,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
||||
ali->direction[0] = '\0';
|
||||
else
|
||||
{
|
||||
if (((best_shift <= 0) && (!switched_seqs)) || ((best_shift > 0) && switched_seqs))
|
||||
if (left_ali)
|
||||
strcpy(ali->direction, "left");
|
||||
else
|
||||
strcpy(ali->direction, "right");
|
||||
@ -441,6 +478,8 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
||||
|
||||
// Build the consensus sequence if asked
|
||||
if (build_consensus)
|
||||
{
|
||||
if (! rev_quals)
|
||||
{
|
||||
// Get the quality arrays
|
||||
qual1 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view1, qual_col1, idx1, 0, &qual1_len);
|
||||
@ -455,15 +494,13 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
||||
obidebug(1, "\nError getting the quality of the 2nd sequence when computing the kmer similarity between two sequences");
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
// Decode the first sequence if not already done
|
||||
if (seq1 == NULL)
|
||||
seq1 = obi_blob_to_seq(blob1);
|
||||
|
||||
if (! switched_seqs)
|
||||
consensus_len = len2 - best_shift;
|
||||
else
|
||||
consensus_len = len1 + best_shift;
|
||||
|
||||
// Allocate memory for consensus sequence
|
||||
consensus_seq = (char*) malloc(consensus_len + 1 * sizeof(char)); // TODO keep malloced too maybe
|
||||
@ -557,6 +594,12 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
||||
free(seq2);
|
||||
free(blob2);
|
||||
|
||||
if (rev_quals)
|
||||
{
|
||||
free(qual1);
|
||||
free(qual2);
|
||||
}
|
||||
|
||||
return ali;
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user