alignpairedend: fixed bug that would cut out sequence ends when it
should not have
This commit is contained in:
@ -77,7 +77,6 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
|||||||
int32_t* shift_count_array;
|
int32_t* shift_count_array;
|
||||||
Obi_ali_p ali = NULL;
|
Obi_ali_p ali = NULL;
|
||||||
int i, j;
|
int i, j;
|
||||||
bool switched_seqs;
|
|
||||||
bool reversed;
|
bool reversed;
|
||||||
int score = 0;
|
int score = 0;
|
||||||
Obi_blob_p blob1 = NULL;
|
Obi_blob_p blob1 = NULL;
|
||||||
@ -124,6 +123,8 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
|||||||
bool keep_seq2_start;
|
bool keep_seq2_start;
|
||||||
bool keep_seq1_end;
|
bool keep_seq1_end;
|
||||||
bool keep_seq2_end;
|
bool keep_seq2_end;
|
||||||
|
bool left_ali;
|
||||||
|
bool rev_quals = false;
|
||||||
|
|
||||||
// Check kmer size
|
// Check kmer size
|
||||||
if ((kmer_size < 1) || (kmer_size > 4))
|
if ((kmer_size < 1) || (kmer_size > 4))
|
||||||
@ -148,19 +149,8 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Choose the shortest sequence to save kmer positions in array
|
// Choose the shortest sequence to save kmer positions in array
|
||||||
switched_seqs = false;
|
|
||||||
len1 = blob1->length_decoded_value;
|
len1 = blob1->length_decoded_value;
|
||||||
len2 = blob2->length_decoded_value;
|
len2 = blob2->length_decoded_value;
|
||||||
if (len2 < len1)
|
|
||||||
{
|
|
||||||
switched_seqs = true;
|
|
||||||
temp_len = len1;
|
|
||||||
len1 = len2;
|
|
||||||
len2 = temp_len;
|
|
||||||
temp_blob = blob1;
|
|
||||||
blob1 = blob2;
|
|
||||||
blob2 = temp_blob;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Force encoding on 2 bits by replacing ambiguous nucleotides by 'a's
|
// Force encoding on 2 bits by replacing ambiguous nucleotides by 'a's
|
||||||
if (blob1->element_size == 4)
|
if (blob1->element_size == 4)
|
||||||
@ -196,7 +186,47 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
|||||||
else
|
else
|
||||||
reversed = false;
|
reversed = false;
|
||||||
if (reversed)
|
if (reversed)
|
||||||
switched_seqs = !switched_seqs;
|
// unreverse to make cases simpler. Costly but rare (direct match is reverse primer match)
|
||||||
|
{
|
||||||
|
if (seq2 == NULL)
|
||||||
|
seq2 = obi_blob_to_seq(blob2);
|
||||||
|
seq2 = reverse_complement_sequence(seq2);
|
||||||
|
blob2 = obi_seq_to_blob(seq2);
|
||||||
|
|
||||||
|
if (seq1 == NULL)
|
||||||
|
seq1 = obi_blob_to_seq(blob1);
|
||||||
|
seq1 = reverse_complement_sequence(seq1);
|
||||||
|
blob1 = obi_seq_to_blob(seq1);
|
||||||
|
free_blob1 = true;
|
||||||
|
|
||||||
|
// Get the quality arrays
|
||||||
|
qual1 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view1, qual_col1, idx1, 0, &qual1_len);
|
||||||
|
if (qual1 == NULL)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError getting the quality of the 1st sequence when computing the kmer similarity between two sequences");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
qual2 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view2, qual_col2, idx2, 0, &qual2_len);
|
||||||
|
if (qual2 == NULL)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError getting the quality of the 2nd sequence when computing the kmer similarity between two sequences");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint8_t* newqual1 = malloc(qual1_len*sizeof(uint8_t));
|
||||||
|
uint8_t* newqual2 = malloc(qual2_len*sizeof(uint8_t));
|
||||||
|
|
||||||
|
for (i=0;i<qual1_len;i++)
|
||||||
|
newqual1[i] = qual1[qual1_len-1-i];
|
||||||
|
|
||||||
|
for (i=0;i<qual2_len;i++)
|
||||||
|
newqual2[i] = qual2[qual2_len-1-i];
|
||||||
|
|
||||||
|
qual1 = newqual1;
|
||||||
|
qual2 = newqual2;
|
||||||
|
|
||||||
|
rev_quals = true;
|
||||||
|
}
|
||||||
|
|
||||||
// Save total length for the shift counts array
|
// Save total length for the shift counts array
|
||||||
total_len = len1 + len2 + 1; // +1 for shift 0
|
total_len = len1 + len2 + 1; // +1 for shift 0
|
||||||
@ -237,7 +267,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (len1 >= shift_array_height)
|
else if (total_len >= shift_array_height)
|
||||||
{
|
{
|
||||||
shift_array_height = total_len;
|
shift_array_height = total_len;
|
||||||
*shift_array_p = (int32_t*) realloc(*shift_array_p, ARRAY_LENGTH * shift_array_height * sizeof(int32_t));
|
*shift_array_p = (int32_t*) realloc(*shift_array_p, ARRAY_LENGTH * shift_array_height * sizeof(int32_t));
|
||||||
@ -291,7 +321,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
|||||||
*shift_array_height_p = shift_array_height;
|
*shift_array_height_p = shift_array_height;
|
||||||
*shift_count_array_length_p = shift_count_array_length;
|
*shift_count_array_length_p = shift_count_array_length;
|
||||||
|
|
||||||
// Fill array with positions of kmers in the shortest sequence
|
// Fill array with positions of kmers in the first sequence
|
||||||
encoding = blob1->element_size;
|
encoding = blob1->element_size;
|
||||||
kmer_count = len1 - kmer_size + 1;
|
kmer_count = len1 - kmer_size + 1;
|
||||||
for (kmer_idx=0; kmer_idx < kmer_count; kmer_idx++)
|
for (kmer_idx=0; kmer_idx < kmer_count; kmer_idx++)
|
||||||
@ -310,7 +340,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
|||||||
kmer_pos_array[(kmer*kmer_pos_array_height)+i] = kmer_idx;
|
kmer_pos_array[(kmer*kmer_pos_array_height)+i] = kmer_idx;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compare positions of kmers between both sequences and store shifts
|
// Compare positions of kmers between both sequences and store shifts (a shift corresponds to pos2 - pos1)
|
||||||
kmer_count = blob2->length_decoded_value - kmer_size + 1;
|
kmer_count = blob2->length_decoded_value - kmer_size + 1;
|
||||||
for (kmer_idx=0; kmer_idx < kmer_count; kmer_idx++)
|
for (kmer_idx=0; kmer_idx < kmer_count; kmer_idx++)
|
||||||
{
|
{
|
||||||
@ -374,35 +404,42 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
|||||||
// The 873863 cases of hell
|
// The 873863 cases of hell
|
||||||
if (best_shift > 0)
|
if (best_shift > 0)
|
||||||
{
|
{
|
||||||
|
left_ali = false;
|
||||||
overlap_len = len2 - best_shift;
|
overlap_len = len2 - best_shift;
|
||||||
if (len1 <= overlap_len)
|
if (len1 <= overlap_len)
|
||||||
{
|
{
|
||||||
overlap_len = len1;
|
overlap_len = len1;
|
||||||
if (! switched_seqs)
|
|
||||||
keep_seq2_end = true;
|
keep_seq2_end = true;
|
||||||
else
|
|
||||||
keep_seq2_start = true;
|
|
||||||
}
|
|
||||||
else if (switched_seqs)
|
|
||||||
{
|
|
||||||
keep_seq2_start = true;
|
|
||||||
keep_seq1_end = true;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (best_shift < 0)
|
else if (best_shift < 0)
|
||||||
{
|
{
|
||||||
|
left_ali = true;
|
||||||
overlap_len = len1 + best_shift;
|
overlap_len = len1 + best_shift;
|
||||||
if (!switched_seqs)
|
if (len2 <= overlap_len)
|
||||||
|
{
|
||||||
|
overlap_len = len2;
|
||||||
|
keep_seq1_start = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
{
|
{
|
||||||
keep_seq1_start = true;
|
keep_seq1_start = true;
|
||||||
keep_seq2_end = true;
|
keep_seq2_end = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else // if (best_shift == 0)
|
||||||
|
{
|
||||||
|
if (len2 >= len1)
|
||||||
{
|
{
|
||||||
overlap_len = len1;
|
overlap_len = len1;
|
||||||
if ((!switched_seqs) && (len2 > len1))
|
|
||||||
keep_seq2_end = true;
|
keep_seq2_end = true;
|
||||||
|
left_ali = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
overlap_len = len2;
|
||||||
|
left_ali = false; // discussable
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ali = (Obi_ali_p) malloc(sizeof(Obi_ali_t));
|
ali = (Obi_ali_p) malloc(sizeof(Obi_ali_t));
|
||||||
@ -433,7 +470,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
|||||||
ali->direction[0] = '\0';
|
ali->direction[0] = '\0';
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (((best_shift <= 0) && (!switched_seqs)) || ((best_shift > 0) && switched_seqs))
|
if (left_ali)
|
||||||
strcpy(ali->direction, "left");
|
strcpy(ali->direction, "left");
|
||||||
else
|
else
|
||||||
strcpy(ali->direction, "right");
|
strcpy(ali->direction, "right");
|
||||||
@ -441,6 +478,8 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
|||||||
|
|
||||||
// Build the consensus sequence if asked
|
// Build the consensus sequence if asked
|
||||||
if (build_consensus)
|
if (build_consensus)
|
||||||
|
{
|
||||||
|
if (! rev_quals)
|
||||||
{
|
{
|
||||||
// Get the quality arrays
|
// Get the quality arrays
|
||||||
qual1 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view1, qual_col1, idx1, 0, &qual1_len);
|
qual1 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view1, qual_col1, idx1, 0, &qual1_len);
|
||||||
@ -455,15 +494,13 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
|||||||
obidebug(1, "\nError getting the quality of the 2nd sequence when computing the kmer similarity between two sequences");
|
obidebug(1, "\nError getting the quality of the 2nd sequence when computing the kmer similarity between two sequences");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Decode the first sequence if not already done
|
// Decode the first sequence if not already done
|
||||||
if (seq1 == NULL)
|
if (seq1 == NULL)
|
||||||
seq1 = obi_blob_to_seq(blob1);
|
seq1 = obi_blob_to_seq(blob1);
|
||||||
|
|
||||||
if (! switched_seqs)
|
|
||||||
consensus_len = len2 - best_shift;
|
consensus_len = len2 - best_shift;
|
||||||
else
|
|
||||||
consensus_len = len1 + best_shift;
|
|
||||||
|
|
||||||
// Allocate memory for consensus sequence
|
// Allocate memory for consensus sequence
|
||||||
consensus_seq = (char*) malloc(consensus_len + 1 * sizeof(char)); // TODO keep malloced too maybe
|
consensus_seq = (char*) malloc(consensus_len + 1 * sizeof(char)); // TODO keep malloced too maybe
|
||||||
@ -557,6 +594,12 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
|||||||
free(seq2);
|
free(seq2);
|
||||||
free(blob2);
|
free(blob2);
|
||||||
|
|
||||||
|
if (rev_quals)
|
||||||
|
{
|
||||||
|
free(qual1);
|
||||||
|
free(qual2);
|
||||||
|
}
|
||||||
|
|
||||||
return ali;
|
return ali;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user