alignpairedend: fixed bug that would cut out sequence ends when it

should not have
This commit is contained in:
MercierC
2021-04-07 10:31:12 +12:00
parent e099a16624
commit 238e9f70f3

View File

@ -77,7 +77,6 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
int32_t* shift_count_array; int32_t* shift_count_array;
Obi_ali_p ali = NULL; Obi_ali_p ali = NULL;
int i, j; int i, j;
bool switched_seqs;
bool reversed; bool reversed;
int score = 0; int score = 0;
Obi_blob_p blob1 = NULL; Obi_blob_p blob1 = NULL;
@ -124,6 +123,8 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
bool keep_seq2_start; bool keep_seq2_start;
bool keep_seq1_end; bool keep_seq1_end;
bool keep_seq2_end; bool keep_seq2_end;
bool left_ali;
bool rev_quals = false;
// Check kmer size // Check kmer size
if ((kmer_size < 1) || (kmer_size > 4)) if ((kmer_size < 1) || (kmer_size > 4))
@ -148,19 +149,8 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
} }
// Choose the shortest sequence to save kmer positions in array // Choose the shortest sequence to save kmer positions in array
switched_seqs = false;
len1 = blob1->length_decoded_value; len1 = blob1->length_decoded_value;
len2 = blob2->length_decoded_value; len2 = blob2->length_decoded_value;
if (len2 < len1)
{
switched_seqs = true;
temp_len = len1;
len1 = len2;
len2 = temp_len;
temp_blob = blob1;
blob1 = blob2;
blob2 = temp_blob;
}
// Force encoding on 2 bits by replacing ambiguous nucleotides by 'a's // Force encoding on 2 bits by replacing ambiguous nucleotides by 'a's
if (blob1->element_size == 4) if (blob1->element_size == 4)
@ -196,7 +186,47 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
else else
reversed = false; reversed = false;
if (reversed) if (reversed)
switched_seqs = !switched_seqs; // unreverse to make cases simpler. Costly but rare (direct match is reverse primer match)
{
if (seq2 == NULL)
seq2 = obi_blob_to_seq(blob2);
seq2 = reverse_complement_sequence(seq2);
blob2 = obi_seq_to_blob(seq2);
if (seq1 == NULL)
seq1 = obi_blob_to_seq(blob1);
seq1 = reverse_complement_sequence(seq1);
blob1 = obi_seq_to_blob(seq1);
free_blob1 = true;
// Get the quality arrays
qual1 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view1, qual_col1, idx1, 0, &qual1_len);
if (qual1 == NULL)
{
obidebug(1, "\nError getting the quality of the 1st sequence when computing the kmer similarity between two sequences");
return NULL;
}
qual2 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view2, qual_col2, idx2, 0, &qual2_len);
if (qual2 == NULL)
{
obidebug(1, "\nError getting the quality of the 2nd sequence when computing the kmer similarity between two sequences");
return NULL;
}
uint8_t* newqual1 = malloc(qual1_len*sizeof(uint8_t));
uint8_t* newqual2 = malloc(qual2_len*sizeof(uint8_t));
for (i=0;i<qual1_len;i++)
newqual1[i] = qual1[qual1_len-1-i];
for (i=0;i<qual2_len;i++)
newqual2[i] = qual2[qual2_len-1-i];
qual1 = newqual1;
qual2 = newqual2;
rev_quals = true;
}
// Save total length for the shift counts array // Save total length for the shift counts array
total_len = len1 + len2 + 1; // +1 for shift 0 total_len = len1 + len2 + 1; // +1 for shift 0
@ -237,7 +267,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
return NULL; return NULL;
} }
} }
else if (len1 >= shift_array_height) else if (total_len >= shift_array_height)
{ {
shift_array_height = total_len; shift_array_height = total_len;
*shift_array_p = (int32_t*) realloc(*shift_array_p, ARRAY_LENGTH * shift_array_height * sizeof(int32_t)); *shift_array_p = (int32_t*) realloc(*shift_array_p, ARRAY_LENGTH * shift_array_height * sizeof(int32_t));
@ -291,7 +321,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
*shift_array_height_p = shift_array_height; *shift_array_height_p = shift_array_height;
*shift_count_array_length_p = shift_count_array_length; *shift_count_array_length_p = shift_count_array_length;
// Fill array with positions of kmers in the shortest sequence // Fill array with positions of kmers in the first sequence
encoding = blob1->element_size; encoding = blob1->element_size;
kmer_count = len1 - kmer_size + 1; kmer_count = len1 - kmer_size + 1;
for (kmer_idx=0; kmer_idx < kmer_count; kmer_idx++) for (kmer_idx=0; kmer_idx < kmer_count; kmer_idx++)
@ -310,7 +340,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
kmer_pos_array[(kmer*kmer_pos_array_height)+i] = kmer_idx; kmer_pos_array[(kmer*kmer_pos_array_height)+i] = kmer_idx;
} }
// Compare positions of kmers between both sequences and store shifts // Compare positions of kmers between both sequences and store shifts (a shift corresponds to pos2 - pos1)
kmer_count = blob2->length_decoded_value - kmer_size + 1; kmer_count = blob2->length_decoded_value - kmer_size + 1;
for (kmer_idx=0; kmer_idx < kmer_count; kmer_idx++) for (kmer_idx=0; kmer_idx < kmer_count; kmer_idx++)
{ {
@ -374,35 +404,42 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
// The 873863 cases of hell // The 873863 cases of hell
if (best_shift > 0) if (best_shift > 0)
{ {
left_ali = false;
overlap_len = len2 - best_shift; overlap_len = len2 - best_shift;
if (len1 <= overlap_len) if (len1 <= overlap_len)
{ {
overlap_len = len1; overlap_len = len1;
if (! switched_seqs)
keep_seq2_end = true; keep_seq2_end = true;
else
keep_seq2_start = true;
}
else if (switched_seqs)
{
keep_seq2_start = true;
keep_seq1_end = true;
} }
} }
else if (best_shift < 0) else if (best_shift < 0)
{ {
left_ali = true;
overlap_len = len1 + best_shift; overlap_len = len1 + best_shift;
if (!switched_seqs) if (len2 <= overlap_len)
{
overlap_len = len2;
keep_seq1_start = true;
}
else
{ {
keep_seq1_start = true; keep_seq1_start = true;
keep_seq2_end = true; keep_seq2_end = true;
} }
} }
else else // if (best_shift == 0)
{
if (len2 >= len1)
{ {
overlap_len = len1; overlap_len = len1;
if ((!switched_seqs) && (len2 > len1))
keep_seq2_end = true; keep_seq2_end = true;
left_ali = true;
}
else
{
overlap_len = len2;
left_ali = false; // discussable
}
} }
ali = (Obi_ali_p) malloc(sizeof(Obi_ali_t)); ali = (Obi_ali_p) malloc(sizeof(Obi_ali_t));
@ -433,7 +470,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
ali->direction[0] = '\0'; ali->direction[0] = '\0';
else else
{ {
if (((best_shift <= 0) && (!switched_seqs)) || ((best_shift > 0) && switched_seqs)) if (left_ali)
strcpy(ali->direction, "left"); strcpy(ali->direction, "left");
else else
strcpy(ali->direction, "right"); strcpy(ali->direction, "right");
@ -441,6 +478,8 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
// Build the consensus sequence if asked // Build the consensus sequence if asked
if (build_consensus) if (build_consensus)
{
if (! rev_quals)
{ {
// Get the quality arrays // Get the quality arrays
qual1 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view1, qual_col1, idx1, 0, &qual1_len); qual1 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view1, qual_col1, idx1, 0, &qual1_len);
@ -455,15 +494,13 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
obidebug(1, "\nError getting the quality of the 2nd sequence when computing the kmer similarity between two sequences"); obidebug(1, "\nError getting the quality of the 2nd sequence when computing the kmer similarity between two sequences");
return NULL; return NULL;
} }
}
// Decode the first sequence if not already done // Decode the first sequence if not already done
if (seq1 == NULL) if (seq1 == NULL)
seq1 = obi_blob_to_seq(blob1); seq1 = obi_blob_to_seq(blob1);
if (! switched_seqs)
consensus_len = len2 - best_shift; consensus_len = len2 - best_shift;
else
consensus_len = len1 + best_shift;
// Allocate memory for consensus sequence // Allocate memory for consensus sequence
consensus_seq = (char*) malloc(consensus_len + 1 * sizeof(char)); // TODO keep malloced too maybe consensus_seq = (char*) malloc(consensus_len + 1 * sizeof(char)); // TODO keep malloced too maybe
@ -557,6 +594,12 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
free(seq2); free(seq2);
free(blob2); free(blob2);
if (rev_quals)
{
free(qual1);
free(qual2);
}
return ali; return ali;
} }