From 238e9f70f3c6c1dca936129550bc95bfc2a954fa Mon Sep 17 00:00:00 2001 From: MercierC Date: Wed, 7 Apr 2021 10:31:12 +1200 Subject: [PATCH] alignpairedend: fixed bug that would cut out sequence ends when it should not have --- src/kmer_similarity.c | 145 +++++++++++++++++++++++++++--------------- 1 file changed, 94 insertions(+), 51 deletions(-) diff --git a/src/kmer_similarity.c b/src/kmer_similarity.c index b9a9950..407eefa 100755 --- a/src/kmer_similarity.c +++ b/src/kmer_similarity.c @@ -77,7 +77,6 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 int32_t* shift_count_array; Obi_ali_p ali = NULL; int i, j; - bool switched_seqs; bool reversed; int score = 0; Obi_blob_p blob1 = NULL; @@ -124,6 +123,8 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 bool keep_seq2_start; bool keep_seq1_end; bool keep_seq2_end; + bool left_ali; + bool rev_quals = false; // Check kmer size if ((kmer_size < 1) || (kmer_size > 4)) @@ -148,19 +149,8 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 } // Choose the shortest sequence to save kmer positions in array - switched_seqs = false; len1 = blob1->length_decoded_value; len2 = blob2->length_decoded_value; - if (len2 < len1) - { - switched_seqs = true; - temp_len = len1; - len1 = len2; - len2 = temp_len; - temp_blob = blob1; - blob1 = blob2; - blob2 = temp_blob; - } // Force encoding on 2 bits by replacing ambiguous nucleotides by 'a's if (blob1->element_size == 4) @@ -196,7 +186,47 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 else reversed = false; if (reversed) - switched_seqs = !switched_seqs; + // unreverse to make cases simpler. Costly but rare (direct match is reverse primer match) + { + if (seq2 == NULL) + seq2 = obi_blob_to_seq(blob2); + seq2 = reverse_complement_sequence(seq2); + blob2 = obi_seq_to_blob(seq2); + + if (seq1 == NULL) + seq1 = obi_blob_to_seq(blob1); + seq1 = reverse_complement_sequence(seq1); + blob1 = obi_seq_to_blob(seq1); + free_blob1 = true; + + // Get the quality arrays + qual1 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view1, qual_col1, idx1, 0, &qual1_len); + if (qual1 == NULL) + { + obidebug(1, "\nError getting the quality of the 1st sequence when computing the kmer similarity between two sequences"); + return NULL; + } + qual2 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view2, qual_col2, idx2, 0, &qual2_len); + if (qual2 == NULL) + { + obidebug(1, "\nError getting the quality of the 2nd sequence when computing the kmer similarity between two sequences"); + return NULL; + } + + uint8_t* newqual1 = malloc(qual1_len*sizeof(uint8_t)); + uint8_t* newqual2 = malloc(qual2_len*sizeof(uint8_t)); + + for (i=0;i= shift_array_height) + else if (total_len >= shift_array_height) { shift_array_height = total_len; *shift_array_p = (int32_t*) realloc(*shift_array_p, ARRAY_LENGTH * shift_array_height * sizeof(int32_t)); @@ -291,7 +321,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 *shift_array_height_p = shift_array_height; *shift_count_array_length_p = shift_count_array_length; - // Fill array with positions of kmers in the shortest sequence + // Fill array with positions of kmers in the first sequence encoding = blob1->element_size; kmer_count = len1 - kmer_size + 1; for (kmer_idx=0; kmer_idx < kmer_count; kmer_idx++) @@ -310,7 +340,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 kmer_pos_array[(kmer*kmer_pos_array_height)+i] = kmer_idx; } - // Compare positions of kmers between both sequences and store shifts + // Compare positions of kmers between both sequences and store shifts (a shift corresponds to pos2 - pos1) kmer_count = blob2->length_decoded_value - kmer_size + 1; for (kmer_idx=0; kmer_idx < kmer_count; kmer_idx++) { @@ -374,35 +404,42 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 // The 873863 cases of hell if (best_shift > 0) { + left_ali = false; overlap_len = len2 - best_shift; if (len1 <= overlap_len) { overlap_len = len1; - if (! switched_seqs) - keep_seq2_end = true; - else - keep_seq2_start = true; - } - else if (switched_seqs) - { - keep_seq2_start = true; - keep_seq1_end = true; + keep_seq2_end = true; } } else if (best_shift < 0) { + left_ali = true; overlap_len = len1 + best_shift; - if (!switched_seqs) - { - keep_seq1_start = true; - keep_seq2_end = true; - } - } - else - { - overlap_len = len1; - if ((!switched_seqs) && (len2 > len1)) + if (len2 <= overlap_len) + { + overlap_len = len2; + keep_seq1_start = true; + } + else + { + keep_seq1_start = true; keep_seq2_end = true; + } + } + else // if (best_shift == 0) + { + if (len2 >= len1) + { + overlap_len = len1; + keep_seq2_end = true; + left_ali = true; + } + else + { + overlap_len = len2; + left_ali = false; // discussable + } } ali = (Obi_ali_p) malloc(sizeof(Obi_ali_t)); @@ -433,7 +470,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 ali->direction[0] = '\0'; else { - if (((best_shift <= 0) && (!switched_seqs)) || ((best_shift > 0) && switched_seqs)) + if (left_ali) strcpy(ali->direction, "left"); else strcpy(ali->direction, "right"); @@ -442,28 +479,28 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 // Build the consensus sequence if asked if (build_consensus) { - // Get the quality arrays - qual1 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view1, qual_col1, idx1, 0, &qual1_len); - if (qual1 == NULL) + if (! rev_quals) { - obidebug(1, "\nError getting the quality of the 1st sequence when computing the kmer similarity between two sequences"); - return NULL; - } - qual2 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view2, qual_col2, idx2, 0, &qual2_len); - if (qual2 == NULL) - { - obidebug(1, "\nError getting the quality of the 2nd sequence when computing the kmer similarity between two sequences"); - return NULL; + // Get the quality arrays + qual1 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view1, qual_col1, idx1, 0, &qual1_len); + if (qual1 == NULL) + { + obidebug(1, "\nError getting the quality of the 1st sequence when computing the kmer similarity between two sequences"); + return NULL; + } + qual2 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view2, qual_col2, idx2, 0, &qual2_len); + if (qual2 == NULL) + { + obidebug(1, "\nError getting the quality of the 2nd sequence when computing the kmer similarity between two sequences"); + return NULL; + } } // Decode the first sequence if not already done if (seq1 == NULL) seq1 = obi_blob_to_seq(blob1); - if (! switched_seqs) - consensus_len = len2 - best_shift; - else - consensus_len = len1 + best_shift; + consensus_len = len2 - best_shift; // Allocate memory for consensus sequence consensus_seq = (char*) malloc(consensus_len + 1 * sizeof(char)); // TODO keep malloced too maybe @@ -557,6 +594,12 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 free(seq2); free(blob2); + if (rev_quals) + { + free(qual1); + free(qual2); + } + return ali; }