Alignpairedend: added alignment using shifting with best kmer similarity

(low level layer in C and Cython API)
2019-01-21 17:30:46 +01:00
parent 59017c0d6b
commit c7f5b8d980
10 changed files with 934 additions and 89 deletions
--- a/src/kmer_similarity.c
+++ b/src/kmer_similarity.c
@ -0,0 +1,483 @@
+/****************************************************************************
+ * Kmer similarity computation functions                                      *
+ ****************************************************************************/
+
+/**
+ * @file kmer_similarity.c
+ * @author Celine Mercier (celine.mercier@metabarcoding.org)
+ * @date January 7th 2019
+ * @brief Kmer similarity computation functions.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+#include "utils.h"
+#include "obidebug.h"
+#include "obierrno.h"
+#include "obitypes.h"
+
+#include "kmer_similarity.h"
+#include "obidmscolumn.h"
+#include "obiview.h"
+#include "encode.h"
+#include "dna_seq_indexer.h"
+
+
+#define DEBUG_LEVEL 0	// TODO has to be defined somewhere else (cython compil flag?)
+
+
+
+/**************************************************************************
+ *
+ * D E C L A R A T I O N   O F   T H E   P R I V A T E   F U N C T I O N S
+ *
+ **************************************************************************/
+
+
+
+
+/************************************************************************
+ *
+ * D E F I N I T I O N   O F   T H E   P R I V A T E   F U N C T I O N S
+ *
+ ************************************************************************/
+
+
+
+
+/**********************************************************************
+ *
+ * D E F I N I T I O N   O F   T H E   P U B L I C   F U N C T I O N S
+ *
+ **********************************************************************/
+
+
+void obi_free_shifted_ali(Obi_ali_p ali)
+{
+	free(ali->consensus_seq);
+	free(ali->consensus_qual);
+	free(ali);
+}
+
+
+Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1, index_t elt_idx1,
+							Obiview_p view2, OBIDMS_column_p column2, index_t idx2, index_t elt_idx2,
+							uint8_t kmer_size,
+							int32_t* kmer_pos_array, int32_t* kmer_pos_array_height_p,
+							int32_t* shift_array, int32_t* shift_array_height_p,
+							int32_t* shift_count_array, int32_t* shift_count_array_length_p,
+							bool build_consensus)
+{
+	Obi_ali_p       ali = NULL;
+	int i, j;
+	OBIDMS_column_p qual_col1;
+	OBIDMS_column_p qual_col2;
+	double 			score = 0.0;
+	Obi_blob_p      blob1 = NULL;
+	Obi_blob_p   	blob2 = NULL;
+	Obi_blob_p   	temp_blob = NULL;
+	char*			seq1 = NULL;
+	char*			seq2 = NULL;
+	int32_t         len1;
+	int32_t         len2;
+	int32_t			temp_len;
+	int32_t         pos;
+	int32_t         shift1;
+	int32_t         shift2;
+	const uint8_t*  qual1;
+	const uint8_t*  qual2;
+	int             qual1_len;
+	int				qual2_len;
+	char*           consensus_seq;
+	uint8_t*        consensus_qual;
+	int32_t  		empty_part;
+	int32_t	        nuc_idx;
+	int32_t    		kmer_idx;
+	int32_t	        kmer_count;
+	int32_t         best_shift_idx;
+	int32_t         best_shift;
+	int32_t         max_common_kmers;
+	int32_t         overlap_len;
+	int32_t         consensus_len;
+	int32_t  		cons_shift;
+	uint8_t   		kmer;
+	byte_t          nuc;
+	uint8_t         encoding;
+	int             total_len;
+	int             shift;
+	int32_t  		shift_idx;
+	int32_t         kmer_pos_array_height    = *kmer_pos_array_height_p;
+	int32_t         shift_array_height       = *shift_array_height_p;
+	int32_t         shift_count_array_length = *shift_count_array_length_p;
+	bool			free_blob1 = false;
+
+	// Check kmer size
+	if ((kmer_size < 1) || (kmer_size > 4))
+	{
+		obi_set_errno(OBI_ALIGN_ERROR);
+		obidebug(1, "\nError when computing the kmer similarity between two sequences: the kmer size must be >= 1 or <= 4");
+		return NULL;
+	}
+
+	// Get sequence blobs
+	blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(view1, column1, idx1, elt_idx1);
+	if (blob1 == NULL)
+	{
+	 	obidebug(1, "\nError getting the blob of the 1st sequence when computing the kmer similarity between two sequences");
+	 	return NULL;
+	}
+	blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(view2, column2, idx2, elt_idx2);
+	if (blob2 == NULL)
+	{
+	 	obidebug(1, "\nError getting the blob of the 2nd sequence when computing the kmer similarity between two sequences");
+	 	return NULL;
+	}
+
+	// Choose the shortest sequence to save kmer positions in array
+	len1 = blob1->length_decoded_value;
+	len2 = blob2->length_decoded_value;
+	if (len2 > len1)
+	{
+		temp_len = len1;
+		len1 = len2;
+		len2 = temp_len;
+		temp_blob = blob1;
+		blob1 = blob2;
+		blob2 = temp_blob;
+	}
+
+	// Force encoding on 2 bits by replacing ambiguous nucleotides by 'a's
+	if (blob1->element_size == 4)
+	{
+		seq1 = obi_blob_to_seq(blob1);
+		for (i=0; i<len1; i++)
+		{
+			if (seq1[i] != 'a' && seq1[i] != 't' && seq1[i] != 'g' && seq1[i] != 'c')
+				seq1[i] = 'a';
+		}
+		blob1 = obi_seq_to_blob(seq1);
+		free_blob1 = true;
+	}
+	if (blob2->element_size == 4)
+	{
+		seq2 = obi_blob_to_seq(blob2);
+		for (i=0; i<len2; i++)
+		{
+			if (seq2[i] != 'a' && seq2[i] != 't' && seq2[i] != 'g' && seq2[i] != 'c')
+				seq2[i] = 'a';
+		}
+	}
+
+	// Reverse complement the other sequence    // TODO blob to seq to blob for now, could be more efficient
+	if (seq2 == NULL)
+		seq2 = obi_blob_to_seq(blob2);
+	seq2 = reverse_complement_sequence(seq2);
+	blob2 = obi_seq_to_blob(seq2);
+
+	// Save total length for the shift counts array
+	total_len = len1 + len2 + 1;   // +1 for shift 0
+
+	// Allocate or reallocate memory for the array of shift counts if necessary
+	if (shift_count_array == NULL)
+	{
+		shift_count_array_length = total_len;
+		shift_count_array = (int32_t*) malloc(shift_count_array_length * sizeof(int32_t));
+		if (shift_count_array == NULL)
+		{
+		 	obi_set_errno(OBI_MALLOC_ERROR);
+		 	obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory");
+		 	return NULL;
+		}
+	}
+	else if (total_len >= shift_count_array_length)
+	{
+		shift_count_array_length = total_len;
+		shift_count_array = (int32_t*) realloc(shift_count_array, shift_count_array_length * sizeof(int32_t));
+		if (shift_count_array == NULL)
+		{
+		 	obi_set_errno(OBI_MALLOC_ERROR);
+		 	obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory");
+		 	return NULL;
+		}
+	}
+
+	// Allocate or reallocate memory for the array of shifts if necessary
+	if (shift_array == NULL)
+	{
+		shift_array_height = total_len;
+		shift_array = (int32_t*) malloc(ARRAY_LENGTH * shift_array_height * sizeof(int32_t));
+		if (shift_array == NULL)
+		{
+		 	obi_set_errno(OBI_MALLOC_ERROR);
+		 	obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory");
+		 	return NULL;
+		}
+	}
+	else if (len1 >= shift_array_height)
+	{
+		shift_array_height = total_len;
+		shift_array = (int32_t*) realloc(shift_array, ARRAY_LENGTH * shift_array_height * sizeof(int32_t));
+		if (shift_array == NULL)
+		{
+		 	obi_set_errno(OBI_MALLOC_ERROR);
+		 	obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory");
+		 	return NULL;
+		}
+	}
+
+	// Allocate or reallocate memory for the array of positions if necessary
+	if (kmer_pos_array == NULL)
+	{
+		kmer_pos_array_height = len1;
+		kmer_pos_array = (int32_t*) malloc(ARRAY_LENGTH * kmer_pos_array_height * sizeof(int32_t));
+		if (kmer_pos_array == NULL)
+		{
+		 	obi_set_errno(OBI_MALLOC_ERROR);
+		 	obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory");
+		 	return NULL;
+		}
+	}
+	else if (len1 >= kmer_pos_array_height)
+	{
+		kmer_pos_array_height = len1;
+		kmer_pos_array = (int32_t*) realloc(kmer_pos_array, ARRAY_LENGTH * kmer_pos_array_height * sizeof(int32_t));
+		if (kmer_pos_array == NULL)
+		{
+		 	obi_set_errno(OBI_MALLOC_ERROR);
+		 	obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory");
+		 	return NULL;
+		}
+	}
+
+	// Initialize all positions to -1
+	for (i=0; i<(ARRAY_LENGTH * kmer_pos_array_height); i++)
+		kmer_pos_array[i] = -1;
+	// Initialize all shifts to the maximum value of an int32_t
+	for (i=0; i<(ARRAY_LENGTH * shift_array_height); i++)
+		shift_array[i] = INT32_MAX;
+	//memset(shift_array, 1, ARRAY_LENGTH * shift_array_height * sizeof(int32_t)); // why doesn't this work?
+	// Initialize all shift counts to 0
+	memset(shift_count_array, 0, shift_count_array_length * sizeof(int32_t));
+
+	*kmer_pos_array_height_p = kmer_pos_array_height;
+	*shift_array_height_p = shift_array_height;
+	*shift_count_array_length_p = shift_count_array_length;
+
+	// Fill array with positions of kmers in the shortest sequence
+	encoding = blob1->element_size;
+	kmer_count = len1 - kmer_size + 1;
+	for (kmer_idx=0; kmer_idx < kmer_count; kmer_idx++)
+	{
+		kmer = 0;
+		for (nuc_idx=kmer_idx; nuc_idx<(kmer_idx+kmer_size); nuc_idx++)
+		{
+			nuc = get_nucleotide_from_encoded_seq(blob1->value, nuc_idx, encoding);
+			kmer <<= encoding;
+			kmer |= nuc;
+		}
+
+		i = 0;
+		while (kmer_pos_array[(kmer*kmer_pos_array_height)+i] != -1)
+			i++;
+		kmer_pos_array[(kmer*kmer_pos_array_height)+i] = kmer_idx;
+	}
+
+	// Compare positions of kmers between both sequences and store shifts
+	kmer_count = blob2->length_decoded_value - kmer_size + 1;
+	for (kmer_idx=0; kmer_idx < kmer_count; kmer_idx++)
+	{
+		kmer = 0;
+		for (nuc_idx=kmer_idx; nuc_idx<(kmer_idx+kmer_size); nuc_idx++)
+		{
+			nuc = get_nucleotide_from_encoded_seq(blob2->value, nuc_idx, encoding);
+			kmer <<= encoding;
+			kmer |= nuc;
+		}
+
+		// Get the index at which the new shifts should be stored for that kmer
+		j = 0;
+		while ((shift_array[(kmer*shift_array_height)+j] != INT32_MAX) && (j<shift_array_height))
+			j++;
+		// Store the shift between the kmer in the 1st and the 2nd sequence
+		i = 0;
+		while ((kmer_pos_array[(kmer*kmer_pos_array_height)+i] != -1) && (i<kmer_pos_array_height))
+		{
+			shift_array[(kmer*shift_array_height)+j] = kmer_idx - kmer_pos_array[(kmer*kmer_pos_array_height)+i];
+			i++;
+			j++;
+		}
+	}
+
+	// Count how many times each shift is represented
+	for (kmer=0;;kmer++)
+	{
+		for (i=0;;i++)
+		{
+			shift = shift_array[(kmer*shift_array_height)+i];
+			if ((shift == INT32_MAX) || (i==shift_array_height))
+				break;
+			shift_idx = shift + len1;
+			shift_count_array[shift_idx]++;
+		}
+		if (kmer == (ARRAY_LENGTH-1))  // stop condition can't be in for line because kmer can't be >= 255
+			break;
+	}
+
+	// Find the most represented shift
+	best_shift_idx = 0;
+	max_common_kmers = 0;
+	empty_part = (shift_count_array_length-1)/2 - len1;
+	for (i=empty_part; i < (shift_count_array_length - empty_part); i++)  // skipping empty parts of the array
+	{
+		if (shift_count_array[i] > max_common_kmers)
+		{
+			best_shift_idx = i;
+			max_common_kmers = shift_count_array[i];
+		}
+	}
+	best_shift = best_shift_idx - len1;
+
+	if (best_shift > 0)
+	    overlap_len = len2 - best_shift;
+	else
+	    overlap_len = len1 + best_shift;
+
+	score = max_common_kmers + kmer_size - 1;
+
+	ali = (Obi_ali_p) malloc(sizeof(Obi_ali_t));
+	if (ali == NULL)
+	{
+	 	obi_set_errno(OBI_MALLOC_ERROR);
+	 	obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory for the result structure");
+	 	return NULL;
+	}
+
+	// Save result in Obi_ali structure
+	ali->score = score;
+	ali->consensus_length = 0;
+	ali->overlap_length = overlap_len;
+	ali->shift = best_shift;
+	ali->consensus_seq = NULL;
+	ali->consensus_qual = NULL;
+
+	// Build the consensus sequence if asked
+	if (build_consensus)
+	{
+		// Get the quality columns
+		qual_col1 = obi_view_get_column(view1, QUALITY_COLUMN);
+		if (qual_col1 == NULL)
+		{
+			obi_set_errno(OBI_ALIGN_ERROR);
+			obidebug(1, "\nError when computing the kmer similarity between two sequences: can't open quality column to build the consensus sequence");
+			return NULL;
+		}
+		qual_col2 = obi_view_get_column(view2, QUALITY_COLUMN);
+		if (qual_col2 == NULL)
+		{
+			obi_set_errno(OBI_ALIGN_ERROR);
+			obidebug(1, "\nError when computing the kmer similarity between two sequences: can't open quality column to build the consensus sequence");
+			return NULL;
+		}
+
+		// Get the quality arrays
+		qual1 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view1, qual_col1, idx1, 0, &qual1_len);
+		if (qual1 == NULL)
+		{
+			obidebug(1, "\nError getting the blob of the 1st sequence when computing the kmer similarity between two sequences");
+			return NULL;
+		}
+		qual2 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view2, qual_col2, idx2, 0, &qual2_len);
+		if (qual2 == NULL)
+		{
+			obidebug(1, "\nError getting the blob of the 1st sequence when computing the kmer similarity between two sequences");
+			return NULL;
+		}
+
+		// Decode the first sequence if not already done
+		if (seq1 == NULL)
+			seq1 = obi_blob_to_seq(blob1);
+
+		if (best_shift > 0)
+		    consensus_len = len1 + len2 - overlap_len;
+		else
+		    consensus_len = overlap_len;
+
+		// Allocate memory for consensus sequence
+		consensus_seq = (char*) malloc(consensus_len + 1 * sizeof(char)); // TODO keep malloced too maybe
+		if (consensus_seq == NULL)
+		{
+		 	obi_set_errno(OBI_MALLOC_ERROR);
+		 	obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory for the consensus sequence");
+		 	return NULL;
+		}
+
+		// Allocate memory for consensus quality
+		consensus_qual = (uint8_t*) malloc(consensus_len * sizeof(uint8_t));
+		if (consensus_qual == NULL)
+		{
+		 	obi_set_errno(OBI_MALLOC_ERROR);
+		 	obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory for the consensus quality");
+		 	return NULL;
+		}
+
+		ali->consensus_length = consensus_len;
+		ali->consensus_seq = consensus_seq;
+		ali->consensus_qual = consensus_qual;
+
+		// Compute consensus-relative shift for each sequence and store direction
+		if (best_shift >= 0)
+		{
+			shift1 = 0;
+			shift2 = best_shift;
+			strcpy(ali->direction, "left");
+		}
+		else
+		{
+			shift1 = -(best_shift);
+			shift2 = 0;
+			strcpy(ali->direction, "right");
+		}
+
+		// If positive shift, copy first part of second sequence
+		if (best_shift > 0)
+		{
+			strncpy(consensus_seq, seq2, best_shift);
+			memcpy(consensus_qual, qual2, best_shift*sizeof(uint8_t));
+			cons_shift = best_shift;
+		}
+		else
+			cons_shift = 0;
+
+		// Build consensus part
+		for (pos=0; pos<overlap_len; pos++)
+		{
+			if (qual1[pos+shift1] >= qual2[pos+shift2])
+				consensus_seq[pos+cons_shift] = seq1[pos+shift1];
+			else
+				consensus_seq[pos+cons_shift] = seq2[pos+shift2];
+ 			consensus_qual[pos+cons_shift] = round((qual1[pos+shift1] + qual2[pos+shift2])/2);    // TODO maybe use the (p1*(1-p2/3)) formula (but parenthesis bug???)
+		}
+
+		// If positive shift, copy last part of first sequence
+		if (best_shift > 0)
+		{
+			strncpy(consensus_seq+cons_shift+overlap_len, seq1+overlap_len, len1-overlap_len);
+			memcpy(consensus_qual+cons_shift+overlap_len, qual1+overlap_len, (len1-overlap_len)*sizeof(uint8_t));
+		}
+
+		consensus_seq[consensus_len] = '\0';
+	}
+
+	if (seq1 != NULL)
+		free(seq1);
+	if (free_blob1)
+		free(blob1);
+	free(seq2);
+	free(blob2);
+
+	return ali;
+}
+