Files
obitools3/src/kmer_similarity.h

133 lines
5.8 KiB
C
Raw Permalink Normal View History

/****************************************************************************
* Header file for Kmer similarity computation functions *
****************************************************************************/
/**
* @file kmer_similarity.h
* @author Celine Mercier (celine.mercier@metabarcoding.org)
* @date January 7th 2019
* @brief Header file for Kmer similarity computation functions.
*/
#ifndef KMER_SIMILARITY_H_
#define KMER_SIMILARITY_H_
#include <stdio.h>
#include "obitypes.h"
#include "obidmscolumn.h"
#include "obiview.h"
#define ARRAY_LENGTH (256)
/**
* @brief Alignment structure, with informations about the similarity and to rebuild the alignment.
*/
typedef struct Obi_ali {
int score; /**< Alignment score, corresponding to an approximation of the number of
* nucleotides matching in the overlap of the alignment.
* It's an approximation because one mismatch produces kmer_size kmer mismatches if in the middle of the overlap,
* and less for mismatches located towards the ends of the overlap. The case where there are the most mismatches is assumed,
* meaning that the score will be often underestimated and never overestimated.
*/
int consensus_length; /**< Length of the final consensus sequence.
*/
int overlap_length; /**< Length of the overlap between the aligned sequences.
*/
char* consensus_seq; /**< Consensus sequence built as to reconstruct a pairedend read.
*/
uint8_t* consensus_qual; /**< Consensus quality built as to reconstruct a pairedend read.
*/
2019-07-06 16:30:32 +02:00
int shift; /**< Shift chosen to align the sequences.
*/
2019-07-06 16:30:32 +02:00
char direction[6]; /**< Alignment direction (positive/right or negative/left shift).
*/
} Obi_ali_t, *Obi_ali_p;
/**
* @brief Frees an Obi_ali_p structure and all its elements.
*
* @warning The pointer sent becomes unusable.
*
* @param ali The pointer on the Obi_ali_p structure to free.
*
* @since January 2019
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
void obi_free_shifted_ali(Obi_ali_p ali);
/**
* Function computing the kmer similarity of two sequences stored in views.
*
* The similarity is computing this way: the positions of identical kmers in both sequences are
* compared, and the most represented shift is chosen. The similarity is then calculated as:
* kmer_similarity = number_of_common_kmers_with_the_chosen_shift + kmer_size - 1
*
* @warning Several pointers and structures passed to or returned by the function have to be freed by the caller:
* - kmer_pos_array
* - shift_array
* - shift_count_array
* - the Obi_ali_p structure returned
*
* @param view1 A pointer on the view containing the first sequence.
* @param column1 A pointer on the column containing the first sequence.
* @param idx1 The index of the first sequence in view1.
* @param elt_idx1 The element index of the first sequence in column1.
* @param view2 A pointer on the view containing the second sequence.
* @param column2 A pointer on the column containing the second sequence.
* @param idx2 The index of the second sequence in view2.
* @param elt_idx2 The element index of the second sequence in column2.
* @param qual_col1 A pointer on the column containing the quality associated with the first sequence (in view1).
* @param qual_col2 A pointer on the column containing the quality associated with the second sequence (in view2).
* @param reversed_column A pointer on the column containing a boolean indicating whether the sequences correspond
* to paired-end reads that might have been reversed before aligning them
* (e.g. when the ngsfilter tool is used before alignpairedend).
* @param kmer_size The kmer length to use. Must be >= 1 <= 4.
* @param kmer_pos_array The array used to store kmer positions. If NULL, allocated by the function.
* If needed, reallocated to a bigger size.
* @param kmer_pos_array_height_p A pointer on an integer corresponding to the size (number of elements)
* allocated for kmer_pos_array. Updated by the function as needed.
* @param shift_array The array used to store kmer shifts. If NULL, allocated by the function.
* If needed, reallocated to a bigger size.
* @param shift_array_height_p A pointer on an integer corresponding to the size (number of elements)
* allocated for shift_array. Updated by the function as needed.
* @param shift_count_array The array used to store shift counts. If NULL, allocated by the function.
* If needed, reallocated to a bigger size.
* @param shift_count_array_height_p A pointer on an integer corresponding to the size (number of elements)
* allocated for shift_count_array. Updated by the function as needed.
* @param build_consensus A boolean indicating whether the function should build the consensus sequence and quality as to reconstruct a pairedend read. // TODO option to build consensus without quality?
*
* @returns A pointer on an Obi_ali_p structure containing the results.
* @retval NULL if an error occurred.
*
* @since January 2019
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
Obi_ali_p kmer_similarity(Obiview_p view1,
OBIDMS_column_p column1,
index_t idx1,
index_t elt_idx1,
Obiview_p view2,
OBIDMS_column_p column2,
index_t idx2,
index_t elt_idx2,
OBIDMS_column_p qual_col1,
OBIDMS_column_p qual_col2,
OBIDMS_column_p reversed_column,
uint8_t kmer_size,
int32_t** kmer_pos_array_p,
int32_t* kmer_pos_array_height_p,
int32_t** shift_array_p,
int32_t* shift_array_height_p,
int32_t** shift_count_array_p,
int32_t* shift_count_array_height_p,
bool build_consensus);
#endif /* KMER_SIMILARITY_H_ */