diff --git a/src/sse_banded_LCS_alignment.c b/src/sse_banded_LCS_alignment.c index 07b5ffe..8790c6b 100644 --- a/src/sse_banded_LCS_alignment.c +++ b/src/sse_banded_LCS_alignment.c @@ -1,16 +1,22 @@ -/* - * sse_banded_LCS_alignment.c - * - * Created on: 7 nov. 2012 - * Author: celine mercier +/**************************************************************************** + * LCS alignment of two sequences * + ****************************************************************************/ + +/** + * @file sse_banded_LCS_alignment.c + * @author Celine Mercier (celine.mercier@metabarcoding.org) + * @date November 7th 2012 + * @brief Functions handling the alignment of two sequences to compute their Longest Common Sequence. */ + #include #include #include #include #include +#include #include "obierrno.h" #include "obidebug.h" @@ -24,6 +30,231 @@ #define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) + +/************************************************************************** + * + * D E C L A R A T I O N O F T H E P R I V A T E F U N C T I O N S + * + **************************************************************************/ + + +/** + * @brief Internal function printing a 128 bits register as 8 16-bits integers. + * + * @param r The register to print. + * + * @author Eric Coissac (eric.coissac@metabarcoding.org) + */ +static void printreg(__m128i r); + + +/** + * @brief Internal function extracting a 16-bits integer from a 128 bits register. + * + * @param r The register to read. + * @param p The position at which the integer should be read (between 0 and 7). + * + * @returns The extracted integer. + * + * @author Eric Coissac (eric.coissac@metabarcoding.org) + */ +static inline int extract_reg(__m128i r, int p); + + +/** + * @brief Internal function aligning two sequences, computing the lengths of their Longest Common Subsequence and of their alignment. + * + * @warning The first argument (seq1) must correspond to the longest sequence. + * + * @param seq1 The first sequence, the longest of the two, as prepared by putSeqInSeq() or putBlobInSeq(). + * @param seq2 The second sequence, the shortest of the two, as prepared by putSeqInSeq() or putBlobInSeq(). + * @param l1 The length of the first sequence. + * @param l2 The length of the second sequence. + * @param bandLengthLeft The length of the left band for the banded alignment, as computed by calculateLeftBandLength(). + * @param bandLengthTotal The length of the complete band for the banded alignment, as computed by calculateSSEBandLength(). + * @param address A pointer, aligned on a 16 bits boundary, on the int array where the initial values for the alignment length are stored, + * as prepared for the alignment by initializeAddressWithGaps(). + * @param lcs_length A pointer on the int where the LCS length will be stored. + * @param ali_length A pointer on the int where the alignment length will be stored. + * + * @since 2012 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal, int16_t* address, int* lcs_length, int* ali_length); + + +/** + * @brief Internal function aligning two sequences, computing the length of their Longest Common Subsequence (and not the alignment length). + * + * @warning The first argument (seq1) must correspond to the longest sequence. + * + * @param seq1 The first sequence, the longest of the two, as prepared by putSeqInSeq() or putBlobInSeq(). + * @param seq2 The second sequence, the shortest of the two, as prepared by putSeqInSeq() or putBlobInSeq(). + * @param l1 The length of the first sequence. + * @param l2 The length of the second sequence. + * @param bandLengthLeft The length of the left band for the banded alignment, as computed by calculateLeftBandLength(). + * @param bandLengthTotal The length of the complete band for the banded alignment, as computed by calculateSSEBandLength(). + * @param lcs_length A pointer on the int where the LCS length will be stored. + * + * @since 2012 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +void sse_banded_align_just_lcs(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal, int* lcs_length); + + +/** + * @brief Internal function calculating the length of the left band for the banded alignment. + * + * @param lmax The length of the longest sequence to align. + * @param LCSmin The minimum length of the LCS to be above the chosen threshold, as computed by calculateLCSmin(). + * + * @returns The length of the left band. + * + * @since 2012 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +int calculateLeftBandLength(int lmax, int LCSmin); + + +/** + * @brief Internal function calculating the length of the right band for the banded alignment. + * + * @param lmin The length of the shortest sequence to align. + * @param LCSmin The minimum length of the LCS to be above the chosen threshold, as computed by calculateLCSmin(). + * + * @returns The length of the right band. + * + * @since 2012 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +int calculateRightBandLength(int lmin, int LCSmin); + + +/** + * @brief Internal function calculating the length of the complete band for the banded alignment. + * + * @param bandLengthRight The length of the right band for the banded alignment, as computed by calculateRightBandLength(). + * @param bandLengthLeft The length of the left band for the banded alignment, as computed by calculateLeftBandLength(). + * + * @returns The length of the complete band. + * + * @since 2012 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +int calculateSSEBandLength(int bandLengthRight, int bandLengthLeft); + + +/** + * @brief Internal function calculating the size to allocate for the int array where the alignment length will be stored in the matrix. + * + * @param maxLen The length of the longest sequence to align. + * @param LCSmin The minimum length of the LCS to be above the chosen threshold, as computed by calculateLCSmin(). + * + * @returns The size to allocate in bytes. + * + * @since 2012 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +int calculateSizeToAllocate(int maxLen, int LCSmin); + + +/** + * @brief Internal function initializing the int array corresponding to a sequence to align with default values. + * + * @param seq The int array corresponding to the sequence to align, as prepared by putSeqInSeq() or putBlobInSeq(). + * @param size The number of positions to initialize. + * @param iniValue The value that the positions should be initialized to. + * + * @since 2012 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +void iniSeq(int16_t* seq, int size, int16_t iniValue); + + +/** + * @brief Internal function building the int array corresponding to a sequence to align. + * + * Each nucleotide is stored as a short int (int16_t). + * + * @param seq A pointer on the allocated int array. + * @param s A pointer on the character string corresponding to the sequence. + * @param l The length of the sequence. + * @param reverse A boolean indicating whether the sequence should be written reversed + * (for the second sequence to align). + * + * @since 2012 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +void putSeqInSeq(int16_t* seq, char* s, int l, bool reverse); + + +/** + * @brief Internal function building the int array corresponding to an obiblob containing a sequence. + * + * Each nucleotide is stored as a short int (int16_t). + * + * @param seq A pointer on the allocated int array. + * @param b A pointer on the obiblob containing the sequence. + * @param l The length of the (decoded) sequence. + * @param reverse A boolean indicating whether the sequence should be written reversed + * (for the second sequence to align). + * + * @since 2012 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +void putBlobInSeq(int16_t* seq, Obi_blob_p b, int l, bool reverse); + + +/** + * @brief Internal function preparing an int array with the initial values for the alignment lengths before the alignment. + * + * The int array containing the initial alignment lengths (corresponding to the first line of the diagonalized band of the alignment matrix) + * needs to be initialized with external gap lengths before the alignment. + * + * @param address A pointer, aligned on a 16 bits boundary, on the int array where the initial values for the alignment length are to be stored. + * @param bandLengthTotal The length of the complete band for the banded alignment, as computed by calculateSSEBandLength(). + * @param bandLengthLeft The length of the left band for the banded alignment, as computed by calculateLeftBandLength(). + * @param lmax The length of the longest sequence to align. + * + * @since 2012 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +void initializeAddressWithGaps(int16_t* address, int bandLengthTotal, int bandLengthLeft, int lmax); + + +/** + * @brief Internal function aligning two sequences, computing the lengths of their Longest Common Subsequence and of their alignment. + * + * @warning The first argument (seq1) must correspond to the longest sequence. + * + * @param seq1 The first sequence, the longest of the two, as prepared by putSeqInSeq() or putBlobInSeq(). + * @param seq2 The second sequence, the shortest of the two, as prepared by putSeqInSeq() or putBlobInSeq(). + * @param l1 The length of the first sequence. + * @param l2 The length of the second sequence. + * @param normalize Whether the score should be normalized with the reference sequence length. + * @param reference The reference length. 0: The alignment length; 1: The longest sequence's length; 2: The shortest sequence's length. + * @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false). + * @param address A pointer, aligned on a 16 bits boundary, on an allocated int array where the initial values for the alignment length will be stored. + * @param LCSmin The minimum length of the LCS to be above the chosen threshold, as computed by calculateLCSmin(). + * @param lcs_length A pointer on the int where the LCS length will be stored. + * @param ali_length A pointer on the int where the alignment length will be stored. + * + * @returns The alignment score (normalized according to the parameters). + * + * @since 2012 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +double sse_banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, bool normalize, int reference, bool similarity_mode, int16_t* address, int LCSmin, int* lcs_length, int* ali_length); + + + +/************************************************************************ + * + * D E F I N I T I O N O F T H E P R I V A T E F U N C T I O N S + * + ************************************************************************/ + + static void printreg(__m128i r) { int16_t a0,a1,a2,a3,a4,a5,a6,a7; @@ -61,7 +292,6 @@ static inline int extract_reg(__m128i r, int p) } -// TODO warning on length order void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal, int16_t* address, int* lcs_length, int* ali_length) { register int j; @@ -287,7 +517,6 @@ void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int } -// TODO warning on length order void sse_banded_align_just_lcs(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal, int* lcs_length) { register int j; @@ -319,7 +548,7 @@ void sse_banded_align_just_lcs(int16_t* seq1, int16_t* seq2, int l1, int l2, int // Initialisations odd_BLL = bandLengthLeft & 1; - even_BLL = !odd_BLL; + even_BLL = !odd_BLL; numberOfRegistersPerLine = bandLengthTotal / 8; numberOfRegistersFor3Lines = 3 * numberOfRegistersPerLine; @@ -446,15 +675,14 @@ int calculateSSEBandLength(int bandLengthRight, int bandLengthLeft) } -// TODO that's gonna be fun to doc -int calculateSizeToAllocate(int maxLen, int minLen, int LCSmin) +int calculateSizeToAllocate(int maxLen, int LCSmin) { int size; size = calculateLeftBandLength(maxLen, LCSmin); size *= 2; - size = (size & (~ (int)7)) + (( size & (int)7) ? 8:0); // Closest greater 8 multiple + size = (size & (~ (int)7)) + ((size & (int)7) ? 8:0); // Closest greater 8 multiple size *= 3; size += 16; @@ -522,13 +750,13 @@ void putBlobInSeq(int16_t* seq, Obi_blob_p b, int l, bool reverse) } -void initializeAddressWithGaps(int16_t* address, int bandLengthTotal, int bandLengthLeft, int l1) +void initializeAddressWithGaps(int16_t* address, int bandLengthTotal, int bandLengthLeft, int lmax) { int i; int address_00, x_address_10, address_01, address_01_shifted; int numberOfRegistersPerLine; int bm; - int value=INT16_MAX-l1; + int value=INT16_MAX-lmax; numberOfRegistersPerLine = bandLengthTotal / 8; bm = bandLengthLeft%2; @@ -556,7 +784,6 @@ void initializeAddressWithGaps(int16_t* address, int bandLengthTotal, int bandLe } -// TODO warning on length order double sse_banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, bool normalize, int reference, bool similarity_mode, int16_t* address, int LCSmin, int* lcs_length, int* ali_length) { double id; @@ -610,10 +837,14 @@ double sse_banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, bool n -// PUBLIC FUNCTIONS +/********************************************************************** + * + * D E F I N I T I O N O F T H E P U B L I C F U N C T I O N S + * + **********************************************************************/ -int calculateLCSmin(int l1, int l2, double threshold, bool normalize, int reference, bool similarity_mode) +int calculateLCSmin(int lmax, int lmin, double threshold, bool normalize, int reference, bool similarity_mode) { int LCSmin; @@ -622,16 +853,16 @@ int calculateLCSmin(int l1, int l2, double threshold, bool normalize, int refere if (normalize) { if (reference == MINLEN) - LCSmin = threshold*l2; + LCSmin = threshold*lmin; else // ref = maxlen or alilen - LCSmin = threshold*l1; + LCSmin = threshold*lmax; } else if (similarity_mode) LCSmin = threshold; else if (reference == MINLEN) // not similarity_mode - LCSmin = l2 - threshold; + LCSmin = lmin - threshold; else // not similarity_mode and ref = maxlen or alilen - LCSmin = l1 - threshold; + LCSmin = lmax - threshold; } else LCSmin = 0; @@ -679,7 +910,7 @@ double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bo // Allocate space for matrix band if the alignment length must be computed if ((reference == ALILEN) && (normalize || !similarity_mode)) // cases in which alignment length must be computed { - sizeToAllocateForBand = calculateSizeToAllocate(lmax, lmin, LCSmin); + sizeToAllocateForBand = calculateSizeToAllocate(lmax, LCSmin); address = obi_get_memory_aligned_on_16(sizeToAllocateForBand, &shift); if (address == NULL) { @@ -774,13 +1005,13 @@ double obiblob_sse_banded_lcs_align(Obi_blob_p seq1, Obi_blob_p seq2, double thr // Allocate space for matrix band if the alignment length must be computed if ((reference == ALILEN) && (normalize || !similarity_mode)) // cases in which alignment length must be computed { - sizeToAllocateForBand = calculateSizeToAllocate(lmax, lmin, LCSmin); + sizeToAllocateForBand = calculateSizeToAllocate(lmax, LCSmin); address = obi_get_memory_aligned_on_16(sizeToAllocateForBand, &shift); if (address == NULL) { obi_set_errno(OBI_MALLOC_ERROR); - obidebug(1, "\nError getting a memory address aligned on 16 bytes boundary"); - return 0; // TODO DOUBLE_MIN + obidebug(1, "\nError getting a memory address aligned on a 16 bits boundary"); + return 0; // TODO DOUBLE_MIN to flag error } } diff --git a/src/sse_banded_LCS_alignment.h b/src/sse_banded_LCS_alignment.h index 23f3358..f46bf94 100644 --- a/src/sse_banded_LCS_alignment.h +++ b/src/sse_banded_LCS_alignment.h @@ -1,10 +1,15 @@ -/* - * sse_banded_LCS_alignment.h - * - * Created on: november 29, 2012 - * Author: mercier +/**************************************************************************** + * LCS alignment of two sequences header file * + ****************************************************************************/ + +/** + * @file sse_banded_LCS_alignment.h + * @author Celine Mercier (celine.mercier@metabarcoding.org) + * @date November 7th 2012 + * @brief header file for the functions handling the alignment of two sequences to compute their Longest Common Sequence. */ + #ifndef SSE_BANDED_LCS_ALIGNMENT_H_ #define SSE_BANDED_LCS_ALIGNMENT_H_ @@ -15,13 +20,97 @@ #include "obiblob.h" -#define ALILEN (0) // TODO enum +/** + * @brief Macros for reference lengths to use when aligning. + * + * @since 2012 + * @author Eric Coissac (eric.coissac@metabarcoding.org) + */ +#define ALILEN (0) #define MAXLEN (1) #define MINLEN (2) -// TODO doc -int calculateLCSmin(int l1, int l2, double threshold, bool normalize, int reference, bool lcsmode); + +/** + * @brief Function calculating the minimum length of the Longest Common Subsequence between two sequences to be above a chosen score threshold. + * + * @warning The first argument (lmax) must correspond to length of the longest sequence. + * + * @param lmax The length of the longest sequence to align. + * @param lmin The length of the shortest sequence to align. + * @param threshold Score threshold. If the score is normalized and expressed in similarity, it is an identity, e.g. 0.95 + * for an identity of 95%. If the score is normalized and expressed in distance, it is (1.0 - identity), + * e.g. 0.05 for an identity of 95%. If the score is not normalized and expressed in similarity, it is + * the length of the Longest Common Subsequence. If the score is not normalized and expressed in distance, + * it is (reference length - LCS length). Only sequence pairs with a similarity above the threshold are printed. + * @param normalize Whether the score should be normalized with the reference sequence length. + * @param reference The reference length. 0: The alignment length; 1: The longest sequence's length; 2: The shortest sequence's length. // TODO + * @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false). + * + * @returns The minimum length of the Longest Common Subsequence between two sequences to be above the chosen score threshold. + * + * @since 2012 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +int calculateLCSmin(int lmax, int lmin, double threshold, bool normalize, int reference, bool similarity_mode); + + +/** + * @brief Function aligning two sequences. + * + * The alignment algorithm is a banded global alignment algorithm, a modified version of the classical Needleman and Wunsch algorithm, + * and uses indices based on the length of the Longest Common Subsequence between the two sequences. + * + * Note: the sequences do not need to be ordered (e.g. with the longest sequence as first argument). + * + * @param seq1 A pointer on the character string corresponding to the first sequence. + * @param seq2 A pointer on the character string corresponding to the second sequence. + * @param threshold Score threshold. If the score is normalized and expressed in similarity, it is an identity, e.g. 0.95 + * for an identity of 95%. If the score is normalized and expressed in distance, it is (1.0 - identity), + * e.g. 0.05 for an identity of 95%. If the score is not normalized and expressed in similarity, it is + * the length of the Longest Common Subsequence. If the score is not normalized and expressed in distance, + * it is (reference length - LCS length). Only sequence pairs with a similarity above the threshold are printed. + * @param normalize Whether the score should be normalized with the reference sequence length. + * @param reference The reference length. 0: The alignment length; 1: The longest sequence's length; 2: The shortest sequence's length. // TODO + * @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false). + * @param lcs_length A pointer on the int where the LCS length will be stored. + * @param ali_length A pointer on the int where the alignment length will be stored. + * + * @returns The alignment score (normalized according to the parameters). + * + * @since 2012 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bool normalize, int reference, bool similarity_mode, int* lcs_length, int* ali_length); + + +/** + * @brief Function aligning two sequences encoded in obiblobs. + * + * The alignment algorithm is a banded global alignment algorithm, a modified version of the classical Needleman and Wunsch algorithm, + * and uses indices based on the length of the Longest Common Subsequence between the two sequences. + * + * Note: the obiblobs do not need to be ordered (e.g. with the obiblob containing the longest sequence as first argument). + * + * @param seq1 A pointer on the blob containing the first sequence. + * @param seq2 A pointer on the blob containing the second sequence. + * @param threshold Score threshold. If the score is normalized and expressed in similarity, it is an identity, e.g. 0.95 + * for an identity of 95%. If the score is normalized and expressed in distance, it is (1.0 - identity), + * e.g. 0.05 for an identity of 95%. If the score is not normalized and expressed in similarity, it is + * the length of the Longest Common Subsequence. If the score is not normalized and expressed in distance, + * it is (reference length - LCS length). Only sequence pairs with a similarity above the threshold are printed. + * @param normalize Whether the score should be normalized with the reference sequence length. + * @param reference The reference length. 0: The alignment length; 1: The longest sequence's length; 2: The shortest sequence's length. // TODO + * @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false). + * @param lcs_length A pointer on the int where the LCS length will be stored. + * @param ali_length A pointer on the int where the alignment length will be stored. + * + * @returns The alignment score (normalized according to the parameters). + * + * @since December 2016 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ double obiblob_sse_banded_lcs_align(Obi_blob_p seq1, Obi_blob_p seq2, double threshold, bool normalize, int reference, bool similarity_mode, int* lcs_length, int* ali_length); + #endif