LCS alignment: documentation for all the lowest level functions
This commit is contained in:
@ -1,16 +1,22 @@
|
|||||||
/*
|
/****************************************************************************
|
||||||
* sse_banded_LCS_alignment.c
|
* LCS alignment of two sequences *
|
||||||
*
|
****************************************************************************/
|
||||||
* Created on: 7 nov. 2012
|
|
||||||
* Author: celine mercier
|
/**
|
||||||
|
* @file sse_banded_LCS_alignment.c
|
||||||
|
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||||
|
* @date November 7th 2012
|
||||||
|
* @brief Functions handling the alignment of two sequences to compute their Longest Common Sequence.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
#include <limits.h>
|
||||||
|
|
||||||
#include "obierrno.h"
|
#include "obierrno.h"
|
||||||
#include "obidebug.h"
|
#include "obidebug.h"
|
||||||
@ -24,6 +30,231 @@
|
|||||||
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
|
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**************************************************************************
|
||||||
|
*
|
||||||
|
* D E C L A R A T I O N O F T H E P R I V A T E F U N C T I O N S
|
||||||
|
*
|
||||||
|
**************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Internal function printing a 128 bits register as 8 16-bits integers.
|
||||||
|
*
|
||||||
|
* @param r The register to print.
|
||||||
|
*
|
||||||
|
* @author Eric Coissac (eric.coissac@metabarcoding.org)
|
||||||
|
*/
|
||||||
|
static void printreg(__m128i r);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Internal function extracting a 16-bits integer from a 128 bits register.
|
||||||
|
*
|
||||||
|
* @param r The register to read.
|
||||||
|
* @param p The position at which the integer should be read (between 0 and 7).
|
||||||
|
*
|
||||||
|
* @returns The extracted integer.
|
||||||
|
*
|
||||||
|
* @author Eric Coissac (eric.coissac@metabarcoding.org)
|
||||||
|
*/
|
||||||
|
static inline int extract_reg(__m128i r, int p);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Internal function aligning two sequences, computing the lengths of their Longest Common Subsequence and of their alignment.
|
||||||
|
*
|
||||||
|
* @warning The first argument (seq1) must correspond to the longest sequence.
|
||||||
|
*
|
||||||
|
* @param seq1 The first sequence, the longest of the two, as prepared by putSeqInSeq() or putBlobInSeq().
|
||||||
|
* @param seq2 The second sequence, the shortest of the two, as prepared by putSeqInSeq() or putBlobInSeq().
|
||||||
|
* @param l1 The length of the first sequence.
|
||||||
|
* @param l2 The length of the second sequence.
|
||||||
|
* @param bandLengthLeft The length of the left band for the banded alignment, as computed by calculateLeftBandLength().
|
||||||
|
* @param bandLengthTotal The length of the complete band for the banded alignment, as computed by calculateSSEBandLength().
|
||||||
|
* @param address A pointer, aligned on a 16 bits boundary, on the int array where the initial values for the alignment length are stored,
|
||||||
|
* as prepared for the alignment by initializeAddressWithGaps().
|
||||||
|
* @param lcs_length A pointer on the int where the LCS length will be stored.
|
||||||
|
* @param ali_length A pointer on the int where the alignment length will be stored.
|
||||||
|
*
|
||||||
|
* @since 2012
|
||||||
|
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||||
|
*/
|
||||||
|
void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal, int16_t* address, int* lcs_length, int* ali_length);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Internal function aligning two sequences, computing the length of their Longest Common Subsequence (and not the alignment length).
|
||||||
|
*
|
||||||
|
* @warning The first argument (seq1) must correspond to the longest sequence.
|
||||||
|
*
|
||||||
|
* @param seq1 The first sequence, the longest of the two, as prepared by putSeqInSeq() or putBlobInSeq().
|
||||||
|
* @param seq2 The second sequence, the shortest of the two, as prepared by putSeqInSeq() or putBlobInSeq().
|
||||||
|
* @param l1 The length of the first sequence.
|
||||||
|
* @param l2 The length of the second sequence.
|
||||||
|
* @param bandLengthLeft The length of the left band for the banded alignment, as computed by calculateLeftBandLength().
|
||||||
|
* @param bandLengthTotal The length of the complete band for the banded alignment, as computed by calculateSSEBandLength().
|
||||||
|
* @param lcs_length A pointer on the int where the LCS length will be stored.
|
||||||
|
*
|
||||||
|
* @since 2012
|
||||||
|
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||||
|
*/
|
||||||
|
void sse_banded_align_just_lcs(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal, int* lcs_length);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Internal function calculating the length of the left band for the banded alignment.
|
||||||
|
*
|
||||||
|
* @param lmax The length of the longest sequence to align.
|
||||||
|
* @param LCSmin The minimum length of the LCS to be above the chosen threshold, as computed by calculateLCSmin().
|
||||||
|
*
|
||||||
|
* @returns The length of the left band.
|
||||||
|
*
|
||||||
|
* @since 2012
|
||||||
|
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||||
|
*/
|
||||||
|
int calculateLeftBandLength(int lmax, int LCSmin);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Internal function calculating the length of the right band for the banded alignment.
|
||||||
|
*
|
||||||
|
* @param lmin The length of the shortest sequence to align.
|
||||||
|
* @param LCSmin The minimum length of the LCS to be above the chosen threshold, as computed by calculateLCSmin().
|
||||||
|
*
|
||||||
|
* @returns The length of the right band.
|
||||||
|
*
|
||||||
|
* @since 2012
|
||||||
|
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||||
|
*/
|
||||||
|
int calculateRightBandLength(int lmin, int LCSmin);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Internal function calculating the length of the complete band for the banded alignment.
|
||||||
|
*
|
||||||
|
* @param bandLengthRight The length of the right band for the banded alignment, as computed by calculateRightBandLength().
|
||||||
|
* @param bandLengthLeft The length of the left band for the banded alignment, as computed by calculateLeftBandLength().
|
||||||
|
*
|
||||||
|
* @returns The length of the complete band.
|
||||||
|
*
|
||||||
|
* @since 2012
|
||||||
|
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||||
|
*/
|
||||||
|
int calculateSSEBandLength(int bandLengthRight, int bandLengthLeft);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Internal function calculating the size to allocate for the int array where the alignment length will be stored in the matrix.
|
||||||
|
*
|
||||||
|
* @param maxLen The length of the longest sequence to align.
|
||||||
|
* @param LCSmin The minimum length of the LCS to be above the chosen threshold, as computed by calculateLCSmin().
|
||||||
|
*
|
||||||
|
* @returns The size to allocate in bytes.
|
||||||
|
*
|
||||||
|
* @since 2012
|
||||||
|
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||||
|
*/
|
||||||
|
int calculateSizeToAllocate(int maxLen, int LCSmin);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Internal function initializing the int array corresponding to a sequence to align with default values.
|
||||||
|
*
|
||||||
|
* @param seq The int array corresponding to the sequence to align, as prepared by putSeqInSeq() or putBlobInSeq().
|
||||||
|
* @param size The number of positions to initialize.
|
||||||
|
* @param iniValue The value that the positions should be initialized to.
|
||||||
|
*
|
||||||
|
* @since 2012
|
||||||
|
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||||
|
*/
|
||||||
|
void iniSeq(int16_t* seq, int size, int16_t iniValue);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Internal function building the int array corresponding to a sequence to align.
|
||||||
|
*
|
||||||
|
* Each nucleotide is stored as a short int (int16_t).
|
||||||
|
*
|
||||||
|
* @param seq A pointer on the allocated int array.
|
||||||
|
* @param s A pointer on the character string corresponding to the sequence.
|
||||||
|
* @param l The length of the sequence.
|
||||||
|
* @param reverse A boolean indicating whether the sequence should be written reversed
|
||||||
|
* (for the second sequence to align).
|
||||||
|
*
|
||||||
|
* @since 2012
|
||||||
|
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||||
|
*/
|
||||||
|
void putSeqInSeq(int16_t* seq, char* s, int l, bool reverse);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Internal function building the int array corresponding to an obiblob containing a sequence.
|
||||||
|
*
|
||||||
|
* Each nucleotide is stored as a short int (int16_t).
|
||||||
|
*
|
||||||
|
* @param seq A pointer on the allocated int array.
|
||||||
|
* @param b A pointer on the obiblob containing the sequence.
|
||||||
|
* @param l The length of the (decoded) sequence.
|
||||||
|
* @param reverse A boolean indicating whether the sequence should be written reversed
|
||||||
|
* (for the second sequence to align).
|
||||||
|
*
|
||||||
|
* @since 2012
|
||||||
|
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||||
|
*/
|
||||||
|
void putBlobInSeq(int16_t* seq, Obi_blob_p b, int l, bool reverse);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Internal function preparing an int array with the initial values for the alignment lengths before the alignment.
|
||||||
|
*
|
||||||
|
* The int array containing the initial alignment lengths (corresponding to the first line of the diagonalized band of the alignment matrix)
|
||||||
|
* needs to be initialized with external gap lengths before the alignment.
|
||||||
|
*
|
||||||
|
* @param address A pointer, aligned on a 16 bits boundary, on the int array where the initial values for the alignment length are to be stored.
|
||||||
|
* @param bandLengthTotal The length of the complete band for the banded alignment, as computed by calculateSSEBandLength().
|
||||||
|
* @param bandLengthLeft The length of the left band for the banded alignment, as computed by calculateLeftBandLength().
|
||||||
|
* @param lmax The length of the longest sequence to align.
|
||||||
|
*
|
||||||
|
* @since 2012
|
||||||
|
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||||
|
*/
|
||||||
|
void initializeAddressWithGaps(int16_t* address, int bandLengthTotal, int bandLengthLeft, int lmax);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Internal function aligning two sequences, computing the lengths of their Longest Common Subsequence and of their alignment.
|
||||||
|
*
|
||||||
|
* @warning The first argument (seq1) must correspond to the longest sequence.
|
||||||
|
*
|
||||||
|
* @param seq1 The first sequence, the longest of the two, as prepared by putSeqInSeq() or putBlobInSeq().
|
||||||
|
* @param seq2 The second sequence, the shortest of the two, as prepared by putSeqInSeq() or putBlobInSeq().
|
||||||
|
* @param l1 The length of the first sequence.
|
||||||
|
* @param l2 The length of the second sequence.
|
||||||
|
* @param normalize Whether the score should be normalized with the reference sequence length.
|
||||||
|
* @param reference The reference length. 0: The alignment length; 1: The longest sequence's length; 2: The shortest sequence's length.
|
||||||
|
* @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false).
|
||||||
|
* @param address A pointer, aligned on a 16 bits boundary, on an allocated int array where the initial values for the alignment length will be stored.
|
||||||
|
* @param LCSmin The minimum length of the LCS to be above the chosen threshold, as computed by calculateLCSmin().
|
||||||
|
* @param lcs_length A pointer on the int where the LCS length will be stored.
|
||||||
|
* @param ali_length A pointer on the int where the alignment length will be stored.
|
||||||
|
*
|
||||||
|
* @returns The alignment score (normalized according to the parameters).
|
||||||
|
*
|
||||||
|
* @since 2012
|
||||||
|
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||||
|
*/
|
||||||
|
double sse_banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, bool normalize, int reference, bool similarity_mode, int16_t* address, int LCSmin, int* lcs_length, int* ali_length);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/************************************************************************
|
||||||
|
*
|
||||||
|
* D E F I N I T I O N O F T H E P R I V A T E F U N C T I O N S
|
||||||
|
*
|
||||||
|
************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
static void printreg(__m128i r)
|
static void printreg(__m128i r)
|
||||||
{
|
{
|
||||||
int16_t a0,a1,a2,a3,a4,a5,a6,a7;
|
int16_t a0,a1,a2,a3,a4,a5,a6,a7;
|
||||||
@ -61,7 +292,6 @@ static inline int extract_reg(__m128i r, int p)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// TODO warning on length order
|
|
||||||
void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal, int16_t* address, int* lcs_length, int* ali_length)
|
void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal, int16_t* address, int* lcs_length, int* ali_length)
|
||||||
{
|
{
|
||||||
register int j;
|
register int j;
|
||||||
@ -287,7 +517,6 @@ void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// TODO warning on length order
|
|
||||||
void sse_banded_align_just_lcs(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal, int* lcs_length)
|
void sse_banded_align_just_lcs(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal, int* lcs_length)
|
||||||
{
|
{
|
||||||
register int j;
|
register int j;
|
||||||
@ -319,7 +548,7 @@ void sse_banded_align_just_lcs(int16_t* seq1, int16_t* seq2, int l1, int l2, int
|
|||||||
// Initialisations
|
// Initialisations
|
||||||
|
|
||||||
odd_BLL = bandLengthLeft & 1;
|
odd_BLL = bandLengthLeft & 1;
|
||||||
even_BLL = !odd_BLL;
|
even_BLL = !odd_BLL;
|
||||||
|
|
||||||
numberOfRegistersPerLine = bandLengthTotal / 8;
|
numberOfRegistersPerLine = bandLengthTotal / 8;
|
||||||
numberOfRegistersFor3Lines = 3 * numberOfRegistersPerLine;
|
numberOfRegistersFor3Lines = 3 * numberOfRegistersPerLine;
|
||||||
@ -446,15 +675,14 @@ int calculateSSEBandLength(int bandLengthRight, int bandLengthLeft)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// TODO that's gonna be fun to doc
|
int calculateSizeToAllocate(int maxLen, int LCSmin)
|
||||||
int calculateSizeToAllocate(int maxLen, int minLen, int LCSmin)
|
|
||||||
{
|
{
|
||||||
int size;
|
int size;
|
||||||
|
|
||||||
size = calculateLeftBandLength(maxLen, LCSmin);
|
size = calculateLeftBandLength(maxLen, LCSmin);
|
||||||
|
|
||||||
size *= 2;
|
size *= 2;
|
||||||
size = (size & (~ (int)7)) + (( size & (int)7) ? 8:0); // Closest greater 8 multiple
|
size = (size & (~ (int)7)) + ((size & (int)7) ? 8:0); // Closest greater 8 multiple
|
||||||
size *= 3;
|
size *= 3;
|
||||||
size += 16;
|
size += 16;
|
||||||
|
|
||||||
@ -522,13 +750,13 @@ void putBlobInSeq(int16_t* seq, Obi_blob_p b, int l, bool reverse)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void initializeAddressWithGaps(int16_t* address, int bandLengthTotal, int bandLengthLeft, int l1)
|
void initializeAddressWithGaps(int16_t* address, int bandLengthTotal, int bandLengthLeft, int lmax)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
int address_00, x_address_10, address_01, address_01_shifted;
|
int address_00, x_address_10, address_01, address_01_shifted;
|
||||||
int numberOfRegistersPerLine;
|
int numberOfRegistersPerLine;
|
||||||
int bm;
|
int bm;
|
||||||
int value=INT16_MAX-l1;
|
int value=INT16_MAX-lmax;
|
||||||
|
|
||||||
numberOfRegistersPerLine = bandLengthTotal / 8;
|
numberOfRegistersPerLine = bandLengthTotal / 8;
|
||||||
bm = bandLengthLeft%2;
|
bm = bandLengthLeft%2;
|
||||||
@ -556,7 +784,6 @@ void initializeAddressWithGaps(int16_t* address, int bandLengthTotal, int bandLe
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// TODO warning on length order
|
|
||||||
double sse_banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, bool normalize, int reference, bool similarity_mode, int16_t* address, int LCSmin, int* lcs_length, int* ali_length)
|
double sse_banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, bool normalize, int reference, bool similarity_mode, int16_t* address, int LCSmin, int* lcs_length, int* ali_length)
|
||||||
{
|
{
|
||||||
double id;
|
double id;
|
||||||
@ -610,10 +837,14 @@ double sse_banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, bool n
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
// PUBLIC FUNCTIONS
|
/**********************************************************************
|
||||||
|
*
|
||||||
|
* D E F I N I T I O N O F T H E P U B L I C F U N C T I O N S
|
||||||
|
*
|
||||||
|
**********************************************************************/
|
||||||
|
|
||||||
|
|
||||||
int calculateLCSmin(int l1, int l2, double threshold, bool normalize, int reference, bool similarity_mode)
|
int calculateLCSmin(int lmax, int lmin, double threshold, bool normalize, int reference, bool similarity_mode)
|
||||||
{
|
{
|
||||||
int LCSmin;
|
int LCSmin;
|
||||||
|
|
||||||
@ -622,16 +853,16 @@ int calculateLCSmin(int l1, int l2, double threshold, bool normalize, int refere
|
|||||||
if (normalize)
|
if (normalize)
|
||||||
{
|
{
|
||||||
if (reference == MINLEN)
|
if (reference == MINLEN)
|
||||||
LCSmin = threshold*l2;
|
LCSmin = threshold*lmin;
|
||||||
else // ref = maxlen or alilen
|
else // ref = maxlen or alilen
|
||||||
LCSmin = threshold*l1;
|
LCSmin = threshold*lmax;
|
||||||
}
|
}
|
||||||
else if (similarity_mode)
|
else if (similarity_mode)
|
||||||
LCSmin = threshold;
|
LCSmin = threshold;
|
||||||
else if (reference == MINLEN) // not similarity_mode
|
else if (reference == MINLEN) // not similarity_mode
|
||||||
LCSmin = l2 - threshold;
|
LCSmin = lmin - threshold;
|
||||||
else // not similarity_mode and ref = maxlen or alilen
|
else // not similarity_mode and ref = maxlen or alilen
|
||||||
LCSmin = l1 - threshold;
|
LCSmin = lmax - threshold;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
LCSmin = 0;
|
LCSmin = 0;
|
||||||
@ -679,7 +910,7 @@ double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bo
|
|||||||
// Allocate space for matrix band if the alignment length must be computed
|
// Allocate space for matrix band if the alignment length must be computed
|
||||||
if ((reference == ALILEN) && (normalize || !similarity_mode)) // cases in which alignment length must be computed
|
if ((reference == ALILEN) && (normalize || !similarity_mode)) // cases in which alignment length must be computed
|
||||||
{
|
{
|
||||||
sizeToAllocateForBand = calculateSizeToAllocate(lmax, lmin, LCSmin);
|
sizeToAllocateForBand = calculateSizeToAllocate(lmax, LCSmin);
|
||||||
address = obi_get_memory_aligned_on_16(sizeToAllocateForBand, &shift);
|
address = obi_get_memory_aligned_on_16(sizeToAllocateForBand, &shift);
|
||||||
if (address == NULL)
|
if (address == NULL)
|
||||||
{
|
{
|
||||||
@ -774,13 +1005,13 @@ double obiblob_sse_banded_lcs_align(Obi_blob_p seq1, Obi_blob_p seq2, double thr
|
|||||||
// Allocate space for matrix band if the alignment length must be computed
|
// Allocate space for matrix band if the alignment length must be computed
|
||||||
if ((reference == ALILEN) && (normalize || !similarity_mode)) // cases in which alignment length must be computed
|
if ((reference == ALILEN) && (normalize || !similarity_mode)) // cases in which alignment length must be computed
|
||||||
{
|
{
|
||||||
sizeToAllocateForBand = calculateSizeToAllocate(lmax, lmin, LCSmin);
|
sizeToAllocateForBand = calculateSizeToAllocate(lmax, LCSmin);
|
||||||
address = obi_get_memory_aligned_on_16(sizeToAllocateForBand, &shift);
|
address = obi_get_memory_aligned_on_16(sizeToAllocateForBand, &shift);
|
||||||
if (address == NULL)
|
if (address == NULL)
|
||||||
{
|
{
|
||||||
obi_set_errno(OBI_MALLOC_ERROR);
|
obi_set_errno(OBI_MALLOC_ERROR);
|
||||||
obidebug(1, "\nError getting a memory address aligned on 16 bytes boundary");
|
obidebug(1, "\nError getting a memory address aligned on a 16 bits boundary");
|
||||||
return 0; // TODO DOUBLE_MIN
|
return 0; // TODO DOUBLE_MIN to flag error
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,10 +1,15 @@
|
|||||||
/*
|
/****************************************************************************
|
||||||
* sse_banded_LCS_alignment.h
|
* LCS alignment of two sequences header file *
|
||||||
*
|
****************************************************************************/
|
||||||
* Created on: november 29, 2012
|
|
||||||
* Author: mercier
|
/**
|
||||||
|
* @file sse_banded_LCS_alignment.h
|
||||||
|
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||||
|
* @date November 7th 2012
|
||||||
|
* @brief header file for the functions handling the alignment of two sequences to compute their Longest Common Sequence.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
#ifndef SSE_BANDED_LCS_ALIGNMENT_H_
|
#ifndef SSE_BANDED_LCS_ALIGNMENT_H_
|
||||||
#define SSE_BANDED_LCS_ALIGNMENT_H_
|
#define SSE_BANDED_LCS_ALIGNMENT_H_
|
||||||
|
|
||||||
@ -15,13 +20,97 @@
|
|||||||
#include "obiblob.h"
|
#include "obiblob.h"
|
||||||
|
|
||||||
|
|
||||||
#define ALILEN (0) // TODO enum
|
/**
|
||||||
|
* @brief Macros for reference lengths to use when aligning.
|
||||||
|
*
|
||||||
|
* @since 2012
|
||||||
|
* @author Eric Coissac (eric.coissac@metabarcoding.org)
|
||||||
|
*/
|
||||||
|
#define ALILEN (0)
|
||||||
#define MAXLEN (1)
|
#define MAXLEN (1)
|
||||||
#define MINLEN (2)
|
#define MINLEN (2)
|
||||||
|
|
||||||
// TODO doc
|
|
||||||
int calculateLCSmin(int l1, int l2, double threshold, bool normalize, int reference, bool lcsmode);
|
/**
|
||||||
|
* @brief Function calculating the minimum length of the Longest Common Subsequence between two sequences to be above a chosen score threshold.
|
||||||
|
*
|
||||||
|
* @warning The first argument (lmax) must correspond to length of the longest sequence.
|
||||||
|
*
|
||||||
|
* @param lmax The length of the longest sequence to align.
|
||||||
|
* @param lmin The length of the shortest sequence to align.
|
||||||
|
* @param threshold Score threshold. If the score is normalized and expressed in similarity, it is an identity, e.g. 0.95
|
||||||
|
* for an identity of 95%. If the score is normalized and expressed in distance, it is (1.0 - identity),
|
||||||
|
* e.g. 0.05 for an identity of 95%. If the score is not normalized and expressed in similarity, it is
|
||||||
|
* the length of the Longest Common Subsequence. If the score is not normalized and expressed in distance,
|
||||||
|
* it is (reference length - LCS length). Only sequence pairs with a similarity above the threshold are printed.
|
||||||
|
* @param normalize Whether the score should be normalized with the reference sequence length.
|
||||||
|
* @param reference The reference length. 0: The alignment length; 1: The longest sequence's length; 2: The shortest sequence's length. // TODO
|
||||||
|
* @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false).
|
||||||
|
*
|
||||||
|
* @returns The minimum length of the Longest Common Subsequence between two sequences to be above the chosen score threshold.
|
||||||
|
*
|
||||||
|
* @since 2012
|
||||||
|
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||||
|
*/
|
||||||
|
int calculateLCSmin(int lmax, int lmin, double threshold, bool normalize, int reference, bool similarity_mode);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Function aligning two sequences.
|
||||||
|
*
|
||||||
|
* The alignment algorithm is a banded global alignment algorithm, a modified version of the classical Needleman and Wunsch algorithm,
|
||||||
|
* and uses indices based on the length of the Longest Common Subsequence between the two sequences.
|
||||||
|
*
|
||||||
|
* Note: the sequences do not need to be ordered (e.g. with the longest sequence as first argument).
|
||||||
|
*
|
||||||
|
* @param seq1 A pointer on the character string corresponding to the first sequence.
|
||||||
|
* @param seq2 A pointer on the character string corresponding to the second sequence.
|
||||||
|
* @param threshold Score threshold. If the score is normalized and expressed in similarity, it is an identity, e.g. 0.95
|
||||||
|
* for an identity of 95%. If the score is normalized and expressed in distance, it is (1.0 - identity),
|
||||||
|
* e.g. 0.05 for an identity of 95%. If the score is not normalized and expressed in similarity, it is
|
||||||
|
* the length of the Longest Common Subsequence. If the score is not normalized and expressed in distance,
|
||||||
|
* it is (reference length - LCS length). Only sequence pairs with a similarity above the threshold are printed.
|
||||||
|
* @param normalize Whether the score should be normalized with the reference sequence length.
|
||||||
|
* @param reference The reference length. 0: The alignment length; 1: The longest sequence's length; 2: The shortest sequence's length. // TODO
|
||||||
|
* @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false).
|
||||||
|
* @param lcs_length A pointer on the int where the LCS length will be stored.
|
||||||
|
* @param ali_length A pointer on the int where the alignment length will be stored.
|
||||||
|
*
|
||||||
|
* @returns The alignment score (normalized according to the parameters).
|
||||||
|
*
|
||||||
|
* @since 2012
|
||||||
|
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||||
|
*/
|
||||||
double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bool normalize, int reference, bool similarity_mode, int* lcs_length, int* ali_length);
|
double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bool normalize, int reference, bool similarity_mode, int* lcs_length, int* ali_length);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Function aligning two sequences encoded in obiblobs.
|
||||||
|
*
|
||||||
|
* The alignment algorithm is a banded global alignment algorithm, a modified version of the classical Needleman and Wunsch algorithm,
|
||||||
|
* and uses indices based on the length of the Longest Common Subsequence between the two sequences.
|
||||||
|
*
|
||||||
|
* Note: the obiblobs do not need to be ordered (e.g. with the obiblob containing the longest sequence as first argument).
|
||||||
|
*
|
||||||
|
* @param seq1 A pointer on the blob containing the first sequence.
|
||||||
|
* @param seq2 A pointer on the blob containing the second sequence.
|
||||||
|
* @param threshold Score threshold. If the score is normalized and expressed in similarity, it is an identity, e.g. 0.95
|
||||||
|
* for an identity of 95%. If the score is normalized and expressed in distance, it is (1.0 - identity),
|
||||||
|
* e.g. 0.05 for an identity of 95%. If the score is not normalized and expressed in similarity, it is
|
||||||
|
* the length of the Longest Common Subsequence. If the score is not normalized and expressed in distance,
|
||||||
|
* it is (reference length - LCS length). Only sequence pairs with a similarity above the threshold are printed.
|
||||||
|
* @param normalize Whether the score should be normalized with the reference sequence length.
|
||||||
|
* @param reference The reference length. 0: The alignment length; 1: The longest sequence's length; 2: The shortest sequence's length. // TODO
|
||||||
|
* @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false).
|
||||||
|
* @param lcs_length A pointer on the int where the LCS length will be stored.
|
||||||
|
* @param ali_length A pointer on the int where the alignment length will be stored.
|
||||||
|
*
|
||||||
|
* @returns The alignment score (normalized according to the parameters).
|
||||||
|
*
|
||||||
|
* @since December 2016
|
||||||
|
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||||
|
*/
|
||||||
double obiblob_sse_banded_lcs_align(Obi_blob_p seq1, Obi_blob_p seq2, double threshold, bool normalize, int reference, bool similarity_mode, int* lcs_length, int* ali_length);
|
double obiblob_sse_banded_lcs_align(Obi_blob_p seq1, Obi_blob_p seq2, double threshold, bool normalize, int reference, bool similarity_mode, int* lcs_length, int* ali_length);
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
Reference in New Issue
Block a user