LCS alignment: documentation for all the lowest level functions

This commit is contained in:
Celine Mercier
2016-12-22 17:03:51 +01:00
parent 5c50e5b378
commit 30e4359c85
2 changed files with 352 additions and 32 deletions

View File

@ -1,16 +1,22 @@
/*
* sse_banded_LCS_alignment.c
*
* Created on: 7 nov. 2012
* Author: celine mercier
/****************************************************************************
* LCS alignment of two sequences *
****************************************************************************/
/**
* @file sse_banded_LCS_alignment.c
* @author Celine Mercier (celine.mercier@metabarcoding.org)
* @date November 7th 2012
* @brief Functions handling the alignment of two sequences to compute their Longest Common Sequence.
*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <stdint.h>
#include <stdbool.h>
#include <limits.h>
#include "obierrno.h"
#include "obidebug.h"
@ -24,6 +30,231 @@
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
/**************************************************************************
*
* D E C L A R A T I O N O F T H E P R I V A T E F U N C T I O N S
*
**************************************************************************/
/**
* @brief Internal function printing a 128 bits register as 8 16-bits integers.
*
* @param r The register to print.
*
* @author Eric Coissac (eric.coissac@metabarcoding.org)
*/
static void printreg(__m128i r);
/**
* @brief Internal function extracting a 16-bits integer from a 128 bits register.
*
* @param r The register to read.
* @param p The position at which the integer should be read (between 0 and 7).
*
* @returns The extracted integer.
*
* @author Eric Coissac (eric.coissac@metabarcoding.org)
*/
static inline int extract_reg(__m128i r, int p);
/**
* @brief Internal function aligning two sequences, computing the lengths of their Longest Common Subsequence and of their alignment.
*
* @warning The first argument (seq1) must correspond to the longest sequence.
*
* @param seq1 The first sequence, the longest of the two, as prepared by putSeqInSeq() or putBlobInSeq().
* @param seq2 The second sequence, the shortest of the two, as prepared by putSeqInSeq() or putBlobInSeq().
* @param l1 The length of the first sequence.
* @param l2 The length of the second sequence.
* @param bandLengthLeft The length of the left band for the banded alignment, as computed by calculateLeftBandLength().
* @param bandLengthTotal The length of the complete band for the banded alignment, as computed by calculateSSEBandLength().
* @param address A pointer, aligned on a 16 bits boundary, on the int array where the initial values for the alignment length are stored,
* as prepared for the alignment by initializeAddressWithGaps().
* @param lcs_length A pointer on the int where the LCS length will be stored.
* @param ali_length A pointer on the int where the alignment length will be stored.
*
* @since 2012
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal, int16_t* address, int* lcs_length, int* ali_length);
/**
* @brief Internal function aligning two sequences, computing the length of their Longest Common Subsequence (and not the alignment length).
*
* @warning The first argument (seq1) must correspond to the longest sequence.
*
* @param seq1 The first sequence, the longest of the two, as prepared by putSeqInSeq() or putBlobInSeq().
* @param seq2 The second sequence, the shortest of the two, as prepared by putSeqInSeq() or putBlobInSeq().
* @param l1 The length of the first sequence.
* @param l2 The length of the second sequence.
* @param bandLengthLeft The length of the left band for the banded alignment, as computed by calculateLeftBandLength().
* @param bandLengthTotal The length of the complete band for the banded alignment, as computed by calculateSSEBandLength().
* @param lcs_length A pointer on the int where the LCS length will be stored.
*
* @since 2012
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
void sse_banded_align_just_lcs(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal, int* lcs_length);
/**
* @brief Internal function calculating the length of the left band for the banded alignment.
*
* @param lmax The length of the longest sequence to align.
* @param LCSmin The minimum length of the LCS to be above the chosen threshold, as computed by calculateLCSmin().
*
* @returns The length of the left band.
*
* @since 2012
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
int calculateLeftBandLength(int lmax, int LCSmin);
/**
* @brief Internal function calculating the length of the right band for the banded alignment.
*
* @param lmin The length of the shortest sequence to align.
* @param LCSmin The minimum length of the LCS to be above the chosen threshold, as computed by calculateLCSmin().
*
* @returns The length of the right band.
*
* @since 2012
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
int calculateRightBandLength(int lmin, int LCSmin);
/**
* @brief Internal function calculating the length of the complete band for the banded alignment.
*
* @param bandLengthRight The length of the right band for the banded alignment, as computed by calculateRightBandLength().
* @param bandLengthLeft The length of the left band for the banded alignment, as computed by calculateLeftBandLength().
*
* @returns The length of the complete band.
*
* @since 2012
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
int calculateSSEBandLength(int bandLengthRight, int bandLengthLeft);
/**
* @brief Internal function calculating the size to allocate for the int array where the alignment length will be stored in the matrix.
*
* @param maxLen The length of the longest sequence to align.
* @param LCSmin The minimum length of the LCS to be above the chosen threshold, as computed by calculateLCSmin().
*
* @returns The size to allocate in bytes.
*
* @since 2012
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
int calculateSizeToAllocate(int maxLen, int LCSmin);
/**
* @brief Internal function initializing the int array corresponding to a sequence to align with default values.
*
* @param seq The int array corresponding to the sequence to align, as prepared by putSeqInSeq() or putBlobInSeq().
* @param size The number of positions to initialize.
* @param iniValue The value that the positions should be initialized to.
*
* @since 2012
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
void iniSeq(int16_t* seq, int size, int16_t iniValue);
/**
* @brief Internal function building the int array corresponding to a sequence to align.
*
* Each nucleotide is stored as a short int (int16_t).
*
* @param seq A pointer on the allocated int array.
* @param s A pointer on the character string corresponding to the sequence.
* @param l The length of the sequence.
* @param reverse A boolean indicating whether the sequence should be written reversed
* (for the second sequence to align).
*
* @since 2012
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
void putSeqInSeq(int16_t* seq, char* s, int l, bool reverse);
/**
* @brief Internal function building the int array corresponding to an obiblob containing a sequence.
*
* Each nucleotide is stored as a short int (int16_t).
*
* @param seq A pointer on the allocated int array.
* @param b A pointer on the obiblob containing the sequence.
* @param l The length of the (decoded) sequence.
* @param reverse A boolean indicating whether the sequence should be written reversed
* (for the second sequence to align).
*
* @since 2012
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
void putBlobInSeq(int16_t* seq, Obi_blob_p b, int l, bool reverse);
/**
* @brief Internal function preparing an int array with the initial values for the alignment lengths before the alignment.
*
* The int array containing the initial alignment lengths (corresponding to the first line of the diagonalized band of the alignment matrix)
* needs to be initialized with external gap lengths before the alignment.
*
* @param address A pointer, aligned on a 16 bits boundary, on the int array where the initial values for the alignment length are to be stored.
* @param bandLengthTotal The length of the complete band for the banded alignment, as computed by calculateSSEBandLength().
* @param bandLengthLeft The length of the left band for the banded alignment, as computed by calculateLeftBandLength().
* @param lmax The length of the longest sequence to align.
*
* @since 2012
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
void initializeAddressWithGaps(int16_t* address, int bandLengthTotal, int bandLengthLeft, int lmax);
/**
* @brief Internal function aligning two sequences, computing the lengths of their Longest Common Subsequence and of their alignment.
*
* @warning The first argument (seq1) must correspond to the longest sequence.
*
* @param seq1 The first sequence, the longest of the two, as prepared by putSeqInSeq() or putBlobInSeq().
* @param seq2 The second sequence, the shortest of the two, as prepared by putSeqInSeq() or putBlobInSeq().
* @param l1 The length of the first sequence.
* @param l2 The length of the second sequence.
* @param normalize Whether the score should be normalized with the reference sequence length.
* @param reference The reference length. 0: The alignment length; 1: The longest sequence's length; 2: The shortest sequence's length.
* @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false).
* @param address A pointer, aligned on a 16 bits boundary, on an allocated int array where the initial values for the alignment length will be stored.
* @param LCSmin The minimum length of the LCS to be above the chosen threshold, as computed by calculateLCSmin().
* @param lcs_length A pointer on the int where the LCS length will be stored.
* @param ali_length A pointer on the int where the alignment length will be stored.
*
* @returns The alignment score (normalized according to the parameters).
*
* @since 2012
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
double sse_banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, bool normalize, int reference, bool similarity_mode, int16_t* address, int LCSmin, int* lcs_length, int* ali_length);
/************************************************************************
*
* D E F I N I T I O N O F T H E P R I V A T E F U N C T I O N S
*
************************************************************************/
static void printreg(__m128i r)
{
int16_t a0,a1,a2,a3,a4,a5,a6,a7;
@ -61,7 +292,6 @@ static inline int extract_reg(__m128i r, int p)
}
// TODO warning on length order
void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal, int16_t* address, int* lcs_length, int* ali_length)
{
register int j;
@ -287,7 +517,6 @@ void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int
}
// TODO warning on length order
void sse_banded_align_just_lcs(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal, int* lcs_length)
{
register int j;
@ -446,15 +675,14 @@ int calculateSSEBandLength(int bandLengthRight, int bandLengthLeft)
}
// TODO that's gonna be fun to doc
int calculateSizeToAllocate(int maxLen, int minLen, int LCSmin)
int calculateSizeToAllocate(int maxLen, int LCSmin)
{
int size;
size = calculateLeftBandLength(maxLen, LCSmin);
size *= 2;
size = (size & (~ (int)7)) + (( size & (int)7) ? 8:0); // Closest greater 8 multiple
size = (size & (~ (int)7)) + ((size & (int)7) ? 8:0); // Closest greater 8 multiple
size *= 3;
size += 16;
@ -522,13 +750,13 @@ void putBlobInSeq(int16_t* seq, Obi_blob_p b, int l, bool reverse)
}
void initializeAddressWithGaps(int16_t* address, int bandLengthTotal, int bandLengthLeft, int l1)
void initializeAddressWithGaps(int16_t* address, int bandLengthTotal, int bandLengthLeft, int lmax)
{
int i;
int address_00, x_address_10, address_01, address_01_shifted;
int numberOfRegistersPerLine;
int bm;
int value=INT16_MAX-l1;
int value=INT16_MAX-lmax;
numberOfRegistersPerLine = bandLengthTotal / 8;
bm = bandLengthLeft%2;
@ -556,7 +784,6 @@ void initializeAddressWithGaps(int16_t* address, int bandLengthTotal, int bandLe
}
// TODO warning on length order
double sse_banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, bool normalize, int reference, bool similarity_mode, int16_t* address, int LCSmin, int* lcs_length, int* ali_length)
{
double id;
@ -610,10 +837,14 @@ double sse_banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, bool n
// PUBLIC FUNCTIONS
/**********************************************************************
*
* D E F I N I T I O N O F T H E P U B L I C F U N C T I O N S
*
**********************************************************************/
int calculateLCSmin(int l1, int l2, double threshold, bool normalize, int reference, bool similarity_mode)
int calculateLCSmin(int lmax, int lmin, double threshold, bool normalize, int reference, bool similarity_mode)
{
int LCSmin;
@ -622,16 +853,16 @@ int calculateLCSmin(int l1, int l2, double threshold, bool normalize, int refere
if (normalize)
{
if (reference == MINLEN)
LCSmin = threshold*l2;
LCSmin = threshold*lmin;
else // ref = maxlen or alilen
LCSmin = threshold*l1;
LCSmin = threshold*lmax;
}
else if (similarity_mode)
LCSmin = threshold;
else if (reference == MINLEN) // not similarity_mode
LCSmin = l2 - threshold;
LCSmin = lmin - threshold;
else // not similarity_mode and ref = maxlen or alilen
LCSmin = l1 - threshold;
LCSmin = lmax - threshold;
}
else
LCSmin = 0;
@ -679,7 +910,7 @@ double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bo
// Allocate space for matrix band if the alignment length must be computed
if ((reference == ALILEN) && (normalize || !similarity_mode)) // cases in which alignment length must be computed
{
sizeToAllocateForBand = calculateSizeToAllocate(lmax, lmin, LCSmin);
sizeToAllocateForBand = calculateSizeToAllocate(lmax, LCSmin);
address = obi_get_memory_aligned_on_16(sizeToAllocateForBand, &shift);
if (address == NULL)
{
@ -774,13 +1005,13 @@ double obiblob_sse_banded_lcs_align(Obi_blob_p seq1, Obi_blob_p seq2, double thr
// Allocate space for matrix band if the alignment length must be computed
if ((reference == ALILEN) && (normalize || !similarity_mode)) // cases in which alignment length must be computed
{
sizeToAllocateForBand = calculateSizeToAllocate(lmax, lmin, LCSmin);
sizeToAllocateForBand = calculateSizeToAllocate(lmax, LCSmin);
address = obi_get_memory_aligned_on_16(sizeToAllocateForBand, &shift);
if (address == NULL)
{
obi_set_errno(OBI_MALLOC_ERROR);
obidebug(1, "\nError getting a memory address aligned on 16 bytes boundary");
return 0; // TODO DOUBLE_MIN
obidebug(1, "\nError getting a memory address aligned on a 16 bits boundary");
return 0; // TODO DOUBLE_MIN to flag error
}
}

View File

@ -1,10 +1,15 @@
/*
* sse_banded_LCS_alignment.h
*
* Created on: november 29, 2012
* Author: mercier
/****************************************************************************
* LCS alignment of two sequences header file *
****************************************************************************/
/**
* @file sse_banded_LCS_alignment.h
* @author Celine Mercier (celine.mercier@metabarcoding.org)
* @date November 7th 2012
* @brief header file for the functions handling the alignment of two sequences to compute their Longest Common Sequence.
*/
#ifndef SSE_BANDED_LCS_ALIGNMENT_H_
#define SSE_BANDED_LCS_ALIGNMENT_H_
@ -15,13 +20,97 @@
#include "obiblob.h"
#define ALILEN (0) // TODO enum
/**
* @brief Macros for reference lengths to use when aligning.
*
* @since 2012
* @author Eric Coissac (eric.coissac@metabarcoding.org)
*/
#define ALILEN (0)
#define MAXLEN (1)
#define MINLEN (2)
// TODO doc
int calculateLCSmin(int l1, int l2, double threshold, bool normalize, int reference, bool lcsmode);
/**
* @brief Function calculating the minimum length of the Longest Common Subsequence between two sequences to be above a chosen score threshold.
*
* @warning The first argument (lmax) must correspond to length of the longest sequence.
*
* @param lmax The length of the longest sequence to align.
* @param lmin The length of the shortest sequence to align.
* @param threshold Score threshold. If the score is normalized and expressed in similarity, it is an identity, e.g. 0.95
* for an identity of 95%. If the score is normalized and expressed in distance, it is (1.0 - identity),
* e.g. 0.05 for an identity of 95%. If the score is not normalized and expressed in similarity, it is
* the length of the Longest Common Subsequence. If the score is not normalized and expressed in distance,
* it is (reference length - LCS length). Only sequence pairs with a similarity above the threshold are printed.
* @param normalize Whether the score should be normalized with the reference sequence length.
* @param reference The reference length. 0: The alignment length; 1: The longest sequence's length; 2: The shortest sequence's length. // TODO
* @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false).
*
* @returns The minimum length of the Longest Common Subsequence between two sequences to be above the chosen score threshold.
*
* @since 2012
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
int calculateLCSmin(int lmax, int lmin, double threshold, bool normalize, int reference, bool similarity_mode);
/**
* @brief Function aligning two sequences.
*
* The alignment algorithm is a banded global alignment algorithm, a modified version of the classical Needleman and Wunsch algorithm,
* and uses indices based on the length of the Longest Common Subsequence between the two sequences.
*
* Note: the sequences do not need to be ordered (e.g. with the longest sequence as first argument).
*
* @param seq1 A pointer on the character string corresponding to the first sequence.
* @param seq2 A pointer on the character string corresponding to the second sequence.
* @param threshold Score threshold. If the score is normalized and expressed in similarity, it is an identity, e.g. 0.95
* for an identity of 95%. If the score is normalized and expressed in distance, it is (1.0 - identity),
* e.g. 0.05 for an identity of 95%. If the score is not normalized and expressed in similarity, it is
* the length of the Longest Common Subsequence. If the score is not normalized and expressed in distance,
* it is (reference length - LCS length). Only sequence pairs with a similarity above the threshold are printed.
* @param normalize Whether the score should be normalized with the reference sequence length.
* @param reference The reference length. 0: The alignment length; 1: The longest sequence's length; 2: The shortest sequence's length. // TODO
* @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false).
* @param lcs_length A pointer on the int where the LCS length will be stored.
* @param ali_length A pointer on the int where the alignment length will be stored.
*
* @returns The alignment score (normalized according to the parameters).
*
* @since 2012
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, bool normalize, int reference, bool similarity_mode, int* lcs_length, int* ali_length);
/**
* @brief Function aligning two sequences encoded in obiblobs.
*
* The alignment algorithm is a banded global alignment algorithm, a modified version of the classical Needleman and Wunsch algorithm,
* and uses indices based on the length of the Longest Common Subsequence between the two sequences.
*
* Note: the obiblobs do not need to be ordered (e.g. with the obiblob containing the longest sequence as first argument).
*
* @param seq1 A pointer on the blob containing the first sequence.
* @param seq2 A pointer on the blob containing the second sequence.
* @param threshold Score threshold. If the score is normalized and expressed in similarity, it is an identity, e.g. 0.95
* for an identity of 95%. If the score is normalized and expressed in distance, it is (1.0 - identity),
* e.g. 0.05 for an identity of 95%. If the score is not normalized and expressed in similarity, it is
* the length of the Longest Common Subsequence. If the score is not normalized and expressed in distance,
* it is (reference length - LCS length). Only sequence pairs with a similarity above the threshold are printed.
* @param normalize Whether the score should be normalized with the reference sequence length.
* @param reference The reference length. 0: The alignment length; 1: The longest sequence's length; 2: The shortest sequence's length. // TODO
* @param similarity_mode Whether the score should be expressed in similarity (true) or distance (false).
* @param lcs_length A pointer on the int where the LCS length will be stored.
* @param ali_length A pointer on the int where the alignment length will be stored.
*
* @returns The alignment score (normalized according to the parameters).
*
* @since December 2016
* @author Celine Mercier (celine.mercier@metabarcoding.org)
*/
double obiblob_sse_banded_lcs_align(Obi_blob_p seq1, Obi_blob_p seq2, double threshold, bool normalize, int reference, bool similarity_mode, int* lcs_length, int* ali_length);
#endif