Files
ecoprimers/src/libthermo/thermalign.c
2009-07-07 12:35:17 +00:00

717 lines
21 KiB
C

/*
* thermalign.cpp
* PHunterLib
*
* Created by Tiayyba Riaz on 7/2/09.
* Copyright 2009 __MyCompanyName__. All rights reserved.
*
*/
//=============================================================================
// Module: thermalign.cpp
// Project: Diploma Thesis - Probe Selection for DNA Microarrays
// Type: implementation - Thermodynamic Alignment.
// Language: c++
// Compiler: microsoft visual c++ 6.0, unix/linux gcc
// System/OS: Windows 32, Sun solaris, Linux, other unix systems (untested)
// Database: none
// Description: class CThermAlign - Thermodynamic Alignment Algorithm
// Author: kaderali
// Date: 9/2000 - 01/2001
// Copyright: (c) L. Kaderali, 9/2000 - 01/2001
//
// Revision History
// $ 00sep04 LK : created
// $ 00dec30 LK : changed to do local alignment of probe against
// one entire sequence
// $ 01feb07 LK : optimized!
// $ 01aug06 LK : corrected, included salt and concentration input;
// $ made true local alignment (problem with initial mismatches!)
// #$
//=============================================================================
#include <math.h>
#include <assert.h>
//#include <iomanip>
#include <stdio.h>
#include <stdlib.h>
#include "thermalign.h"
#define dH(a,b,c) (*(thrma->dH+((b)*(thrma->maxseq1len+1)*3+(a)*3+(c))))
#define dS(a,b,c) (*(thrma->dS+((b)*(thrma->maxseq1len+1)*3+(a)*3+(c))))
///////////////////////////////////////////////////////////////////////////////
// Construction/Destruction
///////////////////////////////////////////////////////////////////////////////
//------------------------ Constructor ----------------------------------------
// Initialize Thermalign Object. This will allocate memory for the dynamic
// programming matrices for both entropy and enthalpy of the present
// alignment. The max length of the two sequences to align has to be passed
// to the routine. Also, an initialized CNNParams Object must be passed to
// CThermAlign. Object will be initialized only once, sequences can be
// passed to it later.
//int s1lmax,s2lmax; // global requiered for DEBUGGING
void therm_initCThermAlign(PThermAlign thrma, int ALength, int BLength, PNNParams myNNParams) {
// Create dynamic programming matrices for entropy and enthalpy.
// The first two dimensions are for the two strings respectively, the
// third dimensions differentiates the alignment according to its end:
// type 0 Alignments will be those that align x(i+1) with y(j+1),
// type 1 Alignments align x(i+1) with a gap
// type 2 Alignments align y(i+1) with a gap
// All three of these types must separately be considered to select the
// optimum alignment in the next iteration.
// The alignment algorithm proceeds analogous to the well known
// Smith-Waterman algorithm for global alignments.
thrma->dH = (float*)calloc ((ALength+1)*(BLength+1)*3, sizeof (float));
thrma->dS = (float*)calloc ((ALength+1)*(BLength+1)*3, sizeof (float));
// s1lmax=ALength;
// s2lmax=BLength;
if ((thrma->dH==NULL)||(thrma->dS==NULL))
{
printf("Fatal Error: Out of memory!\n");
exit(-1);
}
thrma->NNParams = myNNParams;
thrma->seq1len = ALength;
thrma->maxseq1len = thrma->seq1len;
thrma->seq2len = BLength;
// set dH / dS entries all to zero!
#ifdef _pack
// this is FAST, but works only if #pragma pack(1) has been set...
memset(dH,0,(ALength+1)*(BLength+1)*3*sizeof(float));
memset(dS,0,(ALength+1)*(BLength+1)*3*sizeof(float));
#else
// the following is slower, but works also when padding bytes have
// been inserted in the array for alignment by the compiler...
// time is not really critical here, as this initialization is done
// only once...
int a, b;
for (a=0;a<=ALength;a++)
for (b=0;b<=BLength;b++)
{
dH(a,b,0)=0.0f;
dH(a,b,1)=0.0f;
dH(a,b,2)=0.0f;
dS(a,b,0)=0.0f;
dS(a,b,1)=0.0f;
dS(a,b,2)=0.0f;
}
#endif
therm_InitBorder(thrma); // also need to do only once...
// forbidden_entropy = (-(NNParams->R)*(float)log((NNParams->Ct)/4.0f));
return;
}
//------------------------ Destructor -----------------------------------------
// Free memory
void therm_finiCThermAlign(PThermAlign thrma)
{
free (thrma->dH);
free (thrma->dS);
return;
}
///////////////////////////////////////////////////////////////////////////////
// FUNCTION: void CThermAlign::InitStrings(char* s1,
// char* s2,
// int s1len,
// int s2len);
//
// PURPOSE: Initialize strings for alignment table!
//
// PARAMETERS:
// s1: Target sequence
// s2: Probe sequence
// s1len: Target sequence length
// s2len: Probe sequence length
//
// RETURN VALUE:
// void
//
// REVISION HISTORY
// $ 01jan03 LK : created
// $ 01feb07 LK : removed maximum local stuff
// #$
void therm_InitStrings(PThermAlign thrma, char* s1,char* s2,int s1len ,int s2len,int minxpos)
{
thrma->seq1=s1; thrma->seq1len=s1len;
thrma->seq2=s2; thrma->seq2len=s2len;
nparam_UpdateParams(thrma->NNParams, s1,s2);
/* lk01feb07: removed maxloc-stuff
// if maximum local is not in reused subtable, need to reset!
if ((maxlocj>=minxpos)||(maxloci!=seq1len))
{
maxloci=0; maxlocj=0; maxloctm=0;
}
*/
// InitBorder(); //lk01feb07: removed, as it suffices if this is done once in constructor
return;
}
///////////////////////////////////////////////////////////////////////////////
// FUNCTION: void CThermAlign::InitBorder();
//
// PURPOSE: Initialize the lower and left border of the dynamic
// programming table.
//
// PARAMETERS:
// none
//
// RETURN VALUE:
// void
//
// REVISION HISTORY
// $ 00sep06 LK : created
// $ 00dec30 LK : complete revision - simply init table now, no
// more saving etc...
// $ 01feb07 LK : removed local alignment stuff and optimized
// #$
void therm_InitBorder(PThermAlign thrma)
{
// Initialization for new alignment!
thrma->maxloctm = 0; // maximum melting temperature found (kelvin)
thrma->maxloci = 0;
thrma->maxlocj = 0;
thrma->maxloct = 0;
int i, j;
for ( i=0; i<=thrma->seq1len; i++)
{
dH(i,0,0) = 0.0f; dH(i,0,1) = 0.0f; dH(i,0,2) = 0.0f;
dS(i,0,0) = nparam_GetInitialEntropy(thrma->NNParams); dS(i,0,1) = nparam_GetInitialEntropy(thrma->NNParams);
dS(i,0,2) = nparam_GetInitialEntropy(thrma->NNParams);
}
for ( j=1; j<=thrma->seq2len; j++)
{
dH(0,j,0) = 0.0f; dH(0,j,1) = 0.0f; dH(0,j,2) = 0.0f;
dS(0,j,0) = nparam_GetInitialEntropy(thrma->NNParams); dS(0,j,1) = nparam_GetInitialEntropy(thrma->NNParams);
dS(0,j,2) = nparam_GetInitialEntropy(thrma->NNParams);
}
return;
}
///////////////////////////////////////////////////////////////////////////////
// FUNCTION: void CThermAlign::CalculateCell(int i, int j);
//
// PURPOSE: Calculate entropy and enthalpy for a given cell using a variant
// of the dynamic programming recursion, choosing greedy for
// tm->max
// x(i,j,0) = min/max {x(i-1,j-1,t) | for t in {0,1,2}} + newnn
// x(i,j,1) = min/max {x(i-1,j-1,0) + newnn, x(i-1,j,1)}
// x(i,j,2) = min/max {x(i-1,j-1,0) + newnn, x(i, j-1,2)}
// where newnn is the contribution of the newly added nearest
// neighbor term. The underlying assumption is that two
// consecutive gaps do not contribute to x. (Can however be
// easily changed, as long as an affine relationship exists
// between the length of the gap and the associated "cost").
// To estimate the melting temperature TM from Enthalpy and
// Entropy, the following formula is being used:
// TM = dH / (dS + R ln CT)
// where
// TM is the temperature at which 50% of the duplex
// are dissociated
// dH is the Enthalpy
// dS is the Entropy
// R is the gas constant
// CT is the concentration (assumed constant)
// and ln means the natural logarithm.
// One drawback is that the two separate computations of
// Entropy and Enthalpy may result in different alignments.
// We do not want that, and thus pick the one with the higher
// melting temperature TM. However, it must be warned that
// depending on the underlying nearest neighbor parameters this
// may only approximate the "true" optimum result!
//
// PARAMETERS:
// int i,
// int j - coordinates of the dp cell to calculate (=position in
// string). Requieres that (i-1,j), (i,j-1), (i-1,j-1)
// are known.
//
// RETURN VALUE:
// - will be written directly to dH/dS array in this object
//
// REVISION HISTORY
// $ 00sep06 LK : created
// $ 00dec30 LK : modified to align probe with full target (local
// alignment)
// $ 01feb07 LK : optimized and removed local alignment stuff
// $ 01feb08 LK : optimized further and merged into CalculateTable
// #$
///////////////////////////////////////////////////////////////////////////////
// FUNCTION: CThermAlign::CalculateTable();
//
// PURPOSE: Compute the entire dynamic programming table cell by cell.
// Requieres that InitBorder has been called.
// Computations will be done in the usual rowwise manner.
//
// PARAMETERS:
// int startRow: Row that computation shall commence at!
//
// RETURN VALUE:
// void - Data will be written directly to dH and dS in this object
//
// REVISION HISTORY
// $ 00sep06 : created LK
// 00dec31 : modified to begin at a specified row!
// #$
void therm_CalculateTable(PThermAlign thrma, int startRow)
{
// #define forbidden_enthalpy 1000000000000000000.0f
register int i;
register int iminusone;
register int j;
register int jminusone;
// Determine previous character!
register char prevchari;
register char prevcharj;
register char seq1char;
register char seq2char;
register float entropy1;
register float entropy2;
register float entropy3;
register float enthalpy1;
register float enthalpy2;
register float enthalpy3;
register float tm1;
register float tm2;
register float tm3;
float maxtm;
// ofstream dboutstream("debug.out");
// assert(startRow>0);
for (j=startRow; j<=thrma->seq2len; j++)
{
if (j>1)
prevcharj = thrma->seq2[j-2];
else
prevcharj = 0;
jminusone = j - 1;
seq2char = thrma->seq2[jminusone];
for (i=1; i<=thrma->seq1len; i++)
{
// local variables
if (i>1)
prevchari = thrma->seq1[i-2];
else
prevchari = 0;
iminusone = i - 1;
seq1char=thrma->seq1[iminusone];
// --------------- Upper cell: Alignment of seq1[i] with seg2[j]
entropy1 = dS(iminusone,jminusone,0);
enthalpy1 = dH(iminusone,jminusone,0) ;
entropy2 = dS(iminusone,jminusone,1);
enthalpy2 = dH(iminusone,jminusone,1);
entropy3 = dS(iminusone,jminusone,2);
enthalpy3 = dH(iminusone,jminusone,2);
if (enthalpy1<forbidden_enthalpy)
{
entropy1 += nparam_GetEntropy(thrma->NNParams, prevchari, seq1char, prevcharj, seq2char);
enthalpy1 += nparam_GetEnthalpy(thrma->NNParams, prevchari, seq1char, prevcharj, seq2char);
}
if (enthalpy2<forbidden_enthalpy)
{
enthalpy2 += nparam_GetEnthalpy(thrma->NNParams, prevchari, seq1char, 0, seq2char); // 0 is gap!
entropy2 += nparam_GetEntropy(thrma->NNParams, prevchari, seq1char, 0, seq2char);
}
if (enthalpy3<forbidden_enthalpy)
{
enthalpy3 += nparam_GetEnthalpy(thrma->NNParams, 0, seq1char, prevcharj, seq2char);
entropy3 += nparam_GetEntropy(thrma->NNParams, 0, seq1char, prevcharj, seq2char);
}
// choose optimum combination
tm1 = nparam_CalcTM(entropy1, enthalpy1);
tm2 = nparam_CalcTM(entropy2, enthalpy2);
tm3 = nparam_CalcTM(entropy3, enthalpy3);
if ((tm1>=tm2)&&(tm1>=tm3))
{
dS(i,j,0) = entropy1;
dH(i,j,0) = enthalpy1;
maxtm=tm1;
}
else if (tm2>=tm3)
{
dS(i,j,0) = entropy2;
dH(i,j,0) = enthalpy2;
maxtm=tm2;
}
else
{
dS(i,j,0) = entropy3;
dH(i,j,0) = enthalpy3;
maxtm=tm3;
}
// set to zero if alignment so far is mismatches only (tm will be zero -> no need to reset maxtm)!
if (dH(i,j,0)==0)
dS(i,j,0)=nparam_GetInitialEntropy(thrma->NNParams);
// check if local maximum found!
if (((i==thrma->seq1len)||(j==thrma->seq2len))&&(maxtm>=thrma->maxloctm))
{
thrma->maxloctm=maxtm;
thrma->maxloci=i;
thrma->maxlocj=j;
thrma->maxloct=0;
}
// set to zero if alignment so far is mismatches only!
if (dH(i,j,0)==0)
dS(i,j,0)=nparam_GetInitialEntropy(thrma->NNParams);
// --------------- Middle cell: Alignment of seq1[i] with '-'
// following changes lk 01jan08
// we do not allow $- in the beginning, therefore set to forbidden if
// j=1
if (j==1)
{
enthalpy1=forbidden_enthalpy; enthalpy2=forbidden_enthalpy;
entropy1=forbidden_entropy; entropy2=forbidden_entropy;
}
// also, disallow -$ in the end, therefore forbid if j=seq2len-1
else if (j==thrma->seq2len-1)
{
enthalpy1=forbidden_enthalpy; enthalpy2=forbidden_enthalpy;
entropy1=forbidden_entropy; entropy2=forbidden_entropy;
}
else
{
// end lk01jan08
// -- entropy
entropy1 = dS(iminusone,j,0);
entropy2 = dS(iminusone,j,1);
enthalpy1 = dH(iminusone,j,0);
enthalpy2 = dH(iminusone,j,1);
if (enthalpy1<forbidden_enthalpy)
{
entropy1 += nparam_GetEntropy(thrma->NNParams, prevchari, seq1char, seq2char, 0);
enthalpy1 += nparam_GetEnthalpy(thrma->NNParams, prevchari, seq1char, seq2char, 0);
}
if (enthalpy2<forbidden_enthalpy)
{
entropy2 += nparam_GetEntropy(thrma->NNParams, prevchari, seq1char, 0, 0);
enthalpy2 += nparam_GetEnthalpy(thrma->NNParams, prevchari, seq1char, 0, 0);
}
}
// -- choose optimum combination
tm1 = nparam_CalcTM(entropy1, enthalpy1);
tm2 = nparam_CalcTM(entropy2, enthalpy2);
if (tm1>=tm2)
{
dS(i,j,1)=entropy1;
dH(i,j,1)=enthalpy1;
maxtm=tm1;
}
else
{
dS(i,j,1)=entropy2;
dH(i,j,1)=enthalpy2;
maxtm=tm2;
}
// check if local maximum found!
if (((i==thrma->seq1len)||(j==thrma->seq2len))&&(maxtm>=thrma->maxloctm))
{
thrma->maxloctm=maxtm;
thrma->maxloci=i;
thrma->maxlocj=j;
thrma->maxloct=1;
}
// set to zero if alignment so far is mismatches only!
if (dH(i,j,1)==0)
dS(i,j,1)=nparam_GetInitialEntropy(thrma->NNParams);
// --------------- Lower cell: Alignment of '-' with seq2[j]
// following changes lk 01jan08
// we do not allow $- in the beginning, therefore set to forbidden if
// i=1
if (i==1)
{
enthalpy1=forbidden_enthalpy; enthalpy2=forbidden_enthalpy;
entropy1=forbidden_entropy; entropy2=forbidden_entropy;
}
// also, disallow -$ in the end, therefore forbid if i=seq1len-1
else if (i==thrma->seq1len-1)
{
enthalpy1=forbidden_enthalpy; enthalpy2=forbidden_enthalpy;
entropy1=forbidden_entropy; entropy2=forbidden_entropy;
}
else
{
// end lk01jan08
entropy1 = dS(i,jminusone,0);
entropy2 = dS(i,jminusone,2);
enthalpy1 = dH(i,jminusone,0);
enthalpy2 = dH(i,jminusone,2);
if (enthalpy1<forbidden_enthalpy)
{
entropy1 += nparam_GetEntropy(thrma->NNParams, seq1char, 0, prevcharj, seq2char);
enthalpy1 += nparam_GetEnthalpy(thrma->NNParams, seq1char, 0, prevcharj, seq2char);
}
if (enthalpy2<forbidden_enthalpy)
{
entropy2 += nparam_GetEntropy(thrma->NNParams, 0, 0, prevcharj, seq2char);
enthalpy2 += nparam_GetEnthalpy(thrma->NNParams, 0, 0, prevcharj, seq2char);
}
}
// -- choose optimum combination
tm1 = nparam_CalcTM(entropy1, enthalpy1);
tm2 = nparam_CalcTM(entropy2, enthalpy2);
if (tm1>=tm2)
{
dS(i,j,2)=entropy1;
dH(i,j,2)=enthalpy1;
maxtm=tm1;
}
else
{
dS(i,j,2)=entropy2;
dH(i,j,2)=enthalpy2;
maxtm=tm2;
}
// check if local maximum found!
if (((i==thrma->seq1len)||(j==thrma->seq2len))&&(maxtm>=thrma->maxloctm))
{
thrma->maxloctm=maxtm;
thrma->maxloci=i;
thrma->maxlocj=j;
thrma->maxloct=2;
}
// set to zero if alignment so far is mismatches only!
if (dH(i,j,2)==0)
dS(i,j,2)=nparam_GetInitialEntropy(thrma->NNParams);
/*
dboutstream<<"After cell ("<<i<<","<<j<<"):"<<endl;
PrintDPTable(dboutstream);
dboutstream<<endl<<flush;
*/
}
}
return;
}
///////////////////////////////////////////////////////////////////////////////
// FUNCTION: float GetEntropy(int i, int j);
//
// PURPOSE: Return Entropy for given cell
//
// PARAMETERS:
// i, j - Cell coordinates
//
// RETURN VALUE:
// float - Entropy (dS)
//
// REVISION HISTORY
// $ 00sep23 : created LK
// #$
float therm_GetEntropy(PThermAlign thrma, int i, int j)
{
float entropy;
// first, need to determine optimum values for dG and dH
float tm0, tm1, tm2;
tm0 = nparam_CalcTM(dS(i,j,0),dH(i,j,0));
tm1 = nparam_CalcTM(dS(i,j,1),dH(i,j,1));
tm2 = nparam_CalcTM(dS(i,j,2),dH(i,j,2));
if ((tm0>=tm1)&&(tm0>=tm2))
entropy = dS(i,j,0);
else if ((tm1>=tm0)&&(tm1>=tm2))
entropy = dS(i,j,1);
else
entropy = dS(i,j,2);
return entropy;
}
///////////////////////////////////////////////////////////////////////////////
// FUNCTION: float GetEnthalpy(int i, int j);
//
// PURPOSE: Return Enthalpy for given cell
//
// PARAMETERS:
// i, j - Cell coordinates
//
// RETURN VALUE:
// float - Enthalpy (dS)
//
// REVISION HISTORY
// $ 00sep23 : created LK
// #$
float therm_GetEnthalpy(PThermAlign thrma, int i, int j)
{
float enthalpy;
// first, need to determine optimum values for dG and dH
float tm0, tm1, tm2;
tm0 = nparam_CalcTM(dS(i,j,0),dH(i,j,0));
tm1 = nparam_CalcTM(dS(i,j,1),dH(i,j,1));
tm2 = nparam_CalcTM(dS(i,j,2),dH(i,j,2));
if ((tm0>=tm1)&&(tm0>=tm2))
enthalpy = dH(i,j,0);
else if ((tm1>=tm0)&&(tm1>=tm2))
enthalpy = dH(i,j,1);
else
enthalpy = dH(i,j,2);
return enthalpy;
}
///////////////////////////////////////////////////////////////////////////////
// FUNCTION: float GetFreeEnergy(int i, int j);
//
// PURPOSE: Return free Energy for given cell at melting temp
//
// PARAMETERS:
// i, j - Cell coordinates
//
// RETURN VALUE:
// float - free Energy value (dG) at melting temperature
//
// REVISION HISTORY
// $ 00sep06 : created LK
// #$
float therm_GetFreeEnergy(PThermAlign thrma, int i, int j)
{
return therm_GetFreeEnergyK(thrma, i,j,therm_GetMeltingTempK(thrma, i,j));
}
///////////////////////////////////////////////////////////////////////////////
// FUNCTION: float GetFreeEnergyK(int i, int j, float t);
//
// PURPOSE: Return free Energy for given cell at given temperature
//
// PARAMETERS:
// i, j - Cell coordinates
// t - Temperature in Kelvin
//
// RETURN VALUE:
// float - free Energy value (dG) at given temperature (Kelvin)
//
// REVISION HISTORY
// $ 00sep06 : created LK
// #$
float therm_GetFreeEnergyK(PThermAlign thrma, int i, int j, float t)
{
// dG = dH - T * dS
float entropy;
float enthalpy;
// first, need to determine optimum values for dG and dH
entropy = therm_GetEntropy(thrma, i,j);
enthalpy = therm_GetEnthalpy(thrma, i,j);
return enthalpy - t * entropy;
}
///////////////////////////////////////////////////////////////////////////////
// FUNCTION: float GetFreeEnergyC(int i, int j, int t);
//
// PURPOSE: Return free Energy for given cell at given temperature
//
// PARAMETERS:
// i, j - Cell coordinates
// t - Temperature in Celsius
//
// RETURN VALUE:
// float - free Energy value (dG) at melting temperature
//
// REVISION HISTORY
// $ 00sep06 : created LK
// #$
float therm_GetFreeEnergyC(PThermAlign thrma, int i, int j, float t)
{
// dG = dH - T * dS
float entropy;
float enthalpy;
// first, need to determine optimum values for dG and dH
entropy = therm_GetEntropy(thrma, i,j);
enthalpy = therm_GetEnthalpy(thrma, i,j);
return enthalpy - (t + 273.15f) * entropy;
}
///////////////////////////////////////////////////////////////////////////////
// FUNCTION: float CThermAlign::GetMeltingTempC(int i, int j)
//
// PURPOSE: Calculate TM for given cell. Return in Celsius.
//
// PARAMETERS:
// i, j - Cell coordinates
//
// RETURN VALUE:
// float - Melting Temperature TM
//
// REVISION HISTORY
// $ 00sep06 : created LK
// #$
float therm_GetMeltingTempC(PThermAlign thrma, int i, int j)
{
return therm_GetMeltingTempK(thrma, i,j) - 273.15f;
}
///////////////////////////////////////////////////////////////////////////////
// FUNCTION: float CThermAlign::GetMeltingTempK(int i, int j)
//
// PURPOSE: Calculate TM for given cell. Return in Kelvin.
//
// PARAMETERS:
// i, j - Cell coordinates
//
// RETURN VALUE:
// float - Melting Temperature TM
//
// REVISION HISTORY
// $ 00sep06 : created LK
// #$
float therm_GetMeltingTempK(PThermAlign thrma, int i, int j)
{
float tm0, tm1, tm2;
tm0 = nparam_CalcTM(dS(i,j,0), dH(i,j,0));
tm1 = nparam_CalcTM(dS(i,j,1), dH(i,j,1));
tm2 = nparam_CalcTM(dS(i,j,2), dH(i,j,2));
float maxtm;
if ((tm0>=tm1)&&(tm0>=tm2))
maxtm=tm0;
else if ((tm1>=tm0)&&(tm1>=tm2))
maxtm=tm1;
else
maxtm=tm2;
if (maxtm<0)
maxtm=0;
return maxtm;
}