Files
annotate/src/repseek/repseek-2014.09/align_di.c
Eric Coissac d298385685 First commit - second part
Former-commit-id: 202296404e6a70f8ae96db99faffb456104c57e9
Former-commit-id: 118417735d2055683607df9809c9b721cc1b1bab
2015-10-02 21:13:16 +02:00

436 lines
14 KiB
C

/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/******
file : align_di.c
function : get a seeds, align it forward and backward and return the aligned repeat
created : Apr 11 2001
modif : Jul 2003,Feb 2004, April 2004, July 2004: turn scores into doubles
author : amikezor
*****/
#include "repseek_types.h"
#include "align.h"
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
/*
Get the score of a seed
first and ast position are not taken into account since they are used in the alignments
*/
static double score_seed(int32_t pos, int32_t len, char *sequence, SCORING *pScoring){
char *pseq; /* pointer to the seqeunce */
double score=0; /* the score */
int32_t index=0; /* the symbol of the character in the reward matrix */
pseq = sequence+pos;
while(pseq < sequence+pos+len-1){
index = CHAR2SYMB(*pseq);
score += pScoring->matrix[index][index];
pseq++;
}
return score;
}
/*
The repeat is a palindrome
*/
static Rep Palindome(int32_t pos1, int32_t pos2, int32_t len, char *sequence, SCORING *pScoring){
Rep aligned_seed;
aligned_seed.seed_pos1 = pos1;
aligned_seed.seed_pos2 = pos2;
aligned_seed.seed_len = len;
aligned_seed.pos1 = pos1;
aligned_seed.len1 = len/2;
aligned_seed.pos2 = pos1+aligned_seed.len1;
aligned_seed.len2 = len/2;
aligned_seed.match = len/2;
aligned_seed.align_len = len/2;
strcpy(aligned_seed.type, "Palindrome");
aligned_seed.score = score_seed(pos1, len, sequence, pScoring);
return aligned_seed;
}
/*
Init the repeats value with a seed
first and last position are removed since they are used in the alignements
*/
static void InitRep(int32_t pos1, int32_t pos2, int32_t len, double ScoreSeed, Rep *aligned_seed ){
aligned_seed->pos1=pos1+1;
aligned_seed->pos2=pos2+1;
aligned_seed->len1= len-2; /* len of seed + aligned_genomik */
aligned_seed->len2= len-2;
aligned_seed->seed_pos1 = pos1;
aligned_seed->seed_pos2 = pos2;
aligned_seed->seed_len = len;
aligned_seed->match = len-2;
aligned_seed->align_len = len-2;
aligned_seed->score = ScoreSeed;
}
/*
Update the repeats from we have on the right - forward-
Only for direct repeats
*/
static void UpdateRepRight(Rep *aligned_seed , RESULTS *pResults){
aligned_seed->len1 += pResults->BestScore_row+1; /* len of seed + aligned_genomik */
aligned_seed->len2 += pResults->BestScore_col+1;
aligned_seed->match += pResults->matches;
aligned_seed->align_len += pResults->alignment_len;
aligned_seed->score += *pResults->pBestScore;
}
/*
Update the repeats from we have on the left - backward -
Only for direct repeats
*/
static void UpdateRepLeft(Rep *aligned_seed , RESULTS *pResults){
aligned_seed->pos1 -= pResults->BestScore_row+1; /* len of seed + aligned_genomik */
aligned_seed->pos2 -= pResults->BestScore_col+1;
aligned_seed->len1 += pResults->BestScore_row+1; /* len of seed + aligned_genomik */
aligned_seed->len2 += pResults->BestScore_col+1;
aligned_seed->match += pResults->matches;
aligned_seed->align_len += pResults->alignment_len;
aligned_seed->score += *pResults->pBestScore;
}
/*
Same functions but for inverted repeats
*/
static void UpdateRepRightInv(Rep *aligned_seed , RESULTS *pResults){
aligned_seed->len1 += pResults->BestScore_row+1; /* len of seed + aligned_genomik */
aligned_seed->pos2 -= pResults->BestScore_col+1;
aligned_seed->len2 += pResults->BestScore_col+1;
aligned_seed->match += pResults->matches;
aligned_seed->align_len += pResults->alignment_len;
aligned_seed->score += *pResults->pBestScore;
}
static void UpdateRepLeftInv(Rep *aligned_seed , RESULTS *pResults){
aligned_seed->pos1 -= pResults->BestScore_row+1; /* len of seed + aligned_genomik */
aligned_seed->len1 += pResults->BestScore_row+1; /* len of seed + aligned_genomik */
aligned_seed->len2 += pResults->BestScore_col+1;
aligned_seed->match += pResults->matches;
aligned_seed->align_len += pResults->alignment_len;
aligned_seed->score += *pResults->pBestScore;
}
/***
The function that return an aligned Rep from a cupple of seed information
Moreover all Struct have to be init before
***/
Rep alignd(int32_t pos1, int32_t pos2, int32_t len, char *sequence, int32_t sizeseq ,
float Xg, SCORING *pScoring, RESULTS *pResults , int8_t opt_overlap){
Rep aligned_seed; /* the repeat */
char *pseq1, *pseq2; /* pointer for the first and the second copy */
double ScoreSeed=0; /* score of the seed */
int32_t max_col=0, max_row=0; /* the dist max from the edge of the sequence max for rwo and col */
int32_t diff_max =0; /* the maximum difference between row and col to avoid aligning the chr with itself */
/*
Init by the seed
*/
ScoreSeed=score_seed(pos1+1, len-1, sequence, pScoring); /* pos1+1 and len1-1, since first bases are used into the alignement */
InitRep(pos1, pos2, len, ScoreSeed, &aligned_seed );
diff_max = pos2-pos1; /* if pos1 is bigger than pos2, then you align the chromosome with itself */
/*
If the seed is already overlapping and we do not want it
*/
if(opt_overlap == 0 && aligned_seed.seed_pos1+aligned_seed.seed_len >= aligned_seed.seed_pos2){
len=(pos2-pos1);
ScoreSeed=score_seed(pos1, len, sequence, pScoring);
InitRep(pos1-1, pos2-1, len+2, ScoreSeed, &aligned_seed );
strcpy(aligned_seed.type, "Tandem");
return aligned_seed;
}
/*
Align the right side -- forward --
*/
pseq1 = sequence+pos1+len-1; /* set the sequences */
pseq2 = sequence+pos2+len-1;
max_col = sizeseq-(pos2+len-1)-1; /* you cannot align after the chromosome end */
max_row = sizeseq-(pos1+len-1)-1;
if(opt_overlap == 0)
max_row = MIN2(pos2-(pos1+len-1) -1 , max_row); /* here, you cannot go inside the next copy */
align_blast2like(pseq1, pseq2, Xg, pScoring, pResults, 'f', max_row, max_col, diff_max);
UpdateRepRight(&aligned_seed, pResults);
/*
Align the left side -- backward --
*/
pseq1 = sequence+pos1; /* set the sequences */
pseq2 = sequence+pos2;
max_row = pos1;
max_col = pos2;
if(opt_overlap == 0)
max_col = MIN2( aligned_seed.pos2 - (aligned_seed.pos1 + aligned_seed.len1 - 1) -2 , max_col );
align_blast2like(pseq1, pseq2, Xg, pScoring, pResults, 'b', max_row, max_col, diff_max);
UpdateRepLeft(&aligned_seed, pResults);
if( aligned_seed.pos2 < aligned_seed.pos1+aligned_seed.len1 )
strcpy(aligned_seed.type, "Overlap");
else if( aligned_seed.pos2 == aligned_seed.pos1+aligned_seed.len1 )
strcpy(aligned_seed.type, "Tandem");
else if( aligned_seed.pos2 - (aligned_seed.pos1+aligned_seed.len1) < 1000 )
strcpy(aligned_seed.type, "Close" );
else
strcpy(aligned_seed.type, "Distant" );
return aligned_seed; /* return the repeat */
}
/*
Align some Inverted repeat
*/
Rep aligni(int32_t pos1, int32_t pos2, int32_t len, char *sequence, char *invsequence,\
int32_t sizeseq ,float Xg, SCORING *pScoring, RESULTS *pResults ){
Rep aligned_seed; /* the repeat */
char *pseq1, *pseq2; /* pointer for the first and the second copy */
double ScoreSeed=0; /* score of the seed */
int32_t max_row=0, max_col=0; /* the dist max from the edge of the sequence max for rwo and col */
int32_t diff_max =0; /* the maximum difference between row and col to
avoid aligning a palindrome with itself. !! For Inverted it takes NEGATIVE Values !! */
/*
Init by th seed
*/
ScoreSeed=score_seed(pos1+1, len-1, sequence, pScoring); /* pos1+1 and len1-1, since first bases are used into the alignement */
InitRep(pos1, pos2, len, ScoreSeed, &aligned_seed );
if(pos1 == pos2){
aligned_seed = Palindome(pos1, pos2, len, sequence, pScoring);
return aligned_seed;
}
/*
Align the right side -- forward for the first copy --
*/
pseq1 = sequence+pos1+len-1; /* set the sequences */
pseq2 = invsequence + sizeseq - (pos2+1);
diff_max = -(pos2-pos1-len); /* for inverted repeats, we do not want palindromes
so put it in negative is a code to say 'inverted' */
max_row = sizeseq-(pos1+len-1)-1;
max_col = pos2;
align_blast2like(pseq1, pseq2, Xg, pScoring, pResults, 'f', max_row, max_col, diff_max);
UpdateRepRightInv(&aligned_seed, pResults);
/*
Align the left side -- backward for the first copy --
*/
pseq1 = sequence+pos1; /* set the sequences */
pseq2 = invsequence+sizeseq-(pos2+len-1) -1;
max_row = pos1;
max_col = sizeseq-(pos2+len-1)-1;
diff_max = sizeseq; /* for inverted repeats, on left, it does not really matter */
align_blast2like(pseq1, pseq2, Xg, pScoring, pResults, 'b', max_row, max_col, diff_max);
UpdateRepLeftInv(&aligned_seed, pResults);
if(aligned_seed.pos2 == (aligned_seed.pos1+aligned_seed.len1))
strcpy(aligned_seed.type, "Palindrome");
else if(aligned_seed.pos2 - (aligned_seed.pos1+aligned_seed.len1) < 1000 )
strcpy(aligned_seed.type, "Close");
else
strcpy(aligned_seed.type, "Distant" );
return aligned_seed; /* return the repeat */
}
/*
Align repeats shared by 2 sequences
*/
Rep alignd_2seq(int32_t pos1, int32_t pos2, int32_t len, char *seq1, char *seq2, \
int32_t size1, int32_t size2, float Xg, SCORING *pScoring, RESULTS *pResults ){
Rep aligned_seed; /* the repeat */
char *pseq1, *pseq2; /* pointer for the first and the second copy */
double ScoreSeed=0; /* score of the seed */
int32_t max_row=0, max_col=0; /* the dist max from the edge of the sequence max for rwo and col */
int32_t diff_max =0; /* the maximum difference between row and col to avoid aligning the chr with itself */
/*
Init by the seed
*/
ScoreSeed=score_seed(pos1+1, len-1, seq1, pScoring); /* pos1+1 and len1-1, since first bases are used into the alignement */
InitRep(pos1, pos2, len, ScoreSeed, &aligned_seed );
diff_max = size1+size2; /* there is not limit here */
/*
Align the right side -- forward --
*/
pseq1 = seq1+pos1+len-1; /* set the sequences */
pseq2 = seq2+pos2+len-1;
max_row = size1-(pos1+len-1)-1;
max_col = size2-(pos2+len-1)-1; /* you cannot further than a chronmosome end */
align_blast2like(pseq1, pseq2, Xg, pScoring, pResults, 'f', max_row, max_col, diff_max);
UpdateRepRight(&aligned_seed, pResults);
/*
Align the left side -- backward --
*/
pseq1 = seq1+pos1; /* set the sequences */
pseq2 = seq2+pos2;
max_row = pos1;
max_col = pos2; /* you cannot further than a chronmosome begining */
align_blast2like(pseq1, pseq2, Xg, pScoring, pResults, 'b', max_row, max_col, diff_max);
UpdateRepLeft(&aligned_seed, pResults);
strcpy(aligned_seed.type, "Inter");
return aligned_seed; /* return the repeat */
}
/*
Align repeats shared by 2 sequences
*/
Rep aligni_2seq(int32_t pos1, int32_t pos2, int32_t len, char *seq1, char *invseq2, \
int32_t size1, int32_t size2, float Xg, SCORING *pScoring, RESULTS *pResults ){
Rep aligned_seed; /* the repeat */
char *pseq1, *pseq2; /* pointer for the first and the second copy */
double ScoreSeed=0; /* score of the seed */
int32_t max_row=0, max_col=0; /* the dist max from the edge of the sequence max for rwo and col */
int32_t diff_max =0; /* the maximum difference between row and col to
avoid aligning a palindrome with itself. !! For Inverted it takes NEGATIVE Values !! */
/*
Init by th seed
*/
ScoreSeed=score_seed(pos1+1, len-1, seq1, pScoring); /* pos1+1 and len1-1, since first bases are used into the alignement */
InitRep(pos1, pos2, len, ScoreSeed, &aligned_seed );
/*
Align the right side -- forward for the first copy --
*/
pseq1 = seq1+pos1+len-1; /* set the sequences */
pseq2 = invseq2 + size2 - (pos2+1);
diff_max = size1+size2; /* NO diff max for inter-repeats */
max_row = size1-(pos1+len-1)-1;
max_col = pos2;
align_blast2like(pseq1, pseq2, Xg, pScoring, pResults, 'f', max_row, max_col, diff_max);
UpdateRepRightInv(&aligned_seed, pResults);
/*
Align the left side -- backward for the first copy --
*/
pseq1 = seq1+pos1; /* set the sequences */
pseq2 = invseq2+size2-(pos2+len-1) -1;
max_row = pos1;
max_col = size2-(pos2+len-1)-1;
diff_max = size1+size2; /* it does not really matter */
align_blast2like(pseq1, pseq2, Xg, pScoring, pResults, 'b', max_row, max_col, diff_max);
UpdateRepLeftInv(&aligned_seed, pResults);
if(aligned_seed.pos2 == (aligned_seed.pos1+aligned_seed.len1))
strcpy(aligned_seed.type, "Inter");
return aligned_seed; /* return the repeat */
} /* return the repeat */