First commit - second part

Former-commit-id: 202296404e6a70f8ae96db99faffb456104c57e9
Former-commit-id: 118417735d2055683607df9809c9b721cc1b1bab
This commit is contained in:
2015-10-02 21:12:35 +02:00
parent f44f0d8179
commit d298385685
316 changed files with 122579 additions and 0 deletions

BIN
src/repseek/.DS_Store vendored Normal file

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,319 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
#ifndef KMRK_h
#define KMRK_h
/********************************************
********************************************
**
** Declaration of struct
**
********************************************
********************************************/
#include "repseek_types.h"
#include "KMRK_mask.h"
/**
* Structure used to manipulate simultanously
* the v and n vector
*
*/
typedef struct {
int32_t size; /**< size of the current sequence to search for */
int32_t vectorsize; /**< size of vectors */
int32_t seqCount; /**< count of concatenated sequences */
int32_t complement; /**< if seqCount > 1 and complement !=0
* then second sequence is the inverted complement strand of first one */
int32_t symbmax;
int32_t* v; /**< sequence vector */
int32_t* n; /**< linked list vector the special vector of KRM-K */
int32_t limits[1]; /**< array of end limits of concatenated sequences in v (array size is seqCount) */
} vn_type;
/********************************************
********************************************
**
** Declaration of public macro
**
********************************************
********************************************/
#define GETVECTOR(x,v) (((x)->v) - 1)
#define IS_MARKED(x,i) ((x)[i] < 0)
#define MARK(x,i) ((x)[i]) = -ABS((x)[i])
#define UNMARK(x,i) ((x)[i]) = ABS((x)[i])
#define SET(x,i,v) ((x)[i]) = (v)
#define SETMARKED(x,i,v) ((x)[i]) = -(v)
#define GET(x,i) ABS((x)[i])
#define SYMBOLE(x,i) ((IS_MARKED((x),(i))) ? (i): (GET(x,i)))
/**
* Macro used to declare a pointer to a quorum function.
*
* @param name name of the pointer
*
*/
#define KMRK_QUORUM_FUNC_PTR(name) int32_t (*name)( vn_type* x, int32_t pos, int32_t count, int32_t countSeq )
/**
* Macro used to declare a pointer to an initialisation function.
*
* @param name name of the pointer
* @param quorum name used for the quorum assiciated function
*
* @see KMRK_QUORUM_FUNC_PTR
*
*/
#define KMRK_INIT_FUNC_PTR(name,quorum) vn_type* (*name)(char *sequence, \
int32_t complement, \
int32_t count, \
int32_t countSeq, \
int32_t *k, \
KMRK_QUORUM_FUNC_PTR(quorum),\
masked_area_table_t *mask)
/********************************************
********************************************
**
** Declaration of public functions
**
********************************************
********************************************/
/**
* Initialise a vn_type record from one sequence to run KMRK algorithm
*
* @param sequence pointer to a C string containing the sequence
* @param complement != 0 means that seq one and two are the two strands of
* the same sequence.
* @param count parameter count passed to the quorun function
* @param countSeq parametter countSeq passed to the quorun function
* @param k length of the word represented by each symbole of v.
* k is an output parametter
* @param quorum pointer to a quorum function
*
* @return a pointer to vn_type structure correctly initialized
* to be used by KMRK_RunKMRK
*
* @see KMRK_HashOneSequence
*/
vn_type* KMRK_InitOneSequence(char *sequence,
int32_t complement,
int32_t count,
int32_t countSeq,
int32_t *k,
KMRK_QUORUM_FUNC_PTR(quorum),
masked_area_table_t *mask);
/**
* Initialise a vn_type record from one sequence to run KMRK algorithme.
* In more than KMRK_InitOneSequence, KMRK_HashOneSequence construct
* word of len k with an hash algorithm. k used is a function of
* sequence size and alphabet size. If calculed k is superior to lmax
* then k = lmax.
*
* @param sequence pointer to a C string containing the sequence
* @param complement != 0 means that seq one and two are the two strands of
* the same sequence.
* @param count parametter count passed to the quorun function
* @param countSeq parametter countSeq passed to the quorun function
* @param k maximum length of the created word (input)
* length of the word represented by each symbole
* of v (output).
* @param quorum pointer to a quorum function
*
* @return a pointer to vn_type structure correctly initialized
* to be used by KMRK_RunKMRK
*
* @see KMRK_InitOneSequence
*/
vn_type* KMRK_HashOneSequence(char *sequence,
int32_t complement,
int32_t count,
int32_t countSeq,
int32_t *k,
KMRK_QUORUM_FUNC_PTR(quorum),
masked_area_table_t *mask);
/**
* An example of quorum function testing than a factor is
* present a least two times. Because of definition of this
* quorum function count and countSeq parametter have no meanning
* in this instance of quorum function
*
* @param x a pointer to vn_type structure to check
* @param pos position in n of the beginning of the linked list to test
* @param count minimal number of occurence of factor
* @param countSeq minimal number of sequences concerned
*
* @return 1 if quorum is ok 0 otherwise.
*/
int32_t KMRK_CoupleQuorum(vn_type* x,
int32_t pos,
int32_t count,
int32_t countSeq);
/**
* An example of quorum function testing than a factor is
* present a least two times in the direct strand of a sequence or
* at least one time in the direct strand and one time in the reverse
* strand. Because of definition of this
* quorum function count and countSeq parametter have no meanning
* in this instance of quorum function
*
* @param x a pointer to vn_type structure to check
* @param pos position in n of the beginning of the linked list to test
* @param count minimal number of occurence of factor
* @param countSeq minimal number of sequences concerned
*
* @return 1 if quorum is ok 0 otherwise.
*/
int32_t KMRK_DirInvQuorum(vn_type* x,
int32_t pos,
int32_t count,
int32_t countSeq);
/**
* An example of quorum function testing than a factor is
* present a least one time in the direct strand and one time in the reverse
* strand. Because of definition of this
* quorum function count and countSeq parametter have no meanning
* in this instance of quorum function
*
* @param x a pointer to vn_type structure to check
* @param pos position in n of the beginning of the linked list to test
* @param count minimal number of occurence of factor
* @param countSeq minimal number of sequences concerned
*
* @return 1 if quorum is ok 0 otherwise.
*/
int32_t KMRK_InvQuorum(vn_type* x,
int32_t pos,
int32_t count,
int32_t countSeq);
int32_t KMRK_Int12Quorum(vn_type* x,
int32_t pos,
int32_t count,
int32_t countSeq);
int32_t KMRK_IntInv12Quorum(vn_type* x,
int32_t pos,
int32_t count,
int32_t countSeq);
int32_t KMRK_IntDirInv12Quorum(vn_type* x,
int32_t pos,
int32_t count,
int32_t countSeq);
/**
* realize one cycle of KMR.
*
* @param x a pointer to vn_type created by an intialisation
* function or returned by this function.
* @param k step used to join two words
* @param count parametter count passed to the quorun function
* @param countSeq parametter countSeq passed to the quorun function
* @param KMRK_QUORUM_FUNC_PTR quorum pointer to a quorum function
*/
void KMRK_RunKMRK(vn_type *x,
int32_t k,
int32_t count,
int32_t countSeq,
KMRK_QUORUM_FUNC_PTR(quorum));
/**
* realises serveral run of KMR cycle to make from a sequence
* a vn_type structure describing sequences of factors of a precise size.
*
* @param sequence pointer to a C string containing the sequence
* @param size word size to construct
* @param count parametter count passed to the quorun function
* @param countSeq parametter countSeq passed to the quorun function
* @param quorum pointer to a quorum function
* @param init pointer to a initialisation function
*
* @return a vn_type pointer to a structure containing sequences of factors
*/
vn_type *KMRK_RunTo(char *sequence,
int32_t size,
int32_t complement,
int32_t count,
int32_t countSeq,
KMRK_QUORUM_FUNC_PTR(quorum),
KMRK_INIT_FUNC_PTR(init,quorum),
masked_area_table_t *mask);
/**
* free memory associated to a vn_type pointer
*
* @param x a pointer to vn_type structure
*/
void KMRK_FreeVN(vn_type *x);
int32_t KMRK_upperCoupleCount(vn_type *x);
int32_t KMRK_upperInvertedCount(vn_type* x,int32_t wordsize);
int32_t KMRK_upperInterCount(vn_type* x,int32_t seq1,int32_t seq2,int32_t wordsize);
void KMRK_markStart(vn_type* x);
#endif /* KMRK_h */

View File

@ -0,0 +1,947 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
#include "KMRK_Seeds.h"
#include "memory.h"
#include <stdlib.h>
#include <string.h>
#include "sequence.h"
static void SetMultipleLenInvSeeds(
SmallSeed_type* seeds, /* an array of a structure with pos1 and pos2 */
int32_t nseeds, /* number of small seeds */
int32_t wordSize, /* lmin */
int8_t same, /* 1 if one sequence, 0 otherwise */
AllSeeds_type *PtrAllSeeds); /* the final seeds with length */
/*
Concatenate
DirSeq\@InvSeq\0
*/
static char* makeDirInvSeq(char* seq, int32_t size)
{
char *SeqInv;
seq = (char *)MyRealloc( (void *)seq, (size*2+2)*sizeof(char),
(size+1)* sizeof(char), "makeDirInvSeq: Cannot allocate space for reverse sequence");
SeqInv= seq + size + 1;
seq[size]=0;
invseq(seq, SeqInv);
seq[size]='@';
SeqInv[size]=0;
return seq;
}
/*
Merge the seq1 with seq2
*/
static char *merge2seq(char* seq1, char* seq2,
int32_t size1, int32_t size2)
{
char * dest;
seq1 = (char *)MyRealloc((void *)seq1, (size1+size2+2) *sizeof(char),
(size1+1)*sizeof(char), "merg2seq: Cannot allocate space for reverse sequence");
dest = seq1 + size1 + 1;
seq1[size1]='@';
memcpy(dest,seq2,size2);
dest[size2]=0;
return seq1;
}
static int32_t dirDelta(SmallSeed_type* seed)
{
return seed->pos2 - seed->pos1;
}
void KMRK_SetMultipleLenDirSeeds(SmallSeed_type* seeds,
int32_t nseeds,
int32_t wordSize,
AllSeeds_type *PtrAllSeeds)
{
int32_t i,j; /* dummy counters j=kept seeds ; i = current seed */
int32_t curLen=wordSize; /* Length of the current seed */
int32_t delta;
SmallSeed_type *mainSeed;
SmallSeed_type *curSeed;
int32_t add;
/* fprintf(stderr,"New Version\n");*/
KMRK_sortSeeds(seeds,nseeds,KMRK_cmpDeltaSeedsPos);
for(j=0,mainSeed=seeds ;
j < nseeds;
j=i, mainSeed=curSeed)
{
/* foreach seed */
delta = dirDelta(mainSeed);
curLen=wordSize;
for (i=j+1,curSeed=mainSeed+1;
i < nseeds &&
dirDelta(curSeed)==delta &&
(curSeed->pos1 - mainSeed->pos1) <= curLen;
i++,curSeed++)
{
add=wordSize - mainSeed->pos1 - curLen + curSeed->pos1;
if (add < 0) add = 0;
curLen+=add;
}
KMRK_pushSeed(PtrAllSeeds,
seeds[j].pos1,seeds[j].pos2,
curLen,
1);
/*
if(seeds[j].pos1 == seeds[j].pos2)
fprintf(stderr, "KMRK_SetMultipleLenDirSeeds: %d %d %d, bye\n", seeds[j].pos1, seeds[j].pos2, curLen), exit(0);
*/
}
}
static int32_t invDelta(SmallSeed_type* seed)
{
return seed->pos2 + seed->pos1;
}
static void SetMultipleLenInvSeeds(
SmallSeed_type* seeds, /* an array of a structure with pos1 and pos2 */
int32_t nseeds, /* number of small seeds */
int32_t wordSize, /* lmin */
int8_t same, /* 1 if one sequence, 0 otherwise */
AllSeeds_type *PtrAllSeeds) /* the final seeds with length */
{
int32_t i,j; /* dummy counters j=kept seeds ; i = current seed */
int32_t curLen=wordSize; /* Length of the current seed */
int32_t delta; /* the pos1+pos2 - constant if seeds should be merged */
int32_t pos2; /* after merge, new pos2 */
SmallSeed_type *mainSeed; /* the first one of a series */
SmallSeed_type *curSeed; /* the current one */
/*int32_t add; what is added to the Lenght for the merged form */
KMRK_sortSeeds(seeds,nseeds,KMRK_cmpDeltaInvSeedsPos); /* qsort() seeds by delta, then pos1 */
for(j=0,mainSeed=seeds ; j < nseeds; j=i, mainSeed=curSeed)
{
delta = invDelta(mainSeed); /* pos1+pos2 */
curLen=wordSize; /* at start curLen is equal to lmin */
for (i=j+1, curSeed=mainSeed+1;
i < nseeds &&
invDelta(curSeed)==delta && /* Constant if they should merge */
(curSeed->pos1 - (curSeed-1)->pos1) <= wordSize; /* if they overlap */
i++, curSeed++)
{;}
curLen += ( (curSeed-1)->pos1 - mainSeed->pos1);
if ( same && ( seeds[j].pos1+curLen-1 > seeds[i-1].pos2) ){
pos2 = seeds[j].pos1;
curLen += seeds[i-1].pos2 - seeds[j].pos1;
}
else
pos2 = seeds[i-1].pos2;
/*
Old Code - there was a bug for palindrome
for (i=j+1, curSeed=mainSeed+1;
i < nseeds && invDelta(curSeed)==delta && (curSeed->pos1 - mainSeed->pos1) <= curLen;
i++, curSeed++)
{
{
add = wordSize-curLen - (mainSeed->pos1 - curSeed->pos1);
if (add < 0) add = 0;
curLen+=add;
}
if ( same && (seeds[j].pos1+curLen>= seeds[i-1].pos2) )
{
curLen = (seeds[i-1].pos2 - seeds[j].pos1 + 1) * 2;
pos2 = seeds[j].pos1;
}
else
pos2 = seeds[i-1].pos2;
*/
KMRK_pushSeed(PtrAllSeeds,
seeds[j].pos1,pos2,
curLen,
0);
}
}
AllSeeds_type *KMRK_allocSeeds(AllSeeds_type *AllSeeds,
int32_t size,
int8_t opt_dir,
int8_t opt_inv)
{
AllSeeds_type *reponse;
if(opt_inv != 1 && opt_dir != 1)
{
fprintf(stderr,"AllocSeeds: requiere at least "
"one of opt_dir and opt_inv to be 1\n");
exit(4);
}
reponse = AllSeeds;
if (!reponse)
reponse = MyCalloc( 1, sizeof(AllSeeds_type),"KMRK_allocSeeds: cannot allocate new data structure");
if(opt_dir)
{
if (reponse->dirSeeds==NULL)
reponse->dirSeeds = (Seed_type *)MyCalloc( size,sizeof(Seed_type),"KMRK_allocSeeds: cannot allocate Direct Seeds");
else
{
if(size)
reponse->dirSeeds = (Seed_type *)MyRealloc( (void *)reponse->dirSeeds,
size*sizeof(Seed_type), reponse->cDirSeeds*sizeof(Seed_type),
"allocSeeds: cannot reallocate Direct Seeds" );
else
{
MyFree( reponse->dirSeeds, reponse->cDirSeeds*sizeof(Seed_type) );
reponse->dirSeeds=NULL;
reponse->cDirSeeds=0;
}
}
reponse->cDirSeeds=size;
}
if(opt_inv)
{
if (reponse->invSeeds==NULL)
reponse->invSeeds = (Seed_type *)MyCalloc( size, sizeof(Seed_type),"KMRK_allocSeeds: cannot allocate Inverted Seeds");
else
{
if(size)
reponse->invSeeds = (Seed_type *)MyRealloc( (void *)reponse->invSeeds,
size*sizeof(Seed_type), reponse->cInvSeeds*sizeof(Seed_type),
"allocSeeds: cannot reallocate Inverted Seeds" );
else
{
MyFree( reponse->invSeeds, reponse->cInvSeeds*sizeof(Seed_type) );
reponse->invSeeds=NULL;
reponse->cInvSeeds=0;
}
}
reponse->cInvSeeds=size;
}
return reponse;
}
void KMRK_freeSeeds(AllSeeds_type *AllSeeds)
{
if (!AllSeeds)
return;
if (AllSeeds->dirSeeds)
MyFree(AllSeeds->dirSeeds, AllSeeds->cDirSeeds*sizeof(Seed_type) );
AllSeeds->dirSeeds=NULL;
if (AllSeeds->invSeeds)
MyFree(AllSeeds->invSeeds, AllSeeds->cInvSeeds*sizeof(Seed_type) );
AllSeeds->dirSeeds=NULL;
MyFree(AllSeeds, 1*sizeof( AllSeeds_type ) );
}
void KMRK_compactSeeds(AllSeeds_type *AllSeeds)
{
if (AllSeeds)
{
if (AllSeeds->dirSeeds)
KMRK_allocSeeds(AllSeeds, AllSeeds->nDirSeeds, 1, 0);
if (AllSeeds->invSeeds)
KMRK_allocSeeds(AllSeeds, AllSeeds->nInvSeeds, 0, 1);
}
}
void KMRK_pushSeed(AllSeeds_type *AllSeeds,
int32_t pos1,
int32_t pos2,
int32_t length,
int8_t dir)
{
Seed_type* stack;
int32_t maxcount;
int32_t index;
if (dir)
{
dir = 1;
stack = AllSeeds->dirSeeds;
maxcount = AllSeeds->cDirSeeds;
index = AllSeeds->nDirSeeds;
}
else
{
dir = 0;
stack = AllSeeds->invSeeds;
maxcount = AllSeeds->cInvSeeds;
index = AllSeeds->nInvSeeds;
}
if (index == maxcount)
{
(void) KMRK_allocSeeds(AllSeeds,
maxcount * 2,
dir,
!dir);
if (dir)
stack = AllSeeds->dirSeeds;
else
stack = AllSeeds->invSeeds;
}
stack+=index;
stack->pos1 = pos1;
stack->pos2 = pos2;
stack->length = length;
if (dir)
AllSeeds->nDirSeeds++;
else
AllSeeds->nInvSeeds++;
}
AllSeeds_type* KMRK_enumerateDirectCouple(AllSeeds_type* Seeds,
int32_t expected,
int32_t wordSize,
vn_type* stack,
int32_t seq)
{
int32_t xmin;
int32_t xmax;
int32_t i;
int32_t j;
int32_t k;
int32_t next;
int32_t* n;
int32_t nseeds;
SmallSeed_type *list;
list = (SmallSeed_type *)MyCalloc( expected, sizeof(SmallSeed_type) ,
"KMRK_enumerateDirectCouple: cannot allocate DirectCouple memory");
nseeds = 0;
n = GETVECTOR(stack,n);
if (seq)
xmin = stack->limits[seq-1];
else
xmin = 0;
xmax = stack->limits[seq];
for (i=1; i <= xmax; i++)
if (IS_MARKED(n,i)) /* Check begining of chained list */
{
/* Look for begining of sequence of interest */
for( ;(i <= xmin) && (i != GET(n,i));
i=GET(n,i));
/* for each factor in sequence of interest */
for (j=i;
(j != GET(n,j)) && (j <= xmax);
j = GET(n,j))
{
next = GET(n,j);
if (next <= xmax)
do
{
k = next;
next = GET(n,k);
list[nseeds].pos1 = j-1;
list[nseeds].pos2 = k-1;
nseeds++;
} while ((k!=next) && (next <= xmax));
}
} ;
Seeds = KMRK_allocSeeds(Seeds,
expected/20+1,
1,0);
/* fprintf(stderr,"Expected direct couple : %d\n",expected);*/
KMRK_SetMultipleLenDirSeeds(list,nseeds,wordSize,Seeds);
MyFree(list, expected*sizeof(SmallSeed_type) );
KMRK_compactSeeds(Seeds);
return Seeds;
}
/*
From KMR-K Stacks to SmallSeeds
*/
AllSeeds_type* KMRK_enumerateInvertedCouple(
AllSeeds_type* Seeds,
int32_t expected, /* the expected number of couple */
int32_t wordSize, /* the Lmin */
vn_type* stack)
{
int32_t xmax;
int32_t invmax;
int32_t posinv;
int32_t i;
int32_t j;
int32_t k;
int32_t memk;
int32_t* n;
int32_t next;
int32_t nseeds; /* number of seeds */
SmallSeed_type *list; /* seed list with only pos1 and pos2 --simple output from kmrk */
list = (SmallSeed_type *)MyCalloc( expected, sizeof(SmallSeed_type) ,
"KMRK_enumerateInvertedCouple: cannot allocate InvertedCouple memory");
nseeds = 0;
n = GETVECTOR(stack,n);
xmax = stack->limits[0];
invmax = stack->limits[1];
for (i=1; i <= xmax; i++)
if (IS_MARKED(n,i))
{
for(memk=i ;
(memk < xmax) &&
memk != GET(n,memk) &&
(memk <= invmax);
memk=GET(n,memk));
if ((memk > xmax) && (memk <= invmax))
for (j=i;
(j <= xmax) && (j != GET(n,j));
j=GET(n,j))
{
next = memk;
do
{
k = next;
next = GET(n,k);
posinv = 2 * xmax - k -wordSize + 3;
if (j <= posinv)
{
list[nseeds].pos1=j-1;
list[nseeds].pos2=posinv-1;
nseeds++;
}
} while ((next <= invmax) && (k != next));
}
}
Seeds = KMRK_allocSeeds(Seeds, expected/20+1, 0,1);
/* fprintf(stderr,"Expected inverted couple : %d\n",expected);*/
SetMultipleLenInvSeeds(list,nseeds,wordSize,1,Seeds); /* turn the Small seeds into merged seeds */
MyFree(list, expected*sizeof(SmallSeed_type) );
KMRK_compactSeeds(Seeds);
return Seeds;
}
AllSeeds_type* KMRK_enumerateInterCouple(AllSeeds_type* Seeds,
int32_t seq1,
int32_t seq2,
int32_t expected,
int32_t wordSize,
vn_type* stack)
{
int32_t xmin;
int32_t xmax;
int32_t ymax;
int32_t ymin;
int32_t pos1;
int32_t pos2;
int32_t i;
int32_t j;
int32_t k;
int32_t memj;
int32_t memk;
int32_t* n;
int32_t next;
int32_t nseeds;
SmallSeed_type *list;
nseeds=0;
list = (SmallSeed_type *)MyCalloc( expected, sizeof(SmallSeed_type) ,
"KMRK_enumerateInterCouple: cannot allocate InterCouple memory");
n = GETVECTOR(stack,n);
if (seq1==0)
xmin=0;
else
xmin = stack->limits[seq1-1];
xmax = stack->limits[seq1];
ymin = stack->limits[seq2-1];
ymax = stack->limits[seq2];
for (i=1; i <= xmax; i++)
if (IS_MARKED(n,i))
{
for(memj=i ;
(memj < xmin) &&
memj != GET(n,memj);
memj=GET(n,memj));
if ((memj > xmin) && (memj <= xmax))
{
for(memk=memj ;
(memk < ymin) &&
memk != GET(n,memk);
memk=GET(n,memk));
if ((memk > ymin) && (memk <= ymax))
for (j=memj;
(j <= xmax) && (j != GET(n,j));
j=GET(n,j))
{
next = memk;
do
{
k = next;
next = GET(n,k);
if (seq1 > 0)
pos1 = j - xmin - 1;
else
pos1 = j;
pos2 = k - ymin - 1;
list[nseeds].pos1 = pos1 - 1;
list[nseeds].pos2 = pos2 - 1;
nseeds++;
} while ((next <= ymax) && (k != next));
}
}
}
Seeds = KMRK_allocSeeds(Seeds,
expected/20+1,
1,0);
/* fprintf(stderr,"Expected inter-direct couple : %d\n",expected);*/
KMRK_SetMultipleLenDirSeeds(list,nseeds,wordSize,Seeds);
MyFree(list, expected*sizeof(SmallSeed_type) );
KMRK_compactSeeds(Seeds);
return Seeds;
}
AllSeeds_type* KMRK_enumerateInterInvertedCouple(AllSeeds_type* Seeds,
int32_t seq2,
int32_t expected,
int32_t wordSize,
vn_type* stack)
{
int32_t xmin;
int32_t xmax;
int32_t ymax;
int32_t ymin;
int32_t posinv;
int32_t pos2;
int32_t i;
int32_t j;
int32_t k;
int32_t memj;
int32_t memk;
int32_t* n;
int32_t next;
int32_t nseeds;
SmallSeed_type *list;
list = (SmallSeed_type *)MyCalloc( expected, sizeof(SmallSeed_type) ,
"KMRK_enumerateInterCouple: cannot allocate InterCouple memory");
nseeds = 0;
n = GETVECTOR(stack,n);
if (seq2 < 2)
{
fprintf(stderr,"enumerateInterInvertedCouple: seq2 must be differente to 0");
exit(4);
}
xmin = stack->limits[0];
xmax = stack->limits[1];
ymin = stack->limits[seq2-1];
ymax = stack->limits[seq2];
Seeds = KMRK_allocSeeds(Seeds,
expected,
0,1);
for (i=1; i <= xmax; i++)
if (IS_MARKED(n,i))
{
for(memj=i ;
(memj < xmin) &&
memj != GET(n,memj);
memj=GET(n,memj));
if ((memj > xmin) && (memj <= xmax))
{
for(memk=memj ;
(memk < ymin) &&
memk != GET(n,memk);
memk=GET(n,memk));
if ((memk > ymin) && (memk <= ymax))
for (j=memj;
(j <= xmax) && (j != GET(n,j));
j=GET(n,j))
{
next = memk;
do
{
k = next;
next = GET(n,k);
posinv = 2 * xmin - j -wordSize + 3;
pos2 = k - ymin - 1;
list[nseeds].pos1=posinv-1;
list[nseeds].pos2=pos2-1;
nseeds++;
} while ((next <= ymax) && (k != next));
}
}
}
Seeds = KMRK_allocSeeds(Seeds,
expected/20+1,
0,1);
/* fprintf(stderr,"Expected inter-inverted couple : %d\n",expected);*/
SetMultipleLenInvSeeds(list,nseeds,wordSize,0,Seeds);
MyFree(list, expected* sizeof(SmallSeed_type) );
KMRK_compactSeeds(Seeds);
return Seeds;
}
int32_t KMRK_cmpSeedsPos(SmallSeed_type *s1, SmallSeed_type *s2)
{
if (s1->pos1==s2->pos1)
return s1->pos2 - s2->pos2;
else
return s1->pos1 - s2->pos1;
}
int32_t KMRK_cmpDeltaSeedsPos(SmallSeed_type *s1, SmallSeed_type *s2)
{
int32_t delta1 = s1->pos2-s1->pos1;
int32_t delta2 = s2->pos2-s2->pos1;
if (delta1==delta2)
return s1->pos1 - s2->pos1;
else
return delta1 - delta2;
}
/*
Sort by deltas, then pos1
*/
int32_t KMRK_cmpDeltaInvSeedsPos(SmallSeed_type *s1, SmallSeed_type *s2)
{
int32_t delta1 = s1->pos2+s1->pos1;
int32_t delta2 = s2->pos2+s2->pos1;
if (delta1==delta2)
return s1->pos1 - s2->pos1; /* then sort by pos1 */
else
return delta1 - delta2; /* sort by deltas */
}
void KMRK_sortSeeds(SmallSeed_type* seeds,
int32_t nseeds,
KMRK_SORT_SEEDS_FUNC_PTR(compare))
{
qsort(seeds,
nseeds,
sizeof(SmallSeed_type),
(int (*)(const void *, const void *))compare);
}
AllSeeds_type* KMRK_get_seeds(char **seq,
int32_t SimpleSeqLen,
int16_t Lmin,
int8_t opt_dir,
int8_t opt_inv,
int8_t opt_verbose,
masked_area_table_t *mask)
{
AllSeeds_type* AllSeeds;
char *ConcatSeq = *seq;
vn_type * stacks;
int32_t dirExpect=0;
int32_t invExpect=0;
KMRK_QUORUM_FUNC_PTR(quorum);
if(opt_inv != 1 && opt_dir != 1)
fprintf(stderr, "get_seeds: requiered at least opt_dir or opt_inv to be 1\n"), exit(4);
if(opt_inv)
ConcatSeq = makeDirInvSeq(ConcatSeq,SimpleSeqLen); /* create a sequence with "DirSeq\@InvSeq\0" */
if (opt_inv){ /* Are we interested in dir, inv or both ? */
if (opt_dir)
quorum = KMRK_DirInvQuorum;
else
quorum = KMRK_InvQuorum;
}
else
quorum = KMRK_CoupleQuorum;
stacks = KMRK_RunTo(ConcatSeq, /* concatenation of both sequence */
Lmin, /* minimum length of the repeated elements = word size to construct */
opt_inv, /* shall we search for inverted repeats as well ?*/
2, /* the count -- a mysterious value passed to the quorum function (unused ??) */
1, /* how many functions */
quorum, /* a pointer to the selected quorum function */
KMRK_HashOneSequence, /* a pointer to the init function */
mask);
invExpect =0;
KMRK_markStart(stacks);
if (opt_inv)
{
ConcatSeq = (char *)MyRealloc( (void *)ConcatSeq, (SimpleSeqLen+1)*sizeof(char),
(2*SimpleSeqLen+2)*sizeof(char) , "KRMK_get_seeds: Cannot shrink memory"); /* reset mem to a sigle sequence */
ConcatSeq[SimpleSeqLen]=0;
}
if(opt_inv)
invExpect = KMRK_upperInvertedCount(stacks,Lmin);
if(opt_dir)
dirExpect = KMRK_upperCoupleCount(stacks);
AllSeeds = NULL;
MyFree(stacks->v, (stacks->vectorsize) * sizeof(int32_t));
stacks->v=NULL;
if (opt_dir)
AllSeeds = KMRK_enumerateDirectCouple(AllSeeds,dirExpect,Lmin ,stacks,0);
if (opt_inv)
AllSeeds = KMRK_enumerateInvertedCouple(AllSeeds,invExpect,Lmin,stacks);
KMRK_FreeVN(stacks);
*seq = ConcatSeq;
return AllSeeds;
}
AllSeeds_type* KMRK_get_seeds_2seqs(char **seq1,
char **seq2,
int32_t size1,
int32_t size2,
int16_t Lmin,
int8_t opt_dir,
int8_t opt_inv,
int8_t opt_verbose,
masked_area_table_t *mask)
{
AllSeeds_type* AllSeeds;
char *sequence1 = *seq1;
char *sequence2 = *seq2;
vn_type * stacks;
int32_t dirExpect=0;
int32_t invExpect=0;
KMRK_QUORUM_FUNC_PTR(quorum);
int32_t sizef; /* =size1 if only dir OR =2*size1+size2 if inv */
if(opt_inv != 1 &&
opt_dir != 1)
{
fprintf(stderr,
"get_seeds_2seqs: requiered at least "
"opt_dir or opt_inv to be 1\n");
exit(4);
}
sizef = size1;
if(opt_inv)
{
sequence1 = makeDirInvSeq(sequence1,size1);
sizef+=(1+size1);
}
sequence1 = merge2seq(sequence1,sequence2,sizef,size2);
if (opt_inv)
if (opt_dir)
quorum = KMRK_IntDirInv12Quorum;
else
quorum = KMRK_IntInv12Quorum;
else
quorum = KMRK_Int12Quorum;
stacks = KMRK_RunTo(sequence1,
Lmin,
opt_inv,
2,
2,
quorum,
KMRK_HashOneSequence,
mask);
KMRK_markStart(stacks);
sequence1= (char *)MyRealloc( (void *)sequence1, (size1+1)*sizeof(char),
(sizef+size2+2)*sizeof(char), "KMRK_get_seeds_2seqs: shrink memory back to size1");
sequence1[size1]=0;
if (opt_dir){
if (opt_inv)
dirExpect = KMRK_upperInterCount(stacks,0,2,Lmin);
else
dirExpect = KMRK_upperInterCount(stacks,0,1,Lmin);
}
if (opt_inv)
invExpect = KMRK_upperInterCount(stacks,1,2,Lmin);
AllSeeds = NULL;
MyFree(stacks->v, stacks->vectorsize*sizeof(int32_t) ); /* free vector v from the stacks */
stacks->v=NULL;
if (opt_dir){
if (opt_inv)
AllSeeds = KMRK_enumerateInterCouple(AllSeeds,
0,2,
dirExpect,
Lmin ,
stacks);
else
AllSeeds = KMRK_enumerateInterCouple(AllSeeds,
0,1,
dirExpect,
Lmin ,
stacks);
}
if (opt_inv)
AllSeeds = KMRK_enumerateInterInvertedCouple(AllSeeds,
2,
invExpect,
Lmin ,
stacks);
KMRK_FreeVN(stacks);
*seq1 = sequence1;
return AllSeeds;
}
#define SIGN(x) (((x)<0) ? -1:(((x)>0) ? 1:0))
static int32_t compareSeedsByPos(Seed_type* s1,Seed_type* s2)
{
if (s1->pos1 == s2->pos1)
return SIGN(s1->pos2 - s2->pos2);
else
return SIGN(s1->pos1 - s2->pos1);
}
void KMRK_sortSeedsByPos(Seed_type* seeds, int32_t count)
{
qsort(seeds,
count,
sizeof(Seed_type),
(int (*)(const void *, const void *))compareSeedsByPos);
};
#undef SIGN

View File

@ -0,0 +1,145 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
#ifndef KMRK_Seeds_h
#define KMRK_Seeds_h
/********************************************
********************************************
**
** Declaration of struct
**
********************************************
********************************************/
#include <stdio.h>
#include "KMRK.h"
#include "repseek_types.h"
#define KMRK_SORT_SEEDS_FUNC_PTR(name) int32_t (*name)(SmallSeed_type*, \
SmallSeed_type*)
#define KMRK_DELTA_SEEDS_FUNC_PTR(name) int32_t (*name)(SmallSeed_type*)
/********************************************
********************************************
**
** Declaration of public functions
**
********************************************
********************************************/
AllSeeds_type *KMRK_allocSeeds(AllSeeds_type *AllSeeds,
int32_t size,
int8_t opt_dir,
int8_t opt_inv);
void KMRK_SetMultipleLenDirSeeds(SmallSeed_type* seeds,
int32_t nseeds,
int32_t wordSize,
AllSeeds_type *PtrAllSeeds);
void KMRK_freeSeeds(AllSeeds_type *AllSeeds);
void KMRK_compactSeeds(AllSeeds_type *AllSeeds);
void KMRK_pushSeed(AllSeeds_type *AllSeeds,
int32_t pos1,
int32_t pos2,
int32_t length,
int8_t dir);
AllSeeds_type* KMRK_enumerateDirectCouple(AllSeeds_type* Seeds,
int32_t expected,
int32_t wordSize,
vn_type* stack,
int32_t seq);
AllSeeds_type* KMRK_enumerateInvertedCouple(AllSeeds_type* Seeds,
int32_t expected,
int32_t wordSize,
vn_type* stack);
AllSeeds_type* KMRK_enumerateInterCouple(AllSeeds_type* Seeds,
int32_t seq1,
int32_t seq2,
int32_t expected,
int32_t wordSize,
vn_type* stack);
AllSeeds_type* KMRK_enumerateInterInvertedCouple(AllSeeds_type* Seeds,
int32_t seq2,
int32_t expected,
int32_t wordSize,
vn_type* stack);
/**
* Compare two seeds and return an integer less than, equal to or greater
* than zero considering the relative order of the two seeds. This
* version take into account only pos1 and pos2 of seeds without taking
* account of the sequences or of the relative direction
*
* @param s1 pointer to seed one
* @param s2 pointer to seed two
*
* @return a integer less than, equal to or greater than zero
*/
int32_t KMRK_cmpSeedsPos(SmallSeed_type *s1, SmallSeed_type *s2);
int32_t KMRK_cmpDeltaSeedsPos(SmallSeed_type *s1, SmallSeed_type *s2);
int32_t KMRK_cmpDeltaInvSeedsPos(SmallSeed_type *s1, SmallSeed_type *s2);
void KMRK_sortSeeds(SmallSeed_type* seeds,
int32_t nseeds,
KMRK_SORT_SEEDS_FUNC_PTR(compare));
AllSeeds_type* KMRK_get_seeds(char **seq,
int32_t SimpleSeqLen,
int16_t Lmin,
int8_t opt_dir,
int8_t opt_inv,
int8_t opt_verbose,
masked_area_table_t *mask);
AllSeeds_type* KMRK_get_seeds_2seqs(char **seq1,
char **seq2,
int32_t size1,
int32_t size2,
int16_t Lmin,
int8_t opt_dir,
int8_t opt_inv,
int8_t opt_verbose,
masked_area_table_t *mask);
/**
* Order an array of seeds by pos1,pos2
*
* @param seeds pointer to an array of Seed_type object to sort
* @param count count of element in the array
*/
void KMRK_sortSeedsByPos(Seed_type* seeds, int32_t count);
#endif /* KMRK_Seeds_h */

View File

@ -0,0 +1,281 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/*
* KMRK_mask.c
* repseek
*
* Created by Eric Coissac on 04/12/04.
* Copyright 2004 __MyCompanyName__. All rights reserved.
*
*/
#include "KMRK_mask.h"
#include <stdio.h>
#include <stdlib.h>
#include "memory.h"
#define MASKED_AREA_TABLE_SIZE(seqcount) (sizeof(masked_area_table_t) + (sizeof(masked_area_list_t*) * ((seqcount)-1)))
#define MASKED_AREA_LIST_SIZE(areacount) (sizeof(masked_area_list_t) + (sizeof(masked_area_t) * ((areacount)-1)))
#define AREA_COUNT_INIT (1000)
static masked_area_table_t *new_masked_area_table(int32_t seqcount, int32_t areacount);
static masked_area_list_t *new_masked_area_list(int32_t areacount);
static masked_area_list_t *realloc_masked_area_list(masked_area_list_t *list,int32_t areacount);
static int32_t push_area(masked_area_table_t* table,int32_t sequence,int32_t begin,int32_t end);
static void sort_area_table(masked_area_table_t* table);
static int32_t compare_area(const masked_area_t* v1,const masked_area_t* v2);
static int32_t search_area(const masked_area_t* v1,const masked_area_t* v2);
static masked_area_list_t *strip_area_list(masked_area_list_t* list);
static void strip_area_table(masked_area_table_t* table);
static masked_area_list_t *new_masked_area_list(int32_t areacount)
{
masked_area_list_t *list;
list = MyCalloc(1, MASKED_AREA_LIST_SIZE(areacount), "Not enougth memory for mask table");
list->reserved=areacount;
return list;
}
static masked_area_list_t *realloc_masked_area_list(masked_area_list_t *list,int32_t areacount)
{
list = MyRealloc(list,
MASKED_AREA_LIST_SIZE(areacount),
MASKED_AREA_LIST_SIZE(list->reserved),
"Not enougth memory for mask table");
list->reserved=areacount;
return list;
}
static masked_area_table_t *new_masked_area_table(int32_t seqcount, int32_t areacount)
{
masked_area_table_t *table;
int32_t i;
table = MyCalloc(1, MASKED_AREA_TABLE_SIZE(seqcount),"Not enougth memory for mask table");
table->seqcount=seqcount;
for (i=0;i<seqcount;i++)
table->sequence[i]=new_masked_area_list(areacount);
return table;
}
static int32_t push_area(masked_area_table_t* table,int32_t sequence,int32_t begin,int32_t end)
{
masked_area_list_t * list;
if (sequence >= table->seqcount)
return -1;
list = table->sequence[sequence];
if (list->reserved == list->count)
{
list = realloc_masked_area_list(list,list->reserved*2);
table->sequence[sequence]=list;
}
list->area[list->count].begin=begin;
list->area[list->count].end=end;
list->count++;
table->total++;
return table->total;
}
static int32_t compare_area(const masked_area_t* v1,const masked_area_t* v2)
{
return v1->begin - v2->begin;
}
static void sort_area_table(masked_area_table_t* table)
{
int32_t i;
for (i=0; i<table->seqcount;i++)
{
qsort(table->sequence[i]->area,
table->sequence[i]->count,
sizeof(masked_area_t),
(int (*)(const void *, const void *))compare_area);
}
}
static masked_area_list_t *strip_area_list(masked_area_list_t* list)
{
int32_t i;
int32_t j;
int32_t count;
int32_t newcount;
count = list->count;
newcount=count;
for (i=1;i<count;i++)
{
/* fprintf(stderr,"\n%d->%d %d->%d ==>",list->area[i-1].begin,list->area[i-1].end,list->area[i].begin,list->area[i].end); */
if ((list->area[i].begin-1) <= list->area[i-1].end)
{
/* fprintf(stderr," joined"); */
list->area[i].begin=list->area[i-1].begin;
list->area[i-1].begin=-1;
newcount--;
}
}
list->count=newcount;
for (i=0,j=0;i<count;i++)
{
if (list->area[i].begin>=0)
{
if (i!=j)
list->area[j]=list->area[i];
j++;
}
}
return realloc_masked_area_list(list,newcount);
}
static void strip_area_table(masked_area_table_t* table)
{
int32_t seq;
int32_t oldcount;
masked_area_list_t* list;
sort_area_table(table);
for (seq=0; seq < table->seqcount;seq++)
{
list = table->sequence[seq];
oldcount = list->count;
table->sequence[seq]=strip_area_list(list);
table->total-=oldcount - table->sequence[seq]->count;
}
}
static int32_t search_area(const masked_area_t* v1,const masked_area_t* v2)
{
int32_t pos;
pos = v1->begin;
if (pos < v2->begin)
return -1;
if (pos > v2->end)
return 1;
return 0;
}
masked_area_table_t *KMRK_ReadMaskArea(char *areafile, int32_t seqcount, int32_t complement)
{
FILE* area;
char buffer[1000];
char *ok;
int32_t begin;
int32_t end;
int32_t sequence;
int32_t column;
int32_t linecount;
masked_area_table_t *table;
if (complement > 0)
seqcount++;
else
complement=0;
area = fopen(areafile,"r");
linecount=0;
table = new_masked_area_table(seqcount,AREA_COUNT_INIT);
do {
linecount++;
ok = fgets(buffer,999,area);
if (ok)
{
column = sscanf(buffer,"%d %d %d",&begin,&end,&sequence);
if (column > 1 && begin <= end)
{
begin--;
end--;
if (column==3)
sequence--;
else
sequence=0;
if (sequence>1 || sequence<0)
fprintf(stderr,"ERROR while reading mask file, line %d (see help)\n", linecount), exit(0);
if (sequence && complement)
sequence++;
push_area(table,sequence,begin,end);
if (!sequence && complement)
push_area(table,1,complement -1 - end,complement -1 -begin);
}
if (column==1)
fprintf(stderr,"EROOR in mask file reading line %d\n",linecount), exit(0);
}
} while (ok);
fprintf(stderr,"\nread %d masked areas from file\n",table->total);
strip_area_table(table);
fprintf(stderr,"strip to %d non overlaping areas\n",table->total);
return table;
}
char KMRK_isMasked(masked_area_table_t *mask,int32_t seq, int32_t position)
{
masked_area_t input;
int32_t result;
masked_area_list_t *list;
if (! mask || (seq >= mask->seqcount))
return 0;
list = mask->sequence[seq];
input.begin = position;
result = bsearch(&input,
list->area,
list->count,
sizeof(masked_area_t),
(int (*)(const void *, const void *))search_area) != NULL;
return result;
}

View File

@ -0,0 +1,57 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/*
* KMRK_mask.h
* repseek
*
* Created by Eric Coissac on 04/12/04.
* Copyright 2004 __MyCompanyName__. All rights reserved.
*
*/
#ifndef KMRK_MASK_H
#define KMRK_MASK_H
#include "repseek_types.h"
typedef struct {
int32_t begin;
int32_t end;
} masked_area_t;
typedef struct {
int32_t reserved;
int32_t count;
masked_area_t area[1];
} masked_area_list_t;
typedef struct {
int32_t seqcount;
int32_t total;
masked_area_list_t *sequence[1];
} masked_area_table_t;
masked_area_table_t *KMRK_ReadMaskArea(char *areafile,int32_t seqcount,int32_t complement);
char KMRK_isMasked(masked_area_table_t *mask,int32_t seq, int32_t position);
#endif

View File

@ -0,0 +1,142 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/**
* @file KMRK_merge_seeds.c
* @author Eric Coissac <coissac@inrialpes.fr>
* @date Wed Mar 3 11:15:57 2004
*
* @brief Merge function of overlapping seeds
*
*
*/
#include "KMRK_merge_seeds.h"
void KMRK_MergeSeeds(AllSeeds_type *AllSeeds,
int8_t opt_dir,
int8_t opt_inv)
{
int32_t i; /* the current seed */
int32_t j; /* the checked seed */
int32_t N; /* the kept ones */
Seed_type* seeds;
if(opt_dir){
seeds = AllSeeds->dirSeeds;
for(i=0, N=0 ;i<AllSeeds->nDirSeeds; i++){
if(seeds[i].pos1==-1) /* any seed at -1 is removed */
continue;
j=i+1;
while( (seeds[j].pos1!=-1) &&
(seeds[i].pos1!=-1) &&
(j < AllSeeds->nDirSeeds) &&
(seeds[j].pos1 < seeds[i].pos1+ seeds[i].length))
{
if(
((seeds[j].pos2 >= seeds[i].pos2) &&
(seeds[j].pos2 < seeds[i].pos2 + seeds[i].length)) || /* if the seeds are overlapping */
((seeds[j].pos2 + seeds[j].length >= seeds[i].pos2) &&
(seeds[j].pos2 + seeds[j].length < seeds[i].pos2 + seeds[i].length)))
{
if(seeds[j].length <= seeds[i].length)
seeds[j].pos1=seeds[j].pos2=seeds[j].length=-1; /* removed the smallest */
else
seeds[i].pos1=seeds[i].pos2=seeds[i].length=-1;
}
j++;
}
if(seeds[i].pos1 !=-1)
{ /* if this seed is not out, store it */
seeds[N].pos1 = seeds[i].pos1;
seeds[N].pos2 = seeds[i].pos2;
seeds[N].length = seeds[i].length;
N++;
}
}
AllSeeds->nFilteredDirSeeds += AllSeeds->nDirSeeds-N;
AllSeeds->nDirSeeds=N;
}
if(opt_inv){
seeds = AllSeeds->invSeeds;
for(i=0, N=0 ;i<AllSeeds->nInvSeeds; i++){
if(seeds[i].pos1==-1)
continue;
j=i+1;
while( (seeds[j].pos1!=-1 ) &&
(seeds[i].pos1!=-1 ) &&
(j < AllSeeds->nInvSeeds) &&
(seeds[j].pos1 < seeds[i].pos1+seeds[i].length))
{
if(
((seeds[j].pos2 >= seeds[i].pos2) && /* if the seeds are overlapping */
(seeds[j].pos2 < seeds[i].pos2+seeds[i].length)) ||
((seeds[j].pos2 + seeds[j].length >= seeds[i].pos2) &&
(seeds[j].pos2 + seeds[j].length < seeds[i].pos2+seeds[i].length)))
{
if(seeds[j].length <= seeds[i].length)
seeds[j].pos1=seeds[j].pos2=seeds[j].length=-1; /* removed the smallest */
else
seeds[i].pos1=seeds[i].pos2=seeds[i].length=-1;
}
j++;
}
if(seeds[i].pos1!=-1)
{ /* if this seed is not out, store it */
seeds[N].pos1 = seeds[i].pos1;
seeds[N].pos2 = seeds[i].pos2;
seeds[N].length = seeds[i].length;
N++;
}
}
AllSeeds->nFilteredInvSeeds += AllSeeds->nInvSeeds-N;
AllSeeds->nInvSeeds=N;
}
KMRK_compactSeeds(AllSeeds);
}

View File

@ -0,0 +1,30 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
#ifndef KMRK_merge_seeds_h
#define KMRK_merge_seeds_h
#include "KMRK_Seeds.h"
void KMRK_MergeSeeds(AllSeeds_type *AllSeeds,
int8_t opt_dir,
int8_t opt_inv);
#endif /* KMRK_MergeSeeds */

View File

@ -0,0 +1,504 @@
GNU LESSER GENERAL PUBLIC LICENSE
Version 2.1, February 1999
Copyright (C) 1991, 1999 Free Software Foundation, Inc.
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
[This is the first released version of the Lesser GPL. It also counts
as the successor of the GNU Library Public License, version 2, hence
the version number 2.1.]
Preamble
The licenses for most software are designed to take away your
freedom to share and change it. By contrast, the GNU General Public
Licenses are intended to guarantee your freedom to share and change
free software--to make sure the software is free for all its users.
This license, the Lesser General Public License, applies to some
specially designated software packages--typically libraries--of the
Free Software Foundation and other authors who decide to use it. You
can use it too, but we suggest you first think carefully about whether
this license or the ordinary General Public License is the better
strategy to use in any particular case, based on the explanations below.
When we speak of free software, we are referring to freedom of use,
not price. Our General Public Licenses are designed to make sure that
you have the freedom to distribute copies of free software (and charge
for this service if you wish); that you receive source code or can get
it if you want it; that you can change the software and use pieces of
it in new free programs; and that you are informed that you can do
these things.
To protect your rights, we need to make restrictions that forbid
distributors to deny you these rights or to ask you to surrender these
rights. These restrictions translate to certain responsibilities for
you if you distribute copies of the library or if you modify it.
For example, if you distribute copies of the library, whether gratis
or for a fee, you must give the recipients all the rights that we gave
you. You must make sure that they, too, receive or can get the source
code. If you link other code with the library, you must provide
complete object files to the recipients, so that they can relink them
with the library after making changes to the library and recompiling
it. And you must show them these terms so they know their rights.
We protect your rights with a two-step method: (1) we copyright the
library, and (2) we offer you this license, which gives you legal
permission to copy, distribute and/or modify the library.
To protect each distributor, we want to make it very clear that
there is no warranty for the free library. Also, if the library is
modified by someone else and passed on, the recipients should know
that what they have is not the original version, so that the original
author's reputation will not be affected by problems that might be
introduced by others.
Finally, software patents pose a constant threat to the existence of
any free program. We wish to make sure that a company cannot
effectively restrict the users of a free program by obtaining a
restrictive license from a patent holder. Therefore, we insist that
any patent license obtained for a version of the library must be
consistent with the full freedom of use specified in this license.
Most GNU software, including some libraries, is covered by the
ordinary GNU General Public License. This license, the GNU Lesser
General Public License, applies to certain designated libraries, and
is quite different from the ordinary General Public License. We use
this license for certain libraries in order to permit linking those
libraries into non-free programs.
When a program is linked with a library, whether statically or using
a shared library, the combination of the two is legally speaking a
combined work, a derivative of the original library. The ordinary
General Public License therefore permits such linking only if the
entire combination fits its criteria of freedom. The Lesser General
Public License permits more lax criteria for linking other code with
the library.
We call this license the "Lesser" General Public License because it
does Less to protect the user's freedom than the ordinary General
Public License. It also provides other free software developers Less
of an advantage over competing non-free programs. These disadvantages
are the reason we use the ordinary General Public License for many
libraries. However, the Lesser license provides advantages in certain
special circumstances.
For example, on rare occasions, there may be a special need to
encourage the widest possible use of a certain library, so that it becomes
a de-facto standard. To achieve this, non-free programs must be
allowed to use the library. A more frequent case is that a free
library does the same job as widely used non-free libraries. In this
case, there is little to gain by limiting the free library to free
software only, so we use the Lesser General Public License.
In other cases, permission to use a particular library in non-free
programs enables a greater number of people to use a large body of
free software. For example, permission to use the GNU C Library in
non-free programs enables many more people to use the whole GNU
operating system, as well as its variant, the GNU/Linux operating
system.
Although the Lesser General Public License is Less protective of the
users' freedom, it does ensure that the user of a program that is
linked with the Library has the freedom and the wherewithal to run
that program using a modified version of the Library.
The precise terms and conditions for copying, distribution and
modification follow. Pay close attention to the difference between a
"work based on the library" and a "work that uses the library". The
former contains code derived from the library, whereas the latter must
be combined with the library in order to run.
GNU LESSER GENERAL PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. This License Agreement applies to any software library or other
program which contains a notice placed by the copyright holder or
other authorized party saying it may be distributed under the terms of
this Lesser General Public License (also called "this License").
Each licensee is addressed as "you".
A "library" means a collection of software functions and/or data
prepared so as to be conveniently linked with application programs
(which use some of those functions and data) to form executables.
The "Library", below, refers to any such software library or work
which has been distributed under these terms. A "work based on the
Library" means either the Library or any derivative work under
copyright law: that is to say, a work containing the Library or a
portion of it, either verbatim or with modifications and/or translated
straightforwardly into another language. (Hereinafter, translation is
included without limitation in the term "modification".)
"Source code" for a work means the preferred form of the work for
making modifications to it. For a library, complete source code means
all the source code for all modules it contains, plus any associated
interface definition files, plus the scripts used to control compilation
and installation of the library.
Activities other than copying, distribution and modification are not
covered by this License; they are outside its scope. The act of
running a program using the Library is not restricted, and output from
such a program is covered only if its contents constitute a work based
on the Library (independent of the use of the Library in a tool for
writing it). Whether that is true depends on what the Library does
and what the program that uses the Library does.
1. You may copy and distribute verbatim copies of the Library's
complete source code as you receive it, in any medium, provided that
you conspicuously and appropriately publish on each copy an
appropriate copyright notice and disclaimer of warranty; keep intact
all the notices that refer to this License and to the absence of any
warranty; and distribute a copy of this License along with the
Library.
You may charge a fee for the physical act of transferring a copy,
and you may at your option offer warranty protection in exchange for a
fee.
2. You may modify your copy or copies of the Library or any portion
of it, thus forming a work based on the Library, and copy and
distribute such modifications or work under the terms of Section 1
above, provided that you also meet all of these conditions:
a) The modified work must itself be a software library.
b) You must cause the files modified to carry prominent notices
stating that you changed the files and the date of any change.
c) You must cause the whole of the work to be licensed at no
charge to all third parties under the terms of this License.
d) If a facility in the modified Library refers to a function or a
table of data to be supplied by an application program that uses
the facility, other than as an argument passed when the facility
is invoked, then you must make a good faith effort to ensure that,
in the event an application does not supply such function or
table, the facility still operates, and performs whatever part of
its purpose remains meaningful.
(For example, a function in a library to compute square roots has
a purpose that is entirely well-defined independent of the
application. Therefore, Subsection 2d requires that any
application-supplied function or table used by this function must
be optional: if the application does not supply it, the square
root function must still compute square roots.)
These requirements apply to the modified work as a whole. If
identifiable sections of that work are not derived from the Library,
and can be reasonably considered independent and separate works in
themselves, then this License, and its terms, do not apply to those
sections when you distribute them as separate works. But when you
distribute the same sections as part of a whole which is a work based
on the Library, the distribution of the whole must be on the terms of
this License, whose permissions for other licensees extend to the
entire whole, and thus to each and every part regardless of who wrote
it.
Thus, it is not the intent of this section to claim rights or contest
your rights to work written entirely by you; rather, the intent is to
exercise the right to control the distribution of derivative or
collective works based on the Library.
In addition, mere aggregation of another work not based on the Library
with the Library (or with a work based on the Library) on a volume of
a storage or distribution medium does not bring the other work under
the scope of this License.
3. You may opt to apply the terms of the ordinary GNU General Public
License instead of this License to a given copy of the Library. To do
this, you must alter all the notices that refer to this License, so
that they refer to the ordinary GNU General Public License, version 2,
instead of to this License. (If a newer version than version 2 of the
ordinary GNU General Public License has appeared, then you can specify
that version instead if you wish.) Do not make any other change in
these notices.
Once this change is made in a given copy, it is irreversible for
that copy, so the ordinary GNU General Public License applies to all
subsequent copies and derivative works made from that copy.
This option is useful when you wish to copy part of the code of
the Library into a program that is not a library.
4. You may copy and distribute the Library (or a portion or
derivative of it, under Section 2) in object code or executable form
under the terms of Sections 1 and 2 above provided that you accompany
it with the complete corresponding machine-readable source code, which
must be distributed under the terms of Sections 1 and 2 above on a
medium customarily used for software interchange.
If distribution of object code is made by offering access to copy
from a designated place, then offering equivalent access to copy the
source code from the same place satisfies the requirement to
distribute the source code, even though third parties are not
compelled to copy the source along with the object code.
5. A program that contains no derivative of any portion of the
Library, but is designed to work with the Library by being compiled or
linked with it, is called a "work that uses the Library". Such a
work, in isolation, is not a derivative work of the Library, and
therefore falls outside the scope of this License.
However, linking a "work that uses the Library" with the Library
creates an executable that is a derivative of the Library (because it
contains portions of the Library), rather than a "work that uses the
library". The executable is therefore covered by this License.
Section 6 states terms for distribution of such executables.
When a "work that uses the Library" uses material from a header file
that is part of the Library, the object code for the work may be a
derivative work of the Library even though the source code is not.
Whether this is true is especially significant if the work can be
linked without the Library, or if the work is itself a library. The
threshold for this to be true is not precisely defined by law.
If such an object file uses only numerical parameters, data
structure layouts and accessors, and small macros and small inline
functions (ten lines or less in length), then the use of the object
file is unrestricted, regardless of whether it is legally a derivative
work. (Executables containing this object code plus portions of the
Library will still fall under Section 6.)
Otherwise, if the work is a derivative of the Library, you may
distribute the object code for the work under the terms of Section 6.
Any executables containing that work also fall under Section 6,
whether or not they are linked directly with the Library itself.
6. As an exception to the Sections above, you may also combine or
link a "work that uses the Library" with the Library to produce a
work containing portions of the Library, and distribute that work
under terms of your choice, provided that the terms permit
modification of the work for the customer's own use and reverse
engineering for debugging such modifications.
You must give prominent notice with each copy of the work that the
Library is used in it and that the Library and its use are covered by
this License. You must supply a copy of this License. If the work
during execution displays copyright notices, you must include the
copyright notice for the Library among them, as well as a reference
directing the user to the copy of this License. Also, you must do one
of these things:
a) Accompany the work with the complete corresponding
machine-readable source code for the Library including whatever
changes were used in the work (which must be distributed under
Sections 1 and 2 above); and, if the work is an executable linked
with the Library, with the complete machine-readable "work that
uses the Library", as object code and/or source code, so that the
user can modify the Library and then relink to produce a modified
executable containing the modified Library. (It is understood
that the user who changes the contents of definitions files in the
Library will not necessarily be able to recompile the application
to use the modified definitions.)
b) Use a suitable shared library mechanism for linking with the
Library. A suitable mechanism is one that (1) uses at run time a
copy of the library already present on the user's computer system,
rather than copying library functions into the executable, and (2)
will operate properly with a modified version of the library, if
the user installs one, as long as the modified version is
interface-compatible with the version that the work was made with.
c) Accompany the work with a written offer, valid for at
least three years, to give the same user the materials
specified in Subsection 6a, above, for a charge no more
than the cost of performing this distribution.
d) If distribution of the work is made by offering access to copy
from a designated place, offer equivalent access to copy the above
specified materials from the same place.
e) Verify that the user has already received a copy of these
materials or that you have already sent this user a copy.
For an executable, the required form of the "work that uses the
Library" must include any data and utility programs needed for
reproducing the executable from it. However, as a special exception,
the materials to be distributed need not include anything that is
normally distributed (in either source or binary form) with the major
components (compiler, kernel, and so on) of the operating system on
which the executable runs, unless that component itself accompanies
the executable.
It may happen that this requirement contradicts the license
restrictions of other proprietary libraries that do not normally
accompany the operating system. Such a contradiction means you cannot
use both them and the Library together in an executable that you
distribute.
7. You may place library facilities that are a work based on the
Library side-by-side in a single library together with other library
facilities not covered by this License, and distribute such a combined
library, provided that the separate distribution of the work based on
the Library and of the other library facilities is otherwise
permitted, and provided that you do these two things:
a) Accompany the combined library with a copy of the same work
based on the Library, uncombined with any other library
facilities. This must be distributed under the terms of the
Sections above.
b) Give prominent notice with the combined library of the fact
that part of it is a work based on the Library, and explaining
where to find the accompanying uncombined form of the same work.
8. You may not copy, modify, sublicense, link with, or distribute
the Library except as expressly provided under this License. Any
attempt otherwise to copy, modify, sublicense, link with, or
distribute the Library is void, and will automatically terminate your
rights under this License. However, parties who have received copies,
or rights, from you under this License will not have their licenses
terminated so long as such parties remain in full compliance.
9. You are not required to accept this License, since you have not
signed it. However, nothing else grants you permission to modify or
distribute the Library or its derivative works. These actions are
prohibited by law if you do not accept this License. Therefore, by
modifying or distributing the Library (or any work based on the
Library), you indicate your acceptance of this License to do so, and
all its terms and conditions for copying, distributing or modifying
the Library or works based on it.
10. Each time you redistribute the Library (or any work based on the
Library), the recipient automatically receives a license from the
original licensor to copy, distribute, link with or modify the Library
subject to these terms and conditions. You may not impose any further
restrictions on the recipients' exercise of the rights granted herein.
You are not responsible for enforcing compliance by third parties with
this License.
11. If, as a consequence of a court judgment or allegation of patent
infringement or for any other reason (not limited to patent issues),
conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot
distribute so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you
may not distribute the Library at all. For example, if a patent
license would not permit royalty-free redistribution of the Library by
all those who receive copies directly or indirectly through you, then
the only way you could satisfy both it and this License would be to
refrain entirely from distribution of the Library.
If any portion of this section is held invalid or unenforceable under any
particular circumstance, the balance of the section is intended to apply,
and the section as a whole is intended to apply in other circumstances.
It is not the purpose of this section to induce you to infringe any
patents or other property right claims or to contest validity of any
such claims; this section has the sole purpose of protecting the
integrity of the free software distribution system which is
implemented by public license practices. Many people have made
generous contributions to the wide range of software distributed
through that system in reliance on consistent application of that
system; it is up to the author/donor to decide if he or she is willing
to distribute software through any other system and a licensee cannot
impose that choice.
This section is intended to make thoroughly clear what is believed to
be a consequence of the rest of this License.
12. If the distribution and/or use of the Library is restricted in
certain countries either by patents or by copyrighted interfaces, the
original copyright holder who places the Library under this License may add
an explicit geographical distribution limitation excluding those countries,
so that distribution is permitted only in or among countries not thus
excluded. In such case, this License incorporates the limitation as if
written in the body of this License.
13. The Free Software Foundation may publish revised and/or new
versions of the Lesser General Public License from time to time.
Such new versions will be similar in spirit to the present version,
but may differ in detail to address new problems or concerns.
Each version is given a distinguishing version number. If the Library
specifies a version number of this License which applies to it and
"any later version", you have the option of following the terms and
conditions either of that version or of any later version published by
the Free Software Foundation. If the Library does not specify a
license version number, you may choose any version ever published by
the Free Software Foundation.
14. If you wish to incorporate parts of the Library into other free
programs whose distribution conditions are incompatible with these,
write to the author to ask for permission. For software which is
copyrighted by the Free Software Foundation, write to the Free
Software Foundation; we sometimes make exceptions for this. Our
decision will be guided by the two goals of preserving the free status
of all derivatives of our free software and of promoting the sharing
and reuse of software generally.
NO WARRANTY
15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
DAMAGES.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Libraries
If you develop a new library, and you want it to be of the greatest
possible use to the public, we recommend making it free software that
everyone can redistribute and change. You can do so by permitting
redistribution under these terms (or, alternatively, under the terms of the
ordinary General Public License).
To apply these terms, attach the following notices to the library. It is
safest to attach them to the start of each source file to most effectively
convey the exclusion of warranty; and each file should have at least the
"copyright" line and a pointer to where the full notice is found.
<one line to give the library's name and a brief idea of what it does.>
Copyright (C) <year> <name of author>
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Also add information on how to contact you by electronic and paper mail.
You should also get your employer (if you work as a programmer) or your
school, if any, to sign a "copyright disclaimer" for the library, if
necessary. Here is a sample; alter the names:
Yoyodyne, Inc., hereby disclaims all copyright interest in the
library `Frob' (a library for tweaking knobs) written by James Random Hacker.
<signature of Ty Coon>, 1 April 1990
Ty Coon, President of Vice
That's all there is to it!

View File

@ -0,0 +1,108 @@
#
# Design for Repseek
# compile all src and output one binary
# <amikezor>
# April 2004
#
# MACHINE = SUN
# MACHINE = SGI
# MACHINE = LINUX
# MACHINE = OSF
MACHINE = MACOSX
# ???: MALLOC=-lmalloc
# others: MALLOC =
# macosx debug: MALLOC = -lMallocDebug
MALLOC=
# SGI: PROTO= PROTO=1
# others: PROTO= PROTO=0
PROTO= PROTO=0
# SGI: RANLIB= touch
# others: RANLIB= ranlib
RANLIB= ranlib
# Users can choose:
#CC= cc
CC= gcc
CFLAGS= -O4 -Wall
#CFLAGS= -O2
#CFLAGS= -pg
#CFLAGS= -g
INSTALLDIR = $$HOME/bin
##### defined
SHELL = bash
LDFLAGS = -lm $(MALLOC)
#LDFLAGS = -g -lm $(MALLOC)
SRC = sort.c\
help.c\
output.c\
filter2seq.c \
filter.c \
families.c \
families_2seqs.c\
memory.c\
memory_align.c\
sequence.c\
readfst.c\
lmin.c \
smin.c\
KMRK.c \
KMRK_Seeds.c \
KMRK_merge_seeds.c\
KMRK_mask.c\
align_matrix.c\
align_seeds.c\
align_di.c\
align_blast2like.c\
read_seeds.c
OBJ = $(SRC:.c=.o)
## Rules
default:
@echo "++ Repeats Search Engines ++"
@echo "edit Makefile and set MACHINE, RANLIB, PROTO, CC, CFLAGS and eventually MALLOC"
@echo " "
@echo "To compile: make repseek"
@echo "To clean: make clean"
%.o: %.c
$(CC) $(CFLAGS) -D$(MACHINE) -c -o $@ $<;
repseek: $(OBJ) main_repseek.c
$(CC) $(CFLAGS) -D$(MACHINE) -o $@ $(OBJ) main_repseek.c $(LDFLAGS);
install: repseek
cp repseek $(INSTALLDIR)
clean:
+rm -f *.o repseek
archive:
\rm -rf RepSeek; mkdir RepSeek
cp -r $(SRC) *.h Makefile main_repseek.c *.txt RepSeek;
tmp=`date | awk '{print $$3$$2$$NF}'`;tar czvf RepSeek.$$tmp.tgz RepSeek;
\rm -rf RepSeek

View File

@ -0,0 +1,32 @@
/*
<-- RepSeek, a repeat search engine -->
Some help to compile the repseek program
*/
1. Untar, and go the newly created directory repseek
2. Edit the Makefile, and fill the fields MACHINE, MALLOC, PROTO, INSTALLDIR
and change any other field you want to.
3. Type 'make repseek' to compile the repseek binary, 'make install' to install the program
in your ~/bin and 'make clean' to clean all object and the binary.
4. You can create an archive of RepSeek by typing 'make archive'
5. General usage is 'repseek [-v|-h|-H] [ opts ] { core_value } file.fst [file2.fst]'
Core values (select either -l, -p or -P option)
-l lmin : set a minimum length for the seeds detection (no pvalue). Also usuable along with -P.
-p prob : set a p-value that gives a minimum seed length (Karlin & Ost equation)
-P prob : set a p-value that gives a minimum repeat score (Waterman & Vingron regression)
Information
-v : Print 'v'ersion and exit
-h : Print a short 'h'elp and exit
-H : Print a larger 'H'elp on a specific mode and exit
6. For more details on repseek insights, please refer to the repseek documentation
7. For License, Repseek is under Lesser GPL. For more details, see the License.txt file

View File

@ -0,0 +1,67 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
#ifndef _ALIGN_H_
#define _ALIGN_H_
/*
Macros to turn any char from A-Z to a symb 0-26 (for S&W scoring matrix)
*/
#define CHAR2SYMB( A ) ( (A) - 65 )
#define SYMB2CHAR( i ) ( (i) + 65 )
#define Match(A,B) pScoring->matrix[ CHAR2SYMB(seq1[ (A)-1 ]) ][ CHAR2SYMB(seq2[ (B)-1 ]) ]
/*
The Seed2Repeat part
*/
Repeats align_seeds(AllSeeds_type *AllSeeds, char *sequence, float Xg, float gap_open, float gap_ext,\
char matrix_type, int16_t Lmin, int8_t opt_dir, int8_t opt_inv, int8_t opt_overlap, float merge_repeats);
Repeats align_seeds_2seq(AllSeeds_type *AllSeeds, char *seq1, char *seq2, float Xg, float gap_open, float gap_ext, char matrix_type,\
int16_t Lmin, int8_t opt_dir, int8_t opt_inv, float merge_repeats);
Rep alignd(int32_t pos1, int32_t pos2, int32_t len, char *sequence, int32_t sizeseq,
float Xg, SCORING *pScoring, RESULTS *pResults , int8_t opt_overlap);
Rep aligni(int32_t pos1, int32_t pos2, int32_t len, char *sequence, char *invsequence, int32_t sizeseq,\
float Xg, SCORING *pScoring, RESULTS *pResults );
Rep alignd_2seq(int32_t pos1, int32_t pos2, int32_t len, char *seq1, char *seq2, \
int32_t size1, int32_t size2, float Xg, SCORING *pScoring, RESULTS *pResults );
Rep aligni_2seq(int32_t pos1, int32_t pos2, int32_t len, char *seq1, char *invseq2, \
int32_t size1, int32_t size2, float Xg, SCORING *pScoring, RESULTS *pResults );
/*
The alignemnt itself
*/
void align_blast2like(char *seq1, char *seq2, float Xg, SCORING *pScoring,\
RESULTS *pResults, char direction, int32_t max_row, int32_t max_col, int32_t diff_max);
void log_matrix(char *sequence, float gap_open, float gap_ext, SCORING *pScoring );
void identity_matrix(char *sequence, float gap_open, float gap_ext, SCORING *pScoring);
#endif /*_ALIGN_H_*/

View File

@ -0,0 +1,445 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/******
file : align_blast2like.c
function : align 2 sequence by blast2 like algo
first 0,0 is a MATCH
created : 07 jul 2003
modified : July 2003, July 2004: turn int_32 scores into doubles;
Aug 2004: Update the BestScore when it is ">=" (instead of ">")
Sept 2005: store the number of steps
Jan 2006: correct a bug in memory reading
Mar 2006: correct a bug whenever only 1 cell was allowed + change align_blast2like (use max_row and maw_col instead of dist_max)
author : amikezor
*****/
#include "repseek_types.h"
#include "memory_align.h"
#include "align.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/*
Determine three scores (if any available)
*/
static int8_t IsScoreLeft(int32_t row, int32_t col, RESULTS pResults)
{
if( col>pResults.Segment_begin[row] )
return 1;
else
return 0;
}
static int8_t IsScoreAbove(int32_t row, int32_t col, RESULTS pResults)
{
if( row>0 && col >= pResults.Segment_begin[row-1] && col <= pResults.Segment_end[row-1] )
return 1;
else
return 0;
}
static int8_t IsScoreDiag(int32_t row, int32_t col, RESULTS pResults)
{
if( row>0 && col > pResults.Segment_begin[row-1] && col <= pResults.Segment_end[row-1]+1 )
return 1;
else
return 0;
}
/*
In case of update bestscore infos
*/
static void update_Bestscore(int32_t row, int32_t col, RESULTS *pResults){
pResults->pBestScore = pResults->pscore;
pResults->BestScore_row = row;
pResults->BestScore_col = col;
}
/*
The fill 'matrix' function - even if there is no more matrix here
*/
#define BEGIN pResults->Segment_begin
#define END pResults->Segment_end
#define CURRENT pResults->pscore
#define nABOVE pResults->Top
#define nLEFT pResults->Left
#define SCORE_FROM_LEFT( X ) (*(CURRENT - X))
#define SCORE_FROM_ABOVE( row, col ) (*(CURRENT - (col-BEGIN[row]) - (END[row-1]-col+1) ) )
#define SCORE_FROM_DIAG( row, col ) (*(CURRENT - (col-BEGIN[row]) - (END[row-1]-col+1) -1 ) )
#define MATCH_F(A,B) ( pScoring->matrix[ CHAR2SYMB(seq1[ (A) ]) ][ CHAR2SYMB(seq2[ (B) ]) ])
#define MATCH_B(A,B) ( pScoring->matrix[ CHAR2SYMB( *(seq1-(A)) ) ][ CHAR2SYMB( *(seq2-(B)) ) ] )
#define MATCH(A,B) ( (direction=='f')? MATCH_F(A,B):MATCH_B(A,B) )
/*
In this version,
Sequence 2 is on TOP and Sequence 1 on the LEFT
In the trace matrix, a 0 means diagonal,
#>0 means # steps from above and
#<0 translates into # steps from left
*/
#define SCORE_TOP pResults->F
static void fill_matrices(char *seq1, char *seq2, SCORING *pScoring, RESULTS *pResults,
float Xg, int32_t max_col, int32_t max_row, int32_t diff_max, char direction)
{
int32_t col, row; /* current row and col */
double Score_left=0.0; /* score coming from left */
double Score_diag=0.0; /* score coming from diagonal */
int8_t IsDiag, /* markers to check if the cell in Diag, Above or Left does exist ? */
IsAbove,
IsLeft;
CURRENT=pResults->scores; /* set the current score at the begining of the scores */
row=col=0; /* set the col and row */
BEGIN[0]=END[0]=0; /* initialize the arrays */
/*
Set the first Score - necessarily a match - the last/first bp of the seed
*/
*(CURRENT)=MATCH(0,0);
pResults->traces[ 0 ] = 0;
update_Bestscore(row, col, pResults);
nABOVE[0]=0;
SCORE_TOP[0]=*(CURRENT);
/* printf("%c --> [%d,%d]:%f ",direction, row,col,*CURRENT); */
/*
First line
*/
col=1;
CURRENT=pResults->scores+1;
while( *(CURRENT-1) > *pResults->pBestScore-Xg && col<max_col ){
if( CURRENT - pResults->scores >= pResults->max_scores) /* Check Mem and if needed realloc */
ReallocScores(pResults, 0);
if( col >= pResults->max_col )
ReallocAbove(pResults, 0);
if(col==1) *CURRENT = *(CURRENT-1) + pScoring->gap_open;
else *CURRENT = *(CURRENT-1) + pScoring->gap_ext;
pResults->traces[ (CURRENT-pResults->scores) ] = -1;
nABOVE[col]=0; /* set the nABOVE[col] to 0 */
SCORE_TOP[col]=*(CURRENT);
/*fprintf(stderr,"[%d,%d]:%f vs. %f (col-1: %f) / Trace=%d\n", row,col,*CURRENT, *pResults->pBestScore-Xg, *(CURRENT-1),pResults->traces[ (CURRENT-pResults->scores) ]); */
col++; /* increase col and current score */
CURRENT++;
if(direction == 'b' && col > diff_max )break;
}
END[row]=col-1; /* ignore the last score (inferior to the limit) */
/* the last CURRENT value will be erased on next value */
/* printf("Row %d was from %d to %d\n",row, BEGIN[row], END[row]); */
/*
Then fill the whole 'matrix' - the matrix being an array of stripes
*/
while( BEGIN[row] < END[row] && row < max_row ){
row++;
if(row >= pResults->max_row)
ReallocBegEnd(pResults, 0);
END[row]=BEGIN[row]=BEGIN[row-1];
col=BEGIN[row];
/*
lets go for the row/segment
*/
while( col <= max_col ){
if( diff_max>0 && ((direction=='f' && row-col >= diff_max) || /* do not want chromosome aligned with itself */
(direction=='b' && col-row >= diff_max)) )
break;
if(diff_max<0 && col+row > -(diff_max) ) /* do not want plaindromes aligned with itself */
break;
if( CURRENT - pResults->scores >= pResults->max_scores) /* Check Mem and if needed realloc */
ReallocScores(pResults, 0);
if( col >= pResults->max_col)
ReallocAbove(pResults, 0);
IsDiag=IsScoreDiag(row, col, *pResults); /* Look for each potential score */
IsAbove=IsScoreAbove(row, col, *pResults);
IsLeft=IsScoreLeft(row, col, *pResults);
/*
Get the three scores (if they exist)
as well as the three
*/
if( IsLeft ){
if( nLEFT>0 && Score_left+pScoring->gap_ext > SCORE_FROM_LEFT(1)+pScoring->gap_open ){
Score_left += pScoring->gap_ext;
nLEFT++;
}
else{
Score_left = SCORE_FROM_LEFT(1) + pScoring->gap_open;
nLEFT=1;
}
}
else
nLEFT = 0;
if( IsAbove ){
if( nABOVE[col]>0 && SCORE_TOP[col]+pScoring->gap_ext > SCORE_FROM_ABOVE(row,col)+ pScoring->gap_open){
SCORE_TOP[col] += pScoring->gap_ext;
nABOVE[col]++;
}
else{
SCORE_TOP[col] = SCORE_FROM_ABOVE(row,col) + pScoring->gap_open;
nABOVE[col]=1;
}
}
else
nABOVE[col] = 0;
if( IsDiag )
Score_diag=SCORE_FROM_DIAG(row,col)+MATCH(row,col);
/*
Compare the existing Scores
*/
if( IsDiag && /* The SCORE_DIAG is defined ..., then */
(
(!IsLeft && !IsAbove ) || /* no other scores are defined */
(IsLeft && !IsAbove && Score_diag>=Score_left ) || /* bigger than Score_left */
(!IsLeft && IsAbove && Score_diag>=SCORE_TOP[col] ) || /* bigger than SCORE_TOP */
(IsLeft && IsAbove && Score_diag>=Score_left && Score_diag>=SCORE_TOP[col] ) /* bigger than both */
)
){
*CURRENT = Score_diag; /* diag is eq or sup to others */
pResults->traces[ CURRENT-pResults->scores ] = 0;
}
else if( IsAbove && /* The SCORE_ABOVE is defined ..., then */
(
(!IsLeft) || /* no other value */
(IsLeft && SCORE_TOP[col]>Score_left ) /* bigger than Score_left */
)
){
*CURRENT = SCORE_TOP[col]; /* Score_top is better than Score_left */
pResults->traces[ CURRENT-pResults->scores ] = nABOVE[col];
}
else{
*CURRENT = Score_left; /* unknown : we don't know: both way are equal */
pResults->traces[ CURRENT-pResults->scores ] = -nLEFT; /* arbitrarily, choose the left one */
}
/* printf("[%d:%d]: Curr=%f vs. Best=%f ; Trace= %d\n\n",row,col,*CURRENT, *pResults->pBestScore-Xg, pResults->traces[ CURRENT-pResults->scores ]); */
/*
Compare the current score to the limit
*/
if( *CURRENT > *pResults->pBestScore - (double)Xg ){ /* If it is bigger than the limit */
/* fprintf(stderr,"[%d,%d]:%f;%c ",row,col,*CURRENT, pResults->traces[ CURRENT-pResults->scores ]); */
if( *CURRENT >= *pResults->pBestScore ) /* keep it, and eventually set it as the Best_Score */
update_Bestscore(row, col, pResults);
END[row]=col; /* if match it ends at least here */
CURRENT++; /* get the score */
}
else{ /* see where we are */
/* printf("%ld:%ld -> last_row= %ld\n", row, col, END[row-1]); */
if( col == BEGIN[row] )
BEGIN[row]=col+1; /* set begin at next and ... */
if( col > BEGIN[row] && col <= END[row-1]){
CURRENT++; /* then look at the next score */
}
if(col > END[row-1])
break; /* then just stop that loop */
}
col++;
}
/* printf("Row %d was from %d to %d\n\n",row, BEGIN[row], END[row]);*/
CURRENT-=(col-END[row]-1);
/* fprintf(stderr,"/// BestScore= %f\n",*pResults->pBestScore); */
}
CURRENT--;
pResults->nSegment=row+1; /* Just remember how many segments (rows) */
}
/*
The traceback function
it always read the trace matrix in this new form
*/
static void traceback( char *seq1, char *seq2, SCORING * pScoring, RESULTS *pResults , char direction){
int32_t trace=0; /* the current trace */
int32_t row, /* current row and cols */
col;
int32_t count=0; /* the total number of step of traceback */
int32_t *ptraces= pResults->traces + (pResults->pBestScore - pResults->scores); /* where we start, a pointer to the trace arrays */
int n;
if(direction == 'f')
pResults->traceback=pResults->traceback_f;
else if(direction == 'b')
pResults->traceback=pResults->traceback_b;
else
fprintf(stderr,"traceback: direction is either 'f' or 'b', bye\n"),exit(4);
row = pResults->BestScore_row; /* set cursor at the extension end -- ! 0,0 if no similarity -- */
col = pResults->BestScore_col;
pResults->matches=0;
pResults->alignment_len=0;
while( row>0 || col>0 ){
/*
Set trace
*/
trace = *ptraces; /* read the current trace */
/*
fprintf(stderr,"[%d,%d] = %ld\n", row, col, trace);
if(row<-2 || col<-2)
exit(1);
*/
if(trace == 0){ /* if it is diagonal */
if( ( direction=='f' && *(seq1+row)==*(seq2+col)) || \
( direction=='b' && *(seq1-row)==*(seq2-col)) )
(pResults->matches)++;
ptraces -= (col-BEGIN[row]) + (END[row-1]-col+1) + 1;
row--;
col--;
pResults->traceback[count++]= 'd';
}
else if(trace < 0 ){ /* we are going to left */
ptraces += trace;
col += trace; /* think that trace is negative */
for(n=0;n<ABS(trace);n++)
pResults->traceback[count++]= 'l';
}
else { /* choose above by default */
for(n=0 ; n<trace ; n++){
ptraces -= (col-BEGIN[row]) + (END[row-1]-col+1);
row--;
pResults->traceback[count++]= 'a';
}
}
if(count >= pResults->max_alignlen-1)
ReallocTraceback(pResults, 0, direction);
}
pResults->traceback[count++] = 'd'; /* add the last bit from the first cell to the previous */
pResults->matches++; /* it has to be a match --it is the seed end-- */
pResults->alignment_len = count;
pResults->traceback[count] = 0;
}
#undef CURRENT
#undef nABOVE
#undef nLEFT
#undef BEGIN
#undef END
#undef SCORE_FROM_LEFT
#undef SCORE_FROM_ABOVE
#undef SCORE_DIAG
#undef MATCH_F
#undef MATCH_B
#undef MATCH
/*
If diff_max is negatif, then we are in inverted and do not want palindromes
otherwise diff_max is positive and is defined to avoid aligning the chromosome with itself
row is seq1 and col is seq2
*/
void align_blast2like(char *seq1, char *seq2, float Xg, SCORING *pScoring, RESULTS *pResults, char direction, int32_t max_row, int32_t max_col, int32_t diff_max){
if(direction != 'f' && direction != 'b')
fprintf(stderr,"align_blast2like: align 'f'orward or 'b'ackward, bye\n"),exit(4);
fill_matrices(seq1, seq2, pScoring, pResults, Xg, max_col, max_row, diff_max, direction);
traceback( seq1, seq2, pScoring, pResults, direction);
}

View File

@ -0,0 +1,435 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/******
file : align_di.c
function : get a seeds, align it forward and backward and return the aligned repeat
created : Apr 11 2001
modif : Jul 2003,Feb 2004, April 2004, July 2004: turn scores into doubles
author : amikezor
*****/
#include "repseek_types.h"
#include "align.h"
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
/*
Get the score of a seed
first and ast position are not taken into account since they are used in the alignments
*/
static double score_seed(int32_t pos, int32_t len, char *sequence, SCORING *pScoring){
char *pseq; /* pointer to the seqeunce */
double score=0; /* the score */
int32_t index=0; /* the symbol of the character in the reward matrix */
pseq = sequence+pos;
while(pseq < sequence+pos+len-1){
index = CHAR2SYMB(*pseq);
score += pScoring->matrix[index][index];
pseq++;
}
return score;
}
/*
The repeat is a palindrome
*/
static Rep Palindome(int32_t pos1, int32_t pos2, int32_t len, char *sequence, SCORING *pScoring){
Rep aligned_seed;
aligned_seed.seed_pos1 = pos1;
aligned_seed.seed_pos2 = pos2;
aligned_seed.seed_len = len;
aligned_seed.pos1 = pos1;
aligned_seed.len1 = len/2;
aligned_seed.pos2 = pos1+aligned_seed.len1;
aligned_seed.len2 = len/2;
aligned_seed.match = len/2;
aligned_seed.align_len = len/2;
strcpy(aligned_seed.type, "Palindrome");
aligned_seed.score = score_seed(pos1, len, sequence, pScoring);
return aligned_seed;
}
/*
Init the repeats value with a seed
first and last position are removed since they are used in the alignements
*/
static void InitRep(int32_t pos1, int32_t pos2, int32_t len, double ScoreSeed, Rep *aligned_seed ){
aligned_seed->pos1=pos1+1;
aligned_seed->pos2=pos2+1;
aligned_seed->len1= len-2; /* len of seed + aligned_genomik */
aligned_seed->len2= len-2;
aligned_seed->seed_pos1 = pos1;
aligned_seed->seed_pos2 = pos2;
aligned_seed->seed_len = len;
aligned_seed->match = len-2;
aligned_seed->align_len = len-2;
aligned_seed->score = ScoreSeed;
}
/*
Update the repeats from we have on the right - forward-
Only for direct repeats
*/
static void UpdateRepRight(Rep *aligned_seed , RESULTS *pResults){
aligned_seed->len1 += pResults->BestScore_row+1; /* len of seed + aligned_genomik */
aligned_seed->len2 += pResults->BestScore_col+1;
aligned_seed->match += pResults->matches;
aligned_seed->align_len += pResults->alignment_len;
aligned_seed->score += *pResults->pBestScore;
}
/*
Update the repeats from we have on the left - backward -
Only for direct repeats
*/
static void UpdateRepLeft(Rep *aligned_seed , RESULTS *pResults){
aligned_seed->pos1 -= pResults->BestScore_row+1; /* len of seed + aligned_genomik */
aligned_seed->pos2 -= pResults->BestScore_col+1;
aligned_seed->len1 += pResults->BestScore_row+1; /* len of seed + aligned_genomik */
aligned_seed->len2 += pResults->BestScore_col+1;
aligned_seed->match += pResults->matches;
aligned_seed->align_len += pResults->alignment_len;
aligned_seed->score += *pResults->pBestScore;
}
/*
Same functions but for inverted repeats
*/
static void UpdateRepRightInv(Rep *aligned_seed , RESULTS *pResults){
aligned_seed->len1 += pResults->BestScore_row+1; /* len of seed + aligned_genomik */
aligned_seed->pos2 -= pResults->BestScore_col+1;
aligned_seed->len2 += pResults->BestScore_col+1;
aligned_seed->match += pResults->matches;
aligned_seed->align_len += pResults->alignment_len;
aligned_seed->score += *pResults->pBestScore;
}
static void UpdateRepLeftInv(Rep *aligned_seed , RESULTS *pResults){
aligned_seed->pos1 -= pResults->BestScore_row+1; /* len of seed + aligned_genomik */
aligned_seed->len1 += pResults->BestScore_row+1; /* len of seed + aligned_genomik */
aligned_seed->len2 += pResults->BestScore_col+1;
aligned_seed->match += pResults->matches;
aligned_seed->align_len += pResults->alignment_len;
aligned_seed->score += *pResults->pBestScore;
}
/***
The function that return an aligned Rep from a cupple of seed information
Moreover all Struct have to be init before
***/
Rep alignd(int32_t pos1, int32_t pos2, int32_t len, char *sequence, int32_t sizeseq ,
float Xg, SCORING *pScoring, RESULTS *pResults , int8_t opt_overlap){
Rep aligned_seed; /* the repeat */
char *pseq1, *pseq2; /* pointer for the first and the second copy */
double ScoreSeed=0; /* score of the seed */
int32_t max_col=0, max_row=0; /* the dist max from the edge of the sequence max for rwo and col */
int32_t diff_max =0; /* the maximum difference between row and col to avoid aligning the chr with itself */
/*
Init by the seed
*/
ScoreSeed=score_seed(pos1+1, len-1, sequence, pScoring); /* pos1+1 and len1-1, since first bases are used into the alignement */
InitRep(pos1, pos2, len, ScoreSeed, &aligned_seed );
diff_max = pos2-pos1; /* if pos1 is bigger than pos2, then you align the chromosome with itself */
/*
If the seed is already overlapping and we do not want it
*/
if(opt_overlap == 0 && aligned_seed.seed_pos1+aligned_seed.seed_len >= aligned_seed.seed_pos2){
len=(pos2-pos1);
ScoreSeed=score_seed(pos1, len, sequence, pScoring);
InitRep(pos1-1, pos2-1, len+2, ScoreSeed, &aligned_seed );
strcpy(aligned_seed.type, "Tandem");
return aligned_seed;
}
/*
Align the right side -- forward --
*/
pseq1 = sequence+pos1+len-1; /* set the sequences */
pseq2 = sequence+pos2+len-1;
max_col = sizeseq-(pos2+len-1)-1; /* you cannot align after the chromosome end */
max_row = sizeseq-(pos1+len-1)-1;
if(opt_overlap == 0)
max_row = MIN2(pos2-(pos1+len-1) -1 , max_row); /* here, you cannot go inside the next copy */
align_blast2like(pseq1, pseq2, Xg, pScoring, pResults, 'f', max_row, max_col, diff_max);
UpdateRepRight(&aligned_seed, pResults);
/*
Align the left side -- backward --
*/
pseq1 = sequence+pos1; /* set the sequences */
pseq2 = sequence+pos2;
max_row = pos1;
max_col = pos2;
if(opt_overlap == 0)
max_col = MIN2( aligned_seed.pos2 - (aligned_seed.pos1 + aligned_seed.len1 - 1) -2 , max_col );
align_blast2like(pseq1, pseq2, Xg, pScoring, pResults, 'b', max_row, max_col, diff_max);
UpdateRepLeft(&aligned_seed, pResults);
if( aligned_seed.pos2 < aligned_seed.pos1+aligned_seed.len1 )
strcpy(aligned_seed.type, "Overlap");
else if( aligned_seed.pos2 == aligned_seed.pos1+aligned_seed.len1 )
strcpy(aligned_seed.type, "Tandem");
else if( aligned_seed.pos2 - (aligned_seed.pos1+aligned_seed.len1) < 1000 )
strcpy(aligned_seed.type, "Close" );
else
strcpy(aligned_seed.type, "Distant" );
return aligned_seed; /* return the repeat */
}
/*
Align some Inverted repeat
*/
Rep aligni(int32_t pos1, int32_t pos2, int32_t len, char *sequence, char *invsequence,\
int32_t sizeseq ,float Xg, SCORING *pScoring, RESULTS *pResults ){
Rep aligned_seed; /* the repeat */
char *pseq1, *pseq2; /* pointer for the first and the second copy */
double ScoreSeed=0; /* score of the seed */
int32_t max_row=0, max_col=0; /* the dist max from the edge of the sequence max for rwo and col */
int32_t diff_max =0; /* the maximum difference between row and col to
avoid aligning a palindrome with itself. !! For Inverted it takes NEGATIVE Values !! */
/*
Init by th seed
*/
ScoreSeed=score_seed(pos1+1, len-1, sequence, pScoring); /* pos1+1 and len1-1, since first bases are used into the alignement */
InitRep(pos1, pos2, len, ScoreSeed, &aligned_seed );
if(pos1 == pos2){
aligned_seed = Palindome(pos1, pos2, len, sequence, pScoring);
return aligned_seed;
}
/*
Align the right side -- forward for the first copy --
*/
pseq1 = sequence+pos1+len-1; /* set the sequences */
pseq2 = invsequence + sizeseq - (pos2+1);
diff_max = -(pos2-pos1-len); /* for inverted repeats, we do not want palindromes
so put it in negative is a code to say 'inverted' */
max_row = sizeseq-(pos1+len-1)-1;
max_col = pos2;
align_blast2like(pseq1, pseq2, Xg, pScoring, pResults, 'f', max_row, max_col, diff_max);
UpdateRepRightInv(&aligned_seed, pResults);
/*
Align the left side -- backward for the first copy --
*/
pseq1 = sequence+pos1; /* set the sequences */
pseq2 = invsequence+sizeseq-(pos2+len-1) -1;
max_row = pos1;
max_col = sizeseq-(pos2+len-1)-1;
diff_max = sizeseq; /* for inverted repeats, on left, it does not really matter */
align_blast2like(pseq1, pseq2, Xg, pScoring, pResults, 'b', max_row, max_col, diff_max);
UpdateRepLeftInv(&aligned_seed, pResults);
if(aligned_seed.pos2 == (aligned_seed.pos1+aligned_seed.len1))
strcpy(aligned_seed.type, "Palindrome");
else if(aligned_seed.pos2 - (aligned_seed.pos1+aligned_seed.len1) < 1000 )
strcpy(aligned_seed.type, "Close");
else
strcpy(aligned_seed.type, "Distant" );
return aligned_seed; /* return the repeat */
}
/*
Align repeats shared by 2 sequences
*/
Rep alignd_2seq(int32_t pos1, int32_t pos2, int32_t len, char *seq1, char *seq2, \
int32_t size1, int32_t size2, float Xg, SCORING *pScoring, RESULTS *pResults ){
Rep aligned_seed; /* the repeat */
char *pseq1, *pseq2; /* pointer for the first and the second copy */
double ScoreSeed=0; /* score of the seed */
int32_t max_row=0, max_col=0; /* the dist max from the edge of the sequence max for rwo and col */
int32_t diff_max =0; /* the maximum difference between row and col to avoid aligning the chr with itself */
/*
Init by the seed
*/
ScoreSeed=score_seed(pos1+1, len-1, seq1, pScoring); /* pos1+1 and len1-1, since first bases are used into the alignement */
InitRep(pos1, pos2, len, ScoreSeed, &aligned_seed );
diff_max = size1+size2; /* there is not limit here */
/*
Align the right side -- forward --
*/
pseq1 = seq1+pos1+len-1; /* set the sequences */
pseq2 = seq2+pos2+len-1;
max_row = size1-(pos1+len-1)-1;
max_col = size2-(pos2+len-1)-1; /* you cannot further than a chronmosome end */
align_blast2like(pseq1, pseq2, Xg, pScoring, pResults, 'f', max_row, max_col, diff_max);
UpdateRepRight(&aligned_seed, pResults);
/*
Align the left side -- backward --
*/
pseq1 = seq1+pos1; /* set the sequences */
pseq2 = seq2+pos2;
max_row = pos1;
max_col = pos2; /* you cannot further than a chronmosome begining */
align_blast2like(pseq1, pseq2, Xg, pScoring, pResults, 'b', max_row, max_col, diff_max);
UpdateRepLeft(&aligned_seed, pResults);
strcpy(aligned_seed.type, "Inter");
return aligned_seed; /* return the repeat */
}
/*
Align repeats shared by 2 sequences
*/
Rep aligni_2seq(int32_t pos1, int32_t pos2, int32_t len, char *seq1, char *invseq2, \
int32_t size1, int32_t size2, float Xg, SCORING *pScoring, RESULTS *pResults ){
Rep aligned_seed; /* the repeat */
char *pseq1, *pseq2; /* pointer for the first and the second copy */
double ScoreSeed=0; /* score of the seed */
int32_t max_row=0, max_col=0; /* the dist max from the edge of the sequence max for rwo and col */
int32_t diff_max =0; /* the maximum difference between row and col to
avoid aligning a palindrome with itself. !! For Inverted it takes NEGATIVE Values !! */
/*
Init by th seed
*/
ScoreSeed=score_seed(pos1+1, len-1, seq1, pScoring); /* pos1+1 and len1-1, since first bases are used into the alignement */
InitRep(pos1, pos2, len, ScoreSeed, &aligned_seed );
/*
Align the right side -- forward for the first copy --
*/
pseq1 = seq1+pos1+len-1; /* set the sequences */
pseq2 = invseq2 + size2 - (pos2+1);
diff_max = size1+size2; /* NO diff max for inter-repeats */
max_row = size1-(pos1+len-1)-1;
max_col = pos2;
align_blast2like(pseq1, pseq2, Xg, pScoring, pResults, 'f', max_row, max_col, diff_max);
UpdateRepRightInv(&aligned_seed, pResults);
/*
Align the left side -- backward for the first copy --
*/
pseq1 = seq1+pos1; /* set the sequences */
pseq2 = invseq2+size2-(pos2+len-1) -1;
max_row = pos1;
max_col = size2-(pos2+len-1)-1;
diff_max = size1+size2; /* it does not really matter */
align_blast2like(pseq1, pseq2, Xg, pScoring, pResults, 'b', max_row, max_col, diff_max);
UpdateRepLeftInv(&aligned_seed, pResults);
if(aligned_seed.pos2 == (aligned_seed.pos1+aligned_seed.len1))
strcpy(aligned_seed.type, "Inter");
return aligned_seed; /* return the repeat */
} /* return the repeat */

View File

@ -0,0 +1,276 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/******
file : matrix.c
function : create a log(freq) / identity matrix for DNA
created : 15 Oct 2002
modif : nov 12 2002: Set all score of X against anything to -1000; (usefull for mask and twoseq)
modif : nov 26 2002: get my new matrix format PtrConst->pScoring (for removing seqaln and my S&W)
modif : Oct 2003
modif : July 2004 : change the formula of the log matrix, mainly Match(N,i) is changed
modif : July 2004 : turn scores into doubles.
author : amikezor
*****/
#include "repseek_types.h"
#include "align.h"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
/***
calculate the frequency of all symbol
in DNA sequence
***/
static int32_t get_letter(int32_t letters[5], char *seq)
{
int32_t lseq=0;
int8_t letter;
letters[0] = letters[1] = letters[2] = letters[3] = letters[4] = 0;
while( (letter = *seq) != 0)
{
lseq++;
if (letter == 'A')
letters[0]++;
if (letter == 'C')
letters[1]++;
if (letter == 'G')
letters[2]++;
if (letter == 'T')
letters[3]++;
seq++;
}
letters[4] = lseq-letters[0]-letters[1]-letters[2]-letters[3]-letters[4];
return lseq;
}
/***
From a float return the closest int
***/
static int float2int(float a){
if(a<0){
if( ( a-(float)(int)a ) > -0.5 )
return (int)a;
else
return ((int)a-1);
}
else{
if( ( a-(float)(int)a ) < 0.5 )
return (int)a;
else
return ((int)a+1);
}
}
/*
Print the matrix if you want
*/
void printout_matrix2( SCORING *pScoring ){
int i,j;
fprintf(stderr," A B C D E F G H I J K L M N O P"
" Q R S T U V W X Y Z\n");
for (i=0; i<26; i++) {
fprintf(stderr,"%c ", SYMB2CHAR(i) );
for (j=0; j<26; j++)
if(pScoring->matrix[i][j]<100 && pScoring->matrix[i][j]>-100)
fprintf(stderr,"%5.2f",pScoring->matrix[i][j]);
else
fprintf(stderr,"%5.1g",pScoring->matrix[i][j]);
fprintf(stderr,"\n");
}
}
/*
Print the matrix if you want
*/
void printout_matrix( SCORING *pScoring ){
int i,j;
char carac[6]={'A', 'C', 'G', 'T', 'N', 'X'};
fprintf(stderr," ");
for (i=0; i<6; i++)
fprintf(stderr,"%6c", carac[i]);
fprintf(stderr,"\n");
for (i=0; i<6; i++) {
fprintf(stderr,"%-6c ", carac[i] );
for (j=0; j<6; j++)
if(pScoring->matrix[CHAR2SYMB(carac[i])][CHAR2SYMB(carac[j])]<100 &&
pScoring->matrix[CHAR2SYMB(carac[i])][CHAR2SYMB(carac[j])]>-100)
fprintf(stderr,"%6.2f",pScoring->matrix[CHAR2SYMB(carac[i])][CHAR2SYMB(carac[j])]);
else
fprintf(stderr,"%6.1g",pScoring->matrix[CHAR2SYMB(carac[i])][CHAR2SYMB(carac[j])]);
fprintf(stderr,"\n");
}
}
/***
Set up the matrix MATCH(i,j) = 1/2 * log4(pi*pj) - thought to be the more corrective
get all symbol frequency and estimate match(N,i) = pi * log(pi);
***/
void log_matrix(char *sequence, float gap_open, float gap_ext, SCORING *pScoring){
int32_t letters[5]={0,}; /* count of every letter */
double freq[4]= {0.0,}; /* frequency of nucleotides ACGT */
char carac[4]={'A','C','G','T'};
int32_t size; /* size of the seq including Ns ans Xs */
double temp; /* make the read easier */
int32_t size_eff; /* size_eff is the real number of non N nucleotide */
int16_t i=0, j=0, u; /* dummy counters */
for(i=0;i<26;i++) /* set the matrix to 0 */
for(j=0;j<26;j++)
pScoring->matrix[i][j]=0;
size = get_letter(letters, sequence); /* count all letters */
size_eff = size-letters[4]; /* count the size of ACGT symbols */
for(i=0;i<4;i++) /* get the frequencies */
freq[i]=(double)letters[i]/(double)size_eff;
/*
Set the matrix score
*/
for(i=0;i<4;i++){
for(j=0;j<4;j++){
temp = 0.5 * log10( freq[i]*freq[j] )/log10( 4.0 );
pScoring->matrix[ CHAR2SYMB(carac[i]) ][ CHAR2SYMB(carac[j]) ] = (i==j)? -temp: temp;
}
temp = - freq[i] * log10( freq[i] )/log10( 4.00 );
pScoring->matrix[CHAR2SYMB('N')][CHAR2SYMB(carac[i])] = temp;
pScoring->matrix[CHAR2SYMB(carac[i])][CHAR2SYMB('N')] = temp;
}
pScoring->matrix[CHAR2SYMB('N')][CHAR2SYMB('N')] = 0.25;
/*
set all X against anything to -1e6
*/
for(i=0;i<26;i++)
pScoring->matrix[i][CHAR2SYMB('X')] = pScoring->matrix[CHAR2SYMB('X')][i] = -1e6;
/*
Estimate the mean score of the matrix
*/
pScoring->expect=0.0;
for(i=0;i<4;i++)
for(u=0;u<4;u++) /* print out log(freq*freq) into the pScoring->matrix */
pScoring->expect += freq[i]*freq[u]*pScoring->matrix[CHAR2SYMB(carac[i])][CHAR2SYMB(carac[u])];
pScoring->gap_open = (gap_open<0)? gap_open : -gap_open;
pScoring->gap_ext = (gap_ext<0)? gap_ext : -gap_ext;
/*
printout_matrix( pScoring );
*/
}
/*
This matrix is
anything is +/- 1
N is 1
X is -1e6
*/
void identity_matrix(char *sequence, float gap_open, float gap_ext,SCORING *pScoring){
int16_t i,j,u;
int32_t letters[5]={0,};
double freq[4]= {0.0,}; /* frequency of nucleotides ACGT */
char carac[4] = {'A','C','G','T'};
int32_t size;
int32_t size_eff; /* size_eff is the real number of non N nucleotide */
for(i=0;i<26;i++) /* set the matrix to 0 */
for(j=0;j<26;j++)
pScoring->matrix[i][j]=0;
size = get_letter(letters, sequence); /* count all letters */
size_eff = size-letters[4]; /* count the size of ACGT symbols */
for(i=0;i<4;i++) /* get the frequencies */
freq[i]=(double)letters[i]/(double)size_eff;
for(i=0;i<4;i++)
for(u=0;u<4;u++)
pScoring->matrix[CHAR2SYMB(carac[i])][CHAR2SYMB(carac[u])]=(u==i)? 1 : -1;
for(i=0;i<26;i++) /* set all N against anything to +1 */
pScoring->matrix[i][CHAR2SYMB('N')] = pScoring->matrix[CHAR2SYMB('N')][i] = 0.25;
for(i=0;i<26;i++) /* set all X against anything to -1e6 */
pScoring->matrix[i][CHAR2SYMB('X')] = pScoring->matrix[CHAR2SYMB('X')][i] = -1e6;
pScoring->expect=0.0;
for(i=0;i<4;i++)
for(u=0;u<4;u++){ /* print out log(freq*freq) into the pScoring->matrix */
pScoring->expect += freq[i]*freq[u]*pScoring->matrix[CHAR2SYMB(carac[i])][CHAR2SYMB(carac[u])];
}
pScoring->gap_open = (gap_open>0)? -gap_open : gap_open;
pScoring->gap_ext = (gap_ext>0 )? -gap_ext : gap_ext;
/*
printout_matrix( pScoring );
*/
}

View File

@ -0,0 +1,562 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/******
file : align_seeds.c
function : from seeds to repeats
created : jul 7 2003
modif : July 2003, Oct 2003, Nov 2003 --change the reset_Rep(),Feb 2004
July 2004 : turn scores into doubles
Nov 2004 : correct a bug with opt_overlap --now removed too small and overlapping seeds--
Sep 05
March 06: debug and make optimization with reset_seeds()
March 06: change reset_repet() to let the user choose how stringent should be the overlap
May 07: update reset_repet() to to remove a small bug
Dec 09: correct a bug -- thanks to M Krawczykc
author : amikezor
*****/
#include "KMRK_Seeds.h"
#include "repseek_types.h"
#include "align.h"
#include "sequence.h"
#include "memory.h"
#include "memory_align.h"
#include "sort.h"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
/*
A function to remove all seeds that fall in the 'f'orward alignemnt of the current seed
*/
static void reset_seeds(AllSeeds_type *PtrAllSeeds, char orientation, int32_t current_seed, Rep aligned_seed, RESULTS * pResults)
{
int32_t max_seed=0, /* max of inv or dir */
s=current_seed+1; /* the seed counter */
char *ptraceback=NULL;
int32_t Current_pos1, /* the pos in the current forward alignemnt */
Current_pos2;
int32_t mem_pos1=0, mem_pos2=0; /* values used for optimization (avoid always going back to start) */
char *mem_trace=NULL;
int32_t len_traceback = strlen(pResults->traceback_f); /* the end of the traces are the begining of the alignment */
/*
Treat separatly direct and inverted
*/
if(orientation == 'd'){
max_seed = PtrAllSeeds->nDirSeeds;
mem_pos1 = 0; /* before the first seed checked */
while( s<max_seed && PtrAllSeeds->dirSeeds[s].pos1 < aligned_seed.pos1+aligned_seed.len1 ){
if( PtrAllSeeds->dirSeeds[s].pos1 != -1 &&
PtrAllSeeds->dirSeeds[s].pos2 > aligned_seed.seed_pos2 &&
PtrAllSeeds->dirSeeds[s].pos2 <= aligned_seed.pos2+aligned_seed.len2 ){ /* then it is a potential candidate */
if( mem_pos1 == 0 ){ /* first round */
Current_pos1 = aligned_seed.seed_pos1 + aligned_seed.seed_len ; /* the first bit after the seed */
Current_pos2 = aligned_seed.seed_pos2 + aligned_seed.seed_len ;
ptraceback = pResults->traceback_f + len_traceback - 2; /* at the end --last cell is the seed end-- */
}else{
Current_pos1=mem_pos1; /* subsequent rounds */
Current_pos2=mem_pos2;
ptraceback = mem_trace;
mem_pos1 = 0;
}
while( ptraceback >= pResults->traceback_f &&
Current_pos1 <= PtrAllSeeds->dirSeeds[s].pos1 &&
Current_pos2 <= PtrAllSeeds->dirSeeds[s].pos2 ){
switch( *ptraceback ){
case 'd':
Current_pos1++;
Current_pos2++;
break;
case 'a':
Current_pos1++;
break;
case 'l':
Current_pos2++;
break;
default:
fprintf(stderr,"reset_seeds: expect traceback to be 'd','a' or 'l', NOT %c !! bye\n", *ptraceback),exit(6);
}
if( Current_pos1 == PtrAllSeeds->dirSeeds[s].pos1 && mem_pos1 == 0 ){
mem_pos1 = Current_pos1;
mem_pos2 = Current_pos2;
mem_trace = ptraceback-1; /* ptraceback-1, is the next one */
}
if(Current_pos1 == PtrAllSeeds->dirSeeds[s].pos1 && Current_pos2 == PtrAllSeeds->dirSeeds[s].pos2 ){
/*
fprintf(stderr, "Remove %d %d-%d-%d USING %d %d-%d-%d \n",
s+1, PtrAllSeeds->dirSeeds[s].pos1+1, PtrAllSeeds->dirSeeds[s].pos2+1, PtrAllSeeds->dirSeeds[s].length,
current_seed+1, PtrAllSeeds->dirSeeds[current_seed].pos1+1, PtrAllSeeds->dirSeeds[current_seed].pos2+1, PtrAllSeeds->dirSeeds[current_seed].length);
*/
PtrAllSeeds->dirSeeds[s].pos1 = -1;
}
ptraceback--;
}
}
s++;
}
}
else if (orientation == 'i'){
max_seed = PtrAllSeeds->nInvSeeds;
mem_pos1 = 0;
while( s<max_seed && PtrAllSeeds->invSeeds[s].pos1 < aligned_seed.pos1+aligned_seed.len1 ){
if(PtrAllSeeds->invSeeds[s].pos1 != -1 &&
PtrAllSeeds->invSeeds[s].pos2 >= aligned_seed.pos2 &&
PtrAllSeeds->invSeeds[s].pos2 <= aligned_seed.seed_pos2 ){
if( ! mem_pos1 ){ /* first round */
Current_pos1 = aligned_seed.seed_pos1+aligned_seed.seed_len; /* the first bit after the seed */
Current_pos2 = aligned_seed.seed_pos2-1; /* the first bit before the seed 2nd copy */
ptraceback=pResults->traceback_f+len_traceback-2; /* the last cell is the seed end */
}else{
Current_pos1=mem_pos1; /* subsequent rounds */
Current_pos2=mem_pos2;
ptraceback = mem_trace;
mem_pos1 = 0;
}
while( ptraceback >= pResults->traceback_f &&
Current_pos1 <= PtrAllSeeds->invSeeds[s].pos1 &&
Current_pos2 >= PtrAllSeeds->invSeeds[s].pos2 ){
switch( *ptraceback ){
case 'd':
Current_pos1++;
Current_pos2--;
break;
case 'a':
Current_pos1++;
break;
case 'l':
Current_pos2--;
break;
default:
fprintf(stderr,"reset_seeds: expect traceback to be 'd','a' or 'l', NOT %c !! bye\n",*ptraceback),exit(6);
}
if( Current_pos1 == PtrAllSeeds->invSeeds[s].pos1 && mem_pos1 == 0 ){
mem_pos1 = Current_pos1;
mem_pos2 = Current_pos2;
mem_trace = ptraceback-1;
}
if( Current_pos1 == PtrAllSeeds->invSeeds[s].pos1 && Current_pos2 == PtrAllSeeds->invSeeds[s].pos2+PtrAllSeeds->invSeeds[s].length-1 )
PtrAllSeeds->invSeeds[s].pos1 = -1;
ptraceback--;
} /* endof while( ptraceback >= pResults-> ... */
}
s++;
}
}
else
fprintf(stderr,"reset_seeds: orientation has to be 'd' or 'i', bye\n");
}
/*
Remove Repeats which are have the same pos1, pos2, len1 and len2
*/
static int32_t reset_Rep(Rep *RepTable, int32_t nRep, float merge_repeats ){
int32_t u, i,
removed=0;
int32_t overlap_copy1,
overlap_copy2;
if(merge_repeats>1 || merge_repeats<0)
fprintf(stderr, "reset_Rep: 0.0 <= fraction_overlap <= 1.0\n"),exit(7);
for(u=0; u<nRep-1; u++){
i=u+1;
if(RepTable[u].pos1 == -1)
continue;
while( i<nRep &&
RepTable[u].pos1 != -1 &&
RepTable[i].pos1 <= RepTable[u].pos1+RepTable[u].len1
){
if(RepTable[i].pos1 == -1){
i++;
continue;
}
overlap_copy1 = MIN2( (RepTable[i].pos1+RepTable[i].len1), (RepTable[u].pos1+RepTable[u].len1) ) - MAX2(RepTable[i].pos1, RepTable[u].pos1 );
overlap_copy2 = MIN2( (RepTable[i].pos2+RepTable[i].len2), (RepTable[u].pos2+RepTable[u].len2) ) - MAX2(RepTable[i].pos2, RepTable[u].pos2 );
if( overlap_copy1 >= (int32_t) merge_repeats*RepTable[i].len1 && overlap_copy1 >= (int32_t) merge_repeats*RepTable[u].len1 &&
overlap_copy2 >= (int32_t) merge_repeats*RepTable[i].len2 && overlap_copy2 >= (int32_t) merge_repeats*RepTable[u].len2
){
if(RepTable[i].score > RepTable[u].score){ /* keep the best score */
/*
fprintf(stderr,
"REMOVE_REP: %d %d %d %d %d-%d-%d by %d %d %d %d %d-%d-%d\n",
RepTable[u].pos1+1, RepTable[u].pos2+1, RepTable[u].len1, RepTable[u].len2, RepTable[u].seed_pos1+1, RepTable[u].seed_pos2+1, RepTable[u].seed_len,
RepTable[i].pos1+1, RepTable[i].pos2+1, RepTable[i].len1, RepTable[i].len2, RepTable[i].seed_pos1+1, RepTable[i].seed_pos2+1, RepTable[i].seed_len
);
*/
RepTable[u].pos1=RepTable[u].pos2=RepTable[u].len1=RepTable[u].len2=-1;
}
else{
/*
fprintf(stderr,
"REMOVE_REP: %d %d %d %d %d-%d-%d by %d %d %d %d %d-%d-%d\n",
RepTable[i].pos1+1, RepTable[i].pos2+1, RepTable[i].len1, RepTable[i].len2, RepTable[i].seed_pos1+1, RepTable[i].seed_pos2+1, RepTable[i].seed_len,
RepTable[u].pos1+1, RepTable[u].pos2+1, RepTable[u].len1, RepTable[u].len2, RepTable[u].seed_pos1+1, RepTable[u].seed_pos2+1, RepTable[u].seed_len
);
*/
RepTable[i].pos1=RepTable[i].pos2=RepTable[i].len1=RepTable[i].len2=-1;
}
removed++;
}
i++;
}
}
return removed;
}
/***
get Repeats from seeds.
This function is the only one exported and is called by main()
***/
Repeats align_seeds(AllSeeds_type *AllSeeds, char *sequence, float Xg, float gap_open, float gap_ext, char matrix_type,\
int16_t Lmin, int8_t opt_dir, int8_t opt_inv, int8_t opt_overlap, float merge_repeats ){
int32_t sequence_size; /* guess... */
Repeats AllRepeats; /* A structure that contain all Repeats */
int32_t s, r=0; /* to count the 's'eeds and 'r'epeats */
SCORING Scoring; /* Scoring - get scoring matrix and gap-open, gap-ext */
RESULTS Results; /* Results all the results of the alignemnt */
char *invsequence=NULL;
if(!opt_dir && !opt_inv)
fprintf(stderr, "align_seeds: At least one of the direct and inverted is needed, bye\n"),exit(1);
AllRepeats = mem_Repeats(AllSeeds->nDirSeeds, AllSeeds->nInvSeeds); /* Init and Mem */
mem_Blast2Align(&Results); /* get mem for alignment */
if( matrix_type == 'i')
identity_matrix(sequence,gap_open, gap_ext, &Scoring); /* Load identity matrix into memory */
else if(matrix_type == 'l' )
log_matrix(sequence, gap_open, gap_ext, &Scoring);
sequence_size = (int32_t)strlen(sequence);
if(opt_dir){
for(s=0 , r=0; s < AllSeeds->nDirSeeds; s++){
if(s%10 == 0)
fprintf(stderr,"direct: %d seeds -> %d repeats\r",s,r);
if(!opt_overlap &&
AllSeeds->dirSeeds[s].pos1 + AllSeeds->dirSeeds[s].length > AllSeeds->dirSeeds[s].pos2 &&
AllSeeds->dirSeeds[s].pos2- AllSeeds->dirSeeds[s].pos1 < Lmin
)
AllSeeds->dirSeeds[s].pos1 = -1; /* overlapping and too small, remove it */
if(AllSeeds->dirSeeds[s].pos1 == -1 )continue; /* thus it is a removed seed */
AllRepeats.DirRep[r] = alignd(AllSeeds->dirSeeds[s].pos1,
AllSeeds->dirSeeds[s].pos2,
AllSeeds->dirSeeds[s].length,
sequence, sequence_size ,Xg,
&Scoring, &Results, opt_overlap);
AllRepeats.DirRep[r].seed_meanR = AllSeeds->dirSeeds[s].rmean;
reset_seeds(AllSeeds, 'd', s, AllRepeats.DirRep[r], &Results);
r++;
}
AllRepeats.nDirRep = r;
if(AllRepeats.nDirRep != 0){
AllRepeats.DirRep = (Rep *)MyRealloc(AllRepeats.DirRep, AllRepeats.nDirRep*sizeof(Rep),
AllSeeds->nDirSeeds*sizeof(Rep) ,"align_seeds : realloc DirRep error");
qsort_1then2_repeats(AllRepeats.DirRep, AllRepeats.nDirRep);
AllRepeats.nDirBadRep = reset_Rep(AllRepeats.DirRep, AllRepeats.nDirRep, merge_repeats);
}
else{
MyFree(AllRepeats.DirRep, AllSeeds->nDirSeeds*sizeof(Rep) );
AllRepeats.DirRep=NULL;
}
}
if(opt_inv){
/*
Get Inv sequence
*/
invsequence = (char *)MyMalloc((sequence_size+1)*sizeof(char),"align_seeds: invsequence malloc error, bye");
invseq(sequence,invsequence);
for(s=0, r=0; s < AllSeeds->nInvSeeds; s++){
if( s%10 == 0)
fprintf(stderr,"inverted: %d seeds -> %d repeats\r",s,r);
if(AllSeeds->invSeeds[s].pos1 == -1)continue;
AllRepeats.InvRep[r] = aligni(AllSeeds->invSeeds[s].pos1,
AllSeeds->invSeeds[s].pos2,
AllSeeds->invSeeds[s].length,
sequence, invsequence,sequence_size,
Xg, &Scoring, &Results);
AllRepeats.InvRep[r].seed_meanR = AllSeeds->invSeeds[s].rmean;
if(AllRepeats.InvRep[r].len1 >= Lmin){ /* if not do not keep small palindromes */
reset_seeds(AllSeeds, 'i', s, AllRepeats.InvRep[r], &Results);
r++;
}
}
AllRepeats.nInvRep = r;
if(AllRepeats.nInvRep != 0){
AllRepeats.InvRep = (Rep *)MyRealloc(AllRepeats.InvRep, AllRepeats.nInvRep*sizeof(Rep),
AllSeeds->nInvSeeds*sizeof(Rep),"align_seeds : realloc InvRep error");
qsort_1then2_repeats(AllRepeats.InvRep, AllRepeats.nInvRep);
AllRepeats.nInvBadRep = reset_Rep(AllRepeats.InvRep, AllRepeats.nInvRep, merge_repeats);
}
else{
MyFree( AllRepeats.InvRep, AllSeeds->nInvSeeds*sizeof(Rep) );
AllRepeats.InvRep=NULL;
}
MyFree(invsequence, sequence_size*sizeof(char) );
}
free_Blast2Align(&Results); /* free the RESULTS struct */
return AllRepeats;
}
/***
get Repeats from seeds.
This function is the only one exported and is called by main()
***/
Repeats align_seeds_2seq(AllSeeds_type *AllSeeds, char *seq1, char *seq2, float Xg, float gap_open, float gap_ext, char matrix_type,\
int16_t Lmin, int8_t opt_dir, int8_t opt_inv , float merge_repeats){
int32_t size1, size2; /* sequences size */
Repeats AllRepeats; /* A structure that contain all Repeats */
int32_t s, r=0; /* to count the 's'eeds and 'r'epeats */
SCORING Scoring; /* Scoring - get scoring matrix and gap-open, gap-ext */
RESULTS Results; /* Results all the results of the alignemnt */
char *invseq2=NULL; /* a pointer to the inevrted sequence 2 */
if(!opt_dir && !opt_inv)
fprintf(stderr, "align_seeds: At least one of the direct and inverted, bye\n"),exit(1);
size1=(int32_t)strlen(seq1);
size2=(int32_t)strlen(seq2);
AllRepeats = mem_Repeats(AllSeeds->nDirSeeds, AllSeeds->nInvSeeds); /* Init and Mem */
mem_Blast2Align(&Results); /* get mem for alignment */
if( matrix_type == 'i')
identity_matrix(seq1,gap_open, gap_ext, &Scoring); /* Load identity matrix into memory */
else if(matrix_type == 'l' )
log_matrix(seq1, gap_open, gap_ext, &Scoring);
if(opt_dir){
for(s=0 , r=0; s < AllSeeds->nDirSeeds; s++){
if(s%10 == 0)
fprintf(stderr,"direct: %d seeds -> %d repeats\r",s,r);
if(AllSeeds->dirSeeds[s].pos1 == -1 )continue; /* thus it is a removed seed */
AllRepeats.DirRep[r] = alignd_2seq(AllSeeds->dirSeeds[s].pos1,
AllSeeds->dirSeeds[s].pos2,
AllSeeds->dirSeeds[s].length,
seq1, seq2,
size1, size2,
Xg, &Scoring, &Results);
AllRepeats.DirRep[r].seed_meanR = AllSeeds->dirSeeds[s].rmean;
reset_seeds(AllSeeds, 'd', s, AllRepeats.DirRep[r], &Results);
r++;
}
AllRepeats.nDirRep = r;
if(AllRepeats.nDirRep != 0){
AllRepeats.DirRep = (Rep *)MyRealloc(AllRepeats.DirRep, AllRepeats.nDirRep*sizeof(Rep),
AllSeeds->nDirSeeds*sizeof(Rep) ,"align_seeds : realloc DirRep error");
qsort_1then2_repeats(AllRepeats.DirRep, AllRepeats.nDirRep);
AllRepeats.nDirBadRep = reset_Rep(AllRepeats.DirRep, AllRepeats.nDirRep, merge_repeats);
}
else{
MyFree(AllRepeats.DirRep, AllSeeds->nDirSeeds*sizeof(Rep) );
AllRepeats.DirRep=NULL;
}
}
if(opt_inv){
/*
Get Inv sequence
*/
invseq2 = (char *)MyMalloc(size2*sizeof(char),"align_seeds: malloc error, bye");
invseq(seq2,invseq2);
for(s=0, r=0; s < AllSeeds->nInvSeeds; s++){
if( s%10 == 0)
fprintf(stderr,"inverted: %d seeds -> %d repeats\r",s,r);
if(AllSeeds->invSeeds[s].pos1 == -1)continue;
AllRepeats.InvRep[r] = aligni_2seq(AllSeeds->invSeeds[s].pos1,
AllSeeds->invSeeds[s].pos2,
AllSeeds->invSeeds[s].length,
seq1, invseq2,
size1, size2,
Xg, &Scoring, &Results);
AllRepeats.InvRep[r].seed_meanR = AllSeeds->invSeeds[s].rmean;
reset_seeds(AllSeeds, 'i', s, AllRepeats.InvRep[r], &Results);
r++;
}
AllRepeats.nInvRep = r;
if(AllRepeats.nInvRep != 0){
AllRepeats.InvRep = (Rep *)MyRealloc(AllRepeats.InvRep, AllRepeats.nInvRep*sizeof(Rep),
AllSeeds->nInvSeeds*sizeof(Rep),"align_seeds : realloc InvRep error");
qsort_1then2_repeats(AllRepeats.InvRep, AllRepeats.nInvRep);
AllRepeats.nInvBadRep = reset_Rep(AllRepeats.InvRep, AllRepeats.nInvRep, merge_repeats);
}
else{
MyFree( AllRepeats.InvRep, AllSeeds->nInvSeeds*sizeof(Rep) );
AllRepeats.InvRep=NULL;
}
MyFree(invseq2, size2*sizeof(char) );
}
free_Blast2Align(&Results); /* free the RESULTS struct */
return AllRepeats;
}

View File

@ -0,0 +1,169 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/******
file : families.c
function : set the R-level of the repeats (Mean, Main, and fraction of Main)
created : 15 Oct 2002
modif : July 2003, Oct 2003, Spe 05
note : from Air.c - Mar 28 2001 / Jul 10 2001 / Jun 2 2000
author : amikezor
*****/
#include "repseek_types.h"
#include "memory.h"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
/***
construction of the table "air" (of repeats)
***/
static void table_R(int32_t *table_r, Rep *AllReps, int32_t nRep, int32_t size_chr)
{
int32_t u; /* a counter */
for(u=0; u<nRep; u++){
if(AllReps[u].pos1 == -1){continue;} /* skip "bad repeat" */
table_r[ AllReps[u].pos1 ]+=1;
if(AllReps[u].pos1 + AllReps[u].len1 < size_chr)
table_r[ AllReps[u].pos1 + AllReps[u].len1 ]-=1;
table_r[ AllReps[u].pos2 ]+=1;
if(AllReps[u].pos2 + AllReps[u].len2 < size_chr)
table_r[ AllReps[u].pos2 + AllReps[u].len2 ]-=1;
}
}
/***
Built up and returned a memory adress where is table R
***/
static int32_t *mem_RepTableR(Repeats AllRepeats, int8_t opt_dir, int8_t opt_inv, int32_t size_chr)
{
int32_t *table_r=NULL; /* the table R */
int32_t count=0; /* allow to count increase and decrease */
int32_t i=0; /* just a counter */
table_r = MyCalloc(size_chr, sizeof(int32_t), "SetRepFamily: calloc failed, bye");
if(opt_dir)
table_R(table_r, AllRepeats.DirRep, AllRepeats.nDirRep, size_chr);
if(opt_inv)
table_R(table_r, AllRepeats.InvRep, AllRepeats.nInvRep, size_chr);
for(count=1, i=0; i<size_chr ; i++)
table_r[i] = (count+=table_r[i]);
return table_r;
}
/***
Check all repeat and set their family size by
looking at the table R of each copy
***/
static void Tag_Rep(Rep *AllReps, int32_t nReps, int32_t *table_r, int32_t *HistoR, int32_t Max_table_r)
{
Rep *pRep;
int32_t i;
int32_t mainR=0;
int32_t max_mainR=0;
float meanR=0;
float fraction_mainR=0.0;
for(pRep = AllReps; pRep < AllReps+nReps; pRep++){ /* determine the mean repeat level of each repeat */
memset(HistoR, 0, (Max_table_r+1)*sizeof(int32_t) );
for(i=0; i < pRep->len1 ; i++)
HistoR[ table_r[pRep->pos1+i] ]++;
for(i=0; i < pRep->len2 ;i++ )
HistoR[ table_r[pRep->pos2+i] ]++;
for(i=2, meanR=0, max_mainR=0; i <= Max_table_r ; i++){
if( HistoR[i] > max_mainR ){
max_mainR=HistoR[i];
mainR=i;
}
meanR += (float) i * HistoR[i];
}
fraction_mainR = (float)max_mainR / (float)(pRep->len1+pRep->len2);
pRep->meanR= meanR / (float)(pRep->len1+pRep->len2);
pRep->mainR=mainR;
pRep->fraction_mainR=fraction_mainR;
}
}
/***
Set the size of each family (duplicate, triplicate, etc...)
Moreover return the table_r of the chromosome.
In that table, each chr pos correspond to the level of repetition
(1(unique), 2(duplicated), 3(triplicated), ...)
***/
int32_t *SetRepFamily(Repeats AllRepeats, int8_t opt_dir, int8_t opt_inv, int32_t size_chr)
{
int32_t *table_r; /* the tableR itself */
int32_t max_TableR=0; /* max value in TableR */
int32_t *HistoTableR; /* histo of R-level for a repeat */
int32_t i; /* a dummy token */
if( !(AllRepeats.nDirRep-AllRepeats.nDirBadRep) &&
!(AllRepeats.nInvRep-AllRepeats.nInvBadRep) )
return NULL;
table_r = mem_RepTableR( AllRepeats, opt_dir,opt_inv,size_chr);
for(i=0; i<size_chr; i++)
max_TableR=(table_r[i]>max_TableR)?table_r[i]:max_TableR;
HistoTableR = (int32_t *)MyMalloc( (max_TableR+1) * sizeof(int32_t), "SetRepFamily: malloc error" );
if(opt_dir)
Tag_Rep(AllRepeats.DirRep, AllRepeats.nDirRep, table_r, HistoTableR, max_TableR);
if(opt_inv)
Tag_Rep(AllRepeats.InvRep, AllRepeats.nInvRep, table_r, HistoTableR, max_TableR);
MyFree(HistoTableR, (max_TableR+1)*sizeof(int32_t) );
return table_r;
}

View File

@ -0,0 +1,37 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
#ifndef _families_h_
#define _families_h_
void SetNumberRep_2seqs(Repeats AllRepeats,
int8_t opt_dir,
int8_t opt_inv,
int32_t size1,
int32_t size2,
int32_t **TableR1,
int32_t **TableR2);
int32_t *SetRepFamily(Repeats AllRepeats,
int8_t opt_dir,
int8_t opt_inv,
int32_t size_chr);
#endif /* _families_h_*/

View File

@ -0,0 +1,196 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/**
* @file families_2seq.c
* @author amikezor
* @date 14 Nov 2002
* @modif Jul 2003 - change to take into account Mean, Main and fraction of R-level
* @modif April 2004 - Recode totally with 2 TableR and only 1 meanR
* @modif Sept 2005
*
* @brief set the number of copy in the 2 input seq
*
*
*/
#include "repseek_types.h"
#include "memory.h"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
/***
construction of the table "air" (of repeats)
***/
static void Fill_Tables_2seq(int32_t size1, int32_t size2, int32_t *TableR1, int32_t * TableR2, Rep *AllReps, int32_t nRep)
{
int32_t u; /* a counter */
for(u=0; u<nRep; u++){
if(AllReps[u].pos1 == -1){continue;} /* skip "bad repeat" */
TableR1[ AllReps[u].pos1 ]+=1;
if(AllReps[u].pos1 + AllReps[u].len1 < size1)
TableR1[ AllReps[u].pos1 + AllReps[u].len1 ]-=1;
TableR2[ AllReps[u].pos2 ]+=1;
if(AllReps[u].pos2 + AllReps[u].len2 < size2)
TableR2[ AllReps[u].pos2 + AllReps[u].len2 ]-=1;
}
}
/***
Check all repeat and set their copy number by
looking at the table R of each copy
***/
static void Tag_Rep_2seqs(Rep *AllReps, int32_t nReps, int32_t *TableR1, int32_t *TableR2, int32_t *HistoR, int32_t Max_table_r)
{
Rep *pRep;
int32_t i;
int32_t mainR=0;
int32_t max_mainR=0;
float meanR=0, meanR2=0;
float fraction_mainR=0.0;
for(pRep = AllReps; pRep < AllReps+nReps; pRep++){ /* determine the mean repeat level of each repeat */
memset(HistoR, 0, (Max_table_r+1)*sizeof(int32_t) );
for(i=0; i < pRep->len1 ; i++){
HistoR[ TableR1[pRep->pos1+i] ]++;
meanR += TableR1[pRep->pos1+i];
}
for(i=0; i < pRep->len2 ;i++ ){
HistoR[ TableR2[pRep->pos2+i] ]++;
meanR2+=TableR2[pRep->pos2+i];
}
meanR /= (float)(pRep->len1);
meanR2 /= (float)(pRep->len2);
for(i=1, max_mainR = 0; i <= Max_table_r ; i++){
if( HistoR[i] > max_mainR ){
max_mainR=HistoR[i];
mainR=i;
}
}
fraction_mainR = (float)max_mainR / ( (float)(pRep->len1) + (pRep->len2) );
pRep->meanR= meanR+meanR2 -2;
pRep->mainR=0;
pRep->fraction_mainR=0;
}
}
/***
Set the number of copies of each sequence in the second seq (2 means that this rep has 2 copies in seq2)
Moreover return the table_r of the chromosome.
In that table, each chr pos correspond to the level of repetition
(1(unique), 2(duplicated), 3(triplicated), ...)
***/
void SetNumberRep_2seqs(Repeats AllRepeats, int8_t opt_dir, int8_t opt_inv,
int32_t size1, int32_t size2,
int32_t **pT1, int32_t **pT2)
{
int32_t max_TableR=0; /* the maximum redundancy in the TableR1 or TableR2 */
int32_t *HistoTableR; /* Histogram of the R-level of a repeat --> here R-level of copy 1 into sequence 2*/
int32_t i; /* dummy counter */
int32_t count; /* dummy counter 2 */
int32_t *TableR1;
int32_t *TableR2;
*pT1 = NULL;
*pT2 = NULL;
if( !(AllRepeats.nDirRep-AllRepeats.nDirBadRep) &&
!(AllRepeats.nInvRep-AllRepeats.nInvBadRep) )
return;
/*
Alloc Tables
*/
TableR1 = (int32_t *)MyCalloc(size1, sizeof(int32_t), "SetRepFamily: calloc failed, bye");
TableR2 = (int32_t *)MyCalloc(size2, sizeof(int32_t), "SetRepFamily: calloc failed, bye");
/*
Fill those Tables
*/
if(opt_dir) Fill_Tables_2seq(size1, size2, TableR1, TableR2, AllRepeats.DirRep, AllRepeats.nDirRep);
if(opt_inv) Fill_Tables_2seq(size1, size2, TableR1, TableR2, AllRepeats.InvRep, AllRepeats.nInvRep);
for(count=1, i=0; i<size1 ; i++) TableR1[i] = (count += TableR1[i]);
for(count=1, i=0; i<size2 ; i++) TableR2[i] = (count += TableR2[i]);
/*
Get Their max values
*/
max_TableR=0;
for(i=0; i<size1; i++) max_TableR=( TableR1[i]>max_TableR)? TableR1[i]:max_TableR; /* set the max value */
for(i=0; i<size2; i++) max_TableR=( TableR2[i]>max_TableR)? TableR2[i]:max_TableR; /* set the max value */
/*
Alloc only once the histogram
*/
HistoTableR = (int32_t *)MyMalloc( (max_TableR+1) * sizeof(int32_t), "SetRepFamily: malloc error" ); /* get some memory */
/*
Tag the repeats
*/
if(opt_dir)
Tag_Rep_2seqs(AllRepeats.DirRep, AllRepeats.nDirRep, TableR1, TableR2, HistoTableR , max_TableR);
if(opt_inv)
Tag_Rep_2seqs(AllRepeats.InvRep, AllRepeats.nInvRep, TableR1, TableR2, HistoTableR , max_TableR);
/*
Free the histogram
*/
MyFree(HistoTableR, (max_TableR+1) * sizeof(int32_t));
*pT1 = TableR1;
*pT2 = TableR2;
}

View File

@ -0,0 +1,215 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/**
* @file KMRK.filter.c
* @author Guillaume Achaz <gachaz@oeb.harvard.edu>, Eric Coissac <coissac@inrialpes.fr>
* @date Tue Mar 2 18:33:41 2004
* @modif Sept 2005
*
* @brief seed filtrage function
*
*
*/
#include <stdlib.h>
#include <stdio.h>
#include "filter.h"
#include "memory.h"
#include "KMRK.h"
static void table_R(int32_t *table_r,
Seed_type *seeds,
int32_t nSeed,
int32_t size_chr);
static void table_R( int32_t *table_r, Seed_type *seeds, int32_t nSeed, int32_t size_chr)
{
int32_t u; /* a dummy counter */
Seed_type* curSeed; /* the current seed */
for(u=0, curSeed=seeds; u<nSeed; u++, curSeed++){
if(curSeed->pos1 == -1)
continue;
table_r[ curSeed->pos1 ]+=1;
if( (curSeed->pos1 + curSeed->length) < size_chr)
table_r[ curSeed->pos1 + curSeed->length ]-=1;
table_r[ curSeed->pos2 ]+=1;
if( (curSeed->pos2 + curSeed->length) < size_chr)
table_r[ curSeed->pos2 + curSeed->length ]-=1;
}
}
int32_t *KMRK_SeedTableR(AllSeeds_type *AllSeeds,
int8_t opt_dir,
int8_t opt_inv,
int32_t size_chr)
{
int32_t *table_r=NULL; /* init the so called R-table to NULL */
int32_t count=0; /* a token to mark each seed */
int32_t i=0; /* a dummy counter */
table_r= MyCalloc( size_chr, sizeof(int32_t), "mem_tableR: calloc failed, bye" );
if(opt_dir && AllSeeds->nDirSeeds)
table_R(table_r,
AllSeeds->dirSeeds,
AllSeeds->nDirSeeds,
size_chr);
if(opt_inv && AllSeeds->nInvSeeds)
table_R(table_r,
AllSeeds->invSeeds,
AllSeeds->nInvSeeds,
size_chr);
for(count=1, i=0; i<size_chr ; i++)
table_r[i] = (count+=table_r[i]);
return table_r;
}
void BuiltFilterFamily_seeds(AllSeeds_type *AllSeeds,
int32_t *table_r,
double min,
double Max,
int8_t opt_dir,
int8_t opt_inv)
{
double MeanR; /* the R-mean of the seed - average redundancy */
double TotalR; /* the sum of all r of each site of the repeat */
Seed_type *curSeed; /* the current seed */
int32_t u, i;
int32_t N=0;
int8_t opt_filter=(!min && !Max)?0:1; /* filter is on if one of the two is set by the user */
if(opt_dir){
for(u=0, N=0, curSeed=AllSeeds->dirSeeds; u<AllSeeds->nDirSeeds; u++,curSeed++)
{
TotalR = 0.0;
for(i=0; i < curSeed->length ; i++)
TotalR += (double)table_r[ curSeed->pos1+i] + table_r[curSeed->pos2+i]; /* add both repeats together */
MeanR = TotalR / (2.0 * (double) curSeed->length);
curSeed->rmean = MeanR;
if( opt_filter &&
( (Max && MeanR<=Max) || !Max) &&
( (min && MeanR>=min) || !min) )
{
AllSeeds->dirSeeds[N].pos1 = curSeed->pos1; /* thus keep it */
AllSeeds->dirSeeds[N].pos2 = curSeed->pos2;
AllSeeds->dirSeeds[N].length = curSeed->length;
AllSeeds->dirSeeds[N].rmean = curSeed->rmean;
N++;
}
}
if(opt_filter)
{
AllSeeds->nFilteredDirSeeds = AllSeeds->nDirSeeds - N;
AllSeeds->nDirSeeds=N;
}
}
if(opt_inv){
for(u=0, N=0, curSeed=AllSeeds->invSeeds; u < AllSeeds->nInvSeeds; u++,curSeed++)
{
TotalR = 0.0;
for(i=0; i < curSeed->length ; i++)
TotalR += (double)table_r[ curSeed->pos1+i] + table_r[curSeed->pos2+i]; /* add both repeats together */
MeanR = TotalR / (2.0 * (double) curSeed->length);
curSeed->rmean = MeanR;
if( opt_filter &&
( (Max && MeanR<=Max) || !Max) &&
( (min && MeanR>=min) || !min) )
{
AllSeeds->invSeeds[N].pos1 = curSeed->pos1; /* thus keep it */
AllSeeds->invSeeds[N].pos2 = curSeed->pos2;
AllSeeds->invSeeds[N].length = curSeed->length;
AllSeeds->invSeeds[N].rmean = curSeed->rmean;
N++;
}
}
if(opt_filter)
{
AllSeeds->nFilteredInvSeeds = AllSeeds->nInvSeeds-N; /* reset the memory */
AllSeeds->nInvSeeds=N;
}
}
if(opt_filter)
KMRK_compactSeeds(AllSeeds);
return;
}
void KMRK_FamilySeeds(AllSeeds_type *AllSeeds,
double min,
double Max,
int8_t opt_dir,
int8_t opt_inv,
int32_t size_chr)
{
int32_t *table_r; /* the seed table R */
if( !AllSeeds->nDirSeeds && !AllSeeds->nInvSeeds )return;
if(!opt_dir && !opt_inv)
fprintf(stderr, "FilterSeeds: opt_inv or opt_dir must be set to 1, bye\n"),exit(1);
table_r = KMRK_SeedTableR( AllSeeds, opt_dir, opt_inv, size_chr); /* calculate the seed table R */
BuiltFilterFamily_seeds(AllSeeds, table_r, min, Max, opt_dir, opt_inv); /* filter seeds if they does not fit between min and Max */
MyFree(table_r, size_chr*sizeof(int32_t));
return ;
}

View File

@ -0,0 +1,74 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
#ifndef KMRK_filter_h
#define KMRK_filter_h
#include "KMRK.h"
#include "KMRK_Seeds.h"
#include <stdio.h>
void KMRK_FamilySeeds(AllSeeds_type *AllSeeds,
double min,
double Max,
int8_t opt_dir,
int8_t opt_inv,
int32_t size_chr);
int32_t *KMRK_SeedTableR(AllSeeds_type *AllSeeds,
int8_t opt_dir,
int8_t opt_inv,
int32_t size_chr);
void KMRK_SeedTableR2seq(AllSeeds_type *AllSeeds,
int8_t opt_dir,
int8_t opt_inv,
int32_t size1,
int32_t size2,
int32_t **r1,
int32_t **r2);
void KMRK_FamilySeeds2seq(AllSeeds_type *AllSeeds,
double min,
double Max,
int8_t opt_dir,
int8_t opt_inv,
int32_t size1,
int32_t size2);
void BuiltFilterFamily_seeds(AllSeeds_type *AllSeeds,
int32_t *table_r,
double min,
double Max,
int8_t opt_dir,
int8_t opt_inv);
void BuiltFilterFamily_seeds2seq(AllSeeds_type *AllSeeds,
int32_t *table_r1,
int32_t *table_r2,
double min,
double Max,
int8_t opt_dir,
int8_t opt_inv);
#endif /* KMRK_filter_h */

View File

@ -0,0 +1,252 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/**
* @file KMRK.filter.c
* @author Eric Coissac <coissac@inrialpes.fr>
* @date Tue Mar 2 18:33:41 2004
* @modif Sept 2005
*
* @brief seed filtrage function
*
*
*/
#include <stdlib.h>
#include <stdio.h>
#include "filter.h"
#include "memory.h"
#include "KMRK.h"
static void table_R2(int32_t *table_r1,
int32_t *table_r2,
Seed_type *seeds,
int32_t nSeed,
int32_t size1,
int32_t size2);
static void table_R2(int32_t *table_r1,
int32_t *table_r2,
Seed_type *seeds,
int32_t nSeed,
int32_t size1,
int32_t size2)
{
int32_t u;
Seed_type* curSeed;
for(u=0, curSeed=seeds; u<nSeed; u++, curSeed++)
{
if(curSeed->pos1 == -1)
continue;
table_r1[ curSeed->pos1 ]+=1;
if( (curSeed->pos1 + curSeed->length) < size1)
table_r1[ curSeed->pos1 + curSeed->length ]-=1;
table_r2[ curSeed->pos2 ]+=1;
if( (curSeed->pos2 + curSeed->length) < size2)
table_r2[ curSeed->pos2 + curSeed->length ]-=1;
}
}
void KMRK_SeedTableR2seq(AllSeeds_type *AllSeeds,
int8_t opt_dir,
int8_t opt_inv,
int32_t size1,
int32_t size2,
int32_t **r1,
int32_t **r2)
{
int32_t *table_r1=NULL;
int32_t *table_r2=NULL;
int32_t count=0;
int32_t i=0;
table_r1= MyCalloc( size1,sizeof(int32_t), "mem_tableR: calloc failed, bye" );
table_r2= MyCalloc( size2,sizeof(int32_t), "mem_tableR: calloc failed, bye" );
if(opt_dir && AllSeeds->nDirSeeds)
table_R2(table_r1, table_r2, AllSeeds->dirSeeds, AllSeeds->nDirSeeds, size1, size2);
if(opt_inv && AllSeeds->nInvSeeds)
table_R2(table_r1,table_r2, AllSeeds->invSeeds, AllSeeds->nInvSeeds, size1, size2);
for(count=1, i=0; i<size1 ; i++)
table_r1[i] = (count+=table_r1[i]);
for(count=1, i=0; i<size2 ; i++)
table_r2[i] = (count+=table_r2[i]);
*r1 = table_r1;
*r2 = table_r2;
}
void BuiltFilterFamily_seeds2seq(AllSeeds_type *AllSeeds,
int32_t *table_r1,
int32_t *table_r2,
double min,
double Max,
int8_t opt_dir,
int8_t opt_inv)
{
float MeanR1, MeanR2;
float TotalR1, TotalR2;
Seed_type *curSeed;
int32_t u, i;
int32_t N=0;
int8_t opt_filter=(!min && !Max)?0:1;
if(opt_dir){
for(u=0, N=0, curSeed=AllSeeds->dirSeeds; u<AllSeeds->nDirSeeds; u++,curSeed++)
{
TotalR1 = 0.0;
TotalR2 = 0.0;
for(i=0; i < curSeed->length ; i++)
{
TotalR1 += table_r1[curSeed->pos1+i];
TotalR2 += table_r2[curSeed->pos2+i]; /* add both repeats together */
}
MeanR1 = (float)TotalR1 / ( (float) curSeed->length);
MeanR2 = (float)TotalR2 / ( (float) curSeed->length);
curSeed->rmean = MeanR1 + MeanR2 - 2;
if( opt_filter &&
( ( (Max && MeanR1<=Max) || !Max) &&
( (min && MeanR1>=min) || !min) ) &&
( ( (Max && MeanR2<=Max) || !Max) &&
( (min && MeanR2>=min) || !min) ) )
{ /* && has a higher priority than || */
AllSeeds->dirSeeds[N].pos1 = curSeed->pos1; /* thus keep it */
AllSeeds->dirSeeds[N].pos2 = curSeed->pos2;
AllSeeds->dirSeeds[N].length = curSeed->length;
AllSeeds->dirSeeds[N].rmean = curSeed->rmean;
N++;
}
}
if(opt_filter)
{
AllSeeds->nFilteredDirSeeds = AllSeeds->nDirSeeds - N;
AllSeeds->nDirSeeds=N;
}
}
if(opt_inv){
for(u=0, N=0, curSeed=AllSeeds->invSeeds; u < AllSeeds->nInvSeeds; u++,curSeed++)
{
TotalR1 = 0.0;
TotalR2 = 0.0;
for(i=0; i < curSeed->length ; i++)
{
TotalR1 += table_r1[curSeed->pos1+i];
TotalR2 += table_r2[curSeed->pos2+i]; /* add both repeats together */
}
MeanR1 = (float)TotalR1 / ( (float) curSeed->length);
MeanR2 = (float)TotalR2 / ( (float) curSeed->length);
curSeed->rmean = MeanR1 + MeanR2 -2;
if( opt_filter &&
( ( (Max && MeanR1<=Max) || !Max) &&
( (min && MeanR1>=min) || !min) ) &&
( ( (Max && MeanR2<=Max) || !Max) &&
( (min && MeanR2>=min) || !min) ) )
{
AllSeeds->invSeeds[N].pos1 = curSeed->pos1; /* thus keep it */
AllSeeds->invSeeds[N].pos2 = curSeed->pos2;
AllSeeds->invSeeds[N].length = curSeed->length;
AllSeeds->invSeeds[N].rmean = curSeed->rmean;
N++;
}
}
if(opt_filter)
{
AllSeeds->nFilteredInvSeeds = AllSeeds->nInvSeeds-N; /* reset the memory */
AllSeeds->nInvSeeds=N;
}
}
if(opt_filter)
KMRK_compactSeeds(AllSeeds);
return;
}
void KMRK_FamilySeeds2seq(AllSeeds_type *AllSeeds,
double min,
double Max,
int8_t opt_dir,
int8_t opt_inv,
int32_t size1,
int32_t size2)
{
int32_t *table_r1; /* the seed table R */
int32_t *table_r2; /* the seed table R */
if( !AllSeeds->nDirSeeds && !AllSeeds->nInvSeeds )
return;
if(!opt_dir && !opt_inv)
fprintf(stderr, "FilterSeeds: opt_inv or opt_dir must be set to 1, bye\n"),exit(1);
KMRK_SeedTableR2seq( AllSeeds,
opt_dir, opt_inv,
size1,size2,
&table_r1,&table_r2); /* calculate the seed table R */
BuiltFilterFamily_seeds2seq(AllSeeds,
table_r1, table_r2,
min, Max,
opt_dir, opt_inv); /* filter seeds if they does not fit between min and Max */
MyFree(table_r1, size1*sizeof(int32_t));
MyFree(table_r2, size2*sizeof(int32_t));
return;
}

View File

@ -0,0 +1,195 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/******
file : help.c
function : Help for all repseek series
created : 02 Oct 2002
modif : Oct 2003
Dec 2004 Adds for option -D : managing of mask for seed detection <EC>
Dec 2005 switch to a one mode version
author : amikezor
*****/
#include "repseek_types.h"
#include <stdio.h>
#include <stdlib.h>
static void help_contact(){
fprintf(stderr," Contact\n"
" if you need help, want to report a bug or suggest an option,\n"
" please contact G. Achaz by email: achaz(at)abi.snv.jussieu.fr\n"
"\n"
);
}
static void help_information(){
fprintf(stderr," Information\n"
"\t-v : Print 'v'ersion and exit\n"
"\t-h : Print a short 'h'elp and exit\n"
"\t-H : Print a larger 'H'elp (with input and output formats) and exit\n"
"\n");
}
static void help_corevalues(){
fprintf(stderr, " Core values (select either at least one of the four -l/-p -L/-P option)\n"
" seed minimum length: lmin\n"
"\t-l lmin : set a minimum length for the seeds detection (no pvalue).\n"
"\t-p prob : set a p-value that gives a minimum seed length (Karlin & Ost equation)\n"
" repeat minimum score: smin\n"
"\t-L smin : set a minimum score for extended repeat (def is 0: no filter).\n"
"\t-P prob : set a p-value that gives a minimum repeat score (Waterman & Vingron regression)\n"
"\n"
);
}
static void help_options( ){
fprintf(stderr," Optionnal values\n"
"\t-r file : 'r'edirect output into file (otherwise stdout is used).\n"
"\t-T : print out R-'T'able\n"
"\t the R-Table (or R-Table2) shows all non-unique position and its degree of redundancy\n"
"\t-S : output 'S'eeds and exit (do not perform extention).\n"
"\t-s seed_file : use seeds given in seed_file instead of detecting them\n"
"\t File format is 'd|i begin end len [optionnal other fields]'.\n"
"\t-D mask_file : mask sequence regions during seed detection only (cannot be used with -s seed_file).\n"
"\t File format is 'begin end [seq#]' (seq# is 1 or 2; nothing is treated as 1).\n"
"\t-R 0.## : merge 'R'epeats when they share 0.## of their both copies (default = 0.90).\n"
"\t When set to 1.0, merge repeats having exactly the same positions\n"
"\t-d : detect only 'd'irect repeats.\n"
"\t-i : detect only 'i'nverted repeats.\n"
"\t (-d and -i are mutually exclusive).\n"
"\t-B : if some seeds overlaps (i.e. in low-complexity sequence), just keep the 'B'iggest one.\n"
"\t-m #.# : keep only seeds occurring at least the specified minimum number of times (min is a float).\n"
"\t-M #.# : keep only seeds occurring less than the specified maximum number of times (Max is a float).\n"
"\t-O 0|1 : for 1 sequence, direct repeat can have their 2 copies 'O'verlapping (0: no, 1: yes -default-)\n"
"\t-c : set the chromosome as 'c'ircular -default is linear- (unused for 2 sequences)\n"
"\t-o #.# : set gap_open penalty -default 4.0- (cannot be change when -P is used)\n"
"\t-e #.# : set gap_ext penalty -default 1.0- (cannot be change when -P is used)\n"
"\t-X #.# : set 'X'g, the 'exploration value' -default 30.0-\n"
"\n"
);
}
static void help_input( void ){
fprintf(stderr," Input\n"
" Input file(s) should be in a fasta format.\n"
" All non-ACGT characters are considered as N, except for X in which no repeats are detected (mask)\n"
"\n"
);
}
static void help_output( void ){
fprintf(stderr," Output\n"
"\tRepeats are displayed in 12 COLUMNS\n"
"\t 01 - type of the repeat (Tandem|Close|Distant|Overlap|Palindrome|Interseq).(dir|inv)\n"
"\t 'Tandem' : repeat is a perfect tandem (spacer=0)\n"
"\t 'Close' : spacer is smaller than 1 kb\n"
"\t 'Distant' : spacer is larger than 1 kb\n"
"\t 'Overlap' : repeat is composed of two overlapping copies (spacer<0)\n"
"\t 'Palindrome' : repeat is a perfect palindrome (spacer=0)\n"
"\t 'Interseq' : repeat is shared by two sequences\n"
"\t 02 - position of the first copy\n"
"\t 03 - position of the second copy\n"
"\t 04 - length of the first copy\n"
"\t 05 - length of the second copy\n"
"\t 06 - spacer of the repeats (corrected when the sequence is circular)\n"
"\t 07 - characteristics of the seed that gave that repeat (pos1-pos2-len-meanR)\n"
"\t 08 - %%identity between the two copies (matches / alignment_length)\n"
"\t 09 - score of the alignement\n"
"\t 10 - mean-R of the repeat\n"
"\t 11 - mode-R of the repeat\n"
"\t 12 - fraction of its mode-R\n"
"\n"
);
fprintf(stderr,"\tSeeds are displayed in 5 COLUMNS\n"
"\t 01 - Type of the repeat (Seed).(dir|inv)\n"
"\t 02 - position of the first copie\n"
"\t 03 - position of the second copie\n"
"\t 04 - length of the seed\n"
"\t 05 - mean-R\n\n"
);
}
void printversion_repseek(){
fprintf(stderr,"repseek version %s (%s)\n", REPSEEK_VERSION, REPSEEK_DATE);
}
void printusage_repseek(char **argv){
fprintf(stderr,"General usage is '%s [-v|-h|-H] [ opts ] { core_value } file.fst [file2.fst]'\n\n",argv[0]);
help_corevalues();
help_information();
}
void printhelp_repseek(char **argv){
fprintf(stderr,"General usage is '%s [-v|-h|-H] [ opts ] { core_value } file.fst [file2.fst]'\n\n",argv[0]);
help_corevalues();
help_options( );
help_information();
help_input();
help_contact();
}
void printmorehelp_repseek(char **argv){
fprintf(stderr,"/*\n"
"\tRepseek looks for all repeats couples within (one fasta file) or between sequences (two files)\n"
"\tThe method is based on a two steps method (seed detection followed by their extension into repeats)\n"
"\tThe statistical significance of repeats can be (i) not evaluated (-l opt),\n"
"\t(ii) evaluated for seeds (-p opt) or (iii) evaluated for repeats (-P opt). For further details, \n"
"\tplease refer to the manuscript (Achaz et al., in prep)\n"
"*/\n"
);
fprintf(stderr,"General usage is '%s [-v|-h|-H] [ opts ] { core_value } file.fst [file2.fst]'\n\n",argv[0]);
help_corevalues();
help_options( );
help_information();
help_input();
help_output();
help_contact();
}

View File

@ -0,0 +1,51 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
#ifndef _help_h
#define _help_h
/******
file : help.c
function : Help for all repseek series
created : 02 Oct 2002
modif : Oct 2003
author : amikezor
*****/
#include <stdio.h>
#include <stdlib.h>
/********* *********
All you need for RepSeek
********* *********/
void printversion_repseek();
void printusage_repseek(char **argv);
void printhelp_repseek(char **argv);
void printmorehelp_repseek(char **argv);
#endif

View File

@ -0,0 +1,639 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/******
file : lmin.c
function : get lmin from a pvalue
created : Oct 11 2002
note : almost all functions comes from lari_stat.c
Apr. 97 <AV> first draft
April 98 <ed> K formula added
Oct 2002 <amikezor> - subset of lari_stat - add LminRiskSeq()
*****/
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "repseek_types.h"
#include "memory.h"
#define NALPHA 4
typedef struct {
int32_t nalpha;
int32_t total_symb, val_symb, bad_symb;
int32_t count[NALPHA+1];
float freq[NALPHA+1];
char alpha[NALPHA+2];
} FreqTable;
#undef NALPHA
/***
Few function coming from numrec as it
***/
static void nrerror(char *error_text)
{
fprintf(stderr,"> Numerical Recipes run-time error...\n");
fprintf(stderr,"> %s\n", error_text);
exit(1);
}
static float gammln(float xx)
{
double x,y,tmp,ser;
static double cof[6]={76.18009172947146,-86.50532032941677,
24.01409824083091,-1.231739572450155,
0.1208650973866179e-2,-0.5395239384953e-5};
int j;
y=x=xx;
tmp=x+5.5;
tmp -= (x+0.5)*log(tmp);
ser=1.000000000190015;
for (j=0;j<=5;j++) ser += cof[j]/++y;
return -tmp+log(2.5066282746310005*ser/x);
}
#define ITMAX 100
#define EPS 3.0e-7
static void gser(float *gamser, float a, float x, float *gln)
{
int n;
float sum,del,ap;
*gln=gammln(a);
if (x <= 0.0) {
if (x < 0.0) nrerror("x less than 0 in routine gser");
*gamser=0.0;
return;
} else {
ap=a;
del=sum=1.0/a;
for (n=1;n<=ITMAX;n++) {
++ap;
del *= x/ap;
sum += del;
if (fabs(del) < fabs(sum)*EPS) {
*gamser=sum*exp(-x+a*log(x)-(*gln));
return;
}
}
nrerror("a too large, ITMAX too small in routine gser");
return;
}
}
#define FPMIN 1.0e-30
static void gcf(float *gammcf, float a, float x, float *gln)
{
int i;
float an,b,c,d,del,h;
*gln=gammln(a);
b=x+1.0-a;
c=1.0/FPMIN;
d=1.0/b;
h=d;
for (i=1;i<=ITMAX;i++) {
an = -i*(i-a);
b += 2.0;
d=an*d+b;
if (fabs(d) < FPMIN) d=FPMIN;
c=b+an/c;
if (fabs(c) < FPMIN) c=FPMIN;
d=1.0/d;
del=d*c;
h *= del;
if (fabs(del-1.0) < EPS) break;
}
if (i > ITMAX) nrerror("a too large, ITMAX too small in gcf");
*gammcf=exp(-x+a*log(x)-(*gln))*h;
}
#undef ITMAX
#undef EPS
#undef FPMIN
static float gammq(float a, float x)
{
float gamser=0.0,
gammcf=0.0,
gln=0.0;
if (x < 0.0 || a <= 0.0) nrerror("Invalid arguments in routine gammq");
if (x < (a+1.0)) {
gser(&gamser,a,x,&gln);
return 1.0-gamser;
} else {
gcf(&gammcf,a,x,&gln);
return gammcf;
}
}
/* ----------------------------------------------- */
/* compute symbol frequencies */
/* from seq and alphabet */
/* ----------------------------------------------- */
int InitFreqTable(char *seq, char *alpha, FreqTable *table)
{
int i;
float norm;
char *s, *found;
strcpy(table->alpha, alpha);
table->nalpha = strlen(alpha);
for (i = 0 ; i <= table->nalpha ; i++)
table->count[i] = 0;
table->total_symb = 0;
for (s = seq; *s; s++) {
table->total_symb++;
if ( (found=strchr(alpha, *s) ))
table->count[(int)(found - alpha)]++;
else
table->count[table->nalpha]++;
}
table->bad_symb = table->count[table->nalpha];
table->val_symb = table->total_symb - table->bad_symb;
norm = MAX(1., table->val_symb);
for (i = 0 ; i < table->nalpha ; i++)
table->freq[i] = (float) table->count[i] / norm;
return table->val_symb;
}
/* ----------------------------------------------- */
/* compute x^n */
/* ----------------------------------------------- */
static double sPower(double x, int n)
{
double pow = 1.;
while (n-- > 0)
pow *= x;
return pow;
}
/* ----------------------------------------------- */
/* compute lambdaR */
/* ----------------------------------------------- */
static double LambdaR(FreqTable *table, int r)
{
int i;
double lamb = 0.;
for (i = 0 ; i < table->nalpha ; i++)
lamb += sPower((double)table->freq[i], r);
return lamb;
}
/* ----------------------------------------------- */
/* log of binomial coeff */
/* ----------------------------------------------- */
static double sLogBinoCof(int n, int p)
{
double lb = 0.;
while (p >= 1)
lb += log((double)n-- / (double)p--);
return lb;
}
/* ----------------------------------------------- */
/* compute a(x,y) */
/* ----------------------------------------------- */
static double sAlpha(double x, double y)
{
double x1, x2;
x1 = (log(1. - x) + log(y)) / log(x);
x2 = 0.57772 / log(x);
return -(x1 + x2);
}
/* ----------------------------------------------- */
/* compute E(L(N,r)) */
/* ----------------------------------------------- */
static double ExpectL(FreqTable *table, int N, int r)
{
double x1, x2, lambr;
lambr = LambdaR(table, r);
x1 = sLogBinoCof(N, r) / -log(lambr);
x2 = sAlpha(lambr, lambr);
return x1 + x2 + 0.5;
}
/* ----------------------------------------------- */
/* compute Var(N,r) */
/* ----------------------------------------------- */
double VarianceL(FreqTable *table, int r)
{
double x;
x = 1. / log(LambdaR(table, r));
return 1.645 * x * x;
}
/* ----------------------------------------------- */
/* compute recursive rlogN */
/* ----------------------------------------------- */
static void finN(indx, r, data, res)
int *indx, r, *data;
double *res;
{
int i;
double aux;
for (i = 0, aux=1. ; i < r ; i++)
aux *= data[indx[i]];
*res += aux;
}
static void recursN(indx, s, r, rcurr, ipos, data, res)
int *indx, *data;
double *res;
int s, r, rcurr, ipos;
{
int i;
if (rcurr == r) /* end of recursion */
finN(indx, r, data, res);
else { /* body of recursion */
for (i = ipos ; i < s ; i++) {
indx[rcurr] = i;
recursN(indx, s, r, rcurr+1, i+1, data, res);
}
}
}
/* ----------------------------------------------- */
/* compute recursive Rlambda */
/* ----------------------------------------------- */
static void finR(indx, r, data, res)
int *indx, r;
double *res;
FreqTable *data;
{
int i, j;
double aux, aux2;
for (j=0, aux2=0; j<4; j++){
for (i = 0, aux=1. ; i < r ; i++)
aux *= (double) data[indx[i]].freq[j];
aux2 += aux;
}
if (aux2>*res)
*res = aux2;
}
static void recursR(indx, s, r, rcurr, ipos, data, res)
int *indx, s, r, rcurr, ipos;
double *res;
FreqTable *data;
{
int i;
if (rcurr == r) /* end of recursion */
finR(indx, r, data, res);
else { /* body of recursion */
for (i = ipos ; i < s ; i++) {
indx[rcurr] = i;
recursR(indx, s, r, rcurr+1, i+1, data, res);
}
}
}
/* ----------------------------------------------- */
/* binomial coeff */
/* ----------------------------------------------- */
static double sBinoCof(int n, int p)
{
double b = 1.;
while (p >= 1)
b *= ((double)n-- / (double)p--);
return b;
}
/* ----------------------------------------------- */
/* compute E(K(N,r)) */
/* ----------------------------------------------- */
static double ExpectK(FreqTable *table, int r, int s)
{
double x2, x3, lambr;
int *indx, *data, i, nelt;
double x1;
if (s!=r)
nelt = ceil(sBinoCof(s, r));
else
nelt = 1;
indx = (int32_t *)MyCalloc( nelt, sizeof(int32_t), "ExpectK out of memory" ) ;
data = (int32_t *)MyCalloc( s, sizeof(int32_t), "ExpectK out of memory" ) ;
x1=0.;
for (i=0; i<s; i++)
data[i] = table[i].total_symb;
recursN(indx, s, r, 0, 0, data, &x1);
lambr=0;
recursR(indx, s, r, 0, 0, table, &lambr);
x2 = log(x1) / -log(lambr);
x3 = sAlpha(lambr, lambr);
MyFree(indx, nelt* sizeof(int32_t));
MyFree(data, s* sizeof(int32_t));
return x2 + x3 + 0.5;
}
/* ----------------------------------------------- */
/* compute VarK (N,r) */
/* ----------------------------------------------- */
static double VarianceK(FreqTable *table, int r)
{
double x;
x = 1. / log(LambdaR(table, r));
return 1.645 * x * x;
}
/* ----------------------------------------------- */
/* compute Erf(x) */
/* ----------------------------------------------- */
static double sErf(double x)
{
double gamp;
gamp = 1.0 - gammq(0.5, x * x);
return ((x < 0.) ? -gamp : gamp);
}
/* ----------------------------------------------- */
/* compute cumulative of Normal */
/* ----------------------------------------------- */
#define SQRT2 1.4142135624
static double sCumNormal(double x, double mu, double sigma)
{
double xx;
xx = (x - mu) / sigma / SQRT2;
return (1.0 + sErf(xx)) / 2.;
}
#undef SQRT2
/* ----------------------------------------------- */
/* compute Lmin(N,r) at error level p */
/* ----------------------------------------------- */
#define TOLERANCE 1e-6
static double LminRisk(FreqTable *table, int N, int r, float p)
{
double pp, mu, sigma, xmin, xmax, xcurr, pcurr;
if (p <= 0.)
return 1.;
if (p >= 1.)
return (double) N;
pp = 1.0 - (double) p;
mu = ExpectL(table, N, r);
sigma = sqrt(VarianceL(table, r));
xmin = 1;
xmax = N;
xcurr = (xmin + xmax) / 2. ;
pcurr = sCumNormal(xcurr, mu, sigma);
/* dichotomic search */
while (fabs(pp - pcurr) > TOLERANCE) {
if (pp > pcurr)
xmin = xcurr;
else
xmax = xcurr;
xcurr = (xmin + xmax) / 2. ;
pcurr = sCumNormal(xcurr, mu, sigma);
}
return xcurr;
}
/* ----------------------------------------------- */
/* compute K Lmin(N,r) at error level p */
/* ----------------------------------------------- */
static double KLminRisk(FreqTable *table, double rlogn, int r, int s,
float p, int len)
{
double pp, mu, sigma, xmin, xmax, xcurr, pcurr;
if (p <= 0.)
return 1.;
if (p >= 1.)
return rlogn;
pp = 1.0 - (double) p;
mu = ExpectK(table, r, s);
sigma = sqrt(VarianceK(table, r));
xmin = 1;
xmax = len;
xcurr = (xmin + xmax) / 2. ;
pcurr = sCumNormal(xcurr, mu, sigma);
/* dichotomic search */
while (fabs(pp - pcurr) > TOLERANCE) {
if (pp > pcurr)
xmin = xcurr;
else
xmax = xcurr;
xcurr = (xmin + xmax) / 2. ;
pcurr = sCumNormal(xcurr, mu, sigma);
}
return xcurr;
}
#undef TOLERANCE
/***
Take a sequencem the number of occurence and the prob and
return an associated lmin
***/
#define DNA_ALPHABET "ACGT"
static double LminRiskSeq(char *sequence, int r, float p)
{
int nSymbols=0;
char alphabet[32]={0,};
FreqTable ftable;
double Lmin=0.0;
(void) strcpy(alphabet, DNA_ALPHABET);
(void) InitFreqTable(sequence, alphabet, &ftable);
nSymbols = ftable.val_symb;
Lmin = LminRisk(& ftable, nSymbols, r, p);
return Lmin;
}
int16_t set_lmin(float pval, char *sequence){
float flmin;
int16_t lmin;
flmin = LminRiskSeq(sequence, 2, pval);
if(flmin - (float)(int16_t)flmin < 0.5)
lmin = (int16_t)flmin;
else
lmin = (int16_t)flmin + 1;
return lmin;
}
/***
Take a sequence the number of occurence and the prob and
return an associated lmin
***/
#define DNA_ALPHABET "ACGT"
static double LminRisk2Seq(char *Sequence1,
char *Sequence2,
int32_t SeqSize1,
int32_t SeqSize2,
int r, float p)
{
char alphabet[32]={0,}; /* an theoretical bigger alphabet than ACGT */
FreqTable ftable[2]; /* a freq for each sequence */
int nseq=2; /* number of sequences */
double Lmin=0.0; /* the value of interest */
(void) strcpy(alphabet, DNA_ALPHABET);
(void) InitFreqTable(Sequence1, alphabet, &ftable[0]); /* init freq of the first seq */
(void) InitFreqTable(Sequence2, alphabet, &ftable[1]); /* init freq of the second seq */
Lmin = KLminRisk(ftable, (double) r*log(SeqSize1+SeqSize2/nseq), r, nseq, p, SeqSize1+SeqSize2);
return Lmin;
}
#undef DNA_ALPHABET
int16_t set_lmin_2seqs(float pval,
char *Sequence1,
char *Sequence2,
int32_t SeqSize1,
int32_t SeqSize2)
{
double flmin;
int16_t lmin;
int8_t r=2;
flmin = LminRisk2Seq(Sequence1, Sequence2, SeqSize1, SeqSize2, r, pval);
if(flmin - (double)(int16_t)flmin < 0.5)
lmin = (int16_t)flmin;
else
lmin = (int16_t)flmin + 1;
return lmin;
}
#undef DNA_ALPHABET

View File

@ -0,0 +1,33 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
#ifndef lmin_h
#define lmin_h
int16_t set_lmin(float pval, char *sequence);
int16_t set_lmin_2seqs(float pval,
char *Sequence1,
char *Sequence2,
int32_t SeqSize1,
int32_t SeqSize2);
#endif /* lmin_h */

View File

@ -0,0 +1,671 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/**
* @file main_repseek.c
* @author Guillaume Achaz <achaz@abi.snv.jussieu.fr>, Eric Coissac <coissac@inrialpes.fr>
* @date Feb 26 2004
* @modif April 2004
* @modif Dec 2004 Adds for option -D : managing of mask for seed detection <EC>
* @modif sept 2005 re-write all align_blast2like. Correct Memory bugs <GA>
* @modif 17 oct 2005 - mode '4' <GA>
* @modif dec 2005 - remove mode choice <GA>
* @modif jan-fev-march 2006 - stable version (-l, -p or -P options) <GA>
* @modif may 2008 - count Xs in the sequence and remove them to compute smin <GA>
*
* @brief main function of repseek program
*
*
*/
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#if defined(MACOSX) || defined(LINUX) || defined(OSF) /* for options */
#include <unistd.h>
#endif
#include "repseek_types.h"
#include "sequence.h"
#include "readfst.h"
#include "output.h"
#include "help.h"
#include "families.h"
#include "KMRK.h"
#include "KMRK_Seeds.h"
#include "KMRK_merge_seeds.h"
#include "KMRK_mask.h"
#include "read_seeds.h"
#include "filter.h"
#include "align.h"
#include "memory.h"
#include "smin.h"
#include "lmin.h"
int main(int argc, char **argv){
/*
Repseek Core Values
*/
int32_t selected_l=0; /* the lmin chosen by the user --if any-- */
int32_t lmin=0; /* the statistical lmin for seeds detection */
double selected_s=0; /* estimated score minimum if repseek is running in mode '3' */
double smin=0; /* estimated score minimum if repseek is running in mode '3' */
float pval_lmin=0.0; /* a pval that gives an Lmin (Karlin & Ost equation) */
float pval_smin=0.0; /* a pval that gives an Smin (Watermann & Vingron regression) */
/*
Sequence(s)
*/
int8_t repseek_nseq; /* either 1 or 2, 1 being for 1 sequence and 2 for 2 sequences */
char *sequence; /* the first sequence */
char *sequence2=NULL; /* the second sequence --only use when repseek_nseq is 2 */
int32_t size; /* the size of the sequence */
int32_t size2=0; /* the size of the second sequence --only use when repseek_nseq is 2*/
int32_t totalSize; /* size of (both) sequence(s) */
int32_t nX=0, /* number of X in the first and the second sequence */
nX2=0;
masked_area_table_t *mask=NULL; /* masking table used during seed detection */
char *mask_file=NULL;
char opt_shape = 'l'; /* chromosome has to be set at 'l'inear or 'c'ircular */
/*
Output/Input File(s)
*/
char *output_file = NULL; /* the output file name base */
char *seed_file = NULL; /* the seed file that can be used instead of the seed detection */
FILE *fout=stdout; /* the output flux */
/*
Repseek Options
*/
int optc; /* what U need for options parsing */
extern int optind;
extern char *optarg;
int8_t opt_dir = 1; /* if 0 user do not want direct repeats */
int8_t opt_inv = 1; /* if 0 user do not want inverted repeats */
int8_t opt_MergeSeeds=0; /* if 1, thus user ask for seed merging - remove overlapping seeds */
int8_t opt_filter=0; /* if 1, thus user ask for filtering */
float min=0.0, /* if they are non-0, they are used to filter the seeds */
Max=0.0; /* min is the minimum of n-plication, Max is the maximum */
float merge_repeats=0.90; /* merge repeats when x% of both copies is located at the same spot */
int8_t opt_TableR=0; /* do we output TableR - 0 = no, default */
int8_t opt_PrintSeeds=0; /* do we output Seeds - 0 = no, default */
int8_t opt_overlap=1; /* if 0, repeats cannot have their two copies overlapping */
/*
repseek storage
*/
AllSeeds_type* AllSeeds; /* A structure that contains all seeds. Defined in KMRK_Seeds.h */
Repeats AllRepeats; /* A structure that contains all repeats. */
int32_t *TableR=NULL; /* A pointer to the tableR */
int32_t *TableR2=NULL; /* A pointer to a second tableR -- If there are 2 sequences */
/*
Repseek alignment variables
*/
float Xg=30.0; /* The exploratory value. for BLAST2. same notation than in BLAST2 paper */
float gap_open=4.0, /* gap open and gap_ext penality - expressed as a coefficient of the mean score. */
gap_ext=1.0;
char matrix_type='l'; /* matrix is by default 'l'og, can also be set to 'i'dentity */
/*
Parse options
*/
while( (optc=getopt(argc, argv, "Hhvl:p:L:P:TSdis:m:M:co:e:X:O:ID:r:BR:")) != -1)
switch(optc){
/*
Information
*/
case 'h':
printhelp_repseek(argv);
exit(1);
case 'H':
printmorehelp_repseek(argv);
exit(1);
case 'v':
printversion_repseek();
exit(1);
/*
Main Inputs
*/
case 'l':
selected_l = (int16_t)atoi(optarg);
if(selected_l < 1)
fprintf(stderr,"selected lmin has to be >= 1, bye\n"),exit(1);
break;
case 'p':
pval_lmin = atof(optarg);
if(pval_lmin > 1.0 || pval_lmin < 0.0)fprintf(stderr,"set 0.0>pval_lmin>=1.0, bye\n"),exit(1);
break;
case 'L':
selected_s = atof(optarg);
if(selected_s < 0.0 )fprintf(stderr,"selected smin has to be >=0.0, bye\n"),exit(1);
break;
case 'P':
pval_smin = atof(optarg);
if(pval_smin > 1.0 || pval_smin < 0.0)fprintf(stderr,"set 0.0>pval_smin>=1.0, bye\n"),exit(1);
break;
/*
Output/Input
*/
case 'r':
output_file = optarg;
break;
case 'T':
opt_TableR = 1;
break;
case 'S':
opt_PrintSeeds = 1;
break;
case 's':
seed_file = optarg;
break;
/*
Options for detection
*/
case 'd':
if(!opt_dir)
fprintf(stderr,"options '-d' and '-i' are mutually exclusive, bye\n"),exit(1);
opt_inv=0;
break;
case 'i':
if(!opt_inv)
fprintf(stderr,"options '-d' and '-i' are mutually exclusive, bye\n"),exit(1);
opt_dir=0;
break;
/*
Filtering
*/
case 'm':
min = atof(optarg);
opt_filter = 1;
break;
case 'M':
Max = atof(optarg);
opt_filter = 1;
break;
case 'B':
opt_MergeSeeds=1;
break;
case 'R':
merge_repeats= atof(optarg);
if(merge_repeats<0.0 || merge_repeats>1.0)
fprintf(stderr,"for merging repeats: 0.0 <= R <= 1.0 \n"),exit(1);
break;
/*
Alignment options
*/
case 'o':
gap_open = atof(optarg);
break;
case 'e':
gap_ext = atof(optarg);
break;
case 'X':
Xg = ABS(atof(optarg));
break;
case 'O':
opt_overlap = (int8_t )atoi(optarg);
if(opt_overlap != 0 && opt_overlap != 1)
fprintf(stderr,"opt_overlap can only be 1-can overlapp or 0- no overlap\n, bye"),exit(1);
break;
/*
Misc options
*/
case 'c':
opt_shape = 'c';
break;
case 'I':
matrix_type='i';
break;
case 'D':
mask_file = optarg;
break;
}
/*
Do we have the minimum to run ? (mode minimum input and an output)
*/
repseek_nseq = argc-optind;
if((repseek_nseq < 1) || (repseek_nseq > 2) )
printusage_repseek(argv),exit(1);
if( !selected_l && !pval_lmin && !pval_smin && !selected_s)
fprintf(stderr,"repseek minium core value is either -l/-p (i.e. set seed size lmin) or -L/-P (i.e. set repeat score smin)\n"),exit(1);
if( selected_l && pval_lmin )
fprintf(stderr,"Only one between -l, -p should be used (i.e. set seed lmin)\n"),exit(1);
if( selected_s && pval_smin )
fprintf(stderr,"Only one between -L, -P should be used (i.e. set repeat smin)\n"),exit(1);
if( pval_smin && (gap_open != 4.0 || gap_ext != 1.0 ) )
fprintf(stderr,"Using the Pval on repeats, gaps penalties should be default ones\n"),exit(1);
if(output_file){
fout = fopen(output_file,"w");
if(!fout) fprintf(stderr, "Cannot write into %s file, bye\n", output_file),exit(1);
}
fprintf(stderr,"Repseek mode... '%s' + '%s'\n",
pval_lmin?"seed stat":"seed no stat",
pval_smin?"repeat stat":"repeat no stat"
);
/*
Get sequence(s) in memory
*/
fprintf(stderr,"get sequence(s)... ");
sequence = readFastaSeq(argv[optind], &size, &nX);
totalSize = size;
if(repseek_nseq==2){
sequence2 = readFastaSeq(argv[optind+1], &size2, &nX2);
totalSize+=size2;
}
/*
Output Sequence Length and Xs
*/
if(totalSize < 1e3)
fprintf(stderr,"total size %d b ", totalSize );
else if(totalSize < 1e6)
fprintf(stderr,"total size %.2f kb ", ((float)totalSize)/1e3 );
else if(totalSize < 1e9)
fprintf(stderr,"total size %.2f Mb ", ((float)totalSize)/1e6 );
else
fprintf(stderr,"total size %.2f Gb ", ((float)totalSize)/1e9 );
switch( (char)floor( log(nX+nX2)/log(10) ) ){
case 0: case 1: case 2:
fprintf(stderr,"(%d Xs)\n", (nX+nX2) );
break;
case 3: case 4: case 5:
fprintf(stderr,"(%.2f K Xs)\n", (float)(nX+nX2)/1e3 );
break;
case 6: case 7: case 8:
fprintf(stderr,"(%.2f M Xs)\n", (float)(nX+nX2)/1e6 );
break;
default:
fprintf(stderr,"(%.2f G Xs)\n", (float)(nX+nX2)/1e9 );
break;
}
/*
handle the seed minimum size
*/
fprintf(stderr,"set lmin... ");
if(selected_l)
lmin=selected_l;
else{
if(pval_lmin){
if (repseek_nseq==2)
lmin=set_lmin_2seqs(pval_lmin, sequence,sequence2,size,size2); /* Lmin is computed ignoring X and N */
else
lmin=set_lmin(pval_lmin, sequence);
}
else{
if (repseek_nseq==2)
lmin=(int32_t)set_lmin_2seqs(0.001, sequence,sequence2,size,size2)*2.0/3.0;
else
lmin=(int32_t)set_lmin(0.001, sequence)*2.0/3.0;
}
}
fprintf(stderr, "seed >= %d bp ( L5%%: %d ; L1%%: %d ; L0.1%%: %d )\n",
lmin,
(repseek_nseq==2)?set_lmin_2seqs(0.05, sequence,sequence2,size,size2):set_lmin(0.05, sequence),
(repseek_nseq==2)?set_lmin_2seqs(0.01, sequence,sequence2,size,size2):set_lmin(0.01, sequence),
(repseek_nseq==2)?set_lmin_2seqs(0.001, sequence,sequence2,size,size2):set_lmin(0.001, sequence)
);
/*
Select the repeat minimum score
*/
fprintf(stderr,"set smin... ");
if(selected_s)
smin=selected_s; /* set the repeat minimum score as given in the input */
else{
if(pval_smin){
if (repseek_nseq==2)
smin = compute_smin_2seq(sequence, sequence2, size, size2, pval_smin, nX, nX2);
else
smin = compute_smin(sequence, pval_smin, size, nX);
}
else
smin = 0;
}
fprintf(stderr, "repeat >= %.2f score ( S5%%: %.2f ; S1%%: %.2f ; S0.1%%: %.2f )\n",
smin,
(repseek_nseq==2)?compute_smin_2seq(sequence, sequence2, size, size2, 0.05, nX, nX2):compute_smin(sequence, 0.05, size, nX),
(repseek_nseq==2)?compute_smin_2seq(sequence, sequence2, size, size2, 0.01, nX, nX2):compute_smin(sequence, 0.01, size, nX),
(repseek_nseq==2)?compute_smin_2seq(sequence, sequence2, size, size2, 0.001, nX, nX2):compute_smin(sequence, 0.001, size, nX)
);
/*
Begin the seed detection by KMR-K or read file
*/
if(seed_file) fprintf(stderr,"read seeds... ");
else fprintf(stderr,"detect seeds... ");
if (repseek_nseq==1)
{
if (mask_file)
{
if (opt_inv)
mask = KMRK_ReadMaskArea(mask_file,1,size); /* 2nd arg: #seq and the 3rd arg: complement */
else
mask = KMRK_ReadMaskArea(mask_file,1,0);
}
if(seed_file)
AllSeeds = read_seeds(seed_file, lmin, opt_dir, opt_inv, size, 0);
else
AllSeeds = KMRK_get_seeds(&sequence, size, lmin, opt_dir, opt_inv, 0, mask);
}else{
if (mask_file)
{
if (opt_inv)
mask = KMRK_ReadMaskArea(mask_file,2,size);
else
mask = KMRK_ReadMaskArea(mask_file,2,0);
}
if(seed_file)
AllSeeds = read_seeds(seed_file, lmin, opt_dir, opt_inv, size, size2);
else
AllSeeds = KMRK_get_seeds_2seqs(&sequence, &sequence2, size, size2, lmin, opt_dir, opt_inv, 0, mask);
}
fprintf(stderr, "%d direct & %d inverted seeds\n", AllSeeds->nDirSeeds, AllSeeds->nInvSeeds);
/*
Optionnal filters
*/
if (opt_filter)
{
fprintf(stderr,"filter, keep... ");
if (repseek_nseq==1)
KMRK_FamilySeeds(AllSeeds, min, Max, opt_dir, opt_inv, size);
else
KMRK_FamilySeeds2seq(AllSeeds, min, Max, opt_dir, opt_inv, size, size2);
fprintf(stderr, "%d direct and %d inverted seeds\n", AllSeeds->nDirSeeds, AllSeeds->nInvSeeds);
}
if(opt_MergeSeeds){
fprintf(stderr,"merge, keep... ");
KMRK_MergeSeeds(AllSeeds, opt_dir, opt_inv);
fprintf(stderr,"%d direct and %d inverted seeds\n", AllSeeds->nDirSeeds, AllSeeds->nInvSeeds);
}
/*
Final table(s) R of seeds
*/
/* filter seeds if they does not fit between min and Max */
if (repseek_nseq==1){
TableR = KMRK_SeedTableR(AllSeeds, opt_dir, opt_inv, size);
BuiltFilterFamily_seeds(AllSeeds, TableR, 0, 0, opt_dir, opt_inv);
}
else{
KMRK_SeedTableR2seq(AllSeeds, opt_dir, opt_inv, size,size2, &TableR,&TableR2);
BuiltFilterFamily_seeds2seq(AllSeeds, TableR, TableR2, 0, 0, opt_dir, opt_inv);
}
/*
reorder AllSeeds by position
*/
if (opt_dir) KMRK_sortSeedsByPos(AllSeeds->dirSeeds,AllSeeds->nDirSeeds);
if (opt_inv) KMRK_sortSeedsByPos(AllSeeds->invSeeds,AllSeeds->nInvSeeds);
if( opt_PrintSeeds ){
fprintf(stderr,"ouput seeds... ");
if(repseek_nseq == 1)
WriteSeeds(fout, AllSeeds, opt_dir, opt_inv);
else
WriteSeeds(fout, AllSeeds, opt_dir, opt_inv);
if(opt_TableR){
fprintf(fout,"# Seed TableR-1\n");
WriteTableR(fout, TableR, size);
if (repseek_nseq == 2){
fprintf(fout,"# Seed TableR-2\n");
WriteTableR(fout, TableR2, size2);
}
}
fprintf(stderr," done\n");
/*
Free stuff
*/
/* seeds */
KMRK_freeSeeds(AllSeeds);
/* sequence */
MyFree(sequence, (size+1)*sizeof(char));
if(repseek_nseq == 2)
MyFree(sequence2, (size2+1)*sizeof(char));
/* TableR */
MyFree(TableR, size*sizeof(int32_t));
if(repseek_nseq==2)MyFree(TableR2, size2*sizeof(int32_t));
PrintMaxMem();
exit(0);
}
/*
Free TableR of seeds
*/
MyFree(TableR, size*sizeof(int32_t));
if(repseek_nseq==2)MyFree(TableR2, size2*sizeof(int32_t));
/*
Extension : BLAST2-like Alignments
*/
if(repseek_nseq == 1)
AllRepeats = align_seeds(AllSeeds, sequence, Xg, gap_open, gap_ext, matrix_type,
lmin, opt_dir, opt_inv, opt_overlap, merge_repeats);
else
AllRepeats = align_seeds_2seq(AllSeeds, sequence, sequence2, Xg, gap_open, gap_ext, matrix_type,
lmin, opt_dir, opt_inv, merge_repeats);
fprintf(stderr,"align and merge... %d direct & %d inverted repeats (merge with R=%.2f)\n",
AllRepeats.nDirRep-AllRepeats.nDirBadRep, AllRepeats.nInvRep-AllRepeats.nInvBadRep, merge_repeats);
/*
If mode is stat on repeats, remove repeats with a score smaller than smin
*/
if( smin ){
UpdateRepeats(&AllRepeats, smin);
fprintf(stderr,"After score_min... %d direct & %d inverted repeats\n",
AllRepeats.nDirRep-AllRepeats.nDirBadRep, AllRepeats.nInvRep-AllRepeats.nInvBadRep);
}
/*
Built the repeats table(s) R.
*/
if (repseek_nseq==1)
TableR = SetRepFamily(AllRepeats, opt_dir, opt_inv, size);
else
SetNumberRep_2seqs(AllRepeats, opt_dir, opt_inv, size, size2, &TableR, &TableR2);
/*
Misc Output
*/
fprintf(stderr,"output repeats... ");
if(repseek_nseq == 1)
WriteRep(fout, AllRepeats, opt_dir, opt_inv, opt_shape, size);
else
WriteRep_2seqs(fout, AllRepeats, opt_dir, opt_inv, size, size2);
if(opt_TableR){
fprintf(fout,"# TableR-1\n");
WriteTableR(fout, TableR, size);
if (repseek_nseq == 2){
fprintf(fout,"# TableR-2\n");
WriteTableR(fout, TableR2, size2);
}
}
fprintf(stderr,"done\n");
fclose(fout);
/*
All free functions
*/
/* table(s) R */
MyFree(TableR, size*sizeof(int32_t));
if(repseek_nseq == 2)
MyFree(TableR2, size2*sizeof(int32_t));
/* seeds and repeats */
KMRK_freeSeeds(AllSeeds);
free_Repeats(AllRepeats);
/* sequence */
MyFree(sequence, (size+1)*sizeof(char));
if(repseek_nseq == 2)
MyFree(sequence2, (size2+1)*sizeof(char));
/* PrintMem("At the end"); */
PrintMaxMem();
return 0;
}

View File

@ -0,0 +1,258 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/******
file : memory.c
function : All about memory of the KMR, Seeds and Repeats
All MyMalloc() series is about follwoing how mauch memory has been used
created : 19 Sep 2003
modif : Oct 2003, Feb 2004
modif : Dec 2004 <EC> ; Corrected Memory declaration
author : amikezor
*****/
#include <stdio.h>
#include <stdlib.h>
#include "repseek_types.h"
#include "memory.h"
MemUsage Memory;
/*
Functions to count the memory usage all along
dybamic allocation and free
*/
void PrintMem(char *Comment){
extern MemUsage Memory;
fprintf(stderr,"\n%s\nMemory Usage\n\t* Max is: %d bytes, %.2f Kb, %.2f Mb\n\t* Cur is: %d bytes, %.2f Kb, %.2f Mb\n",
Comment,
Memory.Max, (float)Memory.Max/1024, (float)Memory.Max/(1024*1024),
Memory.Current, (float)Memory.Current/1024, (float)Memory.Current/(1024*1024));
}
void PrintMaxMem( void ){
extern MemUsage Memory;
if(Memory.Max < 1024)
fprintf(stderr,"Max Memory Usage.. %d bytes\n", Memory.Max);
else if(Memory.Max < 1024*1024)
fprintf(stderr,"Max Memory Usage.. %.2f Kilobytes\n", (float)Memory.Max/1024);
else if(Memory.Max < 1024*1024*1024)
fprintf(stderr,"Max Memory Usage.. %.2f Megabytes\n", (float)Memory.Max/(1024*1024));
else
fprintf(stderr,"Max Memory Usage.. %.2f Gigabytes\n", (float)Memory.Max/(1024*1024*1024));
}
void Update_Mem(int32_t Value){
extern MemUsage Memory;
Memory.Current += Value;
Memory.Max = (Memory.Current>Memory.Max)?Memory.Current:Memory.Max;
}
void Init_Mem(int32_t Value){
extern MemUsage Memory;
Memory.Current = Value;
Memory.Max = Value;
}
/*
Replace functions of dynamic allocation
to allow the tracking of memory usage
*/
void *MyMalloc( int32_t size , char *Error ){
void *pointer;
/* fprintf(stderr,"malloc: %s %d\n",Error, size); */
pointer = malloc(size);
if(!pointer)fprintf(stderr,"%s\n",Error),exit(3);
Update_Mem(size); /* fprintf(stderr,"Mem += %ld - malloc %s\n", size, Error); */
return pointer;
}
void *MyCalloc( int32_t number, int32_t TypeSize , char *Error ){
void *pointer;
/* fprintf(stderr,"calloc: %s %d of size %d\n", Error, number, TypeSize); */
pointer = calloc(number, TypeSize);
if(!pointer)fprintf(stderr,"%s\n",Error),exit(3);
Update_Mem(number*TypeSize ); /* fprintf(stderr,"Mem += %ld - calloc %s\n", number*TypeSize, Error); */
return pointer;
}
void MyFree( void *Pointer, int32_t size){
if(!Pointer)return;
/* fprintf(stderr,"free: %d\n", size); */
free(Pointer);
Pointer=NULL;
Update_Mem(-size); /* fprintf(stderr,"Mem -= %ld - free\n", size); */
}
void *MyRealloc( void *Pointer, int32_t newsize, int32_t oldsize, char *Error){
/* fprintf(stderr,"realloc: %s %d (from %d)\n", Error, newsize, oldsize); */
Pointer = realloc(Pointer,newsize);
if(!Pointer)fprintf(stderr,"%s\n",Error),exit(3);
Update_Mem( newsize-oldsize ); /* fprintf(stderr,"Mem += %ld - realloc %s\n", (newsize-oldsize), Error); */
return Pointer;
}
/*
Deal with Stacks structure for KMR
void MallocStack(Stacks *s, int32_t number, int32_t *histo, int32_t AllValues){
int32_t i;
s->nStacks = number;
s->nValues = AllValues;
s->ppStacks = (int32_t **)MyMalloc( number * sizeof(int32_t *),
"MallocStack: ppStacks malloc error, bye\n");
s->lenStacks = (int32_t *)MyMalloc( number * sizeof(int32_t),
"MallocStack: lenStacks malloc error, bye\n");
s->ppStacks[0] = (int32_t *)MyMalloc( AllValues * sizeof(int32_t),
"MallocStack: ppStacks[0] malloc error, bye\n");
s->lenStacks[0]=0;
for(i=1;i < number; i++){
s->lenStacks[i]=0;
s->ppStacks[i] = s->ppStacks[i-1] + histo[i] ;
}
}
void FreeStack( Stacks *p){
MyFree(p->ppStacks[0] , p->nValues*sizeof(int32_t) );
MyFree(p->ppStacks , p->nStacks*sizeof(int32_t *) );
MyFree(p->lenStacks , p->nStacks*sizeof(int32_t));
}
*/
/*
Deal with the Seeds part
void free_Seeds(Seeds AllSeeds)
{
if( AllSeeds.nDirSeeds ){
MyFree(AllSeeds.DirPos1, AllSeeds.nDirSeeds*sizeof(int32_t));
MyFree(AllSeeds.DirPos2, AllSeeds.nDirSeeds*sizeof(int32_t));
MyFree(AllSeeds.DirLen, AllSeeds.nDirSeeds*sizeof(int32_t));
MyFree(AllSeeds.DirMeanR, AllSeeds.nDirSeeds*sizeof(float));
}
if(AllSeeds.nInvSeeds ){
MyFree(AllSeeds.InvPos1, AllSeeds.nInvSeeds*sizeof(int32_t));
MyFree(AllSeeds.InvPos2, AllSeeds.nInvSeeds*sizeof(int32_t));
MyFree(AllSeeds.InvLen, AllSeeds.nInvSeeds*sizeof(int32_t));
MyFree(AllSeeds.InvMeanR, AllSeeds.nInvSeeds*sizeof(float));
}
}
*/
/*
Malloc if it is the first time otherwise readjust
void AllocSeeds(Seeds *AllSeeds, int32_t size, int32_t old_size, int8_t opt_dir, int8_t opt_inv){
if(opt_inv != 1 && opt_dir != 1)
fprintf(stderr,"AllocSeeds: requiere at least one of opt_dir and opt_inv to be 1\n"),exit(4);
if(opt_dir == 1){
if( AllSeeds->DirPos1 == NULL){
AllSeeds->DirPos1 = (int32_t *)MyCalloc(size , sizeof(int32_t),"AllocSeeds: Alloc for DirPos1 failed, bye");
AllSeeds->DirPos2 = (int32_t *)MyCalloc(size , sizeof(int32_t),"AllocSeeds: Alloc for DirPos2 failed, bye");
}
else{
AllSeeds->DirPos1 = (int32_t *)MyRealloc(AllSeeds->DirPos1, size * sizeof(int32_t), old_size* sizeof(int32_t),
"AllocSeeds: realloc for DirPos1 failed, bye");
AllSeeds->DirPos2 = (int32_t *)MyRealloc(AllSeeds->DirPos2, size * sizeof(int32_t), old_size* sizeof(int32_t),
"AllocSeeds: realloc for DirPos2 failed, bye");
}
}
if(opt_inv == 1){
if( AllSeeds->InvPos1 == NULL){
AllSeeds->InvPos1 = (int32_t *)MyCalloc(size , sizeof(int32_t), "AllocSeeds: Alloc for InvPos1 failed, bye");
AllSeeds->InvPos2 = (int32_t *)MyCalloc(size , sizeof(int32_t), "AllocSeeds: Alloc for InvPos2 failed, bye");
}
else{
AllSeeds->InvPos1 = (int32_t *)MyRealloc(AllSeeds->InvPos1, size * sizeof(int32_t), old_size* sizeof(int32_t),
"AllocSeeds: realloc for InvPos1 failed, bye");
AllSeeds->InvPos2 = (int32_t *)MyRealloc(AllSeeds->InvPos2, size * sizeof(int32_t), old_size* sizeof(int32_t),
"AllocSeeds: realloc for InvPos2 failed, bye");
}
}
}
*/
/*
Deal with the Repeats structure
*/
Repeats mem_Repeats(int32_t Ndir, int32_t Ninv){
Repeats AllRepeats; /* All Repeats structure */
AllRepeats.nDirRep = Ndir; /* set the number of repeats to the number of seeds */
AllRepeats.nInvRep = Ninv;
AllRepeats.nDirBadRep = 0; /* set the "bad" repet (included into another rep) as 0 */
AllRepeats.nInvBadRep = 0;
if(AllRepeats.nDirRep)
AllRepeats.DirRep = (Rep *)MyMalloc( (AllRepeats.nDirRep)*sizeof(Rep), "init_Repeats: repdir malloc error" );
else
AllRepeats.DirRep = NULL;
if(AllRepeats.nInvRep)
AllRepeats.InvRep = (Rep *)MyMalloc( (AllRepeats.nInvRep)*sizeof(Rep), "init_Repeats: repinv malloc error" );
else
AllRepeats.InvRep = NULL;
return AllRepeats;
}
void free_Repeats(Repeats AllRep)
{
if(AllRep.nDirRep)
MyFree(AllRep.DirRep, AllRep.nDirRep*sizeof(Rep));
if(AllRep.nInvRep)
MyFree(AllRep.InvRep, AllRep.nInvRep*sizeof(Rep));
}

View File

@ -0,0 +1,130 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/**
* @file memory.h
* @author Achaz G
* @date April 2004
*
* @brief header for memory alloc/dealloc
* modif : Dec 2004 <EC> ; Corrected Memory declaration
*
*
*/
#ifndef _MEMORY_H_
#define _MEMORY_H_
#include "repseek_types.h"
typedef struct { /********** Memory Usage structure **************/
int32_t Max;
int32_t Current;
} MemUsage;
#include <stdio.h>
#include <stdlib.h>
#if defined(DMALLOC)
#include <dmalloc.h>
#endif
/********** **********
Global Variable(s)
********** **********/
extern MemUsage Memory; /* Instance of the global variable for memory tracking */
/*
Follow memory usage
*/
void PrintMem(char *Comment);
void PrintMaxMem( void );
void Update_Mem(int32_t Value);
void Init_Mem(int32_t Value);
/*
All Alloc/Free to follow of memory usage
*/
void *MyMalloc( int32_t size , char *Error );
void *MyCalloc( int32_t number, int32_t TypeSize , char *Error );
void MyFree( void *Pointer, int32_t size);
void *MyRealloc( void *Pointer, int32_t newsize, int32_t oldsize, char *Error);
/*
For Stacks
void MallocStack(Stacks *s, int32_t number, int32_t *histo, int32_t AllValues);
void FreeStack( Stacks *p);
For Seeds
void free_Seeds(Seeds AllSeeds);
void AllocSeeds(Seeds *AllSeeds, int32_t size, int32_t old_size, int8_t opt_dir, int8_t opt_inv);
*/
/*
For Repeats
*/
Repeats mem_Repeats(int32_t Ndir, int32_t Ninv);
void free_Repeats(Repeats AllRep);
/*
Not used anymore, but just in case
*/
#define KMRK_MALLOC(var,type,size,message) { \
var = (type*) malloc(size); \
if (!var) \
{ \
fprintf(stderr,"%s\n",message); \
exit(4); \
} \
}
#define KMRK_CALLOC(var,type,length,message) { \
var = (type*) calloc(length,sizeof(type)); \
if (!var) \
{ \
fprintf(stderr,"%s\n",message); \
exit(4); \
} \
}
#define KMRK_REALLOC(var,type,size,message) { \
var = (type*) realloc(var,size); \
if (!var) \
{ \
fprintf(stderr,"%s\n",message); \
exit(4); \
} \
}
#endif /* _MEMORY_H_*/

View File

@ -0,0 +1,186 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/**
* @file align_memory.c
* @author amikezor <gachaz@oeb.harvard.edu>
* @date Jul 07 2003
* @modif July 2003, Oct 2003, Nov 2003 -small bug in realloc traceback, Sep 05
*
* @brief all malloc/realloc mem of the results structure
*
*
*/
#include "repseek_types.h"
#include "memory.h"
#include <stdio.h>
#include <stdlib.h>
#define MIN_COLS 1000
#define MIN_ROWS 1000
#define MIN_ALIGN 10000
#define MIN_SCORES 10000
/*
All the memory you need for the alignment itself
*/
void mem_Blast2Align(RESULTS *pResults){
pResults->scores = (double *) MyCalloc( MIN_SCORES, sizeof(double) , "mem_Blast2Align: scores calloc error");
pResults->traces = (int32_t *) MyCalloc( MIN_SCORES, sizeof(int32_t) , "mem_Blast2Align: traces calloc error" );
pResults->traceback_f = (char *) MyCalloc( MIN_ALIGN, sizeof(char) , "mem_Blast2Align: traceback_f calloc error");
pResults->traceback_b = (char *) MyCalloc( MIN_ALIGN, sizeof(char) , "mem_Blast2Align: traceback_b calloc error");
pResults->Segment_begin = (int32_t *) MyCalloc( MIN_ROWS, sizeof(int32_t) , "mem_Blast2Align: Segment_begin calloc error");
pResults->Segment_end = (int32_t *) MyCalloc( MIN_ROWS, sizeof(int32_t) , "mem_Blast2Align: Segment_end calloc error");
pResults->Top = (int32_t *) MyCalloc( MIN_COLS, sizeof(int32_t) , "mem_Blast2Align: Top calloc error");
pResults->F = (double *) MyCalloc( MIN_COLS, sizeof(double) , "mem_Blast2Align: F calloc error");
pResults->BestScore_row=0;
pResults->BestScore_col=0;
pResults->max_scores=MIN_SCORES;
pResults->max_col= MIN_COLS;
pResults->max_row= MIN_ROWS;
pResults->max_alignlen=MIN_ALIGN;
}
/*
The alignemnt results structure
*/
void free_Blast2Align(RESULTS *pResults){
MyFree(pResults->traceback_f, pResults->max_alignlen*sizeof(char));
MyFree(pResults->traceback_b, pResults->max_alignlen*sizeof(char));
MyFree(pResults->scores, pResults->max_scores*sizeof(double));
MyFree(pResults->traces, pResults->max_scores*sizeof(int32_t));
MyFree(pResults->Top, pResults->max_col*sizeof(int32_t));
MyFree(pResults->F, pResults->max_col*sizeof(double));
MyFree(pResults->Segment_begin, pResults->max_row*sizeof(int32_t));
MyFree(pResults->Segment_end, pResults->max_row*sizeof(int32_t));
}
/*
if sig == 0, then double mem,
otherwise readjust to sig
*/
void ReallocScores(RESULTS *pResults, int32_t sig){
int32_t old=pResults->max_scores;
int32_t nscores = pResults->pscore - pResults->scores;
int32_t Bscores = pResults->pBestScore - pResults->scores;
if( ! sig )
(pResults->max_scores)*=2;
else
pResults->max_scores=sig;
pResults->traces = (int32_t *)MyRealloc( pResults->traces , sizeof(int32_t) * pResults->max_scores, old*sizeof(int32_t),
"ReallocScores: realloc traces error, bye" );
pResults->scores = (double *)MyRealloc( pResults->scores , sizeof(double) * pResults->max_scores , old*sizeof(double),
"ReallocScores: realloc scores error, bye" );
pResults->pscore=(pResults->scores + nscores); /* set correctly the current score */
pResults->pBestScore=(pResults->scores + Bscores); /* set correctly the current best score */
}
/*
if sig == 0, then double mem,
otherwise readjust to sig
*/
void ReallocTraceback(RESULTS *pResults, int32_t sig, char direction){
int32_t old=pResults->max_alignlen;
if( ! sig )
(pResults->max_alignlen)*=2;
else
pResults->max_alignlen=sig;
pResults->traceback_f = (char *)MyRealloc( pResults->traceback_f ,sizeof(char)*pResults->max_alignlen, old*sizeof(char),
"ReallocTraceback: traceback_f realloc error, bye" );
pResults->traceback_b = (char *)MyRealloc( pResults->traceback_b ,sizeof(char)*pResults->max_alignlen, old*sizeof(char),
"ReallocTraceback: traceback_b realloc error, bye" );
if(direction == 'f')
pResults->traceback=pResults->traceback_f;
else if(direction == 'b')
pResults->traceback=pResults->traceback_b;
else
fprintf(stderr,"traceback: direction is either 'f' or 'b', bye\n"),exit(4);
}
/*
if sig == 0, then double mem,
otherwise readjust to sig
*/
void ReallocAbove(RESULTS *pResults, int32_t sig){
int32_t old=pResults->max_col;
if( ! sig )
(pResults->max_col)*=2;
else
pResults->max_col=sig;
pResults->Top = (int32_t *)MyRealloc( (void *)pResults->Top ,sizeof(int32_t)*pResults->max_col, sizeof(int32_t)*old, "ReallocAbove: realloc error, bye" );
pResults->F = (double *)MyRealloc( (void *)pResults->F ,sizeof(double)*pResults->max_col, sizeof(double)*old, "ReallocAbove: realloc error, bye" );
}
/*
if sig == 0, then double mem,
otherwise readjust to sig
*/
void ReallocBegEnd(RESULTS *pResults, int32_t sig){
int32_t old=pResults->max_row;
if( ! sig )
(pResults->max_row)*=2;
else
pResults->max_row=sig;
pResults->Segment_begin = (int32_t *)MyRealloc( (void *)pResults->Segment_begin , sizeof(int32_t)*pResults->max_row ,
sizeof(int32_t)*old, "ReallocBegEnd: Segment_begin realloc error, bye");
pResults->Segment_end = (int32_t *)MyRealloc( (void *)pResults->Segment_end ,sizeof(int32_t)*pResults->max_row ,
sizeof(int32_t)*old, "ReallocBegEnd: Segment_end realloc error, bye");
}
#undef MIN_SCORES
#undef MIN_COLS
#undef MIN_ALIGN

View File

@ -0,0 +1,47 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/**
* @file align_memory.h
* @author amikezor <gachaz@oeb.harvard.edu>
* @date April 2004
*
* @brief header for all memory alloc/dealloc for the alignemnt part.
*
*
*/
#ifndef _align_memory_h_
#define _align_memory_h_
void mem_Blast2Align(RESULTS *pResults);
void free_Blast2Align(RESULTS *pResults);
void ReallocScores(RESULTS *pResults, int32_t sig);
void ReallocTraceback(RESULTS *pResults, int32_t sig, char direction);
void ReallocAbove(RESULTS *pResults, int32_t sig);
void ReallocBegEnd(RESULTS *pResults, int32_t sig);
#endif /* _align_memory_h_ */

View File

@ -0,0 +1,215 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/***
file : output.c
function : write out all information
created : 02 Oct 2002
modif : July 2003 , Oct 2003, July04 (score is now a double), dec 2005
author : amikezor
***/
#include "repseek_types.h"
#include "KMRK_Seeds.h"
#include "align.h"
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
/***
Write out all repeats
***/
#define FPF fprintf(fout,
void WriteRep(FILE *fout, Repeats AllRepeats, int8_t opt_dir, int8_t opt_inv, char opt_shape, int32_t sizeofchr)
{
int32_t i;
int32_t spacer;
if(!opt_dir && !opt_inv)
fprintf(stderr, "WriteRep: need opt_dir or opt_inv\n"),exit(4);
if(opt_shape!='l' && opt_shape!='c')
fprintf(stderr, "WriteRep: need opt_shape set to 'l' or 'c'\n"),exit(4);
if(opt_dir){
#define REP(i) AllRepeats.DirRep[i]
for(i=0; i<AllRepeats.nDirRep; i++){
if(REP(i).pos1 == -1)continue;
if(opt_shape == 'l')
spacer = REP(i).pos2 -1 - (REP(i).pos1+REP(i).len1-1);
else
spacer=MIN( REP(i).pos2-1 - (REP(i).pos1+REP(i).len1-1) , sizeofchr-1 - (REP(i).pos2+REP(i).len2-1) + REP(i).pos1 );
FPF "%s.dir\t%ld\t%ld\t%ld\t%ld\t%ld\t",REP(i).type, (long) REP(i).pos1+1, (long) REP(i).pos2+1, (long) REP(i).len1, (long) REP(i).len2, (long) spacer);
FPF "%ld-%ld-%ld-%.2f\t", (long) REP(i).seed_pos1+1, (long) REP(i).seed_pos2+1, (long) REP(i).seed_len, REP(i).seed_meanR);
FPF "%.3f\t%.2f\t", 100*(float)REP(i).match/(float)REP(i).align_len, REP(i).score );
FPF "%.2f\t%ld\t%.2f\n", REP(i).meanR, (long) REP(i).mainR, REP(i).fraction_mainR );
}
#undef REP
}
if(opt_inv){
#define REP(i) AllRepeats.InvRep[i]
for(i=0; i<AllRepeats.nInvRep; i++){
if(REP(i).pos1 == -1)continue;
if(opt_shape == 'l')
spacer = REP(i).pos2 -1 - (REP(i).pos1+REP(i).len1-1);
else
spacer=MIN( REP(i).pos2-1 - (REP(i).pos1+REP(i).len1-1) , sizeofchr-1 - (REP(i).pos2+REP(i).len2-1) + REP(i).pos1 );
FPF "%s.inv\t%ld\t%ld\t%ld\t%ld\t%ld\t",REP(i).type,(long) REP(i).pos1+1, (long) REP(i).pos2+1, (long) REP(i).len1, (long) REP(i).len2,(long) spacer);
FPF "%ld-%ld-%ld-%.2f\t", (long)REP(i).seed_pos1+1, (long)REP(i).seed_pos2+1, (long)REP(i).seed_len, REP(i).seed_meanR);
FPF "%.3f\t%.2f\t", 100*(float)REP(i).match/(float)REP(i).align_len, REP(i).score );
FPF "%.2f\t%ld\t%.2f\n", REP(i).meanR, (long) REP(i).mainR, REP(i).fraction_mainR );
}
#undef REP
}
}
/***
Write out all repeats
**/
void WriteRep_2seqs(FILE *fout, Repeats AllRepeats, int8_t opt_dir, int8_t opt_inv, int32_t size1, int32_t size2)
{
int32_t i;
int32_t diff2edge, diff1, diff2;
if(!opt_dir && !opt_inv)
fprintf(stderr, "WriteRep2: need opt_dir or opt_inv\n"),exit(4);
if(opt_dir){
#define REP(i) AllRepeats.DirRep[i]
for(i=0; i<AllRepeats.nDirRep; i++){
if(REP(i).pos1 == -1)continue;
diff1 = ( REP(i).pos1+1>size1/2 )? REP(i).pos1+1-size1/2 : REP(i).pos1+1;
diff2 = ( REP(i).pos2> size2/2 )? REP(i).pos2 -size2/2 : REP(i).pos2;
diff2edge = ABS(diff1 - diff2);
FPF "%s.dir\t%ld\t%ld\t%ld\t%ld\t%ld\t","Interseq",(long) REP(i).pos1+1, (long) REP(i).pos2+1, (long) REP(i).len1, (long) REP(i).len2, (long)diff2edge);
FPF "%ld-%ld-%ld\t%.3f\t%.2f\t",(long)REP(i).seed_pos1+1, (long) REP(i).seed_pos2+1, (long) REP(i).seed_len,
100*(float)REP(i).match / (float)REP(i).align_len, REP(i).score );
FPF "%.2f\t%ld\t%.2f\n", REP(i).meanR, (long) REP(i).mainR, REP(i).fraction_mainR );
}
#undef REP
}
if(opt_inv){
#define REP(i) AllRepeats.InvRep[i]
for(i=0; i<AllRepeats.nInvRep; i++){
if(REP(i).pos1 == -1)continue;
diff1 = (REP(i).pos1+1>size1/2)?REP(i).pos1+1-size1/2:REP(i).pos1+1;
diff2 = (REP(i).pos2> size2/2)?REP(i).pos2-size2/2:REP(i).pos2;
diff2edge = ABS(diff1 - diff2);
FPF "%s.inv\t%ld\t%ld\t%ld\t%ld\t%ld\t","Interseq", (long)REP(i).pos1+1, (long) REP(i).pos2+1 , (long) REP(i).len1, (long) REP(i).len2, (long) diff2edge);
FPF "%ld-%ld-%ld\t%.3f\t%.2f\t", (long) REP(i).seed_pos1+1, (long) REP(i).seed_pos2+1, (long) REP(i).seed_len,
100*(float)REP(i).match / (float)REP(i).align_len, REP(i).score);
FPF "%.2f\t%ld\t%.2f\n", REP(i).meanR, (long) REP(i).mainR, REP(i).fraction_mainR );
}
#undef REP
}
}
/***
Write out the table_r (degree of repetition of each seq position)
***/
void WriteTableR(FILE *fout, int32_t *table_r, int32_t max)
{
int32_t i;
if( table_r == NULL )
return;
for(i=0;i<max; i++)
if(table_r[i]>=2)
fprintf(fout,
"TableR %ld %ld\n",
(long) i+1,
(long) table_r[i]);
}
/***
Write out the seeds that will be used for extension phase
***/
void WriteSeeds(FILE *fout,
AllSeeds_type *AllSeeds,
int8_t opt_dir,
int8_t opt_inv)
{
Seed_type *curSeed;
int32_t i; /* just a dummy counter */
if(!opt_dir && !opt_inv)
fprintf(stderr, "WriteSeeds: need opt_dir or opt_inv\n"),exit(4);
if(opt_dir){
for(i=0,curSeed = AllSeeds->dirSeeds; i<AllSeeds->nDirSeeds; i++,curSeed++)
FPF "d\t%ld\t%ld\t%ld\t%.2f\n", (long) curSeed->pos1 + 1, (long) curSeed->pos2 + 1, (long) curSeed->length, curSeed->rmean);
}
if(opt_inv){
for(i=0,curSeed = AllSeeds->invSeeds;i<AllSeeds->nInvSeeds;i++,curSeed++)
FPF "i\t%ld\t%ld\t%ld\t%.2f\n", (long) curSeed->pos1 + 1, (long) curSeed->pos2 + 1, (long) curSeed->length, curSeed->rmean);
}
}
#undef FPF

View File

@ -0,0 +1,47 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/**
* @file output.h
* @author Guillaume Achaz <gachaz@oeb.harvard.edu>
* @date April 2004
* @modif Dec 2005
* @brief header for output functions
*
*
*/
#ifndef output_h
#define output_h
#include "KMRK_Seeds.h"
void WriteSeeds(FILE *fout,
AllSeeds_type *AllSeeds,
int8_t opt_dir,
int8_t opt_inv);
void WriteRep_2seqs(FILE *fout, Repeats AllRepeats, int8_t opt_dir, int8_t opt_inv, int32_t size1, int32_t size2);
void WriteRep(FILE *fout, Repeats AllRepeats, int8_t opt_dir, int8_t opt_inv, char opt_shape, int32_t sizeofchr);
void WriteTableR(FILE *fout, int32_t *table_r, int32_t max);
#endif /* output_h */

View File

@ -0,0 +1,108 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/******
file : read_seeds.c
function : read seeds from a file (instead of using KMRK)
created : Dec 2005
modif :
author : amikezor
*****/
#include "repseek_types.h"
#include "KMRK_mask.h"
#include "KMRK_Seeds.h"
#include <stdio.h>
#include <stdlib.h>
/*
This function takes a file name
the minimum length, if dir and/or inverted should be read
the size of the firt sequence and the size of the second sequence (if any))
*/
AllSeeds_type *read_seeds(char *seed_file, int32_t lmin, int8_t opt_dir, int8_t opt_inv, int32_t size1, int32_t size2){
AllSeeds_type *all_seeds=NULL; /* a seed pointer */
FILE *fcin=NULL; /* file descriptor */
char direction; /* either 'd' or 'i' */
int8_t dir; /* 1 if direct, 0 otherwise */
int32_t begin, /* temporary begin, end and len of a seed - tmp is needed when swap is requiered */
end,
len,
tmp;
int32_t nseed=0; /* the seed_number */
all_seeds = KMRK_allocSeeds(all_seeds, 1, opt_dir, opt_inv); /* get seeds */
/*all_seeds = KMRK_allocSeeds(all_seeds, 1, 1, 1);*/ /* get seeds */
/*all_seeds = KMRK_allocSeeds(all_seeds, 1, 0, 1);*/ /* get seeds */
fcin = fopen(seed_file, "r");
if(!fcin)fprintf(stderr, "error whilst trying to read seed_file %s, please check it; bye\n", seed_file),exit(2);
while( fscanf(fcin, "%c%ld%ld%ld", &direction, (long *) &begin, (long *) &end, (long *) &len) == 4){
dir=1; /* by default it is direct -- for the switch -- */
switch(direction){
case 'i':
if(begin>end){ tmp=begin; begin=end; end=tmp; }
dir=0;
case 'd':
if( (dir && !opt_dir) || (!dir && !opt_inv) )break;
if(len>=lmin)
{
if( begin<1 ||
( !size2 && end+len-1>size2 ) ||
( size2 && (end+len-1>size2 || begin+len-1>size1) )
)
KMRK_pushSeed(all_seeds, begin-1, end-1, len, dir);
}
break;
default:
fprintf(stderr, "direction of seed at line %d is not valid, bye",nseed+1) , exit(4);
}
while( fgetc(fcin) != '\n'); /* remove other infos at the end of line */
nseed++;
}
fclose(fcin);
KMRK_compactSeeds(all_seeds);
return all_seeds;
}

View File

@ -0,0 +1,32 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/******
file : read_seeds.h
function : header for read seeds
created : Dec 2005
modif :
author : amikezor
*****/
#include "repseek_types.h"
AllSeeds_type *read_seeds(char *seed_file, int32_t lmin, int8_t opt_dir, int8_t opt_inv, int32_t size1, int32_t size2);

View File

@ -0,0 +1,222 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/******
file : readfst.c
function : what you need to read the fst files
created : Oct 03 2003
modif : may 2008 - count Xs in the sequence
: sept 2014 - add an extra-check when reading fasta file
note : derived from the libiofst src, changing the malloc() to MyMalloc()
author : amikezor
*****/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "repseek_types.h"
#include "sequence.h"
#include "memory.h"
static int32_t size_oneseq(FILE *fseq)
{
int32_t size, count=0, bonux=0, /* size of the seq, count number of letter/line, and final carac(\n or nothing) */
begin_oct, end_oct; /* begin and and end in octet */
int temp; /* used for temp memory */
while( fgetc(fseq) != '\n'); /* get out the first line [comments] */
begin_oct = ftell(fseq); /* where begin the second line */
while( fgetc(fseq) != '\n')
count++; /* get the size of the second line */
fseek(fseq, -1, SEEK_END); /* Do we have ACGT\n[EOF] or ACGT[EOF] */
bonux = (fgetc(fseq) == '\n')?1:0;
fseek(fseq,-count-bonux,SEEK_END);
while( fgetc(fseq) != '\n');
end_oct = ftell(fseq); /* where finish the ante-last line */
size = (end_oct-begin_oct)/(count+1);
size*=count;
count = 0;
while((temp=fgetc(fseq)) != EOF && temp != '\n' )
count++; /* get the last line */
size+=count;
fseek(fseq,0,SEEK_SET); /* go back to the begin of the file */
return size;
}
static int32_t get_curfst(char *sequence, FILE *fseq, long size_seq)
{
int32_t lseq;
int letter;
lseq = 0;
while(fgetc(fseq) != '\n'); /* lecture de la premiere ligne */
while ((letter=fgetc(fseq)) != EOF || letter== '>')
if (letter !='\n')
sequence[lseq++]=letter;
if(letter == '>')
fseek(fseq, -1, SEEK_CUR);
if(lseq != size_seq){
fprintf(stderr, "\n!! Your input fasta file seems not correct !!\n");
fprintf(stderr, "\t * the first line must start with '>'.\n");
fprintf(stderr, "\t * all sequence lines (except the last one) must have the same size.\n");
exit(1);
}
sequence[lseq]=0;
return lseq;
}
static int32_t numseq(FILE *ffile)
{
int32_t number=0;
int tmp;
if(fgetc(ffile) != '>')
fprintf(stderr,"Check your fasta sequences\n"), exit(4);
rewind(ffile); /* go back at the begining */
while( ( tmp=fgetc(ffile) ) != EOF )
if(tmp == '>'){
number++;
while(fgetc(ffile) != '\n');
}
rewind(ffile); /*return at the file's begining*/
return number;
}
char *file2mem(char *file, int32_t *size) /*le pointeur size permet de recup la size */
{
FILE *fseq;
char *seq;
fseq = fopen(file,"r");
if(fseq == NULL)
fprintf(stderr,"Flux error, check the fasta seq file\n"),exit(3);
if( numseq(fseq) == 1)
*size = size_oneseq(fseq); /* return length of seq */
else
fprintf(stderr,"there is not exactly ONE sequence in that file\n"),exit(2);
seq = (char *)MyMalloc((*size+1)*sizeof(char),"get_chr: Malloc error");
get_curfst(seq, fseq, *size);
fclose(fseq);
return seq;
}
char *fuse2files_mem(char *file1, char *file2, int32_t *size1, int32_t *size2)
{
FILE *fseq1; /* pointer to the first file */
FILE *fseq2; /* pointer to the second one */
char *seq; /* the sequence seq1_X_seq2 */
int32_t size; /* the total size = size1+size2+1 */
fseq1 = fopen(file1,"r");
fseq2 = fopen(file2,"r");
if(!fseq1 || !file2)fprintf(stderr,"twofiles2mem: Flux error, check the fasta seq files\n"),exit(3);
if( numseq(fseq1) == 1)
*size1 = size_oneseq(fseq1); /* return length of seq1 */
else
fprintf(stderr,"twofiles2mem: there is not only 1 sequence in file %s\n", file1),exit(2);
if( numseq(fseq2) == 1)
*size2 = size_oneseq(fseq2); /* return length of seq2 */
else
fprintf(stderr,"twofiles2mem: there is not only 1 sequence in file %s\n", file2),exit(2);
size = (*size1) + 1 + (*size2);
seq = (char *)MyMalloc((size+1)*sizeof(char),"twofiles2mem: malloc error");
if (get_curfst(seq, fseq1, *size1) != *size1)
fprintf(stderr,"twofiles2mem: sequence of file %s is not clean\n",file1),exit(4);
seq[ *size1 ]='X';
if (get_curfst(seq + (*size1)+1, fseq2, *size2) != *size2)
fprintf(stderr,"twofiles2mem: sequence of file %s is not clean\n",file2),exit(4);
fclose(fseq1);
fclose(fseq2);
return seq;
}
char *readFastaSeq(char *file, int32_t *size, int32_t *nX)
{
char *sequence;
/*fprintf(stderr,"get sequence... %s ",file);*/
sequence = file2mem(file, size);
(void) UpperSequence(sequence); /* turn into UPPER sequence */
nonACGTXtoN(sequence); /* set all non ACGTX into N */
(void )CheckSeq(sequence, "ACGTX"); /* check the number of bad symbols */
*nX=countX(sequence);
return sequence;
}

View File

@ -0,0 +1,40 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/**
* @file KMRK_readfst.h
* @author Eric Coissac <coissac@inrialpes.fr>
* @date Tue Feb 24 22:11:07 2004
*
* @brief Header file for fasta file reading
*
*
*/
#ifndef KMRK_readfst_h
#define KMRK_readfst_h
char *file2mem(char *file, int32_t *size);
char *readFastaSeq(char *file, int32_t *size, int32_t *nX);
char *fuse2files_mem(char *file1, char *file2, int32_t *size1, int32_t *size2);
#endif /* KMRK_readfst_h */

View File

@ -0,0 +1,226 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/**
* @file repseek_types.h
* @author Guillaume Achaz <gachaz@oeb.harvard.edu>
* @date April 2004
* @modif July 2004 turn scores into doubles
* @brief definition of general types and macros for repseek
*
*
*/
#ifndef _REPSEEK_TYPES_
#define _REPSEEK_TYPES_
/*
Version of the program
*/
#define REPSEEK_VERSION "6.6"
#define REPSEEK_DATE "Sept, 09, 2014"
/********** **********
General Macros
********** **********/
/*
Macros to compare 2 or three values
*/
#define MAX2( A, B ) ( ((A)>(B))?(A):(B) )
#define MAX3( A, B, C ) ( ((MAX2(A,B))>(C))?(MAX2(A,B)):(C) )
#define MIN2( A, B ) ( ((A)<(B))?(A):(B) )
#define MAX( A, B ) MAX2( A, B )
#define MIN( A, B ) MIN2( A, B )
/*
Absolute values
*/
#define ABS(x) (( (x)>=0 )? (x) : -(x))
/********** **********
All types used in repseek
********** **********/
#include <stdio.h> /* The type FILE * is defined here */
#ifdef OSF
typedef signed char int8_t;
typedef short int16_t;
typedef long int32_t;
#else
#include <stdint.h> /* all, the int??_t are defined in there for typical unix */
#endif
/**
* Store informations about one STRICT repeat (seeds)
*
*/
typedef struct { /* the complete seed structure */
int32_t pos1; /**< position of the first copy */
int32_t pos2; /**< position of the second copy */
int32_t length; /**< length of the strict repeats */
float rmean; /**< mean repeat leavel */
} Seed_type;
typedef struct { /* Just after a KMRK length X, only the 2 pos matter */
int32_t pos1; /**< postion of the first copy */
int32_t pos2; /**< postion of the second copy */
} SmallSeed_type;
/**
* Store informations about all strict repeat (seeds)
*
*/
typedef struct {
int32_t cDirSeeds; /**< currently allocated space in dirSeeds array */
int32_t nDirSeeds; /**< count of direct strict repeats */
int32_t nFilteredDirSeeds; /**< ??? */
Seed_type* dirSeeds; /**< array of direct repeats */
int32_t cInvSeeds; /**< currently allocated space in invSeeds array */
int32_t nInvSeeds; /**< count of inverted strict repeats */
int32_t nFilteredInvSeeds; /**< ??? */
Seed_type* invSeeds; /**< array of inverted repeats */
} AllSeeds_type;
/**
* Store informations about one GENERIC repeat
*
*/
typedef struct{
char type[20]; /* its name; i.e. Tandem, Palindrome, etc... */
int32_t pos1, pos2, /* both copies postions */
len1, len2, /* both copies length */
seed_pos1,seed_pos2, /* pos1 and pos2 of the originate seed */
seed_len, /* len of the seed */
match, align_len; /* number of match and length of alignment */
double score; /* the alignment score */
float seed_meanR; /* the seed meanR */
float meanR; /* The mean R-level of the repeat */
int32_t mainR; /* its Mode R */
float fraction_mainR; /* the fraction of length containing the Mode R */
} Rep;
/**
* Store informations about All GENERIC repeats
*
*/
typedef struct {
int32_t nDirRep; /* Total Number of Direct Repats in Mem */
int32_t nDirBadRep; /* Direct repeats set to -1 -- filtered out and co. */
Rep *DirRep; /* The array of Dir Rep */
int32_t nInvRep; /* Total Number of Inverted Repats in Mem */
int32_t nInvBadRep; /* Inverted Repeats set to -1 -- filtered out and co. */
Rep *InvRep; /* The array of Inverted Rep */
} Repeats;
#define MATRIX_SIZE 26
typedef struct { /******* The scoring Matrix ************/
double matrix[MATRIX_SIZE][MATRIX_SIZE]; /* the matrix of match/missmatch */
double gap_open; /* value of gap-open */
double gap_ext; /* value of gap_ext */
double expect;
} SCORING;
typedef struct { /******* The Results of Alignement by Dynamik programming ************/
double *scores; /* the score strings (+/- 'matrix') */
double *pscore; /* pointer to the current score */
double *F; /* *F is a standard to describe the best score from top */
int32_t *traces; /* the path matrix - could take values 0 for diagonal, >0 for top and <0 for left */
int32_t *Top; /* *Top is the number of steps from TOP -- needed for memorizing deletion in seq2 */
int32_t Left;
double *pBestScore; /* pointer to it */
int32_t BestScore_row; /* its row and col */
int32_t BestScore_col;
char *traceback; /* all you need for bactracking */
char *traceback_f; /* memory for forward traceback and then check other seeds */
char *traceback_b; /* memory needed for backward traceback - to avoid erasing the forward one */
int32_t alignment_len; /* guess ?? */
int32_t matches; /* number of match (score>0 in scoring matrix) */
int32_t nSegment; /* number of segment */
int32_t *Segment_begin; /* begin and end of each segment */
int32_t *Segment_end;
int32_t max_scores; /* size of the matrices ; only for memory purposes */
int32_t max_col;
int32_t max_row;
int32_t max_alignlen;
} RESULTS;
#endif /* _REPSEEK_TYPES_ */

View File

@ -0,0 +1,141 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/**
* @file sequence.c
* @author amikezor
* @date 20 Dec 2002
* @modif Oct 2003, April 2004, may 2008 - count Xs in the sequence
*
* @brief sequence manipulation
*
*
*/
#include <stdio.h>
#include <string.h>
#include "repseek_types.h"
/* -------------------------------------------- */
/* check sequence symbols */
/* -------------------------------------------- */
int8_t CheckSeq(char *seq, char *alpha)
{
int32_t ntot, nbad;
for ( ntot = nbad = 0; *seq ; seq++, ntot++)
if (! strchr(alpha, *seq))
nbad++;
if (nbad){
fprintf(stderr, "*WARNING* %d non-%s symbols on a total of %d ... ", nbad, alpha, ntot);
return 0;
}
return 1;
}
/* -------------------------------------------- */
/* change all non A,C,G,T,X char into N */
/* -------------------------------------------- */
void nonACGTXtoN(char *seq)
{
char *pseq;
for( pseq=seq; *pseq; pseq++ )
if (! strchr("ACGTX", *pseq) )
*pseq='N';
return;
}
/* -------------------------------------------- */
/* count all X in a sequence */
/* -------------------------------------------- */
int32_t countX(char *seq)
{
char *pseq;
int32_t nX=0;
for( pseq=seq; *pseq; pseq++ )
if ( *pseq == 'X' )
nX++;
return nX;
}
/* -------------------------------------------- */
/* uppercase sequence */
/* -------------------------------------------- */
#define IS_LOWER(c) (((c) >= 'a') && ((c) <= 'z'))
#define TO_UPPER(c) ((c) - 'a' + 'A')
void UpperSequence(char *seq){
char *cseq;
for (cseq = seq ; *cseq ; cseq++)
if (IS_LOWER(*cseq))
*cseq = TO_UPPER(*cseq);
}
#undef IS_LOWER
#undef TO_UPPER
void invseq(char *seqsrc, char *seqdest)
{
char *pseqsrc,
*pseqinv;
pseqsrc=seqsrc;
pseqinv=seqdest+strlen(seqsrc)-1;
while(*pseqsrc){
if(*pseqsrc == 'A' || *pseqsrc == 'a'){*pseqinv = 'T';}
else if(*pseqsrc == 'C' || *pseqsrc == 'c'){*pseqinv = 'G';}
else if(*pseqsrc == 'G' || *pseqsrc == 'g'){*pseqinv = 'C';}
else if(*pseqsrc == 'T' || *pseqsrc == 't'){*pseqinv = 'A';}
else if(*pseqsrc == 'X' || *pseqsrc == 'x'){*pseqinv = 'X';}
else{*pseqinv = 'N';}
pseqinv--;
pseqsrc++;
}
seqdest[strlen(seqsrc)]=0;
}

View File

@ -0,0 +1,45 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/**
* @file KMRK_sequence.h
* @author Eric Coissac <coissac@inrialpes.fr>
* @date Tue Feb 24 22:22:57 2004
*
* @brief Header file for sequence utilities
*
*
*/
#ifndef KMRK_sequence_h
#define KMRK_sequence_h
#include "repseek_types.h"
int8_t CheckSeq(char *seq, char *alpha);
void nonACGTXtoN(char *seq);
void UpperSequence(char *seq);
void invseq(char *seqsrc, char *seqdest);
int32_t countX(char *seq);
#endif /* KMRK_sequence_h */

View File

@ -0,0 +1,223 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/**
* @file smin.c
* @author Guillaume Achaz <gachaz@gmail.com>
* @date Jan 2006
* @modif Mar 06 - a bug while plugin the regressions numbers-
* @modif May 08 - do not count Xs from the sequence
*
* @brief from a sequence and a pvalue, get an Smin
*
*
*/
#include "repseek_types.h"
#include <string.h>
#include <math.h>
static float get_dGC(char * sequence, int32_t len )
{
int32_t letters[5] = {0,};
int32_t i=0;
for(i=0;i<len; i++){
if (sequence[i] == 'A')
letters[0]++;
else if (sequence[i] == 'C')
letters[1]++;
else if (sequence[i] == 'G')
letters[2]++;
else if (sequence[i] == 'T')
letters[3]++;
else
letters[4]++;
}
return ABS( 50.0 - 100.0*(float)( letters[1] + letters[2] ) / (float)( len - letters[4] ) );
}
static float get_dGC_2seq(char * sequence1, int32_t len1, char *sequence2, int32_t len2 )
{
int32_t letters[5] = {0,};
int32_t i=0;
for(i=0;i<len1; i++){
if (sequence1[i] == 'A')
letters[0]++;
else if (sequence1[i] == 'C')
letters[1]++;
else if (sequence1[i] == 'G')
letters[2]++;
else if (sequence1[i] == 'T')
letters[3]++;
else
letters[4]++;
}
for(i=0;i<len2; i++){
if (sequence2[i] == 'A')
letters[0]++;
else if (sequence2[i] == 'C')
letters[1]++;
else if (sequence2[i] == 'G')
letters[2]++;
else if (sequence2[i] == 'T')
letters[3]++;
else
letters[4]++;
}
return ABS( 50.0 - 100.0*(float)( letters[1] + letters[2] ) / (float)( len1+len2 - letters[4] ) );
}
static double get_log_gamma(float dGC, int32_t len){
double log_gamma=0.0;
log_gamma = - 0.0029688*dGC*dGC + 0.0727745*dGC - 0.0309583*log(len)*log(len) ;
return log_gamma;
}
static double get_log_p(float dGC, int32_t len){
double log_p=0.0;
log_p = 0.0001789 *dGC*dGC - 0.002409*dGC + 0.006838*log(len) - 1.060;
return log_p;
}
/*
Using the formula of Watrerman & Vingron
log(-log(F)) = log(gamma*m*n) + S*log(p)
for unique sequence it becomes
log(-log(F)) = log(gamma*n*(n-1)/2) + S*log(p)
*/
static double get_Smin(float dGC, int32_t len, double pvalue){
double Smin=0.0;
double log_log_minusF = 0.0,
log_gamma = 0.0,
log_p = 0.0;
log_log_minusF = log( -log( 1.00 - pvalue ) );
log_gamma = get_log_gamma( dGC, len );
log_p = get_log_p( dGC, len );
Smin = log_log_minusF - log_gamma - log( (double)len * (len-1.0) / 2.0 );
Smin /= log_p;
return Smin;
}
static double get_Smin_2seq(float dGC, int32_t len1, int32_t len2, double pvalue){
double Smin=0.0;
double log_log_minusF = 0.0,
log_gamma = 0.0,
log_p = 0.0;
log_log_minusF = log( -log( 1.00 - pvalue ) );
log_gamma = get_log_gamma( dGC, (len1+len2)/2 ); /* average dGC and average len */
log_p = get_log_p( dGC, (len1+len2)/2 );
Smin = log_log_minusF - log_gamma - log( (double)len1 * len2 ); /* when two sequences are considered,
the whole matrix [l1 l2] can potentially have repeats
*/
Smin /= log_p;
return Smin;
}
double compute_smin(char *sequence, double pvalue, int32_t seq_len, int32_t nX){
double Smin=0.0;
float dGC=0.0;
dGC = get_dGC(sequence, seq_len);
Smin = get_Smin( dGC, seq_len-nX, pvalue);
return Smin;
}
double compute_smin_2seq(char *sequence1, char *sequence2, int32_t size1, int32_t size2, double pvalue, int32_t nX, int32_t nX2){
double Smin=0.0;
float dGC=0.0;
dGC = get_dGC_2seq(sequence1, size1, sequence2, size2 );
Smin = get_Smin_2seq( dGC, size1-nX, size2-nX2, pvalue);
return Smin;
}
void UpdateRepeats(Repeats *AllRepeats, double Score_min){
int32_t i;
if( AllRepeats->nDirRep )
for(i=0;i<AllRepeats->nDirRep;i++)
if(AllRepeats->DirRep[i].pos1 != -1 && AllRepeats->DirRep[i].score<Score_min){
AllRepeats->DirRep[i].pos1 = -1;
AllRepeats->DirRep[i].pos2 = -1;
AllRepeats->DirRep[i].len1 = -1;
AllRepeats->DirRep[i].len2 = -1;
AllRepeats->DirRep[i].score = -1;
AllRepeats->nDirBadRep++;
}
if( AllRepeats->nInvRep )
for(i=0;i<AllRepeats->nInvRep;i++)
if(AllRepeats->InvRep[i].pos1 != -1 && AllRepeats->InvRep[i].score<Score_min){
AllRepeats->InvRep[i].pos1 = -1;
AllRepeats->InvRep[i].pos2 = -1;
AllRepeats->InvRep[i].len1 = -1;
AllRepeats->InvRep[i].len2 = -1;
AllRepeats->InvRep[i].score = -1;
AllRepeats->nInvBadRep++;
}
}

View File

@ -0,0 +1,57 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/**
* @file smin.h
* @author Guillaume Achaz <gachaz@oeb.harvard.edu>
* @date April 2004
*
* @brief header for all shuffling/smin process
*
*
*/
#ifndef _smin_h_
#define _smin_h_
/*
From a sequence, get length and dGC
then compute the Smin using Waterman & Vingron
*/
double compute_smin(char *sequence, double pvalue, int32_t size, int32_t nX);
/*
From two sequences, get average length and average dGC
then compute the Smin using Waterman & Vingron
*/
double compute_smin_2seq(char *sequence1, char *sequence2, int32_t size1, int32_t size2, double pvalue, int32_t nX, int32_t nX2);
/*
tag all repeats with a score < smin
set their positions and length at -1
*/
void UpdateRepeats(Repeats *AllRepeats, double Score_min);
#endif /* _smin_h_*/

View File

@ -0,0 +1,491 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/**
* @file sort.c
* @author Guillaume Achaz <gachaz@oeb.harvard.edu>
* @date oct 03 2002
* @modif dec 13 2002 - add the sort_repeats() and qsort_1then2_repeats()
Oct 2003
April 2003 - Int32->int32_t
July 2004, change Qsort into double_Qsort
*
* @brief sort two arrays by values of the first one (int or Rep) - Implemented from Numrec 92 pp.333/336
*
*
*/
#include <stdio.h>
#include <stdlib.h>
#include "repseek_types.h"
#include "memory.h"
/***
sort two array by the order of the first one
Qsort is implemented following the algorithm explained in Numrec C, pp 332-335
Under a certain size, a part of the array is sorted by the naive method (straight insertion)
***/
#define NSTACK 1000 /* the number of subarray that could be memorized */
#define SWITCH_METHOD 7 /* under that size switch to a Insertion method */
#define SWAP(a,b) {tmp=(a); (a)=(b); (b)=tmp;}
static void qsort2(int32_t *array1, int32_t *array2, int32_t n){
int32_t *Stack; /* where remaining values of subarrays are temporarily stocked */
int32_t nStack=0; /* How many of these values are in the Stack. Is never odd */
int32_t end=n-1, /* the last value of the array to sort */
beg=0, /* the first value --- */
postbeg; /* the second value --- */
int32_t val_postbeg; /* the value of the postbeg position - the one which is used for partion-exchange */
int32_t demi_len; /* half of the distance between beg and end */
int32_t i, /* counter from postbeg to right */
j; /* counter from end to left */
int32_t val1_i, /* for insertion stock temporarily a value */
val2_i;
int32_t tmp; /* used for the SWAP macro */
Stack = (int32_t *)MyMalloc(NSTACK*sizeof(int32_t) , "qsort2: not enough memory for Stack") ;
while(1){
if( end-beg+1 > SWITCH_METHOD){
demi_len = (end-beg) >> 1 ;
postbeg = beg+1;
SWAP( array1[beg+demi_len], array1[postbeg] );
SWAP( array2[beg+demi_len], array2[postbeg] );
if(array1[beg] > array1[postbeg]){ /* rearrange to have beg <= postbeg <= end */
SWAP( array1[beg], array1[postbeg] );
SWAP( array2[beg], array2[postbeg] );
}
if(array1[beg] > array1[end]){
SWAP( array1[beg], array1[end] );
SWAP( array2[beg], array2[end] );
}
if(array1[postbeg] > array1[end]){
SWAP( array1[postbeg], array1[end] );
SWAP( array2[postbeg], array2[end] );
}
i = postbeg;
j = end;
val_postbeg = array1[postbeg];
while(1) /* enter the partition exchange process */
{
do i++; while( array1[i] < val_postbeg );
do j--; while( array1[j] > val_postbeg );
if(j<i) break;
SWAP( array1[i], array1[j] );
SWAP( array2[i], array2[j] );
}
SWAP( array1[postbeg] , array1[j] ); /* place the postbeg value into j */
SWAP( array2[postbeg] , array2[j] );
if(nStack+2 > NSTACK)
fprintf(stderr, "qsort2: not enough Stack... sorry bye\n"),exit(1);
if(end-i+1 >= j-beg){
Stack[nStack++] = i; /* stock in Stack the largest and go with the smallest */
Stack[nStack++] = end;
end = j-1;
}
else{
Stack[nStack++] = beg;
Stack[nStack++] = j-1;
beg = i;
}
}
else{ /* Under a certain size, switch to the straight insertion method */
for(i=beg+1; i <= end ; i++){
val1_i = array1[i];
val2_i = array2[i];
for(j=i-1;j>=beg;j--){
if(array1[j] < val1_i)break;
array1[j+1] = array1[j];
array2[j+1] = array2[j];
}
array1[j+1]=val1_i;
array2[j+1]=val2_i;
}
if(nStack==0)break; /* this si the end - exit the process */
end = Stack[--nStack]; /* go for the next round with the stacked parameters */
beg = Stack[--nStack];
}
}
MyFree(Stack, NSTACK*sizeof(int32_t));
}
/*
A simple qsort as described in numrec
*/
void double_Qsort(double *array1, int32_t n){
double *Stack; /* where remaining values of subarrays are temporarily stocked */
int32_t nStack=0; /* How many of these values are in the Stack. Is never odd */
int32_t end=n-1, /* the last value of the array to sort */
beg=0, /* the first value --- */
postbeg; /* the second value --- */
double val_postbeg; /* the value of the postbeg position - the one which is used for partion-exchange */
int32_t demi_len; /* half of the distance between beg and end */
int32_t i, /* counter from postbeg to right */
j; /* counter from end to left */
double val1_i; /* for insertion stock temporarily a value */
double tmp; /* used for the SWAP macro */
Stack= (double *)MyMalloc(NSTACK*sizeof(double) , "Qsort: not enough memory for Stack" );
while(1){
if( end-beg+1 > SWITCH_METHOD){
demi_len = (end-beg) >> 1 ;
postbeg = beg+1;
SWAP( array1[beg+demi_len], array1[postbeg] );
if(array1[beg] > array1[postbeg]){ /* rearrange to have beg <= postbeg <= end */
SWAP( array1[beg], array1[postbeg] );
}
if(array1[beg] > array1[end]){
SWAP( array1[beg], array1[end] );
}
if(array1[postbeg] > array1[end]){
SWAP( array1[postbeg], array1[end] );
}
i = postbeg;
j = end;
val_postbeg = array1[postbeg];
while(1) /* enter the partition exchange process */
{
do i++; while( array1[i] < val_postbeg );
do j--; while( array1[j] > val_postbeg );
if(j<i) break;
SWAP( array1[i], array1[j] );
}
SWAP( array1[postbeg] , array1[j] ); /* place the postbeg value into j */
if(nStack+2 > NSTACK)
fprintf(stderr, "Qsort: not enough Stack... sorry bye\n"),exit(1);
if(end-i+1 >= j-beg){
Stack[nStack++] = i; /* stock in Stack the largest and go with the smallest */
Stack[nStack++] = end;
end = j-1;
}
else{
Stack[nStack++] = beg;
Stack[nStack++] = j-1;
beg = i;
}
}
else{ /* Under a certain size, switch to the straight insertion method */
for(i=beg+1; i <= end ; i++){
val1_i = array1[i];
for(j=i-1;j>=beg;j--){
if(array1[j] < val1_i)break;
array1[j+1] = array1[j];
}
array1[j+1]=val1_i;
}
if(nStack==0)break; /* this si the end - exit the process */
end = Stack[--nStack]; /* go for the next round with the stacked parameters */
beg = Stack[--nStack];
}
}
MyFree(Stack, NSTACK*sizeof(double));
}
#undef NSTACK
#undef SWITCH_METHOD
#undef SWAP
/*
Sort all repeat by an array
Just modify from qsort2
*/
#define SWAP(a,b) { tmprep=(a); (a)=(b); (b)=tmprep; }
#define NSTACK 1000 /* the number of subarray that could be memorized */
#define SWITCH_METHOD 7 /* under that size switch to a Insertion method */
static int32_t GetPos1( Rep repet ){
return repet.pos1;
}
static int32_t GetPos2( Rep repet ){
return repet.pos2;
}
static void qsort_repeats(Rep *AllReps, int32_t n, int32_t pos ){
int32_t *Stack; /* where remaining values of subarrays are temporarily stocked */
int32_t nStack=0; /* How many of these values are in the Stack. Is never odd */
int32_t end=n-1, /* the last value of the array to sort */
beg=0, /* the first value --- */
postbeg; /* the second value --- */
int32_t val_postbeg; /* the value of the postbeg position - the one which is used for partion-exchange */
int32_t demi_len; /* half of the distance between beg and end */
int32_t i, /* counter from postbeg to right */
j; /* counter from end to left */
Rep val_i; /* for insertion stock temporarily a value */
Rep tmprep; /* used for the SWAP macro */
int32_t (* PtrFuncPos)( Rep repet );
if( pos == 1)
PtrFuncPos = GetPos1;
else if( pos == 2)
PtrFuncPos = GetPos2;
else
fprintf(stderr,"qsort_repet: pos should be 1 or 2\n"),exit(4);
Stack=(int32_t *)MyMalloc(NSTACK*sizeof(int32_t) , "qsort_repet: not enough memory for Stack" );
while(1){
if( end-beg+1 > SWITCH_METHOD){
demi_len = (end-beg) >> 1 ;
postbeg = beg+1;
SWAP( AllReps[beg+demi_len], AllReps[postbeg] );
if( PtrFuncPos(AllReps[beg]) > PtrFuncPos(AllReps[postbeg]) ){ /* rearrange to have beg <= postbeg <= end */
SWAP( AllReps[beg], AllReps[postbeg] );
}
if( PtrFuncPos(AllReps[beg]) > PtrFuncPos(AllReps[end]) ){
SWAP( AllReps[beg], AllReps[end] );
}
if( PtrFuncPos(AllReps[postbeg]) > PtrFuncPos(AllReps[end]) ){
SWAP( AllReps[postbeg], AllReps[end] );
}
i = postbeg;
j = end;
val_postbeg = PtrFuncPos(AllReps[postbeg]);
while(1) /* enter the partition exchange process */
{
do i++; while( PtrFuncPos(AllReps[i]) < val_postbeg );
do j--; while( PtrFuncPos(AllReps[j]) > val_postbeg );
if(j<i) break;
SWAP( AllReps[i], AllReps[j] );
}
SWAP( AllReps[postbeg] , AllReps[j] ); /* place the postbeg value into j */
if(nStack+2 > NSTACK)
fprintf(stderr, "qsort_repet: not enough Stack... sorry bye\n"),exit(1);
if(end-i+1 >= j-beg){
Stack[nStack++] = i; /* stock in Stack the largest and go with the smallest */
Stack[nStack++] = end;
end = j-1;
}
else{
Stack[nStack++] = beg;
Stack[nStack++] = j-1;
beg = i;
}
}
else{ /* Under a certain size, switch to the straight insertion method */
for(i=beg+1; i <= end ; i++){
val_i = AllReps[i];
for(j=i-1;j>=beg;j--){
if( PtrFuncPos(AllReps[j]) <= PtrFuncPos(val_i) )break;
AllReps[j+1] = AllReps[j];
}
AllReps[j+1]=val_i;
}
if(nStack==0)break; /* this si the end - exit the process */
end = Stack[--nStack]; /* go for the next round with the stacked parameters */
beg = Stack[--nStack];
}
}
MyFree(Stack, NSTACK*sizeof(int32_t));
}
#undef NSTACK
#undef SWITCH_METHOD
#undef SWAP
/***
This function sort the first array, and for equal value, use the second one
calling the qsort2() function
***/
void qsort2_1then2(int32_t *array1, int32_t *array2, int32_t n){
int32_t i;
int32_t beg=0, end=n;
char token=0;
qsort2(array1, array2, n);
for(i=1, token=0; i < n-1 ; i++){
if(array1[i] == array1[i-1]){
if(!token) beg = i-1;
token = 1;
}
if(array1[i] != array1[i-1])
if(token){
token=0;
end=i;
qsort2(array2+beg, array1+beg, end-beg);
}
}
if(token){
end=i;
qsort2(array2+beg, array1+beg, end-beg);
}
}
/***
This function sort by pos1 and for equal value, use pos2
calling the qsort2_repeats() function
***/
void qsort_1then2_repeats(Rep *AllReps, int32_t n){
int32_t i;
int32_t beg=0, end=n;
char token=0;
qsort_repeats(AllReps, n, 1 );
for(i=1, token=0; i < n-1 ; i++){
if( AllReps[i].pos1 == AllReps[i-1].pos1 ){
if(!token) beg = i-1;
token = 1;
}
if(AllReps[i].pos1 != AllReps[i-1].pos1)
if(token){
token=0;
end=i;
qsort_repeats(AllReps+beg, end-beg, 2 );
}
}
if(token){
end=i;
qsort_repeats(AllReps+beg, end-beg, 2 );
}
}

View File

@ -0,0 +1,61 @@
/*
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2.1
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
*/
/**
* @file sort.h
* @author amikezor <gachaz@oeb.harvard.edu>
* @date april 2004
*
* @brief header fr sort.c
*
*
*/
#ifndef _SORT_H_
#define _SORT_H_
#include <stdio.h>
#include <stdlib.h>
#include "repseek_types.h"
#include "memory.h"
/*
A simple qsort as described in numrec
*/
void double_Qsort(double *array1, int32_t n);
/***
This function sort the first array, and for equal value, use the second one
calling the qsort2() function
***/
void qsort2_1then2(int32_t *array1, int32_t *array2, int32_t n);
/***
This function sort by pos1 and for equal value, use pos2
calling the qsort2_repeats() function
***/
void qsort_1then2_repeats(Rep *AllReps, int32_t n);
#endif /* _SORT_H_*/