First commit - second part
Former-commit-id: 202296404e6a70f8ae96db99faffb456104c57e9 Former-commit-id: 118417735d2055683607df9809c9b721cc1b1bab
This commit is contained in:
BIN
src/repseek/.DS_Store
vendored
Normal file
BIN
src/repseek/.DS_Store
vendored
Normal file
Binary file not shown.
1096
src/repseek/repseek-2014.09/KMRK.c
Normal file
1096
src/repseek/repseek-2014.09/KMRK.c
Normal file
File diff suppressed because it is too large
Load Diff
319
src/repseek/repseek-2014.09/KMRK.h
Normal file
319
src/repseek/repseek-2014.09/KMRK.h
Normal file
@ -0,0 +1,319 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
#ifndef KMRK_h
|
||||
#define KMRK_h
|
||||
|
||||
|
||||
/********************************************
|
||||
********************************************
|
||||
**
|
||||
** Declaration of struct
|
||||
**
|
||||
********************************************
|
||||
********************************************/
|
||||
|
||||
#include "repseek_types.h"
|
||||
#include "KMRK_mask.h"
|
||||
|
||||
/**
|
||||
* Structure used to manipulate simultanously
|
||||
* the v and n vector
|
||||
*
|
||||
*/
|
||||
|
||||
typedef struct {
|
||||
|
||||
int32_t size; /**< size of the current sequence to search for */
|
||||
int32_t vectorsize; /**< size of vectors */
|
||||
|
||||
int32_t seqCount; /**< count of concatenated sequences */
|
||||
int32_t complement; /**< if seqCount > 1 and complement !=0
|
||||
* then second sequence is the inverted complement strand of first one */
|
||||
int32_t symbmax;
|
||||
int32_t* v; /**< sequence vector */
|
||||
int32_t* n; /**< linked list vector the special vector of KRM-K */
|
||||
int32_t limits[1]; /**< array of end limits of concatenated sequences in v (array size is seqCount) */
|
||||
|
||||
} vn_type;
|
||||
|
||||
|
||||
|
||||
/********************************************
|
||||
********************************************
|
||||
**
|
||||
** Declaration of public macro
|
||||
**
|
||||
********************************************
|
||||
********************************************/
|
||||
|
||||
#define GETVECTOR(x,v) (((x)->v) - 1)
|
||||
|
||||
#define IS_MARKED(x,i) ((x)[i] < 0)
|
||||
#define MARK(x,i) ((x)[i]) = -ABS((x)[i])
|
||||
#define UNMARK(x,i) ((x)[i]) = ABS((x)[i])
|
||||
|
||||
#define SET(x,i,v) ((x)[i]) = (v)
|
||||
#define SETMARKED(x,i,v) ((x)[i]) = -(v)
|
||||
#define GET(x,i) ABS((x)[i])
|
||||
#define SYMBOLE(x,i) ((IS_MARKED((x),(i))) ? (i): (GET(x,i)))
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Macro used to declare a pointer to a quorum function.
|
||||
*
|
||||
* @param name name of the pointer
|
||||
*
|
||||
*/
|
||||
|
||||
#define KMRK_QUORUM_FUNC_PTR(name) int32_t (*name)( vn_type* x, int32_t pos, int32_t count, int32_t countSeq )
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Macro used to declare a pointer to an initialisation function.
|
||||
*
|
||||
* @param name name of the pointer
|
||||
* @param quorum name used for the quorum assiciated function
|
||||
*
|
||||
* @see KMRK_QUORUM_FUNC_PTR
|
||||
*
|
||||
*/
|
||||
|
||||
#define KMRK_INIT_FUNC_PTR(name,quorum) vn_type* (*name)(char *sequence, \
|
||||
int32_t complement, \
|
||||
int32_t count, \
|
||||
int32_t countSeq, \
|
||||
int32_t *k, \
|
||||
KMRK_QUORUM_FUNC_PTR(quorum),\
|
||||
masked_area_table_t *mask)
|
||||
|
||||
|
||||
/********************************************
|
||||
********************************************
|
||||
**
|
||||
** Declaration of public functions
|
||||
**
|
||||
********************************************
|
||||
********************************************/
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Initialise a vn_type record from one sequence to run KMRK algorithm
|
||||
*
|
||||
* @param sequence pointer to a C string containing the sequence
|
||||
* @param complement != 0 means that seq one and two are the two strands of
|
||||
* the same sequence.
|
||||
* @param count parameter count passed to the quorun function
|
||||
* @param countSeq parametter countSeq passed to the quorun function
|
||||
* @param k length of the word represented by each symbole of v.
|
||||
* k is an output parametter
|
||||
* @param quorum pointer to a quorum function
|
||||
*
|
||||
* @return a pointer to vn_type structure correctly initialized
|
||||
* to be used by KMRK_RunKMRK
|
||||
*
|
||||
* @see KMRK_HashOneSequence
|
||||
*/
|
||||
|
||||
vn_type* KMRK_InitOneSequence(char *sequence,
|
||||
int32_t complement,
|
||||
int32_t count,
|
||||
int32_t countSeq,
|
||||
int32_t *k,
|
||||
KMRK_QUORUM_FUNC_PTR(quorum),
|
||||
masked_area_table_t *mask);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Initialise a vn_type record from one sequence to run KMRK algorithme.
|
||||
* In more than KMRK_InitOneSequence, KMRK_HashOneSequence construct
|
||||
* word of len k with an hash algorithm. k used is a function of
|
||||
* sequence size and alphabet size. If calculed k is superior to lmax
|
||||
* then k = lmax.
|
||||
*
|
||||
* @param sequence pointer to a C string containing the sequence
|
||||
* @param complement != 0 means that seq one and two are the two strands of
|
||||
* the same sequence.
|
||||
* @param count parametter count passed to the quorun function
|
||||
* @param countSeq parametter countSeq passed to the quorun function
|
||||
* @param k maximum length of the created word (input)
|
||||
* length of the word represented by each symbole
|
||||
* of v (output).
|
||||
* @param quorum pointer to a quorum function
|
||||
*
|
||||
* @return a pointer to vn_type structure correctly initialized
|
||||
* to be used by KMRK_RunKMRK
|
||||
*
|
||||
* @see KMRK_InitOneSequence
|
||||
*/
|
||||
|
||||
vn_type* KMRK_HashOneSequence(char *sequence,
|
||||
int32_t complement,
|
||||
int32_t count,
|
||||
int32_t countSeq,
|
||||
int32_t *k,
|
||||
KMRK_QUORUM_FUNC_PTR(quorum),
|
||||
masked_area_table_t *mask);
|
||||
|
||||
|
||||
/**
|
||||
* An example of quorum function testing than a factor is
|
||||
* present a least two times. Because of definition of this
|
||||
* quorum function count and countSeq parametter have no meanning
|
||||
* in this instance of quorum function
|
||||
*
|
||||
* @param x a pointer to vn_type structure to check
|
||||
* @param pos position in n of the beginning of the linked list to test
|
||||
* @param count minimal number of occurence of factor
|
||||
* @param countSeq minimal number of sequences concerned
|
||||
*
|
||||
* @return 1 if quorum is ok 0 otherwise.
|
||||
*/
|
||||
|
||||
int32_t KMRK_CoupleQuorum(vn_type* x,
|
||||
int32_t pos,
|
||||
int32_t count,
|
||||
int32_t countSeq);
|
||||
|
||||
|
||||
/**
|
||||
* An example of quorum function testing than a factor is
|
||||
* present a least two times in the direct strand of a sequence or
|
||||
* at least one time in the direct strand and one time in the reverse
|
||||
* strand. Because of definition of this
|
||||
* quorum function count and countSeq parametter have no meanning
|
||||
* in this instance of quorum function
|
||||
*
|
||||
* @param x a pointer to vn_type structure to check
|
||||
* @param pos position in n of the beginning of the linked list to test
|
||||
* @param count minimal number of occurence of factor
|
||||
* @param countSeq minimal number of sequences concerned
|
||||
*
|
||||
* @return 1 if quorum is ok 0 otherwise.
|
||||
*/
|
||||
|
||||
int32_t KMRK_DirInvQuorum(vn_type* x,
|
||||
int32_t pos,
|
||||
int32_t count,
|
||||
int32_t countSeq);
|
||||
|
||||
/**
|
||||
* An example of quorum function testing than a factor is
|
||||
* present a least one time in the direct strand and one time in the reverse
|
||||
* strand. Because of definition of this
|
||||
* quorum function count and countSeq parametter have no meanning
|
||||
* in this instance of quorum function
|
||||
*
|
||||
* @param x a pointer to vn_type structure to check
|
||||
* @param pos position in n of the beginning of the linked list to test
|
||||
* @param count minimal number of occurence of factor
|
||||
* @param countSeq minimal number of sequences concerned
|
||||
*
|
||||
* @return 1 if quorum is ok 0 otherwise.
|
||||
*/
|
||||
|
||||
int32_t KMRK_InvQuorum(vn_type* x,
|
||||
int32_t pos,
|
||||
int32_t count,
|
||||
int32_t countSeq);
|
||||
|
||||
int32_t KMRK_Int12Quorum(vn_type* x,
|
||||
int32_t pos,
|
||||
int32_t count,
|
||||
int32_t countSeq);
|
||||
|
||||
int32_t KMRK_IntInv12Quorum(vn_type* x,
|
||||
int32_t pos,
|
||||
int32_t count,
|
||||
int32_t countSeq);
|
||||
|
||||
int32_t KMRK_IntDirInv12Quorum(vn_type* x,
|
||||
int32_t pos,
|
||||
int32_t count,
|
||||
int32_t countSeq);
|
||||
/**
|
||||
* realize one cycle of KMR.
|
||||
*
|
||||
* @param x a pointer to vn_type created by an intialisation
|
||||
* function or returned by this function.
|
||||
* @param k step used to join two words
|
||||
* @param count parametter count passed to the quorun function
|
||||
* @param countSeq parametter countSeq passed to the quorun function
|
||||
* @param KMRK_QUORUM_FUNC_PTR quorum pointer to a quorum function
|
||||
*/
|
||||
|
||||
void KMRK_RunKMRK(vn_type *x,
|
||||
int32_t k,
|
||||
int32_t count,
|
||||
int32_t countSeq,
|
||||
KMRK_QUORUM_FUNC_PTR(quorum));
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* realises serveral run of KMR cycle to make from a sequence
|
||||
* a vn_type structure describing sequences of factors of a precise size.
|
||||
*
|
||||
* @param sequence pointer to a C string containing the sequence
|
||||
* @param size word size to construct
|
||||
* @param count parametter count passed to the quorun function
|
||||
* @param countSeq parametter countSeq passed to the quorun function
|
||||
* @param quorum pointer to a quorum function
|
||||
* @param init pointer to a initialisation function
|
||||
*
|
||||
* @return a vn_type pointer to a structure containing sequences of factors
|
||||
*/
|
||||
|
||||
vn_type *KMRK_RunTo(char *sequence,
|
||||
int32_t size,
|
||||
int32_t complement,
|
||||
int32_t count,
|
||||
int32_t countSeq,
|
||||
KMRK_QUORUM_FUNC_PTR(quorum),
|
||||
KMRK_INIT_FUNC_PTR(init,quorum),
|
||||
masked_area_table_t *mask);
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* free memory associated to a vn_type pointer
|
||||
*
|
||||
* @param x a pointer to vn_type structure
|
||||
*/
|
||||
|
||||
void KMRK_FreeVN(vn_type *x);
|
||||
|
||||
|
||||
int32_t KMRK_upperCoupleCount(vn_type *x);
|
||||
int32_t KMRK_upperInvertedCount(vn_type* x,int32_t wordsize);
|
||||
int32_t KMRK_upperInterCount(vn_type* x,int32_t seq1,int32_t seq2,int32_t wordsize);
|
||||
|
||||
void KMRK_markStart(vn_type* x);
|
||||
|
||||
#endif /* KMRK_h */
|
947
src/repseek/repseek-2014.09/KMRK_Seeds.c
Normal file
947
src/repseek/repseek-2014.09/KMRK_Seeds.c
Normal file
@ -0,0 +1,947 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
#include "KMRK_Seeds.h"
|
||||
#include "memory.h"
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "sequence.h"
|
||||
|
||||
|
||||
static void SetMultipleLenInvSeeds(
|
||||
SmallSeed_type* seeds, /* an array of a structure with pos1 and pos2 */
|
||||
int32_t nseeds, /* number of small seeds */
|
||||
int32_t wordSize, /* lmin */
|
||||
int8_t same, /* 1 if one sequence, 0 otherwise */
|
||||
AllSeeds_type *PtrAllSeeds); /* the final seeds with length */
|
||||
|
||||
|
||||
/*
|
||||
Concatenate
|
||||
DirSeq\@InvSeq\0
|
||||
*/
|
||||
static char* makeDirInvSeq(char* seq, int32_t size)
|
||||
{
|
||||
char *SeqInv;
|
||||
|
||||
seq = (char *)MyRealloc( (void *)seq, (size*2+2)*sizeof(char),
|
||||
(size+1)* sizeof(char), "makeDirInvSeq: Cannot allocate space for reverse sequence");
|
||||
|
||||
SeqInv= seq + size + 1;
|
||||
seq[size]=0;
|
||||
invseq(seq, SeqInv);
|
||||
seq[size]='@';
|
||||
SeqInv[size]=0;
|
||||
|
||||
return seq;
|
||||
}
|
||||
|
||||
/*
|
||||
Merge the seq1 with seq2
|
||||
*/
|
||||
static char *merge2seq(char* seq1, char* seq2,
|
||||
int32_t size1, int32_t size2)
|
||||
{
|
||||
char * dest;
|
||||
|
||||
seq1 = (char *)MyRealloc((void *)seq1, (size1+size2+2) *sizeof(char),
|
||||
(size1+1)*sizeof(char), "merg2seq: Cannot allocate space for reverse sequence");
|
||||
|
||||
dest = seq1 + size1 + 1;
|
||||
seq1[size1]='@';
|
||||
memcpy(dest,seq2,size2);
|
||||
dest[size2]=0;
|
||||
|
||||
return seq1;
|
||||
}
|
||||
|
||||
static int32_t dirDelta(SmallSeed_type* seed)
|
||||
{
|
||||
return seed->pos2 - seed->pos1;
|
||||
}
|
||||
|
||||
void KMRK_SetMultipleLenDirSeeds(SmallSeed_type* seeds,
|
||||
int32_t nseeds,
|
||||
int32_t wordSize,
|
||||
AllSeeds_type *PtrAllSeeds)
|
||||
{
|
||||
|
||||
int32_t i,j; /* dummy counters j=kept seeds ; i = current seed */
|
||||
int32_t curLen=wordSize; /* Length of the current seed */
|
||||
int32_t delta;
|
||||
SmallSeed_type *mainSeed;
|
||||
SmallSeed_type *curSeed;
|
||||
int32_t add;
|
||||
|
||||
/* fprintf(stderr,"New Version\n");*/
|
||||
|
||||
KMRK_sortSeeds(seeds,nseeds,KMRK_cmpDeltaSeedsPos);
|
||||
|
||||
for(j=0,mainSeed=seeds ;
|
||||
j < nseeds;
|
||||
j=i, mainSeed=curSeed)
|
||||
{
|
||||
/* foreach seed */
|
||||
delta = dirDelta(mainSeed);
|
||||
curLen=wordSize;
|
||||
|
||||
for (i=j+1,curSeed=mainSeed+1;
|
||||
i < nseeds &&
|
||||
dirDelta(curSeed)==delta &&
|
||||
(curSeed->pos1 - mainSeed->pos1) <= curLen;
|
||||
i++,curSeed++)
|
||||
{
|
||||
add=wordSize - mainSeed->pos1 - curLen + curSeed->pos1;
|
||||
if (add < 0) add = 0;
|
||||
curLen+=add;
|
||||
}
|
||||
|
||||
KMRK_pushSeed(PtrAllSeeds,
|
||||
seeds[j].pos1,seeds[j].pos2,
|
||||
curLen,
|
||||
1);
|
||||
/*
|
||||
if(seeds[j].pos1 == seeds[j].pos2)
|
||||
fprintf(stderr, "KMRK_SetMultipleLenDirSeeds: %d %d %d, bye\n", seeds[j].pos1, seeds[j].pos2, curLen), exit(0);
|
||||
*/
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
static int32_t invDelta(SmallSeed_type* seed)
|
||||
{
|
||||
return seed->pos2 + seed->pos1;
|
||||
}
|
||||
|
||||
static void SetMultipleLenInvSeeds(
|
||||
SmallSeed_type* seeds, /* an array of a structure with pos1 and pos2 */
|
||||
int32_t nseeds, /* number of small seeds */
|
||||
int32_t wordSize, /* lmin */
|
||||
int8_t same, /* 1 if one sequence, 0 otherwise */
|
||||
AllSeeds_type *PtrAllSeeds) /* the final seeds with length */
|
||||
{
|
||||
|
||||
int32_t i,j; /* dummy counters j=kept seeds ; i = current seed */
|
||||
int32_t curLen=wordSize; /* Length of the current seed */
|
||||
int32_t delta; /* the pos1+pos2 - constant if seeds should be merged */
|
||||
int32_t pos2; /* after merge, new pos2 */
|
||||
SmallSeed_type *mainSeed; /* the first one of a series */
|
||||
SmallSeed_type *curSeed; /* the current one */
|
||||
/*int32_t add; what is added to the Lenght for the merged form */
|
||||
|
||||
|
||||
|
||||
KMRK_sortSeeds(seeds,nseeds,KMRK_cmpDeltaInvSeedsPos); /* qsort() seeds by delta, then pos1 */
|
||||
|
||||
|
||||
for(j=0,mainSeed=seeds ; j < nseeds; j=i, mainSeed=curSeed)
|
||||
{
|
||||
|
||||
delta = invDelta(mainSeed); /* pos1+pos2 */
|
||||
curLen=wordSize; /* at start curLen is equal to lmin */
|
||||
|
||||
for (i=j+1, curSeed=mainSeed+1;
|
||||
i < nseeds &&
|
||||
invDelta(curSeed)==delta && /* Constant if they should merge */
|
||||
(curSeed->pos1 - (curSeed-1)->pos1) <= wordSize; /* if they overlap */
|
||||
i++, curSeed++)
|
||||
{;}
|
||||
|
||||
|
||||
curLen += ( (curSeed-1)->pos1 - mainSeed->pos1);
|
||||
|
||||
if ( same && ( seeds[j].pos1+curLen-1 > seeds[i-1].pos2) ){
|
||||
pos2 = seeds[j].pos1;
|
||||
curLen += seeds[i-1].pos2 - seeds[j].pos1;
|
||||
}
|
||||
else
|
||||
pos2 = seeds[i-1].pos2;
|
||||
|
||||
|
||||
/*
|
||||
Old Code - there was a bug for palindrome
|
||||
|
||||
for (i=j+1, curSeed=mainSeed+1;
|
||||
i < nseeds && invDelta(curSeed)==delta && (curSeed->pos1 - mainSeed->pos1) <= curLen;
|
||||
i++, curSeed++)
|
||||
{
|
||||
{
|
||||
add = wordSize-curLen - (mainSeed->pos1 - curSeed->pos1);
|
||||
if (add < 0) add = 0;
|
||||
curLen+=add;
|
||||
}
|
||||
|
||||
if ( same && (seeds[j].pos1+curLen>= seeds[i-1].pos2) )
|
||||
{
|
||||
curLen = (seeds[i-1].pos2 - seeds[j].pos1 + 1) * 2;
|
||||
pos2 = seeds[j].pos1;
|
||||
}
|
||||
else
|
||||
pos2 = seeds[i-1].pos2;
|
||||
*/
|
||||
|
||||
|
||||
KMRK_pushSeed(PtrAllSeeds,
|
||||
seeds[j].pos1,pos2,
|
||||
curLen,
|
||||
0);
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
AllSeeds_type *KMRK_allocSeeds(AllSeeds_type *AllSeeds,
|
||||
int32_t size,
|
||||
int8_t opt_dir,
|
||||
int8_t opt_inv)
|
||||
{
|
||||
|
||||
AllSeeds_type *reponse;
|
||||
|
||||
if(opt_inv != 1 && opt_dir != 1)
|
||||
{
|
||||
fprintf(stderr,"AllocSeeds: requiere at least "
|
||||
"one of opt_dir and opt_inv to be 1\n");
|
||||
exit(4);
|
||||
}
|
||||
|
||||
reponse = AllSeeds;
|
||||
|
||||
if (!reponse)
|
||||
reponse = MyCalloc( 1, sizeof(AllSeeds_type),"KMRK_allocSeeds: cannot allocate new data structure");
|
||||
|
||||
if(opt_dir)
|
||||
{
|
||||
if (reponse->dirSeeds==NULL)
|
||||
reponse->dirSeeds = (Seed_type *)MyCalloc( size,sizeof(Seed_type),"KMRK_allocSeeds: cannot allocate Direct Seeds");
|
||||
else
|
||||
{
|
||||
if(size)
|
||||
reponse->dirSeeds = (Seed_type *)MyRealloc( (void *)reponse->dirSeeds,
|
||||
size*sizeof(Seed_type), reponse->cDirSeeds*sizeof(Seed_type),
|
||||
"allocSeeds: cannot reallocate Direct Seeds" );
|
||||
else
|
||||
{
|
||||
MyFree( reponse->dirSeeds, reponse->cDirSeeds*sizeof(Seed_type) );
|
||||
reponse->dirSeeds=NULL;
|
||||
reponse->cDirSeeds=0;
|
||||
}
|
||||
}
|
||||
|
||||
reponse->cDirSeeds=size;
|
||||
}
|
||||
|
||||
|
||||
if(opt_inv)
|
||||
{
|
||||
if (reponse->invSeeds==NULL)
|
||||
reponse->invSeeds = (Seed_type *)MyCalloc( size, sizeof(Seed_type),"KMRK_allocSeeds: cannot allocate Inverted Seeds");
|
||||
else
|
||||
{
|
||||
if(size)
|
||||
reponse->invSeeds = (Seed_type *)MyRealloc( (void *)reponse->invSeeds,
|
||||
size*sizeof(Seed_type), reponse->cInvSeeds*sizeof(Seed_type),
|
||||
"allocSeeds: cannot reallocate Inverted Seeds" );
|
||||
else
|
||||
{
|
||||
MyFree( reponse->invSeeds, reponse->cInvSeeds*sizeof(Seed_type) );
|
||||
reponse->invSeeds=NULL;
|
||||
reponse->cInvSeeds=0;
|
||||
}
|
||||
}
|
||||
|
||||
reponse->cInvSeeds=size;
|
||||
}
|
||||
|
||||
|
||||
return reponse;
|
||||
}
|
||||
|
||||
void KMRK_freeSeeds(AllSeeds_type *AllSeeds)
|
||||
{
|
||||
if (!AllSeeds)
|
||||
return;
|
||||
|
||||
if (AllSeeds->dirSeeds)
|
||||
MyFree(AllSeeds->dirSeeds, AllSeeds->cDirSeeds*sizeof(Seed_type) );
|
||||
AllSeeds->dirSeeds=NULL;
|
||||
|
||||
if (AllSeeds->invSeeds)
|
||||
MyFree(AllSeeds->invSeeds, AllSeeds->cInvSeeds*sizeof(Seed_type) );
|
||||
AllSeeds->dirSeeds=NULL;
|
||||
|
||||
MyFree(AllSeeds, 1*sizeof( AllSeeds_type ) );
|
||||
}
|
||||
|
||||
void KMRK_compactSeeds(AllSeeds_type *AllSeeds)
|
||||
{
|
||||
if (AllSeeds)
|
||||
{
|
||||
if (AllSeeds->dirSeeds)
|
||||
KMRK_allocSeeds(AllSeeds, AllSeeds->nDirSeeds, 1, 0);
|
||||
|
||||
if (AllSeeds->invSeeds)
|
||||
KMRK_allocSeeds(AllSeeds, AllSeeds->nInvSeeds, 0, 1);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void KMRK_pushSeed(AllSeeds_type *AllSeeds,
|
||||
int32_t pos1,
|
||||
int32_t pos2,
|
||||
int32_t length,
|
||||
int8_t dir)
|
||||
{
|
||||
Seed_type* stack;
|
||||
int32_t maxcount;
|
||||
int32_t index;
|
||||
|
||||
if (dir)
|
||||
{
|
||||
dir = 1;
|
||||
stack = AllSeeds->dirSeeds;
|
||||
maxcount = AllSeeds->cDirSeeds;
|
||||
index = AllSeeds->nDirSeeds;
|
||||
}
|
||||
else
|
||||
{
|
||||
dir = 0;
|
||||
stack = AllSeeds->invSeeds;
|
||||
maxcount = AllSeeds->cInvSeeds;
|
||||
index = AllSeeds->nInvSeeds;
|
||||
}
|
||||
|
||||
if (index == maxcount)
|
||||
{
|
||||
(void) KMRK_allocSeeds(AllSeeds,
|
||||
maxcount * 2,
|
||||
dir,
|
||||
!dir);
|
||||
|
||||
if (dir)
|
||||
stack = AllSeeds->dirSeeds;
|
||||
else
|
||||
stack = AllSeeds->invSeeds;
|
||||
}
|
||||
|
||||
stack+=index;
|
||||
|
||||
stack->pos1 = pos1;
|
||||
stack->pos2 = pos2;
|
||||
stack->length = length;
|
||||
|
||||
if (dir)
|
||||
AllSeeds->nDirSeeds++;
|
||||
else
|
||||
AllSeeds->nInvSeeds++;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
AllSeeds_type* KMRK_enumerateDirectCouple(AllSeeds_type* Seeds,
|
||||
int32_t expected,
|
||||
int32_t wordSize,
|
||||
vn_type* stack,
|
||||
int32_t seq)
|
||||
{
|
||||
|
||||
int32_t xmin;
|
||||
int32_t xmax;
|
||||
int32_t i;
|
||||
int32_t j;
|
||||
int32_t k;
|
||||
int32_t next;
|
||||
int32_t* n;
|
||||
int32_t nseeds;
|
||||
|
||||
SmallSeed_type *list;
|
||||
|
||||
list = (SmallSeed_type *)MyCalloc( expected, sizeof(SmallSeed_type) ,
|
||||
"KMRK_enumerateDirectCouple: cannot allocate DirectCouple memory");
|
||||
|
||||
nseeds = 0;
|
||||
|
||||
n = GETVECTOR(stack,n);
|
||||
|
||||
if (seq)
|
||||
xmin = stack->limits[seq-1];
|
||||
else
|
||||
xmin = 0;
|
||||
|
||||
xmax = stack->limits[seq];
|
||||
|
||||
for (i=1; i <= xmax; i++)
|
||||
if (IS_MARKED(n,i)) /* Check begining of chained list */
|
||||
{
|
||||
/* Look for begining of sequence of interest */
|
||||
for( ;(i <= xmin) && (i != GET(n,i));
|
||||
i=GET(n,i));
|
||||
|
||||
/* for each factor in sequence of interest */
|
||||
for (j=i;
|
||||
(j != GET(n,j)) && (j <= xmax);
|
||||
j = GET(n,j))
|
||||
{
|
||||
next = GET(n,j);
|
||||
if (next <= xmax)
|
||||
do
|
||||
{
|
||||
k = next;
|
||||
next = GET(n,k);
|
||||
list[nseeds].pos1 = j-1;
|
||||
list[nseeds].pos2 = k-1;
|
||||
nseeds++;
|
||||
} while ((k!=next) && (next <= xmax));
|
||||
}
|
||||
} ;
|
||||
|
||||
Seeds = KMRK_allocSeeds(Seeds,
|
||||
expected/20+1,
|
||||
1,0);
|
||||
|
||||
/* fprintf(stderr,"Expected direct couple : %d\n",expected);*/
|
||||
|
||||
KMRK_SetMultipleLenDirSeeds(list,nseeds,wordSize,Seeds);
|
||||
MyFree(list, expected*sizeof(SmallSeed_type) );
|
||||
KMRK_compactSeeds(Seeds);
|
||||
|
||||
return Seeds;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
From KMR-K Stacks to SmallSeeds
|
||||
*/
|
||||
AllSeeds_type* KMRK_enumerateInvertedCouple(
|
||||
AllSeeds_type* Seeds,
|
||||
int32_t expected, /* the expected number of couple */
|
||||
int32_t wordSize, /* the Lmin */
|
||||
vn_type* stack)
|
||||
{
|
||||
int32_t xmax;
|
||||
int32_t invmax;
|
||||
int32_t posinv;
|
||||
int32_t i;
|
||||
int32_t j;
|
||||
int32_t k;
|
||||
int32_t memk;
|
||||
int32_t* n;
|
||||
int32_t next;
|
||||
|
||||
int32_t nseeds; /* number of seeds */
|
||||
|
||||
SmallSeed_type *list; /* seed list with only pos1 and pos2 --simple output from kmrk */
|
||||
|
||||
|
||||
list = (SmallSeed_type *)MyCalloc( expected, sizeof(SmallSeed_type) ,
|
||||
"KMRK_enumerateInvertedCouple: cannot allocate InvertedCouple memory");
|
||||
|
||||
nseeds = 0;
|
||||
|
||||
n = GETVECTOR(stack,n);
|
||||
|
||||
xmax = stack->limits[0];
|
||||
invmax = stack->limits[1];
|
||||
|
||||
for (i=1; i <= xmax; i++)
|
||||
if (IS_MARKED(n,i))
|
||||
{
|
||||
for(memk=i ;
|
||||
(memk < xmax) &&
|
||||
memk != GET(n,memk) &&
|
||||
(memk <= invmax);
|
||||
memk=GET(n,memk));
|
||||
|
||||
if ((memk > xmax) && (memk <= invmax))
|
||||
for (j=i;
|
||||
(j <= xmax) && (j != GET(n,j));
|
||||
j=GET(n,j))
|
||||
{
|
||||
next = memk;
|
||||
do
|
||||
{
|
||||
k = next;
|
||||
next = GET(n,k);
|
||||
posinv = 2 * xmax - k -wordSize + 3;
|
||||
if (j <= posinv)
|
||||
{
|
||||
list[nseeds].pos1=j-1;
|
||||
list[nseeds].pos2=posinv-1;
|
||||
nseeds++;
|
||||
}
|
||||
} while ((next <= invmax) && (k != next));
|
||||
}
|
||||
}
|
||||
|
||||
Seeds = KMRK_allocSeeds(Seeds, expected/20+1, 0,1);
|
||||
|
||||
/* fprintf(stderr,"Expected inverted couple : %d\n",expected);*/
|
||||
|
||||
SetMultipleLenInvSeeds(list,nseeds,wordSize,1,Seeds); /* turn the Small seeds into merged seeds */
|
||||
MyFree(list, expected*sizeof(SmallSeed_type) );
|
||||
KMRK_compactSeeds(Seeds);
|
||||
|
||||
|
||||
return Seeds;
|
||||
}
|
||||
|
||||
AllSeeds_type* KMRK_enumerateInterCouple(AllSeeds_type* Seeds,
|
||||
int32_t seq1,
|
||||
int32_t seq2,
|
||||
int32_t expected,
|
||||
int32_t wordSize,
|
||||
vn_type* stack)
|
||||
{
|
||||
int32_t xmin;
|
||||
int32_t xmax;
|
||||
int32_t ymax;
|
||||
int32_t ymin;
|
||||
int32_t pos1;
|
||||
int32_t pos2;
|
||||
int32_t i;
|
||||
int32_t j;
|
||||
int32_t k;
|
||||
int32_t memj;
|
||||
int32_t memk;
|
||||
int32_t* n;
|
||||
int32_t next;
|
||||
|
||||
int32_t nseeds;
|
||||
|
||||
|
||||
SmallSeed_type *list;
|
||||
nseeds=0;
|
||||
|
||||
list = (SmallSeed_type *)MyCalloc( expected, sizeof(SmallSeed_type) ,
|
||||
"KMRK_enumerateInterCouple: cannot allocate InterCouple memory");
|
||||
|
||||
n = GETVECTOR(stack,n);
|
||||
|
||||
if (seq1==0)
|
||||
xmin=0;
|
||||
else
|
||||
xmin = stack->limits[seq1-1];
|
||||
|
||||
xmax = stack->limits[seq1];
|
||||
ymin = stack->limits[seq2-1];
|
||||
ymax = stack->limits[seq2];
|
||||
|
||||
for (i=1; i <= xmax; i++)
|
||||
if (IS_MARKED(n,i))
|
||||
{
|
||||
for(memj=i ;
|
||||
(memj < xmin) &&
|
||||
memj != GET(n,memj);
|
||||
memj=GET(n,memj));
|
||||
|
||||
if ((memj > xmin) && (memj <= xmax))
|
||||
{
|
||||
for(memk=memj ;
|
||||
(memk < ymin) &&
|
||||
memk != GET(n,memk);
|
||||
memk=GET(n,memk));
|
||||
|
||||
if ((memk > ymin) && (memk <= ymax))
|
||||
for (j=memj;
|
||||
(j <= xmax) && (j != GET(n,j));
|
||||
j=GET(n,j))
|
||||
{
|
||||
next = memk;
|
||||
do
|
||||
{
|
||||
k = next;
|
||||
next = GET(n,k);
|
||||
if (seq1 > 0)
|
||||
pos1 = j - xmin - 1;
|
||||
else
|
||||
pos1 = j;
|
||||
pos2 = k - ymin - 1;
|
||||
list[nseeds].pos1 = pos1 - 1;
|
||||
list[nseeds].pos2 = pos2 - 1;
|
||||
nseeds++;
|
||||
} while ((next <= ymax) && (k != next));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Seeds = KMRK_allocSeeds(Seeds,
|
||||
expected/20+1,
|
||||
1,0);
|
||||
|
||||
/* fprintf(stderr,"Expected inter-direct couple : %d\n",expected);*/
|
||||
KMRK_SetMultipleLenDirSeeds(list,nseeds,wordSize,Seeds);
|
||||
|
||||
MyFree(list, expected*sizeof(SmallSeed_type) );
|
||||
|
||||
KMRK_compactSeeds(Seeds);
|
||||
|
||||
return Seeds;
|
||||
}
|
||||
|
||||
AllSeeds_type* KMRK_enumerateInterInvertedCouple(AllSeeds_type* Seeds,
|
||||
int32_t seq2,
|
||||
int32_t expected,
|
||||
int32_t wordSize,
|
||||
vn_type* stack)
|
||||
|
||||
{
|
||||
int32_t xmin;
|
||||
int32_t xmax;
|
||||
int32_t ymax;
|
||||
int32_t ymin;
|
||||
int32_t posinv;
|
||||
int32_t pos2;
|
||||
int32_t i;
|
||||
int32_t j;
|
||||
int32_t k;
|
||||
int32_t memj;
|
||||
int32_t memk;
|
||||
int32_t* n;
|
||||
int32_t next;
|
||||
|
||||
int32_t nseeds;
|
||||
|
||||
SmallSeed_type *list;
|
||||
|
||||
list = (SmallSeed_type *)MyCalloc( expected, sizeof(SmallSeed_type) ,
|
||||
"KMRK_enumerateInterCouple: cannot allocate InterCouple memory");
|
||||
|
||||
|
||||
nseeds = 0;
|
||||
n = GETVECTOR(stack,n);
|
||||
|
||||
if (seq2 < 2)
|
||||
{
|
||||
fprintf(stderr,"enumerateInterInvertedCouple: seq2 must be differente to 0");
|
||||
exit(4);
|
||||
}
|
||||
|
||||
|
||||
xmin = stack->limits[0];
|
||||
xmax = stack->limits[1];
|
||||
|
||||
ymin = stack->limits[seq2-1];
|
||||
ymax = stack->limits[seq2];
|
||||
|
||||
Seeds = KMRK_allocSeeds(Seeds,
|
||||
expected,
|
||||
0,1);
|
||||
|
||||
for (i=1; i <= xmax; i++)
|
||||
if (IS_MARKED(n,i))
|
||||
{
|
||||
for(memj=i ;
|
||||
(memj < xmin) &&
|
||||
memj != GET(n,memj);
|
||||
memj=GET(n,memj));
|
||||
|
||||
if ((memj > xmin) && (memj <= xmax))
|
||||
{
|
||||
for(memk=memj ;
|
||||
(memk < ymin) &&
|
||||
memk != GET(n,memk);
|
||||
memk=GET(n,memk));
|
||||
|
||||
if ((memk > ymin) && (memk <= ymax))
|
||||
for (j=memj;
|
||||
(j <= xmax) && (j != GET(n,j));
|
||||
j=GET(n,j))
|
||||
{
|
||||
next = memk;
|
||||
do
|
||||
{
|
||||
k = next;
|
||||
next = GET(n,k);
|
||||
posinv = 2 * xmin - j -wordSize + 3;
|
||||
pos2 = k - ymin - 1;
|
||||
list[nseeds].pos1=posinv-1;
|
||||
list[nseeds].pos2=pos2-1;
|
||||
nseeds++;
|
||||
} while ((next <= ymax) && (k != next));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Seeds = KMRK_allocSeeds(Seeds,
|
||||
expected/20+1,
|
||||
0,1);
|
||||
|
||||
/* fprintf(stderr,"Expected inter-inverted couple : %d\n",expected);*/
|
||||
SetMultipleLenInvSeeds(list,nseeds,wordSize,0,Seeds);
|
||||
MyFree(list, expected* sizeof(SmallSeed_type) );
|
||||
KMRK_compactSeeds(Seeds);
|
||||
|
||||
|
||||
return Seeds;
|
||||
}
|
||||
|
||||
|
||||
int32_t KMRK_cmpSeedsPos(SmallSeed_type *s1, SmallSeed_type *s2)
|
||||
{
|
||||
if (s1->pos1==s2->pos1)
|
||||
return s1->pos2 - s2->pos2;
|
||||
else
|
||||
return s1->pos1 - s2->pos1;
|
||||
}
|
||||
|
||||
int32_t KMRK_cmpDeltaSeedsPos(SmallSeed_type *s1, SmallSeed_type *s2)
|
||||
{
|
||||
int32_t delta1 = s1->pos2-s1->pos1;
|
||||
int32_t delta2 = s2->pos2-s2->pos1;
|
||||
|
||||
if (delta1==delta2)
|
||||
return s1->pos1 - s2->pos1;
|
||||
else
|
||||
return delta1 - delta2;
|
||||
}
|
||||
|
||||
/*
|
||||
Sort by deltas, then pos1
|
||||
*/
|
||||
int32_t KMRK_cmpDeltaInvSeedsPos(SmallSeed_type *s1, SmallSeed_type *s2)
|
||||
{
|
||||
int32_t delta1 = s1->pos2+s1->pos1;
|
||||
int32_t delta2 = s2->pos2+s2->pos1;
|
||||
|
||||
if (delta1==delta2)
|
||||
return s1->pos1 - s2->pos1; /* then sort by pos1 */
|
||||
else
|
||||
return delta1 - delta2; /* sort by deltas */
|
||||
}
|
||||
|
||||
|
||||
void KMRK_sortSeeds(SmallSeed_type* seeds,
|
||||
int32_t nseeds,
|
||||
KMRK_SORT_SEEDS_FUNC_PTR(compare))
|
||||
{
|
||||
|
||||
qsort(seeds,
|
||||
nseeds,
|
||||
sizeof(SmallSeed_type),
|
||||
(int (*)(const void *, const void *))compare);
|
||||
}
|
||||
|
||||
AllSeeds_type* KMRK_get_seeds(char **seq,
|
||||
int32_t SimpleSeqLen,
|
||||
int16_t Lmin,
|
||||
int8_t opt_dir,
|
||||
int8_t opt_inv,
|
||||
int8_t opt_verbose,
|
||||
masked_area_table_t *mask)
|
||||
{
|
||||
AllSeeds_type* AllSeeds;
|
||||
char *ConcatSeq = *seq;
|
||||
vn_type * stacks;
|
||||
int32_t dirExpect=0;
|
||||
int32_t invExpect=0;
|
||||
KMRK_QUORUM_FUNC_PTR(quorum);
|
||||
|
||||
if(opt_inv != 1 && opt_dir != 1)
|
||||
fprintf(stderr, "get_seeds: requiered at least opt_dir or opt_inv to be 1\n"), exit(4);
|
||||
|
||||
|
||||
if(opt_inv)
|
||||
ConcatSeq = makeDirInvSeq(ConcatSeq,SimpleSeqLen); /* create a sequence with "DirSeq\@InvSeq\0" */
|
||||
|
||||
if (opt_inv){ /* Are we interested in dir, inv or both ? */
|
||||
if (opt_dir)
|
||||
quorum = KMRK_DirInvQuorum;
|
||||
else
|
||||
quorum = KMRK_InvQuorum;
|
||||
}
|
||||
else
|
||||
quorum = KMRK_CoupleQuorum;
|
||||
|
||||
stacks = KMRK_RunTo(ConcatSeq, /* concatenation of both sequence */
|
||||
Lmin, /* minimum length of the repeated elements = word size to construct */
|
||||
opt_inv, /* shall we search for inverted repeats as well ?*/
|
||||
2, /* the count -- a mysterious value passed to the quorum function (unused ??) */
|
||||
1, /* how many functions */
|
||||
quorum, /* a pointer to the selected quorum function */
|
||||
KMRK_HashOneSequence, /* a pointer to the init function */
|
||||
mask);
|
||||
|
||||
invExpect =0;
|
||||
|
||||
KMRK_markStart(stacks);
|
||||
|
||||
if (opt_inv)
|
||||
{
|
||||
ConcatSeq = (char *)MyRealloc( (void *)ConcatSeq, (SimpleSeqLen+1)*sizeof(char),
|
||||
(2*SimpleSeqLen+2)*sizeof(char) , "KRMK_get_seeds: Cannot shrink memory"); /* reset mem to a sigle sequence */
|
||||
ConcatSeq[SimpleSeqLen]=0;
|
||||
}
|
||||
|
||||
if(opt_inv)
|
||||
invExpect = KMRK_upperInvertedCount(stacks,Lmin);
|
||||
|
||||
if(opt_dir)
|
||||
dirExpect = KMRK_upperCoupleCount(stacks);
|
||||
|
||||
|
||||
AllSeeds = NULL;
|
||||
|
||||
|
||||
MyFree(stacks->v, (stacks->vectorsize) * sizeof(int32_t));
|
||||
stacks->v=NULL;
|
||||
|
||||
|
||||
if (opt_dir)
|
||||
AllSeeds = KMRK_enumerateDirectCouple(AllSeeds,dirExpect,Lmin ,stacks,0);
|
||||
|
||||
if (opt_inv)
|
||||
AllSeeds = KMRK_enumerateInvertedCouple(AllSeeds,invExpect,Lmin,stacks);
|
||||
|
||||
KMRK_FreeVN(stacks);
|
||||
|
||||
*seq = ConcatSeq;
|
||||
|
||||
return AllSeeds;
|
||||
}
|
||||
|
||||
AllSeeds_type* KMRK_get_seeds_2seqs(char **seq1,
|
||||
char **seq2,
|
||||
int32_t size1,
|
||||
int32_t size2,
|
||||
int16_t Lmin,
|
||||
int8_t opt_dir,
|
||||
int8_t opt_inv,
|
||||
int8_t opt_verbose,
|
||||
masked_area_table_t *mask)
|
||||
{
|
||||
|
||||
AllSeeds_type* AllSeeds;
|
||||
char *sequence1 = *seq1;
|
||||
char *sequence2 = *seq2;
|
||||
vn_type * stacks;
|
||||
int32_t dirExpect=0;
|
||||
int32_t invExpect=0;
|
||||
KMRK_QUORUM_FUNC_PTR(quorum);
|
||||
int32_t sizef; /* =size1 if only dir OR =2*size1+size2 if inv */
|
||||
|
||||
if(opt_inv != 1 &&
|
||||
opt_dir != 1)
|
||||
{
|
||||
fprintf(stderr,
|
||||
"get_seeds_2seqs: requiered at least "
|
||||
"opt_dir or opt_inv to be 1\n");
|
||||
exit(4);
|
||||
}
|
||||
|
||||
sizef = size1;
|
||||
|
||||
if(opt_inv)
|
||||
{
|
||||
sequence1 = makeDirInvSeq(sequence1,size1);
|
||||
sizef+=(1+size1);
|
||||
}
|
||||
|
||||
sequence1 = merge2seq(sequence1,sequence2,sizef,size2);
|
||||
|
||||
if (opt_inv)
|
||||
if (opt_dir)
|
||||
quorum = KMRK_IntDirInv12Quorum;
|
||||
else
|
||||
quorum = KMRK_IntInv12Quorum;
|
||||
else
|
||||
quorum = KMRK_Int12Quorum;
|
||||
|
||||
stacks = KMRK_RunTo(sequence1,
|
||||
Lmin,
|
||||
opt_inv,
|
||||
2,
|
||||
2,
|
||||
quorum,
|
||||
KMRK_HashOneSequence,
|
||||
mask);
|
||||
|
||||
KMRK_markStart(stacks);
|
||||
|
||||
sequence1= (char *)MyRealloc( (void *)sequence1, (size1+1)*sizeof(char),
|
||||
(sizef+size2+2)*sizeof(char), "KMRK_get_seeds_2seqs: shrink memory back to size1");
|
||||
|
||||
sequence1[size1]=0;
|
||||
|
||||
if (opt_dir){
|
||||
if (opt_inv)
|
||||
dirExpect = KMRK_upperInterCount(stacks,0,2,Lmin);
|
||||
else
|
||||
dirExpect = KMRK_upperInterCount(stacks,0,1,Lmin);
|
||||
}
|
||||
if (opt_inv)
|
||||
invExpect = KMRK_upperInterCount(stacks,1,2,Lmin);
|
||||
|
||||
AllSeeds = NULL;
|
||||
MyFree(stacks->v, stacks->vectorsize*sizeof(int32_t) ); /* free vector v from the stacks */
|
||||
stacks->v=NULL;
|
||||
|
||||
if (opt_dir){
|
||||
if (opt_inv)
|
||||
AllSeeds = KMRK_enumerateInterCouple(AllSeeds,
|
||||
0,2,
|
||||
dirExpect,
|
||||
Lmin ,
|
||||
stacks);
|
||||
else
|
||||
AllSeeds = KMRK_enumerateInterCouple(AllSeeds,
|
||||
0,1,
|
||||
dirExpect,
|
||||
Lmin ,
|
||||
stacks);
|
||||
|
||||
}
|
||||
if (opt_inv)
|
||||
AllSeeds = KMRK_enumerateInterInvertedCouple(AllSeeds,
|
||||
2,
|
||||
invExpect,
|
||||
Lmin ,
|
||||
stacks);
|
||||
|
||||
|
||||
KMRK_FreeVN(stacks);
|
||||
|
||||
*seq1 = sequence1;
|
||||
|
||||
return AllSeeds;
|
||||
}
|
||||
|
||||
#define SIGN(x) (((x)<0) ? -1:(((x)>0) ? 1:0))
|
||||
|
||||
static int32_t compareSeedsByPos(Seed_type* s1,Seed_type* s2)
|
||||
{
|
||||
if (s1->pos1 == s2->pos1)
|
||||
return SIGN(s1->pos2 - s2->pos2);
|
||||
else
|
||||
return SIGN(s1->pos1 - s2->pos1);
|
||||
}
|
||||
|
||||
void KMRK_sortSeedsByPos(Seed_type* seeds, int32_t count)
|
||||
{
|
||||
qsort(seeds,
|
||||
count,
|
||||
sizeof(Seed_type),
|
||||
(int (*)(const void *, const void *))compareSeedsByPos);
|
||||
};
|
||||
|
||||
#undef SIGN
|
||||
|
145
src/repseek/repseek-2014.09/KMRK_Seeds.h
Normal file
145
src/repseek/repseek-2014.09/KMRK_Seeds.h
Normal file
@ -0,0 +1,145 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
#ifndef KMRK_Seeds_h
|
||||
#define KMRK_Seeds_h
|
||||
|
||||
/********************************************
|
||||
********************************************
|
||||
**
|
||||
** Declaration of struct
|
||||
**
|
||||
********************************************
|
||||
********************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "KMRK.h"
|
||||
#include "repseek_types.h"
|
||||
|
||||
|
||||
|
||||
|
||||
#define KMRK_SORT_SEEDS_FUNC_PTR(name) int32_t (*name)(SmallSeed_type*, \
|
||||
SmallSeed_type*)
|
||||
|
||||
#define KMRK_DELTA_SEEDS_FUNC_PTR(name) int32_t (*name)(SmallSeed_type*)
|
||||
|
||||
/********************************************
|
||||
********************************************
|
||||
**
|
||||
** Declaration of public functions
|
||||
**
|
||||
********************************************
|
||||
********************************************/
|
||||
|
||||
|
||||
AllSeeds_type *KMRK_allocSeeds(AllSeeds_type *AllSeeds,
|
||||
int32_t size,
|
||||
int8_t opt_dir,
|
||||
int8_t opt_inv);
|
||||
|
||||
void KMRK_SetMultipleLenDirSeeds(SmallSeed_type* seeds,
|
||||
int32_t nseeds,
|
||||
int32_t wordSize,
|
||||
AllSeeds_type *PtrAllSeeds);
|
||||
|
||||
|
||||
void KMRK_freeSeeds(AllSeeds_type *AllSeeds);
|
||||
|
||||
void KMRK_compactSeeds(AllSeeds_type *AllSeeds);
|
||||
|
||||
void KMRK_pushSeed(AllSeeds_type *AllSeeds,
|
||||
int32_t pos1,
|
||||
int32_t pos2,
|
||||
int32_t length,
|
||||
int8_t dir);
|
||||
|
||||
AllSeeds_type* KMRK_enumerateDirectCouple(AllSeeds_type* Seeds,
|
||||
int32_t expected,
|
||||
int32_t wordSize,
|
||||
vn_type* stack,
|
||||
int32_t seq);
|
||||
|
||||
AllSeeds_type* KMRK_enumerateInvertedCouple(AllSeeds_type* Seeds,
|
||||
int32_t expected,
|
||||
int32_t wordSize,
|
||||
vn_type* stack);
|
||||
|
||||
AllSeeds_type* KMRK_enumerateInterCouple(AllSeeds_type* Seeds,
|
||||
int32_t seq1,
|
||||
int32_t seq2,
|
||||
int32_t expected,
|
||||
int32_t wordSize,
|
||||
vn_type* stack);
|
||||
|
||||
AllSeeds_type* KMRK_enumerateInterInvertedCouple(AllSeeds_type* Seeds,
|
||||
int32_t seq2,
|
||||
int32_t expected,
|
||||
int32_t wordSize,
|
||||
vn_type* stack);
|
||||
|
||||
|
||||
/**
|
||||
* Compare two seeds and return an integer less than, equal to or greater
|
||||
* than zero considering the relative order of the two seeds. This
|
||||
* version take into account only pos1 and pos2 of seeds without taking
|
||||
* account of the sequences or of the relative direction
|
||||
*
|
||||
* @param s1 pointer to seed one
|
||||
* @param s2 pointer to seed two
|
||||
*
|
||||
* @return a integer less than, equal to or greater than zero
|
||||
*/
|
||||
|
||||
int32_t KMRK_cmpSeedsPos(SmallSeed_type *s1, SmallSeed_type *s2);
|
||||
int32_t KMRK_cmpDeltaSeedsPos(SmallSeed_type *s1, SmallSeed_type *s2);
|
||||
int32_t KMRK_cmpDeltaInvSeedsPos(SmallSeed_type *s1, SmallSeed_type *s2);
|
||||
|
||||
void KMRK_sortSeeds(SmallSeed_type* seeds,
|
||||
int32_t nseeds,
|
||||
KMRK_SORT_SEEDS_FUNC_PTR(compare));
|
||||
|
||||
|
||||
AllSeeds_type* KMRK_get_seeds(char **seq,
|
||||
int32_t SimpleSeqLen,
|
||||
int16_t Lmin,
|
||||
int8_t opt_dir,
|
||||
int8_t opt_inv,
|
||||
int8_t opt_verbose,
|
||||
masked_area_table_t *mask);
|
||||
|
||||
AllSeeds_type* KMRK_get_seeds_2seqs(char **seq1,
|
||||
char **seq2,
|
||||
int32_t size1,
|
||||
int32_t size2,
|
||||
int16_t Lmin,
|
||||
int8_t opt_dir,
|
||||
int8_t opt_inv,
|
||||
int8_t opt_verbose,
|
||||
masked_area_table_t *mask);
|
||||
|
||||
/**
|
||||
* Order an array of seeds by pos1,pos2
|
||||
*
|
||||
* @param seeds pointer to an array of Seed_type object to sort
|
||||
* @param count count of element in the array
|
||||
*/
|
||||
void KMRK_sortSeedsByPos(Seed_type* seeds, int32_t count);
|
||||
|
||||
#endif /* KMRK_Seeds_h */
|
281
src/repseek/repseek-2014.09/KMRK_mask.c
Normal file
281
src/repseek/repseek-2014.09/KMRK_mask.c
Normal file
@ -0,0 +1,281 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/*
|
||||
* KMRK_mask.c
|
||||
* repseek
|
||||
*
|
||||
* Created by Eric Coissac on 04/12/04.
|
||||
* Copyright 2004 __MyCompanyName__. All rights reserved.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "KMRK_mask.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "memory.h"
|
||||
|
||||
#define MASKED_AREA_TABLE_SIZE(seqcount) (sizeof(masked_area_table_t) + (sizeof(masked_area_list_t*) * ((seqcount)-1)))
|
||||
#define MASKED_AREA_LIST_SIZE(areacount) (sizeof(masked_area_list_t) + (sizeof(masked_area_t) * ((areacount)-1)))
|
||||
|
||||
#define AREA_COUNT_INIT (1000)
|
||||
|
||||
static masked_area_table_t *new_masked_area_table(int32_t seqcount, int32_t areacount);
|
||||
static masked_area_list_t *new_masked_area_list(int32_t areacount);
|
||||
static masked_area_list_t *realloc_masked_area_list(masked_area_list_t *list,int32_t areacount);
|
||||
static int32_t push_area(masked_area_table_t* table,int32_t sequence,int32_t begin,int32_t end);
|
||||
static void sort_area_table(masked_area_table_t* table);
|
||||
static int32_t compare_area(const masked_area_t* v1,const masked_area_t* v2);
|
||||
static int32_t search_area(const masked_area_t* v1,const masked_area_t* v2);
|
||||
|
||||
static masked_area_list_t *strip_area_list(masked_area_list_t* list);
|
||||
static void strip_area_table(masked_area_table_t* table);
|
||||
|
||||
|
||||
static masked_area_list_t *new_masked_area_list(int32_t areacount)
|
||||
{
|
||||
masked_area_list_t *list;
|
||||
|
||||
list = MyCalloc(1, MASKED_AREA_LIST_SIZE(areacount), "Not enougth memory for mask table");
|
||||
list->reserved=areacount;
|
||||
|
||||
return list;
|
||||
}
|
||||
|
||||
static masked_area_list_t *realloc_masked_area_list(masked_area_list_t *list,int32_t areacount)
|
||||
{
|
||||
list = MyRealloc(list,
|
||||
MASKED_AREA_LIST_SIZE(areacount),
|
||||
MASKED_AREA_LIST_SIZE(list->reserved),
|
||||
"Not enougth memory for mask table");
|
||||
|
||||
list->reserved=areacount;
|
||||
|
||||
return list;
|
||||
}
|
||||
|
||||
static masked_area_table_t *new_masked_area_table(int32_t seqcount, int32_t areacount)
|
||||
{
|
||||
masked_area_table_t *table;
|
||||
int32_t i;
|
||||
|
||||
table = MyCalloc(1, MASKED_AREA_TABLE_SIZE(seqcount),"Not enougth memory for mask table");
|
||||
table->seqcount=seqcount;
|
||||
|
||||
for (i=0;i<seqcount;i++)
|
||||
table->sequence[i]=new_masked_area_list(areacount);
|
||||
|
||||
return table;
|
||||
}
|
||||
|
||||
static int32_t push_area(masked_area_table_t* table,int32_t sequence,int32_t begin,int32_t end)
|
||||
{
|
||||
masked_area_list_t * list;
|
||||
|
||||
if (sequence >= table->seqcount)
|
||||
return -1;
|
||||
|
||||
list = table->sequence[sequence];
|
||||
|
||||
if (list->reserved == list->count)
|
||||
{
|
||||
list = realloc_masked_area_list(list,list->reserved*2);
|
||||
table->sequence[sequence]=list;
|
||||
}
|
||||
|
||||
list->area[list->count].begin=begin;
|
||||
list->area[list->count].end=end;
|
||||
|
||||
list->count++;
|
||||
table->total++;
|
||||
|
||||
return table->total;
|
||||
}
|
||||
|
||||
static int32_t compare_area(const masked_area_t* v1,const masked_area_t* v2)
|
||||
{
|
||||
return v1->begin - v2->begin;
|
||||
}
|
||||
|
||||
static void sort_area_table(masked_area_table_t* table)
|
||||
{
|
||||
int32_t i;
|
||||
|
||||
for (i=0; i<table->seqcount;i++)
|
||||
{
|
||||
qsort(table->sequence[i]->area,
|
||||
table->sequence[i]->count,
|
||||
sizeof(masked_area_t),
|
||||
(int (*)(const void *, const void *))compare_area);
|
||||
}
|
||||
}
|
||||
|
||||
static masked_area_list_t *strip_area_list(masked_area_list_t* list)
|
||||
{
|
||||
int32_t i;
|
||||
int32_t j;
|
||||
int32_t count;
|
||||
int32_t newcount;
|
||||
|
||||
count = list->count;
|
||||
newcount=count;
|
||||
|
||||
for (i=1;i<count;i++)
|
||||
{
|
||||
/* fprintf(stderr,"\n%d->%d %d->%d ==>",list->area[i-1].begin,list->area[i-1].end,list->area[i].begin,list->area[i].end); */
|
||||
if ((list->area[i].begin-1) <= list->area[i-1].end)
|
||||
{
|
||||
/* fprintf(stderr," joined"); */
|
||||
list->area[i].begin=list->area[i-1].begin;
|
||||
list->area[i-1].begin=-1;
|
||||
newcount--;
|
||||
}
|
||||
}
|
||||
list->count=newcount;
|
||||
|
||||
for (i=0,j=0;i<count;i++)
|
||||
{
|
||||
if (list->area[i].begin>=0)
|
||||
{
|
||||
if (i!=j)
|
||||
list->area[j]=list->area[i];
|
||||
j++;
|
||||
}
|
||||
}
|
||||
|
||||
return realloc_masked_area_list(list,newcount);
|
||||
|
||||
}
|
||||
|
||||
static void strip_area_table(masked_area_table_t* table)
|
||||
{
|
||||
int32_t seq;
|
||||
int32_t oldcount;
|
||||
masked_area_list_t* list;
|
||||
|
||||
sort_area_table(table);
|
||||
|
||||
for (seq=0; seq < table->seqcount;seq++)
|
||||
{
|
||||
list = table->sequence[seq];
|
||||
oldcount = list->count;
|
||||
table->sequence[seq]=strip_area_list(list);
|
||||
table->total-=oldcount - table->sequence[seq]->count;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static int32_t search_area(const masked_area_t* v1,const masked_area_t* v2)
|
||||
{
|
||||
int32_t pos;
|
||||
|
||||
pos = v1->begin;
|
||||
|
||||
if (pos < v2->begin)
|
||||
return -1;
|
||||
|
||||
if (pos > v2->end)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
masked_area_table_t *KMRK_ReadMaskArea(char *areafile, int32_t seqcount, int32_t complement)
|
||||
{
|
||||
FILE* area;
|
||||
char buffer[1000];
|
||||
char *ok;
|
||||
int32_t begin;
|
||||
int32_t end;
|
||||
int32_t sequence;
|
||||
int32_t column;
|
||||
int32_t linecount;
|
||||
masked_area_table_t *table;
|
||||
|
||||
if (complement > 0)
|
||||
seqcount++;
|
||||
else
|
||||
complement=0;
|
||||
|
||||
area = fopen(areafile,"r");
|
||||
linecount=0;
|
||||
table = new_masked_area_table(seqcount,AREA_COUNT_INIT);
|
||||
|
||||
do {
|
||||
linecount++;
|
||||
ok = fgets(buffer,999,area);
|
||||
if (ok)
|
||||
{
|
||||
column = sscanf(buffer,"%d %d %d",&begin,&end,&sequence);
|
||||
if (column > 1 && begin <= end)
|
||||
{
|
||||
begin--;
|
||||
end--;
|
||||
if (column==3)
|
||||
sequence--;
|
||||
else
|
||||
sequence=0;
|
||||
|
||||
if (sequence>1 || sequence<0)
|
||||
fprintf(stderr,"ERROR while reading mask file, line %d (see help)\n", linecount), exit(0);
|
||||
|
||||
if (sequence && complement)
|
||||
sequence++;
|
||||
|
||||
push_area(table,sequence,begin,end);
|
||||
|
||||
if (!sequence && complement)
|
||||
push_area(table,1,complement -1 - end,complement -1 -begin);
|
||||
|
||||
}
|
||||
|
||||
if (column==1)
|
||||
fprintf(stderr,"EROOR in mask file reading line %d\n",linecount), exit(0);
|
||||
|
||||
}
|
||||
} while (ok);
|
||||
|
||||
fprintf(stderr,"\nread %d masked areas from file\n",table->total);
|
||||
strip_area_table(table);
|
||||
fprintf(stderr,"strip to %d non overlaping areas\n",table->total);
|
||||
|
||||
return table;
|
||||
}
|
||||
|
||||
char KMRK_isMasked(masked_area_table_t *mask,int32_t seq, int32_t position)
|
||||
{
|
||||
masked_area_t input;
|
||||
int32_t result;
|
||||
masked_area_list_t *list;
|
||||
|
||||
if (! mask || (seq >= mask->seqcount))
|
||||
return 0;
|
||||
|
||||
list = mask->sequence[seq];
|
||||
input.begin = position;
|
||||
|
||||
result = bsearch(&input,
|
||||
list->area,
|
||||
list->count,
|
||||
sizeof(masked_area_t),
|
||||
(int (*)(const void *, const void *))search_area) != NULL;
|
||||
|
||||
return result;
|
||||
}
|
57
src/repseek/repseek-2014.09/KMRK_mask.h
Normal file
57
src/repseek/repseek-2014.09/KMRK_mask.h
Normal file
@ -0,0 +1,57 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/*
|
||||
* KMRK_mask.h
|
||||
* repseek
|
||||
*
|
||||
* Created by Eric Coissac on 04/12/04.
|
||||
* Copyright 2004 __MyCompanyName__. All rights reserved.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#ifndef KMRK_MASK_H
|
||||
#define KMRK_MASK_H
|
||||
|
||||
#include "repseek_types.h"
|
||||
|
||||
typedef struct {
|
||||
int32_t begin;
|
||||
int32_t end;
|
||||
} masked_area_t;
|
||||
|
||||
typedef struct {
|
||||
int32_t reserved;
|
||||
int32_t count;
|
||||
|
||||
masked_area_t area[1];
|
||||
} masked_area_list_t;
|
||||
|
||||
typedef struct {
|
||||
int32_t seqcount;
|
||||
int32_t total;
|
||||
|
||||
masked_area_list_t *sequence[1];
|
||||
} masked_area_table_t;
|
||||
|
||||
masked_area_table_t *KMRK_ReadMaskArea(char *areafile,int32_t seqcount,int32_t complement);
|
||||
char KMRK_isMasked(masked_area_table_t *mask,int32_t seq, int32_t position);
|
||||
|
||||
#endif
|
142
src/repseek/repseek-2014.09/KMRK_merge_seeds.c
Normal file
142
src/repseek/repseek-2014.09/KMRK_merge_seeds.c
Normal file
@ -0,0 +1,142 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/**
|
||||
* @file KMRK_merge_seeds.c
|
||||
* @author Eric Coissac <coissac@inrialpes.fr>
|
||||
* @date Wed Mar 3 11:15:57 2004
|
||||
*
|
||||
* @brief Merge function of overlapping seeds
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
#include "KMRK_merge_seeds.h"
|
||||
|
||||
void KMRK_MergeSeeds(AllSeeds_type *AllSeeds,
|
||||
int8_t opt_dir,
|
||||
int8_t opt_inv)
|
||||
{
|
||||
|
||||
int32_t i; /* the current seed */
|
||||
int32_t j; /* the checked seed */
|
||||
|
||||
int32_t N; /* the kept ones */
|
||||
Seed_type* seeds;
|
||||
|
||||
|
||||
if(opt_dir){
|
||||
seeds = AllSeeds->dirSeeds;
|
||||
for(i=0, N=0 ;i<AllSeeds->nDirSeeds; i++){
|
||||
|
||||
if(seeds[i].pos1==-1) /* any seed at -1 is removed */
|
||||
continue;
|
||||
|
||||
|
||||
j=i+1;
|
||||
|
||||
while( (seeds[j].pos1!=-1) &&
|
||||
(seeds[i].pos1!=-1) &&
|
||||
(j < AllSeeds->nDirSeeds) &&
|
||||
(seeds[j].pos1 < seeds[i].pos1+ seeds[i].length))
|
||||
{
|
||||
|
||||
if(
|
||||
((seeds[j].pos2 >= seeds[i].pos2) &&
|
||||
(seeds[j].pos2 < seeds[i].pos2 + seeds[i].length)) || /* if the seeds are overlapping */
|
||||
((seeds[j].pos2 + seeds[j].length >= seeds[i].pos2) &&
|
||||
(seeds[j].pos2 + seeds[j].length < seeds[i].pos2 + seeds[i].length)))
|
||||
{
|
||||
if(seeds[j].length <= seeds[i].length)
|
||||
seeds[j].pos1=seeds[j].pos2=seeds[j].length=-1; /* removed the smallest */
|
||||
else
|
||||
seeds[i].pos1=seeds[i].pos2=seeds[i].length=-1;
|
||||
}
|
||||
|
||||
j++;
|
||||
}
|
||||
|
||||
if(seeds[i].pos1 !=-1)
|
||||
{ /* if this seed is not out, store it */
|
||||
|
||||
seeds[N].pos1 = seeds[i].pos1;
|
||||
seeds[N].pos2 = seeds[i].pos2;
|
||||
seeds[N].length = seeds[i].length;
|
||||
N++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
AllSeeds->nFilteredDirSeeds += AllSeeds->nDirSeeds-N;
|
||||
AllSeeds->nDirSeeds=N;
|
||||
|
||||
|
||||
}
|
||||
|
||||
if(opt_inv){
|
||||
seeds = AllSeeds->invSeeds;
|
||||
|
||||
for(i=0, N=0 ;i<AllSeeds->nInvSeeds; i++){
|
||||
|
||||
if(seeds[i].pos1==-1)
|
||||
continue;
|
||||
|
||||
|
||||
j=i+1;
|
||||
|
||||
while( (seeds[j].pos1!=-1 ) &&
|
||||
(seeds[i].pos1!=-1 ) &&
|
||||
(j < AllSeeds->nInvSeeds) &&
|
||||
(seeds[j].pos1 < seeds[i].pos1+seeds[i].length))
|
||||
{
|
||||
|
||||
if(
|
||||
((seeds[j].pos2 >= seeds[i].pos2) && /* if the seeds are overlapping */
|
||||
(seeds[j].pos2 < seeds[i].pos2+seeds[i].length)) ||
|
||||
((seeds[j].pos2 + seeds[j].length >= seeds[i].pos2) &&
|
||||
(seeds[j].pos2 + seeds[j].length < seeds[i].pos2+seeds[i].length)))
|
||||
{
|
||||
|
||||
if(seeds[j].length <= seeds[i].length)
|
||||
seeds[j].pos1=seeds[j].pos2=seeds[j].length=-1; /* removed the smallest */
|
||||
else
|
||||
seeds[i].pos1=seeds[i].pos2=seeds[i].length=-1;
|
||||
}
|
||||
|
||||
j++;
|
||||
}
|
||||
|
||||
if(seeds[i].pos1!=-1)
|
||||
{ /* if this seed is not out, store it */
|
||||
|
||||
seeds[N].pos1 = seeds[i].pos1;
|
||||
seeds[N].pos2 = seeds[i].pos2;
|
||||
seeds[N].length = seeds[i].length;
|
||||
N++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
AllSeeds->nFilteredInvSeeds += AllSeeds->nInvSeeds-N;
|
||||
AllSeeds->nInvSeeds=N;
|
||||
}
|
||||
|
||||
KMRK_compactSeeds(AllSeeds);
|
||||
|
||||
}
|
30
src/repseek/repseek-2014.09/KMRK_merge_seeds.h
Normal file
30
src/repseek/repseek-2014.09/KMRK_merge_seeds.h
Normal file
@ -0,0 +1,30 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
#ifndef KMRK_merge_seeds_h
|
||||
#define KMRK_merge_seeds_h
|
||||
|
||||
#include "KMRK_Seeds.h"
|
||||
|
||||
void KMRK_MergeSeeds(AllSeeds_type *AllSeeds,
|
||||
int8_t opt_dir,
|
||||
int8_t opt_inv);
|
||||
|
||||
|
||||
#endif /* KMRK_MergeSeeds */
|
504
src/repseek/repseek-2014.09/LICENSE.txt
Normal file
504
src/repseek/repseek-2014.09/LICENSE.txt
Normal file
@ -0,0 +1,504 @@
|
||||
GNU LESSER GENERAL PUBLIC LICENSE
|
||||
Version 2.1, February 1999
|
||||
|
||||
Copyright (C) 1991, 1999 Free Software Foundation, Inc.
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
|
||||
[This is the first released version of the Lesser GPL. It also counts
|
||||
as the successor of the GNU Library Public License, version 2, hence
|
||||
the version number 2.1.]
|
||||
|
||||
Preamble
|
||||
|
||||
The licenses for most software are designed to take away your
|
||||
freedom to share and change it. By contrast, the GNU General Public
|
||||
Licenses are intended to guarantee your freedom to share and change
|
||||
free software--to make sure the software is free for all its users.
|
||||
|
||||
This license, the Lesser General Public License, applies to some
|
||||
specially designated software packages--typically libraries--of the
|
||||
Free Software Foundation and other authors who decide to use it. You
|
||||
can use it too, but we suggest you first think carefully about whether
|
||||
this license or the ordinary General Public License is the better
|
||||
strategy to use in any particular case, based on the explanations below.
|
||||
|
||||
When we speak of free software, we are referring to freedom of use,
|
||||
not price. Our General Public Licenses are designed to make sure that
|
||||
you have the freedom to distribute copies of free software (and charge
|
||||
for this service if you wish); that you receive source code or can get
|
||||
it if you want it; that you can change the software and use pieces of
|
||||
it in new free programs; and that you are informed that you can do
|
||||
these things.
|
||||
|
||||
To protect your rights, we need to make restrictions that forbid
|
||||
distributors to deny you these rights or to ask you to surrender these
|
||||
rights. These restrictions translate to certain responsibilities for
|
||||
you if you distribute copies of the library or if you modify it.
|
||||
|
||||
For example, if you distribute copies of the library, whether gratis
|
||||
or for a fee, you must give the recipients all the rights that we gave
|
||||
you. You must make sure that they, too, receive or can get the source
|
||||
code. If you link other code with the library, you must provide
|
||||
complete object files to the recipients, so that they can relink them
|
||||
with the library after making changes to the library and recompiling
|
||||
it. And you must show them these terms so they know their rights.
|
||||
|
||||
We protect your rights with a two-step method: (1) we copyright the
|
||||
library, and (2) we offer you this license, which gives you legal
|
||||
permission to copy, distribute and/or modify the library.
|
||||
|
||||
To protect each distributor, we want to make it very clear that
|
||||
there is no warranty for the free library. Also, if the library is
|
||||
modified by someone else and passed on, the recipients should know
|
||||
that what they have is not the original version, so that the original
|
||||
author's reputation will not be affected by problems that might be
|
||||
introduced by others.
|
||||
|
||||
Finally, software patents pose a constant threat to the existence of
|
||||
any free program. We wish to make sure that a company cannot
|
||||
effectively restrict the users of a free program by obtaining a
|
||||
restrictive license from a patent holder. Therefore, we insist that
|
||||
any patent license obtained for a version of the library must be
|
||||
consistent with the full freedom of use specified in this license.
|
||||
|
||||
Most GNU software, including some libraries, is covered by the
|
||||
ordinary GNU General Public License. This license, the GNU Lesser
|
||||
General Public License, applies to certain designated libraries, and
|
||||
is quite different from the ordinary General Public License. We use
|
||||
this license for certain libraries in order to permit linking those
|
||||
libraries into non-free programs.
|
||||
|
||||
When a program is linked with a library, whether statically or using
|
||||
a shared library, the combination of the two is legally speaking a
|
||||
combined work, a derivative of the original library. The ordinary
|
||||
General Public License therefore permits such linking only if the
|
||||
entire combination fits its criteria of freedom. The Lesser General
|
||||
Public License permits more lax criteria for linking other code with
|
||||
the library.
|
||||
|
||||
We call this license the "Lesser" General Public License because it
|
||||
does Less to protect the user's freedom than the ordinary General
|
||||
Public License. It also provides other free software developers Less
|
||||
of an advantage over competing non-free programs. These disadvantages
|
||||
are the reason we use the ordinary General Public License for many
|
||||
libraries. However, the Lesser license provides advantages in certain
|
||||
special circumstances.
|
||||
|
||||
For example, on rare occasions, there may be a special need to
|
||||
encourage the widest possible use of a certain library, so that it becomes
|
||||
a de-facto standard. To achieve this, non-free programs must be
|
||||
allowed to use the library. A more frequent case is that a free
|
||||
library does the same job as widely used non-free libraries. In this
|
||||
case, there is little to gain by limiting the free library to free
|
||||
software only, so we use the Lesser General Public License.
|
||||
|
||||
In other cases, permission to use a particular library in non-free
|
||||
programs enables a greater number of people to use a large body of
|
||||
free software. For example, permission to use the GNU C Library in
|
||||
non-free programs enables many more people to use the whole GNU
|
||||
operating system, as well as its variant, the GNU/Linux operating
|
||||
system.
|
||||
|
||||
Although the Lesser General Public License is Less protective of the
|
||||
users' freedom, it does ensure that the user of a program that is
|
||||
linked with the Library has the freedom and the wherewithal to run
|
||||
that program using a modified version of the Library.
|
||||
|
||||
The precise terms and conditions for copying, distribution and
|
||||
modification follow. Pay close attention to the difference between a
|
||||
"work based on the library" and a "work that uses the library". The
|
||||
former contains code derived from the library, whereas the latter must
|
||||
be combined with the library in order to run.
|
||||
|
||||
GNU LESSER GENERAL PUBLIC LICENSE
|
||||
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
||||
|
||||
0. This License Agreement applies to any software library or other
|
||||
program which contains a notice placed by the copyright holder or
|
||||
other authorized party saying it may be distributed under the terms of
|
||||
this Lesser General Public License (also called "this License").
|
||||
Each licensee is addressed as "you".
|
||||
|
||||
A "library" means a collection of software functions and/or data
|
||||
prepared so as to be conveniently linked with application programs
|
||||
(which use some of those functions and data) to form executables.
|
||||
|
||||
The "Library", below, refers to any such software library or work
|
||||
which has been distributed under these terms. A "work based on the
|
||||
Library" means either the Library or any derivative work under
|
||||
copyright law: that is to say, a work containing the Library or a
|
||||
portion of it, either verbatim or with modifications and/or translated
|
||||
straightforwardly into another language. (Hereinafter, translation is
|
||||
included without limitation in the term "modification".)
|
||||
|
||||
"Source code" for a work means the preferred form of the work for
|
||||
making modifications to it. For a library, complete source code means
|
||||
all the source code for all modules it contains, plus any associated
|
||||
interface definition files, plus the scripts used to control compilation
|
||||
and installation of the library.
|
||||
|
||||
Activities other than copying, distribution and modification are not
|
||||
covered by this License; they are outside its scope. The act of
|
||||
running a program using the Library is not restricted, and output from
|
||||
such a program is covered only if its contents constitute a work based
|
||||
on the Library (independent of the use of the Library in a tool for
|
||||
writing it). Whether that is true depends on what the Library does
|
||||
and what the program that uses the Library does.
|
||||
|
||||
1. You may copy and distribute verbatim copies of the Library's
|
||||
complete source code as you receive it, in any medium, provided that
|
||||
you conspicuously and appropriately publish on each copy an
|
||||
appropriate copyright notice and disclaimer of warranty; keep intact
|
||||
all the notices that refer to this License and to the absence of any
|
||||
warranty; and distribute a copy of this License along with the
|
||||
Library.
|
||||
|
||||
You may charge a fee for the physical act of transferring a copy,
|
||||
and you may at your option offer warranty protection in exchange for a
|
||||
fee.
|
||||
|
||||
2. You may modify your copy or copies of the Library or any portion
|
||||
of it, thus forming a work based on the Library, and copy and
|
||||
distribute such modifications or work under the terms of Section 1
|
||||
above, provided that you also meet all of these conditions:
|
||||
|
||||
a) The modified work must itself be a software library.
|
||||
|
||||
b) You must cause the files modified to carry prominent notices
|
||||
stating that you changed the files and the date of any change.
|
||||
|
||||
c) You must cause the whole of the work to be licensed at no
|
||||
charge to all third parties under the terms of this License.
|
||||
|
||||
d) If a facility in the modified Library refers to a function or a
|
||||
table of data to be supplied by an application program that uses
|
||||
the facility, other than as an argument passed when the facility
|
||||
is invoked, then you must make a good faith effort to ensure that,
|
||||
in the event an application does not supply such function or
|
||||
table, the facility still operates, and performs whatever part of
|
||||
its purpose remains meaningful.
|
||||
|
||||
(For example, a function in a library to compute square roots has
|
||||
a purpose that is entirely well-defined independent of the
|
||||
application. Therefore, Subsection 2d requires that any
|
||||
application-supplied function or table used by this function must
|
||||
be optional: if the application does not supply it, the square
|
||||
root function must still compute square roots.)
|
||||
|
||||
These requirements apply to the modified work as a whole. If
|
||||
identifiable sections of that work are not derived from the Library,
|
||||
and can be reasonably considered independent and separate works in
|
||||
themselves, then this License, and its terms, do not apply to those
|
||||
sections when you distribute them as separate works. But when you
|
||||
distribute the same sections as part of a whole which is a work based
|
||||
on the Library, the distribution of the whole must be on the terms of
|
||||
this License, whose permissions for other licensees extend to the
|
||||
entire whole, and thus to each and every part regardless of who wrote
|
||||
it.
|
||||
|
||||
Thus, it is not the intent of this section to claim rights or contest
|
||||
your rights to work written entirely by you; rather, the intent is to
|
||||
exercise the right to control the distribution of derivative or
|
||||
collective works based on the Library.
|
||||
|
||||
In addition, mere aggregation of another work not based on the Library
|
||||
with the Library (or with a work based on the Library) on a volume of
|
||||
a storage or distribution medium does not bring the other work under
|
||||
the scope of this License.
|
||||
|
||||
3. You may opt to apply the terms of the ordinary GNU General Public
|
||||
License instead of this License to a given copy of the Library. To do
|
||||
this, you must alter all the notices that refer to this License, so
|
||||
that they refer to the ordinary GNU General Public License, version 2,
|
||||
instead of to this License. (If a newer version than version 2 of the
|
||||
ordinary GNU General Public License has appeared, then you can specify
|
||||
that version instead if you wish.) Do not make any other change in
|
||||
these notices.
|
||||
|
||||
Once this change is made in a given copy, it is irreversible for
|
||||
that copy, so the ordinary GNU General Public License applies to all
|
||||
subsequent copies and derivative works made from that copy.
|
||||
|
||||
This option is useful when you wish to copy part of the code of
|
||||
the Library into a program that is not a library.
|
||||
|
||||
4. You may copy and distribute the Library (or a portion or
|
||||
derivative of it, under Section 2) in object code or executable form
|
||||
under the terms of Sections 1 and 2 above provided that you accompany
|
||||
it with the complete corresponding machine-readable source code, which
|
||||
must be distributed under the terms of Sections 1 and 2 above on a
|
||||
medium customarily used for software interchange.
|
||||
|
||||
If distribution of object code is made by offering access to copy
|
||||
from a designated place, then offering equivalent access to copy the
|
||||
source code from the same place satisfies the requirement to
|
||||
distribute the source code, even though third parties are not
|
||||
compelled to copy the source along with the object code.
|
||||
|
||||
5. A program that contains no derivative of any portion of the
|
||||
Library, but is designed to work with the Library by being compiled or
|
||||
linked with it, is called a "work that uses the Library". Such a
|
||||
work, in isolation, is not a derivative work of the Library, and
|
||||
therefore falls outside the scope of this License.
|
||||
|
||||
However, linking a "work that uses the Library" with the Library
|
||||
creates an executable that is a derivative of the Library (because it
|
||||
contains portions of the Library), rather than a "work that uses the
|
||||
library". The executable is therefore covered by this License.
|
||||
Section 6 states terms for distribution of such executables.
|
||||
|
||||
When a "work that uses the Library" uses material from a header file
|
||||
that is part of the Library, the object code for the work may be a
|
||||
derivative work of the Library even though the source code is not.
|
||||
Whether this is true is especially significant if the work can be
|
||||
linked without the Library, or if the work is itself a library. The
|
||||
threshold for this to be true is not precisely defined by law.
|
||||
|
||||
If such an object file uses only numerical parameters, data
|
||||
structure layouts and accessors, and small macros and small inline
|
||||
functions (ten lines or less in length), then the use of the object
|
||||
file is unrestricted, regardless of whether it is legally a derivative
|
||||
work. (Executables containing this object code plus portions of the
|
||||
Library will still fall under Section 6.)
|
||||
|
||||
Otherwise, if the work is a derivative of the Library, you may
|
||||
distribute the object code for the work under the terms of Section 6.
|
||||
Any executables containing that work also fall under Section 6,
|
||||
whether or not they are linked directly with the Library itself.
|
||||
|
||||
6. As an exception to the Sections above, you may also combine or
|
||||
link a "work that uses the Library" with the Library to produce a
|
||||
work containing portions of the Library, and distribute that work
|
||||
under terms of your choice, provided that the terms permit
|
||||
modification of the work for the customer's own use and reverse
|
||||
engineering for debugging such modifications.
|
||||
|
||||
You must give prominent notice with each copy of the work that the
|
||||
Library is used in it and that the Library and its use are covered by
|
||||
this License. You must supply a copy of this License. If the work
|
||||
during execution displays copyright notices, you must include the
|
||||
copyright notice for the Library among them, as well as a reference
|
||||
directing the user to the copy of this License. Also, you must do one
|
||||
of these things:
|
||||
|
||||
a) Accompany the work with the complete corresponding
|
||||
machine-readable source code for the Library including whatever
|
||||
changes were used in the work (which must be distributed under
|
||||
Sections 1 and 2 above); and, if the work is an executable linked
|
||||
with the Library, with the complete machine-readable "work that
|
||||
uses the Library", as object code and/or source code, so that the
|
||||
user can modify the Library and then relink to produce a modified
|
||||
executable containing the modified Library. (It is understood
|
||||
that the user who changes the contents of definitions files in the
|
||||
Library will not necessarily be able to recompile the application
|
||||
to use the modified definitions.)
|
||||
|
||||
b) Use a suitable shared library mechanism for linking with the
|
||||
Library. A suitable mechanism is one that (1) uses at run time a
|
||||
copy of the library already present on the user's computer system,
|
||||
rather than copying library functions into the executable, and (2)
|
||||
will operate properly with a modified version of the library, if
|
||||
the user installs one, as long as the modified version is
|
||||
interface-compatible with the version that the work was made with.
|
||||
|
||||
c) Accompany the work with a written offer, valid for at
|
||||
least three years, to give the same user the materials
|
||||
specified in Subsection 6a, above, for a charge no more
|
||||
than the cost of performing this distribution.
|
||||
|
||||
d) If distribution of the work is made by offering access to copy
|
||||
from a designated place, offer equivalent access to copy the above
|
||||
specified materials from the same place.
|
||||
|
||||
e) Verify that the user has already received a copy of these
|
||||
materials or that you have already sent this user a copy.
|
||||
|
||||
For an executable, the required form of the "work that uses the
|
||||
Library" must include any data and utility programs needed for
|
||||
reproducing the executable from it. However, as a special exception,
|
||||
the materials to be distributed need not include anything that is
|
||||
normally distributed (in either source or binary form) with the major
|
||||
components (compiler, kernel, and so on) of the operating system on
|
||||
which the executable runs, unless that component itself accompanies
|
||||
the executable.
|
||||
|
||||
It may happen that this requirement contradicts the license
|
||||
restrictions of other proprietary libraries that do not normally
|
||||
accompany the operating system. Such a contradiction means you cannot
|
||||
use both them and the Library together in an executable that you
|
||||
distribute.
|
||||
|
||||
7. You may place library facilities that are a work based on the
|
||||
Library side-by-side in a single library together with other library
|
||||
facilities not covered by this License, and distribute such a combined
|
||||
library, provided that the separate distribution of the work based on
|
||||
the Library and of the other library facilities is otherwise
|
||||
permitted, and provided that you do these two things:
|
||||
|
||||
a) Accompany the combined library with a copy of the same work
|
||||
based on the Library, uncombined with any other library
|
||||
facilities. This must be distributed under the terms of the
|
||||
Sections above.
|
||||
|
||||
b) Give prominent notice with the combined library of the fact
|
||||
that part of it is a work based on the Library, and explaining
|
||||
where to find the accompanying uncombined form of the same work.
|
||||
|
||||
8. You may not copy, modify, sublicense, link with, or distribute
|
||||
the Library except as expressly provided under this License. Any
|
||||
attempt otherwise to copy, modify, sublicense, link with, or
|
||||
distribute the Library is void, and will automatically terminate your
|
||||
rights under this License. However, parties who have received copies,
|
||||
or rights, from you under this License will not have their licenses
|
||||
terminated so long as such parties remain in full compliance.
|
||||
|
||||
9. You are not required to accept this License, since you have not
|
||||
signed it. However, nothing else grants you permission to modify or
|
||||
distribute the Library or its derivative works. These actions are
|
||||
prohibited by law if you do not accept this License. Therefore, by
|
||||
modifying or distributing the Library (or any work based on the
|
||||
Library), you indicate your acceptance of this License to do so, and
|
||||
all its terms and conditions for copying, distributing or modifying
|
||||
the Library or works based on it.
|
||||
|
||||
10. Each time you redistribute the Library (or any work based on the
|
||||
Library), the recipient automatically receives a license from the
|
||||
original licensor to copy, distribute, link with or modify the Library
|
||||
subject to these terms and conditions. You may not impose any further
|
||||
restrictions on the recipients' exercise of the rights granted herein.
|
||||
You are not responsible for enforcing compliance by third parties with
|
||||
this License.
|
||||
|
||||
11. If, as a consequence of a court judgment or allegation of patent
|
||||
infringement or for any other reason (not limited to patent issues),
|
||||
conditions are imposed on you (whether by court order, agreement or
|
||||
otherwise) that contradict the conditions of this License, they do not
|
||||
excuse you from the conditions of this License. If you cannot
|
||||
distribute so as to satisfy simultaneously your obligations under this
|
||||
License and any other pertinent obligations, then as a consequence you
|
||||
may not distribute the Library at all. For example, if a patent
|
||||
license would not permit royalty-free redistribution of the Library by
|
||||
all those who receive copies directly or indirectly through you, then
|
||||
the only way you could satisfy both it and this License would be to
|
||||
refrain entirely from distribution of the Library.
|
||||
|
||||
If any portion of this section is held invalid or unenforceable under any
|
||||
particular circumstance, the balance of the section is intended to apply,
|
||||
and the section as a whole is intended to apply in other circumstances.
|
||||
|
||||
It is not the purpose of this section to induce you to infringe any
|
||||
patents or other property right claims or to contest validity of any
|
||||
such claims; this section has the sole purpose of protecting the
|
||||
integrity of the free software distribution system which is
|
||||
implemented by public license practices. Many people have made
|
||||
generous contributions to the wide range of software distributed
|
||||
through that system in reliance on consistent application of that
|
||||
system; it is up to the author/donor to decide if he or she is willing
|
||||
to distribute software through any other system and a licensee cannot
|
||||
impose that choice.
|
||||
|
||||
This section is intended to make thoroughly clear what is believed to
|
||||
be a consequence of the rest of this License.
|
||||
|
||||
12. If the distribution and/or use of the Library is restricted in
|
||||
certain countries either by patents or by copyrighted interfaces, the
|
||||
original copyright holder who places the Library under this License may add
|
||||
an explicit geographical distribution limitation excluding those countries,
|
||||
so that distribution is permitted only in or among countries not thus
|
||||
excluded. In such case, this License incorporates the limitation as if
|
||||
written in the body of this License.
|
||||
|
||||
13. The Free Software Foundation may publish revised and/or new
|
||||
versions of the Lesser General Public License from time to time.
|
||||
Such new versions will be similar in spirit to the present version,
|
||||
but may differ in detail to address new problems or concerns.
|
||||
|
||||
Each version is given a distinguishing version number. If the Library
|
||||
specifies a version number of this License which applies to it and
|
||||
"any later version", you have the option of following the terms and
|
||||
conditions either of that version or of any later version published by
|
||||
the Free Software Foundation. If the Library does not specify a
|
||||
license version number, you may choose any version ever published by
|
||||
the Free Software Foundation.
|
||||
|
||||
14. If you wish to incorporate parts of the Library into other free
|
||||
programs whose distribution conditions are incompatible with these,
|
||||
write to the author to ask for permission. For software which is
|
||||
copyrighted by the Free Software Foundation, write to the Free
|
||||
Software Foundation; we sometimes make exceptions for this. Our
|
||||
decision will be guided by the two goals of preserving the free status
|
||||
of all derivatives of our free software and of promoting the sharing
|
||||
and reuse of software generally.
|
||||
|
||||
NO WARRANTY
|
||||
|
||||
15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
|
||||
WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
|
||||
EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
|
||||
OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
|
||||
KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
|
||||
LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
|
||||
THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
||||
|
||||
16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
|
||||
WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
|
||||
AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
|
||||
FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
|
||||
CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
|
||||
LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
|
||||
RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
|
||||
FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
|
||||
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
|
||||
DAMAGES.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
How to Apply These Terms to Your New Libraries
|
||||
|
||||
If you develop a new library, and you want it to be of the greatest
|
||||
possible use to the public, we recommend making it free software that
|
||||
everyone can redistribute and change. You can do so by permitting
|
||||
redistribution under these terms (or, alternatively, under the terms of the
|
||||
ordinary General Public License).
|
||||
|
||||
To apply these terms, attach the following notices to the library. It is
|
||||
safest to attach them to the start of each source file to most effectively
|
||||
convey the exclusion of warranty; and each file should have at least the
|
||||
"copyright" line and a pointer to where the full notice is found.
|
||||
|
||||
<one line to give the library's name and a brief idea of what it does.>
|
||||
Copyright (C) <year> <name of author>
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
|
||||
Also add information on how to contact you by electronic and paper mail.
|
||||
|
||||
You should also get your employer (if you work as a programmer) or your
|
||||
school, if any, to sign a "copyright disclaimer" for the library, if
|
||||
necessary. Here is a sample; alter the names:
|
||||
|
||||
Yoyodyne, Inc., hereby disclaims all copyright interest in the
|
||||
library `Frob' (a library for tweaking knobs) written by James Random Hacker.
|
||||
|
||||
<signature of Ty Coon>, 1 April 1990
|
||||
Ty Coon, President of Vice
|
||||
|
||||
That's all there is to it!
|
||||
|
||||
|
108
src/repseek/repseek-2014.09/Makefile
Normal file
108
src/repseek/repseek-2014.09/Makefile
Normal file
@ -0,0 +1,108 @@
|
||||
#
|
||||
# Design for Repseek
|
||||
# compile all src and output one binary
|
||||
# <amikezor>
|
||||
# April 2004
|
||||
#
|
||||
|
||||
# MACHINE = SUN
|
||||
# MACHINE = SGI
|
||||
# MACHINE = LINUX
|
||||
# MACHINE = OSF
|
||||
|
||||
MACHINE = MACOSX
|
||||
|
||||
|
||||
# ???: MALLOC=-lmalloc
|
||||
# others: MALLOC =
|
||||
# macosx debug: MALLOC = -lMallocDebug
|
||||
MALLOC=
|
||||
|
||||
|
||||
# SGI: PROTO= PROTO=1
|
||||
# others: PROTO= PROTO=0
|
||||
PROTO= PROTO=0
|
||||
|
||||
# SGI: RANLIB= touch
|
||||
# others: RANLIB= ranlib
|
||||
|
||||
RANLIB= ranlib
|
||||
|
||||
# Users can choose:
|
||||
|
||||
#CC= cc
|
||||
CC= gcc
|
||||
|
||||
|
||||
CFLAGS= -O4 -Wall
|
||||
#CFLAGS= -O2
|
||||
#CFLAGS= -pg
|
||||
#CFLAGS= -g
|
||||
|
||||
INSTALLDIR = $$HOME/bin
|
||||
|
||||
##### defined
|
||||
|
||||
SHELL = bash
|
||||
|
||||
LDFLAGS = -lm $(MALLOC)
|
||||
#LDFLAGS = -g -lm $(MALLOC)
|
||||
|
||||
SRC = sort.c\
|
||||
help.c\
|
||||
output.c\
|
||||
filter2seq.c \
|
||||
filter.c \
|
||||
families.c \
|
||||
families_2seqs.c\
|
||||
memory.c\
|
||||
memory_align.c\
|
||||
sequence.c\
|
||||
readfst.c\
|
||||
lmin.c \
|
||||
smin.c\
|
||||
KMRK.c \
|
||||
KMRK_Seeds.c \
|
||||
KMRK_merge_seeds.c\
|
||||
KMRK_mask.c\
|
||||
align_matrix.c\
|
||||
align_seeds.c\
|
||||
align_di.c\
|
||||
align_blast2like.c\
|
||||
read_seeds.c
|
||||
|
||||
|
||||
OBJ = $(SRC:.c=.o)
|
||||
|
||||
|
||||
## Rules
|
||||
|
||||
default:
|
||||
@echo "++ Repeats Search Engines ++"
|
||||
@echo "edit Makefile and set MACHINE, RANLIB, PROTO, CC, CFLAGS and eventually MALLOC"
|
||||
@echo " "
|
||||
@echo "To compile: make repseek"
|
||||
@echo "To clean: make clean"
|
||||
|
||||
|
||||
%.o: %.c
|
||||
$(CC) $(CFLAGS) -D$(MACHINE) -c -o $@ $<;
|
||||
|
||||
|
||||
repseek: $(OBJ) main_repseek.c
|
||||
$(CC) $(CFLAGS) -D$(MACHINE) -o $@ $(OBJ) main_repseek.c $(LDFLAGS);
|
||||
|
||||
|
||||
install: repseek
|
||||
cp repseek $(INSTALLDIR)
|
||||
|
||||
|
||||
clean:
|
||||
+rm -f *.o repseek
|
||||
|
||||
archive:
|
||||
\rm -rf RepSeek; mkdir RepSeek
|
||||
cp -r $(SRC) *.h Makefile main_repseek.c *.txt RepSeek;
|
||||
tmp=`date | awk '{print $$3$$2$$NF}'`;tar czvf RepSeek.$$tmp.tgz RepSeek;
|
||||
\rm -rf RepSeek
|
||||
|
32
src/repseek/repseek-2014.09/README.txt
Normal file
32
src/repseek/repseek-2014.09/README.txt
Normal file
@ -0,0 +1,32 @@
|
||||
/*
|
||||
<-- RepSeek, a repeat search engine -->
|
||||
Some help to compile the repseek program
|
||||
*/
|
||||
|
||||
1. Untar, and go the newly created directory repseek
|
||||
|
||||
2. Edit the Makefile, and fill the fields MACHINE, MALLOC, PROTO, INSTALLDIR
|
||||
and change any other field you want to.
|
||||
|
||||
3. Type 'make repseek' to compile the repseek binary, 'make install' to install the program
|
||||
in your ~/bin and 'make clean' to clean all object and the binary.
|
||||
|
||||
4. You can create an archive of RepSeek by typing 'make archive'
|
||||
|
||||
5. General usage is 'repseek [-v|-h|-H] [ opts ] { core_value } file.fst [file2.fst]'
|
||||
|
||||
Core values (select either -l, -p or -P option)
|
||||
-l lmin : set a minimum length for the seeds detection (no pvalue). Also usuable along with -P.
|
||||
-p prob : set a p-value that gives a minimum seed length (Karlin & Ost equation)
|
||||
-P prob : set a p-value that gives a minimum repeat score (Waterman & Vingron regression)
|
||||
|
||||
Information
|
||||
-v : Print 'v'ersion and exit
|
||||
-h : Print a short 'h'elp and exit
|
||||
-H : Print a larger 'H'elp on a specific mode and exit
|
||||
|
||||
|
||||
6. For more details on repseek insights, please refer to the repseek documentation
|
||||
|
||||
7. For License, Repseek is under Lesser GPL. For more details, see the License.txt file
|
||||
|
67
src/repseek/repseek-2014.09/align.h
Normal file
67
src/repseek/repseek-2014.09/align.h
Normal file
@ -0,0 +1,67 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
#ifndef _ALIGN_H_
|
||||
#define _ALIGN_H_
|
||||
|
||||
/*
|
||||
Macros to turn any char from A-Z to a symb 0-26 (for S&W scoring matrix)
|
||||
*/
|
||||
#define CHAR2SYMB( A ) ( (A) - 65 )
|
||||
#define SYMB2CHAR( i ) ( (i) + 65 )
|
||||
|
||||
#define Match(A,B) pScoring->matrix[ CHAR2SYMB(seq1[ (A)-1 ]) ][ CHAR2SYMB(seq2[ (B)-1 ]) ]
|
||||
|
||||
|
||||
/*
|
||||
The Seed2Repeat part
|
||||
*/
|
||||
Repeats align_seeds(AllSeeds_type *AllSeeds, char *sequence, float Xg, float gap_open, float gap_ext,\
|
||||
char matrix_type, int16_t Lmin, int8_t opt_dir, int8_t opt_inv, int8_t opt_overlap, float merge_repeats);
|
||||
|
||||
Repeats align_seeds_2seq(AllSeeds_type *AllSeeds, char *seq1, char *seq2, float Xg, float gap_open, float gap_ext, char matrix_type,\
|
||||
int16_t Lmin, int8_t opt_dir, int8_t opt_inv, float merge_repeats);
|
||||
|
||||
|
||||
Rep alignd(int32_t pos1, int32_t pos2, int32_t len, char *sequence, int32_t sizeseq,
|
||||
float Xg, SCORING *pScoring, RESULTS *pResults , int8_t opt_overlap);
|
||||
|
||||
Rep aligni(int32_t pos1, int32_t pos2, int32_t len, char *sequence, char *invsequence, int32_t sizeseq,\
|
||||
float Xg, SCORING *pScoring, RESULTS *pResults );
|
||||
|
||||
Rep alignd_2seq(int32_t pos1, int32_t pos2, int32_t len, char *seq1, char *seq2, \
|
||||
int32_t size1, int32_t size2, float Xg, SCORING *pScoring, RESULTS *pResults );
|
||||
|
||||
Rep aligni_2seq(int32_t pos1, int32_t pos2, int32_t len, char *seq1, char *invseq2, \
|
||||
int32_t size1, int32_t size2, float Xg, SCORING *pScoring, RESULTS *pResults );
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
The alignemnt itself
|
||||
*/
|
||||
|
||||
void align_blast2like(char *seq1, char *seq2, float Xg, SCORING *pScoring,\
|
||||
RESULTS *pResults, char direction, int32_t max_row, int32_t max_col, int32_t diff_max);
|
||||
void log_matrix(char *sequence, float gap_open, float gap_ext, SCORING *pScoring );
|
||||
void identity_matrix(char *sequence, float gap_open, float gap_ext, SCORING *pScoring);
|
||||
|
||||
|
||||
#endif /*_ALIGN_H_*/
|
445
src/repseek/repseek-2014.09/align_blast2like.c
Normal file
445
src/repseek/repseek-2014.09/align_blast2like.c
Normal file
@ -0,0 +1,445 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/******
|
||||
file : align_blast2like.c
|
||||
function : align 2 sequence by blast2 like algo
|
||||
first 0,0 is a MATCH
|
||||
|
||||
created : 07 jul 2003
|
||||
modified : July 2003, July 2004: turn int_32 scores into doubles;
|
||||
Aug 2004: Update the BestScore when it is ">=" (instead of ">")
|
||||
Sept 2005: store the number of steps
|
||||
Jan 2006: correct a bug in memory reading
|
||||
Mar 2006: correct a bug whenever only 1 cell was allowed + change align_blast2like (use max_row and maw_col instead of dist_max)
|
||||
|
||||
author : amikezor
|
||||
*****/
|
||||
|
||||
#include "repseek_types.h"
|
||||
#include "memory_align.h"
|
||||
#include "align.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
|
||||
/*
|
||||
Determine three scores (if any available)
|
||||
*/
|
||||
static int8_t IsScoreLeft(int32_t row, int32_t col, RESULTS pResults)
|
||||
{
|
||||
if( col>pResults.Segment_begin[row] )
|
||||
return 1;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
static int8_t IsScoreAbove(int32_t row, int32_t col, RESULTS pResults)
|
||||
{
|
||||
if( row>0 && col >= pResults.Segment_begin[row-1] && col <= pResults.Segment_end[row-1] )
|
||||
return 1;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
static int8_t IsScoreDiag(int32_t row, int32_t col, RESULTS pResults)
|
||||
{
|
||||
if( row>0 && col > pResults.Segment_begin[row-1] && col <= pResults.Segment_end[row-1]+1 )
|
||||
return 1;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
In case of update bestscore infos
|
||||
*/
|
||||
static void update_Bestscore(int32_t row, int32_t col, RESULTS *pResults){
|
||||
pResults->pBestScore = pResults->pscore;
|
||||
pResults->BestScore_row = row;
|
||||
pResults->BestScore_col = col;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
The fill 'matrix' function - even if there is no more matrix here
|
||||
*/
|
||||
#define BEGIN pResults->Segment_begin
|
||||
#define END pResults->Segment_end
|
||||
#define CURRENT pResults->pscore
|
||||
|
||||
#define nABOVE pResults->Top
|
||||
#define nLEFT pResults->Left
|
||||
|
||||
#define SCORE_FROM_LEFT( X ) (*(CURRENT - X))
|
||||
#define SCORE_FROM_ABOVE( row, col ) (*(CURRENT - (col-BEGIN[row]) - (END[row-1]-col+1) ) )
|
||||
#define SCORE_FROM_DIAG( row, col ) (*(CURRENT - (col-BEGIN[row]) - (END[row-1]-col+1) -1 ) )
|
||||
|
||||
#define MATCH_F(A,B) ( pScoring->matrix[ CHAR2SYMB(seq1[ (A) ]) ][ CHAR2SYMB(seq2[ (B) ]) ])
|
||||
#define MATCH_B(A,B) ( pScoring->matrix[ CHAR2SYMB( *(seq1-(A)) ) ][ CHAR2SYMB( *(seq2-(B)) ) ] )
|
||||
#define MATCH(A,B) ( (direction=='f')? MATCH_F(A,B):MATCH_B(A,B) )
|
||||
|
||||
|
||||
/*
|
||||
In this version,
|
||||
Sequence 2 is on TOP and Sequence 1 on the LEFT
|
||||
In the trace matrix, a 0 means diagonal,
|
||||
#>0 means # steps from above and
|
||||
#<0 translates into # steps from left
|
||||
*/
|
||||
|
||||
#define SCORE_TOP pResults->F
|
||||
|
||||
static void fill_matrices(char *seq1, char *seq2, SCORING *pScoring, RESULTS *pResults,
|
||||
float Xg, int32_t max_col, int32_t max_row, int32_t diff_max, char direction)
|
||||
{
|
||||
|
||||
int32_t col, row; /* current row and col */
|
||||
|
||||
double Score_left=0.0; /* score coming from left */
|
||||
double Score_diag=0.0; /* score coming from diagonal */
|
||||
|
||||
|
||||
int8_t IsDiag, /* markers to check if the cell in Diag, Above or Left does exist ? */
|
||||
IsAbove,
|
||||
IsLeft;
|
||||
|
||||
|
||||
CURRENT=pResults->scores; /* set the current score at the begining of the scores */
|
||||
row=col=0; /* set the col and row */
|
||||
BEGIN[0]=END[0]=0; /* initialize the arrays */
|
||||
|
||||
|
||||
/*
|
||||
Set the first Score - necessarily a match - the last/first bp of the seed
|
||||
*/
|
||||
*(CURRENT)=MATCH(0,0);
|
||||
pResults->traces[ 0 ] = 0;
|
||||
update_Bestscore(row, col, pResults);
|
||||
nABOVE[0]=0;
|
||||
SCORE_TOP[0]=*(CURRENT);
|
||||
/* printf("%c --> [%d,%d]:%f ",direction, row,col,*CURRENT); */
|
||||
/*
|
||||
First line
|
||||
*/
|
||||
col=1;
|
||||
CURRENT=pResults->scores+1;
|
||||
|
||||
while( *(CURRENT-1) > *pResults->pBestScore-Xg && col<max_col ){
|
||||
|
||||
if( CURRENT - pResults->scores >= pResults->max_scores) /* Check Mem and if needed realloc */
|
||||
ReallocScores(pResults, 0);
|
||||
if( col >= pResults->max_col )
|
||||
ReallocAbove(pResults, 0);
|
||||
|
||||
|
||||
if(col==1) *CURRENT = *(CURRENT-1) + pScoring->gap_open;
|
||||
else *CURRENT = *(CURRENT-1) + pScoring->gap_ext;
|
||||
|
||||
pResults->traces[ (CURRENT-pResults->scores) ] = -1;
|
||||
|
||||
nABOVE[col]=0; /* set the nABOVE[col] to 0 */
|
||||
SCORE_TOP[col]=*(CURRENT);
|
||||
/*fprintf(stderr,"[%d,%d]:%f vs. %f (col-1: %f) / Trace=%d\n", row,col,*CURRENT, *pResults->pBestScore-Xg, *(CURRENT-1),pResults->traces[ (CURRENT-pResults->scores) ]); */
|
||||
col++; /* increase col and current score */
|
||||
CURRENT++;
|
||||
|
||||
if(direction == 'b' && col > diff_max )break;
|
||||
|
||||
}
|
||||
|
||||
|
||||
END[row]=col-1; /* ignore the last score (inferior to the limit) */
|
||||
/* the last CURRENT value will be erased on next value */
|
||||
/* printf("Row %d was from %d to %d\n",row, BEGIN[row], END[row]); */
|
||||
|
||||
/*
|
||||
Then fill the whole 'matrix' - the matrix being an array of stripes
|
||||
*/
|
||||
while( BEGIN[row] < END[row] && row < max_row ){
|
||||
|
||||
row++;
|
||||
|
||||
if(row >= pResults->max_row)
|
||||
ReallocBegEnd(pResults, 0);
|
||||
|
||||
END[row]=BEGIN[row]=BEGIN[row-1];
|
||||
col=BEGIN[row];
|
||||
|
||||
|
||||
/*
|
||||
lets go for the row/segment
|
||||
*/
|
||||
while( col <= max_col ){
|
||||
|
||||
if( diff_max>0 && ((direction=='f' && row-col >= diff_max) || /* do not want chromosome aligned with itself */
|
||||
(direction=='b' && col-row >= diff_max)) )
|
||||
break;
|
||||
|
||||
if(diff_max<0 && col+row > -(diff_max) ) /* do not want plaindromes aligned with itself */
|
||||
break;
|
||||
|
||||
|
||||
if( CURRENT - pResults->scores >= pResults->max_scores) /* Check Mem and if needed realloc */
|
||||
ReallocScores(pResults, 0);
|
||||
|
||||
if( col >= pResults->max_col)
|
||||
ReallocAbove(pResults, 0);
|
||||
|
||||
IsDiag=IsScoreDiag(row, col, *pResults); /* Look for each potential score */
|
||||
IsAbove=IsScoreAbove(row, col, *pResults);
|
||||
IsLeft=IsScoreLeft(row, col, *pResults);
|
||||
|
||||
|
||||
/*
|
||||
Get the three scores (if they exist)
|
||||
as well as the three
|
||||
*/
|
||||
if( IsLeft ){
|
||||
|
||||
if( nLEFT>0 && Score_left+pScoring->gap_ext > SCORE_FROM_LEFT(1)+pScoring->gap_open ){
|
||||
Score_left += pScoring->gap_ext;
|
||||
nLEFT++;
|
||||
}
|
||||
else{
|
||||
Score_left = SCORE_FROM_LEFT(1) + pScoring->gap_open;
|
||||
nLEFT=1;
|
||||
}
|
||||
}
|
||||
else
|
||||
nLEFT = 0;
|
||||
|
||||
|
||||
if( IsAbove ){
|
||||
|
||||
if( nABOVE[col]>0 && SCORE_TOP[col]+pScoring->gap_ext > SCORE_FROM_ABOVE(row,col)+ pScoring->gap_open){
|
||||
SCORE_TOP[col] += pScoring->gap_ext;
|
||||
nABOVE[col]++;
|
||||
}
|
||||
else{
|
||||
SCORE_TOP[col] = SCORE_FROM_ABOVE(row,col) + pScoring->gap_open;
|
||||
nABOVE[col]=1;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
nABOVE[col] = 0;
|
||||
|
||||
if( IsDiag )
|
||||
Score_diag=SCORE_FROM_DIAG(row,col)+MATCH(row,col);
|
||||
|
||||
/*
|
||||
Compare the existing Scores
|
||||
*/
|
||||
if( IsDiag && /* The SCORE_DIAG is defined ..., then */
|
||||
(
|
||||
(!IsLeft && !IsAbove ) || /* no other scores are defined */
|
||||
(IsLeft && !IsAbove && Score_diag>=Score_left ) || /* bigger than Score_left */
|
||||
(!IsLeft && IsAbove && Score_diag>=SCORE_TOP[col] ) || /* bigger than SCORE_TOP */
|
||||
(IsLeft && IsAbove && Score_diag>=Score_left && Score_diag>=SCORE_TOP[col] ) /* bigger than both */
|
||||
)
|
||||
){
|
||||
*CURRENT = Score_diag; /* diag is eq or sup to others */
|
||||
pResults->traces[ CURRENT-pResults->scores ] = 0;
|
||||
}
|
||||
else if( IsAbove && /* The SCORE_ABOVE is defined ..., then */
|
||||
(
|
||||
(!IsLeft) || /* no other value */
|
||||
(IsLeft && SCORE_TOP[col]>Score_left ) /* bigger than Score_left */
|
||||
)
|
||||
|
||||
){
|
||||
*CURRENT = SCORE_TOP[col]; /* Score_top is better than Score_left */
|
||||
pResults->traces[ CURRENT-pResults->scores ] = nABOVE[col];
|
||||
|
||||
}
|
||||
else{
|
||||
*CURRENT = Score_left; /* unknown : we don't know: both way are equal */
|
||||
pResults->traces[ CURRENT-pResults->scores ] = -nLEFT; /* arbitrarily, choose the left one */
|
||||
}
|
||||
|
||||
/* printf("[%d:%d]: Curr=%f vs. Best=%f ; Trace= %d\n\n",row,col,*CURRENT, *pResults->pBestScore-Xg, pResults->traces[ CURRENT-pResults->scores ]); */
|
||||
|
||||
/*
|
||||
Compare the current score to the limit
|
||||
*/
|
||||
if( *CURRENT > *pResults->pBestScore - (double)Xg ){ /* If it is bigger than the limit */
|
||||
/* fprintf(stderr,"[%d,%d]:%f;%c ",row,col,*CURRENT, pResults->traces[ CURRENT-pResults->scores ]); */
|
||||
if( *CURRENT >= *pResults->pBestScore ) /* keep it, and eventually set it as the Best_Score */
|
||||
update_Bestscore(row, col, pResults);
|
||||
|
||||
END[row]=col; /* if match it ends at least here */
|
||||
CURRENT++; /* get the score */
|
||||
}
|
||||
else{ /* see where we are */
|
||||
/* printf("%ld:%ld -> last_row= %ld\n", row, col, END[row-1]); */
|
||||
if( col == BEGIN[row] )
|
||||
BEGIN[row]=col+1; /* set begin at next and ... */
|
||||
|
||||
if( col > BEGIN[row] && col <= END[row-1]){
|
||||
CURRENT++; /* then look at the next score */
|
||||
}
|
||||
|
||||
if(col > END[row-1])
|
||||
break; /* then just stop that loop */
|
||||
}
|
||||
col++;
|
||||
|
||||
}
|
||||
/* printf("Row %d was from %d to %d\n\n",row, BEGIN[row], END[row]);*/
|
||||
CURRENT-=(col-END[row]-1);
|
||||
/* fprintf(stderr,"/// BestScore= %f\n",*pResults->pBestScore); */
|
||||
}
|
||||
|
||||
CURRENT--;
|
||||
pResults->nSegment=row+1; /* Just remember how many segments (rows) */
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
The traceback function
|
||||
it always read the trace matrix in this new form
|
||||
*/
|
||||
static void traceback( char *seq1, char *seq2, SCORING * pScoring, RESULTS *pResults , char direction){
|
||||
|
||||
int32_t trace=0; /* the current trace */
|
||||
|
||||
int32_t row, /* current row and cols */
|
||||
col;
|
||||
|
||||
int32_t count=0; /* the total number of step of traceback */
|
||||
|
||||
int32_t *ptraces= pResults->traces + (pResults->pBestScore - pResults->scores); /* where we start, a pointer to the trace arrays */
|
||||
|
||||
int n;
|
||||
|
||||
|
||||
if(direction == 'f')
|
||||
pResults->traceback=pResults->traceback_f;
|
||||
else if(direction == 'b')
|
||||
pResults->traceback=pResults->traceback_b;
|
||||
else
|
||||
fprintf(stderr,"traceback: direction is either 'f' or 'b', bye\n"),exit(4);
|
||||
|
||||
|
||||
row = pResults->BestScore_row; /* set cursor at the extension end -- ! 0,0 if no similarity -- */
|
||||
col = pResults->BestScore_col;
|
||||
|
||||
pResults->matches=0;
|
||||
pResults->alignment_len=0;
|
||||
|
||||
while( row>0 || col>0 ){
|
||||
|
||||
|
||||
/*
|
||||
Set trace
|
||||
*/
|
||||
trace = *ptraces; /* read the current trace */
|
||||
|
||||
|
||||
/*
|
||||
fprintf(stderr,"[%d,%d] = %ld\n", row, col, trace);
|
||||
if(row<-2 || col<-2)
|
||||
exit(1);
|
||||
*/
|
||||
|
||||
|
||||
if(trace == 0){ /* if it is diagonal */
|
||||
|
||||
if( ( direction=='f' && *(seq1+row)==*(seq2+col)) || \
|
||||
( direction=='b' && *(seq1-row)==*(seq2-col)) )
|
||||
(pResults->matches)++;
|
||||
|
||||
ptraces -= (col-BEGIN[row]) + (END[row-1]-col+1) + 1;
|
||||
|
||||
row--;
|
||||
col--;
|
||||
|
||||
pResults->traceback[count++]= 'd';
|
||||
|
||||
}
|
||||
else if(trace < 0 ){ /* we are going to left */
|
||||
|
||||
|
||||
ptraces += trace;
|
||||
col += trace; /* think that trace is negative */
|
||||
|
||||
for(n=0;n<ABS(trace);n++)
|
||||
pResults->traceback[count++]= 'l';
|
||||
|
||||
}
|
||||
else { /* choose above by default */
|
||||
|
||||
for(n=0 ; n<trace ; n++){
|
||||
ptraces -= (col-BEGIN[row]) + (END[row-1]-col+1);
|
||||
row--;
|
||||
pResults->traceback[count++]= 'a';
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
if(count >= pResults->max_alignlen-1)
|
||||
ReallocTraceback(pResults, 0, direction);
|
||||
|
||||
|
||||
}
|
||||
|
||||
pResults->traceback[count++] = 'd'; /* add the last bit from the first cell to the previous */
|
||||
pResults->matches++; /* it has to be a match --it is the seed end-- */
|
||||
|
||||
pResults->alignment_len = count;
|
||||
pResults->traceback[count] = 0;
|
||||
|
||||
}
|
||||
|
||||
#undef CURRENT
|
||||
|
||||
#undef nABOVE
|
||||
#undef nLEFT
|
||||
|
||||
#undef BEGIN
|
||||
#undef END
|
||||
#undef SCORE_FROM_LEFT
|
||||
#undef SCORE_FROM_ABOVE
|
||||
#undef SCORE_DIAG
|
||||
|
||||
#undef MATCH_F
|
||||
#undef MATCH_B
|
||||
#undef MATCH
|
||||
|
||||
/*
|
||||
If diff_max is negatif, then we are in inverted and do not want palindromes
|
||||
otherwise diff_max is positive and is defined to avoid aligning the chromosome with itself
|
||||
row is seq1 and col is seq2
|
||||
*/
|
||||
void align_blast2like(char *seq1, char *seq2, float Xg, SCORING *pScoring, RESULTS *pResults, char direction, int32_t max_row, int32_t max_col, int32_t diff_max){
|
||||
|
||||
|
||||
if(direction != 'f' && direction != 'b')
|
||||
fprintf(stderr,"align_blast2like: align 'f'orward or 'b'ackward, bye\n"),exit(4);
|
||||
|
||||
fill_matrices(seq1, seq2, pScoring, pResults, Xg, max_col, max_row, diff_max, direction);
|
||||
traceback( seq1, seq2, pScoring, pResults, direction);
|
||||
|
||||
}
|
||||
|
||||
|
435
src/repseek/repseek-2014.09/align_di.c
Normal file
435
src/repseek/repseek-2014.09/align_di.c
Normal file
@ -0,0 +1,435 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/******
|
||||
file : align_di.c
|
||||
function : get a seeds, align it forward and backward and return the aligned repeat
|
||||
|
||||
created : Apr 11 2001
|
||||
modif : Jul 2003,Feb 2004, April 2004, July 2004: turn scores into doubles
|
||||
|
||||
author : amikezor
|
||||
*****/
|
||||
|
||||
|
||||
#include "repseek_types.h"
|
||||
#include "align.h"
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
|
||||
/*
|
||||
Get the score of a seed
|
||||
first and ast position are not taken into account since they are used in the alignments
|
||||
*/
|
||||
static double score_seed(int32_t pos, int32_t len, char *sequence, SCORING *pScoring){
|
||||
|
||||
char *pseq; /* pointer to the seqeunce */
|
||||
double score=0; /* the score */
|
||||
int32_t index=0; /* the symbol of the character in the reward matrix */
|
||||
|
||||
pseq = sequence+pos;
|
||||
|
||||
while(pseq < sequence+pos+len-1){
|
||||
|
||||
index = CHAR2SYMB(*pseq);
|
||||
score += pScoring->matrix[index][index];
|
||||
pseq++;
|
||||
}
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
The repeat is a palindrome
|
||||
*/
|
||||
static Rep Palindome(int32_t pos1, int32_t pos2, int32_t len, char *sequence, SCORING *pScoring){
|
||||
|
||||
Rep aligned_seed;
|
||||
|
||||
aligned_seed.seed_pos1 = pos1;
|
||||
aligned_seed.seed_pos2 = pos2;
|
||||
aligned_seed.seed_len = len;
|
||||
aligned_seed.pos1 = pos1;
|
||||
aligned_seed.len1 = len/2;
|
||||
aligned_seed.pos2 = pos1+aligned_seed.len1;
|
||||
aligned_seed.len2 = len/2;
|
||||
aligned_seed.match = len/2;
|
||||
aligned_seed.align_len = len/2;
|
||||
strcpy(aligned_seed.type, "Palindrome");
|
||||
aligned_seed.score = score_seed(pos1, len, sequence, pScoring);
|
||||
|
||||
return aligned_seed;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
Init the repeats value with a seed
|
||||
first and last position are removed since they are used in the alignements
|
||||
*/
|
||||
static void InitRep(int32_t pos1, int32_t pos2, int32_t len, double ScoreSeed, Rep *aligned_seed ){
|
||||
|
||||
aligned_seed->pos1=pos1+1;
|
||||
aligned_seed->pos2=pos2+1;
|
||||
|
||||
aligned_seed->len1= len-2; /* len of seed + aligned_genomik */
|
||||
aligned_seed->len2= len-2;
|
||||
|
||||
aligned_seed->seed_pos1 = pos1;
|
||||
aligned_seed->seed_pos2 = pos2;
|
||||
aligned_seed->seed_len = len;
|
||||
|
||||
aligned_seed->match = len-2;
|
||||
aligned_seed->align_len = len-2;
|
||||
aligned_seed->score = ScoreSeed;
|
||||
}
|
||||
|
||||
/*
|
||||
Update the repeats from we have on the right - forward-
|
||||
Only for direct repeats
|
||||
*/
|
||||
static void UpdateRepRight(Rep *aligned_seed , RESULTS *pResults){
|
||||
|
||||
aligned_seed->len1 += pResults->BestScore_row+1; /* len of seed + aligned_genomik */
|
||||
aligned_seed->len2 += pResults->BestScore_col+1;
|
||||
|
||||
aligned_seed->match += pResults->matches;
|
||||
aligned_seed->align_len += pResults->alignment_len;
|
||||
aligned_seed->score += *pResults->pBestScore;
|
||||
}
|
||||
|
||||
/*
|
||||
Update the repeats from we have on the left - backward -
|
||||
Only for direct repeats
|
||||
*/
|
||||
static void UpdateRepLeft(Rep *aligned_seed , RESULTS *pResults){
|
||||
|
||||
aligned_seed->pos1 -= pResults->BestScore_row+1; /* len of seed + aligned_genomik */
|
||||
aligned_seed->pos2 -= pResults->BestScore_col+1;
|
||||
|
||||
aligned_seed->len1 += pResults->BestScore_row+1; /* len of seed + aligned_genomik */
|
||||
aligned_seed->len2 += pResults->BestScore_col+1;
|
||||
|
||||
aligned_seed->match += pResults->matches;
|
||||
aligned_seed->align_len += pResults->alignment_len;
|
||||
aligned_seed->score += *pResults->pBestScore;
|
||||
}
|
||||
|
||||
/*
|
||||
Same functions but for inverted repeats
|
||||
*/
|
||||
static void UpdateRepRightInv(Rep *aligned_seed , RESULTS *pResults){
|
||||
|
||||
aligned_seed->len1 += pResults->BestScore_row+1; /* len of seed + aligned_genomik */
|
||||
|
||||
aligned_seed->pos2 -= pResults->BestScore_col+1;
|
||||
aligned_seed->len2 += pResults->BestScore_col+1;
|
||||
|
||||
aligned_seed->match += pResults->matches;
|
||||
aligned_seed->align_len += pResults->alignment_len;
|
||||
aligned_seed->score += *pResults->pBestScore;
|
||||
}
|
||||
static void UpdateRepLeftInv(Rep *aligned_seed , RESULTS *pResults){
|
||||
|
||||
aligned_seed->pos1 -= pResults->BestScore_row+1; /* len of seed + aligned_genomik */
|
||||
|
||||
aligned_seed->len1 += pResults->BestScore_row+1; /* len of seed + aligned_genomik */
|
||||
aligned_seed->len2 += pResults->BestScore_col+1;
|
||||
|
||||
aligned_seed->match += pResults->matches;
|
||||
aligned_seed->align_len += pResults->alignment_len;
|
||||
aligned_seed->score += *pResults->pBestScore;
|
||||
}
|
||||
|
||||
|
||||
/***
|
||||
The function that return an aligned Rep from a cupple of seed information
|
||||
Moreover all Struct have to be init before
|
||||
***/
|
||||
Rep alignd(int32_t pos1, int32_t pos2, int32_t len, char *sequence, int32_t sizeseq ,
|
||||
float Xg, SCORING *pScoring, RESULTS *pResults , int8_t opt_overlap){
|
||||
|
||||
|
||||
Rep aligned_seed; /* the repeat */
|
||||
char *pseq1, *pseq2; /* pointer for the first and the second copy */
|
||||
double ScoreSeed=0; /* score of the seed */
|
||||
int32_t max_col=0, max_row=0; /* the dist max from the edge of the sequence max for rwo and col */
|
||||
int32_t diff_max =0; /* the maximum difference between row and col to avoid aligning the chr with itself */
|
||||
|
||||
|
||||
|
||||
/*
|
||||
Init by the seed
|
||||
*/
|
||||
|
||||
ScoreSeed=score_seed(pos1+1, len-1, sequence, pScoring); /* pos1+1 and len1-1, since first bases are used into the alignement */
|
||||
InitRep(pos1, pos2, len, ScoreSeed, &aligned_seed );
|
||||
diff_max = pos2-pos1; /* if pos1 is bigger than pos2, then you align the chromosome with itself */
|
||||
|
||||
|
||||
|
||||
/*
|
||||
If the seed is already overlapping and we do not want it
|
||||
*/
|
||||
|
||||
if(opt_overlap == 0 && aligned_seed.seed_pos1+aligned_seed.seed_len >= aligned_seed.seed_pos2){
|
||||
|
||||
len=(pos2-pos1);
|
||||
ScoreSeed=score_seed(pos1, len, sequence, pScoring);
|
||||
InitRep(pos1-1, pos2-1, len+2, ScoreSeed, &aligned_seed );
|
||||
strcpy(aligned_seed.type, "Tandem");
|
||||
return aligned_seed;
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Align the right side -- forward --
|
||||
*/
|
||||
pseq1 = sequence+pos1+len-1; /* set the sequences */
|
||||
pseq2 = sequence+pos2+len-1;
|
||||
|
||||
|
||||
max_col = sizeseq-(pos2+len-1)-1; /* you cannot align after the chromosome end */
|
||||
max_row = sizeseq-(pos1+len-1)-1;
|
||||
|
||||
if(opt_overlap == 0)
|
||||
max_row = MIN2(pos2-(pos1+len-1) -1 , max_row); /* here, you cannot go inside the next copy */
|
||||
|
||||
|
||||
|
||||
align_blast2like(pseq1, pseq2, Xg, pScoring, pResults, 'f', max_row, max_col, diff_max);
|
||||
UpdateRepRight(&aligned_seed, pResults);
|
||||
|
||||
|
||||
/*
|
||||
Align the left side -- backward --
|
||||
*/
|
||||
pseq1 = sequence+pos1; /* set the sequences */
|
||||
pseq2 = sequence+pos2;
|
||||
|
||||
max_row = pos1;
|
||||
max_col = pos2;
|
||||
|
||||
if(opt_overlap == 0)
|
||||
max_col = MIN2( aligned_seed.pos2 - (aligned_seed.pos1 + aligned_seed.len1 - 1) -2 , max_col );
|
||||
|
||||
align_blast2like(pseq1, pseq2, Xg, pScoring, pResults, 'b', max_row, max_col, diff_max);
|
||||
|
||||
UpdateRepLeft(&aligned_seed, pResults);
|
||||
|
||||
if( aligned_seed.pos2 < aligned_seed.pos1+aligned_seed.len1 )
|
||||
strcpy(aligned_seed.type, "Overlap");
|
||||
else if( aligned_seed.pos2 == aligned_seed.pos1+aligned_seed.len1 )
|
||||
strcpy(aligned_seed.type, "Tandem");
|
||||
else if( aligned_seed.pos2 - (aligned_seed.pos1+aligned_seed.len1) < 1000 )
|
||||
strcpy(aligned_seed.type, "Close" );
|
||||
else
|
||||
strcpy(aligned_seed.type, "Distant" );
|
||||
|
||||
return aligned_seed; /* return the repeat */
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
Align some Inverted repeat
|
||||
*/
|
||||
Rep aligni(int32_t pos1, int32_t pos2, int32_t len, char *sequence, char *invsequence,\
|
||||
int32_t sizeseq ,float Xg, SCORING *pScoring, RESULTS *pResults ){
|
||||
|
||||
|
||||
Rep aligned_seed; /* the repeat */
|
||||
char *pseq1, *pseq2; /* pointer for the first and the second copy */
|
||||
double ScoreSeed=0; /* score of the seed */
|
||||
int32_t max_row=0, max_col=0; /* the dist max from the edge of the sequence max for rwo and col */
|
||||
int32_t diff_max =0; /* the maximum difference between row and col to
|
||||
avoid aligning a palindrome with itself. !! For Inverted it takes NEGATIVE Values !! */
|
||||
|
||||
/*
|
||||
Init by th seed
|
||||
*/
|
||||
ScoreSeed=score_seed(pos1+1, len-1, sequence, pScoring); /* pos1+1 and len1-1, since first bases are used into the alignement */
|
||||
InitRep(pos1, pos2, len, ScoreSeed, &aligned_seed );
|
||||
|
||||
if(pos1 == pos2){
|
||||
aligned_seed = Palindome(pos1, pos2, len, sequence, pScoring);
|
||||
return aligned_seed;
|
||||
}
|
||||
/*
|
||||
Align the right side -- forward for the first copy --
|
||||
*/
|
||||
pseq1 = sequence+pos1+len-1; /* set the sequences */
|
||||
pseq2 = invsequence + sizeseq - (pos2+1);
|
||||
diff_max = -(pos2-pos1-len); /* for inverted repeats, we do not want palindromes
|
||||
so put it in negative is a code to say 'inverted' */
|
||||
|
||||
max_row = sizeseq-(pos1+len-1)-1;
|
||||
max_col = pos2;
|
||||
|
||||
align_blast2like(pseq1, pseq2, Xg, pScoring, pResults, 'f', max_row, max_col, diff_max);
|
||||
UpdateRepRightInv(&aligned_seed, pResults);
|
||||
|
||||
/*
|
||||
Align the left side -- backward for the first copy --
|
||||
*/
|
||||
pseq1 = sequence+pos1; /* set the sequences */
|
||||
pseq2 = invsequence+sizeseq-(pos2+len-1) -1;
|
||||
|
||||
max_row = pos1;
|
||||
max_col = sizeseq-(pos2+len-1)-1;
|
||||
|
||||
diff_max = sizeseq; /* for inverted repeats, on left, it does not really matter */
|
||||
|
||||
|
||||
align_blast2like(pseq1, pseq2, Xg, pScoring, pResults, 'b', max_row, max_col, diff_max);
|
||||
UpdateRepLeftInv(&aligned_seed, pResults);
|
||||
|
||||
if(aligned_seed.pos2 == (aligned_seed.pos1+aligned_seed.len1))
|
||||
strcpy(aligned_seed.type, "Palindrome");
|
||||
else if(aligned_seed.pos2 - (aligned_seed.pos1+aligned_seed.len1) < 1000 )
|
||||
strcpy(aligned_seed.type, "Close");
|
||||
else
|
||||
strcpy(aligned_seed.type, "Distant" );
|
||||
|
||||
|
||||
return aligned_seed; /* return the repeat */
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
Align repeats shared by 2 sequences
|
||||
*/
|
||||
Rep alignd_2seq(int32_t pos1, int32_t pos2, int32_t len, char *seq1, char *seq2, \
|
||||
int32_t size1, int32_t size2, float Xg, SCORING *pScoring, RESULTS *pResults ){
|
||||
|
||||
|
||||
Rep aligned_seed; /* the repeat */
|
||||
char *pseq1, *pseq2; /* pointer for the first and the second copy */
|
||||
double ScoreSeed=0; /* score of the seed */
|
||||
int32_t max_row=0, max_col=0; /* the dist max from the edge of the sequence max for rwo and col */
|
||||
int32_t diff_max =0; /* the maximum difference between row and col to avoid aligning the chr with itself */
|
||||
|
||||
|
||||
/*
|
||||
Init by the seed
|
||||
*/
|
||||
|
||||
ScoreSeed=score_seed(pos1+1, len-1, seq1, pScoring); /* pos1+1 and len1-1, since first bases are used into the alignement */
|
||||
InitRep(pos1, pos2, len, ScoreSeed, &aligned_seed );
|
||||
diff_max = size1+size2; /* there is not limit here */
|
||||
|
||||
|
||||
/*
|
||||
Align the right side -- forward --
|
||||
*/
|
||||
pseq1 = seq1+pos1+len-1; /* set the sequences */
|
||||
pseq2 = seq2+pos2+len-1;
|
||||
|
||||
|
||||
max_row = size1-(pos1+len-1)-1;
|
||||
max_col = size2-(pos2+len-1)-1; /* you cannot further than a chronmosome end */
|
||||
|
||||
align_blast2like(pseq1, pseq2, Xg, pScoring, pResults, 'f', max_row, max_col, diff_max);
|
||||
|
||||
UpdateRepRight(&aligned_seed, pResults);
|
||||
|
||||
|
||||
|
||||
/*
|
||||
Align the left side -- backward --
|
||||
*/
|
||||
pseq1 = seq1+pos1; /* set the sequences */
|
||||
pseq2 = seq2+pos2;
|
||||
|
||||
max_row = pos1;
|
||||
max_col = pos2; /* you cannot further than a chronmosome begining */
|
||||
|
||||
align_blast2like(pseq1, pseq2, Xg, pScoring, pResults, 'b', max_row, max_col, diff_max);
|
||||
UpdateRepLeft(&aligned_seed, pResults);
|
||||
|
||||
strcpy(aligned_seed.type, "Inter");
|
||||
|
||||
return aligned_seed; /* return the repeat */
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
Align repeats shared by 2 sequences
|
||||
*/
|
||||
Rep aligni_2seq(int32_t pos1, int32_t pos2, int32_t len, char *seq1, char *invseq2, \
|
||||
int32_t size1, int32_t size2, float Xg, SCORING *pScoring, RESULTS *pResults ){
|
||||
|
||||
|
||||
Rep aligned_seed; /* the repeat */
|
||||
char *pseq1, *pseq2; /* pointer for the first and the second copy */
|
||||
double ScoreSeed=0; /* score of the seed */
|
||||
int32_t max_row=0, max_col=0; /* the dist max from the edge of the sequence max for rwo and col */
|
||||
int32_t diff_max =0; /* the maximum difference between row and col to
|
||||
avoid aligning a palindrome with itself. !! For Inverted it takes NEGATIVE Values !! */
|
||||
|
||||
/*
|
||||
Init by th seed
|
||||
*/
|
||||
ScoreSeed=score_seed(pos1+1, len-1, seq1, pScoring); /* pos1+1 and len1-1, since first bases are used into the alignement */
|
||||
InitRep(pos1, pos2, len, ScoreSeed, &aligned_seed );
|
||||
|
||||
|
||||
|
||||
/*
|
||||
Align the right side -- forward for the first copy --
|
||||
*/
|
||||
pseq1 = seq1+pos1+len-1; /* set the sequences */
|
||||
pseq2 = invseq2 + size2 - (pos2+1);
|
||||
diff_max = size1+size2; /* NO diff max for inter-repeats */
|
||||
|
||||
max_row = size1-(pos1+len-1)-1;
|
||||
max_col = pos2;
|
||||
|
||||
align_blast2like(pseq1, pseq2, Xg, pScoring, pResults, 'f', max_row, max_col, diff_max);
|
||||
UpdateRepRightInv(&aligned_seed, pResults);
|
||||
|
||||
/*
|
||||
Align the left side -- backward for the first copy --
|
||||
*/
|
||||
pseq1 = seq1+pos1; /* set the sequences */
|
||||
pseq2 = invseq2+size2-(pos2+len-1) -1;
|
||||
|
||||
max_row = pos1;
|
||||
max_col = size2-(pos2+len-1)-1;
|
||||
|
||||
diff_max = size1+size2; /* it does not really matter */
|
||||
|
||||
|
||||
align_blast2like(pseq1, pseq2, Xg, pScoring, pResults, 'b', max_row, max_col, diff_max);
|
||||
UpdateRepLeftInv(&aligned_seed, pResults);
|
||||
|
||||
if(aligned_seed.pos2 == (aligned_seed.pos1+aligned_seed.len1))
|
||||
strcpy(aligned_seed.type, "Inter");
|
||||
|
||||
return aligned_seed; /* return the repeat */
|
||||
} /* return the repeat */
|
276
src/repseek/repseek-2014.09/align_matrix.c
Normal file
276
src/repseek/repseek-2014.09/align_matrix.c
Normal file
@ -0,0 +1,276 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/******
|
||||
file : matrix.c
|
||||
function : create a log(freq) / identity matrix for DNA
|
||||
|
||||
created : 15 Oct 2002
|
||||
modif : nov 12 2002: Set all score of X against anything to -1000; (usefull for mask and twoseq)
|
||||
modif : nov 26 2002: get my new matrix format PtrConst->pScoring (for removing seqaln and my S&W)
|
||||
modif : Oct 2003
|
||||
modif : July 2004 : change the formula of the log matrix, mainly Match(N,i) is changed
|
||||
modif : July 2004 : turn scores into doubles.
|
||||
|
||||
author : amikezor
|
||||
*****/
|
||||
|
||||
#include "repseek_types.h"
|
||||
#include "align.h"
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
/***
|
||||
calculate the frequency of all symbol
|
||||
in DNA sequence
|
||||
***/
|
||||
static int32_t get_letter(int32_t letters[5], char *seq)
|
||||
{
|
||||
|
||||
int32_t lseq=0;
|
||||
int8_t letter;
|
||||
|
||||
letters[0] = letters[1] = letters[2] = letters[3] = letters[4] = 0;
|
||||
|
||||
while( (letter = *seq) != 0)
|
||||
{
|
||||
|
||||
lseq++;
|
||||
|
||||
if (letter == 'A')
|
||||
letters[0]++;
|
||||
if (letter == 'C')
|
||||
letters[1]++;
|
||||
if (letter == 'G')
|
||||
letters[2]++;
|
||||
if (letter == 'T')
|
||||
letters[3]++;
|
||||
|
||||
seq++;
|
||||
}
|
||||
|
||||
letters[4] = lseq-letters[0]-letters[1]-letters[2]-letters[3]-letters[4];
|
||||
|
||||
return lseq;
|
||||
}
|
||||
|
||||
/***
|
||||
From a float return the closest int
|
||||
***/
|
||||
static int float2int(float a){
|
||||
|
||||
if(a<0){
|
||||
if( ( a-(float)(int)a ) > -0.5 )
|
||||
return (int)a;
|
||||
else
|
||||
return ((int)a-1);
|
||||
}
|
||||
else{
|
||||
if( ( a-(float)(int)a ) < 0.5 )
|
||||
return (int)a;
|
||||
else
|
||||
return ((int)a+1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Print the matrix if you want
|
||||
*/
|
||||
void printout_matrix2( SCORING *pScoring ){
|
||||
|
||||
int i,j;
|
||||
|
||||
fprintf(stderr," A B C D E F G H I J K L M N O P"
|
||||
" Q R S T U V W X Y Z\n");
|
||||
for (i=0; i<26; i++) {
|
||||
fprintf(stderr,"%c ", SYMB2CHAR(i) );
|
||||
for (j=0; j<26; j++)
|
||||
if(pScoring->matrix[i][j]<100 && pScoring->matrix[i][j]>-100)
|
||||
fprintf(stderr,"%5.2f",pScoring->matrix[i][j]);
|
||||
else
|
||||
fprintf(stderr,"%5.1g",pScoring->matrix[i][j]);
|
||||
|
||||
fprintf(stderr,"\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Print the matrix if you want
|
||||
*/
|
||||
void printout_matrix( SCORING *pScoring ){
|
||||
|
||||
int i,j;
|
||||
char carac[6]={'A', 'C', 'G', 'T', 'N', 'X'};
|
||||
|
||||
fprintf(stderr," ");
|
||||
for (i=0; i<6; i++)
|
||||
fprintf(stderr,"%6c", carac[i]);
|
||||
fprintf(stderr,"\n");
|
||||
|
||||
for (i=0; i<6; i++) {
|
||||
|
||||
fprintf(stderr,"%-6c ", carac[i] );
|
||||
|
||||
for (j=0; j<6; j++)
|
||||
if(pScoring->matrix[CHAR2SYMB(carac[i])][CHAR2SYMB(carac[j])]<100 &&
|
||||
pScoring->matrix[CHAR2SYMB(carac[i])][CHAR2SYMB(carac[j])]>-100)
|
||||
fprintf(stderr,"%6.2f",pScoring->matrix[CHAR2SYMB(carac[i])][CHAR2SYMB(carac[j])]);
|
||||
else
|
||||
fprintf(stderr,"%6.1g",pScoring->matrix[CHAR2SYMB(carac[i])][CHAR2SYMB(carac[j])]);
|
||||
|
||||
fprintf(stderr,"\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/***
|
||||
Set up the matrix MATCH(i,j) = 1/2 * log4(pi*pj) - thought to be the more corrective
|
||||
get all symbol frequency and estimate match(N,i) = pi * log(pi);
|
||||
***/
|
||||
void log_matrix(char *sequence, float gap_open, float gap_ext, SCORING *pScoring){
|
||||
|
||||
int32_t letters[5]={0,}; /* count of every letter */
|
||||
double freq[4]= {0.0,}; /* frequency of nucleotides ACGT */
|
||||
char carac[4]={'A','C','G','T'};
|
||||
|
||||
int32_t size; /* size of the seq including Ns ans Xs */
|
||||
double temp; /* make the read easier */
|
||||
int32_t size_eff; /* size_eff is the real number of non N nucleotide */
|
||||
int16_t i=0, j=0, u; /* dummy counters */
|
||||
|
||||
|
||||
|
||||
for(i=0;i<26;i++) /* set the matrix to 0 */
|
||||
for(j=0;j<26;j++)
|
||||
pScoring->matrix[i][j]=0;
|
||||
|
||||
|
||||
size = get_letter(letters, sequence); /* count all letters */
|
||||
size_eff = size-letters[4]; /* count the size of ACGT symbols */
|
||||
|
||||
for(i=0;i<4;i++) /* get the frequencies */
|
||||
freq[i]=(double)letters[i]/(double)size_eff;
|
||||
|
||||
|
||||
/*
|
||||
Set the matrix score
|
||||
*/
|
||||
for(i=0;i<4;i++){
|
||||
|
||||
for(j=0;j<4;j++){
|
||||
temp = 0.5 * log10( freq[i]*freq[j] )/log10( 4.0 );
|
||||
pScoring->matrix[ CHAR2SYMB(carac[i]) ][ CHAR2SYMB(carac[j]) ] = (i==j)? -temp: temp;
|
||||
}
|
||||
|
||||
temp = - freq[i] * log10( freq[i] )/log10( 4.00 );
|
||||
pScoring->matrix[CHAR2SYMB('N')][CHAR2SYMB(carac[i])] = temp;
|
||||
pScoring->matrix[CHAR2SYMB(carac[i])][CHAR2SYMB('N')] = temp;
|
||||
|
||||
}
|
||||
|
||||
pScoring->matrix[CHAR2SYMB('N')][CHAR2SYMB('N')] = 0.25;
|
||||
|
||||
/*
|
||||
set all X against anything to -1e6
|
||||
*/
|
||||
for(i=0;i<26;i++)
|
||||
pScoring->matrix[i][CHAR2SYMB('X')] = pScoring->matrix[CHAR2SYMB('X')][i] = -1e6;
|
||||
|
||||
|
||||
/*
|
||||
Estimate the mean score of the matrix
|
||||
*/
|
||||
pScoring->expect=0.0;
|
||||
for(i=0;i<4;i++)
|
||||
for(u=0;u<4;u++) /* print out log(freq*freq) into the pScoring->matrix */
|
||||
pScoring->expect += freq[i]*freq[u]*pScoring->matrix[CHAR2SYMB(carac[i])][CHAR2SYMB(carac[u])];
|
||||
|
||||
|
||||
pScoring->gap_open = (gap_open<0)? gap_open : -gap_open;
|
||||
pScoring->gap_ext = (gap_ext<0)? gap_ext : -gap_ext;
|
||||
|
||||
|
||||
/*
|
||||
printout_matrix( pScoring );
|
||||
|
||||
*/
|
||||
}
|
||||
|
||||
/*
|
||||
This matrix is
|
||||
anything is +/- 1
|
||||
N is 1
|
||||
X is -1e6
|
||||
*/
|
||||
void identity_matrix(char *sequence, float gap_open, float gap_ext,SCORING *pScoring){
|
||||
|
||||
int16_t i,j,u;
|
||||
int32_t letters[5]={0,};
|
||||
double freq[4]= {0.0,}; /* frequency of nucleotides ACGT */
|
||||
|
||||
char carac[4] = {'A','C','G','T'};
|
||||
|
||||
int32_t size;
|
||||
|
||||
int32_t size_eff; /* size_eff is the real number of non N nucleotide */
|
||||
|
||||
|
||||
for(i=0;i<26;i++) /* set the matrix to 0 */
|
||||
for(j=0;j<26;j++)
|
||||
pScoring->matrix[i][j]=0;
|
||||
|
||||
|
||||
size = get_letter(letters, sequence); /* count all letters */
|
||||
size_eff = size-letters[4]; /* count the size of ACGT symbols */
|
||||
|
||||
for(i=0;i<4;i++) /* get the frequencies */
|
||||
freq[i]=(double)letters[i]/(double)size_eff;
|
||||
|
||||
|
||||
for(i=0;i<4;i++)
|
||||
for(u=0;u<4;u++)
|
||||
pScoring->matrix[CHAR2SYMB(carac[i])][CHAR2SYMB(carac[u])]=(u==i)? 1 : -1;
|
||||
|
||||
for(i=0;i<26;i++) /* set all N against anything to +1 */
|
||||
pScoring->matrix[i][CHAR2SYMB('N')] = pScoring->matrix[CHAR2SYMB('N')][i] = 0.25;
|
||||
|
||||
for(i=0;i<26;i++) /* set all X against anything to -1e6 */
|
||||
pScoring->matrix[i][CHAR2SYMB('X')] = pScoring->matrix[CHAR2SYMB('X')][i] = -1e6;
|
||||
|
||||
|
||||
pScoring->expect=0.0;
|
||||
for(i=0;i<4;i++)
|
||||
for(u=0;u<4;u++){ /* print out log(freq*freq) into the pScoring->matrix */
|
||||
pScoring->expect += freq[i]*freq[u]*pScoring->matrix[CHAR2SYMB(carac[i])][CHAR2SYMB(carac[u])];
|
||||
}
|
||||
|
||||
|
||||
pScoring->gap_open = (gap_open>0)? -gap_open : gap_open;
|
||||
pScoring->gap_ext = (gap_ext>0 )? -gap_ext : gap_ext;
|
||||
|
||||
/*
|
||||
printout_matrix( pScoring );
|
||||
*/
|
||||
|
||||
}
|
562
src/repseek/repseek-2014.09/align_seeds.c
Normal file
562
src/repseek/repseek-2014.09/align_seeds.c
Normal file
@ -0,0 +1,562 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/******
|
||||
file : align_seeds.c
|
||||
function : from seeds to repeats
|
||||
created : jul 7 2003
|
||||
modif : July 2003, Oct 2003, Nov 2003 --change the reset_Rep(),Feb 2004
|
||||
July 2004 : turn scores into doubles
|
||||
Nov 2004 : correct a bug with opt_overlap --now removed too small and overlapping seeds--
|
||||
Sep 05
|
||||
March 06: debug and make optimization with reset_seeds()
|
||||
March 06: change reset_repet() to let the user choose how stringent should be the overlap
|
||||
May 07: update reset_repet() to to remove a small bug
|
||||
Dec 09: correct a bug -- thanks to M Krawczykc
|
||||
|
||||
author : amikezor
|
||||
*****/
|
||||
|
||||
#include "KMRK_Seeds.h"
|
||||
|
||||
#include "repseek_types.h"
|
||||
#include "align.h"
|
||||
#include "sequence.h"
|
||||
#include "memory.h"
|
||||
#include "memory_align.h"
|
||||
#include "sort.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
A function to remove all seeds that fall in the 'f'orward alignemnt of the current seed
|
||||
*/
|
||||
static void reset_seeds(AllSeeds_type *PtrAllSeeds, char orientation, int32_t current_seed, Rep aligned_seed, RESULTS * pResults)
|
||||
{
|
||||
int32_t max_seed=0, /* max of inv or dir */
|
||||
s=current_seed+1; /* the seed counter */
|
||||
|
||||
|
||||
char *ptraceback=NULL;
|
||||
|
||||
|
||||
int32_t Current_pos1, /* the pos in the current forward alignemnt */
|
||||
Current_pos2;
|
||||
|
||||
|
||||
int32_t mem_pos1=0, mem_pos2=0; /* values used for optimization (avoid always going back to start) */
|
||||
char *mem_trace=NULL;
|
||||
|
||||
int32_t len_traceback = strlen(pResults->traceback_f); /* the end of the traces are the begining of the alignment */
|
||||
|
||||
|
||||
/*
|
||||
Treat separatly direct and inverted
|
||||
*/
|
||||
if(orientation == 'd'){
|
||||
|
||||
|
||||
max_seed = PtrAllSeeds->nDirSeeds;
|
||||
mem_pos1 = 0; /* before the first seed checked */
|
||||
|
||||
while( s<max_seed && PtrAllSeeds->dirSeeds[s].pos1 < aligned_seed.pos1+aligned_seed.len1 ){
|
||||
|
||||
if( PtrAllSeeds->dirSeeds[s].pos1 != -1 &&
|
||||
PtrAllSeeds->dirSeeds[s].pos2 > aligned_seed.seed_pos2 &&
|
||||
PtrAllSeeds->dirSeeds[s].pos2 <= aligned_seed.pos2+aligned_seed.len2 ){ /* then it is a potential candidate */
|
||||
|
||||
if( mem_pos1 == 0 ){ /* first round */
|
||||
|
||||
Current_pos1 = aligned_seed.seed_pos1 + aligned_seed.seed_len ; /* the first bit after the seed */
|
||||
Current_pos2 = aligned_seed.seed_pos2 + aligned_seed.seed_len ;
|
||||
ptraceback = pResults->traceback_f + len_traceback - 2; /* at the end --last cell is the seed end-- */
|
||||
|
||||
}else{
|
||||
Current_pos1=mem_pos1; /* subsequent rounds */
|
||||
Current_pos2=mem_pos2;
|
||||
ptraceback = mem_trace;
|
||||
|
||||
mem_pos1 = 0;
|
||||
}
|
||||
|
||||
|
||||
while( ptraceback >= pResults->traceback_f &&
|
||||
Current_pos1 <= PtrAllSeeds->dirSeeds[s].pos1 &&
|
||||
Current_pos2 <= PtrAllSeeds->dirSeeds[s].pos2 ){
|
||||
|
||||
switch( *ptraceback ){
|
||||
case 'd':
|
||||
Current_pos1++;
|
||||
Current_pos2++;
|
||||
break;
|
||||
case 'a':
|
||||
Current_pos1++;
|
||||
break;
|
||||
case 'l':
|
||||
Current_pos2++;
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr,"reset_seeds: expect traceback to be 'd','a' or 'l', NOT %c !! bye\n", *ptraceback),exit(6);
|
||||
}
|
||||
|
||||
if( Current_pos1 == PtrAllSeeds->dirSeeds[s].pos1 && mem_pos1 == 0 ){
|
||||
mem_pos1 = Current_pos1;
|
||||
mem_pos2 = Current_pos2;
|
||||
mem_trace = ptraceback-1; /* ptraceback-1, is the next one */
|
||||
}
|
||||
|
||||
if(Current_pos1 == PtrAllSeeds->dirSeeds[s].pos1 && Current_pos2 == PtrAllSeeds->dirSeeds[s].pos2 ){
|
||||
|
||||
/*
|
||||
fprintf(stderr, "Remove %d %d-%d-%d USING %d %d-%d-%d \n",
|
||||
s+1, PtrAllSeeds->dirSeeds[s].pos1+1, PtrAllSeeds->dirSeeds[s].pos2+1, PtrAllSeeds->dirSeeds[s].length,
|
||||
current_seed+1, PtrAllSeeds->dirSeeds[current_seed].pos1+1, PtrAllSeeds->dirSeeds[current_seed].pos2+1, PtrAllSeeds->dirSeeds[current_seed].length);
|
||||
|
||||
*/
|
||||
PtrAllSeeds->dirSeeds[s].pos1 = -1;
|
||||
}
|
||||
ptraceback--;
|
||||
}
|
||||
}
|
||||
|
||||
s++;
|
||||
}
|
||||
|
||||
}
|
||||
else if (orientation == 'i'){
|
||||
|
||||
max_seed = PtrAllSeeds->nInvSeeds;
|
||||
mem_pos1 = 0;
|
||||
|
||||
while( s<max_seed && PtrAllSeeds->invSeeds[s].pos1 < aligned_seed.pos1+aligned_seed.len1 ){
|
||||
|
||||
if(PtrAllSeeds->invSeeds[s].pos1 != -1 &&
|
||||
PtrAllSeeds->invSeeds[s].pos2 >= aligned_seed.pos2 &&
|
||||
PtrAllSeeds->invSeeds[s].pos2 <= aligned_seed.seed_pos2 ){
|
||||
|
||||
if( ! mem_pos1 ){ /* first round */
|
||||
|
||||
Current_pos1 = aligned_seed.seed_pos1+aligned_seed.seed_len; /* the first bit after the seed */
|
||||
Current_pos2 = aligned_seed.seed_pos2-1; /* the first bit before the seed 2nd copy */
|
||||
ptraceback=pResults->traceback_f+len_traceback-2; /* the last cell is the seed end */
|
||||
|
||||
}else{
|
||||
|
||||
Current_pos1=mem_pos1; /* subsequent rounds */
|
||||
Current_pos2=mem_pos2;
|
||||
ptraceback = mem_trace;
|
||||
|
||||
mem_pos1 = 0;
|
||||
}
|
||||
|
||||
while( ptraceback >= pResults->traceback_f &&
|
||||
Current_pos1 <= PtrAllSeeds->invSeeds[s].pos1 &&
|
||||
Current_pos2 >= PtrAllSeeds->invSeeds[s].pos2 ){
|
||||
|
||||
|
||||
switch( *ptraceback ){
|
||||
case 'd':
|
||||
Current_pos1++;
|
||||
Current_pos2--;
|
||||
break;
|
||||
case 'a':
|
||||
Current_pos1++;
|
||||
break;
|
||||
case 'l':
|
||||
Current_pos2--;
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr,"reset_seeds: expect traceback to be 'd','a' or 'l', NOT %c !! bye\n",*ptraceback),exit(6);
|
||||
}
|
||||
|
||||
|
||||
if( Current_pos1 == PtrAllSeeds->invSeeds[s].pos1 && mem_pos1 == 0 ){
|
||||
mem_pos1 = Current_pos1;
|
||||
mem_pos2 = Current_pos2;
|
||||
mem_trace = ptraceback-1;
|
||||
}
|
||||
|
||||
if( Current_pos1 == PtrAllSeeds->invSeeds[s].pos1 && Current_pos2 == PtrAllSeeds->invSeeds[s].pos2+PtrAllSeeds->invSeeds[s].length-1 )
|
||||
PtrAllSeeds->invSeeds[s].pos1 = -1;
|
||||
|
||||
ptraceback--;
|
||||
|
||||
} /* endof while( ptraceback >= pResults-> ... */
|
||||
}
|
||||
|
||||
s++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
fprintf(stderr,"reset_seeds: orientation has to be 'd' or 'i', bye\n");
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
Remove Repeats which are have the same pos1, pos2, len1 and len2
|
||||
*/
|
||||
static int32_t reset_Rep(Rep *RepTable, int32_t nRep, float merge_repeats ){
|
||||
|
||||
int32_t u, i,
|
||||
removed=0;
|
||||
|
||||
int32_t overlap_copy1,
|
||||
overlap_copy2;
|
||||
|
||||
if(merge_repeats>1 || merge_repeats<0)
|
||||
fprintf(stderr, "reset_Rep: 0.0 <= fraction_overlap <= 1.0\n"),exit(7);
|
||||
|
||||
for(u=0; u<nRep-1; u++){
|
||||
|
||||
i=u+1;
|
||||
|
||||
if(RepTable[u].pos1 == -1)
|
||||
continue;
|
||||
|
||||
while( i<nRep &&
|
||||
RepTable[u].pos1 != -1 &&
|
||||
RepTable[i].pos1 <= RepTable[u].pos1+RepTable[u].len1
|
||||
){
|
||||
|
||||
if(RepTable[i].pos1 == -1){
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
overlap_copy1 = MIN2( (RepTable[i].pos1+RepTable[i].len1), (RepTable[u].pos1+RepTable[u].len1) ) - MAX2(RepTable[i].pos1, RepTable[u].pos1 );
|
||||
overlap_copy2 = MIN2( (RepTable[i].pos2+RepTable[i].len2), (RepTable[u].pos2+RepTable[u].len2) ) - MAX2(RepTable[i].pos2, RepTable[u].pos2 );
|
||||
|
||||
if( overlap_copy1 >= (int32_t) merge_repeats*RepTable[i].len1 && overlap_copy1 >= (int32_t) merge_repeats*RepTable[u].len1 &&
|
||||
overlap_copy2 >= (int32_t) merge_repeats*RepTable[i].len2 && overlap_copy2 >= (int32_t) merge_repeats*RepTable[u].len2
|
||||
|
||||
){
|
||||
|
||||
if(RepTable[i].score > RepTable[u].score){ /* keep the best score */
|
||||
|
||||
/*
|
||||
fprintf(stderr,
|
||||
"REMOVE_REP: %d %d %d %d %d-%d-%d by %d %d %d %d %d-%d-%d\n",
|
||||
RepTable[u].pos1+1, RepTable[u].pos2+1, RepTable[u].len1, RepTable[u].len2, RepTable[u].seed_pos1+1, RepTable[u].seed_pos2+1, RepTable[u].seed_len,
|
||||
RepTable[i].pos1+1, RepTable[i].pos2+1, RepTable[i].len1, RepTable[i].len2, RepTable[i].seed_pos1+1, RepTable[i].seed_pos2+1, RepTable[i].seed_len
|
||||
);
|
||||
*/
|
||||
RepTable[u].pos1=RepTable[u].pos2=RepTable[u].len1=RepTable[u].len2=-1;
|
||||
}
|
||||
else{
|
||||
/*
|
||||
fprintf(stderr,
|
||||
"REMOVE_REP: %d %d %d %d %d-%d-%d by %d %d %d %d %d-%d-%d\n",
|
||||
RepTable[i].pos1+1, RepTable[i].pos2+1, RepTable[i].len1, RepTable[i].len2, RepTable[i].seed_pos1+1, RepTable[i].seed_pos2+1, RepTable[i].seed_len,
|
||||
RepTable[u].pos1+1, RepTable[u].pos2+1, RepTable[u].len1, RepTable[u].len2, RepTable[u].seed_pos1+1, RepTable[u].seed_pos2+1, RepTable[u].seed_len
|
||||
);
|
||||
*/
|
||||
RepTable[i].pos1=RepTable[i].pos2=RepTable[i].len1=RepTable[i].len2=-1;
|
||||
}
|
||||
removed++;
|
||||
|
||||
}
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
return removed;
|
||||
}
|
||||
|
||||
|
||||
/***
|
||||
get Repeats from seeds.
|
||||
This function is the only one exported and is called by main()
|
||||
***/
|
||||
Repeats align_seeds(AllSeeds_type *AllSeeds, char *sequence, float Xg, float gap_open, float gap_ext, char matrix_type,\
|
||||
int16_t Lmin, int8_t opt_dir, int8_t opt_inv, int8_t opt_overlap, float merge_repeats ){
|
||||
|
||||
|
||||
int32_t sequence_size; /* guess... */
|
||||
|
||||
Repeats AllRepeats; /* A structure that contain all Repeats */
|
||||
|
||||
int32_t s, r=0; /* to count the 's'eeds and 'r'epeats */
|
||||
|
||||
SCORING Scoring; /* Scoring - get scoring matrix and gap-open, gap-ext */
|
||||
RESULTS Results; /* Results all the results of the alignemnt */
|
||||
|
||||
char *invsequence=NULL;
|
||||
|
||||
if(!opt_dir && !opt_inv)
|
||||
fprintf(stderr, "align_seeds: At least one of the direct and inverted is needed, bye\n"),exit(1);
|
||||
|
||||
AllRepeats = mem_Repeats(AllSeeds->nDirSeeds, AllSeeds->nInvSeeds); /* Init and Mem */
|
||||
mem_Blast2Align(&Results); /* get mem for alignment */
|
||||
|
||||
|
||||
if( matrix_type == 'i')
|
||||
identity_matrix(sequence,gap_open, gap_ext, &Scoring); /* Load identity matrix into memory */
|
||||
else if(matrix_type == 'l' )
|
||||
log_matrix(sequence, gap_open, gap_ext, &Scoring);
|
||||
|
||||
sequence_size = (int32_t)strlen(sequence);
|
||||
|
||||
|
||||
if(opt_dir){
|
||||
|
||||
for(s=0 , r=0; s < AllSeeds->nDirSeeds; s++){
|
||||
|
||||
if(s%10 == 0)
|
||||
fprintf(stderr,"direct: %d seeds -> %d repeats\r",s,r);
|
||||
|
||||
if(!opt_overlap &&
|
||||
AllSeeds->dirSeeds[s].pos1 + AllSeeds->dirSeeds[s].length > AllSeeds->dirSeeds[s].pos2 &&
|
||||
AllSeeds->dirSeeds[s].pos2- AllSeeds->dirSeeds[s].pos1 < Lmin
|
||||
)
|
||||
AllSeeds->dirSeeds[s].pos1 = -1; /* overlapping and too small, remove it */
|
||||
|
||||
if(AllSeeds->dirSeeds[s].pos1 == -1 )continue; /* thus it is a removed seed */
|
||||
|
||||
AllRepeats.DirRep[r] = alignd(AllSeeds->dirSeeds[s].pos1,
|
||||
AllSeeds->dirSeeds[s].pos2,
|
||||
AllSeeds->dirSeeds[s].length,
|
||||
sequence, sequence_size ,Xg,
|
||||
&Scoring, &Results, opt_overlap);
|
||||
|
||||
|
||||
AllRepeats.DirRep[r].seed_meanR = AllSeeds->dirSeeds[s].rmean;
|
||||
|
||||
reset_seeds(AllSeeds, 'd', s, AllRepeats.DirRep[r], &Results);
|
||||
|
||||
r++;
|
||||
}
|
||||
|
||||
AllRepeats.nDirRep = r;
|
||||
|
||||
if(AllRepeats.nDirRep != 0){
|
||||
|
||||
AllRepeats.DirRep = (Rep *)MyRealloc(AllRepeats.DirRep, AllRepeats.nDirRep*sizeof(Rep),
|
||||
AllSeeds->nDirSeeds*sizeof(Rep) ,"align_seeds : realloc DirRep error");
|
||||
|
||||
qsort_1then2_repeats(AllRepeats.DirRep, AllRepeats.nDirRep);
|
||||
|
||||
AllRepeats.nDirBadRep = reset_Rep(AllRepeats.DirRep, AllRepeats.nDirRep, merge_repeats);
|
||||
}
|
||||
else{
|
||||
MyFree(AllRepeats.DirRep, AllSeeds->nDirSeeds*sizeof(Rep) );
|
||||
AllRepeats.DirRep=NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if(opt_inv){
|
||||
|
||||
/*
|
||||
Get Inv sequence
|
||||
*/
|
||||
invsequence = (char *)MyMalloc((sequence_size+1)*sizeof(char),"align_seeds: invsequence malloc error, bye");
|
||||
invseq(sequence,invsequence);
|
||||
|
||||
for(s=0, r=0; s < AllSeeds->nInvSeeds; s++){
|
||||
|
||||
if( s%10 == 0)
|
||||
fprintf(stderr,"inverted: %d seeds -> %d repeats\r",s,r);
|
||||
|
||||
if(AllSeeds->invSeeds[s].pos1 == -1)continue;
|
||||
|
||||
AllRepeats.InvRep[r] = aligni(AllSeeds->invSeeds[s].pos1,
|
||||
AllSeeds->invSeeds[s].pos2,
|
||||
AllSeeds->invSeeds[s].length,
|
||||
sequence, invsequence,sequence_size,
|
||||
Xg, &Scoring, &Results);
|
||||
|
||||
AllRepeats.InvRep[r].seed_meanR = AllSeeds->invSeeds[s].rmean;
|
||||
|
||||
if(AllRepeats.InvRep[r].len1 >= Lmin){ /* if not do not keep small palindromes */
|
||||
reset_seeds(AllSeeds, 'i', s, AllRepeats.InvRep[r], &Results);
|
||||
r++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
AllRepeats.nInvRep = r;
|
||||
|
||||
if(AllRepeats.nInvRep != 0){
|
||||
AllRepeats.InvRep = (Rep *)MyRealloc(AllRepeats.InvRep, AllRepeats.nInvRep*sizeof(Rep),
|
||||
AllSeeds->nInvSeeds*sizeof(Rep),"align_seeds : realloc InvRep error");
|
||||
|
||||
qsort_1then2_repeats(AllRepeats.InvRep, AllRepeats.nInvRep);
|
||||
|
||||
AllRepeats.nInvBadRep = reset_Rep(AllRepeats.InvRep, AllRepeats.nInvRep, merge_repeats);
|
||||
}
|
||||
else{
|
||||
MyFree( AllRepeats.InvRep, AllSeeds->nInvSeeds*sizeof(Rep) );
|
||||
AllRepeats.InvRep=NULL;
|
||||
}
|
||||
|
||||
MyFree(invsequence, sequence_size*sizeof(char) );
|
||||
|
||||
}
|
||||
|
||||
free_Blast2Align(&Results); /* free the RESULTS struct */
|
||||
|
||||
return AllRepeats;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/***
|
||||
get Repeats from seeds.
|
||||
This function is the only one exported and is called by main()
|
||||
***/
|
||||
Repeats align_seeds_2seq(AllSeeds_type *AllSeeds, char *seq1, char *seq2, float Xg, float gap_open, float gap_ext, char matrix_type,\
|
||||
int16_t Lmin, int8_t opt_dir, int8_t opt_inv , float merge_repeats){
|
||||
|
||||
|
||||
int32_t size1, size2; /* sequences size */
|
||||
|
||||
Repeats AllRepeats; /* A structure that contain all Repeats */
|
||||
|
||||
int32_t s, r=0; /* to count the 's'eeds and 'r'epeats */
|
||||
|
||||
SCORING Scoring; /* Scoring - get scoring matrix and gap-open, gap-ext */
|
||||
RESULTS Results; /* Results all the results of the alignemnt */
|
||||
|
||||
char *invseq2=NULL; /* a pointer to the inevrted sequence 2 */
|
||||
|
||||
|
||||
if(!opt_dir && !opt_inv)
|
||||
fprintf(stderr, "align_seeds: At least one of the direct and inverted, bye\n"),exit(1);
|
||||
|
||||
|
||||
size1=(int32_t)strlen(seq1);
|
||||
size2=(int32_t)strlen(seq2);
|
||||
|
||||
|
||||
AllRepeats = mem_Repeats(AllSeeds->nDirSeeds, AllSeeds->nInvSeeds); /* Init and Mem */
|
||||
mem_Blast2Align(&Results); /* get mem for alignment */
|
||||
|
||||
|
||||
if( matrix_type == 'i')
|
||||
identity_matrix(seq1,gap_open, gap_ext, &Scoring); /* Load identity matrix into memory */
|
||||
else if(matrix_type == 'l' )
|
||||
log_matrix(seq1, gap_open, gap_ext, &Scoring);
|
||||
|
||||
|
||||
if(opt_dir){
|
||||
|
||||
for(s=0 , r=0; s < AllSeeds->nDirSeeds; s++){
|
||||
|
||||
if(s%10 == 0)
|
||||
fprintf(stderr,"direct: %d seeds -> %d repeats\r",s,r);
|
||||
|
||||
if(AllSeeds->dirSeeds[s].pos1 == -1 )continue; /* thus it is a removed seed */
|
||||
|
||||
AllRepeats.DirRep[r] = alignd_2seq(AllSeeds->dirSeeds[s].pos1,
|
||||
AllSeeds->dirSeeds[s].pos2,
|
||||
AllSeeds->dirSeeds[s].length,
|
||||
seq1, seq2,
|
||||
size1, size2,
|
||||
Xg, &Scoring, &Results);
|
||||
|
||||
|
||||
AllRepeats.DirRep[r].seed_meanR = AllSeeds->dirSeeds[s].rmean;
|
||||
|
||||
reset_seeds(AllSeeds, 'd', s, AllRepeats.DirRep[r], &Results);
|
||||
|
||||
r++;
|
||||
}
|
||||
|
||||
AllRepeats.nDirRep = r;
|
||||
|
||||
if(AllRepeats.nDirRep != 0){
|
||||
|
||||
AllRepeats.DirRep = (Rep *)MyRealloc(AllRepeats.DirRep, AllRepeats.nDirRep*sizeof(Rep),
|
||||
AllSeeds->nDirSeeds*sizeof(Rep) ,"align_seeds : realloc DirRep error");
|
||||
|
||||
qsort_1then2_repeats(AllRepeats.DirRep, AllRepeats.nDirRep);
|
||||
|
||||
AllRepeats.nDirBadRep = reset_Rep(AllRepeats.DirRep, AllRepeats.nDirRep, merge_repeats);
|
||||
}
|
||||
else{
|
||||
MyFree(AllRepeats.DirRep, AllSeeds->nDirSeeds*sizeof(Rep) );
|
||||
AllRepeats.DirRep=NULL;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if(opt_inv){
|
||||
|
||||
/*
|
||||
Get Inv sequence
|
||||
*/
|
||||
invseq2 = (char *)MyMalloc(size2*sizeof(char),"align_seeds: malloc error, bye");
|
||||
invseq(seq2,invseq2);
|
||||
|
||||
for(s=0, r=0; s < AllSeeds->nInvSeeds; s++){
|
||||
|
||||
if( s%10 == 0)
|
||||
fprintf(stderr,"inverted: %d seeds -> %d repeats\r",s,r);
|
||||
|
||||
if(AllSeeds->invSeeds[s].pos1 == -1)continue;
|
||||
|
||||
AllRepeats.InvRep[r] = aligni_2seq(AllSeeds->invSeeds[s].pos1,
|
||||
AllSeeds->invSeeds[s].pos2,
|
||||
AllSeeds->invSeeds[s].length,
|
||||
seq1, invseq2,
|
||||
size1, size2,
|
||||
Xg, &Scoring, &Results);
|
||||
|
||||
AllRepeats.InvRep[r].seed_meanR = AllSeeds->invSeeds[s].rmean;
|
||||
|
||||
reset_seeds(AllSeeds, 'i', s, AllRepeats.InvRep[r], &Results);
|
||||
r++;
|
||||
|
||||
|
||||
}
|
||||
|
||||
AllRepeats.nInvRep = r;
|
||||
|
||||
if(AllRepeats.nInvRep != 0){
|
||||
AllRepeats.InvRep = (Rep *)MyRealloc(AllRepeats.InvRep, AllRepeats.nInvRep*sizeof(Rep),
|
||||
AllSeeds->nInvSeeds*sizeof(Rep),"align_seeds : realloc InvRep error");
|
||||
|
||||
qsort_1then2_repeats(AllRepeats.InvRep, AllRepeats.nInvRep);
|
||||
|
||||
AllRepeats.nInvBadRep = reset_Rep(AllRepeats.InvRep, AllRepeats.nInvRep, merge_repeats);
|
||||
}
|
||||
else{
|
||||
MyFree( AllRepeats.InvRep, AllSeeds->nInvSeeds*sizeof(Rep) );
|
||||
AllRepeats.InvRep=NULL;
|
||||
}
|
||||
|
||||
MyFree(invseq2, size2*sizeof(char) );
|
||||
|
||||
}
|
||||
|
||||
free_Blast2Align(&Results); /* free the RESULTS struct */
|
||||
|
||||
return AllRepeats;
|
||||
}
|
||||
|
169
src/repseek/repseek-2014.09/families.c
Normal file
169
src/repseek/repseek-2014.09/families.c
Normal file
@ -0,0 +1,169 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/******
|
||||
file : families.c
|
||||
function : set the R-level of the repeats (Mean, Main, and fraction of Main)
|
||||
|
||||
created : 15 Oct 2002
|
||||
modif : July 2003, Oct 2003, Spe 05
|
||||
|
||||
note : from Air.c - Mar 28 2001 / Jul 10 2001 / Jun 2 2000
|
||||
|
||||
author : amikezor
|
||||
*****/
|
||||
|
||||
#include "repseek_types.h"
|
||||
#include "memory.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
/***
|
||||
construction of the table "air" (of repeats)
|
||||
***/
|
||||
|
||||
static void table_R(int32_t *table_r, Rep *AllReps, int32_t nRep, int32_t size_chr)
|
||||
{
|
||||
|
||||
int32_t u; /* a counter */
|
||||
|
||||
for(u=0; u<nRep; u++){
|
||||
|
||||
if(AllReps[u].pos1 == -1){continue;} /* skip "bad repeat" */
|
||||
|
||||
table_r[ AllReps[u].pos1 ]+=1;
|
||||
if(AllReps[u].pos1 + AllReps[u].len1 < size_chr)
|
||||
table_r[ AllReps[u].pos1 + AllReps[u].len1 ]-=1;
|
||||
|
||||
table_r[ AllReps[u].pos2 ]+=1;
|
||||
if(AllReps[u].pos2 + AllReps[u].len2 < size_chr)
|
||||
table_r[ AllReps[u].pos2 + AllReps[u].len2 ]-=1;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/***
|
||||
Built up and returned a memory adress where is table R
|
||||
***/
|
||||
static int32_t *mem_RepTableR(Repeats AllRepeats, int8_t opt_dir, int8_t opt_inv, int32_t size_chr)
|
||||
{
|
||||
|
||||
int32_t *table_r=NULL; /* the table R */
|
||||
int32_t count=0; /* allow to count increase and decrease */
|
||||
int32_t i=0; /* just a counter */
|
||||
|
||||
table_r = MyCalloc(size_chr, sizeof(int32_t), "SetRepFamily: calloc failed, bye");
|
||||
|
||||
if(opt_dir)
|
||||
table_R(table_r, AllRepeats.DirRep, AllRepeats.nDirRep, size_chr);
|
||||
|
||||
if(opt_inv)
|
||||
table_R(table_r, AllRepeats.InvRep, AllRepeats.nInvRep, size_chr);
|
||||
|
||||
for(count=1, i=0; i<size_chr ; i++)
|
||||
table_r[i] = (count+=table_r[i]);
|
||||
|
||||
return table_r;
|
||||
|
||||
}
|
||||
|
||||
/***
|
||||
Check all repeat and set their family size by
|
||||
looking at the table R of each copy
|
||||
***/
|
||||
static void Tag_Rep(Rep *AllReps, int32_t nReps, int32_t *table_r, int32_t *HistoR, int32_t Max_table_r)
|
||||
{
|
||||
|
||||
Rep *pRep;
|
||||
|
||||
int32_t i;
|
||||
|
||||
int32_t mainR=0;
|
||||
int32_t max_mainR=0;
|
||||
float meanR=0;
|
||||
float fraction_mainR=0.0;
|
||||
|
||||
for(pRep = AllReps; pRep < AllReps+nReps; pRep++){ /* determine the mean repeat level of each repeat */
|
||||
|
||||
memset(HistoR, 0, (Max_table_r+1)*sizeof(int32_t) );
|
||||
|
||||
for(i=0; i < pRep->len1 ; i++)
|
||||
HistoR[ table_r[pRep->pos1+i] ]++;
|
||||
|
||||
for(i=0; i < pRep->len2 ;i++ )
|
||||
HistoR[ table_r[pRep->pos2+i] ]++;
|
||||
|
||||
|
||||
for(i=2, meanR=0, max_mainR=0; i <= Max_table_r ; i++){
|
||||
|
||||
if( HistoR[i] > max_mainR ){
|
||||
max_mainR=HistoR[i];
|
||||
mainR=i;
|
||||
}
|
||||
|
||||
meanR += (float) i * HistoR[i];
|
||||
}
|
||||
|
||||
|
||||
fraction_mainR = (float)max_mainR / (float)(pRep->len1+pRep->len2);
|
||||
|
||||
pRep->meanR= meanR / (float)(pRep->len1+pRep->len2);
|
||||
pRep->mainR=mainR;
|
||||
pRep->fraction_mainR=fraction_mainR;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
/***
|
||||
Set the size of each family (duplicate, triplicate, etc...)
|
||||
Moreover return the table_r of the chromosome.
|
||||
In that table, each chr pos correspond to the level of repetition
|
||||
(1(unique), 2(duplicated), 3(triplicated), ...)
|
||||
***/
|
||||
int32_t *SetRepFamily(Repeats AllRepeats, int8_t opt_dir, int8_t opt_inv, int32_t size_chr)
|
||||
{
|
||||
|
||||
int32_t *table_r; /* the tableR itself */
|
||||
int32_t max_TableR=0; /* max value in TableR */
|
||||
int32_t *HistoTableR; /* histo of R-level for a repeat */
|
||||
int32_t i; /* a dummy token */
|
||||
|
||||
if( !(AllRepeats.nDirRep-AllRepeats.nDirBadRep) &&
|
||||
!(AllRepeats.nInvRep-AllRepeats.nInvBadRep) )
|
||||
return NULL;
|
||||
|
||||
table_r = mem_RepTableR( AllRepeats, opt_dir,opt_inv,size_chr);
|
||||
|
||||
for(i=0; i<size_chr; i++)
|
||||
max_TableR=(table_r[i]>max_TableR)?table_r[i]:max_TableR;
|
||||
|
||||
HistoTableR = (int32_t *)MyMalloc( (max_TableR+1) * sizeof(int32_t), "SetRepFamily: malloc error" );
|
||||
|
||||
if(opt_dir)
|
||||
Tag_Rep(AllRepeats.DirRep, AllRepeats.nDirRep, table_r, HistoTableR, max_TableR);
|
||||
|
||||
if(opt_inv)
|
||||
Tag_Rep(AllRepeats.InvRep, AllRepeats.nInvRep, table_r, HistoTableR, max_TableR);
|
||||
|
||||
MyFree(HistoTableR, (max_TableR+1)*sizeof(int32_t) );
|
||||
|
||||
return table_r;
|
||||
}
|
37
src/repseek/repseek-2014.09/families.h
Normal file
37
src/repseek/repseek-2014.09/families.h
Normal file
@ -0,0 +1,37 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
#ifndef _families_h_
|
||||
#define _families_h_
|
||||
|
||||
void SetNumberRep_2seqs(Repeats AllRepeats,
|
||||
int8_t opt_dir,
|
||||
int8_t opt_inv,
|
||||
int32_t size1,
|
||||
int32_t size2,
|
||||
int32_t **TableR1,
|
||||
int32_t **TableR2);
|
||||
|
||||
|
||||
int32_t *SetRepFamily(Repeats AllRepeats,
|
||||
int8_t opt_dir,
|
||||
int8_t opt_inv,
|
||||
int32_t size_chr);
|
||||
|
||||
#endif /* _families_h_*/
|
196
src/repseek/repseek-2014.09/families_2seqs.c
Normal file
196
src/repseek/repseek-2014.09/families_2seqs.c
Normal file
@ -0,0 +1,196 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/**
|
||||
* @file families_2seq.c
|
||||
* @author amikezor
|
||||
* @date 14 Nov 2002
|
||||
* @modif Jul 2003 - change to take into account Mean, Main and fraction of R-level
|
||||
* @modif April 2004 - Recode totally with 2 TableR and only 1 meanR
|
||||
* @modif Sept 2005
|
||||
*
|
||||
* @brief set the number of copy in the 2 input seq
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
#include "repseek_types.h"
|
||||
#include "memory.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
/***
|
||||
construction of the table "air" (of repeats)
|
||||
***/
|
||||
|
||||
static void Fill_Tables_2seq(int32_t size1, int32_t size2, int32_t *TableR1, int32_t * TableR2, Rep *AllReps, int32_t nRep)
|
||||
{
|
||||
|
||||
int32_t u; /* a counter */
|
||||
|
||||
for(u=0; u<nRep; u++){
|
||||
|
||||
if(AllReps[u].pos1 == -1){continue;} /* skip "bad repeat" */
|
||||
|
||||
TableR1[ AllReps[u].pos1 ]+=1;
|
||||
|
||||
if(AllReps[u].pos1 + AllReps[u].len1 < size1)
|
||||
TableR1[ AllReps[u].pos1 + AllReps[u].len1 ]-=1;
|
||||
|
||||
TableR2[ AllReps[u].pos2 ]+=1;
|
||||
|
||||
if(AllReps[u].pos2 + AllReps[u].len2 < size2)
|
||||
TableR2[ AllReps[u].pos2 + AllReps[u].len2 ]-=1;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
/***
|
||||
Check all repeat and set their copy number by
|
||||
looking at the table R of each copy
|
||||
***/
|
||||
static void Tag_Rep_2seqs(Rep *AllReps, int32_t nReps, int32_t *TableR1, int32_t *TableR2, int32_t *HistoR, int32_t Max_table_r)
|
||||
{
|
||||
|
||||
Rep *pRep;
|
||||
|
||||
int32_t i;
|
||||
|
||||
int32_t mainR=0;
|
||||
int32_t max_mainR=0;
|
||||
float meanR=0, meanR2=0;
|
||||
float fraction_mainR=0.0;
|
||||
|
||||
|
||||
for(pRep = AllReps; pRep < AllReps+nReps; pRep++){ /* determine the mean repeat level of each repeat */
|
||||
|
||||
memset(HistoR, 0, (Max_table_r+1)*sizeof(int32_t) );
|
||||
|
||||
for(i=0; i < pRep->len1 ; i++){
|
||||
HistoR[ TableR1[pRep->pos1+i] ]++;
|
||||
meanR += TableR1[pRep->pos1+i];
|
||||
}
|
||||
|
||||
for(i=0; i < pRep->len2 ;i++ ){
|
||||
HistoR[ TableR2[pRep->pos2+i] ]++;
|
||||
meanR2+=TableR2[pRep->pos2+i];
|
||||
}
|
||||
|
||||
meanR /= (float)(pRep->len1);
|
||||
meanR2 /= (float)(pRep->len2);
|
||||
|
||||
for(i=1, max_mainR = 0; i <= Max_table_r ; i++){
|
||||
|
||||
if( HistoR[i] > max_mainR ){
|
||||
max_mainR=HistoR[i];
|
||||
mainR=i;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
fraction_mainR = (float)max_mainR / ( (float)(pRep->len1) + (pRep->len2) );
|
||||
|
||||
|
||||
pRep->meanR= meanR+meanR2 -2;
|
||||
pRep->mainR=0;
|
||||
pRep->fraction_mainR=0;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
/***
|
||||
Set the number of copies of each sequence in the second seq (2 means that this rep has 2 copies in seq2)
|
||||
Moreover return the table_r of the chromosome.
|
||||
In that table, each chr pos correspond to the level of repetition
|
||||
(1(unique), 2(duplicated), 3(triplicated), ...)
|
||||
***/
|
||||
void SetNumberRep_2seqs(Repeats AllRepeats, int8_t opt_dir, int8_t opt_inv,
|
||||
int32_t size1, int32_t size2,
|
||||
int32_t **pT1, int32_t **pT2)
|
||||
{
|
||||
|
||||
int32_t max_TableR=0; /* the maximum redundancy in the TableR1 or TableR2 */
|
||||
int32_t *HistoTableR; /* Histogram of the R-level of a repeat --> here R-level of copy 1 into sequence 2*/
|
||||
int32_t i; /* dummy counter */
|
||||
int32_t count; /* dummy counter 2 */
|
||||
|
||||
int32_t *TableR1;
|
||||
int32_t *TableR2;
|
||||
|
||||
|
||||
*pT1 = NULL;
|
||||
*pT2 = NULL;
|
||||
|
||||
if( !(AllRepeats.nDirRep-AllRepeats.nDirBadRep) &&
|
||||
!(AllRepeats.nInvRep-AllRepeats.nInvBadRep) )
|
||||
return;
|
||||
|
||||
/*
|
||||
Alloc Tables
|
||||
*/
|
||||
TableR1 = (int32_t *)MyCalloc(size1, sizeof(int32_t), "SetRepFamily: calloc failed, bye");
|
||||
TableR2 = (int32_t *)MyCalloc(size2, sizeof(int32_t), "SetRepFamily: calloc failed, bye");
|
||||
|
||||
|
||||
/*
|
||||
Fill those Tables
|
||||
*/
|
||||
if(opt_dir) Fill_Tables_2seq(size1, size2, TableR1, TableR2, AllRepeats.DirRep, AllRepeats.nDirRep);
|
||||
if(opt_inv) Fill_Tables_2seq(size1, size2, TableR1, TableR2, AllRepeats.InvRep, AllRepeats.nInvRep);
|
||||
|
||||
|
||||
for(count=1, i=0; i<size1 ; i++) TableR1[i] = (count += TableR1[i]);
|
||||
for(count=1, i=0; i<size2 ; i++) TableR2[i] = (count += TableR2[i]);
|
||||
|
||||
/*
|
||||
Get Their max values
|
||||
*/
|
||||
max_TableR=0;
|
||||
for(i=0; i<size1; i++) max_TableR=( TableR1[i]>max_TableR)? TableR1[i]:max_TableR; /* set the max value */
|
||||
for(i=0; i<size2; i++) max_TableR=( TableR2[i]>max_TableR)? TableR2[i]:max_TableR; /* set the max value */
|
||||
|
||||
/*
|
||||
Alloc only once the histogram
|
||||
*/
|
||||
HistoTableR = (int32_t *)MyMalloc( (max_TableR+1) * sizeof(int32_t), "SetRepFamily: malloc error" ); /* get some memory */
|
||||
|
||||
/*
|
||||
Tag the repeats
|
||||
*/
|
||||
if(opt_dir)
|
||||
Tag_Rep_2seqs(AllRepeats.DirRep, AllRepeats.nDirRep, TableR1, TableR2, HistoTableR , max_TableR);
|
||||
|
||||
if(opt_inv)
|
||||
Tag_Rep_2seqs(AllRepeats.InvRep, AllRepeats.nInvRep, TableR1, TableR2, HistoTableR , max_TableR);
|
||||
|
||||
|
||||
/*
|
||||
Free the histogram
|
||||
*/
|
||||
MyFree(HistoTableR, (max_TableR+1) * sizeof(int32_t));
|
||||
|
||||
*pT1 = TableR1;
|
||||
*pT2 = TableR2;
|
||||
|
||||
}
|
215
src/repseek/repseek-2014.09/filter.c
Normal file
215
src/repseek/repseek-2014.09/filter.c
Normal file
@ -0,0 +1,215 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/**
|
||||
* @file KMRK.filter.c
|
||||
* @author Guillaume Achaz <gachaz@oeb.harvard.edu>, Eric Coissac <coissac@inrialpes.fr>
|
||||
* @date Tue Mar 2 18:33:41 2004
|
||||
* @modif Sept 2005
|
||||
*
|
||||
* @brief seed filtrage function
|
||||
*
|
||||
*
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include "filter.h"
|
||||
#include "memory.h"
|
||||
#include "KMRK.h"
|
||||
|
||||
|
||||
static void table_R(int32_t *table_r,
|
||||
Seed_type *seeds,
|
||||
int32_t nSeed,
|
||||
int32_t size_chr);
|
||||
|
||||
static void table_R( int32_t *table_r, Seed_type *seeds, int32_t nSeed, int32_t size_chr)
|
||||
{
|
||||
|
||||
int32_t u; /* a dummy counter */
|
||||
Seed_type* curSeed; /* the current seed */
|
||||
|
||||
for(u=0, curSeed=seeds; u<nSeed; u++, curSeed++){
|
||||
|
||||
if(curSeed->pos1 == -1)
|
||||
continue;
|
||||
|
||||
table_r[ curSeed->pos1 ]+=1;
|
||||
|
||||
if( (curSeed->pos1 + curSeed->length) < size_chr)
|
||||
table_r[ curSeed->pos1 + curSeed->length ]-=1;
|
||||
|
||||
table_r[ curSeed->pos2 ]+=1;
|
||||
|
||||
if( (curSeed->pos2 + curSeed->length) < size_chr)
|
||||
table_r[ curSeed->pos2 + curSeed->length ]-=1;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int32_t *KMRK_SeedTableR(AllSeeds_type *AllSeeds,
|
||||
int8_t opt_dir,
|
||||
int8_t opt_inv,
|
||||
int32_t size_chr)
|
||||
{
|
||||
|
||||
int32_t *table_r=NULL; /* init the so called R-table to NULL */
|
||||
int32_t count=0; /* a token to mark each seed */
|
||||
int32_t i=0; /* a dummy counter */
|
||||
|
||||
table_r= MyCalloc( size_chr, sizeof(int32_t), "mem_tableR: calloc failed, bye" );
|
||||
|
||||
if(opt_dir && AllSeeds->nDirSeeds)
|
||||
table_R(table_r,
|
||||
AllSeeds->dirSeeds,
|
||||
AllSeeds->nDirSeeds,
|
||||
size_chr);
|
||||
|
||||
if(opt_inv && AllSeeds->nInvSeeds)
|
||||
table_R(table_r,
|
||||
AllSeeds->invSeeds,
|
||||
AllSeeds->nInvSeeds,
|
||||
size_chr);
|
||||
|
||||
for(count=1, i=0; i<size_chr ; i++)
|
||||
table_r[i] = (count+=table_r[i]);
|
||||
|
||||
return table_r;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
void BuiltFilterFamily_seeds(AllSeeds_type *AllSeeds,
|
||||
int32_t *table_r,
|
||||
double min,
|
||||
double Max,
|
||||
int8_t opt_dir,
|
||||
int8_t opt_inv)
|
||||
{
|
||||
|
||||
double MeanR; /* the R-mean of the seed - average redundancy */
|
||||
double TotalR; /* the sum of all r of each site of the repeat */
|
||||
Seed_type *curSeed; /* the current seed */
|
||||
|
||||
int32_t u, i;
|
||||
|
||||
int32_t N=0;
|
||||
int8_t opt_filter=(!min && !Max)?0:1; /* filter is on if one of the two is set by the user */
|
||||
|
||||
if(opt_dir){
|
||||
|
||||
for(u=0, N=0, curSeed=AllSeeds->dirSeeds; u<AllSeeds->nDirSeeds; u++,curSeed++)
|
||||
{
|
||||
|
||||
TotalR = 0.0;
|
||||
|
||||
for(i=0; i < curSeed->length ; i++)
|
||||
TotalR += (double)table_r[ curSeed->pos1+i] + table_r[curSeed->pos2+i]; /* add both repeats together */
|
||||
|
||||
MeanR = TotalR / (2.0 * (double) curSeed->length);
|
||||
|
||||
curSeed->rmean = MeanR;
|
||||
|
||||
if( opt_filter &&
|
||||
( (Max && MeanR<=Max) || !Max) &&
|
||||
( (min && MeanR>=min) || !min) )
|
||||
{
|
||||
|
||||
AllSeeds->dirSeeds[N].pos1 = curSeed->pos1; /* thus keep it */
|
||||
AllSeeds->dirSeeds[N].pos2 = curSeed->pos2;
|
||||
AllSeeds->dirSeeds[N].length = curSeed->length;
|
||||
AllSeeds->dirSeeds[N].rmean = curSeed->rmean;
|
||||
|
||||
N++;
|
||||
}
|
||||
}
|
||||
|
||||
if(opt_filter)
|
||||
{
|
||||
AllSeeds->nFilteredDirSeeds = AllSeeds->nDirSeeds - N;
|
||||
AllSeeds->nDirSeeds=N;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
if(opt_inv){
|
||||
|
||||
for(u=0, N=0, curSeed=AllSeeds->invSeeds; u < AllSeeds->nInvSeeds; u++,curSeed++)
|
||||
{
|
||||
TotalR = 0.0;
|
||||
|
||||
for(i=0; i < curSeed->length ; i++)
|
||||
TotalR += (double)table_r[ curSeed->pos1+i] + table_r[curSeed->pos2+i]; /* add both repeats together */
|
||||
|
||||
MeanR = TotalR / (2.0 * (double) curSeed->length);
|
||||
curSeed->rmean = MeanR;
|
||||
|
||||
if( opt_filter &&
|
||||
( (Max && MeanR<=Max) || !Max) &&
|
||||
( (min && MeanR>=min) || !min) )
|
||||
{
|
||||
AllSeeds->invSeeds[N].pos1 = curSeed->pos1; /* thus keep it */
|
||||
AllSeeds->invSeeds[N].pos2 = curSeed->pos2;
|
||||
AllSeeds->invSeeds[N].length = curSeed->length;
|
||||
AllSeeds->invSeeds[N].rmean = curSeed->rmean;
|
||||
N++;
|
||||
}
|
||||
}
|
||||
|
||||
if(opt_filter)
|
||||
{
|
||||
AllSeeds->nFilteredInvSeeds = AllSeeds->nInvSeeds-N; /* reset the memory */
|
||||
AllSeeds->nInvSeeds=N;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if(opt_filter)
|
||||
KMRK_compactSeeds(AllSeeds);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void KMRK_FamilySeeds(AllSeeds_type *AllSeeds,
|
||||
double min,
|
||||
double Max,
|
||||
int8_t opt_dir,
|
||||
int8_t opt_inv,
|
||||
int32_t size_chr)
|
||||
{
|
||||
|
||||
int32_t *table_r; /* the seed table R */
|
||||
|
||||
if( !AllSeeds->nDirSeeds && !AllSeeds->nInvSeeds )return;
|
||||
|
||||
if(!opt_dir && !opt_inv)
|
||||
fprintf(stderr, "FilterSeeds: opt_inv or opt_dir must be set to 1, bye\n"),exit(1);
|
||||
|
||||
table_r = KMRK_SeedTableR( AllSeeds, opt_dir, opt_inv, size_chr); /* calculate the seed table R */
|
||||
|
||||
BuiltFilterFamily_seeds(AllSeeds, table_r, min, Max, opt_dir, opt_inv); /* filter seeds if they does not fit between min and Max */
|
||||
|
||||
MyFree(table_r, size_chr*sizeof(int32_t));
|
||||
|
||||
return ;
|
||||
}
|
||||
|
74
src/repseek/repseek-2014.09/filter.h
Normal file
74
src/repseek/repseek-2014.09/filter.h
Normal file
@ -0,0 +1,74 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
#ifndef KMRK_filter_h
|
||||
#define KMRK_filter_h
|
||||
|
||||
|
||||
#include "KMRK.h"
|
||||
#include "KMRK_Seeds.h"
|
||||
#include <stdio.h>
|
||||
|
||||
void KMRK_FamilySeeds(AllSeeds_type *AllSeeds,
|
||||
double min,
|
||||
double Max,
|
||||
int8_t opt_dir,
|
||||
int8_t opt_inv,
|
||||
int32_t size_chr);
|
||||
|
||||
int32_t *KMRK_SeedTableR(AllSeeds_type *AllSeeds,
|
||||
int8_t opt_dir,
|
||||
int8_t opt_inv,
|
||||
int32_t size_chr);
|
||||
|
||||
void KMRK_SeedTableR2seq(AllSeeds_type *AllSeeds,
|
||||
int8_t opt_dir,
|
||||
int8_t opt_inv,
|
||||
int32_t size1,
|
||||
int32_t size2,
|
||||
int32_t **r1,
|
||||
int32_t **r2);
|
||||
|
||||
void KMRK_FamilySeeds2seq(AllSeeds_type *AllSeeds,
|
||||
double min,
|
||||
double Max,
|
||||
int8_t opt_dir,
|
||||
int8_t opt_inv,
|
||||
int32_t size1,
|
||||
int32_t size2);
|
||||
|
||||
|
||||
void BuiltFilterFamily_seeds(AllSeeds_type *AllSeeds,
|
||||
int32_t *table_r,
|
||||
double min,
|
||||
double Max,
|
||||
int8_t opt_dir,
|
||||
int8_t opt_inv);
|
||||
|
||||
void BuiltFilterFamily_seeds2seq(AllSeeds_type *AllSeeds,
|
||||
int32_t *table_r1,
|
||||
int32_t *table_r2,
|
||||
double min,
|
||||
double Max,
|
||||
int8_t opt_dir,
|
||||
int8_t opt_inv);
|
||||
|
||||
|
||||
|
||||
#endif /* KMRK_filter_h */
|
252
src/repseek/repseek-2014.09/filter2seq.c
Normal file
252
src/repseek/repseek-2014.09/filter2seq.c
Normal file
@ -0,0 +1,252 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/**
|
||||
* @file KMRK.filter.c
|
||||
* @author Eric Coissac <coissac@inrialpes.fr>
|
||||
* @date Tue Mar 2 18:33:41 2004
|
||||
* @modif Sept 2005
|
||||
*
|
||||
* @brief seed filtrage function
|
||||
*
|
||||
*
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include "filter.h"
|
||||
#include "memory.h"
|
||||
#include "KMRK.h"
|
||||
|
||||
|
||||
static void table_R2(int32_t *table_r1,
|
||||
int32_t *table_r2,
|
||||
Seed_type *seeds,
|
||||
int32_t nSeed,
|
||||
int32_t size1,
|
||||
int32_t size2);
|
||||
|
||||
|
||||
|
||||
static void table_R2(int32_t *table_r1,
|
||||
int32_t *table_r2,
|
||||
Seed_type *seeds,
|
||||
int32_t nSeed,
|
||||
int32_t size1,
|
||||
int32_t size2)
|
||||
{
|
||||
|
||||
int32_t u;
|
||||
Seed_type* curSeed;
|
||||
|
||||
for(u=0, curSeed=seeds; u<nSeed; u++, curSeed++)
|
||||
{
|
||||
if(curSeed->pos1 == -1)
|
||||
continue;
|
||||
|
||||
table_r1[ curSeed->pos1 ]+=1;
|
||||
|
||||
if( (curSeed->pos1 + curSeed->length) < size1)
|
||||
table_r1[ curSeed->pos1 + curSeed->length ]-=1;
|
||||
|
||||
table_r2[ curSeed->pos2 ]+=1;
|
||||
|
||||
if( (curSeed->pos2 + curSeed->length) < size2)
|
||||
table_r2[ curSeed->pos2 + curSeed->length ]-=1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void KMRK_SeedTableR2seq(AllSeeds_type *AllSeeds,
|
||||
int8_t opt_dir,
|
||||
int8_t opt_inv,
|
||||
int32_t size1,
|
||||
int32_t size2,
|
||||
int32_t **r1,
|
||||
int32_t **r2)
|
||||
{
|
||||
|
||||
int32_t *table_r1=NULL;
|
||||
int32_t *table_r2=NULL;
|
||||
int32_t count=0;
|
||||
int32_t i=0;
|
||||
|
||||
table_r1= MyCalloc( size1,sizeof(int32_t), "mem_tableR: calloc failed, bye" );
|
||||
table_r2= MyCalloc( size2,sizeof(int32_t), "mem_tableR: calloc failed, bye" );
|
||||
|
||||
if(opt_dir && AllSeeds->nDirSeeds)
|
||||
table_R2(table_r1, table_r2, AllSeeds->dirSeeds, AllSeeds->nDirSeeds, size1, size2);
|
||||
|
||||
if(opt_inv && AllSeeds->nInvSeeds)
|
||||
table_R2(table_r1,table_r2, AllSeeds->invSeeds, AllSeeds->nInvSeeds, size1, size2);
|
||||
|
||||
for(count=1, i=0; i<size1 ; i++)
|
||||
table_r1[i] = (count+=table_r1[i]);
|
||||
|
||||
for(count=1, i=0; i<size2 ; i++)
|
||||
table_r2[i] = (count+=table_r2[i]);
|
||||
|
||||
*r1 = table_r1;
|
||||
*r2 = table_r2;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
void BuiltFilterFamily_seeds2seq(AllSeeds_type *AllSeeds,
|
||||
int32_t *table_r1,
|
||||
int32_t *table_r2,
|
||||
double min,
|
||||
double Max,
|
||||
int8_t opt_dir,
|
||||
int8_t opt_inv)
|
||||
{
|
||||
|
||||
float MeanR1, MeanR2;
|
||||
float TotalR1, TotalR2;
|
||||
|
||||
Seed_type *curSeed;
|
||||
|
||||
int32_t u, i;
|
||||
|
||||
int32_t N=0;
|
||||
int8_t opt_filter=(!min && !Max)?0:1;
|
||||
|
||||
if(opt_dir){
|
||||
|
||||
for(u=0, N=0, curSeed=AllSeeds->dirSeeds; u<AllSeeds->nDirSeeds; u++,curSeed++)
|
||||
{
|
||||
|
||||
TotalR1 = 0.0;
|
||||
TotalR2 = 0.0;
|
||||
for(i=0; i < curSeed->length ; i++)
|
||||
{
|
||||
TotalR1 += table_r1[curSeed->pos1+i];
|
||||
TotalR2 += table_r2[curSeed->pos2+i]; /* add both repeats together */
|
||||
}
|
||||
|
||||
MeanR1 = (float)TotalR1 / ( (float) curSeed->length);
|
||||
MeanR2 = (float)TotalR2 / ( (float) curSeed->length);
|
||||
curSeed->rmean = MeanR1 + MeanR2 - 2;
|
||||
|
||||
if( opt_filter &&
|
||||
( ( (Max && MeanR1<=Max) || !Max) &&
|
||||
( (min && MeanR1>=min) || !min) ) &&
|
||||
( ( (Max && MeanR2<=Max) || !Max) &&
|
||||
( (min && MeanR2>=min) || !min) ) )
|
||||
|
||||
{ /* && has a higher priority than || */
|
||||
|
||||
AllSeeds->dirSeeds[N].pos1 = curSeed->pos1; /* thus keep it */
|
||||
AllSeeds->dirSeeds[N].pos2 = curSeed->pos2;
|
||||
AllSeeds->dirSeeds[N].length = curSeed->length;
|
||||
AllSeeds->dirSeeds[N].rmean = curSeed->rmean;
|
||||
|
||||
N++;
|
||||
}
|
||||
}
|
||||
|
||||
if(opt_filter)
|
||||
{
|
||||
AllSeeds->nFilteredDirSeeds = AllSeeds->nDirSeeds - N;
|
||||
AllSeeds->nDirSeeds=N;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
if(opt_inv){
|
||||
|
||||
for(u=0, N=0, curSeed=AllSeeds->invSeeds; u < AllSeeds->nInvSeeds; u++,curSeed++)
|
||||
{
|
||||
TotalR1 = 0.0;
|
||||
TotalR2 = 0.0;
|
||||
|
||||
for(i=0; i < curSeed->length ; i++)
|
||||
{
|
||||
TotalR1 += table_r1[curSeed->pos1+i];
|
||||
TotalR2 += table_r2[curSeed->pos2+i]; /* add both repeats together */
|
||||
}
|
||||
|
||||
MeanR1 = (float)TotalR1 / ( (float) curSeed->length);
|
||||
MeanR2 = (float)TotalR2 / ( (float) curSeed->length);
|
||||
curSeed->rmean = MeanR1 + MeanR2 -2;
|
||||
|
||||
if( opt_filter &&
|
||||
( ( (Max && MeanR1<=Max) || !Max) &&
|
||||
( (min && MeanR1>=min) || !min) ) &&
|
||||
( ( (Max && MeanR2<=Max) || !Max) &&
|
||||
( (min && MeanR2>=min) || !min) ) )
|
||||
{
|
||||
|
||||
AllSeeds->invSeeds[N].pos1 = curSeed->pos1; /* thus keep it */
|
||||
AllSeeds->invSeeds[N].pos2 = curSeed->pos2;
|
||||
AllSeeds->invSeeds[N].length = curSeed->length;
|
||||
AllSeeds->invSeeds[N].rmean = curSeed->rmean;
|
||||
N++;
|
||||
}
|
||||
}
|
||||
|
||||
if(opt_filter)
|
||||
{
|
||||
AllSeeds->nFilteredInvSeeds = AllSeeds->nInvSeeds-N; /* reset the memory */
|
||||
AllSeeds->nInvSeeds=N;
|
||||
}
|
||||
}
|
||||
|
||||
if(opt_filter)
|
||||
KMRK_compactSeeds(AllSeeds);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void KMRK_FamilySeeds2seq(AllSeeds_type *AllSeeds,
|
||||
double min,
|
||||
double Max,
|
||||
int8_t opt_dir,
|
||||
int8_t opt_inv,
|
||||
int32_t size1,
|
||||
int32_t size2)
|
||||
{
|
||||
|
||||
int32_t *table_r1; /* the seed table R */
|
||||
int32_t *table_r2; /* the seed table R */
|
||||
|
||||
if( !AllSeeds->nDirSeeds && !AllSeeds->nInvSeeds )
|
||||
return;
|
||||
|
||||
if(!opt_dir && !opt_inv)
|
||||
fprintf(stderr, "FilterSeeds: opt_inv or opt_dir must be set to 1, bye\n"),exit(1);
|
||||
|
||||
KMRK_SeedTableR2seq( AllSeeds,
|
||||
opt_dir, opt_inv,
|
||||
size1,size2,
|
||||
&table_r1,&table_r2); /* calculate the seed table R */
|
||||
|
||||
BuiltFilterFamily_seeds2seq(AllSeeds,
|
||||
table_r1, table_r2,
|
||||
min, Max,
|
||||
opt_dir, opt_inv); /* filter seeds if they does not fit between min and Max */
|
||||
|
||||
MyFree(table_r1, size1*sizeof(int32_t));
|
||||
MyFree(table_r2, size2*sizeof(int32_t));
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
195
src/repseek/repseek-2014.09/help.c
Normal file
195
src/repseek/repseek-2014.09/help.c
Normal file
@ -0,0 +1,195 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/******
|
||||
file : help.c
|
||||
function : Help for all repseek series
|
||||
|
||||
created : 02 Oct 2002
|
||||
modif : Oct 2003
|
||||
Dec 2004 Adds for option -D : managing of mask for seed detection <EC>
|
||||
Dec 2005 switch to a one mode version
|
||||
|
||||
author : amikezor
|
||||
*****/
|
||||
|
||||
#include "repseek_types.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
static void help_contact(){
|
||||
fprintf(stderr," Contact\n"
|
||||
" if you need help, want to report a bug or suggest an option,\n"
|
||||
" please contact G. Achaz by email: achaz(at)abi.snv.jussieu.fr\n"
|
||||
"\n"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void help_information(){
|
||||
fprintf(stderr," Information\n"
|
||||
"\t-v : Print 'v'ersion and exit\n"
|
||||
"\t-h : Print a short 'h'elp and exit\n"
|
||||
"\t-H : Print a larger 'H'elp (with input and output formats) and exit\n"
|
||||
"\n");
|
||||
}
|
||||
|
||||
static void help_corevalues(){
|
||||
fprintf(stderr, " Core values (select either at least one of the four -l/-p -L/-P option)\n"
|
||||
" seed minimum length: lmin\n"
|
||||
"\t-l lmin : set a minimum length for the seeds detection (no pvalue).\n"
|
||||
"\t-p prob : set a p-value that gives a minimum seed length (Karlin & Ost equation)\n"
|
||||
" repeat minimum score: smin\n"
|
||||
"\t-L smin : set a minimum score for extended repeat (def is 0: no filter).\n"
|
||||
"\t-P prob : set a p-value that gives a minimum repeat score (Waterman & Vingron regression)\n"
|
||||
"\n"
|
||||
);
|
||||
}
|
||||
|
||||
static void help_options( ){
|
||||
|
||||
fprintf(stderr," Optionnal values\n"
|
||||
|
||||
"\t-r file : 'r'edirect output into file (otherwise stdout is used).\n"
|
||||
"\t-T : print out R-'T'able\n"
|
||||
"\t the R-Table (or R-Table2) shows all non-unique position and its degree of redundancy\n"
|
||||
|
||||
"\t-S : output 'S'eeds and exit (do not perform extention).\n"
|
||||
"\t-s seed_file : use seeds given in seed_file instead of detecting them\n"
|
||||
"\t File format is 'd|i begin end len [optionnal other fields]'.\n"
|
||||
"\t-D mask_file : mask sequence regions during seed detection only (cannot be used with -s seed_file).\n"
|
||||
"\t File format is 'begin end [seq#]' (seq# is 1 or 2; nothing is treated as 1).\n"
|
||||
"\t-R 0.## : merge 'R'epeats when they share 0.## of their both copies (default = 0.90).\n"
|
||||
"\t When set to 1.0, merge repeats having exactly the same positions\n"
|
||||
|
||||
"\t-d : detect only 'd'irect repeats.\n"
|
||||
"\t-i : detect only 'i'nverted repeats.\n"
|
||||
"\t (-d and -i are mutually exclusive).\n"
|
||||
"\t-B : if some seeds overlaps (i.e. in low-complexity sequence), just keep the 'B'iggest one.\n"
|
||||
"\t-m #.# : keep only seeds occurring at least the specified minimum number of times (min is a float).\n"
|
||||
"\t-M #.# : keep only seeds occurring less than the specified maximum number of times (Max is a float).\n"
|
||||
"\t-O 0|1 : for 1 sequence, direct repeat can have their 2 copies 'O'verlapping (0: no, 1: yes -default-)\n"
|
||||
"\t-c : set the chromosome as 'c'ircular -default is linear- (unused for 2 sequences)\n"
|
||||
"\t-o #.# : set gap_open penalty -default 4.0- (cannot be change when -P is used)\n"
|
||||
"\t-e #.# : set gap_ext penalty -default 1.0- (cannot be change when -P is used)\n"
|
||||
"\t-X #.# : set 'X'g, the 'exploration value' -default 30.0-\n"
|
||||
"\n"
|
||||
);
|
||||
|
||||
|
||||
}
|
||||
|
||||
static void help_input( void ){
|
||||
|
||||
fprintf(stderr," Input\n"
|
||||
" Input file(s) should be in a fasta format.\n"
|
||||
" All non-ACGT characters are considered as N, except for X in which no repeats are detected (mask)\n"
|
||||
"\n"
|
||||
);
|
||||
}
|
||||
|
||||
static void help_output( void ){
|
||||
|
||||
fprintf(stderr," Output\n"
|
||||
"\tRepeats are displayed in 12 COLUMNS\n"
|
||||
"\t 01 - type of the repeat (Tandem|Close|Distant|Overlap|Palindrome|Interseq).(dir|inv)\n"
|
||||
"\t 'Tandem' : repeat is a perfect tandem (spacer=0)\n"
|
||||
"\t 'Close' : spacer is smaller than 1 kb\n"
|
||||
"\t 'Distant' : spacer is larger than 1 kb\n"
|
||||
"\t 'Overlap' : repeat is composed of two overlapping copies (spacer<0)\n"
|
||||
"\t 'Palindrome' : repeat is a perfect palindrome (spacer=0)\n"
|
||||
"\t 'Interseq' : repeat is shared by two sequences\n"
|
||||
"\t 02 - position of the first copy\n"
|
||||
"\t 03 - position of the second copy\n"
|
||||
"\t 04 - length of the first copy\n"
|
||||
"\t 05 - length of the second copy\n"
|
||||
"\t 06 - spacer of the repeats (corrected when the sequence is circular)\n"
|
||||
"\t 07 - characteristics of the seed that gave that repeat (pos1-pos2-len-meanR)\n"
|
||||
"\t 08 - %%identity between the two copies (matches / alignment_length)\n"
|
||||
"\t 09 - score of the alignement\n"
|
||||
"\t 10 - mean-R of the repeat\n"
|
||||
"\t 11 - mode-R of the repeat\n"
|
||||
"\t 12 - fraction of its mode-R\n"
|
||||
"\n"
|
||||
);
|
||||
|
||||
fprintf(stderr,"\tSeeds are displayed in 5 COLUMNS\n"
|
||||
"\t 01 - Type of the repeat (Seed).(dir|inv)\n"
|
||||
"\t 02 - position of the first copie\n"
|
||||
"\t 03 - position of the second copie\n"
|
||||
"\t 04 - length of the seed\n"
|
||||
"\t 05 - mean-R\n\n"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
void printversion_repseek(){
|
||||
fprintf(stderr,"repseek version %s (%s)\n", REPSEEK_VERSION, REPSEEK_DATE);
|
||||
}
|
||||
|
||||
|
||||
void printusage_repseek(char **argv){
|
||||
|
||||
fprintf(stderr,"General usage is '%s [-v|-h|-H] [ opts ] { core_value } file.fst [file2.fst]'\n\n",argv[0]);
|
||||
help_corevalues();
|
||||
help_information();
|
||||
}
|
||||
|
||||
void printhelp_repseek(char **argv){
|
||||
|
||||
fprintf(stderr,"General usage is '%s [-v|-h|-H] [ opts ] { core_value } file.fst [file2.fst]'\n\n",argv[0]);
|
||||
|
||||
help_corevalues();
|
||||
help_options( );
|
||||
help_information();
|
||||
|
||||
help_input();
|
||||
help_contact();
|
||||
}
|
||||
|
||||
void printmorehelp_repseek(char **argv){
|
||||
|
||||
fprintf(stderr,"/*\n"
|
||||
"\tRepseek looks for all repeats couples within (one fasta file) or between sequences (two files)\n"
|
||||
"\tThe method is based on a two steps method (seed detection followed by their extension into repeats)\n"
|
||||
"\tThe statistical significance of repeats can be (i) not evaluated (-l opt),\n"
|
||||
"\t(ii) evaluated for seeds (-p opt) or (iii) evaluated for repeats (-P opt). For further details, \n"
|
||||
"\tplease refer to the manuscript (Achaz et al., in prep)\n"
|
||||
"*/\n"
|
||||
);
|
||||
|
||||
fprintf(stderr,"General usage is '%s [-v|-h|-H] [ opts ] { core_value } file.fst [file2.fst]'\n\n",argv[0]);
|
||||
|
||||
help_corevalues();
|
||||
help_options( );
|
||||
help_information();
|
||||
|
||||
help_input();
|
||||
help_output();
|
||||
|
||||
help_contact();
|
||||
}
|
||||
|
51
src/repseek/repseek-2014.09/help.h
Normal file
51
src/repseek/repseek-2014.09/help.h
Normal file
@ -0,0 +1,51 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
|
||||
#ifndef _help_h
|
||||
#define _help_h
|
||||
|
||||
|
||||
/******
|
||||
file : help.c
|
||||
function : Help for all repseek series
|
||||
|
||||
created : 02 Oct 2002
|
||||
modif : Oct 2003
|
||||
|
||||
author : amikezor
|
||||
*****/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
/********* *********
|
||||
|
||||
All you need for RepSeek
|
||||
|
||||
********* *********/
|
||||
|
||||
void printversion_repseek();
|
||||
void printusage_repseek(char **argv);
|
||||
void printhelp_repseek(char **argv);
|
||||
void printmorehelp_repseek(char **argv);
|
||||
|
||||
#endif
|
||||
|
639
src/repseek/repseek-2014.09/lmin.c
Normal file
639
src/repseek/repseek-2014.09/lmin.c
Normal file
@ -0,0 +1,639 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/******
|
||||
file : lmin.c
|
||||
function : get lmin from a pvalue
|
||||
created : Oct 11 2002
|
||||
|
||||
note : almost all functions comes from lari_stat.c
|
||||
Apr. 97 <AV> first draft
|
||||
April 98 <ed> K formula added
|
||||
Oct 2002 <amikezor> - subset of lari_stat - add LminRiskSeq()
|
||||
*****/
|
||||
|
||||
|
||||
#include <ctype.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "repseek_types.h"
|
||||
#include "memory.h"
|
||||
|
||||
#define NALPHA 4
|
||||
|
||||
typedef struct {
|
||||
int32_t nalpha;
|
||||
int32_t total_symb, val_symb, bad_symb;
|
||||
int32_t count[NALPHA+1];
|
||||
float freq[NALPHA+1];
|
||||
char alpha[NALPHA+2];
|
||||
} FreqTable;
|
||||
|
||||
#undef NALPHA
|
||||
|
||||
|
||||
|
||||
/***
|
||||
Few function coming from numrec as it
|
||||
***/
|
||||
|
||||
static void nrerror(char *error_text)
|
||||
|
||||
{
|
||||
fprintf(stderr,"> Numerical Recipes run-time error...\n");
|
||||
fprintf(stderr,"> %s\n", error_text);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
static float gammln(float xx)
|
||||
{
|
||||
double x,y,tmp,ser;
|
||||
static double cof[6]={76.18009172947146,-86.50532032941677,
|
||||
24.01409824083091,-1.231739572450155,
|
||||
0.1208650973866179e-2,-0.5395239384953e-5};
|
||||
int j;
|
||||
|
||||
y=x=xx;
|
||||
tmp=x+5.5;
|
||||
tmp -= (x+0.5)*log(tmp);
|
||||
ser=1.000000000190015;
|
||||
for (j=0;j<=5;j++) ser += cof[j]/++y;
|
||||
return -tmp+log(2.5066282746310005*ser/x);
|
||||
}
|
||||
|
||||
#define ITMAX 100
|
||||
#define EPS 3.0e-7
|
||||
|
||||
static void gser(float *gamser, float a, float x, float *gln)
|
||||
{
|
||||
int n;
|
||||
float sum,del,ap;
|
||||
|
||||
*gln=gammln(a);
|
||||
if (x <= 0.0) {
|
||||
if (x < 0.0) nrerror("x less than 0 in routine gser");
|
||||
*gamser=0.0;
|
||||
return;
|
||||
} else {
|
||||
ap=a;
|
||||
del=sum=1.0/a;
|
||||
for (n=1;n<=ITMAX;n++) {
|
||||
++ap;
|
||||
del *= x/ap;
|
||||
sum += del;
|
||||
if (fabs(del) < fabs(sum)*EPS) {
|
||||
*gamser=sum*exp(-x+a*log(x)-(*gln));
|
||||
return;
|
||||
}
|
||||
}
|
||||
nrerror("a too large, ITMAX too small in routine gser");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
#define FPMIN 1.0e-30
|
||||
|
||||
static void gcf(float *gammcf, float a, float x, float *gln)
|
||||
{
|
||||
int i;
|
||||
float an,b,c,d,del,h;
|
||||
|
||||
*gln=gammln(a);
|
||||
b=x+1.0-a;
|
||||
c=1.0/FPMIN;
|
||||
d=1.0/b;
|
||||
h=d;
|
||||
for (i=1;i<=ITMAX;i++) {
|
||||
an = -i*(i-a);
|
||||
b += 2.0;
|
||||
d=an*d+b;
|
||||
if (fabs(d) < FPMIN) d=FPMIN;
|
||||
c=b+an/c;
|
||||
if (fabs(c) < FPMIN) c=FPMIN;
|
||||
d=1.0/d;
|
||||
del=d*c;
|
||||
h *= del;
|
||||
if (fabs(del-1.0) < EPS) break;
|
||||
}
|
||||
if (i > ITMAX) nrerror("a too large, ITMAX too small in gcf");
|
||||
*gammcf=exp(-x+a*log(x)-(*gln))*h;
|
||||
}
|
||||
|
||||
#undef ITMAX
|
||||
#undef EPS
|
||||
#undef FPMIN
|
||||
|
||||
static float gammq(float a, float x)
|
||||
{
|
||||
float gamser=0.0,
|
||||
gammcf=0.0,
|
||||
gln=0.0;
|
||||
|
||||
if (x < 0.0 || a <= 0.0) nrerror("Invalid arguments in routine gammq");
|
||||
if (x < (a+1.0)) {
|
||||
gser(&gamser,a,x,&gln);
|
||||
return 1.0-gamser;
|
||||
} else {
|
||||
gcf(&gammcf,a,x,&gln);
|
||||
return gammcf;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
/* compute symbol frequencies */
|
||||
/* from seq and alphabet */
|
||||
/* ----------------------------------------------- */
|
||||
|
||||
int InitFreqTable(char *seq, char *alpha, FreqTable *table)
|
||||
{
|
||||
int i;
|
||||
float norm;
|
||||
char *s, *found;
|
||||
|
||||
strcpy(table->alpha, alpha);
|
||||
|
||||
table->nalpha = strlen(alpha);
|
||||
|
||||
for (i = 0 ; i <= table->nalpha ; i++)
|
||||
table->count[i] = 0;
|
||||
|
||||
table->total_symb = 0;
|
||||
for (s = seq; *s; s++) {
|
||||
table->total_symb++;
|
||||
if ( (found=strchr(alpha, *s) ))
|
||||
table->count[(int)(found - alpha)]++;
|
||||
else
|
||||
table->count[table->nalpha]++;
|
||||
}
|
||||
|
||||
table->bad_symb = table->count[table->nalpha];
|
||||
table->val_symb = table->total_symb - table->bad_symb;
|
||||
|
||||
norm = MAX(1., table->val_symb);
|
||||
|
||||
for (i = 0 ; i < table->nalpha ; i++)
|
||||
table->freq[i] = (float) table->count[i] / norm;
|
||||
|
||||
return table->val_symb;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
/* compute x^n */
|
||||
/* ----------------------------------------------- */
|
||||
|
||||
static double sPower(double x, int n)
|
||||
{
|
||||
double pow = 1.;
|
||||
|
||||
while (n-- > 0)
|
||||
pow *= x;
|
||||
|
||||
return pow;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
/* compute lambdaR */
|
||||
/* ----------------------------------------------- */
|
||||
|
||||
static double LambdaR(FreqTable *table, int r)
|
||||
{
|
||||
int i;
|
||||
double lamb = 0.;
|
||||
|
||||
for (i = 0 ; i < table->nalpha ; i++)
|
||||
lamb += sPower((double)table->freq[i], r);
|
||||
|
||||
return lamb;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
/* log of binomial coeff */
|
||||
/* ----------------------------------------------- */
|
||||
|
||||
static double sLogBinoCof(int n, int p)
|
||||
{
|
||||
double lb = 0.;
|
||||
|
||||
while (p >= 1)
|
||||
lb += log((double)n-- / (double)p--);
|
||||
|
||||
return lb;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
/* compute a(x,y) */
|
||||
/* ----------------------------------------------- */
|
||||
|
||||
static double sAlpha(double x, double y)
|
||||
{
|
||||
double x1, x2;
|
||||
|
||||
x1 = (log(1. - x) + log(y)) / log(x);
|
||||
x2 = 0.57772 / log(x);
|
||||
|
||||
return -(x1 + x2);
|
||||
}
|
||||
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
/* compute E(L(N,r)) */
|
||||
/* ----------------------------------------------- */
|
||||
|
||||
static double ExpectL(FreqTable *table, int N, int r)
|
||||
{
|
||||
double x1, x2, lambr;
|
||||
|
||||
lambr = LambdaR(table, r);
|
||||
x1 = sLogBinoCof(N, r) / -log(lambr);
|
||||
x2 = sAlpha(lambr, lambr);
|
||||
|
||||
return x1 + x2 + 0.5;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
/* compute Var(N,r) */
|
||||
/* ----------------------------------------------- */
|
||||
|
||||
double VarianceL(FreqTable *table, int r)
|
||||
{
|
||||
double x;
|
||||
|
||||
x = 1. / log(LambdaR(table, r));
|
||||
|
||||
return 1.645 * x * x;
|
||||
}
|
||||
/* ----------------------------------------------- */
|
||||
/* compute recursive rlogN */
|
||||
/* ----------------------------------------------- */
|
||||
|
||||
static void finN(indx, r, data, res)
|
||||
int *indx, r, *data;
|
||||
double *res;
|
||||
{
|
||||
int i;
|
||||
double aux;
|
||||
|
||||
for (i = 0, aux=1. ; i < r ; i++)
|
||||
aux *= data[indx[i]];
|
||||
|
||||
*res += aux;
|
||||
|
||||
|
||||
}
|
||||
|
||||
static void recursN(indx, s, r, rcurr, ipos, data, res)
|
||||
int *indx, *data;
|
||||
double *res;
|
||||
int s, r, rcurr, ipos;
|
||||
{
|
||||
int i;
|
||||
|
||||
if (rcurr == r) /* end of recursion */
|
||||
finN(indx, r, data, res);
|
||||
else { /* body of recursion */
|
||||
|
||||
for (i = ipos ; i < s ; i++) {
|
||||
indx[rcurr] = i;
|
||||
recursN(indx, s, r, rcurr+1, i+1, data, res);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
/* compute recursive Rlambda */
|
||||
/* ----------------------------------------------- */
|
||||
|
||||
static void finR(indx, r, data, res)
|
||||
int *indx, r;
|
||||
double *res;
|
||||
FreqTable *data;
|
||||
{
|
||||
int i, j;
|
||||
double aux, aux2;
|
||||
|
||||
for (j=0, aux2=0; j<4; j++){
|
||||
for (i = 0, aux=1. ; i < r ; i++)
|
||||
aux *= (double) data[indx[i]].freq[j];
|
||||
aux2 += aux;
|
||||
}
|
||||
|
||||
if (aux2>*res)
|
||||
*res = aux2;
|
||||
|
||||
}
|
||||
|
||||
static void recursR(indx, s, r, rcurr, ipos, data, res)
|
||||
int *indx, s, r, rcurr, ipos;
|
||||
double *res;
|
||||
FreqTable *data;
|
||||
{
|
||||
int i;
|
||||
|
||||
if (rcurr == r) /* end of recursion */
|
||||
finR(indx, r, data, res);
|
||||
else { /* body of recursion */
|
||||
|
||||
for (i = ipos ; i < s ; i++) {
|
||||
indx[rcurr] = i;
|
||||
recursR(indx, s, r, rcurr+1, i+1, data, res);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
/* binomial coeff */
|
||||
/* ----------------------------------------------- */
|
||||
|
||||
static double sBinoCof(int n, int p)
|
||||
{
|
||||
double b = 1.;
|
||||
|
||||
while (p >= 1)
|
||||
b *= ((double)n-- / (double)p--);
|
||||
|
||||
return b;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
/* compute E(K(N,r)) */
|
||||
/* ----------------------------------------------- */
|
||||
|
||||
|
||||
|
||||
static double ExpectK(FreqTable *table, int r, int s)
|
||||
{
|
||||
double x2, x3, lambr;
|
||||
int *indx, *data, i, nelt;
|
||||
double x1;
|
||||
|
||||
if (s!=r)
|
||||
nelt = ceil(sBinoCof(s, r));
|
||||
else
|
||||
nelt = 1;
|
||||
|
||||
indx = (int32_t *)MyCalloc( nelt, sizeof(int32_t), "ExpectK out of memory" ) ;
|
||||
data = (int32_t *)MyCalloc( s, sizeof(int32_t), "ExpectK out of memory" ) ;
|
||||
|
||||
x1=0.;
|
||||
|
||||
for (i=0; i<s; i++)
|
||||
data[i] = table[i].total_symb;
|
||||
|
||||
recursN(indx, s, r, 0, 0, data, &x1);
|
||||
|
||||
lambr=0;
|
||||
|
||||
recursR(indx, s, r, 0, 0, table, &lambr);
|
||||
|
||||
x2 = log(x1) / -log(lambr);
|
||||
x3 = sAlpha(lambr, lambr);
|
||||
|
||||
MyFree(indx, nelt* sizeof(int32_t));
|
||||
MyFree(data, s* sizeof(int32_t));
|
||||
|
||||
return x2 + x3 + 0.5;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
/* compute VarK (N,r) */
|
||||
/* ----------------------------------------------- */
|
||||
|
||||
static double VarianceK(FreqTable *table, int r)
|
||||
{
|
||||
double x;
|
||||
|
||||
x = 1. / log(LambdaR(table, r));
|
||||
|
||||
return 1.645 * x * x;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
/* compute Erf(x) */
|
||||
/* ----------------------------------------------- */
|
||||
static double sErf(double x)
|
||||
{
|
||||
double gamp;
|
||||
|
||||
gamp = 1.0 - gammq(0.5, x * x);
|
||||
return ((x < 0.) ? -gamp : gamp);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
/* compute cumulative of Normal */
|
||||
/* ----------------------------------------------- */
|
||||
#define SQRT2 1.4142135624
|
||||
static double sCumNormal(double x, double mu, double sigma)
|
||||
{
|
||||
double xx;
|
||||
|
||||
xx = (x - mu) / sigma / SQRT2;
|
||||
|
||||
return (1.0 + sErf(xx)) / 2.;
|
||||
}
|
||||
|
||||
#undef SQRT2
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
/* compute Lmin(N,r) at error level p */
|
||||
/* ----------------------------------------------- */
|
||||
|
||||
#define TOLERANCE 1e-6
|
||||
|
||||
static double LminRisk(FreqTable *table, int N, int r, float p)
|
||||
{
|
||||
double pp, mu, sigma, xmin, xmax, xcurr, pcurr;
|
||||
|
||||
if (p <= 0.)
|
||||
return 1.;
|
||||
if (p >= 1.)
|
||||
return (double) N;
|
||||
|
||||
pp = 1.0 - (double) p;
|
||||
mu = ExpectL(table, N, r);
|
||||
sigma = sqrt(VarianceL(table, r));
|
||||
|
||||
xmin = 1;
|
||||
xmax = N;
|
||||
xcurr = (xmin + xmax) / 2. ;
|
||||
pcurr = sCumNormal(xcurr, mu, sigma);
|
||||
|
||||
/* dichotomic search */
|
||||
|
||||
while (fabs(pp - pcurr) > TOLERANCE) {
|
||||
if (pp > pcurr)
|
||||
xmin = xcurr;
|
||||
else
|
||||
xmax = xcurr;
|
||||
xcurr = (xmin + xmax) / 2. ;
|
||||
pcurr = sCumNormal(xcurr, mu, sigma);
|
||||
}
|
||||
|
||||
return xcurr;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
/* compute K Lmin(N,r) at error level p */
|
||||
/* ----------------------------------------------- */
|
||||
|
||||
static double KLminRisk(FreqTable *table, double rlogn, int r, int s,
|
||||
float p, int len)
|
||||
{
|
||||
double pp, mu, sigma, xmin, xmax, xcurr, pcurr;
|
||||
|
||||
if (p <= 0.)
|
||||
return 1.;
|
||||
if (p >= 1.)
|
||||
return rlogn;
|
||||
|
||||
pp = 1.0 - (double) p;
|
||||
mu = ExpectK(table, r, s);
|
||||
sigma = sqrt(VarianceK(table, r));
|
||||
|
||||
xmin = 1;
|
||||
xmax = len;
|
||||
xcurr = (xmin + xmax) / 2. ;
|
||||
pcurr = sCumNormal(xcurr, mu, sigma);
|
||||
|
||||
/* dichotomic search */
|
||||
|
||||
while (fabs(pp - pcurr) > TOLERANCE) {
|
||||
if (pp > pcurr)
|
||||
xmin = xcurr;
|
||||
else
|
||||
xmax = xcurr;
|
||||
xcurr = (xmin + xmax) / 2. ;
|
||||
pcurr = sCumNormal(xcurr, mu, sigma);
|
||||
}
|
||||
|
||||
return xcurr;
|
||||
}
|
||||
|
||||
#undef TOLERANCE
|
||||
|
||||
/***
|
||||
Take a sequencem the number of occurence and the prob and
|
||||
return an associated lmin
|
||||
***/
|
||||
|
||||
#define DNA_ALPHABET "ACGT"
|
||||
|
||||
static double LminRiskSeq(char *sequence, int r, float p)
|
||||
{
|
||||
|
||||
int nSymbols=0;
|
||||
char alphabet[32]={0,};
|
||||
FreqTable ftable;
|
||||
|
||||
double Lmin=0.0;
|
||||
|
||||
(void) strcpy(alphabet, DNA_ALPHABET);
|
||||
|
||||
(void) InitFreqTable(sequence, alphabet, &ftable);
|
||||
|
||||
nSymbols = ftable.val_symb;
|
||||
|
||||
Lmin = LminRisk(& ftable, nSymbols, r, p);
|
||||
|
||||
return Lmin;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int16_t set_lmin(float pval, char *sequence){
|
||||
|
||||
float flmin;
|
||||
int16_t lmin;
|
||||
|
||||
flmin = LminRiskSeq(sequence, 2, pval);
|
||||
|
||||
if(flmin - (float)(int16_t)flmin < 0.5)
|
||||
lmin = (int16_t)flmin;
|
||||
else
|
||||
lmin = (int16_t)flmin + 1;
|
||||
|
||||
return lmin;
|
||||
}
|
||||
|
||||
/***
|
||||
Take a sequence the number of occurence and the prob and
|
||||
return an associated lmin
|
||||
***/
|
||||
|
||||
#define DNA_ALPHABET "ACGT"
|
||||
|
||||
static double LminRisk2Seq(char *Sequence1,
|
||||
char *Sequence2,
|
||||
int32_t SeqSize1,
|
||||
int32_t SeqSize2,
|
||||
int r, float p)
|
||||
{
|
||||
|
||||
char alphabet[32]={0,}; /* an theoretical bigger alphabet than ACGT */
|
||||
FreqTable ftable[2]; /* a freq for each sequence */
|
||||
|
||||
int nseq=2; /* number of sequences */
|
||||
|
||||
double Lmin=0.0; /* the value of interest */
|
||||
|
||||
(void) strcpy(alphabet, DNA_ALPHABET);
|
||||
|
||||
(void) InitFreqTable(Sequence1, alphabet, &ftable[0]); /* init freq of the first seq */
|
||||
(void) InitFreqTable(Sequence2, alphabet, &ftable[1]); /* init freq of the second seq */
|
||||
|
||||
Lmin = KLminRisk(ftable, (double) r*log(SeqSize1+SeqSize2/nseq), r, nseq, p, SeqSize1+SeqSize2);
|
||||
|
||||
return Lmin;
|
||||
}
|
||||
|
||||
#undef DNA_ALPHABET
|
||||
|
||||
|
||||
int16_t set_lmin_2seqs(float pval,
|
||||
char *Sequence1,
|
||||
char *Sequence2,
|
||||
int32_t SeqSize1,
|
||||
int32_t SeqSize2)
|
||||
{
|
||||
|
||||
double flmin;
|
||||
int16_t lmin;
|
||||
int8_t r=2;
|
||||
|
||||
flmin = LminRisk2Seq(Sequence1, Sequence2, SeqSize1, SeqSize2, r, pval);
|
||||
|
||||
if(flmin - (double)(int16_t)flmin < 0.5)
|
||||
lmin = (int16_t)flmin;
|
||||
else
|
||||
lmin = (int16_t)flmin + 1;
|
||||
|
||||
return lmin;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#undef DNA_ALPHABET
|
||||
|
33
src/repseek/repseek-2014.09/lmin.h
Normal file
33
src/repseek/repseek-2014.09/lmin.h
Normal file
@ -0,0 +1,33 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
|
||||
#ifndef lmin_h
|
||||
#define lmin_h
|
||||
|
||||
|
||||
int16_t set_lmin(float pval, char *sequence);
|
||||
|
||||
int16_t set_lmin_2seqs(float pval,
|
||||
char *Sequence1,
|
||||
char *Sequence2,
|
||||
int32_t SeqSize1,
|
||||
int32_t SeqSize2);
|
||||
|
||||
#endif /* lmin_h */
|
671
src/repseek/repseek-2014.09/main_repseek.c
Normal file
671
src/repseek/repseek-2014.09/main_repseek.c
Normal file
@ -0,0 +1,671 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/**
|
||||
* @file main_repseek.c
|
||||
* @author Guillaume Achaz <achaz@abi.snv.jussieu.fr>, Eric Coissac <coissac@inrialpes.fr>
|
||||
* @date Feb 26 2004
|
||||
* @modif April 2004
|
||||
* @modif Dec 2004 Adds for option -D : managing of mask for seed detection <EC>
|
||||
* @modif sept 2005 re-write all align_blast2like. Correct Memory bugs <GA>
|
||||
* @modif 17 oct 2005 - mode '4' <GA>
|
||||
* @modif dec 2005 - remove mode choice <GA>
|
||||
* @modif jan-fev-march 2006 - stable version (-l, -p or -P options) <GA>
|
||||
* @modif may 2008 - count Xs in the sequence and remove them to compute smin <GA>
|
||||
*
|
||||
* @brief main function of repseek program
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
|
||||
#if defined(MACOSX) || defined(LINUX) || defined(OSF) /* for options */
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#include "repseek_types.h"
|
||||
|
||||
#include "sequence.h"
|
||||
#include "readfst.h"
|
||||
#include "output.h"
|
||||
#include "help.h"
|
||||
#include "families.h"
|
||||
|
||||
#include "KMRK.h"
|
||||
#include "KMRK_Seeds.h"
|
||||
#include "KMRK_merge_seeds.h"
|
||||
#include "KMRK_mask.h"
|
||||
|
||||
#include "read_seeds.h"
|
||||
|
||||
#include "filter.h"
|
||||
#include "align.h"
|
||||
|
||||
#include "memory.h"
|
||||
|
||||
#include "smin.h"
|
||||
#include "lmin.h"
|
||||
|
||||
int main(int argc, char **argv){
|
||||
|
||||
|
||||
/*
|
||||
Repseek Core Values
|
||||
*/
|
||||
|
||||
int32_t selected_l=0; /* the lmin chosen by the user --if any-- */
|
||||
|
||||
int32_t lmin=0; /* the statistical lmin for seeds detection */
|
||||
|
||||
double selected_s=0; /* estimated score minimum if repseek is running in mode '3' */
|
||||
double smin=0; /* estimated score minimum if repseek is running in mode '3' */
|
||||
|
||||
float pval_lmin=0.0; /* a pval that gives an Lmin (Karlin & Ost equation) */
|
||||
float pval_smin=0.0; /* a pval that gives an Smin (Watermann & Vingron regression) */
|
||||
|
||||
|
||||
/*
|
||||
Sequence(s)
|
||||
*/
|
||||
|
||||
int8_t repseek_nseq; /* either 1 or 2, 1 being for 1 sequence and 2 for 2 sequences */
|
||||
|
||||
char *sequence; /* the first sequence */
|
||||
char *sequence2=NULL; /* the second sequence --only use when repseek_nseq is 2 */
|
||||
int32_t size; /* the size of the sequence */
|
||||
int32_t size2=0; /* the size of the second sequence --only use when repseek_nseq is 2*/
|
||||
int32_t totalSize; /* size of (both) sequence(s) */
|
||||
|
||||
int32_t nX=0, /* number of X in the first and the second sequence */
|
||||
nX2=0;
|
||||
|
||||
|
||||
masked_area_table_t *mask=NULL; /* masking table used during seed detection */
|
||||
char *mask_file=NULL;
|
||||
|
||||
char opt_shape = 'l'; /* chromosome has to be set at 'l'inear or 'c'ircular */
|
||||
|
||||
|
||||
|
||||
/*
|
||||
Output/Input File(s)
|
||||
*/
|
||||
char *output_file = NULL; /* the output file name base */
|
||||
char *seed_file = NULL; /* the seed file that can be used instead of the seed detection */
|
||||
|
||||
FILE *fout=stdout; /* the output flux */
|
||||
|
||||
|
||||
/*
|
||||
Repseek Options
|
||||
*/
|
||||
|
||||
int optc; /* what U need for options parsing */
|
||||
extern int optind;
|
||||
extern char *optarg;
|
||||
|
||||
int8_t opt_dir = 1; /* if 0 user do not want direct repeats */
|
||||
int8_t opt_inv = 1; /* if 0 user do not want inverted repeats */
|
||||
|
||||
int8_t opt_MergeSeeds=0; /* if 1, thus user ask for seed merging - remove overlapping seeds */
|
||||
int8_t opt_filter=0; /* if 1, thus user ask for filtering */
|
||||
float min=0.0, /* if they are non-0, they are used to filter the seeds */
|
||||
Max=0.0; /* min is the minimum of n-plication, Max is the maximum */
|
||||
|
||||
float merge_repeats=0.90; /* merge repeats when x% of both copies is located at the same spot */
|
||||
|
||||
int8_t opt_TableR=0; /* do we output TableR - 0 = no, default */
|
||||
int8_t opt_PrintSeeds=0; /* do we output Seeds - 0 = no, default */
|
||||
|
||||
int8_t opt_overlap=1; /* if 0, repeats cannot have their two copies overlapping */
|
||||
|
||||
|
||||
|
||||
/*
|
||||
repseek storage
|
||||
*/
|
||||
AllSeeds_type* AllSeeds; /* A structure that contains all seeds. Defined in KMRK_Seeds.h */
|
||||
|
||||
Repeats AllRepeats; /* A structure that contains all repeats. */
|
||||
|
||||
int32_t *TableR=NULL; /* A pointer to the tableR */
|
||||
int32_t *TableR2=NULL; /* A pointer to a second tableR -- If there are 2 sequences */
|
||||
|
||||
|
||||
/*
|
||||
Repseek alignment variables
|
||||
*/
|
||||
float Xg=30.0; /* The exploratory value. for BLAST2. same notation than in BLAST2 paper */
|
||||
float gap_open=4.0, /* gap open and gap_ext penality - expressed as a coefficient of the mean score. */
|
||||
gap_ext=1.0;
|
||||
char matrix_type='l'; /* matrix is by default 'l'og, can also be set to 'i'dentity */
|
||||
|
||||
|
||||
|
||||
/*
|
||||
Parse options
|
||||
*/
|
||||
|
||||
while( (optc=getopt(argc, argv, "Hhvl:p:L:P:TSdis:m:M:co:e:X:O:ID:r:BR:")) != -1)
|
||||
|
||||
switch(optc){
|
||||
|
||||
/*
|
||||
Information
|
||||
*/
|
||||
case 'h':
|
||||
printhelp_repseek(argv);
|
||||
exit(1);
|
||||
|
||||
case 'H':
|
||||
printmorehelp_repseek(argv);
|
||||
exit(1);
|
||||
|
||||
case 'v':
|
||||
printversion_repseek();
|
||||
exit(1);
|
||||
|
||||
/*
|
||||
Main Inputs
|
||||
*/
|
||||
case 'l':
|
||||
selected_l = (int16_t)atoi(optarg);
|
||||
if(selected_l < 1)
|
||||
fprintf(stderr,"selected lmin has to be >= 1, bye\n"),exit(1);
|
||||
break;
|
||||
|
||||
case 'p':
|
||||
pval_lmin = atof(optarg);
|
||||
if(pval_lmin > 1.0 || pval_lmin < 0.0)fprintf(stderr,"set 0.0>pval_lmin>=1.0, bye\n"),exit(1);
|
||||
break;
|
||||
|
||||
case 'L':
|
||||
selected_s = atof(optarg);
|
||||
if(selected_s < 0.0 )fprintf(stderr,"selected smin has to be >=0.0, bye\n"),exit(1);
|
||||
break;
|
||||
|
||||
case 'P':
|
||||
pval_smin = atof(optarg);
|
||||
if(pval_smin > 1.0 || pval_smin < 0.0)fprintf(stderr,"set 0.0>pval_smin>=1.0, bye\n"),exit(1);
|
||||
break;
|
||||
|
||||
/*
|
||||
Output/Input
|
||||
*/
|
||||
case 'r':
|
||||
output_file = optarg;
|
||||
break;
|
||||
|
||||
case 'T':
|
||||
opt_TableR = 1;
|
||||
break;
|
||||
|
||||
case 'S':
|
||||
opt_PrintSeeds = 1;
|
||||
break;
|
||||
|
||||
case 's':
|
||||
seed_file = optarg;
|
||||
break;
|
||||
|
||||
/*
|
||||
Options for detection
|
||||
*/
|
||||
case 'd':
|
||||
if(!opt_dir)
|
||||
fprintf(stderr,"options '-d' and '-i' are mutually exclusive, bye\n"),exit(1);
|
||||
opt_inv=0;
|
||||
break;
|
||||
|
||||
case 'i':
|
||||
if(!opt_inv)
|
||||
fprintf(stderr,"options '-d' and '-i' are mutually exclusive, bye\n"),exit(1);
|
||||
opt_dir=0;
|
||||
break;
|
||||
|
||||
/*
|
||||
Filtering
|
||||
*/
|
||||
case 'm':
|
||||
min = atof(optarg);
|
||||
opt_filter = 1;
|
||||
break;
|
||||
|
||||
case 'M':
|
||||
Max = atof(optarg);
|
||||
opt_filter = 1;
|
||||
break;
|
||||
|
||||
case 'B':
|
||||
opt_MergeSeeds=1;
|
||||
break;
|
||||
|
||||
case 'R':
|
||||
merge_repeats= atof(optarg);
|
||||
if(merge_repeats<0.0 || merge_repeats>1.0)
|
||||
fprintf(stderr,"for merging repeats: 0.0 <= R <= 1.0 \n"),exit(1);
|
||||
break;
|
||||
|
||||
|
||||
|
||||
/*
|
||||
Alignment options
|
||||
*/
|
||||
case 'o':
|
||||
gap_open = atof(optarg);
|
||||
break;
|
||||
|
||||
case 'e':
|
||||
gap_ext = atof(optarg);
|
||||
break;
|
||||
|
||||
case 'X':
|
||||
Xg = ABS(atof(optarg));
|
||||
break;
|
||||
|
||||
case 'O':
|
||||
opt_overlap = (int8_t )atoi(optarg);
|
||||
if(opt_overlap != 0 && opt_overlap != 1)
|
||||
fprintf(stderr,"opt_overlap can only be 1-can overlapp or 0- no overlap\n, bye"),exit(1);
|
||||
break;
|
||||
|
||||
/*
|
||||
Misc options
|
||||
*/
|
||||
case 'c':
|
||||
opt_shape = 'c';
|
||||
break;
|
||||
|
||||
case 'I':
|
||||
matrix_type='i';
|
||||
break;
|
||||
|
||||
case 'D':
|
||||
mask_file = optarg;
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
Do we have the minimum to run ? (mode minimum input and an output)
|
||||
*/
|
||||
repseek_nseq = argc-optind;
|
||||
|
||||
if((repseek_nseq < 1) || (repseek_nseq > 2) )
|
||||
printusage_repseek(argv),exit(1);
|
||||
|
||||
if( !selected_l && !pval_lmin && !pval_smin && !selected_s)
|
||||
fprintf(stderr,"repseek minium core value is either -l/-p (i.e. set seed size lmin) or -L/-P (i.e. set repeat score smin)\n"),exit(1);
|
||||
|
||||
if( selected_l && pval_lmin )
|
||||
fprintf(stderr,"Only one between -l, -p should be used (i.e. set seed lmin)\n"),exit(1);
|
||||
|
||||
if( selected_s && pval_smin )
|
||||
fprintf(stderr,"Only one between -L, -P should be used (i.e. set repeat smin)\n"),exit(1);
|
||||
|
||||
|
||||
if( pval_smin && (gap_open != 4.0 || gap_ext != 1.0 ) )
|
||||
fprintf(stderr,"Using the Pval on repeats, gaps penalties should be default ones\n"),exit(1);
|
||||
|
||||
|
||||
if(output_file){
|
||||
fout = fopen(output_file,"w");
|
||||
if(!fout) fprintf(stderr, "Cannot write into %s file, bye\n", output_file),exit(1);
|
||||
}
|
||||
|
||||
fprintf(stderr,"Repseek mode... '%s' + '%s'\n",
|
||||
pval_lmin?"seed stat":"seed no stat",
|
||||
pval_smin?"repeat stat":"repeat no stat"
|
||||
);
|
||||
|
||||
|
||||
/*
|
||||
Get sequence(s) in memory
|
||||
*/
|
||||
|
||||
fprintf(stderr,"get sequence(s)... ");
|
||||
|
||||
sequence = readFastaSeq(argv[optind], &size, &nX);
|
||||
totalSize = size;
|
||||
|
||||
if(repseek_nseq==2){
|
||||
sequence2 = readFastaSeq(argv[optind+1], &size2, &nX2);
|
||||
totalSize+=size2;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Output Sequence Length and Xs
|
||||
*/
|
||||
if(totalSize < 1e3)
|
||||
fprintf(stderr,"total size %d b ", totalSize );
|
||||
else if(totalSize < 1e6)
|
||||
fprintf(stderr,"total size %.2f kb ", ((float)totalSize)/1e3 );
|
||||
else if(totalSize < 1e9)
|
||||
fprintf(stderr,"total size %.2f Mb ", ((float)totalSize)/1e6 );
|
||||
else
|
||||
fprintf(stderr,"total size %.2f Gb ", ((float)totalSize)/1e9 );
|
||||
|
||||
switch( (char)floor( log(nX+nX2)/log(10) ) ){
|
||||
case 0: case 1: case 2:
|
||||
fprintf(stderr,"(%d Xs)\n", (nX+nX2) );
|
||||
break;
|
||||
case 3: case 4: case 5:
|
||||
fprintf(stderr,"(%.2f K Xs)\n", (float)(nX+nX2)/1e3 );
|
||||
break;
|
||||
case 6: case 7: case 8:
|
||||
fprintf(stderr,"(%.2f M Xs)\n", (float)(nX+nX2)/1e6 );
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr,"(%.2f G Xs)\n", (float)(nX+nX2)/1e9 );
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
handle the seed minimum size
|
||||
*/
|
||||
fprintf(stderr,"set lmin... ");
|
||||
|
||||
if(selected_l)
|
||||
lmin=selected_l;
|
||||
else{
|
||||
|
||||
if(pval_lmin){
|
||||
if (repseek_nseq==2)
|
||||
lmin=set_lmin_2seqs(pval_lmin, sequence,sequence2,size,size2); /* Lmin is computed ignoring X and N */
|
||||
else
|
||||
lmin=set_lmin(pval_lmin, sequence);
|
||||
}
|
||||
else{
|
||||
if (repseek_nseq==2)
|
||||
lmin=(int32_t)set_lmin_2seqs(0.001, sequence,sequence2,size,size2)*2.0/3.0;
|
||||
else
|
||||
lmin=(int32_t)set_lmin(0.001, sequence)*2.0/3.0;
|
||||
}
|
||||
|
||||
}
|
||||
fprintf(stderr, "seed >= %d bp ( L5%%: %d ; L1%%: %d ; L0.1%%: %d )\n",
|
||||
lmin,
|
||||
(repseek_nseq==2)?set_lmin_2seqs(0.05, sequence,sequence2,size,size2):set_lmin(0.05, sequence),
|
||||
(repseek_nseq==2)?set_lmin_2seqs(0.01, sequence,sequence2,size,size2):set_lmin(0.01, sequence),
|
||||
(repseek_nseq==2)?set_lmin_2seqs(0.001, sequence,sequence2,size,size2):set_lmin(0.001, sequence)
|
||||
);
|
||||
|
||||
|
||||
/*
|
||||
Select the repeat minimum score
|
||||
*/
|
||||
fprintf(stderr,"set smin... ");
|
||||
if(selected_s)
|
||||
smin=selected_s; /* set the repeat minimum score as given in the input */
|
||||
else{
|
||||
|
||||
if(pval_smin){
|
||||
if (repseek_nseq==2)
|
||||
smin = compute_smin_2seq(sequence, sequence2, size, size2, pval_smin, nX, nX2);
|
||||
else
|
||||
smin = compute_smin(sequence, pval_smin, size, nX);
|
||||
}
|
||||
else
|
||||
smin = 0;
|
||||
|
||||
}
|
||||
|
||||
|
||||
fprintf(stderr, "repeat >= %.2f score ( S5%%: %.2f ; S1%%: %.2f ; S0.1%%: %.2f )\n",
|
||||
smin,
|
||||
(repseek_nseq==2)?compute_smin_2seq(sequence, sequence2, size, size2, 0.05, nX, nX2):compute_smin(sequence, 0.05, size, nX),
|
||||
(repseek_nseq==2)?compute_smin_2seq(sequence, sequence2, size, size2, 0.01, nX, nX2):compute_smin(sequence, 0.01, size, nX),
|
||||
(repseek_nseq==2)?compute_smin_2seq(sequence, sequence2, size, size2, 0.001, nX, nX2):compute_smin(sequence, 0.001, size, nX)
|
||||
);
|
||||
|
||||
|
||||
|
||||
/*
|
||||
Begin the seed detection by KMR-K or read file
|
||||
*/
|
||||
|
||||
if(seed_file) fprintf(stderr,"read seeds... ");
|
||||
else fprintf(stderr,"detect seeds... ");
|
||||
|
||||
if (repseek_nseq==1)
|
||||
{
|
||||
if (mask_file)
|
||||
{
|
||||
if (opt_inv)
|
||||
mask = KMRK_ReadMaskArea(mask_file,1,size); /* 2nd arg: #seq and the 3rd arg: complement */
|
||||
else
|
||||
mask = KMRK_ReadMaskArea(mask_file,1,0);
|
||||
}
|
||||
|
||||
if(seed_file)
|
||||
AllSeeds = read_seeds(seed_file, lmin, opt_dir, opt_inv, size, 0);
|
||||
else
|
||||
AllSeeds = KMRK_get_seeds(&sequence, size, lmin, opt_dir, opt_inv, 0, mask);
|
||||
|
||||
}else{
|
||||
|
||||
if (mask_file)
|
||||
{
|
||||
if (opt_inv)
|
||||
mask = KMRK_ReadMaskArea(mask_file,2,size);
|
||||
else
|
||||
mask = KMRK_ReadMaskArea(mask_file,2,0);
|
||||
}
|
||||
|
||||
if(seed_file)
|
||||
AllSeeds = read_seeds(seed_file, lmin, opt_dir, opt_inv, size, size2);
|
||||
else
|
||||
AllSeeds = KMRK_get_seeds_2seqs(&sequence, &sequence2, size, size2, lmin, opt_dir, opt_inv, 0, mask);
|
||||
|
||||
}
|
||||
|
||||
|
||||
fprintf(stderr, "%d direct & %d inverted seeds\n", AllSeeds->nDirSeeds, AllSeeds->nInvSeeds);
|
||||
|
||||
|
||||
/*
|
||||
Optionnal filters
|
||||
*/
|
||||
if (opt_filter)
|
||||
{
|
||||
fprintf(stderr,"filter, keep... ");
|
||||
if (repseek_nseq==1)
|
||||
KMRK_FamilySeeds(AllSeeds, min, Max, opt_dir, opt_inv, size);
|
||||
else
|
||||
KMRK_FamilySeeds2seq(AllSeeds, min, Max, opt_dir, opt_inv, size, size2);
|
||||
|
||||
fprintf(stderr, "%d direct and %d inverted seeds\n", AllSeeds->nDirSeeds, AllSeeds->nInvSeeds);
|
||||
}
|
||||
|
||||
if(opt_MergeSeeds){
|
||||
|
||||
fprintf(stderr,"merge, keep... ");
|
||||
KMRK_MergeSeeds(AllSeeds, opt_dir, opt_inv);
|
||||
|
||||
fprintf(stderr,"%d direct and %d inverted seeds\n", AllSeeds->nDirSeeds, AllSeeds->nInvSeeds);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Final table(s) R of seeds
|
||||
*/
|
||||
/* filter seeds if they does not fit between min and Max */
|
||||
|
||||
if (repseek_nseq==1){
|
||||
TableR = KMRK_SeedTableR(AllSeeds, opt_dir, opt_inv, size);
|
||||
BuiltFilterFamily_seeds(AllSeeds, TableR, 0, 0, opt_dir, opt_inv);
|
||||
}
|
||||
else{
|
||||
KMRK_SeedTableR2seq(AllSeeds, opt_dir, opt_inv, size,size2, &TableR,&TableR2);
|
||||
BuiltFilterFamily_seeds2seq(AllSeeds, TableR, TableR2, 0, 0, opt_dir, opt_inv);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
reorder AllSeeds by position
|
||||
*/
|
||||
if (opt_dir) KMRK_sortSeedsByPos(AllSeeds->dirSeeds,AllSeeds->nDirSeeds);
|
||||
if (opt_inv) KMRK_sortSeedsByPos(AllSeeds->invSeeds,AllSeeds->nInvSeeds);
|
||||
|
||||
|
||||
|
||||
|
||||
if( opt_PrintSeeds ){
|
||||
|
||||
fprintf(stderr,"ouput seeds... ");
|
||||
|
||||
if(repseek_nseq == 1)
|
||||
WriteSeeds(fout, AllSeeds, opt_dir, opt_inv);
|
||||
else
|
||||
WriteSeeds(fout, AllSeeds, opt_dir, opt_inv);
|
||||
|
||||
if(opt_TableR){
|
||||
|
||||
fprintf(fout,"# Seed TableR-1\n");
|
||||
WriteTableR(fout, TableR, size);
|
||||
if (repseek_nseq == 2){
|
||||
fprintf(fout,"# Seed TableR-2\n");
|
||||
WriteTableR(fout, TableR2, size2);
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr," done\n");
|
||||
|
||||
/*
|
||||
Free stuff
|
||||
*/
|
||||
|
||||
/* seeds */
|
||||
KMRK_freeSeeds(AllSeeds);
|
||||
|
||||
/* sequence */
|
||||
MyFree(sequence, (size+1)*sizeof(char));
|
||||
if(repseek_nseq == 2)
|
||||
MyFree(sequence2, (size2+1)*sizeof(char));
|
||||
|
||||
/* TableR */
|
||||
MyFree(TableR, size*sizeof(int32_t));
|
||||
if(repseek_nseq==2)MyFree(TableR2, size2*sizeof(int32_t));
|
||||
|
||||
|
||||
PrintMaxMem();
|
||||
exit(0);
|
||||
}
|
||||
|
||||
/*
|
||||
Free TableR of seeds
|
||||
*/
|
||||
MyFree(TableR, size*sizeof(int32_t));
|
||||
if(repseek_nseq==2)MyFree(TableR2, size2*sizeof(int32_t));
|
||||
|
||||
|
||||
/*
|
||||
Extension : BLAST2-like Alignments
|
||||
*/
|
||||
if(repseek_nseq == 1)
|
||||
AllRepeats = align_seeds(AllSeeds, sequence, Xg, gap_open, gap_ext, matrix_type,
|
||||
lmin, opt_dir, opt_inv, opt_overlap, merge_repeats);
|
||||
else
|
||||
AllRepeats = align_seeds_2seq(AllSeeds, sequence, sequence2, Xg, gap_open, gap_ext, matrix_type,
|
||||
lmin, opt_dir, opt_inv, merge_repeats);
|
||||
|
||||
fprintf(stderr,"align and merge... %d direct & %d inverted repeats (merge with R=%.2f)\n",
|
||||
AllRepeats.nDirRep-AllRepeats.nDirBadRep, AllRepeats.nInvRep-AllRepeats.nInvBadRep, merge_repeats);
|
||||
|
||||
|
||||
/*
|
||||
If mode is stat on repeats, remove repeats with a score smaller than smin
|
||||
*/
|
||||
if( smin ){
|
||||
UpdateRepeats(&AllRepeats, smin);
|
||||
fprintf(stderr,"After score_min... %d direct & %d inverted repeats\n",
|
||||
AllRepeats.nDirRep-AllRepeats.nDirBadRep, AllRepeats.nInvRep-AllRepeats.nInvBadRep);
|
||||
}
|
||||
|
||||
/*
|
||||
Built the repeats table(s) R.
|
||||
*/
|
||||
if (repseek_nseq==1)
|
||||
TableR = SetRepFamily(AllRepeats, opt_dir, opt_inv, size);
|
||||
else
|
||||
SetNumberRep_2seqs(AllRepeats, opt_dir, opt_inv, size, size2, &TableR, &TableR2);
|
||||
|
||||
|
||||
/*
|
||||
Misc Output
|
||||
*/
|
||||
|
||||
fprintf(stderr,"output repeats... ");
|
||||
|
||||
|
||||
if(repseek_nseq == 1)
|
||||
WriteRep(fout, AllRepeats, opt_dir, opt_inv, opt_shape, size);
|
||||
else
|
||||
WriteRep_2seqs(fout, AllRepeats, opt_dir, opt_inv, size, size2);
|
||||
|
||||
|
||||
if(opt_TableR){
|
||||
fprintf(fout,"# TableR-1\n");
|
||||
WriteTableR(fout, TableR, size);
|
||||
if (repseek_nseq == 2){
|
||||
fprintf(fout,"# TableR-2\n");
|
||||
WriteTableR(fout, TableR2, size2);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fprintf(stderr,"done\n");
|
||||
|
||||
fclose(fout);
|
||||
|
||||
|
||||
/*
|
||||
All free functions
|
||||
*/
|
||||
|
||||
/* table(s) R */
|
||||
MyFree(TableR, size*sizeof(int32_t));
|
||||
if(repseek_nseq == 2)
|
||||
MyFree(TableR2, size2*sizeof(int32_t));
|
||||
|
||||
/* seeds and repeats */
|
||||
KMRK_freeSeeds(AllSeeds);
|
||||
free_Repeats(AllRepeats);
|
||||
|
||||
/* sequence */
|
||||
MyFree(sequence, (size+1)*sizeof(char));
|
||||
if(repseek_nseq == 2)
|
||||
MyFree(sequence2, (size2+1)*sizeof(char));
|
||||
|
||||
/* PrintMem("At the end"); */
|
||||
|
||||
PrintMaxMem();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
258
src/repseek/repseek-2014.09/memory.c
Normal file
258
src/repseek/repseek-2014.09/memory.c
Normal file
@ -0,0 +1,258 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/******
|
||||
file : memory.c
|
||||
function : All about memory of the KMR, Seeds and Repeats
|
||||
All MyMalloc() series is about follwoing how mauch memory has been used
|
||||
|
||||
created : 19 Sep 2003
|
||||
modif : Oct 2003, Feb 2004
|
||||
modif : Dec 2004 <EC> ; Corrected Memory declaration
|
||||
author : amikezor
|
||||
*****/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "repseek_types.h"
|
||||
#include "memory.h"
|
||||
|
||||
|
||||
MemUsage Memory;
|
||||
|
||||
|
||||
/*
|
||||
Functions to count the memory usage all along
|
||||
dybamic allocation and free
|
||||
*/
|
||||
void PrintMem(char *Comment){
|
||||
|
||||
extern MemUsage Memory;
|
||||
|
||||
fprintf(stderr,"\n%s\nMemory Usage\n\t* Max is: %d bytes, %.2f Kb, %.2f Mb\n\t* Cur is: %d bytes, %.2f Kb, %.2f Mb\n",
|
||||
Comment,
|
||||
Memory.Max, (float)Memory.Max/1024, (float)Memory.Max/(1024*1024),
|
||||
Memory.Current, (float)Memory.Current/1024, (float)Memory.Current/(1024*1024));
|
||||
}
|
||||
void PrintMaxMem( void ){
|
||||
|
||||
extern MemUsage Memory;
|
||||
|
||||
if(Memory.Max < 1024)
|
||||
fprintf(stderr,"Max Memory Usage.. %d bytes\n", Memory.Max);
|
||||
else if(Memory.Max < 1024*1024)
|
||||
fprintf(stderr,"Max Memory Usage.. %.2f Kilobytes\n", (float)Memory.Max/1024);
|
||||
else if(Memory.Max < 1024*1024*1024)
|
||||
fprintf(stderr,"Max Memory Usage.. %.2f Megabytes\n", (float)Memory.Max/(1024*1024));
|
||||
else
|
||||
fprintf(stderr,"Max Memory Usage.. %.2f Gigabytes\n", (float)Memory.Max/(1024*1024*1024));
|
||||
}
|
||||
void Update_Mem(int32_t Value){
|
||||
|
||||
extern MemUsage Memory;
|
||||
|
||||
Memory.Current += Value;
|
||||
Memory.Max = (Memory.Current>Memory.Max)?Memory.Current:Memory.Max;
|
||||
|
||||
}
|
||||
void Init_Mem(int32_t Value){
|
||||
|
||||
extern MemUsage Memory;
|
||||
|
||||
Memory.Current = Value;
|
||||
Memory.Max = Value;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Replace functions of dynamic allocation
|
||||
to allow the tracking of memory usage
|
||||
*/
|
||||
void *MyMalloc( int32_t size , char *Error ){
|
||||
|
||||
void *pointer;
|
||||
|
||||
/* fprintf(stderr,"malloc: %s %d\n",Error, size); */
|
||||
|
||||
pointer = malloc(size);
|
||||
if(!pointer)fprintf(stderr,"%s\n",Error),exit(3);
|
||||
|
||||
Update_Mem(size); /* fprintf(stderr,"Mem += %ld - malloc %s\n", size, Error); */
|
||||
|
||||
return pointer;
|
||||
}
|
||||
void *MyCalloc( int32_t number, int32_t TypeSize , char *Error ){
|
||||
|
||||
void *pointer;
|
||||
|
||||
/* fprintf(stderr,"calloc: %s %d of size %d\n", Error, number, TypeSize); */
|
||||
|
||||
pointer = calloc(number, TypeSize);
|
||||
if(!pointer)fprintf(stderr,"%s\n",Error),exit(3);
|
||||
|
||||
Update_Mem(number*TypeSize ); /* fprintf(stderr,"Mem += %ld - calloc %s\n", number*TypeSize, Error); */
|
||||
|
||||
return pointer;
|
||||
}
|
||||
void MyFree( void *Pointer, int32_t size){
|
||||
|
||||
if(!Pointer)return;
|
||||
|
||||
/* fprintf(stderr,"free: %d\n", size); */
|
||||
|
||||
free(Pointer);
|
||||
Pointer=NULL;
|
||||
|
||||
Update_Mem(-size); /* fprintf(stderr,"Mem -= %ld - free\n", size); */
|
||||
}
|
||||
void *MyRealloc( void *Pointer, int32_t newsize, int32_t oldsize, char *Error){
|
||||
|
||||
|
||||
/* fprintf(stderr,"realloc: %s %d (from %d)\n", Error, newsize, oldsize); */
|
||||
|
||||
Pointer = realloc(Pointer,newsize);
|
||||
if(!Pointer)fprintf(stderr,"%s\n",Error),exit(3);
|
||||
|
||||
Update_Mem( newsize-oldsize ); /* fprintf(stderr,"Mem += %ld - realloc %s\n", (newsize-oldsize), Error); */
|
||||
return Pointer;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Deal with Stacks structure for KMR
|
||||
|
||||
void MallocStack(Stacks *s, int32_t number, int32_t *histo, int32_t AllValues){
|
||||
|
||||
int32_t i;
|
||||
|
||||
s->nStacks = number;
|
||||
s->nValues = AllValues;
|
||||
|
||||
s->ppStacks = (int32_t **)MyMalloc( number * sizeof(int32_t *),
|
||||
"MallocStack: ppStacks malloc error, bye\n");
|
||||
s->lenStacks = (int32_t *)MyMalloc( number * sizeof(int32_t),
|
||||
"MallocStack: lenStacks malloc error, bye\n");
|
||||
s->ppStacks[0] = (int32_t *)MyMalloc( AllValues * sizeof(int32_t),
|
||||
"MallocStack: ppStacks[0] malloc error, bye\n");
|
||||
s->lenStacks[0]=0;
|
||||
|
||||
for(i=1;i < number; i++){
|
||||
s->lenStacks[i]=0;
|
||||
s->ppStacks[i] = s->ppStacks[i-1] + histo[i] ;
|
||||
}
|
||||
}
|
||||
|
||||
void FreeStack( Stacks *p){
|
||||
MyFree(p->ppStacks[0] , p->nValues*sizeof(int32_t) );
|
||||
MyFree(p->ppStacks , p->nStacks*sizeof(int32_t *) );
|
||||
MyFree(p->lenStacks , p->nStacks*sizeof(int32_t));
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
Deal with the Seeds part
|
||||
|
||||
|
||||
void free_Seeds(Seeds AllSeeds)
|
||||
{
|
||||
|
||||
if( AllSeeds.nDirSeeds ){
|
||||
MyFree(AllSeeds.DirPos1, AllSeeds.nDirSeeds*sizeof(int32_t));
|
||||
MyFree(AllSeeds.DirPos2, AllSeeds.nDirSeeds*sizeof(int32_t));
|
||||
MyFree(AllSeeds.DirLen, AllSeeds.nDirSeeds*sizeof(int32_t));
|
||||
MyFree(AllSeeds.DirMeanR, AllSeeds.nDirSeeds*sizeof(float));
|
||||
}
|
||||
|
||||
if(AllSeeds.nInvSeeds ){
|
||||
MyFree(AllSeeds.InvPos1, AllSeeds.nInvSeeds*sizeof(int32_t));
|
||||
MyFree(AllSeeds.InvPos2, AllSeeds.nInvSeeds*sizeof(int32_t));
|
||||
MyFree(AllSeeds.InvLen, AllSeeds.nInvSeeds*sizeof(int32_t));
|
||||
MyFree(AllSeeds.InvMeanR, AllSeeds.nInvSeeds*sizeof(float));
|
||||
}
|
||||
|
||||
}
|
||||
*/
|
||||
/*
|
||||
Malloc if it is the first time otherwise readjust
|
||||
|
||||
void AllocSeeds(Seeds *AllSeeds, int32_t size, int32_t old_size, int8_t opt_dir, int8_t opt_inv){
|
||||
|
||||
if(opt_inv != 1 && opt_dir != 1)
|
||||
fprintf(stderr,"AllocSeeds: requiere at least one of opt_dir and opt_inv to be 1\n"),exit(4);
|
||||
|
||||
if(opt_dir == 1){
|
||||
if( AllSeeds->DirPos1 == NULL){
|
||||
AllSeeds->DirPos1 = (int32_t *)MyCalloc(size , sizeof(int32_t),"AllocSeeds: Alloc for DirPos1 failed, bye");
|
||||
AllSeeds->DirPos2 = (int32_t *)MyCalloc(size , sizeof(int32_t),"AllocSeeds: Alloc for DirPos2 failed, bye");
|
||||
}
|
||||
else{
|
||||
AllSeeds->DirPos1 = (int32_t *)MyRealloc(AllSeeds->DirPos1, size * sizeof(int32_t), old_size* sizeof(int32_t),
|
||||
"AllocSeeds: realloc for DirPos1 failed, bye");
|
||||
AllSeeds->DirPos2 = (int32_t *)MyRealloc(AllSeeds->DirPos2, size * sizeof(int32_t), old_size* sizeof(int32_t),
|
||||
"AllocSeeds: realloc for DirPos2 failed, bye");
|
||||
}
|
||||
}
|
||||
|
||||
if(opt_inv == 1){
|
||||
if( AllSeeds->InvPos1 == NULL){
|
||||
AllSeeds->InvPos1 = (int32_t *)MyCalloc(size , sizeof(int32_t), "AllocSeeds: Alloc for InvPos1 failed, bye");
|
||||
AllSeeds->InvPos2 = (int32_t *)MyCalloc(size , sizeof(int32_t), "AllocSeeds: Alloc for InvPos2 failed, bye");
|
||||
}
|
||||
else{
|
||||
AllSeeds->InvPos1 = (int32_t *)MyRealloc(AllSeeds->InvPos1, size * sizeof(int32_t), old_size* sizeof(int32_t),
|
||||
"AllocSeeds: realloc for InvPos1 failed, bye");
|
||||
AllSeeds->InvPos2 = (int32_t *)MyRealloc(AllSeeds->InvPos2, size * sizeof(int32_t), old_size* sizeof(int32_t),
|
||||
"AllocSeeds: realloc for InvPos2 failed, bye");
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
Deal with the Repeats structure
|
||||
*/
|
||||
Repeats mem_Repeats(int32_t Ndir, int32_t Ninv){
|
||||
|
||||
Repeats AllRepeats; /* All Repeats structure */
|
||||
|
||||
AllRepeats.nDirRep = Ndir; /* set the number of repeats to the number of seeds */
|
||||
AllRepeats.nInvRep = Ninv;
|
||||
AllRepeats.nDirBadRep = 0; /* set the "bad" repet (included into another rep) as 0 */
|
||||
AllRepeats.nInvBadRep = 0;
|
||||
|
||||
if(AllRepeats.nDirRep)
|
||||
AllRepeats.DirRep = (Rep *)MyMalloc( (AllRepeats.nDirRep)*sizeof(Rep), "init_Repeats: repdir malloc error" );
|
||||
else
|
||||
AllRepeats.DirRep = NULL;
|
||||
|
||||
if(AllRepeats.nInvRep)
|
||||
AllRepeats.InvRep = (Rep *)MyMalloc( (AllRepeats.nInvRep)*sizeof(Rep), "init_Repeats: repinv malloc error" );
|
||||
else
|
||||
AllRepeats.InvRep = NULL;
|
||||
|
||||
return AllRepeats;
|
||||
}
|
||||
void free_Repeats(Repeats AllRep)
|
||||
{
|
||||
if(AllRep.nDirRep)
|
||||
MyFree(AllRep.DirRep, AllRep.nDirRep*sizeof(Rep));
|
||||
if(AllRep.nInvRep)
|
||||
MyFree(AllRep.InvRep, AllRep.nInvRep*sizeof(Rep));
|
||||
}
|
130
src/repseek/repseek-2014.09/memory.h
Normal file
130
src/repseek/repseek-2014.09/memory.h
Normal file
@ -0,0 +1,130 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/**
|
||||
* @file memory.h
|
||||
* @author Achaz G
|
||||
* @date April 2004
|
||||
*
|
||||
* @brief header for memory alloc/dealloc
|
||||
* modif : Dec 2004 <EC> ; Corrected Memory declaration
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#ifndef _MEMORY_H_
|
||||
#define _MEMORY_H_
|
||||
|
||||
#include "repseek_types.h"
|
||||
|
||||
typedef struct { /********** Memory Usage structure **************/
|
||||
|
||||
int32_t Max;
|
||||
int32_t Current;
|
||||
|
||||
} MemUsage;
|
||||
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#if defined(DMALLOC)
|
||||
#include <dmalloc.h>
|
||||
#endif
|
||||
|
||||
/********** **********
|
||||
|
||||
Global Variable(s)
|
||||
|
||||
********** **********/
|
||||
|
||||
extern MemUsage Memory; /* Instance of the global variable for memory tracking */
|
||||
|
||||
/*
|
||||
Follow memory usage
|
||||
*/
|
||||
void PrintMem(char *Comment);
|
||||
void PrintMaxMem( void );
|
||||
void Update_Mem(int32_t Value);
|
||||
void Init_Mem(int32_t Value);
|
||||
/*
|
||||
All Alloc/Free to follow of memory usage
|
||||
*/
|
||||
void *MyMalloc( int32_t size , char *Error );
|
||||
void *MyCalloc( int32_t number, int32_t TypeSize , char *Error );
|
||||
void MyFree( void *Pointer, int32_t size);
|
||||
void *MyRealloc( void *Pointer, int32_t newsize, int32_t oldsize, char *Error);
|
||||
/*
|
||||
For Stacks
|
||||
|
||||
void MallocStack(Stacks *s, int32_t number, int32_t *histo, int32_t AllValues);
|
||||
void FreeStack( Stacks *p);
|
||||
|
||||
For Seeds
|
||||
|
||||
void free_Seeds(Seeds AllSeeds);
|
||||
void AllocSeeds(Seeds *AllSeeds, int32_t size, int32_t old_size, int8_t opt_dir, int8_t opt_inv);
|
||||
*/
|
||||
|
||||
/*
|
||||
For Repeats
|
||||
*/
|
||||
Repeats mem_Repeats(int32_t Ndir, int32_t Ninv);
|
||||
void free_Repeats(Repeats AllRep);
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
Not used anymore, but just in case
|
||||
*/
|
||||
|
||||
|
||||
#define KMRK_MALLOC(var,type,size,message) { \
|
||||
var = (type*) malloc(size); \
|
||||
if (!var) \
|
||||
{ \
|
||||
fprintf(stderr,"%s\n",message); \
|
||||
exit(4); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define KMRK_CALLOC(var,type,length,message) { \
|
||||
var = (type*) calloc(length,sizeof(type)); \
|
||||
if (!var) \
|
||||
{ \
|
||||
fprintf(stderr,"%s\n",message); \
|
||||
exit(4); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define KMRK_REALLOC(var,type,size,message) { \
|
||||
var = (type*) realloc(var,size); \
|
||||
if (!var) \
|
||||
{ \
|
||||
fprintf(stderr,"%s\n",message); \
|
||||
exit(4); \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
#endif /* _MEMORY_H_*/
|
186
src/repseek/repseek-2014.09/memory_align.c
Normal file
186
src/repseek/repseek-2014.09/memory_align.c
Normal file
@ -0,0 +1,186 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/**
|
||||
* @file align_memory.c
|
||||
* @author amikezor <gachaz@oeb.harvard.edu>
|
||||
* @date Jul 07 2003
|
||||
* @modif July 2003, Oct 2003, Nov 2003 -small bug in realloc traceback, Sep 05
|
||||
*
|
||||
* @brief all malloc/realloc mem of the results structure
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include "repseek_types.h"
|
||||
#include "memory.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#define MIN_COLS 1000
|
||||
#define MIN_ROWS 1000
|
||||
#define MIN_ALIGN 10000
|
||||
#define MIN_SCORES 10000
|
||||
|
||||
/*
|
||||
All the memory you need for the alignment itself
|
||||
*/
|
||||
void mem_Blast2Align(RESULTS *pResults){
|
||||
|
||||
pResults->scores = (double *) MyCalloc( MIN_SCORES, sizeof(double) , "mem_Blast2Align: scores calloc error");
|
||||
pResults->traces = (int32_t *) MyCalloc( MIN_SCORES, sizeof(int32_t) , "mem_Blast2Align: traces calloc error" );
|
||||
|
||||
pResults->traceback_f = (char *) MyCalloc( MIN_ALIGN, sizeof(char) , "mem_Blast2Align: traceback_f calloc error");
|
||||
pResults->traceback_b = (char *) MyCalloc( MIN_ALIGN, sizeof(char) , "mem_Blast2Align: traceback_b calloc error");
|
||||
|
||||
pResults->Segment_begin = (int32_t *) MyCalloc( MIN_ROWS, sizeof(int32_t) , "mem_Blast2Align: Segment_begin calloc error");
|
||||
pResults->Segment_end = (int32_t *) MyCalloc( MIN_ROWS, sizeof(int32_t) , "mem_Blast2Align: Segment_end calloc error");
|
||||
|
||||
pResults->Top = (int32_t *) MyCalloc( MIN_COLS, sizeof(int32_t) , "mem_Blast2Align: Top calloc error");
|
||||
pResults->F = (double *) MyCalloc( MIN_COLS, sizeof(double) , "mem_Blast2Align: F calloc error");
|
||||
|
||||
|
||||
pResults->BestScore_row=0;
|
||||
pResults->BestScore_col=0;
|
||||
|
||||
pResults->max_scores=MIN_SCORES;
|
||||
pResults->max_col= MIN_COLS;
|
||||
pResults->max_row= MIN_ROWS;
|
||||
pResults->max_alignlen=MIN_ALIGN;
|
||||
|
||||
}
|
||||
/*
|
||||
The alignemnt results structure
|
||||
*/
|
||||
void free_Blast2Align(RESULTS *pResults){
|
||||
|
||||
MyFree(pResults->traceback_f, pResults->max_alignlen*sizeof(char));
|
||||
MyFree(pResults->traceback_b, pResults->max_alignlen*sizeof(char));
|
||||
MyFree(pResults->scores, pResults->max_scores*sizeof(double));
|
||||
MyFree(pResults->traces, pResults->max_scores*sizeof(int32_t));
|
||||
|
||||
MyFree(pResults->Top, pResults->max_col*sizeof(int32_t));
|
||||
MyFree(pResults->F, pResults->max_col*sizeof(double));
|
||||
|
||||
MyFree(pResults->Segment_begin, pResults->max_row*sizeof(int32_t));
|
||||
MyFree(pResults->Segment_end, pResults->max_row*sizeof(int32_t));
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
if sig == 0, then double mem,
|
||||
otherwise readjust to sig
|
||||
*/
|
||||
void ReallocScores(RESULTS *pResults, int32_t sig){
|
||||
|
||||
int32_t old=pResults->max_scores;
|
||||
|
||||
int32_t nscores = pResults->pscore - pResults->scores;
|
||||
int32_t Bscores = pResults->pBestScore - pResults->scores;
|
||||
|
||||
|
||||
if( ! sig )
|
||||
(pResults->max_scores)*=2;
|
||||
else
|
||||
pResults->max_scores=sig;
|
||||
|
||||
|
||||
pResults->traces = (int32_t *)MyRealloc( pResults->traces , sizeof(int32_t) * pResults->max_scores, old*sizeof(int32_t),
|
||||
"ReallocScores: realloc traces error, bye" );
|
||||
|
||||
pResults->scores = (double *)MyRealloc( pResults->scores , sizeof(double) * pResults->max_scores , old*sizeof(double),
|
||||
"ReallocScores: realloc scores error, bye" );
|
||||
|
||||
pResults->pscore=(pResults->scores + nscores); /* set correctly the current score */
|
||||
pResults->pBestScore=(pResults->scores + Bscores); /* set correctly the current best score */
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
if sig == 0, then double mem,
|
||||
otherwise readjust to sig
|
||||
*/
|
||||
void ReallocTraceback(RESULTS *pResults, int32_t sig, char direction){
|
||||
|
||||
int32_t old=pResults->max_alignlen;
|
||||
|
||||
if( ! sig )
|
||||
(pResults->max_alignlen)*=2;
|
||||
else
|
||||
pResults->max_alignlen=sig;
|
||||
|
||||
pResults->traceback_f = (char *)MyRealloc( pResults->traceback_f ,sizeof(char)*pResults->max_alignlen, old*sizeof(char),
|
||||
"ReallocTraceback: traceback_f realloc error, bye" );
|
||||
|
||||
pResults->traceback_b = (char *)MyRealloc( pResults->traceback_b ,sizeof(char)*pResults->max_alignlen, old*sizeof(char),
|
||||
"ReallocTraceback: traceback_b realloc error, bye" );
|
||||
|
||||
if(direction == 'f')
|
||||
pResults->traceback=pResults->traceback_f;
|
||||
else if(direction == 'b')
|
||||
pResults->traceback=pResults->traceback_b;
|
||||
else
|
||||
fprintf(stderr,"traceback: direction is either 'f' or 'b', bye\n"),exit(4);
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
if sig == 0, then double mem,
|
||||
otherwise readjust to sig
|
||||
*/
|
||||
void ReallocAbove(RESULTS *pResults, int32_t sig){
|
||||
|
||||
int32_t old=pResults->max_col;
|
||||
|
||||
if( ! sig )
|
||||
(pResults->max_col)*=2;
|
||||
else
|
||||
pResults->max_col=sig;
|
||||
|
||||
pResults->Top = (int32_t *)MyRealloc( (void *)pResults->Top ,sizeof(int32_t)*pResults->max_col, sizeof(int32_t)*old, "ReallocAbove: realloc error, bye" );
|
||||
pResults->F = (double *)MyRealloc( (void *)pResults->F ,sizeof(double)*pResults->max_col, sizeof(double)*old, "ReallocAbove: realloc error, bye" );
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
if sig == 0, then double mem,
|
||||
otherwise readjust to sig
|
||||
*/
|
||||
void ReallocBegEnd(RESULTS *pResults, int32_t sig){
|
||||
|
||||
int32_t old=pResults->max_row;
|
||||
|
||||
if( ! sig )
|
||||
(pResults->max_row)*=2;
|
||||
else
|
||||
pResults->max_row=sig;
|
||||
|
||||
pResults->Segment_begin = (int32_t *)MyRealloc( (void *)pResults->Segment_begin , sizeof(int32_t)*pResults->max_row ,
|
||||
sizeof(int32_t)*old, "ReallocBegEnd: Segment_begin realloc error, bye");
|
||||
|
||||
pResults->Segment_end = (int32_t *)MyRealloc( (void *)pResults->Segment_end ,sizeof(int32_t)*pResults->max_row ,
|
||||
sizeof(int32_t)*old, "ReallocBegEnd: Segment_end realloc error, bye");
|
||||
}
|
||||
|
||||
|
||||
#undef MIN_SCORES
|
||||
#undef MIN_COLS
|
||||
#undef MIN_ALIGN
|
47
src/repseek/repseek-2014.09/memory_align.h
Normal file
47
src/repseek/repseek-2014.09/memory_align.h
Normal file
@ -0,0 +1,47 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/**
|
||||
* @file align_memory.h
|
||||
* @author amikezor <gachaz@oeb.harvard.edu>
|
||||
* @date April 2004
|
||||
*
|
||||
* @brief header for all memory alloc/dealloc for the alignemnt part.
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef _align_memory_h_
|
||||
#define _align_memory_h_
|
||||
|
||||
|
||||
void mem_Blast2Align(RESULTS *pResults);
|
||||
|
||||
void free_Blast2Align(RESULTS *pResults);
|
||||
|
||||
void ReallocScores(RESULTS *pResults, int32_t sig);
|
||||
|
||||
void ReallocTraceback(RESULTS *pResults, int32_t sig, char direction);
|
||||
|
||||
void ReallocAbove(RESULTS *pResults, int32_t sig);
|
||||
|
||||
void ReallocBegEnd(RESULTS *pResults, int32_t sig);
|
||||
|
||||
|
||||
#endif /* _align_memory_h_ */
|
215
src/repseek/repseek-2014.09/output.c
Normal file
215
src/repseek/repseek-2014.09/output.c
Normal file
@ -0,0 +1,215 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/***
|
||||
file : output.c
|
||||
function : write out all information
|
||||
|
||||
created : 02 Oct 2002
|
||||
modif : July 2003 , Oct 2003, July04 (score is now a double), dec 2005
|
||||
|
||||
author : amikezor
|
||||
***/
|
||||
|
||||
#include "repseek_types.h"
|
||||
#include "KMRK_Seeds.h"
|
||||
#include "align.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
|
||||
|
||||
|
||||
/***
|
||||
Write out all repeats
|
||||
***/
|
||||
|
||||
#define FPF fprintf(fout,
|
||||
|
||||
void WriteRep(FILE *fout, Repeats AllRepeats, int8_t opt_dir, int8_t opt_inv, char opt_shape, int32_t sizeofchr)
|
||||
{
|
||||
|
||||
int32_t i;
|
||||
int32_t spacer;
|
||||
|
||||
if(!opt_dir && !opt_inv)
|
||||
fprintf(stderr, "WriteRep: need opt_dir or opt_inv\n"),exit(4);
|
||||
|
||||
if(opt_shape!='l' && opt_shape!='c')
|
||||
fprintf(stderr, "WriteRep: need opt_shape set to 'l' or 'c'\n"),exit(4);
|
||||
|
||||
if(opt_dir){
|
||||
|
||||
#define REP(i) AllRepeats.DirRep[i]
|
||||
|
||||
for(i=0; i<AllRepeats.nDirRep; i++){
|
||||
|
||||
if(REP(i).pos1 == -1)continue;
|
||||
|
||||
if(opt_shape == 'l')
|
||||
spacer = REP(i).pos2 -1 - (REP(i).pos1+REP(i).len1-1);
|
||||
else
|
||||
spacer=MIN( REP(i).pos2-1 - (REP(i).pos1+REP(i).len1-1) , sizeofchr-1 - (REP(i).pos2+REP(i).len2-1) + REP(i).pos1 );
|
||||
|
||||
FPF "%s.dir\t%ld\t%ld\t%ld\t%ld\t%ld\t",REP(i).type, (long) REP(i).pos1+1, (long) REP(i).pos2+1, (long) REP(i).len1, (long) REP(i).len2, (long) spacer);
|
||||
FPF "%ld-%ld-%ld-%.2f\t", (long) REP(i).seed_pos1+1, (long) REP(i).seed_pos2+1, (long) REP(i).seed_len, REP(i).seed_meanR);
|
||||
FPF "%.3f\t%.2f\t", 100*(float)REP(i).match/(float)REP(i).align_len, REP(i).score );
|
||||
FPF "%.2f\t%ld\t%.2f\n", REP(i).meanR, (long) REP(i).mainR, REP(i).fraction_mainR );
|
||||
|
||||
}
|
||||
|
||||
#undef REP
|
||||
}
|
||||
|
||||
if(opt_inv){
|
||||
|
||||
#define REP(i) AllRepeats.InvRep[i]
|
||||
|
||||
for(i=0; i<AllRepeats.nInvRep; i++){
|
||||
|
||||
if(REP(i).pos1 == -1)continue;
|
||||
|
||||
if(opt_shape == 'l')
|
||||
spacer = REP(i).pos2 -1 - (REP(i).pos1+REP(i).len1-1);
|
||||
else
|
||||
spacer=MIN( REP(i).pos2-1 - (REP(i).pos1+REP(i).len1-1) , sizeofchr-1 - (REP(i).pos2+REP(i).len2-1) + REP(i).pos1 );
|
||||
|
||||
|
||||
FPF "%s.inv\t%ld\t%ld\t%ld\t%ld\t%ld\t",REP(i).type,(long) REP(i).pos1+1, (long) REP(i).pos2+1, (long) REP(i).len1, (long) REP(i).len2,(long) spacer);
|
||||
FPF "%ld-%ld-%ld-%.2f\t", (long)REP(i).seed_pos1+1, (long)REP(i).seed_pos2+1, (long)REP(i).seed_len, REP(i).seed_meanR);
|
||||
FPF "%.3f\t%.2f\t", 100*(float)REP(i).match/(float)REP(i).align_len, REP(i).score );
|
||||
FPF "%.2f\t%ld\t%.2f\n", REP(i).meanR, (long) REP(i).mainR, REP(i).fraction_mainR );
|
||||
}
|
||||
|
||||
#undef REP
|
||||
}
|
||||
}
|
||||
|
||||
/***
|
||||
Write out all repeats
|
||||
**/
|
||||
void WriteRep_2seqs(FILE *fout, Repeats AllRepeats, int8_t opt_dir, int8_t opt_inv, int32_t size1, int32_t size2)
|
||||
{
|
||||
|
||||
int32_t i;
|
||||
int32_t diff2edge, diff1, diff2;
|
||||
|
||||
|
||||
if(!opt_dir && !opt_inv)
|
||||
fprintf(stderr, "WriteRep2: need opt_dir or opt_inv\n"),exit(4);
|
||||
|
||||
if(opt_dir){
|
||||
|
||||
#define REP(i) AllRepeats.DirRep[i]
|
||||
|
||||
for(i=0; i<AllRepeats.nDirRep; i++){
|
||||
|
||||
if(REP(i).pos1 == -1)continue;
|
||||
|
||||
diff1 = ( REP(i).pos1+1>size1/2 )? REP(i).pos1+1-size1/2 : REP(i).pos1+1;
|
||||
diff2 = ( REP(i).pos2> size2/2 )? REP(i).pos2 -size2/2 : REP(i).pos2;
|
||||
diff2edge = ABS(diff1 - diff2);
|
||||
|
||||
FPF "%s.dir\t%ld\t%ld\t%ld\t%ld\t%ld\t","Interseq",(long) REP(i).pos1+1, (long) REP(i).pos2+1, (long) REP(i).len1, (long) REP(i).len2, (long)diff2edge);
|
||||
FPF "%ld-%ld-%ld\t%.3f\t%.2f\t",(long)REP(i).seed_pos1+1, (long) REP(i).seed_pos2+1, (long) REP(i).seed_len,
|
||||
100*(float)REP(i).match / (float)REP(i).align_len, REP(i).score );
|
||||
FPF "%.2f\t%ld\t%.2f\n", REP(i).meanR, (long) REP(i).mainR, REP(i).fraction_mainR );
|
||||
}
|
||||
|
||||
#undef REP
|
||||
}
|
||||
|
||||
|
||||
if(opt_inv){
|
||||
|
||||
#define REP(i) AllRepeats.InvRep[i]
|
||||
|
||||
for(i=0; i<AllRepeats.nInvRep; i++){
|
||||
|
||||
if(REP(i).pos1 == -1)continue;
|
||||
|
||||
diff1 = (REP(i).pos1+1>size1/2)?REP(i).pos1+1-size1/2:REP(i).pos1+1;
|
||||
diff2 = (REP(i).pos2> size2/2)?REP(i).pos2-size2/2:REP(i).pos2;
|
||||
diff2edge = ABS(diff1 - diff2);
|
||||
|
||||
|
||||
FPF "%s.inv\t%ld\t%ld\t%ld\t%ld\t%ld\t","Interseq", (long)REP(i).pos1+1, (long) REP(i).pos2+1 , (long) REP(i).len1, (long) REP(i).len2, (long) diff2edge);
|
||||
FPF "%ld-%ld-%ld\t%.3f\t%.2f\t", (long) REP(i).seed_pos1+1, (long) REP(i).seed_pos2+1, (long) REP(i).seed_len,
|
||||
100*(float)REP(i).match / (float)REP(i).align_len, REP(i).score);
|
||||
FPF "%.2f\t%ld\t%.2f\n", REP(i).meanR, (long) REP(i).mainR, REP(i).fraction_mainR );
|
||||
}
|
||||
|
||||
#undef REP
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/***
|
||||
Write out the table_r (degree of repetition of each seq position)
|
||||
***/
|
||||
|
||||
void WriteTableR(FILE *fout, int32_t *table_r, int32_t max)
|
||||
{
|
||||
|
||||
int32_t i;
|
||||
|
||||
if( table_r == NULL )
|
||||
return;
|
||||
|
||||
for(i=0;i<max; i++)
|
||||
if(table_r[i]>=2)
|
||||
fprintf(fout,
|
||||
"TableR %ld %ld\n",
|
||||
(long) i+1,
|
||||
(long) table_r[i]);
|
||||
}
|
||||
|
||||
/***
|
||||
Write out the seeds that will be used for extension phase
|
||||
***/
|
||||
|
||||
void WriteSeeds(FILE *fout,
|
||||
AllSeeds_type *AllSeeds,
|
||||
int8_t opt_dir,
|
||||
int8_t opt_inv)
|
||||
{
|
||||
Seed_type *curSeed;
|
||||
|
||||
int32_t i; /* just a dummy counter */
|
||||
|
||||
if(!opt_dir && !opt_inv)
|
||||
fprintf(stderr, "WriteSeeds: need opt_dir or opt_inv\n"),exit(4);
|
||||
|
||||
if(opt_dir){
|
||||
for(i=0,curSeed = AllSeeds->dirSeeds; i<AllSeeds->nDirSeeds; i++,curSeed++)
|
||||
FPF "d\t%ld\t%ld\t%ld\t%.2f\n", (long) curSeed->pos1 + 1, (long) curSeed->pos2 + 1, (long) curSeed->length, curSeed->rmean);
|
||||
}
|
||||
|
||||
if(opt_inv){
|
||||
for(i=0,curSeed = AllSeeds->invSeeds;i<AllSeeds->nInvSeeds;i++,curSeed++)
|
||||
FPF "i\t%ld\t%ld\t%ld\t%.2f\n", (long) curSeed->pos1 + 1, (long) curSeed->pos2 + 1, (long) curSeed->length, curSeed->rmean);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#undef FPF
|
||||
|
||||
|
47
src/repseek/repseek-2014.09/output.h
Normal file
47
src/repseek/repseek-2014.09/output.h
Normal file
@ -0,0 +1,47 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/**
|
||||
* @file output.h
|
||||
* @author Guillaume Achaz <gachaz@oeb.harvard.edu>
|
||||
* @date April 2004
|
||||
* @modif Dec 2005
|
||||
* @brief header for output functions
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef output_h
|
||||
#define output_h
|
||||
|
||||
#include "KMRK_Seeds.h"
|
||||
|
||||
|
||||
void WriteSeeds(FILE *fout,
|
||||
AllSeeds_type *AllSeeds,
|
||||
int8_t opt_dir,
|
||||
int8_t opt_inv);
|
||||
|
||||
|
||||
void WriteRep_2seqs(FILE *fout, Repeats AllRepeats, int8_t opt_dir, int8_t opt_inv, int32_t size1, int32_t size2);
|
||||
void WriteRep(FILE *fout, Repeats AllRepeats, int8_t opt_dir, int8_t opt_inv, char opt_shape, int32_t sizeofchr);
|
||||
void WriteTableR(FILE *fout, int32_t *table_r, int32_t max);
|
||||
|
||||
|
||||
#endif /* output_h */
|
108
src/repseek/repseek-2014.09/read_seeds.c
Normal file
108
src/repseek/repseek-2014.09/read_seeds.c
Normal file
@ -0,0 +1,108 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/******
|
||||
file : read_seeds.c
|
||||
function : read seeds from a file (instead of using KMRK)
|
||||
|
||||
created : Dec 2005
|
||||
modif :
|
||||
|
||||
author : amikezor
|
||||
*****/
|
||||
|
||||
|
||||
#include "repseek_types.h"
|
||||
#include "KMRK_mask.h"
|
||||
#include "KMRK_Seeds.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
/*
|
||||
This function takes a file name
|
||||
the minimum length, if dir and/or inverted should be read
|
||||
the size of the firt sequence and the size of the second sequence (if any))
|
||||
*/
|
||||
AllSeeds_type *read_seeds(char *seed_file, int32_t lmin, int8_t opt_dir, int8_t opt_inv, int32_t size1, int32_t size2){
|
||||
|
||||
|
||||
AllSeeds_type *all_seeds=NULL; /* a seed pointer */
|
||||
FILE *fcin=NULL; /* file descriptor */
|
||||
|
||||
char direction; /* either 'd' or 'i' */
|
||||
int8_t dir; /* 1 if direct, 0 otherwise */
|
||||
|
||||
int32_t begin, /* temporary begin, end and len of a seed - tmp is needed when swap is requiered */
|
||||
end,
|
||||
len,
|
||||
tmp;
|
||||
|
||||
int32_t nseed=0; /* the seed_number */
|
||||
|
||||
|
||||
all_seeds = KMRK_allocSeeds(all_seeds, 1, opt_dir, opt_inv); /* get seeds */
|
||||
|
||||
/*all_seeds = KMRK_allocSeeds(all_seeds, 1, 1, 1);*/ /* get seeds */
|
||||
/*all_seeds = KMRK_allocSeeds(all_seeds, 1, 0, 1);*/ /* get seeds */
|
||||
|
||||
|
||||
fcin = fopen(seed_file, "r");
|
||||
if(!fcin)fprintf(stderr, "error whilst trying to read seed_file %s, please check it; bye\n", seed_file),exit(2);
|
||||
|
||||
while( fscanf(fcin, "%c%ld%ld%ld", &direction, (long *) &begin, (long *) &end, (long *) &len) == 4){
|
||||
|
||||
dir=1; /* by default it is direct -- for the switch -- */
|
||||
|
||||
switch(direction){
|
||||
|
||||
case 'i':
|
||||
if(begin>end){ tmp=begin; begin=end; end=tmp; }
|
||||
dir=0;
|
||||
|
||||
case 'd':
|
||||
if( (dir && !opt_dir) || (!dir && !opt_inv) )break;
|
||||
if(len>=lmin)
|
||||
{
|
||||
if( begin<1 ||
|
||||
( !size2 && end+len-1>size2 ) ||
|
||||
( size2 && (end+len-1>size2 || begin+len-1>size1) )
|
||||
)
|
||||
KMRK_pushSeed(all_seeds, begin-1, end-1, len, dir);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
fprintf(stderr, "direction of seed at line %d is not valid, bye",nseed+1) , exit(4);
|
||||
|
||||
}
|
||||
|
||||
while( fgetc(fcin) != '\n'); /* remove other infos at the end of line */
|
||||
|
||||
nseed++;
|
||||
}
|
||||
|
||||
|
||||
fclose(fcin);
|
||||
KMRK_compactSeeds(all_seeds);
|
||||
|
||||
return all_seeds;
|
||||
}
|
||||
|
||||
|
32
src/repseek/repseek-2014.09/read_seeds.h
Normal file
32
src/repseek/repseek-2014.09/read_seeds.h
Normal file
@ -0,0 +1,32 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/******
|
||||
file : read_seeds.h
|
||||
function : header for read seeds
|
||||
|
||||
created : Dec 2005
|
||||
modif :
|
||||
|
||||
author : amikezor
|
||||
*****/
|
||||
|
||||
#include "repseek_types.h"
|
||||
|
||||
AllSeeds_type *read_seeds(char *seed_file, int32_t lmin, int8_t opt_dir, int8_t opt_inv, int32_t size1, int32_t size2);
|
222
src/repseek/repseek-2014.09/readfst.c
Normal file
222
src/repseek/repseek-2014.09/readfst.c
Normal file
@ -0,0 +1,222 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/******
|
||||
file : readfst.c
|
||||
function : what you need to read the fst files
|
||||
|
||||
created : Oct 03 2003
|
||||
modif : may 2008 - count Xs in the sequence
|
||||
: sept 2014 - add an extra-check when reading fasta file
|
||||
|
||||
note : derived from the libiofst src, changing the malloc() to MyMalloc()
|
||||
author : amikezor
|
||||
*****/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "repseek_types.h"
|
||||
#include "sequence.h"
|
||||
#include "memory.h"
|
||||
|
||||
|
||||
static int32_t size_oneseq(FILE *fseq)
|
||||
{
|
||||
int32_t size, count=0, bonux=0, /* size of the seq, count number of letter/line, and final carac(\n or nothing) */
|
||||
begin_oct, end_oct; /* begin and and end in octet */
|
||||
|
||||
int temp; /* used for temp memory */
|
||||
|
||||
while( fgetc(fseq) != '\n'); /* get out the first line [comments] */
|
||||
|
||||
begin_oct = ftell(fseq); /* where begin the second line */
|
||||
|
||||
while( fgetc(fseq) != '\n')
|
||||
count++; /* get the size of the second line */
|
||||
|
||||
|
||||
fseek(fseq, -1, SEEK_END); /* Do we have ACGT\n[EOF] or ACGT[EOF] */
|
||||
bonux = (fgetc(fseq) == '\n')?1:0;
|
||||
|
||||
fseek(fseq,-count-bonux,SEEK_END);
|
||||
|
||||
while( fgetc(fseq) != '\n');
|
||||
|
||||
end_oct = ftell(fseq); /* where finish the ante-last line */
|
||||
|
||||
size = (end_oct-begin_oct)/(count+1);
|
||||
size*=count;
|
||||
|
||||
count = 0;
|
||||
while((temp=fgetc(fseq)) != EOF && temp != '\n' )
|
||||
count++; /* get the last line */
|
||||
|
||||
size+=count;
|
||||
|
||||
fseek(fseq,0,SEEK_SET); /* go back to the begin of the file */
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
|
||||
static int32_t get_curfst(char *sequence, FILE *fseq, long size_seq)
|
||||
{
|
||||
|
||||
int32_t lseq;
|
||||
int letter;
|
||||
|
||||
lseq = 0;
|
||||
|
||||
while(fgetc(fseq) != '\n'); /* lecture de la premiere ligne */
|
||||
|
||||
|
||||
while ((letter=fgetc(fseq)) != EOF || letter== '>')
|
||||
if (letter !='\n')
|
||||
sequence[lseq++]=letter;
|
||||
|
||||
if(letter == '>')
|
||||
fseek(fseq, -1, SEEK_CUR);
|
||||
|
||||
if(lseq != size_seq){
|
||||
fprintf(stderr, "\n!! Your input fasta file seems not correct !!\n");
|
||||
fprintf(stderr, "\t * the first line must start with '>'.\n");
|
||||
fprintf(stderr, "\t * all sequence lines (except the last one) must have the same size.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
sequence[lseq]=0;
|
||||
return lseq;
|
||||
|
||||
}
|
||||
|
||||
static int32_t numseq(FILE *ffile)
|
||||
{
|
||||
|
||||
int32_t number=0;
|
||||
int tmp;
|
||||
|
||||
|
||||
if(fgetc(ffile) != '>')
|
||||
fprintf(stderr,"Check your fasta sequences\n"), exit(4);
|
||||
|
||||
rewind(ffile); /* go back at the begining */
|
||||
|
||||
while( ( tmp=fgetc(ffile) ) != EOF )
|
||||
if(tmp == '>'){
|
||||
number++;
|
||||
while(fgetc(ffile) != '\n');
|
||||
}
|
||||
|
||||
|
||||
rewind(ffile); /*return at the file's begining*/
|
||||
|
||||
return number;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
char *file2mem(char *file, int32_t *size) /*le pointeur size permet de recup la size */
|
||||
{
|
||||
|
||||
FILE *fseq;
|
||||
char *seq;
|
||||
|
||||
fseq = fopen(file,"r");
|
||||
if(fseq == NULL)
|
||||
fprintf(stderr,"Flux error, check the fasta seq file\n"),exit(3);
|
||||
|
||||
|
||||
if( numseq(fseq) == 1)
|
||||
*size = size_oneseq(fseq); /* return length of seq */
|
||||
else
|
||||
fprintf(stderr,"there is not exactly ONE sequence in that file\n"),exit(2);
|
||||
|
||||
seq = (char *)MyMalloc((*size+1)*sizeof(char),"get_chr: Malloc error");
|
||||
|
||||
get_curfst(seq, fseq, *size);
|
||||
|
||||
fclose(fseq);
|
||||
|
||||
return seq;
|
||||
}
|
||||
|
||||
|
||||
char *fuse2files_mem(char *file1, char *file2, int32_t *size1, int32_t *size2)
|
||||
{
|
||||
|
||||
FILE *fseq1; /* pointer to the first file */
|
||||
FILE *fseq2; /* pointer to the second one */
|
||||
char *seq; /* the sequence seq1_X_seq2 */
|
||||
|
||||
int32_t size; /* the total size = size1+size2+1 */
|
||||
|
||||
fseq1 = fopen(file1,"r");
|
||||
fseq2 = fopen(file2,"r");
|
||||
if(!fseq1 || !file2)fprintf(stderr,"twofiles2mem: Flux error, check the fasta seq files\n"),exit(3);
|
||||
|
||||
if( numseq(fseq1) == 1)
|
||||
*size1 = size_oneseq(fseq1); /* return length of seq1 */
|
||||
else
|
||||
fprintf(stderr,"twofiles2mem: there is not only 1 sequence in file %s\n", file1),exit(2);
|
||||
|
||||
|
||||
if( numseq(fseq2) == 1)
|
||||
*size2 = size_oneseq(fseq2); /* return length of seq2 */
|
||||
else
|
||||
fprintf(stderr,"twofiles2mem: there is not only 1 sequence in file %s\n", file2),exit(2);
|
||||
|
||||
size = (*size1) + 1 + (*size2);
|
||||
|
||||
seq = (char *)MyMalloc((size+1)*sizeof(char),"twofiles2mem: malloc error");
|
||||
|
||||
if (get_curfst(seq, fseq1, *size1) != *size1)
|
||||
fprintf(stderr,"twofiles2mem: sequence of file %s is not clean\n",file1),exit(4);
|
||||
|
||||
seq[ *size1 ]='X';
|
||||
|
||||
if (get_curfst(seq + (*size1)+1, fseq2, *size2) != *size2)
|
||||
fprintf(stderr,"twofiles2mem: sequence of file %s is not clean\n",file2),exit(4);
|
||||
|
||||
fclose(fseq1);
|
||||
fclose(fseq2);
|
||||
|
||||
return seq;
|
||||
}
|
||||
|
||||
char *readFastaSeq(char *file, int32_t *size, int32_t *nX)
|
||||
{
|
||||
|
||||
char *sequence;
|
||||
|
||||
/*fprintf(stderr,"get sequence... %s ",file);*/
|
||||
|
||||
sequence = file2mem(file, size);
|
||||
(void) UpperSequence(sequence); /* turn into UPPER sequence */
|
||||
nonACGTXtoN(sequence); /* set all non ACGTX into N */
|
||||
(void )CheckSeq(sequence, "ACGTX"); /* check the number of bad symbols */
|
||||
|
||||
*nX=countX(sequence);
|
||||
|
||||
|
||||
return sequence;
|
||||
}
|
||||
|
40
src/repseek/repseek-2014.09/readfst.h
Normal file
40
src/repseek/repseek-2014.09/readfst.h
Normal file
@ -0,0 +1,40 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/**
|
||||
* @file KMRK_readfst.h
|
||||
* @author Eric Coissac <coissac@inrialpes.fr>
|
||||
* @date Tue Feb 24 22:11:07 2004
|
||||
*
|
||||
* @brief Header file for fasta file reading
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#ifndef KMRK_readfst_h
|
||||
#define KMRK_readfst_h
|
||||
|
||||
char *file2mem(char *file, int32_t *size);
|
||||
|
||||
char *readFastaSeq(char *file, int32_t *size, int32_t *nX);
|
||||
|
||||
char *fuse2files_mem(char *file1, char *file2, int32_t *size1, int32_t *size2);
|
||||
|
||||
#endif /* KMRK_readfst_h */
|
226
src/repseek/repseek-2014.09/repseek_types.h
Normal file
226
src/repseek/repseek-2014.09/repseek_types.h
Normal file
@ -0,0 +1,226 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/**
|
||||
* @file repseek_types.h
|
||||
* @author Guillaume Achaz <gachaz@oeb.harvard.edu>
|
||||
* @date April 2004
|
||||
* @modif July 2004 turn scores into doubles
|
||||
* @brief definition of general types and macros for repseek
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#ifndef _REPSEEK_TYPES_
|
||||
#define _REPSEEK_TYPES_
|
||||
|
||||
/*
|
||||
Version of the program
|
||||
*/
|
||||
#define REPSEEK_VERSION "6.6"
|
||||
#define REPSEEK_DATE "Sept, 09, 2014"
|
||||
|
||||
|
||||
|
||||
/********** **********
|
||||
|
||||
General Macros
|
||||
|
||||
********** **********/
|
||||
|
||||
/*
|
||||
Macros to compare 2 or three values
|
||||
*/
|
||||
#define MAX2( A, B ) ( ((A)>(B))?(A):(B) )
|
||||
#define MAX3( A, B, C ) ( ((MAX2(A,B))>(C))?(MAX2(A,B)):(C) )
|
||||
#define MIN2( A, B ) ( ((A)<(B))?(A):(B) )
|
||||
|
||||
#define MAX( A, B ) MAX2( A, B )
|
||||
#define MIN( A, B ) MIN2( A, B )
|
||||
|
||||
/*
|
||||
Absolute values
|
||||
*/
|
||||
#define ABS(x) (( (x)>=0 )? (x) : -(x))
|
||||
|
||||
|
||||
|
||||
/********** **********
|
||||
|
||||
All types used in repseek
|
||||
|
||||
********** **********/
|
||||
|
||||
#include <stdio.h> /* The type FILE * is defined here */
|
||||
|
||||
#ifdef OSF
|
||||
typedef signed char int8_t;
|
||||
typedef short int16_t;
|
||||
typedef long int32_t;
|
||||
#else
|
||||
#include <stdint.h> /* all, the int??_t are defined in there for typical unix */
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Store informations about one STRICT repeat (seeds)
|
||||
*
|
||||
*/
|
||||
typedef struct { /* the complete seed structure */
|
||||
|
||||
int32_t pos1; /**< position of the first copy */
|
||||
int32_t pos2; /**< position of the second copy */
|
||||
|
||||
int32_t length; /**< length of the strict repeats */
|
||||
float rmean; /**< mean repeat leavel */
|
||||
|
||||
} Seed_type;
|
||||
|
||||
typedef struct { /* Just after a KMRK length X, only the 2 pos matter */
|
||||
|
||||
int32_t pos1; /**< postion of the first copy */
|
||||
int32_t pos2; /**< postion of the second copy */
|
||||
|
||||
} SmallSeed_type;
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Store informations about all strict repeat (seeds)
|
||||
*
|
||||
*/
|
||||
typedef struct {
|
||||
|
||||
int32_t cDirSeeds; /**< currently allocated space in dirSeeds array */
|
||||
int32_t nDirSeeds; /**< count of direct strict repeats */
|
||||
int32_t nFilteredDirSeeds; /**< ??? */
|
||||
|
||||
Seed_type* dirSeeds; /**< array of direct repeats */
|
||||
|
||||
int32_t cInvSeeds; /**< currently allocated space in invSeeds array */
|
||||
int32_t nInvSeeds; /**< count of inverted strict repeats */
|
||||
int32_t nFilteredInvSeeds; /**< ??? */
|
||||
|
||||
Seed_type* invSeeds; /**< array of inverted repeats */
|
||||
|
||||
} AllSeeds_type;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Store informations about one GENERIC repeat
|
||||
*
|
||||
*/
|
||||
typedef struct{
|
||||
|
||||
char type[20]; /* its name; i.e. Tandem, Palindrome, etc... */
|
||||
|
||||
int32_t pos1, pos2, /* both copies postions */
|
||||
len1, len2, /* both copies length */
|
||||
seed_pos1,seed_pos2, /* pos1 and pos2 of the originate seed */
|
||||
seed_len, /* len of the seed */
|
||||
|
||||
match, align_len; /* number of match and length of alignment */
|
||||
|
||||
double score; /* the alignment score */
|
||||
|
||||
float seed_meanR; /* the seed meanR */
|
||||
|
||||
float meanR; /* The mean R-level of the repeat */
|
||||
int32_t mainR; /* its Mode R */
|
||||
float fraction_mainR; /* the fraction of length containing the Mode R */
|
||||
|
||||
|
||||
} Rep;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Store informations about All GENERIC repeats
|
||||
*
|
||||
*/
|
||||
typedef struct {
|
||||
|
||||
int32_t nDirRep; /* Total Number of Direct Repats in Mem */
|
||||
int32_t nDirBadRep; /* Direct repeats set to -1 -- filtered out and co. */
|
||||
Rep *DirRep; /* The array of Dir Rep */
|
||||
|
||||
int32_t nInvRep; /* Total Number of Inverted Repats in Mem */
|
||||
int32_t nInvBadRep; /* Inverted Repeats set to -1 -- filtered out and co. */
|
||||
Rep *InvRep; /* The array of Inverted Rep */
|
||||
|
||||
} Repeats;
|
||||
|
||||
|
||||
#define MATRIX_SIZE 26
|
||||
|
||||
typedef struct { /******* The scoring Matrix ************/
|
||||
|
||||
double matrix[MATRIX_SIZE][MATRIX_SIZE]; /* the matrix of match/missmatch */
|
||||
double gap_open; /* value of gap-open */
|
||||
double gap_ext; /* value of gap_ext */
|
||||
double expect;
|
||||
|
||||
} SCORING;
|
||||
|
||||
|
||||
|
||||
typedef struct { /******* The Results of Alignement by Dynamik programming ************/
|
||||
|
||||
|
||||
double *scores; /* the score strings (+/- 'matrix') */
|
||||
double *pscore; /* pointer to the current score */
|
||||
|
||||
double *F; /* *F is a standard to describe the best score from top */
|
||||
|
||||
int32_t *traces; /* the path matrix - could take values 0 for diagonal, >0 for top and <0 for left */
|
||||
int32_t *Top; /* *Top is the number of steps from TOP -- needed for memorizing deletion in seq2 */
|
||||
int32_t Left;
|
||||
|
||||
|
||||
double *pBestScore; /* pointer to it */
|
||||
int32_t BestScore_row; /* its row and col */
|
||||
int32_t BestScore_col;
|
||||
|
||||
char *traceback; /* all you need for bactracking */
|
||||
char *traceback_f; /* memory for forward traceback and then check other seeds */
|
||||
char *traceback_b; /* memory needed for backward traceback - to avoid erasing the forward one */
|
||||
|
||||
int32_t alignment_len; /* guess ?? */
|
||||
int32_t matches; /* number of match (score>0 in scoring matrix) */
|
||||
|
||||
int32_t nSegment; /* number of segment */
|
||||
int32_t *Segment_begin; /* begin and end of each segment */
|
||||
int32_t *Segment_end;
|
||||
|
||||
int32_t max_scores; /* size of the matrices ; only for memory purposes */
|
||||
int32_t max_col;
|
||||
int32_t max_row;
|
||||
int32_t max_alignlen;
|
||||
|
||||
} RESULTS;
|
||||
|
||||
|
||||
|
||||
#endif /* _REPSEEK_TYPES_ */
|
||||
|
141
src/repseek/repseek-2014.09/sequence.c
Normal file
141
src/repseek/repseek-2014.09/sequence.c
Normal file
@ -0,0 +1,141 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/**
|
||||
* @file sequence.c
|
||||
* @author amikezor
|
||||
* @date 20 Dec 2002
|
||||
* @modif Oct 2003, April 2004, may 2008 - count Xs in the sequence
|
||||
*
|
||||
* @brief sequence manipulation
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include "repseek_types.h"
|
||||
|
||||
/* -------------------------------------------- */
|
||||
/* check sequence symbols */
|
||||
/* -------------------------------------------- */
|
||||
int8_t CheckSeq(char *seq, char *alpha)
|
||||
{
|
||||
int32_t ntot, nbad;
|
||||
|
||||
|
||||
for ( ntot = nbad = 0; *seq ; seq++, ntot++)
|
||||
if (! strchr(alpha, *seq))
|
||||
nbad++;
|
||||
|
||||
if (nbad){
|
||||
fprintf(stderr, "*WARNING* %d non-%s symbols on a total of %d ... ", nbad, alpha, ntot);
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* -------------------------------------------- */
|
||||
/* change all non A,C,G,T,X char into N */
|
||||
/* -------------------------------------------- */
|
||||
void nonACGTXtoN(char *seq)
|
||||
{
|
||||
char *pseq;
|
||||
|
||||
for( pseq=seq; *pseq; pseq++ )
|
||||
if (! strchr("ACGTX", *pseq) )
|
||||
*pseq='N';
|
||||
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* -------------------------------------------- */
|
||||
/* count all X in a sequence */
|
||||
/* -------------------------------------------- */
|
||||
int32_t countX(char *seq)
|
||||
{
|
||||
char *pseq;
|
||||
int32_t nX=0;
|
||||
|
||||
for( pseq=seq; *pseq; pseq++ )
|
||||
if ( *pseq == 'X' )
|
||||
nX++;
|
||||
|
||||
return nX;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/* -------------------------------------------- */
|
||||
/* uppercase sequence */
|
||||
/* -------------------------------------------- */
|
||||
|
||||
#define IS_LOWER(c) (((c) >= 'a') && ((c) <= 'z'))
|
||||
#define TO_UPPER(c) ((c) - 'a' + 'A')
|
||||
|
||||
void UpperSequence(char *seq){
|
||||
|
||||
char *cseq;
|
||||
|
||||
for (cseq = seq ; *cseq ; cseq++)
|
||||
if (IS_LOWER(*cseq))
|
||||
*cseq = TO_UPPER(*cseq);
|
||||
|
||||
}
|
||||
|
||||
#undef IS_LOWER
|
||||
#undef TO_UPPER
|
||||
|
||||
|
||||
|
||||
void invseq(char *seqsrc, char *seqdest)
|
||||
{
|
||||
|
||||
char *pseqsrc,
|
||||
*pseqinv;
|
||||
|
||||
pseqsrc=seqsrc;
|
||||
pseqinv=seqdest+strlen(seqsrc)-1;
|
||||
|
||||
while(*pseqsrc){
|
||||
|
||||
if(*pseqsrc == 'A' || *pseqsrc == 'a'){*pseqinv = 'T';}
|
||||
else if(*pseqsrc == 'C' || *pseqsrc == 'c'){*pseqinv = 'G';}
|
||||
else if(*pseqsrc == 'G' || *pseqsrc == 'g'){*pseqinv = 'C';}
|
||||
else if(*pseqsrc == 'T' || *pseqsrc == 't'){*pseqinv = 'A';}
|
||||
else if(*pseqsrc == 'X' || *pseqsrc == 'x'){*pseqinv = 'X';}
|
||||
else{*pseqinv = 'N';}
|
||||
|
||||
pseqinv--;
|
||||
pseqsrc++;
|
||||
|
||||
}
|
||||
|
||||
|
||||
seqdest[strlen(seqsrc)]=0;
|
||||
|
||||
}
|
||||
|
||||
|
45
src/repseek/repseek-2014.09/sequence.h
Normal file
45
src/repseek/repseek-2014.09/sequence.h
Normal file
@ -0,0 +1,45 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/**
|
||||
* @file KMRK_sequence.h
|
||||
* @author Eric Coissac <coissac@inrialpes.fr>
|
||||
* @date Tue Feb 24 22:22:57 2004
|
||||
*
|
||||
* @brief Header file for sequence utilities
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef KMRK_sequence_h
|
||||
#define KMRK_sequence_h
|
||||
|
||||
#include "repseek_types.h"
|
||||
|
||||
int8_t CheckSeq(char *seq, char *alpha);
|
||||
|
||||
void nonACGTXtoN(char *seq);
|
||||
|
||||
void UpperSequence(char *seq);
|
||||
|
||||
void invseq(char *seqsrc, char *seqdest);
|
||||
|
||||
int32_t countX(char *seq);
|
||||
|
||||
#endif /* KMRK_sequence_h */
|
223
src/repseek/repseek-2014.09/smin.c
Normal file
223
src/repseek/repseek-2014.09/smin.c
Normal file
@ -0,0 +1,223 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/**
|
||||
* @file smin.c
|
||||
* @author Guillaume Achaz <gachaz@gmail.com>
|
||||
* @date Jan 2006
|
||||
* @modif Mar 06 - a bug while plugin the regressions numbers-
|
||||
* @modif May 08 - do not count Xs from the sequence
|
||||
*
|
||||
* @brief from a sequence and a pvalue, get an Smin
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
#include "repseek_types.h"
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
|
||||
static float get_dGC(char * sequence, int32_t len )
|
||||
{
|
||||
int32_t letters[5] = {0,};
|
||||
int32_t i=0;
|
||||
|
||||
for(i=0;i<len; i++){
|
||||
|
||||
if (sequence[i] == 'A')
|
||||
letters[0]++;
|
||||
else if (sequence[i] == 'C')
|
||||
letters[1]++;
|
||||
else if (sequence[i] == 'G')
|
||||
letters[2]++;
|
||||
else if (sequence[i] == 'T')
|
||||
letters[3]++;
|
||||
else
|
||||
letters[4]++;
|
||||
}
|
||||
|
||||
return ABS( 50.0 - 100.0*(float)( letters[1] + letters[2] ) / (float)( len - letters[4] ) );
|
||||
}
|
||||
|
||||
static float get_dGC_2seq(char * sequence1, int32_t len1, char *sequence2, int32_t len2 )
|
||||
{
|
||||
|
||||
int32_t letters[5] = {0,};
|
||||
int32_t i=0;
|
||||
|
||||
for(i=0;i<len1; i++){
|
||||
|
||||
if (sequence1[i] == 'A')
|
||||
letters[0]++;
|
||||
else if (sequence1[i] == 'C')
|
||||
letters[1]++;
|
||||
else if (sequence1[i] == 'G')
|
||||
letters[2]++;
|
||||
else if (sequence1[i] == 'T')
|
||||
letters[3]++;
|
||||
else
|
||||
letters[4]++;
|
||||
}
|
||||
|
||||
for(i=0;i<len2; i++){
|
||||
|
||||
if (sequence2[i] == 'A')
|
||||
letters[0]++;
|
||||
else if (sequence2[i] == 'C')
|
||||
letters[1]++;
|
||||
else if (sequence2[i] == 'G')
|
||||
letters[2]++;
|
||||
else if (sequence2[i] == 'T')
|
||||
letters[3]++;
|
||||
else
|
||||
letters[4]++;
|
||||
}
|
||||
|
||||
|
||||
return ABS( 50.0 - 100.0*(float)( letters[1] + letters[2] ) / (float)( len1+len2 - letters[4] ) );
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
static double get_log_gamma(float dGC, int32_t len){
|
||||
|
||||
double log_gamma=0.0;
|
||||
log_gamma = - 0.0029688*dGC*dGC + 0.0727745*dGC - 0.0309583*log(len)*log(len) ;
|
||||
return log_gamma;
|
||||
}
|
||||
static double get_log_p(float dGC, int32_t len){
|
||||
|
||||
double log_p=0.0;
|
||||
log_p = 0.0001789 *dGC*dGC - 0.002409*dGC + 0.006838*log(len) - 1.060;
|
||||
return log_p;
|
||||
}
|
||||
|
||||
/*
|
||||
Using the formula of Watrerman & Vingron
|
||||
log(-log(F)) = log(gamma*m*n) + S*log(p)
|
||||
for unique sequence it becomes
|
||||
log(-log(F)) = log(gamma*n*(n-1)/2) + S*log(p)
|
||||
*/
|
||||
|
||||
static double get_Smin(float dGC, int32_t len, double pvalue){
|
||||
|
||||
double Smin=0.0;
|
||||
|
||||
double log_log_minusF = 0.0,
|
||||
log_gamma = 0.0,
|
||||
log_p = 0.0;
|
||||
|
||||
|
||||
log_log_minusF = log( -log( 1.00 - pvalue ) );
|
||||
log_gamma = get_log_gamma( dGC, len );
|
||||
log_p = get_log_p( dGC, len );
|
||||
|
||||
Smin = log_log_minusF - log_gamma - log( (double)len * (len-1.0) / 2.0 );
|
||||
Smin /= log_p;
|
||||
|
||||
return Smin;
|
||||
}
|
||||
|
||||
|
||||
static double get_Smin_2seq(float dGC, int32_t len1, int32_t len2, double pvalue){
|
||||
|
||||
double Smin=0.0;
|
||||
|
||||
double log_log_minusF = 0.0,
|
||||
log_gamma = 0.0,
|
||||
log_p = 0.0;
|
||||
|
||||
|
||||
log_log_minusF = log( -log( 1.00 - pvalue ) );
|
||||
log_gamma = get_log_gamma( dGC, (len1+len2)/2 ); /* average dGC and average len */
|
||||
log_p = get_log_p( dGC, (len1+len2)/2 );
|
||||
|
||||
|
||||
Smin = log_log_minusF - log_gamma - log( (double)len1 * len2 ); /* when two sequences are considered,
|
||||
the whole matrix [l1 l2] can potentially have repeats
|
||||
*/
|
||||
Smin /= log_p;
|
||||
|
||||
return Smin;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
double compute_smin(char *sequence, double pvalue, int32_t seq_len, int32_t nX){
|
||||
|
||||
double Smin=0.0;
|
||||
float dGC=0.0;
|
||||
|
||||
|
||||
dGC = get_dGC(sequence, seq_len);
|
||||
Smin = get_Smin( dGC, seq_len-nX, pvalue);
|
||||
|
||||
return Smin;
|
||||
}
|
||||
|
||||
|
||||
|
||||
double compute_smin_2seq(char *sequence1, char *sequence2, int32_t size1, int32_t size2, double pvalue, int32_t nX, int32_t nX2){
|
||||
|
||||
double Smin=0.0;
|
||||
float dGC=0.0;
|
||||
|
||||
dGC = get_dGC_2seq(sequence1, size1, sequence2, size2 );
|
||||
|
||||
Smin = get_Smin_2seq( dGC, size1-nX, size2-nX2, pvalue);
|
||||
|
||||
return Smin;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
void UpdateRepeats(Repeats *AllRepeats, double Score_min){
|
||||
|
||||
int32_t i;
|
||||
|
||||
if( AllRepeats->nDirRep )
|
||||
for(i=0;i<AllRepeats->nDirRep;i++)
|
||||
if(AllRepeats->DirRep[i].pos1 != -1 && AllRepeats->DirRep[i].score<Score_min){
|
||||
AllRepeats->DirRep[i].pos1 = -1;
|
||||
AllRepeats->DirRep[i].pos2 = -1;
|
||||
AllRepeats->DirRep[i].len1 = -1;
|
||||
AllRepeats->DirRep[i].len2 = -1;
|
||||
AllRepeats->DirRep[i].score = -1;
|
||||
AllRepeats->nDirBadRep++;
|
||||
}
|
||||
|
||||
if( AllRepeats->nInvRep )
|
||||
for(i=0;i<AllRepeats->nInvRep;i++)
|
||||
if(AllRepeats->InvRep[i].pos1 != -1 && AllRepeats->InvRep[i].score<Score_min){
|
||||
AllRepeats->InvRep[i].pos1 = -1;
|
||||
AllRepeats->InvRep[i].pos2 = -1;
|
||||
AllRepeats->InvRep[i].len1 = -1;
|
||||
AllRepeats->InvRep[i].len2 = -1;
|
||||
AllRepeats->InvRep[i].score = -1;
|
||||
AllRepeats->nInvBadRep++;
|
||||
}
|
||||
}
|
||||
|
||||
|
57
src/repseek/repseek-2014.09/smin.h
Normal file
57
src/repseek/repseek-2014.09/smin.h
Normal file
@ -0,0 +1,57 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/**
|
||||
* @file smin.h
|
||||
* @author Guillaume Achaz <gachaz@oeb.harvard.edu>
|
||||
* @date April 2004
|
||||
*
|
||||
* @brief header for all shuffling/smin process
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#ifndef _smin_h_
|
||||
#define _smin_h_
|
||||
|
||||
|
||||
/*
|
||||
From a sequence, get length and dGC
|
||||
then compute the Smin using Waterman & Vingron
|
||||
*/
|
||||
double compute_smin(char *sequence, double pvalue, int32_t size, int32_t nX);
|
||||
|
||||
|
||||
/*
|
||||
From two sequences, get average length and average dGC
|
||||
then compute the Smin using Waterman & Vingron
|
||||
*/
|
||||
|
||||
double compute_smin_2seq(char *sequence1, char *sequence2, int32_t size1, int32_t size2, double pvalue, int32_t nX, int32_t nX2);
|
||||
|
||||
|
||||
/*
|
||||
tag all repeats with a score < smin
|
||||
set their positions and length at -1
|
||||
*/
|
||||
void UpdateRepeats(Repeats *AllRepeats, double Score_min);
|
||||
|
||||
|
||||
#endif /* _smin_h_*/
|
491
src/repseek/repseek-2014.09/sort.c
Normal file
491
src/repseek/repseek-2014.09/sort.c
Normal file
@ -0,0 +1,491 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
/**
|
||||
* @file sort.c
|
||||
* @author Guillaume Achaz <gachaz@oeb.harvard.edu>
|
||||
* @date oct 03 2002
|
||||
* @modif dec 13 2002 - add the sort_repeats() and qsort_1then2_repeats()
|
||||
Oct 2003
|
||||
April 2003 - Int32->int32_t
|
||||
July 2004, change Qsort into double_Qsort
|
||||
*
|
||||
* @brief sort two arrays by values of the first one (int or Rep) - Implemented from Numrec 92 pp.333/336
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "repseek_types.h"
|
||||
#include "memory.h"
|
||||
|
||||
/***
|
||||
sort two array by the order of the first one
|
||||
Qsort is implemented following the algorithm explained in Numrec C, pp 332-335
|
||||
Under a certain size, a part of the array is sorted by the naive method (straight insertion)
|
||||
***/
|
||||
|
||||
#define NSTACK 1000 /* the number of subarray that could be memorized */
|
||||
#define SWITCH_METHOD 7 /* under that size switch to a Insertion method */
|
||||
#define SWAP(a,b) {tmp=(a); (a)=(b); (b)=tmp;}
|
||||
|
||||
static void qsort2(int32_t *array1, int32_t *array2, int32_t n){
|
||||
|
||||
|
||||
int32_t *Stack; /* where remaining values of subarrays are temporarily stocked */
|
||||
int32_t nStack=0; /* How many of these values are in the Stack. Is never odd */
|
||||
|
||||
int32_t end=n-1, /* the last value of the array to sort */
|
||||
beg=0, /* the first value --- */
|
||||
postbeg; /* the second value --- */
|
||||
|
||||
int32_t val_postbeg; /* the value of the postbeg position - the one which is used for partion-exchange */
|
||||
|
||||
int32_t demi_len; /* half of the distance between beg and end */
|
||||
|
||||
int32_t i, /* counter from postbeg to right */
|
||||
j; /* counter from end to left */
|
||||
|
||||
int32_t val1_i, /* for insertion stock temporarily a value */
|
||||
val2_i;
|
||||
|
||||
int32_t tmp; /* used for the SWAP macro */
|
||||
|
||||
|
||||
Stack = (int32_t *)MyMalloc(NSTACK*sizeof(int32_t) , "qsort2: not enough memory for Stack") ;
|
||||
|
||||
while(1){
|
||||
|
||||
if( end-beg+1 > SWITCH_METHOD){
|
||||
|
||||
demi_len = (end-beg) >> 1 ;
|
||||
postbeg = beg+1;
|
||||
|
||||
SWAP( array1[beg+demi_len], array1[postbeg] );
|
||||
SWAP( array2[beg+demi_len], array2[postbeg] );
|
||||
|
||||
if(array1[beg] > array1[postbeg]){ /* rearrange to have beg <= postbeg <= end */
|
||||
SWAP( array1[beg], array1[postbeg] );
|
||||
SWAP( array2[beg], array2[postbeg] );
|
||||
}
|
||||
|
||||
if(array1[beg] > array1[end]){
|
||||
SWAP( array1[beg], array1[end] );
|
||||
SWAP( array2[beg], array2[end] );
|
||||
}
|
||||
|
||||
if(array1[postbeg] > array1[end]){
|
||||
SWAP( array1[postbeg], array1[end] );
|
||||
SWAP( array2[postbeg], array2[end] );
|
||||
}
|
||||
|
||||
|
||||
i = postbeg;
|
||||
j = end;
|
||||
|
||||
val_postbeg = array1[postbeg];
|
||||
|
||||
while(1) /* enter the partition exchange process */
|
||||
{
|
||||
do i++; while( array1[i] < val_postbeg );
|
||||
do j--; while( array1[j] > val_postbeg );
|
||||
|
||||
if(j<i) break;
|
||||
|
||||
SWAP( array1[i], array1[j] );
|
||||
SWAP( array2[i], array2[j] );
|
||||
}
|
||||
|
||||
SWAP( array1[postbeg] , array1[j] ); /* place the postbeg value into j */
|
||||
SWAP( array2[postbeg] , array2[j] );
|
||||
|
||||
if(nStack+2 > NSTACK)
|
||||
fprintf(stderr, "qsort2: not enough Stack... sorry bye\n"),exit(1);
|
||||
|
||||
if(end-i+1 >= j-beg){
|
||||
Stack[nStack++] = i; /* stock in Stack the largest and go with the smallest */
|
||||
Stack[nStack++] = end;
|
||||
|
||||
end = j-1;
|
||||
}
|
||||
else{
|
||||
Stack[nStack++] = beg;
|
||||
Stack[nStack++] = j-1;
|
||||
|
||||
beg = i;
|
||||
}
|
||||
|
||||
}
|
||||
else{ /* Under a certain size, switch to the straight insertion method */
|
||||
|
||||
|
||||
for(i=beg+1; i <= end ; i++){
|
||||
|
||||
val1_i = array1[i];
|
||||
val2_i = array2[i];
|
||||
|
||||
for(j=i-1;j>=beg;j--){
|
||||
|
||||
if(array1[j] < val1_i)break;
|
||||
|
||||
array1[j+1] = array1[j];
|
||||
array2[j+1] = array2[j];
|
||||
}
|
||||
array1[j+1]=val1_i;
|
||||
array2[j+1]=val2_i;
|
||||
}
|
||||
|
||||
if(nStack==0)break; /* this si the end - exit the process */
|
||||
|
||||
end = Stack[--nStack]; /* go for the next round with the stacked parameters */
|
||||
beg = Stack[--nStack];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
MyFree(Stack, NSTACK*sizeof(int32_t));
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
A simple qsort as described in numrec
|
||||
*/
|
||||
void double_Qsort(double *array1, int32_t n){
|
||||
|
||||
|
||||
double *Stack; /* where remaining values of subarrays are temporarily stocked */
|
||||
int32_t nStack=0; /* How many of these values are in the Stack. Is never odd */
|
||||
|
||||
int32_t end=n-1, /* the last value of the array to sort */
|
||||
beg=0, /* the first value --- */
|
||||
postbeg; /* the second value --- */
|
||||
|
||||
double val_postbeg; /* the value of the postbeg position - the one which is used for partion-exchange */
|
||||
|
||||
int32_t demi_len; /* half of the distance between beg and end */
|
||||
|
||||
int32_t i, /* counter from postbeg to right */
|
||||
j; /* counter from end to left */
|
||||
|
||||
double val1_i; /* for insertion stock temporarily a value */
|
||||
|
||||
double tmp; /* used for the SWAP macro */
|
||||
|
||||
|
||||
Stack= (double *)MyMalloc(NSTACK*sizeof(double) , "Qsort: not enough memory for Stack" );
|
||||
|
||||
while(1){
|
||||
|
||||
if( end-beg+1 > SWITCH_METHOD){
|
||||
|
||||
demi_len = (end-beg) >> 1 ;
|
||||
postbeg = beg+1;
|
||||
|
||||
SWAP( array1[beg+demi_len], array1[postbeg] );
|
||||
|
||||
if(array1[beg] > array1[postbeg]){ /* rearrange to have beg <= postbeg <= end */
|
||||
SWAP( array1[beg], array1[postbeg] );
|
||||
}
|
||||
|
||||
if(array1[beg] > array1[end]){
|
||||
SWAP( array1[beg], array1[end] );
|
||||
}
|
||||
|
||||
if(array1[postbeg] > array1[end]){
|
||||
SWAP( array1[postbeg], array1[end] );
|
||||
}
|
||||
|
||||
|
||||
i = postbeg;
|
||||
j = end;
|
||||
|
||||
val_postbeg = array1[postbeg];
|
||||
|
||||
while(1) /* enter the partition exchange process */
|
||||
{
|
||||
do i++; while( array1[i] < val_postbeg );
|
||||
do j--; while( array1[j] > val_postbeg );
|
||||
|
||||
if(j<i) break;
|
||||
|
||||
SWAP( array1[i], array1[j] );
|
||||
}
|
||||
|
||||
SWAP( array1[postbeg] , array1[j] ); /* place the postbeg value into j */
|
||||
|
||||
if(nStack+2 > NSTACK)
|
||||
fprintf(stderr, "Qsort: not enough Stack... sorry bye\n"),exit(1);
|
||||
|
||||
if(end-i+1 >= j-beg){
|
||||
Stack[nStack++] = i; /* stock in Stack the largest and go with the smallest */
|
||||
Stack[nStack++] = end;
|
||||
|
||||
end = j-1;
|
||||
}
|
||||
else{
|
||||
Stack[nStack++] = beg;
|
||||
Stack[nStack++] = j-1;
|
||||
|
||||
beg = i;
|
||||
}
|
||||
|
||||
}
|
||||
else{ /* Under a certain size, switch to the straight insertion method */
|
||||
|
||||
|
||||
for(i=beg+1; i <= end ; i++){
|
||||
|
||||
val1_i = array1[i];
|
||||
|
||||
for(j=i-1;j>=beg;j--){
|
||||
|
||||
if(array1[j] < val1_i)break;
|
||||
|
||||
array1[j+1] = array1[j];
|
||||
}
|
||||
array1[j+1]=val1_i;
|
||||
}
|
||||
|
||||
if(nStack==0)break; /* this si the end - exit the process */
|
||||
|
||||
end = Stack[--nStack]; /* go for the next round with the stacked parameters */
|
||||
beg = Stack[--nStack];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
MyFree(Stack, NSTACK*sizeof(double));
|
||||
|
||||
}
|
||||
|
||||
#undef NSTACK
|
||||
#undef SWITCH_METHOD
|
||||
#undef SWAP
|
||||
|
||||
|
||||
/*
|
||||
Sort all repeat by an array
|
||||
Just modify from qsort2
|
||||
*/
|
||||
|
||||
#define SWAP(a,b) { tmprep=(a); (a)=(b); (b)=tmprep; }
|
||||
#define NSTACK 1000 /* the number of subarray that could be memorized */
|
||||
#define SWITCH_METHOD 7 /* under that size switch to a Insertion method */
|
||||
|
||||
static int32_t GetPos1( Rep repet ){
|
||||
return repet.pos1;
|
||||
}
|
||||
|
||||
static int32_t GetPos2( Rep repet ){
|
||||
return repet.pos2;
|
||||
}
|
||||
|
||||
static void qsort_repeats(Rep *AllReps, int32_t n, int32_t pos ){
|
||||
|
||||
int32_t *Stack; /* where remaining values of subarrays are temporarily stocked */
|
||||
int32_t nStack=0; /* How many of these values are in the Stack. Is never odd */
|
||||
|
||||
int32_t end=n-1, /* the last value of the array to sort */
|
||||
beg=0, /* the first value --- */
|
||||
postbeg; /* the second value --- */
|
||||
|
||||
int32_t val_postbeg; /* the value of the postbeg position - the one which is used for partion-exchange */
|
||||
|
||||
int32_t demi_len; /* half of the distance between beg and end */
|
||||
|
||||
int32_t i, /* counter from postbeg to right */
|
||||
j; /* counter from end to left */
|
||||
|
||||
Rep val_i; /* for insertion stock temporarily a value */
|
||||
Rep tmprep; /* used for the SWAP macro */
|
||||
|
||||
int32_t (* PtrFuncPos)( Rep repet );
|
||||
|
||||
if( pos == 1)
|
||||
PtrFuncPos = GetPos1;
|
||||
else if( pos == 2)
|
||||
PtrFuncPos = GetPos2;
|
||||
else
|
||||
fprintf(stderr,"qsort_repet: pos should be 1 or 2\n"),exit(4);
|
||||
|
||||
Stack=(int32_t *)MyMalloc(NSTACK*sizeof(int32_t) , "qsort_repet: not enough memory for Stack" );
|
||||
|
||||
while(1){
|
||||
|
||||
if( end-beg+1 > SWITCH_METHOD){
|
||||
|
||||
demi_len = (end-beg) >> 1 ;
|
||||
postbeg = beg+1;
|
||||
|
||||
SWAP( AllReps[beg+demi_len], AllReps[postbeg] );
|
||||
|
||||
if( PtrFuncPos(AllReps[beg]) > PtrFuncPos(AllReps[postbeg]) ){ /* rearrange to have beg <= postbeg <= end */
|
||||
SWAP( AllReps[beg], AllReps[postbeg] );
|
||||
}
|
||||
|
||||
if( PtrFuncPos(AllReps[beg]) > PtrFuncPos(AllReps[end]) ){
|
||||
SWAP( AllReps[beg], AllReps[end] );
|
||||
}
|
||||
|
||||
if( PtrFuncPos(AllReps[postbeg]) > PtrFuncPos(AllReps[end]) ){
|
||||
SWAP( AllReps[postbeg], AllReps[end] );
|
||||
}
|
||||
|
||||
i = postbeg;
|
||||
j = end;
|
||||
|
||||
val_postbeg = PtrFuncPos(AllReps[postbeg]);
|
||||
|
||||
while(1) /* enter the partition exchange process */
|
||||
{
|
||||
do i++; while( PtrFuncPos(AllReps[i]) < val_postbeg );
|
||||
do j--; while( PtrFuncPos(AllReps[j]) > val_postbeg );
|
||||
|
||||
if(j<i) break;
|
||||
|
||||
SWAP( AllReps[i], AllReps[j] );
|
||||
}
|
||||
|
||||
SWAP( AllReps[postbeg] , AllReps[j] ); /* place the postbeg value into j */
|
||||
|
||||
if(nStack+2 > NSTACK)
|
||||
fprintf(stderr, "qsort_repet: not enough Stack... sorry bye\n"),exit(1);
|
||||
|
||||
if(end-i+1 >= j-beg){
|
||||
Stack[nStack++] = i; /* stock in Stack the largest and go with the smallest */
|
||||
Stack[nStack++] = end;
|
||||
|
||||
end = j-1;
|
||||
}
|
||||
else{
|
||||
Stack[nStack++] = beg;
|
||||
Stack[nStack++] = j-1;
|
||||
|
||||
beg = i;
|
||||
}
|
||||
|
||||
}
|
||||
else{ /* Under a certain size, switch to the straight insertion method */
|
||||
|
||||
|
||||
for(i=beg+1; i <= end ; i++){
|
||||
|
||||
val_i = AllReps[i];
|
||||
|
||||
for(j=i-1;j>=beg;j--){
|
||||
|
||||
if( PtrFuncPos(AllReps[j]) <= PtrFuncPos(val_i) )break;
|
||||
|
||||
AllReps[j+1] = AllReps[j];
|
||||
}
|
||||
AllReps[j+1]=val_i;
|
||||
}
|
||||
|
||||
if(nStack==0)break; /* this si the end - exit the process */
|
||||
|
||||
end = Stack[--nStack]; /* go for the next round with the stacked parameters */
|
||||
beg = Stack[--nStack];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
MyFree(Stack, NSTACK*sizeof(int32_t));
|
||||
|
||||
}
|
||||
|
||||
#undef NSTACK
|
||||
#undef SWITCH_METHOD
|
||||
#undef SWAP
|
||||
|
||||
|
||||
/***
|
||||
This function sort the first array, and for equal value, use the second one
|
||||
calling the qsort2() function
|
||||
***/
|
||||
void qsort2_1then2(int32_t *array1, int32_t *array2, int32_t n){
|
||||
|
||||
int32_t i;
|
||||
int32_t beg=0, end=n;
|
||||
|
||||
char token=0;
|
||||
|
||||
qsort2(array1, array2, n);
|
||||
|
||||
for(i=1, token=0; i < n-1 ; i++){
|
||||
|
||||
if(array1[i] == array1[i-1]){
|
||||
if(!token) beg = i-1;
|
||||
token = 1;
|
||||
}
|
||||
|
||||
if(array1[i] != array1[i-1])
|
||||
if(token){
|
||||
token=0;
|
||||
end=i;
|
||||
qsort2(array2+beg, array1+beg, end-beg);
|
||||
}
|
||||
}
|
||||
|
||||
if(token){
|
||||
end=i;
|
||||
qsort2(array2+beg, array1+beg, end-beg);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/***
|
||||
This function sort by pos1 and for equal value, use pos2
|
||||
calling the qsort2_repeats() function
|
||||
***/
|
||||
|
||||
void qsort_1then2_repeats(Rep *AllReps, int32_t n){
|
||||
|
||||
int32_t i;
|
||||
int32_t beg=0, end=n;
|
||||
|
||||
char token=0;
|
||||
|
||||
qsort_repeats(AllReps, n, 1 );
|
||||
|
||||
for(i=1, token=0; i < n-1 ; i++){
|
||||
|
||||
if( AllReps[i].pos1 == AllReps[i-1].pos1 ){
|
||||
if(!token) beg = i-1;
|
||||
token = 1;
|
||||
}
|
||||
|
||||
if(AllReps[i].pos1 != AllReps[i-1].pos1)
|
||||
if(token){
|
||||
token=0;
|
||||
end=i;
|
||||
qsort_repeats(AllReps+beg, end-beg, 2 );
|
||||
}
|
||||
}
|
||||
|
||||
if(token){
|
||||
end=i;
|
||||
qsort_repeats(AllReps+beg, end-beg, 2 );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
61
src/repseek/repseek-2014.09/sort.h
Normal file
61
src/repseek/repseek-2014.09/sort.h
Normal file
@ -0,0 +1,61 @@
|
||||
/*
|
||||
Copyright (C) 2006 G achaz, F boyer, E rocha, A viari and E coissac.
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public License
|
||||
as published by the Free Software Foundation; either version 2.1
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
for more information, please contact guillaume achaz <achaz@abi.snv.jussieu.fr>
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file sort.h
|
||||
* @author amikezor <gachaz@oeb.harvard.edu>
|
||||
* @date april 2004
|
||||
*
|
||||
* @brief header fr sort.c
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#ifndef _SORT_H_
|
||||
#define _SORT_H_
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "repseek_types.h"
|
||||
#include "memory.h"
|
||||
|
||||
|
||||
/*
|
||||
A simple qsort as described in numrec
|
||||
*/
|
||||
void double_Qsort(double *array1, int32_t n);
|
||||
|
||||
|
||||
/***
|
||||
This function sort the first array, and for equal value, use the second one
|
||||
calling the qsort2() function
|
||||
***/
|
||||
void qsort2_1then2(int32_t *array1, int32_t *array2, int32_t n);
|
||||
|
||||
|
||||
/***
|
||||
This function sort by pos1 and for equal value, use pos2
|
||||
calling the qsort2_repeats() function
|
||||
***/
|
||||
|
||||
void qsort_1then2_repeats(Rep *AllReps, int32_t n);
|
||||
|
||||
#endif /* _SORT_H_*/
|
Reference in New Issue
Block a user