Manage sequence circularity

git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPCR/trunk@166 60f365c0-8329-0410-b2a4-ec073aeeaa1d
This commit is contained in:
2008-04-28 15:49:12 +00:00
parent 4b74056af8
commit d863b7e48e
6 changed files with 134 additions and 51 deletions

View File

@ -64,7 +64,7 @@ int ispatternmatching(ecoseq_t *seq, PatternPtr pattern){
if (pattern != NULL)
{
SeqPtr apatseq = NULL;
apatseq=ecoseq2apatseq(seq,apatseq);
apatseq=ecoseq2apatseq(seq,apatseq,0);
return ManberAll(apatseq,pattern,0,0,apatseq->seqlen) > 0;
}
else return 0;

View File

@ -131,6 +131,7 @@ void printRepeat(ecoseq_t *seq,
AC = seq->AC;
seqlength = seq->SQ_length;
main_taxon = &taxonomy->taxons->taxon[seq->taxid];
taxid = main_taxon->taxid;
scientificName= main_taxon->name;
@ -184,10 +185,9 @@ void printRepeat(ecoseq_t *seq,
superkingdom_name = "###";
}
amplength = pos2-pos1;
amplifia = getSubSequence(seq->SQ,pos1,pos2);
amplength= strlen(amplifia);
if (strand=='R')
{
@ -297,9 +297,10 @@ int main(int argc, char **argv)
int32_t *ignored_taxid = NULL;
int32_t r=0;
int32_t g=0;
int32_t circular=0;
while ((carg = getopt(argc, argv, "hd:l:L:e:i:r:k")) != -1) {
while ((carg = getopt(argc, argv, "hcd:l:L:e:i:r:k")) != -1) {
switch (carg) {
/* -------------------- */
@ -359,6 +360,11 @@ int main(int argc, char **argv)
break;
/* -------------------- */
case 'c': /* stores the taxonomic id to ignore */
/* --------------------------------- */
circular = 1;
break;
case '?': /* bad option */
/* -------------------- */
errflag++;
@ -379,6 +385,11 @@ int main(int argc, char **argv)
oligo2 = ECOMALLOC(strlen(argv[optind])+1,
"Error on oligo1 allocation");
strcpy(oligo2,argv[optind]);
if (circular)
circular = strlen(oligo1);
if (strlen(oligo2)>(size_t)circular)
circular = strlen(oligo2);
}
else
errflag++;
@ -420,6 +431,10 @@ int main(int argc, char **argv)
printf("# output in kingdom mode\n");
else
printf("# output in superkingdom mode\n");
if (circular)
printf("# DB sequences are considered as circular\n");
else
printf("# DB sequences are considered as linear\n");
printf("#\n");
taxonomy = read_taxonomy(prefix,0);
@ -458,9 +473,9 @@ int main(int argc, char **argv)
strncpy(tail,seq->SQ+seq->SQ_length-10,10);
tail[10]=0;
apatseq=ecoseq2apatseq(seq,apatseq);
apatseq=ecoseq2apatseq(seq,apatseq,circular);
o1Hits = ManberAll(apatseq,o1,0,0,apatseq->seqlen);
o1Hits = ManberAll(apatseq,o1,0,0,apatseq->seqlen+apatseq->circular);
o2cHits= 0;
if (o1Hits)
@ -473,23 +488,43 @@ int main(int argc, char **argv)
else
length= apatseq->seqlen - begin;
if (circular)
{
begin = 0;
length=apatseq->seqlen+circular;
}
o2cHits = ManberAll(apatseq,o2c,1,begin,length);
if (o2cHits)
for (i=0; i < o1Hits;i++)
{
posi = apatseq->hitpos[0]->val[i];
if (posi < apatseq->seqlen)
{
erri = apatseq->hiterr[0]->val[i];
for (j=0; j < o2cHits; j++)
{
posj =apatseq->hitpos[1]->val[j] + o2c->patlen;
errj =apatseq->hiterr[1]->val[j];
length=posj - posi + 1 - o1->patlen - o2->patlen;
posj =apatseq->hitpos[1]->val[j];
if ((!lmin || (length >= lmin)) &&
if (posj < apatseq->seqlen)
{
posj+=o2c->patlen;
// printf("coucou %d %d %d\n",posi,posj,apatseq->seqlen);
errj =apatseq->hiterr[1]->val[j];
length = 0;
if (posj > posi)
length=posj - posi - o1->patlen; /* - o2->patlen : suppress by <EC> */
if (posj < posi)
length= posj + apatseq->seqlen - posi - o1->patlen;
if (length &&
(!lmin || (length >= lmin)) &&
(!lmax || (length <= lmax)))
printRepeat(seq,o1,o2c,'D',kingdom_mode,posi,posj,erri,errj,taxonomy);
//printf("%s\tD\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o1Hits,o2cHits,posi,posj,scname);
}
}
}
}
}
@ -506,26 +541,46 @@ int main(int argc, char **argv)
else
length= apatseq->seqlen - begin;
if (circular)
{
begin = 0;
length=apatseq->seqlen+circular;
}
o1cHits = ManberAll(apatseq,o1c,3,begin,length);
if (o1cHits)
for (i=0; i < o2Hits;i++)
{
posi = apatseq->hitpos[2]->val[i];
if (posi < apatseq->seqlen)
{
erri = apatseq->hiterr[2]->val[i];
for (j=0; j < o1cHits; j++)
{
posj=apatseq->hitpos[3]->val[j] + o1c->patlen;
posj=apatseq->hitpos[3]->val[j];
if (posj < apatseq->seqlen)
{
posj+=o1c->patlen;
errj=apatseq->hiterr[3]->val[j];
length=posj - posi + 1 - o1->patlen - o2->patlen;
if ((!lmin || (length >= lmin)) &&
length = 0;
if (posj > posi)
length=posj - posi + 1 - o2->patlen; /* - o1->patlen : suppress by <EC> */
if (posj < posi)
length= posj + apatseq->seqlen - posi - o1->patlen;
if (length &&
(!lmin || (length >= lmin)) &&
(!lmax || (length <= lmax)))
printRepeat(seq,o2,o1c,'R',kingdom_mode,posi,posj,erri,errj,taxonomy);
//printf("%s\tR\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o2Hits,o1cHits,posi,posj,scname);
}
}
}
}
}
} /* End of taxonomic selection */

View File

@ -103,6 +103,7 @@ typedef struct { /* sequence */
Int32 seqlen; /* sequence length */
Int32 seqsiz; /* sequence buffer size */
Int32 datsiz; /* data buffer size */
Int32 circular;
UInt8 *data; /* data buffer */
char *cseq; /* sequence buffer */
StackiPtr hitpos[MAX_PATTERN]; /* stack of hit pos. */

View File

@ -251,7 +251,7 @@ int32_t delete_apatseq(SeqPtr pseq);
PatternPtr buildPattern(const char *pat, int32_t error_max);
PatternPtr complementPattern(PatternPtr pat);
SeqPtr ecoseq2apatseq(ecoseq_t *in,SeqPtr out);
SeqPtr ecoseq2apatseq(ecoseq_t *in,SeqPtr out,int32_t circular);
char *ecoComplementPattern(char *nucAcSeq);
char *ecoComplementSequence(char *nucAcSeq);

View File

@ -55,6 +55,9 @@ void EncodeSequence(SeqPtr seq)
cseq++;
}
for (i=0,cseq=seq->cseq;i < seq->circular; i++,cseq++)
*data++ = (IS_UPPER(*cseq) ? *cseq - 'A' : 0x0);
for (i = 0 ; i < MAX_PATTERN ; i++)
seq->hitpos[i]->top = seq->hiterr[i]->top = 0;
@ -63,7 +66,7 @@ void EncodeSequence(SeqPtr seq)
#undef IS_UPPER
SeqPtr ecoseq2apatseq(ecoseq_t *in,SeqPtr out)
SeqPtr ecoseq2apatseq(ecoseq_t *in,SeqPtr out,int32_t circular)
{
int i;
@ -83,20 +86,22 @@ SeqPtr ecoseq2apatseq(ecoseq_t *in,SeqPtr out)
}
}
out->name = in->AC;
out->seqsiz = out->seqlen = in->SQ_length;
out->circular = circular;
if (!out->data)
{
out->data = ECOMALLOC(out->seqlen *sizeof(UInt8),
out->data = ECOMALLOC((out->seqlen+circular) *sizeof(UInt8),
"Error in Allocation of a new Seq data member");
out->datsiz= out->seqlen;
out->datsiz= out->seqlen+circular;
}
else if (out->seqlen >= out->datsiz)
else if ((out->seqlen +circular) >= out->datsiz)
{
out->data = ECOREALLOC(out->data,out->seqlen,
out->data = ECOREALLOC(out->data,(out->seqlen+circular),
"Error during Seq data buffer realloc");
out->datsiz= out->seqlen;
out->datsiz= out->seqlen+circular;
}
out->cseq = in->SQ;

View File

@ -109,6 +109,8 @@ char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end)
static int32_t buffSize= 0;
int32_t length;
if (begin < end)
{
length = end - begin;
if (length >= buffSize)
@ -125,6 +127,26 @@ char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end)
strncpy(buffer,nucAcSeq + begin,length);
buffer[length]=0;
}
else
{
length = end + strlen(nucAcSeq) - begin;
if (length >= buffSize)
{
buffSize = length+1;
if (buffer)
buffer=ECOREALLOC(buffer,buffSize,
"Error in reallocating sub sequence buffer");
else
buffer=ECOMALLOC(buffSize,
"Error in allocating sub sequence buffer");
}
strncpy(buffer,nucAcSeq+begin,length - end);
strncpy(buffer+(length-end),nucAcSeq ,end);
buffer[length]=0;
}
return buffer;
}