From d863b7e48e456dc1b7d130779d671c7596ec5615 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 28 Apr 2008 15:49:12 +0000 Subject: [PATCH] Manage sequence circularity git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPCR/trunk@166 60f365c0-8329-0410-b2a4-ec073aeeaa1d --- src/ecogrep.c | 4 +- src/ecopcr.c | 107 ++++++++++++++++++++++++++++++---------- src/libapat/apat.h | 1 + src/libecoPCR/ecoPCR.h | 2 +- src/libecoPCR/ecoapat.c | 21 +++++--- src/libecoPCR/ecodna.c | 50 +++++++++++++------ 6 files changed, 134 insertions(+), 51 deletions(-) diff --git a/src/ecogrep.c b/src/ecogrep.c index 5b47b87..bf32500 100644 --- a/src/ecogrep.c +++ b/src/ecogrep.c @@ -64,7 +64,7 @@ int ispatternmatching(ecoseq_t *seq, PatternPtr pattern){ if (pattern != NULL) { SeqPtr apatseq = NULL; - apatseq=ecoseq2apatseq(seq,apatseq); + apatseq=ecoseq2apatseq(seq,apatseq,0); return ManberAll(apatseq,pattern,0,0,apatseq->seqlen) > 0; } else return 0; @@ -400,4 +400,4 @@ int main(int argc, char **argv){ ECOFREE(restricted_taxid,"Error in free stream"); return 0; -} \ No newline at end of file +} diff --git a/src/ecopcr.c b/src/ecopcr.c index 9d8a948..cf4c9de 100644 --- a/src/ecopcr.c +++ b/src/ecopcr.c @@ -131,6 +131,7 @@ void printRepeat(ecoseq_t *seq, AC = seq->AC; seqlength = seq->SQ_length; + main_taxon = &taxonomy->taxons->taxon[seq->taxid]; taxid = main_taxon->taxid; scientificName= main_taxon->name; @@ -184,10 +185,9 @@ void printRepeat(ecoseq_t *seq, superkingdom_name = "###"; } - amplength = pos2-pos1; amplifia = getSubSequence(seq->SQ,pos1,pos2); - + amplength= strlen(amplifia); if (strand=='R') { @@ -297,9 +297,10 @@ int main(int argc, char **argv) int32_t *ignored_taxid = NULL; int32_t r=0; int32_t g=0; + int32_t circular=0; - while ((carg = getopt(argc, argv, "hd:l:L:e:i:r:k")) != -1) { + while ((carg = getopt(argc, argv, "hcd:l:L:e:i:r:k")) != -1) { switch (carg) { /* -------------------- */ @@ -359,6 +360,11 @@ int main(int argc, char **argv) break; /* -------------------- */ + case 'c': /* stores the taxonomic id to ignore */ + /* --------------------------------- */ + circular = 1; + break; + case '?': /* bad option */ /* -------------------- */ errflag++; @@ -379,6 +385,11 @@ int main(int argc, char **argv) oligo2 = ECOMALLOC(strlen(argv[optind])+1, "Error on oligo1 allocation"); strcpy(oligo2,argv[optind]); + + if (circular) + circular = strlen(oligo1); + if (strlen(oligo2)>(size_t)circular) + circular = strlen(oligo2); } else errflag++; @@ -420,6 +431,10 @@ int main(int argc, char **argv) printf("# output in kingdom mode\n"); else printf("# output in superkingdom mode\n"); + if (circular) + printf("# DB sequences are considered as circular\n"); + else + printf("# DB sequences are considered as linear\n"); printf("#\n"); taxonomy = read_taxonomy(prefix,0); @@ -458,9 +473,9 @@ int main(int argc, char **argv) strncpy(tail,seq->SQ+seq->SQ_length-10,10); tail[10]=0; - apatseq=ecoseq2apatseq(seq,apatseq); + apatseq=ecoseq2apatseq(seq,apatseq,circular); - o1Hits = ManberAll(apatseq,o1,0,0,apatseq->seqlen); + o1Hits = ManberAll(apatseq,o1,0,0,apatseq->seqlen+apatseq->circular); o2cHits= 0; if (o1Hits) @@ -472,24 +487,44 @@ int main(int argc, char **argv) length= stktmp->val[stktmp->top-1] + o1->patlen - begin + lmax + o2->patlen; else length= apatseq->seqlen - begin; - + + if (circular) + { + begin = 0; + length=apatseq->seqlen+circular; + } o2cHits = ManberAll(apatseq,o2c,1,begin,length); if (o2cHits) for (i=0; i < o1Hits;i++) { posi = apatseq->hitpos[0]->val[i]; - erri = apatseq->hiterr[0]->val[i]; - for (j=0; j < o2cHits; j++) + + if (posi < apatseq->seqlen) { - posj =apatseq->hitpos[1]->val[j] + o2c->patlen; - errj =apatseq->hiterr[1]->val[j]; - length=posj - posi + 1 - o1->patlen - o2->patlen; - - if ((!lmin || (length >= lmin)) && - (!lmax || (length <= lmax))) - printRepeat(seq,o1,o2c,'D',kingdom_mode,posi,posj,erri,errj,taxonomy); - //printf("%s\tD\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o1Hits,o2cHits,posi,posj,scname); + erri = apatseq->hiterr[0]->val[i]; + for (j=0; j < o2cHits; j++) + { + posj =apatseq->hitpos[1]->val[j]; + + if (posj < apatseq->seqlen) + { + posj+=o2c->patlen; + // printf("coucou %d %d %d\n",posi,posj,apatseq->seqlen); + errj =apatseq->hiterr[1]->val[j]; + length = 0; + if (posj > posi) + length=posj - posi - o1->patlen; /* - o2->patlen : suppress by */ + if (posj < posi) + length= posj + apatseq->seqlen - posi - o1->patlen; + if (length && + (!lmin || (length >= lmin)) && + (!lmax || (length <= lmax))) + printRepeat(seq,o1,o2c,'D',kingdom_mode,posi,posj,erri,errj,taxonomy); + //printf("%s\tD\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o1Hits,o2cHits,posi,posj,scname); + + } + } } } } @@ -506,23 +541,43 @@ int main(int argc, char **argv) else length= apatseq->seqlen - begin; + if (circular) + { + begin = 0; + length=apatseq->seqlen+circular; + } + o1cHits = ManberAll(apatseq,o1c,3,begin,length); if (o1cHits) for (i=0; i < o2Hits;i++) { posi = apatseq->hitpos[2]->val[i]; - erri = apatseq->hiterr[2]->val[i]; - for (j=0; j < o1cHits; j++) + + if (posi < apatseq->seqlen) { - posj=apatseq->hitpos[3]->val[j] + o1c->patlen; - errj=apatseq->hiterr[3]->val[j]; - length=posj - posi + 1 - o1->patlen - o2->patlen; - - if ((!lmin || (length >= lmin)) && - (!lmax || (length <= lmax))) - printRepeat(seq,o2,o1c,'R',kingdom_mode,posi,posj,erri,errj,taxonomy); - //printf("%s\tR\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o2Hits,o1cHits,posi,posj,scname); + erri = apatseq->hiterr[2]->val[i]; + for (j=0; j < o1cHits; j++) + { + posj=apatseq->hitpos[3]->val[j]; + if (posj < apatseq->seqlen) + { + posj+=o1c->patlen; + errj=apatseq->hiterr[3]->val[j]; + + length = 0; + if (posj > posi) + length=posj - posi + 1 - o2->patlen; /* - o1->patlen : suppress by */ + if (posj < posi) + length= posj + apatseq->seqlen - posi - o1->patlen; + + if (length && + (!lmin || (length >= lmin)) && + (!lmax || (length <= lmax))) + printRepeat(seq,o2,o1c,'R',kingdom_mode,posi,posj,erri,errj,taxonomy); + //printf("%s\tR\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o2Hits,o1cHits,posi,posj,scname); + } + } } } } diff --git a/src/libapat/apat.h b/src/libapat/apat.h index 230276c..eaa06df 100644 --- a/src/libapat/apat.h +++ b/src/libapat/apat.h @@ -103,6 +103,7 @@ typedef struct { /* sequence */ Int32 seqlen; /* sequence length */ Int32 seqsiz; /* sequence buffer size */ Int32 datsiz; /* data buffer size */ + Int32 circular; UInt8 *data; /* data buffer */ char *cseq; /* sequence buffer */ StackiPtr hitpos[MAX_PATTERN]; /* stack of hit pos. */ diff --git a/src/libecoPCR/ecoPCR.h b/src/libecoPCR/ecoPCR.h index 4c26fc0..3e13b18 100644 --- a/src/libecoPCR/ecoPCR.h +++ b/src/libecoPCR/ecoPCR.h @@ -251,7 +251,7 @@ int32_t delete_apatseq(SeqPtr pseq); PatternPtr buildPattern(const char *pat, int32_t error_max); PatternPtr complementPattern(PatternPtr pat); -SeqPtr ecoseq2apatseq(ecoseq_t *in,SeqPtr out); +SeqPtr ecoseq2apatseq(ecoseq_t *in,SeqPtr out,int32_t circular); char *ecoComplementPattern(char *nucAcSeq); char *ecoComplementSequence(char *nucAcSeq); diff --git a/src/libecoPCR/ecoapat.c b/src/libecoPCR/ecoapat.c index 05d8bf4..a652b1e 100644 --- a/src/libecoPCR/ecoapat.c +++ b/src/libecoPCR/ecoapat.c @@ -54,6 +54,9 @@ void EncodeSequence(SeqPtr seq) cseq++; } + + for (i=0,cseq=seq->cseq;i < seq->circular; i++,cseq++) + *data++ = (IS_UPPER(*cseq) ? *cseq - 'A' : 0x0); for (i = 0 ; i < MAX_PATTERN ; i++) seq->hitpos[i]->top = seq->hiterr[i]->top = 0; @@ -63,7 +66,7 @@ void EncodeSequence(SeqPtr seq) #undef IS_UPPER -SeqPtr ecoseq2apatseq(ecoseq_t *in,SeqPtr out) +SeqPtr ecoseq2apatseq(ecoseq_t *in,SeqPtr out,int32_t circular) { int i; @@ -83,20 +86,22 @@ SeqPtr ecoseq2apatseq(ecoseq_t *in,SeqPtr out) } } + out->name = in->AC; out->seqsiz = out->seqlen = in->SQ_length; - + out->circular = circular; + if (!out->data) { - out->data = ECOMALLOC(out->seqlen *sizeof(UInt8), + out->data = ECOMALLOC((out->seqlen+circular) *sizeof(UInt8), "Error in Allocation of a new Seq data member"); - out->datsiz= out->seqlen; + out->datsiz= out->seqlen+circular; } - else if (out->seqlen >= out->datsiz) + else if ((out->seqlen +circular) >= out->datsiz) { - out->data = ECOREALLOC(out->data,out->seqlen, + out->data = ECOREALLOC(out->data,(out->seqlen+circular), "Error during Seq data buffer realloc"); - out->datsiz= out->seqlen; + out->datsiz= out->seqlen+circular; } out->cseq = in->SQ; @@ -191,4 +196,4 @@ PatternPtr complementPattern(PatternPtr pat) return pattern; -} \ No newline at end of file +} diff --git a/src/libecoPCR/ecodna.c b/src/libecoPCR/ecodna.c index 8fc420e..7d29a0e 100644 --- a/src/libecoPCR/ecodna.c +++ b/src/libecoPCR/ecodna.c @@ -109,22 +109,44 @@ char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end) static int32_t buffSize= 0; int32_t length; - length = end - begin; - - if (length >= buffSize) + if (begin < end) { - buffSize = length+1; - if (buffer) - buffer=ECOREALLOC(buffer,buffSize, - "Error in reallocating sub sequence buffer"); - else - buffer=ECOMALLOC(buffSize, - "Error in allocating sub sequence buffer"); - + length = end - begin; + + if (length >= buffSize) + { + buffSize = length+1; + if (buffer) + buffer=ECOREALLOC(buffer,buffSize, + "Error in reallocating sub sequence buffer"); + else + buffer=ECOMALLOC(buffSize, + "Error in allocating sub sequence buffer"); + + } + + strncpy(buffer,nucAcSeq + begin,length); + buffer[length]=0; + } + else + { + length = end + strlen(nucAcSeq) - begin; + + if (length >= buffSize) + { + buffSize = length+1; + if (buffer) + buffer=ECOREALLOC(buffer,buffSize, + "Error in reallocating sub sequence buffer"); + else + buffer=ECOMALLOC(buffSize, + "Error in allocating sub sequence buffer"); + + } + strncpy(buffer,nucAcSeq+begin,length - end); + strncpy(buffer+(length-end),nucAcSeq ,end); + buffer[length]=0; } - - strncpy(buffer,nucAcSeq + begin,length); - buffer[length]=0; return buffer; }