mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
381 lines
8.7 KiB
C
Executable File
381 lines
8.7 KiB
C
Executable File
#include <string.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include "libstki.h"
|
|
#include "apat.h"
|
|
|
|
#include "obiapat.h"
|
|
|
|
static void EncodeSequence(SeqPtr seq, const char *in);
|
|
static void UpperSequence(char *seq);
|
|
|
|
/*
|
|
* print the message given as argument and exit the program
|
|
* @param error error number
|
|
* @param message the text explaining what's going on
|
|
* @param filename the file source where the program failed
|
|
* @param linenumber the line where it has failed
|
|
* filename and linenumber are written at pre-processing
|
|
* time by a macro
|
|
*/
|
|
void* ecoError(int error,
|
|
const char* message,
|
|
const char * filename,
|
|
int linenumber,
|
|
int *errno,
|
|
char **error_msg)
|
|
{
|
|
*error_msg = malloc(1001);
|
|
snprintf(*error_msg,1000,
|
|
"Error %d in file %s line %d : %s",
|
|
error,
|
|
filename,
|
|
linenumber,
|
|
message);
|
|
|
|
*errno = error;
|
|
return NULL;
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
* @doc: DNA alphabet (IUPAC)
|
|
*/
|
|
#define LX_BIO_DNA_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZ#![]"
|
|
|
|
/*
|
|
* @doc: complementary DNA alphabet (IUPAC)
|
|
*/
|
|
#define LX_BIO_CDNA_ALPHA "TVGHEFCDIJMLKNOPQYSAABWXRZ#!]["
|
|
|
|
|
|
static char sNuc[] = LX_BIO_DNA_ALPHA;
|
|
static char sAnuc[] = LX_BIO_CDNA_ALPHA;
|
|
|
|
static char LXBioBaseComplement(char nucAc);
|
|
static char *LXBioSeqComplement(char *nucAcSeq);
|
|
static char *reverseSequence(char *str,char isPattern);
|
|
|
|
|
|
/* ---------------------------- */
|
|
|
|
char LXBioBaseComplement(char nucAc)
|
|
{
|
|
char *c;
|
|
|
|
if ((c = strchr(sNuc, nucAc)))
|
|
return sAnuc[(c - sNuc)];
|
|
else
|
|
return nucAc;
|
|
}
|
|
|
|
/* ---------------------------- */
|
|
|
|
char *LXBioSeqComplement(char *nucAcSeq)
|
|
{
|
|
char *s;
|
|
|
|
for (s = nucAcSeq ; *s ; s++)
|
|
*s = LXBioBaseComplement(*s);
|
|
|
|
return nucAcSeq;
|
|
}
|
|
|
|
|
|
char *reverseSequence(char *str,char isPattern)
|
|
{
|
|
char *sb, *se, c;
|
|
|
|
if (! str)
|
|
return str;
|
|
|
|
sb = str;
|
|
se = str + strlen(str) - 1;
|
|
|
|
while(sb <= se) {
|
|
c = *sb;
|
|
*sb++ = *se;
|
|
*se-- = c;
|
|
}
|
|
|
|
sb = str;
|
|
se = str + strlen(str) - 1;
|
|
|
|
if (isPattern)
|
|
for (;sb <= se; sb++)
|
|
{
|
|
if (*sb=='#')
|
|
{
|
|
if (*(sb+1) == '[') {
|
|
while(*sb !=']') {
|
|
*sb = *(sb+1);
|
|
sb++;
|
|
}
|
|
*sb='#';
|
|
} else {
|
|
if (((se - sb) > 2) && (*(sb+2)=='!'))
|
|
{
|
|
*sb='!';
|
|
sb+=2;
|
|
*sb='#';
|
|
}
|
|
else
|
|
{
|
|
*sb=*(sb+1);
|
|
sb++;
|
|
*sb='#';
|
|
}}
|
|
}
|
|
else if (*sb=='!')
|
|
{
|
|
*sb=*(sb-1);
|
|
*(sb-1)='!';
|
|
}
|
|
}
|
|
|
|
return str;
|
|
}
|
|
|
|
/* -------------------------------------------- */
|
|
/* lowercase sequence */
|
|
/* -------------------------------------------- */
|
|
|
|
#define IS_UPPER(c) (((c) >= 'A') && ((c) <= 'A'))
|
|
#define TO_LOWER(c) ((c) - 'A' + 'a')
|
|
|
|
char *LowerSequence(char *seq)
|
|
{
|
|
char *cseq;
|
|
|
|
for (cseq = seq ; *cseq ; cseq++)
|
|
if (IS_UPPER(*cseq)) {
|
|
*cseq = TO_LOWER(*cseq);
|
|
}
|
|
return seq;
|
|
}
|
|
|
|
#undef IS_UPPER
|
|
#undef TO_LOWER
|
|
|
|
|
|
char *ecoComplementPattern(char *nucAcSeq)
|
|
{
|
|
return reverseSequence(LXBioSeqComplement(nucAcSeq),1);
|
|
}
|
|
|
|
|
|
/* -------------------------------------------- */
|
|
/* uppercase sequence */
|
|
/* -------------------------------------------- */
|
|
|
|
#define IS_LOWER(c) (((c) >= 'a') && ((c) <= 'z'))
|
|
#define TO_UPPER(c) ((c) - 'a' + 'A')
|
|
|
|
void UpperSequence(char *seq)
|
|
{
|
|
char *cseq;
|
|
|
|
for (cseq = seq ; *cseq ; cseq++)
|
|
if (IS_LOWER(*cseq))
|
|
*cseq = TO_UPPER(*cseq);
|
|
}
|
|
|
|
#undef IS_LOWER
|
|
#undef TO_UPPER
|
|
|
|
|
|
|
|
|
|
|
|
/* -------------------------------------------- */
|
|
/* encode sequence */
|
|
/* IS_UPPER is slightly faster than isupper */
|
|
/* -------------------------------------------- */
|
|
|
|
#define IS_UPPER(c) (((c) >= 'A') && ((c) <= 'Z'))
|
|
#define IS_LOWER(c) (((c) >= 'a') && ((c) <= 'z'))
|
|
|
|
|
|
|
|
void EncodeSequence(SeqPtr seq, const char *in)
|
|
{
|
|
int i;
|
|
uint8_t *data;
|
|
const char *cseq;
|
|
char nuc;
|
|
|
|
data = seq->data;
|
|
|
|
for (i=0,cseq=in; i < seq->seqlen; i++,cseq++,data++) {
|
|
nuc = *cseq;
|
|
*data = (IS_LOWER(nuc) ? nuc - 'a' : 0x0);
|
|
}
|
|
|
|
for (i=0,cseq=in; i < seq->circular; i++,cseq++,data++) {
|
|
nuc = *cseq;
|
|
*data = (IS_LOWER(nuc) ? nuc - 'a' : 0x0);
|
|
}
|
|
|
|
for (i = 0 ; i < MAX_PATTERN ; i++)
|
|
seq->hitpos[i]->top = seq->hiterr[i]->top = 0;
|
|
|
|
}
|
|
|
|
#undef IS_UPPER
|
|
|
|
|
|
SeqPtr new_apatseq(const char *in,int32_t circular, int32_t seqlen,
|
|
SeqPtr out,
|
|
int *errno, char **errmsg)
|
|
{
|
|
// fprintf(stderr,">>>>>>>> new_apatseq\n");
|
|
int i;
|
|
|
|
if (circular != 0) circular=MAX_PAT_LEN;
|
|
|
|
if (!out)
|
|
{
|
|
out = ECOMALLOC(sizeof(Seq),
|
|
"Error in Allocation of a new Seq structure",errno,errmsg);
|
|
|
|
for (i = 0 ; i < MAX_PATTERN ; i++)
|
|
{
|
|
|
|
if (! (out->hitpos[i] = NewStacki(kMinStackiSize)))
|
|
ECOERROR(ECO_MEM_ERROR,"Error in hit stack Allocation",errno,errmsg);
|
|
|
|
if (! (out->hiterr[i] = NewStacki(kMinStackiSize)))
|
|
ECOERROR(ECO_MEM_ERROR,"Error in error stack Allocation",errno,errmsg);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
if (!out->data)
|
|
{
|
|
out->data = ECOMALLOC((seqlen+circular) *sizeof(uint8_t),
|
|
"Error in Allocation of a new Seq data member",
|
|
errno,errmsg);
|
|
out->datsiz= seqlen+circular;
|
|
}
|
|
else if ((seqlen +circular) >= out->datsiz)
|
|
{
|
|
out->data = ECOREALLOC(out->data,(seqlen+circular) *sizeof(uint8_t),
|
|
"Error during Seq data buffer realloc",
|
|
errno,errmsg);
|
|
out->datsiz= seqlen+circular;
|
|
}
|
|
|
|
out->circular = circular;
|
|
out->seqlen = seqlen;
|
|
EncodeSequence(out,in);
|
|
// fprintf(stderr,">>>>>>>> Encodage ok\n");
|
|
return out;
|
|
}
|
|
|
|
int32_t delete_apatseq(SeqPtr pseq,
|
|
int *errno, char **errmsg)
|
|
{
|
|
int i;
|
|
|
|
if (pseq) {
|
|
|
|
if (pseq->data)
|
|
ECOFREE(pseq->data,"Freeing sequence data buffer",
|
|
errno,errmsg);
|
|
|
|
for (i = 0 ; i < MAX_PATTERN ; i++) {
|
|
if (pseq->hitpos[i]) FreeStacki(pseq->hitpos[i]);
|
|
if (pseq->hiterr[i]) FreeStacki(pseq->hiterr[i]);
|
|
}
|
|
|
|
ECOFREE(pseq,"Freeing apat sequence structure",
|
|
errno,errmsg);
|
|
|
|
return 0;
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
Pattern *buildPattern(const char *pat, int32_t error_max, uint8_t hasIndel,
|
|
int *errno, char **errmsg)
|
|
{
|
|
Pattern *pattern;
|
|
int32_t patlen;
|
|
int32_t patlen2;
|
|
|
|
patlen = (int32_t)strlen(pat);
|
|
patlen2 = lenPattern(pat);
|
|
|
|
pattern = ECOMALLOC(sizeof(Pattern) + // Space for struct Pattern
|
|
sizeof(char)*patlen+1 + // Space for cpat
|
|
sizeof(uint32_t) * patlen2 + // Space for patcode
|
|
sizeof(patword_t) * ALPHA_LEN , // Space for smat
|
|
"Error in pattern allocation",
|
|
errno,errmsg);
|
|
|
|
pattern->ok = true;
|
|
pattern->hasIndel= hasIndel;
|
|
pattern->maxerr = error_max;
|
|
|
|
pattern->cpat = (char*)pattern + sizeof(Pattern);
|
|
pattern->patcode = (uint32_t*)(pattern->cpat + patlen + 1);
|
|
pattern->smat = (patword_t*)(pattern->patcode + patlen2);
|
|
|
|
strncpy(pattern->cpat,pat,patlen);
|
|
pattern->cpat[patlen]=0;
|
|
UpperSequence(pattern->cpat);
|
|
|
|
if (!CheckPattern(pattern))
|
|
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern checking",errno,errmsg);
|
|
|
|
if (! EncodePattern(pattern, dna))
|
|
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern encoding",errno,errmsg);
|
|
|
|
if (! CreateS(pattern, ALPHA_LEN))
|
|
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern compiling",errno,errmsg);
|
|
|
|
return pattern;
|
|
|
|
}
|
|
|
|
Pattern *complementPattern(Pattern *pat, int *errno,
|
|
char **errmsg)
|
|
{
|
|
Pattern *pattern;
|
|
|
|
pattern = ECOMALLOC(sizeof(Pattern) +
|
|
sizeof(char) * strlen(pat->cpat) + 1 +
|
|
sizeof(uint32_t) * pat->patlen +
|
|
sizeof(patword_t) * ALPHA_LEN,
|
|
"Error in pattern allocation",
|
|
errno,errmsg);
|
|
|
|
pattern->ok = true;
|
|
pattern->hasIndel= pat->hasIndel;
|
|
pattern->maxerr = pat->maxerr;
|
|
pattern->patlen = pat->patlen;
|
|
|
|
pattern->cpat = (char*)pattern + sizeof(Pattern);
|
|
pattern->patcode = (uint32_t*)(pattern->cpat + strlen(pat->cpat) + 1);
|
|
pattern->smat = (patword_t*)(pattern->patcode + pat->patlen);
|
|
|
|
strcpy(pattern->cpat,pat->cpat);
|
|
|
|
ecoComplementPattern(pattern->cpat);
|
|
|
|
if (!CheckPattern(pattern))
|
|
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern checking",errno,errmsg);
|
|
|
|
if (! EncodePattern(pattern, dna))
|
|
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern encoding",errno,errmsg);
|
|
|
|
if (! CreateS(pattern, ALPHA_LEN))
|
|
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern compiling",errno,errmsg);
|
|
|
|
return pattern;
|
|
|
|
}
|