First commit

This commit is contained in:
2022-01-13 23:27:39 +01:00
parent dab6549cad
commit f53bf1b804
93 changed files with 11042 additions and 0 deletions

View File

@ -0,0 +1,14 @@
/* ----------------------------------------------- */
/* dft_pat_seq_code.h */
/* default alphabet encoding for alpha */
/* ----------------------------------------------- */
0x00000001 /* A */, 0x00000002 /* B */, 0x00000004 /* C */,
0x00000008 /* D */, 0x00000010 /* E */, 0x00000020 /* F */,
0x00000040 /* G */, 0x00000080 /* H */, 0x00000100 /* I */,
0x00000200 /* J */, 0x00000400 /* K */, 0x00000800 /* L */,
0x00001000 /* M */, 0x00002000 /* N */, 0x00004000 /* O */,
0x00008000 /* P */, 0x00010000 /* Q */, 0x00020000 /* R */,
0x00040000 /* S */, 0x00080000 /* T */, 0x00100000 /* U */,
0x00200000 /* V */, 0x00400000 /* W */, 0x00800000 /* X */,
0x01000000 /* Y */, 0x02000000 /* Z */

View File

@ -0,0 +1,71 @@
/* ----------------------------------------------- */
/* dna_code.h */
/* alphabet encoding for dna/rna */
/* ----------------------------------------- */
/* IUPAC encoding */
/* ----------------------------------------- */
/* G/A/T/C */
/* U=T */
/* R=AG */
/* Y=CT */
/* M=AC */
/* K=GT */
/* S=CG */
/* W=AT */
/* H=ACT */
/* B=CGT */
/* V=ACG */
/* D=AGT */
/* N=ACGT */
/* X=ACGT */
/* EFIJLOPQZ not recognized */
/* ----------------------------------------- */
/* dual encoding */
/* ----------------------------------------- */
/* A=ADHMNRVW */
/* B=BCDGHKMNRSTUVWY */
/* C=BCHMNSVY */
/* D=ABDGHKMNRSTUVWY */
/* G=BDGKNRSV */
/* H=ABCDHKMNRSTUVWY */
/* K=BDGHKNRSTUVWY */
/* M=ABCDHMNRSVWY */
/* N=ABCDGHKMNRSTUVWY */
/* R=ABDGHKMNRSVW */
/* S=BCDGHKMNRSVY */
/* T=BDHKNTUWY */
/* U=BDHKNTUWY */
/* V=ABCDGHKMNRSVWY */
/* W=ABDHKMNRTUVWY */
/* X=ABCDGHKMNRSTUVWY */
/* Y=BCDHKMNSTUVWY */
/* EFIJLOPQZ not recognized */
/* ----------------------------------------------- */
#ifndef USE_DUAL
/* IUPAC */
0x00000001 /* A */, 0x00080044 /* B */, 0x00000004 /* C */,
0x00080041 /* D */, 0x00000000 /* E */, 0x00000000 /* F */,
0x00000040 /* G */, 0x00080005 /* H */, 0x00000000 /* I */,
0x00000000 /* J */, 0x00080040 /* K */, 0x00000000 /* L */,
0x00000005 /* M */, 0x00080045 /* N */, 0x00000000 /* O */,
0x00000000 /* P */, 0x00000000 /* Q */, 0x00000041 /* R */,
0x00000044 /* S */, 0x00080000 /* T */, 0x00080000 /* U */,
0x00000045 /* V */, 0x00080001 /* W */, 0x00080045 /* X */,
0x00080004 /* Y */, 0x00000000 /* Z */
#else
/* DUAL */
0x00623089 /* A */, 0x017e34ce /* B */, 0x01243086 /* C */,
0x017e34cb /* D */, 0x00000000 /* E */, 0x00000000 /* F */,
0x0026244a /* G */, 0x017e348f /* H */, 0x00000000 /* I */,
0x00000000 /* J */, 0x017e24ca /* K */, 0x00000000 /* L */,
0x0166308f /* M */, 0x017e34cf /* N */, 0x00000000 /* O */,
0x00000000 /* P */, 0x00000000 /* Q */, 0x006634cb /* R */,
0x012634ce /* S */, 0x0158248a /* T */, 0x0158248a /* U */,
0x016634cf /* V */, 0x017a348b /* W */, 0x017e34cf /* X */,
0x017c348e /* Y */, 0x00000000 /* Z */
#endif

View File

@ -0,0 +1,51 @@
/* ----------------------------------------------- */
/* prot_code.h */
/* alphabet encoding for proteins */
/* ----------------------------------------- */
/* IUPAC encoding */
/* ----------------------------------------- */
/* B=DN */
/* Z=EQ */
/* X=any - {X} */
/* JOU not recognized */
/* ----------------------------------------- */
/* dual encoding */
/* ----------------------------------------- */
/* B=BDN */
/* D=BD */
/* E=EZ */
/* N=BN */
/* Q=QZ */
/* X=any - {X} */
/* Z=EQZ */
/* JOU not recognized */
/* ----------------------------------------------- */
#ifndef USE_DUAL
/* IUPAC */
0x00000001 /* A */, 0x00002008 /* B */, 0x00000004 /* C */,
0x00000008 /* D */, 0x00000010 /* E */, 0x00000020 /* F */,
0x00000040 /* G */, 0x00000080 /* H */, 0x00000100 /* I */,
0x00000000 /* J */, 0x00000400 /* K */, 0x00000800 /* L */,
0x00001000 /* M */, 0x00002000 /* N */, 0x00000000 /* O */,
0x00008000 /* P */, 0x00010000 /* Q */, 0x00020000 /* R */,
0x00040000 /* S */, 0x00080000 /* T */, 0x00000000 /* U */,
0x00200000 /* V */, 0x00400000 /* W */, 0x037fffff /* X */,
0x01000000 /* Y */, 0x00010010 /* Z */
#else
/* DUAL */
0x00000001 /* A */, 0x0000200a /* B */, 0x00000004 /* C */,
0x0000000a /* D */, 0x02000010 /* E */, 0x00000020 /* F */,
0x00000040 /* G */, 0x00000080 /* H */, 0x00000100 /* I */,
0x00000000 /* J */, 0x00000400 /* K */, 0x00000800 /* L */,
0x00001000 /* M */, 0x00002002 /* N */, 0x00000000 /* O */,
0x00008000 /* P */, 0x02010000 /* Q */, 0x00020000 /* R */,
0x00040000 /* S */, 0x00080000 /* T */, 0x00000000 /* U */,
0x00200000 /* V */, 0x00400000 /* W */, 0x037fffff /* X */,
0x01000000 /* Y */, 0x02010010 /* Z */
#endif

View File

@ -0,0 +1,24 @@
SOURCES = apat_parse.c \
apat_search.c \
libstki.c
SRCS=$(SOURCES)
OBJECTS= $(patsubst %.c,%.o,$(SOURCES))
LIBFILE= libapat.a
RANLIB=ranlib
include ../global.mk
all: $(LIBFILE)
clean:
rm -rf $(OBJECTS) $(LIBFILE)
$(LIBFILE): $(OBJECTS)
ar -cr $@ $?
$(RANLIB) $@

165
pkg/obiapat/apat.h Normal file
View File

@ -0,0 +1,165 @@
/* ==================================================== */
/* Copyright (c) Atelier de BioInformatique */
/* Dec. 94 */
/* File: apat.h */
/* Purpose: pattern scan */
/* History: */
/* 28/12/94 : <Gloup> ascan first version */
/* 14/05/99 : <Gloup> last revision */
/* 07/12/21 : <Zafacs> last some cleaning for 2020 */
/* ==================================================== */
#ifndef H_apat
#define H_apat
#include <stdio.h>
#include "libstki.h"
/* ----------------------------------------------- */
/* constantes */
/* ----------------------------------------------- */
#ifndef BUFSIZ
#define BUFSIZ 1024 /* io buffer size */
#endif
#define MAX_NAME_LEN BUFSIZ /* max length of sequence name */
#define ALPHA_LEN 26 /* alphabet length */
/* *DO NOT* modify */
#define MAX_PATTERN 1 /* max # of patterns */
/* *DO NOT* modify */
#define MAX_PAT_LEN 64 /* max pattern length */
/* *DO NOT* modify */
#define MAX_PAT_ERR 64 /* max # of errors */
/* *DO NOT* modify */
#define PATMASK 0x3ffffff /* mask for 26 symbols */
/* *DO NOT* modify */
#define OBLIBIT 0x4000000 /* bit 27 to 1 -> oblig. pos */
/* *DO NOT* modify */
/* mask for position */
#define ONEMASK 0x8000000000000000 /* mask for highest position */
/* masks for Levenhstein edit */
#define OPER_IDT 0x0000000000000000 /* identity */
#define OPER_INS 0x4000000000000000 /* insertion */
#define OPER_DEL 0x8000000000000000 /* deletion */
#define OPER_SUB 0xc000000000000000 /* substitution */
#define OPER_SHFT 30 /* <unused> shift */
/* Levenhstein Opcodes */
#define SOPER_IDT 0x0 /* identity */
#define SOPER_INS 0x1 /* insertion */
#define SOPER_DEL 0x2 /* deletion */
#define SOPER_SUB 0x3 /* substitution */
/* Levenhstein Opcodes masks */
#define OPERMASK 0xc000000000000000 /* mask for Opcodes /!\ */
#define NOPERMASK 0x3fffffffffffffff /* negate of previous /!\ */
/* special chars in pattern */
#define PATCHARS "[]!#"
/* 26 letter alphabet */
/* in alphabetical order */
#define ORD_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
/* protein alphabet */
#define PROT_ALPHA "ACDEFGHIKLMNPQRSTVWY"
/* dna/rna alphabet */
#define DNA_ALPHA "ABCDGHKMNRSTUVWXY"
/* ----------------------------------------------- */
/* data structures */
/* ----------------------------------------------- */
typedef uint64_t patword_t;
/* -------------------- */
typedef enum { /* data encoding */
/* -------------------- */
alpha = 0, /* [A-Z] */
dna, /* IUPAC DNA */
protein /* IUPAC proteins */
} CodType;
/* -------------------- */
typedef struct { /* sequence */
/* -------------------- */
char *name; /* sequence name */
int32_t seqlen; /* sequence length */
int32_t seqsiz; /* sequence buffer size */
int32_t datsiz; /* data buffer size */
int32_t circular;
uint8_t *data; /* data buffer */
char *cseq; /* sequence buffer */
StackiPtr hitpos[MAX_PATTERN]; /* stack of hit pos. */
StackiPtr hiterr[MAX_PATTERN]; /* stack of errors */
} Seq, *SeqPtr;
/* -------------------- */
typedef struct { /* pattern */
/* -------------------- */
int32_t patlen; /* pattern length */
int32_t maxerr; /* max # of errors */
char *cpat; /* pattern string */
uint32_t *patcode; /* encoded pattern */
patword_t *smat; /* S matrix */
patword_t omask; /* oblig. bits mask */
bool hasIndel; /* are indels allowed */
bool ok; /* is pattern ok */
} Pattern, *PatternPtr;
/* ----------------------------------------------- */
/* prototypes */
/* ----------------------------------------------- */
/* apat_seq.c */
SeqPtr FreeSequence (SeqPtr pseq);
SeqPtr NewSequence (void);
int32_t ReadNextSequence (SeqPtr pseq);
int32_t WriteSequence (FILE *filou , SeqPtr pseq);
/* apat_parse.c */
uint32_t *GetCode (CodType ctype);
int32_t CheckPattern (Pattern *ppat);
int32_t EncodePattern (Pattern *ppat, CodType ctype);
int32_t ReadPattern (Pattern *ppat);
void PrintDebugPattern (Pattern *ppat);
int lenPattern (const char *pat);
/* apat_search.c */
int32_t CreateS (Pattern *ppat, int32_t lalpha);
int32_t ManberNoErr (Seq *pseq , Pattern *ppat, int32_t patnum,int32_t begin,int32_t length);
int32_t ManberSub (Seq *pseq , Pattern *ppat, int32_t patnum,int32_t begin,int32_t length);
int32_t ManberIndel (Seq *pseq , Pattern *ppat, int32_t patnum,int32_t begin,int32_t length);
int32_t ManberAll (Seq *pseq , Pattern *ppat, int32_t patnum,int32_t begin,int32_t length);
int32_t NwsPatAlign (Seq *pseq , Pattern *ppat, int32_t nerr ,
int32_t *reslen , int32_t *reserr);
/* apat_sys.c */
float UserCpuTime (int32_t reset);
float SysCpuTime (int32_t reset);
char *StrCpuTime (int32_t reset);
void Erreur (char *msg , int32_t stat);
int32_t AccessFile (char *path, char *mode);
#endif /* H_apat */

15
pkg/obiapat/apat_mem.h Normal file
View File

@ -0,0 +1,15 @@
#ifndef __APAT_MEM_H__
#define __APAT_MEM_H__
/* ----------------------------------------------- */
/* macros */
/* ----------------------------------------------- */
#define NEW(typ) (typ*)malloc(sizeof(typ))
#define NEWN(typ, dim) (typ*)malloc((uint64_t)(dim) * sizeof(typ))
#define REALLOC(typ, ptr, dim) (typ*)realloc((void *) (ptr), (uint64_t)(dim) * sizeof(typ))
#define FREE(ptr) free((void *) ptr)
#endif /* __APAT_MEM_H__ */

393
pkg/obiapat/apat_parse.c Normal file
View File

@ -0,0 +1,393 @@
/* ==================================================== */
/* Copyright (c) Atelier de BioInformatique */
/* Mar. 92 */
/* File: apat_parse.c */
/* Purpose: Codage du pattern */
/* History: */
/* 00/07/94 : <Gloup> first version (stanford) */
/* 00/11/94 : <Gloup> revised for DNA/PROTEIN */
/* 30/12/94 : <Gloup> modified EncodePattern */
/* for manber search */
/* 14/05/99 : <Gloup> indels added */
/* 07/12/21 : <Zafacs> some cleaning for 2020 */
/* ==================================================== */
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "apat.h"
/* -------------------- */
/* default char */
/* encodings */
/* -------------------- */
static uint32_t sDftCode[] = {
0x00000001 /* A */, 0x00000002 /* B */, 0x00000004 /* C */,
0x00000008 /* D */, 0x00000010 /* E */, 0x00000020 /* F */,
0x00000040 /* G */, 0x00000080 /* H */, 0x00000100 /* I */,
0x00000200 /* J */, 0x00000400 /* K */, 0x00000800 /* L */,
0x00001000 /* M */, 0x00002000 /* N */, 0x00004000 /* O */,
0x00008000 /* P */, 0x00010000 /* Q */, 0x00020000 /* R */,
0x00040000 /* S */, 0x00080000 /* T */, 0x00100000 /* U */,
0x00200000 /* V */, 0x00400000 /* W */, 0x00800000 /* X */,
0x01000000 /* Y */, 0x02000000 /* Z */
};
/* -------------------- */
/* char encodings */
/* IUPAC */
/* -------------------- */
/* IUPAC Proteins */
static uint32_t sProtCode[] = {
0x00000001 /* A */, 0x00002008 /* B */, 0x00000004 /* C */,
0x00000008 /* D */, 0x00000010 /* E */, 0x00000020 /* F */,
0x00000040 /* G */, 0x00000080 /* H */, 0x00000100 /* I */,
0x00000000 /* J */, 0x00000400 /* K */, 0x00000800 /* L */,
0x00001000 /* M */, 0x00002000 /* N */, 0x00000000 /* O */,
0x00008000 /* P */, 0x00010000 /* Q */, 0x00020000 /* R */,
0x00040000 /* S */, 0x00080000 /* T */, 0x00000000 /* U */,
0x00200000 /* V */, 0x00400000 /* W */, 0x037fffff /* X */,
0x01000000 /* Y */, 0x00010010 /* Z */
};
/* IUPAC Dna/Rna */
static uint32_t sDnaCode[] = {
0x00000001 /* A */, 0x00080044 /* B */, 0x00000004 /* C */,
0x00080041 /* D */, 0x00000000 /* E */, 0x00000000 /* F */,
0x00000040 /* G */, 0x00080005 /* H */, 0x00000000 /* I */,
0x00000000 /* J */, 0x00080040 /* K */, 0x00000000 /* L */,
0x00000005 /* M */, 0x00080045 /* N */, 0x00000000 /* O */,
0x00000000 /* P */, 0x00000000 /* Q */, 0x00000041 /* R */,
0x00000044 /* S */, 0x00080000 /* T */, 0x00080000 /* U */,
0x00000045 /* V */, 0x00080001 /* W */, 0x00080045 /* X */,
0x00080004 /* Y */, 0x00000000 /* Z */
};
/* -------------------------------------------- */
/* internal replacement of gets */
/* -------------------------------------------- */
static char *sGets(char *buffer, int size) {
char *ebuf;
if (! fgets(buffer, size-1, stdin))
return NULL;
/* remove trailing line feed */
ebuf = buffer + strlen(buffer);
while (--ebuf >= buffer) {
if ((*ebuf == '\n') || (*ebuf == '\r'))
*ebuf = '\000';
else
break;
}
return buffer;
}
/* -------------------------------------------- */
/* returns actual code associated to type */
/* -------------------------------------------- */
uint32_t *GetCode(CodType ctype)
{
uint32_t *code = sDftCode;
switch (ctype) {
case dna : code = sDnaCode ; break;
case protein : code = sProtCode ; break;
default : code = sDftCode ; break;
}
return code;
}
/* -------------------------------------------- */
#define BAD_IF(tst) if (tst) return 0
int CheckPattern(Pattern *ppat)
{
int lev;
char *pat;
pat = ppat->cpat;
BAD_IF (*pat == '#');
for (lev = 0; *pat ; pat++)
switch (*pat) {
case '[' :
BAD_IF (lev);
BAD_IF (*(pat+1) == ']');
lev++;
break;
case ']' :
lev--;
BAD_IF (lev);
break;
case '!' :
BAD_IF (lev);
BAD_IF (! *(pat+1));
BAD_IF (*(pat+1) == ']');
break;
case '#' :
BAD_IF (lev);
BAD_IF (*(pat-1) == '[');
break;
default :
if (! isupper(*pat))
return 0;
break;
}
return (lev ? 0 : 1);
}
#undef BAD_IF
/* -------------------------------------------- */
static const char *skipOblig(const char *pat)
{
return (*(pat+1) == '#' ? pat+1 : pat);
}
/* -------------------------------------------- */
static const char *splitPattern(const char *pat)
{
switch (*pat) {
case '[' :
for (; *pat; pat++)
if (*pat == ']')
return skipOblig(pat);
return NULL;
break;
case '!' :
return splitPattern(pat+1);
break;
}
return skipOblig(pat);
}
/* -------------------------------------------- */
static uint32_t valPattern(char *pat, uint32_t *code)
{
uint32_t val;
switch (*pat) {
case '[' :
return valPattern(pat+1, code);
break;
case '!' :
val = valPattern(pat+1, code);
return (~val & PATMASK);
break;
default :
val = 0x0;
while (isupper(*pat)) {
val |= code[*pat - 'A'];
pat++;
}
return val;
}
return 0x0;
}
/* -------------------------------------------- */
static uint32_t obliBitPattern(char *pat)
{
return (*(pat + strlen(pat) - 1) == '#' ? OBLIBIT : 0x0);
}
/* -------------------------------------------- */
int lenPattern(const char *pat)
{
int lpat;
lpat = 0;
while (*pat) {
if (! (pat = splitPattern(pat)))
return 0;
pat++;
lpat++;
}
return lpat;
}
/* -------------------------------------------- */
/* Interface */
/* -------------------------------------------- */
/* -------------------------------------------- */
/* encode un pattern */
/* -------------------------------------------- */
int EncodePattern(Pattern *ppat, CodType ctype)
{
int pos, lpat;
uint32_t *code;
char *pp, *pa, c;
ppat->ok = false;
code = GetCode(ctype);
ppat->patlen = lpat = lenPattern(ppat->cpat);
if (lpat <= 0)
return 0;
// if (! (ppat->patcode = NEWN(uint32_t, lpat)))
// return 0;
pa = pp = ppat->cpat;
pos = 0;
while (*pa) {
pp = (char*)splitPattern(pa);
c = *++pp;
*pp = '\000';
ppat->patcode[pos++] = valPattern(pa, code) | obliBitPattern(pa);
*pp = c;
pa = pp;
}
ppat->ok = true;
return lpat;
}
/* -------------------------------------------- */
/* remove blanks */
/* -------------------------------------------- */
static char *RemBlanks(char *s)
{
char *sb, *sc;
for (sb = sc = s ; *sb ; sb++)
if (! isspace(*sb))
*sc++ = *sb;
return s;
}
/* -------------------------------------------- */
/* count non blanks */
/* -------------------------------------------- */
static uint32_t CountAlpha(char *s)
{
uint32_t n;
for (n = 0 ; *s ; s++)
if (! isspace(*s))
n++;
return n;
}
/* -------------------------------------------- */
/* lit un pattern */
/* <pattern> #mis */
/* ligne starting with '/' are comments */
/* -------------------------------------------- */
int ReadPattern(Pattern *ppat)
{
int val;
char *spac;
char buffer[BUFSIZ];
ppat->ok = true;
if (! sGets(buffer, sizeof(buffer)))
return 0;
if (*buffer == '/')
return ReadPattern(ppat);
if (! CountAlpha(buffer))
return ReadPattern(ppat);
for (spac = buffer ; *spac ; spac++)
if ((*spac == ' ') || (*spac == '\t'))
break;
ppat->ok = false;
if (! *spac)
return 0;
if (sscanf(spac, "%d", &val) != 1)
return 0;
ppat->hasIndel = (val < 0);
ppat->maxerr = ((val >= 0) ? val : -val);
*spac = '\000';
(void) RemBlanks(buffer);
if ((ppat->cpat = NEWN(char, strlen(buffer)+1)))
strcpy(ppat->cpat, buffer);
ppat->ok = (ppat->cpat != NULL);
return (ppat->ok ? 1 : 0);
}
/* -------------------------------------------- */
/* ecrit un pattern - Debug - */
/* -------------------------------------------- */
void PrintDebugPattern(Pattern *ppat)
{
int i;
printf("Pattern : %s (length : %d)\n", ppat->cpat, ppat->patlen);
printf("Encoding : \n\t");
for (i = 0 ; i < ppat->patlen ; i++) {
printf("0x%8.8x ", ppat->patcode[i]);
if (i%4 == 3)
printf("\n\t");
}
printf("\n");
}

337
pkg/obiapat/apat_search.c Normal file
View File

@ -0,0 +1,337 @@
/* ==================================================== */
/* Copyright (c) Atelier de BioInformatique */
/* Dec. 94 */
/* File: apat_search.c */
/* Purpose: recherche du pattern */
/* algorithme de Baeza-Yates/Gonnet */
/* Manber (agrep) */
/* History: */
/* 07/12/94 : <MFS> first version */
/* 28/12/94 : <Gloup> revised version */
/* 14/05/99 : <Gloup> last revision */
/* 07/12/21 : <Zafacs> last some cleaning for 2020 */
/* ==================================================== */
#include <stdlib.h>
#include <string.h>
#include "libstki.h"
#include "apat.h"
#define POP PopiOut
#define PUSH PushiIn
#define TOPCURS CursiToTop
#define DOWNREAD ReadiDown
#define KRONECK(x, msk) ((~x & msk) ? 0 : 1)
#define MIN(x, y) ((x) < (y) ? (x) : (y))
/* -------------------------------------------- */
/* Construction de la matrice S */
/* -------------------------------------------- */
int CreateS(Pattern *ppat, int32_t lalpha)
{
int32_t indx, pindx, i, j;
patword_t amask, omask, *smat;
ppat->ok = false;
omask = 0x0L;
// if (! (smat = NEWN(uint32_t, lalpha)))
// return 0;
smat = ppat->smat;
for (i = 0 ; i < lalpha ; i++)
smat[i] = 0x0;
for (i = ppat->patlen - 1, amask = 0x1L ; i >= 0 ; i--, amask <<= 1) {
indx = ppat->patcode[i];
if (ppat->patcode[i] & OBLIBIT)
omask |= amask;
for (j = 0, pindx = 0x1L ; j < lalpha ; j++, pindx <<= 1)
if (indx & pindx)
smat[j] |= amask;
}
ppat->smat = smat;
ppat->omask = omask;
ppat->ok = true;
return 1;
}
/* -------------------------------------------- */
/* Baeza-Yates/Manber algorithm */
/* NoError */
/* -------------------------------------------- */
int32_t ManberNoErr(Seq *pseq, Pattern *ppat, int patnum,int begin,int length)
{
int32_t pos;
patword_t smask, r;
uint8_t *data;
StackiPtr *stkpos, *stkerr;
int32_t end;
end = begin + length;
end = (end <= (size_t)(pseq->seqlen+pseq->circular)) ? end:(size_t)(pseq->seqlen+pseq->circular);
/* create local masks */
smask = r = 0x1L << ppat->patlen;
/* init. scan */
data = pseq->data + begin;
stkpos = pseq->hitpos + patnum;
EmptyStacki(stkpos[0]);
stkerr = pseq->hiterr + patnum;
EmptyStacki(stkerr[0]);
/* loop on text data */
for (pos = begin ; pos < end ; pos++) {
r = (r >> 1) & ppat->smat[*data++];
if (r & 0x1L) {
PUSH(stkpos, pos - ppat->patlen + 1);
PUSH(stkerr, 0);
}
r |= smask;
}
return (*stkpos)->top; /* aka # of hits */
}
/* -------------------------------------------- */
/* Baeza-Yates/Manber algorithm */
/* Substitution only */
/* */
/* Note : r array is stored as : */
/* 0 0 r(0,j) r(0,j+1) r(1,j) r(1,j+1) ... */
/* */
/* -------------------------------------------- */
int32_t ManberSub(Seq *pseq, Pattern *ppat, int patnum,int begin,int length)
{
int e, emax, found;
uint32_t pos;
uint32_t smask, cmask, sindx;
uint32_t *pr, r[2 * MAX_PAT_ERR + 2];
uint8_t *data;
StackiPtr *stkpos, *stkerr;
uint32_t end;
end = begin + length;
end = (end <= (size_t)(pseq->seqlen+pseq->circular)) ? end:(size_t)(pseq->seqlen+pseq->circular);
/* create local masks */
emax = ppat->maxerr;
r[0] = r[1] = 0x0;
cmask = smask = 0x1L << ppat->patlen;
for (e = 0, pr = r + 3 ; e <= emax ; e++, pr += 2)
*pr = cmask;
cmask = ~ ppat->omask;
/* init. scan */
data = pseq->data + begin;
stkpos = pseq->hitpos + patnum;
EmptyStacki(stkpos[0]);
stkerr = pseq->hiterr + patnum;
EmptyStacki(stkerr[0]);
/* loop on text data */
for (pos = begin ; pos < end ; pos++) {
sindx = ppat->smat[*data++];
for (e = found = 0, pr = r ; e <= emax ; e++, pr += 2) {
pr[2] = pr[3] | smask;
pr[3] = ((pr[0] >> 1) & cmask) /* sub */
| ((pr[2] >> 1) & sindx); /* ident */
if (pr[3] & 0x1L) { /* found */
if (! found) {
PUSH(stkpos, pos - ppat->patlen + 1);
PUSH(stkerr, e);
}
found++;
}
}
}
return (*stkpos)->top; /* aka # of hits */
}
/* -------------------------------------------- */
/* Baeza-Yates/Manber algorithm */
/* Substitution + Indels */
/* */
/* Note : r array is stored as : */
/* 0 0 r(0,j) r(0,j+1) r(1,j) r(1,j+1) ... */
/* */
/* Warning: may return shifted pos. */
/* */
/* -------------------------------------------- */
int32_t ManberIndel(Seq *pseq, Pattern *ppat, int patnum,int begin,int length)
{
int e, emax, found;
uint32_t pos;
uint32_t smask, cmask, sindx;
uint32_t *pr, r[2 * MAX_PAT_ERR + 2];
uint8_t *data;
StackiPtr *stkpos, *stkerr;
uint32_t end;
end = begin + length;
end = (end <= (size_t)(pseq->seqlen+pseq->circular)) ? end:(size_t)(pseq->seqlen+pseq->circular);
/* create local masks */
emax = ppat->maxerr;
r[0] = r[1] = 0x0;
cmask = smask = 0x1L << ppat->patlen;
for (e = 0, pr = r + 3 ; e <= emax ; e++, pr += 2) {
*pr = cmask;
cmask = (cmask >> 1) | smask;
}
cmask = ~ ppat->omask;
/* init. scan */
data = pseq->data + begin;
stkpos = pseq->hitpos + patnum;
stkerr = pseq->hiterr + patnum;
/* loop on text data */
for (pos = begin ; pos < end ; pos++) {
sindx = ppat->smat[*data++];
for (e = found = 0, pr = r ; e <= emax ; e++, pr += 2) {
pr[2] = pr[3] | smask;
pr[3] = (( pr[0] /* ins */
| (pr[0] >> 1) /* sub */
| (pr[1] >> 1)) /* del */
& cmask)
| ((pr[2] >> 1) & sindx); /* ident */
if (pr[3] & 0x1L) { /* found */
if (! found) {
PUSH(stkpos, pos - ppat->patlen + 1);
PUSH(stkerr, e);
}
found++;
}
}
}
return (*stkpos)->top; /* aka # of hits */
}
/* -------------------------------------------- */
/* Baeza-Yates/Manber algorithm */
/* API call to previous functions */
/* -------------------------------------------- */
int32_t ManberAll(Seq *pseq, Pattern *ppat, int patnum,int begin,int length)
{
if (ppat->maxerr == 0)
return ManberNoErr(pseq, ppat, patnum, begin, length);
else if (ppat->hasIndel)
return ManberIndel(pseq, ppat, patnum, begin, length);
else
return ManberSub(pseq, ppat, patnum, begin, length);
}
/* -------------------------------------------- */
/* Alignement NWS */
/* pour edition des hits */
/* (avec substitution obligatoire aux bords) */
/* -------------------------------------------- */
int32_t NwsPatAlign(pseq, ppat, nerr, reslen, reserr)
Seq *pseq;
Pattern *ppat;
int32_t nerr, *reslen, *reserr;
{
uint8_t *sseq, *px;
int32_t i, j, lseq, lpat, npos, dindel, dsub,
*pc, *pi, *pd, *ps;
uint32_t amask;
static int32_t sTab[(MAX_PAT_LEN+MAX_PAT_ERR+1) * (MAX_PAT_LEN+1)];
lseq = pseq->seqlen;
pc = sTab; /* |----|----| --> i */
pi = pc - 1; /* | ps | pd | | */
pd = pi - lseq; /* |----|----| | */
ps = pd - 1; /* | pi | pc | v j */
/* |---------| */
lseq = pseq->seqlen;
lpat = ppat->patlen;
sseq = pseq->data - 1;
amask = ONEMASK >> lpat;
for (j = 0 ; j <= lpat ; j++) {
for (i = 0 , px = sseq ; i <= lseq ; i++, px++) {
if (i && j) {
dindel = MIN(*pi, *pd) + 1;
dsub = *ps + KRONECK(ppat->smat[*px], amask);
*pc = MIN(dindel, dsub);
}
else if (i) /* j == 0 */
*pc = *pi + 1;
else if (j) /* i == 0 */
*pc = *pd + 1;
else /* root */
*pc = 0;
pc++;
pi++;
pd++;
ps++;
}
amask <<= 1;
}
pc--;
for (i = lseq, npos = 0 ; i >= 0 ; i--, pc--) {
if (*pc <= nerr) {
*reslen++ = i;
*reserr++ = *pc;
npos++;
}
}
return npos;
}

82
pkg/obiapat/ecoMalloc.c Normal file
View File

@ -0,0 +1,82 @@
#include "obiapat.h"
#include <stdlib.h>
static int eco_log_malloc = 0;
void eco_trace_memory_allocation()
{
eco_log_malloc=1;
}
void eco_untrace_memory_allocation()
{
eco_log_malloc=0;
}
void *eco_malloc(int32_t chunksize,
const char *error_message,
const char *filename,
int32_t line,
int *errno, char **errmsg)
{
void * chunk;
chunk = calloc(1,chunksize);
if (!chunk)
ecoError(ECO_MEM_ERROR,error_message,filename,line,errno,errmsg);
if (eco_log_malloc)
fprintf(stderr,
"Memory segment located at %p of size %d is allocated (file : %s [%d])",
chunk,
chunksize,
filename,
line);
return chunk;
}
void *eco_realloc(void *chunk,
int32_t newsize,
const char *error_message,
const char *filename,
int32_t line,
int *errno, char **errmsg)
{
void *newchunk;
newchunk = realloc(chunk,newsize);
if (!newchunk)
ecoError(ECO_MEM_ERROR,error_message,filename,line,errno,errmsg);
if (eco_log_malloc)
fprintf(stderr,
"Old memory segment %p is reallocated at %p with a size of %d (file : %s [%d])",
chunk,
newchunk,
newsize,
filename,
line);
return newchunk;
}
void eco_free(void *chunk,
const char *error_message,
const char *filename,
int32_t line,
int *errno, char **errmsg)
{
free(chunk);
if (eco_log_malloc)
fprintf(stderr,
"Memory segment %p is released => %s (file : %s [%d])",
chunk,
error_message,
filename,
line);
}

391
pkg/obiapat/libstki.c Normal file
View File

@ -0,0 +1,391 @@
/* ==================================================== */
/* Copyright (c) Atelier de BioInformatique */
/* Mar. 92 */
/* File: libstki.c */
/* Purpose: A library to deal with 'stacks' of */
/* integers */
/* Note: 'stacks' are dynamic (i.e. size is */
/* automatically readjusted when needed) */
/* History: */
/* 00/03/92 : <Gloup> first draft */
/* 15/08/93 : <Gloup> revised version */
/* 14/05/99 : <Gloup> last revision */
/* 07/12/21 : <Zafacs> some cleaning for 2020's */
/* ==================================================== */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// #include "Gtypes.h"
#include "libstki.h"
/* ============================ */
/* Constantes et Macros locales */
/* ============================ */
#define ExpandStack(stkh) ResizeStacki((stkh), (*stkh)->size << 1)
#define ShrinkStack(stkh) ResizeStacki((stkh), (*stkh)->size >> 1)
static int16_t sStkiLastError = kStkiNoErr;
/* -------------------------------------------- */
/* gestion des erreurs */
/* get/reset erreur flag */
/* */
/* @function: StkiError */
/* -------------------------------------------- */
int16_t StkiError(bool reset)
{
int16_t err;
err = sStkiLastError;
if (reset)
sStkiLastError = kStkiNoErr;
return err;
} /* end of StkiError */
/* -------------------------------------------- */
/* creation d'un stack */
/* */
/* @function: NewStacki */
/* -------------------------------------------- */
StackiPtr NewStacki(int32_t size)
{
StackiPtr stki;
if (! (stki = NEW(Stacki)))
return NULL;
stki->size = size;
stki->top = 0;
stki->cursor = 0;
if ( ! (stki->val = NEWN(int32_t, size))) {
sStkiLastError = kStkiMemErr;
return FreeStacki(stki);
}
return stki;
} /* end of NewStacki */
/* -------------------------------------------- */
/* liberation d'un stack */
/* */
/* @function: FreeStacki */
/* -------------------------------------------- */
StackiPtr FreeStacki(StackiPtr stki)
{
if (stki) {
if (stki->val)
FREE(stki->val);
FREE(stki);
}
return NULL;
} /* end of FreeStacki */
/* -------------------------------------------- */
/* creation d'un vecteur de stacks */
/* */
/* @function: NewStackiVector */
/* -------------------------------------------- */
StackiHdle NewStackiVector(int32_t vectSize, int32_t stackSize)
{
int32_t i;
StackiHdle stkh;
if (! (stkh = NEWN(StackiPtr, vectSize))) {
sStkiLastError = kStkiMemErr;
return NULL;
}
for (i = 0 ; i < vectSize ; i++)
if (! (stkh[i] = NewStacki(stackSize)))
return FreeStackiVector(stkh, i);
return stkh;
} /* end of NewStackiVector */
/* -------------------------------------------- */
/* liberation d'un vecteur de stacks */
/* */
/* @function: FreeStackiVector */
/* -------------------------------------------- */
StackiHdle FreeStackiVector(StackiHdle stkh, int32_t vectSize)
{
int32_t i;
if (stkh) {
for (i = 0 ; i < vectSize ; i++)
(void) FreeStacki(stkh[i]);
FREE(stkh);
}
return NULL;
} /* end of FreeStackiVector */
/* -------------------------------------------- */
/* resize d'un stack */
/* */
/* @function: ResizeStacki */
/* -------------------------------------------- */
int32_t ResizeStacki(StackiHdle stkh, int32_t size)
{
int32_t resize = 0; /* assume error */
int32_t *val;
if ((val = REALLOC(int32_t, (*stkh)->val, size))) {
(*stkh)->size = resize = size;
(*stkh)->val = val;
}
if (! resize)
sStkiLastError = kStkiMemErr;
return resize;
} /* end of ResizeStacki */
/* -------------------------------------------- */
/* empilage(/lement) */
/* */
/* @function: PushiIn */
/* -------------------------------------------- */
bool PushiIn(StackiHdle stkh, int32_t val)
{
if (((*stkh)->top >= (*stkh)->size) && (! ExpandStack(stkh)))
return false;
(*stkh)->val[((*stkh)->top)++] = val;
return true;
} /* end of PushiIn */
/* -------------------------------------------- */
/* depilage(/lement) */
/* */
/* @function: PopiOut */
/* -------------------------------------------- */
bool PopiOut(StackiHdle stkh, int32_t *val)
{
if ((*stkh)->top <= 0)
return false;
*val = (*stkh)->val[--((*stkh)->top)];
if ( ((*stkh)->top < ((*stkh)->size >> 1))
&& ((*stkh)->top > kMinStackiSize))
(void) ShrinkStack(stkh);
return true;
} /* end of PopiOut */
/* -------------------------------------------- */
/* lecture descendante */
/* */
/* @function: ReadiDown */
/* -------------------------------------------- */
bool ReadiDown(StackiPtr stki, int32_t *val)
{
if (stki->cursor <= 0)
return false;
*val = stki->val[--(stki->cursor)];
return true;
} /* end of ReadiDown */
/* -------------------------------------------- */
/* lecture ascendante */
/* */
/* @function: ReadiUp */
/* -------------------------------------------- */
bool ReadiUp(StackiPtr stki, int32_t *val)
{
if (stki->cursor >= stki->top)
return false;
*val = stki->val[(stki->cursor)++];
return true;
} /* end of ReadiUp */
/* -------------------------------------------- */
/* remontee/descente du curseur */
/* */
/* @function: CursiToTop */
/* @function: CursiToBottom */
/* -------------------------------------------- */
void CursiToTop(StackiPtr stki)
{
stki->cursor = stki->top;
} /* end of CursiToTop */
void CursiToBottom(stki)
StackiPtr stki;
{
stki->cursor = 0;
} /* end of CursiToBottom */
/* -------------------------------------------- */
/* echange des valeurs cursor <-> (top - 1) */
/* */
/* @function: CursiSwap */
/* -------------------------------------------- */
void CursiSwap(StackiPtr stki)
{
int32_t tmp;
if ((stki->top <= 0) || (stki->cursor < 0))
return;
tmp = stki->val[stki->cursor];
stki->val[stki->cursor] = stki->val[stki->top - 1];
stki->val[stki->top - 1] = tmp;
} /* end of CursiSwap */
/* -------------------------------------------- */
/* Recherche d'une valeur en stack a partir du */
/* curseur courant en descendant. */
/* on laisse le curseur a l'endroit trouve */
/* */
/* @function: SearchDownStacki */
/* -------------------------------------------- */
bool SearchDownStacki(StackiPtr stki, int32_t sval)
{
int32_t val;
bool more;
while ((more = ReadiDown(stki, &val)))
if (val == sval)
break;
return more;
} /* end of SearchDownStacki */
/* -------------------------------------------- */
/* Recherche dichotomique d'une valeur en stack */
/* le stack est suppose trie par valeurs */
/* croissantes. */
/* on place le curseur a l'endroit trouve */
/* */
/* @function: BinSearchStacki */
/* -------------------------------------------- */
bool BinSearchStacki(StackiPtr stki, int32_t sval)
{
int32_t midd, low, high, span;
low = 0;
high = stki->top - 1;
while (high >= low) {
midd = (high + low) / 2;
span = stki->val[midd] - sval;
if (span == 0) {
stki->cursor = midd;
return true;
}
if (span > 0)
high = midd - 1;
else
low = midd + 1;
}
return false;
} /* end of BinSearchStacki */
/* -------------------------------------------- */
/* teste l'egalite *physique* de deux stacks */
/* */
/* @function: SameStacki */
/* -------------------------------------------- */
bool SameStacki(StackiPtr stki1, StackiPtr stki2)
{
if (stki1->top != stki2->top)
return false;
return ((memcmp(stki1->val, stki2->val,
stki1->top * sizeof(int32_t)) == 0) ? true : false);
} /* end of SameStacki */
/* -------------------------------------------- */
/* inverse l'ordre des elements dans un stack */
/* */
/* @function: ReverseStacki */
/* -------------------------------------------- */
bool ReverseStacki(StackiPtr stki)
{
int32_t *t, *b, swp;
if (stki->top <= 0)
return false;
b = stki->val;
t = b + stki->top - 1;
while (t > b) {
swp = *t;
*t-- = *b;
*b++ = swp;
}
return true;
} /* end of ReverseStacki */
/* -------------------------------------------- */
/* Remove every values from a stack by moving */
/* back the top member to 0. */
/* */
/* @function: EmptyStacki */
/* -------------------------------------------- */
bool EmptyStacki(StackiPtr stki)
{
stki->top = 0;
return true;
}

81
pkg/obiapat/libstki.h Normal file
View File

@ -0,0 +1,81 @@
/* ==================================================== */
/* Copyright (c) Atelier de BioInformatique */
/* Mar. 92 */
/* File: libstki.h */
/* Purpose: library of dynamic stacks holding */
/* integer values */
/* History: */
/* 00/03/92 : <Gloup> first draft */
/* 07/07/93 : <Gloup> complete revision */
/* 10/03/94 : <Gloup> added xxxVector funcs */
/* 14/05/99 : <Gloup> last revision */
/* 07/12/21 : <Zafacs> last some cleaning for 2020 */
/* ==================================================== */
#ifndef _H_libstki
#define _H_libstki
#include <stdint.h>
#include <stdbool.h>
#include "apat_mem.h"
/* ==================================================== */
/* Constantes de dimensionnement */
/* ==================================================== */
#ifndef kMinStackiSize
#define kMinStackiSize 2 /* taille mini stack */
#endif
#define kStkiNoErr 0 /* ok */
#define kStkiMemErr 1 /* not enough memory */
#define kStkiReset true
#define kStkiGet false
/* ==================================================== */
/* Types & Structures de donnees */
/* ==================================================== */
/* -------------------- */
/* structure : pile */
/* -------------------- */
typedef struct Stacki {
/* ---------------------*/
int32_t size; /* stack size */
int32_t top; /* current free pos. */
int32_t cursor; /* current cursor */
int32_t *val; /* values */
/* ---------------------*/
} Stacki, *StackiPtr, **StackiHdle;
/* ==================================================== */
/* Prototypes (generated by mproto) */
/* ==================================================== */
/* libstki.c */
int16_t StkiError (bool reset );
StackiPtr NewStacki (int32_t size );
StackiPtr FreeStacki (StackiPtr stki );
StackiHdle NewStackiVector (int32_t vectSize, int32_t stackSize );
StackiHdle FreeStackiVector (StackiHdle stkh , int32_t vectSize );
int32_t ResizeStacki (StackiHdle stkh , int32_t size );
bool PushiIn (StackiHdle stkh , int32_t val );
bool PopiOut (StackiHdle stkh , int32_t *val );
bool ReadiDown (StackiPtr stki , int32_t *val );
bool ReadiUp (StackiPtr stki , int32_t *val );
void CursiToTop (StackiPtr stki );
void CursiToBottom (StackiPtr stki );
void CursiSwap (StackiPtr stki );
bool SearchDownStacki (StackiPtr stki , int32_t sval );
bool BinSearchStacki (StackiPtr stki , int32_t sval );
bool SameStacki (StackiPtr stki1 , StackiPtr stki2 );
bool ReverseStacki (StackiPtr stki );
bool EmptyStacki (StackiPtr stki );
#endif /* _H_libstki */

417
pkg/obiapat/obiapat.c Normal file
View File

@ -0,0 +1,417 @@
#include <string.h>
#include <stdio.h>
#include "libstki.h"
#include "apat.h"
#include "obiapat.h"
static void EncodeSequence(SeqPtr seq);
static void UpperSequence(char *seq);
/*
* print the message given as argument and exit the program
* @param error error number
* @param message the text explaining what's going on
* @param filename the file source where the program failed
* @param linenumber the line where it has failed
* filename and linenumber are written at pre-processing
* time by a macro
*/
void* ecoError(int error,
const char* message,
const char * filename,
int linenumber,
int *errno,
char **error_msg)
{
asprintf(error_msg,
"Error %d in file %s line %d : %s",
error,
filename,
linenumber,
message);
*errno = error;
return NULL;
}
/*
* @doc: DNA alphabet (IUPAC)
*/
#define LX_BIO_DNA_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZ#![]"
/*
* @doc: complementary DNA alphabet (IUPAC)
*/
#define LX_BIO_CDNA_ALPHA "TVGHEFCDIJMLKNOPQYSAABWXRZ#!]["
static char sNuc[] = LX_BIO_DNA_ALPHA;
static char sAnuc[] = LX_BIO_CDNA_ALPHA;
static char LXBioBaseComplement(char nucAc);
static char *LXBioSeqComplement(char *nucAcSeq);
static char *reverseSequence(char *str,char isPattern);
/* ---------------------------- */
char LXBioBaseComplement(char nucAc)
{
char *c;
if ((c = strchr(sNuc, nucAc)))
return sAnuc[(c - sNuc)];
else
return nucAc;
}
/* ---------------------------- */
char *LXBioSeqComplement(char *nucAcSeq)
{
char *s;
for (s = nucAcSeq ; *s ; s++)
*s = LXBioBaseComplement(*s);
return nucAcSeq;
}
char *reverseSequence(char *str,char isPattern)
{
char *sb, *se, c;
if (! str)
return str;
sb = str;
se = str + strlen(str) - 1;
while(sb <= se) {
c = *sb;
*sb++ = *se;
*se-- = c;
}
sb = str;
se = str + strlen(str) - 1;
if (isPattern)
for (;sb <= se; sb++)
{
if (*sb=='#')
{
if (*(sb+1) == '[') {
while(*sb !=']') {
*sb = *(sb+1);
sb++;
}
*sb='#';
} else {
if (((se - sb) > 2) && (*(sb+2)=='!'))
{
*sb='!';
sb+=2;
*sb='#';
}
else
{
*sb=*(sb+1);
sb++;
*sb='#';
}}
}
else if (*sb=='!')
{
*sb=*(sb-1);
*(sb-1)='!';
}
}
return str;
}
char *ecoComplementPattern(char *nucAcSeq)
{
return reverseSequence(LXBioSeqComplement(nucAcSeq),1);
}
char *ecoComplementSequence(char *nucAcSeq)
{
return reverseSequence(LXBioSeqComplement(nucAcSeq),0);
}
char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end,
int *errno, char **errmsg)
/*
extract subsequence from nucAcSeq [begin,end[
*/
{
static char *buffer = NULL;
static int32_t buffSize= 0;
int32_t length;
if (begin < end)
{
length = end - begin;
if (length >= buffSize)
{
buffSize = length+1;
if (buffer)
buffer=ECOREALLOC(buffer,buffSize,
"Error in reallocating sub sequence buffer",errno,errmsg);
else
buffer=ECOMALLOC(buffSize,
"Error in allocating sub sequence buffer",errno,errmsg);
}
strncpy(buffer,nucAcSeq + begin,length);
buffer[length]=0;
}
else
{
length = end + strlen(nucAcSeq) - begin;
if (length >= buffSize)
{
buffSize = length+1;
if (buffer)
buffer=ECOREALLOC(buffer,buffSize,
"Error in reallocating sub sequence buffer",errno,errmsg);
else
buffer=ECOMALLOC(buffSize,
"Error in allocating sub sequence buffer",errno,errmsg);
}
strncpy(buffer,nucAcSeq+begin,length - end);
strncpy(buffer+(length-end),nucAcSeq ,end);
buffer[length]=0;
}
return buffer;
}
/* -------------------------------------------- */
/* uppercase sequence */
/* -------------------------------------------- */
#define IS_LOWER(c) (((c) >= 'a') && ((c) <= 'z'))
#define TO_UPPER(c) ((c) - 'a' + 'A')
void UpperSequence(char *seq)
{
char *cseq;
for (cseq = seq ; *cseq ; cseq++)
if (IS_LOWER(*cseq))
*cseq = TO_UPPER(*cseq);
}
#undef IS_LOWER
#undef TO_UPPER
/* -------------------------------------------- */
/* encode sequence */
/* IS_UPPER is slightly faster than isupper */
/* -------------------------------------------- */
#define IS_UPPER(c) (((c) >= 'A') && ((c) <= 'Z'))
void EncodeSequence(SeqPtr seq)
{
int i;
uint8_t *data;
char *cseq;
char nuc;
data = seq->data;
cseq = seq->cseq;
while (*cseq) {
nuc = *cseq & (~32);
*data = (IS_UPPER(nuc) ? nuc - 'A' : 0x0);
data++;
cseq++;
}
for (i=0,cseq=seq->cseq;i < seq->circular; i++,cseq++,data++) {
nuc = *cseq & (~32);
*data = (IS_UPPER(nuc) ? nuc - 'A' : 0x0);
}
for (i = 0 ; i < MAX_PATTERN ; i++)
seq->hitpos[i]->top = seq->hiterr[i]->top = 0;
}
#undef IS_UPPER
SeqPtr new_apatseq(const char *in,int32_t circular, int32_t seqlen,
SeqPtr out,
int *errno, char **errmsg)
{
int i;
if (circular != 0) circular=MAX_PAT_LEN;
if (!out)
{
out = ECOMALLOC(sizeof(Seq),
"Error in Allocation of a new Seq structure",errno,errmsg);
for (i = 0 ; i < MAX_PATTERN ; i++)
{
if (! (out->hitpos[i] = NewStacki(kMinStackiSize)))
ECOERROR(ECO_MEM_ERROR,"Error in hit stack Allocation",errno,errmsg);
if (! (out->hiterr[i] = NewStacki(kMinStackiSize)))
ECOERROR(ECO_MEM_ERROR,"Error in error stack Allocation",errno,errmsg);
}
}
out->seqsiz = out->seqlen = seqlen;
out->circular = circular;
if (!out->data)
{
out->data = ECOMALLOC((out->seqlen+circular) *sizeof(uint8_t),
"Error in Allocation of a new Seq data member",
errno,errmsg);
out->datsiz= out->seqlen+circular;
}
else if ((out->seqlen +circular) >= out->datsiz)
{
out->data = ECOREALLOC(out->data,(out->seqlen+circular) *sizeof(uint8_t),
"Error during Seq data buffer realloc",
errno,errmsg);
out->datsiz= out->seqlen+circular;
}
out->cseq = (char *)in;
EncodeSequence(out);
return out;
}
int32_t delete_apatseq(SeqPtr pseq,
int *errno, char **errmsg)
{
int i;
if (pseq) {
if (pseq->data)
ECOFREE(pseq->data,"Freeing sequence data buffer",
errno,errmsg);
for (i = 0 ; i < MAX_PATTERN ; i++) {
if (pseq->hitpos[i]) FreeStacki(pseq->hitpos[i]);
if (pseq->hiterr[i]) FreeStacki(pseq->hiterr[i]);
}
ECOFREE(pseq,"Freeing apat sequence structure",
errno,errmsg);
return 0;
}
return 1;
}
PatternPtr buildPattern(const char *pat, int32_t error_max,
int *errno, char **errmsg)
{
PatternPtr pattern;
int32_t patlen;
int32_t patlen2;
patlen = strlen(pat);
patlen2 = lenPattern(pat);
pattern = ECOMALLOC(sizeof(Pattern) + // Space for struct Pattern
sizeof(char)*patlen+1 + // Space for cpat
sizeof(uint32_t) * patlen2 + // Space for patcode
sizeof(patword_t) * ALPHA_LEN , // Space for smat
"Error in pattern allocation",
errno,errmsg);
pattern->ok = true;
pattern->hasIndel= false;
pattern->maxerr = error_max;
pattern->cpat = (char*)pattern + sizeof(Pattern);
pattern->patcode = (uint32_t*)(pattern->cpat + patlen + 1);
pattern->smat = (patword_t*)(pattern->patcode + patlen2);
strncpy(pattern->cpat,pat,patlen);
pattern->cpat[patlen]=0;
UpperSequence(pattern->cpat);
if (!CheckPattern(pattern))
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern checking",errno,errmsg);
if (! EncodePattern(pattern, dna))
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern encoding",errno,errmsg);
if (! CreateS(pattern, ALPHA_LEN))
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern compiling",errno,errmsg);
return pattern;
}
PatternPtr complementPattern(PatternPtr pat, int *errno,
char **errmsg)
{
PatternPtr pattern;
pattern = ECOMALLOC(sizeof(Pattern) +
sizeof(char) * strlen(pat->cpat) + 1 +
sizeof(uint32_t) * pat->patlen +
sizeof(patword_t) * ALPHA_LEN,
"Error in pattern allocation",
errno,errmsg);
pattern->ok = true;
pattern->hasIndel= pat->hasIndel;
pattern->maxerr = pat->maxerr;
pattern->patlen = pat->patlen;
pattern->cpat = (char*)pattern + sizeof(Pattern);
pattern->patcode = (uint32_t*)(pattern->cpat + strlen(pat->cpat) + 1);
pattern->smat = (patword_t*)(pattern->patcode + pat->patlen);
strcpy(pattern->cpat,pat->cpat);
ecoComplementPattern(pattern->cpat);
if (!CheckPattern(pattern))
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern checking",errno,errmsg);
if (! EncodePattern(pattern, dna))
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern encoding",errno,errmsg);
if (! CreateS(pattern, ALPHA_LEN))
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern compiling",errno,errmsg);
return pattern;
}

134
pkg/obiapat/obiapat.h Normal file
View File

@ -0,0 +1,134 @@
#ifndef __obiapat_h__
#define __obiapat_h__
#include <stdio.h>
#include <stdint.h>
#include "apat.h"
/*****************************************************
*
* Data type declarations
*
*****************************************************/
/*
*
* Sequence types
*
*/
typedef struct {
int32_t taxid;
char AC[20];
int32_t DE_length;
int32_t SQ_length;
int32_t CSQ_length;
char data[1];
} ecoseqformat_t;
typedef struct {
int32_t taxid;
int32_t SQ_length;
char *AC;
char *DE;
char *SQ;
} ecoseq_t;
/*****************************************************
*
* Function declarations
*
*****************************************************/
void* ecoError(int error,
const char* message,
const char * filename,
int linenumber,
int *errno,
char **error_msg);
#define ECOERROR(code,message,errno,errmsg) \
{ return ecoError((code),(message),__FILE__,__LINE__,errno,errmsg); }
#define ECO_IO_ERROR (1)
#define ECO_MEM_ERROR (2)
#define ECO_ASSERT_ERROR (3)
#define ECO_NOTFOUND_ERROR (4)
/*
*
* Low level system functions
*
*/
int32_t is_big_endian();
int32_t swap_int32_t(int32_t);
void *eco_malloc(int32_t chunksize,
const char *error_message,
const char *filename,
int32_t line,
int *errno, char **errmsg);
void *eco_realloc(void *chunk,
int32_t chunksize,
const char *error_message,
const char *filename,
int32_t line,
int *errno, char **errmsg);
void eco_free(void *chunk,
const char *error_message,
const char *filename,
int32_t line,
int *errno, char **errmsg);
void eco_trace_memory_allocation();
void eco_untrace_memory_allocation();
#define ECOMALLOC(size,error_message,errno,errmsg) \
eco_malloc((size),(error_message),__FILE__,__LINE__,errno,errmsg)
#define ECOREALLOC(chunk,size,error_message,errno,errmsg) \
eco_realloc((chunk),(size),(error_message),__FILE__,__LINE__,errno,errmsg)
#define ECOFREE(chunk,error_message,errno,errmsg) \
eco_free((chunk),(error_message),__FILE__,__LINE__,errno,errmsg)
ecoseq_t *new_ecoseq();
int32_t delete_ecoseq(ecoseq_t *);
ecoseq_t *new_ecoseq_with_data( char *AC,
char *DE,
char *SQ,
int32_t taxid
);
int32_t delete_apatseq(SeqPtr pseq,
int *errno, char **errmsg);
PatternPtr buildPattern(const char *pat, int32_t error_max, int *errno, char **errmsg);
PatternPtr complementPattern(PatternPtr pat, int *errno, char **errmsg);
SeqPtr new_apatseq(const char *in,int32_t circular, int32_t seqlen,
SeqPtr out,
int *errno, char **errmsg);
char *ecoComplementPattern(char *nucAcSeq);
char *ecoComplementSequence(char *nucAcSeq);
char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end,
int *errno, char **errmsg);
#endif /* __obiapat_h__ */

168
pkg/obiapat/pattern.go Normal file
View File

@ -0,0 +1,168 @@
package obiapat
/*
#cgo CFLAGS: -g -Wall
#include <stdlib.h>
#include "obiapat.h"
*/
import "C"
import (
"errors"
"unsafe"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
)
var MAX_PAT_LEN = int(C.MAX_PAT_LEN)
type ApatPattern struct {
pointer *C.Pattern
}
type ApatSequence struct {
pointer *C.Seq
}
var NilApatPattern = ApatPattern{nil}
var NilApatSequence = ApatSequence{nil}
func MakeApatPattern(pattern string, errormax int) (ApatPattern, error) {
cpattern := C.CString(pattern)
defer C.free(unsafe.Pointer(cpattern))
cerrormax := C.int32_t(errormax)
var errno C.int32_t
var errmsg *C.char
ap := C.buildPattern(cpattern, cerrormax, &errno, &errmsg)
if ap == nil {
message := C.GoString(errmsg)
C.free(unsafe.Pointer(errmsg))
return NilApatPattern, errors.New(message)
}
return ApatPattern{pointer: ap}, nil
}
func (pattern ApatPattern) ReverseComplement() (ApatPattern, error) {
var errno C.int32_t
var errmsg *C.char
ap := C.complementPattern((*C.Pattern)(pattern.pointer), &errno, &errmsg)
if ap == nil {
message := C.GoString(errmsg)
C.free(unsafe.Pointer(errmsg))
return ApatPattern{nil}, errors.New(message)
}
return ApatPattern{pointer: ap}, nil
}
func (pattern ApatPattern) String() string {
return C.GoString(pattern.pointer.cpat)
}
func (pattern ApatPattern) Length() int {
return int(pattern.pointer.patlen)
}
func (pattern ApatPattern) Free() {
C.free(unsafe.Pointer(pattern.pointer))
}
func (pattern ApatPattern) Print() {
C.PrintDebugPattern(C.PatternPtr(pattern.pointer))
}
func MakeApatSequence(sequence obiseq.BioSequence, circular bool, recycle ...ApatSequence) (ApatSequence, error) {
var errno C.int32_t
var errmsg *C.char
seqlen := sequence.Length()
p := C.malloc(C.size_t(seqlen) + 1)
ic := 0
if circular {
ic = 1
}
// copy the data into the buffer, by converting it to a Go array
cBuf := (*[1 << 30]byte)(p)
copy(cBuf[:], sequence.Sequence())
cBuf[sequence.Length()] = 0
var out *C.Seq
if len(recycle) > 0 {
out = recycle[0].pointer
} else {
out = nil
}
pseq := C.new_apatseq((*C.char)(p), C.int32_t(ic), C.int32_t(seqlen),
(*C.Seq)(out),
&errno, &errmsg)
if pseq == nil {
message := C.GoString(errmsg)
C.free(unsafe.Pointer(errmsg))
return NilApatSequence, errors.New(message)
}
seq := ApatSequence{pointer: pseq}
//log.Println(C.GoString(pseq.cseq))
// runtime.SetFinalizer(&seq, __free_apat_sequence__)
return seq, nil
}
func (sequence ApatSequence) Length() int {
return int(sequence.pointer.seqlen)
}
func (sequence ApatSequence) Free() {
var errno C.int32_t
var errmsg *C.char
C.delete_apatseq(sequence.pointer,
&errno, &errmsg)
sequence.pointer = nil
}
func (pattern ApatPattern) FindAllIndex(sequence ApatSequence, limits ...int) (loc [][3]int) {
begin := 0
length := sequence.Length()
if len(limits) > 0 {
begin = limits[0]
}
if len(limits) > 1 {
length = limits[1]
}
nhits := int(C.ManberAll(sequence.pointer,
pattern.pointer,
0,
C.int32_t(begin),
C.int32_t(length+C.MAX_PAT_LEN)))
//log.Printf("match count : %d\n", nhits)
if nhits == 0 {
return nil
}
stktmp := (*[1 << 30]int32)(unsafe.Pointer(sequence.pointer.hitpos[0].val))
errtmp := (*[1 << 30]int32)(unsafe.Pointer(sequence.pointer.hiterr[0].val))
patlen := int(pattern.pointer.patlen)
for i := 0; i < nhits; i++ {
start := int(stktmp[i])
err := int(errtmp[i])
loc = append(loc, [3]int{start, start + patlen, err})
}
return loc
}

370
pkg/obiapat/pcr.go Normal file
View File

@ -0,0 +1,370 @@
package obiapat
import (
"git.metabarcoding.org/lecasofts/go/oa2/pkg/goutils"
"git.metabarcoding.org/lecasofts/go/oa2/pkg/obiseq"
)
type __options__ struct {
min_length int
max_length int
circular bool
forward_error int
reverse_error int
buffer_size int
batch_size int
parallel_workers int
}
type Options struct {
pointer *__options__
}
type WithOption func(Options)
func (options Options) MinLength() int {
return options.pointer.min_length
}
func (options Options) MaxLength() int {
return options.pointer.max_length
}
func (options Options) ForwardError() int {
return options.pointer.forward_error
}
func (options Options) ReverseError() int {
return options.pointer.reverse_error
}
func (options Options) Circular() bool {
return options.pointer.circular
}
func (opt Options) BufferSize() int {
return opt.pointer.buffer_size
}
func (opt Options) BatchSize() int {
return opt.pointer.batch_size
}
func (opt Options) ParallelWorkers() int {
return opt.pointer.parallel_workers
}
func MakeOptions(setters []WithOption) Options {
o := __options__{
min_length: 0,
max_length: 0,
forward_error: 0,
reverse_error: 0,
circular: false,
parallel_workers: 4,
batch_size: 100,
buffer_size: 100,
}
opt := Options{&o}
for _, set := range setters {
set(opt)
}
return opt
}
func OptionMinLength(min_length int) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.min_length = min_length
})
return f
}
func OptionMaxLength(max_length int) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.max_length = max_length
})
return f
}
func OptionForwardError(max int) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.forward_error = max
})
return f
}
func OptionReverseError(max int) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.reverse_error = max
})
return f
}
func OptionCircular(circular bool) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.circular = circular
})
return f
}
func OptionBufferSize(size int) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.buffer_size = size
})
return f
}
func OptionParallelWorkers(nworkers int) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.parallel_workers = nworkers
})
return f
}
func OptionBatchSize(size int) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.batch_size = size
})
return f
}
func __pcr__(seq ApatSequence, sequence obiseq.BioSequence,
forward, cfwd, reverse, crev ApatPattern,
opt Options) obiseq.BioSequenceSlice {
results := make(obiseq.BioSequenceSlice, 0, 10)
forward_matches := forward.FindAllIndex(seq)
if forward_matches != nil {
begin := forward_matches[0][0]
length := seq.Length() - begin
if opt.pointer.max_length > 0 {
length = forward_matches[len(forward_matches)-1][2] - begin + opt.MaxLength() + reverse.Length()
}
if opt.Circular() {
begin = 0
length = seq.Length() + MAX_PAT_LEN
}
reverse_matches := crev.FindAllIndex(seq, begin, length)
if reverse_matches != nil {
for _, fm := range forward_matches {
posi := fm[0]
if posi < seq.Length() {
erri := fm[2]
for _, rm := range reverse_matches {
posj := rm[0]
if posj < seq.Length() {
posj := rm[1]
errj := rm[2]
length = 0
if posj > posi {
length = rm[0] - fm[1]
} else {
if opt.Circular() {
length = rm[0] + seq.Length() - posi - forward.Length()
}
}
if length > 0 && // For when primers touch or overlap
(opt.MinLength() == 0 || length >= opt.MinLength()) &&
(opt.MaxLength() == 0 || length <= opt.MaxLength()) {
amplicon, _ := sequence.Subsequence(fm[1], rm[0], opt.pointer.circular)
annot := amplicon.Annotations()
goutils.CopyMap(annot, sequence.Annotations())
annot["forward_primer"] = forward.String()
match, _ := sequence.Subsequence(fm[0], fm[1], opt.pointer.circular)
annot["forward_match"] = match.String()
match.Revoke()
annot["forward_error"] = erri
annot["reverse_primer"] = reverse.String()
match, _ = sequence.Subsequence(rm[0], rm[1], opt.pointer.circular)
match = match.ReverseComplement(true)
annot["reverse_match"] = match.String()
match.Revoke()
annot["reverse_error"] = errj
results = append(results, amplicon)
}
}
}
}
}
}
}
forward_matches = reverse.FindAllIndex(seq)
if forward_matches != nil {
begin := forward_matches[0][0]
length := seq.Length() - begin
if opt.pointer.max_length > 0 {
length = forward_matches[len(forward_matches)-1][2] - begin + opt.MaxLength() + reverse.Length()
}
if opt.Circular() {
begin = 0
length = seq.Length() + MAX_PAT_LEN
}
reverse_matches := cfwd.FindAllIndex(seq, begin, length)
if reverse_matches != nil {
for _, fm := range forward_matches {
posi := fm[0]
if posi < seq.Length() {
erri := fm[2]
for _, rm := range reverse_matches {
posj := rm[0]
if posj < seq.Length() {
posj := rm[1]
errj := rm[2]
length = 0
if posj > posi {
length = rm[0] - fm[1]
} else {
if opt.Circular() {
length = rm[0] + seq.Length() - posi - forward.Length()
}
}
if length > 0 && // For when primers touch or overlap
(opt.MinLength() == 0 || length >= opt.MinLength()) &&
(opt.MaxLength() == 0 || length <= opt.MaxLength()) {
amplicon, _ := sequence.Subsequence(fm[1], rm[0], opt.pointer.circular)
amplicon = amplicon.ReverseComplement(true)
annot := amplicon.Annotations()
goutils.CopyMap(annot, sequence.Annotations())
annot["forward_primer"] = forward.String()
match, _ := sequence.Subsequence(rm[0], rm[1], opt.pointer.circular)
match.ReverseComplement(true)
annot["forward_match"] = match.String()
match.Revoke()
annot["forward_error"] = errj
annot["reverse_primer"] = reverse.String()
match, _ = sequence.Subsequence(fm[0], fm[1], opt.pointer.circular)
annot["reverse_match"] = match.String()
match.Revoke()
annot["reverse_error"] = erri
results = append(results, amplicon)
}
}
}
}
}
}
}
return results
}
func PCR(sequence obiseq.BioSequence,
forward, reverse string, options ...WithOption) obiseq.BioSequenceSlice {
opt := MakeOptions(options)
seq, _ := MakeApatSequence(sequence, opt.Circular())
fwd, _ := MakeApatPattern(forward, opt.ForwardError())
rev, _ := MakeApatPattern(reverse, opt.ReverseError())
cfwd, _ := fwd.ReverseComplement()
crev, _ := rev.ReverseComplement()
results := __pcr__(seq, sequence,
fwd, cfwd, rev, crev,
opt)
seq.Free()
fwd.Free()
rev.Free()
cfwd.Free()
crev.Free()
return results
}
func PCRSlice(sequences obiseq.BioSequenceSlice,
forward, reverse string, options ...WithOption) obiseq.BioSequenceSlice {
results := make(obiseq.BioSequenceSlice, 0, len(sequences))
opt := MakeOptions(options)
fwd, _ := MakeApatPattern(forward, opt.ForwardError())
rev, _ := MakeApatPattern(reverse, opt.ReverseError())
cfwd, _ := fwd.ReverseComplement()
crev, _ := rev.ReverseComplement()
if len(sequences) > 0 {
seq, _ := MakeApatSequence(sequences[0], opt.Circular())
amplicons := __pcr__(seq, sequences[0],
fwd, cfwd, rev, crev,
opt)
if len(amplicons) > 0 {
results = append(results, amplicons...)
}
for _, sequence := range sequences[1:] {
seq, _ := MakeApatSequence(sequence, opt.Circular(), seq)
amplicons = __pcr__(seq, sequence,
fwd, cfwd, rev, crev,
opt)
if len(amplicons) > 0 {
results = append(results, amplicons...)
}
}
seq.Free()
}
fwd.Free()
rev.Free()
cfwd.Free()
crev.Free()
return results
}
func PCRSliceWorker(forward, reverse string,
options ...WithOption) obiseq.SeqSliceWorker {
worker := func(sequences obiseq.BioSequenceSlice) obiseq.BioSequenceSlice {
return PCRSlice(sequences, forward, reverse, options...)
}
return worker
}