Change the package path

This commit is contained in:
2018-02-20 06:40:29 +11:00
parent 0450ebf427
commit 51f152cca4
48 changed files with 0 additions and 3 deletions

26
src/ecoError.c Normal file
View File

@@ -0,0 +1,26 @@
#include "ecoPCR.h"
#include <stdio.h>
#include <stdlib.h>
/*
* print the message given as argument and exit the program
* @param error error number
* @param message the text explaining what's going on
* @param filename the file source where the program failed
* @param linenumber the line where it has failed
* filename and linenumber are written at pre-processing
* time by a macro
*/
void ecoError(int32_t error,
const char* message,
const char * filename,
int linenumber)
{
fprintf(stderr,"Error %d in file %s line %d : %s\n",
error,
filename,
linenumber,
message);
abort();
}

122
src/ecoIOUtils.c Normal file
View File

@@ -0,0 +1,122 @@
#include "ecoPCR.h"
#include <stdio.h>
#include <stdlib.h>
#define SWAPINT32(x) ((((x) << 24) & 0xFF000000) | (((x) << 8) & 0xFF0000) | \
(((x) >> 8) & 0xFF00) | (((x) >> 24) & 0xFF))
int32_t is_big_endian()
{
int32_t i=1;
return (int32_t)((char*)&i)[0];
}
int32_t swap_int32_t(int32_t i)
{
return SWAPINT32(i);
}
/**
* Read part of the file
* @param *f the database
* @param recordSize the size to be read
*
* @return buffer
*/
void *read_ecorecord(FILE *f,int32_t *recordSize)
{
static void *buffer =NULL;
int32_t buffersize=0;
int32_t read;
if (!recordSize)
ECOERROR(ECO_ASSERT_ERROR,
"recordSize cannot be NULL");
read = fread(recordSize,
1,
sizeof(int32_t),
f);
if (feof(f))
return NULL;
if (read != sizeof(int32_t))
ECOERROR(ECO_IO_ERROR,"Reading record size error");
if (is_big_endian())
*recordSize=swap_int32_t(*recordSize);
if (buffersize < *recordSize)
{
if (buffer)
buffer = ECOREALLOC(buffer,*recordSize,
"Increase size of record buffer");
else
buffer = ECOMALLOC(*recordSize,
"Allocate record buffer");
}
read = fread(buffer,
1,
*recordSize,
f);
if (read != *recordSize)
ECOERROR(ECO_IO_ERROR,"Reading record data error");
return buffer;
};
/**
* Open the database and check it's readable
* @param filename name of the database (.sdx, .rdx, .tbx)
* @param sequencecount buffer - pointer to variable storing the number of occurence
* @param abort_on_open_error boolean to define the behaviour in case of error
* while opening the database
* @return FILE type
**/
FILE *open_ecorecorddb(const char *filename,
int32_t *sequencecount,
int32_t abort_on_open_error)
{
FILE *f;
int32_t read;
f = fopen(filename,"rb");
if (!f)
{
if (abort_on_open_error)
ECOERROR(ECO_IO_ERROR,"Cannot open file");
else
{
*sequencecount=0;
return NULL;
}
}
read = fread(sequencecount,
1,
sizeof(int32_t),
f);
if (read != sizeof(int32_t))
ECOERROR(ECO_IO_ERROR,"Reading record size error");
if (is_big_endian())
*sequencecount=swap_int32_t(*sequencecount);
return f;
}

79
src/ecoMalloc.c Normal file
View File

@@ -0,0 +1,79 @@
#include "ecoPCR.h"
#include <stdlib.h>
static int eco_log_malloc = 0;
void eco_trace_memory_allocation()
{
eco_log_malloc=1;
}
void eco_untrace_memory_allocation()
{
eco_log_malloc=0;
}
void *eco_malloc(int32_t chunksize,
const char *error_message,
const char *filename,
int32_t line)
{
void * chunk;
chunk = calloc(1,chunksize);
if (!chunk)
ecoError(ECO_MEM_ERROR,error_message,filename,line);
if (eco_log_malloc)
fprintf(stderr,
"Memory segment located at %p of size %d is allocated (file : %s [%d])",
chunk,
chunksize,
filename,
line);
return chunk;
}
void *eco_realloc(void *chunk,
int32_t newsize,
const char *error_message,
const char *filename,
int32_t line)
{
void *newchunk;
newchunk = realloc(chunk,newsize);
if (!newchunk)
ecoError(ECO_MEM_ERROR,error_message,filename,line);
if (eco_log_malloc)
fprintf(stderr,
"Old memory segment %p is reallocated at %p with a size of %d (file : %s [%d])",
chunk,
newchunk,
newsize,
filename,
line);
return newchunk;
}
void eco_free(void *chunk,
const char *error_message,
const char *filename,
int32_t line)
{
free(chunk);
if (eco_log_malloc)
fprintf(stderr,
"Memory segment %p is released => %s (file : %s [%d])",
chunk,
error_message,
filename,
line);
}

283
src/ecoPCR.h Normal file
View File

@@ -0,0 +1,283 @@
#ifndef ECOPCR_H_
#define ECOPCR_H_
#include <stdio.h>
#include <inttypes.h>
#include <R.h>
#include <Rinternals.h>
#include <Rdefines.h>
//#ifndef H_apat
//#include "../libapat/apat.h"
//#endif
/*****************************************************
*
* Data type declarations
*
*****************************************************/
/*
*
* Sequence types
*
*/
typedef struct {
int32_t taxid;
char AC[20];
int32_t DE_length;
int32_t SQ_length;
int32_t CSQ_length;
char data[1];
} ecoseqformat_t;
typedef struct {
int32_t taxid;
int32_t SQ_length;
char *AC;
char *DE;
char *SQ;
} ecoseq_t;
/*
*
* Taxonomy taxon types
*
*/
typedef struct {
int32_t taxid;
int32_t rank;
int32_t parent;
int32_t namelength;
char name[1];
} ecotxformat_t;
typedef struct ecotxnode {
int32_t taxid;
int32_t rank;
int32_t farest;
struct ecotxnode *parent;
char *name;
} ecotx_t;
typedef struct {
int32_t count;
int32_t maxtaxid;
int32_t buffersize;
ecotx_t taxon[1];
} ecotxidx_t;
/*
*
* Taxonomy rank types
*
*/
typedef struct {
int32_t count;
char* label[1];
} ecorankidx_t;
/*
*
* Taxonomy name types
*
*/
typedef struct {
int32_t is_scientificname;
int32_t namelength;
int32_t classlength;
int32_t taxid;
char names[1];
} econameformat_t;
typedef struct {
char *name;
char *classname;
int32_t is_scientificname;
struct ecotxnode *taxon;
} econame_t;
typedef struct {
int32_t count;
econame_t names[1];
} econameidx_t;
typedef struct {
ecorankidx_t *ranks;
econameidx_t *names;
ecotxidx_t *taxons;
} ecotaxonomy_t;
/*****************************************************
*
* Function declarations
*
*****************************************************/
/*
*
* Low level system functions
*
*/
int32_t is_big_endian();
int32_t swap_int32_t(int32_t);
void *eco_malloc(int32_t chunksize,
const char *error_message,
const char *filename,
int32_t line);
void *eco_realloc(void *chunk,
int32_t chunksize,
const char *error_message,
const char *filename,
int32_t line);
void eco_free(void *chunk,
const char *error_message,
const char *filename,
int32_t line);
void eco_trace_memory_allocation();
void eco_untrace_memory_allocation();
#define ECOMALLOC(size,error_message) \
eco_malloc((size),(error_message),__FILE__,__LINE__)
#define ECOREALLOC(chunk,size,error_message) \
eco_realloc((chunk),(size),(error_message),__FILE__,__LINE__)
#define ECOFREE(chunk,error_message) \
eco_free((chunk),(error_message),__FILE__,__LINE__)
/*
*
* Error managment
*
*/
void ecoError(int32_t,const char*,const char *,int);
#define ECOERROR(code,message) ecoError((code),(message),__FILE__,__LINE__)
#define ECO_IO_ERROR (1)
#define ECO_MEM_ERROR (2)
#define ECO_ASSERT_ERROR (3)
#define ECO_NOTFOUND_ERROR (4)
/*
*
* Low level Disk access functions
*
*/
FILE *open_ecorecorddb(const char *filename,
int32_t *sequencecount,
int32_t abort_on_open_error);
void *read_ecorecord(FILE *,int32_t *recordSize);
/*
* Read function in internal binary format
*/
FILE *open_ecoseqdb(const char *filename,
int32_t *sequencecount);
ecoseq_t *readnext_ecoseq(FILE *);
ecorankidx_t *read_rankidx(const char *filename);
econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy);
/**
* Read taxonomy data as formated by the ecoPCRFormat.py script.
*
* This function is normaly uses internaly by the read_taxonomy
* function and should not be called directly.
*
* @arg filename path to the *.tdx file of the reformated db
*
* @return pointer to a taxonomy index structure
*/
ecotxidx_t *read_taxonomyidx(const char *filename,const char *filename2);
ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName);
ecotx_t *eco_findtaxonatrank(ecotx_t *taxon, int32_t rankidx);
ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy, int32_t taxid);
int eco_isundertaxon(ecotx_t *taxon, int other_taxid);
ecoseq_t *ecoseq_iterator(const char *prefix);
ecoseq_t *new_ecoseq();
int32_t delete_ecoseq(ecoseq_t *);
ecoseq_t *new_ecoseq_with_data( char *AC,
char *DE,
char *SQ,
int32_t taxid
);
int32_t delete_taxon(ecotx_t *taxon);
int32_t delete_taxonomy(ecotxidx_t *index);
int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy);
int32_t rank_index(const char* label,ecorankidx_t* ranks);
//int32_t delete_apatseq(SeqPtr pseq);
//PatternPtr buildPattern(const char *pat, int32_t error_max);
//PatternPtr complementPattern(PatternPtr pat);
//
//SeqPtr ecoseq2apatseq(ecoseq_t *in,SeqPtr out,int32_t circular);
//char *ecoComplementPattern(char *nucAcSeq);
//char *ecoComplementSequence(char *nucAcSeq);
//char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end);
ecotx_t *eco_getspecies(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
ecotx_t *eco_getgenus(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
ecotx_t *eco_getfamily(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
ecotx_t *eco_getkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
//int eco_is_taxid_ignored(int32_t *ignored_taxid, int32_t tab_len, int32_t taxid);
//int eco_is_taxid_included(ecotaxonomy_t *taxonomy, int32_t *included_taxid, int32_t tab_len, int32_t taxid);
ecotaxonomy_t *getTaxPointer(SEXP Rtaxonomy);
#endif /*ECOPCR_H_*/

156
src/ecodna.c Normal file
View File

@@ -0,0 +1,156 @@
#include <string.h>
#include "ecoPCR.h"
/*
* @doc: DNA alphabet (IUPAC)
*/
#define LX_BIO_DNA_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZ#![]"
/*
* @doc: complementary DNA alphabet (IUPAC)
*/
#define LX_BIO_CDNA_ALPHA "TVGHEFCDIJMLKNOPQYSAABWXRZ#!]["
static char sNuc[] = LX_BIO_DNA_ALPHA;
static char sAnuc[] = LX_BIO_CDNA_ALPHA;
static char LXBioBaseComplement(char nucAc);
static char *LXBioSeqComplement(char *nucAcSeq);
static char *reverseSequence(char *str,char isPattern);
/* ---------------------------- */
char LXBioBaseComplement(char nucAc)
{
char *c;
if ((c = strchr(sNuc, nucAc)))
return sAnuc[(c - sNuc)];
else
return nucAc;
}
/* ---------------------------- */
char *LXBioSeqComplement(char *nucAcSeq)
{
char *s;
for (s = nucAcSeq ; *s ; s++)
*s = LXBioBaseComplement(*s);
return nucAcSeq;
}
char *reverseSequence(char *str,char isPattern)
{
char *sb, *se, c;
if (! str)
return str;
sb = str;
se = str + strlen(str) - 1;
while(sb <= se) {
c = *sb;
*sb++ = *se;
*se-- = c;
}
sb = str;
se = str + strlen(str) - 1;
if (isPattern)
for (;sb < se; sb++)
{
if (*sb=='#')
{
if (((se - sb) > 2) && (*(sb+2)=='!'))
{
*sb='!';
sb+=2;
*sb='#';
}
else
{
*sb=*(sb+1);
sb++;
*sb='#';
}
}
else if (*sb=='!')
{
*sb=*(sb-1);
*(sb-1)='!';
}
}
return str;
}
char *ecoComplementPattern(char *nucAcSeq)
{
return reverseSequence(LXBioSeqComplement(nucAcSeq),1);
}
char *ecoComplementSequence(char *nucAcSeq)
{
return reverseSequence(LXBioSeqComplement(nucAcSeq),0);
}
char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end)
/*
extract subsequence from nucAcSeq [begin,end[
*/
{
static char *buffer = NULL;
static int32_t buffSize= 0;
int32_t length;
if (begin < end)
{
length = end - begin;
if (length >= buffSize)
{
buffSize = length+1;
if (buffer)
buffer=ECOREALLOC(buffer,buffSize,
"Error in reallocating sub sequence buffer");
else
buffer=ECOMALLOC(buffSize,
"Error in allocating sub sequence buffer");
}
strncpy(buffer,nucAcSeq + begin,length);
buffer[length]=0;
}
else
{
length = end + strlen(nucAcSeq) - begin;
if (length >= buffSize)
{
buffSize = length+1;
if (buffer)
buffer=ECOREALLOC(buffer,buffSize,
"Error in reallocating sub sequence buffer");
else
buffer=ECOMALLOC(buffSize,
"Error in allocating sub sequence buffer");
}
strncpy(buffer,nucAcSeq+begin,length - end);
strncpy(buffer+(length-end),nucAcSeq ,end);
buffer[length]=0;
}
return buffer;
}

20
src/ecofilter.c Normal file
View File

@@ -0,0 +1,20 @@
#include "ecoPCR.h"
int eco_is_taxid_included( ecotaxonomy_t *taxonomy,
int32_t *restricted_taxid,
int32_t tab_len,
int32_t taxid)
{
int i;
ecotx_t *taxon;
taxon = eco_findtaxonbytaxid(taxonomy, taxid);
if (taxon)
for (i=0; i < tab_len; i++)
if ( (taxon->taxid == restricted_taxid[i]) ||
(eco_isundertaxon(taxon, restricted_taxid[i])) )
return 1;
return 0;
}

64
src/econame.c Normal file
View File

@@ -0,0 +1,64 @@
#include "ecoPCR.h"
#include <string.h>
#include <stdlib.h>
static econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy);
econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy)
{
int32_t count;
FILE *f;
econameidx_t *indexname;
int32_t i;
f = open_ecorecorddb(filename,&count,0);
if (f==NULL)
return NULL;
indexname = (econameidx_t*) ECOMALLOC(sizeof(econameidx_t) + sizeof(econame_t) * (count-1),"Allocate names");
indexname->count=count;
for (i=0; i < count; i++){
readnext_econame(f,(indexname->names)+i,taxonomy);
}
return indexname;
}
econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy)
{
econameformat_t *raw;
int32_t rs;
raw = read_ecorecord(f,&rs);
if (!raw)
return NULL;
if (is_big_endian())
{
raw->is_scientificname = swap_int32_t(raw->is_scientificname);
raw->namelength = swap_int32_t(raw->namelength);
raw->classlength = swap_int32_t(raw->classlength);
raw->taxid = swap_int32_t(raw->taxid);
}
name->is_scientificname=raw->is_scientificname;
name->name = ECOMALLOC((raw->namelength+1) * sizeof(char),"Allocate name");
strncpy(name->name,raw->names,raw->namelength);
name->name[raw->namelength]=0;
name->classname = ECOMALLOC((raw->classlength+1) * sizeof(char),"Allocate classname");
strncpy(name->classname,(raw->names+raw->namelength),raw->classlength);
name->classname[raw->classlength]=0;
name->taxon = taxonomy->taxons->taxon + raw->taxid;
return name;
}

55
src/ecorank.c Normal file
View File

@@ -0,0 +1,55 @@
#include "ecoPCR.h"
#include <string.h>
#include <stdlib.h>
static int compareRankLabel(const void *label1, const void *label2);
ecorankidx_t *read_rankidx(const char *filename)
{
int32_t count;
FILE *f;
ecorankidx_t *index;
int32_t i;
int32_t rs;
char *buffer;
f = open_ecorecorddb(filename,&count,0);
if (f==NULL)
return NULL;
index = (ecorankidx_t*) ECOMALLOC(sizeof(ecorankidx_t) + sizeof(char*) * (count-1),
"Allocate rank index");
index->count=count;
for (i=0; i < count; i++)
{
buffer = read_ecorecord(f,&rs);
index->label[i]=(char*) ECOMALLOC(rs+1,
"Allocate rank label");
strncpy(index->label[i],buffer,rs);
}
return index;
}
int32_t rank_index(const char* label,ecorankidx_t* ranks)
{
char **rep;
rep = bsearch(label,ranks->label,ranks->count,sizeof(char*),compareRankLabel);
if (rep)
return rep-ranks->label;
// else
// ECOERROR(ECO_NOTFOUND_ERROR,"Rank label not found");
return -1;
}
int compareRankLabel(const void *label1, const void *label2)
{
return strcmp((const char*)label1,*(const char**)label2);
}

230
src/ecoseq.c Normal file
View File

@@ -0,0 +1,230 @@
#include "ecoPCR.h"
#include <stdlib.h>
#include <string.h>
#include <zlib.h>
#include <string.h>
#include <stdio.h>
#include <ctype.h>
static FILE *open_seqfile(const char *prefix,int32_t index);
ecoseq_t *new_ecoseq()
{
void *tmp;
tmp = ECOMALLOC(sizeof(ecoseq_t),"Allocate new ecoseq structure");
return tmp;
}
int32_t delete_ecoseq(ecoseq_t * seq)
{
if (seq)
{
if (seq->AC)
ECOFREE(seq->AC,"Free sequence AC");
if (seq->DE)
ECOFREE(seq->DE,"Free sequence DE");
if (seq->SQ)
ECOFREE(seq->SQ,"Free sequence SQ");
ECOFREE(seq,"Free sequence structure");
return 0;
}
return 1;
}
ecoseq_t *new_ecoseq_with_data( char *AC,
char *DE,
char *SQ,
int32_t taxid_idx
)
{
ecoseq_t *tmp;
int32_t lstr;
tmp = new_ecoseq();
tmp->taxid=taxid_idx;
if (AC)
{
lstr =strlen(AC);
tmp->AC=ECOMALLOC((lstr+1) * sizeof(char),
"Allocate sequence accession");
strcpy(tmp->AC,AC);
}
if (DE)
{
lstr =strlen(DE);
tmp->DE=ECOMALLOC((lstr+1) * sizeof(char),
"Allocate sequence definition");
strcpy(tmp->DE,DE);
}
if (SQ)
{
lstr =strlen(SQ);
tmp->SQ=ECOMALLOC((lstr+1) * sizeof(char),
"Allocate sequence data");
strcpy(tmp->SQ,SQ);
}
return tmp;
}
/**
* ?? used ??
**/
FILE *open_ecoseqdb(const char *filename,
int32_t *sequencecount)
{
return open_ecorecorddb(filename,sequencecount,1);
}
ecoseq_t *readnext_ecoseq(FILE *f)
{
char *compressed=NULL;
ecoseqformat_t *raw;
ecoseq_t *seq;
int32_t comp_status;
unsigned long int seqlength;
int32_t rs;
char *c;
int32_t i;
raw = read_ecorecord(f,&rs);
if (!raw)
return NULL;
if (is_big_endian())
{
raw->CSQ_length = swap_int32_t(raw->CSQ_length);
raw->DE_length = swap_int32_t(raw->DE_length);
raw->SQ_length = swap_int32_t(raw->SQ_length);
raw->taxid = swap_int32_t(raw->taxid);
}
seq = new_ecoseq();
seq->taxid = raw->taxid;
seq->AC = ECOMALLOC(strlen(raw->AC) +1,
"Allocate Sequence Accesion number");
strncpy(seq->AC,raw->AC,strlen(raw->AC));
seq->DE = ECOMALLOC(raw->DE_length+1,
"Allocate Sequence definition");
strncpy(seq->DE,raw->data,raw->DE_length);
seqlength = seq->SQ_length = raw->SQ_length;
compressed = raw->data + raw->DE_length;
seq->SQ = ECOMALLOC(seqlength+1,
"Allocate sequence buffer");
// comp_status = uncompress((unsigned char*)seq->SQ,
// &seqlength,
// (unsigned char*)compressed,
// raw->CSQ_length);
//
if (comp_status != Z_OK)
ECOERROR(ECO_IO_ERROR,"I cannot uncompress sequence data");
for (c=seq->SQ,i=0;i<seqlength;c++,i++)
*c=toupper(*c);
return seq;
}
/**
* Open the sequences database (.sdx file)
* @param prefix name of the database (radical without extension)
* @param index integer
*
* @return file object
*/
FILE *open_seqfile(const char *prefix,int32_t index)
{
char filename_buffer[1024];
int32_t filename_length;
FILE *input;
int32_t seqcount;
filename_length = snprintf(filename_buffer,
1023,
"%s_%03d.sdx",
prefix,
index);
// fprintf(stderr,"# Coucou %s\n",filename_buffer);
if (filename_length >= 1024)
ECOERROR(ECO_ASSERT_ERROR,"file name is too long");
filename_buffer[filename_length]=0;
input=open_ecorecorddb(filename_buffer,&seqcount,0);
if (input)
fprintf(stderr,"# Reading file %s containing %d sequences...\n",
filename_buffer,
seqcount);
return input;
}
ecoseq_t *ecoseq_iterator(const char *prefix)
{
static FILE *current_seq_file= NULL;
static int32_t current_file_idx = 1;
static char current_prefix[1024];
ecoseq_t *seq;
if (prefix)
{
current_file_idx = 1;
if (current_seq_file)
fclose(current_seq_file);
strncpy(current_prefix,prefix,1023);
current_prefix[1023]=0;
current_seq_file = open_seqfile(current_prefix,
current_file_idx);
if (!current_seq_file)
return NULL;
}
seq = readnext_ecoseq(current_seq_file);
if (!seq && feof(current_seq_file))
{
current_file_idx++;
fclose(current_seq_file);
current_seq_file = open_seqfile(current_prefix,
current_file_idx);
if (current_seq_file)
seq = readnext_ecoseq(current_seq_file);
}
return seq;
}

437
src/ecotax.c Normal file
View File

@@ -0,0 +1,437 @@
#include "ecoPCR.h"
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <R.h>
#ifndef MAX
#define MAX(x,y) (((x)>(y)) ? (x):(y))
#endif
static ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon);
/**
* Open the taxonomy database
* @param pointer to the database (.tdx file)
* @return a ecotxidx_t structure
*/
ecotxidx_t *read_taxonomyidx(const char *filename,const char *filename2)
{
int32_t count;
int32_t count2;
FILE *f;
FILE *f2;
ecotxidx_t *index;
struct ecotxnode *t;
int32_t i;
int32_t j;
f = open_ecorecorddb(filename,&count,0);
if (f==NULL) return NULL;
f2 = open_ecorecorddb(filename2,&count2,0);
index = (ecotxidx_t*) ECOMALLOC(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count+count2-1),
"Allocate taxonomy");
index->count=count+count2;
index->buffersize = index->count;
index->maxtaxid=0;
REprintf("Readind %d taxa...\n",count);
for (i=0; i < count; i++){
readnext_ecotaxon(f,&(index->taxon[i]));
index->taxon[i].parent=index->taxon + (size_t)index->taxon[i].parent;
index->taxon[i].parent->farest=0;
if (index->taxon[i].taxid > index->maxtaxid)
index->maxtaxid=index->taxon[i].taxid;
}
if (count2>0)
REprintf("Readind %d local taxa...\n",count2);
else
REprintf("No local taxon\n");
count = index->count;
for (; i < count; i++){
readnext_ecotaxon(f2,&(index->taxon[i]));
index->taxon[i].parent=index->taxon + (size_t)index->taxon[i].parent;
index->taxon[i].parent->farest=0;
if (index->taxon[i].taxid > index->maxtaxid)
index->maxtaxid=index->taxon[i].taxid;
}
REprintf("Computing longest branches...\n",count);
for (i=0; i < count; i++){
t=index->taxon+i;
if (t->farest==-1)
{
t->farest=0;
while(t->parent != t)
{
j = t->farest + 1;
if (j > t->parent->farest)
{
t->parent->farest = j;
t=t->parent;
}
else
t=index->taxon;
}
}
}
return index;
}
int32_t delete_taxonomy(ecotxidx_t *index)
{
int32_t i;
if (index)
{
for (i=0; i< index->count; i++)
if (index->taxon[i].name)
ECOFREE(index->taxon[i].name,"Free scientific name");
ECOFREE(index,"Free Taxonomy");
return 0;
}
return 1;
}
int32_t delete_taxon(ecotx_t *taxon)
{
if (taxon)
{
if (taxon->name)
ECOFREE(taxon->name,"Free scientific name");
ECOFREE(taxon,"Free Taxon");
return 0;
}
return 1;
}
/**
* Read the database for a given taxon a save the data
* into the taxon structure(if any found)
* @param *f pointer to FILE type returned by fopen
* @param *taxon pointer to the structure
*
* @return a ecotx_t structure if any taxon found else NULL
*/
ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon)
{
ecotxformat_t *raw;
int32_t rs;
raw = read_ecorecord(f,&rs);
if (!raw)
return NULL;
if (is_big_endian())
{
raw->namelength = swap_int32_t(raw->namelength);
raw->parent = swap_int32_t(raw->parent);
raw->rank = swap_int32_t(raw->rank);
raw->taxid = swap_int32_t(raw->taxid);
}
taxon->parent = (ecotx_t*)((size_t)raw->parent);
taxon->taxid = raw->taxid;
taxon->rank = raw->rank;
taxon->farest = -1;
taxon->name = ECOMALLOC((raw->namelength+1) * sizeof(char),
"Allocate taxon scientific name");
strncpy(taxon->name,raw->name,raw->namelength);
return taxon;
}
ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName)
{
ecotaxonomy_t *tax;
char *filename;
char *filename2;
int buffsize;
tax = ECOMALLOC(sizeof(ecotaxonomy_t),
"Allocate taxonomy structure");
tax->ranks =NULL;
tax->taxons=NULL;
tax->names =NULL;
buffsize = strlen(prefix)+10;
filename = ECOMALLOC(buffsize,
"Allocate filename");
filename2= ECOMALLOC(buffsize,
"Allocate filename");
snprintf(filename,buffsize,"%s.rdx",prefix);
tax->ranks = read_rankidx(filename);
if (tax->ranks == NULL)
{
ECOFREE(filename,"Desallocate filename 1");
ECOFREE(filename2,"Desallocate filename 2");
delete_ecotaxonomy(tax);
return NULL;
}
snprintf(filename,buffsize,"%s.tdx",prefix);
snprintf(filename2,buffsize,"%s.ldx",prefix);
tax->taxons = read_taxonomyidx(filename,filename2);
if (tax->taxons == NULL)
{
ECOFREE(filename,"Desallocate filename 1");
ECOFREE(filename,"Desallocate filename 2");
delete_ecotaxonomy(tax);
return NULL;
}
if (readAlternativeName)
{
snprintf(filename,buffsize,"%s.ndx",prefix);
tax->names=read_nameidx(filename,tax);
}
else
tax->names=NULL;
ECOFREE(filename,"Desallocate filename 1");
ECOFREE(filename2,"Desallocate filename 2");
return tax;
}
int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy)
{
if (taxonomy)
{
if (taxonomy->ranks)
ECOFREE(taxonomy->ranks,"Free rank index");
if (taxonomy->names)
ECOFREE(taxonomy->names,"Free names index");
if (taxonomy->taxons)
ECOFREE(taxonomy->taxons,"Free taxon index");
ECOFREE(taxonomy,"Free taxonomy structure");
return 0;
}
return 1;
}
ecotx_t *eco_findtaxonatrank(ecotx_t *taxon,
int32_t rankidx)
{
ecotx_t *current_taxon;
ecotx_t *next_taxon;
current_taxon = taxon;
next_taxon = current_taxon->parent;
while ((current_taxon!=next_taxon) && // I' am the root node
(current_taxon->rank!=rankidx))
{
current_taxon = next_taxon;
next_taxon = current_taxon->parent;
}
if (current_taxon->rank==rankidx)
return current_taxon;
else
return NULL;
}
static int bcomptaxon (const void * ptaxid, const void * ptaxon) {
ecotx_t *current_taxon = (ecotx_t*)ptaxon;
int32_t taxid=(int32_t)((size_t)ptaxid);
return taxid - current_taxon->taxid;
}
/**
* Get back information concerning a taxon from a taxonomic id
* @param *taxonomy the taxonomy database
* @param taxid the taxonomic id
*
* @result a ecotx_t structure containing the taxonimic information
**/
ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy,
int32_t taxid)
{
ecotx_t *current_taxon;
int32_t taxoncount;
// int32_t i;
taxoncount=taxonomy->taxons->count;
current_taxon = (ecotx_t*) bsearch((const void *)((size_t)taxid),
(const void *)taxonomy->taxons->taxon,
taxoncount,
sizeof(ecotx_t),
bcomptaxon);
/* Old version
for (current_taxon=taxonomy->taxons->taxon,
i=0;
i < taxoncount;
i++,
current_taxon++){
if (current_taxon->taxid==taxid){
return current_taxon;
}
}
*/
return current_taxon;
}
/**
* Find out if taxon is son of other taxon (identified by its taxid)
* @param *taxon son taxon
* @param parent_taxid taxonomic id of the other taxon
*
* @return 1 is the other taxid math a parent taxid, else 0
**/
int eco_isundertaxon(ecotx_t *taxon,
int other_taxid)
{
ecotx_t *next_parent;
next_parent = taxon->parent;
while ( (other_taxid != next_parent->taxid) &&
(strcmp(next_parent->name, "root")) )
{
next_parent = next_parent->parent;
}
if (other_taxid == next_parent->taxid)
return 1;
else
return 0;
}
ecotx_t *eco_getspecies(ecotx_t *taxon,
ecotaxonomy_t *taxonomy)
{
static ecotaxonomy_t *tax=NULL;
static int32_t rankindex=-1;
if (taxonomy && tax!=taxonomy)
{
rankindex = rank_index("species",taxonomy->ranks);
tax=taxonomy;
}
if (!tax || rankindex < 0)
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
return eco_findtaxonatrank(taxon,rankindex);
}
ecotx_t *eco_getgenus(ecotx_t *taxon,
ecotaxonomy_t *taxonomy)
{
static ecotaxonomy_t *tax=NULL;
static int32_t rankindex=-1;
if (taxonomy && tax!=taxonomy)
{
rankindex = rank_index("genus",taxonomy->ranks);
tax=taxonomy;
}
if (!tax || rankindex < 0)
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
return eco_findtaxonatrank(taxon,rankindex);
}
ecotx_t *eco_getfamily(ecotx_t *taxon,
ecotaxonomy_t *taxonomy)
{
static ecotaxonomy_t *tax=NULL;
static int32_t rankindex=-1;
if (taxonomy && tax!=taxonomy)
{
rankindex = rank_index("family",taxonomy->ranks);
tax=taxonomy;
}
if (!tax || rankindex < 0)
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
return eco_findtaxonatrank(taxon,rankindex);
}
ecotx_t *eco_getkingdom(ecotx_t *taxon,
ecotaxonomy_t *taxonomy)
{
static ecotaxonomy_t *tax=NULL;
static int32_t rankindex=-1;
if (taxonomy && tax!=taxonomy)
{
rankindex = rank_index("kingdom",taxonomy->ranks);
tax=taxonomy;
}
if (!tax || rankindex < 0)
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
return eco_findtaxonatrank(taxon,rankindex);
}
ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,
ecotaxonomy_t *taxonomy)
{
static ecotaxonomy_t *tax=NULL;
static int32_t rankindex=-1;
if (taxonomy && tax!=taxonomy)
{
rankindex = rank_index("superkingdom",taxonomy->ranks);
tax=taxonomy;
}
if (!tax || rankindex < 0)
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
return eco_findtaxonatrank(taxon,rankindex);
}

835
src/robitax.c Normal file
View File

@@ -0,0 +1,835 @@
/*
* robitax.c
*
* Created on: 17 janv. 2013
* Author: coissac
*/
#include "robitax.h"
#include <unistd.h>
//#include <regex.h>
#include "slre.h"
/**
* Return a pointeur to an obitools taxonomy C structure
* from an R instance of taxonomy.obitools
*
* The function checks if the pointer stored in the R object is set
* to NULL. In this case this means that we have to load the taxonomy
* from the disk.
*
* @param taxonomy an R object
* @type taxonomy SEXP
*
* @return a pointer to the C structure
* @rtype ecotaxonomy_t *
*/
ecotaxonomy_t *getTaxPointer(SEXP Rtaxonomy)
{
char *pwd;
SEXP pointer;
SEXP rclass;
SEXP rdir;
SEXP rfile;
ecotaxonomy_t *ptax;
const char *class;
const char *file;
const char *dir;
int saved;
if (!IS_S4_OBJECT(Rtaxonomy) )
error("argument not taxonomy.obitools instance");
// We get the class name and compare it to "taxonomy.obitools"
rclass = getAttrib(Rtaxonomy, R_ClassSymbol);
class = CHAR(asChar(rclass));
if (strcmp(class,"taxonomy.obitools"))
error("argument not taxonomy.obitools instance");
pointer = R_do_slot(Rtaxonomy,mkString("pointer"));
saved = LOGICAL(R_do_slot(Rtaxonomy,mkString("saved")))[0];
ptax = (ecotaxonomy_t *) R_ExternalPtrAddr(pointer);
// If the external pointer is set to NULL we have to load
// the taxonomy from file
if (ptax==NULL && saved)
{
pwd = getcwd(NULL,0);
rfile = R_do_slot(Rtaxonomy,mkString("dbname"));
file = CHAR(asChar(rfile));
rdir = R_do_slot(Rtaxonomy,mkString("workingdir"));
dir = CHAR(asChar(rdir));
chdir(dir);
ptax = read_taxonomy(file,1);
R_SetExternalPtrAddr(pointer,(void*)ptax);
chdir(pwd);
free(pwd);
}
if (ptax==NULL && ! saved)
error("The taxonomy instance is no more valid and must be rebuilt");
return ptax;
}
SEXP R_delete_taxonomy(SEXP Rtaxonomy)
{
ecotaxonomy_t *ptax;
// SEXP pointer;
ptax = (ecotaxonomy_t *) R_ExternalPtrAddr(Rtaxonomy);
(void) delete_ecotaxonomy(ptax);
// Clear the external pointer
R_ClearExternalPtr(Rtaxonomy);
return R_NilValue;
}
SEXP R_read_taxonomy(SEXP filename, SEXP altenative)
{
int alt;
const char* file;
SEXP Rtax;
if (! isString(filename))
error("filename not character");
file = CHAR(STRING_ELT(filename, 0));
if (! isLogical(altenative))
error("altenative not logical");
alt = LOGICAL(altenative)[0];
ecotaxonomy_t *taxonomy = read_taxonomy(file,alt);
if (! taxonomy)
error("Cannot open the taxonomy database");
Rtax = PROTECT(R_MakeExternalPtr(taxonomy, mkString("ROBITools NCBI Taxonomy pointer"), R_NilValue));
R_RegisterCFinalizerEx(Rtax, (R_CFinalizer_t)R_delete_taxonomy,TRUE);
UNPROTECT(1);
return Rtax;
}
SEXP R_get_scientific_name(SEXP Rtaxonomy,SEXP Rtaxid)
{
ecotx_t *taxon;
ecotaxonomy_t *ptax;
int taxid;
// SEXP scname;
ptax = getTaxPointer(Rtaxonomy);
if (! isInteger(Rtaxid))
error("taxid not integer");
taxid = *INTEGER(Rtaxid);
if (! (taxid > 0))
error("taxid not positive");
taxon = eco_findtaxonbytaxid(ptax, taxid);
if (!taxon)
return ScalarString(R_NaString);
// error("unkown taxid");
return mkString(taxon->name);
}
SEXP R_get_rank(SEXP Rtaxonomy,SEXP Rtaxid)
{
ecotx_t *taxon;
ecotaxonomy_t *ptax;
int *taxid;
int ntaxid;
int i;
SEXP results;
// SEXP scname;
ptax = getTaxPointer(Rtaxonomy);
if (! isInteger(Rtaxid))
error("taxid not integer");
ntaxid = length(Rtaxid);
results = PROTECT(allocVector(STRSXP, ntaxid));
taxid = INTEGER(Rtaxid);
for (i=0; i < ntaxid; i++)
{
if (taxid[i]== NA_INTEGER || taxid[i] <= 0)
SET_STRING_ELT(results, i, R_NaString);
else {
taxon = eco_findtaxonbytaxid(ptax, taxid[i]);
if (!taxon)
SET_STRING_ELT(results, i, R_NaString);
else
SET_STRING_ELT(results, i, mkChar(ptax->ranks->label[taxon->rank]));
}
}
UNPROTECT(1);
return results;
}
SEXP R_findtaxonatrank(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rrank, SEXP Rname)
{
ecotx_t *taxon;
ecotx_t *rep;
ecotaxonomy_t *ptax;
int taxid;
int name;
const char *rank;
int rankidx;
// SEXP scname;
ptax = getTaxPointer(Rtaxonomy);
if (! isInteger(Rtaxid))
error("taxid not integer");
taxid = *INTEGER(Rtaxid);
if (! (taxid > 0))
error("taxid not positive");
if (! isString(Rrank))
error("rank not a string");
rank=CHAR(STRING_ELT(Rrank,0));
rankidx=rank_index(rank,ptax->ranks);
if (rankidx < 0)
error("unkown rank name");
if (! isLogical(Rname))
error("name not logical");
name = LOGICAL(Rname)[0];
taxon = eco_findtaxonbytaxid(ptax, taxid);
if (!taxon)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
rep = eco_findtaxonatrank(taxon,rankidx);
if (!rep)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
if (name)
return mkString(rep->name);
else
return ScalarInteger(rep->taxid);
}
SEXP R_get_species(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
{
ecotx_t *taxon;
ecotx_t *rep;
ecotaxonomy_t *ptax;
int taxid;
int name;
// SEXP scname;
ptax = getTaxPointer(Rtaxonomy);
if (! isInteger(Rtaxid))
error("taxid not integer");
taxid = *INTEGER(Rtaxid);
if (! (taxid > 0))
error("taxid not positive");
if (! isLogical(Rname))
error("name not logical");
name = LOGICAL(Rname)[0];
taxon = eco_findtaxonbytaxid(ptax, taxid);
if (!taxon)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
rep = eco_getspecies(taxon,ptax);
if (!rep)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
if (name)
return mkString(rep->name);
else
return ScalarInteger(rep->taxid);
}
SEXP R_get_genus(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
{
ecotx_t *taxon;
ecotx_t *rep;
ecotaxonomy_t *ptax;
int taxid;
int name;
// SEXP scname;
ptax = getTaxPointer(Rtaxonomy);
if (! isInteger(Rtaxid))
error("taxid not integer");
taxid = *INTEGER(Rtaxid);
if (! (taxid > 0))
error("taxid not positive");
if (! isLogical(Rname))
error("name not logical");
name = LOGICAL(Rname)[0];
taxon = eco_findtaxonbytaxid(ptax, taxid);
if (!taxon)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
rep = eco_getgenus(taxon,ptax);
if (!rep)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
if (name)
return mkString(rep->name);
else
return ScalarInteger(rep->taxid);
}
SEXP R_get_family(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
{
ecotx_t *taxon;
ecotx_t *rep;
ecotaxonomy_t *ptax;
int taxid;
int name;
// SEXP scname;
ptax = getTaxPointer(Rtaxonomy);
if (! isInteger(Rtaxid))
error("taxid not integer");
taxid = *INTEGER(Rtaxid);
if (! (taxid > 0))
error("taxid not positive");
if (! isLogical(Rname))
error("name not logical");
name = LOGICAL(Rname)[0];
taxon = eco_findtaxonbytaxid(ptax, taxid);
if (!taxon)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
rep = eco_getfamily(taxon,ptax);
if (!rep)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
if (name)
return mkString(rep->name);
else
return ScalarInteger(rep->taxid);
}
SEXP R_get_kingdom(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
{
ecotx_t *taxon;
ecotx_t *rep;
ecotaxonomy_t *ptax;
int taxid;
int name;
// SEXP scname;
ptax = getTaxPointer(Rtaxonomy);
if (! isInteger(Rtaxid))
error("taxid not integer");
taxid = *INTEGER(Rtaxid);
if (! (taxid > 0))
error("taxid not positive");
if (! isLogical(Rname))
error("name not logical");
name = LOGICAL(Rname)[0];
taxon = eco_findtaxonbytaxid(ptax, taxid);
if (!taxon)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
rep = eco_getkingdom(taxon,ptax);
if (!rep)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
if (name)
return mkString(rep->name);
else
return ScalarInteger(rep->taxid);
}
SEXP R_get_superkingdom(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
{
ecotx_t *taxon;
ecotx_t *rep;
ecotaxonomy_t *ptax;
int taxid;
int name;
// SEXP scname;
ptax = getTaxPointer(Rtaxonomy);
if (! isInteger(Rtaxid))
error("taxid not integer");
taxid = *INTEGER(Rtaxid);
if (! (taxid > 0))
error("taxid not positive");
if (! isLogical(Rname))
error("name not logical");
name = LOGICAL(Rname)[0];
taxon = eco_findtaxonbytaxid(ptax, taxid);
if (!taxon)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
rep = eco_getsuperkingdom(taxon,ptax);
if (!rep)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
if (name)
return mkString(rep->name);
else
return ScalarInteger(rep->taxid);
}
SEXP R_get_parent(SEXP Rtaxonomy,SEXP Rtaxid,SEXP Rname)
{
ecotx_t *taxon;
ecotx_t *rep;
ecotaxonomy_t *ptax;
int taxid;
int name;
// SEXP scname;
ptax = getTaxPointer(Rtaxonomy);
if (! isInteger(Rtaxid))
error("taxid not integer");
taxid = *INTEGER(Rtaxid);
if (! (taxid > 0))
error("taxid not positive");
if (! isLogical(Rname))
error("name not logical");
name = LOGICAL(Rname)[0];
taxon = eco_findtaxonbytaxid(ptax, taxid);
if (!taxon)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
rep = taxon->parent;
if (rep->taxid==taxid)
{
if (name)
return ScalarString(R_NaString);
else
return ScalarInteger(R_NaInt);
}
if (name)
return mkString(rep->name);
else
return ScalarInteger(rep->taxid);
}
SEXP R_validate_taxid(SEXP Rtaxonomy,SEXP Rtaxid)
{
ecotx_t *taxon;
ecotaxonomy_t *ptax;
int taxid;
// int name;
// SEXP scname;
ptax = getTaxPointer(Rtaxonomy);
if (! isInteger(Rtaxid))
error("taxid not integer");
taxid = *INTEGER(Rtaxid);
if (! (taxid > 0))
return ScalarInteger(R_NaInt);
taxon = eco_findtaxonbytaxid(ptax, taxid);
if (!taxon)
return ScalarInteger(R_NaInt);
else
return ScalarInteger(taxon->taxid);
}
SEXP R_is_under_taxon(SEXP Rtaxonomy, SEXP Rtaxid, SEXP Rparent)
{
ecotx_t *taxon;
ecotaxonomy_t *ptax;
int taxid;
int parent;
int rep;
// SEXP isunder;
ptax = getTaxPointer(Rtaxonomy);
if (! isInteger(Rparent))
error("parent not integer");
parent = *INTEGER(Rparent);
if (parent <= 0)
return ScalarInteger(R_NaInt);
taxon = eco_findtaxonbytaxid(ptax, parent);
if (!taxon)
return ScalarInteger(R_NaInt);
if (! isInteger(Rtaxid))
error("taxid not integer");
taxid = *INTEGER(Rtaxid);
if (taxid <= 0)
return ScalarInteger(R_NaInt);
taxon = eco_findtaxonbytaxid(ptax, taxid);
if (!taxon)
return ScalarInteger(R_NaInt);
rep = eco_isundertaxon(taxon, parent);
return ScalarLogical(rep);
}
SEXP R_longest_path(SEXP Rtaxonomy,SEXP Rtaxid)
{
ecotx_t *taxon;
ecotaxonomy_t *ptax;
int taxid;
// int name;
// SEXP scname;
ptax = getTaxPointer(Rtaxonomy);
if (! isInteger(Rtaxid))
error("taxid not integer");
taxid = *INTEGER(Rtaxid);
if (taxid <= 0)
return ScalarInteger(R_NaInt);
taxon = eco_findtaxonbytaxid(ptax, taxid);
if (!taxon)
return ScalarInteger(R_NaInt);
else
return ScalarInteger(taxon->farest);
}
SEXP R_rank_list(SEXP Rtaxonomy)
{
int nrank;
int i;
ecotaxonomy_t *ptax;
SEXP rNames;
ptax = getTaxPointer(Rtaxonomy);
nrank = ptax->ranks->count;
rNames = PROTECT(allocVector(STRSXP, nrank));
for (i=0; i < nrank;i++)
SET_STRING_ELT(rNames, i, mkChar(ptax->ranks->label[i]));
UNPROTECT(1);
return rNames;
}
SEXP R_taxid_list(SEXP Rtaxonomy)
{
int ntaxid;
int i;
ecotaxonomy_t *ptax;
SEXP rTaxids;
ptax = getTaxPointer(Rtaxonomy);
ntaxid = ptax->taxons->count;
rTaxids = PROTECT(allocVector(INTSXP, ntaxid));
for (i=0; i < ntaxid;i++)
INTEGER(rTaxids)[i]=ptax->taxons->taxon[i].taxid;
UNPROTECT(1);
return rTaxids;
}
SEXP R_max_taxid(SEXP Rtaxonomy)
{
// int nrank;
// int i;
ecotaxonomy_t *ptax;
// SEXP rNames;
ptax = getTaxPointer(Rtaxonomy);
return ScalarInteger(ptax->taxons->maxtaxid);
}
SEXP R_length_taxonomy(SEXP Rtaxonomy)
{
ecotaxonomy_t *ptax;
ptax = getTaxPointer(Rtaxonomy);
return ScalarInteger(ptax->taxons->count);
}
SEXP R_ecofind(SEXP Rtaxonomy, SEXP Rpattern, SEXP Rrank, SEXP Ralternative)
{
ecotaxonomy_t *ptax;
econame_t *name;
char* pattern=NULL;
int re_match;
SEXP taxids;
int32_t* buffer;
int32_t tax_count = 0;
size_t j = 0;
int32_t rankfilter = 1;
int* ptaxid;
char *rankname=NULL;
int32_t nummatch = 0;
int32_t alternative = 0;
size_t bsize;
ptax = getTaxPointer(Rtaxonomy);
tax_count = ptax->taxons->count;
if (! isString(Rpattern))
error("pattern not a string");
pattern= (char*) CHAR(STRING_ELT(Rpattern,0));
if (! isNull(Rrank))
{
if (! isString(Rrank))
error("rank not a string");
rankname= (char*) CHAR(STRING_ELT(Rrank,0));
}
if (! isLogical(Ralternative))
error("rank not a logical");
alternative = LOGICAL(Ralternative)[0];
nummatch=0;
buffer = (int32_t*) malloc(100 * sizeof(int32_t));
bsize=100;
if (alternative && ptax->names!=NULL)
for (j=0,name=ptax->names->names;
j < ptax->names->count;
name++,j++)
{
if(rankname)
rankfilter = !(strcmp(rankname,ptax->ranks->label[name->taxon->rank]));
re_match = slre_match(pattern, name->name,
strlen(name->name),
NULL, 0,
SLRE_IGNORE_CASE);
if (re_match > 0 && rankfilter)
{
buffer[nummatch]=name->taxon->taxid;
nummatch++;
if (nummatch==bsize) {
bsize*=2;
buffer = (int32_t*) realloc(buffer, bsize * sizeof(int32_t));
if (buffer==0)
{
// regfree(&re_preg);
error("Cannot allocate memory for the taxid list");
}
}
}
}
else
for (j=0; j < ptax->taxons->count;j++)
{
if(rankname)
rankfilter = !(strcmp(rankname,ptax->ranks->label[ptax->taxons->taxon[j].rank]));
// re_match = regexec (&re_preg, ptax->taxons->taxon[j].name, 0, NULL, 0);
re_match = slre_match(pattern, ptax->taxons->taxon[j].name,
strlen(ptax->taxons->taxon[j].name),
NULL, 0,
SLRE_IGNORE_CASE);
// if (!re_match && rankfilter)
if (re_match > 0 && rankfilter)
{
buffer[nummatch]=ptax->taxons->taxon[j].taxid;
nummatch++;
if (nummatch==bsize) {
bsize*=2;
buffer = (int32_t*) realloc(buffer, bsize * sizeof(int32_t));
if (buffer==0)
{
// regfree(&re_preg);
error("Cannot allocate memory for the taxid list");
}
}
}
}
//regfree(&re_preg);
taxids = PROTECT(NEW_INTEGER(nummatch));
ptaxid = INTEGER(taxids);
for (j=0; j < nummatch; j++)
ptaxid[j]=buffer[j];
free(buffer);
UNPROTECT(1);
return taxids;
}

6
src/robitax.h Normal file
View File

@@ -0,0 +1,6 @@
#include "ecoPCR.h"
ecotaxonomy_t *getTaxPointer(SEXP Rtaxonomy);
SEXP R_delete_taxonomy(SEXP Rtaxonomy);

433
src/slre.c Executable file
View File

@@ -0,0 +1,433 @@
/*
* Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
* Copyright (c) 2013 Cesanta Software Limited
* All rights reserved
*
* This library is dual-licensed: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation. For the terms of this
* license, see <http://www.gnu.org/licenses/>.
*
* You are free to use this library under the terms of the GNU General
* Public License, but WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU General Public License for more details.
*
* Alternatively, you can license this library under a commercial
* license, as set out in <http://cesanta.com/products.html>.
*/
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include "slre.h"
#define MAX_BRANCHES 100
#define MAX_BRACKETS 100
#define FAIL_IF(condition, error_code) if (condition) return (error_code)
#ifndef ARRAY_SIZE
#define ARRAY_SIZE(ar) (sizeof(ar) / sizeof((ar)[0]))
#endif
#ifdef SLRE_DEBUG
#define DBG(x) printf x
#else
#define DBG(x)
#endif
struct bracket_pair {
const char *ptr; /* Points to the first char after '(' in regex */
int len; /* Length of the text between '(' and ')' */
int branches; /* Index in the branches array for this pair */
int num_branches; /* Number of '|' in this bracket pair */
};
struct branch {
int bracket_index; /* index for 'struct bracket_pair brackets' */
/* array defined below */
const char *schlong; /* points to the '|' character in the regex */
};
struct regex_info {
/*
* Describes all bracket pairs in the regular expression.
* First entry is always present, and grabs the whole regex.
*/
struct bracket_pair brackets[MAX_BRACKETS];
int num_brackets;
/*
* Describes alternations ('|' operators) in the regular expression.
* Each branch falls into a specific branch pair.
*/
struct branch branches[MAX_BRANCHES];
int num_branches;
/* Array of captures provided by the user */
struct slre_cap *caps;
int num_caps;
/* E.g. SLRE_IGNORE_CASE. See enum below */
int flags;
};
static int is_metacharacter(const unsigned char *s) {
static const char *metacharacters = "^$().[]*+?|\\Ssdbfnrtv";
return strchr(metacharacters, *s) != NULL;
}
static int op_len(const char *re) {
return re[0] == '\\' && re[1] == 'x' ? 4 : re[0] == '\\' ? 2 : 1;
}
static int set_len(const char *re, int re_len) {
int len = 0;
while (len < re_len && re[len] != ']') {
len += op_len(re + len);
}
return len <= re_len ? len + 1 : -1;
}
static int get_op_len(const char *re, int re_len) {
return re[0] == '[' ? set_len(re + 1, re_len - 1) + 1 : op_len(re);
}
static int is_quantifier(const char *re) {
return re[0] == '*' || re[0] == '+' || re[0] == '?';
}
static int toi(int x) {
return isdigit(x) ? x - '0' : x - 'W';
}
static int hextoi(const unsigned char *s) {
return (toi(tolower(s[0])) << 4) | toi(tolower(s[1]));
}
static int match_op(const unsigned char *re, const unsigned char *s,
struct regex_info *info) {
int result = 0;
switch (*re) {
case '\\':
/* Metacharacters */
switch (re[1]) {
case 'S': FAIL_IF(isspace(*s), SLRE_NO_MATCH); result++; break;
case 's': FAIL_IF(!isspace(*s), SLRE_NO_MATCH); result++; break;
case 'd': FAIL_IF(!isdigit(*s), SLRE_NO_MATCH); result++; break;
case 'b': FAIL_IF(*s != '\b', SLRE_NO_MATCH); result++; break;
case 'f': FAIL_IF(*s != '\f', SLRE_NO_MATCH); result++; break;
case 'n': FAIL_IF(*s != '\n', SLRE_NO_MATCH); result++; break;
case 'r': FAIL_IF(*s != '\r', SLRE_NO_MATCH); result++; break;
case 't': FAIL_IF(*s != '\t', SLRE_NO_MATCH); result++; break;
case 'v': FAIL_IF(*s != '\v', SLRE_NO_MATCH); result++; break;
case 'x':
/* Match byte, \xHH where HH is hexadecimal byte representaion */
FAIL_IF(hextoi(re + 2) != *s, SLRE_NO_MATCH);
result++;
break;
default:
/* Valid metacharacter check is done in bar() */
FAIL_IF(re[1] != s[0], SLRE_NO_MATCH);
result++;
break;
}
break;
case '|': FAIL_IF(1, SLRE_INTERNAL_ERROR); break;
case '$': FAIL_IF(1, SLRE_NO_MATCH); break;
case '.': result++; break;
default:
if (info->flags & SLRE_IGNORE_CASE) {
FAIL_IF(tolower(*re) != tolower(*s), SLRE_NO_MATCH);
} else {
FAIL_IF(*re != *s, SLRE_NO_MATCH);
}
result++;
break;
}
return result;
}
static int match_set(const char *re, int re_len, const char *s,
struct regex_info *info) {
int len = 0, result = -1, invert = re[0] == '^';
if (invert) re++, re_len--;
while (len <= re_len && re[len] != ']' && result <= 0) {
/* Support character range */
if (re[len] != '-' && re[len + 1] == '-' && re[len + 2] != ']' &&
re[len + 2] != '\0') {
result = info->flags && SLRE_IGNORE_CASE ?
*s >= re[len] && *s <= re[len + 2] :
tolower(*s) >= tolower(re[len]) && tolower(*s) <= tolower(re[len + 2]);
len += 3;
} else {
result = match_op((unsigned char *) re + len, (unsigned char *) s, info);
len += op_len(re + len);
}
}
return (!invert && result > 0) || (invert && result <= 0) ? 1 : -1;
}
static int doh(const char *s, int s_len, struct regex_info *info, int bi);
static int bar(const char *re, int re_len, const char *s, int s_len,
struct regex_info *info, int bi) {
/* i is offset in re, j is offset in s, bi is brackets index */
int i, j, n, step;
for (i = j = 0; i < re_len && j <= s_len; i += step) {
/* Handle quantifiers. Get the length of the chunk. */
step = re[i] == '(' ? info->brackets[bi + 1].len + 2 :
get_op_len(re + i, re_len - i);
DBG(("%s [%.*s] [%.*s] re_len=%d step=%d i=%d j=%d\n", __func__,
re_len - i, re + i, s_len - j, s + j, re_len, step, i, j));
FAIL_IF(is_quantifier(&re[i]), SLRE_UNEXPECTED_QUANTIFIER);
FAIL_IF(step <= 0, SLRE_INVALID_CHARACTER_SET);
if (i + step < re_len && is_quantifier(re + i + step)) {
DBG(("QUANTIFIER: [%.*s]%c [%.*s]\n", step, re + i,
re[i + step], s_len - j, s + j));
if (re[i + step] == '?') {
int result = bar(re + i, step, s + j, s_len - j, info, bi);
j += result > 0 ? result : 0;
i++;
} else if (re[i + step] == '+' || re[i + step] == '*') {
int j2 = j, nj = j, n1, n2 = -1, ni, non_greedy = 0;
/* Points to the regexp code after the quantifier */
ni = i + step + 1;
if (ni < re_len && re[ni] == '?') {
non_greedy = 1;
ni++;
}
do {
if ((n1 = bar(re + i, step, s + j2, s_len - j2, info, bi)) > 0) {
j2 += n1;
}
if (re[i + step] == '+' && n1 < 0) break;
if (ni >= re_len) {
/* After quantifier, there is nothing */
nj = j2;
} else if ((n2 = bar(re + ni, re_len - ni, s + j2,
s_len - j2, info, bi)) >= 0) {
/* Regex after quantifier matched */
nj = j2 + n2;
}
if (nj > j && non_greedy) break;
} while (n1 > 0);
if (n1 < 0 && re[i + step] == '*' &&
(n2 = bar(re + ni, re_len - ni, s + j, s_len - j, info, bi)) > 0) {
nj = j + n2;
}
DBG(("STAR/PLUS END: %d %d %d %d %d\n", j, nj, re_len - ni, n1, n2));
FAIL_IF(re[i + step] == '+' && nj == j, SLRE_NO_MATCH);
/* If while loop body above was not executed for the * quantifier, */
/* make sure the rest of the regex matches */
FAIL_IF(nj == j && ni < re_len && n2 < 0, SLRE_NO_MATCH);
/* Returning here cause we've matched the rest of RE already */
return nj;
}
continue;
}
if (re[i] == '[') {
n = match_set(re + i + 1, re_len - (i + 2), s + j, info);
DBG(("SET %.*s [%.*s] -> %d\n", step, re + i, s_len - j, s + j, n));
FAIL_IF(n <= 0, SLRE_NO_MATCH);
j += n;
} else if (re[i] == '(') {
n = SLRE_NO_MATCH;
bi++;
FAIL_IF(bi >= info->num_brackets, SLRE_INTERNAL_ERROR);
DBG(("CAPTURING [%.*s] [%.*s] [%s]\n",
step, re + i, s_len - j, s + j, re + i + step));
if (re_len - (i + step) <= 0) {
/* Nothing follows brackets */
n = doh(s + j, s_len - j, info, bi);
} else {
int j2;
for (j2 = 0; j2 <= s_len - j; j2++) {
if ((n = doh(s + j, s_len - (j + j2), info, bi)) >= 0 &&
bar(re + i + step, re_len - (i + step),
s + j + n, s_len - (j + n), info, bi) >= 0) break;
}
}
DBG(("CAPTURED [%.*s] [%.*s]:%d\n", step, re + i, s_len - j, s + j, n));
FAIL_IF(n < 0, n);
if (info->caps != NULL) {
info->caps[bi - 1].ptr = s + j;
info->caps[bi - 1].len = n;
}
j += n;
} else if (re[i] == '^') {
FAIL_IF(j != 0, SLRE_NO_MATCH);
} else if (re[i] == '$') {
FAIL_IF(j != s_len, SLRE_NO_MATCH);
} else {
FAIL_IF(j >= s_len, SLRE_NO_MATCH);
n = match_op((unsigned char *) (re + i), (unsigned char *) (s + j), info);
FAIL_IF(n <= 0, n);
j += n;
}
}
return j;
}
/* Process branch points */
static int doh(const char *s, int s_len, struct regex_info *info, int bi) {
const struct bracket_pair *b = &info->brackets[bi];
int i = 0, len, result;
const char *p;
do {
p = i == 0 ? b->ptr : info->branches[b->branches + i - 1].schlong + 1;
len = b->num_branches == 0 ? b->len :
i == b->num_branches ? (int) (b->ptr + b->len - p) :
(int) (info->branches[b->branches + i].schlong - p);
DBG(("%s %d %d [%.*s] [%.*s]\n", __func__, bi, i, len, p, s_len, s));
result = bar(p, len, s, s_len, info, bi);
DBG(("%s <- %d\n", __func__, result));
} while (result <= 0 && i++ < b->num_branches); /* At least 1 iteration */
return result;
}
static int baz(const char *s, int s_len, struct regex_info *info) {
int i, result = -1, is_anchored = info->brackets[0].ptr[0] == '^';
for (i = 0; i <= s_len; i++) {
result = doh(s + i, s_len - i, info, 0);
if (result >= 0) {
result += i;
break;
}
if (is_anchored) break;
}
return result;
}
static void setup_branch_points(struct regex_info *info) {
int i, j;
struct branch tmp;
/* First, sort branches. Must be stable, no qsort. Use bubble algo. */
for (i = 0; i < info->num_branches; i++) {
for (j = i + 1; j < info->num_branches; j++) {
if (info->branches[i].bracket_index > info->branches[j].bracket_index) {
tmp = info->branches[i];
info->branches[i] = info->branches[j];
info->branches[j] = tmp;
}
}
}
/*
* For each bracket, set their branch points. This way, for every bracket
* (i.e. every chunk of regex) we know all branch points before matching.
*/
for (i = j = 0; i < info->num_brackets; i++) {
info->brackets[i].num_branches = 0;
info->brackets[i].branches = j;
while (j < info->num_branches && info->branches[j].bracket_index == i) {
info->brackets[i].num_branches++;
j++;
}
}
}
static int foo(const char *re, int re_len, const char *s, int s_len,
struct regex_info *info) {
int i, step, depth = 0;
/* First bracket captures everything */
info->brackets[0].ptr = re;
info->brackets[0].len = re_len;
info->num_brackets = 1;
/* Make a single pass over regex string, memorize brackets and branches */
for (i = 0; i < re_len; i += step) {
step = get_op_len(re + i, re_len - i);
if (re[i] == '|') {
FAIL_IF(info->num_branches >= (int) ARRAY_SIZE(info->branches),
SLRE_TOO_MANY_BRANCHES);
info->branches[info->num_branches].bracket_index =
info->brackets[info->num_brackets - 1].len == -1 ?
info->num_brackets - 1 : depth;
info->branches[info->num_branches].schlong = &re[i];
info->num_branches++;
} else if (re[i] == '\\') {
FAIL_IF(i >= re_len - 1, SLRE_INVALID_METACHARACTER);
if (re[i + 1] == 'x') {
/* Hex digit specification must follow */
FAIL_IF(re[i + 1] == 'x' && i >= re_len - 3,
SLRE_INVALID_METACHARACTER);
FAIL_IF(re[i + 1] == 'x' && !(isxdigit(re[i + 2]) &&
isxdigit(re[i + 3])), SLRE_INVALID_METACHARACTER);
} else {
FAIL_IF(!is_metacharacter((unsigned char *) re + i + 1),
SLRE_INVALID_METACHARACTER);
}
} else if (re[i] == '(') {
FAIL_IF(info->num_brackets >= (int) ARRAY_SIZE(info->brackets),
SLRE_TOO_MANY_BRACKETS);
depth++; /* Order is important here. Depth increments first. */
info->brackets[info->num_brackets].ptr = re + i + 1;
info->brackets[info->num_brackets].len = -1;
info->num_brackets++;
FAIL_IF(info->num_caps > 0 && info->num_brackets - 1 > info->num_caps,
SLRE_CAPS_ARRAY_TOO_SMALL);
} else if (re[i] == ')') {
int ind = info->brackets[info->num_brackets - 1].len == -1 ?
info->num_brackets - 1 : depth;
info->brackets[ind].len = (int) (&re[i] - info->brackets[ind].ptr);
DBG(("SETTING BRACKET %d [%.*s]\n",
ind, info->brackets[ind].len, info->brackets[ind].ptr));
depth--;
FAIL_IF(depth < 0, SLRE_UNBALANCED_BRACKETS);
FAIL_IF(i > 0 && re[i - 1] == '(', SLRE_NO_MATCH);
}
}
FAIL_IF(depth != 0, SLRE_UNBALANCED_BRACKETS);
setup_branch_points(info);
return baz(s, s_len, info);
}
int slre_match(const char *regexp, const char *s, int s_len,
struct slre_cap *caps, int num_caps, int flags) {
struct regex_info info;
/* Initialize info structure */
info.flags = flags;
info.num_brackets = info.num_branches = 0;
info.num_caps = num_caps;
info.caps = caps;
DBG(("========================> [%s] [%.*s]\n", regexp, s_len, s));
return foo(regexp, (int) strlen(regexp), s, s_len, &info);
}

60
src/slre.h Executable file
View File

@@ -0,0 +1,60 @@
/*
* Copyright (c) 2004-2013 Sergey Lyubka <valenok@gmail.com>
* Copyright (c) 2013 Cesanta Software Limited
* All rights reserved
*
* This library is dual-licensed: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation. For the terms of this
* license, see <http://www.gnu.org/licenses/>.
*
* You are free to use this library under the terms of the GNU General
* Public License, but WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU General Public License for more details.
*
* Alternatively, you can license this library under a commercial
* license, as set out in <http://cesanta.com/products.html>.
*/
/*
* This is a regular expression library that implements a subset of Perl RE.
* Please refer to README.md for a detailed reference.
*/
#ifndef SLRE_HEADER_DEFINED
#define SLRE_HEADER_DEFINED
#ifdef __cplusplus
extern "C" {
#endif
struct slre_cap {
const char *ptr;
int len;
};
int slre_match(const char *regexp, const char *buf, int buf_len,
struct slre_cap *caps, int num_caps, int flags);
/* Possible flags for slre_match() */
enum { SLRE_IGNORE_CASE = 1 };
/* slre_match() failure codes */
#define SLRE_NO_MATCH -1
#define SLRE_UNEXPECTED_QUANTIFIER -2
#define SLRE_UNBALANCED_BRACKETS -3
#define SLRE_INTERNAL_ERROR -4
#define SLRE_INVALID_CHARACTER_SET -5
#define SLRE_INVALID_METACHARACTER -6
#define SLRE_CAPS_ARRAY_TOO_SMALL -7
#define SLRE_TOO_MANY_BRANCHES -8
#define SLRE_TOO_MANY_BRACKETS -9
#ifdef __cplusplus
}
#endif
#endif /* SLRE_HEADER_DEFINED */