git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPCR/trunk@114 60f365c0-8329-0410-b2a4-ec073aeeaa1d

This commit is contained in:
2007-09-24 09:48:08 +00:00
parent 6cd1141130
commit bc4c7656c6
15 changed files with 1494 additions and 240 deletions

84
src/Makefile Normal file
View File

@ -0,0 +1,84 @@
EXEC=ecoPCR ecofind ecoisundertaxon
PCR_SRC= ecopcr.c
PCR_OBJ= $(patsubst %.c,%.o,$(PCR_SRC))
FIND_SRC= ecofind.c
FIND_OBJ= $(patsubst %.c,%.o,$(FIND_SRC))
IUT_SRC= ecoisundertaxon.c
IUT_OBJ= $(patsubst %.c,%.o,$(IUT_SRC))
SRCS= $(PCR_SRC) $(FIND_SRC) $(IUT_SRC)
LIB= -lecoPCR -lapat -lz -lm
LIBFILE= libapat/libapat.a \
libecoPCR/libecoPCR.a
include global.mk
all: $(EXEC)
########
#
# ecoPCR compilation
#
########
# executable compilation and link
ecoPCR: $(PCR_OBJ) $(LIBFILE)
$(CC) $(LDFLAGS) -o $@ $< $(LIBPATH) $(LIB)
########
#
# ecofind compilation
#
########
# executable compilation and link
ecofind: $(FIND_OBJ) $(LIBFILE)
$(CC) $(LDFLAGS) -o $@ $< $(LIBPATH) $(LIB)
########
#
# IsUnderTaxon compilation
#
########
# executable compilation and link
ecoisundertaxon: $(IUT_OBJ) $(LIBFILE)
$(CC) $(LDFLAGS) -o $@ $< $(LIBPATH) $(LIB)
########
#
# library compilation
#
########
libapat/libapat.a:
$(MAKE) -C libapat
libecoPCR/libecoPCR.a:
$(MAKE) -C libecoPCR
########
#
# project management
#
########
clean:
rm -f *.o
rm -f $(EXEC)
$(MAKE) -C libapat clean
$(MAKE) -C libecoPCR clean

345
src/ecofind.c Normal file
View File

@ -0,0 +1,345 @@
#include "libecoPCR/ecoPCR.h"
#include <regex.h>
#include <string.h>
#include <stdlib.h>
#include <getopt.h>
#include <stdio.h>
#define VERSION "0.1"
/**
* display the result
**/
static void printresult(ecotx_t *taxon,econame_t* name,ecotaxonomy_t *taxonomy)
{
char* rankname;
char* classname;
char* matchedname=taxon->name;
classname="scientific name";
if (name)
{
classname=name->classname;
matchedname=name->name;
}
rankname= taxonomy->ranks->label[taxon->rank];
printf("%10d \t| %15s \t|\t %-50s \t|\t %15s \t|\t %s\n",
taxon->taxid,
rankname,
matchedname,
classname,
taxon->name);
}
/**
* display header before printing any result
**/
static void printheader(void)
{
printf("# %12s \t| %15s \t|\t %-50s \t|\t %-15s \t|\t %s\n#\n",
"taxonomy id",
"taxonomy rank",
"name",
"class name",
"scientific name");
}
/**
* display son's list for given taxon
**/
static void get_son(ecotaxonomy_t *taxonomy, ecotx_t *taxon, int32_t *count, char *rankname)
{
int32_t i;
ecotx_t *current_taxon;
for ( i = 0, current_taxon = taxonomy->taxons->taxon;
i < taxonomy->taxons->count;
i++, current_taxon++)
{
if (taxon->taxid == current_taxon->parent->taxid)
{
if (rankname == NULL || !strcmp(rankname,taxonomy->ranks->label[current_taxon->rank]))
{
printresult(current_taxon, NULL, taxonomy);
(*count)++;
}
get_son(taxonomy,current_taxon,count,rankname);
}
}
}
/**
* display list of rank filter option (-l option)
**/
static void listfilteroptions(ecorankidx_t *ranks)
{
int32_t i;
printf("#\n");
for ( i=0;
i < ranks->count;
i++)
{
printf("# %s \n",ranks->label[i]);
}
printf("#\n");
}
/* ---------------------------------------- */
/* get back on given taxid taxonomic parent */
/* and display it */
/* ---------------------------------------- */
void gettaxidparents(int32_t taxid, ecotaxonomy_t *taxonomy, char *rankname)
{
ecotx_t *next_parent;
int32_t c = 0;
next_parent = eco_findtaxonbytaxid(taxonomy, taxid);
printheader();
printresult(next_parent, NULL,taxonomy);
while ( strcmp(next_parent->name, "root") )
{
next_parent = next_parent->parent;
if (rankname == NULL || !strcmp(rankname,taxonomy->ranks->label[next_parent->rank]))
{
printresult(next_parent, NULL,taxonomy);
c++;
}
}
printf("# %d parent(s) found\n#\n",c);
}
/**
* printout usage and exit
**/
#define PP fprintf(stderr,
static void ExitUsage(stat)
int stat;
{
PP "usage: ecofind [-d database] [-h] [-l] [-r taxonomic rank] [-p taxid] [-s taxid] <taxon name pattern> ... \n");
PP "type \"ecofind -h\" for help\n");
if (stat)
exit(stat);
}
#undef PP
/**
* printout help
**/
#define PP fprintf(stdout,
static void PrintHelp()
{
PP "------------------------------------------\n");
PP " ecofind Version %s\n", VERSION);
PP "------------------------------------------\n");
PP "synopsis : searching for taxonomic and rank and\n");
PP " taxonomy id for given regular expression patterns\n\n");
PP "usage: ecofind [options] <patterns>\n");
PP "------------------------------------------\n");
PP "options:\n");
PP "-a : [A]ll enable the search on all alternative names and not only scientific names.\n\n");
PP "-d : [D]atabase containing the taxonomy.\n");
PP " To match the expected format, the database\n");
PP " has to be formated first by the ecoPCRFormat.py\n");
PP " program located in the tools directory.\n");
PP " Write the database radical without any extension.\n\n");
PP "-h : [H]elp - print <this> help\n\n");
PP "-l : [L]ist all taxonomic rank available for -r option\n\n");
PP "-p : [P]arents : specifiying this option displays all parental tree's information for the given taxid.\n\n");
PP "-r : [R]estrict to given taxonomic rank\n\n");
PP "-s : [S]ons: specifiying this option displays all subtree's information for the given taxid.\n\n");
PP "arguments:\n");
PP "<taxon> name pattern bearing regular expressions\n\n");
PP "------------------------------------------\n");
PP " http://www.grenoble.prabi.fr/trac/ecoPCR/\n");
PP "------------------------------------------\n\n");
}
/* ----------------------------------------------- */
#define PATTERN_NUMBER 10
#define PATTERN_LENGHT 40
#define RESULT_LENGTH 100
int main(int argc, char **argv)
{
int32_t carg = 0;
int32_t nummatch = 0;
int32_t k,j = 0;
int32_t errflag = 0;
int32_t tax_count = 0;
int32_t alternative = 0;
char *prefix = NULL;
ecotaxonomy_t *taxonomy;
econame_t *name;
int32_t name_count;
int re_error;
int re_match;
regex_t re_preg;
int32_t uptree = 0;
int32_t subtree = 0;
char *rankname = NULL;
int32_t rankfilter = 1;
int32_t list = 0;
ecotx_t *subtree_parent;
int32_t count_son = 0;
while ((carg = getopt(argc, argv, "had:p:s:r:l")) != -1) {
switch (carg) {
case 's': /* path to the database */
sscanf(optarg,"%d",&subtree);
break;
case 'r': /* rank filter */
rankname = ECOMALLOC(strlen(optarg),"allocation rankname");
strcpy(rankname,optarg);
rankfilter = 0;
break;
case 'd': /* path to the database */
prefix = ECOMALLOC(strlen(optarg),"allocation prefix");
strcpy(prefix,optarg);
break;
case 'l': /* list rank filter options */
list = 1;
break;
case 'a': /* allow alternative names */
alternative = 1;
break;
case 'h': /* display help */
PrintHelp();
exit(0);
break;
case 'p': /* taxid */
sscanf(optarg,"%d",&uptree);
break;
case '?': /* bad option */
errflag++;
}
}
if ((argc - optind) < 1)
errflag++;
if (prefix == NULL)
{
prefix = getenv("ECOPCRDB");
if (prefix == NULL)
errflag++;
}
if (errflag && !uptree && !rankname && !subtree && !list)
ExitUsage(errflag);
/**
* load taxonomy using libecoPCR functions
**/
printf("# \n# opening %s database\n",prefix);
taxonomy = read_taxonomy(prefix,1);
tax_count = taxonomy->taxons->count;
name_count = taxonomy->names->count;
/* ---------------------------------------- */
/* list -r option possibility */
/* ---------------------------------------- */
if (list)
{
listfilteroptions(taxonomy->ranks);
return 0;
}
/* ---------------------------------------- */
/* display taxid parent if -t option */
/* specified in command line */
/* ---------------------------------------- */
if (uptree)
{
gettaxidparents(uptree,taxonomy,rankname);
return 0;
}
/* ---------------------------------------- */
/* display taxid sons if -s option */
/* specified in command line */
/* ---------------------------------------- */
if (subtree)
{
printheader();
subtree_parent = eco_findtaxonbytaxid(taxonomy,subtree);
printresult(subtree_parent, NULL,taxonomy);
get_son(taxonomy, subtree_parent,&count_son,rankname);
printf("# %d son(s) found\n#\n",count_son);
return 0;
}
printf("# %d taxons\n", tax_count);
/**
* parse taxonomy
**/
for (k=optind;k<argc;k++)
{
printf("#\n# searching for '%s' pattern\n",argv[k]);
re_error = regcomp (&re_preg, argv[k], REG_NOSUB | REG_EXTENDED | REG_ICASE);
if (re_error)
{
fprintf(stderr,"# misformed pattern '%s'\n",argv[k]);
exit(1);
}
nummatch=0;
printheader();
for (j=0,name=taxonomy->names->names;
j < name_count;
name++,j++)
{
if(rankname)
rankfilter = !(strcmp(rankname,taxonomy->ranks->label[name->taxon->rank]));
re_match = regexec (&re_preg, name->name, 0, NULL, 0);
if (!re_match && (alternative || name->is_scientificname) && rankfilter)
{
printresult(name->taxon,name,taxonomy);
nummatch++;
}
}
printf("# %d records found \n",nummatch);
regfree(&re_preg);
}
return 0;
}

403
src/ecogrep.c Normal file
View File

@ -0,0 +1,403 @@
#include "libecoPCR/ecoPCR.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <getopt.h>
#include <stdlib.h>
#include <sys/stat.h>
#define VERSION "0.1"
void getLineContent(char *stream, ecoseq_t *seq, ecoseq_t *oligoseq_1, ecoseq_t *oligoseq_2){
int i;
char *buffer;
for( i=0, buffer = strtok(stream,"|");
buffer != NULL;
i++, buffer = strtok(NULL,"|"))
{
switch (i) {
case 0:
seq->AC = strdup(buffer);
break;
case 4:
sscanf(buffer,"%d",&seq->taxid);
break;
case 13:
oligoseq_1->SQ = strdup(buffer);
oligoseq_1->SQ_length = strlen(buffer);
break;
case 15:
oligoseq_2->SQ = strdup(buffer);
oligoseq_2->SQ_length = strlen(buffer);
break;
case 18:
seq->SQ = strdup(buffer);
seq->SQ_length = strlen(buffer);
break;
default:
break;
}
}
}
void freememory(char **tab, int32_t num){
int32_t i;
for (i=0;i<num-1;i++){
ECOFREE(tab[i],"Error in freememory function");
}
}
/**
* Check if pattern match a string using mamberall libapat function
* @param line array containing sequence information
* @param pattern array containing patterns to test on the sequence
* @param numpattern number of pattern in pattern array
* @param error_max error rate allowed by the user
*
* @return int 1 if a pattern match, else 0
**/
int ispatternmatching(ecoseq_t *seq, PatternPtr pattern){
if (pattern != NULL)
{
SeqPtr apatseq = NULL;
apatseq=ecoseq2apatseq(seq,apatseq);
return ManberAll(apatseq,pattern,0,0,apatseq->seqlen) > 0;
}
else return 0;
}
/* ----------------------------------------------- */
/* printout help */
/* ----------------------------------------------- */
#define PP fprintf(stdout,
static void PrintHelp()
{
PP "\n------------------------------------------\n");
PP " ecogrep Version %s\n", VERSION);
PP "------------------------------------------\n");
PP " synopsis : filtering ecoPCR result based on\n");
PP " taxonomic id filter and regular expression pattern\n");
PP " usage: ecogrep [options] filename\n");
PP "------------------------------------------\n");
PP " options:\n");
PP " -1 : [FIRST] : compare the given pattern with direct strand oligonucleotide\n\n");
PP " -2 : [SECOND] : compare the given pattern with reverse strand oligonucleotide\n\n");
PP " -d : [D]atabase containing taxonomic information\n\n");
PP " -e : [E]rrors : max error allowed in pattern match (option-1, -2 and -p) (0 by default)\n\n");
PP " -p : [P]attern : oligonucleotide pattern\n\n");
PP " -h : [H]elp : print <this> help\n\n");
PP " -i : [I]gnore subtree under given taxonomic id\n\n");
PP " -r : [R]estrict search to subtree under given taxomic id\n\n");
PP " -v : in[V]ert the sense of matching, to select non-matching lines.\n\n");
PP " argument:\n");
PP " ecoPCR ouput file name\n");
PP "------------------------------------------\n\n");
PP " http://www.grenoble.prabi.fr/trac/ecoPCR/\n");
PP "------------------------------------------\n\n");
}
#undef PP
/* ----------------------------------------------- */
/* printout usage and exit */
/* ----------------------------------------------- */
#define PP fprintf(stderr,
static void ExitUsage(stat)
int stat;
{
PP "usage: ecogrep [-d database] [-p pattern] [-i taxid] [-r taxid] [-v] [-h] <file name>\n");
PP "type \"ecogrep -h\" for help\n");
if (stat)
exit(stat);
}
#undef PP
/* ----------------------------------------------- */
/* MAIN */
/* ----------------------------------------------- */
#define LINE_BUFF_SIZE 10000
int main(int argc, char **argv){
int32_t carg = 0;
int32_t r = 0; // number of restricted taxid
int32_t i = 0; // number of ignored taxid
int32_t v = 0; // stores if -v mode is active
int32_t k = 0; // file counter
int32_t errflag = 0;
int32_t error_max = 0; // stores the error rate allowed by the user
int32_t matchingresult = 0; // stores number of matching result
ecotaxonomy_t *taxonomy; // stores the taxonomy
ecoseq_t *seq = NULL; // stores sequence info
ecoseq_t *oligoseq_1 = NULL; // stores the oligo_1 info
ecoseq_t *oligoseq_2 = NULL; // stores the oligo_2 info
char *database = NULL; // stores the database path (for taxonomy)
char *p = NULL; // pattern for sequence
PatternPtr pattern = NULL; // stores the build pattern for sequence
char *o1 = NULL; // pattern for direct strand oligo
PatternPtr oligo_1 = NULL; // stores the build pattern for direct strand oligo
char *o2 = NULL; // pattern for reverse strand oligo
PatternPtr oligo_2 = NULL; // stores the build pattern for reverse strand oligo
int32_t *restricted_taxid = NULL; // stores the restricted taxid
int32_t *ignored_taxid = NULL; // stores the ignored taxid
FILE *file = NULL; // stores the data stream, stdin by default
char *stream = ECOMALLOC(sizeof(char *)*LINE_BUFF_SIZE,"error stream buffer allocation");
char *orig = ECOMALLOC(sizeof(char *)*LINE_BUFF_SIZE,"error orig buffer allocation");
int is_ignored = 0;
int is_included = 0;
int is_matching = 0;
int match_o1 = 0;
int match_o2 = 0;
int good = 0;
seq = new_ecoseq();
oligoseq_1 = new_ecoseq();
oligoseq_2 = new_ecoseq();
/**
* Parse commande line options
**/
while ((carg = getopt(argc, argv, "1:2:p:d:i:r:e:vh")) != -1) {
switch (carg) {
case '1':
o1 = ECOMALLOC(strlen(optarg)+1,
"Error on o1 allocation");
strcpy(o1,optarg);
break;
case '2':
o2 = ECOMALLOC(strlen(optarg)+1,
"Error on o2 allocation");
strcpy(o2,optarg);
break;
case 'd':
database = ECOMALLOC(strlen(optarg)+1,
"Error on datafile allocation");
strcpy(database,optarg);
break;
case 'i':
ignored_taxid = ECOREALLOC( ignored_taxid,
sizeof(int32_t)*(i+1),
"Error on ignored_taxid reallocation");
sscanf(optarg,"%d",&ignored_taxid[i]);
i++;
break;
case 'r':
restricted_taxid = ECOREALLOC( restricted_taxid,
sizeof(int32_t)*(r+1),
"Error on restricted_taxid reallocation");
sscanf(optarg,"%d",&restricted_taxid[r]);
r++;
break;
case 'v':
v = 1;
break;
case 'h':
PrintHelp();
exit(0);
break;
case 'e':
sscanf(optarg,"%d",&error_max);
break;
case 'p':
p = ECOMALLOC(strlen(optarg)+1,
"Error on pattern allocation");
strcpy(p,optarg);
break;
case '?':
errflag++;
}
}
/**
* Check sequence pattern length and build it in PatternPtr format
**/
if(p)
{
if (strlen(p) > 32){
printf("# Sorry, ecogrep doesn't handle pattern longer than 32 characters.\
\n# Please check it out : %s\n",p);
exit(EXIT_FAILURE);
}
else if ( (pattern = buildPattern(p,error_max)) == NULL)
exit(EXIT_FAILURE);
}
/**
* Check o1 pattern length and build it in PatternPtr format
**/
if(o1)
{
if (strlen(o1) > 32){
printf("# Sorry, ecogrep doesn't handle pattern longer than 32 characters.\
\n# Please check it out : %s\n",o1);
exit(EXIT_FAILURE);
}
else if ( (oligo_1 = buildPattern(o1,error_max)) == NULL)
exit(EXIT_FAILURE);
}
/**
* Check o2 pattern length and build it in PatternPtr format
**/
if(o2)
{
if (strlen(o2) > 32){
printf("# Sorry, ecogrep doesn't handle pattern longer than 32 characters.\
\n# Please check it out : %s\n",o2);
exit(EXIT_FAILURE);
}
else if ( (oligo_2 = buildPattern(o2,error_max)) == NULL)
exit(EXIT_FAILURE);
}
/**
* try to get the database name from environment variable
* if no database name specified in the -d option
**/
if (database == NULL)
{
database = getenv("ECOPCRDB");
if (database == NULL)
errflag++;
}
/**
* check at leat one processing is asked
* either patterns or taxid filters
**/
if ( !p && !o1 && !o2 && restricted_taxid == NULL && ignored_taxid == NULL )
{
errflag++;
}
if (errflag)
ExitUsage(errflag);
/**
* Get the taxonomy back
**/
taxonomy = read_taxonomy(database,0);
/**
* Parse the stream
**/
for (k=0 ; argc >= optind ; optind++, k++){
matchingresult = 0;
if ( (file = fopen(argv[optind], "r")) == NULL)
{
if (isatty(fileno(stdin)) == 0)
{
file = stdin;
printf("# Processing standard input...\n");
}
else
break;
}
else
printf("# Processing %s...\n",argv[optind]);
while( fgets(stream, LINE_BUFF_SIZE, file) != NULL ){
if (stream[0]!= '#')
{
stream[LINE_BUFF_SIZE-1]=0;
strcpy(orig,stream);
getLineContent(stream,seq,oligoseq_1,oligoseq_2);
/* -----------------------------------------------*/
/* is ignored if at least one option -i */
/* AND */
/* if current sequence is son of taxid */
/* -----------------------------------------------*/
is_ignored = ( (i > 0) && (eco_is_taxid_included( taxonomy,
ignored_taxid,
i,
seq->taxid))
);
/* -----------------------------------------------*/
/* is included if no -r option */
/* OR */
/* if current sequence is son of taxid */
/* -----------------------------------------------*/
is_included = ( (r == 0) || (eco_is_taxid_included( taxonomy,
restricted_taxid,
r,
seq->taxid))
);
/* ----------------------------------------------------------- */
/* match if no pattern or if pattern match current sequence */
/* ----------------------------------------------------------- */
is_matching = ( !p || (ispatternmatching(seq,pattern)));
/* ---------------------------------------------------------------------------- */
/* match if no direct oligo pattern or if pattern match current direct oligo */
/* ---------------------------------------------------------------------------- */
match_o1 = (!o1 || (ispatternmatching(oligoseq_1,oligo_1)));
/* ------------------------------------------------------------------------------- */
/* match if no revesrse oligo pattern or if pattern match current reverse oligo */
/* ------------------------------------------------------------------------------- */
match_o2 = (!o2 || (ispatternmatching(oligoseq_2,oligo_2)));
good = (is_included && is_matching && !is_ignored && match_o1 && match_o2);
if (v)
good=!good;
if ( good )
{
printf("%s",orig);
matchingresult++;
}
}
}
if ( file != stdin )
fclose(file);
printf("# %d matching result(s)\n#\n",matchingresult);
}
/**
* clean and free before leaving
**/
ECOFREE(orig,"Error in free orig");
ECOFREE(stream,"Error in free stream");
ECOFREE(ignored_taxid,"Error in free stream");
ECOFREE(restricted_taxid,"Error in free stream");
return 0;
}

123
src/ecoisundertaxon.c Normal file
View File

@ -0,0 +1,123 @@
#include "libecoPCR/ecoPCR.h"
#include <getopt.h>
#include <stdlib.h>
#define VERSION "0.1"
/* ----------------------------------------------- */
/* printout verbose mode */
/* ----------------------------------------------- */
static void printTaxon(ecotx_t *taxon){
printf("# taxid : %d | rank : %d | name : %s \n\n",taxon->taxid, taxon->rank, taxon->name);
}
/* ----------------------------------------------- */
/* printout help */
/* ----------------------------------------------- */
#define PP fprintf(stdout,
static void PrintHelp()
{
PP "\n------------------------------------------\n");
PP " ecoisundertaxon Version %s\n", VERSION);
PP "------------------------------------------\n");
PP " synopsis : searching relationship in taxonomy\n");
PP " usage: ecoisundertaxon [options] database\n");
PP "------------------------------------------\n");
PP " options:\n");
PP " -1 : [FIRST] taxomic id of the hypothetical son\n\n");
PP " -2 : [SECOND] taxonomic id of the hypothetical parent\n\n");
PP " -h : [H]elp - print <this> help\n\n");
PP " -v : [V]erbose mode. Display taxonomic information for both\n");
PP " : taxonomic id.\n\n");
PP "------------------------------------------\n");
PP " database : to match the expected format, the database\n");
PP " has to be formated first by the ecoPCRFormat.py program located.\n");
PP " in the tools directory. Type the radical only, leaving out the extension\n");
PP "------------------------------------------\n\n");
PP " https://www.grenoble.prabi.fr/trac/ecoPCR/wiki");
PP "------------------------------------------\n\n");
}
#undef PP
/* ----------------------------------------------- */
/* printout usage and exit */
/* ----------------------------------------------- */
#define PP fprintf(stderr,
static void ExitUsage(stat)
int stat;
{
PP "usage: ecoisundertaxon [-1 taxid] [-2 taxid] [-v] [-h] datafile\n");
PP "type \"ecoisundertaxon -h\" for help\n");
if (stat)
exit(stat);
}
#undef PP
/* ----------------------------------------------- */
/* MAIN */
/* ----------------------------------------------- */
int main(int argc, char **argv){
int32_t carg = 0;
int32_t taxid_1 = 0;
int32_t taxid_2 = 0;
int32_t verbose = 0;
int32_t errflag = 0;
ecotaxonomy_t *taxonomy = NULL;
ecotx_t *son = NULL;
ecotx_t *parent = NULL;
while ((carg = getopt(argc, argv, "1:2:vh")) != -1) {
switch (carg) {
case '1':
sscanf(optarg,"%d",&taxid_1);
break;
case '2':
sscanf(optarg,"%d",&taxid_2);
break;
case 'v':
verbose = 1;
break;
case 'h':
PrintHelp();
exit(0);
break;
case '?':
errflag++;
}
}
if ((argc -= optind) != 1)
errflag++;
if (errflag)
ExitUsage(errflag);
taxonomy = read_taxonomy(argv[optind],0);
son = eco_findtaxonbytaxid(taxonomy, taxid_1);
if (verbose){
parent = eco_findtaxonbytaxid(taxonomy, taxid_2);
printTaxon(son);
printTaxon(parent);
}
if (eco_isundertaxon(son, taxid_2))
printf("# taxid_1 (%d) is son of taxid_2 (%d)\n",taxid_1, taxid_2);
else
printf("# taxid_1 (%d) is NOT son of taxid_2 (%d)\n",taxid_1, taxid_2);
return 0;
}

View File

@ -8,104 +8,69 @@
#define VERSION "0.1" #define VERSION "0.1"
/* ----------------------------------------------- */ /* ----------------------------------------------- */
/* printout help */ /* ----------------------------------------------- */ /* printout help */
/* ----------------------------------------------- */
#define PP fprintf(stdout, #define PP fprintf(stdout,
static void PrintHelp() static void PrintHelp()
{ {
PP "------------------------------------------\n"); PP "------------------------------------------\n");
PP " Apat Version %s\n", VERSION); PP " ecoPCR Version %s\n", VERSION);
PP "------------------------------------------\n"); PP "------------------------------------------\n");
PP "synopsis : pattern(s) searching program\n"); PP "synopsis : searching for sequence and taxonomy hybriding with given primers\n");
PP "usage: apat [options] patfile datafile\n"); PP "usage: ecoPCR [options] <nucleotidic patterns>\n");
PP "------------------------------------------\n"); PP "------------------------------------------\n");
PP "options:\n"); PP "options:\n");
PP "-a code : [A]lphabet encoding for pattern\n"); PP "-d : [D]atabase : to match the expected format, the database\n");
PP " code is one of : \n"); PP " has to be formated first by the ecoPCRFormat.py program located.\n");
PP " dna: use IUPAC equivalences for dna/rna\n"); PP " in the tools directory.\n");
PP " prot: use IUPAC equivalences for proteins\n"); PP " ecoPCRFormat.py creates three file types :\n");
PP " alpha: no equivalences, just treat plain symbols\n"); PP " .sdx : contains the sequences\n");
PP " note: the equivalences are used in pattern only\n"); PP " .tdx : contains information concerning the taxonomy\n");
PP " *not* in sequence(s) (see note (4) below)\n"); PP " .rdx : contains the taxonomy rank\n\n");
PP " dft: alpha\n"); PP " ecoPCR needs all the file type. As a result, you have to write the\n");
PP "-c : [C]ooccurences\n"); PP " database radical without any extension. For example /ecoPCRDB/gbmam\n\n");
PP " print patterns cooccurence matrix \n"); PP "-e : [E]rror : max error allowed by oligonucleotide (0 by default)\n\n");
PP " dft: off\n"); PP "-h : [H]elp - print <this> help\n\n");
PP "-h : [H]elp - print <this> help\n"); PP "-i : [I]gnore the given taxonomy id.\n");
PP "-m : [M]ultiple occurences\n"); PP " Taxonomy id are available using the ecofind program.\n");
PP " see -q option \n"); PP " see its help typing ecofind -h for more information.\n\n");
PP " dft: off\n"); PP "-k : [K]ingdom mode : set the kingdom mode\n");
PP "-o file : [O]utput sequences\n"); PP " super kingdom mode by default.\n\n");
PP " additionaly output sequence(s) that match into\n"); PP "-l : minimum [L]ength : define the minimum amplication length. \n\n");
PP " 'file' in fasta format\n"); PP "-L : maximum [L]ength : define the maximum amplicationlength. \n\n");
PP " dft: off\n"); PP "-r : [R]estricts the search to the given taxonomic id.\n");
PP "-p : no [Print] - don't printout hits\n"); PP " Taxonomy id are available using the ecofind program.\n");
PP " when just counts are needed\n"); PP " see its help typing ecofind -h for more information.\n");
PP " dft: off\n");
PP "-q nn : [Quorum]\n");
PP " printout result if at least nn\n");
PP " different patterns are found on the sequence\n");
PP " (with -m : at least nn different <hits>)\n");
PP " dft: # of patterns read\n");
PP "-s : no [Sort] - don't sort hits before printing\n");
PP " usually hits are printed by increasing position\n");
PP " this option will list them by pattern\n");
PP " dft: off\n");
PP "-t : [T]est sequence\n");
PP " additionnaly check if sequences are uppercase\n");
PP " this is mostly used for testing\n");
PP " dft: off\n");
PP "-u : [U]pper\n");
PP " force lower->upper sequence conversion\n");
PP " without this option lowercase symbols in sequence\n");
PP " will not be considered to as matches\n");
PP " dft: off\n");
PP "-v : [V]erbose\n");
PP " just display a kind of progress clock on stderr\n");
PP " (this is only useful if you redirect stdout)\n");
PP "\n");
PP "patfile : pattern file (see below)\n");
PP "datafile : database file (see below)\n");
PP "------------------------------------------\n");
PP "pattern file format :\n");
PP " one pattern/line\n");
PP " format : <pattern> <space> #errors\n");
PP " <pattern> := pattern<symbol>\n");
PP " or !pattern<symbol>\n");
PP " or pattern<symbol>#\n");
PP " or !pattern<symbol>#\n");
PP " <symbol> := <letter>\n");
PP " or [<letter>....<letter>]\n");
PP " <letter> := uppercase letter (A-Z)\n");
PP " <number> := a positive number indicates max number of mismatches\n");
PP " a negative number indicates max number of mismatches or indels\n");
PP " # means that no error is allowed at this position\n");
PP " ! complement the <symbol>\n");
PP " [...] means that all symbols within [] are allowed\n");
PP " in addition IUPAC equivalences may be used as symbols\n");
PP " with the -a option\n");
PP "\n");
PP "example: G[DE]S#[GIV]!HP![DE]# 1\n");
PP "\n"); PP "\n");
PP "------------------------------------------\n"); PP "------------------------------------------\n");
PP "datafile contains one or more sequences in\n"); PP "first argument : oligonucleotide for direct strand\n\n");
PP "Fasta format, with *uppercase* symbols \n"); PP "second argument : oligonucleotide for reverse strand\n\n");
PP "\n");
PP "------------------------------------------\n"); PP "------------------------------------------\n");
PP "note (1): the maximum number of patterns is %d\n", MAX_PATTERN); PP "Table result description : \n");
PP "\n"); PP "column 1 : accession number\n");
PP "note (2): the maximum length for one pattern is %d\n", MAX_PAT_LEN); PP "column 2 : sequence length\n");
PP "\n"); PP "column 3 : taxonomic id\n");
PP "note (3): indels are still experimental and are :\n"); PP "column 4 : rank\n");
PP " not handled gracefully with the # syntax\n"); PP "column 5 : species taxonomic id\n");
PP " and hits are not printed very nicely\n"); PP "column 6 : scientific name\n");
PP "\n"); PP "column 7 : genus taxonomic id\n");
PP "note (4): the IUPAC equivalences (-a option) are used\n"); PP "column 8 : genus name\n");
PP " in pattern only *not* in sequence(s).\n"); PP "column 9 : family taxonomic id\n");
PP " for instance GATN (with option -a dna) is equivalent to GAT[ACGT]\n"); PP "column 10 : family name\n");
PP " and will match GATA/GATC/GATG/GATC but will not match GATN\n"); PP "column 11 : super kingdom taxonomic id\n");
PP " (nor NNNN) in sequence.\n"); PP "column 12 : super kingdom name\n");
PP "column 13 : strand (direct or reverse)\n");
PP "column 14 : first oligonucleotide\n");
PP "column 15 : number of errors for the first strand\n");
PP "column 16 : second oligonucleotide\n");
PP "column 17 : number of errors for the second strand\n");
PP "column 18 : amplification length\n");
PP "column 19 : sequence\n");
PP "column 20 : definition\n");
PP "------------------------------------------\n");
PP " http://www.grenoble.prabi.fr/trac/ecoPCR/\n");
PP "------------------------------------------\n\n");
PP "\n"); PP "\n");
} }
@ -121,10 +86,8 @@ static void PrintHelp()
static void ExitUsage(stat) static void ExitUsage(stat)
int stat; int stat;
{ {
PP "usage: apat [-a dna|prot] [-c] [-h] [-m] [-o file] [-p]\n"); PP "usage: ecoPCR [-d database] [-l value] [-L value] [-e value] [-r taxid] [-i taxid] [-k] oligo1 oligo2\n");
PP " [-q nn] [-t] [-u] [-v]\n"); PP "type \"ecoPCR -h\" for help\n");
PP " patfile datafile\n");
PP "type \"apat -h\" for help\n");
if (stat) if (stat)
exit(stat); exit(stat);
@ -306,7 +269,7 @@ int main(int argc, char **argv)
int32_t errflag=0; int32_t errflag=0;
char kingdom_mode=0; char kingdom_mode=0;
char *prefix; char *prefix = NULL;
int32_t checkedSequence = 0; int32_t checkedSequence = 0;
int32_t positiveSequence= 0; int32_t positiveSequence= 0;
@ -330,43 +293,39 @@ int main(int argc, char **argv)
int32_t erri; int32_t erri;
int32_t errj; int32_t errj;
int32_t *restricted_taxid = NULL;
int32_t *ignored_taxid = NULL;
int32_t r=0;
int32_t g=0;
while ((carg = getopt(argc, argv, "h1:2:l:L:e:k")) != -1) {
while ((carg = getopt(argc, argv, "hd:l:L:e:i:r:k")) != -1) {
switch (carg) { switch (carg) {
/* -------------------- */ /* -------------------- */
case '1': /* prenier oligo */ case 'd': /* database name */
/* -------------------- */ /* -------------------- */
oligo1 = ECOMALLOC(strlen(optarg)+1, prefix = ECOMALLOC(strlen(optarg)+1,
"Error on oligo 1 allocation"); "Error on prefix allocation");
strcpy(oligo1,optarg); strcpy(prefix,optarg);
break;
/* -------------------- */
case '2': /* coocurence option */
/* -------------------- */
oligo2 = ECOMALLOC(strlen(optarg)+1,
"Error on oligo 1 allocation");
strcpy(oligo2,optarg);
break; break;
/* -------------------- */ /* -------------------- */
case 'h': /* help */ case 'h': /* help */
/* -------------------- */ /* -------------------- */
PrintHelp(); PrintHelp();
exit(0); exit(0);
break; break;
/* -------------------- */ /* ------------------------- */
case 'l': /* lmin amplification */ case 'l': /* min amplification lenght */
/* -------------------- */ /* ------------------------- */
sscanf(optarg,"%d",&lmin); sscanf(optarg,"%d",&lmin);
break; break;
/* -------------------- */ /* -------------------------- */
case 'L': /* lmax amplification */ case 'L': /* max amplification lenght */
/* -------------------- */ /* -------------------------- */
sscanf(optarg,"%d",&lmax); sscanf(optarg,"%d",&lmax);
break; break;
@ -374,11 +333,29 @@ int main(int argc, char **argv)
case 'e': /* error max */ case 'e': /* error max */
/* -------------------- */ /* -------------------- */
sscanf(optarg,"%d",&error_max); sscanf(optarg,"%d",&error_max);
break;
/* -------------------- */
case 'k': /* set the kingdom mode */
kingdom_mode = 1; /* -------------------- */
break;
/* ------------------------------------------ */
case 'r': /* stores the restricting search taxonomic id */
/* ------------------------------------------ */
restricted_taxid = ECOREALLOC(restricted_taxid,sizeof(int32_t)*(r+1),
"Error on restricted_taxid reallocation");
sscanf(optarg,"%d",&restricted_taxid[r]);
r++;
break; break;
case 'k': /* error max */ /* --------------------------------- */
/* -------------------- */ case 'i': /* stores the taxonomic id to ignore */
kingdom_mode = 1; /* --------------------------------- */
ignored_taxid = ECOREALLOC(ignored_taxid,sizeof(int32_t)*(g+1),
"Error on excluded_taxid reallocation");
sscanf(optarg,"%d",&ignored_taxid[g]);
g++;
break; break;
/* -------------------- */ /* -------------------- */
@ -389,25 +366,43 @@ int main(int argc, char **argv)
} }
if ((argc -= optind) != 1) /**
errflag++; * check the path to the database is given as last argument
*/
if ((argc -= optind) == 2)
{
oligo1 = ECOMALLOC(strlen(argv[optind])+1,
"Error on oligo1 allocation");
strcpy(oligo1,argv[optind]);
optind++;
oligo2 = ECOMALLOC(strlen(argv[optind])+1,
"Error on oligo1 allocation");
strcpy(oligo2,argv[optind]);
}
else
errflag++;
if (prefix == NULL)
{
prefix = getenv("ECOPCRDB");
if (prefix == NULL)
errflag++;
}
if (!oligo1 || !oligo2) if (!oligo1 || !oligo2)
errflag++; errflag++;
if (errflag) if (errflag)
ExitUsage(errflag); ExitUsage(errflag);
prefix = argv[optind];
o1 = buildPattern(oligo1,error_max); o1 = buildPattern(oligo1,error_max);
o2 = buildPattern(oligo2,error_max); o2 = buildPattern(oligo2,error_max);
o1c = complementPattern(o1); o1c = complementPattern(o1);
o2c = complementPattern(o2); o2c = complementPattern(o2);
printf("#\n"); printf("#\n");
printf("# ecoPCR version %s\n",VERSION); printf("# ecoPCR version %s\n",VERSION);
printf("# direct strand oligo1 : %-32s ; oligo2c : %32s\n", o1->cpat,o2c->cpat); printf("# direct strand oligo1 : %-32s ; oligo2c : %32s\n", o1->cpat,o2c->cpat);
@ -427,102 +422,120 @@ int main(int argc, char **argv)
printf("# output in superkingdom mode\n"); printf("# output in superkingdom mode\n");
printf("#\n"); printf("#\n");
taxonomy = read_taxonomy(prefix); taxonomy = read_taxonomy(prefix,0);
seq = ecoseq_iterator(prefix); seq = ecoseq_iterator(prefix);
checkedSequence = 0; checkedSequence = 0;
positiveSequence= 0; positiveSequence= 0;
amplifiatCount = 0; amplifiatCount = 0;
while(seq) while(seq)
{ {
checkedSequence++; checkedSequence++;
/**
scname = taxonomy->taxons->taxon[seq->taxid].name; * check if current sequence should be included
strncpy(head,seq->SQ,10); **/
head[10]=0; if ( (r == 0) ||
strncpy(tail,seq->SQ+seq->SQ_length-10,10); (eco_is_taxid_included(taxonomy,
tail[10]=0; restricted_taxid,
r,
taxonomy->taxons->taxon[seq->taxid].taxid)
)
)
if ((g == 0) ||
!(eco_is_taxid_included(taxonomy,
ignored_taxid,
g,
taxonomy->taxons->taxon[seq->taxid].taxid)
)
)
{
apatseq=ecoseq2apatseq(seq,apatseq); scname = taxonomy->taxons->taxon[seq->taxid].name;
strncpy(head,seq->SQ,10);
head[10]=0;
strncpy(tail,seq->SQ+seq->SQ_length-10,10);
tail[10]=0;
o1Hits = ManberAll(apatseq,o1,0,0,apatseq->seqlen); apatseq=ecoseq2apatseq(seq,apatseq);
o2cHits= 0;
if (o1Hits)
{
stktmp = apatseq->hitpos[0];
begin = stktmp->val[0] + o1->patlen;
if (lmax)
length= stktmp->val[stktmp->top-1] + o1->patlen - begin + lmax + o2->patlen;
else
length= apatseq->seqlen - begin;
o2cHits = ManberAll(apatseq,o2c,1,begin,length); o1Hits = ManberAll(apatseq,o1,0,0,apatseq->seqlen);
o2cHits= 0;
if (o2cHits)
for (i=0; i < o1Hits;i++)
{
posi = apatseq->hitpos[0]->val[i];
erri = apatseq->hiterr[0]->val[i];
for (j=0; j < o2cHits; j++)
{
posj =apatseq->hitpos[1]->val[j] + o2c->patlen;
errj =apatseq->hiterr[1]->val[j];
length=posj - posi + 1 - o1->patlen - o2->patlen;
if ((!lmin || (length >= lmin)) &&
(!lmax || (length <= lmax)))
printRepeat(seq,o1,o2c,'D',kingdom_mode,posi,posj,erri,errj,taxonomy);
//printf("%s\tD\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o1Hits,o2cHits,posi,posj,scname);
}
}
}
o2Hits = ManberAll(apatseq,o2,2,0,apatseq->seqlen);
o1cHits= 0;
if (o2Hits)
{
stktmp = apatseq->hitpos[2];
begin = stktmp->val[0] + o2->patlen;
if (lmax)
length= stktmp->val[stktmp->top-1] + o2->patlen - begin + lmax + o1->patlen;
else
length= apatseq->seqlen - begin;
o1cHits = ManberAll(apatseq,o1c,3,begin,length); if (o1Hits)
if (o1cHits)
for (i=0; i < o2Hits;i++)
{ {
posi = apatseq->hitpos[2]->val[i]; stktmp = apatseq->hitpos[0];
erri = apatseq->hiterr[2]->val[i]; begin = stktmp->val[0] + o1->patlen;
for (j=0; j < o1cHits; j++)
{ if (lmax)
posj=apatseq->hitpos[3]->val[j] + o1c->patlen; length= stktmp->val[stktmp->top-1] + o1->patlen - begin + lmax + o2->patlen;
errj=apatseq->hiterr[3]->val[j]; else
length=posj - posi + 1 - o1->patlen - o2->patlen; length= apatseq->seqlen - begin;
if ((!lmin || (length >= lmin)) && o2cHits = ManberAll(apatseq,o2c,1,begin,length);
(!lmax || (length <= lmax)))
printRepeat(seq,o2,o1c,'R',kingdom_mode,posi,posj,erri,errj,taxonomy); if (o2cHits)
//printf("%s\tR\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o2Hits,o1cHits,posi,posj,scname); for (i=0; i < o1Hits;i++)
} {
posi = apatseq->hitpos[0]->val[i];
erri = apatseq->hiterr[0]->val[i];
for (j=0; j < o2cHits; j++)
{
posj =apatseq->hitpos[1]->val[j] + o2c->patlen;
errj =apatseq->hiterr[1]->val[j];
length=posj - posi + 1 - o1->patlen - o2->patlen;
if ((!lmin || (length >= lmin)) &&
(!lmax || (length <= lmax)))
printRepeat(seq,o1,o2c,'D',kingdom_mode,posi,posj,erri,errj,taxonomy);
//printf("%s\tD\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o1Hits,o2cHits,posi,posj,scname);
}
}
} }
}
o2Hits = ManberAll(apatseq,o2,2,0,apatseq->seqlen);
o1cHits= 0;
if (o2Hits)
{
stktmp = apatseq->hitpos[2];
begin = stktmp->val[0] + o2->patlen;
if (lmax)
length= stktmp->val[stktmp->top-1] + o2->patlen - begin + lmax + o1->patlen;
else
length= apatseq->seqlen - begin;
o1cHits = ManberAll(apatseq,o1c,3,begin,length);
if (o1cHits)
for (i=0; i < o2Hits;i++)
{
posi = apatseq->hitpos[2]->val[i];
erri = apatseq->hiterr[2]->val[i];
for (j=0; j < o1cHits; j++)
{
posj=apatseq->hitpos[3]->val[j] + o1c->patlen;
errj=apatseq->hiterr[3]->val[j];
length=posj - posi + 1 - o1->patlen - o2->patlen;
if ((!lmin || (length >= lmin)) &&
(!lmax || (length <= lmax)))
printRepeat(seq,o2,o1c,'R',kingdom_mode,posi,posj,erri,errj,taxonomy);
//printf("%s\tR\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o2Hits,o1cHits,posi,posj,scname);
}
}
}
} /* End of taxonomic selection */
delete_ecoseq(seq); delete_ecoseq(seq);
seq = ecoseq_iterator(NULL); seq = ecoseq_iterator(NULL);
} }
ECOFREE(restricted_taxid, "Error: could not free restricted_taxid\n");
ECOFREE(ignored_taxid, "Error: could not free excluded_taxid\n");
return 0; return 0;
} }

18
src/global.mk Normal file
View File

@ -0,0 +1,18 @@
MACHINE=MAC_OS_X
LIBPATH= -Llibapat -LlibecoPCR
MAKEDEPEND = gcc -D$(MACHINE) -M $(CPPFLAGS) -o $*.d $<
CC=gcc
CFLAGS= -W -Wall -O2 -g
default: all
%.o: %.c
$(CC) -D$(MACHINE) $(CFLAGS) -c -o $@ $<
%.P : %.c
$(MAKEDEPEND)
@sed 's/\($*\)\.o[ :]*/\1.o $@ : /g' < $*.d > $@; \
rm -f $*.d; [ -s $@ ] || rm -f $@
include $(SRCS:.c=.P)

23
src/libapat/Makefile Normal file
View File

@ -0,0 +1,23 @@
SOURCES = apat_parse.c \
apat_search.c \
libstki.c
SRCS=$(SOURCES)
OBJECTS= $(patsubst %.c,%.o,$(SOURCES))
LIBFILE= libapat.a
include ../global.mk
all: $(LIBFILE)
clean:
rm -rf $(OBJECTS) $(LIBFILE)
$(LIBFILE): $(OBJECTS)
ar -cr $@ $?

29
src/libecoPCR/Makefile Normal file
View File

@ -0,0 +1,29 @@
SOURCES = ecoapat.c \
ecodna.c \
ecoError.c \
ecoIOUtils.c \
ecoMalloc.c \
ecorank.c \
ecoseq.c \
ecotax.c \
ecofilter.c \
econame.c
SRCS=$(SOURCES)
OBJECTS= $(patsubst %.c,%.o,$(SOURCES))
LIBFILE= libecoPCR.a
include ../global.mk
all: $(LIBFILE)
clean:
rm -rf $(OBJECTS) $(LIBFILE)
$(LIBFILE): $(OBJECTS)
ar -cr $@ $?

View File

@ -2,7 +2,15 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
/*
* print the message given as argument and exit the program
* @param error error number
* @param message the text explaining what's going on
* @param filename the file source where the program failed
* @param linenumber the line where it has failed
* filename and linenumber are written at pre-processing
* time by a macro
*/
void ecoError(int32_t error, void ecoError(int32_t error,
const char* message, const char* message,
const char * filename, const char * filename,

View File

@ -16,20 +16,19 @@ int32_t is_big_endian()
int32_t swap_int32_t(int32_t i) int32_t swap_int32_t(int32_t i)
{ {
return SWAPINT32(i); return SWAPINT32(i);
} }
/**
* Read part of the file
* @param *f the database
* @param recordSize the size to be read
*
* @return buffer
*/
void *read_ecorecord(FILE *f,int32_t *recordSize) void *read_ecorecord(FILE *f,int32_t *recordSize)
{ {
static void *buffer =NULL; static void *buffer =NULL;
@ -79,10 +78,14 @@ void *read_ecorecord(FILE *f,int32_t *recordSize)
/**
* Open the database and check it's readable
* @param filename name of the database (.sdx, .rdx, .tbx)
* @param sequencecount buffer - pointer to variable storing the number of occurence
* @param abort_on_open_error boolean to define the behaviour in case of error
* while opening the database
* @return FILE type
**/
FILE *open_ecorecorddb(const char *filename, FILE *open_ecorecorddb(const char *filename,
int32_t *sequencecount, int32_t *sequencecount,
int32_t abort_on_open_error) int32_t abort_on_open_error)

View File

@ -56,11 +56,11 @@ typedef struct {
} ecotxformat_t; } ecotxformat_t;
typedef struct { typedef struct ecotxnode {
int32_t taxid; int32_t taxid;
int32_t rank; int32_t rank;
int32_t parent; struct ecotxnode *parent;
char *name; char *name;
} ecotx_t; } ecotx_t;
typedef struct { typedef struct {
@ -80,12 +80,42 @@ typedef struct {
char* label[1]; char* label[1];
} ecorankidx_t; } ecorankidx_t;
/*
*
* Taxonomy name types
*
*/
typedef struct { typedef struct {
int32_t is_scientificname;
int32_t namelength;
int32_t classlength;
int32_t taxid;
char names[1];
} econameformat_t;
typedef struct {
char *name;
char *classname;
int32_t is_scientificname;
struct ecotxnode *taxon;
} econame_t;
typedef struct {
int32_t count;
econame_t names[1];
} econameidx_t;
typedef struct {
ecorankidx_t *ranks; ecorankidx_t *ranks;
econameidx_t *names;
ecotxidx_t *taxons; ecotxidx_t *taxons;
} ecotaxonomy_t; } ecotaxonomy_t;
/***************************************************** /*****************************************************
* *
* Function declarations * Function declarations
@ -175,6 +205,9 @@ ecoseq_t *readnext_ecoseq(FILE *);
ecorankidx_t *read_rankidx(const char *filename); ecorankidx_t *read_rankidx(const char *filename);
econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy);
/** /**
* Read taxonomy data as formated by the ecoPCRFormat.py script. * Read taxonomy data as formated by the ecoPCRFormat.py script.
@ -189,8 +222,11 @@ ecorankidx_t *read_rankidx(const char *filename);
ecotxidx_t *read_taxonomyidx(const char *filename); ecotxidx_t *read_taxonomyidx(const char *filename);
ecotaxonomy_t *read_taxonomy(const char *prefix); ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName);
ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy, int32_t taxid);
int eco_isundertaxon(ecotx_t *taxon, int other_taxid);
ecoseq_t *ecoseq_iterator(const char *prefix); ecoseq_t *ecoseq_iterator(const char *prefix);
@ -227,4 +263,7 @@ ecotx_t *eco_getfamily(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
ecotx_t *eco_getkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy); ecotx_t *eco_getkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy); ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
int eco_is_taxid_ignored(int32_t *ignored_taxid, int32_t tab_len, int32_t taxid);
int eco_is_taxid_included(ecotaxonomy_t *taxonomy, int32_t *included_taxid, int32_t tab_len, int32_t taxid);
#endif /*ECOPCR_H_*/ #endif /*ECOPCR_H_*/

19
src/libecoPCR/ecofilter.c Normal file
View File

@ -0,0 +1,19 @@
#include "ecoPCR.h"
int eco_is_taxid_included( ecotaxonomy_t *taxonomy,
int32_t *restricted_taxid,
int32_t tab_len,
int32_t taxid)
{
int i;
ecotx_t *taxon;
taxon = eco_findtaxonbytaxid(taxonomy, taxid);
for (i=0; i < tab_len; i++)
if ( (taxon->taxid == restricted_taxid[i]) ||
(eco_isundertaxon(taxon, restricted_taxid[i])) )
return 1;
return 0;
}

61
src/libecoPCR/econame.c Normal file
View File

@ -0,0 +1,61 @@
#include "ecoPCR.h"
#include <string.h>
#include <stdlib.h>
static econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy);
econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy)
{
int32_t count;
FILE *f;
econameidx_t *indexname;
int32_t i;
f = open_ecorecorddb(filename,&count,1);
indexname = (econameidx_t*) ECOMALLOC(sizeof(econameidx_t) + sizeof(econame_t) * (count-1),"Allocate names");
indexname->count=count;
for (i=0; i < count; i++){
readnext_econame(f,(indexname->names)+i,taxonomy);
}
return indexname;
}
econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy)
{
econameformat_t *raw;
int32_t rs;
raw = read_ecorecord(f,&rs);
if (!raw)
return NULL;
if (is_big_endian())
{
raw->is_scientificname = swap_int32_t(raw->is_scientificname);
raw->namelength = swap_int32_t(raw->namelength);
raw->classlength = swap_int32_t(raw->classlength);
raw->taxid = swap_int32_t(raw->taxid);
}
name->is_scientificname=raw->is_scientificname;
name->name = ECOMALLOC((raw->namelength+1) * sizeof(char),"Allocate name");
strncpy(name->name,raw->names,raw->namelength);
name->name[raw->namelength]=0;
name->classname = ECOMALLOC((raw->classlength+1) * sizeof(char),"Allocate classname");
strncpy(name->classname,(raw->names+raw->namelength),raw->classlength);
name->classname[raw->classlength]=0;
name->taxon = taxonomy->taxons->taxon + raw->taxid;
return name;
}

View File

@ -79,7 +79,9 @@ ecoseq_t *new_ecoseq_with_data( char *AC,
} }
/**
* ?? used ??
**/
FILE *open_ecoseqdb(const char *filename, FILE *open_ecoseqdb(const char *filename,
int32_t *sequencecount) int32_t *sequencecount)
{ {
@ -140,6 +142,13 @@ ecoseq_t *readnext_ecoseq(FILE *f)
return seq; return seq;
} }
/**
* Open the sequences database (.sdx file)
* @param prefix name of the database (radical without extension)
* @param index integer
*
* @return file object
*/
FILE *open_seqfile(const char *prefix,int32_t index) FILE *open_seqfile(const char *prefix,int32_t index)
{ {
char filename_buffer[1024]; char filename_buffer[1024];
@ -153,7 +162,7 @@ FILE *open_seqfile(const char *prefix,int32_t index)
prefix, prefix,
index); index);
fprintf(stderr,"Coucou %s\n",filename_buffer); fprintf(stderr,"# Coucou %s\n",filename_buffer);
if (filename_length >= 1024) if (filename_length >= 1024)
@ -164,7 +173,7 @@ FILE *open_seqfile(const char *prefix,int32_t index)
input=open_ecorecorddb(filename_buffer,&seqcount,0); input=open_ecorecorddb(filename_buffer,&seqcount,0);
if (input) if (input)
fprintf(stderr,"Reading file %s containing %d sequences...\n", fprintf(stderr,"# Reading file %s containing %d sequences...\n",
filename_buffer, filename_buffer,
seqcount); seqcount);

View File

@ -5,7 +5,11 @@
static ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon); static ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon);
/**
* Open the taxonomy database
* @param pointer to the database (.tdx file)
* @return a ecotxidx_t structure
*/
ecotxidx_t *read_taxonomyidx(const char *filename) ecotxidx_t *read_taxonomyidx(const char *filename)
{ {
int32_t count; int32_t count;
@ -18,14 +22,15 @@ ecotxidx_t *read_taxonomyidx(const char *filename)
index = (ecotxidx_t*) ECOMALLOC(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count-1), index = (ecotxidx_t*) ECOMALLOC(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count-1),
"Allocate taxonomy"); "Allocate taxonomy");
index->count=count; index->count=count;
for (i=0; i < count; i++){
for (i=0; i < count; i++)
readnext_ecotaxon(f,&(index->taxon[i])); readnext_ecotaxon(f,&(index->taxon[i]));
index->taxon[i].parent=index->taxon + (int32_t)index->taxon[i].parent;
}
return index; return index;
} }
int32_t delete_taxonomy(ecotxidx_t *index) int32_t delete_taxonomy(ecotxidx_t *index)
{ {
int32_t i; int32_t i;
@ -61,6 +66,15 @@ int32_t delete_taxon(ecotx_t *taxon)
return 1; return 1;
} }
/**
* Read the database for a given taxon a save the data
* into the taxon structure(if any found)
* @param *f pointer to FILE type returned by fopen
* @param *taxon pointer to the structure
*
* @return a ecotx_t structure if any taxon found else NULL
*/
ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon) ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon)
{ {
@ -80,7 +94,7 @@ ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon)
raw->taxid = swap_int32_t(raw->taxid); raw->taxid = swap_int32_t(raw->taxid);
} }
taxon->parent = raw->parent; taxon->parent = (ecotx_t*)raw->parent;
taxon->taxid = raw->taxid; taxon->taxid = raw->taxid;
taxon->rank = raw->rank; taxon->rank = raw->rank;
@ -88,12 +102,12 @@ ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon)
"Allocate taxon scientific name"); "Allocate taxon scientific name");
strncpy(taxon->name,raw->name,raw->namelength); strncpy(taxon->name,raw->name,raw->namelength);
return taxon; return taxon;
} }
ecotaxonomy_t *read_taxonomy(const char *prefix) ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName)
{ {
ecotaxonomy_t *tax; ecotaxonomy_t *tax;
char *filename; char *filename;
@ -115,10 +129,19 @@ ecotaxonomy_t *read_taxonomy(const char *prefix)
tax->taxons = read_taxonomyidx(filename); tax->taxons = read_taxonomyidx(filename);
if (readAlternativeName)
{
snprintf(filename,buffsize,"%s.ndx",prefix);
tax->names=read_nameidx(filename,tax);
}
else
tax->names=NULL;
return tax; return tax;
} }
int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy) int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy)
{ {
if (taxonomy) if (taxonomy)
@ -138,20 +161,19 @@ int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy)
} }
ecotx_t *eco_findtaxonatrank(ecotx_t *taxon, ecotx_t *eco_findtaxonatrank(ecotx_t *taxon,
int32_t rankidx, int32_t rankidx)
ecotaxonomy_t *taxonomy)
{ {
ecotx_t *current_taxon; ecotx_t *current_taxon;
ecotx_t *next_taxon; ecotx_t *next_taxon;
current_taxon = taxon; current_taxon = taxon;
next_taxon = &(taxonomy->taxons->taxon[current_taxon->parent]); next_taxon = current_taxon->parent;
while ((current_taxon!=next_taxon) && while ((current_taxon!=next_taxon) && // I' am the root node
(current_taxon->rank!=rankidx)) (current_taxon->rank!=rankidx))
{ {
current_taxon = next_taxon; current_taxon = next_taxon;
next_taxon = &(taxonomy->taxons->taxon[current_taxon->parent]); next_taxon = current_taxon->parent;
} }
if (current_taxon->rank==rankidx) if (current_taxon->rank==rankidx)
@ -160,6 +182,61 @@ ecotx_t *eco_findtaxonatrank(ecotx_t *taxon,
return NULL; return NULL;
} }
/**
* Get back information concerning a taxon from a taxonomic id
* @param *taxonomy the taxonomy database
* @param taxid the taxonomic id
*
* @result a ecotx_t structure containing the taxonimic information
**/
ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy,
int32_t taxid)
{
ecotx_t *current_taxon;
int32_t taxoncount;
int32_t i;
taxoncount=taxonomy->taxons->count;
for (current_taxon=taxonomy->taxons->taxon,
i=0;
i < taxoncount;
i++,
current_taxon++){
if (current_taxon->taxid==taxid){
return current_taxon;
}
}
return (ecotx_t*)NULL;
}
/**
* Find out if taxon is son of other taxon (identified by its taxid)
* @param *taxon son taxon
* @param parent_taxid taxonomic id of the other taxon
*
* @return 1 is the other taxid math a parent taxid, else 0
**/
int eco_isundertaxon(ecotx_t *taxon,
int other_taxid)
{
ecotx_t *next_parent;
next_parent = taxon->parent;
while ( (other_taxid != next_parent->taxid) &&
(strcmp(next_parent->name, "root")) )
{
next_parent = next_parent->parent;
}
if (other_taxid == next_parent->taxid)
return 1;
else
return 0;
}
ecotx_t *eco_getspecies(ecotx_t *taxon, ecotx_t *eco_getspecies(ecotx_t *taxon,
ecotaxonomy_t *taxonomy) ecotaxonomy_t *taxonomy)
{ {
@ -175,7 +252,7 @@ ecotx_t *eco_getspecies(ecotx_t *taxon,
if (!tax || rankindex < 0) if (!tax || rankindex < 0)
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
return eco_findtaxonatrank(taxon,rankindex,tax); return eco_findtaxonatrank(taxon,rankindex);
} }
ecotx_t *eco_getgenus(ecotx_t *taxon, ecotx_t *eco_getgenus(ecotx_t *taxon,
@ -193,7 +270,7 @@ ecotx_t *eco_getgenus(ecotx_t *taxon,
if (!tax || rankindex < 0) if (!tax || rankindex < 0)
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
return eco_findtaxonatrank(taxon,rankindex,tax); return eco_findtaxonatrank(taxon,rankindex);
} }
@ -212,7 +289,7 @@ ecotx_t *eco_getfamily(ecotx_t *taxon,
if (!tax || rankindex < 0) if (!tax || rankindex < 0)
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
return eco_findtaxonatrank(taxon,rankindex,tax); return eco_findtaxonatrank(taxon,rankindex);
} }
ecotx_t *eco_getkingdom(ecotx_t *taxon, ecotx_t *eco_getkingdom(ecotx_t *taxon,
@ -230,7 +307,7 @@ ecotx_t *eco_getkingdom(ecotx_t *taxon,
if (!tax || rankindex < 0) if (!tax || rankindex < 0)
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
return eco_findtaxonatrank(taxon,rankindex,tax); return eco_findtaxonatrank(taxon,rankindex);
} }
ecotx_t *eco_getsuperkingdom(ecotx_t *taxon, ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,
@ -248,5 +325,5 @@ ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,
if (!tax || rankindex < 0) if (!tax || rankindex < 0)
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
return eco_findtaxonatrank(taxon,rankindex,tax); return eco_findtaxonatrank(taxon,rankindex);
} }