From bc4c7656c607a198de541c28e94ceea344082274 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 24 Sep 2007 09:48:08 +0000 Subject: [PATCH] git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPCR/trunk@114 60f365c0-8329-0410-b2a4-ec073aeeaa1d --- src/Makefile | 84 ++++++++ src/ecofind.c | 345 +++++++++++++++++++++++++++++++ src/ecogrep.c | 403 ++++++++++++++++++++++++++++++++++++ src/ecoisundertaxon.c | 123 +++++++++++ src/ecopcr.c | 411 +++++++++++++++++++------------------ src/global.mk | 18 ++ src/libapat/Makefile | 23 +++ src/libecoPCR/Makefile | 29 +++ src/libecoPCR/ecoError.c | 10 +- src/libecoPCR/ecoIOUtils.c | 27 +-- src/libecoPCR/ecoPCR.h | 53 ++++- src/libecoPCR/ecofilter.c | 19 ++ src/libecoPCR/econame.c | 61 ++++++ src/libecoPCR/ecoseq.c | 15 +- src/libecoPCR/ecotax.c | 113 ++++++++-- 15 files changed, 1494 insertions(+), 240 deletions(-) create mode 100644 src/Makefile create mode 100644 src/ecofind.c create mode 100644 src/ecogrep.c create mode 100644 src/ecoisundertaxon.c create mode 100644 src/global.mk create mode 100644 src/libapat/Makefile create mode 100644 src/libecoPCR/Makefile create mode 100644 src/libecoPCR/ecofilter.c create mode 100644 src/libecoPCR/econame.c diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000..60baeb5 --- /dev/null +++ b/src/Makefile @@ -0,0 +1,84 @@ +EXEC=ecoPCR ecofind ecoisundertaxon + +PCR_SRC= ecopcr.c +PCR_OBJ= $(patsubst %.c,%.o,$(PCR_SRC)) + +FIND_SRC= ecofind.c +FIND_OBJ= $(patsubst %.c,%.o,$(FIND_SRC)) + +IUT_SRC= ecoisundertaxon.c +IUT_OBJ= $(patsubst %.c,%.o,$(IUT_SRC)) + +SRCS= $(PCR_SRC) $(FIND_SRC) $(IUT_SRC) + +LIB= -lecoPCR -lapat -lz -lm + +LIBFILE= libapat/libapat.a \ + libecoPCR/libecoPCR.a + + +include global.mk + +all: $(EXEC) + + +######## +# +# ecoPCR compilation +# +######## + +# executable compilation and link + +ecoPCR: $(PCR_OBJ) $(LIBFILE) + $(CC) $(LDFLAGS) -o $@ $< $(LIBPATH) $(LIB) + +######## +# +# ecofind compilation +# +######## + +# executable compilation and link + +ecofind: $(FIND_OBJ) $(LIBFILE) + $(CC) $(LDFLAGS) -o $@ $< $(LIBPATH) $(LIB) + +######## +# +# IsUnderTaxon compilation +# +######## + +# executable compilation and link + +ecoisundertaxon: $(IUT_OBJ) $(LIBFILE) + $(CC) $(LDFLAGS) -o $@ $< $(LIBPATH) $(LIB) + +######## +# +# library compilation +# +######## + +libapat/libapat.a: + $(MAKE) -C libapat + +libecoPCR/libecoPCR.a: + $(MAKE) -C libecoPCR + + +######## +# +# project management +# +######## + +clean: + rm -f *.o + rm -f $(EXEC) + $(MAKE) -C libapat clean + $(MAKE) -C libecoPCR clean + + + \ No newline at end of file diff --git a/src/ecofind.c b/src/ecofind.c new file mode 100644 index 0000000..66b4a09 --- /dev/null +++ b/src/ecofind.c @@ -0,0 +1,345 @@ +#include "libecoPCR/ecoPCR.h" +#include +#include +#include +#include +#include +#define VERSION "0.1" + +/** + * display the result + **/ +static void printresult(ecotx_t *taxon,econame_t* name,ecotaxonomy_t *taxonomy) +{ + char* rankname; + char* classname; + char* matchedname=taxon->name; + + classname="scientific name"; + if (name) + { + classname=name->classname; + matchedname=name->name; + } + + rankname= taxonomy->ranks->label[taxon->rank]; + + printf("%10d \t| %15s \t|\t %-50s \t|\t %15s \t|\t %s\n", + taxon->taxid, + rankname, + matchedname, + classname, + taxon->name); +} + +/** + * display header before printing any result + **/ +static void printheader(void) +{ + printf("# %12s \t| %15s \t|\t %-50s \t|\t %-15s \t|\t %s\n#\n", + "taxonomy id", + "taxonomy rank", + "name", + "class name", + "scientific name"); +} + +/** + * display son's list for given taxon + **/ +static void get_son(ecotaxonomy_t *taxonomy, ecotx_t *taxon, int32_t *count, char *rankname) +{ + int32_t i; + ecotx_t *current_taxon; + + for ( i = 0, current_taxon = taxonomy->taxons->taxon; + i < taxonomy->taxons->count; + i++, current_taxon++) + { + if (taxon->taxid == current_taxon->parent->taxid) + { + if (rankname == NULL || !strcmp(rankname,taxonomy->ranks->label[current_taxon->rank])) + { + printresult(current_taxon, NULL, taxonomy); + (*count)++; + } + get_son(taxonomy,current_taxon,count,rankname); + } + } +} + + + +/** + * display list of rank filter option (-l option) + **/ +static void listfilteroptions(ecorankidx_t *ranks) +{ + int32_t i; + + printf("#\n"); + + for ( i=0; + i < ranks->count; + i++) + { + printf("# %s \n",ranks->label[i]); + } + + printf("#\n"); +} + + +/* ---------------------------------------- */ +/* get back on given taxid taxonomic parent */ +/* and display it */ +/* ---------------------------------------- */ +void gettaxidparents(int32_t taxid, ecotaxonomy_t *taxonomy, char *rankname) +{ + ecotx_t *next_parent; + int32_t c = 0; + + next_parent = eco_findtaxonbytaxid(taxonomy, taxid); + + printheader(); + + printresult(next_parent, NULL,taxonomy); + + while ( strcmp(next_parent->name, "root") ) + { + next_parent = next_parent->parent; + if (rankname == NULL || !strcmp(rankname,taxonomy->ranks->label[next_parent->rank])) + { + printresult(next_parent, NULL,taxonomy); + c++; + } + } + + printf("# %d parent(s) found\n#\n",c); +} + + +/** + * printout usage and exit + **/ +#define PP fprintf(stderr, + +static void ExitUsage(stat) + int stat; +{ + PP "usage: ecofind [-d database] [-h] [-l] [-r taxonomic rank] [-p taxid] [-s taxid] ... \n"); + PP "type \"ecofind -h\" for help\n"); + if (stat) + exit(stat); +} + +#undef PP + +/** + * printout help + **/ +#define PP fprintf(stdout, + +static void PrintHelp() +{ + PP "------------------------------------------\n"); + PP " ecofind Version %s\n", VERSION); + PP "------------------------------------------\n"); + PP "synopsis : searching for taxonomic and rank and\n"); + PP " taxonomy id for given regular expression patterns\n\n"); + PP "usage: ecofind [options] \n"); + PP "------------------------------------------\n"); + PP "options:\n"); + PP "-a : [A]ll enable the search on all alternative names and not only scientific names.\n\n"); + PP "-d : [D]atabase containing the taxonomy.\n"); + PP " To match the expected format, the database\n"); + PP " has to be formated first by the ecoPCRFormat.py\n"); + PP " program located in the tools directory.\n"); + PP " Write the database radical without any extension.\n\n"); + PP "-h : [H]elp - print help\n\n"); + PP "-l : [L]ist all taxonomic rank available for -r option\n\n"); + PP "-p : [P]arents : specifiying this option displays all parental tree's information for the given taxid.\n\n"); + PP "-r : [R]estrict to given taxonomic rank\n\n"); + PP "-s : [S]ons: specifiying this option displays all subtree's information for the given taxid.\n\n"); + PP "arguments:\n"); + PP " name pattern bearing regular expressions\n\n"); + PP "------------------------------------------\n"); + PP " http://www.grenoble.prabi.fr/trac/ecoPCR/\n"); + PP "------------------------------------------\n\n"); +} + +/* ----------------------------------------------- */ + +#define PATTERN_NUMBER 10 +#define PATTERN_LENGHT 40 +#define RESULT_LENGTH 100 + +int main(int argc, char **argv) +{ + int32_t carg = 0; + int32_t nummatch = 0; + int32_t k,j = 0; + int32_t errflag = 0; + int32_t tax_count = 0; + int32_t alternative = 0; + char *prefix = NULL; + ecotaxonomy_t *taxonomy; + econame_t *name; + int32_t name_count; + + int re_error; + int re_match; + regex_t re_preg; + + int32_t uptree = 0; + int32_t subtree = 0; + char *rankname = NULL; + int32_t rankfilter = 1; + int32_t list = 0; + + ecotx_t *subtree_parent; + int32_t count_son = 0; + + + while ((carg = getopt(argc, argv, "had:p:s:r:l")) != -1) { + switch (carg) { + case 's': /* path to the database */ + sscanf(optarg,"%d",&subtree); + break; + + case 'r': /* rank filter */ + rankname = ECOMALLOC(strlen(optarg),"allocation rankname"); + strcpy(rankname,optarg); + rankfilter = 0; + break; + + case 'd': /* path to the database */ + prefix = ECOMALLOC(strlen(optarg),"allocation prefix"); + strcpy(prefix,optarg); + break; + + case 'l': /* list rank filter options */ + list = 1; + break; + + case 'a': /* allow alternative names */ + alternative = 1; + break; + + case 'h': /* display help */ + PrintHelp(); + exit(0); + break; + + case 'p': /* taxid */ + sscanf(optarg,"%d",&uptree); + break; + + case '?': /* bad option */ + errflag++; + } + } + + if ((argc - optind) < 1) + errflag++; + + if (prefix == NULL) + { + prefix = getenv("ECOPCRDB"); + if (prefix == NULL) + errflag++; + } + + if (errflag && !uptree && !rankname && !subtree && !list) + ExitUsage(errflag); + + /** + * load taxonomy using libecoPCR functions + **/ + printf("# \n# opening %s database\n",prefix); + + taxonomy = read_taxonomy(prefix,1); + tax_count = taxonomy->taxons->count; + name_count = taxonomy->names->count; + + + /* ---------------------------------------- */ + /* list -r option possibility */ + /* ---------------------------------------- */ + if (list) + { + listfilteroptions(taxonomy->ranks); + return 0; + } + + /* ---------------------------------------- */ + /* display taxid parent if -t option */ + /* specified in command line */ + /* ---------------------------------------- */ + if (uptree) + { + gettaxidparents(uptree,taxonomy,rankname); + return 0; + } + + /* ---------------------------------------- */ + /* display taxid sons if -s option */ + /* specified in command line */ + /* ---------------------------------------- */ + if (subtree) + { + printheader(); + subtree_parent = eco_findtaxonbytaxid(taxonomy,subtree); + printresult(subtree_parent, NULL,taxonomy); + get_son(taxonomy, subtree_parent,&count_son,rankname); + printf("# %d son(s) found\n#\n",count_son); + return 0; + } + + printf("# %d taxons\n", tax_count); + + /** + * parse taxonomy + **/ + for (k=optind;knames->names; + j < name_count; + name++,j++) + { + + if(rankname) + rankfilter = !(strcmp(rankname,taxonomy->ranks->label[name->taxon->rank])); + + re_match = regexec (&re_preg, name->name, 0, NULL, 0); + + if (!re_match && (alternative || name->is_scientificname) && rankfilter) + { + printresult(name->taxon,name,taxonomy); + nummatch++; + } + + } + + printf("# %d records found \n",nummatch); + regfree(&re_preg); + } + + return 0; +} + + diff --git a/src/ecogrep.c b/src/ecogrep.c new file mode 100644 index 0000000..ff712a6 --- /dev/null +++ b/src/ecogrep.c @@ -0,0 +1,403 @@ +#include "libecoPCR/ecoPCR.h" +#include +#include +#include +#include +#include +#include + + +#define VERSION "0.1" + +void getLineContent(char *stream, ecoseq_t *seq, ecoseq_t *oligoseq_1, ecoseq_t *oligoseq_2){ + + int i; + char *buffer; + + for( i=0, buffer = strtok(stream,"|"); + buffer != NULL; + i++, buffer = strtok(NULL,"|")) + { + switch (i) { + case 0: + seq->AC = strdup(buffer); + break; + case 4: + sscanf(buffer,"%d",&seq->taxid); + break; + case 13: + oligoseq_1->SQ = strdup(buffer); + oligoseq_1->SQ_length = strlen(buffer); + break; + case 15: + oligoseq_2->SQ = strdup(buffer); + oligoseq_2->SQ_length = strlen(buffer); + break; + case 18: + seq->SQ = strdup(buffer); + seq->SQ_length = strlen(buffer); + break; + default: + break; + } + } +} + + +void freememory(char **tab, int32_t num){ + int32_t i; + for (i=0;iseqlen) > 0; + } + else return 0; +} + +/* ----------------------------------------------- */ +/* printout help */ +/* ----------------------------------------------- */ +#define PP fprintf(stdout, + +static void PrintHelp() +{ + PP "\n------------------------------------------\n"); + PP " ecogrep Version %s\n", VERSION); + PP "------------------------------------------\n"); + PP " synopsis : filtering ecoPCR result based on\n"); + PP " taxonomic id filter and regular expression pattern\n"); + PP " usage: ecogrep [options] filename\n"); + PP "------------------------------------------\n"); + PP " options:\n"); + PP " -1 : [FIRST] : compare the given pattern with direct strand oligonucleotide\n\n"); + PP " -2 : [SECOND] : compare the given pattern with reverse strand oligonucleotide\n\n"); + PP " -d : [D]atabase containing taxonomic information\n\n"); + PP " -e : [E]rrors : max error allowed in pattern match (option-1, -2 and -p) (0 by default)\n\n"); + PP " -p : [P]attern : oligonucleotide pattern\n\n"); + PP " -h : [H]elp : print help\n\n"); + PP " -i : [I]gnore subtree under given taxonomic id\n\n"); + PP " -r : [R]estrict search to subtree under given taxomic id\n\n"); + PP " -v : in[V]ert the sense of matching, to select non-matching lines.\n\n"); + PP " argument:\n"); + PP " ecoPCR ouput file name\n"); + PP "------------------------------------------\n\n"); + PP " http://www.grenoble.prabi.fr/trac/ecoPCR/\n"); + PP "------------------------------------------\n\n"); +} + +#undef PP + +/* ----------------------------------------------- */ +/* printout usage and exit */ +/* ----------------------------------------------- */ + +#define PP fprintf(stderr, + +static void ExitUsage(stat) + int stat; +{ + PP "usage: ecogrep [-d database] [-p pattern] [-i taxid] [-r taxid] [-v] [-h] \n"); + PP "type \"ecogrep -h\" for help\n"); + + if (stat) + exit(stat); +} + +#undef PP + +/* ----------------------------------------------- */ +/* MAIN */ +/* ----------------------------------------------- */ + +#define LINE_BUFF_SIZE 10000 + +int main(int argc, char **argv){ + int32_t carg = 0; + int32_t r = 0; // number of restricted taxid + int32_t i = 0; // number of ignored taxid + int32_t v = 0; // stores if -v mode is active + int32_t k = 0; // file counter + int32_t errflag = 0; + int32_t error_max = 0; // stores the error rate allowed by the user + int32_t matchingresult = 0; // stores number of matching result + + ecotaxonomy_t *taxonomy; // stores the taxonomy + + ecoseq_t *seq = NULL; // stores sequence info + ecoseq_t *oligoseq_1 = NULL; // stores the oligo_1 info + ecoseq_t *oligoseq_2 = NULL; // stores the oligo_2 info + + char *database = NULL; // stores the database path (for taxonomy) + + char *p = NULL; // pattern for sequence + PatternPtr pattern = NULL; // stores the build pattern for sequence + char *o1 = NULL; // pattern for direct strand oligo + PatternPtr oligo_1 = NULL; // stores the build pattern for direct strand oligo + char *o2 = NULL; // pattern for reverse strand oligo + PatternPtr oligo_2 = NULL; // stores the build pattern for reverse strand oligo + + int32_t *restricted_taxid = NULL; // stores the restricted taxid + int32_t *ignored_taxid = NULL; // stores the ignored taxid + + FILE *file = NULL; // stores the data stream, stdin by default + char *stream = ECOMALLOC(sizeof(char *)*LINE_BUFF_SIZE,"error stream buffer allocation"); + char *orig = ECOMALLOC(sizeof(char *)*LINE_BUFF_SIZE,"error orig buffer allocation"); + + int is_ignored = 0; + int is_included = 0; + int is_matching = 0; + int match_o1 = 0; + int match_o2 = 0; + int good = 0; + + seq = new_ecoseq(); + oligoseq_1 = new_ecoseq(); + oligoseq_2 = new_ecoseq(); + + /** + * Parse commande line options + **/ + while ((carg = getopt(argc, argv, "1:2:p:d:i:r:e:vh")) != -1) { + + switch (carg) { + case '1': + o1 = ECOMALLOC(strlen(optarg)+1, + "Error on o1 allocation"); + strcpy(o1,optarg); + break; + + case '2': + o2 = ECOMALLOC(strlen(optarg)+1, + "Error on o2 allocation"); + strcpy(o2,optarg); + break; + + case 'd': + database = ECOMALLOC(strlen(optarg)+1, + "Error on datafile allocation"); + strcpy(database,optarg); + break; + + case 'i': + ignored_taxid = ECOREALLOC( ignored_taxid, + sizeof(int32_t)*(i+1), + "Error on ignored_taxid reallocation"); + sscanf(optarg,"%d",&ignored_taxid[i]); + i++; + break; + + case 'r': + restricted_taxid = ECOREALLOC( restricted_taxid, + sizeof(int32_t)*(r+1), + "Error on restricted_taxid reallocation"); + sscanf(optarg,"%d",&restricted_taxid[r]); + r++; + break; + + case 'v': + v = 1; + break; + + case 'h': + PrintHelp(); + exit(0); + break; + + case 'e': + sscanf(optarg,"%d",&error_max); + break; + + case 'p': + p = ECOMALLOC(strlen(optarg)+1, + "Error on pattern allocation"); + strcpy(p,optarg); + break; + + case '?': + errflag++; + } + } + + /** + * Check sequence pattern length and build it in PatternPtr format + **/ + if(p) + { + if (strlen(p) > 32){ + printf("# Sorry, ecogrep doesn't handle pattern longer than 32 characters.\ + \n# Please check it out : %s\n",p); + exit(EXIT_FAILURE); + } + else if ( (pattern = buildPattern(p,error_max)) == NULL) + exit(EXIT_FAILURE); + } + + + + /** + * Check o1 pattern length and build it in PatternPtr format + **/ + if(o1) + { + if (strlen(o1) > 32){ + printf("# Sorry, ecogrep doesn't handle pattern longer than 32 characters.\ + \n# Please check it out : %s\n",o1); + exit(EXIT_FAILURE); + } + else if ( (oligo_1 = buildPattern(o1,error_max)) == NULL) + exit(EXIT_FAILURE); + } + + /** + * Check o2 pattern length and build it in PatternPtr format + **/ + if(o2) + { + if (strlen(o2) > 32){ + printf("# Sorry, ecogrep doesn't handle pattern longer than 32 characters.\ + \n# Please check it out : %s\n",o2); + exit(EXIT_FAILURE); + } + else if ( (oligo_2 = buildPattern(o2,error_max)) == NULL) + exit(EXIT_FAILURE); + } + + /** + * try to get the database name from environment variable + * if no database name specified in the -d option + **/ + if (database == NULL) + { + database = getenv("ECOPCRDB"); + if (database == NULL) + errflag++; + } + + /** + * check at leat one processing is asked + * either patterns or taxid filters + **/ + if ( !p && !o1 && !o2 && restricted_taxid == NULL && ignored_taxid == NULL ) + { + errflag++; + } + if (errflag) + ExitUsage(errflag); + + /** + * Get the taxonomy back + **/ + taxonomy = read_taxonomy(database,0); + + /** + * Parse the stream + **/ + for (k=0 ; argc >= optind ; optind++, k++){ + + matchingresult = 0; + + if ( (file = fopen(argv[optind], "r")) == NULL) + { + if (isatty(fileno(stdin)) == 0) + { + file = stdin; + printf("# Processing standard input...\n"); + } + else + break; + } + else + printf("# Processing %s...\n",argv[optind]); + + while( fgets(stream, LINE_BUFF_SIZE, file) != NULL ){ + + if (stream[0]!= '#') + { + + stream[LINE_BUFF_SIZE-1]=0; + + strcpy(orig,stream); + + getLineContent(stream,seq,oligoseq_1,oligoseq_2); + + /* -----------------------------------------------*/ + /* is ignored if at least one option -i */ + /* AND */ + /* if current sequence is son of taxid */ + /* -----------------------------------------------*/ + is_ignored = ( (i > 0) && (eco_is_taxid_included( taxonomy, + ignored_taxid, + i, + seq->taxid)) + ); + + /* -----------------------------------------------*/ + /* is included if no -r option */ + /* OR */ + /* if current sequence is son of taxid */ + /* -----------------------------------------------*/ + is_included = ( (r == 0) || (eco_is_taxid_included( taxonomy, + restricted_taxid, + r, + seq->taxid)) + ); + + /* ----------------------------------------------------------- */ + /* match if no pattern or if pattern match current sequence */ + /* ----------------------------------------------------------- */ + is_matching = ( !p || (ispatternmatching(seq,pattern))); + + /* ---------------------------------------------------------------------------- */ + /* match if no direct oligo pattern or if pattern match current direct oligo */ + /* ---------------------------------------------------------------------------- */ + match_o1 = (!o1 || (ispatternmatching(oligoseq_1,oligo_1))); + + /* ------------------------------------------------------------------------------- */ + /* match if no revesrse oligo pattern or if pattern match current reverse oligo */ + /* ------------------------------------------------------------------------------- */ + match_o2 = (!o2 || (ispatternmatching(oligoseq_2,oligo_2))); + + good = (is_included && is_matching && !is_ignored && match_o1 && match_o2); + + if (v) + good=!good; + + if ( good ) + { + printf("%s",orig); + matchingresult++; + } + } + } + if ( file != stdin ) + fclose(file); + + printf("# %d matching result(s)\n#\n",matchingresult); + } + + /** + * clean and free before leaving + **/ + ECOFREE(orig,"Error in free orig"); + ECOFREE(stream,"Error in free stream"); + ECOFREE(ignored_taxid,"Error in free stream"); + ECOFREE(restricted_taxid,"Error in free stream"); + + return 0; +} \ No newline at end of file diff --git a/src/ecoisundertaxon.c b/src/ecoisundertaxon.c new file mode 100644 index 0000000..1e41659 --- /dev/null +++ b/src/ecoisundertaxon.c @@ -0,0 +1,123 @@ +#include "libecoPCR/ecoPCR.h" +#include +#include + +#define VERSION "0.1" + +/* ----------------------------------------------- */ +/* printout verbose mode */ +/* ----------------------------------------------- */ +static void printTaxon(ecotx_t *taxon){ + printf("# taxid : %d | rank : %d | name : %s \n\n",taxon->taxid, taxon->rank, taxon->name); +} + +/* ----------------------------------------------- */ +/* printout help */ +/* ----------------------------------------------- */ +#define PP fprintf(stdout, + +static void PrintHelp() +{ + PP "\n------------------------------------------\n"); + PP " ecoisundertaxon Version %s\n", VERSION); + PP "------------------------------------------\n"); + PP " synopsis : searching relationship in taxonomy\n"); + PP " usage: ecoisundertaxon [options] database\n"); + PP "------------------------------------------\n"); + PP " options:\n"); + PP " -1 : [FIRST] taxomic id of the hypothetical son\n\n"); + PP " -2 : [SECOND] taxonomic id of the hypothetical parent\n\n"); + PP " -h : [H]elp - print help\n\n"); + PP " -v : [V]erbose mode. Display taxonomic information for both\n"); + PP " : taxonomic id.\n\n"); + PP "------------------------------------------\n"); + PP " database : to match the expected format, the database\n"); + PP " has to be formated first by the ecoPCRFormat.py program located.\n"); + PP " in the tools directory. Type the radical only, leaving out the extension\n"); + PP "------------------------------------------\n\n"); + PP " https://www.grenoble.prabi.fr/trac/ecoPCR/wiki"); + PP "------------------------------------------\n\n"); +} + +#undef PP + +/* ----------------------------------------------- */ +/* printout usage and exit */ +/* ----------------------------------------------- */ + +#define PP fprintf(stderr, + +static void ExitUsage(stat) + int stat; +{ + PP "usage: ecoisundertaxon [-1 taxid] [-2 taxid] [-v] [-h] datafile\n"); + PP "type \"ecoisundertaxon -h\" for help\n"); + + if (stat) + exit(stat); +} + +#undef PP + +/* ----------------------------------------------- */ +/* MAIN */ +/* ----------------------------------------------- */ + +int main(int argc, char **argv){ + int32_t carg = 0; + int32_t taxid_1 = 0; + int32_t taxid_2 = 0; + int32_t verbose = 0; + int32_t errflag = 0; + ecotaxonomy_t *taxonomy = NULL; + ecotx_t *son = NULL; + ecotx_t *parent = NULL; + + + while ((carg = getopt(argc, argv, "1:2:vh")) != -1) { + switch (carg) { + case '1': + sscanf(optarg,"%d",&taxid_1); + break; + + case '2': + sscanf(optarg,"%d",&taxid_2); + break; + + case 'v': + verbose = 1; + break; + + case 'h': + PrintHelp(); + exit(0); + break; + + case '?': + errflag++; + } + } + + if ((argc -= optind) != 1) + errflag++; + + if (errflag) + ExitUsage(errflag); + + taxonomy = read_taxonomy(argv[optind],0); + + son = eco_findtaxonbytaxid(taxonomy, taxid_1); + + if (verbose){ + parent = eco_findtaxonbytaxid(taxonomy, taxid_2); + printTaxon(son); + printTaxon(parent); + } + + if (eco_isundertaxon(son, taxid_2)) + printf("# taxid_1 (%d) is son of taxid_2 (%d)\n",taxid_1, taxid_2); + else + printf("# taxid_1 (%d) is NOT son of taxid_2 (%d)\n",taxid_1, taxid_2); + + return 0; +} \ No newline at end of file diff --git a/src/ecopcr.c b/src/ecopcr.c index 20a9bce..9d8a948 100644 --- a/src/ecopcr.c +++ b/src/ecopcr.c @@ -8,104 +8,69 @@ #define VERSION "0.1" /* ----------------------------------------------- */ -/* printout help */ /* ----------------------------------------------- */ - +/* printout help */ +/* ----------------------------------------------- */ #define PP fprintf(stdout, static void PrintHelp() { PP "------------------------------------------\n"); - PP " Apat Version %s\n", VERSION); + PP " ecoPCR Version %s\n", VERSION); PP "------------------------------------------\n"); - PP "synopsis : pattern(s) searching program\n"); - PP "usage: apat [options] patfile datafile\n"); + PP "synopsis : searching for sequence and taxonomy hybriding with given primers\n"); + PP "usage: ecoPCR [options] \n"); PP "------------------------------------------\n"); PP "options:\n"); - PP "-a code : [A]lphabet encoding for pattern\n"); - PP " code is one of : \n"); - PP " dna: use IUPAC equivalences for dna/rna\n"); - PP " prot: use IUPAC equivalences for proteins\n"); - PP " alpha: no equivalences, just treat plain symbols\n"); - PP " note: the equivalences are used in pattern only\n"); - PP " *not* in sequence(s) (see note (4) below)\n"); - PP " dft: alpha\n"); - PP "-c : [C]ooccurences\n"); - PP " print patterns cooccurence matrix \n"); - PP " dft: off\n"); - PP "-h : [H]elp - print help\n"); - PP "-m : [M]ultiple occurences\n"); - PP " see -q option \n"); - PP " dft: off\n"); - PP "-o file : [O]utput sequences\n"); - PP " additionaly output sequence(s) that match into\n"); - PP " 'file' in fasta format\n"); - PP " dft: off\n"); - PP "-p : no [Print] - don't printout hits\n"); - PP " when just counts are needed\n"); - PP " dft: off\n"); - PP "-q nn : [Quorum]\n"); - PP " printout result if at least nn\n"); - PP " different patterns are found on the sequence\n"); - PP " (with -m : at least nn different )\n"); - PP " dft: # of patterns read\n"); - PP "-s : no [Sort] - don't sort hits before printing\n"); - PP " usually hits are printed by increasing position\n"); - PP " this option will list them by pattern\n"); - PP " dft: off\n"); - PP "-t : [T]est sequence\n"); - PP " additionnaly check if sequences are uppercase\n"); - PP " this is mostly used for testing\n"); - PP " dft: off\n"); - PP "-u : [U]pper\n"); - PP " force lower->upper sequence conversion\n"); - PP " without this option lowercase symbols in sequence\n"); - PP " will not be considered to as matches\n"); - PP " dft: off\n"); - PP "-v : [V]erbose\n"); - PP " just display a kind of progress clock on stderr\n"); - PP " (this is only useful if you redirect stdout)\n"); - PP "\n"); - PP "patfile : pattern file (see below)\n"); - PP "datafile : database file (see below)\n"); - PP "------------------------------------------\n"); - PP "pattern file format :\n"); - PP " one pattern/line\n"); - PP " format : #errors\n"); - PP " := pattern\n"); - PP " or !pattern\n"); - PP " or pattern#\n"); - PP " or !pattern#\n"); - PP " := \n"); - PP " or [....]\n"); - PP " := uppercase letter (A-Z)\n"); - PP " := a positive number indicates max number of mismatches\n"); - PP " a negative number indicates max number of mismatches or indels\n"); - PP " # means that no error is allowed at this position\n"); - PP " ! complement the \n"); - PP " [...] means that all symbols within [] are allowed\n"); - PP " in addition IUPAC equivalences may be used as symbols\n"); - PP " with the -a option\n"); - PP "\n"); - PP "example: G[DE]S#[GIV]!HP![DE]# 1\n"); + PP "-d : [D]atabase : to match the expected format, the database\n"); + PP " has to be formated first by the ecoPCRFormat.py program located.\n"); + PP " in the tools directory.\n"); + PP " ecoPCRFormat.py creates three file types :\n"); + PP " .sdx : contains the sequences\n"); + PP " .tdx : contains information concerning the taxonomy\n"); + PP " .rdx : contains the taxonomy rank\n\n"); + PP " ecoPCR needs all the file type. As a result, you have to write the\n"); + PP " database radical without any extension. For example /ecoPCRDB/gbmam\n\n"); + PP "-e : [E]rror : max error allowed by oligonucleotide (0 by default)\n\n"); + PP "-h : [H]elp - print help\n\n"); + PP "-i : [I]gnore the given taxonomy id.\n"); + PP " Taxonomy id are available using the ecofind program.\n"); + PP " see its help typing ecofind -h for more information.\n\n"); + PP "-k : [K]ingdom mode : set the kingdom mode\n"); + PP " super kingdom mode by default.\n\n"); + PP "-l : minimum [L]ength : define the minimum amplication length. \n\n"); + PP "-L : maximum [L]ength : define the maximum amplicationlength. \n\n"); + PP "-r : [R]estricts the search to the given taxonomic id.\n"); + PP " Taxonomy id are available using the ecofind program.\n"); + PP " see its help typing ecofind -h for more information.\n"); PP "\n"); PP "------------------------------------------\n"); - PP "datafile contains one or more sequences in\n"); - PP "Fasta format, with *uppercase* symbols \n"); - PP "\n"); + PP "first argument : oligonucleotide for direct strand\n\n"); + PP "second argument : oligonucleotide for reverse strand\n\n"); PP "------------------------------------------\n"); - PP "note (1): the maximum number of patterns is %d\n", MAX_PATTERN); - PP "\n"); - PP "note (2): the maximum length for one pattern is %d\n", MAX_PAT_LEN); - PP "\n"); - PP "note (3): indels are still experimental and are :\n"); - PP " not handled gracefully with the # syntax\n"); - PP " and hits are not printed very nicely\n"); - PP "\n"); - PP "note (4): the IUPAC equivalences (-a option) are used\n"); - PP " in pattern only *not* in sequence(s).\n"); - PP " for instance GATN (with option -a dna) is equivalent to GAT[ACGT]\n"); - PP " and will match GATA/GATC/GATG/GATC but will not match GATN\n"); - PP " (nor NNNN) in sequence.\n"); + PP "Table result description : \n"); + PP "column 1 : accession number\n"); + PP "column 2 : sequence length\n"); + PP "column 3 : taxonomic id\n"); + PP "column 4 : rank\n"); + PP "column 5 : species taxonomic id\n"); + PP "column 6 : scientific name\n"); + PP "column 7 : genus taxonomic id\n"); + PP "column 8 : genus name\n"); + PP "column 9 : family taxonomic id\n"); + PP "column 10 : family name\n"); + PP "column 11 : super kingdom taxonomic id\n"); + PP "column 12 : super kingdom name\n"); + PP "column 13 : strand (direct or reverse)\n"); + PP "column 14 : first oligonucleotide\n"); + PP "column 15 : number of errors for the first strand\n"); + PP "column 16 : second oligonucleotide\n"); + PP "column 17 : number of errors for the second strand\n"); + PP "column 18 : amplification length\n"); + PP "column 19 : sequence\n"); + PP "column 20 : definition\n"); + PP "------------------------------------------\n"); + PP " http://www.grenoble.prabi.fr/trac/ecoPCR/\n"); + PP "------------------------------------------\n\n"); PP "\n"); } @@ -121,10 +86,8 @@ static void PrintHelp() static void ExitUsage(stat) int stat; { - PP "usage: apat [-a dna|prot] [-c] [-h] [-m] [-o file] [-p]\n"); - PP " [-q nn] [-t] [-u] [-v]\n"); - PP " patfile datafile\n"); - PP "type \"apat -h\" for help\n"); + PP "usage: ecoPCR [-d database] [-l value] [-L value] [-e value] [-r taxid] [-i taxid] [-k] oligo1 oligo2\n"); + PP "type \"ecoPCR -h\" for help\n"); if (stat) exit(stat); @@ -306,7 +269,7 @@ int main(int argc, char **argv) int32_t errflag=0; char kingdom_mode=0; - char *prefix; + char *prefix = NULL; int32_t checkedSequence = 0; int32_t positiveSequence= 0; @@ -330,43 +293,39 @@ int main(int argc, char **argv) int32_t erri; int32_t errj; + int32_t *restricted_taxid = NULL; + int32_t *ignored_taxid = NULL; + int32_t r=0; + int32_t g=0; - while ((carg = getopt(argc, argv, "h1:2:l:L:e:k")) != -1) { + while ((carg = getopt(argc, argv, "hd:l:L:e:i:r:k")) != -1) { + switch (carg) { /* -------------------- */ - case '1': /* prenier oligo */ + case 'd': /* database name */ /* -------------------- */ - oligo1 = ECOMALLOC(strlen(optarg)+1, - "Error on oligo 1 allocation"); - strcpy(oligo1,optarg); - break; - - /* -------------------- */ - case '2': /* coocurence option */ - /* -------------------- */ - oligo2 = ECOMALLOC(strlen(optarg)+1, - "Error on oligo 1 allocation"); - strcpy(oligo2,optarg); + prefix = ECOMALLOC(strlen(optarg)+1, + "Error on prefix allocation"); + strcpy(prefix,optarg); break; /* -------------------- */ case 'h': /* help */ /* -------------------- */ - PrintHelp(); exit(0); break; - /* -------------------- */ - case 'l': /* lmin amplification */ - /* -------------------- */ + /* ------------------------- */ + case 'l': /* min amplification lenght */ + /* ------------------------- */ sscanf(optarg,"%d",&lmin); break; - /* -------------------- */ - case 'L': /* lmax amplification */ - /* -------------------- */ + /* -------------------------- */ + case 'L': /* max amplification lenght */ + /* -------------------------- */ sscanf(optarg,"%d",&lmax); break; @@ -374,11 +333,29 @@ int main(int argc, char **argv) case 'e': /* error max */ /* -------------------- */ sscanf(optarg,"%d",&error_max); + break; + + /* -------------------- */ + case 'k': /* set the kingdom mode */ + kingdom_mode = 1; /* -------------------- */ + break; + + /* ------------------------------------------ */ + case 'r': /* stores the restricting search taxonomic id */ + /* ------------------------------------------ */ + restricted_taxid = ECOREALLOC(restricted_taxid,sizeof(int32_t)*(r+1), + "Error on restricted_taxid reallocation"); + sscanf(optarg,"%d",&restricted_taxid[r]); + r++; break; - case 'k': /* error max */ - /* -------------------- */ - kingdom_mode = 1; + /* --------------------------------- */ + case 'i': /* stores the taxonomic id to ignore */ + /* --------------------------------- */ + ignored_taxid = ECOREALLOC(ignored_taxid,sizeof(int32_t)*(g+1), + "Error on excluded_taxid reallocation"); + sscanf(optarg,"%d",&ignored_taxid[g]); + g++; break; /* -------------------- */ @@ -389,25 +366,43 @@ int main(int argc, char **argv) } - if ((argc -= optind) != 1) - errflag++; - + /** + * check the path to the database is given as last argument + */ + if ((argc -= optind) == 2) + { + + oligo1 = ECOMALLOC(strlen(argv[optind])+1, + "Error on oligo1 allocation"); + strcpy(oligo1,argv[optind]); + optind++; + oligo2 = ECOMALLOC(strlen(argv[optind])+1, + "Error on oligo1 allocation"); + strcpy(oligo2,argv[optind]); + } + else + errflag++; + + if (prefix == NULL) + { + prefix = getenv("ECOPCRDB"); + if (prefix == NULL) + errflag++; + } + + if (!oligo1 || !oligo2) errflag++; if (errflag) ExitUsage(errflag); - prefix = argv[optind]; - o1 = buildPattern(oligo1,error_max); o2 = buildPattern(oligo2,error_max); o1c = complementPattern(o1); o2c = complementPattern(o2); - - printf("#\n"); printf("# ecoPCR version %s\n",VERSION); printf("# direct strand oligo1 : %-32s ; oligo2c : %32s\n", o1->cpat,o2c->cpat); @@ -427,102 +422,120 @@ int main(int argc, char **argv) printf("# output in superkingdom mode\n"); printf("#\n"); - taxonomy = read_taxonomy(prefix); - + taxonomy = read_taxonomy(prefix,0); + seq = ecoseq_iterator(prefix); - - + checkedSequence = 0; positiveSequence= 0; amplifiatCount = 0; while(seq) - { + { checkedSequence++; - - scname = taxonomy->taxons->taxon[seq->taxid].name; - strncpy(head,seq->SQ,10); - head[10]=0; - strncpy(tail,seq->SQ+seq->SQ_length-10,10); - tail[10]=0; - + /** + * check if current sequence should be included + **/ + if ( (r == 0) || + (eco_is_taxid_included(taxonomy, + restricted_taxid, + r, + taxonomy->taxons->taxon[seq->taxid].taxid) + ) + ) + if ((g == 0) || + !(eco_is_taxid_included(taxonomy, + ignored_taxid, + g, + taxonomy->taxons->taxon[seq->taxid].taxid) + ) + ) + { - apatseq=ecoseq2apatseq(seq,apatseq); + scname = taxonomy->taxons->taxon[seq->taxid].name; + strncpy(head,seq->SQ,10); + head[10]=0; + strncpy(tail,seq->SQ+seq->SQ_length-10,10); + tail[10]=0; - o1Hits = ManberAll(apatseq,o1,0,0,apatseq->seqlen); - o2cHits= 0; - - - if (o1Hits) - { - stktmp = apatseq->hitpos[0]; - begin = stktmp->val[0] + o1->patlen; - - if (lmax) - length= stktmp->val[stktmp->top-1] + o1->patlen - begin + lmax + o2->patlen; - else - length= apatseq->seqlen - begin; + apatseq=ecoseq2apatseq(seq,apatseq); - o2cHits = ManberAll(apatseq,o2c,1,begin,length); - - if (o2cHits) - for (i=0; i < o1Hits;i++) - { - posi = apatseq->hitpos[0]->val[i]; - erri = apatseq->hiterr[0]->val[i]; - for (j=0; j < o2cHits; j++) - { - posj =apatseq->hitpos[1]->val[j] + o2c->patlen; - errj =apatseq->hiterr[1]->val[j]; - length=posj - posi + 1 - o1->patlen - o2->patlen; - - if ((!lmin || (length >= lmin)) && - (!lmax || (length <= lmax))) - printRepeat(seq,o1,o2c,'D',kingdom_mode,posi,posj,erri,errj,taxonomy); - //printf("%s\tD\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o1Hits,o2cHits,posi,posj,scname); - } - } - } - - o2Hits = ManberAll(apatseq,o2,2,0,apatseq->seqlen); - o1cHits= 0; - if (o2Hits) - { - stktmp = apatseq->hitpos[2]; - begin = stktmp->val[0] + o2->patlen; - - if (lmax) - length= stktmp->val[stktmp->top-1] + o2->patlen - begin + lmax + o1->patlen; - else - length= apatseq->seqlen - begin; + o1Hits = ManberAll(apatseq,o1,0,0,apatseq->seqlen); + o2cHits= 0; - o1cHits = ManberAll(apatseq,o1c,3,begin,length); - - if (o1cHits) - for (i=0; i < o2Hits;i++) + if (o1Hits) { - posi = apatseq->hitpos[2]->val[i]; - erri = apatseq->hiterr[2]->val[i]; - for (j=0; j < o1cHits; j++) - { - posj=apatseq->hitpos[3]->val[j] + o1c->patlen; - errj=apatseq->hiterr[3]->val[j]; - length=posj - posi + 1 - o1->patlen - o2->patlen; + stktmp = apatseq->hitpos[0]; + begin = stktmp->val[0] + o1->patlen; + + if (lmax) + length= stktmp->val[stktmp->top-1] + o1->patlen - begin + lmax + o2->patlen; + else + length= apatseq->seqlen - begin; - if ((!lmin || (length >= lmin)) && - (!lmax || (length <= lmax))) - printRepeat(seq,o2,o1c,'R',kingdom_mode,posi,posj,erri,errj,taxonomy); - //printf("%s\tR\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o2Hits,o1cHits,posi,posj,scname); - } + o2cHits = ManberAll(apatseq,o2c,1,begin,length); + + if (o2cHits) + for (i=0; i < o1Hits;i++) + { + posi = apatseq->hitpos[0]->val[i]; + erri = apatseq->hiterr[0]->val[i]; + for (j=0; j < o2cHits; j++) + { + posj =apatseq->hitpos[1]->val[j] + o2c->patlen; + errj =apatseq->hiterr[1]->val[j]; + length=posj - posi + 1 - o1->patlen - o2->patlen; + + if ((!lmin || (length >= lmin)) && + (!lmax || (length <= lmax))) + printRepeat(seq,o1,o2c,'D',kingdom_mode,posi,posj,erri,errj,taxonomy); + //printf("%s\tD\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o1Hits,o2cHits,posi,posj,scname); + } + } } - } - - + + o2Hits = ManberAll(apatseq,o2,2,0,apatseq->seqlen); + o1cHits= 0; + if (o2Hits) + { + stktmp = apatseq->hitpos[2]; + begin = stktmp->val[0] + o2->patlen; + + if (lmax) + length= stktmp->val[stktmp->top-1] + o2->patlen - begin + lmax + o1->patlen; + else + length= apatseq->seqlen - begin; + + o1cHits = ManberAll(apatseq,o1c,3,begin,length); + + if (o1cHits) + for (i=0; i < o2Hits;i++) + { + posi = apatseq->hitpos[2]->val[i]; + erri = apatseq->hiterr[2]->val[i]; + for (j=0; j < o1cHits; j++) + { + posj=apatseq->hitpos[3]->val[j] + o1c->patlen; + errj=apatseq->hiterr[3]->val[j]; + length=posj - posi + 1 - o1->patlen - o2->patlen; + + if ((!lmin || (length >= lmin)) && + (!lmax || (length <= lmax))) + printRepeat(seq,o2,o1c,'R',kingdom_mode,posi,posj,erri,errj,taxonomy); + //printf("%s\tR\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o2Hits,o1cHits,posi,posj,scname); + } + } + } + + } /* End of taxonomic selection */ delete_ecoseq(seq); seq = ecoseq_iterator(NULL); } + + ECOFREE(restricted_taxid, "Error: could not free restricted_taxid\n"); + ECOFREE(ignored_taxid, "Error: could not free excluded_taxid\n"); return 0; } diff --git a/src/global.mk b/src/global.mk new file mode 100644 index 0000000..44efeca --- /dev/null +++ b/src/global.mk @@ -0,0 +1,18 @@ +MACHINE=MAC_OS_X +LIBPATH= -Llibapat -LlibecoPCR +MAKEDEPEND = gcc -D$(MACHINE) -M $(CPPFLAGS) -o $*.d $< + +CC=gcc +CFLAGS= -W -Wall -O2 -g + +default: all + +%.o: %.c + $(CC) -D$(MACHINE) $(CFLAGS) -c -o $@ $< + +%.P : %.c + $(MAKEDEPEND) + @sed 's/\($*\)\.o[ :]*/\1.o $@ : /g' < $*.d > $@; \ + rm -f $*.d; [ -s $@ ] || rm -f $@ + +include $(SRCS:.c=.P) \ No newline at end of file diff --git a/src/libapat/Makefile b/src/libapat/Makefile new file mode 100644 index 0000000..3cbd4cd --- /dev/null +++ b/src/libapat/Makefile @@ -0,0 +1,23 @@ + +SOURCES = apat_parse.c \ + apat_search.c \ + libstki.c + +SRCS=$(SOURCES) + + +OBJECTS= $(patsubst %.c,%.o,$(SOURCES)) + +LIBFILE= libapat.a + + +include ../global.mk + +all: $(LIBFILE) + +clean: + rm -rf $(OBJECTS) $(LIBFILE) + +$(LIBFILE): $(OBJECTS) + ar -cr $@ $? + diff --git a/src/libecoPCR/Makefile b/src/libecoPCR/Makefile new file mode 100644 index 0000000..c5f2033 --- /dev/null +++ b/src/libecoPCR/Makefile @@ -0,0 +1,29 @@ + +SOURCES = ecoapat.c \ + ecodna.c \ + ecoError.c \ + ecoIOUtils.c \ + ecoMalloc.c \ + ecorank.c \ + ecoseq.c \ + ecotax.c \ + ecofilter.c \ + econame.c + +SRCS=$(SOURCES) + +OBJECTS= $(patsubst %.c,%.o,$(SOURCES)) + +LIBFILE= libecoPCR.a + + +include ../global.mk + + +all: $(LIBFILE) + +clean: + rm -rf $(OBJECTS) $(LIBFILE) + +$(LIBFILE): $(OBJECTS) + ar -cr $@ $? diff --git a/src/libecoPCR/ecoError.c b/src/libecoPCR/ecoError.c index a52b9e9..00bbfa2 100644 --- a/src/libecoPCR/ecoError.c +++ b/src/libecoPCR/ecoError.c @@ -2,7 +2,15 @@ #include #include - +/* + * print the message given as argument and exit the program + * @param error error number + * @param message the text explaining what's going on + * @param filename the file source where the program failed + * @param linenumber the line where it has failed + * filename and linenumber are written at pre-processing + * time by a macro + */ void ecoError(int32_t error, const char* message, const char * filename, diff --git a/src/libecoPCR/ecoIOUtils.c b/src/libecoPCR/ecoIOUtils.c index 0f8cebc..8d7ce82 100644 --- a/src/libecoPCR/ecoIOUtils.c +++ b/src/libecoPCR/ecoIOUtils.c @@ -16,20 +16,19 @@ int32_t is_big_endian() - - - int32_t swap_int32_t(int32_t i) { return SWAPINT32(i); } - - - - - +/** + * Read part of the file + * @param *f the database + * @param recordSize the size to be read + * + * @return buffer + */ void *read_ecorecord(FILE *f,int32_t *recordSize) { static void *buffer =NULL; @@ -79,10 +78,14 @@ void *read_ecorecord(FILE *f,int32_t *recordSize) - - - - +/** + * Open the database and check it's readable + * @param filename name of the database (.sdx, .rdx, .tbx) + * @param sequencecount buffer - pointer to variable storing the number of occurence + * @param abort_on_open_error boolean to define the behaviour in case of error + * while opening the database + * @return FILE type + **/ FILE *open_ecorecorddb(const char *filename, int32_t *sequencecount, int32_t abort_on_open_error) diff --git a/src/libecoPCR/ecoPCR.h b/src/libecoPCR/ecoPCR.h index 737cd76..4c26fc0 100644 --- a/src/libecoPCR/ecoPCR.h +++ b/src/libecoPCR/ecoPCR.h @@ -56,11 +56,11 @@ typedef struct { } ecotxformat_t; -typedef struct { - int32_t taxid; - int32_t rank; - int32_t parent; - char *name; +typedef struct ecotxnode { + int32_t taxid; + int32_t rank; + struct ecotxnode *parent; + char *name; } ecotx_t; typedef struct { @@ -80,12 +80,42 @@ typedef struct { char* label[1]; } ecorankidx_t; +/* + * + * Taxonomy name types + * + */ + typedef struct { + int32_t is_scientificname; + int32_t namelength; + int32_t classlength; + int32_t taxid; + char names[1]; +} econameformat_t; + + + typedef struct { + char *name; + char *classname; + int32_t is_scientificname; + struct ecotxnode *taxon; +} econame_t; + + +typedef struct { + int32_t count; + econame_t names[1]; +} econameidx_t; + + + typedef struct { ecorankidx_t *ranks; + econameidx_t *names; ecotxidx_t *taxons; } ecotaxonomy_t; - + /***************************************************** * * Function declarations @@ -175,6 +205,9 @@ ecoseq_t *readnext_ecoseq(FILE *); ecorankidx_t *read_rankidx(const char *filename); +econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy); + + /** * Read taxonomy data as formated by the ecoPCRFormat.py script. @@ -189,8 +222,11 @@ ecorankidx_t *read_rankidx(const char *filename); ecotxidx_t *read_taxonomyidx(const char *filename); -ecotaxonomy_t *read_taxonomy(const char *prefix); +ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName); +ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy, int32_t taxid); + +int eco_isundertaxon(ecotx_t *taxon, int other_taxid); ecoseq_t *ecoseq_iterator(const char *prefix); @@ -227,4 +263,7 @@ ecotx_t *eco_getfamily(ecotx_t *taxon,ecotaxonomy_t *taxonomy); ecotx_t *eco_getkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy); ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy); +int eco_is_taxid_ignored(int32_t *ignored_taxid, int32_t tab_len, int32_t taxid); +int eco_is_taxid_included(ecotaxonomy_t *taxonomy, int32_t *included_taxid, int32_t tab_len, int32_t taxid); + #endif /*ECOPCR_H_*/ diff --git a/src/libecoPCR/ecofilter.c b/src/libecoPCR/ecofilter.c new file mode 100644 index 0000000..8a7dbb1 --- /dev/null +++ b/src/libecoPCR/ecofilter.c @@ -0,0 +1,19 @@ +#include "ecoPCR.h" + +int eco_is_taxid_included( ecotaxonomy_t *taxonomy, + int32_t *restricted_taxid, + int32_t tab_len, + int32_t taxid) +{ + int i; + ecotx_t *taxon; + + taxon = eco_findtaxonbytaxid(taxonomy, taxid); + + for (i=0; i < tab_len; i++) + if ( (taxon->taxid == restricted_taxid[i]) || + (eco_isundertaxon(taxon, restricted_taxid[i])) ) + return 1; + + return 0; +} \ No newline at end of file diff --git a/src/libecoPCR/econame.c b/src/libecoPCR/econame.c new file mode 100644 index 0000000..835d79c --- /dev/null +++ b/src/libecoPCR/econame.c @@ -0,0 +1,61 @@ +#include "ecoPCR.h" +#include +#include + +static econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy); + +econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy) +{ + + int32_t count; + FILE *f; + econameidx_t *indexname; + int32_t i; + + f = open_ecorecorddb(filename,&count,1); + + indexname = (econameidx_t*) ECOMALLOC(sizeof(econameidx_t) + sizeof(econame_t) * (count-1),"Allocate names"); + + indexname->count=count; + + for (i=0; i < count; i++){ + readnext_econame(f,(indexname->names)+i,taxonomy); + } + + return indexname; +} + +econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy) +{ + + econameformat_t *raw; + int32_t rs; + + raw = read_ecorecord(f,&rs); + + if (!raw) + return NULL; + + if (is_big_endian()) + { + raw->is_scientificname = swap_int32_t(raw->is_scientificname); + raw->namelength = swap_int32_t(raw->namelength); + raw->classlength = swap_int32_t(raw->classlength); + raw->taxid = swap_int32_t(raw->taxid); + } + + name->is_scientificname=raw->is_scientificname; + + name->name = ECOMALLOC((raw->namelength+1) * sizeof(char),"Allocate name"); + strncpy(name->name,raw->names,raw->namelength); + name->name[raw->namelength]=0; + + name->classname = ECOMALLOC((raw->classlength+1) * sizeof(char),"Allocate classname"); + strncpy(name->classname,(raw->names+raw->namelength),raw->classlength); + name->classname[raw->classlength]=0; + + name->taxon = taxonomy->taxons->taxon + raw->taxid; + + return name; +} + diff --git a/src/libecoPCR/ecoseq.c b/src/libecoPCR/ecoseq.c index cc25ad5..2ec2614 100644 --- a/src/libecoPCR/ecoseq.c +++ b/src/libecoPCR/ecoseq.c @@ -79,7 +79,9 @@ ecoseq_t *new_ecoseq_with_data( char *AC, } - +/** + * ?? used ?? + **/ FILE *open_ecoseqdb(const char *filename, int32_t *sequencecount) { @@ -140,6 +142,13 @@ ecoseq_t *readnext_ecoseq(FILE *f) return seq; } +/** + * Open the sequences database (.sdx file) + * @param prefix name of the database (radical without extension) + * @param index integer + * + * @return file object + */ FILE *open_seqfile(const char *prefix,int32_t index) { char filename_buffer[1024]; @@ -153,7 +162,7 @@ FILE *open_seqfile(const char *prefix,int32_t index) prefix, index); - fprintf(stderr,"Coucou %s\n",filename_buffer); + fprintf(stderr,"# Coucou %s\n",filename_buffer); if (filename_length >= 1024) @@ -164,7 +173,7 @@ FILE *open_seqfile(const char *prefix,int32_t index) input=open_ecorecorddb(filename_buffer,&seqcount,0); if (input) - fprintf(stderr,"Reading file %s containing %d sequences...\n", + fprintf(stderr,"# Reading file %s containing %d sequences...\n", filename_buffer, seqcount); diff --git a/src/libecoPCR/ecotax.c b/src/libecoPCR/ecotax.c index 7eb825e..89ac82b 100644 --- a/src/libecoPCR/ecotax.c +++ b/src/libecoPCR/ecotax.c @@ -5,7 +5,11 @@ static ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon); - + /** + * Open the taxonomy database + * @param pointer to the database (.tdx file) + * @return a ecotxidx_t structure + */ ecotxidx_t *read_taxonomyidx(const char *filename) { int32_t count; @@ -18,14 +22,15 @@ ecotxidx_t *read_taxonomyidx(const char *filename) index = (ecotxidx_t*) ECOMALLOC(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count-1), "Allocate taxonomy"); - index->count=count; - - for (i=0; i < count; i++) + index->count=count; + for (i=0; i < count; i++){ readnext_ecotaxon(f,&(index->taxon[i])); - + index->taxon[i].parent=index->taxon + (int32_t)index->taxon[i].parent; + } return index; } + int32_t delete_taxonomy(ecotxidx_t *index) { int32_t i; @@ -61,6 +66,15 @@ int32_t delete_taxon(ecotx_t *taxon) return 1; } + +/** + * Read the database for a given taxon a save the data + * into the taxon structure(if any found) + * @param *f pointer to FILE type returned by fopen + * @param *taxon pointer to the structure + * + * @return a ecotx_t structure if any taxon found else NULL + */ ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon) { @@ -80,7 +94,7 @@ ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon) raw->taxid = swap_int32_t(raw->taxid); } - taxon->parent = raw->parent; + taxon->parent = (ecotx_t*)raw->parent; taxon->taxid = raw->taxid; taxon->rank = raw->rank; @@ -88,12 +102,12 @@ ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon) "Allocate taxon scientific name"); strncpy(taxon->name,raw->name,raw->namelength); - + return taxon; } -ecotaxonomy_t *read_taxonomy(const char *prefix) +ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName) { ecotaxonomy_t *tax; char *filename; @@ -115,10 +129,19 @@ ecotaxonomy_t *read_taxonomy(const char *prefix) tax->taxons = read_taxonomyidx(filename); + if (readAlternativeName) + { + snprintf(filename,buffsize,"%s.ndx",prefix); + tax->names=read_nameidx(filename,tax); + } + else + tax->names=NULL; return tax; } + + int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy) { if (taxonomy) @@ -138,20 +161,19 @@ int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy) } ecotx_t *eco_findtaxonatrank(ecotx_t *taxon, - int32_t rankidx, - ecotaxonomy_t *taxonomy) + int32_t rankidx) { ecotx_t *current_taxon; ecotx_t *next_taxon; current_taxon = taxon; - next_taxon = &(taxonomy->taxons->taxon[current_taxon->parent]); + next_taxon = current_taxon->parent; - while ((current_taxon!=next_taxon) && + while ((current_taxon!=next_taxon) && // I' am the root node (current_taxon->rank!=rankidx)) { current_taxon = next_taxon; - next_taxon = &(taxonomy->taxons->taxon[current_taxon->parent]); + next_taxon = current_taxon->parent; } if (current_taxon->rank==rankidx) @@ -160,6 +182,61 @@ ecotx_t *eco_findtaxonatrank(ecotx_t *taxon, return NULL; } +/** + * Get back information concerning a taxon from a taxonomic id + * @param *taxonomy the taxonomy database + * @param taxid the taxonomic id + * + * @result a ecotx_t structure containing the taxonimic information + **/ +ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy, + int32_t taxid) +{ + ecotx_t *current_taxon; + int32_t taxoncount; + int32_t i; + + taxoncount=taxonomy->taxons->count; + + for (current_taxon=taxonomy->taxons->taxon, + i=0; + i < taxoncount; + i++, + current_taxon++){ + if (current_taxon->taxid==taxid){ + return current_taxon; + } + } + + return (ecotx_t*)NULL; +} + +/** + * Find out if taxon is son of other taxon (identified by its taxid) + * @param *taxon son taxon + * @param parent_taxid taxonomic id of the other taxon + * + * @return 1 is the other taxid math a parent taxid, else 0 + **/ +int eco_isundertaxon(ecotx_t *taxon, + int other_taxid) +{ + ecotx_t *next_parent; + + next_parent = taxon->parent; + + while ( (other_taxid != next_parent->taxid) && + (strcmp(next_parent->name, "root")) ) + { + next_parent = next_parent->parent; + } + + if (other_taxid == next_parent->taxid) + return 1; + else + return 0; +} + ecotx_t *eco_getspecies(ecotx_t *taxon, ecotaxonomy_t *taxonomy) { @@ -175,7 +252,7 @@ ecotx_t *eco_getspecies(ecotx_t *taxon, if (!tax || rankindex < 0) ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); - return eco_findtaxonatrank(taxon,rankindex,tax); + return eco_findtaxonatrank(taxon,rankindex); } ecotx_t *eco_getgenus(ecotx_t *taxon, @@ -193,7 +270,7 @@ ecotx_t *eco_getgenus(ecotx_t *taxon, if (!tax || rankindex < 0) ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); - return eco_findtaxonatrank(taxon,rankindex,tax); + return eco_findtaxonatrank(taxon,rankindex); } @@ -212,7 +289,7 @@ ecotx_t *eco_getfamily(ecotx_t *taxon, if (!tax || rankindex < 0) ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); - return eco_findtaxonatrank(taxon,rankindex,tax); + return eco_findtaxonatrank(taxon,rankindex); } ecotx_t *eco_getkingdom(ecotx_t *taxon, @@ -230,7 +307,7 @@ ecotx_t *eco_getkingdom(ecotx_t *taxon, if (!tax || rankindex < 0) ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); - return eco_findtaxonatrank(taxon,rankindex,tax); + return eco_findtaxonatrank(taxon,rankindex); } ecotx_t *eco_getsuperkingdom(ecotx_t *taxon, @@ -248,5 +325,5 @@ ecotx_t *eco_getsuperkingdom(ecotx_t *taxon, if (!tax || rankindex < 0) ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); - return eco_findtaxonatrank(taxon,rankindex,tax); + return eco_findtaxonatrank(taxon,rankindex); } \ No newline at end of file