diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000..9b213b0 --- /dev/null +++ b/src/Makefile @@ -0,0 +1,58 @@ +EXEC=ecoPrimer + +PRIMER_SRC= ecoprimer.c +PRIMER_OBJ= $(patsubst %.c,%.o,$(PRIMER_SRC)) + + +SRCS= $(PRIMER_SRC) + +LIB= -lecoPCR -lecoprimer -lz -lm + +LIBFILE= libecoPCR/libecoPCR.a \ + libecoprimer/libecoprimer.a + + +include global.mk + +all: $(EXEC) + + +######## +# +# ecoPrimer compilation +# +######## + +# executable compilation and link + +ecoPrimer: $(PRIMER_OBJ) $(LIBFILE) + $(CC) $(LDFLAGS) -O5 -m64 -fast -o $@ $< $(LIBPATH) $(LIB) + + +######## +# +# library compilation +# +######## + +libecoPCR/libecoPCR.a: + $(MAKE) -C libecoPCR + +libecoprimer/libecoprimer.a: + $(MAKE) -C libecoprimer + + +######## +# +# project management +# +######## + +clean: + rm -f *.o + rm -f $(EXEC) + $(MAKE) -C libecoPCR clean + $(MAKE) -C libecoprimer clean + + + \ No newline at end of file diff --git a/src/ecoPrimer b/src/ecoPrimer new file mode 100755 index 0000000..13716c1 Binary files /dev/null and b/src/ecoPrimer differ diff --git a/src/ecoprimer.c b/src/ecoprimer.c new file mode 100644 index 0000000..b54ef97 --- /dev/null +++ b/src/ecoprimer.c @@ -0,0 +1,472 @@ +/* + * ecoprimer.c + * + * Created on: 7 nov. 2008 + * Author: coissac + */ + +#include "libecoprimer/ecoprimer.h" +#include +#include +#include +#include +#include +#include +#include + +#define VERSION "0.1" + /* TR: by default, statistics are made on species level*/ +#define DEFULTTAXONRANK "species" + +/* ----------------------------------------------- */ +/* printout help */ +/* ----------------------------------------------- */ +#define PP fprintf(stdout, + +static void PrintHelp() +{ + PP "------------------------------------------\n"); + PP " ecoPrimer Version %s\n", VERSION); + PP "------------------------------------------\n"); +} + +static void ExitUsage(int stat) +{ + PP "usage: ecoprimer [-d database] [-l value] [-L value] [-e value] [-r taxid] [-i taxid] [-R rank] [-t taxon level]\n"); + PP "type \"ecoprimer -h\" for help\n"); + + if (stat) + exit(stat); +} + +#undef PP + +void initoptions(poptions_t options) +{ + options->lmin=0; //< Amplifia minimal length + options->lmax=0; //< Amplifia maximal length + options->error_max=3; //**< maximum error count in fuzzy search + options->primer_length=18; //**< minimal length of the primers + options->restricted_taxid=NULL; //**< limit amplification below these taxid + options->ignored_taxid=NULL; //**< no amplification below these taxid + options->prefix=NULL; + options->circular=0; + options->doublestrand=1; + options->strict_quorum=0.7; + options->strict_exclude_quorum=0.1; + options->sensitivity_quorum=0.9; + options->false_positive_quorum=0.1; + options->strict_three_prime=2; + options->r=0; + options->g=0; + options->no_multi_match=FALSE; + strcpy(options->taxonrank, DEFULTTAXONRANK); /*taxon level for results, species by default*/ +} + +void printcurrenttime () +{ + time_t now; + struct tm *ts; + char buf[80]; + + /* Get the current time */ + now = time(NULL); + + /* Format and print the time, "ddd yyyy-mm-dd hh:mm:ss zzz" */ + ts = localtime(&now); + strftime(buf, sizeof(buf), "%a %Y-%m-%d %H:%M:%S %Z", ts); + fprintf(stderr,"#%d#, %s\n",now, buf); + } + +void printcurrenttimeinmilli() +{ + struct timeval tv; + struct timezone tz; + struct tm *tm; + gettimeofday(&tv, &tz); + tm=localtime(&tv.tv_sec); + fprintf(stderr, " %d:%02d:%02d %d %d \n", tm->tm_hour, tm->tm_min, + tm->tm_sec, tv.tv_usec, tv.tv_usec/1000); +} + +/*TR: Added*/ +void printpairs (pairscount_t pairs, poptions_t options, int32_t rankdbstats, uint32_t seqdbsize) +{ + uint32_t i; + uint32_t wordsize = options->primer_length; + uint32_t quorumseqs; + double sens; + double speci; + float avg; + + quorumseqs = seqdbsize * 70 / 100; + + printf("primer_1\tseq_1\tPrimer_2\tseq_2\tamplifia_count\t%s_snes\t%s_spe\tmin_l\tmax_l\tavr_l\n", options->taxonrank, options->taxonrank); + + for (i=0; i < pairs.paircount; i++) + { + if (quorumseqs > pairs.pairs[i].inexample) continue; + sens = (pairs.pairs[i].taxsetindex*1.0)/rankdbstats*100; + speci = (pairs.pairs[i].oktaxoncount*1.0)/pairs.pairs[i].taxsetindex*100; + avg = (pairs.pairs[i].mind+pairs.pairs[i].maxd)*1.0/2; + + printf("P1\t%s", ecoUnhashWord(pairs.pairs[i].w1, wordsize)); + printf("\tP2\t%s", ecoUnhashWord(pairs.pairs[i].w2, wordsize)); + printf("\t%d", pairs.pairs[i].inexample); + printf("\t%3.2f", sens); + printf("\t%3.2f", speci); + printf("\t%d", pairs.pairs[i].mind); + printf("\t%d", pairs.pairs[i].maxd); + printf("\t%3.2f\n", avg); + } +} + +/*updateseqparams: This function counts the insample and outsample sequences + * and with each sequences adds a tag of the taxon to which the sequence beongs*/ +void updateseqparams (pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxonomy, + poptions_t options, int32_t *insamples, int32_t *outsamples) +{ + uint32_t i; + int32_t taxid; + ecotx_t *tmptaxon; + + for (i=0;iisexample=isGoodTaxon(taxonomy,seqdb[i]->taxid,options); + if (seqdb[i]->isexample) + (*insamples)++; + else + (*outsamples)++; + + taxid = taxonomy->taxons->taxon[seqdb[i]->taxid].taxid; + tmptaxon = eco_findtaxonbytaxid(taxonomy, taxid); + if (tmptaxon) + tmptaxon = eco_findtaxonatrank(tmptaxon, options->taxonrankidx); + if (tmptaxon) + seqdb[i]->ranktaxonid = tmptaxon->taxid; + } +} + +void setresulttaxonrank (ecotaxonomy_t *taxonomy, poptions_t options) +{ + int32_t i; + + /*set taxon rank for which result is to be given*/ + for (i = 0; i < taxonomy->ranks->count; i++) + { + if (strcmp(taxonomy->ranks->label[i], options->taxonrank) == 0) + { + options->taxonrankidx = i; + break; + } + } + + if (i == taxonomy->ranks->count) + { + fprintf(stderr,"\nUnknown taxon level: '%s'\n", options->taxonrank); + exit(0); + } +} +/* to get db stats, totals of species, genus etc....*/ +int32_t getrankdbstats(pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxonomy, + poptions_t options) +{ + uint32_t i; + uint32_t j; + uint32_t nameslots = 500; + uint32_t namesindex = 0; + int32_t *ranktaxonids = ECOMALLOC(nameslots * sizeof(int32_t), "Error in taxon rank allocation"); + int32_t taxid; + + ecotx_t *tmptaxon; + + for (i=0;itaxons->taxon[seqdb[i]->taxid].taxid; + tmptaxon = eco_findtaxonbytaxid(taxonomy, taxid); + if (tmptaxon) + tmptaxon = eco_findtaxonatrank(tmptaxon, options->taxonrankidx); + if (tmptaxon) + { + for (j = 0; j < namesindex; j++) + { + if (tmptaxon->taxid == ranktaxonids[j]) break; + } + if (j < namesindex) continue; /* name is already in list, so no need to add it*/ + + if (namesindex == nameslots) + { + nameslots += 500; + ranktaxonids = ECOREALLOC(ranktaxonids, nameslots * sizeof(int32_t), "Cannot allocate pair rank taxon table"); + } + ranktaxonids[namesindex] = tmptaxon->taxid; + namesindex++; + } + } + ECOFREE(ranktaxonids, "free rank taxon table"); + + return namesindex; +} + +void setoktaxforspecificity (ppairscount_t pairs) +{ + uint32_t i; + uint32_t j; + uint32_t k; + uint32_t l; + int taxcount; + int32_t taxid; + + for (i = 0; i < pairs->paircount; i++) + { + for (j = 0; j < pairs->pairs[i].taxsetindex; j++) + { + for (k = 0; k < pairs->pairs[i].taxset[j].amplifiaindex; k++) + { + taxid = 0; + taxcount = 0; + for (l = 0; l < pairs->pairs[i].ampsetindex; l++) + { + /*compare only char pointers because for equal strings we have same pointer in both sets*/ + if (pairs->pairs[i].taxset[j].amplifia[k] == pairs->pairs[i].ampset[l].amplifia) + { + if (pairs->pairs[i].ampset[l].seqidindex > 1) + { + taxcount += pairs->pairs[i].ampset[l].seqidindex; + break; + } + + if (taxid != pairs->pairs[i].ampset[l].taxonids[0]) + { + if (!taxid) taxid = pairs->pairs[i].ampset[l].taxonids[0]; + taxcount++; + } + + if (taxcount > 1) break; + } + } + if (taxcount == 1) pairs->pairs[i].oktaxoncount++; + } + } + } +} + +int main(int argc, char **argv) +{ + pecodnadb_t seqdb; /* of type ecoseq_t */ + uint32_t seqdbsize=0; + ecotaxonomy_t *taxonomy; + + options_t options; + int carg; + int32_t errflag=0; + + int32_t insamples=0; + int32_t outsamples=0; + uint32_t i; + + pwordcount_t words; + pprimercount_t primers; + pairscount_t pairs; + + int32_t rankdbstats = 0; + + //printcurrenttime(); + //return 0; + + initoptions(&options); + + while ((carg = getopt(argc, argv, "hcUDSd:l:L:e:i:r:q:3:s:x:t:")) != -1) { + + switch (carg) { + /* -------------------- */ + case 'd': /* database name */ + /* -------------------- */ + options.prefix = ECOMALLOC(strlen(optarg)+1, + "Error on prefix allocation"); + strcpy(options.prefix,optarg); + break; + + /* -------------------- */ + case 'h': /* help */ + /* -------------------- */ + PrintHelp(); + exit(0); + break; + + /* ------------------------- */ + case 'l': /* min amplification lenght */ + /* ------------------------- */ + sscanf(optarg,"%d",&(options.lmin)); + break; + + /* -------------------------- */ + case 'L': /* max amplification lenght */ + /* -------------------------- */ + sscanf(optarg,"%d",&(options.lmax)); + break; + + /* -------------------- */ + case 'e': /* error max */ + /* -------------------- */ + sscanf(optarg,"%d",&(options.error_max)); + break; + + + /* ------------------------ */ + case '3': /* three prime strict match */ + /* ------------------------ */ + sscanf(optarg,"%d",&(options.strict_three_prime)); + break; + + /* -------------------- */ + case 'q': /* strict matching quorum */ + /* -------------------- */ + sscanf(optarg,"%f",&(options.strict_quorum)); + break; + + /* -------------------- */ + case 's': /* strict matching quorum */ + /* -------------------- */ + sscanf(optarg,"%f",&(options.sensitivity_quorum)); + break; + + /* -------------------- */ + case 't': /* required taxon level for results */ + /* -------------------- */ + strncpy(options.taxonrank, optarg, 19); + options.taxonrank[19] = 0; + break; + + /* -------------------- */ + case 'x': /* strict matching quorum */ + /* -------------------- */ + sscanf(optarg,"%f",&(options.false_positive_quorum)); + break; + + /* ---------------------------- */ + case 'D': /* set in double strand mode */ + /* ---------------------------- */ + options.doublestrand=1; + break; + + /* ---------------------------- */ + case 'S': /* set in single strand mode */ + /* ---------------------------- */ + options.doublestrand=0; + break; + + /* ---------------------------- */ + case 'U': /* set in single strand mode */ + /* ---------------------------- */ + options.no_multi_match=TRUE; + break; + + /* ------------------------------------------ */ + case 'r': /* stores the restricting search taxonomic id */ + /* ------------------------------------------ */ + options.restricted_taxid = ECOREALLOC(options.restricted_taxid,sizeof(int32_t)*(options.r+1), + "Error on restricted_taxid reallocation"); + sscanf(optarg,"%d",&(options.restricted_taxid[options.r])); + options.r++; + break; + + /* --------------------------------- */ + case 'i': /* stores the taxonomic id to ignore */ + /* --------------------------------- */ + options.ignored_taxid = ECOREALLOC(options.ignored_taxid,sizeof(int32_t)*(options.g+1), + "Error on excluded_taxid reallocation"); + sscanf(optarg,"%d",&(options.ignored_taxid[options.g])); + options.g++; + break; + + /* -------------------- */ + case 'c': /* sequences are circular */ + /* --------------------------------- */ + options.circular = 1; + break; + + case '?': /* bad option */ + /* -------------------- */ + errflag++; + } + + } + + fprintf(stderr,"Reading taxonomy database ..."); + taxonomy = read_taxonomy(options.prefix,0); + fprintf(stderr,"Ok\n"); + + setresulttaxonrank(taxonomy, &options); /*TR: set rank level for statistics*/ + + fprintf(stderr,"Reading sequence database ...\n"); + + seqdb = readdnadb(options.prefix,&seqdbsize); + + fprintf(stderr,"Ok\n"); + fprintf(stderr,"Sequence read : %d\n",(int32_t)seqdbsize); + + updateseqparams(seqdb, seqdbsize, taxonomy, &options, &insamples , &outsamples); + + rankdbstats = getrankdbstats(seqdb, seqdbsize, taxonomy, &options); + + fprintf(stderr,"Database is constituted of %5d examples\n",insamples); + fprintf(stderr," and %5d counterexamples\n",outsamples); + fprintf(stderr,"Total distinct %s count %d\n",options.taxonrank, rankdbstats); + + fprintf(stderr,"\nIndexing words in sequences\n"); + + printcurrenttimeinmilli(); + words = lookforStrictPrimer(seqdb,seqdbsize,insamples,&options); + printcurrenttimeinmilli(); + + fprintf(stderr,"\n Strict primer count : %d\n",words->size); + + if (options.no_multi_match) + { + (void)filterMultiStrictPrimer(words); + fprintf(stderr,"\n Strict primer with single match count : %d\n",words->size); + } + + + fprintf(stderr,"\n\n Primer sample : \n"); + for (i=0; isize); i++) + fprintf(stderr," + Primer : %s sequence count : %d\n",ecoUnhashWord(words->words[i],options.primer_length),words->strictcount[i]); + + + fprintf(stderr,"\nEncoding sequences for fuzzy pattern matching...\n"); + for (i=0;istrictcount,"Free strict primer count table"); + + primers = lookforAproxPrimer(seqdb,seqdbsize,insamples,words,&options); + + ECOFREE(words->words,"Free strict primer table"); + ECOFREE(words,"Free strict primer structure"); + fprintf(stderr,"\n\n Approximate repeats :%d \n", primers->size); + + fprintf(stderr,"\n\n Primer sample : \n"); + for (i=0; isize); i++) + fprintf(stderr," + Primer : %s example sequence count : %5d counterexample sequence count : %5d status : %s\n",ecoUnhashWord(primers->primers[i].word,options.primer_length), + primers->primers[i].inexample, + primers->primers[i].outexample, + primers->primers[i].good ? "good":"bad"); + + fprintf(stderr,"\n"); + + /*TR: Added*/ + pairs = buildPrimerPairs(seqdb, seqdbsize, primers, &options); + setoktaxforspecificity (&pairs); + + printpairs (pairs, &options, rankdbstats, seqdbsize); + + + + ECOFREE(pairs.pairs,"Free pairs table"); + + return 0; +} diff --git a/src/global.mk b/src/global.mk new file mode 100644 index 0000000..94cf61d --- /dev/null +++ b/src/global.mk @@ -0,0 +1,20 @@ +MACHINE=MAC_OS_X +LIBPATH= -Llibapat -LlibecoPCR -Llibecoprimer +MAKEDEPEND = gcc -D$(MACHINE) -M $(CPPFLAGS) -o $*.d $< + +CC=gcc +CFLAGS= -W -Wall -O5 -m64 -fast -g +#CFLAGS= -W -Wall -O0 -m64 -g +#CFLAGS= -W -Wall -O5 -fast -g + +default: all + +%.o: %.c + $(CC) -D$(MACHINE) $(CFLAGS) -c -o $@ $< + +%.P : %.c + $(MAKEDEPEND) + @sed 's/\($*\)\.o[ :]*/\1.o $@ : /g' < $*.d > $@; \ + rm -f $*.d; [ -s $@ ] || rm -f $@ + +include $(SRCS:.c=.P) \ No newline at end of file diff --git a/src/libecoPCR/Makefile b/src/libecoPCR/Makefile new file mode 100644 index 0000000..e2c1e3e --- /dev/null +++ b/src/libecoPCR/Makefile @@ -0,0 +1,30 @@ + +SOURCES = ecodna.c \ + ecoError.c \ + ecoIOUtils.c \ + ecoMalloc.c \ + ecorank.c \ + ecoseq.c \ + ecotax.c \ + ecofilter.c \ + econame.c + +SRCS=$(SOURCES) + +OBJECTS= $(patsubst %.c,%.o,$(SOURCES)) + +LIBFILE= libecoPCR.a +RANLIB= ranlib + + +include ../global.mk + + +all: $(LIBFILE) + +clean: + rm -rf $(OBJECTS) $(LIBFILE) + +$(LIBFILE): $(OBJECTS) + ar -cr $@ $? + $(RANLIB) $@ diff --git a/src/libecoPCR/ecoError.P b/src/libecoPCR/ecoError.P new file mode 100644 index 0000000..7c7ae71 --- /dev/null +++ b/src/libecoPCR/ecoError.P @@ -0,0 +1,15 @@ +ecoError.o ecoError.P : ecoError.c ecoPCR.h /usr/include/stdio.h \ + /usr/include/_types.h /usr/include/sys/_types.h \ + /usr/include/sys/cdefs.h /usr/include/machine/_types.h \ + /usr/include/i386/_types.h /usr/include/inttypes.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \ + /usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \ + /usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \ + /usr/include/machine/signal.h /usr/include/i386/signal.h \ + /usr/include/i386/_structs.h /usr/include/sys/_structs.h \ + /usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \ + /usr/include/sys/resource.h /usr/include/machine/endian.h \ + /usr/include/i386/endian.h /usr/include/sys/_endian.h \ + /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \ + /usr/include/machine/types.h /usr/include/i386/types.h diff --git a/src/libecoPCR/ecoError.c b/src/libecoPCR/ecoError.c new file mode 100644 index 0000000..00bbfa2 --- /dev/null +++ b/src/libecoPCR/ecoError.c @@ -0,0 +1,26 @@ +#include "ecoPCR.h" +#include +#include + +/* + * print the message given as argument and exit the program + * @param error error number + * @param message the text explaining what's going on + * @param filename the file source where the program failed + * @param linenumber the line where it has failed + * filename and linenumber are written at pre-processing + * time by a macro + */ +void ecoError(int32_t error, + const char* message, + const char * filename, + int linenumber) +{ + fprintf(stderr,"Error %d in file %s line %d : %s\n", + error, + filename, + linenumber, + message); + + abort(); +} diff --git a/src/libecoPCR/ecoIOUtils.P b/src/libecoPCR/ecoIOUtils.P new file mode 100644 index 0000000..fc34a70 --- /dev/null +++ b/src/libecoPCR/ecoIOUtils.P @@ -0,0 +1,15 @@ +ecoIOUtils.o ecoIOUtils.P : ecoIOUtils.c ecoPCR.h /usr/include/stdio.h \ + /usr/include/_types.h /usr/include/sys/_types.h \ + /usr/include/sys/cdefs.h /usr/include/machine/_types.h \ + /usr/include/i386/_types.h /usr/include/inttypes.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \ + /usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \ + /usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \ + /usr/include/machine/signal.h /usr/include/i386/signal.h \ + /usr/include/i386/_structs.h /usr/include/sys/_structs.h \ + /usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \ + /usr/include/sys/resource.h /usr/include/machine/endian.h \ + /usr/include/i386/endian.h /usr/include/sys/_endian.h \ + /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \ + /usr/include/machine/types.h /usr/include/i386/types.h diff --git a/src/libecoPCR/ecoIOUtils.c b/src/libecoPCR/ecoIOUtils.c new file mode 100644 index 0000000..73fb812 --- /dev/null +++ b/src/libecoPCR/ecoIOUtils.c @@ -0,0 +1,122 @@ +#include "ecoPCR.h" +#include +#include + +#define SWAPINT32(x) ((((x) << 24) & 0xFF000000) | (((x) << 8) & 0xFF0000) | \ + (((x) >> 8) & 0xFF00) | (((x) >> 24) & 0xFF)) + + +int32_t is_big_endian() +{ + int32_t i=1; + + return (int32_t)((char*)&i)[0]; +} + + + + +int32_t swap_int32_t(int32_t i) +{ + return SWAPINT32(i); +} + + +/** + * Read part of the file + * @param *f the database + * @param recordSize the size to be read + * + * @return buffer + */ +void *read_ecorecord(FILE *f,int32_t *recordSize) +{ + static void *buffer =NULL; + int32_t buffersize=0; + int32_t read; + + if (!recordSize) + ECOERROR(ECO_ASSERT_ERROR, + "recordSize cannot be NULL"); + + read = fread(recordSize, + 1, + sizeof(int32_t), + f); + + if (feof(f)) + return NULL; + + if (read != sizeof(int32_t)) + ECOERROR(ECO_IO_ERROR,"Reading record size error"); + + if (is_big_endian()) + *recordSize=swap_int32_t(*recordSize); + + if (buffersize < *recordSize) + { + if (buffer) + buffer = ECOREALLOC(buffer,*recordSize, + "Increase size of record buffer"); + else + buffer = ECOMALLOC(*recordSize, + "Allocate record buffer"); + } + + read = fread(buffer, + 1, + *recordSize, + f); + + if (read != *recordSize) + ECOERROR(ECO_IO_ERROR,"Reading record data error"); + + return buffer; +}; + + + + + +/** + * Open the database and check it's readable + * @param filename name of the database (.sdx, .rdx, .tbx) + * @param sequencecount buffer - pointer to variable storing the number of occurence + * @param abort_on_open_error boolean to define the behaviour in case of error + * while opening the database + * @return FILE type + **/ +FILE *open_ecorecorddb(const char *filename, + int32_t *sequencecount, + int32_t abort_on_open_error) +{ + FILE *f; + int32_t read; + + f = fopen(filename,"rb"); + + if (!f) + { + if (abort_on_open_error) + ECOERROR(ECO_IO_ERROR,"Cannot open file"); + else + { + *sequencecount=0; + return NULL; + } + } + + read = fread(sequencecount, + 1, + sizeof(int32_t), + f); + + if (read != sizeof(int32_t)) + ECOERROR(ECO_IO_ERROR,"Reading record size error"); + + if (is_big_endian()) + *sequencecount=swap_int32_t(*sequencecount); + + return f; +} + diff --git a/src/libecoPCR/ecoMalloc.P b/src/libecoPCR/ecoMalloc.P new file mode 100644 index 0000000..ea3767b --- /dev/null +++ b/src/libecoPCR/ecoMalloc.P @@ -0,0 +1,15 @@ +ecoMalloc.o ecoMalloc.P : ecoMalloc.c ecoPCR.h /usr/include/stdio.h \ + /usr/include/_types.h /usr/include/sys/_types.h \ + /usr/include/sys/cdefs.h /usr/include/machine/_types.h \ + /usr/include/i386/_types.h /usr/include/inttypes.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \ + /usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \ + /usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \ + /usr/include/machine/signal.h /usr/include/i386/signal.h \ + /usr/include/i386/_structs.h /usr/include/sys/_structs.h \ + /usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \ + /usr/include/sys/resource.h /usr/include/machine/endian.h \ + /usr/include/i386/endian.h /usr/include/sys/_endian.h \ + /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \ + /usr/include/machine/types.h /usr/include/i386/types.h diff --git a/src/libecoPCR/ecoMalloc.c b/src/libecoPCR/ecoMalloc.c new file mode 100644 index 0000000..f1a0d0f --- /dev/null +++ b/src/libecoPCR/ecoMalloc.c @@ -0,0 +1,94 @@ +#include "ecoPCR.h" +#include + +static int eco_log_malloc = 0; +static size_t eco_amount_malloc=0; +static size_t eco_chunk_malloc=0; + +void eco_trace_memory_allocation() +{ + eco_log_malloc=1; +} + +void eco_untrace_memory_allocation() +{ + eco_log_malloc=0; +} + +void ecoMallocedMemory() +{ + return eco_amount_malloc; +} + +void *eco_malloc(int32_t chunksize, + const char *error_message, + const char *filename, + int32_t line) +{ + void * chunk; + + chunk = calloc(1,chunksize); + + + if (!chunk) + ecoError(ECO_MEM_ERROR,error_message,filename,line); + + eco_chunk_malloc++; + + if (eco_log_malloc) + fprintf(stderr, + "Memory segment located at %p of size %d is allocated (file : %s [%d])", + chunk, + chunksize, + filename, + line); + + return chunk; +} + +void *eco_realloc(void *chunk, + int32_t newsize, + const char *error_message, + const char *filename, + int32_t line) +{ + void *newchunk; + + newchunk = realloc(chunk,newsize); + + + if (!newchunk) + ecoError(ECO_MEM_ERROR,error_message,filename,line); + + if (!chunk) + eco_chunk_malloc++; + + if (eco_log_malloc) + fprintf(stderr, + "Old memory segment %p is reallocated at %p with a size of %d (file : %s [%d])", + chunk, + newchunk, + newsize, + filename, + line); + + return newchunk; +} + +void eco_free(void *chunk, + const char *error_message, + const char *filename, + int32_t line) +{ + free(chunk); + + if (eco_log_malloc) + fprintf(stderr, + "Memory segment %p is released => %s (file : %s [%d])", + chunk, + error_message, + filename, + line); + + eco_chunk_malloc--; +} diff --git a/src/libecoPCR/ecoPCR.h b/src/libecoPCR/ecoPCR.h new file mode 100644 index 0000000..bd13942 --- /dev/null +++ b/src/libecoPCR/ecoPCR.h @@ -0,0 +1,270 @@ +#ifndef ECOPCR_H_ +#define ECOPCR_H_ + +#include +#include + +/***************************************************** + * + * Data type declarations + * + *****************************************************/ + +/* + * + * Sequence types + * + */ + +typedef struct { + + int32_t taxid; + char AC[20]; + int32_t DE_length; + int32_t SQ_length; + int32_t CSQ_length; /*what is this CSQ_length ? */ + + char data[1]; + +} ecoseqformat_t; + +typedef struct { + int32_t taxid; + int32_t SQ_length; + int32_t isexample; + char *AC; + char *DE; + char *SQ; + + int32_t ranktaxonid;/*TR: taxon id to which the sequence belongs*/ +} ecoseq_t, *pecoseq_t; + +/* + * + * Taxonomy taxon types + * + */ + + +typedef struct { + int32_t taxid; + int32_t rank; + int32_t parent; + int32_t namelength; + char name[1]; + +} ecotxformat_t; + +typedef struct ecotxnode { + int32_t taxid; + int32_t rank; + struct ecotxnode *parent; + char *name; +} ecotx_t; + +typedef struct { + int32_t count; + ecotx_t taxon[1]; +} ecotxidx_t; + + +/* + * + * Taxonomy rank types + * + */ + +typedef struct { + int32_t count; + char* label[1]; +} ecorankidx_t; + +/* + * + * Taxonomy name types + * + */ + +typedef struct { + int32_t is_scientificname; + int32_t namelength; + int32_t classlength; + int32_t taxid; + char names[1]; +} econameformat_t; + + + typedef struct { + char *name; + char *classname; + int32_t is_scientificname; + struct ecotxnode *taxon; +} econame_t; + + +typedef struct { + int32_t count; + econame_t names[1]; +} econameidx_t; + + + typedef struct { + ecorankidx_t *ranks; + econameidx_t *names; + ecotxidx_t *taxons; +} ecotaxonomy_t; + + +/***************************************************** + * + * Function declarations + * + *****************************************************/ + +/* + * + * Low level system functions + * + */ + +int32_t is_big_endian(); +int32_t swap_int32_t(int32_t); + +void *eco_malloc(int32_t chunksize, + const char *error_message, + const char *filename, + int32_t line); + + +void *eco_realloc(void *chunk, + int32_t chunksize, + const char *error_message, + const char *filename, + int32_t line); + +void eco_free(void *chunk, + const char *error_message, + const char *filename, + int32_t line); + +void eco_trace_memory_allocation(); +void eco_untrace_memory_allocation(); + +#define ECOMALLOC(size,error_message) \ + eco_malloc((size),(error_message),__FILE__,__LINE__) + +#define ECOREALLOC(chunk,size,error_message) \ + eco_realloc((chunk),(size),(error_message),__FILE__,__LINE__) + +#define ECOFREE(chunk,error_message) \ + eco_free((chunk),(error_message),__FILE__,__LINE__) + + + + +/* + * + * Error managment + * + */ + + +void ecoError(int32_t,const char*,const char *,int); + +#define ECOERROR(code,message) ecoError((code),(message),__FILE__,__LINE__) + +#define ECO_IO_ERROR (1) +#define ECO_MEM_ERROR (2) +#define ECO_ASSERT_ERROR (3) +#define ECO_NOTFOUND_ERROR (4) + + +/* + * + * Low level Disk access functions + * + */ + +FILE *open_ecorecorddb(const char *filename, + int32_t *sequencecount, + int32_t abort_on_open_error); + +void *read_ecorecord(FILE *,int32_t *recordSize); + + + +/* + * Read function in internal binary format + */ + +FILE *open_ecoseqdb(const char *filename, + int32_t *sequencecount); + +ecoseq_t *readnext_ecoseq(FILE *); + +ecorankidx_t *read_rankidx(const char *filename); + +econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy); + + + + /** + * Read taxonomy data as formated by the ecoPCRFormat.py script. + * + * This function is normaly uses internaly by the read_taxonomy + * function and should not be called directly. + * + * @arg filename path to the *.tdx file of the reformated db + * + * @return pointer to a taxonomy index structure + */ + +ecotxidx_t *read_taxonomyidx(const char *filename); + +ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName); + +ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy, int32_t taxid); + +ecotx_t *eco_findtaxonatrank(ecotx_t *taxon, int32_t rankidx); + +int eco_isundertaxon(ecotx_t *taxon, int other_taxid); + +ecoseq_t *ecoseq_iterator(const char *prefix); + + + +ecoseq_t *new_ecoseq(); +int32_t delete_ecoseq(ecoseq_t *); +ecoseq_t *new_ecoseq_with_data( char *AC, + char *DE, + char *SQ, + int32_t taxid + ); + + +int32_t delete_taxon(ecotx_t *taxon); +int32_t delete_taxonomy(ecotxidx_t *index); + + +int32_t rank_index(const char* label,ecorankidx_t* ranks); + +//int32_t delete_apatseq(SeqPtr pseq); +//PatternPtr buildPattern(const char *pat, int32_t error_max); +//PatternPtr complementPattern(PatternPtr pat); +// +//SeqPtr ecoseq2apatseq(ecoseq_t *in,SeqPtr out,int32_t circular); + +char *ecoComplementPattern(char *nucAcSeq); +char *ecoComplementSequence(char *nucAcSeq); +char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end); + +ecotx_t *eco_getspecies(ecotx_t *taxon,ecotaxonomy_t *taxonomy); +ecotx_t *eco_getgenus(ecotx_t *taxon,ecotaxonomy_t *taxonomy); +ecotx_t *eco_getfamily(ecotx_t *taxon,ecotaxonomy_t *taxonomy); +ecotx_t *eco_getkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy); +ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy); + +int eco_is_taxid_ignored(int32_t *ignored_taxid, int32_t tab_len, int32_t taxid); +int eco_is_taxid_included(ecotaxonomy_t *taxonomy, int32_t *included_taxid, int32_t tab_len, int32_t taxid); + +#endif /*ECOPCR_H_*/ diff --git a/src/libecoPCR/ecoapat.c b/src/libecoPCR/ecoapat.c new file mode 100644 index 0000000..284e579 --- /dev/null +++ b/src/libecoPCR/ecoapat.c @@ -0,0 +1,202 @@ +#include "../libapat/libstki.h" +#include "../libapat/apat.h" + +#include "ecoPCR.h" + +#include + +static void EncodeSequence(SeqPtr seq); +static void UpperSequence(char *seq); + +/* -------------------------------------------- */ +/* uppercase sequence */ +/* -------------------------------------------- */ + +#define IS_LOWER(c) (((c) >= 'a') && ((c) <= 'z')) +#define TO_UPPER(c) ((c) - 'a' + 'A') + +void UpperSequence(char *seq) +{ + char *cseq; + + for (cseq = seq ; *cseq ; cseq++) + if (IS_LOWER(*cseq)) + *cseq = TO_UPPER(*cseq); +} + +#undef IS_LOWER +#undef TO_UPPER + + + + +/* -------------------------------------------- */ +/* encode sequence */ +/* IS_UPPER is slightly faster than isupper */ +/* -------------------------------------------- */ + +#define IS_UPPER(c) (((c) >= 'A') && ((c) <= 'Z')) + + + +void EncodeSequence(SeqPtr seq) +{ + int i; + UInt8 *data; + char *cseq; + + data = seq->data; + cseq = seq->cseq; + + while (*cseq) { + + *data = (IS_UPPER(*cseq) ? *cseq - 'A' : 0x0); + data++; + cseq++; + } + + for (i=0,cseq=seq->cseq;i < seq->circular; i++,cseq++,data++) + *data = (IS_UPPER(*cseq) ? *cseq - 'A' : 0x0); + + for (i = 0 ; i < MAX_PATTERN ; i++) + seq->hitpos[i]->top = seq->hiterr[i]->top = 0; + +} + +#undef IS_UPPER + + +SeqPtr ecoseq2apatseq(ecoseq_t *in,SeqPtr out,int32_t circular) +{ + int i; + + if (!out) + { + out = ECOMALLOC(sizeof(Seq), + "Error in Allocation of a new Seq structure"); + + for (i = 0 ; i < MAX_PATTERN ; i++) + { + + if (! (out->hitpos[i] = NewStacki(kMinStackiSize))) + ECOERROR(ECO_MEM_ERROR,"Error in hit stack Allocation"); + + if (! (out->hiterr[i] = NewStacki(kMinStackiSize))) + ECOERROR(ECO_MEM_ERROR,"Error in error stack Allocation"); + } + } + + + out->name = in->AC; + out->seqsiz = out->seqlen = in->SQ_length; + out->circular = circular; + + if (!out->data) + { + out->data = ECOMALLOC((out->seqlen+circular) *sizeof(UInt8), + "Error in Allocation of a new Seq data member"); + out->datsiz= out->seqlen+circular; + } + else if ((out->seqlen +circular) >= out->datsiz) + { + out->data = ECOREALLOC(out->data,(out->seqlen+circular), + "Error during Seq data buffer realloc"); + out->datsiz= out->seqlen+circular; + } + + out->cseq = in->SQ; + + EncodeSequence(out); + + return out; +} + +int32_t delete_apatseq(SeqPtr pseq) +{ + int i; + + if (pseq) { + + if (pseq->data) + ECOFREE(pseq->data,"Freeing sequence data buffer"); + + for (i = 0 ; i < MAX_PATTERN ; i++) { + if (pseq->hitpos[i]) FreeStacki(pseq->hitpos[i]); + if (pseq->hiterr[i]) FreeStacki(pseq->hiterr[i]); + } + + ECOFREE(pseq,"Freeing apat sequence structure"); + + return 0; + } + + return 1; +} + +/* + +PatternPtr buildPattern(const char *pat, int32_t error_max) +{ + PatternPtr pattern; + int32_t patlen; + + pattern = ECOMALLOC(sizeof(Pattern), + "Error in pattern allocation"); + + pattern->ok = Vrai; + pattern->hasIndel= Faux; + pattern->maxerr = error_max; + patlen = strlen(pat); + + pattern->cpat = ECOMALLOC(sizeof(char)*patlen+1, + "Error in sequence pattern allocation"); + + strncpy(pattern->cpat,pat,patlen); + pattern->cpat[patlen]=0; + UpperSequence(pattern->cpat); + + if (!CheckPattern(pattern)) + ECOERROR(ECO_ASSERT_ERROR,"Error in pattern checking"); + + if (! EncodePattern(pattern, dna)) + ECOERROR(ECO_ASSERT_ERROR,"Error in pattern encoding"); + + if (! CreateS(pattern, ALPHA_LEN)) + ECOERROR(ECO_ASSERT_ERROR,"Error in pattern compiling"); + + return pattern; + +} + +PatternPtr complementPattern(PatternPtr pat) +{ + PatternPtr pattern; + + pattern = ECOMALLOC(sizeof(Pattern), + "Error in pattern allocation"); + + pattern->ok = Vrai; + pattern->hasIndel= pat->hasIndel; + pattern->maxerr = pat->maxerr; + pattern->patlen = pat->patlen; + + pattern->cpat = ECOMALLOC(sizeof(char)*(strlen(pat->cpat)+1), + "Error in sequence pattern allocation"); + + strcpy(pattern->cpat,pat->cpat); + + ecoComplementPattern(pattern->cpat); + + if (!CheckPattern(pattern)) + ECOERROR(ECO_ASSERT_ERROR,"Error in pattern checking"); + + if (! EncodePattern(pattern, dna)) + ECOERROR(ECO_ASSERT_ERROR,"Error in pattern encoding"); + + if (! CreateS(pattern, ALPHA_LEN)) + ECOERROR(ECO_ASSERT_ERROR,"Error in pattern compiling"); + + return pattern; + +} +*/ diff --git a/src/libecoPCR/ecodna.P b/src/libecoPCR/ecodna.P new file mode 100644 index 0000000..b9a71b9 --- /dev/null +++ b/src/libecoPCR/ecodna.P @@ -0,0 +1,5 @@ +ecodna.o ecodna.P : ecodna.c /usr/include/string.h /usr/include/_types.h \ + /usr/include/sys/_types.h /usr/include/sys/cdefs.h \ + /usr/include/machine/_types.h /usr/include/i386/_types.h ecoPCR.h \ + /usr/include/stdio.h /usr/include/inttypes.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h diff --git a/src/libecoPCR/ecodna.c b/src/libecoPCR/ecodna.c new file mode 100644 index 0000000..7d29a0e --- /dev/null +++ b/src/libecoPCR/ecodna.c @@ -0,0 +1,153 @@ +#include +#include "ecoPCR.h" + +/* + * @doc: DNA alphabet (IUPAC) + */ +#define LX_BIO_DNA_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZ#![]" + +/* + * @doc: complementary DNA alphabet (IUPAC) + */ +#define LX_BIO_CDNA_ALPHA "TVGHEFCDIJMLKNOPQYSAABWXRZ#!][" + + +static char sNuc[] = LX_BIO_DNA_ALPHA; +static char sAnuc[] = LX_BIO_CDNA_ALPHA; + +static char LXBioBaseComplement(char nucAc); +static char *LXBioSeqComplement(char *nucAcSeq); +static char *reverseSequence(char *str,char isPattern); + + +/* ---------------------------- */ + +char LXBioBaseComplement(char nucAc) +{ + char *c; + + if ((c = strchr(sNuc, nucAc))) + return sAnuc[(c - sNuc)]; + else + return nucAc; +} + +/* ---------------------------- */ + +char *LXBioSeqComplement(char *nucAcSeq) +{ + char *s; + + for (s = nucAcSeq ; *s ; s++) + *s = LXBioBaseComplement(*s); + + return nucAcSeq; +} + + +char *reverseSequence(char *str,char isPattern) +{ + char *sb, *se, c; + + if (! str) + return str; + + sb = str; + se = str + strlen(str) - 1; + + while(sb <= se) { + c = *sb; + *sb++ = *se; + *se-- = c; + } + + sb = str; + se = str + strlen(str) - 1; + + if (isPattern) + for (;sb < se; sb++) + { + if (*sb=='#') + { + if (((se - sb) > 2) && (*(sb+2)=='!')) + { + *sb='!'; + sb+=2; + *sb='#'; + } + else + { + *sb=*(sb+1); + sb++; + *sb='#'; + } + } + else if (*sb=='!') + { + *sb=*(sb-1); + *(sb-1)='!'; + } + } + + return str; +} + +char *ecoComplementPattern(char *nucAcSeq) +{ + return reverseSequence(LXBioSeqComplement(nucAcSeq),1); +} + +char *ecoComplementSequence(char *nucAcSeq) +{ + return reverseSequence(LXBioSeqComplement(nucAcSeq),0); +} + + +char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end) +{ + static char *buffer = NULL; + static int32_t buffSize= 0; + int32_t length; + + if (begin < end) + { + length = end - begin; + + if (length >= buffSize) + { + buffSize = length+1; + if (buffer) + buffer=ECOREALLOC(buffer,buffSize, + "Error in reallocating sub sequence buffer"); + else + buffer=ECOMALLOC(buffSize, + "Error in allocating sub sequence buffer"); + + } + + strncpy(buffer,nucAcSeq + begin,length); + buffer[length]=0; + } + else + { + length = end + strlen(nucAcSeq) - begin; + + if (length >= buffSize) + { + buffSize = length+1; + if (buffer) + buffer=ECOREALLOC(buffer,buffSize, + "Error in reallocating sub sequence buffer"); + else + buffer=ECOMALLOC(buffSize, + "Error in allocating sub sequence buffer"); + + } + strncpy(buffer,nucAcSeq+begin,length - end); + strncpy(buffer+(length-end),nucAcSeq ,end); + buffer[length]=0; + } + + return buffer; +} + diff --git a/src/libecoPCR/ecofilter.P b/src/libecoPCR/ecofilter.P new file mode 100644 index 0000000..d46d3e0 --- /dev/null +++ b/src/libecoPCR/ecofilter.P @@ -0,0 +1,5 @@ +ecofilter.o ecofilter.P : ecofilter.c ecoPCR.h /usr/include/stdio.h \ + /usr/include/_types.h /usr/include/sys/_types.h \ + /usr/include/sys/cdefs.h /usr/include/machine/_types.h \ + /usr/include/i386/_types.h /usr/include/inttypes.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h diff --git a/src/libecoPCR/ecofilter.c b/src/libecoPCR/ecofilter.c new file mode 100644 index 0000000..8a7dbb1 --- /dev/null +++ b/src/libecoPCR/ecofilter.c @@ -0,0 +1,19 @@ +#include "ecoPCR.h" + +int eco_is_taxid_included( ecotaxonomy_t *taxonomy, + int32_t *restricted_taxid, + int32_t tab_len, + int32_t taxid) +{ + int i; + ecotx_t *taxon; + + taxon = eco_findtaxonbytaxid(taxonomy, taxid); + + for (i=0; i < tab_len; i++) + if ( (taxon->taxid == restricted_taxid[i]) || + (eco_isundertaxon(taxon, restricted_taxid[i])) ) + return 1; + + return 0; +} \ No newline at end of file diff --git a/src/libecoPCR/econame.P b/src/libecoPCR/econame.P new file mode 100644 index 0000000..4c9946c --- /dev/null +++ b/src/libecoPCR/econame.P @@ -0,0 +1,15 @@ +econame.o econame.P : econame.c ecoPCR.h /usr/include/stdio.h /usr/include/_types.h \ + /usr/include/sys/_types.h /usr/include/sys/cdefs.h \ + /usr/include/machine/_types.h /usr/include/i386/_types.h \ + /usr/include/inttypes.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \ + /usr/include/string.h /usr/include/stdlib.h /usr/include/available.h \ + /usr/include/sys/wait.h /usr/include/sys/signal.h \ + /usr/include/sys/appleapiopts.h /usr/include/machine/signal.h \ + /usr/include/i386/signal.h /usr/include/i386/_structs.h \ + /usr/include/sys/_structs.h /usr/include/machine/_structs.h \ + /usr/include/mach/i386/_structs.h /usr/include/sys/resource.h \ + /usr/include/machine/endian.h /usr/include/i386/endian.h \ + /usr/include/sys/_endian.h /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \ + /usr/include/machine/types.h /usr/include/i386/types.h diff --git a/src/libecoPCR/econame.c b/src/libecoPCR/econame.c new file mode 100644 index 0000000..835d79c --- /dev/null +++ b/src/libecoPCR/econame.c @@ -0,0 +1,61 @@ +#include "ecoPCR.h" +#include +#include + +static econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy); + +econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy) +{ + + int32_t count; + FILE *f; + econameidx_t *indexname; + int32_t i; + + f = open_ecorecorddb(filename,&count,1); + + indexname = (econameidx_t*) ECOMALLOC(sizeof(econameidx_t) + sizeof(econame_t) * (count-1),"Allocate names"); + + indexname->count=count; + + for (i=0; i < count; i++){ + readnext_econame(f,(indexname->names)+i,taxonomy); + } + + return indexname; +} + +econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy) +{ + + econameformat_t *raw; + int32_t rs; + + raw = read_ecorecord(f,&rs); + + if (!raw) + return NULL; + + if (is_big_endian()) + { + raw->is_scientificname = swap_int32_t(raw->is_scientificname); + raw->namelength = swap_int32_t(raw->namelength); + raw->classlength = swap_int32_t(raw->classlength); + raw->taxid = swap_int32_t(raw->taxid); + } + + name->is_scientificname=raw->is_scientificname; + + name->name = ECOMALLOC((raw->namelength+1) * sizeof(char),"Allocate name"); + strncpy(name->name,raw->names,raw->namelength); + name->name[raw->namelength]=0; + + name->classname = ECOMALLOC((raw->classlength+1) * sizeof(char),"Allocate classname"); + strncpy(name->classname,(raw->names+raw->namelength),raw->classlength); + name->classname[raw->classlength]=0; + + name->taxon = taxonomy->taxons->taxon + raw->taxid; + + return name; +} + diff --git a/src/libecoPCR/ecorank.P b/src/libecoPCR/ecorank.P new file mode 100644 index 0000000..75e09b9 --- /dev/null +++ b/src/libecoPCR/ecorank.P @@ -0,0 +1,15 @@ +ecorank.o ecorank.P : ecorank.c ecoPCR.h /usr/include/stdio.h /usr/include/_types.h \ + /usr/include/sys/_types.h /usr/include/sys/cdefs.h \ + /usr/include/machine/_types.h /usr/include/i386/_types.h \ + /usr/include/inttypes.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \ + /usr/include/string.h /usr/include/stdlib.h /usr/include/available.h \ + /usr/include/sys/wait.h /usr/include/sys/signal.h \ + /usr/include/sys/appleapiopts.h /usr/include/machine/signal.h \ + /usr/include/i386/signal.h /usr/include/i386/_structs.h \ + /usr/include/sys/_structs.h /usr/include/machine/_structs.h \ + /usr/include/mach/i386/_structs.h /usr/include/sys/resource.h \ + /usr/include/machine/endian.h /usr/include/i386/endian.h \ + /usr/include/sys/_endian.h /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \ + /usr/include/machine/types.h /usr/include/i386/types.h diff --git a/src/libecoPCR/ecorank.c b/src/libecoPCR/ecorank.c new file mode 100644 index 0000000..4796088 --- /dev/null +++ b/src/libecoPCR/ecorank.c @@ -0,0 +1,52 @@ +#include "ecoPCR.h" +#include +#include + +static int compareRankLabel(const void *label1, const void *label2); + +ecorankidx_t *read_rankidx(const char *filename) +{ + int32_t count; + FILE *f; + ecorankidx_t *index; + int32_t i; + int32_t rs; + char *buffer; + + f = open_ecorecorddb(filename,&count,1); + + index = (ecorankidx_t*) ECOMALLOC(sizeof(ecorankidx_t) + sizeof(char*) * (count-1), + "Allocate rank index"); + + index->count=count; + + for (i=0; i < count; i++) + { + buffer = read_ecorecord(f,&rs); + index->label[i]=(char*) ECOMALLOC(rs+1, + "Allocate rank label"); + strncpy(index->label[i],buffer,rs); + } + + return index; +} + +int32_t rank_index(const char* label,ecorankidx_t* ranks) +{ + char **rep; + + rep = bsearch(label,ranks->label,ranks->count,sizeof(char*),compareRankLabel); + + if (rep) + return rep-ranks->label; + else + ECOERROR(ECO_NOTFOUND_ERROR,"Rank label not found"); + + return -1; +} + + +int compareRankLabel(const void *label1, const void *label2) +{ + return strcmp((const char*)label1,*(const char**)label2); +} diff --git a/src/libecoPCR/ecoseq.P b/src/libecoPCR/ecoseq.P new file mode 100644 index 0000000..6222690 --- /dev/null +++ b/src/libecoPCR/ecoseq.P @@ -0,0 +1,19 @@ +ecoseq.o ecoseq.P : ecoseq.c ecoPCR.h /usr/include/stdio.h /usr/include/_types.h \ + /usr/include/sys/_types.h /usr/include/sys/cdefs.h \ + /usr/include/machine/_types.h /usr/include/i386/_types.h \ + /usr/include/inttypes.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \ + /usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \ + /usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \ + /usr/include/machine/signal.h /usr/include/i386/signal.h \ + /usr/include/i386/_structs.h /usr/include/sys/_structs.h \ + /usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \ + /usr/include/sys/resource.h /usr/include/machine/endian.h \ + /usr/include/i386/endian.h /usr/include/sys/_endian.h \ + /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \ + /usr/include/machine/types.h /usr/include/i386/types.h \ + /usr/include/string.h /usr/include/zlib.h /usr/include/zconf.h \ + /usr/include/sys/types.h /usr/include/unistd.h \ + /usr/include/sys/unistd.h /usr/include/sys/select.h \ + /usr/include/sys/_select.h diff --git a/src/libecoPCR/ecoseq.c b/src/libecoPCR/ecoseq.c new file mode 100644 index 0000000..141db5d --- /dev/null +++ b/src/libecoPCR/ecoseq.c @@ -0,0 +1,227 @@ +#include "ecoPCR.h" +#include +#include +#include +#include +#include + +static FILE *open_seqfile(const char *prefix,int32_t index); + + +ecoseq_t *new_ecoseq() +{ + void *tmp; + + tmp = ECOMALLOC(sizeof(ecoseq_t),"Allocate new ecoseq structure"); + + return tmp; +} + +int32_t delete_ecoseq(ecoseq_t * seq) +{ + + if (seq) + { + if (seq->AC) + ECOFREE(seq->AC,"Free sequence AC"); + + if (seq->DE) + ECOFREE(seq->DE,"Free sequence DE"); + + if (seq->SQ) + ECOFREE(seq->SQ,"Free sequence SQ"); + + ECOFREE(seq,"Free sequence structure"); + + return 0; + + } + + return 1; +} + +ecoseq_t *new_ecoseq_with_data( char *AC, + char *DE, + char *SQ, + int32_t taxid_idx + ) +{ + ecoseq_t *tmp; + int32_t lstr; + tmp = new_ecoseq(); + + tmp->taxid=taxid_idx; + + if (AC) + { + lstr =strlen(AC); + tmp->AC=ECOMALLOC((lstr+1) * sizeof(char), + "Allocate sequence accession"); + strcpy(tmp->AC,AC); + } + + if (DE) + { + lstr =strlen(DE); + tmp->DE=ECOMALLOC((lstr+1) * sizeof(char), + "Allocate sequence definition"); + strcpy(tmp->DE,DE); + } + + if (SQ) + { + lstr =strlen(SQ); + tmp->SQ=ECOMALLOC((lstr+1) * sizeof(char), + "Allocate sequence data"); + strcpy(tmp->SQ,SQ); + } + + tmp->isexample=1; + + return tmp; + +} + +/** + * ?? used ?? + **/ +FILE *open_ecoseqdb(const char *filename, + int32_t *sequencecount) +{ + return open_ecorecorddb(filename,sequencecount,1); +} + +ecoseq_t *readnext_ecoseq(FILE *f) +{ + char *compressed=NULL; + + ecoseqformat_t *raw; + ecoseq_t *seq; + int32_t comp_status; + unsigned long int seqlength; + int32_t rs; + + raw = read_ecorecord(f,&rs); + + if (!raw) + return NULL; + + if (is_big_endian()) + { + raw->CSQ_length = swap_int32_t(raw->CSQ_length); + raw->DE_length = swap_int32_t(raw->DE_length); + raw->SQ_length = swap_int32_t(raw->SQ_length); + raw->taxid = swap_int32_t(raw->taxid); + } + + seq = new_ecoseq(); + + seq->taxid = raw->taxid; + + seq->AC = ECOMALLOC(strlen(raw->AC) +1, + "Allocate Sequence Accesion number"); + strncpy(seq->AC,raw->AC,strlen(raw->AC)); + + + seq->DE = ECOMALLOC(raw->DE_length+1, + "Allocate Sequence definition"); + strncpy(seq->DE,raw->data,raw->DE_length); + + seqlength = seq->SQ_length = raw->SQ_length; + + compressed = raw->data + raw->DE_length; + + seq->SQ = ECOMALLOC(seqlength+1, + "Allocate sequence buffer"); + + seq->isexample=1; + + comp_status = uncompress((unsigned char*)seq->SQ, + &seqlength, + (unsigned char*)compressed, + raw->CSQ_length); + + if (comp_status != Z_OK) + ECOERROR(ECO_IO_ERROR,"I cannot uncompress sequence data"); + + return seq; +} + +/** + * Open the sequences database (.sdx file) + * @param prefix name of the database (radical without extension) + * @param index integer + * + * @return file object + */ +FILE *open_seqfile(const char *prefix,int32_t index) +{ + char filename_buffer[1024]; + int32_t filename_length; + FILE *input; + int32_t seqcount; + + filename_length = snprintf(filename_buffer, + 1023, + "%s_%03d.sdx", + prefix, + index); + + + + if (filename_length >= 1024) + ECOERROR(ECO_ASSERT_ERROR,"file name is too long"); + + filename_buffer[filename_length]=0; + + input=open_ecorecorddb(filename_buffer,&seqcount,0); + + if (input) + fprintf(stderr,"# Reading file %s containing %d sequences...\n", + filename_buffer, + seqcount); + + return input; +} + +ecoseq_t *ecoseq_iterator(const char *prefix) +{ + static FILE *current_seq_file= NULL; + static int32_t current_file_idx = 1; + static char current_prefix[1024]; + ecoseq_t *seq; + + if (prefix) + { + current_file_idx = 1; + + if (current_seq_file) + fclose(current_seq_file); + + strncpy(current_prefix,prefix,1023); + current_prefix[1024]=0; + + current_seq_file = open_seqfile(current_prefix, + current_file_idx); + + if (!current_seq_file) + return NULL; + + } + + seq = readnext_ecoseq(current_seq_file); + + if (!seq && feof(current_seq_file)) + { + current_file_idx++; + fclose(current_seq_file); + current_seq_file = open_seqfile(current_prefix, + current_file_idx); + + + if (current_seq_file) + seq = readnext_ecoseq(current_seq_file); + } + + return seq; +} diff --git a/src/libecoPCR/ecotax.P b/src/libecoPCR/ecotax.P new file mode 100644 index 0000000..489fc97 --- /dev/null +++ b/src/libecoPCR/ecotax.P @@ -0,0 +1,15 @@ +ecotax.o ecotax.P : ecotax.c ecoPCR.h /usr/include/stdio.h /usr/include/_types.h \ + /usr/include/sys/_types.h /usr/include/sys/cdefs.h \ + /usr/include/machine/_types.h /usr/include/i386/_types.h \ + /usr/include/inttypes.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \ + /usr/include/string.h /usr/include/stdlib.h /usr/include/available.h \ + /usr/include/sys/wait.h /usr/include/sys/signal.h \ + /usr/include/sys/appleapiopts.h /usr/include/machine/signal.h \ + /usr/include/i386/signal.h /usr/include/i386/_structs.h \ + /usr/include/sys/_structs.h /usr/include/machine/_structs.h \ + /usr/include/mach/i386/_structs.h /usr/include/sys/resource.h \ + /usr/include/machine/endian.h /usr/include/i386/endian.h \ + /usr/include/sys/_endian.h /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \ + /usr/include/machine/types.h /usr/include/i386/types.h diff --git a/src/libecoPCR/ecotax.c b/src/libecoPCR/ecotax.c new file mode 100644 index 0000000..a0ade86 --- /dev/null +++ b/src/libecoPCR/ecotax.c @@ -0,0 +1,329 @@ +#include "ecoPCR.h" +#include +#include +#include + +static ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon); + + /** + * Open the taxonomy database + * @param pointer to the database (.tdx file) + * @return a ecotxidx_t structure + */ +ecotxidx_t *read_taxonomyidx(const char *filename) +{ + int32_t count; + FILE *f; + ecotxidx_t *index; + int32_t i; + + f = open_ecorecorddb(filename,&count,1); + + index = (ecotxidx_t*) ECOMALLOC(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count-1), + "Allocate taxonomy"); + + index->count=count; + for (i=0; i < count; i++){ + readnext_ecotaxon(f,&(index->taxon[i])); + index->taxon[i].parent=index->taxon + (size_t)index->taxon[i].parent; + } + return index; +} + + +int32_t delete_taxonomy(ecotxidx_t *index) +{ + int32_t i; + + if (index) + { + for (i=0; i< index->count; i++) + if (index->taxon[i].name) + ECOFREE(index->taxon[i].name,"Free scientific name"); + + ECOFREE(index,"Free Taxonomy"); + + return 0; + } + + return 1; +} + + + +int32_t delete_taxon(ecotx_t *taxon) +{ + if (taxon) + { + if (taxon->name) + ECOFREE(taxon->name,"Free scientific name"); + + ECOFREE(taxon,"Free Taxon"); + + return 0; + } + + return 1; +} + + +/** + * Read the database for a given taxon a save the data + * into the taxon structure(if any found) + * @param *f pointer to FILE type returned by fopen + * @param *taxon pointer to the structure + * + * @return a ecotx_t structure if any taxon found else NULL + */ +ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon) +{ + + ecotxformat_t *raw; + int32_t rs; + + raw = read_ecorecord(f,&rs); + + if (!raw) + return NULL; + + if (is_big_endian()) + { + raw->namelength = swap_int32_t(raw->namelength); + raw->parent = swap_int32_t(raw->parent); + raw->rank = swap_int32_t(raw->rank); + raw->taxid = swap_int32_t(raw->taxid); + } + + taxon->parent = (ecotx_t*)(size_t)raw->parent; + taxon->taxid = raw->taxid; + taxon->rank = raw->rank; + + taxon->name = ECOMALLOC((raw->namelength+1) * sizeof(char), + "Allocate taxon scientific name"); + + strncpy(taxon->name,raw->name,raw->namelength); + + return taxon; +} + + +ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName) +{ + ecotaxonomy_t *tax; + char *filename; + int buffsize; + + tax = ECOMALLOC(sizeof(ecotaxonomy_t), + "Allocate taxonomy structure"); + + buffsize = strlen(prefix)+10; + + filename = ECOMALLOC(buffsize, + "Allocate filename"); + + snprintf(filename,buffsize,"%s.rdx",prefix); + + tax->ranks = read_rankidx(filename); + + snprintf(filename,buffsize,"%s.tdx",prefix); + + tax->taxons = read_taxonomyidx(filename); + + if (readAlternativeName) + { + snprintf(filename,buffsize,"%s.ndx",prefix); + tax->names=read_nameidx(filename,tax); + } + else + tax->names=NULL; + return tax; + +} + + + +int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy) +{ + if (taxonomy) + { + if (taxonomy->ranks) + ECOFREE(taxonomy->ranks,"Free rank index"); + + if (taxonomy->taxons) + ECOFREE(taxonomy->taxons,"Free taxon index"); + + ECOFREE(taxonomy,"Free taxonomy structure"); + + return 0; + } + + return 1; +} + +ecotx_t *eco_findtaxonatrank(ecotx_t *taxon, + int32_t rankidx) +{ + ecotx_t *current_taxon; + ecotx_t *next_taxon; + + current_taxon = taxon; + next_taxon = current_taxon->parent; + + while ((current_taxon!=next_taxon) && // I' am the root node + (current_taxon->rank!=rankidx)) + { + current_taxon = next_taxon; + next_taxon = current_taxon->parent; + } + + if (current_taxon->rank==rankidx) + return current_taxon; + else + return NULL; +} + +/** + * Get back information concerning a taxon from a taxonomic id + * @param *taxonomy the taxonomy database + * @param taxid the taxonomic id + * + * @result a ecotx_t structure containing the taxonimic information + **/ +ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy, + int32_t taxid) +{ + ecotx_t *current_taxon; + int32_t taxoncount; + int32_t i; + + taxoncount=taxonomy->taxons->count; + + for (current_taxon=taxonomy->taxons->taxon, + i=0; + i < taxoncount; + i++, + current_taxon++){ + if (current_taxon->taxid==taxid){ + return current_taxon; + } + } + + return (ecotx_t*)NULL; +} + +/** + * Find out if taxon is son of other taxon (identified by its taxid) + * @param *taxon son taxon + * @param parent_taxid taxonomic id of the other taxon + * + * @return 1 is the other taxid math a parent taxid, else 0 + **/ +int eco_isundertaxon(ecotx_t *taxon, + int other_taxid) +{ + ecotx_t *next_parent; + + next_parent = taxon->parent; + + while ( (other_taxid != next_parent->taxid) && + (strcmp(next_parent->name, "root")) ) + { + next_parent = next_parent->parent; + } + + if (other_taxid == next_parent->taxid) + return 1; + else + return 0; +} + +ecotx_t *eco_getspecies(ecotx_t *taxon, + ecotaxonomy_t *taxonomy) +{ + static ecotaxonomy_t *tax=NULL; + static int32_t rankindex=-1; + + if (taxonomy && tax!=taxonomy) + { + rankindex = rank_index("species",taxonomy->ranks); + tax=taxonomy; + } + + if (!tax || rankindex < 0) + ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); + + return eco_findtaxonatrank(taxon,rankindex); +} + +ecotx_t *eco_getgenus(ecotx_t *taxon, + ecotaxonomy_t *taxonomy) +{ + static ecotaxonomy_t *tax=NULL; + static int32_t rankindex=-1; + + if (taxonomy && tax!=taxonomy) + { + rankindex = rank_index("genus",taxonomy->ranks); + tax=taxonomy; + } + + if (!tax || rankindex < 0) + ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); + + return eco_findtaxonatrank(taxon,rankindex); +} + + +ecotx_t *eco_getfamily(ecotx_t *taxon, + ecotaxonomy_t *taxonomy) +{ + static ecotaxonomy_t *tax=NULL; + static int32_t rankindex=-1; + + if (taxonomy && tax!=taxonomy) + { + rankindex = rank_index("family",taxonomy->ranks); + tax=taxonomy; + } + + if (!tax || rankindex < 0) + ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); + + return eco_findtaxonatrank(taxon,rankindex); +} + +ecotx_t *eco_getkingdom(ecotx_t *taxon, + ecotaxonomy_t *taxonomy) +{ + static ecotaxonomy_t *tax=NULL; + static int32_t rankindex=-1; + + if (taxonomy && tax!=taxonomy) + { + rankindex = rank_index("kingdom",taxonomy->ranks); + tax=taxonomy; + } + + if (!tax || rankindex < 0) + ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); + + return eco_findtaxonatrank(taxon,rankindex); +} + +ecotx_t *eco_getsuperkingdom(ecotx_t *taxon, + ecotaxonomy_t *taxonomy) +{ + static ecotaxonomy_t *tax=NULL; + static int32_t rankindex=-1; + + if (taxonomy && tax!=taxonomy) + { + rankindex = rank_index("superkingdom",taxonomy->ranks); + tax=taxonomy; + } + + if (!tax || rankindex < 0) + ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); + + return eco_findtaxonatrank(taxon,rankindex); +} diff --git a/src/libecoprimer/Makefile b/src/libecoprimer/Makefile new file mode 100644 index 0000000..d68202b --- /dev/null +++ b/src/libecoprimer/Makefile @@ -0,0 +1,34 @@ + +SOURCES = goodtaxon.c \ + readdnadb.c \ + smothsort.c \ + sortword.c \ + hashsequence.c \ + strictprimers.c \ + aproxpattern.c \ + merge.c \ + queue.c \ + libstki.c \ + sortmatch.c \ + pairs.c \ + apat_search.c + +SRCS=$(SOURCES) + +OBJECTS= $(patsubst %.c,%.o,$(SOURCES)) + +LIBFILE= libecoprimer.a +RANLIB= ranlib + + +include ../global.mk + + +all: $(LIBFILE) + +clean: + rm -rf $(OBJECTS) $(LIBFILE) + +$(LIBFILE): $(OBJECTS) + ar -cr $@ $? + $(RANLIB) $@ diff --git a/src/libecoprimer/apat.h b/src/libecoprimer/apat.h new file mode 100644 index 0000000..dd9ae06 --- /dev/null +++ b/src/libecoprimer/apat.h @@ -0,0 +1,120 @@ +/* ==================================================== */ +/* Copyright (c) Atelier de BioInformatique */ +/* Dec. 94 */ +/* File: apat.h */ +/* Purpose: pattern scan */ +/* History: */ +/* 28/12/94 : ascan first version */ +/* 14/05/99 : last revision */ +/* ==================================================== */ + + +#ifndef H_apat +#define H_apat + + +#include "libstki.h" +#include "inttypes.h" +#include "../libecoPCR/ecoPCR.h" + + +/* ----------------------------------------------- */ +/* constantes */ +/* ----------------------------------------------- */ + +#ifndef BUFSIZ +#define BUFSIZ 1024 /* io buffer size */ +#endif + +#define MAX_NAME_LEN BUFSIZ /* max length of sequence name */ + +#define ALPHA_LEN 4 /* alphabet length */ + /* *DO NOT* modify */ + +#define MAX_PATTERN 4 /* max # of patterns */ + /* *DO NOT* modify */ + +#define MAX_PAT_LEN 32 /* max pattern length */ + /* *DO NOT* modify */ + +#define MAX_PAT_ERR 32 /* max # of errors */ + /* *DO NOT* modify */ + +#define PATMASK 0x3ffffff /* mask for 26 symbols */ + /* *DO NOT* modify */ + +#define OBLIBIT 0x4000000 /* bit 27 to 1 -> oblig. pos */ + /* *DO NOT* modify */ + + /* mask for position */ +#define ONEMASK 0x80000000 /* mask for highest position */ + + /* masks for Levenhstein edit */ +#define OPER_IDT 0x00000000 /* identity */ +#define OPER_INS 0x40000000 /* insertion */ +#define OPER_DEL 0x80000000 /* deletion */ +#define OPER_SUB 0xc0000000 /* substitution */ + +#define OPER_SHFT 30 /* shift */ + + /* Levenhstein Opcodes */ +#define SOPER_IDT 0x0 /* identity */ +#define SOPER_INS 0x1 /* insertion */ +#define SOPER_DEL 0x2 /* deletion */ +#define SOPER_SUB 0x3 /* substitution */ + + /* Levenhstein Opcodes masks */ +#define OPERMASK 0xc0000000 /* mask for Opcodes */ +#define NOPERMASK 0x3fffffff /* negate of previous */ + + + +/* ----------------------------------------------- */ +/* data structures */ +/* ----------------------------------------------- */ + + +typedef uint32_t pattern_t[ALPHA_LEN], *ppattern_t; + + /* -------------------- */ +typedef struct { /* pattern */ + /* -------------------- */ +int patlen; /* pattern length */ +int maxerr; /* max # of errors */ +uint32_t omask; /* oblig. bits mask */ +bool_t circular; /* is circular sequence */ +} patternParam_t, *ppatternParam_t; + + +/* ----------------------------------------------- */ +/* macros */ +/* ----------------------------------------------- */ + +#ifndef NEW +#define NEW(typ) (typ*)malloc(sizeof(typ)) +#define NEWN(typ, dim) (typ*)malloc((unsigned long)(dim) * sizeof(typ)) +#define REALLOC(typ, ptr, dim) (typ*)realloc((void *) (ptr), (unsigned long)(dim) * sizeof(typ)) +#define FREE(ptr) free((void *) ptr) +#endif + +/* ----------------------------------------------- */ +/* prototypes */ +/* ----------------------------------------------- */ + + /* apat_search.c */ + +int32_t ManberNoErr(pecoseq_t pseq,ppattern_t pat, + ppatternParam_t param, + StackiPtr stkpos); + +int32_t ManberSub(pecoseq_t pseq,ppattern_t pat, + ppatternParam_t param, + StackiPtr stkpos); + +int32_t ManberAll(pecoseq_t pseq,ppattern_t pat, + ppatternParam_t param, + StackiPtr stkpos); + + +#endif /* H_apat */ + diff --git a/src/libecoprimer/apat_parse.c b/src/libecoprimer/apat_parse.c new file mode 100644 index 0000000..c11b3a7 --- /dev/null +++ b/src/libecoprimer/apat_parse.c @@ -0,0 +1,65 @@ +/* ==================================================== */ +/* Copyright (c) Atelier de BioInformatique */ +/* Mar. 92 */ +/* File: apat_parse.c */ +/* Purpose: Codage du pattern */ +/* History: */ +/* 00/07/94 : first version (stanford) */ +/* 00/11/94 : revised for DNA/PROTEIN */ +/* 30/12/94 : modified EncodePattern */ +/* for manber search */ +/* 14/05/99 : indels added */ +/* ==================================================== */ + +#include +#include +#include +#include + +#include "apat.h" +#include "ecoprimer.h" + + + /* IUPAC Dna */ +static int32_t sDnaCode[] = { + /* IUPAC */ + + 0x00000001 /* A */, 0x0000000E /* B */, 0x00000002 /* C */, + 0x0000000D /* D */, 0x00000000 /* E */, 0x00000000 /* F */, + 0x00000004 /* G */, 0x0000000B /* H */, 0x00000000 /* I */, + 0x00000000 /* J */, 0x0000000C /* K */, 0x00000000 /* L */, + 0x00000003 /* M */, 0x0000000F /* N */, 0x00000000 /* O */, + 0x00000000 /* P */, 0x00000000 /* Q */, 0x00000005 /* R */, + 0x00000006 /* S */, 0x00000008 /* T */, 0x00000008 /* U */, + 0x00000007 /* V */, 0x00000009 /* W */, 0x00000000 /* X */, + 0x0000000A /* Y */, 0x00000000 /* Z */ +}; + + +/* -------------------------------------------- */ +/* internal replacement of gets */ +/* -------------------------------------------- */ +static char *sGets(char *buffer, int size) { + + char *ebuf; + + if (! fgets(buffer, size-1, stdin)) + return NULL; + + /* remove trailing line feed */ + + ebuf = buffer + strlen(buffer); + + while (--ebuf >= buffer) { + if ((*ebuf == '\n') || (*ebuf == '\r')) + *ebuf = '\000'; + else + break; + } + + return buffer; +} + +/* -------------------------------------------- */ +/* Interface */ +/* -------------------------------------------- */ diff --git a/src/libecoprimer/apat_search.P b/src/libecoprimer/apat_search.P new file mode 100644 index 0000000..bfc181f --- /dev/null +++ b/src/libecoprimer/apat_search.P @@ -0,0 +1,17 @@ +apat_search.o apat_search.P : apat_search.c /usr/include/stdio.h /usr/include/_types.h \ + /usr/include/sys/_types.h /usr/include/sys/cdefs.h \ + /usr/include/machine/_types.h /usr/include/i386/_types.h \ + /usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \ + /usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \ + /usr/include/machine/signal.h /usr/include/i386/signal.h \ + /usr/include/i386/_structs.h /usr/include/sys/_structs.h \ + /usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \ + /usr/include/sys/resource.h /usr/include/machine/endian.h \ + /usr/include/i386/endian.h /usr/include/sys/_endian.h \ + /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \ + /usr/include/machine/types.h /usr/include/i386/types.h \ + /usr/include/string.h libstki.h ecotype.h apat.h \ + /usr/include/inttypes.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \ + ../libecoPCR/ecoPCR.h diff --git a/src/libecoprimer/apat_search.c b/src/libecoprimer/apat_search.c new file mode 100644 index 0000000..0ed417a --- /dev/null +++ b/src/libecoprimer/apat_search.c @@ -0,0 +1,156 @@ +/* ==================================================== */ +/* Copyright (c) Atelier de BioInformatique */ +/* Dec. 94 */ +/* File: apat_search.c */ +/* Purpose: recherche du pattern */ +/* algorithme de Baeza-Yates/Gonnet */ +/* Manber (agrep) */ +/* History: */ +/* 07/12/94 : first version */ +/* 28/12/94 : revised version */ +/* 14/05/99 : last revision */ +/* ==================================================== */ + + +#include +#include +#include + +#include "libstki.h" +#include "apat.h" + +#define POP PopiOut +#define PUSH(s,v) PushiIn(&(s),(v)) +#define TOPCURS CursiToTop +#define DOWNREAD ReadiDown + +#define KRONECK(x, msk) ((~x & msk) ? 0 : 1) +#define MIN(x, y) ((x) < (y) ? (x) : (y)) + + +/* -------------------------------------------- */ +/* Baeza-Yates/Manber algorithm */ +/* NoError */ +/* -------------------------------------------- */ +int32_t ManberNoErr(pecoseq_t pseq,ppattern_t pat, + ppatternParam_t param, + StackiPtr stkpos) +{ + int32_t pos; + uint32_t smask, r; + uint8_t *data; + int32_t end; + + end = (size_t)(pseq->SQ_length); + + if (param->circular) + end+=param->patlen - 1; + + + /* create local masks */ + smask = r = 0x1L << param->patlen; + /* init. scan */ + data = (uint8_t*)(pseq->SQ); + + /* loop on text data */ + for (pos = 0 ; pos < end ; pos++,data++) { + if (pos==pseq->SQ_length) + data=(uint8_t*)(pseq->SQ); + + if (*data < 4) + r = (r >> 1) & pat[*data]; + else + r=0; + + if (r & 0x1L) { + PUSH(stkpos, pos - param->patlen + 1); + } + + r |= smask; + } + return stkpos->top; /* aka # of hits */ +} + +/* -------------------------------------------- */ +/* Baeza-Yates/Manber algorithm */ +/* Substitution only */ +/* */ +/* Note : r array is stored as : */ +/* 0 0 r(0,j) r(0,j+1) r(1,j) r(1,j+1) ... */ +/* */ +/* -------------------------------------------- */ +int32_t ManberSub(pecoseq_t pseq,ppattern_t pat, + ppatternParam_t param, + StackiPtr stkpos) +{ + int e, found; + int32_t pos; + uint32_t smask, cmask, sindx; + uint32_t *pr, r[2 * MAX_PAT_ERR + 2]; + uint8_t *data; + int32_t end; + + end = (size_t)(pseq->SQ_length); + + if (param->circular) + end+=param->patlen - 1; + + /* create local masks */ + r[0] = r[1] = 0x0; + + cmask = smask = 0x1L << param->patlen; + + for (e = 0, pr = r + 3 ; e <= param->maxerr ; e++, pr += 2) + *pr = cmask; + + cmask = ~ param->omask; // A VOIR !!!!! omask (new) doit tre composŽ de + et - ... Ancien omask : bits + + /* init. scan */ + data = (uint8_t*)(pseq->SQ); + + /* loop on text data */ + + for (pos = 0 ; pos < end ; pos++,data++) { + if (pos==pseq->SQ_length) + data=(uint8_t*)(pseq->SQ); + + sindx = (*data==4) ? 0:pat[*data]; + + for (e = found = 0, pr = r ; e <= param->maxerr ; e++, pr += 2) { + + pr[2] = pr[3] | smask; + + pr[3] = ((pr[0] >> 1) & cmask) /* sub */ + | ((pr[2] >> 1) & sindx); /* ident */ + + if (pr[3] & 0x1L) { /* found */ + if (! found) { + PUSH(stkpos, pos - param->patlen + 1); + } + found++; + } + } + } + + return stkpos->top; /* aka # of hits */ +} + + +/* -------------------------------------------- */ +/* Baeza-Yates/Manber algorithm */ +/* API call to previous functions */ +/* -------------------------------------------- */ +int32_t ManberAll(pecoseq_t pseq,ppattern_t pat, + ppatternParam_t param, + StackiPtr stkpos) +{ + if (param->maxerr == 0) + return ManberNoErr(pseq, + pat, param, + stkpos); + else + return ManberSub(pseq, + pat, param, + stkpos); +} + diff --git a/src/libecoprimer/aproxpattern.P b/src/libecoprimer/aproxpattern.P new file mode 100644 index 0000000..1b3ebf8 --- /dev/null +++ b/src/libecoprimer/aproxpattern.P @@ -0,0 +1,17 @@ +aproxpattern.o aproxpattern.P : aproxpattern.c ecoprimer.h /usr/include/inttypes.h \ + /usr/include/sys/cdefs.h /usr/include/_types.h \ + /usr/include/sys/_types.h /usr/include/machine/_types.h \ + /usr/include/i386/_types.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \ + /usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \ + /usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \ + /usr/include/machine/signal.h /usr/include/i386/signal.h \ + /usr/include/i386/_structs.h /usr/include/sys/_structs.h \ + /usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \ + /usr/include/sys/resource.h /usr/include/machine/endian.h \ + /usr/include/i386/endian.h /usr/include/sys/_endian.h \ + /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \ + /usr/include/machine/types.h /usr/include/i386/types.h \ + /usr/include/stdio.h ecotype.h ../libecoPCR/ecoPCR.h apat.h libstki.h \ + debug.h /usr/include/math.h /usr/include/architecture/i386/math.h diff --git a/src/libecoprimer/aproxpattern.c b/src/libecoprimer/aproxpattern.c new file mode 100644 index 0000000..d924281 --- /dev/null +++ b/src/libecoprimer/aproxpattern.c @@ -0,0 +1,236 @@ +/* + * aproxpattern.c + * + * Created on: 20 nov. 2008 + * Author: coissac + */ + + +#include "ecoprimer.h" +#include "apat.h" +#include + +static uint8_t encoder[] = {0, // A + 4, // b + 1, // C + 4,4,4, // d, e, f + 2, // G + 4,4,4,4,4,4,4,4,4,4,4,4, // h,i,j,k,l,m,n,o,p,q,r,s + 3,3, // T,U + 4,4,4,4,4}; // v,w,x,y,z + + +ppattern_t buildPatternFromWord(word_t word, uint32_t patlen) +{ + static pattern_t pattern; + uint32_t i; + + for (i = 0 ; i < ALPHA_LEN ; i++) + pattern[i] = 0x0; + + for (i=0;i < patlen; i++) + { + pattern[word & 3LLU] |= 1 << i; + word>>=2; + } + + return pattern; + +} + + +#ifdef IS_UPPER(c) +#undef IS_UPPER(c) +#endif + +/* -------------------------------------------- */ +/* encode sequence */ +/* IS_UPPER is slightly faster than isupper */ +/* -------------------------------------------- */ + +#define IS_UPPER(c) (((c) >= 'A') && ((c) <= 'Z')) + +void encodeSequence(ecoseq_t *seq) +{ + int i; + uint8_t *data; + char *cseq; + + data = (uint8_t*)(seq->SQ); + cseq = seq->SQ; + + for (i=0;iSQ_length;i++,data++,cseq++) + { + *data = encoder[(IS_UPPER(*cseq) ? *cseq - 'A' : 'Z')]; + } +} + +pprimercount_t lookforAproxPrimer(pecodnadb_t database, uint32_t seqdbsize,uint32_t exampleCount, + pwordcount_t words,poptions_t options) +{ + pprimer_t data; + pprimercount_t primers; + ppattern_t pattern; + patternParam_t params; + uint32_t i; + uint32_t w; + uint32_t j; + Stacki positions; + uint32_t count=1; + uint32_t goodPrimers=0; + + uint32_t inSequenceQuorum; + uint32_t outSequenceQuorum; + bool_t conserved = TRUE; + + //poslist_t ttt; + + + inSequenceQuorum = (uint32_t)floor((float)exampleCount * options->sensitivity_quorum); + outSequenceQuorum = (uint32_t)floor((float)(seqdbsize-exampleCount) * options->false_positive_quorum); + + fprintf(stderr," Primers should be at least present in %d/%d example sequences\n",inSequenceQuorum,exampleCount); + fprintf(stderr," Primers should not be present in more than %d/%d counterexample sequences\n",outSequenceQuorum,(seqdbsize-exampleCount)); + + data = ECOMALLOC(words->size * sizeof(primer_t), + "Cannot allocate memory for fuzzy matching results"); + + params.circular = options->circular; + params.maxerr = options->error_max; + params.omask = (1 << options->strict_three_prime) -1; + params.patlen = options->primer_length; + + positions.val=NULL; + + for (i=0,w=0; i < words->size; i++) + { + data[w].word=WORD(words->words[i]); + data[w].inexample = 0; + data[w].outexample= 0; + count = 1; + + if (conserved) + { + data[w].directCount=ECOMALLOC(seqdbsize * sizeof(uint32_t), + "Cannot allocate memory for primer position"); + data[w].directPos = ECOMALLOC(seqdbsize * sizeof(poslist_t), + "Cannot allocate memory for primer position"); + data[w].reverseCount=ECOMALLOC(seqdbsize * sizeof(uint32_t), + "Cannot allocate memory for primer position"); + data[w].reversePos = ECOMALLOC(seqdbsize * sizeof(poslist_t), + "Cannot allocate memory for primer position"); + } + + pattern = buildPatternFromWord(data[w].word,options->primer_length); + positions.val=NULL; + + for (j=0; j < seqdbsize && (count < 2 || !options->no_multi_match); j++) + { + positions.cursor=0; + positions.top =0; + if (!positions.val) + { + positions.size=1; + positions.val = ECOMALLOC(sizeof(uint32_t), + "Cannot allocate memory for primer position"); + } + + + count = ManberAll(database[j],pattern,¶ms,&positions); + data[w].directCount[j]=count; + + + if (count>1) + { + data[w].directPos[j].pointer = (uint32_t*)positions.val; + positions.val=NULL; + } + else + { + data[w].directPos[j].pointer=NULL; + if (count==1) + data[w].directPos[j].value = (uint32_t)*(positions.val); + } + + + } + + pattern = buildPatternFromWord(ecoComplementWord(data[w].word,options->primer_length), + options->primer_length); + + for (j=0; j < seqdbsize && (count < 2 || !options->no_multi_match); j++) + { + positions.cursor=0; + positions.top =0; + if (!positions.val) + { + positions.size=1; + positions.val = ECOMALLOC(sizeof(uint32_t), + "Cannot allocate memory for primer position"); + } + + count = ManberAll(database[j],pattern,¶ms,&positions); + data[w].reverseCount[j]=count; + + if (count>1) + { + data[w].reversePos[j].pointer = (uint32_t*)positions.val; + positions.val=NULL; + } + else + { + data[w].reversePos[j].pointer=NULL; + if (count==1) + data[w].reversePos[j].value = (uint32_t)*(positions.val); + } + + if (database[j]->isexample) + { + data[w].inexample+=(data[w].directCount[j] || data[w].reverseCount[j])? 1:0; + } + else + { + data[w].outexample+=(data[w].directCount[j] || data[w].reverseCount[j])? 1:0; + + } + + count+=data[w].directCount[j]; + } + + data[w].good = data[w].inexample >= inSequenceQuorum && data[w].outexample <= outSequenceQuorum; + goodPrimers+=data[w].good? 1:0; + + fprintf(stderr,"Primers %5d/%d analyzed => sequence : %s in %d example and %d counterexample sequences \r", + i+1,words->size,ecoUnhashWord(data[w].word,options->primer_length), + data[w].inexample,data[w].outexample); + + + conserved=data[w].inexample >= inSequenceQuorum; + conserved=conserved && (count < 2 || !options->no_multi_match); + + if (conserved) + w++; + } + + if (positions.val) + ECOFREE(positions.val,"Free stack position pointer"); + + if (!conserved) + { + ECOFREE(data[w].directCount,"Free direct count table"); + ECOFREE(data[w].directPos,"Free direct count table"); + ECOFREE(data[w].reverseCount,"Free direct count table"); + ECOFREE(data[w].reversePos,"Free direct count table"); + } + + fprintf(stderr,"\n\nOn %d analyzed primers %d respect quorum conditions\n",words->size,goodPrimers); + fprintf(stderr,"Conserved primers for further analysis : %d/%d\n",w,words->size); + + primers = ECOMALLOC(sizeof(primercount_t),"Cannot allocate memory for primer table"); + primers->primers=ECOREALLOC(data, + w * sizeof(primer_t), + "Cannot reallocate memory for fuzzy matching results"); + primers->size=w; + + return primers; +} diff --git a/src/libecoprimer/debug.h b/src/libecoprimer/debug.h new file mode 100644 index 0000000..48f473a --- /dev/null +++ b/src/libecoprimer/debug.h @@ -0,0 +1,29 @@ +/* + * debug.h + * + * Created on: 12 nov. 2008 + * Author: coissac + */ + +#ifndef DEBUG_H_ +#define DEBUG_H_ + +#include + + +#ifdef DEBUG + +#define DEBUG_LOG(message,...) { \ + char *text; \ + (void)asprintf(&text,(message),##__VA_ARGS__); \ + fprintf(stderr,"DEBUG %s (line %d) : %s\n",__FILE__,__LINE__,(text)); \ + free(text); \ + } + +#else + +#define DEBUG_LOG(message, ...) + +#endif + +#endif /* DEBUG_H_ */ diff --git a/src/libecoprimer/ecoprimer.h b/src/libecoprimer/ecoprimer.h new file mode 100644 index 0000000..5d26573 --- /dev/null +++ b/src/libecoprimer/ecoprimer.h @@ -0,0 +1,238 @@ +/* + * epsort.h + * + * Created on: 6 nov. 2008 + * Author: coissac + */ + +#ifndef EPSORT_H_ +#define EPSORT_H_ + +#include +#include +#include +#include "ecotype.h" +#include "../libecoPCR/ecoPCR.h" +#include "apat.h" + +#define DEBUG +#include "debug.h" + +/**** + * Word format used : + * + * bit 63 : bad word -> this word should not be used + * bit 62 : multi word -> this word is not uniq in at least one seq + * bits 0-61 : hashed dna word of max size 31 pb + * code used for a : 00 + * code used for c : 01 + * code used for g : 10 + * code used for t : 11 + */ + +typedef uint64_t word_t, *pword_t; + +#define WORD(x) ((x) & 0x3FFFFFFFFFFFFFFFLLU) +#define WORD(x) ((x) & 0x3FFFFFFFFFFFFFFFLLU) + +#define ISBADWORD(x) (((x) & 0x8000000000000000LLU) >> 63) +#define SETBADWORD(x) ((x) | 0x8000000000000000LLU) +#define RESETBADWORD(x) ((x) & 0x7FFFFFFFFFFFFFFFLLU) + +#define ISMULTIWORD(x) (((x) & 0x4000000000000000LLU) >> 62) +#define SETMULTIWORD(x) ((x) | 0x4000000000000000LLU) +#define RESETMULTIWORD(x) ((x) & 0xBFFFFFFFFFFFFFFFLLU) + + +#define WORDMASK(s) ((1LLU << ((s) * 2)) -1) +#define LSHIFTWORD(x,s) (((x) << 2) & WORDMASK(s)) +#define RSHIFTWORD(x,s) (((x) & WORDMASK(s))>> 2) + +#define RAPPENDBASE(x,s,c) (LSHIFTWORD((x),(s)) | (word_t)(c)) +#define LAPPENDBASE(x,s,c) (RSHIFTWORD((x),(s)) | ((word_t)((~(c)) & 3) << (((s)-1) *2))) + + +#define ECO_ASSERT(x,message) if (!(x)) \ + { \ + fprintf(stderr,"Assertion Error in %s (line %d): %s\n", \ + __FILE__,\ + __LINE__,\ + message\ + ); \ + exit(ECO_ASSERT_ERROR); \ + } + +#define MINI(x,y) (((x) < (y)) ? (x):(y)) +#define MAXI(x,y) (((x) < (y)) ? (y):(x)) + +typedef struct { + pword_t words; + uint32_t *strictcount; + uint32_t inseqcount; + uint32_t outseqcount; + uint32_t size; +} wordcount_t, *pwordcount_t; + + +typedef union { + uint32_t *pointer; + uint32_t value; +} poslist_t, *ppostlist_t; + +typedef struct { + word_t word; + uint32_t *directCount; + ppostlist_t directPos; + + uint32_t *reverseCount; + ppostlist_t reversePos; + bool_t good; + uint32_t inexample; + uint32_t outexample; +} primer_t, *pprimer_t; + +typedef struct { + pprimer_t primers; + uint32_t size; +} primercount_t, *pprimercount_t; + +typedef struct { + word_t word; + uint32_t position; + bool_t strand; + bool_t good; /*TR: Added*/ +} primermatch_t, *pprimermatch_t; + +/*TR: Added*/ +typedef struct { + pprimermatch_t matches; + uint32_t matchcount; +} primermatchcount_t, *pprimermatchcount_t; + +typedef struct { + char *amplifia; + int32_t *taxonids; + uint32_t seqidcount; + uint32_t seqidindex; +} ampseqset_t, *pampseqset_t; + +typedef struct { + int32_t taxonid; + char **amplifia; + uint32_t amplifiacount; + uint32_t amplifiaindex; +} taxampset_t, *ptaxampset_t; + +typedef struct { + word_t w1; + word_t w2; + uint32_t inexample; /*inexample count*/ + uint32_t outexample; /*outexample count*/ + + uint32_t mind; + uint32_t maxd; + + uint32_t ampsetcount; + uint32_t ampsetindex; + pampseqset_t ampset; + + uint32_t taxsetcount; + uint32_t taxsetindex; + ptaxampset_t taxset; + + uint32_t oktaxoncount; +} pairs_t, *ppairs_t; + +/*TR: Added*/ +typedef struct { + ppairs_t pairs; + uint32_t paircount; +}pairscount_t, *ppairscount_t; + +typedef struct { + pword_t words; + uint32_t *count; + uint32_t push; + uint32_t pop; + uint32_t size; + bool_t empty; + bool_t full; +} queue_t, *pqueue_t; + +typedef struct { + pword_t words; + uint32_t *count; + uint32_t write; + uint32_t read1; + uint32_t read2; + uint32_t size; +} merge_t, *pmerge_t; + + +typedef struct { + uint32_t lmin; //**< Amplifia minimal length + uint32_t lmax; //**< Amplifia maximal length + uint32_t error_max; //**< maximum error count in fuzzy search + uint32_t primer_length; //**< minimal length of the primers + int32_t *restricted_taxid; //**< limit amplification below these taxid + int32_t *ignored_taxid; //**< no amplification below these taxid + char *prefix; + uint32_t circular; + uint32_t doublestrand; + float strict_quorum; + float strict_exclude_quorum; + float sensitivity_quorum; + float false_positive_quorum; + uint32_t strict_three_prime; + int32_t r; //**< count of restrited taxa (restricted_taxid array size) + int32_t g; //**< count of ignored taxa (ignored_taxid array size) + bool_t no_multi_match; + char taxonrank[20]; //TR to count ranks against a pair + int32_t taxonrankidx; //TR to count ranks against a pair +} options_t, *poptions_t; + +typedef ecoseq_t **pecodnadb_t; + +void sortword(pword_t table,uint32_t N); + + +pecodnadb_t readdnadb(const char *name, uint32_t *size); + +int isGoodTaxon(ecotaxonomy_t *taxonomy,int32_t taxon,poptions_t options); + +uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq); +pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint32_t doublestrand, ecoseq_t *seq,uint32_t *size); +uint32_t ecoCompactHashSequence(pword_t dest,uint32_t size); +const char* ecoUnhashWord(word_t word,uint32_t size); +word_t ecoComplementWord(word_t word,uint32_t size); +uint32_t ecoFindWord(pwordcount_t table,word_t word); + + +void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,uint32_t seqQuorum); +pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,ecoseq_t *seq); +void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t exampleCount,uint32_t seqQuorum,ecoseq_t *seq); + +pqueue_t newQueue(pqueue_t queue, uint32_t size); +pqueue_t resizeQueue(pqueue_t queue, uint32_t size); + +void pop(pqueue_t queue); +void push(pqueue_t queue, word_t word, uint32_t count); + +pqueue_t cleanQueue(pqueue_t queue); + +pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize, + uint32_t exampleCount, poptions_t options); +uint32_t filterMultiStrictPrimer(pwordcount_t strictprimers); + +void encodeSequence(ecoseq_t *seq); +ppattern_t buildPatternFromWord(word_t word, uint32_t patlen); + +pprimercount_t lookforAproxPrimer(pecodnadb_t database, uint32_t seqdbsize,uint32_t exampleCount, + pwordcount_t words,poptions_t options); + +void sortmatch(pprimermatch_t table,uint32_t N); + +/*TR: Added*/ +pairscount_t buildPrimerPairs(pecodnadb_t seqdb,uint32_t seqdbsize,pprimercount_t primers,poptions_t options); + +#endif /* EPSORT_H_ */ diff --git a/src/libecoprimer/ecotype.h b/src/libecoprimer/ecotype.h new file mode 100644 index 0000000..38ce5a8 --- /dev/null +++ b/src/libecoprimer/ecotype.h @@ -0,0 +1,14 @@ +/* + * ecotype.h + * + * Created on: 24 nov. 2008 + * Author: coissac + */ + +#ifndef ECOTYPE_H_ +#define ECOTYPE_H_ + +typedef enum { FALSE=0,TRUE=1} bool_t, *pbool_t; + + +#endif /* ECOTYPE_H_ */ diff --git a/src/libecoprimer/goodtaxon.P b/src/libecoprimer/goodtaxon.P new file mode 100644 index 0000000..a990580 --- /dev/null +++ b/src/libecoprimer/goodtaxon.P @@ -0,0 +1,17 @@ +goodtaxon.o goodtaxon.P : goodtaxon.c ecoprimer.h /usr/include/inttypes.h \ + /usr/include/sys/cdefs.h /usr/include/_types.h \ + /usr/include/sys/_types.h /usr/include/machine/_types.h \ + /usr/include/i386/_types.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \ + /usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \ + /usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \ + /usr/include/machine/signal.h /usr/include/i386/signal.h \ + /usr/include/i386/_structs.h /usr/include/sys/_structs.h \ + /usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \ + /usr/include/sys/resource.h /usr/include/machine/endian.h \ + /usr/include/i386/endian.h /usr/include/sys/_endian.h \ + /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \ + /usr/include/machine/types.h /usr/include/i386/types.h \ + /usr/include/stdio.h ecotype.h ../libecoPCR/ecoPCR.h apat.h libstki.h \ + debug.h diff --git a/src/libecoprimer/goodtaxon.c b/src/libecoprimer/goodtaxon.c new file mode 100644 index 0000000..f4d7598 --- /dev/null +++ b/src/libecoprimer/goodtaxon.c @@ -0,0 +1,27 @@ +/* + * goodtaxon.c + * + * Created on: 7 nov. 2008 + * Author: coissac + */ + + +#include "ecoprimer.h" + +int isGoodTaxon(ecotaxonomy_t *taxonomy,int32_t taxon,poptions_t options) +{ + int result; + + result=( (options->r == 0) || (eco_is_taxid_included(taxonomy, + options->restricted_taxid, + options->r, + taxonomy->taxons->taxon[taxon].taxid) + )) && + ((options->g == 0) || !(eco_is_taxid_included(taxonomy, + options->ignored_taxid, + options->g, + taxonomy->taxons->taxon[taxon].taxid) + )); + + return result; +} diff --git a/src/libecoprimer/hashsequence.P b/src/libecoprimer/hashsequence.P new file mode 100644 index 0000000..d4be603 --- /dev/null +++ b/src/libecoprimer/hashsequence.P @@ -0,0 +1,17 @@ +hashsequence.o hashsequence.P : hashsequence.c ecoprimer.h /usr/include/inttypes.h \ + /usr/include/sys/cdefs.h /usr/include/_types.h \ + /usr/include/sys/_types.h /usr/include/machine/_types.h \ + /usr/include/i386/_types.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \ + /usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \ + /usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \ + /usr/include/machine/signal.h /usr/include/i386/signal.h \ + /usr/include/i386/_structs.h /usr/include/sys/_structs.h \ + /usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \ + /usr/include/sys/resource.h /usr/include/machine/endian.h \ + /usr/include/i386/endian.h /usr/include/sys/_endian.h \ + /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \ + /usr/include/machine/types.h /usr/include/i386/types.h \ + /usr/include/stdio.h ecotype.h ../libecoPCR/ecoPCR.h apat.h libstki.h \ + debug.h diff --git a/src/libecoprimer/hashsequence.c b/src/libecoprimer/hashsequence.c new file mode 100644 index 0000000..8dfe6d4 --- /dev/null +++ b/src/libecoprimer/hashsequence.c @@ -0,0 +1,203 @@ +/* + * hashsequence.c + * + * Created on: 7 nov. 2008 + * Author: coissac + */ + + +#include "ecoprimer.h" + +static int cmpword(const void *x,const void *y); + +static int8_t encoder[] = {0, // A + -1, // b + 1, // C + -1,-1,-1, // d, e, f + 2, // G + -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // h,i,j,k,l,m,n,o,p,q,r,s + 3,3, // T,U + -1,-1,-1,-1,-1}; // v,w,x,y,z + + +uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq) +{ + uint32_t wordcount; + + wordcount = seq->SQ_length; + + if (!circular) wordcount-=wordsize-1; + + return wordcount; +} + +pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint32_t doublestrand, ecoseq_t *seq,uint32_t *size) +{ + uint32_t i=0; + uint32_t j; + char *base; + int8_t code; + int32_t error=0; + word_t word=0; + word_t antiword=0; + uint32_t lmax=0; + + (*size)=0; + + lmax = seq->SQ_length; + if (!circular) + lmax-= wordsize-1; + + if (!dest) + dest = ECOMALLOC(lmax*sizeof(word_t), + "I cannot allocate memory for sequence hashing" + ); + +// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i)); + + for (i=0, base = seq->SQ; i < wordsize && i < lmax; i++,base++) + { + error<<= 1; + + code = encoder[(*base) - 'A']; + if (code <0) + { + code = 0; + error|= 1; + } + + + word=RAPPENDBASE(word,wordsize,code); + if (doublestrand) + antiword=LAPPENDBASE(antiword,wordsize,code); + } + + if (!error && i==wordsize) + { + dest[*size]=(doublestrand) ? MINI(word,antiword):word; + (*size)++; + } + + + for (j=1; j < lmax; j++,i++,base++) + { + +// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j)); + + /* roll over the sequence for circular ones */ + if (i==(uint32_t)seq->SQ_length) base=seq->SQ; + + error<<= 1; + + code = encoder[(*base) - 'A']; + if (code <0) + { + code = 0; + error|= 1; + } + + word=RAPPENDBASE(word,wordsize,code); + if (doublestrand) + antiword=LAPPENDBASE(antiword,wordsize,code); + + if (!error) + { + dest[*size]=(doublestrand) ? MINI(word,antiword):word; + (*size)++; + } + + } + + return dest; + +} + +uint32_t ecoCompactHashSequence(pword_t table,uint32_t size) +{ + uint32_t i,j; + word_t current; + + sortword(table,size); + + current = 0; + current=SETMULTIWORD(current); /* build impossible word for the first loop cycle */ + + for (i=0,j=0; j < size;j++) + { + if (table[j]!=current) + { + current =table[j]; + table[i]=current; + i++; + } + else + table[i]=SETMULTIWORD(table[i]); + } + + return i; +} + +const char* ecoUnhashWord(word_t word,uint32_t size) +{ + static char buffer[32]; + static char decode[]="ACGT"; + + uint32_t i; + + for (i=0; i < size; i++) + { + buffer[i]=decode[(word >> (2 * (size - 1 -i))) & 3]; + } + + buffer[size]=0; + + return buffer; +} + +word_t ecoComplementWord(word_t word,uint32_t size) +{ + word_t rep=0; + uint32_t i; + +// DEBUG_LOG("%llx %llx",word,~word); + word=(~word) & WORDMASK(size); + for (i=0;i < size; i++) + { + + rep = RAPPENDBASE(rep,size,word & 3LLU); +// DEBUG_LOG("%016llx %016llx %016llx",word,word & 3LLU,rep); + word>>=2; + } +// DEBUG_LOG("Complemented = %s",ecoUnhashWord(rep,18)); + return rep; + +} + +static int cmpword(const void *x,const void *y) +{ + word_t w1 = *(pword_t)x; + word_t w2 = *(pword_t)y; + + w1 = WORD(w1); + w2 = WORD(w2); + + if (w1 < w2) + return -1; + if (w1 > w2) + return +1; + + return 0; +} + +uint32_t ecoFindWord(pwordcount_t table,word_t word) +{ + pword_t dest; + + dest = (pword_t)bsearch((const void*)&word,(const void*)table->words,table->size,sizeof(word_t),cmpword); + + if (dest) + return dest - table->words; + else + return ~0; +} + diff --git a/src/libecoprimer/libstki.P b/src/libecoprimer/libstki.P new file mode 100644 index 0000000..3b125ef --- /dev/null +++ b/src/libecoprimer/libstki.P @@ -0,0 +1,17 @@ +libstki.o libstki.P : libstki.c /usr/include/stdio.h /usr/include/_types.h \ + /usr/include/sys/_types.h /usr/include/sys/cdefs.h \ + /usr/include/machine/_types.h /usr/include/i386/_types.h \ + /usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \ + /usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \ + /usr/include/machine/signal.h /usr/include/i386/signal.h \ + /usr/include/i386/_structs.h /usr/include/sys/_structs.h \ + /usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \ + /usr/include/sys/resource.h /usr/include/machine/endian.h \ + /usr/include/i386/endian.h /usr/include/sys/_endian.h \ + /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \ + /usr/include/machine/types.h /usr/include/i386/types.h \ + /usr/include/string.h libstki.h ecotype.h ecoprimer.h \ + /usr/include/inttypes.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \ + ../libecoPCR/ecoPCR.h apat.h debug.h diff --git a/src/libecoprimer/libstki.c b/src/libecoprimer/libstki.c new file mode 100644 index 0000000..9bdebf2 --- /dev/null +++ b/src/libecoprimer/libstki.c @@ -0,0 +1,379 @@ +/* ==================================================== */ +/* Copyright (c) Atelier de BioInformatique */ +/* Mar. 92 */ +/* File: libstki.c */ +/* Purpose: A library to deal with 'stacks' of */ +/* integers */ +/* Note: 'stacks' are dynamic (i.e. size is */ +/* automatically readjusted when needed) */ +/* History: */ +/* 00/03/92 : first draft */ +/* 15/08/93 : revised version */ +/* 14/05/99 : last revision */ +/* ==================================================== */ + +#include +#include +#include + +#include "libstki.h" +#include "ecoprimer.h" + + +/* ============================ */ +/* Constantes et Macros locales */ +/* ============================ */ + +#define ExpandStack(stkh) ResizeStacki((stkh), (*stkh)->size << 1) + +#define ShrinkStack(stkh) ResizeStacki((stkh), (*stkh)->size >> 1) + + +static int16_t sStkiLastError = kStkiNoErr; + +/* -------------------------------------------- */ +/* gestion des erreurs */ +/* get/reset erreur flag */ +/* */ +/* @function: StkiError */ +/* -------------------------------------------- */ + +int16_t StkiError(bool_t reset) +{ + int16_t err; + + err = sStkiLastError; + + if (reset) + sStkiLastError = kStkiNoErr; + + return err; + +} /* end of StkiError */ + +/* -------------------------------------------- */ +/* creation d'un stack */ +/* */ +/* @function: NewStacki */ +/* -------------------------------------------- */ + +StackiPtr NewStacki(int32_t size) +{ + StackiPtr stki; + + if (! (stki = NEW(Stacki))) + return NULL; + + stki->size = size; + stki->top = 0; + stki->cursor = 0; + + if ( ! (stki->val = NEWN(int32_t, size))) { + sStkiLastError = kStkiMemErr; + return FreeStacki(stki); + } + + return stki; + +} /* end of NewStacki */ + + +/* -------------------------------------------- */ +/* liberation d'un stack */ +/* */ +/* @function: FreeStacki */ +/* -------------------------------------------- */ + +StackiPtr FreeStacki(StackiPtr stki) +{ + if (stki) { + if (stki->val) + ECOFREE(stki->val,"Free stack values"); + ECOFREE(stki,"Free stack"); + } + + return NULL; + +} /* end of FreeStacki */ + +/* -------------------------------------------- */ +/* creation d'un vecteur de stacks */ +/* */ +/* @function: NewStackiVector */ +/* -------------------------------------------- */ + +StackiHdle NewStackiVector(int32_t vectSize, int32_t stackSize) +{ + int32_t i; + StackiHdle stkh; + + if (! (stkh = NEWN(StackiPtr, vectSize))) { + sStkiLastError = kStkiMemErr; + return NULL; + } + + for (i = 0 ; i < vectSize ; i++) + if (! (stkh[i] = NewStacki(stackSize))) + return FreeStackiVector(stkh, i); + + return stkh; + +} /* end of NewStackiVector */ + + +/* -------------------------------------------- */ +/* liberation d'un vecteur de stacks */ +/* */ +/* @function: FreeStackiVector */ +/* -------------------------------------------- */ + +StackiHdle FreeStackiVector(StackiHdle stkh, int32_t vectSize) +{ + int32_t i; + + if (stkh) { + for (i = 0 ; i < vectSize ; i++) + (void) FreeStacki(stkh[i]); + ECOFREE(stkh,"Free stack vector"); + } + + return NULL; + +} /* end of FreeStackiVector */ + +/* -------------------------------------------- */ +/* resize d'un stack */ +/* */ +/* @function: ResizeStacki */ +/* -------------------------------------------- */ + +int32_t ResizeStacki(StackiHdle stkh, int32_t size) +{ + int32_t resize = 0; /* assume error */ + int32_t *val; + + if ((val = ECOREALLOC((*stkh)->val, size * sizeof(int32_t),"Cannot reallocate stack values"))) { + (*stkh)->size = resize = size; + (*stkh)->val = val; + } + + if (! resize) + sStkiLastError = kStkiMemErr; + + return resize; + +} /* end of ResizeStacki */ + +/* -------------------------------------------- */ +/* empilage(/lement) */ +/* */ +/* @function: PushiIn */ +/* -------------------------------------------- */ + +bool_t PushiIn(StackiHdle stkh, int32_t val) +{ + if (((*stkh)->top >= (*stkh)->size) && (! ExpandStack(stkh))) + return FALSE; + + (*stkh)->val[((*stkh)->top)++] = val; + + return TRUE; + +} /* end of PushiIn */ + +/* -------------------------------------------- */ +/* depilage(/lement) */ +/* */ +/* @function: PopiOut */ +/* -------------------------------------------- */ + +bool_t PopiOut(StackiHdle stkh, int32_t *val) +{ + if ((*stkh)->top <= 0) + return FALSE; + + *val = (*stkh)->val[--((*stkh)->top)]; + + if ( ((*stkh)->top < ((*stkh)->size >> 1)) + && ((*stkh)->top > kMinStackiSize)) + + (void) ShrinkStack(stkh); + + return TRUE; + +} /* end of PopiOut */ + +/* -------------------------------------------- */ +/* lecture descendante */ +/* */ +/* @function: ReadiDown */ +/* -------------------------------------------- */ + +bool_t ReadiDown(StackiPtr stki, int32_t *val) +{ + if (stki->cursor <= 0) + return FALSE; + + *val = stki->val[--(stki->cursor)]; + + return TRUE; + +} /* end of ReadiDown */ + +/* -------------------------------------------- */ +/* lecture ascendante */ +/* */ +/* @function: ReadiUp */ +/* -------------------------------------------- */ + +bool_t ReadiUp(StackiPtr stki, int32_t *val) +{ + if (stki->cursor >= stki->top) + return FALSE; + + *val = stki->val[(stki->cursor)++]; + + return TRUE; + +} /* end of ReadiUp */ + +/* -------------------------------------------- */ +/* remontee/descente du curseur */ +/* */ +/* @function: CursiToTop */ +/* @function: CursiToBottom */ +/* -------------------------------------------- */ + +void CursiToTop(StackiPtr stki) +{ + stki->cursor = stki->top; + +} /* end of CursiToTop */ + +void CursiToBottom(stki) + StackiPtr stki; +{ + stki->cursor = 0; + +} /* end of CursiToBottom */ + +/* -------------------------------------------- */ +/* echange des valeurs cursor <-> (top - 1) */ +/* */ +/* @function: CursiSwap */ +/* -------------------------------------------- */ + +void CursiSwap(StackiPtr stki) +{ + int32_t tmp; + + if ((stki->top <= 0) || (stki->cursor < 0)) + return; + + tmp = stki->val[stki->cursor]; + stki->val[stki->cursor] = stki->val[stki->top - 1]; + stki->val[stki->top - 1] = tmp; + +} /* end of CursiSwap */ + +/* -------------------------------------------- */ +/* Recherche d'une valeur en stack a partir du */ +/* curseur courant en descendant. */ +/* on laisse le curseur a l'endroit trouve */ +/* */ +/* @function: SearchDownStacki */ +/* -------------------------------------------- */ + +bool_t SearchDownStacki(StackiPtr stki, int32_t sval) +{ + int32_t val; + bool_t more; + + while ((more = ReadiDown(stki, &val))) + if (val == sval) + break; + + return more; + +} /* end of SearchDownStacki */ + +/* -------------------------------------------- */ +/* Recherche dichotomique d'une valeur en stack */ +/* le stack est suppose trie par valeurs */ +/* croissantes. */ +/* on place le curseur a l'endroit trouve */ +/* */ +/* @function: BinSearchStacki */ +/* -------------------------------------------- */ + +bool_t BinSearchStacki(StackiPtr stki, int32_t sval) +{ + int32_t midd, low, high, span; + + low = 0; + high = stki->top - 1; + + while (high >= low) { + + midd = (high + low) / 2; + + span = stki->val[midd] - sval; + + if (span == 0) { + stki->cursor = midd; + return TRUE; + } + + if (span > 0) + high = midd - 1; + else + low = midd + 1; + } + + return FALSE; + +} /* end of BinSearchStacki */ + +/* -------------------------------------------- */ +/* teste l'egalite *physique* de deux stacks */ +/* */ +/* @function: SameStacki */ +/* -------------------------------------------- */ + +bool_t SameStacki(StackiPtr stki1, StackiPtr stki2) +{ + if (stki1->top != stki2->top) + return FALSE; + + return ((memcmp(stki1->val, stki2->val, + stki1->top * sizeof(int32_t)) == 0) ? TRUE : FALSE); + +} /* end of SameStacki */ + + +/* -------------------------------------------- */ +/* inverse l'ordre des elements dans un stack */ +/* */ +/* @function: ReverseStacki */ +/* -------------------------------------------- */ + +bool_t ReverseStacki(StackiPtr stki) +{ + int32_t *t, *b, swp; + + if (stki->top <= 0) + return FALSE; + + b = stki->val; + t = b + stki->top - 1; + + while (t > b) { + swp = *t; + *t-- = *b; + *b++ = swp; + } + + return TRUE; + +} /* end of ReverseStacki */ + diff --git a/src/libecoprimer/libstki.h b/src/libecoprimer/libstki.h new file mode 100644 index 0000000..cad7d60 --- /dev/null +++ b/src/libecoprimer/libstki.h @@ -0,0 +1,89 @@ +/* ==================================================== */ +/* Copyright (c) Atelier de BioInformatique */ +/* Mar. 92 */ +/* File: libstki.h */ +/* Purpose: library of dynamic stacks holding */ +/* integer values */ +/* History: */ +/* 00/03/92 : first draft */ +/* 07/07/93 : complete revision */ +/* 10/03/94 : added xxxVector funcs */ +/* 14/05/99 : last revision */ +/* ==================================================== */ + +#ifndef _H_libstki +#define _H_libstki + + +#include "ecotype.h" + +/* ==================================================== */ +/* Constantes de dimensionnement */ +/* ==================================================== */ + +#ifndef kMinStackiSize +#define kMinStackiSize 2 /* taille mini stack */ +#endif + + +#define kStkiNoErr 0 /* ok */ +#define kStkiMemErr 1 /* not enough memory */ + +#define kStkiReset TRUE +#define kStkiGet FALSE + +/* ==================================================== */ +/* Macros standards */ +/* ==================================================== */ + +#ifndef NEW +#define NEW(typ) (typ*)malloc(sizeof(typ)) +#define NEWN(typ, dim) (typ*)malloc((uint32_t)(dim) * sizeof(typ)) +#define REALLOC(typ, ptr, dim) (typ*)realloc((void *) (ptr), (uint32_t)(dim) * sizeof(typ)) +#define FREE(ptr) free((Ptr) ptr) +#endif + + +/* ==================================================== */ +/* Types & Structures de donnees */ +/* ==================================================== */ + + /* -------------------- */ + /* structure : pile */ + /* -------------------- */ +typedef struct Stacki { + /* ---------------------*/ + int32_t size; /* stack size */ + int32_t top; /* current free pos. */ + int32_t cursor; /* current cursor */ + int32_t *val; /* values */ + /* ---------------------*/ +} Stacki, *StackiPtr, **StackiHdle; + + + +/* ==================================================== */ +/* Prototypes (generated by mproto) */ +/* ==================================================== */ + + /* libstki.c */ + +int16_t StkiError (bool_t reset ); +StackiPtr NewStacki (int32_t size ); +StackiPtr FreeStacki (StackiPtr stki ); +StackiHdle NewStackiVector (int32_t vectSize, int32_t stackSize ); +StackiHdle FreeStackiVector (StackiHdle stkh, int32_t vectSize ); +int32_t ResizeStacki (StackiHdle stkh , int32_t size ); +bool_t PushiIn (StackiHdle stkh , int32_t val ); +bool_t PopiOut (StackiHdle stkh , int32_t *val ); +bool_t ReadiDown (StackiPtr stki , int32_t *val ); +bool_t ReadiUp (StackiPtr stki , int32_t *val ); +void CursiToTop (StackiPtr stki ); +void CursiToBottom (StackiPtr stki ); +void CursiSwap (StackiPtr stki ); +bool_t SearchDownStacki (StackiPtr stki , int32_t sval ); +bool_t BinSearchStacki (StackiPtr stki , int32_t sval ); +bool_t SameStacki (StackiPtr stki1 , StackiPtr stki2 ); +bool_t ReverseStacki (StackiPtr stki ); + +#endif /* _H_libstki */ diff --git a/src/libecoprimer/mapping.c b/src/libecoprimer/mapping.c new file mode 100644 index 0000000..96c84bd --- /dev/null +++ b/src/libecoprimer/mapping.c @@ -0,0 +1,7 @@ +/* + * mapping.c + * + * Created on: 25 nov. 2008 + * Author: coissac + */ + diff --git a/src/libecoprimer/merge.P b/src/libecoprimer/merge.P new file mode 100644 index 0000000..fdb5796 --- /dev/null +++ b/src/libecoprimer/merge.P @@ -0,0 +1,17 @@ +merge.o merge.P : merge.c ecoprimer.h /usr/include/inttypes.h \ + /usr/include/sys/cdefs.h /usr/include/_types.h \ + /usr/include/sys/_types.h /usr/include/machine/_types.h \ + /usr/include/i386/_types.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \ + /usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \ + /usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \ + /usr/include/machine/signal.h /usr/include/i386/signal.h \ + /usr/include/i386/_structs.h /usr/include/sys/_structs.h \ + /usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \ + /usr/include/sys/resource.h /usr/include/machine/endian.h \ + /usr/include/i386/endian.h /usr/include/sys/_endian.h \ + /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \ + /usr/include/machine/types.h /usr/include/i386/types.h \ + /usr/include/stdio.h ecotype.h ../libecoPCR/ecoPCR.h apat.h libstki.h \ + debug.h diff --git a/src/libecoprimer/merge.c b/src/libecoprimer/merge.c new file mode 100644 index 0000000..28ec24b --- /dev/null +++ b/src/libecoprimer/merge.c @@ -0,0 +1,144 @@ +/* + * merge.c + * + * Created on: 11 nov. 2008 + * Author: coissac + */ + +#include "ecoprimer.h" + +static pmerge_t mergeInit(pmerge_t merge,pwordcount_t data,uint32_t s1,uint32_t s2); + + +static pmerge_t mergeInit(pmerge_t merge, pwordcount_t data,uint32_t s1,uint32_t s2) +{ + merge->words = data->words; + merge->count = data->strictcount; + merge->write = 0; + merge->read1 = 0; + merge->read2 = s1; + merge->size = s1+s2; + return merge; +} + + +typedef enum {S1=1,S2=2,STACK=3} source_t; + +void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,uint32_t seqQuorum) +{ + merge_t merged; + source_t source; + word_t currentword,tmpword; + uint32_t currentcount,tmpcount; + int same; + queue_t queue; + int nsame=0; + uint32_t maxcount=0; + bool_t writed=TRUE; + +// DEBUG_LOG("Coucou %p s1= %d s2= %d",data,s1,s2) + + (void)mergeInit(&merged,data,s1,s2); + (void)newQueue(&queue,MINI(s1,s2)); + + while (merged.read1 < s1 && merged.read2 < merged.size) + { + if (! queue.empty) + { + currentword = queue.words[queue.pop]; + currentcount = queue.count[queue.pop]; + source=STACK; + } + else + { + currentword = merged.words[merged.read1]; + currentcount = merged.count[merged.read1]; + source=S1; + } + + if (WORD(currentword) > WORD(merged.words[merged.read2])) + { + currentword = merged.words[merged.read2]; + currentcount = merged.count[merged.read2]; + source = S2; + } + + same = (source != S2) && (WORD(currentword) == WORD(merged.words[merged.read2])); + nsame+=same; + +// DEBUG_LOG("Merging : r1 = %d s1 = %d r2 = %d size = %d word = %s source=%u same=%u",merged.read1,s1,merged.read2-s1,merged.size,ecoUnhashWord(currentword,18),source,same) + + tmpword = merged.words[merged.write]; + tmpcount= merged.count[merged.write]; + + merged.words[merged.write] = currentword; + merged.count[merged.write] = currentcount; + + if (source != S2) + { + if (same) + { + merged.count[merged.write]+=merged.count[merged.read2]; + + if (ISMULTIWORD(currentword) || ISMULTIWORD(merged.words[merged.read2])) + merged.words[merged.write]=SETMULTIWORD(currentword); + + merged.read2++; + } + + if (source==STACK) + pop(&queue); + merged.read1++; + } + else + merged.read2++; + + if (writed && merged.read1 <= merged.write && merged.write < s1) + push(&queue,tmpword,tmpcount); + + if (merged.count[merged.write] > maxcount) + maxcount=merged.count[merged.write]; + + writed = remainingSeq + merged.count[merged.write] >= seqQuorum; + if (writed) + merged.write++; + + +// else +// DEBUG_LOG("Remove word : %s count : %d remainingSeq : %d total : %d Quorum : %d", +// ecoUnhashWord(currentword,18),merged.count[merged.write],remainingSeq,maxcount+remainingSeq,seqQuorum); + + } /* while loop */ + +// DEBUG_LOG("r1 : %d r2 : %d qsize : %d nsame : %d tot : %d write : %s count : %d source : %d size : %d pop : %d push : %d empty : %d",merged.read1,merged.read2-s1,qsize,nsame,qsize+nsame,ecoUnhashWord(currentword,18),currentcount,source,queue.size,queue.pop,queue.push,queue.empty) + + + if (merged.read2 < merged.size) + for (;merged.read2 < merged.size;merged.read2++) + { + merged.words[merged.write]=merged.words[merged.read2]; + merged.count[merged.write]=merged.count[merged.read2]; + if (remainingSeq + merged.count[merged.write] >= seqQuorum) + merged.write++; + + } + else while (! queue.empty) + { +// DEBUG_LOG("write : %s count : %d write : %d size : %d pop : %d push : %d empty : %d",ecoUnhashWord(queue.words[queue.pop],18),queue.count[queue.pop],merged.write,queue.size,queue.pop,queue.push,queue.empty) + merged.words[merged.write]=queue.words[queue.pop]; + merged.count[merged.write]=queue.count[queue.pop]; + pop(&queue); + if (remainingSeq + merged.count[merged.write] >= seqQuorum) + merged.write++; + } + + data->size = merged.write; + + cleanQueue(&queue); + +// DEBUG_LOG("Max count : %d remainingSeq : %d total : %d Quorum : %d",maxcount,remainingSeq,maxcount+remainingSeq,seqQuorum) +// DEBUG_LOG("Second word : %s",ecoUnhashWord(data->words[1],18)) +// DEBUG_LOG("Last word : %s",ecoUnhashWord(data->words[data->size-1],18)) + + +} diff --git a/src/libecoprimer/pairs.P b/src/libecoprimer/pairs.P new file mode 100644 index 0000000..61a7976 --- /dev/null +++ b/src/libecoprimer/pairs.P @@ -0,0 +1,17 @@ +pairs.o pairs.P : pairs.c ecoprimer.h /usr/include/inttypes.h \ + /usr/include/sys/cdefs.h /usr/include/_types.h \ + /usr/include/sys/_types.h /usr/include/machine/_types.h \ + /usr/include/i386/_types.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \ + /usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \ + /usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \ + /usr/include/machine/signal.h /usr/include/i386/signal.h \ + /usr/include/i386/_structs.h /usr/include/sys/_structs.h \ + /usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \ + /usr/include/sys/resource.h /usr/include/machine/endian.h \ + /usr/include/i386/endian.h /usr/include/sys/_endian.h \ + /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \ + /usr/include/machine/types.h /usr/include/i386/types.h \ + /usr/include/stdio.h ecotype.h ../libecoPCR/ecoPCR.h apat.h libstki.h \ + debug.h /usr/include/string.h diff --git a/src/libecoprimer/pairs.c b/src/libecoprimer/pairs.c new file mode 100644 index 0000000..a5308a6 --- /dev/null +++ b/src/libecoprimer/pairs.c @@ -0,0 +1,321 @@ +/* + * pairs.c + * + * Created on: 15 dŽc. 2008 + * Author: coissac + */ + +#include "ecoprimer.h" +#include + +primermatchcount_t buildPrimerPairsForOneSeq(uint32_t seqid,pprimercount_t primers,poptions_t options); + +int32_t pairinlist (ppairs_t pairlist, word_t w1, word_t w2, uint32_t size) +{ + uint32_t i; + + for (i = 0; i < size; i++) + { + if (w1 == pairlist[i].w1 && w2 == pairlist[i].w2) return i; + if (w1 == pairlist[i].w2 && w2 == pairlist[i].w1) return i; + } + return -1; +} + +char *addamplifiasetelem (ppairs_t pair, char* amplifia, int32_t taxid) +{ + uint32_t i; + uint32_t j; + char *ampused = NULL; + + if(pair->ampsetcount == 0) + { + pair->ampsetcount = 500; + pair->ampsetindex = 0; + pair->ampset = ECOMALLOC(pair->ampsetcount * sizeof(ampseqset_t),"Cannot allocate amplifia set"); + } + + for (i = 0; i < pair->ampsetindex; i++) + { + if (strcmp (pair->ampset[i].amplifia, amplifia) == 0) + { + ampused = pair->ampset[i].amplifia; + break; + } + } + + if (i == 0) + { + pair->ampset[i].seqidcount = 100; + pair->ampset[i].seqidindex = 0; + pair->ampset[i].taxonids = ECOMALLOC(pair->ampset[i].seqidcount * sizeof(uint32_t),"Cannot allocate amplifia sequence table"); + } + + if (pair->ampsetindex == pair->ampsetcount) + { + pair->ampsetcount += 500; + pair->ampset = ECOREALLOC(pair->ampset, pair->ampsetcount * sizeof(ampseqset_t), "Cannot allocate amplifia set"); + } + + if (pair->ampset[i].seqidindex == pair->ampset[i].seqidcount) + { + pair->ampset[i].seqidcount += 100; + pair->ampset[i].taxonids = ECOREALLOC(pair->ampset[i].taxonids, pair->ampset[i].seqidcount * sizeof(int32_t), "Cannot allocate amplifia sequence table"); + } + + if (pair->ampset[i].amplifia == NULL) + { + pair->ampset[i].amplifia = amplifia; + pair->ampsetindex++; + } + + for (j = 0; j < pair->ampset[i].seqidindex; j++) + { + if (pair->ampset[i].taxonids[j] == taxid) break; + } + + if (j == pair->ampset[i].seqidindex) + pair->ampset[i].taxonids[pair->ampset[i].seqidindex++] = taxid; + return ampused; +} + +void addtaxampsetelem (ppairs_t pair, int32_t taxid, char *amplifia) +{ + uint32_t i; + uint32_t j; + + if(pair->taxsetcount == 0) + { + pair->taxsetcount = 500; + pair->taxsetindex = 0; + pair->taxset = ECOMALLOC(pair->taxsetcount * sizeof(taxampset_t),"Cannot allocate taxon set"); + } + + for (i = 0; i < pair->taxsetindex; i++) + { + if (pair->taxset[i].taxonid == taxid) break; + } + + if (i == 0) + { + pair->taxset[i].amplifiacount = 100; + pair->taxset[i].amplifiaindex = 0; + pair->taxset[i].amplifia = ECOMALLOC(pair->taxset[i].amplifiacount * sizeof(char *),"Cannot allocate amplifia table"); + } + + if (pair->taxsetindex == pair->taxsetcount) + { + pair->taxsetcount += 500; + pair->taxset = ECOREALLOC(pair->taxset, pair->taxsetcount * sizeof(taxampset_t), "Cannot allocate taxon set"); + } + + if (pair->taxset[i].amplifiaindex == pair->taxset[i].amplifiacount) + { + pair->taxset[i].amplifiacount += 100; + pair->taxset[i].amplifia = ECOREALLOC(pair->taxset[i].amplifia, pair->taxset[i].amplifiacount * sizeof(char *), "Cannot allocate amplifia table"); + } + + if (pair->taxset[i].taxonid == 0) + { + pair->taxset[i].taxonid = taxid; + pair->taxsetindex++; + } + + for (j = 0; j < pair->taxset[i].amplifiaindex; j++) + { + if (strcmp(pair->taxset[i].amplifia[j], amplifia) == 0) break; + } + + if (j == pair->taxset[i].amplifiaindex) + { + pair->taxset[i].amplifia[j] = amplifia; + pair->taxset[i].amplifiaindex++; + } +} + +char *getamplifia (pecoseq_t seq, uint32_t start, uint32_t len) +{ + char *amplifia = ECOMALLOC((len + 1) * sizeof(char),"Cannot allocate amplifia"); + char *seqc = &seq->SQ[start]; + + strncpy(amplifia, seqc, len); + return amplifia; +} + + +/*TR: Added*/ +pairscount_t buildPrimerPairs(pecodnadb_t seqdb,uint32_t seqdbsize,pprimercount_t primers,poptions_t options) +{ + uint32_t i; + uint32_t j; + uint32_t k; + uint32_t d; + uint32_t strt; + uint32_t end; + uint32_t paircount = 0; + uint32_t pairslots = 500; + int32_t foundindex; + ppairs_t pairs; + pairscount_t primerpairs; + primermatchcount_t seqmatchcount; + word_t w1; + word_t w2; + char *amplifia; + char *oldamp; + + + pairs = ECOMALLOC(pairslots * sizeof(pairs_t),"Cannot allocate pairs table"); + + for (i=0; i < seqdbsize; i++) + { + seqmatchcount = buildPrimerPairsForOneSeq(i, primers, options); + if (seqmatchcount.matchcount == 0) continue; + + for (j=0; j < seqmatchcount.matchcount; j++) + { + strt = 0; + w1 = seqmatchcount.matches[j].word; + /*first word should b on direct strand*/ + if (!seqmatchcount.matches[j].strand) + w1 = ecoComplementWord(w1, options->primer_length); + else + strt = options->primer_length; + + for (k=j+1; k < seqmatchcount.matchcount; k++) + { + end = 0; + w2 = seqmatchcount.matches[k].word; + /*second word should be on reverse strand*/ + if (seqmatchcount.matches[k].strand) + w2 = ecoComplementWord(w2, options->primer_length); + else + end = options->primer_length; + + if (!(seqmatchcount.matches[j].good || seqmatchcount.matches[k].good)) continue; + if (w1 == w2) continue; + + d = seqmatchcount.matches[k].position - seqmatchcount.matches[j].position; + if (d >= options->lmin && d <= options->lmax) + { + /*get amplified string*/ + amplifia = getamplifia (seqdb[i], seqmatchcount.matches[j].position + strt, d - strt - end); + + foundindex = pairinlist(pairs, w1, w2, paircount); + if (foundindex != -1) /*pair is found*/ + { + if (seqdb[i]->isexample) + pairs[foundindex].inexample++; + else + pairs[foundindex].outexample++; + + if (pairs[foundindex].mind > d) pairs[foundindex].mind = d; + else if (pairs[foundindex].maxd < d) pairs[foundindex].maxd = d; + + oldamp = addamplifiasetelem (&pairs[foundindex], amplifia, seqdb[i]->ranktaxonid); + /*if exact same string is already in amplifia set then use that for taxon set, it will help for + * calculating the fully identified taxons i.e specificity, we will compare pointrs instead of strings + * because same string means same pointer*/ + if (oldamp) + { + ECOFREE (amplifia, "free amplifia"); + amplifia = oldamp; + } + addtaxampsetelem (&pairs[foundindex], seqdb[i]->ranktaxonid, amplifia); + + continue; + } + + if (paircount == pairslots) + { + pairslots += 500; + pairs = ECOREALLOC(pairs, pairslots * sizeof(pairs_t), "Cannot allocate pairs table"); + } + pairs[paircount].w1 = w1; + pairs[paircount].w2 = w2; + if (seqdb[i]->isexample) pairs[paircount].inexample = 1; + else pairs[paircount].outexample = 1; + pairs[paircount].mind = d; + pairs[paircount].maxd = d; + oldamp = addamplifiasetelem (&pairs[paircount], amplifia, seqdb[i]->ranktaxonid); + addtaxampsetelem (&pairs[paircount], seqdb[i]->ranktaxonid, amplifia); + + paircount++; + } + else if (d > options->lmax) + break; /*once if the distance is greater than lmax then it will keep on increasing*/ + } + } + ECOFREE(seqmatchcount.matches, "Cannot free matches table"); + } + primerpairs.pairs = ECOREALLOC(pairs, paircount * sizeof(pairs_t), "Cannot allocate pairs table"); + primerpairs.paircount = paircount; + return primerpairs; +} + +primermatchcount_t buildPrimerPairsForOneSeq(uint32_t seqid,pprimercount_t primers,poptions_t options) +{ + uint32_t i,j,k; + uint32_t matchcount=0; + pprimermatch_t matches = NULL; + primermatchcount_t seqmatchcount; + + seqmatchcount.matchcount = 0; + seqmatchcount.matches = NULL; + + for (i=0;i < primers->size; i++) + { + matchcount+=primers->primers[i].directCount[seqid]; + matchcount+=primers->primers[i].reverseCount[seqid]; + } + + if (matchcount <= 0) return seqmatchcount; + matches = ECOMALLOC(matchcount * sizeof(primermatch_t),"Cannot allocate primers match table"); + + for (i=0,j=0;i < primers->size; i++) + { + if (primers->primers[i].directCount[seqid]) + { + if (primers->primers[i].directCount[seqid]==1) + { + matches[j].word = primers->primers[i].word; + matches[j].strand=TRUE; + matches[j].good=primers->primers[i].good;/*TR: Added*/ + matches[j].position=primers->primers[i].directPos[seqid].value; + j++; + } + else for (k=0; k < primers->primers[i].directCount[seqid]; k++,j++) + { + matches[j].word = primers->primers[i].word; + matches[j].strand=TRUE; + matches[j].good=primers->primers[i].good;/*TR: Added*/ + matches[j].position=primers->primers[i].directPos[seqid].pointer[k]; + } + } + + if (primers->primers[i].reverseCount[seqid]) + { + if (primers->primers[i].reverseCount[seqid]==1) + { + matches[j].word = primers->primers[i].word; + matches[j].strand=FALSE; + matches[j].good=primers->primers[i].good;/*TR: Added*/ + matches[j].position=primers->primers[i].reversePos[seqid].value; + j++; + } + else for (k=0; k < primers->primers[i].reverseCount[seqid]; k++,j++) + { + matches[j].word = primers->primers[i].word; + matches[j].strand=FALSE; + matches[j].good=primers->primers[i].good;/*TR: Added*/ + matches[j].position=primers->primers[i].reversePos[seqid].pointer[k]; + } + } + } + + sortmatch(matches,matchcount); // sort in asscending order by position + + /*TR: Added*/ + seqmatchcount.matches = matches; + seqmatchcount.matchcount = matchcount; + return seqmatchcount; +} diff --git a/src/libecoprimer/queue.P b/src/libecoprimer/queue.P new file mode 100644 index 0000000..e7a6bc9 --- /dev/null +++ b/src/libecoprimer/queue.P @@ -0,0 +1,17 @@ +queue.o queue.P : queue.c ecoprimer.h /usr/include/inttypes.h \ + /usr/include/sys/cdefs.h /usr/include/_types.h \ + /usr/include/sys/_types.h /usr/include/machine/_types.h \ + /usr/include/i386/_types.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \ + /usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \ + /usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \ + /usr/include/machine/signal.h /usr/include/i386/signal.h \ + /usr/include/i386/_structs.h /usr/include/sys/_structs.h \ + /usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \ + /usr/include/sys/resource.h /usr/include/machine/endian.h \ + /usr/include/i386/endian.h /usr/include/sys/_endian.h \ + /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \ + /usr/include/machine/types.h /usr/include/i386/types.h \ + /usr/include/stdio.h ecotype.h ../libecoPCR/ecoPCR.h apat.h libstki.h \ + debug.h diff --git a/src/libecoprimer/queue.c b/src/libecoprimer/queue.c new file mode 100644 index 0000000..7d11b25 --- /dev/null +++ b/src/libecoprimer/queue.c @@ -0,0 +1,100 @@ +/* + * queue.c + * + * Created on: 14 nov. 2008 + * Author: coissac + */ + +#include "ecoprimer.h" + + + +pqueue_t newQueue(pqueue_t queue, uint32_t size) +{ + if (!queue) + queue = ECOMALLOC(sizeof(queue_t),"Cannot allocate queue structure"); + + queue->size=0; + + resizeQueue(queue,size); + + return queue; + +} + +pqueue_t resizeQueue(pqueue_t queue, uint32_t size) +{ + queue->pop=0; + queue->push=0; + queue->empty=TRUE; + queue->full=FALSE; + + if (!queue->size) + { + queue->count=ECOMALLOC(size * sizeof(uint32_t), + "Cannot allocate count queue array" + ); + queue->words=ECOMALLOC(size * sizeof(word_t), + "Cannot allocate word queue array" + ); + queue->size=size; + } + else if (size > queue->size) + { + queue->count=ECOREALLOC(queue->count, + size * sizeof(uint32_t), + "Cannot allocate count queue array" + ); + queue->words=ECOREALLOC(queue->words, + size * sizeof(word_t), + "Cannot allocate word queue array" + ); + + queue->size=size; + } + + return queue; +} + +pqueue_t cleanQueue(pqueue_t queue) +{ + if (queue->size) + { + if (queue->count) + ECOFREE(queue->count,"Free count queue"); + if (queue->words) + ECOFREE(queue->words,"Free words queue"); + } + + queue->size=0; + + return queue; +} + +void push(pqueue_t queue, word_t word, uint32_t count) +{ + ECO_ASSERT(!queue->full,"Queue is full"); + + queue->count[queue->push]=count; + queue->words[queue->push]=word; + + queue->push++; + + if (queue->push==queue->size) + queue->push=0; + + queue->full=queue->push==queue->pop; + queue->empty=FALSE; +} + +void pop(pqueue_t queue) +{ + ECO_ASSERT(!queue->empty,"Queue is empty"); + queue->pop++; + + if (queue->pop==queue->size) + queue->pop=0; + + queue->empty=queue->push==queue->pop; + queue->full=FALSE; +} diff --git a/src/libecoprimer/readdnadb.P b/src/libecoprimer/readdnadb.P new file mode 100644 index 0000000..9d16e11 --- /dev/null +++ b/src/libecoprimer/readdnadb.P @@ -0,0 +1,17 @@ +readdnadb.o readdnadb.P : readdnadb.c ecoprimer.h /usr/include/inttypes.h \ + /usr/include/sys/cdefs.h /usr/include/_types.h \ + /usr/include/sys/_types.h /usr/include/machine/_types.h \ + /usr/include/i386/_types.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \ + /usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \ + /usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \ + /usr/include/machine/signal.h /usr/include/i386/signal.h \ + /usr/include/i386/_structs.h /usr/include/sys/_structs.h \ + /usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \ + /usr/include/sys/resource.h /usr/include/machine/endian.h \ + /usr/include/i386/endian.h /usr/include/sys/_endian.h \ + /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \ + /usr/include/machine/types.h /usr/include/i386/types.h \ + /usr/include/stdio.h ecotype.h ../libecoPCR/ecoPCR.h apat.h libstki.h \ + debug.h diff --git a/src/libecoprimer/readdnadb.c b/src/libecoprimer/readdnadb.c new file mode 100644 index 0000000..a148510 --- /dev/null +++ b/src/libecoprimer/readdnadb.c @@ -0,0 +1,35 @@ +/* + * readdnadb.c + * + * Created on: 7 nov. 2008 + * Author: coissac + */ + +#include "ecoprimer.h" + +pecodnadb_t readdnadb(const char *name, uint32_t *size) +{ + ecoseq_t *seq; + uint32_t buffsize=100; + pecodnadb_t db; + + db = ECOMALLOC(buffsize*sizeof(ecoseq_t*),"I cannot allocate db memory"); + + + for(seq=ecoseq_iterator(name), *size=0; + seq; + seq=ecoseq_iterator(NULL), (*size)++ + ) + { + if (*size==buffsize) + { + buffsize*=2; + db = ECOREALLOC(db,buffsize*sizeof(ecoseq_t*),"I cannot allocate db memory"); + } + db[*size]=seq; + }; + + db = ECOREALLOC(db,(*size)*sizeof(ecoseq_t*),"I cannot allocate db memory"); + + return db; +} diff --git a/src/libecoprimer/smothsort.P b/src/libecoprimer/smothsort.P new file mode 100644 index 0000000..2441cad --- /dev/null +++ b/src/libecoprimer/smothsort.P @@ -0,0 +1,10 @@ +smothsort.o smothsort.P : smothsort.c /usr/include/assert.h /usr/include/sys/cdefs.h \ + /usr/include/stdio.h /usr/include/_types.h /usr/include/sys/_types.h \ + /usr/include/machine/_types.h /usr/include/i386/_types.h \ + /usr/include/sys/types.h /usr/include/sys/appleapiopts.h \ + /usr/include/machine/types.h /usr/include/i386/types.h \ + /usr/include/machine/endian.h /usr/include/i386/endian.h \ + /usr/include/sys/_endian.h /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/sys/_structs.h \ + /usr/include/inttypes.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h diff --git a/src/libecoprimer/smothsort.c b/src/libecoprimer/smothsort.c new file mode 100644 index 0000000..72ee444 --- /dev/null +++ b/src/libecoprimer/smothsort.c @@ -0,0 +1,265 @@ +/* + * This file is part of the Sofia-SIP package + * + * Copyright (C) 2005 Nokia Corporation. + * + * Contact: Pekka Pessi + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + * + */ + +/**@file smoothsort.c + * @brief Smoothsort implementation + * + * Smoothsort is a in-place sorting algorithm with performance of O(NlogN) + * in worst case and O(n) in best case. + * + * @sa + * "Smoothsort, an alternative for sorting in-situ", E.D. Dijkstra, EWD796a, + * <http://www.enterag.ch/hartwig/order/smoothsort.pdf>. + * + * @author Pekka Pessi + */ + + +#include +#include +#include +#include /* add sto switch from size_t to uint32_t */ + +/** Description of current stretch */ +typedef struct { + uint32_t b, c; /**< Leonardo numbers */ + unsigned long long p; /**< Concatenation codification */ +} stretch; + +/** Description of array */ +typedef struct +{ + void *m; + int (*less)(void *m, uint32_t a, uint32_t b); + void (*swap)(void *m, uint32_t a, uint32_t b); +} array; + +static inline uint32_t stretch_up(stretch s[1]) +{ + uint32_t next; + + s->p >>= 1; + + next = s->b + s->c + 1, s->c = s->b, s->b = next; + + return next; +} + +static inline uint32_t stretch_down(stretch s[1], unsigned bit) +{ + uint32_t next; + + s->p <<= 1, s->p |= bit; + + next = s->c, s->c = s->b - s->c - 1, s->b = next; + + return next; +} + +#if DEBUG_SMOOTHSORT +static char const *binary(unsigned long long p) +{ + static char binary[65]; + int i; + + if (p == 0) + return "0"; + + binary[64] = 0; + + for (i = 64; p; p >>= 1) + binary[--i] = "01"[p & 1]; + + return binary + i; +} +#else +#define DEBUG(x) ((void)0) +#endif + +/** + * Sift the root of the stretch. + * + * The low values are sifted up (towards index 0) from root. + * + * @param array description of array to sort + * @param r root of the stretch + * @param s description of current stretch + */ +static void sift(array const *array, uint32_t r, stretch s) +{ + while (s.b >= 3) { + uint32_t r2 = r - s.b + s.c; + + if (!array->less(array->m, r - 1, r2)) { + r2 = r - 1; + stretch_down(&s, 0); + } + + if (array->less(array->m, r2, r)) + break; + + DEBUG(("\tswap(%p @%zu <=> @%zu)\n", array, r, r2)); + + array->swap(array->m, r, r2); r = r2; + + stretch_down(&s, 0); + } +} + +/** Trinkle the roots of the given stretches + * + * @param array description of array to sort + * @param r root of the stretch + * @param s description of stretches to concatenate + */ +static void trinkle(array const *array, uint32_t r, stretch s) +{ + DEBUG(("trinkle(%p, %zu, (%u, %s))\n", array, r, s.b, binary(s.p))); + + while (s.p != 0) { + uint32_t r2, r3; + + while ((s.p & 1) == 0) + stretch_up(&s); + + if (s.p == 1) + break; + + r3 = r - s.b; + + if (array->less(array->m, r3, r)) + break; + + s.p--; + + if (s.b < 3) { + DEBUG(("\tswap(%p @%zu <=> @%zu b=%u)\n", array, r, r3, s.b)); + array->swap(array->m, r, r3); r = r3; + continue; + } + + r2 = r - s.b + s.c; + + if (array->less(array->m, r2, r - 1)) { + r2 = r - 1; + stretch_down(&s, 0); + } + + if (array->less(array->m, r2, r3)) { + DEBUG(("swap(%p [%zu]=[%zu])\n", array, r, r3)); + array->swap(array->m, r, r3); r = r3; + continue; + } + + DEBUG(("\tswap(%p @%zu <=> @%zu b=%u)\n", array, r, r2, s.b)); + array->swap(array->m, r, r2); r = r2; + stretch_down(&s, 0); + break; + } + + sift(array, r, s); +} + +/** Trinkles the stretches when the adjacent stretches are already trusty. + * + * @param array description of array to sort + * @param r root of the stretch + * @param stretch description of stretches to trinkle + */ +static void semitrinkle(array const *array, uint32_t r, stretch s) +{ + uint32_t r1 = r - s.c; + + DEBUG(("semitrinkle(%p, %zu, (%u, %s))\n", array, r, s.b, binary(s.p))); + + if (array->less(array->m, r, r1)) { + DEBUG(("\tswap(%p @%zu <=> @%zu b=%u)\n", array, r, r1, s.b)); + array->swap(array->m, r, r1); + trinkle(array, r1, s); + } +} + +/** Sort array using smoothsort. + * + * Sort @a N elements from array @a base starting with index @a r with smoothsort. + * + * @param base pointer to array + * @param r lowest index to sort + * @param N number of elements to sort + * @param less comparison function returning nonzero if m[a] < m[b] + * @param swap swapper function exchanging elements m[a] and m[b] + */ +void su_smoothsort(void *base, uint32_t r, uint32_t N, + int (*less)(void *m, uint32_t a, uint32_t b), + void (*swap)(void *m, uint32_t a, uint32_t b)) +{ + stretch s = { 1, 1, 1 }; + uint32_t q; + + array const array[1] = {{ base, less, swap }}; + + assert(less && swap); + + if (base == NULL || N <= 1 || less == NULL || swap == NULL) + return; + + DEBUG(("\nsmoothsort(%p, %zu)\n", array, nmemb)); + + for (q = 1; q != N; q++, r++, s.p++) { + DEBUG(("loop0 q=%zu, b=%u, p=%s \n", q, s.b, binary(s.p))); + + if ((s.p & 7) == 3) { + sift(array, r, s), stretch_up(&s), stretch_up(&s); + } + else /* if ((s.p & 3) == 1) */ { assert((s.p & 3) == 1); + if (q + s.c < N) + sift(array, r, s); + else + trinkle(array, r, s); + + while (stretch_down(&s, 0) > 1) + ; + } + } + + trinkle(array, r, s); + + for (; q > 1; q--) { + s.p--; + + DEBUG(("loop1 q=%zu: b=%u p=%s\n", q, s.b, binary(s.p))); + + if (s.b <= 1) { + while ((s.p & 1) == 0) + stretch_up(&s); + --r; + } + else /* if b >= 3 */ { + if (s.p) semitrinkle(array, r - (s.b - s.c), s); + stretch_down(&s, 1); + semitrinkle(array, --r, s); + stretch_down(&s, 1); + } + } +} diff --git a/src/libecoprimer/sortmatch.P b/src/libecoprimer/sortmatch.P new file mode 100644 index 0000000..721cd46 --- /dev/null +++ b/src/libecoprimer/sortmatch.P @@ -0,0 +1,17 @@ +sortmatch.o sortmatch.P : sortmatch.c ecoprimer.h /usr/include/inttypes.h \ + /usr/include/sys/cdefs.h /usr/include/_types.h \ + /usr/include/sys/_types.h /usr/include/machine/_types.h \ + /usr/include/i386/_types.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \ + /usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \ + /usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \ + /usr/include/machine/signal.h /usr/include/i386/signal.h \ + /usr/include/i386/_structs.h /usr/include/sys/_structs.h \ + /usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \ + /usr/include/sys/resource.h /usr/include/machine/endian.h \ + /usr/include/i386/endian.h /usr/include/sys/_endian.h \ + /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \ + /usr/include/machine/types.h /usr/include/i386/types.h \ + /usr/include/stdio.h ecotype.h ../libecoPCR/ecoPCR.h apat.h libstki.h \ + debug.h /usr/include/math.h /usr/include/architecture/i386/math.h diff --git a/src/libecoprimer/sortmatch.c b/src/libecoprimer/sortmatch.c new file mode 100644 index 0000000..f3771b7 --- /dev/null +++ b/src/libecoprimer/sortmatch.c @@ -0,0 +1,51 @@ +/* + * sortmatch.c + * + * Created on: 15 dŽc. 2008 + * Author: coissac + */ + +/* + * sortword.c + * + * + * Created on: 6 nov. 2008 + * Author: coissac + */ + +#include "ecoprimer.h" +#include + +void su_smoothsort(void *base, uint32_t r, uint32_t N, + int (*less)(void *m, uint32_t a, uint32_t b), + void (*swap)(void *m, uint32_t a, uint32_t b)); + +static int less(void *m, uint32_t a, uint32_t b); +static void swap(void *m, uint32_t a, uint32_t b); + + +void sortmatch(pprimermatch_t table,uint32_t N) +{ + su_smoothsort((void*)table,0,N,less,swap); +} + +int less(void *m, uint32_t a, uint32_t b) +{ + pprimermatch_t t; + + t = (pprimermatch_t)m; + + return t[a].position <= t[b].position; +} + +void swap(void *m, uint32_t a, uint32_t b) +{ + primermatch_t tmp; + pprimermatch_t t; + + t = (pprimermatch_t)m; + tmp = t[a]; + t[a]= t[b]; + t[b]= tmp; +} + diff --git a/src/libecoprimer/sortword.P b/src/libecoprimer/sortword.P new file mode 100644 index 0000000..57df213 --- /dev/null +++ b/src/libecoprimer/sortword.P @@ -0,0 +1,17 @@ +sortword.o sortword.P : sortword.c ecoprimer.h /usr/include/inttypes.h \ + /usr/include/sys/cdefs.h /usr/include/_types.h \ + /usr/include/sys/_types.h /usr/include/machine/_types.h \ + /usr/include/i386/_types.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \ + /usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \ + /usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \ + /usr/include/machine/signal.h /usr/include/i386/signal.h \ + /usr/include/i386/_structs.h /usr/include/sys/_structs.h \ + /usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \ + /usr/include/sys/resource.h /usr/include/machine/endian.h \ + /usr/include/i386/endian.h /usr/include/sys/_endian.h \ + /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \ + /usr/include/machine/types.h /usr/include/i386/types.h \ + /usr/include/stdio.h ecotype.h ../libecoPCR/ecoPCR.h apat.h libstki.h \ + debug.h /usr/include/math.h /usr/include/architecture/i386/math.h diff --git a/src/libecoprimer/sortword.c b/src/libecoprimer/sortword.c new file mode 100644 index 0000000..389630f --- /dev/null +++ b/src/libecoprimer/sortword.c @@ -0,0 +1,44 @@ +/* + * sortword.c + * + * + * Created on: 6 nov. 2008 + * Author: coissac + */ + +#include "ecoprimer.h" +#include + +void su_smoothsort(void *base, uint32_t r, uint32_t N, + int (*less)(void *m, uint32_t a, uint32_t b), + void (*swap)(void *m, uint32_t a, uint32_t b)); + +static int less(void *m, uint32_t a, uint32_t b); +static void swap(void *m, uint32_t a, uint32_t b); + + +void sortword(pword_t table,uint32_t N) +{ + su_smoothsort((void*)table,0,N,less,swap); +} + +int less(void *m, uint32_t a, uint32_t b) +{ + pword_t t; + + t = (pword_t)m; + + return WORD(t[a]) <= WORD(t[b]); +} + +void swap(void *m, uint32_t a, uint32_t b) +{ + word_t tmp; + pword_t t; + + t = (pword_t)m; + tmp = t[a]; + t[a]= t[b]; + t[b]= tmp; +} + diff --git a/src/libecoprimer/strictprimers.P b/src/libecoprimer/strictprimers.P new file mode 100644 index 0000000..cd887aa --- /dev/null +++ b/src/libecoprimer/strictprimers.P @@ -0,0 +1,18 @@ +strictprimers.o strictprimers.P : strictprimers.c ecoprimer.h /usr/include/inttypes.h \ + /usr/include/sys/cdefs.h /usr/include/_types.h \ + /usr/include/sys/_types.h /usr/include/machine/_types.h \ + /usr/include/i386/_types.h \ + /usr/lib/gcc/i686-apple-darwin9/4.0.1/include/stdint.h \ + /usr/include/stdlib.h /usr/include/available.h /usr/include/sys/wait.h \ + /usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \ + /usr/include/machine/signal.h /usr/include/i386/signal.h \ + /usr/include/i386/_structs.h /usr/include/sys/_structs.h \ + /usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \ + /usr/include/sys/resource.h /usr/include/machine/endian.h \ + /usr/include/i386/endian.h /usr/include/sys/_endian.h \ + /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \ + /usr/include/machine/types.h /usr/include/i386/types.h \ + /usr/include/stdio.h ecotype.h ../libecoPCR/ecoPCR.h apat.h libstki.h \ + debug.h /usr/include/string.h /usr/include/math.h \ + /usr/include/architecture/i386/math.h diff --git a/src/libecoprimer/strictprimers.c b/src/libecoprimer/strictprimers.c new file mode 100644 index 0000000..7cca4e8 --- /dev/null +++ b/src/libecoprimer/strictprimers.c @@ -0,0 +1,170 @@ +/* + * strictprimers.c + * + * Created on: 7 nov. 2008 + * Author: coissac + */ + +#include "ecoprimer.h" +#include +#include + +pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,ecoseq_t *seq) +{ + uint32_t i; + uint32_t buffsize; + //wordcount_t t; + + if (!table) + table = ECOMALLOC(sizeof(wordcount_t),"Cannot allocate memory for word count structure"); + + table->words=NULL; + table->size =0; + table->outseqcount=0; + table->inseqcount=0; + table->strictcount =0; + + if (seq) + { + table->words = ecoHashSequence(NULL,wordsize,circular,doublestrand,seq,&buffsize); + table->size = ecoCompactHashSequence(table->words,buffsize); + + table->inseqcount=1; + table->strictcount =ECOMALLOC((table->size*sizeof(uint32_t)), + "Cannot allocate memory for word count table" + ); + + for (i=0; i < table->size; i++) table->strictcount[i]=1; + } + + return table; +} + +void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t exampleCount,uint32_t seqQuorum,ecoseq_t *seq) +{ + uint32_t buffersize; + pword_t newtable; + uint32_t newsize; + uint32_t i; + + buffersize = table->size + ecoWordCount(wordsize,circular,seq); + + table->words = ECOREALLOC(table->words,buffersize*sizeof(word_t), + "Cannot allocate memory to extend word table"); + + + newtable = table->words + table->size; + +// DEBUG_LOG("Words = %x (%u) new = %x", table->words,table->size,newtable); + + (void)ecoHashSequence(newtable,wordsize,circular,doublestrand,seq,&newsize); +// DEBUG_LOG("new seq wordCount : %d",newsize); + + newsize = ecoCompactHashSequence(newtable,newsize); + +// DEBUG_LOG("compacted wordCount : %d",newsize); + buffersize = table->size + newsize; + + // resize the count buffer + + table->inseqcount++; + + + table->strictcount = ECOREALLOC(table->strictcount,buffersize*sizeof(uint32_t), + "Cannot allocate memory to extend example word count table"); + + for (i=table->size; i < buffersize; i++) + table->strictcount[i]=1; + + + + // Now we have to merge in situ the two tables + + ecomerge(table,table->size,newsize,exampleCount - table->inseqcount,seqQuorum); +// DEBUG_LOG("Dictionnary size : %d",table->size); + +} + +pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize, + uint32_t exampleCount,poptions_t options) +{ + uint32_t i; + pwordcount_t strictprimers=NULL; + uint32_t sequenceQuorum = (uint32_t)floor((float)exampleCount * options->strict_quorum); + + fprintf(stderr," Primers should be at least present in %d/%d example sequences\n",sequenceQuorum,exampleCount); + + strictprimers = initCountTable(NULL,options->primer_length, + options->circular, + options->doublestrand, + NULL); + + + for (i=0;iisexample) + { + if (strictprimers->size) + { + uint32_t s; + s = strictprimers->size; +// DEBUG_LOG("stack size : %u",s); + addSeqToWordCountTable(strictprimers,options->primer_length, + options->circular, + options->doublestrand, + exampleCount, + sequenceQuorum, + database[i]); + } + else + strictprimers = initCountTable(strictprimers,options->primer_length, + options->circular, + options->doublestrand, + database[i]); + + } + else + strictprimers->outseqcount++; + + fprintf(stderr," Indexed sequences %5d/%5d : considered words %-10d \r",(int32_t)i+1,(int32_t)seqdbsize,strictprimers->size); + +// DEBUG_LOG("First word : %s ==> %d",ecoUnhashWord(strictprimers->words[0],18),strictprimers->incount[0]) +// DEBUG_LOG("Second word : %s ==> %d",ecoUnhashWord(strictprimers->words[1],18),strictprimers->incount[1]) + } + + strictprimers->strictcount = ECOREALLOC(strictprimers->strictcount, + sizeof(uint32_t)*strictprimers->size, + "Cannot reallocate strict primer count table"); + strictprimers->words = ECOREALLOC(strictprimers->words, + sizeof(word_t)*strictprimers->size, + "Cannot reallocate strict primer table"); + + return strictprimers; +} + +uint32_t filterMultiStrictPrimer(pwordcount_t strictprimers) +{ + uint32_t i; + uint32_t w; + + for (i=0,w=0;i < strictprimers->size;i++) + { + if (w < i) + { + strictprimers->words[w]=strictprimers->words[i]; + strictprimers->strictcount[w]=strictprimers->strictcount[i]; + } + if (! ISMULTIWORD(strictprimers->words[w])) + w++; + } + + strictprimers->size=w; + strictprimers->strictcount = ECOREALLOC(strictprimers->strictcount, + sizeof(uint32_t)*strictprimers->size, + "Cannot reallocate strict primer count table"); + strictprimers->words = ECOREALLOC(strictprimers->words, + sizeof(word_t)*strictprimers->size, + "Cannot reallocate strict primer table"); + + return w; +}