Compare commits
77 Commits
ecopcr_v0.
...
refactorin
Author | SHA1 | Date | |
---|---|---|---|
b000ae68d9 | |||
2e8c634b80 | |||
aaaf31063d | |||
ddcf0c2fc4 | |||
9fe0c1ce48 | |||
8eccd87035 | |||
cacd9854f3 | |||
212c1fbb7f | |||
2e7e0f625f | |||
d6fcc7673b | |||
379ca7988b | |||
24f958ce1f | |||
36d536305e | |||
1c421f5903 | |||
c77c360782 | |||
163693a132 | |||
7f39141289 | |||
433a19fd87 | |||
dc6e7bba69 | |||
f91c7804ab | |||
d94fc0daa4 | |||
d8696f0566 | |||
388fe75522 | |||
fc26a9fda3 | |||
8455ed0a99 | |||
ab46791915 | |||
86253e3712 | |||
9f4aa93249 | |||
087ee70620 | |||
c264556629 | |||
697e16ae98 | |||
cb71eb72f0 | |||
7fbd940ba2 | |||
cdc0375356 | |||
51a9cd7399 | |||
88f0fff090 | |||
82c93c6d10 | |||
7bd4139a94 | |||
0e5b94ad80 | |||
c736f5b59d | |||
82f8b79cd2 | |||
f3b14b9a8b | |||
255ab0c670 | |||
6b28897ff5 | |||
73786523bb | |||
d1051d2123 | |||
6a8bf94854 | |||
3117f978c2 | |||
dcee664315 | |||
cd4ea3fb32 | |||
a827cc0cfe | |||
065f4fe0c6 | |||
2a433afba2 | |||
45f27e8c87 | |||
d36821796f | |||
f2d3a6c5f6 | |||
8b455b0954 | |||
38030f2298 | |||
1a768aa97e | |||
65e22a235d | |||
1673e76781 | |||
0cbe831e0e | |||
adca6d14d9 | |||
7d9ab96dc1 | |||
5094a9a9ce | |||
4fd532e05c | |||
bcfa42d009 | |||
64c1bf5151 | |||
f64bb16909 | |||
736a5b384f | |||
079d097fb2 | |||
f4cce78d05 | |||
22216dc3a0 | |||
4b40069c51 | |||
c81a591889 | |||
33984922bd | |||
e599fc29a0 |
@ -1,62 +0,0 @@
|
||||
r -1acghtgd -2gctcagctagcta /Users/coissac/encours/ecoPCR/tools/ecodb
|
||||
print put
|
||||
print out
|
||||
f ecoPCR
|
||||
r -1GGGCAATCCTGAGCCAA -2CCATTGAGTCTCTGCACCTATC -l5 -L200 -e2 /Users/coissac/encours/ecoPCR/tools/ecodb
|
||||
bt
|
||||
print label1
|
||||
up
|
||||
print label1
|
||||
print (char*)label1
|
||||
r -1GGGCAATCCTGAGCCAA -2CCATTGAGTCTCTGCACCTATC -l5 -L200 -e2 /Users/coissac/encours/ecoPCR/tools/ecodb
|
||||
bt
|
||||
up 3
|
||||
l
|
||||
b 38
|
||||
r
|
||||
r -1GGGCAATCCTGAGCCAA -2CCATTGAGTCTCTGCACCTATC -l5 -L200 -e2 /Users/coissac/encours/ecoPCR/tools/ecodb
|
||||
print label
|
||||
print ranks
|
||||
print *ranks
|
||||
print ranks->label +1
|
||||
print *ranks->label +1
|
||||
print *(ranks->label +1)
|
||||
print *(ranks->label +2)
|
||||
b ecotax.c:147
|
||||
r -1GGGCAATCCTGAGCCAA -2CCATTGAGTCTCTGCACCTATC -l5 -L200 -e2 /Users/coissac/encours/ecoPCR/tools/ecodb
|
||||
print current_taxon ;
|
||||
print current_taxon
|
||||
n
|
||||
print current_taxon ;
|
||||
print current_taxon
|
||||
print *current_taxon
|
||||
print taxonomy->taxons[11]
|
||||
print taxonomy->taxons[2952]
|
||||
print taxonomy->taxons->taxon[2952]
|
||||
print taxonomy->ranks->rank[11]
|
||||
print taxonomy->ranks->label[11]
|
||||
print taxonomy->ranks->label[3]
|
||||
b ecotax.c:147
|
||||
r -1GGGCAATCCTGAGCCAA -2CCATTGAGTCTCTGCACCTATC -l5 -L200 -e2 /Users/coissac/encours/ecoPCR/tools/ecodb
|
||||
print current_taxon
|
||||
print *current_taxon
|
||||
n
|
||||
print *current_taxon
|
||||
b ecotax.c:147
|
||||
r -1GGGCAATCCTGAGCCAA -2CCATTGAGTCTCTGCACCTATC -l5 -L200 -e2 /Users/coissac/encours/ecoPCR/tools/ecodb
|
||||
n
|
||||
print *current_taxon
|
||||
r -1GGGCAATCCTGAGCCAA -2CCATTGAGTCTCTGCACCTATC -l5 -L200 -e2 /Users/coissac/encours/ecoPCR/tools/ecodb
|
||||
n
|
||||
print *current_taxon
|
||||
b ecodna.c:70
|
||||
r -1GGGCAATCCTGAGCC#A#A# -2CCATTGAGTCTCTGCACCTA#T#C# -l5 -L200 -e2 /Users/coissac/encours/ecoPCR/tools/ecodb
|
||||
*sb
|
||||
print *sb
|
||||
print sb
|
||||
print str
|
||||
b 52
|
||||
r -1GGGCAATCCTGAGCC#A#A# -2CCATTGAGTCTCTGCACCTA#T#C# -l5 -L200 -e2 /Users/coissac/encours/ecoPCR/tools/ecodb
|
||||
print str
|
||||
r -1GGGCAATCCTGAGCC#A#A# -2CCATTGAGTCTCTGCACCTA#T#C# -l5 -L200 -e2 /Users/coissac/encours/ecoPCR/tools/ecodb
|
||||
bt
|
@ -1,46 +0,0 @@
|
||||
################################################################################
|
||||
# Automatically-generated file. Do not edit!
|
||||
################################################################################
|
||||
|
||||
-include ../makefile.init
|
||||
|
||||
RM := rm -rf
|
||||
|
||||
# All of the sources participating in the build are defined here
|
||||
-include sources.mk
|
||||
-include subdir.mk
|
||||
-include src/libecoPCR/subdir.mk
|
||||
-include src/libapat/subdir.mk
|
||||
-include src/subdir.mk
|
||||
-include objects.mk
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
ifneq ($(strip $(C_DEPS)),)
|
||||
-include $(C_DEPS)
|
||||
endif
|
||||
endif
|
||||
|
||||
-include ../makefile.defs
|
||||
|
||||
# Add inputs and outputs from these tool invocations to the build variables
|
||||
|
||||
# All Target
|
||||
all: ecoPCR
|
||||
|
||||
# Tool invocations
|
||||
ecoPCR: $(OBJS) $(USER_OBJS)
|
||||
@echo 'Building target: $@'
|
||||
@echo 'Invoking: MacOS X C Linker'
|
||||
gcc -o "ecoPCR" $(OBJS) $(USER_OBJS) $(LIBS)
|
||||
@echo 'Finished building target: $@'
|
||||
@echo ' '
|
||||
|
||||
# Other Targets
|
||||
clean:
|
||||
-$(RM) $(OBJS)$(EXECUTABLES)$(C_DEPS) ecoPCR
|
||||
-@echo ' '
|
||||
|
||||
.PHONY: all clean dependents
|
||||
.SECONDARY:
|
||||
|
||||
-include ../makefile.targets
|
@ -1,7 +0,0 @@
|
||||
################################################################################
|
||||
# Automatically-generated file. Do not edit!
|
||||
################################################################################
|
||||
|
||||
LIBS := -lz
|
||||
|
||||
USER_OBJS :=
|
@ -1,19 +0,0 @@
|
||||
################################################################################
|
||||
# Automatically-generated file. Do not edit!
|
||||
################################################################################
|
||||
|
||||
C_SRCS :=
|
||||
O_SRCS :=
|
||||
ASM_SRCS :=
|
||||
S_SRCS :=
|
||||
OBJ_SRCS :=
|
||||
OBJS :=
|
||||
EXECUTABLES :=
|
||||
C_DEPS :=
|
||||
|
||||
# Every subdirectory with source files must be described here
|
||||
SUBDIRS := \
|
||||
src/libecoPCR \
|
||||
src/libapat \
|
||||
src \
|
||||
|
@ -1,24 +0,0 @@
|
||||
################################################################################
|
||||
# Automatically-generated file. Do not edit!
|
||||
################################################################################
|
||||
|
||||
# Add inputs and outputs from these tool invocations to the build variables
|
||||
C_SRCS += \
|
||||
../src/libapat/CODES/gendual.c
|
||||
|
||||
OBJS += \
|
||||
./src/libapat/CODES/gendual.o
|
||||
|
||||
C_DEPS += \
|
||||
./src/libapat/CODES/gendual.d
|
||||
|
||||
|
||||
# Each subdirectory must supply rules for building sources it contributes
|
||||
src/libapat/CODES/%.o: ../src/libapat/CODES/%.c
|
||||
@echo 'Building file: $<'
|
||||
@echo 'Invoking: GCC C Compiler'
|
||||
gcc -DMAC_OS_X -O0 -g3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o"$@" "$<"
|
||||
@echo 'Finished building: $<'
|
||||
@echo ' '
|
||||
|
||||
|
@ -1,30 +0,0 @@
|
||||
################################################################################
|
||||
# Automatically-generated file. Do not edit!
|
||||
################################################################################
|
||||
|
||||
# Add inputs and outputs from these tool invocations to the build variables
|
||||
C_SRCS += \
|
||||
../src/libapat/apat_parse.c \
|
||||
../src/libapat/apat_search.c \
|
||||
../src/libapat/libstki.c
|
||||
|
||||
OBJS += \
|
||||
./src/libapat/apat_parse.o \
|
||||
./src/libapat/apat_search.o \
|
||||
./src/libapat/libstki.o
|
||||
|
||||
C_DEPS += \
|
||||
./src/libapat/apat_parse.d \
|
||||
./src/libapat/apat_search.d \
|
||||
./src/libapat/libstki.d
|
||||
|
||||
|
||||
# Each subdirectory must supply rules for building sources it contributes
|
||||
src/libapat/%.o: ../src/libapat/%.c
|
||||
@echo 'Building file: $<'
|
||||
@echo 'Invoking: GCC C Compiler'
|
||||
gcc -DMAC_OS_X -O0 -g3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o"$@" "$<"
|
||||
@echo 'Finished building: $<'
|
||||
@echo ' '
|
||||
|
||||
|
@ -1,45 +0,0 @@
|
||||
################################################################################
|
||||
# Automatically-generated file. Do not edit!
|
||||
################################################################################
|
||||
|
||||
# Add inputs and outputs from these tool invocations to the build variables
|
||||
C_SRCS += \
|
||||
../src/libecoPCR/ecoError.c \
|
||||
../src/libecoPCR/ecoIOUtils.c \
|
||||
../src/libecoPCR/ecoMalloc.c \
|
||||
../src/libecoPCR/ecoapat.c \
|
||||
../src/libecoPCR/ecodna.c \
|
||||
../src/libecoPCR/ecorank.c \
|
||||
../src/libecoPCR/ecoseq.c \
|
||||
../src/libecoPCR/ecotax.c
|
||||
|
||||
OBJS += \
|
||||
./src/libecoPCR/ecoError.o \
|
||||
./src/libecoPCR/ecoIOUtils.o \
|
||||
./src/libecoPCR/ecoMalloc.o \
|
||||
./src/libecoPCR/ecoapat.o \
|
||||
./src/libecoPCR/ecodna.o \
|
||||
./src/libecoPCR/ecorank.o \
|
||||
./src/libecoPCR/ecoseq.o \
|
||||
./src/libecoPCR/ecotax.o
|
||||
|
||||
C_DEPS += \
|
||||
./src/libecoPCR/ecoError.d \
|
||||
./src/libecoPCR/ecoIOUtils.d \
|
||||
./src/libecoPCR/ecoMalloc.d \
|
||||
./src/libecoPCR/ecoapat.d \
|
||||
./src/libecoPCR/ecodna.d \
|
||||
./src/libecoPCR/ecorank.d \
|
||||
./src/libecoPCR/ecoseq.d \
|
||||
./src/libecoPCR/ecotax.d
|
||||
|
||||
|
||||
# Each subdirectory must supply rules for building sources it contributes
|
||||
src/libecoPCR/%.o: ../src/libecoPCR/%.c
|
||||
@echo 'Building file: $<'
|
||||
@echo 'Invoking: GCC C Compiler'
|
||||
gcc -DMAC_OS_X -O0 -g3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o"$@" "$<"
|
||||
@echo 'Finished building: $<'
|
||||
@echo ' '
|
||||
|
||||
|
@ -1,24 +0,0 @@
|
||||
################################################################################
|
||||
# Automatically-generated file. Do not edit!
|
||||
################################################################################
|
||||
|
||||
# Add inputs and outputs from these tool invocations to the build variables
|
||||
C_SRCS += \
|
||||
../src/ecopcr.c
|
||||
|
||||
OBJS += \
|
||||
./src/ecopcr.o
|
||||
|
||||
C_DEPS += \
|
||||
./src/ecopcr.d
|
||||
|
||||
|
||||
# Each subdirectory must supply rules for building sources it contributes
|
||||
src/%.o: ../src/%.c
|
||||
@echo 'Building file: $<'
|
||||
@echo 'Invoking: GCC C Compiler'
|
||||
gcc -DMAC_OS_X -O0 -g3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o"$@" "$<"
|
||||
@echo 'Finished building: $<'
|
||||
@echo ' '
|
||||
|
||||
|
@ -1,46 +0,0 @@
|
||||
################################################################################
|
||||
# Automatically-generated file. Do not edit!
|
||||
################################################################################
|
||||
|
||||
-include ../makefile.init
|
||||
|
||||
RM := rm -rf
|
||||
|
||||
# All of the sources participating in the build are defined here
|
||||
-include sources.mk
|
||||
-include subdir.mk
|
||||
-include src/libecoPCR/subdir.mk
|
||||
-include src/libapat/subdir.mk
|
||||
-include src/subdir.mk
|
||||
-include objects.mk
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
ifneq ($(strip $(C_DEPS)),)
|
||||
-include $(C_DEPS)
|
||||
endif
|
||||
endif
|
||||
|
||||
-include ../makefile.defs
|
||||
|
||||
# Add inputs and outputs from these tool invocations to the build variables
|
||||
|
||||
# All Target
|
||||
all: ecoPCR
|
||||
|
||||
# Tool invocations
|
||||
ecoPCR: $(OBJS) $(USER_OBJS)
|
||||
@echo 'Building target: $@'
|
||||
@echo 'Invoking: MacOS X C Linker'
|
||||
gcc -o "ecoPCR" $(OBJS) $(USER_OBJS) $(LIBS)
|
||||
@echo 'Finished building target: $@'
|
||||
@echo ' '
|
||||
|
||||
# Other Targets
|
||||
clean:
|
||||
-$(RM) $(OBJS)$(EXECUTABLES)$(C_DEPS) ecoPCR
|
||||
-@echo ' '
|
||||
|
||||
.PHONY: all clean dependents
|
||||
.SECONDARY:
|
||||
|
||||
-include ../makefile.targets
|
@ -1,7 +0,0 @@
|
||||
################################################################################
|
||||
# Automatically-generated file. Do not edit!
|
||||
################################################################################
|
||||
|
||||
LIBS := -lz
|
||||
|
||||
USER_OBJS :=
|
@ -1,19 +0,0 @@
|
||||
################################################################################
|
||||
# Automatically-generated file. Do not edit!
|
||||
################################################################################
|
||||
|
||||
C_SRCS :=
|
||||
O_SRCS :=
|
||||
ASM_SRCS :=
|
||||
S_SRCS :=
|
||||
OBJ_SRCS :=
|
||||
OBJS :=
|
||||
EXECUTABLES :=
|
||||
C_DEPS :=
|
||||
|
||||
# Every subdirectory with source files must be described here
|
||||
SUBDIRS := \
|
||||
src/libecoPCR \
|
||||
src/libapat \
|
||||
src \
|
||||
|
@ -1,30 +0,0 @@
|
||||
################################################################################
|
||||
# Automatically-generated file. Do not edit!
|
||||
################################################################################
|
||||
|
||||
# Add inputs and outputs from these tool invocations to the build variables
|
||||
C_SRCS += \
|
||||
../src/libapat/apat_parse.c \
|
||||
../src/libapat/apat_search.c \
|
||||
../src/libapat/libstki.c
|
||||
|
||||
OBJS += \
|
||||
./src/libapat/apat_parse.o \
|
||||
./src/libapat/apat_search.o \
|
||||
./src/libapat/libstki.o
|
||||
|
||||
C_DEPS += \
|
||||
./src/libapat/apat_parse.d \
|
||||
./src/libapat/apat_search.d \
|
||||
./src/libapat/libstki.d
|
||||
|
||||
|
||||
# Each subdirectory must supply rules for building sources it contributes
|
||||
src/libapat/%.o: ../src/libapat/%.c
|
||||
@echo 'Building file: $<'
|
||||
@echo 'Invoking: GCC C Compiler'
|
||||
gcc -DMAC_OS_X -O3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o"$@" "$<"
|
||||
@echo 'Finished building: $<'
|
||||
@echo ' '
|
||||
|
||||
|
@ -1,45 +0,0 @@
|
||||
################################################################################
|
||||
# Automatically-generated file. Do not edit!
|
||||
################################################################################
|
||||
|
||||
# Add inputs and outputs from these tool invocations to the build variables
|
||||
C_SRCS += \
|
||||
../src/libecoPCR/ecoError.c \
|
||||
../src/libecoPCR/ecoIOUtils.c \
|
||||
../src/libecoPCR/ecoMalloc.c \
|
||||
../src/libecoPCR/ecoapat.c \
|
||||
../src/libecoPCR/ecodna.c \
|
||||
../src/libecoPCR/ecorank.c \
|
||||
../src/libecoPCR/ecoseq.c \
|
||||
../src/libecoPCR/ecotax.c
|
||||
|
||||
OBJS += \
|
||||
./src/libecoPCR/ecoError.o \
|
||||
./src/libecoPCR/ecoIOUtils.o \
|
||||
./src/libecoPCR/ecoMalloc.o \
|
||||
./src/libecoPCR/ecoapat.o \
|
||||
./src/libecoPCR/ecodna.o \
|
||||
./src/libecoPCR/ecorank.o \
|
||||
./src/libecoPCR/ecoseq.o \
|
||||
./src/libecoPCR/ecotax.o
|
||||
|
||||
C_DEPS += \
|
||||
./src/libecoPCR/ecoError.d \
|
||||
./src/libecoPCR/ecoIOUtils.d \
|
||||
./src/libecoPCR/ecoMalloc.d \
|
||||
./src/libecoPCR/ecoapat.d \
|
||||
./src/libecoPCR/ecodna.d \
|
||||
./src/libecoPCR/ecorank.d \
|
||||
./src/libecoPCR/ecoseq.d \
|
||||
./src/libecoPCR/ecotax.d
|
||||
|
||||
|
||||
# Each subdirectory must supply rules for building sources it contributes
|
||||
src/libecoPCR/%.o: ../src/libecoPCR/%.c
|
||||
@echo 'Building file: $<'
|
||||
@echo 'Invoking: GCC C Compiler'
|
||||
gcc -DMAC_OS_X -O3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o"$@" "$<"
|
||||
@echo 'Finished building: $<'
|
||||
@echo ' '
|
||||
|
||||
|
@ -1,24 +0,0 @@
|
||||
################################################################################
|
||||
# Automatically-generated file. Do not edit!
|
||||
################################################################################
|
||||
|
||||
# Add inputs and outputs from these tool invocations to the build variables
|
||||
C_SRCS += \
|
||||
../src/ecopcr.c
|
||||
|
||||
OBJS += \
|
||||
./src/ecopcr.o
|
||||
|
||||
C_DEPS += \
|
||||
./src/ecopcr.d
|
||||
|
||||
|
||||
# Each subdirectory must supply rules for building sources it contributes
|
||||
src/%.o: ../src/%.c
|
||||
@echo 'Building file: $<'
|
||||
@echo 'Invoking: GCC C Compiler'
|
||||
gcc -DMAC_OS_X -O3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o"$@" "$<"
|
||||
@echo 'Finished building: $<'
|
||||
@echo ' '
|
||||
|
||||
|
84
src/Makefile
Normal file
84
src/Makefile
Normal file
@ -0,0 +1,84 @@
|
||||
EXEC=ecoPCR ecofind ecoisundertaxon
|
||||
|
||||
PCR_SRC= ecopcr.c
|
||||
PCR_OBJ= $(patsubst %.c,%.o,$(PCR_SRC))
|
||||
|
||||
FIND_SRC= ecofind.c
|
||||
FIND_OBJ= $(patsubst %.c,%.o,$(FIND_SRC))
|
||||
|
||||
IUT_SRC= ecoisundertaxon.c
|
||||
IUT_OBJ= $(patsubst %.c,%.o,$(IUT_SRC))
|
||||
|
||||
SRCS= $(PCR_SRC) $(FIND_SRC) $(IUT_SRC)
|
||||
|
||||
LIB= -lecoPCR -lapat -lz -lm
|
||||
|
||||
LIBFILE= libapat/libapat.a \
|
||||
libecoPCR/libecoPCR.a
|
||||
|
||||
|
||||
include global.mk
|
||||
|
||||
all: $(EXEC)
|
||||
|
||||
|
||||
########
|
||||
#
|
||||
# ecoPCR compilation
|
||||
#
|
||||
########
|
||||
|
||||
# executable compilation and link
|
||||
|
||||
ecoPCR: $(PCR_OBJ) $(LIBFILE)
|
||||
$(CC) $(LDFLAGS) -o $@ $< $(LIBPATH) $(LIB)
|
||||
|
||||
########
|
||||
#
|
||||
# ecofind compilation
|
||||
#
|
||||
########
|
||||
|
||||
# executable compilation and link
|
||||
|
||||
ecofind: $(FIND_OBJ) $(LIBFILE)
|
||||
$(CC) $(LDFLAGS) -o $@ $< $(LIBPATH) $(LIB)
|
||||
|
||||
########
|
||||
#
|
||||
# IsUnderTaxon compilation
|
||||
#
|
||||
########
|
||||
|
||||
# executable compilation and link
|
||||
|
||||
ecoisundertaxon: $(IUT_OBJ) $(LIBFILE)
|
||||
$(CC) $(LDFLAGS) -o $@ $< $(LIBPATH) $(LIB)
|
||||
|
||||
########
|
||||
#
|
||||
# library compilation
|
||||
#
|
||||
########
|
||||
|
||||
libapat/libapat.a:
|
||||
$(MAKE) -C libapat
|
||||
|
||||
libecoPCR/libecoPCR.a:
|
||||
$(MAKE) -C libecoPCR
|
||||
|
||||
|
||||
########
|
||||
#
|
||||
# project management
|
||||
#
|
||||
########
|
||||
|
||||
clean:
|
||||
rm -f *.o
|
||||
rm -f $(EXEC)
|
||||
$(MAKE) -C libapat clean
|
||||
$(MAKE) -C libecoPCR clean
|
||||
|
||||
|
||||
|
345
src/ecofind.c
Normal file
345
src/ecofind.c
Normal file
@ -0,0 +1,345 @@
|
||||
#include "libecoPCR/ecoPCR.h"
|
||||
#include <regex.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <getopt.h>
|
||||
#include <stdio.h>
|
||||
#define VERSION "0.1"
|
||||
|
||||
/**
|
||||
* display the result
|
||||
**/
|
||||
static void printresult(ecotx_t *taxon,econame_t* name,ecotaxonomy_t *taxonomy)
|
||||
{
|
||||
char* rankname;
|
||||
char* classname;
|
||||
char* matchedname=taxon->name;
|
||||
|
||||
classname="scientific name";
|
||||
if (name)
|
||||
{
|
||||
classname=name->classname;
|
||||
matchedname=name->name;
|
||||
}
|
||||
|
||||
rankname= taxonomy->ranks->label[taxon->rank];
|
||||
|
||||
printf("%10d \t| %15s \t|\t %-50s \t|\t %15s \t|\t %s\n",
|
||||
taxon->taxid,
|
||||
rankname,
|
||||
matchedname,
|
||||
classname,
|
||||
taxon->name);
|
||||
}
|
||||
|
||||
/**
|
||||
* display header before printing any result
|
||||
**/
|
||||
static void printheader(void)
|
||||
{
|
||||
printf("# %12s \t| %15s \t|\t %-50s \t|\t %-15s \t|\t %s\n#\n",
|
||||
"taxonomy id",
|
||||
"taxonomy rank",
|
||||
"name",
|
||||
"class name",
|
||||
"scientific name");
|
||||
}
|
||||
|
||||
/**
|
||||
* display son's list for given taxon
|
||||
**/
|
||||
static void get_son(ecotaxonomy_t *taxonomy, ecotx_t *taxon, int32_t *count, char *rankname)
|
||||
{
|
||||
int32_t i;
|
||||
ecotx_t *current_taxon;
|
||||
|
||||
for ( i = 0, current_taxon = taxonomy->taxons->taxon;
|
||||
i < taxonomy->taxons->count;
|
||||
i++, current_taxon++)
|
||||
{
|
||||
if (taxon->taxid == current_taxon->parent->taxid)
|
||||
{
|
||||
if (rankname == NULL || !strcmp(rankname,taxonomy->ranks->label[current_taxon->rank]))
|
||||
{
|
||||
printresult(current_taxon, NULL, taxonomy);
|
||||
(*count)++;
|
||||
}
|
||||
get_son(taxonomy,current_taxon,count,rankname);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* display list of rank filter option (-l option)
|
||||
**/
|
||||
static void listfilteroptions(ecorankidx_t *ranks)
|
||||
{
|
||||
int32_t i;
|
||||
|
||||
printf("#\n");
|
||||
|
||||
for ( i=0;
|
||||
i < ranks->count;
|
||||
i++)
|
||||
{
|
||||
printf("# %s \n",ranks->label[i]);
|
||||
}
|
||||
|
||||
printf("#\n");
|
||||
}
|
||||
|
||||
|
||||
/* ---------------------------------------- */
|
||||
/* get back on given taxid taxonomic parent */
|
||||
/* and display it */
|
||||
/* ---------------------------------------- */
|
||||
void gettaxidparents(int32_t taxid, ecotaxonomy_t *taxonomy, char *rankname)
|
||||
{
|
||||
ecotx_t *next_parent;
|
||||
int32_t c = 0;
|
||||
|
||||
next_parent = eco_findtaxonbytaxid(taxonomy, taxid);
|
||||
|
||||
printheader();
|
||||
|
||||
printresult(next_parent, NULL,taxonomy);
|
||||
|
||||
while ( strcmp(next_parent->name, "root") )
|
||||
{
|
||||
next_parent = next_parent->parent;
|
||||
if (rankname == NULL || !strcmp(rankname,taxonomy->ranks->label[next_parent->rank]))
|
||||
{
|
||||
printresult(next_parent, NULL,taxonomy);
|
||||
c++;
|
||||
}
|
||||
}
|
||||
|
||||
printf("# %d parent(s) found\n#\n",c);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* printout usage and exit
|
||||
**/
|
||||
#define PP fprintf(stderr,
|
||||
|
||||
static void ExitUsage(stat)
|
||||
int stat;
|
||||
{
|
||||
PP "usage: ecofind [-d database] [-h] [-l] [-r taxonomic rank] [-p taxid] [-s taxid] <taxon name pattern> ... \n");
|
||||
PP "type \"ecofind -h\" for help\n");
|
||||
if (stat)
|
||||
exit(stat);
|
||||
}
|
||||
|
||||
#undef PP
|
||||
|
||||
/**
|
||||
* printout help
|
||||
**/
|
||||
#define PP fprintf(stdout,
|
||||
|
||||
static void PrintHelp()
|
||||
{
|
||||
PP "------------------------------------------\n");
|
||||
PP " ecofind Version %s\n", VERSION);
|
||||
PP "------------------------------------------\n");
|
||||
PP "synopsis : searching for taxonomic and rank and\n");
|
||||
PP " taxonomy id for given regular expression patterns\n\n");
|
||||
PP "usage: ecofind [options] <patterns>\n");
|
||||
PP "------------------------------------------\n");
|
||||
PP "options:\n");
|
||||
PP "-a : [A]ll enable the search on all alternative names and not only scientific names.\n\n");
|
||||
PP "-d : [D]atabase containing the taxonomy.\n");
|
||||
PP " To match the expected format, the database\n");
|
||||
PP " has to be formated first by the ecoPCRFormat.py\n");
|
||||
PP " program located in the tools directory.\n");
|
||||
PP " Write the database radical without any extension.\n\n");
|
||||
PP "-h : [H]elp - print <this> help\n\n");
|
||||
PP "-l : [L]ist all taxonomic rank available for -r option\n\n");
|
||||
PP "-p : [P]arents : specifiying this option displays all parental tree's information for the given taxid.\n\n");
|
||||
PP "-r : [R]estrict to given taxonomic rank\n\n");
|
||||
PP "-s : [S]ons: specifiying this option displays all subtree's information for the given taxid.\n\n");
|
||||
PP "arguments:\n");
|
||||
PP "<taxon> name pattern bearing regular expressions\n\n");
|
||||
PP "------------------------------------------\n");
|
||||
PP " http://www.grenoble.prabi.fr/trac/ecoPCR/\n");
|
||||
PP "------------------------------------------\n\n");
|
||||
}
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
|
||||
#define PATTERN_NUMBER 10
|
||||
#define PATTERN_LENGHT 40
|
||||
#define RESULT_LENGTH 100
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int32_t carg = 0;
|
||||
int32_t nummatch = 0;
|
||||
int32_t k,j = 0;
|
||||
int32_t errflag = 0;
|
||||
int32_t tax_count = 0;
|
||||
int32_t alternative = 0;
|
||||
char *prefix = NULL;
|
||||
ecotaxonomy_t *taxonomy;
|
||||
econame_t *name;
|
||||
int32_t name_count;
|
||||
|
||||
int re_error;
|
||||
int re_match;
|
||||
regex_t re_preg;
|
||||
|
||||
int32_t uptree = 0;
|
||||
int32_t subtree = 0;
|
||||
char *rankname = NULL;
|
||||
int32_t rankfilter = 1;
|
||||
int32_t list = 0;
|
||||
|
||||
ecotx_t *subtree_parent;
|
||||
int32_t count_son = 0;
|
||||
|
||||
|
||||
while ((carg = getopt(argc, argv, "had:p:s:r:l")) != -1) {
|
||||
switch (carg) {
|
||||
case 's': /* path to the database */
|
||||
sscanf(optarg,"%d",&subtree);
|
||||
break;
|
||||
|
||||
case 'r': /* rank filter */
|
||||
rankname = ECOMALLOC(strlen(optarg),"allocation rankname");
|
||||
strcpy(rankname,optarg);
|
||||
rankfilter = 0;
|
||||
break;
|
||||
|
||||
case 'd': /* path to the database */
|
||||
prefix = ECOMALLOC(strlen(optarg),"allocation prefix");
|
||||
strcpy(prefix,optarg);
|
||||
break;
|
||||
|
||||
case 'l': /* list rank filter options */
|
||||
list = 1;
|
||||
break;
|
||||
|
||||
case 'a': /* allow alternative names */
|
||||
alternative = 1;
|
||||
break;
|
||||
|
||||
case 'h': /* display help */
|
||||
PrintHelp();
|
||||
exit(0);
|
||||
break;
|
||||
|
||||
case 'p': /* taxid */
|
||||
sscanf(optarg,"%d",&uptree);
|
||||
break;
|
||||
|
||||
case '?': /* bad option */
|
||||
errflag++;
|
||||
}
|
||||
}
|
||||
|
||||
if ((argc - optind) < 1)
|
||||
errflag++;
|
||||
|
||||
if (prefix == NULL)
|
||||
{
|
||||
prefix = getenv("ECOPCRDB");
|
||||
if (prefix == NULL)
|
||||
errflag++;
|
||||
}
|
||||
|
||||
if (errflag && !uptree && !rankname && !subtree && !list)
|
||||
ExitUsage(errflag);
|
||||
|
||||
/**
|
||||
* load taxonomy using libecoPCR functions
|
||||
**/
|
||||
printf("# \n# opening %s database\n",prefix);
|
||||
|
||||
taxonomy = read_taxonomy(prefix,1);
|
||||
tax_count = taxonomy->taxons->count;
|
||||
name_count = taxonomy->names->count;
|
||||
|
||||
|
||||
/* ---------------------------------------- */
|
||||
/* list -r option possibility */
|
||||
/* ---------------------------------------- */
|
||||
if (list)
|
||||
{
|
||||
listfilteroptions(taxonomy->ranks);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ---------------------------------------- */
|
||||
/* display taxid parent if -t option */
|
||||
/* specified in command line */
|
||||
/* ---------------------------------------- */
|
||||
if (uptree)
|
||||
{
|
||||
gettaxidparents(uptree,taxonomy,rankname);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ---------------------------------------- */
|
||||
/* display taxid sons if -s option */
|
||||
/* specified in command line */
|
||||
/* ---------------------------------------- */
|
||||
if (subtree)
|
||||
{
|
||||
printheader();
|
||||
subtree_parent = eco_findtaxonbytaxid(taxonomy,subtree);
|
||||
printresult(subtree_parent, NULL,taxonomy);
|
||||
get_son(taxonomy, subtree_parent,&count_son,rankname);
|
||||
printf("# %d son(s) found\n#\n",count_son);
|
||||
return 0;
|
||||
}
|
||||
|
||||
printf("# %d taxons\n", tax_count);
|
||||
|
||||
/**
|
||||
* parse taxonomy
|
||||
**/
|
||||
for (k=optind;k<argc;k++)
|
||||
{
|
||||
printf("#\n# searching for '%s' pattern\n",argv[k]);
|
||||
|
||||
re_error = regcomp (&re_preg, argv[k], REG_NOSUB | REG_EXTENDED | REG_ICASE);
|
||||
if (re_error)
|
||||
{
|
||||
fprintf(stderr,"# misformed pattern '%s'\n",argv[k]);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
nummatch=0;
|
||||
|
||||
printheader();
|
||||
|
||||
for (j=0,name=taxonomy->names->names;
|
||||
j < name_count;
|
||||
name++,j++)
|
||||
{
|
||||
|
||||
if(rankname)
|
||||
rankfilter = !(strcmp(rankname,taxonomy->ranks->label[name->taxon->rank]));
|
||||
|
||||
re_match = regexec (&re_preg, name->name, 0, NULL, 0);
|
||||
|
||||
if (!re_match && (alternative || name->is_scientificname) && rankfilter)
|
||||
{
|
||||
printresult(name->taxon,name,taxonomy);
|
||||
nummatch++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
printf("# %d records found \n",nummatch);
|
||||
regfree(&re_preg);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
403
src/ecogrep.c
Normal file
403
src/ecogrep.c
Normal file
@ -0,0 +1,403 @@
|
||||
#include "libecoPCR/ecoPCR.h"
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <getopt.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/stat.h>
|
||||
|
||||
|
||||
#define VERSION "0.1"
|
||||
|
||||
void getLineContent(char *stream, ecoseq_t *seq, ecoseq_t *oligoseq_1, ecoseq_t *oligoseq_2){
|
||||
|
||||
int i;
|
||||
char *buffer;
|
||||
|
||||
for( i=0, buffer = strtok(stream,"|");
|
||||
buffer != NULL;
|
||||
i++, buffer = strtok(NULL,"|"))
|
||||
{
|
||||
switch (i) {
|
||||
case 0:
|
||||
seq->AC = strdup(buffer);
|
||||
break;
|
||||
case 4:
|
||||
sscanf(buffer,"%d",&seq->taxid);
|
||||
break;
|
||||
case 13:
|
||||
oligoseq_1->SQ = strdup(buffer);
|
||||
oligoseq_1->SQ_length = strlen(buffer);
|
||||
break;
|
||||
case 15:
|
||||
oligoseq_2->SQ = strdup(buffer);
|
||||
oligoseq_2->SQ_length = strlen(buffer);
|
||||
break;
|
||||
case 18:
|
||||
seq->SQ = strdup(buffer);
|
||||
seq->SQ_length = strlen(buffer);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void freememory(char **tab, int32_t num){
|
||||
int32_t i;
|
||||
for (i=0;i<num-1;i++){
|
||||
ECOFREE(tab[i],"Error in freememory function");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if pattern match a string using mamberall libapat function
|
||||
* @param line array containing sequence information
|
||||
* @param pattern array containing patterns to test on the sequence
|
||||
* @param numpattern number of pattern in pattern array
|
||||
* @param error_max error rate allowed by the user
|
||||
*
|
||||
* @return int 1 if a pattern match, else 0
|
||||
**/
|
||||
int ispatternmatching(ecoseq_t *seq, PatternPtr pattern){
|
||||
if (pattern != NULL)
|
||||
{
|
||||
SeqPtr apatseq = NULL;
|
||||
apatseq=ecoseq2apatseq(seq,apatseq);
|
||||
return ManberAll(apatseq,pattern,0,0,apatseq->seqlen) > 0;
|
||||
}
|
||||
else return 0;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
/* printout help */
|
||||
/* ----------------------------------------------- */
|
||||
#define PP fprintf(stdout,
|
||||
|
||||
static void PrintHelp()
|
||||
{
|
||||
PP "\n------------------------------------------\n");
|
||||
PP " ecogrep Version %s\n", VERSION);
|
||||
PP "------------------------------------------\n");
|
||||
PP " synopsis : filtering ecoPCR result based on\n");
|
||||
PP " taxonomic id filter and regular expression pattern\n");
|
||||
PP " usage: ecogrep [options] filename\n");
|
||||
PP "------------------------------------------\n");
|
||||
PP " options:\n");
|
||||
PP " -1 : [FIRST] : compare the given pattern with direct strand oligonucleotide\n\n");
|
||||
PP " -2 : [SECOND] : compare the given pattern with reverse strand oligonucleotide\n\n");
|
||||
PP " -d : [D]atabase containing taxonomic information\n\n");
|
||||
PP " -e : [E]rrors : max error allowed in pattern match (option-1, -2 and -p) (0 by default)\n\n");
|
||||
PP " -p : [P]attern : oligonucleotide pattern\n\n");
|
||||
PP " -h : [H]elp : print <this> help\n\n");
|
||||
PP " -i : [I]gnore subtree under given taxonomic id\n\n");
|
||||
PP " -r : [R]estrict search to subtree under given taxomic id\n\n");
|
||||
PP " -v : in[V]ert the sense of matching, to select non-matching lines.\n\n");
|
||||
PP " argument:\n");
|
||||
PP " ecoPCR ouput file name\n");
|
||||
PP "------------------------------------------\n\n");
|
||||
PP " http://www.grenoble.prabi.fr/trac/ecoPCR/\n");
|
||||
PP "------------------------------------------\n\n");
|
||||
}
|
||||
|
||||
#undef PP
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
/* printout usage and exit */
|
||||
/* ----------------------------------------------- */
|
||||
|
||||
#define PP fprintf(stderr,
|
||||
|
||||
static void ExitUsage(stat)
|
||||
int stat;
|
||||
{
|
||||
PP "usage: ecogrep [-d database] [-p pattern] [-i taxid] [-r taxid] [-v] [-h] <file name>\n");
|
||||
PP "type \"ecogrep -h\" for help\n");
|
||||
|
||||
if (stat)
|
||||
exit(stat);
|
||||
}
|
||||
|
||||
#undef PP
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
/* MAIN */
|
||||
/* ----------------------------------------------- */
|
||||
|
||||
#define LINE_BUFF_SIZE 10000
|
||||
|
||||
int main(int argc, char **argv){
|
||||
int32_t carg = 0;
|
||||
int32_t r = 0; // number of restricted taxid
|
||||
int32_t i = 0; // number of ignored taxid
|
||||
int32_t v = 0; // stores if -v mode is active
|
||||
int32_t k = 0; // file counter
|
||||
int32_t errflag = 0;
|
||||
int32_t error_max = 0; // stores the error rate allowed by the user
|
||||
int32_t matchingresult = 0; // stores number of matching result
|
||||
|
||||
ecotaxonomy_t *taxonomy; // stores the taxonomy
|
||||
|
||||
ecoseq_t *seq = NULL; // stores sequence info
|
||||
ecoseq_t *oligoseq_1 = NULL; // stores the oligo_1 info
|
||||
ecoseq_t *oligoseq_2 = NULL; // stores the oligo_2 info
|
||||
|
||||
char *database = NULL; // stores the database path (for taxonomy)
|
||||
|
||||
char *p = NULL; // pattern for sequence
|
||||
PatternPtr pattern = NULL; // stores the build pattern for sequence
|
||||
char *o1 = NULL; // pattern for direct strand oligo
|
||||
PatternPtr oligo_1 = NULL; // stores the build pattern for direct strand oligo
|
||||
char *o2 = NULL; // pattern for reverse strand oligo
|
||||
PatternPtr oligo_2 = NULL; // stores the build pattern for reverse strand oligo
|
||||
|
||||
int32_t *restricted_taxid = NULL; // stores the restricted taxid
|
||||
int32_t *ignored_taxid = NULL; // stores the ignored taxid
|
||||
|
||||
FILE *file = NULL; // stores the data stream, stdin by default
|
||||
char *stream = ECOMALLOC(sizeof(char *)*LINE_BUFF_SIZE,"error stream buffer allocation");
|
||||
char *orig = ECOMALLOC(sizeof(char *)*LINE_BUFF_SIZE,"error orig buffer allocation");
|
||||
|
||||
int is_ignored = 0;
|
||||
int is_included = 0;
|
||||
int is_matching = 0;
|
||||
int match_o1 = 0;
|
||||
int match_o2 = 0;
|
||||
int good = 0;
|
||||
|
||||
seq = new_ecoseq();
|
||||
oligoseq_1 = new_ecoseq();
|
||||
oligoseq_2 = new_ecoseq();
|
||||
|
||||
/**
|
||||
* Parse commande line options
|
||||
**/
|
||||
while ((carg = getopt(argc, argv, "1:2:p:d:i:r:e:vh")) != -1) {
|
||||
|
||||
switch (carg) {
|
||||
case '1':
|
||||
o1 = ECOMALLOC(strlen(optarg)+1,
|
||||
"Error on o1 allocation");
|
||||
strcpy(o1,optarg);
|
||||
break;
|
||||
|
||||
case '2':
|
||||
o2 = ECOMALLOC(strlen(optarg)+1,
|
||||
"Error on o2 allocation");
|
||||
strcpy(o2,optarg);
|
||||
break;
|
||||
|
||||
case 'd':
|
||||
database = ECOMALLOC(strlen(optarg)+1,
|
||||
"Error on datafile allocation");
|
||||
strcpy(database,optarg);
|
||||
break;
|
||||
|
||||
case 'i':
|
||||
ignored_taxid = ECOREALLOC( ignored_taxid,
|
||||
sizeof(int32_t)*(i+1),
|
||||
"Error on ignored_taxid reallocation");
|
||||
sscanf(optarg,"%d",&ignored_taxid[i]);
|
||||
i++;
|
||||
break;
|
||||
|
||||
case 'r':
|
||||
restricted_taxid = ECOREALLOC( restricted_taxid,
|
||||
sizeof(int32_t)*(r+1),
|
||||
"Error on restricted_taxid reallocation");
|
||||
sscanf(optarg,"%d",&restricted_taxid[r]);
|
||||
r++;
|
||||
break;
|
||||
|
||||
case 'v':
|
||||
v = 1;
|
||||
break;
|
||||
|
||||
case 'h':
|
||||
PrintHelp();
|
||||
exit(0);
|
||||
break;
|
||||
|
||||
case 'e':
|
||||
sscanf(optarg,"%d",&error_max);
|
||||
break;
|
||||
|
||||
case 'p':
|
||||
p = ECOMALLOC(strlen(optarg)+1,
|
||||
"Error on pattern allocation");
|
||||
strcpy(p,optarg);
|
||||
break;
|
||||
|
||||
case '?':
|
||||
errflag++;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check sequence pattern length and build it in PatternPtr format
|
||||
**/
|
||||
if(p)
|
||||
{
|
||||
if (strlen(p) > 32){
|
||||
printf("# Sorry, ecogrep doesn't handle pattern longer than 32 characters.\
|
||||
\n# Please check it out : %s\n",p);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
else if ( (pattern = buildPattern(p,error_max)) == NULL)
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Check o1 pattern length and build it in PatternPtr format
|
||||
**/
|
||||
if(o1)
|
||||
{
|
||||
if (strlen(o1) > 32){
|
||||
printf("# Sorry, ecogrep doesn't handle pattern longer than 32 characters.\
|
||||
\n# Please check it out : %s\n",o1);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
else if ( (oligo_1 = buildPattern(o1,error_max)) == NULL)
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check o2 pattern length and build it in PatternPtr format
|
||||
**/
|
||||
if(o2)
|
||||
{
|
||||
if (strlen(o2) > 32){
|
||||
printf("# Sorry, ecogrep doesn't handle pattern longer than 32 characters.\
|
||||
\n# Please check it out : %s\n",o2);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
else if ( (oligo_2 = buildPattern(o2,error_max)) == NULL)
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
/**
|
||||
* try to get the database name from environment variable
|
||||
* if no database name specified in the -d option
|
||||
**/
|
||||
if (database == NULL)
|
||||
{
|
||||
database = getenv("ECOPCRDB");
|
||||
if (database == NULL)
|
||||
errflag++;
|
||||
}
|
||||
|
||||
/**
|
||||
* check at leat one processing is asked
|
||||
* either patterns or taxid filters
|
||||
**/
|
||||
if ( !p && !o1 && !o2 && restricted_taxid == NULL && ignored_taxid == NULL )
|
||||
{
|
||||
errflag++;
|
||||
}
|
||||
if (errflag)
|
||||
ExitUsage(errflag);
|
||||
|
||||
/**
|
||||
* Get the taxonomy back
|
||||
**/
|
||||
taxonomy = read_taxonomy(database,0);
|
||||
|
||||
/**
|
||||
* Parse the stream
|
||||
**/
|
||||
for (k=0 ; argc >= optind ; optind++, k++){
|
||||
|
||||
matchingresult = 0;
|
||||
|
||||
if ( (file = fopen(argv[optind], "r")) == NULL)
|
||||
{
|
||||
if (isatty(fileno(stdin)) == 0)
|
||||
{
|
||||
file = stdin;
|
||||
printf("# Processing standard input...\n");
|
||||
}
|
||||
else
|
||||
break;
|
||||
}
|
||||
else
|
||||
printf("# Processing %s...\n",argv[optind]);
|
||||
|
||||
while( fgets(stream, LINE_BUFF_SIZE, file) != NULL ){
|
||||
|
||||
if (stream[0]!= '#')
|
||||
{
|
||||
|
||||
stream[LINE_BUFF_SIZE-1]=0;
|
||||
|
||||
strcpy(orig,stream);
|
||||
|
||||
getLineContent(stream,seq,oligoseq_1,oligoseq_2);
|
||||
|
||||
/* -----------------------------------------------*/
|
||||
/* is ignored if at least one option -i */
|
||||
/* AND */
|
||||
/* if current sequence is son of taxid */
|
||||
/* -----------------------------------------------*/
|
||||
is_ignored = ( (i > 0) && (eco_is_taxid_included( taxonomy,
|
||||
ignored_taxid,
|
||||
i,
|
||||
seq->taxid))
|
||||
);
|
||||
|
||||
/* -----------------------------------------------*/
|
||||
/* is included if no -r option */
|
||||
/* OR */
|
||||
/* if current sequence is son of taxid */
|
||||
/* -----------------------------------------------*/
|
||||
is_included = ( (r == 0) || (eco_is_taxid_included( taxonomy,
|
||||
restricted_taxid,
|
||||
r,
|
||||
seq->taxid))
|
||||
);
|
||||
|
||||
/* ----------------------------------------------------------- */
|
||||
/* match if no pattern or if pattern match current sequence */
|
||||
/* ----------------------------------------------------------- */
|
||||
is_matching = ( !p || (ispatternmatching(seq,pattern)));
|
||||
|
||||
/* ---------------------------------------------------------------------------- */
|
||||
/* match if no direct oligo pattern or if pattern match current direct oligo */
|
||||
/* ---------------------------------------------------------------------------- */
|
||||
match_o1 = (!o1 || (ispatternmatching(oligoseq_1,oligo_1)));
|
||||
|
||||
/* ------------------------------------------------------------------------------- */
|
||||
/* match if no revesrse oligo pattern or if pattern match current reverse oligo */
|
||||
/* ------------------------------------------------------------------------------- */
|
||||
match_o2 = (!o2 || (ispatternmatching(oligoseq_2,oligo_2)));
|
||||
|
||||
good = (is_included && is_matching && !is_ignored && match_o1 && match_o2);
|
||||
|
||||
if (v)
|
||||
good=!good;
|
||||
|
||||
if ( good )
|
||||
{
|
||||
printf("%s",orig);
|
||||
matchingresult++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ( file != stdin )
|
||||
fclose(file);
|
||||
|
||||
printf("# %d matching result(s)\n#\n",matchingresult);
|
||||
}
|
||||
|
||||
/**
|
||||
* clean and free before leaving
|
||||
**/
|
||||
ECOFREE(orig,"Error in free orig");
|
||||
ECOFREE(stream,"Error in free stream");
|
||||
ECOFREE(ignored_taxid,"Error in free stream");
|
||||
ECOFREE(restricted_taxid,"Error in free stream");
|
||||
|
||||
return 0;
|
||||
}
|
123
src/ecoisundertaxon.c
Normal file
123
src/ecoisundertaxon.c
Normal file
@ -0,0 +1,123 @@
|
||||
#include "libecoPCR/ecoPCR.h"
|
||||
#include <getopt.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#define VERSION "0.1"
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
/* printout verbose mode */
|
||||
/* ----------------------------------------------- */
|
||||
static void printTaxon(ecotx_t *taxon){
|
||||
printf("# taxid : %d | rank : %d | name : %s \n\n",taxon->taxid, taxon->rank, taxon->name);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
/* printout help */
|
||||
/* ----------------------------------------------- */
|
||||
#define PP fprintf(stdout,
|
||||
|
||||
static void PrintHelp()
|
||||
{
|
||||
PP "\n------------------------------------------\n");
|
||||
PP " ecoisundertaxon Version %s\n", VERSION);
|
||||
PP "------------------------------------------\n");
|
||||
PP " synopsis : searching relationship in taxonomy\n");
|
||||
PP " usage: ecoisundertaxon [options] database\n");
|
||||
PP "------------------------------------------\n");
|
||||
PP " options:\n");
|
||||
PP " -1 : [FIRST] taxomic id of the hypothetical son\n\n");
|
||||
PP " -2 : [SECOND] taxonomic id of the hypothetical parent\n\n");
|
||||
PP " -h : [H]elp - print <this> help\n\n");
|
||||
PP " -v : [V]erbose mode. Display taxonomic information for both\n");
|
||||
PP " : taxonomic id.\n\n");
|
||||
PP "------------------------------------------\n");
|
||||
PP " database : to match the expected format, the database\n");
|
||||
PP " has to be formated first by the ecoPCRFormat.py program located.\n");
|
||||
PP " in the tools directory. Type the radical only, leaving out the extension\n");
|
||||
PP "------------------------------------------\n\n");
|
||||
PP " https://www.grenoble.prabi.fr/trac/ecoPCR/wiki");
|
||||
PP "------------------------------------------\n\n");
|
||||
}
|
||||
|
||||
#undef PP
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
/* printout usage and exit */
|
||||
/* ----------------------------------------------- */
|
||||
|
||||
#define PP fprintf(stderr,
|
||||
|
||||
static void ExitUsage(stat)
|
||||
int stat;
|
||||
{
|
||||
PP "usage: ecoisundertaxon [-1 taxid] [-2 taxid] [-v] [-h] datafile\n");
|
||||
PP "type \"ecoisundertaxon -h\" for help\n");
|
||||
|
||||
if (stat)
|
||||
exit(stat);
|
||||
}
|
||||
|
||||
#undef PP
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
/* MAIN */
|
||||
/* ----------------------------------------------- */
|
||||
|
||||
int main(int argc, char **argv){
|
||||
int32_t carg = 0;
|
||||
int32_t taxid_1 = 0;
|
||||
int32_t taxid_2 = 0;
|
||||
int32_t verbose = 0;
|
||||
int32_t errflag = 0;
|
||||
ecotaxonomy_t *taxonomy = NULL;
|
||||
ecotx_t *son = NULL;
|
||||
ecotx_t *parent = NULL;
|
||||
|
||||
|
||||
while ((carg = getopt(argc, argv, "1:2:vh")) != -1) {
|
||||
switch (carg) {
|
||||
case '1':
|
||||
sscanf(optarg,"%d",&taxid_1);
|
||||
break;
|
||||
|
||||
case '2':
|
||||
sscanf(optarg,"%d",&taxid_2);
|
||||
break;
|
||||
|
||||
case 'v':
|
||||
verbose = 1;
|
||||
break;
|
||||
|
||||
case 'h':
|
||||
PrintHelp();
|
||||
exit(0);
|
||||
break;
|
||||
|
||||
case '?':
|
||||
errflag++;
|
||||
}
|
||||
}
|
||||
|
||||
if ((argc -= optind) != 1)
|
||||
errflag++;
|
||||
|
||||
if (errflag)
|
||||
ExitUsage(errflag);
|
||||
|
||||
taxonomy = read_taxonomy(argv[optind],0);
|
||||
|
||||
son = eco_findtaxonbytaxid(taxonomy, taxid_1);
|
||||
|
||||
if (verbose){
|
||||
parent = eco_findtaxonbytaxid(taxonomy, taxid_2);
|
||||
printTaxon(son);
|
||||
printTaxon(parent);
|
||||
}
|
||||
|
||||
if (eco_isundertaxon(son, taxid_2))
|
||||
printf("# taxid_1 (%d) is son of taxid_2 (%d)\n",taxid_1, taxid_2);
|
||||
else
|
||||
printf("# taxid_1 (%d) is NOT son of taxid_2 (%d)\n",taxid_1, taxid_2);
|
||||
|
||||
return 0;
|
||||
}
|
411
src/ecopcr.c
411
src/ecopcr.c
@ -8,104 +8,69 @@
|
||||
#define VERSION "0.1"
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
/* printout help */ /* ----------------------------------------------- */
|
||||
|
||||
/* printout help */
|
||||
/* ----------------------------------------------- */
|
||||
#define PP fprintf(stdout,
|
||||
|
||||
static void PrintHelp()
|
||||
{
|
||||
PP "------------------------------------------\n");
|
||||
PP " Apat Version %s\n", VERSION);
|
||||
PP " ecoPCR Version %s\n", VERSION);
|
||||
PP "------------------------------------------\n");
|
||||
PP "synopsis : pattern(s) searching program\n");
|
||||
PP "usage: apat [options] patfile datafile\n");
|
||||
PP "synopsis : searching for sequence and taxonomy hybriding with given primers\n");
|
||||
PP "usage: ecoPCR [options] <nucleotidic patterns>\n");
|
||||
PP "------------------------------------------\n");
|
||||
PP "options:\n");
|
||||
PP "-a code : [A]lphabet encoding for pattern\n");
|
||||
PP " code is one of : \n");
|
||||
PP " dna: use IUPAC equivalences for dna/rna\n");
|
||||
PP " prot: use IUPAC equivalences for proteins\n");
|
||||
PP " alpha: no equivalences, just treat plain symbols\n");
|
||||
PP " note: the equivalences are used in pattern only\n");
|
||||
PP " *not* in sequence(s) (see note (4) below)\n");
|
||||
PP " dft: alpha\n");
|
||||
PP "-c : [C]ooccurences\n");
|
||||
PP " print patterns cooccurence matrix \n");
|
||||
PP " dft: off\n");
|
||||
PP "-h : [H]elp - print <this> help\n");
|
||||
PP "-m : [M]ultiple occurences\n");
|
||||
PP " see -q option \n");
|
||||
PP " dft: off\n");
|
||||
PP "-o file : [O]utput sequences\n");
|
||||
PP " additionaly output sequence(s) that match into\n");
|
||||
PP " 'file' in fasta format\n");
|
||||
PP " dft: off\n");
|
||||
PP "-p : no [Print] - don't printout hits\n");
|
||||
PP " when just counts are needed\n");
|
||||
PP " dft: off\n");
|
||||
PP "-q nn : [Quorum]\n");
|
||||
PP " printout result if at least nn\n");
|
||||
PP " different patterns are found on the sequence\n");
|
||||
PP " (with -m : at least nn different <hits>)\n");
|
||||
PP " dft: # of patterns read\n");
|
||||
PP "-s : no [Sort] - don't sort hits before printing\n");
|
||||
PP " usually hits are printed by increasing position\n");
|
||||
PP " this option will list them by pattern\n");
|
||||
PP " dft: off\n");
|
||||
PP "-t : [T]est sequence\n");
|
||||
PP " additionnaly check if sequences are uppercase\n");
|
||||
PP " this is mostly used for testing\n");
|
||||
PP " dft: off\n");
|
||||
PP "-u : [U]pper\n");
|
||||
PP " force lower->upper sequence conversion\n");
|
||||
PP " without this option lowercase symbols in sequence\n");
|
||||
PP " will not be considered to as matches\n");
|
||||
PP " dft: off\n");
|
||||
PP "-v : [V]erbose\n");
|
||||
PP " just display a kind of progress clock on stderr\n");
|
||||
PP " (this is only useful if you redirect stdout)\n");
|
||||
PP "\n");
|
||||
PP "patfile : pattern file (see below)\n");
|
||||
PP "datafile : database file (see below)\n");
|
||||
PP "------------------------------------------\n");
|
||||
PP "pattern file format :\n");
|
||||
PP " one pattern/line\n");
|
||||
PP " format : <pattern> <space> #errors\n");
|
||||
PP " <pattern> := pattern<symbol>\n");
|
||||
PP " or !pattern<symbol>\n");
|
||||
PP " or pattern<symbol>#\n");
|
||||
PP " or !pattern<symbol>#\n");
|
||||
PP " <symbol> := <letter>\n");
|
||||
PP " or [<letter>....<letter>]\n");
|
||||
PP " <letter> := uppercase letter (A-Z)\n");
|
||||
PP " <number> := a positive number indicates max number of mismatches\n");
|
||||
PP " a negative number indicates max number of mismatches or indels\n");
|
||||
PP " # means that no error is allowed at this position\n");
|
||||
PP " ! complement the <symbol>\n");
|
||||
PP " [...] means that all symbols within [] are allowed\n");
|
||||
PP " in addition IUPAC equivalences may be used as symbols\n");
|
||||
PP " with the -a option\n");
|
||||
PP "\n");
|
||||
PP "example: G[DE]S#[GIV]!HP![DE]# 1\n");
|
||||
PP "-d : [D]atabase : to match the expected format, the database\n");
|
||||
PP " has to be formated first by the ecoPCRFormat.py program located.\n");
|
||||
PP " in the tools directory.\n");
|
||||
PP " ecoPCRFormat.py creates three file types :\n");
|
||||
PP " .sdx : contains the sequences\n");
|
||||
PP " .tdx : contains information concerning the taxonomy\n");
|
||||
PP " .rdx : contains the taxonomy rank\n\n");
|
||||
PP " ecoPCR needs all the file type. As a result, you have to write the\n");
|
||||
PP " database radical without any extension. For example /ecoPCRDB/gbmam\n\n");
|
||||
PP "-e : [E]rror : max error allowed by oligonucleotide (0 by default)\n\n");
|
||||
PP "-h : [H]elp - print <this> help\n\n");
|
||||
PP "-i : [I]gnore the given taxonomy id.\n");
|
||||
PP " Taxonomy id are available using the ecofind program.\n");
|
||||
PP " see its help typing ecofind -h for more information.\n\n");
|
||||
PP "-k : [K]ingdom mode : set the kingdom mode\n");
|
||||
PP " super kingdom mode by default.\n\n");
|
||||
PP "-l : minimum [L]ength : define the minimum amplication length. \n\n");
|
||||
PP "-L : maximum [L]ength : define the maximum amplicationlength. \n\n");
|
||||
PP "-r : [R]estricts the search to the given taxonomic id.\n");
|
||||
PP " Taxonomy id are available using the ecofind program.\n");
|
||||
PP " see its help typing ecofind -h for more information.\n");
|
||||
PP "\n");
|
||||
PP "------------------------------------------\n");
|
||||
PP "datafile contains one or more sequences in\n");
|
||||
PP "Fasta format, with *uppercase* symbols \n");
|
||||
PP "\n");
|
||||
PP "first argument : oligonucleotide for direct strand\n\n");
|
||||
PP "second argument : oligonucleotide for reverse strand\n\n");
|
||||
PP "------------------------------------------\n");
|
||||
PP "note (1): the maximum number of patterns is %d\n", MAX_PATTERN);
|
||||
PP "\n");
|
||||
PP "note (2): the maximum length for one pattern is %d\n", MAX_PAT_LEN);
|
||||
PP "\n");
|
||||
PP "note (3): indels are still experimental and are :\n");
|
||||
PP " not handled gracefully with the # syntax\n");
|
||||
PP " and hits are not printed very nicely\n");
|
||||
PP "\n");
|
||||
PP "note (4): the IUPAC equivalences (-a option) are used\n");
|
||||
PP " in pattern only *not* in sequence(s).\n");
|
||||
PP " for instance GATN (with option -a dna) is equivalent to GAT[ACGT]\n");
|
||||
PP " and will match GATA/GATC/GATG/GATC but will not match GATN\n");
|
||||
PP " (nor NNNN) in sequence.\n");
|
||||
PP "Table result description : \n");
|
||||
PP "column 1 : accession number\n");
|
||||
PP "column 2 : sequence length\n");
|
||||
PP "column 3 : taxonomic id\n");
|
||||
PP "column 4 : rank\n");
|
||||
PP "column 5 : species taxonomic id\n");
|
||||
PP "column 6 : scientific name\n");
|
||||
PP "column 7 : genus taxonomic id\n");
|
||||
PP "column 8 : genus name\n");
|
||||
PP "column 9 : family taxonomic id\n");
|
||||
PP "column 10 : family name\n");
|
||||
PP "column 11 : super kingdom taxonomic id\n");
|
||||
PP "column 12 : super kingdom name\n");
|
||||
PP "column 13 : strand (direct or reverse)\n");
|
||||
PP "column 14 : first oligonucleotide\n");
|
||||
PP "column 15 : number of errors for the first strand\n");
|
||||
PP "column 16 : second oligonucleotide\n");
|
||||
PP "column 17 : number of errors for the second strand\n");
|
||||
PP "column 18 : amplification length\n");
|
||||
PP "column 19 : sequence\n");
|
||||
PP "column 20 : definition\n");
|
||||
PP "------------------------------------------\n");
|
||||
PP " http://www.grenoble.prabi.fr/trac/ecoPCR/\n");
|
||||
PP "------------------------------------------\n\n");
|
||||
PP "\n");
|
||||
|
||||
}
|
||||
@ -121,10 +86,8 @@ static void PrintHelp()
|
||||
static void ExitUsage(stat)
|
||||
int stat;
|
||||
{
|
||||
PP "usage: apat [-a dna|prot] [-c] [-h] [-m] [-o file] [-p]\n");
|
||||
PP " [-q nn] [-t] [-u] [-v]\n");
|
||||
PP " patfile datafile\n");
|
||||
PP "type \"apat -h\" for help\n");
|
||||
PP "usage: ecoPCR [-d database] [-l value] [-L value] [-e value] [-r taxid] [-i taxid] [-k] oligo1 oligo2\n");
|
||||
PP "type \"ecoPCR -h\" for help\n");
|
||||
|
||||
if (stat)
|
||||
exit(stat);
|
||||
@ -306,7 +269,7 @@ int main(int argc, char **argv)
|
||||
int32_t errflag=0;
|
||||
char kingdom_mode=0;
|
||||
|
||||
char *prefix;
|
||||
char *prefix = NULL;
|
||||
|
||||
int32_t checkedSequence = 0;
|
||||
int32_t positiveSequence= 0;
|
||||
@ -330,43 +293,39 @@ int main(int argc, char **argv)
|
||||
int32_t erri;
|
||||
int32_t errj;
|
||||
|
||||
int32_t *restricted_taxid = NULL;
|
||||
int32_t *ignored_taxid = NULL;
|
||||
int32_t r=0;
|
||||
int32_t g=0;
|
||||
|
||||
while ((carg = getopt(argc, argv, "h1:2:l:L:e:k")) != -1) {
|
||||
|
||||
while ((carg = getopt(argc, argv, "hd:l:L:e:i:r:k")) != -1) {
|
||||
|
||||
switch (carg) {
|
||||
/* -------------------- */
|
||||
case '1': /* prenier oligo */
|
||||
case 'd': /* database name */
|
||||
/* -------------------- */
|
||||
oligo1 = ECOMALLOC(strlen(optarg)+1,
|
||||
"Error on oligo 1 allocation");
|
||||
strcpy(oligo1,optarg);
|
||||
break;
|
||||
|
||||
/* -------------------- */
|
||||
case '2': /* coocurence option */
|
||||
/* -------------------- */
|
||||
oligo2 = ECOMALLOC(strlen(optarg)+1,
|
||||
"Error on oligo 1 allocation");
|
||||
strcpy(oligo2,optarg);
|
||||
prefix = ECOMALLOC(strlen(optarg)+1,
|
||||
"Error on prefix allocation");
|
||||
strcpy(prefix,optarg);
|
||||
break;
|
||||
|
||||
/* -------------------- */
|
||||
case 'h': /* help */
|
||||
/* -------------------- */
|
||||
|
||||
PrintHelp();
|
||||
exit(0);
|
||||
break;
|
||||
|
||||
/* -------------------- */
|
||||
case 'l': /* lmin amplification */
|
||||
/* -------------------- */
|
||||
/* ------------------------- */
|
||||
case 'l': /* min amplification lenght */
|
||||
/* ------------------------- */
|
||||
sscanf(optarg,"%d",&lmin);
|
||||
break;
|
||||
|
||||
/* -------------------- */
|
||||
case 'L': /* lmax amplification */
|
||||
/* -------------------- */
|
||||
/* -------------------------- */
|
||||
case 'L': /* max amplification lenght */
|
||||
/* -------------------------- */
|
||||
sscanf(optarg,"%d",&lmax);
|
||||
break;
|
||||
|
||||
@ -374,11 +333,29 @@ int main(int argc, char **argv)
|
||||
case 'e': /* error max */
|
||||
/* -------------------- */
|
||||
sscanf(optarg,"%d",&error_max);
|
||||
break;
|
||||
|
||||
/* -------------------- */
|
||||
case 'k': /* set the kingdom mode */
|
||||
kingdom_mode = 1; /* -------------------- */
|
||||
break;
|
||||
|
||||
/* ------------------------------------------ */
|
||||
case 'r': /* stores the restricting search taxonomic id */
|
||||
/* ------------------------------------------ */
|
||||
restricted_taxid = ECOREALLOC(restricted_taxid,sizeof(int32_t)*(r+1),
|
||||
"Error on restricted_taxid reallocation");
|
||||
sscanf(optarg,"%d",&restricted_taxid[r]);
|
||||
r++;
|
||||
break;
|
||||
|
||||
case 'k': /* error max */
|
||||
/* -------------------- */
|
||||
kingdom_mode = 1;
|
||||
/* --------------------------------- */
|
||||
case 'i': /* stores the taxonomic id to ignore */
|
||||
/* --------------------------------- */
|
||||
ignored_taxid = ECOREALLOC(ignored_taxid,sizeof(int32_t)*(g+1),
|
||||
"Error on excluded_taxid reallocation");
|
||||
sscanf(optarg,"%d",&ignored_taxid[g]);
|
||||
g++;
|
||||
break;
|
||||
|
||||
/* -------------------- */
|
||||
@ -389,25 +366,43 @@ int main(int argc, char **argv)
|
||||
|
||||
}
|
||||
|
||||
if ((argc -= optind) != 1)
|
||||
errflag++;
|
||||
|
||||
/**
|
||||
* check the path to the database is given as last argument
|
||||
*/
|
||||
if ((argc -= optind) == 2)
|
||||
{
|
||||
|
||||
oligo1 = ECOMALLOC(strlen(argv[optind])+1,
|
||||
"Error on oligo1 allocation");
|
||||
strcpy(oligo1,argv[optind]);
|
||||
optind++;
|
||||
oligo2 = ECOMALLOC(strlen(argv[optind])+1,
|
||||
"Error on oligo1 allocation");
|
||||
strcpy(oligo2,argv[optind]);
|
||||
}
|
||||
else
|
||||
errflag++;
|
||||
|
||||
if (prefix == NULL)
|
||||
{
|
||||
prefix = getenv("ECOPCRDB");
|
||||
if (prefix == NULL)
|
||||
errflag++;
|
||||
}
|
||||
|
||||
|
||||
if (!oligo1 || !oligo2)
|
||||
errflag++;
|
||||
|
||||
if (errflag)
|
||||
ExitUsage(errflag);
|
||||
|
||||
prefix = argv[optind];
|
||||
|
||||
o1 = buildPattern(oligo1,error_max);
|
||||
o2 = buildPattern(oligo2,error_max);
|
||||
|
||||
o1c = complementPattern(o1);
|
||||
o2c = complementPattern(o2);
|
||||
|
||||
|
||||
|
||||
printf("#\n");
|
||||
printf("# ecoPCR version %s\n",VERSION);
|
||||
printf("# direct strand oligo1 : %-32s ; oligo2c : %32s\n", o1->cpat,o2c->cpat);
|
||||
@ -427,102 +422,120 @@ int main(int argc, char **argv)
|
||||
printf("# output in superkingdom mode\n");
|
||||
printf("#\n");
|
||||
|
||||
taxonomy = read_taxonomy(prefix);
|
||||
|
||||
taxonomy = read_taxonomy(prefix,0);
|
||||
|
||||
seq = ecoseq_iterator(prefix);
|
||||
|
||||
|
||||
|
||||
checkedSequence = 0;
|
||||
positiveSequence= 0;
|
||||
amplifiatCount = 0;
|
||||
|
||||
while(seq)
|
||||
{
|
||||
{
|
||||
checkedSequence++;
|
||||
|
||||
scname = taxonomy->taxons->taxon[seq->taxid].name;
|
||||
strncpy(head,seq->SQ,10);
|
||||
head[10]=0;
|
||||
strncpy(tail,seq->SQ+seq->SQ_length-10,10);
|
||||
tail[10]=0;
|
||||
|
||||
/**
|
||||
* check if current sequence should be included
|
||||
**/
|
||||
if ( (r == 0) ||
|
||||
(eco_is_taxid_included(taxonomy,
|
||||
restricted_taxid,
|
||||
r,
|
||||
taxonomy->taxons->taxon[seq->taxid].taxid)
|
||||
)
|
||||
)
|
||||
if ((g == 0) ||
|
||||
!(eco_is_taxid_included(taxonomy,
|
||||
ignored_taxid,
|
||||
g,
|
||||
taxonomy->taxons->taxon[seq->taxid].taxid)
|
||||
)
|
||||
)
|
||||
{
|
||||
|
||||
apatseq=ecoseq2apatseq(seq,apatseq);
|
||||
scname = taxonomy->taxons->taxon[seq->taxid].name;
|
||||
strncpy(head,seq->SQ,10);
|
||||
head[10]=0;
|
||||
strncpy(tail,seq->SQ+seq->SQ_length-10,10);
|
||||
tail[10]=0;
|
||||
|
||||
o1Hits = ManberAll(apatseq,o1,0,0,apatseq->seqlen);
|
||||
o2cHits= 0;
|
||||
|
||||
|
||||
if (o1Hits)
|
||||
{
|
||||
stktmp = apatseq->hitpos[0];
|
||||
begin = stktmp->val[0] + o1->patlen;
|
||||
|
||||
if (lmax)
|
||||
length= stktmp->val[stktmp->top-1] + o1->patlen - begin + lmax + o2->patlen;
|
||||
else
|
||||
length= apatseq->seqlen - begin;
|
||||
apatseq=ecoseq2apatseq(seq,apatseq);
|
||||
|
||||
o2cHits = ManberAll(apatseq,o2c,1,begin,length);
|
||||
|
||||
if (o2cHits)
|
||||
for (i=0; i < o1Hits;i++)
|
||||
{
|
||||
posi = apatseq->hitpos[0]->val[i];
|
||||
erri = apatseq->hiterr[0]->val[i];
|
||||
for (j=0; j < o2cHits; j++)
|
||||
{
|
||||
posj =apatseq->hitpos[1]->val[j] + o2c->patlen;
|
||||
errj =apatseq->hiterr[1]->val[j];
|
||||
length=posj - posi + 1 - o1->patlen - o2->patlen;
|
||||
|
||||
if ((!lmin || (length >= lmin)) &&
|
||||
(!lmax || (length <= lmax)))
|
||||
printRepeat(seq,o1,o2c,'D',kingdom_mode,posi,posj,erri,errj,taxonomy);
|
||||
//printf("%s\tD\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o1Hits,o2cHits,posi,posj,scname);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
o2Hits = ManberAll(apatseq,o2,2,0,apatseq->seqlen);
|
||||
o1cHits= 0;
|
||||
if (o2Hits)
|
||||
{
|
||||
stktmp = apatseq->hitpos[2];
|
||||
begin = stktmp->val[0] + o2->patlen;
|
||||
|
||||
if (lmax)
|
||||
length= stktmp->val[stktmp->top-1] + o2->patlen - begin + lmax + o1->patlen;
|
||||
else
|
||||
length= apatseq->seqlen - begin;
|
||||
o1Hits = ManberAll(apatseq,o1,0,0,apatseq->seqlen);
|
||||
o2cHits= 0;
|
||||
|
||||
o1cHits = ManberAll(apatseq,o1c,3,begin,length);
|
||||
|
||||
if (o1cHits)
|
||||
for (i=0; i < o2Hits;i++)
|
||||
if (o1Hits)
|
||||
{
|
||||
posi = apatseq->hitpos[2]->val[i];
|
||||
erri = apatseq->hiterr[2]->val[i];
|
||||
for (j=0; j < o1cHits; j++)
|
||||
{
|
||||
posj=apatseq->hitpos[3]->val[j] + o1c->patlen;
|
||||
errj=apatseq->hiterr[3]->val[j];
|
||||
length=posj - posi + 1 - o1->patlen - o2->patlen;
|
||||
stktmp = apatseq->hitpos[0];
|
||||
begin = stktmp->val[0] + o1->patlen;
|
||||
|
||||
if (lmax)
|
||||
length= stktmp->val[stktmp->top-1] + o1->patlen - begin + lmax + o2->patlen;
|
||||
else
|
||||
length= apatseq->seqlen - begin;
|
||||
|
||||
if ((!lmin || (length >= lmin)) &&
|
||||
(!lmax || (length <= lmax)))
|
||||
printRepeat(seq,o2,o1c,'R',kingdom_mode,posi,posj,erri,errj,taxonomy);
|
||||
//printf("%s\tR\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o2Hits,o1cHits,posi,posj,scname);
|
||||
}
|
||||
o2cHits = ManberAll(apatseq,o2c,1,begin,length);
|
||||
|
||||
if (o2cHits)
|
||||
for (i=0; i < o1Hits;i++)
|
||||
{
|
||||
posi = apatseq->hitpos[0]->val[i];
|
||||
erri = apatseq->hiterr[0]->val[i];
|
||||
for (j=0; j < o2cHits; j++)
|
||||
{
|
||||
posj =apatseq->hitpos[1]->val[j] + o2c->patlen;
|
||||
errj =apatseq->hiterr[1]->val[j];
|
||||
length=posj - posi + 1 - o1->patlen - o2->patlen;
|
||||
|
||||
if ((!lmin || (length >= lmin)) &&
|
||||
(!lmax || (length <= lmax)))
|
||||
printRepeat(seq,o1,o2c,'D',kingdom_mode,posi,posj,erri,errj,taxonomy);
|
||||
//printf("%s\tD\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o1Hits,o2cHits,posi,posj,scname);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
o2Hits = ManberAll(apatseq,o2,2,0,apatseq->seqlen);
|
||||
o1cHits= 0;
|
||||
if (o2Hits)
|
||||
{
|
||||
stktmp = apatseq->hitpos[2];
|
||||
begin = stktmp->val[0] + o2->patlen;
|
||||
|
||||
if (lmax)
|
||||
length= stktmp->val[stktmp->top-1] + o2->patlen - begin + lmax + o1->patlen;
|
||||
else
|
||||
length= apatseq->seqlen - begin;
|
||||
|
||||
o1cHits = ManberAll(apatseq,o1c,3,begin,length);
|
||||
|
||||
if (o1cHits)
|
||||
for (i=0; i < o2Hits;i++)
|
||||
{
|
||||
posi = apatseq->hitpos[2]->val[i];
|
||||
erri = apatseq->hiterr[2]->val[i];
|
||||
for (j=0; j < o1cHits; j++)
|
||||
{
|
||||
posj=apatseq->hitpos[3]->val[j] + o1c->patlen;
|
||||
errj=apatseq->hiterr[3]->val[j];
|
||||
length=posj - posi + 1 - o1->patlen - o2->patlen;
|
||||
|
||||
if ((!lmin || (length >= lmin)) &&
|
||||
(!lmax || (length <= lmax)))
|
||||
printRepeat(seq,o2,o1c,'R',kingdom_mode,posi,posj,erri,errj,taxonomy);
|
||||
//printf("%s\tR\t%s...%s (%d)\t%d\t%d\t%d\t%d\t%s\n",seq->AC,head,tail,seq->SQ_length,o2Hits,o1cHits,posi,posj,scname);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} /* End of taxonomic selection */
|
||||
|
||||
delete_ecoseq(seq);
|
||||
|
||||
seq = ecoseq_iterator(NULL);
|
||||
}
|
||||
|
||||
ECOFREE(restricted_taxid, "Error: could not free restricted_taxid\n");
|
||||
ECOFREE(ignored_taxid, "Error: could not free excluded_taxid\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
18
src/global.mk
Normal file
18
src/global.mk
Normal file
@ -0,0 +1,18 @@
|
||||
MACHINE=MAC_OS_X
|
||||
LIBPATH= -Llibapat -LlibecoPCR
|
||||
MAKEDEPEND = gcc -D$(MACHINE) -M $(CPPFLAGS) -o $*.d $<
|
||||
|
||||
CC=gcc
|
||||
CFLAGS= -W -Wall -O2 -g
|
||||
|
||||
default: all
|
||||
|
||||
%.o: %.c
|
||||
$(CC) -D$(MACHINE) $(CFLAGS) -c -o $@ $<
|
||||
|
||||
%.P : %.c
|
||||
$(MAKEDEPEND)
|
||||
@sed 's/\($*\)\.o[ :]*/\1.o $@ : /g' < $*.d > $@; \
|
||||
rm -f $*.d; [ -s $@ ] || rm -f $@
|
||||
|
||||
include $(SRCS:.c=.P)
|
23
src/libapat/Makefile
Normal file
23
src/libapat/Makefile
Normal file
@ -0,0 +1,23 @@
|
||||
|
||||
SOURCES = apat_parse.c \
|
||||
apat_search.c \
|
||||
libstki.c
|
||||
|
||||
SRCS=$(SOURCES)
|
||||
|
||||
|
||||
OBJECTS= $(patsubst %.c,%.o,$(SOURCES))
|
||||
|
||||
LIBFILE= libapat.a
|
||||
|
||||
|
||||
include ../global.mk
|
||||
|
||||
all: $(LIBFILE)
|
||||
|
||||
clean:
|
||||
rm -rf $(OBJECTS) $(LIBFILE)
|
||||
|
||||
$(LIBFILE): $(OBJECTS)
|
||||
ar -cr $@ $?
|
||||
|
29
src/libecoPCR/Makefile
Normal file
29
src/libecoPCR/Makefile
Normal file
@ -0,0 +1,29 @@
|
||||
|
||||
SOURCES = ecoapat.c \
|
||||
ecodna.c \
|
||||
ecoError.c \
|
||||
ecoIOUtils.c \
|
||||
ecoMalloc.c \
|
||||
ecorank.c \
|
||||
ecoseq.c \
|
||||
ecotax.c \
|
||||
ecofilter.c \
|
||||
econame.c
|
||||
|
||||
SRCS=$(SOURCES)
|
||||
|
||||
OBJECTS= $(patsubst %.c,%.o,$(SOURCES))
|
||||
|
||||
LIBFILE= libecoPCR.a
|
||||
|
||||
|
||||
include ../global.mk
|
||||
|
||||
|
||||
all: $(LIBFILE)
|
||||
|
||||
clean:
|
||||
rm -rf $(OBJECTS) $(LIBFILE)
|
||||
|
||||
$(LIBFILE): $(OBJECTS)
|
||||
ar -cr $@ $?
|
@ -2,7 +2,15 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
|
||||
/*
|
||||
* print the message given as argument and exit the program
|
||||
* @param error error number
|
||||
* @param message the text explaining what's going on
|
||||
* @param filename the file source where the program failed
|
||||
* @param linenumber the line where it has failed
|
||||
* filename and linenumber are written at pre-processing
|
||||
* time by a macro
|
||||
*/
|
||||
void ecoError(int32_t error,
|
||||
const char* message,
|
||||
const char * filename,
|
||||
|
@ -16,20 +16,19 @@ int32_t is_big_endian()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
int32_t swap_int32_t(int32_t i)
|
||||
{
|
||||
return SWAPINT32(i);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Read part of the file
|
||||
* @param *f the database
|
||||
* @param recordSize the size to be read
|
||||
*
|
||||
* @return buffer
|
||||
*/
|
||||
void *read_ecorecord(FILE *f,int32_t *recordSize)
|
||||
{
|
||||
static void *buffer =NULL;
|
||||
@ -79,10 +78,14 @@ void *read_ecorecord(FILE *f,int32_t *recordSize)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Open the database and check it's readable
|
||||
* @param filename name of the database (.sdx, .rdx, .tbx)
|
||||
* @param sequencecount buffer - pointer to variable storing the number of occurence
|
||||
* @param abort_on_open_error boolean to define the behaviour in case of error
|
||||
* while opening the database
|
||||
* @return FILE type
|
||||
**/
|
||||
FILE *open_ecorecorddb(const char *filename,
|
||||
int32_t *sequencecount,
|
||||
int32_t abort_on_open_error)
|
||||
|
@ -56,11 +56,11 @@ typedef struct {
|
||||
|
||||
} ecotxformat_t;
|
||||
|
||||
typedef struct {
|
||||
int32_t taxid;
|
||||
int32_t rank;
|
||||
int32_t parent;
|
||||
char *name;
|
||||
typedef struct ecotxnode {
|
||||
int32_t taxid;
|
||||
int32_t rank;
|
||||
struct ecotxnode *parent;
|
||||
char *name;
|
||||
} ecotx_t;
|
||||
|
||||
typedef struct {
|
||||
@ -80,12 +80,42 @@ typedef struct {
|
||||
char* label[1];
|
||||
} ecorankidx_t;
|
||||
|
||||
/*
|
||||
*
|
||||
* Taxonomy name types
|
||||
*
|
||||
*/
|
||||
|
||||
typedef struct {
|
||||
int32_t is_scientificname;
|
||||
int32_t namelength;
|
||||
int32_t classlength;
|
||||
int32_t taxid;
|
||||
char names[1];
|
||||
} econameformat_t;
|
||||
|
||||
|
||||
typedef struct {
|
||||
char *name;
|
||||
char *classname;
|
||||
int32_t is_scientificname;
|
||||
struct ecotxnode *taxon;
|
||||
} econame_t;
|
||||
|
||||
|
||||
typedef struct {
|
||||
int32_t count;
|
||||
econame_t names[1];
|
||||
} econameidx_t;
|
||||
|
||||
|
||||
typedef struct {
|
||||
ecorankidx_t *ranks;
|
||||
econameidx_t *names;
|
||||
ecotxidx_t *taxons;
|
||||
} ecotaxonomy_t;
|
||||
|
||||
|
||||
|
||||
/*****************************************************
|
||||
*
|
||||
* Function declarations
|
||||
@ -175,6 +205,9 @@ ecoseq_t *readnext_ecoseq(FILE *);
|
||||
|
||||
ecorankidx_t *read_rankidx(const char *filename);
|
||||
|
||||
econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy);
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Read taxonomy data as formated by the ecoPCRFormat.py script.
|
||||
@ -189,8 +222,11 @@ ecorankidx_t *read_rankidx(const char *filename);
|
||||
|
||||
ecotxidx_t *read_taxonomyidx(const char *filename);
|
||||
|
||||
ecotaxonomy_t *read_taxonomy(const char *prefix);
|
||||
ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName);
|
||||
|
||||
ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy, int32_t taxid);
|
||||
|
||||
int eco_isundertaxon(ecotx_t *taxon, int other_taxid);
|
||||
|
||||
ecoseq_t *ecoseq_iterator(const char *prefix);
|
||||
|
||||
@ -227,4 +263,7 @@ ecotx_t *eco_getfamily(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
|
||||
ecotx_t *eco_getkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
|
||||
ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
|
||||
|
||||
int eco_is_taxid_ignored(int32_t *ignored_taxid, int32_t tab_len, int32_t taxid);
|
||||
int eco_is_taxid_included(ecotaxonomy_t *taxonomy, int32_t *included_taxid, int32_t tab_len, int32_t taxid);
|
||||
|
||||
#endif /*ECOPCR_H_*/
|
||||
|
19
src/libecoPCR/ecofilter.c
Normal file
19
src/libecoPCR/ecofilter.c
Normal file
@ -0,0 +1,19 @@
|
||||
#include "ecoPCR.h"
|
||||
|
||||
int eco_is_taxid_included( ecotaxonomy_t *taxonomy,
|
||||
int32_t *restricted_taxid,
|
||||
int32_t tab_len,
|
||||
int32_t taxid)
|
||||
{
|
||||
int i;
|
||||
ecotx_t *taxon;
|
||||
|
||||
taxon = eco_findtaxonbytaxid(taxonomy, taxid);
|
||||
|
||||
for (i=0; i < tab_len; i++)
|
||||
if ( (taxon->taxid == restricted_taxid[i]) ||
|
||||
(eco_isundertaxon(taxon, restricted_taxid[i])) )
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
61
src/libecoPCR/econame.c
Normal file
61
src/libecoPCR/econame.c
Normal file
@ -0,0 +1,61 @@
|
||||
#include "ecoPCR.h"
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
static econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy);
|
||||
|
||||
econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy)
|
||||
{
|
||||
|
||||
int32_t count;
|
||||
FILE *f;
|
||||
econameidx_t *indexname;
|
||||
int32_t i;
|
||||
|
||||
f = open_ecorecorddb(filename,&count,1);
|
||||
|
||||
indexname = (econameidx_t*) ECOMALLOC(sizeof(econameidx_t) + sizeof(econame_t) * (count-1),"Allocate names");
|
||||
|
||||
indexname->count=count;
|
||||
|
||||
for (i=0; i < count; i++){
|
||||
readnext_econame(f,(indexname->names)+i,taxonomy);
|
||||
}
|
||||
|
||||
return indexname;
|
||||
}
|
||||
|
||||
econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy)
|
||||
{
|
||||
|
||||
econameformat_t *raw;
|
||||
int32_t rs;
|
||||
|
||||
raw = read_ecorecord(f,&rs);
|
||||
|
||||
if (!raw)
|
||||
return NULL;
|
||||
|
||||
if (is_big_endian())
|
||||
{
|
||||
raw->is_scientificname = swap_int32_t(raw->is_scientificname);
|
||||
raw->namelength = swap_int32_t(raw->namelength);
|
||||
raw->classlength = swap_int32_t(raw->classlength);
|
||||
raw->taxid = swap_int32_t(raw->taxid);
|
||||
}
|
||||
|
||||
name->is_scientificname=raw->is_scientificname;
|
||||
|
||||
name->name = ECOMALLOC((raw->namelength+1) * sizeof(char),"Allocate name");
|
||||
strncpy(name->name,raw->names,raw->namelength);
|
||||
name->name[raw->namelength]=0;
|
||||
|
||||
name->classname = ECOMALLOC((raw->classlength+1) * sizeof(char),"Allocate classname");
|
||||
strncpy(name->classname,(raw->names+raw->namelength),raw->classlength);
|
||||
name->classname[raw->classlength]=0;
|
||||
|
||||
name->taxon = taxonomy->taxons->taxon + raw->taxid;
|
||||
|
||||
return name;
|
||||
}
|
||||
|
@ -79,7 +79,9 @@ ecoseq_t *new_ecoseq_with_data( char *AC,
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* ?? used ??
|
||||
**/
|
||||
FILE *open_ecoseqdb(const char *filename,
|
||||
int32_t *sequencecount)
|
||||
{
|
||||
@ -140,6 +142,13 @@ ecoseq_t *readnext_ecoseq(FILE *f)
|
||||
return seq;
|
||||
}
|
||||
|
||||
/**
|
||||
* Open the sequences database (.sdx file)
|
||||
* @param prefix name of the database (radical without extension)
|
||||
* @param index integer
|
||||
*
|
||||
* @return file object
|
||||
*/
|
||||
FILE *open_seqfile(const char *prefix,int32_t index)
|
||||
{
|
||||
char filename_buffer[1024];
|
||||
@ -153,7 +162,7 @@ FILE *open_seqfile(const char *prefix,int32_t index)
|
||||
prefix,
|
||||
index);
|
||||
|
||||
fprintf(stderr,"Coucou %s\n",filename_buffer);
|
||||
fprintf(stderr,"# Coucou %s\n",filename_buffer);
|
||||
|
||||
|
||||
if (filename_length >= 1024)
|
||||
@ -164,7 +173,7 @@ FILE *open_seqfile(const char *prefix,int32_t index)
|
||||
input=open_ecorecorddb(filename_buffer,&seqcount,0);
|
||||
|
||||
if (input)
|
||||
fprintf(stderr,"Reading file %s containing %d sequences...\n",
|
||||
fprintf(stderr,"# Reading file %s containing %d sequences...\n",
|
||||
filename_buffer,
|
||||
seqcount);
|
||||
|
||||
|
@ -5,7 +5,11 @@
|
||||
|
||||
static ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon);
|
||||
|
||||
|
||||
/**
|
||||
* Open the taxonomy database
|
||||
* @param pointer to the database (.tdx file)
|
||||
* @return a ecotxidx_t structure
|
||||
*/
|
||||
ecotxidx_t *read_taxonomyidx(const char *filename)
|
||||
{
|
||||
int32_t count;
|
||||
@ -18,14 +22,15 @@ ecotxidx_t *read_taxonomyidx(const char *filename)
|
||||
index = (ecotxidx_t*) ECOMALLOC(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count-1),
|
||||
"Allocate taxonomy");
|
||||
|
||||
index->count=count;
|
||||
|
||||
for (i=0; i < count; i++)
|
||||
index->count=count;
|
||||
for (i=0; i < count; i++){
|
||||
readnext_ecotaxon(f,&(index->taxon[i]));
|
||||
|
||||
index->taxon[i].parent=index->taxon + (int32_t)index->taxon[i].parent;
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
|
||||
int32_t delete_taxonomy(ecotxidx_t *index)
|
||||
{
|
||||
int32_t i;
|
||||
@ -61,6 +66,15 @@ int32_t delete_taxon(ecotx_t *taxon)
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Read the database for a given taxon a save the data
|
||||
* into the taxon structure(if any found)
|
||||
* @param *f pointer to FILE type returned by fopen
|
||||
* @param *taxon pointer to the structure
|
||||
*
|
||||
* @return a ecotx_t structure if any taxon found else NULL
|
||||
*/
|
||||
ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon)
|
||||
{
|
||||
|
||||
@ -80,7 +94,7 @@ ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon)
|
||||
raw->taxid = swap_int32_t(raw->taxid);
|
||||
}
|
||||
|
||||
taxon->parent = raw->parent;
|
||||
taxon->parent = (ecotx_t*)raw->parent;
|
||||
taxon->taxid = raw->taxid;
|
||||
taxon->rank = raw->rank;
|
||||
|
||||
@ -88,12 +102,12 @@ ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon)
|
||||
"Allocate taxon scientific name");
|
||||
|
||||
strncpy(taxon->name,raw->name,raw->namelength);
|
||||
|
||||
|
||||
return taxon;
|
||||
}
|
||||
|
||||
|
||||
ecotaxonomy_t *read_taxonomy(const char *prefix)
|
||||
ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName)
|
||||
{
|
||||
ecotaxonomy_t *tax;
|
||||
char *filename;
|
||||
@ -115,10 +129,19 @@ ecotaxonomy_t *read_taxonomy(const char *prefix)
|
||||
|
||||
tax->taxons = read_taxonomyidx(filename);
|
||||
|
||||
if (readAlternativeName)
|
||||
{
|
||||
snprintf(filename,buffsize,"%s.ndx",prefix);
|
||||
tax->names=read_nameidx(filename,tax);
|
||||
}
|
||||
else
|
||||
tax->names=NULL;
|
||||
return tax;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy)
|
||||
{
|
||||
if (taxonomy)
|
||||
@ -138,20 +161,19 @@ int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy)
|
||||
}
|
||||
|
||||
ecotx_t *eco_findtaxonatrank(ecotx_t *taxon,
|
||||
int32_t rankidx,
|
||||
ecotaxonomy_t *taxonomy)
|
||||
int32_t rankidx)
|
||||
{
|
||||
ecotx_t *current_taxon;
|
||||
ecotx_t *next_taxon;
|
||||
|
||||
current_taxon = taxon;
|
||||
next_taxon = &(taxonomy->taxons->taxon[current_taxon->parent]);
|
||||
next_taxon = current_taxon->parent;
|
||||
|
||||
while ((current_taxon!=next_taxon) &&
|
||||
while ((current_taxon!=next_taxon) && // I' am the root node
|
||||
(current_taxon->rank!=rankidx))
|
||||
{
|
||||
current_taxon = next_taxon;
|
||||
next_taxon = &(taxonomy->taxons->taxon[current_taxon->parent]);
|
||||
next_taxon = current_taxon->parent;
|
||||
}
|
||||
|
||||
if (current_taxon->rank==rankidx)
|
||||
@ -160,6 +182,61 @@ ecotx_t *eco_findtaxonatrank(ecotx_t *taxon,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get back information concerning a taxon from a taxonomic id
|
||||
* @param *taxonomy the taxonomy database
|
||||
* @param taxid the taxonomic id
|
||||
*
|
||||
* @result a ecotx_t structure containing the taxonimic information
|
||||
**/
|
||||
ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy,
|
||||
int32_t taxid)
|
||||
{
|
||||
ecotx_t *current_taxon;
|
||||
int32_t taxoncount;
|
||||
int32_t i;
|
||||
|
||||
taxoncount=taxonomy->taxons->count;
|
||||
|
||||
for (current_taxon=taxonomy->taxons->taxon,
|
||||
i=0;
|
||||
i < taxoncount;
|
||||
i++,
|
||||
current_taxon++){
|
||||
if (current_taxon->taxid==taxid){
|
||||
return current_taxon;
|
||||
}
|
||||
}
|
||||
|
||||
return (ecotx_t*)NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find out if taxon is son of other taxon (identified by its taxid)
|
||||
* @param *taxon son taxon
|
||||
* @param parent_taxid taxonomic id of the other taxon
|
||||
*
|
||||
* @return 1 is the other taxid math a parent taxid, else 0
|
||||
**/
|
||||
int eco_isundertaxon(ecotx_t *taxon,
|
||||
int other_taxid)
|
||||
{
|
||||
ecotx_t *next_parent;
|
||||
|
||||
next_parent = taxon->parent;
|
||||
|
||||
while ( (other_taxid != next_parent->taxid) &&
|
||||
(strcmp(next_parent->name, "root")) )
|
||||
{
|
||||
next_parent = next_parent->parent;
|
||||
}
|
||||
|
||||
if (other_taxid == next_parent->taxid)
|
||||
return 1;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
ecotx_t *eco_getspecies(ecotx_t *taxon,
|
||||
ecotaxonomy_t *taxonomy)
|
||||
{
|
||||
@ -175,7 +252,7 @@ ecotx_t *eco_getspecies(ecotx_t *taxon,
|
||||
if (!tax || rankindex < 0)
|
||||
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
|
||||
|
||||
return eco_findtaxonatrank(taxon,rankindex,tax);
|
||||
return eco_findtaxonatrank(taxon,rankindex);
|
||||
}
|
||||
|
||||
ecotx_t *eco_getgenus(ecotx_t *taxon,
|
||||
@ -193,7 +270,7 @@ ecotx_t *eco_getgenus(ecotx_t *taxon,
|
||||
if (!tax || rankindex < 0)
|
||||
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
|
||||
|
||||
return eco_findtaxonatrank(taxon,rankindex,tax);
|
||||
return eco_findtaxonatrank(taxon,rankindex);
|
||||
}
|
||||
|
||||
|
||||
@ -212,7 +289,7 @@ ecotx_t *eco_getfamily(ecotx_t *taxon,
|
||||
if (!tax || rankindex < 0)
|
||||
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
|
||||
|
||||
return eco_findtaxonatrank(taxon,rankindex,tax);
|
||||
return eco_findtaxonatrank(taxon,rankindex);
|
||||
}
|
||||
|
||||
ecotx_t *eco_getkingdom(ecotx_t *taxon,
|
||||
@ -230,7 +307,7 @@ ecotx_t *eco_getkingdom(ecotx_t *taxon,
|
||||
if (!tax || rankindex < 0)
|
||||
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
|
||||
|
||||
return eco_findtaxonatrank(taxon,rankindex,tax);
|
||||
return eco_findtaxonatrank(taxon,rankindex);
|
||||
}
|
||||
|
||||
ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,
|
||||
@ -248,5 +325,5 @@ ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,
|
||||
if (!tax || rankindex < 0)
|
||||
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
|
||||
|
||||
return eco_findtaxonatrank(taxon,rankindex,tax);
|
||||
return eco_findtaxonatrank(taxon,rankindex);
|
||||
}
|
303
tools/ecoPCRFilter.py
Executable file
303
tools/ecoPCRFilter.py
Executable file
@ -0,0 +1,303 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import struct
|
||||
import sys
|
||||
import os
|
||||
import gzip
|
||||
|
||||
|
||||
#####
|
||||
#
|
||||
# Generic file function
|
||||
#
|
||||
#####
|
||||
|
||||
class Filter(object):
|
||||
|
||||
|
||||
def __init__(self,path):
|
||||
self._path = path
|
||||
self._taxonFile = "%s.tdx" % self._path
|
||||
self._ranksFile = "%s.rdx" % self._path
|
||||
self._namesFile = "%s.ndx" % self._path
|
||||
self._taxonomy, self._index, self._ranks, self._name = self.__readNodeTable()
|
||||
|
||||
|
||||
def __universalOpen(self,file):
|
||||
if isinstance(file,str):
|
||||
if file[-3:] == '.gz':
|
||||
rep = gzip.open(file)
|
||||
else:
|
||||
rep = open(file)
|
||||
else:
|
||||
rep = file
|
||||
return rep
|
||||
|
||||
def __universalTell(self,file):
|
||||
if isinstance(file, gzip.GzipFile):
|
||||
file=file.myfileobj
|
||||
return file.tell()
|
||||
|
||||
def __fileSize(self,file):
|
||||
if isinstance(file, gzip.GzipFile):
|
||||
file=file.myfileobj
|
||||
pos = file.tell()
|
||||
file.seek(0,2)
|
||||
length = file.tell()
|
||||
file.seek(pos,0)
|
||||
return length
|
||||
|
||||
def __progressBar(self,pos,max,reset=False,delta=[]):
|
||||
if reset:
|
||||
del delta[:]
|
||||
if not delta:
|
||||
delta.append(time.time())
|
||||
delta.append(time.time())
|
||||
|
||||
delta[1]=time.time()
|
||||
elapsed = delta[1]-delta[0]
|
||||
percent = float(pos)/max * 100
|
||||
remain = time.strftime('%H:%M:%S',time.gmtime(elapsed / percent * (100-percent)))
|
||||
bar = '#' * int(percent/2)
|
||||
bar+= '|/-\\-'[pos % 5]
|
||||
bar+= ' ' * (50 - int(percent/2))
|
||||
sys.stderr.write('\r%5.1f %% |%s] remain : %s' %(percent,bar,remain))
|
||||
|
||||
|
||||
|
||||
|
||||
#####
|
||||
#
|
||||
# Iterator functions
|
||||
#
|
||||
#####
|
||||
|
||||
|
||||
|
||||
def __ecoRecordIterator(self,file):
|
||||
file = self.__universalOpen(file)
|
||||
(recordCount,) = struct.unpack('> I',file.read(4))
|
||||
|
||||
for i in xrange(recordCount):
|
||||
(recordSize,)=struct.unpack('>I',file.read(4))
|
||||
record = file.read(recordSize)
|
||||
yield record
|
||||
|
||||
|
||||
def __ecoNameIterator(self):
|
||||
for record in self.__ecoRecordIterator(self._namesFile):
|
||||
lrecord = len(record)
|
||||
lnames = lrecord - 16
|
||||
(isScientificName,namelength,classLength,indextaxid,names)=struct.unpack('> I I I I %ds' % lnames, record)
|
||||
name=names[:namelength]
|
||||
classname=names[namelength:]
|
||||
yield (name,classname,indextaxid)
|
||||
|
||||
|
||||
def __ecoTaxonomicIterator(self):
|
||||
for record in self.__ecoRecordIterator(self._taxonFile):
|
||||
lrecord = len(record)
|
||||
lnames = lrecord - 16
|
||||
(taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record)
|
||||
yield (taxid,rankid,parentidx,name)
|
||||
|
||||
|
||||
def __ecoSequenceIterator(self,file):
|
||||
for record in self.__ecoRecordIterator(file):
|
||||
lrecord = len(record)
|
||||
lnames = lrecord - (4*4+20)
|
||||
(taxid,seqid,deflength,seqlength,cptseqlength,string)=struct.unpack('> I 20s I I I %ds' % lnames, record)
|
||||
de = string[:deflength]
|
||||
seq = gzip.zlib.decompress(string[deflength:])
|
||||
yield (taxid,seqid,deflength,seqlength,cptseqlength,de,seq)
|
||||
|
||||
|
||||
def __ecoRankIterator(self):
|
||||
for record in self.__ecoRecordIterator(self._ranksFile):
|
||||
yield record
|
||||
|
||||
|
||||
#####
|
||||
#
|
||||
# Indexes
|
||||
#
|
||||
#####
|
||||
|
||||
def __ecoNameIndex(self):
|
||||
indexName = [x for x in self.__ecoNameIterator()]
|
||||
return indexName
|
||||
|
||||
def __ecoRankIndex(self):
|
||||
rank = [r for r in self.__ecoRankIterator()]
|
||||
return rank
|
||||
|
||||
def __ecoTaxonomyIndex(self):
|
||||
taxonomy = []
|
||||
index = {}
|
||||
i = 0;
|
||||
for x in self.__ecoTaxonomicIterator():
|
||||
taxonomy.append(x)
|
||||
index[x[0]] = i
|
||||
i = i + 1
|
||||
return taxonomy, index
|
||||
|
||||
def __readNodeTable(self):
|
||||
taxonomy, index = self.__ecoTaxonomyIndex()
|
||||
ranks = self.__ecoRankIndex()
|
||||
name = self.__ecoNameIndex()
|
||||
return taxonomy,index,ranks,name
|
||||
|
||||
|
||||
def findTaxonByTaxid(self,taxid):
|
||||
return self._taxonomy[self._index[taxid]]
|
||||
|
||||
|
||||
|
||||
#####
|
||||
#
|
||||
# PUBLIC METHODS
|
||||
#
|
||||
#####
|
||||
|
||||
|
||||
def subTreeIterator(self, taxid):
|
||||
"return subtree for given taxonomic id "
|
||||
idx = self._index[taxid]
|
||||
yield self._taxonomy[idx]
|
||||
for t in self._taxonomy:
|
||||
if t[2] == idx:
|
||||
for subt in self.subTreeIterator(t[0]):
|
||||
yield subt
|
||||
|
||||
|
||||
def parentalTreeIterator(self, taxid):
|
||||
"""
|
||||
return parental tree for given taxonomic id starting from
|
||||
first ancester to the root.
|
||||
"""
|
||||
taxon=self.findTaxonByTaxid(taxid)
|
||||
while taxon[2]!= 0:
|
||||
yield taxon
|
||||
taxon = self._taxonomy[taxon[2]]
|
||||
yield self._taxonomy[0]
|
||||
|
||||
|
||||
def ecoPCRResultIterator(self, file):
|
||||
"iteration on ecoPCR result file"
|
||||
file = self.__universalOpen(file)
|
||||
data = ColumnFile(file,
|
||||
sep='|',
|
||||
types=(str,int,int,
|
||||
str,int,str,
|
||||
int,str,int,
|
||||
str,int,str,
|
||||
str,str,int,
|
||||
str,int,int,
|
||||
str,str),skip='#')
|
||||
|
||||
for ac, sq_len, taxid,\
|
||||
rank, sp_taxid, species,\
|
||||
ge_taxid, genus, fa_taxid,\
|
||||
family, sk_taxid, s_kgdom,\
|
||||
strand, oligo_1, error_1,\
|
||||
oligo_2, error_2, amp_len,\
|
||||
sq_des, definition in data:
|
||||
|
||||
yield {'ac':ac, 'sq_len':sq_len, 'taxid':taxid,
|
||||
'rank':rank, 'sp_taxid':sp_taxid, 'species':species,
|
||||
'ge_taxid':ge_taxid, 'genus':genus, 'fa_taxid':fa_taxid,
|
||||
'family':family, 'sk_taxid':sk_taxid, 's_kgdom':s_kgdom,
|
||||
'strand':strand, 'oligo_1':oligo_1, 'error_1':error_1,
|
||||
'oligo_2':oligo_2, 'error_2':error_2, 'amp_len':amp_len,
|
||||
'sq_des':sq_des, 'definition':definition}
|
||||
|
||||
def rankFilter(self,rankid,filter):
|
||||
return self._ranks[rankid] == filter
|
||||
|
||||
|
||||
def lastCommonTaxon(self,taxid_1, taxid_2):
|
||||
t1 = [x[0] for x in self.parentalTreeIterator(taxid_1)]
|
||||
t2 = [x[0] for x in self.parentalTreeIterator(taxid_2)]
|
||||
t1.reverse()
|
||||
t2.reverse()
|
||||
count = t1 < t2 and len(t1) or len(t2)
|
||||
for i in range(count):
|
||||
if t1[i] != t2[i]:
|
||||
return t1[i-1]
|
||||
|
||||
|
||||
|
||||
|
||||
class ColumnFile(object):
|
||||
|
||||
def __init__(self,stream,sep=None,strip=True,types=None,skip=None):
|
||||
if isinstance(stream,str):
|
||||
self._stream = open(stream)
|
||||
elif hasattr(stream,'next'):
|
||||
self._stream = stream
|
||||
else:
|
||||
raise ValueError,'stream must be string or an iterator'
|
||||
self._delimiter=sep
|
||||
self._strip=strip
|
||||
if types:
|
||||
self._types=[x for x in types]
|
||||
for i in xrange(len(self._types)):
|
||||
if self._types[i] is bool:
|
||||
self._types[i]=ColumnFile.str2bool
|
||||
else:
|
||||
self._types=None
|
||||
self._skip = skip
|
||||
|
||||
def str2bool(x):
|
||||
return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False}))
|
||||
|
||||
str2bool = staticmethod(str2bool)
|
||||
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def next(self):
|
||||
ligne = self._stream.next()
|
||||
while ligne[0] == self._skip:
|
||||
ligne = self._stream.next()
|
||||
data = ligne.split(self._delimiter)
|
||||
if self._strip or self._types:
|
||||
data = [x.strip() for x in data]
|
||||
if self._types:
|
||||
it = self.endLessIterator(self._types)
|
||||
data = [x[1](x[0]) for x in ((y,it.next()) for y in data)]
|
||||
return data
|
||||
|
||||
def endLessIterator(self,endedlist):
|
||||
for x in endedlist:
|
||||
yield x
|
||||
while(1):
|
||||
yield endedlist[-1]
|
||||
|
||||
|
||||
class Table(list):
|
||||
|
||||
def __init__(self, headers, types):
|
||||
list.__init__(self)
|
||||
self.headers = headers
|
||||
self.types = types
|
||||
self.lines = []
|
||||
|
||||
def printTable(self):
|
||||
for h in self.headers:
|
||||
print "\t%s\t|" % h,
|
||||
print "\n"
|
||||
for l in self.lines:
|
||||
for c in l:
|
||||
print "\t%s\t|" % c
|
||||
print "\n"
|
||||
|
||||
def getColumn(self,n):
|
||||
print "\t%s\n" % self.header[n]
|
||||
for i in range(len(self.lines)):
|
||||
print "\t%s\n" % i[n]
|
||||
|
||||
|
||||
|
||||
|
@ -68,8 +68,7 @@ def endLessIterator(endedlist):
|
||||
yield x
|
||||
while(1):
|
||||
yield endedlist[-1]
|
||||
|
||||
|
||||
|
||||
class ColumnFile(object):
|
||||
|
||||
def __init__(self,stream,sep=None,strip=True,types=None):
|
||||
@ -135,7 +134,7 @@ def bsearchTaxon(taxonomy,taxid):
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
|
||||
|
||||
def readNodeTable(file):
|
||||
|
||||
@ -149,7 +148,6 @@ def readNodeTable(file):
|
||||
bool,bool,bool,str))
|
||||
print >>sys.stderr,"Reading taxonomy dump file..."
|
||||
taxonomy=[[n[0],n[2],n[1]] for n in nodes]
|
||||
|
||||
print >>sys.stderr,"List all taxonomy rank..."
|
||||
ranks =list(set(x[1] for x in taxonomy))
|
||||
ranks.sort()
|
||||
@ -171,15 +169,14 @@ def readNodeTable(file):
|
||||
|
||||
return taxonomy,ranks,index
|
||||
|
||||
def scientificNameIterator(file):
|
||||
def nameIterator(file):
|
||||
file = universalOpen(file)
|
||||
names = ColumnFile(file,
|
||||
sep='|',
|
||||
types=(int,str,
|
||||
str,str))
|
||||
for taxid,name,unique,classname,white in names:
|
||||
if classname == 'scientific name':
|
||||
yield taxid,name
|
||||
yield taxid,name,classname
|
||||
|
||||
def mergedNodeIterator(file):
|
||||
file = universalOpen(file)
|
||||
@ -201,8 +198,12 @@ def readTaxonomyDump(taxdir):
|
||||
taxonomy,ranks,index = readNodeTable('%s/nodes.dmp' % taxdir)
|
||||
|
||||
print >>sys.stderr,"Adding scientific name..."
|
||||
for taxid,name in scientificNameIterator('%s/names.dmp' % taxdir):
|
||||
taxonomy[index[taxid]].append(name)
|
||||
|
||||
alternativeName=[]
|
||||
for taxid,name,classname in nameIterator('%s/names.dmp' % taxdir):
|
||||
alternativeName.append((name,classname,index[taxid]))
|
||||
if classname == 'scientific name':
|
||||
taxonomy[index[taxid]].append(name)
|
||||
|
||||
print >>sys.stderr,"Adding taxid alias..."
|
||||
for taxid,current in mergedNodeIterator('%s/merged.dmp' % taxdir):
|
||||
@ -212,7 +213,7 @@ def readTaxonomyDump(taxdir):
|
||||
for taxid in deletedNodeIterator('%s/delnodes.dmp' % taxdir):
|
||||
index[taxid]=None
|
||||
|
||||
return taxonomy,ranks,index
|
||||
return taxonomy,ranks,alternativeName,index
|
||||
|
||||
|
||||
#####
|
||||
@ -267,28 +268,52 @@ def genbankEntryParser(entry):
|
||||
Tx = None
|
||||
return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq}
|
||||
|
||||
_fastaParseID = re.compile('(?<=^>)[^ ]+')
|
||||
_fastaParseDE = re.compile('(?<=^>).+',)
|
||||
_fastaParseSQ = re.compile('^[^>].+',re.MULTILINE+re.DOTALL)
|
||||
_fastaParseTX = re.compile('(?<=[[Tt]axon:) *[0-9]+ *(?=])')
|
||||
|
||||
def fastaEntryParser(entry):
|
||||
Id = _fastaParseID.findall(entry)[0]
|
||||
De = _fastaParseDE.findall(entry)[0].split(None,1)[1:]
|
||||
if not De:
|
||||
De=''
|
||||
else:
|
||||
De=De[0]
|
||||
Sq = cleanSeq(_fastaParseSQ.findall(entry)[0].upper())
|
||||
######################
|
||||
|
||||
_cleanDef = re.compile('[\nDE]')
|
||||
|
||||
def cleanDef(definition):
|
||||
return _cleanDef.sub('',definition)
|
||||
|
||||
_emblParseID = re.compile('(?<=^ID {3})[^ ]+(?=;)',re.MULTILINE)
|
||||
_emblParseDE = re.compile('(?<=^DE {3}).+?\. *$(?=[^ ])',re.MULTILINE+re.DOTALL)
|
||||
_emblParseSQ = re.compile('(?<=^ ).+?(?=^//$)',re.MULTILINE+re.DOTALL)
|
||||
_emblParseTX = re.compile('(?<= /db_xref="taxon:)[0-9]+(?=")')
|
||||
|
||||
def emblEntryParser(entry):
|
||||
Id = _emblParseID.findall(entry)[0]
|
||||
De = ' '.join(cleanDef(_emblParseDE.findall(entry)[0]).split())
|
||||
Sq = cleanSeq(_emblParseSQ.findall(entry)[0].upper())
|
||||
try:
|
||||
Tx = int(_fastaParseTX.findall(entry)[0])
|
||||
Tx = int(_emblParseTX.findall(entry)[0])
|
||||
except IndexError:
|
||||
Tx = None
|
||||
|
||||
return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq}
|
||||
|
||||
|
||||
######################
|
||||
|
||||
def parseFasta(seq):
|
||||
title = seq[0].strip()[1:].split(None,1)
|
||||
id=title[0]
|
||||
if len(title) == 2:
|
||||
field = title[1].split('; ')
|
||||
else:
|
||||
field=[]
|
||||
info = dict(x.split('=') for x in field if '=' in x)
|
||||
definition = ' '.join([x for x in field if '=' not in x])
|
||||
seq=(''.join([x.strip() for x in seq[1:]])).upper()
|
||||
return id,seq,definition,info
|
||||
|
||||
|
||||
def fastaEntryParser(entry):
|
||||
id,seq,definition,info = parseFasta(entry)
|
||||
Tx = info.get('taxid',None)
|
||||
if Tx is not None:
|
||||
Tx=int(Tx)
|
||||
return {'id':id,'taxid':Tx,'definition':definition,'sequence':seq}
|
||||
|
||||
|
||||
|
||||
def sequenceIteratorFactory(entryParser,entryIterator):
|
||||
def sequenceIterator(file):
|
||||
for entry in entryIterator(file):
|
||||
@ -381,6 +406,22 @@ def ecoRankPacker(rank):
|
||||
|
||||
return packed
|
||||
|
||||
def ecoNamePacker(name):
|
||||
|
||||
namelength = len(name[0])
|
||||
classlength= len(name[1])
|
||||
totalSize = namelength + classlength + 4 + 4 + 4 + 4
|
||||
|
||||
packed = struct.pack('> I I I I I %ds %ds' % (namelength,classlength),
|
||||
totalSize,
|
||||
int(name[1]=='scientific name'),
|
||||
namelength,
|
||||
classlength,
|
||||
name[2],
|
||||
name[0],
|
||||
name[1])
|
||||
|
||||
return packed
|
||||
|
||||
def ecoSeqWriter(file,input,taxindex,parser):
|
||||
output = open(file,'wb')
|
||||
@ -438,18 +479,40 @@ def ecoRankWriter(file,ranks):
|
||||
output.write(ecoRankPacker(rank))
|
||||
|
||||
output.close()
|
||||
|
||||
def nameCmp(n1,n2):
|
||||
name1=n1[0].upper()
|
||||
name2=n2[0].upper()
|
||||
if name1 < name2:
|
||||
return -1
|
||||
elif name1 > name2:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
def ecoNameWriter(file,names):
|
||||
output = open(file,'wb')
|
||||
output.write(struct.pack('> I',len(names)))
|
||||
|
||||
names.sort(nameCmp)
|
||||
|
||||
for name in names:
|
||||
output.write(ecoNamePacker(name))
|
||||
|
||||
output.close()
|
||||
|
||||
def ecoDBWriter(prefix,taxonomy,seqFileNames,parser):
|
||||
|
||||
ecoRankWriter('%s.rdx' % prefix, taxonomy[1])
|
||||
ecoTaxWriter('%s.tdx' % prefix, taxonomy[0])
|
||||
|
||||
ecoNameWriter('%s.ndx' % prefix, taxonomy[2])
|
||||
|
||||
filecount = 0
|
||||
for filename in seqFileNames:
|
||||
filecount+=1
|
||||
sk=ecoSeqWriter('%s_%03d.sdx' % (prefix,filecount),
|
||||
filename,
|
||||
taxonomy[2],
|
||||
taxonomy[3],
|
||||
parser)
|
||||
if sk:
|
||||
print >>sys.stderr,"Skipped entry :"
|
||||
@ -464,37 +527,58 @@ def ecoParseOptions(arguments):
|
||||
}
|
||||
|
||||
o,filenames = getopt.getopt(arguments,
|
||||
'ht:n:gf',
|
||||
'ht:n:gfe',
|
||||
['help',
|
||||
'taxonomy=',
|
||||
'name=',
|
||||
'genbank',
|
||||
'fasta'])
|
||||
'fasta',
|
||||
'embl'])
|
||||
|
||||
for name,value in o:
|
||||
if name in ('-h','--help'):
|
||||
pass
|
||||
printHelp()
|
||||
exit()
|
||||
elif name in ('-t','--taxonomy'):
|
||||
opt['taxdir']=value
|
||||
elif name in ('-n','--name'):
|
||||
opt['prefix']=value
|
||||
elif name in ('-g','--genbank'):
|
||||
opt['parser']=sequenceIteratorFactory(genbankEntryParser,
|
||||
entryIterator
|
||||
)
|
||||
entryIterator)
|
||||
|
||||
elif name in ('-f','--fasta'):
|
||||
opt['parser']=sequenceIteratorFactory(fastaEntryParser,
|
||||
fastaEntryIterator)
|
||||
|
||||
elif name in ('-e','--embl'):
|
||||
opt['parser']=sequenceIteratorFactory(emblEntryParser,
|
||||
entryIterator)
|
||||
else:
|
||||
raise ValueError,'Unknown option %s' % name
|
||||
|
||||
return opt,filenames
|
||||
|
||||
def printHelp():
|
||||
print "-----------------------------------"
|
||||
print " ecoPCRFormat.py"
|
||||
print "-----------------------------------"
|
||||
print "ecoPCRFormat.py [option] <argument>"
|
||||
print "-----------------------------------"
|
||||
print "-e --embl :[E]mbl format"
|
||||
print "-f --fasta :[F]asta format"
|
||||
print "-g --genbank :[G]enbank format"
|
||||
print "-h --help :[H]elp - print this help"
|
||||
print "-n --name :[N]ame of the new database created"
|
||||
print "-t --taxonomy :[T]axonomy - path to the taxonomy database"
|
||||
print " :bcp-like dump from GenBank taxonomy database."
|
||||
print "-----------------------------------"
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
opt,filenames = ecoParseOptions(sys.argv[1:])
|
||||
|
||||
taxonomy = readTaxonomyDump(opt['taxdir'])
|
||||
|
||||
|
||||
ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser'])
|
||||
|
||||
|
811
tools/ecoSort.py
Executable file
811
tools/ecoSort.py
Executable file
@ -0,0 +1,811 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import struct
|
||||
import sys
|
||||
import os
|
||||
import gzip
|
||||
import re
|
||||
import string
|
||||
|
||||
from reportlab.graphics.shapes import *
|
||||
from reportlab.graphics.charts.barcharts import VerticalBarChart
|
||||
from reportlab.graphics.charts.piecharts import Pie
|
||||
from reportlab.lib.styles import getSampleStyleSheet
|
||||
from reportlab.lib.units import cm
|
||||
from reportlab.lib import colors
|
||||
from reportlab.platypus import *
|
||||
|
||||
|
||||
#####
|
||||
#
|
||||
# Generic file function
|
||||
#
|
||||
#####
|
||||
|
||||
class Filter(object):
|
||||
"""
|
||||
This object provides different iterators and method :
|
||||
* findTaxonByTaxid
|
||||
* subTreeIterator
|
||||
* parentalTreeIterator
|
||||
* ecoPCRResultIterator
|
||||
* rankFilter
|
||||
* lastCommonTaxon
|
||||
|
||||
see each method for more informations
|
||||
"""
|
||||
|
||||
def __init__(self,path):
|
||||
self._path = path
|
||||
self._taxonFile = "%s.tdx" % self._path
|
||||
self._ranksFile = "%s.rdx" % self._path
|
||||
self._namesFile = "%s.ndx" % self._path
|
||||
self._taxonomy, self._index, self._ranks, self._name = self.__readNodeTable()
|
||||
|
||||
|
||||
def __universalOpen(self,file):
|
||||
if isinstance(file,str):
|
||||
if file[-3:] == '.gz':
|
||||
rep = gzip.open(file)
|
||||
else:
|
||||
rep = open(file)
|
||||
else:
|
||||
rep = file
|
||||
return rep
|
||||
|
||||
def __universalTell(self,file):
|
||||
if isinstance(file, gzip.GzipFile):
|
||||
file=file.myfileobj
|
||||
return file.tell()
|
||||
|
||||
def __fileSize(self,file):
|
||||
if isinstance(file, gzip.GzipFile):
|
||||
file=file.myfileobj
|
||||
pos = file.tell()
|
||||
file.seek(0,2)
|
||||
length = file.tell()
|
||||
file.seek(pos,0)
|
||||
return length
|
||||
|
||||
def __progressBar(self,pos,max,reset=False,delta=[]):
|
||||
if reset:
|
||||
del delta[:]
|
||||
if not delta:
|
||||
delta.append(time.time())
|
||||
delta.append(time.time())
|
||||
|
||||
delta[1]=time.time()
|
||||
elapsed = delta[1]-delta[0]
|
||||
percent = float(pos)/max * 100
|
||||
remain = time.strftime('%H:%M:%S',time.gmtime(elapsed / percent * (100-percent)))
|
||||
bar = '#' * int(percent/2)
|
||||
bar+= '|/-\\-'[pos % 5]
|
||||
bar+= ' ' * (50 - int(percent/2))
|
||||
sys.stderr.write('\r%5.1f %% |%s] remain : %s' %(percent,bar,remain))
|
||||
|
||||
|
||||
|
||||
|
||||
#####
|
||||
#
|
||||
# Iterator functions
|
||||
#
|
||||
#####
|
||||
|
||||
|
||||
|
||||
def __ecoRecordIterator(self,file):
|
||||
file = self.__universalOpen(file)
|
||||
(recordCount,) = struct.unpack('> I',file.read(4))
|
||||
|
||||
for i in xrange(recordCount):
|
||||
(recordSize,)=struct.unpack('>I',file.read(4))
|
||||
record = file.read(recordSize)
|
||||
yield record
|
||||
|
||||
|
||||
def __ecoNameIterator(self):
|
||||
for record in self.__ecoRecordIterator(self._namesFile):
|
||||
lrecord = len(record)
|
||||
lnames = lrecord - 16
|
||||
(isScientificName,namelength,classLength,indextaxid,names)=struct.unpack('> I I I I %ds' % lnames, record)
|
||||
name=names[:namelength]
|
||||
classname=names[namelength:]
|
||||
yield (name,classname,indextaxid)
|
||||
|
||||
|
||||
def __ecoTaxonomicIterator(self):
|
||||
for record in self.__ecoRecordIterator(self._taxonFile):
|
||||
lrecord = len(record)
|
||||
lnames = lrecord - 16
|
||||
(taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record)
|
||||
yield (taxid,rankid,parentidx,name)
|
||||
|
||||
|
||||
def __ecoSequenceIterator(self,file):
|
||||
for record in self.__ecoRecordIterator(file):
|
||||
lrecord = len(record)
|
||||
lnames = lrecord - (4*4+20)
|
||||
(taxid,seqid,deflength,seqlength,cptseqlength,string)=struct.unpack('> I 20s I I I %ds' % lnames, record)
|
||||
de = string[:deflength]
|
||||
seq = gzip.zlib.decompress(string[deflength:])
|
||||
yield (taxid,seqid,deflength,seqlength,cptseqlength,de,seq)
|
||||
|
||||
|
||||
def __ecoRankIterator(self):
|
||||
for record in self.__ecoRecordIterator(self._ranksFile):
|
||||
yield record
|
||||
|
||||
|
||||
#####
|
||||
#
|
||||
# Indexes
|
||||
#
|
||||
#####
|
||||
|
||||
def __ecoNameIndex(self):
|
||||
indexName = [x for x in self.__ecoNameIterator()]
|
||||
return indexName
|
||||
|
||||
def __ecoRankIndex(self):
|
||||
rank = [r for r in self.__ecoRankIterator()]
|
||||
return rank
|
||||
|
||||
def __ecoTaxonomyIndex(self):
|
||||
taxonomy = []
|
||||
index = {}
|
||||
i = 0;
|
||||
for x in self.__ecoTaxonomicIterator():
|
||||
taxonomy.append(x)
|
||||
index[x[0]] = i
|
||||
i = i + 1
|
||||
return taxonomy, index
|
||||
|
||||
def __readNodeTable(self):
|
||||
taxonomy, index = self.__ecoTaxonomyIndex()
|
||||
ranks = self.__ecoRankIndex()
|
||||
name = self.__ecoNameIndex()
|
||||
return taxonomy,index,ranks,name
|
||||
|
||||
|
||||
#####
|
||||
#
|
||||
# PUBLIC METHODS
|
||||
#
|
||||
#####
|
||||
|
||||
def findTaxonByTaxid(self,taxid):
|
||||
"""
|
||||
Returns a list containing [taxid,rankid,parent_index,nameLength,name]
|
||||
It takes one argument : a taxonomic id
|
||||
"""
|
||||
return self._taxonomy[self._index[taxid]]
|
||||
|
||||
|
||||
def subTreeIterator(self, taxid):
|
||||
"""
|
||||
Returns subtree for given taxid from first child
|
||||
to last child. It takes one argument : a taxonomic id
|
||||
"""
|
||||
idx = self._index[taxid]
|
||||
yield self._taxonomy[idx]
|
||||
for t in self._taxonomy:
|
||||
if t[2] == idx:
|
||||
for subt in self.subTreeIterator(t[0]):
|
||||
yield subt
|
||||
|
||||
|
||||
def parentalTreeIterator(self, taxid):
|
||||
"""
|
||||
return parental tree for given taxonomic id starting from
|
||||
first ancester to the root.
|
||||
"""
|
||||
taxon=self.findTaxonByTaxid(taxid)
|
||||
while taxon[2]!= 0:
|
||||
yield taxon
|
||||
taxon = self._taxonomy[taxon[2]]
|
||||
yield self._taxonomy[0]
|
||||
|
||||
|
||||
def ecoPCRResultIterator(self, file):
|
||||
"""
|
||||
iteration on ecoPCR result file"
|
||||
It returns a dictionnary
|
||||
"""
|
||||
file = self.__universalOpen(file)
|
||||
data = ColumnFile(file,
|
||||
sep='|',
|
||||
types=(str,int,int,
|
||||
str,int,str,
|
||||
int,str,int,
|
||||
str,int,str,
|
||||
str,str,int,
|
||||
str,int,int,
|
||||
str,str),skip='#')
|
||||
|
||||
|
||||
for ac, sq_len, taxid,\
|
||||
rank, sp_taxid, species,\
|
||||
ge_taxid, genus, fa_taxid,\
|
||||
family, sk_taxid, s_kgdom,\
|
||||
strand, oligo_1, error_1,\
|
||||
oligo_2, error_2, amp_len,\
|
||||
sq_des, definition in data:
|
||||
|
||||
yield {'ac':ac, 'sq_len':sq_len, 'taxid':taxid,
|
||||
'rank':rank, 'sp_taxid':sp_taxid, 'species':species,
|
||||
'ge_taxid':ge_taxid, 'genus':genus, 'fa_taxid':fa_taxid,
|
||||
'family':family, 'sk_taxid':sk_taxid, 's_kgdom':s_kgdom,
|
||||
'strand':strand, 'oligo_1':oligo_1, 'error_1':error_1,
|
||||
'oligo_2':oligo_2, 'error_2':error_2, 'amp_len':amp_len,
|
||||
'sq_des':sq_des, 'definition':definition}
|
||||
|
||||
def rankFilter(self,rankid,filter):
|
||||
"""
|
||||
boolean telling whether rankid match filter
|
||||
takes 2 arguments :
|
||||
1- rankid
|
||||
2- filter (i.e genus)
|
||||
"""
|
||||
return self._ranks[rankid] == filter
|
||||
|
||||
|
||||
def lastCommonTaxon(self,taxid_1, taxid_2):
|
||||
"""
|
||||
returns a last common parent for two given taxon.
|
||||
It starts from the root and goes down the tree
|
||||
until their parents diverge.
|
||||
It takes 2 arguments :
|
||||
1- taxid 1
|
||||
2- taxid 2
|
||||
"""
|
||||
t1 = [x[0] for x in self.parentalTreeIterator(taxid_1)]
|
||||
t2 = [x[0] for x in self.parentalTreeIterator(taxid_2)]
|
||||
t1.reverse()
|
||||
t2.reverse()
|
||||
count = t1 < t2 and len(t1) or len(t2)
|
||||
for i in range(count):
|
||||
if t1[i] != t2[i]:
|
||||
return t1[i-1]
|
||||
return t1[count-1]
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class ColumnFile(object):
|
||||
"""
|
||||
cut an ecoPCR file into a list
|
||||
"""
|
||||
def __init__(self,stream,sep=None,strip=True,types=None,skip=None):
|
||||
if isinstance(stream,str):
|
||||
self._stream = open(stream)
|
||||
elif hasattr(stream,'next'):
|
||||
self._stream = stream
|
||||
else:
|
||||
raise ValueError,'stream must be string or an iterator'
|
||||
self._delimiter=sep
|
||||
self._strip=strip
|
||||
if types:
|
||||
self._types=[x for x in types]
|
||||
for i in xrange(len(self._types)):
|
||||
if self._types[i] is bool:
|
||||
self._types[i]=ColumnFile.str2bool
|
||||
else:
|
||||
self._types=None
|
||||
self._skip = skip
|
||||
self._oligo = {}
|
||||
|
||||
def str2bool(x):
|
||||
return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False}))
|
||||
|
||||
str2bool = staticmethod(str2bool)
|
||||
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def next(self):
|
||||
ligne = self._stream.next()
|
||||
while ligne[0] == self._skip:
|
||||
ligne = self._stream.next()
|
||||
data = ligne.split(self._delimiter)
|
||||
if self._strip or self._types:
|
||||
data = [x.strip() for x in data]
|
||||
if self._types:
|
||||
it = self.endLessIterator(self._types)
|
||||
data = [x[1](x[0]) for x in ((y,it.next()) for y in data)]
|
||||
return data
|
||||
|
||||
def endLessIterator(self,endedlist):
|
||||
for x in endedlist:
|
||||
yield x
|
||||
while(1):
|
||||
yield endedlist[-1]
|
||||
|
||||
def getOligo(self,line):
|
||||
if line[2:8] == 'direct':
|
||||
r = re.compile('(?<=direct strand oligo1 : )[A-Z]+(?= *)')
|
||||
self._oligo['o1'] = r.findall(line)
|
||||
if line[2:9] == 'reverse':
|
||||
r = re.compile('(?<=reverse strand oligo2 : )[A-Z]+(?= *)')
|
||||
self._oligo['o2'] = r.findall(line)
|
||||
return None
|
||||
|
||||
|
||||
|
||||
|
||||
###########
|
||||
#
|
||||
# DATA STRUCTURE
|
||||
#
|
||||
###########
|
||||
|
||||
|
||||
class ecoTable(list):
|
||||
"""
|
||||
Data object inheriting from list
|
||||
"""
|
||||
def __init__(self, headers, types):
|
||||
list.__init__(self)
|
||||
self.headers = headers
|
||||
self.types = types
|
||||
|
||||
|
||||
def __setitem__ (self,key,value):
|
||||
"""
|
||||
method overloaded to check data types
|
||||
"""
|
||||
for e in range(len(value)):
|
||||
value[e] = self.types[e](value[e])
|
||||
list.__setitem__(self,key,value)
|
||||
|
||||
def __getitem__(self,index):
|
||||
newtable = ecoTable(self.headers,self.types)
|
||||
if isinstance(index,slice):
|
||||
newtable.extend(list.__getitem__(self,index))
|
||||
else:
|
||||
newtable.append(list.__getitem__(self,index))
|
||||
|
||||
return newtable
|
||||
|
||||
def getColumns(self,columnList):
|
||||
newhead = [self.headers[x] for x in columnList]
|
||||
newtype = [self.types[x] for x in columnList]
|
||||
newtable = ecoTable(newhead,newtype)
|
||||
for line in self:
|
||||
newtable.append([line[x] for x in columnList])
|
||||
|
||||
return newtable
|
||||
|
||||
|
||||
###########
|
||||
#
|
||||
# PARSE FUNCTIONS
|
||||
#
|
||||
###########
|
||||
|
||||
def _parseOligoResult(filter,file,strand):
|
||||
seq = {}
|
||||
|
||||
if strand == 'direct':
|
||||
key = 'oligo_1'
|
||||
elif strand == 'reverse':
|
||||
key = 'oligo_2'
|
||||
|
||||
for s in filter.ecoPCRResultIterator(file):
|
||||
o = s[key]
|
||||
taxid = s['taxid']
|
||||
if not seq.has_key(o):
|
||||
seq[o] = [1,taxid]
|
||||
else:
|
||||
seq[o][0] = seq[o][0] + 1
|
||||
seq[o][1] = filter.lastCommonTaxon(seq[o][1],taxid)
|
||||
return seq
|
||||
|
||||
|
||||
def _parseTaxonomyResult(table):
|
||||
tax = {}
|
||||
for l in table:
|
||||
taxid = l[2]
|
||||
scName = l[3]
|
||||
count = l[1]
|
||||
if not tax.has_key(taxid):
|
||||
tax[taxid] = [1,scName,count]
|
||||
else:
|
||||
tax[taxid][0] = tax[taxid][0] + 1
|
||||
tax[taxid][2] = tax[taxid][2] + count
|
||||
return tax
|
||||
|
||||
|
||||
def _sortTable(e1,e2):
|
||||
e1 = e1[1]
|
||||
e2 = e2[1]
|
||||
if e1 < e2:
|
||||
return 1
|
||||
if e1 > e2:
|
||||
return -1
|
||||
return 0
|
||||
|
||||
|
||||
def _countOligoMismatch(o1,o2):
|
||||
"""
|
||||
define mismatch between two oligonucleotids
|
||||
return number of mismatch
|
||||
"""
|
||||
mmatch = 0
|
||||
if len(o1) < len(o2):
|
||||
mmatch = int(len(o2) - len(o1))
|
||||
for i in range(len(o1)):
|
||||
try:
|
||||
if o1[i] != o2[i]:
|
||||
mmatch = mmatch + 1
|
||||
except:
|
||||
mmatch = mmatch + 1
|
||||
|
||||
return mmatch
|
||||
|
||||
###########
|
||||
#
|
||||
# TOOLS FUNCTIONS
|
||||
#
|
||||
###########
|
||||
|
||||
def customSort(table,x,y):
|
||||
"""
|
||||
|
||||
"""
|
||||
x = x-1
|
||||
y = y-1
|
||||
h = (table.headers[x],table.headers[y])
|
||||
t = (table.types[x],table.types[y])
|
||||
cTable = ecoTable(h,t)
|
||||
|
||||
tmp = {}
|
||||
|
||||
for l in table:
|
||||
if tmp.has_key(l[x]):
|
||||
tmp[l[x]] = tmp[l[x]] + l[y]
|
||||
else:
|
||||
tmp[l[x]] = l[y]
|
||||
|
||||
for k,v in tmp.items():
|
||||
cTable.append([k,v])
|
||||
|
||||
return cTable
|
||||
|
||||
|
||||
def countColumnOccurrence(table,x):
|
||||
x = x-1
|
||||
h = (table.headers[x],"count")
|
||||
t = (table.types[x],int)
|
||||
cTable = Table(h,t)
|
||||
|
||||
tmp = {}
|
||||
|
||||
for l in table:
|
||||
if tmp.has_key(l[x]):
|
||||
tmp[l[x]] = tmp[l[x]] + 1
|
||||
else:
|
||||
tmp[l[x]] = 1
|
||||
|
||||
for k,v in tmp.items():
|
||||
cTable.append([k,v])
|
||||
|
||||
return cTable
|
||||
|
||||
|
||||
def buildSpecificityTable(table):
|
||||
header = ("mismatch","taxon","count")
|
||||
type = (int,str,int)
|
||||
speTable = ecoTable(header,type)
|
||||
|
||||
tmp = {}
|
||||
for l in table:
|
||||
if not tmp.has_key(l[5]):
|
||||
tmp[l[5]] = {}
|
||||
if not tmp[l[5]].has_key(l[3]):
|
||||
tmp[l[5]][l[3]] = l[1]
|
||||
else:
|
||||
tmp[l[5]][l[3]] = tmp[l[5]][l[3]] + l[1]
|
||||
|
||||
for mismatch in tmp.items():
|
||||
for taxon,count in mismatch[1].items():
|
||||
speTable.append([mismatch[0],taxon,count])
|
||||
|
||||
return speTable
|
||||
|
||||
|
||||
def buildOligoTable(table, file, filter, oligoRef, strand='direct'):
|
||||
"""
|
||||
Fills and sorts a Table object with ecoPCR result file
|
||||
|
||||
Takes 4 arguments
|
||||
1- Table object
|
||||
2- ecoPCR result file path
|
||||
3- Filter Object
|
||||
4- the oligo used in ecoPCR
|
||||
5- the oligo type : direct or reverse
|
||||
|
||||
"""
|
||||
seq = _parseOligoResult(filter, file, strand)
|
||||
|
||||
i = 0
|
||||
for oligo, info in seq.items():
|
||||
table.append(0)
|
||||
count, lctTaxid = info[0], info[1]
|
||||
scName = filter.findTaxonByTaxid(info[1])[3]
|
||||
rank = filter._ranks[filter.findTaxonByTaxid(info[1])[1]]
|
||||
mismatch = _countOligoMismatch(oligoRef,oligo)
|
||||
table[i]=[oligo,count,lctTaxid,scName,rank,mismatch]
|
||||
i = i + 1
|
||||
|
||||
table.sort(_sortTable)
|
||||
|
||||
|
||||
def buildTaxonomicTable(table):
|
||||
"""
|
||||
Fill a Table object with a taxonomic synthesis
|
||||
"""
|
||||
taxHeaders = ("scName","numOfOligo","numOfAmpl","taxid")
|
||||
taxTypes = (str, int, int, int)
|
||||
taxTable = ecoTable(taxHeaders, taxTypes)
|
||||
|
||||
tax = _parseTaxonomyResult(table)
|
||||
|
||||
i = 0
|
||||
for taxid, info in tax.items():
|
||||
taxTable.append(0)
|
||||
numOfOligo, scName, numOfAmpl = info[0], info[1], info[2]
|
||||
taxTable[i]=[scName,numOfOligo,numOfAmpl,taxid]
|
||||
i = i + 1
|
||||
|
||||
taxTable.sort(_sortTable)
|
||||
|
||||
return taxTable
|
||||
|
||||
def _parseSequenceResult(filter,file,id):
|
||||
sequences = {}
|
||||
idIndex = {}
|
||||
|
||||
if id == 'family':
|
||||
key = 'fa_taxid'
|
||||
elif id == 'genus':
|
||||
key = 'ge_taxid'
|
||||
else:
|
||||
key = 'taxid'
|
||||
|
||||
for s in filter.ecoPCRResultIterator(file):
|
||||
seq = s['sq_des']
|
||||
id = s[key]
|
||||
if not idIndex.has_key(id):
|
||||
idIndex[id] = []
|
||||
if not sequences.has_key(seq):
|
||||
sequences[seq] = [id]
|
||||
else:
|
||||
sequences[seq].append(id)
|
||||
return sequences, idIndex
|
||||
|
||||
|
||||
def _sameValuesInList(array):
|
||||
for i in range(len(array)-1):
|
||||
if array[i] != array[i+1]:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _sortSequences(file,filter):
|
||||
|
||||
sequences, idIndex = _parseSequenceResult(filter,file,'species')
|
||||
|
||||
for s,id in sequences.items():
|
||||
if len(id) == 1 or _sameValuesInList(id):
|
||||
idIndex[id[0]].append(1)
|
||||
else:
|
||||
for e in id:
|
||||
idIndex[e].append(0)
|
||||
|
||||
|
||||
for id,values in idIndex.items():
|
||||
idIndex[id] = float(values.count(1)) / float(len(values)) * 100
|
||||
|
||||
|
||||
identified = {}
|
||||
non_identified = {}
|
||||
ambiguous = {}
|
||||
|
||||
return sequences, idIndex
|
||||
|
||||
def getIntraSpeciesDiversity(table,file,filter):
|
||||
|
||||
intraDiv = {}
|
||||
|
||||
seq, idIndex = _sortSequences(file,filter)
|
||||
|
||||
for id,percent in idIndex.items():
|
||||
if percent == 100:
|
||||
intraDiv[id] = [0,[]]
|
||||
for seq,idList in sequences.items():
|
||||
if id in idList:
|
||||
intraDiv[id][0] = intraDiv[id][0] + 1
|
||||
intraDiv[id][1].append(seq)
|
||||
|
||||
for id, values in intraDiv.items():
|
||||
table.append(id,values[0],values[1])
|
||||
|
||||
|
||||
|
||||
###########
|
||||
#
|
||||
# OUTPUT FUNCTIONS
|
||||
#
|
||||
###########
|
||||
|
||||
def printTable(table):
|
||||
"""
|
||||
Displays the content a of Table object
|
||||
Take 1 arguments
|
||||
1- Table object
|
||||
"""
|
||||
|
||||
format = ("%20s | " * len(table.headers))[:-3]
|
||||
print format % tuple([str(e) for e in table.headers ]) +"\n" + "-"*23*len(table.headers)
|
||||
for l in table:
|
||||
print format % tuple([str(e) for e in l ])
|
||||
print "# %d results" % len(table)
|
||||
|
||||
|
||||
def saveAsCSV(table,path):
|
||||
"""
|
||||
Creates a csv file from a Table object
|
||||
Takes 2 arguments
|
||||
1- Table object
|
||||
2- path of the file-to-be
|
||||
"""
|
||||
file = open(path,'w')
|
||||
file.write(','.join([str(e) for e in table.headers ]) + "\n")
|
||||
for l in table:
|
||||
file.write(','.join([str(e) for e in l ]) + "\n")
|
||||
file.close()
|
||||
|
||||
|
||||
def grepTable(table,col,pattern):
|
||||
"""
|
||||
Filters a Table object with regular expression
|
||||
Takes 3 arguments :
|
||||
1- Table object
|
||||
2- number of column to match with
|
||||
3- regular expression pattern
|
||||
|
||||
Returns a Table object
|
||||
"""
|
||||
col = col -1
|
||||
p = re.compile(pattern, re.IGNORECASE)
|
||||
out = ecoTable(table.headers,table.types)
|
||||
for l in table:
|
||||
if p.search(l[col]):
|
||||
out.append(l)
|
||||
return out
|
||||
|
||||
|
||||
###########
|
||||
#
|
||||
# GRAPH FUNCTIONS
|
||||
#
|
||||
###########
|
||||
|
||||
class EcoGraph(object):
|
||||
|
||||
def __init__(self):
|
||||
self._styles = getSampleStyleSheet()
|
||||
|
||||
self._element = []
|
||||
self._element.append(self._box(Paragraph("EcoPCR report", self._styles['Title'])))
|
||||
self._element.append(Spacer(0, 0.5 * cm))
|
||||
|
||||
def _box(self,flt, center=True):
|
||||
box_style = [('BOX', (0, 0), (-1, -1), 0.5, colors.lightgrey)]
|
||||
if center:
|
||||
box_style += [('ALIGN', (0, 0), (-1, -1), 'CENTER')]
|
||||
return Table([[flt]], style=box_style)
|
||||
|
||||
def _addChart(self,chart,title):
|
||||
drawing = Drawing(300, 250)
|
||||
drawing.add(chart)
|
||||
self._element.append(self._box(Paragraph(title, self._styles['Normal'])))
|
||||
self._element.append(self._box(drawing))
|
||||
self._element.append(Spacer(0, 0.5 * cm))
|
||||
|
||||
def _formatData(self,table):
|
||||
data, label = [],[]
|
||||
for i in range(len(table)):
|
||||
label.append(table[i][0])
|
||||
data.append(table[i][1])
|
||||
return data, label
|
||||
|
||||
def makePie(self, table, title):
|
||||
data, label = self._formatData(table)
|
||||
pie = Pie()
|
||||
pie.x = 100
|
||||
pie.y = 100
|
||||
pie.data = data
|
||||
pie.labels = label
|
||||
self._addChart(pie, title)
|
||||
|
||||
def makeHistogram(self, table, title):
|
||||
data, label = self._formatData(table)
|
||||
data = [tuple(data)]
|
||||
|
||||
histo = VerticalBarChart()
|
||||
histo.x = 10
|
||||
histo.y = 70
|
||||
histo.height = 150
|
||||
histo.width = 300
|
||||
histo.bars.strokeWidth = 1
|
||||
histo.barSpacing = 1
|
||||
histo.barLabels.dy = 5
|
||||
histo.barLabelFormat = '%d'
|
||||
histo.barLabels.fontSize = 9 - (len(data[0])/10)
|
||||
histo.data = data
|
||||
|
||||
histo.categoryAxis.labels.boxAnchor = 'e'
|
||||
histo.categoryAxis.labels.textAnchor = 'start'
|
||||
histo.categoryAxis.labels.dx = -40
|
||||
histo.categoryAxis.labels.dy = -50
|
||||
histo.categoryAxis.labels.angle = 45
|
||||
histo.categoryAxis.labels.width = 10
|
||||
histo.categoryAxis.labels.height = 4
|
||||
histo.categoryAxis.categoryNames = label
|
||||
histo.categoryAxis.strokeWidth = 1
|
||||
histo.categoryAxis.labels.fontSize = 8
|
||||
|
||||
histo.valueAxis.valueMin = min(data[0])*0.7
|
||||
histo.valueAxis.valueMax = max(data[0])
|
||||
step = (max(data[0]) - min(data[0])) / 10
|
||||
histo.valueAxis.valueStep = step > 1 and step or 1
|
||||
|
||||
self._addChart(histo, title)
|
||||
|
||||
def makeReport(self,path):
|
||||
doc = SimpleDocTemplate(path)
|
||||
doc.build(self._element)
|
||||
|
||||
|
||||
######################
|
||||
|
||||
|
||||
def init():
|
||||
file = "/Users/bessiere/Documents/workspace/ecoPCR/src/toto.tmp"
|
||||
oligo = {'o1':'ATGTTTAAAA','o2':'ATGGGGGTATTG'}
|
||||
|
||||
filter = Filter("/ecoPCRDB/gbmam")
|
||||
|
||||
headers = ("oligo", "Num", "LCT Taxid", "Sc Name", "Rank", "distance")
|
||||
types = (str, int, int, str, str, int)
|
||||
|
||||
o1Table = ecoTable(headers, types)
|
||||
o2Table = ecoTable(headers, types)
|
||||
|
||||
buildOligoTable(o1Table, file, filter, oligo['o1'], 'direct')
|
||||
buildOligoTable(o2Table, file, filter, oligo['o2'], 'reverse')
|
||||
|
||||
|
||||
taxTable = buildTaxonomicTable(o1Table)
|
||||
speTable = buildSpecificityTable(o1Table)
|
||||
|
||||
return o1Table, o2Table, taxTable
|
||||
|
||||
|
||||
|
||||
def start():
|
||||
file = "/Users/bessiere/Documents/workspace/ecoPCR/src/toto.tmp"
|
||||
filter = Filter("/ecoPCRDB/gbmam")
|
||||
|
||||
speHeaders = ("taxid","num of seq","list of seq")
|
||||
speTypes = (int,int,list)
|
||||
speTable = ecoTable(speHeaders,speTypes)
|
||||
|
||||
getIntraSpeciesDiversity(speTable, file, filter)
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user