Compare commits
78 Commits
Author | SHA1 | Date | |
---|---|---|---|
d4fa8b43a4 | |||
b7552200bd | |||
791aeb0ef9 | |||
1e3f82bb51 | |||
f1ef2caae6 | |||
680cc4d7a2 | |||
09f3e19f32 | |||
08debff2ac | |||
e16e9bfa0c | |||
7fbea9d63f | |||
9a9fb383ad | |||
8c1668faa4 | |||
92826de147 | |||
73236c72a8 | |||
eb8d44528d | |||
2b3a331602 | |||
a75191bbe6 | |||
3cfbf8ba42 | |||
efb4fba78c | |||
66c0511f09 | |||
ea1ca1b6d9 | |||
58a65f7ff4 | |||
4315aecbf0 | |||
aedee0c154 | |||
9e36754fef | |||
9262e954cf | |||
1f5a30b0df | |||
19887e9a46 | |||
e77c6e5339 | |||
4c9c8382fe | |||
5d8103f4b4 | |||
ca1e9d8899 | |||
dd08d73dda | |||
e483b17e18 | |||
396d6e2028 | |||
3f488baa6f | |||
fd13788289 | |||
82d5e21471 | |||
0f4f2a74fe | |||
b3d6acae76 | |||
89576b96fa | |||
9e6b924c92 | |||
2737cf0606 | |||
6f5f2a16f3 | |||
088b5c09d0 | |||
f1f8562918 | |||
1911880bb9 | |||
494791d133 | |||
79cadf0809 | |||
419bda966d | |||
455bf63949 | |||
b625941d72 | |||
ba26734e9b | |||
50c26e81b9 | |||
e79738e170 | |||
91753ace82 | |||
f142d0e904 | |||
b4d8842f31 | |||
1cae92e963 | |||
3a617890ca | |||
40644bc85f | |||
c192908469 | |||
b0521a7e15 | |||
b7c1640042 | |||
5dc55c7f53 | |||
04ee4e531c | |||
b092497eaf | |||
6624181788 | |||
81ada091bf | |||
c5a430ea80 | |||
c308fb2edc | |||
796274d746 | |||
c36bc5e838 | |||
313dd59f5a | |||
e869d6daed | |||
93b327285a | |||
e3d922e103 | |||
b8af5dd65f |
364
.cproject
364
.cproject
@ -1,151 +1,221 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||||
<?fileVersion 4.0.0?>
|
<?fileVersion 4.0.0?>
|
||||||
|
|
||||||
<cproject>
|
<cproject>
|
||||||
<storageModule moduleId="org.eclipse.cdt.core.settings">
|
<storageModule moduleId="org.eclipse.cdt.core.settings">
|
||||||
<cconfiguration id="cdt.managedbuild.toolchain.gnu.macosx.base.2134184396">
|
<cconfiguration id="cdt.managedbuild.toolchain.gnu.macosx.base.2134184396">
|
||||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.toolchain.gnu.macosx.base.2134184396" moduleId="org.eclipse.cdt.core.settings" name="MacOSX GCC">
|
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.toolchain.gnu.macosx.base.2134184396" moduleId="org.eclipse.cdt.core.settings" name="MacOSX GCC">
|
||||||
<externalSettings/>
|
<externalSettings/>
|
||||||
<extensions>
|
<extensions>
|
||||||
<extension id="org.eclipse.cdt.core.MachO" point="org.eclipse.cdt.core.BinaryParser"/>
|
<extension id="org.eclipse.cdt.core.MachO" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||||
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||||
<extension id="org.eclipse.cdt.core.MakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||||
</extensions>
|
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||||
</storageModule>
|
</extensions>
|
||||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
</storageModule>
|
||||||
<configuration artifactName="ecoPrimers" buildProperties="" id="cdt.managedbuild.toolchain.gnu.macosx.base.2134184396" name="MacOSX GCC" parent="org.eclipse.cdt.build.core.emptycfg">
|
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||||
<folderInfo id="cdt.managedbuild.toolchain.gnu.macosx.base.2134184396.1840911077" name="/" resourcePath="">
|
<configuration artifactName="ecoPrimers" buildProperties="" description="" id="cdt.managedbuild.toolchain.gnu.macosx.base.2134184396" name="MacOSX GCC" parent="org.eclipse.cdt.build.core.emptycfg">
|
||||||
<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.base.766054112" name="cdt.managedbuild.toolchain.gnu.macosx.base" superClass="cdt.managedbuild.toolchain.gnu.macosx.base">
|
<folderInfo id="cdt.managedbuild.toolchain.gnu.macosx.base.2134184396.1840911077" name="/" resourcePath="">
|
||||||
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.MachO" id="cdt.managedbuild.target.gnu.platform.macosx.base.2057035265" name="Debug Platform" osList="macosx" superClass="cdt.managedbuild.target.gnu.platform.macosx.base"/>
|
<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.base.766054112" name="cdt.managedbuild.toolchain.gnu.macosx.base" superClass="cdt.managedbuild.toolchain.gnu.macosx.base">
|
||||||
<builder id="cdt.managedbuild.target.gnu.builder.macosx.base.783726363" managedBuildOn="false" name="Gnu Make Builder.MacOSX GCC" superClass="cdt.managedbuild.target.gnu.builder.macosx.base"/>
|
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.MachO" id="cdt.managedbuild.target.gnu.platform.macosx.base.2057035265" name="Debug Platform" osList="macosx" superClass="cdt.managedbuild.target.gnu.platform.macosx.base"/>
|
||||||
<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.base.914103467" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.base">
|
<builder id="cdt.managedbuild.target.gnu.builder.macosx.base.783726363" keepEnvironmentInBuildfile="false" managedBuildOn="false" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.base"/>
|
||||||
<inputType id="cdt.managedbuild.tool.macosx.c.linker.input.62980206" superClass="cdt.managedbuild.tool.macosx.c.linker.input">
|
<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.base.914103467" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.base">
|
||||||
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
|
<inputType id="cdt.managedbuild.tool.macosx.c.linker.input.62980206" superClass="cdt.managedbuild.tool.macosx.c.linker.input">
|
||||||
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
|
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
|
||||||
</inputType>
|
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
|
||||||
</tool>
|
</inputType>
|
||||||
<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.base.691108439" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.base"/>
|
</tool>
|
||||||
<tool id="cdt.managedbuild.tool.gnu.assembler.macosx.base.695639877" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.macosx.base">
|
<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.base.691108439" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.base"/>
|
||||||
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.1507665054" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
|
<tool id="cdt.managedbuild.tool.gnu.assembler.macosx.base.695639877" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.macosx.base">
|
||||||
</tool>
|
<option id="gnu.both.asm.option.include.paths.1544375094" name="Include paths (-I)" superClass="gnu.both.asm.option.include.paths" valueType="includePath"/>
|
||||||
<tool id="cdt.managedbuild.tool.gnu.archiver.macosx.base.1786370580" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.macosx.base"/>
|
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.1507665054" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
|
||||||
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.macosx.base.454329831" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.macosx.base"/>
|
</tool>
|
||||||
<tool id="cdt.managedbuild.tool.gnu.c.compiler.macosx.base.1928774909" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.macosx.base">
|
<tool id="cdt.managedbuild.tool.gnu.archiver.macosx.base.1786370580" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.macosx.base"/>
|
||||||
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.330854350" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
|
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.macosx.base.454329831" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.macosx.base"/>
|
||||||
</tool>
|
<tool id="cdt.managedbuild.tool.gnu.c.compiler.macosx.base.1928774909" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.macosx.base">
|
||||||
</toolChain>
|
<option id="gnu.c.compiler.option.include.paths.823251305" superClass="gnu.c.compiler.option.include.paths" valueType="includePath">
|
||||||
</folderInfo>
|
<listOptionValue builtIn="false" value="/usr/include"/>
|
||||||
</configuration>
|
</option>
|
||||||
</storageModule>
|
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.330854350" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
|
||||||
<storageModule moduleId="scannerConfiguration">
|
</tool>
|
||||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile"/>
|
</toolChain>
|
||||||
<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
|
</folderInfo>
|
||||||
<buildOutputProvider>
|
</configuration>
|
||||||
<openAction enabled="true" filePath=""/>
|
</storageModule>
|
||||||
<parser enabled="true"/>
|
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
|
||||||
</buildOutputProvider>
|
<storageModule moduleId="org.eclipse.cdt.core.language.mapping"/>
|
||||||
<scannerInfoProvider id="specsFile">
|
<storageModule moduleId="scannerConfiguration">
|
||||||
<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
|
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile"/>
|
||||||
<parser enabled="true"/>
|
<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
|
||||||
</scannerInfoProvider>
|
<buildOutputProvider>
|
||||||
</profile>
|
<openAction enabled="true" filePath=""/>
|
||||||
<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
|
<parser enabled="true"/>
|
||||||
<buildOutputProvider>
|
</buildOutputProvider>
|
||||||
<openAction enabled="true" filePath=""/>
|
<scannerInfoProvider id="specsFile">
|
||||||
<parser enabled="true"/>
|
<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
|
||||||
</buildOutputProvider>
|
<parser enabled="true"/>
|
||||||
<scannerInfoProvider id="makefileGenerator">
|
</scannerInfoProvider>
|
||||||
<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
|
</profile>
|
||||||
<parser enabled="true"/>
|
<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
|
||||||
</scannerInfoProvider>
|
<buildOutputProvider>
|
||||||
</profile>
|
<openAction enabled="true" filePath=""/>
|
||||||
<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
|
<parser enabled="true"/>
|
||||||
<buildOutputProvider>
|
</buildOutputProvider>
|
||||||
<openAction enabled="true" filePath=""/>
|
<scannerInfoProvider id="makefileGenerator">
|
||||||
<parser enabled="true"/>
|
<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
|
||||||
</buildOutputProvider>
|
<parser enabled="true"/>
|
||||||
<scannerInfoProvider id="specsFile">
|
</scannerInfoProvider>
|
||||||
<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
|
</profile>
|
||||||
<parser enabled="true"/>
|
<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
|
||||||
</scannerInfoProvider>
|
<buildOutputProvider>
|
||||||
</profile>
|
<openAction enabled="true" filePath=""/>
|
||||||
<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
|
<parser enabled="true"/>
|
||||||
<buildOutputProvider>
|
</buildOutputProvider>
|
||||||
<openAction enabled="true" filePath=""/>
|
<scannerInfoProvider id="specsFile">
|
||||||
<parser enabled="true"/>
|
<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
|
||||||
</buildOutputProvider>
|
<parser enabled="true"/>
|
||||||
<scannerInfoProvider id="specsFile">
|
</scannerInfoProvider>
|
||||||
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
|
</profile>
|
||||||
<parser enabled="true"/>
|
<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
|
||||||
</scannerInfoProvider>
|
<buildOutputProvider>
|
||||||
</profile>
|
<openAction enabled="true" filePath=""/>
|
||||||
<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
|
<parser enabled="true"/>
|
||||||
<buildOutputProvider>
|
</buildOutputProvider>
|
||||||
<openAction enabled="true" filePath=""/>
|
<scannerInfoProvider id="specsFile">
|
||||||
<parser enabled="true"/>
|
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
|
||||||
</buildOutputProvider>
|
<parser enabled="true"/>
|
||||||
<scannerInfoProvider id="specsFile">
|
</scannerInfoProvider>
|
||||||
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
|
</profile>
|
||||||
<parser enabled="true"/>
|
<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
|
||||||
</scannerInfoProvider>
|
<buildOutputProvider>
|
||||||
</profile>
|
<openAction enabled="true" filePath=""/>
|
||||||
<profile id="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfile">
|
<parser enabled="true"/>
|
||||||
<buildOutputProvider>
|
</buildOutputProvider>
|
||||||
<openAction enabled="true" filePath=""/>
|
<scannerInfoProvider id="specsFile">
|
||||||
<parser enabled="true"/>
|
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
|
||||||
</buildOutputProvider>
|
<parser enabled="true"/>
|
||||||
<scannerInfoProvider id="specsFile">
|
</scannerInfoProvider>
|
||||||
<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
|
</profile>
|
||||||
<parser enabled="true"/>
|
<profile id="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfile">
|
||||||
</scannerInfoProvider>
|
<buildOutputProvider>
|
||||||
</profile>
|
<openAction enabled="true" filePath=""/>
|
||||||
<profile id="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP">
|
<parser enabled="true"/>
|
||||||
<buildOutputProvider>
|
</buildOutputProvider>
|
||||||
<openAction enabled="true" filePath=""/>
|
<scannerInfoProvider id="specsFile">
|
||||||
<parser enabled="true"/>
|
<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
|
||||||
</buildOutputProvider>
|
<parser enabled="true"/>
|
||||||
<scannerInfoProvider id="specsFile">
|
</scannerInfoProvider>
|
||||||
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
|
</profile>
|
||||||
<parser enabled="true"/>
|
<profile id="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP">
|
||||||
</scannerInfoProvider>
|
<buildOutputProvider>
|
||||||
</profile>
|
<openAction enabled="true" filePath=""/>
|
||||||
<profile id="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC">
|
<parser enabled="true"/>
|
||||||
<buildOutputProvider>
|
</buildOutputProvider>
|
||||||
<openAction enabled="true" filePath=""/>
|
<scannerInfoProvider id="specsFile">
|
||||||
<parser enabled="true"/>
|
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
|
||||||
</buildOutputProvider>
|
<parser enabled="true"/>
|
||||||
<scannerInfoProvider id="specsFile">
|
</scannerInfoProvider>
|
||||||
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
|
</profile>
|
||||||
<parser enabled="true"/>
|
<profile id="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC">
|
||||||
</scannerInfoProvider>
|
<buildOutputProvider>
|
||||||
</profile>
|
<openAction enabled="true" filePath=""/>
|
||||||
<profile id="org.eclipse.cdt.managedbuilder.xlc.core.XLCManagedMakePerProjectProfile">
|
<parser enabled="true"/>
|
||||||
<buildOutputProvider>
|
</buildOutputProvider>
|
||||||
<openAction enabled="false" filePath=""/>
|
<scannerInfoProvider id="specsFile">
|
||||||
<parser enabled="false"/>
|
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
|
||||||
</buildOutputProvider>
|
<parser enabled="true"/>
|
||||||
<scannerInfoProvider id="specsFile">
|
</scannerInfoProvider>
|
||||||
<runAction arguments="-E -v ${plugin_state_location}/${specs_file}" command="${XL_compilerRoot}/xlc" useDefault="true"/>
|
</profile>
|
||||||
<parser enabled="true"/>
|
<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.macosx.base.2134184396;cdt.managedbuild.toolchain.gnu.macosx.base.2134184396.1840911077;cdt.managedbuild.tool.gnu.c.compiler.macosx.base.1928774909;cdt.managedbuild.tool.gnu.c.compiler.input.330854350">
|
||||||
</scannerInfoProvider>
|
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
|
||||||
</profile>
|
<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
|
||||||
<profile id="org.eclipse.cdt.managedbuilder.xlc.core.XLCManagedMakePerProjectProfileCPP">
|
<buildOutputProvider>
|
||||||
<buildOutputProvider>
|
<openAction enabled="true" filePath=""/>
|
||||||
<openAction enabled="false" filePath=""/>
|
<parser enabled="true"/>
|
||||||
<parser enabled="false"/>
|
</buildOutputProvider>
|
||||||
</buildOutputProvider>
|
<scannerInfoProvider id="specsFile">
|
||||||
<scannerInfoProvider id="specsFile">
|
<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
|
||||||
<runAction arguments="-E -v ${plugin_state_location}/${specs_file}" command="${XL_compilerRoot}/xlC" useDefault="true"/>
|
<parser enabled="true"/>
|
||||||
<parser enabled="true"/>
|
</scannerInfoProvider>
|
||||||
</scannerInfoProvider>
|
</profile>
|
||||||
</profile>
|
<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
|
||||||
</storageModule>
|
<buildOutputProvider>
|
||||||
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
|
<openAction enabled="true" filePath=""/>
|
||||||
</cconfiguration>
|
<parser enabled="true"/>
|
||||||
</storageModule>
|
</buildOutputProvider>
|
||||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
<scannerInfoProvider id="makefileGenerator">
|
||||||
<project id="ecoPrimers.null.1292969001" name="ecoPrimers"/>
|
<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
|
||||||
</storageModule>
|
<parser enabled="true"/>
|
||||||
|
</scannerInfoProvider>
|
||||||
|
</profile>
|
||||||
|
<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
|
||||||
|
<buildOutputProvider>
|
||||||
|
<openAction enabled="true" filePath=""/>
|
||||||
|
<parser enabled="true"/>
|
||||||
|
</buildOutputProvider>
|
||||||
|
<scannerInfoProvider id="specsFile">
|
||||||
|
<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
|
||||||
|
<parser enabled="true"/>
|
||||||
|
</scannerInfoProvider>
|
||||||
|
</profile>
|
||||||
|
<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
|
||||||
|
<buildOutputProvider>
|
||||||
|
<openAction enabled="true" filePath=""/>
|
||||||
|
<parser enabled="true"/>
|
||||||
|
</buildOutputProvider>
|
||||||
|
<scannerInfoProvider id="specsFile">
|
||||||
|
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
|
||||||
|
<parser enabled="true"/>
|
||||||
|
</scannerInfoProvider>
|
||||||
|
</profile>
|
||||||
|
<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
|
||||||
|
<buildOutputProvider>
|
||||||
|
<openAction enabled="true" filePath=""/>
|
||||||
|
<parser enabled="true"/>
|
||||||
|
</buildOutputProvider>
|
||||||
|
<scannerInfoProvider id="specsFile">
|
||||||
|
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
|
||||||
|
<parser enabled="true"/>
|
||||||
|
</scannerInfoProvider>
|
||||||
|
</profile>
|
||||||
|
<profile id="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfile">
|
||||||
|
<buildOutputProvider>
|
||||||
|
<openAction enabled="true" filePath=""/>
|
||||||
|
<parser enabled="true"/>
|
||||||
|
</buildOutputProvider>
|
||||||
|
<scannerInfoProvider id="specsFile">
|
||||||
|
<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
|
||||||
|
<parser enabled="true"/>
|
||||||
|
</scannerInfoProvider>
|
||||||
|
</profile>
|
||||||
|
<profile id="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP">
|
||||||
|
<buildOutputProvider>
|
||||||
|
<openAction enabled="true" filePath=""/>
|
||||||
|
<parser enabled="true"/>
|
||||||
|
</buildOutputProvider>
|
||||||
|
<scannerInfoProvider id="specsFile">
|
||||||
|
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
|
||||||
|
<parser enabled="true"/>
|
||||||
|
</scannerInfoProvider>
|
||||||
|
</profile>
|
||||||
|
<profile id="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC">
|
||||||
|
<buildOutputProvider>
|
||||||
|
<openAction enabled="true" filePath=""/>
|
||||||
|
<parser enabled="true"/>
|
||||||
|
</buildOutputProvider>
|
||||||
|
<scannerInfoProvider id="specsFile">
|
||||||
|
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
|
||||||
|
<parser enabled="true"/>
|
||||||
|
</scannerInfoProvider>
|
||||||
|
</profile>
|
||||||
|
</scannerConfigBuildInfo>
|
||||||
|
</storageModule>
|
||||||
|
<storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
|
||||||
|
</cconfiguration>
|
||||||
|
</storageModule>
|
||||||
|
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||||
|
<project id="ecoPrimers.null.1292969001" name="ecoPrimers"/>
|
||||||
|
</storageModule>
|
||||||
</cproject>
|
</cproject>
|
||||||
|
15
.gitignore
vendored
Normal file
15
.gitignore
vendored
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
*.o
|
||||||
|
/src/*.P
|
||||||
|
/src/*.a
|
||||||
|
/src/ecoPrimer
|
||||||
|
/src/libecoPCR/*.P
|
||||||
|
/src/libecoPCR/*.a
|
||||||
|
/src/libecoPCR/*.o
|
||||||
|
/src/libecoprimer/*.P
|
||||||
|
/src/libecoprimer/*.a
|
||||||
|
/src/libecoprimer/*.o
|
||||||
|
/src/libthermo/*.P
|
||||||
|
/src/libthermo/*.a
|
||||||
|
/src/libthermo/*.o
|
||||||
|
phylonorway.*
|
||||||
|
src/ecoPrimers
|
60
.project
60
.project
@ -5,45 +5,26 @@
|
|||||||
<projects>
|
<projects>
|
||||||
</projects>
|
</projects>
|
||||||
<buildSpec>
|
<buildSpec>
|
||||||
|
<buildCommand>
|
||||||
|
<name>org.python.pydev.PyDevBuilder</name>
|
||||||
|
<arguments>
|
||||||
|
</arguments>
|
||||||
|
</buildCommand>
|
||||||
<buildCommand>
|
<buildCommand>
|
||||||
<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
|
<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
|
||||||
<triggers>clean,full,incremental,</triggers>
|
<triggers>clean,full,incremental,</triggers>
|
||||||
<arguments>
|
<arguments>
|
||||||
<dictionary>
|
|
||||||
<key>org.eclipse.cdt.make.core.fullBuildTarget</key>
|
|
||||||
<value>all</value>
|
|
||||||
</dictionary>
|
|
||||||
<dictionary>
|
<dictionary>
|
||||||
<key>?name?</key>
|
<key>?name?</key>
|
||||||
<value></value>
|
<value></value>
|
||||||
</dictionary>
|
</dictionary>
|
||||||
<dictionary>
|
|
||||||
<key>org.eclipse.cdt.make.core.enableAutoBuild</key>
|
|
||||||
<value>false</value>
|
|
||||||
</dictionary>
|
|
||||||
<dictionary>
|
|
||||||
<key>org.eclipse.cdt.make.core.enableFullBuild</key>
|
|
||||||
<value>true</value>
|
|
||||||
</dictionary>
|
|
||||||
<dictionary>
|
|
||||||
<key>org.eclipse.cdt.make.core.enableCleanBuild</key>
|
|
||||||
<value>true</value>
|
|
||||||
</dictionary>
|
|
||||||
<dictionary>
|
|
||||||
<key>org.eclipse.cdt.make.core.cleanBuildTarget</key>
|
|
||||||
<value>clean</value>
|
|
||||||
</dictionary>
|
|
||||||
<dictionary>
|
<dictionary>
|
||||||
<key>org.eclipse.cdt.make.core.append_environment</key>
|
<key>org.eclipse.cdt.make.core.append_environment</key>
|
||||||
<value>true</value>
|
<value>true</value>
|
||||||
</dictionary>
|
</dictionary>
|
||||||
<dictionary>
|
<dictionary>
|
||||||
<key>org.eclipse.cdt.make.core.contents</key>
|
<key>org.eclipse.cdt.make.core.autoBuildTarget</key>
|
||||||
<value>org.eclipse.cdt.make.core.activeConfigSettings</value>
|
<value>all</value>
|
||||||
</dictionary>
|
|
||||||
<dictionary>
|
|
||||||
<key>org.eclipse.cdt.make.core.useDefaultBuildCmd</key>
|
|
||||||
<value>true</value>
|
|
||||||
</dictionary>
|
</dictionary>
|
||||||
<dictionary>
|
<dictionary>
|
||||||
<key>org.eclipse.cdt.make.core.buildArguments</key>
|
<key>org.eclipse.cdt.make.core.buildArguments</key>
|
||||||
@ -54,13 +35,37 @@
|
|||||||
<value>make</value>
|
<value>make</value>
|
||||||
</dictionary>
|
</dictionary>
|
||||||
<dictionary>
|
<dictionary>
|
||||||
<key>org.eclipse.cdt.make.core.autoBuildTarget</key>
|
<key>org.eclipse.cdt.make.core.cleanBuildTarget</key>
|
||||||
|
<value>clean</value>
|
||||||
|
</dictionary>
|
||||||
|
<dictionary>
|
||||||
|
<key>org.eclipse.cdt.make.core.contents</key>
|
||||||
|
<value>org.eclipse.cdt.make.core.activeConfigSettings</value>
|
||||||
|
</dictionary>
|
||||||
|
<dictionary>
|
||||||
|
<key>org.eclipse.cdt.make.core.enableAutoBuild</key>
|
||||||
|
<value>false</value>
|
||||||
|
</dictionary>
|
||||||
|
<dictionary>
|
||||||
|
<key>org.eclipse.cdt.make.core.enableCleanBuild</key>
|
||||||
|
<value>true</value>
|
||||||
|
</dictionary>
|
||||||
|
<dictionary>
|
||||||
|
<key>org.eclipse.cdt.make.core.enableFullBuild</key>
|
||||||
|
<value>true</value>
|
||||||
|
</dictionary>
|
||||||
|
<dictionary>
|
||||||
|
<key>org.eclipse.cdt.make.core.fullBuildTarget</key>
|
||||||
<value>all</value>
|
<value>all</value>
|
||||||
</dictionary>
|
</dictionary>
|
||||||
<dictionary>
|
<dictionary>
|
||||||
<key>org.eclipse.cdt.make.core.stopOnError</key>
|
<key>org.eclipse.cdt.make.core.stopOnError</key>
|
||||||
<value>true</value>
|
<value>true</value>
|
||||||
</dictionary>
|
</dictionary>
|
||||||
|
<dictionary>
|
||||||
|
<key>org.eclipse.cdt.make.core.useDefaultBuildCmd</key>
|
||||||
|
<value>true</value>
|
||||||
|
</dictionary>
|
||||||
</arguments>
|
</arguments>
|
||||||
</buildCommand>
|
</buildCommand>
|
||||||
<buildCommand>
|
<buildCommand>
|
||||||
@ -73,5 +78,6 @@
|
|||||||
<nature>org.eclipse.cdt.core.cnature</nature>
|
<nature>org.eclipse.cdt.core.cnature</nature>
|
||||||
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
|
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
|
||||||
<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
|
<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
|
||||||
|
<nature>org.python.pydev.pythonNature</nature>
|
||||||
</natures>
|
</natures>
|
||||||
</projectDescription>
|
</projectDescription>
|
||||||
|
7
.pydevproject
Normal file
7
.pydevproject
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||||
|
<?eclipse-pydev version="1.0"?>
|
||||||
|
|
||||||
|
<pydev_project>
|
||||||
|
<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Python 2.6</pydev_property>
|
||||||
|
<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.6</pydev_property>
|
||||||
|
</pydev_project>
|
33
src/Makefile
33
src/Makefile
@ -1,4 +1,4 @@
|
|||||||
EXEC=ecoPrimer
|
EXEC=ecoPrimers
|
||||||
|
|
||||||
PRIMER_SRC= ecoprimer.c
|
PRIMER_SRC= ecoprimer.c
|
||||||
PRIMER_OBJ= $(patsubst %.c,%.o,$(PRIMER_SRC))
|
PRIMER_OBJ= $(patsubst %.c,%.o,$(PRIMER_SRC))
|
||||||
@ -6,10 +6,12 @@ PRIMER_OBJ= $(patsubst %.c,%.o,$(PRIMER_SRC))
|
|||||||
|
|
||||||
SRCS= $(PRIMER_SRC)
|
SRCS= $(PRIMER_SRC)
|
||||||
|
|
||||||
LIB= -lecoPCR -lecoprimer -lz -lm
|
LIB= -lecoprimer -lecoPCR -lthermo -lz -lm
|
||||||
|
|
||||||
LIBFILE= libecoPCR/libecoPCR.a \
|
LIBFILE= libecoPCR/libecoPCR.a \
|
||||||
libecoprimer/libecoprimer.a
|
libecoprimer/libecoprimer.a \
|
||||||
|
libthermo/libthermo.a \
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
include global.mk
|
include global.mk
|
||||||
@ -25,8 +27,8 @@ all: $(EXEC)
|
|||||||
|
|
||||||
# executable compilation and link
|
# executable compilation and link
|
||||||
|
|
||||||
ecoPrimer: $(PRIMER_OBJ) $(LIBFILE)
|
ecoPrimers: $(PRIMER_OBJ) $(LIBFILE)
|
||||||
$(CC) $(LDFLAGS) -O5 -m64 -fast -o $@ $< $(LIBPATH) $(LIB)
|
$(CC) $(LDFLAGS) -O5 -o $@ $< $(LIBPATH) $(LIB)
|
||||||
|
|
||||||
|
|
||||||
########
|
########
|
||||||
@ -41,6 +43,8 @@ libecoPCR/libecoPCR.a:
|
|||||||
libecoprimer/libecoprimer.a:
|
libecoprimer/libecoprimer.a:
|
||||||
$(MAKE) -C libecoprimer
|
$(MAKE) -C libecoprimer
|
||||||
|
|
||||||
|
libthermo/libthermo.a:
|
||||||
|
$(MAKE) -C libthermo
|
||||||
|
|
||||||
########
|
########
|
||||||
#
|
#
|
||||||
@ -50,9 +54,26 @@ libecoprimer/libecoprimer.a:
|
|||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -f *.o
|
rm -f *.o
|
||||||
|
rm -f *.P
|
||||||
rm -f $(EXEC)
|
rm -f $(EXEC)
|
||||||
$(MAKE) -C libecoPCR clean
|
$(MAKE) -C libecoPCR clean
|
||||||
$(MAKE) -C libecoprimer clean
|
$(MAKE) -C libecoprimer clean
|
||||||
|
$(MAKE) -C libthermo clean
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
########
|
||||||
|
#
|
||||||
|
# clean for k2 to remove .o and .P files
|
||||||
|
#
|
||||||
|
########
|
||||||
|
|
||||||
|
k2clean:
|
||||||
|
rm -f *.o
|
||||||
|
rm -f *.P
|
||||||
|
rm -f libecoPCR/*.o
|
||||||
|
rm -f libecoPCR/*.P
|
||||||
|
rm -f libecoprimer/*.o
|
||||||
|
rm -f libecoprimer/*.P
|
||||||
|
rm -f libthermo/*.o
|
||||||
|
rm -f libthermo/*.P
|
||||||
|
BIN
src/ecoPrimer
BIN
src/ecoPrimer
Binary file not shown.
899
src/ecoprimer.c
899
src/ecoprimer.c
File diff suppressed because it is too large
Load Diff
@ -1,9 +1,10 @@
|
|||||||
MACHINE=MAC_OS_X
|
MACHINE=LINUX
|
||||||
LIBPATH= -Llibapat -LlibecoPCR -Llibecoprimer
|
LIBPATH= -LlibecoPCR -Llibecoprimer -Llibthermo -L/usr/local/lib
|
||||||
MAKEDEPEND = gcc -D$(MACHINE) -M $(CPPFLAGS) -o $*.d $<
|
MAKEDEPEND = gcc -D$(MACHINE) -M $(CPPFLAGS) -o $*.d $<
|
||||||
|
|
||||||
CC=gcc
|
CC=gcc
|
||||||
CFLAGS= -W -Wall -O5 -m64 -fast -g
|
CFLAGS= -W -Wall -O5
|
||||||
|
#CFLAGS= -W -Wall -O5 -m64 -g
|
||||||
#CFLAGS= -W -Wall -O0 -m64 -g
|
#CFLAGS= -W -Wall -O0 -m64 -g
|
||||||
#CFLAGS= -W -Wall -O5 -fast -g
|
#CFLAGS= -W -Wall -O5 -fast -g
|
||||||
|
|
||||||
@ -17,4 +18,4 @@ default: all
|
|||||||
@sed 's/\($*\)\.o[ :]*/\1.o $@ : /g' < $*.d > $@; \
|
@sed 's/\($*\)\.o[ :]*/\1.o $@ : /g' < $*.d > $@; \
|
||||||
rm -f $*.d; [ -s $@ ] || rm -f $@
|
rm -f $*.d; [ -s $@ ] || rm -f $@
|
||||||
|
|
||||||
include $(SRCS:.c=.P)
|
include $(SRCS:.c=.P)
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
|
||||||
static int eco_log_malloc = 0;
|
static int eco_log_malloc = 0;
|
||||||
static size_t eco_amount_malloc=0;
|
//static size_t eco_amount_malloc=0;
|
||||||
static size_t eco_chunk_malloc=0;
|
static size_t eco_chunk_malloc=0;
|
||||||
|
|
||||||
void eco_trace_memory_allocation()
|
void eco_trace_memory_allocation()
|
||||||
@ -17,10 +17,10 @@ void eco_untrace_memory_allocation()
|
|||||||
|
|
||||||
void ecoMallocedMemory()
|
void ecoMallocedMemory()
|
||||||
{
|
{
|
||||||
return eco_amount_malloc;
|
//eco_amount_malloc;
|
||||||
}
|
}
|
||||||
|
|
||||||
void *eco_malloc(int32_t chunksize,
|
void *eco_malloc(int64_t chunksize,
|
||||||
const char *error_message,
|
const char *error_message,
|
||||||
const char *filename,
|
const char *filename,
|
||||||
int32_t line)
|
int32_t line)
|
||||||
@ -37,7 +37,7 @@ void *eco_malloc(int32_t chunksize,
|
|||||||
|
|
||||||
if (eco_log_malloc)
|
if (eco_log_malloc)
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
"Memory segment located at %p of size %d is allocated (file : %s [%d])",
|
"Memory segment located at %p of size %lld is allocated (file : %s [%d])",
|
||||||
chunk,
|
chunk,
|
||||||
chunksize,
|
chunksize,
|
||||||
filename,
|
filename,
|
||||||
@ -47,25 +47,33 @@ void *eco_malloc(int32_t chunksize,
|
|||||||
}
|
}
|
||||||
|
|
||||||
void *eco_realloc(void *chunk,
|
void *eco_realloc(void *chunk,
|
||||||
int32_t newsize,
|
int64_t newsize,
|
||||||
const char *error_message,
|
const char *error_message,
|
||||||
const char *filename,
|
const char *filename,
|
||||||
int32_t line)
|
int32_t line)
|
||||||
{
|
{
|
||||||
void *newchunk;
|
void *newchunk;
|
||||||
|
|
||||||
|
if (newsize == 0)
|
||||||
|
{
|
||||||
|
if (chunk)
|
||||||
|
free(chunk);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
newchunk = realloc(chunk,newsize);
|
newchunk = realloc(chunk,newsize);
|
||||||
|
|
||||||
|
|
||||||
if (!newchunk)
|
if (!newchunk)
|
||||||
|
{
|
||||||
|
fprintf(stderr,"Requested memory : %lld\n",newsize);
|
||||||
ecoError(ECO_MEM_ERROR,error_message,filename,line);
|
ecoError(ECO_MEM_ERROR,error_message,filename,line);
|
||||||
|
}
|
||||||
if (!chunk)
|
if (!chunk)
|
||||||
eco_chunk_malloc++;
|
eco_chunk_malloc++;
|
||||||
|
|
||||||
if (eco_log_malloc)
|
if (eco_log_malloc)
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
"Old memory segment %p is reallocated at %p with a size of %d (file : %s [%d])",
|
"Old memory segment %p is reallocated at %p with a size of %lld (file : %s [%d])",
|
||||||
chunk,
|
chunk,
|
||||||
newchunk,
|
newchunk,
|
||||||
newsize,
|
newsize,
|
||||||
|
@ -21,8 +21,8 @@ typedef struct {
|
|||||||
int32_t taxid;
|
int32_t taxid;
|
||||||
char AC[20];
|
char AC[20];
|
||||||
int32_t DE_length;
|
int32_t DE_length;
|
||||||
int32_t SQ_length;
|
uint32_t SQ_length;
|
||||||
int32_t CSQ_length; /*what is this CSQ_length ? */
|
uint32_t CSQ_length; /*what is this CSQ_length ? */
|
||||||
|
|
||||||
char data[1];
|
char data[1];
|
||||||
|
|
||||||
@ -30,7 +30,7 @@ typedef struct {
|
|||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int32_t taxid;
|
int32_t taxid;
|
||||||
int32_t SQ_length;
|
uint32_t SQ_length;
|
||||||
int32_t isexample;
|
int32_t isexample;
|
||||||
char *AC;
|
char *AC;
|
||||||
char *DE;
|
char *DE;
|
||||||
@ -130,14 +130,14 @@ typedef struct {
|
|||||||
int32_t is_big_endian();
|
int32_t is_big_endian();
|
||||||
int32_t swap_int32_t(int32_t);
|
int32_t swap_int32_t(int32_t);
|
||||||
|
|
||||||
void *eco_malloc(int32_t chunksize,
|
void *eco_malloc(int64_t chunksize,
|
||||||
const char *error_message,
|
const char *error_message,
|
||||||
const char *filename,
|
const char *filename,
|
||||||
int32_t line);
|
int32_t line);
|
||||||
|
|
||||||
|
|
||||||
void *eco_realloc(void *chunk,
|
void *eco_realloc(void *chunk,
|
||||||
int32_t chunksize,
|
int64_t chunksize,
|
||||||
const char *error_message,
|
const char *error_message,
|
||||||
const char *filename,
|
const char *filename,
|
||||||
int32_t line);
|
int32_t line);
|
||||||
@ -219,7 +219,7 @@ econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy);
|
|||||||
* @return pointer to a taxonomy index structure
|
* @return pointer to a taxonomy index structure
|
||||||
*/
|
*/
|
||||||
|
|
||||||
ecotxidx_t *read_taxonomyidx(const char *filename);
|
ecotxidx_t *read_taxonomyidx(const char *filename,const char *filename2);
|
||||||
|
|
||||||
ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName);
|
ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName);
|
||||||
|
|
||||||
|
@ -16,4 +16,5 @@ int eco_is_taxid_included( ecotaxonomy_t *taxonomy,
|
|||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -12,14 +12,14 @@ ecorankidx_t *read_rankidx(const char *filename)
|
|||||||
int32_t i;
|
int32_t i;
|
||||||
int32_t rs;
|
int32_t rs;
|
||||||
char *buffer;
|
char *buffer;
|
||||||
|
|
||||||
f = open_ecorecorddb(filename,&count,1);
|
f = open_ecorecorddb(filename,&count,1);
|
||||||
|
|
||||||
index = (ecorankidx_t*) ECOMALLOC(sizeof(ecorankidx_t) + sizeof(char*) * (count-1),
|
index = (ecorankidx_t*) ECOMALLOC(sizeof(ecorankidx_t) + sizeof(char*) * (count-1),
|
||||||
"Allocate rank index");
|
"Allocate rank index");
|
||||||
|
|
||||||
index->count=count;
|
index->count=count;
|
||||||
|
|
||||||
for (i=0; i < count; i++)
|
for (i=0; i < count; i++)
|
||||||
{
|
{
|
||||||
buffer = read_ecorecord(f,&rs);
|
buffer = read_ecorecord(f,&rs);
|
||||||
@ -27,21 +27,18 @@ ecorankidx_t *read_rankidx(const char *filename)
|
|||||||
"Allocate rank label");
|
"Allocate rank label");
|
||||||
strncpy(index->label[i],buffer,rs);
|
strncpy(index->label[i],buffer,rs);
|
||||||
}
|
}
|
||||||
|
|
||||||
return index;
|
return index;
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t rank_index(const char* label,ecorankidx_t* ranks)
|
int32_t rank_index(const char* label,ecorankidx_t* ranks)
|
||||||
{
|
{
|
||||||
char **rep;
|
char **rep;
|
||||||
|
fprintf(stderr,"Looking for rank -%s-... ",label);
|
||||||
rep = bsearch(label,ranks->label,ranks->count,sizeof(char*),compareRankLabel);
|
rep = bsearch(label,ranks->label,ranks->count,sizeof(char*),compareRankLabel);
|
||||||
|
|
||||||
if (rep)
|
if (rep)
|
||||||
return rep-ranks->label;
|
return rep-ranks->label;
|
||||||
else
|
|
||||||
ECOERROR(ECO_NOTFOUND_ERROR,"Rank label not found");
|
|
||||||
|
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <zlib.h>
|
#include <zlib.h>
|
||||||
|
#include <ctype.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
@ -100,6 +101,8 @@ ecoseq_t *readnext_ecoseq(FILE *f)
|
|||||||
int32_t comp_status;
|
int32_t comp_status;
|
||||||
unsigned long int seqlength;
|
unsigned long int seqlength;
|
||||||
int32_t rs;
|
int32_t rs;
|
||||||
|
char *c;
|
||||||
|
int32_t i;
|
||||||
|
|
||||||
raw = read_ecorecord(f,&rs);
|
raw = read_ecorecord(f,&rs);
|
||||||
|
|
||||||
@ -144,6 +147,10 @@ ecoseq_t *readnext_ecoseq(FILE *f)
|
|||||||
if (comp_status != Z_OK)
|
if (comp_status != Z_OK)
|
||||||
ECOERROR(ECO_IO_ERROR,"I cannot uncompress sequence data");
|
ECOERROR(ECO_IO_ERROR,"I cannot uncompress sequence data");
|
||||||
|
|
||||||
|
for (c=seq->SQ,i=0;i<seqlength;c++,i++)
|
||||||
|
*c=toupper(*c);
|
||||||
|
|
||||||
|
// fprintf(stderr,"seq name : %30s seq size : %d\n",seq->DE,seq->SQ_length);
|
||||||
return seq;
|
return seq;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -188,7 +195,7 @@ ecoseq_t *ecoseq_iterator(const char *prefix)
|
|||||||
{
|
{
|
||||||
static FILE *current_seq_file= NULL;
|
static FILE *current_seq_file= NULL;
|
||||||
static int32_t current_file_idx = 1;
|
static int32_t current_file_idx = 1;
|
||||||
static char current_prefix[1024];
|
static char current_prefix[1025];
|
||||||
ecoseq_t *seq;
|
ecoseq_t *seq;
|
||||||
|
|
||||||
if (prefix)
|
if (prefix)
|
||||||
|
@ -10,23 +10,41 @@ static ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon);
|
|||||||
* @param pointer to the database (.tdx file)
|
* @param pointer to the database (.tdx file)
|
||||||
* @return a ecotxidx_t structure
|
* @return a ecotxidx_t structure
|
||||||
*/
|
*/
|
||||||
ecotxidx_t *read_taxonomyidx(const char *filename)
|
ecotxidx_t *read_taxonomyidx(const char *filename,const char *filename2)
|
||||||
{
|
{
|
||||||
int32_t count;
|
int32_t count;
|
||||||
|
int32_t count2;
|
||||||
FILE *f;
|
FILE *f;
|
||||||
|
FILE *f2;
|
||||||
ecotxidx_t *index;
|
ecotxidx_t *index;
|
||||||
int32_t i;
|
int32_t i;
|
||||||
|
|
||||||
f = open_ecorecorddb(filename,&count,1);
|
f = open_ecorecorddb(filename,&count,1);
|
||||||
|
f2 = open_ecorecorddb(filename2,&count2,0);
|
||||||
|
|
||||||
index = (ecotxidx_t*) ECOMALLOC(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count-1),
|
|
||||||
|
|
||||||
|
index = (ecotxidx_t*) ECOMALLOC(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count+count2-1),
|
||||||
"Allocate taxonomy");
|
"Allocate taxonomy");
|
||||||
|
|
||||||
index->count=count;
|
index->count=count+count2;
|
||||||
|
|
||||||
|
fprintf(stderr,"Reading %d taxa...\n",count);
|
||||||
for (i=0; i < count; i++){
|
for (i=0; i < count; i++){
|
||||||
readnext_ecotaxon(f,&(index->taxon[i]));
|
readnext_ecotaxon(f,&(index->taxon[i]));
|
||||||
index->taxon[i].parent=index->taxon + (size_t)index->taxon[i].parent;
|
index->taxon[i].parent=index->taxon + (int32_t)index->taxon[i].parent;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (count2>0)
|
||||||
|
fprintf(stderr,"Reading %d local taxa...\n",count2);
|
||||||
|
else
|
||||||
|
fprintf(stderr,"No local taxon\n");
|
||||||
|
|
||||||
|
for (i=0; i < count2; i++){
|
||||||
|
readnext_ecotaxon(f2,&(index->taxon[count+i]));
|
||||||
|
index->taxon[count+i].parent=index->taxon + (int32_t)index->taxon[count+i].parent;
|
||||||
|
}
|
||||||
|
|
||||||
return index;
|
return index;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -94,7 +112,7 @@ ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon)
|
|||||||
raw->taxid = swap_int32_t(raw->taxid);
|
raw->taxid = swap_int32_t(raw->taxid);
|
||||||
}
|
}
|
||||||
|
|
||||||
taxon->parent = (ecotx_t*)(size_t)raw->parent;
|
taxon->parent = (ecotx_t*)raw->parent;
|
||||||
taxon->taxid = raw->taxid;
|
taxon->taxid = raw->taxid;
|
||||||
taxon->rank = raw->rank;
|
taxon->rank = raw->rank;
|
||||||
|
|
||||||
@ -111,6 +129,7 @@ ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName)
|
|||||||
{
|
{
|
||||||
ecotaxonomy_t *tax;
|
ecotaxonomy_t *tax;
|
||||||
char *filename;
|
char *filename;
|
||||||
|
char *filename2;
|
||||||
int buffsize;
|
int buffsize;
|
||||||
|
|
||||||
tax = ECOMALLOC(sizeof(ecotaxonomy_t),
|
tax = ECOMALLOC(sizeof(ecotaxonomy_t),
|
||||||
@ -120,14 +139,17 @@ ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName)
|
|||||||
|
|
||||||
filename = ECOMALLOC(buffsize,
|
filename = ECOMALLOC(buffsize,
|
||||||
"Allocate filename");
|
"Allocate filename");
|
||||||
|
filename2= ECOMALLOC(buffsize,
|
||||||
|
"Allocate filename");
|
||||||
|
|
||||||
snprintf(filename,buffsize,"%s.rdx",prefix);
|
snprintf(filename,buffsize,"%s.rdx",prefix);
|
||||||
|
|
||||||
tax->ranks = read_rankidx(filename);
|
tax->ranks = read_rankidx(filename);
|
||||||
|
|
||||||
snprintf(filename,buffsize,"%s.tdx",prefix);
|
snprintf(filename,buffsize,"%s.tdx",prefix);
|
||||||
|
snprintf(filename2,buffsize,"%s.ldx",prefix);
|
||||||
|
|
||||||
tax->taxons = read_taxonomyidx(filename);
|
tax->taxons = read_taxonomyidx(filename,filename2);
|
||||||
|
|
||||||
if (readAlternativeName)
|
if (readAlternativeName)
|
||||||
{
|
{
|
||||||
@ -319,6 +341,9 @@ ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,
|
|||||||
if (taxonomy && tax!=taxonomy)
|
if (taxonomy && tax!=taxonomy)
|
||||||
{
|
{
|
||||||
rankindex = rank_index("superkingdom",taxonomy->ranks);
|
rankindex = rank_index("superkingdom",taxonomy->ranks);
|
||||||
|
if (rankindex < 0) {
|
||||||
|
rankindex = rank_index("domain",taxonomy->ranks);
|
||||||
|
}
|
||||||
tax=taxonomy;
|
tax=taxonomy;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -10,8 +10,13 @@ SOURCES = goodtaxon.c \
|
|||||||
queue.c \
|
queue.c \
|
||||||
libstki.c \
|
libstki.c \
|
||||||
sortmatch.c \
|
sortmatch.c \
|
||||||
|
pairtree.c \
|
||||||
pairs.c \
|
pairs.c \
|
||||||
apat_search.c
|
taxstats.c \
|
||||||
|
apat_search.c \
|
||||||
|
filtering.c \
|
||||||
|
PrimerSets.c \
|
||||||
|
ahocorasick.c
|
||||||
|
|
||||||
SRCS=$(SOURCES)
|
SRCS=$(SOURCES)
|
||||||
|
|
||||||
|
1772
src/libecoprimer/PrimerSets.c
Normal file
1772
src/libecoprimer/PrimerSets.c
Normal file
File diff suppressed because it is too large
Load Diff
58
src/libecoprimer/PrimerSets.h
Normal file
58
src/libecoprimer/PrimerSets.h
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
#ifndef PRIMERSETS_H_
|
||||||
|
#define PRIMERSETS_H_
|
||||||
|
|
||||||
|
#include "ecoprimer.h"
|
||||||
|
|
||||||
|
#define PRIMERS_IN_SET_COUNT 10
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
int *set_wellIdentifiedTaxa;
|
||||||
|
int32_t set_pairs[PRIMERS_IN_SET_COUNT];
|
||||||
|
float set_specificity;
|
||||||
|
float set_coverage;
|
||||||
|
float set_lmean;
|
||||||
|
float set_lcov;
|
||||||
|
float set_score;
|
||||||
|
int32_t set_intaxa;
|
||||||
|
int32_t set_wi_cnt;
|
||||||
|
}pairset;
|
||||||
|
|
||||||
|
typedef struct{
|
||||||
|
ppair_t* sortedpairs;
|
||||||
|
int32_t sorted_count;
|
||||||
|
pecodnadb_t seqdb;
|
||||||
|
poptions_t options;
|
||||||
|
}SetParams;
|
||||||
|
|
||||||
|
typedef struct{
|
||||||
|
float t_spc; //specificity contribution
|
||||||
|
float t_cov; //coverage contribution
|
||||||
|
float t_lmd; //link spread difference
|
||||||
|
float len; //length
|
||||||
|
float score; //score
|
||||||
|
}primerscore;
|
||||||
|
|
||||||
|
void add_pair_in_set (pairset *pair_set, int32_t pset_idx, int32_t prb_idx, SetParams *pparams);
|
||||||
|
void get_next_pair_options (int *pair_wi_count_sorted_ids, pairset *pair_set, SetParams *pparams);
|
||||||
|
float get_links_distribution (int prb_idx, pairset *prob_set, SetParams *pparams);
|
||||||
|
pairset build_primers_set_greedy_spc (SetParams *pparams);
|
||||||
|
void get_set_mean_cov_stats (pairset *prob_set, SetParams *pparams);
|
||||||
|
void some_other_set_possibilities (pairset *pair_set,
|
||||||
|
ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options);
|
||||||
|
void sets_by_SimulatedAnealing (pairset *pair_set,
|
||||||
|
ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options);
|
||||||
|
void sets_by_TabuSearch (pairset *pair_set,
|
||||||
|
ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options);
|
||||||
|
pairset * sets_by_BruteForce (ppair_t * sortedpairs,
|
||||||
|
int32_t sorted_count, pecodnadb_t seqdb, poptions_t options);
|
||||||
|
pairset * extend_set_randomly (pairset *pair_set, SetParams *params, int extend_to_cnt);
|
||||||
|
void build_and_print_sets (ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options);
|
||||||
|
int32_t get_next_option_increasing_cov (pairset *pair_set, SetParams *pparams);
|
||||||
|
void reset_set_props (pairset *pair_set, SetParams *pparams);
|
||||||
|
void primers_graph_graphviz (ppair_t * sortedpairs,
|
||||||
|
int32_t sorted_count, poptions_t options);
|
||||||
|
size_t primers_changeSortedArray (ppair_t ** pairs,
|
||||||
|
size_t sorted_count, poptions_t options);
|
||||||
|
size_t primers_filterWithGivenLinks (ppair_t ** pairs,
|
||||||
|
size_t sorted_count, poptions_t options);
|
||||||
|
#endif
|
478
src/libecoprimer/ahocorasick.c
Executable file
478
src/libecoprimer/ahocorasick.c
Executable file
@ -0,0 +1,478 @@
|
|||||||
|
/*
|
||||||
|
* ahocorasick.h
|
||||||
|
*
|
||||||
|
* Created on: 26 march 2011
|
||||||
|
* Author: tiayyba
|
||||||
|
*/
|
||||||
|
#include <inttypes.h>
|
||||||
|
#include "hashencoder.h"
|
||||||
|
#include "ahocorasick.h"
|
||||||
|
|
||||||
|
void ahoc_graphKeywordTree (aho_state *root);
|
||||||
|
aho_state *groot = NULL; //just for graph testing
|
||||||
|
|
||||||
|
#define BASEATINDEX(w, l, i) (uint8_t)((((w)&(0x3LLU<<(((l)-(i))*2)))>>(((l)-(i))*2)) & 0x3LLU)
|
||||||
|
|
||||||
|
void ahoc_addOutputElement (aho_state *node, bool_t isdirect, uint32_t idx)
|
||||||
|
{
|
||||||
|
if (!node) return;
|
||||||
|
if (node->output.count == 0)
|
||||||
|
node->output.out_set = ECOMALLOC(sizeof(aho_output),
|
||||||
|
"Cannot allocate memory for aho-corasick state output element");
|
||||||
|
else
|
||||||
|
node->output.out_set = ECOREALLOC(node->output.out_set, (node->output.count+1)*sizeof(aho_output),
|
||||||
|
"Cannot allocate memory for aho-corasick state output element");
|
||||||
|
node->output.out_set[node->output.count].wordidx = idx;
|
||||||
|
node->output.out_set[node->output.count].isdirect = isdirect;
|
||||||
|
node->output.count++;
|
||||||
|
}
|
||||||
|
|
||||||
|
//is the passed output element in the set
|
||||||
|
bool_t ahoc_isOutputIn (aho_state *node, aho_output ot)
|
||||||
|
{
|
||||||
|
uint32_t i;
|
||||||
|
|
||||||
|
for (i=0; i<node->output.count; i++)
|
||||||
|
if (node->output.out_set[i].isdirect == ot.isdirect && node->output.out_set[i].wordidx == ot.wordidx) return TRUE;
|
||||||
|
return FALSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
//take union of output of the two nodes and put in node1
|
||||||
|
void ahoc_unionOutputElements (aho_state *node1, aho_state *node2)
|
||||||
|
{
|
||||||
|
uint32_t i;
|
||||||
|
|
||||||
|
for (i=0; i<node2->output.count; i++)
|
||||||
|
if (ahoc_isOutputIn (node1, node2->output.out_set[i]) == FALSE)
|
||||||
|
ahoc_addOutputElement (node1, node2->output.out_set[i].isdirect, node2->output.out_set[i].wordidx);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ahoc_addKeyword (aho_state *root, word_t w, bool_t isdirect, uint32_t idx, poptions_t options)
|
||||||
|
{
|
||||||
|
uint32_t i;
|
||||||
|
aho_state *nextnode = root;
|
||||||
|
uint8_t basecode;
|
||||||
|
static uint32_t state_id = 0;
|
||||||
|
|
||||||
|
//fprintf (stderr, "%s\n", ecoUnhashWord(w, options->primer_length));
|
||||||
|
for (i=1; i<=options->primer_length; i++)
|
||||||
|
{
|
||||||
|
basecode = BASEATINDEX (w, options->primer_length, i);
|
||||||
|
//fprintf (stderr, "%d", basecode);
|
||||||
|
if (nextnode->next[basecode] == NULL)
|
||||||
|
{
|
||||||
|
//add new state
|
||||||
|
nextnode->next[basecode] = ECOMALLOC(sizeof(aho_state),
|
||||||
|
"Cannot allocate memory for aho-corasick state");
|
||||||
|
nextnode = nextnode->next[basecode];
|
||||||
|
//initialize state
|
||||||
|
nextnode->id = ++state_id;
|
||||||
|
nextnode->next[0]=nextnode->next[1]=nextnode->next[2]=nextnode->next[3]=NULL;
|
||||||
|
nextnode->fail = NULL;
|
||||||
|
nextnode->output.count = 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
nextnode = nextnode->next[basecode];
|
||||||
|
}
|
||||||
|
//fprintf (stderr, "\n", basecode);
|
||||||
|
//new pattern addess so add node ouptup element
|
||||||
|
ahoc_addOutputElement (nextnode, isdirect, idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ahoc_buildKeywordTree (aho_state *root, pwordcount_t words, poptions_t options)
|
||||||
|
{
|
||||||
|
uint32_t i;
|
||||||
|
if (!root) return;
|
||||||
|
|
||||||
|
//init root
|
||||||
|
root->id = 0;
|
||||||
|
root->next[0]=root->next[1]=root->next[2]=root->next[3]=NULL;
|
||||||
|
root->fail = NULL;
|
||||||
|
root->output.count = 0;
|
||||||
|
|
||||||
|
//now add each word as a pattern in the keyword tree
|
||||||
|
for (i=0; i<words->size; i++)
|
||||||
|
{
|
||||||
|
//add direct word
|
||||||
|
word_t w=WORD(words->words[i]);
|
||||||
|
ahoc_addKeyword (root, w, TRUE, i, options);
|
||||||
|
|
||||||
|
//add reverse word
|
||||||
|
w=ecoComplementWord(w,options->primer_length);
|
||||||
|
ahoc_addKeyword (root, w, FALSE, i, options);
|
||||||
|
}
|
||||||
|
|
||||||
|
//loop on root if some base has no out going edge from roots
|
||||||
|
for (i=0; i<4; i++)
|
||||||
|
if (root->next[i] == NULL)
|
||||||
|
root->next[i] = root;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ahoc_enqueue (aho_queue *ahoqueue, aho_state *node)
|
||||||
|
{
|
||||||
|
queue_node *q;
|
||||||
|
if (node == NULL) return;
|
||||||
|
|
||||||
|
q = ECOMALLOC(sizeof(queue_node),
|
||||||
|
"Cannot allocate memory for aho-corasick queue node");
|
||||||
|
q->state_node = node;
|
||||||
|
q->next = NULL;
|
||||||
|
|
||||||
|
if (ahoqueue->first == NULL)
|
||||||
|
{
|
||||||
|
ahoqueue->first = q;
|
||||||
|
ahoqueue->last = q;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ahoqueue->last->next = q;
|
||||||
|
ahoqueue->last = q;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
aho_state *ahoc_dequeue (aho_queue *ahoqueue)
|
||||||
|
{
|
||||||
|
aho_state *node = NULL;
|
||||||
|
queue_node *q;
|
||||||
|
|
||||||
|
if (ahoqueue->first == NULL) return node;
|
||||||
|
q = ahoqueue->first;
|
||||||
|
ahoqueue->first = q->next;
|
||||||
|
|
||||||
|
node = q->state_node;
|
||||||
|
ECOFREE (q, "Cannot free memory for aho-corasick queue node");
|
||||||
|
return node;
|
||||||
|
}
|
||||||
|
|
||||||
|
//set fail links and output sets for the keyword tree
|
||||||
|
void ahoc_updateForFailAndOutput (aho_state *root)
|
||||||
|
{
|
||||||
|
int32_t i;
|
||||||
|
aho_queue Q;
|
||||||
|
aho_state *node_r;
|
||||||
|
aho_state *node_u;
|
||||||
|
aho_state *node_v;
|
||||||
|
|
||||||
|
//empty queue
|
||||||
|
Q.first = NULL;
|
||||||
|
Q.last = NULL;
|
||||||
|
|
||||||
|
//for us alphabet has 4 elements, A=0, C=1, G=2 and T=3
|
||||||
|
for (i=0; i<4; i++)
|
||||||
|
{
|
||||||
|
if (root->next[i] != root && root->next[i] != NULL)
|
||||||
|
{
|
||||||
|
root->next[i]->fail = root;
|
||||||
|
ahoc_enqueue (&Q, root->next[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//while queue not empty
|
||||||
|
while (Q.first != NULL)
|
||||||
|
{
|
||||||
|
node_r = ahoc_dequeue (&Q);
|
||||||
|
for (i=0; i<4; i++)
|
||||||
|
{
|
||||||
|
if (node_r->next[i] != NULL)
|
||||||
|
{
|
||||||
|
node_u = node_r->next[i];
|
||||||
|
ahoc_enqueue (&Q, node_u);
|
||||||
|
node_v = node_r->fail;
|
||||||
|
while (node_v->next[i] == NULL)
|
||||||
|
node_v = node_v->fail;
|
||||||
|
node_u->fail = node_v->next[i];
|
||||||
|
ahoc_unionOutputElements (node_u, node_u->fail);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ahoc_freeKeywordTree (aho_state *node)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
for (i=0; i<4; i++)
|
||||||
|
if (node->next[i])
|
||||||
|
ahoc_freeKeywordTree (node->next[i]);
|
||||||
|
if (node->output.count > 0)
|
||||||
|
ECOFREE (node->output.out_set, "Free failed for node output");
|
||||||
|
ECOFREE (node, "Free failed for node");
|
||||||
|
}
|
||||||
|
|
||||||
|
pprimercount_t ahoc_lookforStrictPrimers (pecodnadb_t database, uint32_t seqdbsize,uint32_t exampleCount,
|
||||||
|
pwordcount_t words,poptions_t options)
|
||||||
|
{
|
||||||
|
aho_state automaton_root;
|
||||||
|
aho_state *curr_state;
|
||||||
|
//uint32_t inSequenceQuorum;
|
||||||
|
uint32_t outSequenceQuorum;
|
||||||
|
pprimer_t data;
|
||||||
|
pprimercount_t primers;
|
||||||
|
uint32_t i, j, k;
|
||||||
|
int32_t pos;
|
||||||
|
uint32_t lmax;
|
||||||
|
char *base;
|
||||||
|
int8_t code;
|
||||||
|
uint32_t goodPrimers=0;
|
||||||
|
|
||||||
|
|
||||||
|
//inSequenceQuorum = (uint32_t)floor((float)exampleCount * options->sensitivity_quorum);
|
||||||
|
outSequenceQuorum = (uint32_t)floor((float)(seqdbsize-exampleCount) * options->false_positive_quorum);
|
||||||
|
|
||||||
|
//fprintf(stderr," Primers should be at least present in %d/%d example sequences\n",inSequenceQuorum,exampleCount);
|
||||||
|
fprintf(stderr," Primers should not be present in more than %d/%d counterexample sequences\n",outSequenceQuorum,(seqdbsize-exampleCount));
|
||||||
|
|
||||||
|
data = ECOMALLOC(words->size * sizeof(primer_t),
|
||||||
|
"Cannot allocate memory for fuzzy matching results");
|
||||||
|
for (i=0; i < words->size; i++)
|
||||||
|
{
|
||||||
|
data[i].word=WORD(words->words[i]);
|
||||||
|
data[i].inexample = 0;
|
||||||
|
data[i].outexample= 0;
|
||||||
|
|
||||||
|
data[i].directCount=ECOMALLOC(seqdbsize * sizeof(uint32_t),
|
||||||
|
"Cannot allocate memory for primer position");
|
||||||
|
data[i].directPos = ECOMALLOC(seqdbsize * sizeof(poslist_t),
|
||||||
|
"Cannot allocate memory for primer position");
|
||||||
|
data[i].reverseCount=ECOMALLOC(seqdbsize * sizeof(uint32_t),
|
||||||
|
"Cannot allocate memory for primer position");
|
||||||
|
data[i].reversePos = ECOMALLOC(seqdbsize * sizeof(poslist_t),
|
||||||
|
"Cannot allocate memory for primer position");
|
||||||
|
}
|
||||||
|
|
||||||
|
//build keywords automaton
|
||||||
|
ahoc_buildKeywordTree (&automaton_root, words, options);
|
||||||
|
//set fail links and output sets
|
||||||
|
ahoc_updateForFailAndOutput (&automaton_root);
|
||||||
|
|
||||||
|
//debug; print keywordtree in a gv file
|
||||||
|
//ahoc_graphKeywordTree (&automaton_root);
|
||||||
|
|
||||||
|
//loop on each sequence for its each base and find words
|
||||||
|
for (i=0; i < seqdbsize; i++)
|
||||||
|
{
|
||||||
|
if(database[i]->SQ_length <= options->primer_length) continue;
|
||||||
|
|
||||||
|
lmax = database[i]->SQ_length;
|
||||||
|
if (!options->circular)
|
||||||
|
lmax += options->primer_length-1;
|
||||||
|
curr_state = &automaton_root;
|
||||||
|
|
||||||
|
for (j=0,base=database[i]->SQ; j<lmax; j++,base++)
|
||||||
|
{
|
||||||
|
if (i==(uint32_t)database[i]->SQ_length) base=database[i]->SQ;
|
||||||
|
|
||||||
|
//code = encoder[(*base) - 'A'];
|
||||||
|
code = *base;
|
||||||
|
//if (iii++ < 30)
|
||||||
|
// fprintf (stderr, "%d:%d,", *base, code);
|
||||||
|
if (code < 0 || code > 3)
|
||||||
|
{
|
||||||
|
//if error char, start from root for next character
|
||||||
|
//+forget any incomplete words
|
||||||
|
curr_state = &automaton_root;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
while (curr_state->next[code] == NULL) curr_state = curr_state->fail;
|
||||||
|
curr_state = curr_state->next[code];
|
||||||
|
|
||||||
|
//start position of primer is options->primer_length-1 chars back
|
||||||
|
pos = j-options->primer_length+1;
|
||||||
|
if (pos < 0) pos = database[i]->SQ_length+pos;
|
||||||
|
|
||||||
|
//set output, if there is some output on this state then
|
||||||
|
//+all words in the output set complete here, so increment their
|
||||||
|
//+found properties for current sequence
|
||||||
|
for (k=0; k<curr_state->output.count; k++)
|
||||||
|
{
|
||||||
|
if (curr_state->output.out_set[k].isdirect)
|
||||||
|
data[curr_state->output.out_set[k].wordidx].directCount[i]++;
|
||||||
|
else
|
||||||
|
data[curr_state->output.out_set[k].wordidx].reverseCount[i]++;
|
||||||
|
|
||||||
|
if (options->no_multi_match)
|
||||||
|
{
|
||||||
|
if ((data[curr_state->output.out_set[k].wordidx].directCount[i] +
|
||||||
|
data[curr_state->output.out_set[k].wordidx].reverseCount[i]) > 1)
|
||||||
|
//since multimach not allowd, set an indication on 1st seq position that
|
||||||
|
//+ a multimatch was found, so that this word will be filtered out
|
||||||
|
//+ and because of first postion we wont have to search the whole array
|
||||||
|
//+ to find if it voilated nomultimatch constraint for some seq
|
||||||
|
data[curr_state->output.out_set[k].wordidx].directCount[0] = 2;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (curr_state->output.out_set[k].isdirect)
|
||||||
|
//direct word found on jth position of ith sequence
|
||||||
|
data[curr_state->output.out_set[k].wordidx].directPos[i].value = (uint32_t)pos;
|
||||||
|
else
|
||||||
|
//reverse word found on jth position of ith sequence
|
||||||
|
data[curr_state->output.out_set[k].wordidx].reversePos[i].value = (uint32_t)pos;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
//okay multi match allowed
|
||||||
|
if (curr_state->output.out_set[k].isdirect)
|
||||||
|
{
|
||||||
|
if (data[curr_state->output.out_set[k].wordidx].directCount[i] == 1)
|
||||||
|
data[curr_state->output.out_set[k].wordidx].directPos[i].value = (uint32_t)pos;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
//need to create or extend the positions list
|
||||||
|
if (data[curr_state->output.out_set[k].wordidx].directCount[i] == 2)
|
||||||
|
{
|
||||||
|
//for second element, first was put in .value, so dont forget to copy that in the array too
|
||||||
|
data[curr_state->output.out_set[k].wordidx].directPos[i].pointer = ECOMALLOC(2 * sizeof(uint32_t),
|
||||||
|
"Cannot allocate memory for primer position");
|
||||||
|
data[curr_state->output.out_set[k].wordidx].directPos[i].pointer[0] = data[curr_state->output.out_set[k].wordidx].directPos[i].value;
|
||||||
|
data[curr_state->output.out_set[k].wordidx].directPos[i].pointer[1] = (uint32_t)pos;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
//for third or greater element
|
||||||
|
data[curr_state->output.out_set[k].wordidx].directPos[i].pointer = ECOREALLOC(data[curr_state->output.out_set[k].wordidx].directPos[i].pointer,
|
||||||
|
data[curr_state->output.out_set[k].wordidx].directCount[i] * sizeof(uint32_t),
|
||||||
|
"Cannot allocate memory for primer position");
|
||||||
|
data[curr_state->output.out_set[k].wordidx].directPos[i].pointer[data[curr_state->output.out_set[k].wordidx].directCount[i]-1] = (uint32_t)pos;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (data[curr_state->output.out_set[k].wordidx].reverseCount[i] == 1)
|
||||||
|
data[curr_state->output.out_set[k].wordidx].reversePos[i].value = (uint32_t)pos;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
//need to create or extend the positions list
|
||||||
|
if (data[curr_state->output.out_set[k].wordidx].reverseCount[i] == 2)
|
||||||
|
{
|
||||||
|
//for second element, first was put in .value, so dont forget to copy that in the array too
|
||||||
|
data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer = ECOMALLOC(2 * sizeof(uint32_t),
|
||||||
|
"Cannot allocate memory for primer position");
|
||||||
|
data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer[0] = data[curr_state->output.out_set[k].wordidx].reversePos[i].value;
|
||||||
|
data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer[1] = (uint32_t)pos;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
//for third or greater element
|
||||||
|
data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer = ECOREALLOC(data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer,
|
||||||
|
data[curr_state->output.out_set[k].wordidx].reverseCount[i] * sizeof(uint32_t),
|
||||||
|
"Cannot allocate memory for primer position");
|
||||||
|
data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer[data[curr_state->output.out_set[k].wordidx].reverseCount[i]-1] = (uint32_t)pos;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
//dont forget to increment inexample or outexample count, but only once for a sequence
|
||||||
|
if ((data[curr_state->output.out_set[k].wordidx].directCount[i] +
|
||||||
|
data[curr_state->output.out_set[k].wordidx].reverseCount[i]) == 1)
|
||||||
|
{
|
||||||
|
if (database[i]->isexample)
|
||||||
|
data[curr_state->output.out_set[k].wordidx].inexample++;
|
||||||
|
else
|
||||||
|
data[curr_state->output.out_set[k].wordidx].outexample++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//Only thing that remains is to remove the failed words
|
||||||
|
for (i=0,j=0; i<words->size; i++)
|
||||||
|
{
|
||||||
|
fprintf(stderr,"Primers %5d/%lld analyzed => sequence : %s in %d example and %d counterexample sequences \r",
|
||||||
|
i+1,words->size,ecoUnhashWord(data[i].word,options->primer_length),
|
||||||
|
data[i].inexample,data[i].outexample);
|
||||||
|
|
||||||
|
//if (data[i].inexample < inSequenceQuorum || (data[i].directCount[0] == 2 && options->no_multi_match))
|
||||||
|
if (data[i].directCount[0] == 2 && options->no_multi_match)
|
||||||
|
{
|
||||||
|
//bad word, delete from the array
|
||||||
|
for (k=0; k<seqdbsize; k++)
|
||||||
|
{
|
||||||
|
if (data[i].directCount[k] > 1)
|
||||||
|
ECOFREE (data[i].directPos[k].pointer, "Cannot free position pointer.");
|
||||||
|
if (data[i].reverseCount[k] > 1)
|
||||||
|
ECOFREE (data[i].reversePos[k].pointer, "Cannot free position pointer.");
|
||||||
|
}
|
||||||
|
ECOFREE (data[i].directCount, "Cannot free position pointer.");
|
||||||
|
ECOFREE (data[i].directPos, "Cannot free position pointer.");
|
||||||
|
ECOFREE (data[i].reverseCount, "Cannot free position pointer.");
|
||||||
|
ECOFREE (data[i].reversePos, "Cannot free position pointer.");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
//data[i].good = data[i].inexample >= inSequenceQuorum && data[i].outexample <= outSequenceQuorum;
|
||||||
|
data[i].good = data[i].outexample <= outSequenceQuorum;
|
||||||
|
goodPrimers+=data[i].good? 1:0;
|
||||||
|
if (j < i)
|
||||||
|
data[j] = data[i];
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fprintf(stderr,"\n\nOn %lld analyzed primers %d respect quorum conditions\n",words->size,goodPrimers);
|
||||||
|
fprintf(stderr,"Conserved primers for further analysis : %d/%lld\n",j,words->size);
|
||||||
|
|
||||||
|
primers = ECOMALLOC(sizeof(primercount_t),"Cannot allocate memory for primer table");
|
||||||
|
primers->primers=ECOREALLOC(data,
|
||||||
|
j * sizeof(primer_t),
|
||||||
|
"Cannot reallocate memory for fuzzy matching results");
|
||||||
|
primers->size=j;
|
||||||
|
|
||||||
|
//free memory of keyword table
|
||||||
|
for (i=0; i<4; i++)
|
||||||
|
if (automaton_root.next[i] != &automaton_root)
|
||||||
|
ahoc_freeKeywordTree (automaton_root.next[i]);
|
||||||
|
|
||||||
|
return primers;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ahoc_graphPrintNodesInfo (aho_state *node, FILE* gfile)
|
||||||
|
{
|
||||||
|
uint32_t i;
|
||||||
|
fprintf (gfile, "\"%d\"[\n", node->id);
|
||||||
|
fprintf (gfile, "label=\"%d\\n", node->id);
|
||||||
|
for (i=0; i<node->output.count; i++)
|
||||||
|
fprintf (gfile, "%d%c,", node->output.out_set[i].wordidx, node->output.out_set[i].isdirect?'d':'r');
|
||||||
|
fprintf (gfile, "\"\n];\n");
|
||||||
|
|
||||||
|
for (i=0; i<4; i++)
|
||||||
|
if (node->next[i] != NULL && node->next[i] != node)
|
||||||
|
ahoc_graphPrintNodesInfo (node->next[i], gfile);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ahoc_graphPrintNodesLinks (aho_state *node, FILE* gfile)
|
||||||
|
{
|
||||||
|
uint32_t i;
|
||||||
|
static int j=0;
|
||||||
|
|
||||||
|
for (i=0; i<4; i++)
|
||||||
|
if (node->next[i] != NULL && node->next[i] != node)
|
||||||
|
{
|
||||||
|
fprintf (gfile, "\"%d\" -> \"%d\" [\n", node->id, node->next[i]->id);
|
||||||
|
fprintf (gfile, "label=\"%c\"\n];\n", "ACGT"[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (j++ < 40)
|
||||||
|
if (node->fail != NULL && node->fail != groot)
|
||||||
|
{
|
||||||
|
fprintf (gfile, "\"%d\" -> \"%d\" [\n", node->id, node->fail->id);
|
||||||
|
fprintf (gfile, "color= \"red\"\n];\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i=0; i<4; i++)
|
||||||
|
if (node->next[i] != NULL && node->next[i] != node)
|
||||||
|
ahoc_graphPrintNodesLinks (node->next[i], gfile);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ahoc_graphKeywordTree (aho_state *root)
|
||||||
|
{
|
||||||
|
FILE *gfile;
|
||||||
|
|
||||||
|
groot=root;
|
||||||
|
gfile = fopen ("keywordtree.gv", "w");
|
||||||
|
fprintf (gfile, "digraph keywordtree {\n");
|
||||||
|
ahoc_graphPrintNodesInfo (root, gfile);
|
||||||
|
ahoc_graphPrintNodesLinks (root, gfile);
|
||||||
|
fprintf (gfile, "}\n");
|
||||||
|
fclose(gfile);
|
||||||
|
}
|
||||||
|
|
43
src/libecoprimer/ahocorasick.h
Executable file
43
src/libecoprimer/ahocorasick.h
Executable file
@ -0,0 +1,43 @@
|
|||||||
|
/*
|
||||||
|
* ahocorasick.h
|
||||||
|
*
|
||||||
|
* Created on: 26 march 2011
|
||||||
|
* Author: tiayyba
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef H_ahocorasick
|
||||||
|
#define H_ahocorasick
|
||||||
|
|
||||||
|
#include "ecoprimer.h"
|
||||||
|
|
||||||
|
typedef struct aho_output_t{
|
||||||
|
uint32_t wordidx; //index of strict word (dont save the word of 64B)
|
||||||
|
bool_t isdirect; //we need to find both direct and reverse words so we must know which one is it
|
||||||
|
}aho_output;
|
||||||
|
|
||||||
|
typedef struct aho_output_count_t{
|
||||||
|
uint32_t count;
|
||||||
|
aho_output *out_set;
|
||||||
|
}aho_output_count;
|
||||||
|
|
||||||
|
typedef struct aho_state_t{
|
||||||
|
int32_t id;
|
||||||
|
struct aho_state_t *next[4]; //for labels A=0,C=1,G=2 and T=3
|
||||||
|
struct aho_state_t *fail;
|
||||||
|
aho_output_count output;
|
||||||
|
}aho_state;
|
||||||
|
|
||||||
|
typedef struct queue_node_t {
|
||||||
|
aho_state *state_node;
|
||||||
|
struct queue_node_t *next;
|
||||||
|
}queue_node;
|
||||||
|
|
||||||
|
typedef struct{
|
||||||
|
queue_node *first;
|
||||||
|
queue_node *last;
|
||||||
|
}aho_queue;
|
||||||
|
|
||||||
|
pprimercount_t ahoc_lookforStrictPrimers (pecodnadb_t database, uint32_t seqdbsize,uint32_t exampleCount,
|
||||||
|
pwordcount_t words,poptions_t options);
|
||||||
|
#endif /* H_ahocorasick */
|
||||||
|
|
131
src/libecoprimer/amplifiatree.c
Normal file
131
src/libecoprimer/amplifiatree.c
Normal file
@ -0,0 +1,131 @@
|
|||||||
|
/*
|
||||||
|
* amplifiatree.c
|
||||||
|
*
|
||||||
|
* Created on: 7 mars 2009
|
||||||
|
* Author: coissac
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "ecoprimer.h"
|
||||||
|
#include <search.h>
|
||||||
|
|
||||||
|
static void cleanamplifia(pamplifia_t amplifia);
|
||||||
|
static void deleteamplifialist(pamplifialist_t list);
|
||||||
|
static int cmpamplifia(const void* p1,const void*p2);
|
||||||
|
|
||||||
|
|
||||||
|
static void cleanamplifiatlist(pamplifiacount_t list)
|
||||||
|
{
|
||||||
|
if (list->amplifias)
|
||||||
|
ECOFREE(list->amplifias,
|
||||||
|
"Free amplifia list");
|
||||||
|
}
|
||||||
|
|
||||||
|
static void cleanamplifia(pamplifia_t amplifia)
|
||||||
|
{
|
||||||
|
cleanamplifiatlist(&(amplifia->pcr));
|
||||||
|
}
|
||||||
|
|
||||||
|
static pamplifialist_t newamplifialist(pamplifialist_t parent, size_t size)
|
||||||
|
{
|
||||||
|
pamplifialist_t tmp;
|
||||||
|
|
||||||
|
tmp=ECOMALLOC(sizeof(amplifialist_t)+sizeof(amplifia_t)*(size-1),
|
||||||
|
"Cannot allocate new amplifia list");
|
||||||
|
|
||||||
|
tmp->amplifiaslots=size;
|
||||||
|
tmp->amplifiacount=0;
|
||||||
|
tmp->next=NULL;
|
||||||
|
|
||||||
|
if (parent)
|
||||||
|
parent->next=(void*)tmp;
|
||||||
|
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void deleteamplifialist(pamplifialist_t list)
|
||||||
|
{
|
||||||
|
size_t i;
|
||||||
|
|
||||||
|
if (list)
|
||||||
|
{
|
||||||
|
if (list->next)
|
||||||
|
{
|
||||||
|
deleteamplifialist(list->next);
|
||||||
|
list->next=NULL;
|
||||||
|
}
|
||||||
|
for (i=0; i < list->amplifiacount; i++)
|
||||||
|
cleanamplifia((list->amplifias)+i);
|
||||||
|
|
||||||
|
ECOFREE(list,"Delete amplifia list");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
static int cmpamplifia(const void* p1,const void*p2)
|
||||||
|
{
|
||||||
|
pamplifia_t pr1,pr2;
|
||||||
|
|
||||||
|
pr1=(pamplifia_t)p1;
|
||||||
|
pr2=(pamplifia_t)p2;
|
||||||
|
|
||||||
|
if (pr1->p1 < pr2->p1) return -1;
|
||||||
|
if (pr1->p1 > pr2->p1) return 1;
|
||||||
|
|
||||||
|
if (pr1->asdirect1 < pr2->asdirect1) return -1;
|
||||||
|
if (pr1->asdirect1 > pr2->asdirect1) return 1;
|
||||||
|
|
||||||
|
if (pr1->p2 < pr2->p2) return -1;
|
||||||
|
if (pr1->p2 > pr2->p2) return 1;
|
||||||
|
|
||||||
|
if (pr1->asdirect2 < pr2->asdirect2) return -1;
|
||||||
|
if (pr1->asdirect2 > pr2->asdirect2) return 1;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
pamplifia_t amplifiaintree (amplifia_t key,
|
||||||
|
pamplifiatree_t amplifialist)
|
||||||
|
{
|
||||||
|
if (!amplifialist->tree)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
return *((pamplifia_t*)tsearch((const void *)(&key),
|
||||||
|
&(amplifialist->tree),
|
||||||
|
cmpamplifia
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
pamplifia_t insertamplifia(amplifia_t key,
|
||||||
|
pamplifiatree_t list)
|
||||||
|
{
|
||||||
|
pamplifia_t current;
|
||||||
|
pamplifia_t found;
|
||||||
|
|
||||||
|
if (list->last->amplifiacount==list->last->amplifiaslots)
|
||||||
|
{
|
||||||
|
list->last->next=newamplifialist(list,100);
|
||||||
|
list->last=list->last->next;
|
||||||
|
}
|
||||||
|
|
||||||
|
current = list->last->amplifias + list->last->amplifiacount;
|
||||||
|
*current=key;
|
||||||
|
|
||||||
|
found = *((pamplifia_t*)tsearch((const void *)current,
|
||||||
|
&(list->tree),
|
||||||
|
cmpamplifia));
|
||||||
|
if (found==current)
|
||||||
|
list->last->amplifiacount++;
|
||||||
|
|
||||||
|
return found;
|
||||||
|
}
|
||||||
|
|
||||||
|
pamplifiatree_t initamplifiatree(pamplifiatree_t tree)
|
||||||
|
{
|
||||||
|
if (!tree)
|
||||||
|
tree = ECOMALLOC(sizeof(amplifiatree_t),"Cannot allocate amplifia tree");
|
||||||
|
|
||||||
|
tree->first=newamplifialist(NULL,500);
|
||||||
|
tree->last=tree->first;
|
||||||
|
|
||||||
|
tree->tree=NULL;
|
||||||
|
}
|
@ -36,10 +36,10 @@ int32_t ManberNoErr(pecoseq_t pseq,ppattern_t pat,
|
|||||||
ppatternParam_t param,
|
ppatternParam_t param,
|
||||||
StackiPtr stkpos)
|
StackiPtr stkpos)
|
||||||
{
|
{
|
||||||
int32_t pos;
|
uint32_t pos;
|
||||||
uint32_t smask, r;
|
uint32_t smask, r;
|
||||||
uint8_t *data;
|
uint8_t *data;
|
||||||
int32_t end;
|
uint32_t end;
|
||||||
|
|
||||||
end = (size_t)(pseq->SQ_length);
|
end = (size_t)(pseq->SQ_length);
|
||||||
|
|
||||||
@ -84,11 +84,11 @@ int32_t ManberSub(pecoseq_t pseq,ppattern_t pat,
|
|||||||
StackiPtr stkpos)
|
StackiPtr stkpos)
|
||||||
{
|
{
|
||||||
int e, found;
|
int e, found;
|
||||||
int32_t pos;
|
uint32_t pos;
|
||||||
uint32_t smask, cmask, sindx;
|
uint32_t smask, cmask, sindx;
|
||||||
uint32_t *pr, r[2 * MAX_PAT_ERR + 2];
|
uint32_t *pr, r[2 * MAX_PAT_ERR + 2];
|
||||||
uint8_t *data;
|
uint8_t *data;
|
||||||
int32_t end;
|
uint32_t end;
|
||||||
|
|
||||||
end = (size_t)(pseq->SQ_length);
|
end = (size_t)(pseq->SQ_length);
|
||||||
|
|
||||||
@ -103,8 +103,7 @@ int32_t ManberSub(pecoseq_t pseq,ppattern_t pat,
|
|||||||
for (e = 0, pr = r + 3 ; e <= param->maxerr ; e++, pr += 2)
|
for (e = 0, pr = r + 3 ; e <= param->maxerr ; e++, pr += 2)
|
||||||
*pr = cmask;
|
*pr = cmask;
|
||||||
|
|
||||||
cmask = ~ param->omask; // A VOIR !!!!! omask (new) doit <20>tre compos<6F> de + et - ... Ancien omask : bits
|
cmask = ~ param->omask;
|
||||||
|
|
||||||
/* init. scan */
|
/* init. scan */
|
||||||
data = (uint8_t*)(pseq->SQ);
|
data = (uint8_t*)(pseq->SQ);
|
||||||
|
|
||||||
|
@ -39,8 +39,8 @@ ppattern_t buildPatternFromWord(word_t word, uint32_t patlen)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#ifdef IS_UPPER(c)
|
#ifdef IS_UPPER
|
||||||
#undef IS_UPPER(c)
|
#undef IS_UPPER
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* -------------------------------------------- */
|
/* -------------------------------------------- */
|
||||||
@ -59,9 +59,9 @@ void encodeSequence(ecoseq_t *seq)
|
|||||||
data = (uint8_t*)(seq->SQ);
|
data = (uint8_t*)(seq->SQ);
|
||||||
cseq = seq->SQ;
|
cseq = seq->SQ;
|
||||||
|
|
||||||
for (i=0;i<seq->SQ_length;i++,data++,cseq++)
|
for (i=0;i<(int)(seq->SQ_length);i++,data++,cseq++)
|
||||||
{
|
{
|
||||||
*data = encoder[(IS_UPPER(*cseq) ? *cseq - 'A' : 'Z')];
|
*data = encoder[(IS_UPPER(*cseq) ? *cseq : 'Z') - 'A'];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -82,7 +82,7 @@ pprimercount_t lookforAproxPrimer(pecodnadb_t database, uint32_t seqdbsize,uint3
|
|||||||
uint32_t inSequenceQuorum;
|
uint32_t inSequenceQuorum;
|
||||||
uint32_t outSequenceQuorum;
|
uint32_t outSequenceQuorum;
|
||||||
bool_t conserved = TRUE;
|
bool_t conserved = TRUE;
|
||||||
|
|
||||||
//poslist_t ttt;
|
//poslist_t ttt;
|
||||||
|
|
||||||
|
|
||||||
@ -97,7 +97,8 @@ pprimercount_t lookforAproxPrimer(pecodnadb_t database, uint32_t seqdbsize,uint3
|
|||||||
|
|
||||||
params.circular = options->circular;
|
params.circular = options->circular;
|
||||||
params.maxerr = options->error_max;
|
params.maxerr = options->error_max;
|
||||||
params.omask = (1 << options->strict_three_prime) -1;
|
// params.omask = (1 << options->strict_three_prime) -1;
|
||||||
|
params.omask = 0;
|
||||||
params.patlen = options->primer_length;
|
params.patlen = options->primer_length;
|
||||||
|
|
||||||
positions.val=NULL;
|
positions.val=NULL;
|
||||||
@ -200,7 +201,7 @@ pprimercount_t lookforAproxPrimer(pecodnadb_t database, uint32_t seqdbsize,uint3
|
|||||||
data[w].good = data[w].inexample >= inSequenceQuorum && data[w].outexample <= outSequenceQuorum;
|
data[w].good = data[w].inexample >= inSequenceQuorum && data[w].outexample <= outSequenceQuorum;
|
||||||
goodPrimers+=data[w].good? 1:0;
|
goodPrimers+=data[w].good? 1:0;
|
||||||
|
|
||||||
fprintf(stderr,"Primers %5d/%d analyzed => sequence : %s in %d example and %d counterexample sequences \r",
|
fprintf(stderr,"Primers %5d/%lld analyzed => sequence : %s in %d example and %d counterexample sequences \r",
|
||||||
i+1,words->size,ecoUnhashWord(data[w].word,options->primer_length),
|
i+1,words->size,ecoUnhashWord(data[w].word,options->primer_length),
|
||||||
data[w].inexample,data[w].outexample);
|
data[w].inexample,data[w].outexample);
|
||||||
|
|
||||||
@ -223,8 +224,8 @@ pprimercount_t lookforAproxPrimer(pecodnadb_t database, uint32_t seqdbsize,uint3
|
|||||||
ECOFREE(data[w].reversePos,"Free direct count table");
|
ECOFREE(data[w].reversePos,"Free direct count table");
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr,"\n\nOn %d analyzed primers %d respect quorum conditions\n",words->size,goodPrimers);
|
fprintf(stderr,"\n\nOn %lld analyzed primers %d respect quorum conditions\n",words->size,goodPrimers);
|
||||||
fprintf(stderr,"Conserved primers for further analysis : %d/%d\n",w,words->size);
|
fprintf(stderr,"Conserved primers for further analysis : %d/%lld\n",w,words->size);
|
||||||
|
|
||||||
primers = ECOMALLOC(sizeof(primercount_t),"Cannot allocate memory for primer table");
|
primers = ECOMALLOC(sizeof(primercount_t),"Cannot allocate memory for primer table");
|
||||||
primers->primers=ECOREALLOC(data,
|
primers->primers=ECOREALLOC(data,
|
||||||
|
@ -13,6 +13,7 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include "ecotype.h"
|
#include "ecotype.h"
|
||||||
#include "../libecoPCR/ecoPCR.h"
|
#include "../libecoPCR/ecoPCR.h"
|
||||||
|
#include "../libthermo/nnparams.h"
|
||||||
#include "apat.h"
|
#include "apat.h"
|
||||||
|
|
||||||
#define DEBUG
|
#define DEBUG
|
||||||
@ -47,6 +48,7 @@ typedef uint64_t word_t, *pword_t;
|
|||||||
#define WORDMASK(s) ((1LLU << ((s) * 2)) -1)
|
#define WORDMASK(s) ((1LLU << ((s) * 2)) -1)
|
||||||
#define LSHIFTWORD(x,s) (((x) << 2) & WORDMASK(s))
|
#define LSHIFTWORD(x,s) (((x) << 2) & WORDMASK(s))
|
||||||
#define RSHIFTWORD(x,s) (((x) & WORDMASK(s))>> 2)
|
#define RSHIFTWORD(x,s) (((x) & WORDMASK(s))>> 2)
|
||||||
|
#define ERRORMASK(s) ((int32_t)((1LLU << (s)) -1))
|
||||||
|
|
||||||
#define RAPPENDBASE(x,s,c) (LSHIFTWORD((x),(s)) | (word_t)(c))
|
#define RAPPENDBASE(x,s,c) (LSHIFTWORD((x),(s)) | (word_t)(c))
|
||||||
#define LAPPENDBASE(x,s,c) (RSHIFTWORD((x),(s)) | ((word_t)((~(c)) & 3) << (((s)-1) *2)))
|
#define LAPPENDBASE(x,s,c) (RSHIFTWORD((x),(s)) | ((word_t)((~(c)) & 3) << (((s)-1) *2)))
|
||||||
@ -65,42 +67,61 @@ typedef uint64_t word_t, *pword_t;
|
|||||||
#define MINI(x,y) (((x) < (y)) ? (x):(y))
|
#define MINI(x,y) (((x) < (y)) ? (x):(y))
|
||||||
#define MAXI(x,y) (((x) < (y)) ? (y):(x))
|
#define MAXI(x,y) (((x) < (y)) ? (y):(x))
|
||||||
|
|
||||||
|
#define FWORDSIZE (13)
|
||||||
|
#define FWORDMASK WORDMASK(FWORDSIZE)
|
||||||
|
#define FILTERWORD(x) ((uint32_t)((x) & FWORDMASK))
|
||||||
|
#define CFILTERWORD(x,s) ((uint32_t)(((x) >> (((s)-FWORDSIZE)*2)) & FWORDMASK))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
pword_t words;
|
pword_t words;
|
||||||
uint32_t *strictcount;
|
uint32_t *strictcount;
|
||||||
uint32_t inseqcount;
|
uint32_t inseqcount;
|
||||||
uint32_t outseqcount;
|
uint32_t outseqcount;
|
||||||
uint32_t size;
|
uint64_t size;
|
||||||
} wordcount_t, *pwordcount_t;
|
} wordcount_t, *pwordcount_t;
|
||||||
|
|
||||||
|
|
||||||
typedef union {
|
typedef union {
|
||||||
uint32_t *pointer;
|
uint32_t *pointer;
|
||||||
uint32_t value;
|
uint32_t value;
|
||||||
} poslist_t, *ppostlist_t;
|
} poslist_t, *pposlist_t;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* primer_t structure store fuzzy match positions for a primer
|
||||||
|
* on all sequences
|
||||||
|
*/
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
word_t word;
|
word_t word; //< code for the primer
|
||||||
uint32_t *directCount;
|
uint32_t *directCount; //< Occurrence count on direct strand
|
||||||
ppostlist_t directPos;
|
pposlist_t directPos; //< list of position list on direct strand
|
||||||
|
|
||||||
uint32_t *reverseCount;
|
uint32_t *reverseCount; //< Occurrence count on reverse strand
|
||||||
ppostlist_t reversePos;
|
pposlist_t reversePos; //< list of position list on reverse strand
|
||||||
bool_t good;
|
|
||||||
uint32_t inexample;
|
bool_t good; //< primer match more than quorum example and no
|
||||||
uint32_t outexample;
|
// more counterexample quorum.
|
||||||
|
|
||||||
|
uint32_t inexample; //< count of example sequences matching primer
|
||||||
|
uint32_t outexample; //< count of counterexample sequences matching primer
|
||||||
} primer_t, *pprimer_t;
|
} primer_t, *pprimer_t;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* primercount_t structure store fuzzy match positions for all primers
|
||||||
|
* on all sequences as a list of primer_t
|
||||||
|
*/
|
||||||
typedef struct {
|
typedef struct {
|
||||||
pprimer_t primers;
|
pprimer_t primers;
|
||||||
uint32_t size;
|
uint32_t size;
|
||||||
} primercount_t, *pprimercount_t;
|
} primercount_t, *pprimercount_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
word_t word;
|
pprimer_t primer;
|
||||||
uint32_t position;
|
uint32_t position;
|
||||||
bool_t strand;
|
bool_t strand;
|
||||||
bool_t good; /*TR: Added*/
|
|
||||||
} primermatch_t, *pprimermatch_t;
|
} primermatch_t, *pprimermatch_t;
|
||||||
|
|
||||||
/*TR: Added*/
|
/*TR: Added*/
|
||||||
@ -109,6 +130,21 @@ typedef struct {
|
|||||||
uint32_t matchcount;
|
uint32_t matchcount;
|
||||||
} primermatchcount_t, *pprimermatchcount_t;
|
} primermatchcount_t, *pprimermatchcount_t;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
pecoseq_t sequence;
|
||||||
|
bool_t strand;
|
||||||
|
const char *amplifia;
|
||||||
|
int32_t length;
|
||||||
|
uint32_t begin;
|
||||||
|
uint32_t end;
|
||||||
|
} amplifia_t, *pamplifia_t;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
pamplifia_t amplifias;
|
||||||
|
uint32_t ampcount;
|
||||||
|
uint32_t ampslot;
|
||||||
|
} amplifiacount_t, *pamplifiacount_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
char *amplifia;
|
char *amplifia;
|
||||||
int32_t *taxonids;
|
int32_t *taxonids;
|
||||||
@ -124,30 +160,64 @@ typedef struct {
|
|||||||
} taxampset_t, *ptaxampset_t;
|
} taxampset_t, *ptaxampset_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
word_t w1;
|
pprimer_t p1;
|
||||||
word_t w2;
|
bool_t asdirect1;
|
||||||
uint32_t inexample; /*inexample count*/
|
pprimer_t p2;
|
||||||
uint32_t outexample; /*outexample count*/
|
bool_t asdirect2;
|
||||||
|
|
||||||
uint32_t mind;
|
amplifiacount_t pcr;
|
||||||
uint32_t maxd;
|
|
||||||
|
uint32_t inexample; //< example sequence count
|
||||||
uint32_t ampsetcount;
|
uint32_t outexample; //< counterexample sequence count
|
||||||
uint32_t ampsetindex;
|
uint32_t intaxa; //< example taxa count
|
||||||
pampseqset_t ampset;
|
uint32_t outtaxa; //< counterexample taxa count
|
||||||
|
uint32_t notwellidentifiedtaxa;
|
||||||
uint32_t taxsetcount;
|
|
||||||
uint32_t taxsetindex;
|
int *wellIdentifiedSeqs; //< an array having elements equla to total seqs
|
||||||
ptaxampset_t taxset;
|
// values are either 0 or 1, if seq is well identified
|
||||||
|
// its 1 else 0
|
||||||
uint32_t oktaxoncount;
|
int *coveredSeqs; //< an array having elements equal to total seqs, 1 if seq is covered else 0
|
||||||
} pairs_t, *ppairs_t;
|
|
||||||
|
// these statistics are relative to inexample sequences
|
||||||
|
|
||||||
|
uint32_t mind; //< minimum distance between primers
|
||||||
|
uint32_t maxd; //< maximum distance between primers
|
||||||
|
uint32_t sumd; //< distance sum
|
||||||
|
uint32_t amplifiacount;
|
||||||
|
float yule;
|
||||||
|
float quorumin;
|
||||||
|
float quorumout;
|
||||||
|
float bs;
|
||||||
|
float bc;
|
||||||
|
int32_t refsequence;
|
||||||
|
//
|
||||||
|
// uint32_t taxsetcount;
|
||||||
|
// uint32_t taxsetindex;
|
||||||
|
// ptaxampset_t taxset;
|
||||||
|
//
|
||||||
|
// uint32_t oktaxoncount;
|
||||||
|
uint32_t curseqid;
|
||||||
|
float p1temp; //strict primer1 melting temperature
|
||||||
|
float p1mintemp; //approx primer1 minimum melting temperature
|
||||||
|
float p2temp; //strict primer2 melting temperature
|
||||||
|
float p2mintemp; //approx primer2 minimum melting temperature
|
||||||
|
} pair_t, *ppair_t;
|
||||||
|
|
||||||
/*TR: Added*/
|
/*TR: Added*/
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
ppairs_t pairs;
|
size_t paircount;
|
||||||
uint32_t paircount;
|
size_t pairslots;
|
||||||
}pairscount_t, *ppairscount_t;
|
void* next;
|
||||||
|
pair_t pairs[1];
|
||||||
|
} pairlist_t, *ppairlist_t;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
ppairlist_t first;
|
||||||
|
ppairlist_t last;
|
||||||
|
void *tree;
|
||||||
|
int32_t count;
|
||||||
|
} pairtree_t, *ppairtree_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
pword_t words;
|
pword_t words;
|
||||||
@ -168,15 +238,34 @@ typedef struct {
|
|||||||
uint32_t size;
|
uint32_t size;
|
||||||
} merge_t, *pmerge_t;
|
} merge_t, *pmerge_t;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
const char *amplifia;
|
||||||
|
bool_t strand;
|
||||||
|
int32_t length;
|
||||||
|
int32_t taxoncount;
|
||||||
|
void *taxontree;
|
||||||
|
}amptotaxon_t, *pamptotaxon_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
int32_t taxid;
|
||||||
|
void *amptree;
|
||||||
|
}taxontoamp_t, *ptaxontoamp_t;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
bool_t printAC;
|
||||||
|
bool_t statistics;
|
||||||
|
bool_t filtering;
|
||||||
uint32_t lmin; //**< Amplifia minimal length
|
uint32_t lmin; //**< Amplifia minimal length
|
||||||
uint32_t lmax; //**< Amplifia maximal length
|
uint32_t lmax; //**< Amplifia maximal length
|
||||||
uint32_t error_max; //**< maximum error count in fuzzy search
|
uint32_t error_max; //**< maximum error count in fuzzy search
|
||||||
uint32_t primer_length; //**< minimal length of the primers
|
uint32_t primer_length; //**< minimal length of the primers
|
||||||
int32_t *restricted_taxid; //**< limit amplification below these taxid
|
int32_t *restricted_taxid; //**< limit amplification below these taxid
|
||||||
int32_t *ignored_taxid; //**< no amplification below these taxid
|
int32_t *ignored_taxid; //**< no amplification below these taxid
|
||||||
|
int32_t *exception_taxid;
|
||||||
char *prefix;
|
char *prefix;
|
||||||
|
char *reference;
|
||||||
|
pecoseq_t refseq;
|
||||||
|
uint32_t refseqid;
|
||||||
uint32_t circular;
|
uint32_t circular;
|
||||||
uint32_t doublestrand;
|
uint32_t doublestrand;
|
||||||
float strict_quorum;
|
float strict_quorum;
|
||||||
@ -186,9 +275,26 @@ typedef struct {
|
|||||||
uint32_t strict_three_prime;
|
uint32_t strict_three_prime;
|
||||||
int32_t r; //**< count of restrited taxa (restricted_taxid array size)
|
int32_t r; //**< count of restrited taxa (restricted_taxid array size)
|
||||||
int32_t g; //**< count of ignored taxa (ignored_taxid array size)
|
int32_t g; //**< count of ignored taxa (ignored_taxid array size)
|
||||||
|
int32_t e; //**< count of ignored taxa (ignored_taxid array size)
|
||||||
bool_t no_multi_match;
|
bool_t no_multi_match;
|
||||||
char taxonrank[20]; //TR to count ranks against a pair
|
char taxonrank[20]; //TR to count ranks against a pair
|
||||||
int32_t taxonrankidx; //TR to count ranks against a pair
|
int32_t taxonrankidx; //TR to count ranks against a pair
|
||||||
|
|
||||||
|
// Some statistics useful for options filters
|
||||||
|
|
||||||
|
int32_t dbsize;
|
||||||
|
int32_t insamples;
|
||||||
|
int32_t outsamples;
|
||||||
|
int32_t intaxa;
|
||||||
|
int32_t outtaxa;
|
||||||
|
int saltmethod;
|
||||||
|
float salt;
|
||||||
|
PNNParams pnparm;
|
||||||
|
bool_t print_sets_of_primers;
|
||||||
|
float specificity_threshold;
|
||||||
|
int links_cnt;
|
||||||
|
float max_links_percent;
|
||||||
|
bool_t filter_on_links;
|
||||||
} options_t, *poptions_t;
|
} options_t, *poptions_t;
|
||||||
|
|
||||||
typedef ecoseq_t **pecodnadb_t;
|
typedef ecoseq_t **pecodnadb_t;
|
||||||
@ -196,12 +302,15 @@ typedef ecoseq_t **pecodnadb_t;
|
|||||||
void sortword(pword_t table,uint32_t N);
|
void sortword(pword_t table,uint32_t N);
|
||||||
|
|
||||||
|
|
||||||
pecodnadb_t readdnadb(const char *name, uint32_t *size);
|
pecodnadb_t readdnadb(const char *name, ecotaxonomy_t *taxonomy, uint32_t *size,poptions_t options);
|
||||||
|
|
||||||
int isGoodTaxon(ecotaxonomy_t *taxonomy,int32_t taxon,poptions_t options);
|
int isGoodTaxon(ecotaxonomy_t *taxonomy,int32_t taxon,poptions_t options);
|
||||||
|
int isExampleTaxon(ecotaxonomy_t *taxonomy,int32_t taxon,poptions_t options);
|
||||||
|
int isCounterExampleTaxon(ecotaxonomy_t *taxonomy,int32_t taxon,poptions_t options);
|
||||||
|
|
||||||
uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq);
|
uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq);
|
||||||
pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint32_t doublestrand, ecoseq_t *seq,uint32_t *size);
|
pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint32_t doublestrand, ecoseq_t *seq,uint32_t *size,int32_t *neededWords,uint32_t neededWordCount,
|
||||||
|
int32_t quorum);
|
||||||
uint32_t ecoCompactHashSequence(pword_t dest,uint32_t size);
|
uint32_t ecoCompactHashSequence(pword_t dest,uint32_t size);
|
||||||
const char* ecoUnhashWord(word_t word,uint32_t size);
|
const char* ecoUnhashWord(word_t word,uint32_t size);
|
||||||
word_t ecoComplementWord(word_t word,uint32_t size);
|
word_t ecoComplementWord(word_t word,uint32_t size);
|
||||||
@ -209,8 +318,8 @@ uint32_t ecoFindWord(pwordcount_t table,word_t word);
|
|||||||
|
|
||||||
|
|
||||||
void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,uint32_t seqQuorum);
|
void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,uint32_t seqQuorum);
|
||||||
pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,ecoseq_t *seq);
|
pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount);
|
||||||
void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t exampleCount,uint32_t seqQuorum,ecoseq_t *seq);
|
void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t exampleCount,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount);
|
||||||
|
|
||||||
pqueue_t newQueue(pqueue_t queue, uint32_t size);
|
pqueue_t newQueue(pqueue_t queue, uint32_t size);
|
||||||
pqueue_t resizeQueue(pqueue_t queue, uint32_t size);
|
pqueue_t resizeQueue(pqueue_t queue, uint32_t size);
|
||||||
@ -232,7 +341,26 @@ pprimercount_t lookforAproxPrimer(pecodnadb_t database, uint32_t seqdbsize,uint3
|
|||||||
|
|
||||||
void sortmatch(pprimermatch_t table,uint32_t N);
|
void sortmatch(pprimermatch_t table,uint32_t N);
|
||||||
|
|
||||||
|
ppairtree_t initpairtree(ppairtree_t tree);
|
||||||
|
ppair_t pairintree (pair_t key,ppairtree_t pairlist);
|
||||||
|
ppair_t insertpair(pair_t key,ppairtree_t list);
|
||||||
|
|
||||||
|
|
||||||
/*TR: Added*/
|
/*TR: Added*/
|
||||||
pairscount_t buildPrimerPairs(pecodnadb_t seqdb,uint32_t seqdbsize,pprimercount_t primers,poptions_t options);
|
ppairtree_t buildPrimerPairs(pecodnadb_t seqdb,uint32_t seqdbsize,pprimercount_t primers,poptions_t options);
|
||||||
|
|
||||||
|
int32_t counttaxon(int32_t taxid);
|
||||||
|
int32_t getrankdbstats(pecodnadb_t seqdb,
|
||||||
|
uint32_t seqdbsize,
|
||||||
|
ecotaxonomy_t *taxonomy,
|
||||||
|
poptions_t options);
|
||||||
|
float taxonomycoverage(ppair_t pair, poptions_t options, pecodnadb_t seqdb,uint32_t seqdbsize);
|
||||||
|
char ecoComplementChar(char base);
|
||||||
|
void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize);
|
||||||
|
|
||||||
|
int32_t *filteringSeq(pecodnadb_t database, uint32_t seqdbsize,
|
||||||
|
uint32_t exampleCount,poptions_t options,uint32_t *size,int32_t sequenceQuorum);
|
||||||
|
|
||||||
|
void printSeqTest(pecodnadb_t seqdb,uint32_t seqdbsize);
|
||||||
|
|
||||||
#endif /* EPSORT_H_ */
|
#endif /* EPSORT_H_ */
|
||||||
|
222
src/libecoprimer/filtering.c
Normal file
222
src/libecoprimer/filtering.c
Normal file
@ -0,0 +1,222 @@
|
|||||||
|
/*
|
||||||
|
* filtering.c
|
||||||
|
*
|
||||||
|
* Created on: 12 mai 2009
|
||||||
|
* Author: coissac
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "ecoprimer.h"
|
||||||
|
#include <string.h>
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
#include "hashencoder.h"
|
||||||
|
|
||||||
|
static int32_t *ecoFilteringHashSequence(int32_t *dest,
|
||||||
|
uint32_t circular,
|
||||||
|
uint32_t doublestrand,
|
||||||
|
ecoseq_t *seq,
|
||||||
|
uint32_t *size);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
static int32_t *ecoFilteringHashSequence(int32_t *dest,
|
||||||
|
uint32_t circular,
|
||||||
|
uint32_t doublestrand,
|
||||||
|
ecoseq_t *seq,
|
||||||
|
uint32_t *size)
|
||||||
|
{
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This function aims at building a vector of count for every possible hash codes
|
||||||
|
*
|
||||||
|
* The function must be applied once on each sequence
|
||||||
|
*
|
||||||
|
* The function allocates memory on the first call for the dest table
|
||||||
|
* The function also allocates memory for the static temporary table in_last_seq and
|
||||||
|
* the function must be called with *dest == -1 in order to free this temporary table
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
static char *in_last_seq=NULL;
|
||||||
|
uint32_t i=0;
|
||||||
|
uint32_t j;
|
||||||
|
char *base;
|
||||||
|
int8_t code;
|
||||||
|
int32_t error=0;
|
||||||
|
word_t word=0;
|
||||||
|
word_t antiword=0;
|
||||||
|
uint32_t goodword;
|
||||||
|
uint32_t lmax=0;
|
||||||
|
|
||||||
|
// run on the first call;
|
||||||
|
|
||||||
|
if (dest==(void*)-1)
|
||||||
|
{
|
||||||
|
if (in_last_seq) ECOFREE(in_last_seq,"Free in last seq table");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* FWORDSIZE = 13 => *size = 67 108 864
|
||||||
|
* FWORDSIZE = 14 => *size = 268 435 456
|
||||||
|
* FWORDSIZE = 15 => *size = 1 073 741 824
|
||||||
|
*/
|
||||||
|
|
||||||
|
*size = pow(4,FWORDSIZE);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* in_last_seq is a vector of char as it is just to avoid counting twice (or more) a hash code (a DNA word)
|
||||||
|
* it is set to zero (with memset) and then filled with ones for each word belonging to the sequence
|
||||||
|
*/
|
||||||
|
|
||||||
|
if (!in_last_seq)
|
||||||
|
in_last_seq = ECOMALLOC(*size*sizeof(char),
|
||||||
|
"Cannot allocate filtering hash table");
|
||||||
|
|
||||||
|
memset(in_last_seq,0,*size*sizeof(char));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Allocate (on first call) the memory for the table of counts
|
||||||
|
*/
|
||||||
|
|
||||||
|
if (!dest)
|
||||||
|
{
|
||||||
|
dest = ECOMALLOC(*size*sizeof(int32_t),
|
||||||
|
"Cannot allocate filtering hash table");
|
||||||
|
memset(dest,0,*size*sizeof(int32_t));
|
||||||
|
}
|
||||||
|
|
||||||
|
lmax = seq->SQ_length;
|
||||||
|
if (!circular)
|
||||||
|
lmax-= FWORDSIZE-1;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Compute first word of seq
|
||||||
|
*/
|
||||||
|
|
||||||
|
for (i=0, base = seq->SQ; i < FWORDSIZE && i < lmax; i++,base++)
|
||||||
|
{
|
||||||
|
error<<= 1;
|
||||||
|
error&=ERRORMASK(FWORDSIZE);
|
||||||
|
|
||||||
|
code = encoder[(*base) - 'A'];
|
||||||
|
if (code <0)
|
||||||
|
{
|
||||||
|
code = 0;
|
||||||
|
error|= 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
word=RAPPENDBASE(word,FWORDSIZE,code);
|
||||||
|
if (doublestrand)
|
||||||
|
antiword=LAPPENDBASE(antiword,FWORDSIZE,code);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!error && i==FWORDSIZE)
|
||||||
|
{
|
||||||
|
|
||||||
|
goodword=(uint32_t)((doublestrand) ? MINI(word,antiword):word);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* FB: I don't think the test is necessary as the table has just been initialized
|
||||||
|
*/
|
||||||
|
if (!in_last_seq[goodword])
|
||||||
|
{
|
||||||
|
in_last_seq[goodword]=1;
|
||||||
|
dest[goodword]++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* compute and store counts (avoid counting twice a word) for the other words of the seq
|
||||||
|
*/
|
||||||
|
|
||||||
|
for (j=1; j < lmax; j++,i++,base++)
|
||||||
|
{
|
||||||
|
|
||||||
|
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j));
|
||||||
|
|
||||||
|
/* roll over the sequence for circular ones */
|
||||||
|
if (i==(uint32_t)seq->SQ_length) base=seq->SQ;
|
||||||
|
|
||||||
|
error<<= 1;
|
||||||
|
error&=ERRORMASK(FWORDSIZE);
|
||||||
|
|
||||||
|
//code = -1;
|
||||||
|
//if((*base) >= 'A' && (*base) <= 'Z')
|
||||||
|
code = encoder[(*base) - 'A'];
|
||||||
|
if (code <0)
|
||||||
|
{
|
||||||
|
code = 0;
|
||||||
|
error|= 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
word=RAPPENDBASE(word,FWORDSIZE,code);
|
||||||
|
if (doublestrand)
|
||||||
|
antiword=LAPPENDBASE(antiword,FWORDSIZE,code);
|
||||||
|
|
||||||
|
if (!error)
|
||||||
|
{
|
||||||
|
if (doublestrand)
|
||||||
|
goodword=(uint32_t)MINI(word,antiword);
|
||||||
|
else
|
||||||
|
goodword=word;
|
||||||
|
if (!in_last_seq[goodword])
|
||||||
|
{
|
||||||
|
in_last_seq[goodword]=1;
|
||||||
|
dest[goodword]++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
return dest;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int32_t *filteringSeq(pecodnadb_t database, uint32_t seqdbsize,
|
||||||
|
uint32_t exampleCount,poptions_t options,uint32_t *size,int32_t sequenceQuorum)
|
||||||
|
{
|
||||||
|
int32_t *wordscount=NULL;
|
||||||
|
int32_t keep=0;
|
||||||
|
uint32_t i,j=0;
|
||||||
|
|
||||||
|
for (i=0;i<seqdbsize;i++)
|
||||||
|
{
|
||||||
|
if (database[i]->isexample && database[i]->SQ_length > options->primer_length)
|
||||||
|
{
|
||||||
|
j++;
|
||||||
|
wordscount=ecoFilteringHashSequence(wordscount,
|
||||||
|
options->circular,
|
||||||
|
options->doublestrand,
|
||||||
|
database[i],
|
||||||
|
size);
|
||||||
|
}
|
||||||
|
fprintf(stderr," Filtered sequences %5u/%5u \r",j,exampleCount);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr,"\n");
|
||||||
|
|
||||||
|
for (i=0;i<*size;i++)
|
||||||
|
if (wordscount[i] >= sequenceQuorum)
|
||||||
|
keep++;
|
||||||
|
|
||||||
|
|
||||||
|
(void)ecoFilteringHashSequence((int32_t*)-1,
|
||||||
|
options->circular,
|
||||||
|
options->doublestrand,
|
||||||
|
NULL,
|
||||||
|
NULL);
|
||||||
|
|
||||||
|
fprintf(stderr,"ok\n Considered word of size %d for filtering : %d\n",FWORDSIZE,keep);
|
||||||
|
return wordscount;
|
||||||
|
|
||||||
|
}
|
@ -12,16 +12,53 @@ int isGoodTaxon(ecotaxonomy_t *taxonomy,int32_t taxon,poptions_t options)
|
|||||||
{
|
{
|
||||||
int result;
|
int result;
|
||||||
|
|
||||||
result=( (options->r == 0) || (eco_is_taxid_included(taxonomy,
|
result=((options->r == 0) || (eco_is_taxid_included(taxonomy,
|
||||||
options->restricted_taxid,
|
options->restricted_taxid,
|
||||||
options->r,
|
options->r,
|
||||||
taxonomy->taxons->taxon[taxon].taxid)
|
taxonomy->taxons->taxon[taxon].taxid)
|
||||||
)) &&
|
)) &&
|
||||||
((options->g == 0) || !(eco_is_taxid_included(taxonomy,
|
((options->e == 0) || !(eco_is_taxid_included(taxonomy,
|
||||||
options->ignored_taxid,
|
options->exception_taxid,
|
||||||
options->g,
|
options->e,
|
||||||
taxonomy->taxons->taxon[taxon].taxid)
|
taxonomy->taxons->taxon[taxon].taxid)
|
||||||
));
|
));
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int isExampleTaxon(ecotaxonomy_t *taxonomy,int32_t taxon,poptions_t options)
|
||||||
|
{
|
||||||
|
int result;
|
||||||
|
|
||||||
|
result=( (options->r == 0) || (eco_is_taxid_included(taxonomy,
|
||||||
|
options->restricted_taxid,
|
||||||
|
options->r,
|
||||||
|
taxonomy->taxons->taxon[taxon].taxid)
|
||||||
|
)) &&
|
||||||
|
((options->e == 0) || !(eco_is_taxid_included(taxonomy,
|
||||||
|
options->exception_taxid,
|
||||||
|
options->e,
|
||||||
|
taxonomy->taxons->taxon[taxon].taxid)
|
||||||
|
));
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int isCounterExampleTaxon(ecotaxonomy_t *taxonomy,int32_t taxon,poptions_t options)
|
||||||
|
{
|
||||||
|
int result;
|
||||||
|
|
||||||
|
result=((options->g != 0) && (eco_is_taxid_included(taxonomy,
|
||||||
|
options->ignored_taxid,
|
||||||
|
options->g,
|
||||||
|
taxonomy->taxons->taxon[taxon].taxid))
|
||||||
|
) || ((options->e != 0) && (eco_is_taxid_included(taxonomy,
|
||||||
|
options->exception_taxid,
|
||||||
|
options->e,
|
||||||
|
taxonomy->taxons->taxon[taxon].taxid))
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
13
src/libecoprimer/hashencoder.h
Normal file
13
src/libecoprimer/hashencoder.h
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
/*
|
||||||
|
* hashencoder.h
|
||||||
|
*
|
||||||
|
* Created on: 12 mai 2009
|
||||||
|
* Author: coissac
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef HASHENCODER_H_
|
||||||
|
#define HASHENCODER_H_
|
||||||
|
|
||||||
|
extern int8_t encoder[];
|
||||||
|
|
||||||
|
#endif /* HASHENCODER_H_ */
|
@ -10,7 +10,9 @@
|
|||||||
|
|
||||||
static int cmpword(const void *x,const void *y);
|
static int cmpword(const void *x,const void *y);
|
||||||
|
|
||||||
static int8_t encoder[] = {0, // A
|
#include "hashencoder.h"
|
||||||
|
|
||||||
|
int8_t encoder[] = {0, // A
|
||||||
-1, // b
|
-1, // b
|
||||||
1, // C
|
1, // C
|
||||||
-1,-1,-1, // d, e, f
|
-1,-1,-1, // d, e, f
|
||||||
@ -31,8 +33,29 @@ uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq)
|
|||||||
return wordcount;
|
return wordcount;
|
||||||
}
|
}
|
||||||
|
|
||||||
pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint32_t doublestrand, ecoseq_t *seq,uint32_t *size)
|
pword_t ecoHashSequence(pword_t dest,
|
||||||
|
uint32_t wordsize,
|
||||||
|
uint32_t circular,
|
||||||
|
uint32_t doublestrand,
|
||||||
|
ecoseq_t *seq,
|
||||||
|
uint32_t *size,
|
||||||
|
int32_t *neededWords,
|
||||||
|
uint32_t neededWordCount,
|
||||||
|
int32_t quorum)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
/*
|
||||||
|
* dest / out : words of the hashed sequence position per position
|
||||||
|
* wordsize / in : size of the word to be hashed (record error for that size) BUT not equal to FWORDSIZE ...
|
||||||
|
* ... the size of the word REALLY returned as a result
|
||||||
|
* circular / in : is the sequence circular
|
||||||
|
* doublestrand / in : if we have to hash on both strands of the sequence
|
||||||
|
* seq / in : the sequence in ecoseq format
|
||||||
|
* size / out : number of hashed words (size of the dest vector)
|
||||||
|
* neededWordCount / in : table hash codes of word counts in the full DB (used to filter words)
|
||||||
|
* quorum / in : minimum quorum used to filter words based on the neededWordCount table
|
||||||
|
*/
|
||||||
|
|
||||||
uint32_t i=0;
|
uint32_t i=0;
|
||||||
uint32_t j;
|
uint32_t j;
|
||||||
char *base;
|
char *base;
|
||||||
@ -40,6 +63,7 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
|
|||||||
int32_t error=0;
|
int32_t error=0;
|
||||||
word_t word=0;
|
word_t word=0;
|
||||||
word_t antiword=0;
|
word_t antiword=0;
|
||||||
|
word_t goodword;
|
||||||
uint32_t lmax=0;
|
uint32_t lmax=0;
|
||||||
|
|
||||||
(*size)=0;
|
(*size)=0;
|
||||||
@ -53,11 +77,13 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
|
|||||||
"I cannot allocate memory for sequence hashing"
|
"I cannot allocate memory for sequence hashing"
|
||||||
);
|
);
|
||||||
|
|
||||||
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i));
|
//DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i));
|
||||||
|
|
||||||
for (i=0, base = seq->SQ; i < wordsize && i < lmax; i++,base++)
|
for (i=0, base = seq->SQ; i < wordsize && i < lmax; i++,base++)
|
||||||
{
|
{
|
||||||
|
|
||||||
error<<= 1;
|
error<<= 1;
|
||||||
|
error&=ERRORMASK(wordsize);
|
||||||
|
|
||||||
code = encoder[(*base) - 'A'];
|
code = encoder[(*base) - 'A'];
|
||||||
if (code <0)
|
if (code <0)
|
||||||
@ -68,10 +94,22 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
|
|||||||
|
|
||||||
|
|
||||||
word=RAPPENDBASE(word,wordsize,code);
|
word=RAPPENDBASE(word,wordsize,code);
|
||||||
|
|
||||||
if (doublestrand)
|
if (doublestrand)
|
||||||
antiword=LAPPENDBASE(antiword,wordsize,code);
|
antiword=LAPPENDBASE(antiword,wordsize,code);
|
||||||
|
|
||||||
|
if (neededWordCount && i>=(FWORDSIZE-1))
|
||||||
|
{
|
||||||
|
|
||||||
|
goodword = (doublestrand) ? MINI(FILTERWORD(word),CFILTERWORD(antiword,wordsize)):FILTERWORD(word);
|
||||||
|
if (neededWords[(uint32_t)goodword]<quorum)
|
||||||
|
error|= (1 << (FWORDSIZE-1));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (!error && i==wordsize)
|
if (!error && i==wordsize)
|
||||||
{
|
{
|
||||||
dest[*size]=(doublestrand) ? MINI(word,antiword):word;
|
dest[*size]=(doublestrand) ? MINI(word,antiword):word;
|
||||||
@ -82,12 +120,14 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
|
|||||||
for (j=1; j < lmax; j++,i++,base++)
|
for (j=1; j < lmax; j++,i++,base++)
|
||||||
{
|
{
|
||||||
|
|
||||||
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j));
|
//DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j));
|
||||||
|
|
||||||
/* roll over the sequence for circular ones */
|
/* roll over the sequence for circular ones */
|
||||||
|
|
||||||
if (i==(uint32_t)seq->SQ_length) base=seq->SQ;
|
if (i==(uint32_t)seq->SQ_length) base=seq->SQ;
|
||||||
|
|
||||||
error<<= 1;
|
error<<= 1;
|
||||||
|
error&=ERRORMASK(wordsize);
|
||||||
|
|
||||||
code = encoder[(*base) - 'A'];
|
code = encoder[(*base) - 'A'];
|
||||||
if (code <0)
|
if (code <0)
|
||||||
@ -100,6 +140,17 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
|
|||||||
if (doublestrand)
|
if (doublestrand)
|
||||||
antiword=LAPPENDBASE(antiword,wordsize,code);
|
antiword=LAPPENDBASE(antiword,wordsize,code);
|
||||||
|
|
||||||
|
if (neededWordCount)
|
||||||
|
{
|
||||||
|
goodword = (doublestrand) ? MINI(FILTERWORD(word),CFILTERWORD(antiword,wordsize)):FILTERWORD(word);
|
||||||
|
if (neededWords[(uint32_t)goodword]<quorum)
|
||||||
|
error|= (1 << (FWORDSIZE-1));
|
||||||
|
// else
|
||||||
|
// DEBUG_LOG("%s goodword = %p %d/%d (pos:%d error:%d)",seq->AC,goodword,neededWords[(uint32_t)goodword],quorum,i,error);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
if (!error)
|
if (!error)
|
||||||
{
|
{
|
||||||
dest[*size]=(doublestrand) ? MINI(word,antiword):word;
|
dest[*size]=(doublestrand) ? MINI(word,antiword):word;
|
||||||
@ -107,24 +158,34 @@ pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
//DEBUG_LOG("%s goodword = %d",seq->AC,*size);
|
||||||
return dest;
|
return dest;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t ecoCompactHashSequence(pword_t table,uint32_t size)
|
uint32_t ecoCompactHashSequence(pword_t table,uint32_t size)
|
||||||
{
|
{
|
||||||
|
/*
|
||||||
|
*
|
||||||
|
* MULTIWORD is a word occurring more than once in a sequence
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
uint32_t i,j;
|
uint32_t i,j;
|
||||||
word_t current;
|
word_t current;
|
||||||
|
// bool_t here=FALSE;
|
||||||
|
|
||||||
sortword(table,size);
|
sortword(table,size);
|
||||||
|
|
||||||
current = 0;
|
current = 0;
|
||||||
current=SETMULTIWORD(current); /* build impossible word for the first loop cycle */
|
current=SETMULTIWORD(current); /* build impossible word for the first loop cycle */
|
||||||
|
|
||||||
|
// if (strcmp(ecoUnhashWord(table[size-1],18),"GTTTGTTCAACGATTAAA")==0)
|
||||||
|
// here=TRUE;
|
||||||
|
|
||||||
for (i=0,j=0; j < size;j++)
|
for (i=0,j=0; j < size;j++)
|
||||||
{
|
{
|
||||||
if (table[j]!=current)
|
if (WORD(table[j])!=current)
|
||||||
{
|
{
|
||||||
current =table[j];
|
current =table[j];
|
||||||
table[i]=current;
|
table[i]=current;
|
||||||
@ -134,6 +195,9 @@ uint32_t ecoCompactHashSequence(pword_t table,uint32_t size)
|
|||||||
table[i]=SETMULTIWORD(table[i]);
|
table[i]=SETMULTIWORD(table[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// if (strcmp(ecoUnhashWord(WORD(table[i-1]),18),"TACGACCTCGATGTTGGA")==0)
|
||||||
|
// DEBUG_LOG("winner %d",i)
|
||||||
|
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -201,3 +265,8 @@ uint32_t ecoFindWord(pwordcount_t table,word_t word)
|
|||||||
return ~0;
|
return ~0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
char ecoComplementChar(char base)
|
||||||
|
{
|
||||||
|
return (base < 4)? ~base & 3: 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -250,8 +250,7 @@ void CursiToTop(StackiPtr stki)
|
|||||||
|
|
||||||
} /* end of CursiToTop */
|
} /* end of CursiToTop */
|
||||||
|
|
||||||
void CursiToBottom(stki)
|
void CursiToBottom(StackiPtr stki)
|
||||||
StackiPtr stki;
|
|
||||||
{
|
{
|
||||||
stki->cursor = 0;
|
stki->cursor = 0;
|
||||||
|
|
||||||
|
@ -16,6 +16,7 @@
|
|||||||
|
|
||||||
|
|
||||||
#include "ecotype.h"
|
#include "ecotype.h"
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
/* ==================================================== */
|
/* ==================================================== */
|
||||||
/* Constantes de dimensionnement */
|
/* Constantes de dimensionnement */
|
||||||
@ -85,5 +86,6 @@ bool_t SearchDownStacki (StackiPtr stki , int32_t sval );
|
|||||||
bool_t BinSearchStacki (StackiPtr stki , int32_t sval );
|
bool_t BinSearchStacki (StackiPtr stki , int32_t sval );
|
||||||
bool_t SameStacki (StackiPtr stki1 , StackiPtr stki2 );
|
bool_t SameStacki (StackiPtr stki1 , StackiPtr stki2 );
|
||||||
bool_t ReverseStacki (StackiPtr stki );
|
bool_t ReverseStacki (StackiPtr stki );
|
||||||
|
void CursiToBottom (StackiPtr stki );
|
||||||
|
|
||||||
#endif /* _H_libstki */
|
#endif /* _H_libstki */
|
||||||
|
@ -10,7 +10,7 @@
|
|||||||
static pmerge_t mergeInit(pmerge_t merge,pwordcount_t data,uint32_t s1,uint32_t s2);
|
static pmerge_t mergeInit(pmerge_t merge,pwordcount_t data,uint32_t s1,uint32_t s2);
|
||||||
|
|
||||||
|
|
||||||
static pmerge_t mergeInit(pmerge_t merge, pwordcount_t data,uint32_t s1,uint32_t s2)
|
static pmerge_t mergeInit(pmerge_t merge, pwordcount_t data, uint32_t s1, uint32_t s2)
|
||||||
{
|
{
|
||||||
merge->words = data->words;
|
merge->words = data->words;
|
||||||
merge->count = data->strictcount;
|
merge->count = data->strictcount;
|
||||||
@ -26,23 +26,52 @@ typedef enum {S1=1,S2=2,STACK=3} source_t;
|
|||||||
|
|
||||||
void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,uint32_t seqQuorum)
|
void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,uint32_t seqQuorum)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
/*
|
||||||
|
* data / in out : the table that contains the two parts to be merged
|
||||||
|
* s1 / in : end of the first part of the table
|
||||||
|
* s2 / in : end of the second part of the table
|
||||||
|
* remainingSeq / in : the number of remaining seqs to be added to the table
|
||||||
|
* seqQuorum / in : the minimum number of sequences in which a pattern must appear
|
||||||
|
*/
|
||||||
|
|
||||||
merge_t merged;
|
merge_t merged;
|
||||||
source_t source;
|
source_t source;
|
||||||
word_t currentword,tmpword;
|
word_t currentword,tmpword;
|
||||||
uint32_t currentcount,tmpcount;
|
uint32_t currentcount,tmpcount;
|
||||||
int same;
|
int same;
|
||||||
queue_t queue;
|
queue_t queue;
|
||||||
int nsame=0;
|
// int nsame=0;
|
||||||
uint32_t maxcount=0;
|
uint32_t maxcount=0;
|
||||||
bool_t writed=TRUE;
|
bool_t writed=TRUE;
|
||||||
|
|
||||||
// DEBUG_LOG("Coucou %p s1= %d s2= %d",data,s1,s2)
|
// DEBUG_LOG("Coucou %p s1= %d s2= %d",data,s1,s2)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* init the merged structure (used only for coding convenience, never returned, allocated on the C-stack)
|
||||||
|
* note that :
|
||||||
|
* merged.words : hashcodes (initialized to data->words)
|
||||||
|
* merged.count : counts of each word (initialized to data->strictcount)
|
||||||
|
* merged.read1 : index of the first word of the first subtable (initialized to 0)
|
||||||
|
* merged.read1 : index of the first word of the first subtable (initialized to 0)
|
||||||
|
* merged.read2 : index of the first word of the second subtable (initialized to s1)
|
||||||
|
* merged.size : total size of the table (initialized to s1+s2)
|
||||||
|
*
|
||||||
|
* allocate a new stack of size min(s1, s2)
|
||||||
|
*/
|
||||||
|
|
||||||
(void)mergeInit(&merged,data,s1,s2);
|
(void)mergeInit(&merged,data,s1,s2);
|
||||||
(void)newQueue(&queue,MINI(s1,s2));
|
(void)newQueue(&queue,MINI(s1,s2));
|
||||||
|
|
||||||
while (merged.read1 < s1 && merged.read2 < merged.size)
|
|
||||||
|
/* true until
|
||||||
|
* merged.read1 == s1 AND merged.read2 == merged.size, i.e. ALL words have been processed
|
||||||
|
*/
|
||||||
|
while (merged.read1 < s1 || merged.read2 < merged.size)
|
||||||
{
|
{
|
||||||
|
/*
|
||||||
|
* initialize current{word,count} from either STACK (if not empty) or first table (S1)
|
||||||
|
*/
|
||||||
if (! queue.empty)
|
if (! queue.empty)
|
||||||
{
|
{
|
||||||
currentword = queue.words[queue.pop];
|
currentword = queue.words[queue.pop];
|
||||||
@ -56,18 +85,34 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui
|
|||||||
source=S1;
|
source=S1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (WORD(currentword) > WORD(merged.words[merged.read2]))
|
/*
|
||||||
|
* IF there are some words in the second subtable remaining to be processed AND
|
||||||
|
* its first word is lower than current word
|
||||||
|
* THEN initialize current{word,count} from the second table (S2)
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
if (merged.read2 < merged.size &&
|
||||||
|
WORD(currentword) > WORD(merged.words[merged.read2]))
|
||||||
{
|
{
|
||||||
currentword = merged.words[merged.read2];
|
currentword = merged.words[merged.read2];
|
||||||
currentcount = merged.count[merged.read2];
|
currentcount = merged.count[merged.read2];
|
||||||
source = S2;
|
source = S2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* record if the two words in the both subtable are the same
|
||||||
|
*/
|
||||||
same = (source != S2) && (WORD(currentword) == WORD(merged.words[merged.read2]));
|
same = (source != S2) && (WORD(currentword) == WORD(merged.words[merged.read2]));
|
||||||
nsame+=same;
|
// nsame+=same;
|
||||||
|
|
||||||
// DEBUG_LOG("Merging : r1 = %d s1 = %d r2 = %d size = %d word = %s source=%u same=%u",merged.read1,s1,merged.read2-s1,merged.size,ecoUnhashWord(currentword,18),source,same)
|
// DEBUG_LOG("Merging : r1 = %d s1 = %d r2 = %d size = %d word = %s source=%u same=%u",merged.read1,s1,merged.read2-s1,merged.size,ecoUnhashWord(currentword,18),source,same)
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* merge step (AND apply the quorum property)
|
||||||
|
* update merged.read1 AND/OR merged.read2
|
||||||
|
* record the word and its count in the table
|
||||||
|
*/
|
||||||
tmpword = merged.words[merged.write];
|
tmpword = merged.words[merged.write];
|
||||||
tmpcount= merged.count[merged.write];
|
tmpcount= merged.count[merged.write];
|
||||||
|
|
||||||
@ -113,7 +158,15 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui
|
|||||||
// DEBUG_LOG("r1 : %d r2 : %d qsize : %d nsame : %d tot : %d write : %s count : %d source : %d size : %d pop : %d push : %d empty : %d",merged.read1,merged.read2-s1,qsize,nsame,qsize+nsame,ecoUnhashWord(currentword,18),currentcount,source,queue.size,queue.pop,queue.push,queue.empty)
|
// DEBUG_LOG("r1 : %d r2 : %d qsize : %d nsame : %d tot : %d write : %s count : %d source : %d size : %d pop : %d push : %d empty : %d",merged.read1,merged.read2-s1,qsize,nsame,qsize+nsame,ecoUnhashWord(currentword,18),currentcount,source,queue.size,queue.pop,queue.push,queue.empty)
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* finish the merging with words not processed (AND apply the quorum property)
|
||||||
|
* they are stored in the second subtable (IF)
|
||||||
|
* OR in the queue (ELSE)
|
||||||
|
*/
|
||||||
|
|
||||||
if (merged.read2 < merged.size)
|
if (merged.read2 < merged.size)
|
||||||
|
{
|
||||||
|
//DEBUG_LOG("end1 %d %d/%d %d/%d",merged.write,merged.read1,s1,merged.read2,merged.size);
|
||||||
for (;merged.read2 < merged.size;merged.read2++)
|
for (;merged.read2 < merged.size;merged.read2++)
|
||||||
{
|
{
|
||||||
merged.words[merged.write]=merged.words[merged.read2];
|
merged.words[merged.write]=merged.words[merged.read2];
|
||||||
@ -122,7 +175,10 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui
|
|||||||
merged.write++;
|
merged.write++;
|
||||||
|
|
||||||
}
|
}
|
||||||
else while (! queue.empty)
|
}
|
||||||
|
else {
|
||||||
|
//DEBUG_LOG("end2 %d %d/%d %d/%d",merged.write,merged.read1,s1,merged.read2,merged.size);
|
||||||
|
while (! queue.empty)
|
||||||
{
|
{
|
||||||
// DEBUG_LOG("write : %s count : %d write : %d size : %d pop : %d push : %d empty : %d",ecoUnhashWord(queue.words[queue.pop],18),queue.count[queue.pop],merged.write,queue.size,queue.pop,queue.push,queue.empty)
|
// DEBUG_LOG("write : %s count : %d write : %d size : %d pop : %d push : %d empty : %d",ecoUnhashWord(queue.words[queue.pop],18),queue.count[queue.pop],merged.write,queue.size,queue.pop,queue.push,queue.empty)
|
||||||
merged.words[merged.write]=queue.words[queue.pop];
|
merged.words[merged.write]=queue.words[queue.pop];
|
||||||
@ -131,6 +187,7 @@ void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,ui
|
|||||||
if (remainingSeq + merged.count[merged.write] >= seqQuorum)
|
if (remainingSeq + merged.count[merged.write] >= seqQuorum)
|
||||||
merged.write++;
|
merged.write++;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
data->size = merged.write;
|
data->size = merged.write;
|
||||||
|
|
||||||
|
@ -7,34 +7,41 @@
|
|||||||
|
|
||||||
#include "ecoprimer.h"
|
#include "ecoprimer.h"
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include "../libthermo/thermostats.h"
|
||||||
|
|
||||||
primermatchcount_t buildPrimerPairsForOneSeq(uint32_t seqid,pprimercount_t primers,poptions_t options);
|
static void buildPrimerPairsForOneSeq(uint32_t seqid,
|
||||||
|
pecodnadb_t seqdb,
|
||||||
|
pprimercount_t primers,
|
||||||
|
ppairtree_t pairs,
|
||||||
|
poptions_t options);
|
||||||
|
|
||||||
int32_t pairinlist (ppairs_t pairlist, word_t w1, word_t w2, uint32_t size)
|
|
||||||
{
|
|
||||||
uint32_t i;
|
|
||||||
|
|
||||||
for (i = 0; i < size; i++)
|
|
||||||
{
|
|
||||||
if (w1 == pairlist[i].w1 && w2 == pairlist[i].w2) return i;
|
|
||||||
if (w1 == pairlist[i].w2 && w2 == pairlist[i].w1) return i;
|
|
||||||
}
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
char *addamplifiasetelem (ppairs_t pair, char* amplifia, int32_t taxid)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*************************************
|
||||||
|
*
|
||||||
|
* pair collection management
|
||||||
|
*
|
||||||
|
*************************************/
|
||||||
|
|
||||||
|
#ifdef MASKEDCODE
|
||||||
|
|
||||||
|
char *addamplifiasetelem (ppair_t pair, char* amplifia, int32_t taxid)
|
||||||
{
|
{
|
||||||
uint32_t i;
|
uint32_t i;
|
||||||
uint32_t j;
|
uint32_t j;
|
||||||
char *ampused = NULL;
|
char *ampused = NULL;
|
||||||
|
|
||||||
if(pair->ampsetcount == 0)
|
if(pair->ampsetcount == 0)
|
||||||
{
|
{
|
||||||
pair->ampsetcount = 500;
|
pair->ampsetcount = 500;
|
||||||
pair->ampsetindex = 0;
|
pair->ampsetindex = 0;
|
||||||
pair->ampset = ECOMALLOC(pair->ampsetcount * sizeof(ampseqset_t),"Cannot allocate amplifia set");
|
pair->ampset = ECOMALLOC(pair->ampsetcount * sizeof(ampseqset_t),"Cannot allocate amplifia set");
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = 0; i < pair->ampsetindex; i++)
|
for (i = 0; i < pair->ampsetindex; i++)
|
||||||
{
|
{
|
||||||
if (strcmp (pair->ampset[i].amplifia, amplifia) == 0)
|
if (strcmp (pair->ampset[i].amplifia, amplifia) == 0)
|
||||||
@ -43,43 +50,43 @@ char *addamplifiasetelem (ppairs_t pair, char* amplifia, int32_t taxid)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i == 0)
|
if (i == 0)
|
||||||
{
|
{
|
||||||
pair->ampset[i].seqidcount = 100;
|
pair->ampset[i].seqidcount = 100;
|
||||||
pair->ampset[i].seqidindex = 0;
|
pair->ampset[i].seqidindex = 0;
|
||||||
pair->ampset[i].taxonids = ECOMALLOC(pair->ampset[i].seqidcount * sizeof(uint32_t),"Cannot allocate amplifia sequence table");
|
pair->ampset[i].taxonids = ECOMALLOC(pair->ampset[i].seqidcount * sizeof(uint32_t),"Cannot allocate amplifia sequence table");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pair->ampsetindex == pair->ampsetcount)
|
if (pair->ampsetindex == pair->ampsetcount)
|
||||||
{
|
{
|
||||||
pair->ampsetcount += 500;
|
pair->ampsetcount += 500;
|
||||||
pair->ampset = ECOREALLOC(pair->ampset, pair->ampsetcount * sizeof(ampseqset_t), "Cannot allocate amplifia set");
|
pair->ampset = ECOREALLOC(pair->ampset, pair->ampsetcount * sizeof(ampseqset_t), "Cannot allocate amplifia set");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pair->ampset[i].seqidindex == pair->ampset[i].seqidcount)
|
if (pair->ampset[i].seqidindex == pair->ampset[i].seqidcount)
|
||||||
{
|
{
|
||||||
pair->ampset[i].seqidcount += 100;
|
pair->ampset[i].seqidcount += 100;
|
||||||
pair->ampset[i].taxonids = ECOREALLOC(pair->ampset[i].taxonids, pair->ampset[i].seqidcount * sizeof(int32_t), "Cannot allocate amplifia sequence table");
|
pair->ampset[i].taxonids = ECOREALLOC(pair->ampset[i].taxonids, pair->ampset[i].seqidcount * sizeof(int32_t), "Cannot allocate amplifia sequence table");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pair->ampset[i].amplifia == NULL)
|
if (pair->ampset[i].amplifia == NULL)
|
||||||
{
|
{
|
||||||
pair->ampset[i].amplifia = amplifia;
|
pair->ampset[i].amplifia = amplifia;
|
||||||
pair->ampsetindex++;
|
pair->ampsetindex++;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (j = 0; j < pair->ampset[i].seqidindex; j++)
|
for (j = 0; j < pair->ampset[i].seqidindex; j++)
|
||||||
{
|
{
|
||||||
if (pair->ampset[i].taxonids[j] == taxid) break;
|
if (pair->ampset[i].taxonids[j] == taxid) break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (j == pair->ampset[i].seqidindex)
|
if (j == pair->ampset[i].seqidindex)
|
||||||
pair->ampset[i].taxonids[pair->ampset[i].seqidindex++] = taxid;
|
pair->ampset[i].taxonids[pair->ampset[i].seqidindex++] = taxid;
|
||||||
return ampused;
|
return ampused;
|
||||||
}
|
}
|
||||||
|
|
||||||
void addtaxampsetelem (ppairs_t pair, int32_t taxid, char *amplifia)
|
void addtaxampsetelem (ppair_t pair, int32_t taxid, char *amplifia)
|
||||||
{
|
{
|
||||||
uint32_t i;
|
uint32_t i;
|
||||||
uint32_t j;
|
uint32_t j;
|
||||||
@ -90,42 +97,42 @@ void addtaxampsetelem (ppairs_t pair, int32_t taxid, char *amplifia)
|
|||||||
pair->taxsetindex = 0;
|
pair->taxsetindex = 0;
|
||||||
pair->taxset = ECOMALLOC(pair->taxsetcount * sizeof(taxampset_t),"Cannot allocate taxon set");
|
pair->taxset = ECOMALLOC(pair->taxsetcount * sizeof(taxampset_t),"Cannot allocate taxon set");
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = 0; i < pair->taxsetindex; i++)
|
for (i = 0; i < pair->taxsetindex; i++)
|
||||||
{
|
{
|
||||||
if (pair->taxset[i].taxonid == taxid) break;
|
if (pair->taxset[i].taxonid == taxid) break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i == 0)
|
if (i == 0)
|
||||||
{
|
{
|
||||||
pair->taxset[i].amplifiacount = 100;
|
pair->taxset[i].amplifiacount = 100;
|
||||||
pair->taxset[i].amplifiaindex = 0;
|
pair->taxset[i].amplifiaindex = 0;
|
||||||
pair->taxset[i].amplifia = ECOMALLOC(pair->taxset[i].amplifiacount * sizeof(char *),"Cannot allocate amplifia table");
|
pair->taxset[i].amplifia = ECOMALLOC(pair->taxset[i].amplifiacount * sizeof(char *),"Cannot allocate amplifia table");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pair->taxsetindex == pair->taxsetcount)
|
if (pair->taxsetindex == pair->taxsetcount)
|
||||||
{
|
{
|
||||||
pair->taxsetcount += 500;
|
pair->taxsetcount += 500;
|
||||||
pair->taxset = ECOREALLOC(pair->taxset, pair->taxsetcount * sizeof(taxampset_t), "Cannot allocate taxon set");
|
pair->taxset = ECOREALLOC(pair->taxset, pair->taxsetcount * sizeof(taxampset_t), "Cannot allocate taxon set");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pair->taxset[i].amplifiaindex == pair->taxset[i].amplifiacount)
|
if (pair->taxset[i].amplifiaindex == pair->taxset[i].amplifiacount)
|
||||||
{
|
{
|
||||||
pair->taxset[i].amplifiacount += 100;
|
pair->taxset[i].amplifiacount += 100;
|
||||||
pair->taxset[i].amplifia = ECOREALLOC(pair->taxset[i].amplifia, pair->taxset[i].amplifiacount * sizeof(char *), "Cannot allocate amplifia table");
|
pair->taxset[i].amplifia = ECOREALLOC(pair->taxset[i].amplifia, pair->taxset[i].amplifiacount * sizeof(char *), "Cannot allocate amplifia table");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pair->taxset[i].taxonid == 0)
|
if (pair->taxset[i].taxonid == 0)
|
||||||
{
|
{
|
||||||
pair->taxset[i].taxonid = taxid;
|
pair->taxset[i].taxonid = taxid;
|
||||||
pair->taxsetindex++;
|
pair->taxsetindex++;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (j = 0; j < pair->taxset[i].amplifiaindex; j++)
|
for (j = 0; j < pair->taxset[i].amplifiaindex; j++)
|
||||||
{
|
{
|
||||||
if (strcmp(pair->taxset[i].amplifia[j], amplifia) == 0) break;
|
if (strcmp(pair->taxset[i].amplifia[j], amplifia) == 0) break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (j == pair->taxset[i].amplifiaindex)
|
if (j == pair->taxset[i].amplifiaindex)
|
||||||
{
|
{
|
||||||
pair->taxset[i].amplifia[j] = amplifia;
|
pair->taxset[i].amplifia[j] = amplifia;
|
||||||
@ -135,140 +142,67 @@ void addtaxampsetelem (ppairs_t pair, int32_t taxid, char *amplifia)
|
|||||||
|
|
||||||
char *getamplifia (pecoseq_t seq, uint32_t start, uint32_t len)
|
char *getamplifia (pecoseq_t seq, uint32_t start, uint32_t len)
|
||||||
{
|
{
|
||||||
|
fprintf(stderr,"start : %d length : %d\n",start,len);
|
||||||
char *amplifia = ECOMALLOC((len + 1) * sizeof(char),"Cannot allocate amplifia");
|
char *amplifia = ECOMALLOC((len + 1) * sizeof(char),"Cannot allocate amplifia");
|
||||||
char *seqc = &seq->SQ[start];
|
char *seqc = &seq->SQ[start];
|
||||||
|
|
||||||
strncpy(amplifia, seqc, len);
|
strncpy(amplifia, seqc, len);
|
||||||
return amplifia;
|
return amplifia;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
/*TR: Added*/
|
/*TR: Added*/
|
||||||
pairscount_t buildPrimerPairs(pecodnadb_t seqdb,uint32_t seqdbsize,pprimercount_t primers,poptions_t options)
|
ppairtree_t buildPrimerPairs(pecodnadb_t seqdb,uint32_t seqdbsize,pprimercount_t primers,poptions_t options)
|
||||||
{
|
{
|
||||||
uint32_t i;
|
uint32_t i;
|
||||||
uint32_t j;
|
ppairtree_t primerpairs;
|
||||||
uint32_t k;
|
|
||||||
uint32_t d;
|
|
||||||
uint32_t strt;
|
|
||||||
uint32_t end;
|
|
||||||
uint32_t paircount = 0;
|
|
||||||
uint32_t pairslots = 500;
|
|
||||||
int32_t foundindex;
|
|
||||||
ppairs_t pairs;
|
|
||||||
pairscount_t primerpairs;
|
|
||||||
primermatchcount_t seqmatchcount;
|
|
||||||
word_t w1;
|
|
||||||
word_t w2;
|
|
||||||
char *amplifia;
|
|
||||||
char *oldamp;
|
|
||||||
|
|
||||||
|
primerpairs = initpairtree(NULL);
|
||||||
pairs = ECOMALLOC(pairslots * sizeof(pairs_t),"Cannot allocate pairs table");
|
|
||||||
|
|
||||||
for (i=0; i < seqdbsize; i++)
|
for (i=0; i < seqdbsize; i++)
|
||||||
{
|
{
|
||||||
seqmatchcount = buildPrimerPairsForOneSeq(i, primers, options);
|
buildPrimerPairsForOneSeq(i, seqdb, primers, primerpairs, options);
|
||||||
if (seqmatchcount.matchcount == 0) continue;
|
|
||||||
|
|
||||||
for (j=0; j < seqmatchcount.matchcount; j++)
|
|
||||||
{
|
|
||||||
strt = 0;
|
|
||||||
w1 = seqmatchcount.matches[j].word;
|
|
||||||
/*first word should b on direct strand*/
|
|
||||||
if (!seqmatchcount.matches[j].strand)
|
|
||||||
w1 = ecoComplementWord(w1, options->primer_length);
|
|
||||||
else
|
|
||||||
strt = options->primer_length;
|
|
||||||
|
|
||||||
for (k=j+1; k < seqmatchcount.matchcount; k++)
|
|
||||||
{
|
|
||||||
end = 0;
|
|
||||||
w2 = seqmatchcount.matches[k].word;
|
|
||||||
/*second word should be on reverse strand*/
|
|
||||||
if (seqmatchcount.matches[k].strand)
|
|
||||||
w2 = ecoComplementWord(w2, options->primer_length);
|
|
||||||
else
|
|
||||||
end = options->primer_length;
|
|
||||||
|
|
||||||
if (!(seqmatchcount.matches[j].good || seqmatchcount.matches[k].good)) continue;
|
|
||||||
if (w1 == w2) continue;
|
|
||||||
|
|
||||||
d = seqmatchcount.matches[k].position - seqmatchcount.matches[j].position;
|
|
||||||
if (d >= options->lmin && d <= options->lmax)
|
|
||||||
{
|
|
||||||
/*get amplified string*/
|
|
||||||
amplifia = getamplifia (seqdb[i], seqmatchcount.matches[j].position + strt, d - strt - end);
|
|
||||||
|
|
||||||
foundindex = pairinlist(pairs, w1, w2, paircount);
|
|
||||||
if (foundindex != -1) /*pair is found*/
|
|
||||||
{
|
|
||||||
if (seqdb[i]->isexample)
|
|
||||||
pairs[foundindex].inexample++;
|
|
||||||
else
|
|
||||||
pairs[foundindex].outexample++;
|
|
||||||
|
|
||||||
if (pairs[foundindex].mind > d) pairs[foundindex].mind = d;
|
|
||||||
else if (pairs[foundindex].maxd < d) pairs[foundindex].maxd = d;
|
|
||||||
|
|
||||||
oldamp = addamplifiasetelem (&pairs[foundindex], amplifia, seqdb[i]->ranktaxonid);
|
|
||||||
/*if exact same string is already in amplifia set then use that for taxon set, it will help for
|
|
||||||
* calculating the fully identified taxons i.e specificity, we will compare pointrs instead of strings
|
|
||||||
* because same string means same pointer*/
|
|
||||||
if (oldamp)
|
|
||||||
{
|
|
||||||
ECOFREE (amplifia, "free amplifia");
|
|
||||||
amplifia = oldamp;
|
|
||||||
}
|
|
||||||
addtaxampsetelem (&pairs[foundindex], seqdb[i]->ranktaxonid, amplifia);
|
|
||||||
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (paircount == pairslots)
|
|
||||||
{
|
|
||||||
pairslots += 500;
|
|
||||||
pairs = ECOREALLOC(pairs, pairslots * sizeof(pairs_t), "Cannot allocate pairs table");
|
|
||||||
}
|
|
||||||
pairs[paircount].w1 = w1;
|
|
||||||
pairs[paircount].w2 = w2;
|
|
||||||
if (seqdb[i]->isexample) pairs[paircount].inexample = 1;
|
|
||||||
else pairs[paircount].outexample = 1;
|
|
||||||
pairs[paircount].mind = d;
|
|
||||||
pairs[paircount].maxd = d;
|
|
||||||
oldamp = addamplifiasetelem (&pairs[paircount], amplifia, seqdb[i]->ranktaxonid);
|
|
||||||
addtaxampsetelem (&pairs[paircount], seqdb[i]->ranktaxonid, amplifia);
|
|
||||||
|
|
||||||
paircount++;
|
|
||||||
}
|
|
||||||
else if (d > options->lmax)
|
|
||||||
break; /*once if the distance is greater than lmax then it will keep on increasing*/
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ECOFREE(seqmatchcount.matches, "Cannot free matches table");
|
|
||||||
}
|
}
|
||||||
primerpairs.pairs = ECOREALLOC(pairs, paircount * sizeof(pairs_t), "Cannot allocate pairs table");
|
|
||||||
primerpairs.paircount = paircount;
|
|
||||||
return primerpairs;
|
return primerpairs;
|
||||||
}
|
}
|
||||||
|
|
||||||
primermatchcount_t buildPrimerPairsForOneSeq(uint32_t seqid,pprimercount_t primers,poptions_t options)
|
#define DMAX (2000000000)
|
||||||
{
|
|
||||||
uint32_t i,j,k;
|
|
||||||
uint32_t matchcount=0;
|
|
||||||
pprimermatch_t matches = NULL;
|
|
||||||
primermatchcount_t seqmatchcount;
|
|
||||||
|
|
||||||
seqmatchcount.matchcount = 0;
|
static void buildPrimerPairsForOneSeq(uint32_t seqid,
|
||||||
seqmatchcount.matches = NULL;
|
pecodnadb_t seqdb,
|
||||||
|
pprimercount_t primers,
|
||||||
|
ppairtree_t pairs,
|
||||||
|
poptions_t options)
|
||||||
|
{
|
||||||
|
static uint32_t paircount=0;
|
||||||
|
uint32_t i,j,k;
|
||||||
|
uint32_t matchcount=0;
|
||||||
|
pprimermatch_t matches = NULL;
|
||||||
|
//primermatchcount_t seqmatchcount;
|
||||||
|
ppair_t pcurrent;
|
||||||
|
pair_t current;
|
||||||
|
pprimer_t wswp;
|
||||||
|
bool_t bswp;
|
||||||
|
size_t distance;
|
||||||
|
bool_t strand;
|
||||||
|
//char prmr[50];
|
||||||
|
//float mtemp;
|
||||||
|
word_t w1, w1a, omask = (0x1L << (options->strict_three_prime*2)) -1;
|
||||||
|
word_t w2, w2a;//, wtmp;
|
||||||
|
uint32_t bp1,bp2;
|
||||||
|
|
||||||
|
//prmr[options->primer_length] = '\0';
|
||||||
|
|
||||||
for (i=0;i < primers->size; i++)
|
for (i=0;i < primers->size; i++)
|
||||||
{
|
{
|
||||||
matchcount+=primers->primers[i].directCount[seqid];
|
matchcount+=primers->primers[i].directCount[seqid];
|
||||||
matchcount+=primers->primers[i].reverseCount[seqid];
|
matchcount+=primers->primers[i].reverseCount[seqid];
|
||||||
}
|
}
|
||||||
|
|
||||||
if (matchcount <= 0) return seqmatchcount;
|
if (matchcount <= 0)
|
||||||
|
return;
|
||||||
|
|
||||||
matches = ECOMALLOC(matchcount * sizeof(primermatch_t),"Cannot allocate primers match table");
|
matches = ECOMALLOC(matchcount * sizeof(primermatch_t),"Cannot allocate primers match table");
|
||||||
|
|
||||||
for (i=0,j=0;i < primers->size; i++)
|
for (i=0,j=0;i < primers->size; i++)
|
||||||
@ -277,17 +211,15 @@ primermatchcount_t buildPrimerPairsForOneSeq(uint32_t seqid,pprimercount_t prime
|
|||||||
{
|
{
|
||||||
if (primers->primers[i].directCount[seqid]==1)
|
if (primers->primers[i].directCount[seqid]==1)
|
||||||
{
|
{
|
||||||
matches[j].word = primers->primers[i].word;
|
matches[j].primer = primers->primers+i;
|
||||||
matches[j].strand=TRUE;
|
matches[j].strand=TRUE;
|
||||||
matches[j].good=primers->primers[i].good;/*TR: Added*/
|
|
||||||
matches[j].position=primers->primers[i].directPos[seqid].value;
|
matches[j].position=primers->primers[i].directPos[seqid].value;
|
||||||
j++;
|
j++;
|
||||||
}
|
}
|
||||||
else for (k=0; k < primers->primers[i].directCount[seqid]; k++,j++)
|
else for (k=0; k < primers->primers[i].directCount[seqid]; k++,j++)
|
||||||
{
|
{
|
||||||
matches[j].word = primers->primers[i].word;
|
matches[j].primer = primers->primers+i;
|
||||||
matches[j].strand=TRUE;
|
matches[j].strand=TRUE;
|
||||||
matches[j].good=primers->primers[i].good;/*TR: Added*/
|
|
||||||
matches[j].position=primers->primers[i].directPos[seqid].pointer[k];
|
matches[j].position=primers->primers[i].directPos[seqid].pointer[k];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -296,26 +228,233 @@ primermatchcount_t buildPrimerPairsForOneSeq(uint32_t seqid,pprimercount_t prime
|
|||||||
{
|
{
|
||||||
if (primers->primers[i].reverseCount[seqid]==1)
|
if (primers->primers[i].reverseCount[seqid]==1)
|
||||||
{
|
{
|
||||||
matches[j].word = primers->primers[i].word;
|
matches[j].primer = primers->primers+i;
|
||||||
matches[j].strand=FALSE;
|
matches[j].strand=FALSE;
|
||||||
matches[j].good=primers->primers[i].good;/*TR: Added*/
|
|
||||||
matches[j].position=primers->primers[i].reversePos[seqid].value;
|
matches[j].position=primers->primers[i].reversePos[seqid].value;
|
||||||
j++;
|
j++;
|
||||||
}
|
}
|
||||||
else for (k=0; k < primers->primers[i].reverseCount[seqid]; k++,j++)
|
else for (k=0; k < primers->primers[i].reverseCount[seqid]; k++,j++)
|
||||||
{
|
{
|
||||||
matches[j].word = primers->primers[i].word;
|
matches[j].primer = primers->primers+i;
|
||||||
matches[j].strand=FALSE;
|
matches[j].strand=FALSE;
|
||||||
matches[j].good=primers->primers[i].good;/*TR: Added*/
|
|
||||||
matches[j].position=primers->primers[i].reversePos[seqid].pointer[k];
|
matches[j].position=primers->primers[i].reversePos[seqid].pointer[k];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
sortmatch(matches,matchcount); // sort in asscending order by position
|
if (matchcount>1)
|
||||||
|
{
|
||||||
/*TR: Added*/
|
// fprintf(stderr,"\n====================================\n");
|
||||||
seqmatchcount.matches = matches;
|
|
||||||
seqmatchcount.matchcount = matchcount;
|
sortmatch(matches,matchcount); // sort in ascending order by position
|
||||||
return seqmatchcount;
|
|
||||||
|
for (i=0; i < matchcount;i++)
|
||||||
|
{
|
||||||
|
// For all primers matching the sequence
|
||||||
|
|
||||||
|
/*for(j=i+1;
|
||||||
|
(j<matchcount)
|
||||||
|
&& ((distance=matches[j].position - matches[i].position - options->primer_length) < options->lmax);
|
||||||
|
j++
|
||||||
|
)//*/
|
||||||
|
for (j=i+1; j<matchcount; j++)
|
||||||
|
{
|
||||||
|
if (matches[j].position - matches[i].position <= options->primer_length) continue;
|
||||||
|
distance = matches[j].position - matches[i].position - options->primer_length;
|
||||||
|
if (distance >= options->lmax) break;
|
||||||
|
|
||||||
|
|
||||||
|
// For all not too far primers
|
||||||
|
|
||||||
|
if ( (matches[i].primer->good || matches[j].primer->good)
|
||||||
|
&& (distance > options->lmin)
|
||||||
|
)
|
||||||
|
{
|
||||||
|
// If possible primer pair
|
||||||
|
current.p1 = matches[i].primer;
|
||||||
|
current.asdirect1=matches[i].strand;
|
||||||
|
current.p2 = matches[j].primer;
|
||||||
|
current.asdirect2= !matches[j].strand;
|
||||||
|
current.maxd=DMAX;
|
||||||
|
current.mind=DMAX;
|
||||||
|
current.sumd=0;
|
||||||
|
current.amplifiacount=0;
|
||||||
|
current.inexample=0;
|
||||||
|
current.outexample=0;
|
||||||
|
current.curseqid = 0;
|
||||||
|
current.refsequence=-1;
|
||||||
|
//current.p1temp = 100;
|
||||||
|
//current.p1mintemp = 100;
|
||||||
|
//current.p2temp = 100;
|
||||||
|
//current.p2mintemp = 100;
|
||||||
|
|
||||||
|
// Standardize the pair
|
||||||
|
strand = current.p2->word > current.p1->word;
|
||||||
|
if (!strand)
|
||||||
|
{
|
||||||
|
wswp = current.p1;
|
||||||
|
current.p1=current.p2;
|
||||||
|
current.p2=wswp;
|
||||||
|
|
||||||
|
bswp = current.asdirect1;
|
||||||
|
current.asdirect1=current.asdirect2;
|
||||||
|
current.asdirect2=bswp;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//Code to make sure that if -3 option is given then
|
||||||
|
//3' end must match upto given number of base pairs
|
||||||
|
if (options->strict_three_prime > 0)
|
||||||
|
{
|
||||||
|
w1 = current.p1->word;
|
||||||
|
w2 = current.p2->word;
|
||||||
|
if (!current.asdirect1) //make sure that word is from 5' to 3'
|
||||||
|
w1=ecoComplementWord(w1,options->primer_length);
|
||||||
|
|
||||||
|
if (!current.asdirect2) //make sure that word is from 5' to 3'
|
||||||
|
w2=ecoComplementWord(w2,options->primer_length);
|
||||||
|
//now both w1 and w2 are from 5' to 3' end
|
||||||
|
bp1 = matches[i].position;
|
||||||
|
bp2 = matches[j].position;
|
||||||
|
if (!strand)
|
||||||
|
{
|
||||||
|
bp1 = matches[j].position;
|
||||||
|
bp2 = matches[i].position;
|
||||||
|
}
|
||||||
|
//get word of first approximate repeat
|
||||||
|
w1a = extractSite(seqdb[seqid]->SQ,bp1,options->primer_length,strand);
|
||||||
|
//get word of second approximate repeat
|
||||||
|
w2a = extractSite(seqdb[seqid]->SQ,bp2,options->primer_length,!strand);
|
||||||
|
|
||||||
|
w1 = w1 & omask; //keep only strict_three_prime bases on the right (3') end
|
||||||
|
w2 = w2 & omask; //keep only strict_three_prime bases on the right (3') end
|
||||||
|
w1a = w1a & omask; //keep only strict_three_prime bases on the right (3') end
|
||||||
|
w2a = w2a & omask; //keep only strict_three_prime bases on the right (3') end
|
||||||
|
|
||||||
|
//now check that both words and primers of amplifia have same bases on 3' end
|
||||||
|
if ((w1 ^ w1a) != 0) continue;
|
||||||
|
if ((w2 ^ w2a) != 0) continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// Look for the new pair in already seen pairs
|
||||||
|
|
||||||
|
pcurrent = insertpair(current,pairs);
|
||||||
|
|
||||||
|
|
||||||
|
if (seqdb[seqid]->isexample)
|
||||||
|
|
||||||
|
{
|
||||||
|
//pcurrent->inexample++;
|
||||||
|
pcurrent->sumd+=distance;
|
||||||
|
pcurrent->amplifiacount++;
|
||||||
|
|
||||||
|
if ((pcurrent->maxd==DMAX) || (distance > pcurrent->maxd))
|
||||||
|
pcurrent->maxd = distance;
|
||||||
|
|
||||||
|
if (distance < pcurrent->mind)
|
||||||
|
pcurrent->mind = distance;
|
||||||
|
}
|
||||||
|
//else
|
||||||
|
// pcurrent->outexample++;
|
||||||
|
|
||||||
|
//for each pair we save current sequence id in the pair
|
||||||
|
//when we see this pair for the first time in currnet sequence
|
||||||
|
//because we want to increment inexample & outexample count
|
||||||
|
//only once for one sequence
|
||||||
|
if (pcurrent->curseqid != (seqid+1))
|
||||||
|
{
|
||||||
|
if (seqdb[seqid]->isexample)
|
||||||
|
pcurrent->inexample++;
|
||||||
|
else
|
||||||
|
pcurrent->outexample++;
|
||||||
|
|
||||||
|
if (pcurrent->curseqid != 0)
|
||||||
|
pcurrent->curseqid = seqid+1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*if ((pcurrent->outexample+pcurrent->inexample)==0)
|
||||||
|
{
|
||||||
|
fprintf(stderr,"pcurrent->outexample+pcurrent->inexample=0!\n");
|
||||||
|
exit(0);
|
||||||
|
}*/
|
||||||
|
|
||||||
|
if (pcurrent->curseqid == 0)//((pcurrent->outexample+pcurrent->inexample)==1)
|
||||||
|
{
|
||||||
|
pcurrent->curseqid = seqid+1;
|
||||||
|
paircount++;
|
||||||
|
pcurrent->pcr.ampslot=200;
|
||||||
|
pcurrent->pcr.ampcount=0;
|
||||||
|
pcurrent->pcr.amplifias = ECOMALLOC(sizeof(amplifia_t)*pcurrent->pcr.ampslot,
|
||||||
|
"Cannot allocate amplifia table");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (pcurrent->pcr.ampslot==pcurrent->pcr.ampcount)
|
||||||
|
{
|
||||||
|
pcurrent->pcr.ampslot+=200;
|
||||||
|
pcurrent->pcr.amplifias = ECOREALLOC(pcurrent->pcr.amplifias,
|
||||||
|
sizeof(amplifia_t)*pcurrent->pcr.ampslot,
|
||||||
|
"Cannot allocate amplifia table");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (seqid==options->refseqid)
|
||||||
|
pcurrent->refsequence=seqid;
|
||||||
|
pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].length=distance;
|
||||||
|
pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].sequence=seqdb[seqid];
|
||||||
|
pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].strand=strand;
|
||||||
|
pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].begin=matches[i].position + options->primer_length;
|
||||||
|
pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].end= matches[j].position - 1;
|
||||||
|
|
||||||
|
if (strand)
|
||||||
|
pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].amplifia= seqdb[seqid]->SQ + matches[i].position + options->primer_length;
|
||||||
|
else
|
||||||
|
pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].amplifia= seqdb[seqid]->SQ + matches[j].position - 1 ;
|
||||||
|
|
||||||
|
|
||||||
|
/*strncpy (prmr, seqdb[seqid]->SQ + matches[i].position, options->primer_length);
|
||||||
|
mtemp = nparam_CalcSelfTM (options->pnparm, prmr, options->primer_length) - 273.0;
|
||||||
|
if (mtemp < pcurrent->p1mintemp)
|
||||||
|
pcurrent->p1mintemp = mtemp;
|
||||||
|
//fprintf (stderr, "prmr1: %s\n", seqdb[seqid]->SQ);
|
||||||
|
strncpy (prmr, seqdb[seqid]->SQ + matches[j].position, options->primer_length);
|
||||||
|
mtemp = nparam_CalcSelfTM (options->pnparm, prmr, options->primer_length) - 273.0;
|
||||||
|
if (mtemp < pcurrent->p2mintemp)
|
||||||
|
pcurrent->p2mintemp = mtemp;
|
||||||
|
//fprintf (stderr, "prmr2: %s\n", prmr);
|
||||||
|
|
||||||
|
if (pcurrent->p1temp == 100)
|
||||||
|
pcurrent->p1temp = nparam_CalcSelfTM (options->pnparm, ecoUnhashWord(pcurrent->p1->word, options->primer_length), 0) - 273.0;
|
||||||
|
if (pcurrent->p2temp == 100)
|
||||||
|
pcurrent->p2temp = nparam_CalcSelfTM (options->pnparm, ecoUnhashWord(pcurrent->p2->word, options->primer_length), 0) - 273.0;
|
||||||
|
*/
|
||||||
|
pcurrent->pcr.ampcount++;
|
||||||
|
// fprintf(stderr,"%c%c W1 : %s direct : %c",
|
||||||
|
// "bG"[(int)pcurrent->p1->good],
|
||||||
|
// "bG"[(int)pcurrent->p2->good],
|
||||||
|
// ecoUnhashWord(pcurrent->p1->word, options->primer_length),
|
||||||
|
// "><"[(int)pcurrent->asdirect1]
|
||||||
|
// );
|
||||||
|
//
|
||||||
|
// fprintf(stderr," W2 : %s direct : %c distance : %d (min/max/avg : %d/%d/%f) in/out: %d/%d %c (%d pairs)\n",
|
||||||
|
// ecoUnhashWord(pcurrent->p2->word, options->primer_length),
|
||||||
|
// "><"[(int)pcurrent->asdirect2],
|
||||||
|
// distance,
|
||||||
|
// pcurrent->mind,pcurrent->maxd,
|
||||||
|
// (pcurrent->inexample) ? (float)pcurrent->sumd/pcurrent->inexample:0.0,
|
||||||
|
// pcurrent->inexample,pcurrent->outexample,
|
||||||
|
// " N"[(pcurrent->outexample+pcurrent->inexample)==1],
|
||||||
|
// paircount
|
||||||
|
//
|
||||||
|
// );
|
||||||
|
//
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pairs->count=paircount;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
136
src/libecoprimer/pairtree.c
Normal file
136
src/libecoprimer/pairtree.c
Normal file
@ -0,0 +1,136 @@
|
|||||||
|
/*
|
||||||
|
* pairtree.c
|
||||||
|
*
|
||||||
|
* Created on: 7 mars 2009
|
||||||
|
* Author: coissac
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "ecoprimer.h"
|
||||||
|
#include <search.h>
|
||||||
|
|
||||||
|
static void cleanpair(ppair_t pair);
|
||||||
|
static void deletepairlist(ppairlist_t list);
|
||||||
|
static int cmppair(const void* p1,const void*p2);
|
||||||
|
|
||||||
|
|
||||||
|
static void cleanamplifiatlist(pamplifiacount_t list)
|
||||||
|
{
|
||||||
|
if (list->amplifias)
|
||||||
|
ECOFREE(list->amplifias,
|
||||||
|
"Free amplifia list");
|
||||||
|
}
|
||||||
|
|
||||||
|
static void cleanpair(ppair_t pair)
|
||||||
|
{
|
||||||
|
cleanamplifiatlist(&(pair->pcr));
|
||||||
|
}
|
||||||
|
|
||||||
|
static ppairlist_t newpairlist(ppairlist_t parent, size_t size)
|
||||||
|
{
|
||||||
|
ppairlist_t tmp;
|
||||||
|
|
||||||
|
tmp=ECOMALLOC(sizeof(pairlist_t)+sizeof(pair_t)*(size-1),
|
||||||
|
"Cannot allocate new pair list");
|
||||||
|
|
||||||
|
tmp->pairslots=size;
|
||||||
|
tmp->paircount=0;
|
||||||
|
tmp->next=NULL;
|
||||||
|
|
||||||
|
if (parent)
|
||||||
|
parent->next=(void*)tmp;
|
||||||
|
|
||||||
|
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void deletepairlist(ppairlist_t list)
|
||||||
|
{
|
||||||
|
size_t i;
|
||||||
|
|
||||||
|
if (list)
|
||||||
|
{
|
||||||
|
if (list->next)
|
||||||
|
{
|
||||||
|
deletepairlist(list->next);
|
||||||
|
list->next=NULL;
|
||||||
|
}
|
||||||
|
for (i=0; i < list->paircount; i++)
|
||||||
|
cleanpair((list->pairs)+i);
|
||||||
|
|
||||||
|
ECOFREE(list,"Delete pair list");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
static int cmppair(const void* p1,const void*p2)
|
||||||
|
{
|
||||||
|
ppair_t pr1,pr2;
|
||||||
|
|
||||||
|
pr1=(ppair_t)p1;
|
||||||
|
pr2=(ppair_t)p2;
|
||||||
|
|
||||||
|
if (pr1->p1 < pr2->p1) return -1;
|
||||||
|
if (pr1->p1 > pr2->p1) return 1;
|
||||||
|
|
||||||
|
if (pr1->asdirect1 < pr2->asdirect1) return -1;
|
||||||
|
if (pr1->asdirect1 > pr2->asdirect1) return 1;
|
||||||
|
|
||||||
|
if (pr1->p2 < pr2->p2) return -1;
|
||||||
|
if (pr1->p2 > pr2->p2) return 1;
|
||||||
|
|
||||||
|
if (pr1->asdirect2 < pr2->asdirect2) return -1;
|
||||||
|
if (pr1->asdirect2 > pr2->asdirect2) return 1;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
ppair_t pairintree (pair_t key,
|
||||||
|
ppairtree_t pairlist)
|
||||||
|
{
|
||||||
|
if (!pairlist->tree)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
return *((ppair_t*)tsearch((const void *)(&key),
|
||||||
|
&(pairlist->tree),
|
||||||
|
cmppair
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
ppair_t insertpair(pair_t key,
|
||||||
|
ppairtree_t list)
|
||||||
|
{
|
||||||
|
ppair_t current;
|
||||||
|
ppair_t found;
|
||||||
|
|
||||||
|
if (list->last->paircount==list->last->pairslots)
|
||||||
|
{
|
||||||
|
list->last->next=newpairlist(list->last,100);
|
||||||
|
list->last=list->last->next;
|
||||||
|
}
|
||||||
|
|
||||||
|
current = list->last->pairs + list->last->paircount;
|
||||||
|
*current=key;
|
||||||
|
|
||||||
|
found = *((ppair_t*)tsearch((const void *)current,
|
||||||
|
&(list->tree),
|
||||||
|
cmppair));
|
||||||
|
if (found==current)
|
||||||
|
list->last->paircount++;
|
||||||
|
|
||||||
|
return found;
|
||||||
|
}
|
||||||
|
|
||||||
|
ppairtree_t initpairtree(ppairtree_t tree)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (!tree)
|
||||||
|
tree = ECOMALLOC(sizeof(pairtree_t),"Cannot allocate pair tree");
|
||||||
|
|
||||||
|
tree->first=newpairlist(NULL,300);
|
||||||
|
tree->last=tree->first;
|
||||||
|
|
||||||
|
tree->tree=NULL;
|
||||||
|
tree->count=0;
|
||||||
|
|
||||||
|
return tree;
|
||||||
|
}
|
@ -7,7 +7,7 @@
|
|||||||
|
|
||||||
#include "ecoprimer.h"
|
#include "ecoprimer.h"
|
||||||
|
|
||||||
pecodnadb_t readdnadb(const char *name, uint32_t *size)
|
pecodnadb_t readdnadb(const char *name, ecotaxonomy_t *taxonomy, uint32_t *size,poptions_t options)
|
||||||
{
|
{
|
||||||
ecoseq_t *seq;
|
ecoseq_t *seq;
|
||||||
uint32_t buffsize=100;
|
uint32_t buffsize=100;
|
||||||
@ -18,18 +18,42 @@ pecodnadb_t readdnadb(const char *name, uint32_t *size)
|
|||||||
|
|
||||||
for(seq=ecoseq_iterator(name), *size=0;
|
for(seq=ecoseq_iterator(name), *size=0;
|
||||||
seq;
|
seq;
|
||||||
seq=ecoseq_iterator(NULL), (*size)++
|
seq=ecoseq_iterator(NULL)
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
if (*size==buffsize)
|
if (isExampleTaxon(taxonomy,seq->taxid,options) ||
|
||||||
{
|
isCounterExampleTaxon(taxonomy,seq->taxid,options))
|
||||||
buffsize*=2;
|
{
|
||||||
db = ECOREALLOC(db,buffsize*sizeof(ecoseq_t*),"I cannot allocate db memory");
|
if (*size==buffsize)
|
||||||
}
|
{
|
||||||
db[*size]=seq;
|
buffsize*=2;
|
||||||
|
db = ECOREALLOC(db,buffsize*sizeof(ecoseq_t*),"I cannot allocate db memory");
|
||||||
|
}
|
||||||
|
db[*size]=seq;
|
||||||
|
(*size)++;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
delete_ecoseq(seq);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
db = ECOREALLOC(db,(*size)*sizeof(ecoseq_t*),"I cannot allocate db memory");
|
db = ECOREALLOC(db,(*size)*sizeof(ecoseq_t*),"I cannot allocate db memory");
|
||||||
|
|
||||||
return db;
|
return db;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void printSeqTest(pecodnadb_t seqdb,uint32_t seqdbsize)
|
||||||
|
{
|
||||||
|
uint32_t i;
|
||||||
|
char ch[11];
|
||||||
|
ch [10] = '\0';
|
||||||
|
|
||||||
|
for (i=0; i < seqdbsize; i++)
|
||||||
|
{
|
||||||
|
strncpy (ch, seqdb[i]->SQ, 10);
|
||||||
|
fprintf (stderr, "seq %d = %s\n", i, ch);
|
||||||
|
}
|
||||||
|
exit (0);
|
||||||
|
}
|
||||||
|
@ -35,6 +35,7 @@
|
|||||||
* @author Pekka Pessi <Pekka.Pessi@nokia.com>
|
* @author Pekka Pessi <Pekka.Pessi@nokia.com>
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include <stdlib.h> /* FB for NULL */
|
||||||
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
@ -5,16 +5,55 @@
|
|||||||
* Author: coissac
|
* Author: coissac
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#define _GNU_SOURCE
|
||||||
#include "ecoprimer.h"
|
#include "ecoprimer.h"
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
#include <sys/resource.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,ecoseq_t *seq)
|
#ifndef RUSAGE_SELF
|
||||||
|
#define RUSAGE_SELF 0
|
||||||
|
#define RUSAGE_CHILDREN -1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static double timeval_subtract (struct timeval *x, struct timeval *y);
|
||||||
|
|
||||||
|
|
||||||
|
/* Subtract the `struct timeval' values X and Y,
|
||||||
|
Return elapsed secondes as a double. */
|
||||||
|
|
||||||
|
double timeval_subtract (struct timeval *x, struct timeval *y)
|
||||||
|
{
|
||||||
|
struct timeval result;
|
||||||
|
|
||||||
|
/* Perform the carry for the later subtraction by updating y. */
|
||||||
|
if (x->tv_usec < y->tv_usec) {
|
||||||
|
int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1;
|
||||||
|
y->tv_usec -= 1000000 * nsec;
|
||||||
|
y->tv_sec += nsec;
|
||||||
|
}
|
||||||
|
if (x->tv_usec - y->tv_usec > 1000000) {
|
||||||
|
int nsec = (x->tv_usec - y->tv_usec) / 1000000;
|
||||||
|
y->tv_usec += 1000000 * nsec;
|
||||||
|
y->tv_sec -= nsec;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Compute the time remaining to wait.
|
||||||
|
tv_usec is certainly positive. */
|
||||||
|
result.tv_sec = x->tv_sec - y->tv_sec;
|
||||||
|
result.tv_usec = x->tv_usec - y->tv_usec;
|
||||||
|
|
||||||
|
return (double)result.tv_sec + (double)result.tv_usec/1e6;
|
||||||
|
}
|
||||||
|
|
||||||
|
pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount)
|
||||||
{
|
{
|
||||||
uint32_t i;
|
uint32_t i;
|
||||||
uint32_t buffsize;
|
uint32_t buffsize;
|
||||||
//wordcount_t t;
|
//wordcount_t t;
|
||||||
|
|
||||||
if (!table)
|
if (!table)
|
||||||
table = ECOMALLOC(sizeof(wordcount_t),"Cannot allocate memory for word count structure");
|
table = ECOMALLOC(sizeof(wordcount_t),"Cannot allocate memory for word count structure");
|
||||||
|
|
||||||
@ -26,7 +65,7 @@ pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circ
|
|||||||
|
|
||||||
if (seq)
|
if (seq)
|
||||||
{
|
{
|
||||||
table->words = ecoHashSequence(NULL,wordsize,circular,doublestrand,seq,&buffsize);
|
table->words = ecoHashSequence(NULL,wordsize,circular,doublestrand,seq,&buffsize,neededWords,neededWordCount,seqQuorum);
|
||||||
table->size = ecoCompactHashSequence(table->words,buffsize);
|
table->size = ecoCompactHashSequence(table->words,buffsize);
|
||||||
|
|
||||||
table->inseqcount=1;
|
table->inseqcount=1;
|
||||||
@ -40,7 +79,7 @@ pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circ
|
|||||||
return table;
|
return table;
|
||||||
}
|
}
|
||||||
|
|
||||||
void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t exampleCount,uint32_t seqQuorum,ecoseq_t *seq)
|
void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t exampleCount,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount)
|
||||||
{
|
{
|
||||||
uint32_t buffersize;
|
uint32_t buffersize;
|
||||||
pword_t newtable;
|
pword_t newtable;
|
||||||
@ -50,33 +89,52 @@ void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circ
|
|||||||
buffersize = table->size + ecoWordCount(wordsize,circular,seq);
|
buffersize = table->size + ecoWordCount(wordsize,circular,seq);
|
||||||
|
|
||||||
table->words = ECOREALLOC(table->words,buffersize*sizeof(word_t),
|
table->words = ECOREALLOC(table->words,buffersize*sizeof(word_t),
|
||||||
"Cannot allocate memory to extend word table");
|
"\n\nCannot allocate memory to extend word table" );
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* newtable is a pointer on the memory planed to be used for the new sequence (ecoWordCount new hash codes max)
|
||||||
|
*/
|
||||||
newtable = table->words + table->size;
|
newtable = table->words + table->size;
|
||||||
|
|
||||||
// DEBUG_LOG("Words = %x (%u) new = %x", table->words,table->size,newtable);
|
// DEBUG_LOG("Words = %x (%u) new = %x", table->words,table->size,newtable);
|
||||||
|
|
||||||
(void)ecoHashSequence(newtable,wordsize,circular,doublestrand,seq,&newsize);
|
(void)ecoHashSequence(newtable,wordsize,circular,doublestrand,seq,&newsize,neededWords,neededWordCount,seqQuorum);
|
||||||
// DEBUG_LOG("new seq wordCount : %d",newsize);
|
// DEBUG_LOG("new seq wordCount : %d",newsize);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* at this stage, new hash codes have been added in the table but the table is not sorted
|
||||||
|
*/
|
||||||
|
|
||||||
newsize = ecoCompactHashSequence(newtable,newsize);
|
newsize = ecoCompactHashSequence(newtable,newsize);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* new hash codes have now been sorted BUT the whole table is not.
|
||||||
|
* MULTIWORDS have been tagged (and compacted)
|
||||||
|
*/
|
||||||
|
|
||||||
// DEBUG_LOG("compacted wordCount : %d",newsize);
|
// DEBUG_LOG("compacted wordCount : %d",newsize);
|
||||||
buffersize = table->size + newsize;
|
buffersize = table->size + newsize;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* buffersize is now set to the REAL size used by the table (but the memory chunck may be larger)
|
||||||
|
*/
|
||||||
|
|
||||||
// resize the count buffer
|
// resize the count buffer
|
||||||
|
|
||||||
table->inseqcount++;
|
table->inseqcount++;
|
||||||
|
|
||||||
|
//fprintf (stderr, "\nOldAddress: %x", table->strictcount);
|
||||||
table->strictcount = ECOREALLOC(table->strictcount,buffersize*sizeof(uint32_t),
|
table->strictcount = ECOREALLOC(table->strictcount,(buffersize+5000)*sizeof(uint32_t),
|
||||||
"Cannot allocate memory to extend example word count table");
|
"Cannot allocate memory to extend example word count table");
|
||||||
|
//fprintf (stderr, " NewAddress: %x\n", table->strictcount);
|
||||||
|
|
||||||
|
|
||||||
for (i=table->size; i < buffersize; i++)
|
for (i=table->size; i < buffersize; i++)
|
||||||
table->strictcount[i]=1;
|
table->strictcount[i]=1;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* new words in the table are set to a count of ONE
|
||||||
|
*/
|
||||||
|
|
||||||
// Now we have to merge in situ the two tables
|
// Now we have to merge in situ the two tables
|
||||||
|
|
||||||
@ -88,45 +146,96 @@ void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circ
|
|||||||
pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
|
pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
|
||||||
uint32_t exampleCount,poptions_t options)
|
uint32_t exampleCount,poptions_t options)
|
||||||
{
|
{
|
||||||
uint32_t i;
|
struct rusage start;
|
||||||
|
struct rusage usage;
|
||||||
|
double seconde;
|
||||||
|
char *logfilename;
|
||||||
|
FILE *logfile;
|
||||||
|
uint32_t i, j;
|
||||||
|
bool_t first=TRUE;
|
||||||
pwordcount_t strictprimers=NULL;
|
pwordcount_t strictprimers=NULL;
|
||||||
|
uint64_t totallength=0;
|
||||||
uint32_t sequenceQuorum = (uint32_t)floor((float)exampleCount * options->strict_quorum);
|
uint32_t sequenceQuorum = (uint32_t)floor((float)exampleCount * options->strict_quorum);
|
||||||
|
int32_t *neededWords;
|
||||||
|
uint32_t neededWordCount;
|
||||||
|
|
||||||
|
fprintf(stderr,"Filtering... ");
|
||||||
|
|
||||||
|
if (options->filtering)
|
||||||
|
neededWords = filteringSeq(database,seqdbsize,exampleCount,options,&neededWordCount,(int32_t)sequenceQuorum);
|
||||||
|
else
|
||||||
|
{
|
||||||
|
neededWordCount=0;
|
||||||
|
neededWords=NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (options->statistics)
|
||||||
|
{
|
||||||
|
asprintf(&logfilename,"ecoprimer_%d.log",getpid());
|
||||||
|
logfile = fopen(logfilename,"w");
|
||||||
|
fprintf(logfile,"# seq\tlength\tsize\ttime\tspeed\n");
|
||||||
|
fclose(logfile);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
fprintf(stderr," Primers should be at least present in %d/%d example sequences\n",sequenceQuorum,exampleCount);
|
fprintf(stderr," Primers should be at least present in %d/%d example sequences\n",sequenceQuorum,exampleCount);
|
||||||
|
|
||||||
strictprimers = initCountTable(NULL,options->primer_length,
|
strictprimers = initCountTable(NULL,options->primer_length,
|
||||||
options->circular,
|
options->circular,
|
||||||
options->doublestrand,
|
options->doublestrand,
|
||||||
NULL);
|
0,
|
||||||
|
NULL,NULL,0);
|
||||||
|
|
||||||
|
|
||||||
|
getrusage(RUSAGE_SELF,&start);
|
||||||
|
|
||||||
for (i=0;i<seqdbsize;i++)
|
for (i=0;i<seqdbsize;i++)
|
||||||
{
|
{
|
||||||
if (database[i]->isexample)
|
if (database[i]->isexample && database[i]->SQ_length > options->primer_length)
|
||||||
{
|
{
|
||||||
if (strictprimers->size)
|
|
||||||
|
if (first)
|
||||||
{
|
{
|
||||||
uint32_t s;
|
strictprimers = initCountTable(strictprimers,options->primer_length,
|
||||||
s = strictprimers->size;
|
options->circular,
|
||||||
|
options->doublestrand,
|
||||||
|
sequenceQuorum,
|
||||||
|
database[i],neededWords,neededWordCount);
|
||||||
|
first=FALSE;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// uint32_t s;
|
||||||
|
// s = strictprimers->size;
|
||||||
// DEBUG_LOG("stack size : %u",s);
|
// DEBUG_LOG("stack size : %u",s);
|
||||||
addSeqToWordCountTable(strictprimers,options->primer_length,
|
addSeqToWordCountTable(strictprimers,options->primer_length,
|
||||||
options->circular,
|
options->circular,
|
||||||
options->doublestrand,
|
options->doublestrand,
|
||||||
exampleCount,
|
exampleCount,
|
||||||
sequenceQuorum,
|
sequenceQuorum,
|
||||||
database[i]);
|
database[i],neededWords,neededWordCount);
|
||||||
|
};
|
||||||
|
totallength+=database[i]->SQ_length;
|
||||||
|
getrusage(RUSAGE_SELF,&usage);
|
||||||
|
if (options->statistics)
|
||||||
|
{
|
||||||
|
asprintf(&logfilename,"ecoprimer_%d.log",getpid());
|
||||||
|
logfile = fopen(logfilename,"a");
|
||||||
|
seconde = timeval_subtract(&(usage.ru_utime),&(start.ru_utime)) +
|
||||||
|
timeval_subtract(&(usage.ru_stime),&(start.ru_stime));
|
||||||
|
fprintf(logfile,"%d\t%llu\t%llu\t%8.3f\t%8.3e\n",i,
|
||||||
|
(long long unsigned)totallength,
|
||||||
|
strictprimers->size*(sizeof(int64_t)+sizeof(int32_t)),
|
||||||
|
seconde,seconde/(double)totallength);
|
||||||
|
fclose(logfile);
|
||||||
}
|
}
|
||||||
else
|
|
||||||
strictprimers = initCountTable(strictprimers,options->primer_length,
|
|
||||||
options->circular,
|
|
||||||
options->doublestrand,
|
|
||||||
database[i]);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
strictprimers->outseqcount++;
|
strictprimers->outseqcount++;
|
||||||
|
|
||||||
fprintf(stderr," Indexed sequences %5d/%5d : considered words %-10d \r",(int32_t)i+1,(int32_t)seqdbsize,strictprimers->size);
|
fprintf(stderr," Indexed sequences %5d/%5d : considered words %-10llu \r",
|
||||||
|
(int32_t)i+1,(int32_t)seqdbsize,
|
||||||
|
(long long unsigned)strictprimers->size);
|
||||||
|
|
||||||
// DEBUG_LOG("First word : %s ==> %d",ecoUnhashWord(strictprimers->words[0],18),strictprimers->incount[0])
|
// DEBUG_LOG("First word : %s ==> %d",ecoUnhashWord(strictprimers->words[0],18),strictprimers->incount[0])
|
||||||
// DEBUG_LOG("Second word : %s ==> %d",ecoUnhashWord(strictprimers->words[1],18),strictprimers->incount[1])
|
// DEBUG_LOG("Second word : %s ==> %d",ecoUnhashWord(strictprimers->words[1],18),strictprimers->incount[1])
|
||||||
@ -139,6 +248,39 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
|
|||||||
sizeof(word_t)*strictprimers->size,
|
sizeof(word_t)*strictprimers->size,
|
||||||
"Cannot reallocate strict primer table");
|
"Cannot reallocate strict primer table");
|
||||||
|
|
||||||
|
if (neededWords){
|
||||||
|
ECOFREE(neededWords,"Clean needed word table");
|
||||||
|
}
|
||||||
|
//TR: Somehow for some primers strictcount value is extremely large hence invalid
|
||||||
|
//we need to remove these primers from the list
|
||||||
|
j = strictprimers->size+1;
|
||||||
|
for (i=0; i<strictprimers->size; i++)
|
||||||
|
{
|
||||||
|
if (strictprimers->strictcount[i] > seqdbsize)
|
||||||
|
{
|
||||||
|
if (j == (strictprimers->size+1))
|
||||||
|
j = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (j < i && strictprimers->strictcount[i] <= seqdbsize)
|
||||||
|
{
|
||||||
|
strictprimers->words[j] = strictprimers->words[i];
|
||||||
|
strictprimers->strictcount[j] = strictprimers->strictcount[i];
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (j < strictprimers->size)
|
||||||
|
{
|
||||||
|
strictprimers->size = j;
|
||||||
|
strictprimers->strictcount = ECOREALLOC(strictprimers->strictcount,
|
||||||
|
sizeof(uint32_t)*strictprimers->size,
|
||||||
|
"Cannot reallocate strict primer count table");
|
||||||
|
strictprimers->words = ECOREALLOC(strictprimers->words,
|
||||||
|
sizeof(word_t)*strictprimers->size,
|
||||||
|
"Cannot reallocate strict primer table");
|
||||||
|
}
|
||||||
|
|
||||||
return strictprimers;
|
return strictprimers;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
378
src/libecoprimer/taxstats.c
Normal file
378
src/libecoprimer/taxstats.c
Normal file
@ -0,0 +1,378 @@
|
|||||||
|
/*
|
||||||
|
* taxstats.c
|
||||||
|
*
|
||||||
|
* Created on: 12 mars 2009
|
||||||
|
* Author: coissac
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <search.h>
|
||||||
|
//void tdestroy (void *root, void (*free_node)(void *nodep));
|
||||||
|
|
||||||
|
#include "ecoprimer.h"
|
||||||
|
|
||||||
|
static int cmptaxon(const void *t1, const void* t2);
|
||||||
|
|
||||||
|
void **tree_root = NULL;
|
||||||
|
int delete_passes = 0;
|
||||||
|
|
||||||
|
void delete_twalkaction (const void *node, VISIT order, int level)
|
||||||
|
{
|
||||||
|
switch (order)
|
||||||
|
{
|
||||||
|
case preorder:
|
||||||
|
delete_passes++;
|
||||||
|
break;
|
||||||
|
case postorder:
|
||||||
|
delete_passes++;
|
||||||
|
break;
|
||||||
|
case endorder:
|
||||||
|
delete_passes++;
|
||||||
|
break;
|
||||||
|
case leaf:
|
||||||
|
if (tree_root)
|
||||||
|
tdelete (node, tree_root,cmptaxon);
|
||||||
|
delete_passes++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void free_tree_nodes (void *tree)
|
||||||
|
{
|
||||||
|
while (1)
|
||||||
|
{
|
||||||
|
delete_passes = 0;
|
||||||
|
twalk (tree, delete_twalkaction);
|
||||||
|
if (delete_passes <= 1) break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int cmptaxon(const void *t1, const void* t2)
|
||||||
|
{
|
||||||
|
const size_t taxid1=(size_t)t1;
|
||||||
|
const size_t taxid2=(size_t)t2;
|
||||||
|
|
||||||
|
// fprintf(stderr,"==> counted taxid1 : %d\n",taxid1);
|
||||||
|
// fprintf(stderr,"==> counted taxid2 : %d\n",taxid2);
|
||||||
|
|
||||||
|
if (taxid1 < taxid2)
|
||||||
|
return -1;
|
||||||
|
if (taxid1 > taxid2)
|
||||||
|
return +1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t counttaxon(int32_t taxid)
|
||||||
|
{
|
||||||
|
static void* taxontree=NULL;
|
||||||
|
static int32_t taxoncount=0;
|
||||||
|
|
||||||
|
// fprintf(stderr,"counted taxid : %d taxontree %p\n",taxid,taxontree);
|
||||||
|
|
||||||
|
if (taxid==-1)
|
||||||
|
{
|
||||||
|
if (taxontree)
|
||||||
|
{
|
||||||
|
tree_root = (void **)&taxontree;
|
||||||
|
//free_tree_nodes (taxontree);
|
||||||
|
ECOFREE(taxontree,"Free taxon tree");
|
||||||
|
tree_root = NULL;
|
||||||
|
}
|
||||||
|
taxontree=NULL;
|
||||||
|
taxoncount=0;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if ((taxid > 0) && ((!taxontree) || (!tfind((void*)((size_t)taxid),&taxontree,cmptaxon))))
|
||||||
|
{
|
||||||
|
tsearch((void*)((size_t)taxid),&taxontree,cmptaxon);
|
||||||
|
taxoncount++;
|
||||||
|
}
|
||||||
|
return taxoncount;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t getrankdbstats(pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxonomy,
|
||||||
|
poptions_t options)
|
||||||
|
{
|
||||||
|
|
||||||
|
uint32_t i;
|
||||||
|
ecotx_t *taxon;
|
||||||
|
ecotx_t *tmptaxon;
|
||||||
|
|
||||||
|
counttaxon(-1);
|
||||||
|
options->intaxa = 0;
|
||||||
|
|
||||||
|
for (i=0;i<seqdbsize;i++)
|
||||||
|
{
|
||||||
|
taxon = &(taxonomy->taxons->taxon[seqdb[i]->taxid]);
|
||||||
|
seqdb[i]->isexample=isExampleTaxon(taxonomy,seqdb[i]->taxid,options);
|
||||||
|
|
||||||
|
tmptaxon = eco_findtaxonatrank(taxon,
|
||||||
|
options->taxonrankidx);
|
||||||
|
|
||||||
|
// fprintf(stderr,"Taxid : %d %p\n",taxon->taxid,tmptaxon);
|
||||||
|
|
||||||
|
if (tmptaxon)
|
||||||
|
{
|
||||||
|
// fprintf(stderr,"orig : %d trans : %d\n",taxon->taxid,
|
||||||
|
// tmptaxon->taxid);
|
||||||
|
|
||||||
|
seqdb[i]->ranktaxonid=tmptaxon->taxid;
|
||||||
|
if (seqdb[i]->isexample)
|
||||||
|
options->intaxa = counttaxon(tmptaxon->taxid);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
seqdb[i]->ranktaxonid=-1;
|
||||||
|
}
|
||||||
|
|
||||||
|
counttaxon(-1);
|
||||||
|
options->outtaxa = 0;
|
||||||
|
|
||||||
|
for (i=0;i<seqdbsize;i++)
|
||||||
|
{
|
||||||
|
if (seqdb[i]->ranktaxonid>=0 && !seqdb[i]->isexample)
|
||||||
|
options->outtaxa = counttaxon(seqdb[i]->ranktaxonid);
|
||||||
|
}
|
||||||
|
|
||||||
|
return options->outtaxa + options->intaxa;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
float taxonomycoverage(ppair_t pair, poptions_t options, pecodnadb_t seqdb,uint32_t seqdbsize)
|
||||||
|
{
|
||||||
|
int32_t seqcount;
|
||||||
|
int32_t i;
|
||||||
|
int32_t incount=0;
|
||||||
|
int32_t outcount=0;
|
||||||
|
uint32_t j;
|
||||||
|
|
||||||
|
|
||||||
|
memset (pair->coveredSeqs, 0, seqdbsize*sizeof (int));
|
||||||
|
seqcount=pair->pcr.ampcount;
|
||||||
|
|
||||||
|
counttaxon(-1);
|
||||||
|
for (i=0; i < seqcount; i++)
|
||||||
|
if (pair->pcr.amplifias[i].sequence->isexample
|
||||||
|
&& pair->pcr.amplifias[i].sequence->ranktaxonid > 0 )
|
||||||
|
{
|
||||||
|
incount = counttaxon(pair->pcr.amplifias[i].sequence->ranktaxonid);
|
||||||
|
|
||||||
|
for (j=0; j<seqdbsize; j++)
|
||||||
|
if (pair->pcr.amplifias[i].sequence == seqdb[j])
|
||||||
|
{pair->coveredSeqs[j] = 1; break;}
|
||||||
|
}
|
||||||
|
|
||||||
|
counttaxon(-1);
|
||||||
|
for (i=0; i < seqcount; i++)
|
||||||
|
if (!pair->pcr.amplifias[i].sequence->isexample
|
||||||
|
&& pair->pcr.amplifias[i].sequence->ranktaxonid)
|
||||||
|
outcount = counttaxon(pair->pcr.amplifias[i].sequence->ranktaxonid);
|
||||||
|
|
||||||
|
|
||||||
|
pair->intaxa=incount;
|
||||||
|
pair->outtaxa=outcount;
|
||||||
|
pair->bc=(float)incount/options->intaxa;
|
||||||
|
return pair->bc;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
static int cmpamp(const void *ampf1, const void* ampf2)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
int j = 0;
|
||||||
|
int incr = 1;
|
||||||
|
char cd1;
|
||||||
|
char cd2;
|
||||||
|
int chd = 0;
|
||||||
|
int len = 0;
|
||||||
|
|
||||||
|
pamptotaxon_t pampf1 = (pamptotaxon_t) ampf1;
|
||||||
|
pamptotaxon_t pampf2 = (pamptotaxon_t) ampf2;
|
||||||
|
|
||||||
|
|
||||||
|
if (pampf1->strand != pampf2->strand)
|
||||||
|
{
|
||||||
|
incr = -1;
|
||||||
|
j = pampf1->length - 1;
|
||||||
|
|
||||||
|
if (pampf2->strand)
|
||||||
|
{
|
||||||
|
pampf1 = (pamptotaxon_t) ampf2;
|
||||||
|
pampf2 = (pamptotaxon_t) ampf1;
|
||||||
|
chd = 1;
|
||||||
|
}
|
||||||
|
//j = pampf2->length - 1; should have been here and pampf2 instead of pampf1?
|
||||||
|
}
|
||||||
|
|
||||||
|
len = (pampf1->length <= pampf2->length)? pampf1->length: pampf2->length;
|
||||||
|
|
||||||
|
for (i = 0; i < len; i++, j += incr)
|
||||||
|
{
|
||||||
|
cd1 = pampf1->amplifia[i];
|
||||||
|
if (incr == -1)
|
||||||
|
cd2 = ecoComplementChar(pampf2->amplifia[j]);
|
||||||
|
else
|
||||||
|
cd2 = pampf2->amplifia[j];
|
||||||
|
|
||||||
|
if (cd1 < cd2) return chd ? 1: -1;
|
||||||
|
if (cd2 < cd1) return chd ? -1: 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pampf1->length > pampf2->length) return chd ? -1: 1;
|
||||||
|
if (pampf2->length > pampf1->length) return chd ? 1: -1;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}*/
|
||||||
|
|
||||||
|
|
||||||
|
static int cmpamp(const void *ampf1, const void* ampf2)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
char cd1;
|
||||||
|
char cd2;
|
||||||
|
int len = 0;
|
||||||
|
const char *ch1;
|
||||||
|
const char *ch2;
|
||||||
|
int incr1;
|
||||||
|
int incr2;
|
||||||
|
|
||||||
|
pamptotaxon_t pampf1 = (pamptotaxon_t) ampf1;
|
||||||
|
pamptotaxon_t pampf2 = (pamptotaxon_t) ampf2;
|
||||||
|
|
||||||
|
ch1 = pampf1->amplifia;
|
||||||
|
ch2 = pampf2->amplifia;
|
||||||
|
|
||||||
|
incr1 = 1;
|
||||||
|
incr2 = 1;
|
||||||
|
|
||||||
|
if (!pampf1->strand)
|
||||||
|
incr1 = -1;
|
||||||
|
if (!pampf2->strand)
|
||||||
|
incr2 = -1;
|
||||||
|
|
||||||
|
len = (pampf1->length <= pampf2->length)? pampf1->length: pampf2->length;
|
||||||
|
for (i = 0; i < len; i++)
|
||||||
|
{
|
||||||
|
cd1 = *ch1;
|
||||||
|
if (incr1 == -1)
|
||||||
|
cd1 = ecoComplementChar(*ch1);
|
||||||
|
|
||||||
|
cd2 = *ch2;
|
||||||
|
if (incr2 == -1)
|
||||||
|
cd2 = ecoComplementChar(*ch2);
|
||||||
|
|
||||||
|
if (cd1 < cd2) return -1;
|
||||||
|
if (cd2 < cd1) return 1;
|
||||||
|
|
||||||
|
ch1 += incr1;
|
||||||
|
ch2 += incr2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pampf1->length > pampf2->length) return 1;
|
||||||
|
if (pampf2->length > pampf1->length) return -1;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void twalkaction (const void *node, VISIT order, int level)
|
||||||
|
{
|
||||||
|
int32_t *taxid = (int32_t*)node;
|
||||||
|
//const size_t taxid=(size_t)node;
|
||||||
|
//printf ("\t%d:%p, ", *taxid, node);
|
||||||
|
counttaxon(*taxid);
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t gtxid;
|
||||||
|
void twalkaction2 (const void *node, VISIT order, int level)
|
||||||
|
{
|
||||||
|
int32_t *pt = (int32_t *) node;
|
||||||
|
gtxid = *pt;
|
||||||
|
}
|
||||||
|
|
||||||
|
void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize)
|
||||||
|
{
|
||||||
|
uint32_t i, j;
|
||||||
|
uint32_t ampfindex = 0;
|
||||||
|
int32_t taxid;
|
||||||
|
uint32_t wellidentifiedcount;
|
||||||
|
|
||||||
|
void *ampftree = NULL;
|
||||||
|
pamptotaxon_t pcurrentampf;
|
||||||
|
pamptotaxon_t *ptmp;
|
||||||
|
|
||||||
|
pamptotaxon_t ampfwithtaxtree = ECOMALLOC(sizeof(amptotaxon_t) * pair->pcr.ampcount,"Cannot allocate amplifia tree");
|
||||||
|
|
||||||
|
for (i = 0; i < pair->pcr.ampcount; i++)
|
||||||
|
{
|
||||||
|
/*populate taxon ids tree against each unique amplifia
|
||||||
|
i.e set of taxon ids for each amplifia*/
|
||||||
|
if (pair->pcr.amplifias[i].sequence->isexample)
|
||||||
|
{
|
||||||
|
ampfwithtaxtree[ampfindex].amplifia = pair->pcr.amplifias[i].amplifia;
|
||||||
|
ampfwithtaxtree[ampfindex].strand = pair->pcr.amplifias[i].strand;
|
||||||
|
ampfwithtaxtree[ampfindex].length = pair->pcr.amplifias[i].length;
|
||||||
|
pcurrentampf = &fwithtaxtree[ampfindex];
|
||||||
|
taxid = pair->pcr.amplifias[i].sequence->ranktaxonid;
|
||||||
|
ptmp = tfind((const void*)pcurrentampf, &ftree, cmpamp);
|
||||||
|
if (ptmp == NULL)
|
||||||
|
{
|
||||||
|
pcurrentampf = &fwithtaxtree[ampfindex];
|
||||||
|
tsearch((void*)pcurrentampf,&ftree,cmpamp);
|
||||||
|
ampfindex++;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
pcurrentampf = *ptmp;
|
||||||
|
|
||||||
|
if (tfind((void*)((size_t)taxid), &(pcurrentampf->taxontree), cmptaxon) == NULL)
|
||||||
|
{
|
||||||
|
pcurrentampf->taxoncount++;
|
||||||
|
tsearch((void*)((size_t)taxid),&(pcurrentampf->taxontree),cmptaxon);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
memset (pair->wellIdentifiedSeqs, 0, seqdbsize*sizeof (int));
|
||||||
|
//counttaxon(-1);
|
||||||
|
for (i = 0; i < ampfindex; i++)
|
||||||
|
{
|
||||||
|
if (ampfwithtaxtree[i].taxoncount > 1)
|
||||||
|
{
|
||||||
|
//printf ("\nampfwithtaxtree[i].taxoncount: %d\n", ampfwithtaxtree[i].taxoncount);
|
||||||
|
//twalk(ampfwithtaxtree[i].taxontree, twalkaction);
|
||||||
|
}
|
||||||
|
//TR 5/9/10 - added code for well identified seqs
|
||||||
|
else if(ampfwithtaxtree[i].taxoncount == 1) /*well identified*/
|
||||||
|
{
|
||||||
|
gtxid = -1;
|
||||||
|
twalk(ampfwithtaxtree[i].taxontree, twalkaction2);
|
||||||
|
|
||||||
|
if (gtxid != -1)
|
||||||
|
{
|
||||||
|
for (j = 0; j < seqdbsize; j++)
|
||||||
|
if (seqdb[j]->ranktaxonid == gtxid
|
||||||
|
&& seqdb[j]->isexample
|
||||||
|
&&(pair->p1->directCount[j] > 0
|
||||||
|
|| pair->p1->reverseCount[j] > 0)
|
||||||
|
&& (pair->p2->directCount[j] > 0
|
||||||
|
|| pair->p2->reverseCount[j] > 0))
|
||||||
|
{
|
||||||
|
pair->wellIdentifiedSeqs[j] = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
//printf ("\n");
|
||||||
|
counttaxon(-1);
|
||||||
|
wellidentifiedcount = 0;
|
||||||
|
for (j = 0; j < seqdbsize; j++)
|
||||||
|
if (pair->wellIdentifiedSeqs[j] == 1)
|
||||||
|
counttaxon(seqdb[j]->ranktaxonid);
|
||||||
|
wellidentifiedcount = counttaxon(-2);
|
||||||
|
//pair->notwellidentifiedtaxa = counttaxon(-2);
|
||||||
|
pair->notwellidentifiedtaxa = (pair->intaxa-wellidentifiedcount); //counttaxon(-2);
|
||||||
|
//pair->bs = ((float)pair->intaxa - (float)pair->notwellidentifiedtaxa) / pair->intaxa;
|
||||||
|
pair->bs = ((float)wellidentifiedcount) / (float)pair->intaxa;
|
||||||
|
|
||||||
|
ECOFREE (ampfwithtaxtree, "Free amplifia table");
|
||||||
|
|
||||||
|
}
|
23
src/libthermo/Makefile
Normal file
23
src/libthermo/Makefile
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
|
||||||
|
SOURCES = nnparams.c \
|
||||||
|
thermostats.c
|
||||||
|
|
||||||
|
SRCS=$(SOURCES)
|
||||||
|
|
||||||
|
OBJECTS= $(patsubst %.c,%.o,$(SOURCES))
|
||||||
|
|
||||||
|
LIBFILE= libthermo.a
|
||||||
|
RANLIB= ranlib
|
||||||
|
|
||||||
|
|
||||||
|
include ../global.mk
|
||||||
|
|
||||||
|
|
||||||
|
all: $(LIBFILE)
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf $(OBJECTS) $(LIBFILE)
|
||||||
|
|
||||||
|
$(LIBFILE): $(OBJECTS)
|
||||||
|
ar -cr $@ $?
|
||||||
|
$(RANLIB) $@
|
609
src/libthermo/nnparams.c
Normal file
609
src/libthermo/nnparams.c
Normal file
@ -0,0 +1,609 @@
|
|||||||
|
/*
|
||||||
|
* nnparams.cpp
|
||||||
|
* PHunterLib
|
||||||
|
*
|
||||||
|
* Nearest Neighbor Model / Parameters
|
||||||
|
*
|
||||||
|
* Created by Tiayyba Riaz on 7/2/09.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <memory.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include"nnparams.h"
|
||||||
|
|
||||||
|
|
||||||
|
double forbidden_entropy;
|
||||||
|
|
||||||
|
char bpencoder[] = { 1, // A
|
||||||
|
0, // b
|
||||||
|
2, // C
|
||||||
|
0,0,0, // d, e, f
|
||||||
|
3, // G
|
||||||
|
0,0,0,0,0,0,0,0,0,0,0,0, // h,i,j,k,l,m,n,o,p,q,r,s
|
||||||
|
4,0, // T,U
|
||||||
|
0,0,0,0,0}; // v,w,x,y,z
|
||||||
|
|
||||||
|
|
||||||
|
double nparam_GetInitialEntropy(PNNParams nparm)
|
||||||
|
{
|
||||||
|
return -5.9f+nparm->rlogc;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//Retrieve Enthalpy for given NN-Pair from parameter table
|
||||||
|
double nparam_GetEnthalpy(PNNParams nparm, char x0, char x1, char y0, char y1)
|
||||||
|
{
|
||||||
|
return ndH(x0,x1,y0,y1); //xx, yx are already numbers
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//Retrieve Entropy for given NN-Pair from parameter table
|
||||||
|
double nparam_GetEntropy(PNNParams nparm, char x0, char x1, char y0, char y1)
|
||||||
|
{
|
||||||
|
//xx and yx are already numbers
|
||||||
|
char nx0=x0;//nparam_convertNum(x0);
|
||||||
|
char nx1=x1;//nparam_convertNum(x1);
|
||||||
|
char ny0=y0;//nparam_convertNum(y0);
|
||||||
|
char ny1=y1;//nparam_convertNum(y1);
|
||||||
|
double answer = ndS(nx0,nx1,ny0,ny1);
|
||||||
|
/*Salt correction Santalucia*/
|
||||||
|
if (nparm->saltMethod == SALT_METHOD_SANTALUCIA) {
|
||||||
|
if(nx0!=5 && 1<= nx1 && nx1<=4) {
|
||||||
|
answer += 0.5*nparm->kfac;
|
||||||
|
}
|
||||||
|
if(ny1!=5 && 1<= ny0 && ny0<=4) {
|
||||||
|
answer += 0.5*nparm->kfac;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*Salt correction Owczarzy*/
|
||||||
|
if (nparm->saltMethod == SALT_METHOD_OWCZARZY) {
|
||||||
|
double logk = log(nparm->kplus);
|
||||||
|
answer += ndH(nx0,nx1,ny0,ny1)*((4.29 * nparm->gcContent-3.95)*0.00001*logk+ 0.0000094*logk*logk);
|
||||||
|
}
|
||||||
|
return answer;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* PURPOSE: Return melting temperature TM for given entropy and enthalpy
|
||||||
|
* Assuming a one-state transition and using the formula
|
||||||
|
* TM = dH / (dS + R ln(Ct/4))
|
||||||
|
* entropy = dS + R ln Ct/4 (must already be included!)
|
||||||
|
* enthaklpy = dH
|
||||||
|
* where
|
||||||
|
* dH = enthalpy
|
||||||
|
* dS = entropy
|
||||||
|
* R = Boltzmann factor
|
||||||
|
* Ct = Strand Concentration
|
||||||
|
*
|
||||||
|
* PARAMETERS:
|
||||||
|
* entrypy and enthalpy
|
||||||
|
*
|
||||||
|
* RETURN VALUE:
|
||||||
|
* temperature
|
||||||
|
*/
|
||||||
|
|
||||||
|
double nparam_CalcTM(double entropy,double enthalpy)
|
||||||
|
{
|
||||||
|
double tm = 0; // absolute zero - return if model fails!
|
||||||
|
if (enthalpy>=forbidden_enthalpy) //||(entropy==-cfact))
|
||||||
|
return 0;
|
||||||
|
if (entropy<0) // avoid division by zero and model errors!
|
||||||
|
{
|
||||||
|
tm = enthalpy/entropy;// - kfac; //LKFEB
|
||||||
|
if (tm<0)
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return tm;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void nparam_InitParams(PNNParams nparm, double c1, double c2, double kp, int sm)
|
||||||
|
{
|
||||||
|
nparm->Ct1 = c1;
|
||||||
|
nparm->Ct2 = c2;
|
||||||
|
nparm->kplus = kp;
|
||||||
|
int maxCT = 1;
|
||||||
|
if(nparm->Ct2 > nparm->Ct1)
|
||||||
|
{
|
||||||
|
maxCT = 2;
|
||||||
|
}
|
||||||
|
double ctFactor;
|
||||||
|
if(nparm->Ct1 == nparm->Ct2)
|
||||||
|
{
|
||||||
|
ctFactor = nparm->Ct1/2;
|
||||||
|
}
|
||||||
|
else if (maxCT == 1)
|
||||||
|
{
|
||||||
|
ctFactor = nparm->Ct1-nparm->Ct2/2;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ctFactor = nparm->Ct2-nparm->Ct1/2;
|
||||||
|
}
|
||||||
|
nparm->rlogc = R * log(ctFactor);
|
||||||
|
forbidden_entropy = nparm->rlogc;
|
||||||
|
nparm->kfac = 0.368 * log (nparm->kplus);
|
||||||
|
nparm->saltMethod = sm;
|
||||||
|
int x,y,a,b; // variables used as counters...
|
||||||
|
|
||||||
|
// Set all parameters to zero!
|
||||||
|
memset(nparm->dH,0,sizeof(nparm->dH));
|
||||||
|
memset(nparm->dS,0,sizeof(nparm->dS));
|
||||||
|
|
||||||
|
// Set all X-/Y-, -X/Y- and X-/-Y so, that TM will be VERY small!
|
||||||
|
for (x=1;x<=4;x++)
|
||||||
|
{
|
||||||
|
for (y=1;y<=4;y++)
|
||||||
|
{
|
||||||
|
ndH(0,x,y,0)=forbidden_enthalpy;
|
||||||
|
ndS(0,x,y,0)=forbidden_entropy;
|
||||||
|
ndH(x,0,0,y)=forbidden_enthalpy;
|
||||||
|
ndS(x,0,0,y)=forbidden_entropy;
|
||||||
|
ndH(x,0,y,0)=forbidden_enthalpy;
|
||||||
|
ndS(x,0,y,0)=forbidden_entropy;
|
||||||
|
// forbid X-/Y$ and X$/Y- etc., i.e. terminal must not be paired with gap!
|
||||||
|
ndH(x,5,y,0)=forbidden_enthalpy;
|
||||||
|
ndS(x,5,y,0)=forbidden_entropy;
|
||||||
|
ndH(x,0,y,5)=forbidden_enthalpy;
|
||||||
|
ndS(x,0,y,5)=forbidden_entropy;
|
||||||
|
ndH(5,x,0,y)=forbidden_enthalpy;
|
||||||
|
ndS(5,x,0,y)=forbidden_entropy;
|
||||||
|
ndH(0,x,5,y)=forbidden_enthalpy;
|
||||||
|
ndS(0,x,5,y)=forbidden_entropy;
|
||||||
|
// forbid X$/-Y etc.
|
||||||
|
ndH(x,5,0,y)=forbidden_enthalpy;
|
||||||
|
ndS(x,5,0,y)=forbidden_entropy;
|
||||||
|
ndH(x,0,5,y)=forbidden_enthalpy;
|
||||||
|
ndS(x,0,5,y)=forbidden_entropy;
|
||||||
|
ndH(5,x,y,0)=forbidden_enthalpy;
|
||||||
|
ndS(5,x,y,0)=forbidden_entropy;
|
||||||
|
ndH(0,x,y,5)=forbidden_enthalpy;
|
||||||
|
ndS(0,x,y,5)=forbidden_entropy;
|
||||||
|
|
||||||
|
}
|
||||||
|
// also, forbid x-/-- and --/x-, i.e. no two inner gaps paired
|
||||||
|
ndH(x,0,0,0)=forbidden_enthalpy;
|
||||||
|
ndS(x,0,0,0)=forbidden_entropy;
|
||||||
|
ndH(0,0,x,0)=forbidden_enthalpy;
|
||||||
|
ndS(0,0,x,0)=forbidden_entropy;
|
||||||
|
// x-/-$
|
||||||
|
ndH(x,0,0,5)=forbidden_enthalpy;
|
||||||
|
ndS(x,0,0,5)=forbidden_entropy;
|
||||||
|
ndH(5,0,0,x)=forbidden_enthalpy;
|
||||||
|
ndS(5,0,0,x)=forbidden_entropy;
|
||||||
|
ndH(0,5,x,0)=forbidden_enthalpy;
|
||||||
|
ndS(x,0,0,5)=forbidden_entropy;
|
||||||
|
ndH(0,x,5,0)=forbidden_enthalpy;
|
||||||
|
ndS(0,x,5,0)=forbidden_entropy;
|
||||||
|
}
|
||||||
|
// forbid --/--
|
||||||
|
ndH(0,0,0,0)=forbidden_enthalpy;
|
||||||
|
ndS(0,0,0,0)=forbidden_entropy;
|
||||||
|
|
||||||
|
ndH(5,0,0,0)=forbidden_enthalpy;
|
||||||
|
ndS(5,0,0,0)=forbidden_entropy;
|
||||||
|
ndH(0,0,5,0)=forbidden_enthalpy;
|
||||||
|
ndS(0,0,5,0)=forbidden_entropy;
|
||||||
|
ndH(0,5,5,0)=forbidden_enthalpy;
|
||||||
|
ndS(0,5,5,0)=forbidden_entropy;
|
||||||
|
|
||||||
|
// Interior loops (double Mismatches)
|
||||||
|
#define iloop_entropy -0.97f
|
||||||
|
#define iloop_enthalpy 0.0f
|
||||||
|
for (x=1; x<=4; x++)
|
||||||
|
for (y=1; y<=4; y++)
|
||||||
|
for (a=1; a<=4; a++)
|
||||||
|
for (b=1; b<=4; b++)
|
||||||
|
// AT and CG pair, and as A=1, C=2, G=3, T=4 this means
|
||||||
|
// we have Watson-Crick pairs if (x+a==5) and (y+b)==5.
|
||||||
|
if (!((x+a==5)||(y+b==5)))
|
||||||
|
{
|
||||||
|
// No watson-crick-pair, i.e. double mismatch!
|
||||||
|
// set enthalpy/entropy to loop expansion!
|
||||||
|
ndH(x,y,a,b) = iloop_enthalpy;
|
||||||
|
ndS(x,y,a,b) = iloop_entropy;
|
||||||
|
}
|
||||||
|
|
||||||
|
// xy/-- and --/xy (Bulge Loops of size > 1)
|
||||||
|
#define bloop_entropy -1.3f
|
||||||
|
#define bloop_enthalpy 0.0f
|
||||||
|
for (x=1; x<=4; x++)
|
||||||
|
for (y=1; y<=4; y++)
|
||||||
|
{
|
||||||
|
ndH(x,y,0,0) = bloop_enthalpy;
|
||||||
|
ndS(x,y,0,0) = bloop_entropy;
|
||||||
|
ndH(0,0,x,y) = bloop_enthalpy;
|
||||||
|
ndS(0,0,x,y) = bloop_entropy;
|
||||||
|
}
|
||||||
|
|
||||||
|
// x-/ya abd xa/y- as well as -x/ay and ax/-y
|
||||||
|
// bulge opening and closing parameters with
|
||||||
|
// adjacent matches / mismatches
|
||||||
|
// obulge_mism and cbulge_mism chosen so high to avoid
|
||||||
|
// AAAAAAAAA
|
||||||
|
// T--G----T
|
||||||
|
// being better than
|
||||||
|
// AAAAAAAAA
|
||||||
|
// TG------T
|
||||||
|
#define obulge_match_H (-2.66f * 1000)
|
||||||
|
#define obulge_match_S -14.22f
|
||||||
|
#define cbulge_match_H (-2.66f * 1000)
|
||||||
|
#define cbulge_match_S -14.22f
|
||||||
|
#define obulge_mism_H (0.0f * 1000)
|
||||||
|
#define obulge_mism_S -6.45f
|
||||||
|
#define cbulge_mism_H 0.0f
|
||||||
|
#define cbulge_mism_S -6.45f
|
||||||
|
for (x=1; x<=4; x++)
|
||||||
|
for (y=1; y<=4; y++)
|
||||||
|
for (a=1; a<=4; a++)
|
||||||
|
{
|
||||||
|
if (x+y==5) // other base pair matches!
|
||||||
|
{
|
||||||
|
ndH(x,0,y,a)=obulge_match_H; // bulge opening
|
||||||
|
ndS(x,0,y,a)=obulge_match_S;
|
||||||
|
ndH(x,a,y,0)=obulge_match_H;
|
||||||
|
ndS(x,a,y,0)=obulge_match_S;
|
||||||
|
ndH(0,x,a,y)=cbulge_match_H; // bulge closing
|
||||||
|
ndS(0,x,a,y)=cbulge_match_S;
|
||||||
|
ndH(a,x,0,y)=cbulge_match_H;
|
||||||
|
ndS(a,x,0,y)=cbulge_match_S;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{ // mismatch in other base pair!
|
||||||
|
ndH(x,0,y,a)=obulge_mism_H; // bulge opening
|
||||||
|
ndS(x,0,y,a)=obulge_mism_S;
|
||||||
|
ndH(x,a,y,0)=obulge_mism_H;
|
||||||
|
ndS(x,a,y,0)=obulge_mism_S;
|
||||||
|
ndH(0,x,a,y)=cbulge_mism_H; // bulge closing
|
||||||
|
ndS(0,x,a,y)=cbulge_mism_S;
|
||||||
|
ndH(a,x,0,y)=cbulge_mism_H;
|
||||||
|
ndS(a,x,0,y)=cbulge_mism_S;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Watson-Crick pairs (note that only ten are unique, as obviously
|
||||||
|
// 5'-AG-3'/3'-TC-5' = 5'-CT-3'/3'-GA-5' etc.
|
||||||
|
ndH(1,1,4,4)=-7.6f*1000; ndS(1,1,4,4)=-21.3f; // AA/TT 04
|
||||||
|
ndH(1,2,4,3)=-8.4f*1000; ndS(1,2,4,3)=-22.4f; // AC/TG adapted GT/CA
|
||||||
|
ndH(1,3,4,2)=-7.8f*1000; ndS(1,3,4,2)=-21.0f; // AG/TC adapted CT/GA
|
||||||
|
ndH(1,4,4,1)=-7.2f*1000; ndS(1,4,4,1)=-20.4f; // AT/TA 04
|
||||||
|
ndH(2,1,3,4)=-8.5f*1000; ndS(2,1,3,4)=-22.7f; // CA/GT 04
|
||||||
|
ndH(2,2,3,3)=-8.0f*1000; ndS(2,2,3,3)=-19.9f; // CC/GG adapted GG/CC
|
||||||
|
ndH(2,3,3,2)=-10.6f*1000; ndS(2,3,3,2)=-27.2f; // CG/GC 04
|
||||||
|
ndH(2,4,3,1)=-7.8f*1000; ndS(2,4,3,1)=-21.0f; // CT/GA 04
|
||||||
|
ndH(3,1,2,4)=-8.2f*1000; ndS(3,1,2,4)=-22.2f; // GA/CT 04
|
||||||
|
ndH(3,2,2,3)=-9.8f*1000; ndS(3,2,2,3)=-24.4f; // GC/CG 04
|
||||||
|
ndH(3,3,2,2)=-8.0f*1000; ndS(3,3,2,2)=-19.9f; // GG/CC 04
|
||||||
|
ndH(3,4,2,1)=-8.4f*1000; ndS(3,4,2,1)=-22.4f; // GT/CA 04
|
||||||
|
ndH(4,1,1,4)=-7.2f*1000; ndS(4,1,1,4)=-21.3f; // TA/AT 04
|
||||||
|
ndH(4,2,1,3)=-8.2f*1000; ndS(4,2,1,3)=-22.2f; // TC/AG adapted GA/CT
|
||||||
|
ndH(4,3,1,2)=-8.5f*1000; ndS(4,3,1,2)=-22.7f; // TG/AC adapted CA/GT
|
||||||
|
ndH(4,4,1,1)=-7.6f*1000; ndS(4,4,1,1)=-21.3f; // TT/AA adapted AA/TT
|
||||||
|
|
||||||
|
// A-C Mismatches (Values for pH 7.0)
|
||||||
|
ndH(1,1,2,4)=7.6f*1000; ndS(1,1,2,4)=20.2f; // AA/CT
|
||||||
|
ndH(1,1,4,2)=2.3f*1000; ndS(1,1,4,2)=4.6f; // AA/TC
|
||||||
|
ndH(1,2,2,3)=-0.7f*1000; ndS(1,2,2,3)=-3.8f; // AC/CG
|
||||||
|
ndH(1,2,4,1)=5.3f*1000; ndS(1,2,4,1)=14.6f; // AC/TA
|
||||||
|
ndH(1,3,2,2)=0.6f*1000; ndS(1,3,2,2)=-0.6f; // AG/CC
|
||||||
|
ndH(1,4,2,1)=5.3f*1000; ndS(1,4,2,1)=14.6f; // AT/CA
|
||||||
|
ndH(2,1,1,4)=3.4f*1000; ndS(2,1,1,4)=8.0f; // CA/AT
|
||||||
|
ndH(2,1,3,2)=1.9f*1000; ndS(2,1,3,2)=3.7f; // CA/GC
|
||||||
|
ndH(2,2,1,3)=5.2f*1000; ndS(2,2,1,3)=14.2f; // CC/AG
|
||||||
|
ndH(2,2,3,1)=0.6f*1000; ndS(2,2,3,1)=-0.6f; // CC/GA
|
||||||
|
ndH(2,3,1,2)=1.9f*1000; ndS(2,3,1,2)=3.7f; // CG/AC
|
||||||
|
ndH(2,4,1,1)=2.3f*1000; ndS(2,4,1,1)=4.6f; // CT/AA
|
||||||
|
ndH(3,1,2,2)=5.2f*1000; ndS(3,1,2,2)=14.2f; // GA/CC
|
||||||
|
ndH(3,2,2,1)=-0.7f*1000; ndS(3,2,2,1)=-3.8f; // GC/CA
|
||||||
|
ndH(4,1,1,2)=3.4f*1000; ndS(4,1,1,2)=8.0f; // TA/AC
|
||||||
|
ndH(4,2,1,1)=7.6f*1000; ndS(4,2,1,1)=20.2f; // TC/AA
|
||||||
|
|
||||||
|
// C-T Mismatches
|
||||||
|
ndH(1,2,4,4)=0.7f*1000; ndS(1,2,4,4)=0.2f; // AC/TT
|
||||||
|
ndH(1,4,4,2)=-1.2f*1000; ndS(1,4,4,2)=-6.2f; // AT/TC
|
||||||
|
ndH(2,1,4,4)=1.0f*1000; ndS(2,1,4,4)=0.7f; // CA/TT
|
||||||
|
ndH(2,2,3,4)=-0.8f*1000; ndS(2,2,3,4)=-4.5f; // CC/GT
|
||||||
|
ndH(2,2,4,3)=5.2f*1000; ndS(2,2,4,3)=13.5f; // CC/TG
|
||||||
|
ndH(2,3,4,2)=-1.5f*1000; ndS(2,3,4,2)=-6.1f; // CG/TC
|
||||||
|
ndH(2,4,3,2)=-1.5f*1000; ndS(2,4,3,2)=-6.1f; // CT/GC
|
||||||
|
ndH(2,4,4,1)=-1.2f*1000; ndS(2,4,4,1)=-6.2f; // CT/TA
|
||||||
|
ndH(3,2,2,4)=2.3f*1000; ndS(3,2,2,4)=5.4f; // GC/CT
|
||||||
|
ndH(3,4,2,2)=5.2f*1000; ndS(3,4,2,2)=13.5f; // GT/CC
|
||||||
|
ndH(4,1,2,4)=1.2f*1000; ndS(4,1,2,4)=0.7f; // TA/CT
|
||||||
|
ndH(4,2,2,3)=2.3f*1000; ndS(4,2,2,3)=5.4f; // TC/CG
|
||||||
|
ndH(4,2,1,4)=1.2f*1000; ndS(4,2,1,4)=0.7f; // TC/AT
|
||||||
|
ndH(4,3,2,2)=-0.8f*1000; ndS(4,3,2,2)=-4.5f; // TG/CC
|
||||||
|
ndH(4,4,2,1)=0.7f*1000; ndS(4,4,2,1)=0.2f; // TT/CA
|
||||||
|
ndH(4,4,1,2)=1.0f*1000; ndS(4,4,1,2)=0.7f; // TT/AC
|
||||||
|
|
||||||
|
// G-A Mismatches
|
||||||
|
ndH(1,1,3,4)=3.0f*1000; ndS(1,1,3,4)=7.4f; // AA/GT
|
||||||
|
ndH(1,1,4,3)=-0.6f*1000; ndS(1,1,4,3)=-2.3f; // AA/TG
|
||||||
|
ndH(1,2,3,3)=0.5f*1000; ndS(1,2,3,3)=3.2f; // AC/GG
|
||||||
|
ndH(1,3,3,2)=-4.0f*1000; ndS(1,3,3,2)=-13.2f; // AG/GC
|
||||||
|
ndH(1,3,4,1)=-0.7f*1000; ndS(1,3,4,1)=-2.3f; // AG/TA
|
||||||
|
ndH(1,4,3,1)=-0.7f*1000; ndS(1,4,3,1)=-2.3f; // AT/GA
|
||||||
|
ndH(2,1,3,3)=-0.7f*1000; ndS(2,1,3,3)=-2.3f; // CA/GG
|
||||||
|
ndH(2,3,3,1)=-4.0f*1000; ndS(2,3,3,1)=-13.2f; // CG/GA
|
||||||
|
ndH(3,1,1,4)=0.7f*1000; ndS(3,1,1,4)=0.7f; // GA/AT
|
||||||
|
ndH(3,1,2,3)=-0.6f*1000; ndS(3,1,2,3)=-1.0f; // GA/CG
|
||||||
|
ndH(3,2,1,3)=-0.6f*1000; ndS(3,2,1,3)=-1.0f; // GC/AG
|
||||||
|
ndH(3,3,1,2)=-0.7f*1000; ndS(3,3,1,2)=-2.3f; // GG/AC
|
||||||
|
ndH(3,3,2,1)=0.5f*1000; ndS(3,3,2,1)=3.2f; // GG/CA
|
||||||
|
ndH(3,4,1,1)=-0.6f*1000; ndS(3,4,1,1)=-2.3f; // GT/AA
|
||||||
|
ndH(4,1,1,3)=0.7f*1000; ndS(4,1,1,3)=0.7f; // TA/AG
|
||||||
|
ndH(4,3,1,1)=3.0f*1000; ndS(4,3,1,1)=7.4f; // TG/AA
|
||||||
|
|
||||||
|
// G-T Mismatches
|
||||||
|
ndH(1,3,4,4)=1.0f*1000; ndS(1,3,4,4)=0.9f; // AG/TT
|
||||||
|
ndH(1,4,4,3)=-2.5f*1000; ndS(1,4,4,3)=-8.3f; // AT/TG
|
||||||
|
ndH(2,3,3,4)=-4.1f*1000; ndS(2,3,3,4)=-11.7f; // CG/GT
|
||||||
|
ndH(2,4,3,3)=-2.8f*1000; ndS(2,4,3,3)=-8.0f; // CT/GG
|
||||||
|
ndH(3,1,4,4)=-1.3f*1000; ndS(3,1,4,4)=-5.3f; // GA/TT
|
||||||
|
ndH(3,2,4,3)=-4.4f*1000; ndS(3,2,4,3)=-12.3f; // GC/TG
|
||||||
|
ndH(3,3,2,4)=3.3f*1000; ndS(3,3,2,4)=10.4f; // GG/CT
|
||||||
|
ndH(3,3,4,2)=-2.8f*1000; ndS(3,3,4,2)=-8.0f; // GG/TC
|
||||||
|
// ndH(3,3,4,4)=5.8f*1000; ndS(3,3,4,4)=16.3f; // GG/TT
|
||||||
|
ndH(3,4,2,3)=-4.4f*1000; ndS(3,4,2,3)=-12.3f; // GT/CG
|
||||||
|
ndH(3,4,4,1)=-2.5f*1000; ndS(3,4,4,1)=-8.3f; // GT/TA
|
||||||
|
// ndH(3,4,4,3)=4.1f*1000; ndS(3,4,4,3)=9.5f; // GT/TG
|
||||||
|
ndH(4,1,3,4)=-0.1f*1000; ndS(4,1,3,4)=-1.7f; // TA/GT
|
||||||
|
ndH(4,2,3,3)=3.3f*1000; ndS(4,2,3,3)=10.4f; // TC/GG
|
||||||
|
ndH(4,3,1,4)=-0.1f*1000; ndS(4,3,1,4)=-1.7f; // TG/AT
|
||||||
|
ndH(4,3,3,2)=-4.1f*1000; ndS(4,3,3,2)=-11.7f; // TG/GC
|
||||||
|
// ndH(4,3,3,4)=-1.4f*1000; ndS(4,3,3,4)=-6.2f; // TG/GT
|
||||||
|
ndH(4,4,1,3)=-1.3f*1000; ndS(4,4,1,3)=-5.3f; // TT/AG
|
||||||
|
ndH(4,4,3,1)=1.0f*1000; ndS(4,4,3,1)=0.9f; // TT/GA
|
||||||
|
// ndH(4,4,3,3)=5.8f*1000; ndS(4,4,3,3)=16.3f; // TT/GG
|
||||||
|
|
||||||
|
// A-A Mismatches
|
||||||
|
ndH(1,1,1,4)=4.7f*1000; ndS(1,1,1,4)=12.9f; // AA/AT
|
||||||
|
ndH(1,1,4,1)=1.2f*1000; ndS(1,1,4,1)=1.7f; // AA/TA
|
||||||
|
ndH(1,2,1,3)=-2.9f*1000; ndS(1,2,1,3)=-9.8f; // AC/AG
|
||||||
|
ndH(1,3,1,2)=-0.9f*1000; ndS(1,3,1,2)=-4.2f; // AG/AC
|
||||||
|
ndH(1,4,1,1)=1.2f*1000; ndS(1,4,1,1)=1.7f; // AT/AA
|
||||||
|
ndH(2,1,3,1)=-0.9f*1000; ndS(2,1,3,1)=-4.2f; // CA/GA
|
||||||
|
ndH(3,1,2,1)=-2.9f*1000; ndS(3,1,2,1)=-9.8f; // GA/CA
|
||||||
|
ndH(4,1,1,1)=4.7f*1000; ndS(4,1,1,1)=12.9f; // TA/AA
|
||||||
|
|
||||||
|
// C-C Mismatches
|
||||||
|
ndH(1,2,4,2)=0.0f*1000; ndS(1,2,4,2)=-4.4f; // AC/TC
|
||||||
|
ndH(2,1,2,4)=6.1f*1000; ndS(2,1,2,4)=16.4f; // CA/CT
|
||||||
|
ndH(2,2,2,3)=3.6f*1000; ndS(2,2,2,3)=8.9f; // CC/CG
|
||||||
|
ndH(2,2,3,2)=-1.5f*1000; ndS(2,2,3,2)=-7.2f; // CC/GC
|
||||||
|
ndH(2,3,2,2)=-1.5f*1000; ndS(2,3,2,2)=-7.2f; // CG/CC
|
||||||
|
ndH(2,4,2,1)=0.0f*1000; ndS(2,4,2,1)=-4.4f; // CT/CA
|
||||||
|
ndH(3,2,2,2)=3.6f*1000; ndS(3,2,2,2)=8.9f; // GC/CC
|
||||||
|
ndH(4,2,1,2)=6.1f*1000; ndS(4,2,1,2)=16.4f; // TC/AC
|
||||||
|
|
||||||
|
// G-G Mismatches
|
||||||
|
ndH(1,3,4,3)=-3.1f*1000; ndS(1,3,4,3)=-9.5f; // AG/TG
|
||||||
|
ndH(2,3,3,3)=-4.9f*1000; ndS(2,3,3,3)=-15.3f; // CG/GG
|
||||||
|
ndH(3,1,3,4)=1.6f*1000; ndS(3,1,3,4)=3.6f; // GA/GT
|
||||||
|
ndH(3,2,3,3)=-6.0f*1000; ndS(3,2,3,3)=-15.8f; // GC/GG
|
||||||
|
ndH(3,3,2,3)=-6.0f*1000; ndS(3,3,2,3)=-15.8f; // GG/CG
|
||||||
|
ndH(3,3,3,2)=-4.9f*1000; ndS(3,3,3,2)=-15.3f; // GG/GC
|
||||||
|
ndH(3,4,3,1)=-3.1f*1000; ndS(3,4,3,1)=-9.5f; // GT/GA
|
||||||
|
ndH(4,3,1,3)=1.6f*1000; ndS(4,3,1,3)=3.6f; // TG/AG
|
||||||
|
|
||||||
|
// T-T Mismatches
|
||||||
|
ndH(1,4,4,4)=-2.7f*1000; ndS(1,4,4,4)=-10.8f; // AT/TT
|
||||||
|
ndH(2,4,3,4)=-5.0f*1000; ndS(2,4,3,4)=-15.8f; // CT/GT
|
||||||
|
ndH(3,4,2,4)=-2.2f*1000; ndS(3,4,2,4)=-8.4f; // GT/CT
|
||||||
|
ndH(4,1,4,4)=0.2f*1000; ndS(4,1,4,4)=-1.5f; // TA/TT
|
||||||
|
ndH(4,2,4,3)=-2.2f*1000; ndS(4,2,4,3)=-8.4f; // TC/TG
|
||||||
|
ndH(4,3,4,2)=-5.0f*1000; ndS(4,3,4,2)=-15.8f; // TG/TC
|
||||||
|
ndH(4,4,1,4)=0.2f*1000; ndS(4,4,1,4)=-1.5f; // TT/AT
|
||||||
|
ndH(4,4,4,1)=-2.7f*1000; ndS(4,4,4,1)=-10.8f; // TT/TA
|
||||||
|
|
||||||
|
// Dangling Ends
|
||||||
|
ndH(5,1,1,4)=-0.7f*1000; ndS(5,1,1,4)=-0.8f; // $A/AT
|
||||||
|
ndH(5,1,2,4)=4.4f*1000; ndS(5,1,2,4)=14.9f; // $A/CT
|
||||||
|
ndH(5,1,3,4)=-1.6f*1000; ndS(5,1,3,4)=-3.6f; // $A/GT
|
||||||
|
ndH(5,1,4,4)=2.9f*1000; ndS(5,1,4,4)=10.4f; // $A/TT
|
||||||
|
ndH(5,2,1,3)=-2.1f*1000; ndS(5,2,1,3)=-3.9f; // $C/AG
|
||||||
|
ndH(5,2,2,3)=-0.2f*1000; ndS(5,2,2,3)=-0.1f; // $C/CG
|
||||||
|
ndH(5,2,3,3)=-3.9f*1000; ndS(5,2,3,3)=-11.2f; // $C/GG
|
||||||
|
ndH(5,2,4,3)=-4.4f*1000; ndS(5,2,4,3)=-13.1f; // $C/TG
|
||||||
|
ndH(5,3,1,2)=-5.9f*1000; ndS(5,3,1,2)=-16.5f; // $G/AC
|
||||||
|
ndH(5,3,2,2)=-2.6f*1000; ndS(5,3,2,2)=-7.4f; // $G/CC
|
||||||
|
ndH(5,3,3,2)=-3.2f*1000; ndS(5,3,3,2)=-10.4f; // $G/GC
|
||||||
|
ndH(5,3,4,2)=-5.2f*1000; ndS(5,3,4,2)=-15.0f; // $G/TC
|
||||||
|
ndH(5,4,1,1)=-0.5f*1000; ndS(5,4,1,1)=-1.1f; // $T/AA
|
||||||
|
ndH(5,4,2,1)=4.7f*1000; ndS(5,4,2,1)=14.2f; // $T/CA
|
||||||
|
ndH(5,4,3,1)=-4.1f*1000; ndS(5,4,3,1)=-13.1f; // $T/GA
|
||||||
|
ndH(5,4,4,1)=-3.8f*1000; ndS(5,4,4,1)=-12.6f; // $T/TA
|
||||||
|
ndH(1,5,4,1)=-2.9f*1000; ndS(1,5,4,1)=-7.6f; // A$/TA
|
||||||
|
ndH(1,5,4,2)=-4.1f*1000; ndS(1,5,4,2)=-13.0f; // A$/TC
|
||||||
|
ndH(1,5,4,3)=-4.2f*1000; ndS(1,5,4,3)=-15.0f; // A$/TG
|
||||||
|
ndH(1,5,4,4)=-0.2f*1000; ndS(1,5,4,4)=-0.5f; // A$/TT
|
||||||
|
ndH(1,1,5,4)=0.2f*1000; ndS(1,1,5,4)=2.3f; // AA/$T
|
||||||
|
ndH(1,1,4,5)=-0.5f*1000; ndS(1,1,4,5)=-1.1f; // AA/T$
|
||||||
|
ndH(1,2,5,3)=-6.3f*1000; ndS(1,2,5,3)=-17.1f; // AC/$G
|
||||||
|
ndH(1,2,4,5)=4.7f*1000; ndS(1,2,4,5)=14.2f; // AC/T$
|
||||||
|
ndH(1,3,5,2)=-3.7f*1000; ndS(1,3,5,2)=-10.0f; // AG/$C
|
||||||
|
ndH(1,3,4,5)=-4.1f*1000; ndS(1,3,4,5)=-13.1f; // AG/T$
|
||||||
|
ndH(1,4,5,1)=-2.9f*1000; ndS(1,4,5,1)=-7.6f; // AT/$A
|
||||||
|
ndH(1,4,4,5)=-3.8f*1000; ndS(1,4,4,5)=-12.6f; // AT/T$
|
||||||
|
ndH(2,5,3,1)=-3.7f*1000; ndS(2,5,3,1)=-10.0f; // C$/GA
|
||||||
|
ndH(2,5,3,2)=-4.0f*1000; ndS(2,5,3,2)=-11.9f; // C$/GC
|
||||||
|
ndH(2,5,3,3)=-3.9f*1000; ndS(2,5,3,3)=-10.9f; // C$/GG
|
||||||
|
ndH(2,5,3,4)=-4.9f*1000; ndS(2,5,3,4)=-13.8f; // C$/GT
|
||||||
|
ndH(2,1,5,4)=0.6f*1000; ndS(2,1,5,4)=3.3f; // CA/$T
|
||||||
|
ndH(2,1,3,5)=-5.9f*1000; ndS(2,1,3,5)=-16.5f; // CA/G$
|
||||||
|
ndH(2,2,5,3)=-4.4f*1000; ndS(2,2,5,3)=-12.6f; // CC/$G
|
||||||
|
ndH(2,2,3,5)=-2.6f*1000; ndS(2,2,3,5)=-7.4f; // CC/G$
|
||||||
|
ndH(2,3,5,2)=-4.0f*1000; ndS(2,3,5,2)=-11.9f; // CG/$C
|
||||||
|
ndH(2,3,3,5)=-3.2f*1000; ndS(2,3,3,5)=-10.4f; // CG/G$
|
||||||
|
ndH(2,4,5,1)=-4.1f*1000; ndS(2,4,5,1)=-13.0f; // CT/$A
|
||||||
|
ndH(2,4,3,5)=-5.2f*1000; ndS(2,4,3,5)=-15.0f; // CT/G$
|
||||||
|
ndH(3,5,2,1)=-6.3f*1000; ndS(3,5,2,1)=-17.1f; // G$/CA
|
||||||
|
ndH(3,5,2,2)=-4.4f*1000; ndS(3,5,2,2)=-12.6f; // G$/CC
|
||||||
|
ndH(3,5,2,3)=-5.1f*1000; ndS(3,5,2,3)=-14.0f; // G$/CG
|
||||||
|
ndH(3,5,2,4)=-4.0f*1000; ndS(3,5,2,4)=-10.9f; // G$/CT
|
||||||
|
ndH(3,1,5,4)=-1.1f*1000; ndS(3,1,5,4)=-1.6f; // GA/$T
|
||||||
|
ndH(3,1,2,5)=-2.1f*1000; ndS(3,1,2,5)=-3.9f; // GA/C$
|
||||||
|
ndH(3,2,5,3)=-5.1f*1000; ndS(3,2,5,3)=-14.0f; // GC/$G
|
||||||
|
ndH(3,2,2,5)=-0.2f*1000; ndS(3,2,2,5)=-0.1f; // GC/C$
|
||||||
|
ndH(3,3,5,2)=-3.9f*1000; ndS(3,3,5,2)=-10.9f; // GG/$C
|
||||||
|
ndH(3,3,2,5)=-3.9f*1000; ndS(3,3,2,5)=-11.2f; // GG/C$
|
||||||
|
ndH(3,4,5,1)=-4.2f*1000; ndS(3,4,5,1)=-15.0f; // GT/$A
|
||||||
|
ndH(3,4,2,5)=-4.4f*1000; ndS(3,4,2,5)=-13.1f; // GT/C$
|
||||||
|
ndH(4,5,1,1)=0.2f*1000; ndS(4,5,1,1)=2.3f; // T$/AA
|
||||||
|
ndH(4,5,1,2)=0.6f*1000; ndS(4,5,1,2)=3.3f; // T$/AC
|
||||||
|
ndH(4,5,1,3)=-1.1f*1000; ndS(4,5,1,3)=-1.6f; // T$/AG
|
||||||
|
ndH(4,5,1,4)=-6.9f*1000; ndS(4,5,1,4)=-20.0f; // T$/AT
|
||||||
|
ndH(4,1,5,4)=-6.9f*1000; ndS(4,1,5,4)=-20.0f; // TA/$T
|
||||||
|
ndH(4,1,1,5)=-0.7f*1000; ndS(4,1,1,5)=-0.7f; // TA/A$
|
||||||
|
ndH(4,2,5,3)=-4.0f*1000; ndS(4,2,5,3)=-10.9f; // TC/$G
|
||||||
|
ndH(4,2,1,5)=4.4f*1000; ndS(4,2,1,5)=14.9f; // TC/A$
|
||||||
|
ndH(4,3,5,2)=-4.9f*1000; ndS(4,3,5,2)=-13.8f; // TG/$C
|
||||||
|
ndH(4,3,1,5)=-1.6f*1000; ndS(4,3,1,5)=-3.6f; // TG/A$
|
||||||
|
ndH(4,4,5,1)=-0.2f*1000; ndS(4,4,5,1)=-0.5f; // TT/$A
|
||||||
|
ndH(4,4,1,5)=2.9f*1000; ndS(4,4,1,5)=10.4f; // TT/A$
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int nparam_CountGCContent(char * seq ) {
|
||||||
|
int lseq = strlen(seq);
|
||||||
|
int k;
|
||||||
|
double count = 0;
|
||||||
|
for( k=0;k<lseq;k++) {
|
||||||
|
if (seq[k] == 'G' || seq[k] == 'C' ) {
|
||||||
|
count+=1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
void nparam_CleanSeq (const char* inseq, char* outseq, int len)
|
||||||
|
{
|
||||||
|
int seqlen = strlen (inseq);
|
||||||
|
int i, j;
|
||||||
|
|
||||||
|
if (len != 0)
|
||||||
|
seqlen = len;
|
||||||
|
|
||||||
|
outseq[0]='x';
|
||||||
|
|
||||||
|
for (i = 0, j = 0; i < seqlen && outseq[0]; i++,j++)
|
||||||
|
{
|
||||||
|
switch (inseq[i])
|
||||||
|
{
|
||||||
|
case 'a':
|
||||||
|
case '\0':
|
||||||
|
case 'A':
|
||||||
|
outseq[j] = 'A'; break;
|
||||||
|
case 'c':
|
||||||
|
case '\1':
|
||||||
|
case 'C':
|
||||||
|
outseq[j] = 'C'; break;
|
||||||
|
case 'g':
|
||||||
|
case '\2':
|
||||||
|
case 'G':
|
||||||
|
outseq[j] = 'G'; break;
|
||||||
|
case 't':
|
||||||
|
case '\3':
|
||||||
|
case 'T':
|
||||||
|
outseq[j] = 'T'; break;
|
||||||
|
default:
|
||||||
|
outseq[0]=0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
outseq[j] = '\0';
|
||||||
|
}
|
||||||
|
|
||||||
|
//Calculate TM for given sequence against its complement
|
||||||
|
double nparam_CalcSelfTM(PNNParams nparm, const char* seq, size_t len)
|
||||||
|
{
|
||||||
|
double thedH = 0;
|
||||||
|
//double thedS = nparam_GetInitialEntropy(nparm);
|
||||||
|
double thedS = -5.9f+nparm->rlogc;
|
||||||
|
double mtemp;
|
||||||
|
char c1;
|
||||||
|
char c2;
|
||||||
|
char c3;
|
||||||
|
char c4;
|
||||||
|
unsigned int i;
|
||||||
|
char nseq[50];
|
||||||
|
const char *useq = seq;
|
||||||
|
|
||||||
|
nparam_CleanSeq (seq, nseq, len);
|
||||||
|
useq = nseq;
|
||||||
|
|
||||||
|
for ( i=1;i<len;i++)
|
||||||
|
{
|
||||||
|
c1 = GETREVCODE(useq[i-1]); //nparam_getComplement(seq[i-1],1);
|
||||||
|
c2 = GETREVCODE(useq[i]); //nparam_getComplement(seq[i],1);
|
||||||
|
c3 = GETNUMCODE(useq[i-1]);
|
||||||
|
c4 = GETNUMCODE(useq[i]);
|
||||||
|
|
||||||
|
|
||||||
|
thedH += nparm->dH[(uint8_t) c3][(uint8_t)c4][(uint8_t)c1][(uint8_t)c2];//nparam_GetEnthalpy(nparm, c3,c4,c1,c2);
|
||||||
|
thedS += nparam_GetEntropy(nparm, c3,c4,c1,c2);
|
||||||
|
}
|
||||||
|
//printf("------------------\n");
|
||||||
|
mtemp = nparam_CalcTM(thedS,thedH);
|
||||||
|
//fprintf(stderr,"Enthalpy: %f, entropy: %f, seq: %s rloc=%f\n", thedH, thedS, useq, nparm->rlogc);
|
||||||
|
//exit (0);
|
||||||
|
return mtemp;
|
||||||
|
}
|
||||||
|
|
||||||
|
double nparam_CalcTwoTM(PNNParams nparm, const char* seq1, const char* seq2, size_t len)
|
||||||
|
{
|
||||||
|
double thedH = 0;
|
||||||
|
//double thedS = nparam_GetInitialEntropy(nparm);
|
||||||
|
double thedS = -5.9f+nparm->rlogc;
|
||||||
|
double mtemp;
|
||||||
|
char c1;
|
||||||
|
char c2;
|
||||||
|
char c3;
|
||||||
|
char c4;
|
||||||
|
unsigned int i;
|
||||||
|
char nseq1[50];
|
||||||
|
char nseq2[50];
|
||||||
|
char *useq1;
|
||||||
|
char *useq2;
|
||||||
|
|
||||||
|
nparam_CleanSeq (seq1, nseq1, len);
|
||||||
|
useq1 = nseq1;
|
||||||
|
|
||||||
|
nparam_CleanSeq (seq2, nseq2, len);
|
||||||
|
useq2 = nseq2;
|
||||||
|
|
||||||
|
//fprintf (stderr,"Primer : %s\n",useq);
|
||||||
|
for ( i=1;i<len;i++)
|
||||||
|
{
|
||||||
|
c1 = GETREVCODE(useq2[i-1]); //nparam_getComplement(seq[i-1],1);
|
||||||
|
c2 = GETREVCODE(useq2[i]); //nparam_getComplement(seq[i],1);
|
||||||
|
c3 = GETNUMCODE(useq1[i-1]);
|
||||||
|
c4 = GETNUMCODE(useq1[i]);
|
||||||
|
|
||||||
|
//fprintf (stderr,"Primer : %s %f %f %d %d, %d %d %f\n",useq,thedH,thedS,(int)c3,(int)c4,(int)c1,(int)c2,nparam_GetEnthalpy(nparm, c3,c4,c1,c2));
|
||||||
|
|
||||||
|
thedH += nparm->dH[(uint8_t)c3][(uint8_t)c4][(uint8_t)c1][(uint8_t)c2];//nparam_GetEnthalpy(nparm, c3,c4,c1,c2);
|
||||||
|
thedS += nparam_GetEntropy(nparm, c3,c4,c1,c2);
|
||||||
|
}
|
||||||
|
//fprintf(stderr,"------------------\n");
|
||||||
|
mtemp = nparam_CalcTM(thedS,thedH);
|
||||||
|
//if (mtemp == 0)
|
||||||
|
//{
|
||||||
|
// fprintf(stderr,"Enthalpy: %f, entropy: %f, seq: %s\n", thedH, thedS, useq);
|
||||||
|
//exit (0);
|
||||||
|
//}
|
||||||
|
return mtemp;
|
||||||
|
}
|
||||||
|
|
||||||
|
double calculateMeltingTemperatureBasic (char * seq) {
|
||||||
|
int gccount;
|
||||||
|
double temp;
|
||||||
|
int seqlen;
|
||||||
|
|
||||||
|
seqlen = strlen (seq);
|
||||||
|
gccount = nparam_CountGCContent (seq);
|
||||||
|
temp = 64.9 + 41*(gccount - 16.4)/seqlen;
|
||||||
|
return temp;
|
||||||
|
}
|
67
src/libthermo/nnparams.h
Normal file
67
src/libthermo/nnparams.h
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
/*
|
||||||
|
* nnparams.h
|
||||||
|
* PHunterLib
|
||||||
|
*
|
||||||
|
* Nearest Neighbor Model Parameters
|
||||||
|
*
|
||||||
|
* Created by Tiayyba Riaz on 02/07/09.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef NNPARAMS_H_
|
||||||
|
#define NNPARAMS_H_
|
||||||
|
|
||||||
|
#include <math.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
//#include "../libecoprimer/ecoprimer.h"
|
||||||
|
|
||||||
|
// following defines to simplify coding...
|
||||||
|
#define ndH(a,b,c,d) nparm->dH[(uint8_t) a][(uint8_t) b][(uint8_t) c][(uint8_t) d]
|
||||||
|
#define ndS(a,b,c,d) nparm->dS[(uint8_t) a][(uint8_t) b][(uint8_t) c][(uint8_t) d]
|
||||||
|
#define forbidden_enthalpy 1000000000000000000.0f
|
||||||
|
#define R 1.987f
|
||||||
|
#define SALT_METHOD_SANTALUCIA 1
|
||||||
|
#define SALT_METHOD_OWCZARZY 2
|
||||||
|
|
||||||
|
#define DEF_CONC_PRIMERS 0.0000008
|
||||||
|
#define DEF_CONC_SEQUENCES 0
|
||||||
|
#define DEF_SALT 0.05
|
||||||
|
|
||||||
|
#define GETNUMCODE(a) bpencoder[a - 'A']
|
||||||
|
#define GETREVCODE(a) 5-bpencoder[a - 'A']
|
||||||
|
|
||||||
|
|
||||||
|
extern double forbidden_entropy;
|
||||||
|
|
||||||
|
extern char bpencoder[]; // v,w,x,y,z
|
||||||
|
|
||||||
|
|
||||||
|
typedef struct CNNParams_st
|
||||||
|
{
|
||||||
|
double Ct1;
|
||||||
|
double Ct2;
|
||||||
|
double rlogc;
|
||||||
|
double kplus;
|
||||||
|
double kfac;
|
||||||
|
int saltMethod;
|
||||||
|
double gcContent;
|
||||||
|
double new_TM;
|
||||||
|
double dH[6][6][6][6]; // A-C-G-T + gap + initiation (dangling end, $ sign)
|
||||||
|
double dS[6][6][6][6];
|
||||||
|
}CNNParams, * PNNParams;
|
||||||
|
|
||||||
|
void nparam_InitParams(PNNParams nparm, double c1, double c2, double kp, int sm);
|
||||||
|
int nparam_CountGCContent(char * seq );
|
||||||
|
double nparam_GetEntropy(PNNParams nparm, char x0, char x1, char y0, char y1);
|
||||||
|
double nparam_GetEnthalpy(PNNParams nparm, char x0, char x1, char y0, char y1);
|
||||||
|
double nparam_CalcTM(double entropy,double enthalpy);
|
||||||
|
double nparam_CalcSelfTM(PNNParams nparm, const char* seq, size_t len);
|
||||||
|
double nparam_CalcTwoTM(PNNParams nparm, const char* seq1, const char* seq2, size_t len);
|
||||||
|
|
||||||
|
double nparam_GetInitialEntropy(PNNParams nparm) ;
|
||||||
|
double calculateMeltingTemperatureBasic (char * seq);
|
||||||
|
//void getThermoProperties (ppair_t* pairs, size_t count, poptions_t options);
|
||||||
|
|
||||||
|
#endif
|
113
src/libthermo/thermostats.c
Normal file
113
src/libthermo/thermostats.c
Normal file
@ -0,0 +1,113 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <ctype.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include "thermostats.h"
|
||||||
|
|
||||||
|
word_t extractSite(char* sequence, size_t begin, size_t length, bool_t strand)
|
||||||
|
{
|
||||||
|
char *c;
|
||||||
|
char *start;
|
||||||
|
uint32_t l;
|
||||||
|
word_t site = 0;
|
||||||
|
|
||||||
|
start=sequence+begin;
|
||||||
|
if (!strand)
|
||||||
|
start+=length-1;
|
||||||
|
|
||||||
|
|
||||||
|
for (c=start,
|
||||||
|
l=0;
|
||||||
|
l<length;
|
||||||
|
l++,
|
||||||
|
c+=(strand)? 1:-1)
|
||||||
|
site = (site << 2) | ((strand)? (*c):(~*c)&3);
|
||||||
|
|
||||||
|
return site;
|
||||||
|
}
|
||||||
|
|
||||||
|
void getThermoProperties (ppair_t* pairs, size_t count, poptions_t options)
|
||||||
|
{
|
||||||
|
size_t i, j;
|
||||||
|
uint32_t bp1,bp2;
|
||||||
|
word_t w1;
|
||||||
|
word_t w2;
|
||||||
|
bool_t strand;
|
||||||
|
|
||||||
|
char *sq;
|
||||||
|
char prmrd[50];
|
||||||
|
char prmrr[50];
|
||||||
|
double mtemp;
|
||||||
|
|
||||||
|
for (i = 0; i < count; i++)
|
||||||
|
{
|
||||||
|
w1 = pairs[i]->p1->word;
|
||||||
|
w2 = pairs[i]->p2->word;
|
||||||
|
|
||||||
|
if (!pairs[i]->asdirect1)
|
||||||
|
w1=ecoComplementWord(w1,options->primer_length);
|
||||||
|
|
||||||
|
if (!pairs[i]->asdirect2)
|
||||||
|
w2=ecoComplementWord(w2,options->primer_length);
|
||||||
|
|
||||||
|
strncpy(prmrd,ecoUnhashWord(w1, options->primer_length),options->primer_length);
|
||||||
|
strncpy(prmrr,ecoUnhashWord(w2, options->primer_length),options->primer_length);
|
||||||
|
prmrd[options->primer_length]=0;
|
||||||
|
prmrr[options->primer_length]=0;
|
||||||
|
pairs[i]->p1temp = nparam_CalcSelfTM (options->pnparm, prmrd, options->primer_length) - 273.0;
|
||||||
|
pairs[i]->p2temp = nparam_CalcSelfTM (options->pnparm, prmrr, options->primer_length) - 273.0;
|
||||||
|
pairs[i]->p1mintemp = 100;
|
||||||
|
pairs[i]->p2mintemp = 100;
|
||||||
|
|
||||||
|
for (j = 0; j < pairs[i]->pcr.ampcount; j++)
|
||||||
|
if (pairs[i]->pcr.amplifias[j].sequence->isexample)
|
||||||
|
{
|
||||||
|
|
||||||
|
sq = pairs[i]->pcr.amplifias[j].sequence->SQ;
|
||||||
|
strand = pairs[i]->pcr.amplifias[j].strand;
|
||||||
|
bp1 = pairs[i]->pcr.amplifias[j].begin - options->primer_length;
|
||||||
|
bp2 = pairs[i]->pcr.amplifias[j].end + 1;
|
||||||
|
|
||||||
|
if (!strand)
|
||||||
|
{
|
||||||
|
uint32_t tmp;
|
||||||
|
tmp=bp1;
|
||||||
|
bp1=bp2;
|
||||||
|
bp2=tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
// printf("%s : %s, %c",prmrd,
|
||||||
|
// ecoUnhashWord(extractSite(sq,bp1,options->primer_length,strand),options->primer_length),
|
||||||
|
// "rd"[strand]);
|
||||||
|
mtemp = nparam_CalcTwoTM(options->pnparm,
|
||||||
|
prmrd,
|
||||||
|
ecoUnhashWord(extractSite(sq,bp1,options->primer_length,strand),options->primer_length),
|
||||||
|
options->primer_length) - 273.0;
|
||||||
|
// printf(" %4.2f %4.2f\n",pairs[i]->p1temp,mtemp);
|
||||||
|
if (mtemp < pairs[i]->p1mintemp)
|
||||||
|
pairs[i]->p1mintemp = mtemp;
|
||||||
|
|
||||||
|
// printf("%s : %s, %c\n",prmrr,ecoUnhashWord(extractSite(sq,bp2,options->primer_length,!strand),options->primer_length),
|
||||||
|
// "rd"[strand]);
|
||||||
|
//
|
||||||
|
mtemp = nparam_CalcTwoTM(options->pnparm,
|
||||||
|
prmrr,
|
||||||
|
ecoUnhashWord(extractSite(sq,bp2,options->primer_length,!strand),options->primer_length),
|
||||||
|
options->primer_length) - 273.0;
|
||||||
|
if (mtemp < pairs[i]->p2mintemp)
|
||||||
|
pairs[i]->p2mintemp = mtemp;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (w2 < w1)
|
||||||
|
{
|
||||||
|
mtemp = pairs[i]->p1temp;
|
||||||
|
pairs[i]->p1temp = pairs[i]->p2temp;
|
||||||
|
pairs[i]->p2temp = mtemp;
|
||||||
|
|
||||||
|
mtemp = pairs[i]->p1mintemp;
|
||||||
|
pairs[i]->p1mintemp = pairs[i]->p2mintemp;
|
||||||
|
pairs[i]->p2mintemp = mtemp;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
9
src/libthermo/thermostats.h
Normal file
9
src/libthermo/thermostats.h
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
#ifndef THERMOSTATS_H_
|
||||||
|
#define THERMOSTATS_H_
|
||||||
|
|
||||||
|
#include "../libecoprimer/ecoprimer.h"
|
||||||
|
|
||||||
|
void getThermoProperties (ppair_t* pairs, size_t count, poptions_t options);
|
||||||
|
word_t extractSite(char* sequence, size_t begin, size_t length, bool_t strand);
|
||||||
|
|
||||||
|
#endif
|
696
tools/ecoPCRFormat.py
Executable file
696
tools/ecoPCRFormat.py
Executable file
@ -0,0 +1,696 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from __future__ import print_function
|
||||||
|
import re
|
||||||
|
import gzip
|
||||||
|
import struct
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import getopt
|
||||||
|
import zlib
|
||||||
|
|
||||||
|
_dbenable = False
|
||||||
|
|
||||||
|
#####
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# Generic file function
|
||||||
|
#
|
||||||
|
#
|
||||||
|
#####
|
||||||
|
|
||||||
|
def universalOpen(file):
|
||||||
|
if isinstance(file, str):
|
||||||
|
if file.endswith('.gz'):
|
||||||
|
if sys.version_info[0] < 3:
|
||||||
|
rep = gzip.open(file)
|
||||||
|
else:
|
||||||
|
rep = gzip.open(file, 'rt', encoding='latin-1')
|
||||||
|
else:
|
||||||
|
rep = open(file, 'r')
|
||||||
|
else:
|
||||||
|
rep = file
|
||||||
|
return rep
|
||||||
|
|
||||||
|
def universalTell(file):
|
||||||
|
if hasattr(file, 'myfileobj'): # Python 3 gzip
|
||||||
|
return file.myfileobj.tell()
|
||||||
|
elif hasattr(file, 'fileobj'): # Python 2 gzip
|
||||||
|
return file.fileobj.tell()
|
||||||
|
return file.tell()
|
||||||
|
|
||||||
|
def fileSize(file):
|
||||||
|
if hasattr(file, 'myfileobj'): # Python 3 gzip
|
||||||
|
f = file.myfileobj
|
||||||
|
elif hasattr(file, 'fileobj'): # Python 2 gzip
|
||||||
|
f = file.fileobj
|
||||||
|
else:
|
||||||
|
f = file
|
||||||
|
|
||||||
|
if not hasattr(f, 'fileno'):
|
||||||
|
return None
|
||||||
|
|
||||||
|
pos = f.tell()
|
||||||
|
f.seek(0, 2)
|
||||||
|
length = f.tell()
|
||||||
|
f.seek(pos, 0)
|
||||||
|
return length
|
||||||
|
|
||||||
|
def progressBar(pos, max_size, reset=False, delta=[]):
|
||||||
|
if reset:
|
||||||
|
del delta[:]
|
||||||
|
if not delta:
|
||||||
|
delta.append(time.time())
|
||||||
|
delta.append(time.time())
|
||||||
|
|
||||||
|
delta[1] = time.time()
|
||||||
|
elapsed = delta[1] - delta[0]
|
||||||
|
|
||||||
|
if max_size is None:
|
||||||
|
sys.stderr.write('\rReaded sequences : %d ' % pos)
|
||||||
|
sys.stderr.flush()
|
||||||
|
return
|
||||||
|
|
||||||
|
percent = float(pos) / max_size * 100 if max_size > 0 else 0
|
||||||
|
|
||||||
|
if percent > 0:
|
||||||
|
remain = elapsed / percent * (100 - percent)
|
||||||
|
remain_str = time.strftime('%H:%M:%S', time.gmtime(int(remain)))
|
||||||
|
else:
|
||||||
|
remain_str = '?'
|
||||||
|
|
||||||
|
bar_length = 50
|
||||||
|
filled = int(percent / 2)
|
||||||
|
bar = '#' * filled
|
||||||
|
bar += '|/-\\'[pos % 4]
|
||||||
|
bar += ' ' * (bar_length - filled - 1)
|
||||||
|
|
||||||
|
sys.stderr.write('\r%5.1f %% |%s] remain : %s' % (percent, bar, remain_str))
|
||||||
|
sys.stderr.flush()
|
||||||
|
|
||||||
|
#####
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# NCBI Dump Taxonomy reader
|
||||||
|
#
|
||||||
|
#
|
||||||
|
#####
|
||||||
|
|
||||||
|
class endLessIterator(object):
|
||||||
|
def __init__(self, endedlist):
|
||||||
|
self.endedlist = endedlist
|
||||||
|
self.index = 0
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def next(self): # Python 2
|
||||||
|
if self.index < len(self.endedlist):
|
||||||
|
item = self.endedlist[self.index]
|
||||||
|
self.index += 1
|
||||||
|
return item
|
||||||
|
else:
|
||||||
|
return self.endedlist[-1]
|
||||||
|
|
||||||
|
__next__ = next # Python 3 compatibility
|
||||||
|
|
||||||
|
class ColumnFile(object):
|
||||||
|
def __init__(self, stream, sep=None, strip=True, types=None):
|
||||||
|
if isinstance(stream, str):
|
||||||
|
self._stream = open(stream)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
iter(stream)
|
||||||
|
self._stream = stream
|
||||||
|
except TypeError:
|
||||||
|
raise ValueError('stream must be string or an iterator')
|
||||||
|
self._delimiter = sep
|
||||||
|
self._strip = strip
|
||||||
|
self._types = types
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def str2bool(x):
|
||||||
|
val = x.strip()[0].upper()
|
||||||
|
if val in 'T1Y':
|
||||||
|
return True
|
||||||
|
elif val in 'F0N':
|
||||||
|
return False
|
||||||
|
raise ValueError("Invalid boolean value: %s" % x)
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def next(self): # Python 2
|
||||||
|
ligne = next(self._stream)
|
||||||
|
data = ligne.split(self._delimiter)
|
||||||
|
if self._strip:
|
||||||
|
data = [x.strip() for x in data]
|
||||||
|
|
||||||
|
if self._types:
|
||||||
|
typed_data = []
|
||||||
|
for i, val in enumerate(data):
|
||||||
|
if i < len(self._types):
|
||||||
|
t = self._types[i]
|
||||||
|
if t is bool:
|
||||||
|
typed_data.append(self.str2bool(val))
|
||||||
|
else:
|
||||||
|
typed_data.append(t(val))
|
||||||
|
else:
|
||||||
|
typed_data.append(val)
|
||||||
|
return typed_data
|
||||||
|
return data
|
||||||
|
|
||||||
|
__next__ = next # Python 3 compatibility
|
||||||
|
|
||||||
|
def bsearchTaxon(taxonomy, taxid):
|
||||||
|
taxCount = len(taxonomy)
|
||||||
|
begin = 0
|
||||||
|
end = taxCount
|
||||||
|
oldcheck = -1
|
||||||
|
check = int(begin + (end - begin) / 2)
|
||||||
|
|
||||||
|
while check != oldcheck and check < taxCount and taxonomy[check][0] != taxid:
|
||||||
|
if taxonomy[check][0] < taxid:
|
||||||
|
begin = check
|
||||||
|
else:
|
||||||
|
end = check
|
||||||
|
|
||||||
|
oldcheck = check
|
||||||
|
check = int(begin + (end - begin) / 2)
|
||||||
|
|
||||||
|
if check < taxCount and taxonomy[check][0] == taxid:
|
||||||
|
return check
|
||||||
|
return None
|
||||||
|
|
||||||
|
def readNodeTable(file):
|
||||||
|
file = universalOpen(file)
|
||||||
|
nodes = ColumnFile(file,
|
||||||
|
sep='|',
|
||||||
|
types=(int, int, str,
|
||||||
|
str, str, bool,
|
||||||
|
int, bool, int,
|
||||||
|
bool, bool, bool, str))
|
||||||
|
print("Reading taxonomy dump file...", file=sys.stderr)
|
||||||
|
sys.stderr.flush()
|
||||||
|
|
||||||
|
taxonomy = []
|
||||||
|
for n in nodes:
|
||||||
|
taxonomy.append([n[0], n[2], n[1]])
|
||||||
|
|
||||||
|
print("List all taxonomy rank...", file=sys.stderr)
|
||||||
|
sys.stderr.flush()
|
||||||
|
|
||||||
|
ranks = list(set(x[1] for x in taxonomy))
|
||||||
|
ranks.sort()
|
||||||
|
rank_dict = {rank: idx for idx, rank in enumerate(ranks)}
|
||||||
|
|
||||||
|
print("Sorting taxons...", file=sys.stderr)
|
||||||
|
sys.stderr.flush()
|
||||||
|
taxonomy.sort(key=lambda x: x[0])
|
||||||
|
|
||||||
|
print("Indexing taxonomy...", file=sys.stderr)
|
||||||
|
sys.stderr.flush()
|
||||||
|
|
||||||
|
index = {}
|
||||||
|
for i, t in enumerate(taxonomy):
|
||||||
|
index[t[0]] = i
|
||||||
|
|
||||||
|
print("Indexing parent and rank...", file=sys.stderr)
|
||||||
|
sys.stderr.flush()
|
||||||
|
|
||||||
|
for t in taxonomy:
|
||||||
|
t[1] = rank_dict[t[1]]
|
||||||
|
t[2] = index.get(t[2], None)
|
||||||
|
|
||||||
|
return taxonomy, rank_dict, index
|
||||||
|
|
||||||
|
def nameIterator(file):
|
||||||
|
file = universalOpen(file)
|
||||||
|
names = ColumnFile(file,
|
||||||
|
sep='|',
|
||||||
|
types=(int, str, str, str))
|
||||||
|
for row in names:
|
||||||
|
if len(row) >= 4:
|
||||||
|
taxid, name, unique, classname = row[:4]
|
||||||
|
yield taxid, name, classname
|
||||||
|
|
||||||
|
def mergedNodeIterator(file):
|
||||||
|
file = universalOpen(file)
|
||||||
|
merged = ColumnFile(file,
|
||||||
|
sep='|',
|
||||||
|
types=(int, int, str))
|
||||||
|
for row in merged:
|
||||||
|
if len(row) >= 2:
|
||||||
|
taxid, current = row[:2]
|
||||||
|
yield taxid, current
|
||||||
|
|
||||||
|
def deletedNodeIterator(file):
|
||||||
|
file = universalOpen(file)
|
||||||
|
deleted = ColumnFile(file,
|
||||||
|
sep='|',
|
||||||
|
types=(int, str))
|
||||||
|
for row in deleted:
|
||||||
|
if row:
|
||||||
|
taxid = row[0]
|
||||||
|
yield taxid
|
||||||
|
|
||||||
|
def readTaxonomyDump(taxdir):
|
||||||
|
taxonomy, ranks, index = readNodeTable(taxdir + '/nodes.dmp')
|
||||||
|
|
||||||
|
# Initialize scientific name as empty string for each taxon
|
||||||
|
for tx in taxonomy:
|
||||||
|
tx.append("") # Add fourth element: scientific name (empty at start)
|
||||||
|
|
||||||
|
print("Adding scientific name...", file=sys.stderr)
|
||||||
|
sys.stderr.flush()
|
||||||
|
|
||||||
|
alternativeName = []
|
||||||
|
for taxid, name, classname in nameIterator(taxdir + '/names.dmp'):
|
||||||
|
idx = index.get(taxid, None)
|
||||||
|
if idx is not None:
|
||||||
|
# Clean and ensure ASCII only
|
||||||
|
name_clean = ''.join(c for c in name if ord(c) < 128)
|
||||||
|
alternativeName.append((name_clean, classname, idx))
|
||||||
|
if classname == 'scientific name':
|
||||||
|
taxonomy[idx][3] = name_clean
|
||||||
|
|
||||||
|
print("Adding taxid alias...", file=sys.stderr)
|
||||||
|
sys.stderr.flush()
|
||||||
|
for taxid, current in mergedNodeIterator(taxdir + '/merged.dmp'):
|
||||||
|
if current in index:
|
||||||
|
index[taxid] = index[current]
|
||||||
|
|
||||||
|
print("Adding deleted taxid...", file=sys.stderr)
|
||||||
|
sys.stderr.flush()
|
||||||
|
for taxid in deletedNodeIterator(taxdir + '/delnodes.dmp'):
|
||||||
|
index[taxid] = None
|
||||||
|
|
||||||
|
return taxonomy, ranks, alternativeName, index
|
||||||
|
|
||||||
|
#####
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# Genbank/EMBL sequence reader
|
||||||
|
#
|
||||||
|
#
|
||||||
|
#####
|
||||||
|
|
||||||
|
def entryIterator(file):
|
||||||
|
file = universalOpen(file)
|
||||||
|
rep = []
|
||||||
|
ligne = file.readline()
|
||||||
|
while ligne:
|
||||||
|
rep.append(ligne)
|
||||||
|
if ligne == '//\n':
|
||||||
|
yield ''.join(rep)
|
||||||
|
rep = []
|
||||||
|
ligne = file.readline()
|
||||||
|
if rep:
|
||||||
|
yield ''.join(rep)
|
||||||
|
|
||||||
|
def fastaEntryIterator(file):
|
||||||
|
file = universalOpen(file)
|
||||||
|
rep = []
|
||||||
|
ligne = file.readline()
|
||||||
|
while ligne:
|
||||||
|
if ligne.startswith('>') and rep:
|
||||||
|
yield ''.join(rep)
|
||||||
|
rep = []
|
||||||
|
rep.append(ligne)
|
||||||
|
ligne = file.readline()
|
||||||
|
if rep:
|
||||||
|
yield ''.join(rep)
|
||||||
|
|
||||||
|
_cleanSeq = re.compile(r'[\s\d]') # More efficient
|
||||||
|
|
||||||
|
def cleanSeq(seq):
|
||||||
|
return _cleanSeq.sub('', seq)
|
||||||
|
|
||||||
|
# GenBank patterns
|
||||||
|
_gbParseID = re.compile(r'(?<=^LOCUS {7})[^ ]+(?= )', re.MULTILINE)
|
||||||
|
_gbParseDE = re.compile(r'(?<=^DEFINITION {2}).+?\. *$', re.MULTILINE | re.DOTALL)
|
||||||
|
_gbParseSQ = re.compile(r'(?<=^ORIGIN).+?(?=^//$)', re.MULTILINE | re.DOTALL)
|
||||||
|
_gbParseTX = re.compile(r'(?<= /db_xref="taxon:)[0-9]+(?=")')
|
||||||
|
|
||||||
|
def genbankEntryParser(entry):
|
||||||
|
Id_match = _gbParseID.findall(entry)
|
||||||
|
Id = Id_match[0] if Id_match else "UNKNOWN_ID"
|
||||||
|
|
||||||
|
De_match = _gbParseDE.findall(entry)
|
||||||
|
De = ' '.join(De_match[0].split()) if De_match else ""
|
||||||
|
|
||||||
|
Sq_match = _gbParseSQ.findall(entry)
|
||||||
|
Sq = cleanSeq(Sq_match[0].upper()) if Sq_match else ""
|
||||||
|
|
||||||
|
Tx_match = _gbParseTX.findall(entry)
|
||||||
|
Tx = int(Tx_match[0]) if Tx_match else None
|
||||||
|
|
||||||
|
# Clean and ensure ASCII only
|
||||||
|
Id = ''.join(c for c in Id if ord(c) < 128)
|
||||||
|
De = ''.join(c for c in De if ord(c) < 128)
|
||||||
|
|
||||||
|
return {'id': Id, 'taxid': Tx, 'definition': De, 'sequence': Sq}
|
||||||
|
|
||||||
|
# EMBL patterns
|
||||||
|
_emblParseID = re.compile(r'(?<=^ID {3})[^ ]+(?=;)', re.MULTILINE)
|
||||||
|
_emblParseDE = re.compile(r'(?<=^DE {3}).+?\. *$', re.MULTILINE | re.DOTALL)
|
||||||
|
_emblParseSQ = re.compile(r'(?<=^ ).+?(?=^//$)', re.MULTILINE | re.DOTALL)
|
||||||
|
_emblParseTX = re.compile(r'(?<= /db_xref="taxon:)[0-9]+(?=")')
|
||||||
|
|
||||||
|
def emblEntryParser(entry):
|
||||||
|
Id_match = _emblParseID.findall(entry)
|
||||||
|
Id = Id_match[0] if Id_match else "UNKNOWN_ID"
|
||||||
|
|
||||||
|
De_match = _emblParseDE.findall(entry)
|
||||||
|
De = ' '.join(De_match[0].replace('\nDE ', ' ').split()) if De_match else ""
|
||||||
|
|
||||||
|
Sq_match = _emblParseSQ.findall(entry)
|
||||||
|
Sq = cleanSeq(Sq_match[0].upper()) if Sq_match else ""
|
||||||
|
|
||||||
|
Tx_match = _emblParseTX.findall(entry)
|
||||||
|
Tx = int(Tx_match[0]) if Tx_match else None
|
||||||
|
|
||||||
|
# Clean and ensure ASCII only
|
||||||
|
Id = ''.join(c for c in Id if ord(c) < 128)
|
||||||
|
De = ''.join(c for c in De if ord(c) < 128)
|
||||||
|
|
||||||
|
return {'id': Id, 'taxid': Tx, 'definition': De, 'sequence': Sq}
|
||||||
|
|
||||||
|
# FASTA processing
|
||||||
|
_fastaSplit = re.compile(r';\W*')
|
||||||
|
|
||||||
|
def parseFasta(seq):
|
||||||
|
lines = seq.split('\n')
|
||||||
|
if not lines:
|
||||||
|
return "", "", "", {}
|
||||||
|
|
||||||
|
header = lines[0].strip()
|
||||||
|
if header.startswith('>'):
|
||||||
|
header = header[1:]
|
||||||
|
|
||||||
|
parts = header.split(None, 1)
|
||||||
|
seq_id = parts[0] if parts else ""
|
||||||
|
description = parts[1] if len(parts) > 1 else ""
|
||||||
|
|
||||||
|
fields = _fastaSplit.split(description) if description else []
|
||||||
|
info = {}
|
||||||
|
other_desc = []
|
||||||
|
for field in fields:
|
||||||
|
if '=' in field:
|
||||||
|
key, val = field.split('=', 1)
|
||||||
|
info[key.strip()] = val.strip()
|
||||||
|
else:
|
||||||
|
other_desc.append(field)
|
||||||
|
|
||||||
|
definition = ' '.join(other_desc)
|
||||||
|
sequence = ''.join(x.strip() for x in lines[1:]).upper()
|
||||||
|
|
||||||
|
# Clean and ensure ASCII only
|
||||||
|
seq_id = ''.join(c for c in seq_id if ord(c) < 128)
|
||||||
|
definition = ''.join(c for c in definition if ord(c) < 128)
|
||||||
|
|
||||||
|
return seq_id, sequence, definition, info
|
||||||
|
|
||||||
|
def fastaEntryParser(entry):
|
||||||
|
seq_id, sequence, definition, info = parseFasta(entry)
|
||||||
|
Tx = info.get('taxid', None)
|
||||||
|
if Tx is not None:
|
||||||
|
try:
|
||||||
|
Tx = int(Tx)
|
||||||
|
except ValueError:
|
||||||
|
Tx = None
|
||||||
|
return {'id': seq_id, 'taxid': Tx, 'definition': definition, 'sequence': sequence}
|
||||||
|
|
||||||
|
def sequenceIteratorFactory(entryParser, entryIterator):
|
||||||
|
def sequenceIterator(file):
|
||||||
|
for entry in entryIterator(file):
|
||||||
|
yield entryParser(entry)
|
||||||
|
return sequenceIterator
|
||||||
|
|
||||||
|
#####
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# Binary writer (CORRECTED VERSION)
|
||||||
|
#
|
||||||
|
#
|
||||||
|
#####
|
||||||
|
|
||||||
|
def ecoSeqPacker(sq):
|
||||||
|
# Ensure sequence is ASCII uppercase
|
||||||
|
sequence = ''.join(c for c in sq['sequence'] if ord(c) < 128).upper()
|
||||||
|
seq_bytes = sequence.encode('ascii')
|
||||||
|
compactseq = zlib.compress(seq_bytes, 9)
|
||||||
|
cptseqlength = len(compactseq)
|
||||||
|
|
||||||
|
# Clean definition to pure ASCII
|
||||||
|
definition = ''.join(c for c in sq['definition'] if ord(c) < 128)
|
||||||
|
de_bytes = definition.encode('ascii')
|
||||||
|
delength = len(de_bytes)
|
||||||
|
|
||||||
|
# Clean ID to pure ASCII and pad to 20 bytes
|
||||||
|
seq_id = ''.join(c for c in sq['id'] if ord(c) < 128)
|
||||||
|
seq_id_bytes = seq_id.encode('ascii')
|
||||||
|
if len(seq_id_bytes) > 20:
|
||||||
|
seq_id_bytes = seq_id_bytes[:20]
|
||||||
|
else:
|
||||||
|
seq_id_bytes = seq_id_bytes.ljust(20, b'\0')
|
||||||
|
|
||||||
|
# Calculate total size (4 bytes for each integer + data lengths)
|
||||||
|
totalSize = 4 + 20 + 4 + 4 + 4 + delength + cptseqlength
|
||||||
|
|
||||||
|
# Pack with correct byte order and field order
|
||||||
|
packed = struct.pack('>I', totalSize)
|
||||||
|
packed += struct.pack('>I', sq['taxid'])
|
||||||
|
packed += seq_id_bytes
|
||||||
|
packed += struct.pack('>I', delength)
|
||||||
|
packed += struct.pack('>I', len(sequence))
|
||||||
|
packed += struct.pack('>I', cptseqlength)
|
||||||
|
packed += de_bytes
|
||||||
|
packed += compactseq
|
||||||
|
|
||||||
|
return packed
|
||||||
|
|
||||||
|
def ecoTaxPacker(tx):
|
||||||
|
# Clean name to pure ASCII
|
||||||
|
name_clean = ''.join(c for c in tx[3] if ord(c) < 128)
|
||||||
|
name_bytes = name_clean.encode('ascii')
|
||||||
|
namelength = len(name_bytes)
|
||||||
|
totalSize = 4 + 4 + 4 + 4 + namelength
|
||||||
|
|
||||||
|
packed = struct.pack('>I', totalSize)
|
||||||
|
packed += struct.pack('>I', tx[0]) # taxid
|
||||||
|
packed += struct.pack('>I', tx[1]) # rank index
|
||||||
|
packed += struct.pack('>I', tx[2]) # parent index
|
||||||
|
packed += struct.pack('>I', namelength)
|
||||||
|
packed += name_bytes
|
||||||
|
|
||||||
|
return packed
|
||||||
|
|
||||||
|
def ecoRankPacker(rank):
|
||||||
|
# Clean rank to pure ASCII
|
||||||
|
rank_clean = ''.join(c for c in rank if ord(c) < 128)
|
||||||
|
rank_bytes = rank_clean.encode('ascii')
|
||||||
|
namelength = len(rank_bytes)
|
||||||
|
packed = struct.pack('>I', namelength)
|
||||||
|
packed += rank_bytes
|
||||||
|
return packed
|
||||||
|
|
||||||
|
def ecoNamePacker(name):
|
||||||
|
# Clean name and class to pure ASCII
|
||||||
|
name_clean = ''.join(c for c in name[0] if ord(c) < 128)
|
||||||
|
class_clean = ''.join(c for c in name[1] if ord(c) < 128)
|
||||||
|
|
||||||
|
name_bytes = name_clean.encode('ascii')
|
||||||
|
class_bytes = class_clean.encode('ascii')
|
||||||
|
namelength = len(name_bytes)
|
||||||
|
classlength = len(class_bytes)
|
||||||
|
totalSize = 4 + 4 + 4 + 4 + namelength + classlength
|
||||||
|
|
||||||
|
packed = struct.pack('>I', totalSize)
|
||||||
|
packed += struct.pack('>I', 1 if name[1] == 'scientific name' else 0)
|
||||||
|
packed += struct.pack('>I', namelength)
|
||||||
|
packed += struct.pack('>I', classlength)
|
||||||
|
packed += struct.pack('>I', name[2]) # taxon index
|
||||||
|
packed += name_bytes
|
||||||
|
packed += class_bytes
|
||||||
|
|
||||||
|
return packed
|
||||||
|
|
||||||
|
def ecoSeqWriter(file, input_file, taxindex, parser):
|
||||||
|
output = open(file, 'wb')
|
||||||
|
input_stream = universalOpen(input_file)
|
||||||
|
inputsize = fileSize(input_stream)
|
||||||
|
entries = parser(input_stream)
|
||||||
|
seqcount = 0
|
||||||
|
skipped = []
|
||||||
|
|
||||||
|
# Write placeholder for sequence count
|
||||||
|
output.write(struct.pack('>I', 0))
|
||||||
|
|
||||||
|
progressBar(0, inputsize, reset=True)
|
||||||
|
|
||||||
|
for entry in entries:
|
||||||
|
if entry['taxid'] is not None:
|
||||||
|
try:
|
||||||
|
# Convert to integer taxid
|
||||||
|
taxid_int = int(entry['taxid'])
|
||||||
|
# Lookup in taxindex
|
||||||
|
entry['taxid'] = taxindex.get(taxid_int, None)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
entry['taxid'] = None
|
||||||
|
|
||||||
|
if entry['taxid'] is not None:
|
||||||
|
seqcount += 1
|
||||||
|
packed = ecoSeqPacker(entry)
|
||||||
|
output.write(packed)
|
||||||
|
else:
|
||||||
|
skipped.append(entry['id'])
|
||||||
|
|
||||||
|
# Update progress
|
||||||
|
if inputsize is not None:
|
||||||
|
where = universalTell(input_stream)
|
||||||
|
progressBar(where, inputsize)
|
||||||
|
else:
|
||||||
|
progressBar(seqcount, seqcount + 1) # Fake progress
|
||||||
|
|
||||||
|
sys.stderr.write(" Readed sequences : %d \r" % seqcount)
|
||||||
|
sys.stderr.flush()
|
||||||
|
|
||||||
|
print("\n", file=sys.stderr)
|
||||||
|
# Update sequence count at beginning of file
|
||||||
|
output.seek(0)
|
||||||
|
output.write(struct.pack('>I', seqcount))
|
||||||
|
output.close()
|
||||||
|
|
||||||
|
return skipped
|
||||||
|
|
||||||
|
def ecoTaxWriter(file, taxonomy):
|
||||||
|
output = open(file, 'wb')
|
||||||
|
output.write(struct.pack('>I', len(taxonomy)))
|
||||||
|
|
||||||
|
for tx in taxonomy:
|
||||||
|
output.write(ecoTaxPacker(tx))
|
||||||
|
|
||||||
|
output.close()
|
||||||
|
|
||||||
|
def ecoRankWriter(file, ranks):
|
||||||
|
output = open(file, 'wb')
|
||||||
|
rank_list = sorted(ranks.keys())
|
||||||
|
output.write(struct.pack('>I', len(rank_list)))
|
||||||
|
|
||||||
|
for rank in rank_list:
|
||||||
|
output.write(ecoRankPacker(rank))
|
||||||
|
|
||||||
|
output.close()
|
||||||
|
|
||||||
|
def ecoNameWriter(file, names):
|
||||||
|
output = open(file, 'wb')
|
||||||
|
output.write(struct.pack('>I', len(names)))
|
||||||
|
|
||||||
|
# Sort case-insensitive
|
||||||
|
names.sort(key=lambda x: x[0].lower())
|
||||||
|
|
||||||
|
for name in names:
|
||||||
|
output.write(ecoNamePacker(name))
|
||||||
|
|
||||||
|
output.close()
|
||||||
|
|
||||||
|
def ecoDBWriter(prefix, taxonomy, seqFileNames, parser):
|
||||||
|
ecoRankWriter(prefix + '.rdx', taxonomy[1])
|
||||||
|
ecoTaxWriter(prefix + '.tdx', taxonomy[0])
|
||||||
|
ecoNameWriter(prefix + '.ndx', taxonomy[2])
|
||||||
|
|
||||||
|
filecount = 0
|
||||||
|
for filename in seqFileNames:
|
||||||
|
filecount += 1
|
||||||
|
outfile = "%s_%03d.sdx" % (prefix, filecount)
|
||||||
|
print("\nProcessing:", filename, file=sys.stderr)
|
||||||
|
skipped = ecoSeqWriter(outfile, filename, taxonomy[3], parser)
|
||||||
|
|
||||||
|
if skipped:
|
||||||
|
print("\nSkipped entries without valid taxonomy:", file=sys.stderr)
|
||||||
|
for sid in skipped:
|
||||||
|
print(sid, file=sys.stderr)
|
||||||
|
else:
|
||||||
|
print("All entries processed successfully", file=sys.stderr)
|
||||||
|
|
||||||
|
def ecoParseOptions(arguments):
|
||||||
|
opt = {
|
||||||
|
'prefix': 'ecodb',
|
||||||
|
'taxdir': 'taxdump',
|
||||||
|
'parser': sequenceIteratorFactory(genbankEntryParser, entryIterator)
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
o, filenames = getopt.getopt(arguments,
|
||||||
|
'ht:T:n:gfe',
|
||||||
|
['help',
|
||||||
|
'taxonomy=',
|
||||||
|
'taxonomy_db=',
|
||||||
|
'name=',
|
||||||
|
'genbank',
|
||||||
|
'fasta',
|
||||||
|
'embl'])
|
||||||
|
except getopt.GetoptError as err:
|
||||||
|
print(str(err), file=sys.stderr)
|
||||||
|
printHelp()
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
|
for name, value in o:
|
||||||
|
if name in ('-h', '--help'):
|
||||||
|
printHelp()
|
||||||
|
sys.exit()
|
||||||
|
elif name in ('-t', '--taxonomy'):
|
||||||
|
opt['taxdir'] = value
|
||||||
|
elif name in ('-n', '--name'):
|
||||||
|
opt['prefix'] = value
|
||||||
|
elif name in ('-g', '--genbank'):
|
||||||
|
opt['parser'] = sequenceIteratorFactory(genbankEntryParser, entryIterator)
|
||||||
|
elif name in ('-f', '--fasta'):
|
||||||
|
opt['parser'] = sequenceIteratorFactory(fastaEntryParser, fastaEntryIterator)
|
||||||
|
elif name in ('-e', '--embl'):
|
||||||
|
opt['parser'] = sequenceIteratorFactory(emblEntryParser, entryIterator)
|
||||||
|
else:
|
||||||
|
raise ValueError('Unknown option: %s' % name)
|
||||||
|
|
||||||
|
return opt, filenames
|
||||||
|
|
||||||
|
def printHelp():
|
||||||
|
print("-----------------------------------")
|
||||||
|
print(" ecoPCRFormat.py")
|
||||||
|
print("-----------------------------------")
|
||||||
|
print("Converts sequence databases to ecoPCR format")
|
||||||
|
print("Usage: ecoPCRFormat.py [options] <files>")
|
||||||
|
print("Options:")
|
||||||
|
print(" -e, --embl Input in EMBL format")
|
||||||
|
print(" -f, --fasta Input in FASTA format")
|
||||||
|
print(" -g, --genbank Input in GenBank format (default)")
|
||||||
|
print(" -h, --help Show this help message")
|
||||||
|
print(" -n NAME, --name=NAME Database prefix (default: ecodb)")
|
||||||
|
print(" -t DIR, --taxonomy=DIR Taxonomy directory (default: taxdump)")
|
||||||
|
print("-----------------------------------")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
try:
|
||||||
|
opt, filenames = ecoParseOptions(sys.argv[1:])
|
||||||
|
|
||||||
|
if not filenames:
|
||||||
|
print("Error: No input files specified", file=sys.stderr)
|
||||||
|
printHelp()
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
print("Loading taxonomy...", file=sys.stderr)
|
||||||
|
taxonomy = readTaxonomyDump(opt['taxdir'])
|
||||||
|
|
||||||
|
print("Processing sequence files...", file=sys.stderr)
|
||||||
|
ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser'])
|
||||||
|
|
||||||
|
print("Database creation complete", file=sys.stderr)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print("Error: " + str(e), file=sys.stderr)
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
sys.exit(1)
|
||||||
|
|
Reference in New Issue
Block a user