Backup with sets and ahocorasick

git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/branches/ecoPrimers-2.1@298 60f365c0-8329-0410-b2a4-ec073aeeaa1d
This commit is contained in:
2011-06-01 22:42:30 +00:00
commit e2ebc357ee
64 changed files with 13604 additions and 0 deletions

221
.cproject Normal file
View File

@ -0,0 +1,221 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?>
<cproject>
<storageModule moduleId="org.eclipse.cdt.core.settings">
<cconfiguration id="cdt.managedbuild.toolchain.gnu.macosx.base.2134184396">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.toolchain.gnu.macosx.base.2134184396" moduleId="org.eclipse.cdt.core.settings" name="MacOSX GCC">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.MachO" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactName="ecoPrimers" buildProperties="" description="" id="cdt.managedbuild.toolchain.gnu.macosx.base.2134184396" name="MacOSX GCC" parent="org.eclipse.cdt.build.core.emptycfg">
<folderInfo id="cdt.managedbuild.toolchain.gnu.macosx.base.2134184396.1840911077" name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.base.766054112" name="cdt.managedbuild.toolchain.gnu.macosx.base" superClass="cdt.managedbuild.toolchain.gnu.macosx.base">
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.MachO" id="cdt.managedbuild.target.gnu.platform.macosx.base.2057035265" name="Debug Platform" osList="macosx" superClass="cdt.managedbuild.target.gnu.platform.macosx.base"/>
<builder id="cdt.managedbuild.target.gnu.builder.macosx.base.783726363" keepEnvironmentInBuildfile="false" managedBuildOn="false" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.base"/>
<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.base.914103467" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.base">
<inputType id="cdt.managedbuild.tool.macosx.c.linker.input.62980206" superClass="cdt.managedbuild.tool.macosx.c.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.base.691108439" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.base"/>
<tool id="cdt.managedbuild.tool.gnu.assembler.macosx.base.695639877" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.macosx.base">
<option id="gnu.both.asm.option.include.paths.1544375094" name="Include paths (-I)" superClass="gnu.both.asm.option.include.paths" valueType="includePath"/>
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.1507665054" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.archiver.macosx.base.1786370580" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.macosx.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.macosx.base.454329831" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.macosx.base"/>
<tool id="cdt.managedbuild.tool.gnu.c.compiler.macosx.base.1928774909" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.macosx.base">
<option id="gnu.c.compiler.option.include.paths.823251305" superClass="gnu.c.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="/usr/include"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.330854350" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
</toolChain>
</folderInfo>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
<storageModule moduleId="org.eclipse.cdt.core.language.mapping"/>
<storageModule moduleId="scannerConfiguration">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile"/>
<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
<buildOutputProvider>
<openAction enabled="true" filePath=""/>
<parser enabled="true"/>
</buildOutputProvider>
<scannerInfoProvider id="specsFile">
<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
<parser enabled="true"/>
</scannerInfoProvider>
</profile>
<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
<buildOutputProvider>
<openAction enabled="true" filePath=""/>
<parser enabled="true"/>
</buildOutputProvider>
<scannerInfoProvider id="makefileGenerator">
<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
<parser enabled="true"/>
</scannerInfoProvider>
</profile>
<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
<buildOutputProvider>
<openAction enabled="true" filePath=""/>
<parser enabled="true"/>
</buildOutputProvider>
<scannerInfoProvider id="specsFile">
<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
<parser enabled="true"/>
</scannerInfoProvider>
</profile>
<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
<buildOutputProvider>
<openAction enabled="true" filePath=""/>
<parser enabled="true"/>
</buildOutputProvider>
<scannerInfoProvider id="specsFile">
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
<parser enabled="true"/>
</scannerInfoProvider>
</profile>
<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
<buildOutputProvider>
<openAction enabled="true" filePath=""/>
<parser enabled="true"/>
</buildOutputProvider>
<scannerInfoProvider id="specsFile">
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
<parser enabled="true"/>
</scannerInfoProvider>
</profile>
<profile id="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfile">
<buildOutputProvider>
<openAction enabled="true" filePath=""/>
<parser enabled="true"/>
</buildOutputProvider>
<scannerInfoProvider id="specsFile">
<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
<parser enabled="true"/>
</scannerInfoProvider>
</profile>
<profile id="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP">
<buildOutputProvider>
<openAction enabled="true" filePath=""/>
<parser enabled="true"/>
</buildOutputProvider>
<scannerInfoProvider id="specsFile">
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
<parser enabled="true"/>
</scannerInfoProvider>
</profile>
<profile id="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC">
<buildOutputProvider>
<openAction enabled="true" filePath=""/>
<parser enabled="true"/>
</buildOutputProvider>
<scannerInfoProvider id="specsFile">
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
<parser enabled="true"/>
</scannerInfoProvider>
</profile>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.macosx.base.2134184396;cdt.managedbuild.toolchain.gnu.macosx.base.2134184396.1840911077;cdt.managedbuild.tool.gnu.c.compiler.macosx.base.1928774909;cdt.managedbuild.tool.gnu.c.compiler.input.330854350">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
<buildOutputProvider>
<openAction enabled="true" filePath=""/>
<parser enabled="true"/>
</buildOutputProvider>
<scannerInfoProvider id="specsFile">
<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
<parser enabled="true"/>
</scannerInfoProvider>
</profile>
<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
<buildOutputProvider>
<openAction enabled="true" filePath=""/>
<parser enabled="true"/>
</buildOutputProvider>
<scannerInfoProvider id="makefileGenerator">
<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
<parser enabled="true"/>
</scannerInfoProvider>
</profile>
<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
<buildOutputProvider>
<openAction enabled="true" filePath=""/>
<parser enabled="true"/>
</buildOutputProvider>
<scannerInfoProvider id="specsFile">
<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
<parser enabled="true"/>
</scannerInfoProvider>
</profile>
<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
<buildOutputProvider>
<openAction enabled="true" filePath=""/>
<parser enabled="true"/>
</buildOutputProvider>
<scannerInfoProvider id="specsFile">
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
<parser enabled="true"/>
</scannerInfoProvider>
</profile>
<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
<buildOutputProvider>
<openAction enabled="true" filePath=""/>
<parser enabled="true"/>
</buildOutputProvider>
<scannerInfoProvider id="specsFile">
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
<parser enabled="true"/>
</scannerInfoProvider>
</profile>
<profile id="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfile">
<buildOutputProvider>
<openAction enabled="true" filePath=""/>
<parser enabled="true"/>
</buildOutputProvider>
<scannerInfoProvider id="specsFile">
<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
<parser enabled="true"/>
</scannerInfoProvider>
</profile>
<profile id="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP">
<buildOutputProvider>
<openAction enabled="true" filePath=""/>
<parser enabled="true"/>
</buildOutputProvider>
<scannerInfoProvider id="specsFile">
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
<parser enabled="true"/>
</scannerInfoProvider>
</profile>
<profile id="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC">
<buildOutputProvider>
<openAction enabled="true" filePath=""/>
<parser enabled="true"/>
</buildOutputProvider>
<scannerInfoProvider id="specsFile">
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
<parser enabled="true"/>
</scannerInfoProvider>
</profile>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
</cconfiguration>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<project id="ecoPrimers.null.1292969001" name="ecoPrimers"/>
</storageModule>
</cproject>

83
.project Normal file
View File

@ -0,0 +1,83 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>ecoPrimers</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.python.pydev.PyDevBuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
<triggers>clean,full,incremental,</triggers>
<arguments>
<dictionary>
<key>?name?</key>
<value></value>
</dictionary>
<dictionary>
<key>org.eclipse.cdt.make.core.append_environment</key>
<value>true</value>
</dictionary>
<dictionary>
<key>org.eclipse.cdt.make.core.autoBuildTarget</key>
<value>all</value>
</dictionary>
<dictionary>
<key>org.eclipse.cdt.make.core.buildArguments</key>
<value></value>
</dictionary>
<dictionary>
<key>org.eclipse.cdt.make.core.buildCommand</key>
<value>make</value>
</dictionary>
<dictionary>
<key>org.eclipse.cdt.make.core.cleanBuildTarget</key>
<value>clean</value>
</dictionary>
<dictionary>
<key>org.eclipse.cdt.make.core.contents</key>
<value>org.eclipse.cdt.make.core.activeConfigSettings</value>
</dictionary>
<dictionary>
<key>org.eclipse.cdt.make.core.enableAutoBuild</key>
<value>false</value>
</dictionary>
<dictionary>
<key>org.eclipse.cdt.make.core.enableCleanBuild</key>
<value>true</value>
</dictionary>
<dictionary>
<key>org.eclipse.cdt.make.core.enableFullBuild</key>
<value>true</value>
</dictionary>
<dictionary>
<key>org.eclipse.cdt.make.core.fullBuildTarget</key>
<value>all</value>
</dictionary>
<dictionary>
<key>org.eclipse.cdt.make.core.stopOnError</key>
<value>true</value>
</dictionary>
<dictionary>
<key>org.eclipse.cdt.make.core.useDefaultBuildCmd</key>
<value>true</value>
</dictionary>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.cdt.core.cnature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
<nature>org.python.pydev.pythonNature</nature>
</natures>
</projectDescription>

7
.pydevproject Normal file
View File

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?eclipse-pydev version="1.0"?>
<pydev_project>
<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Python 2.6</pydev_property>
<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.6</pydev_property>
</pydev_project>

506
Licence_CeCILL_V2-en.txt Normal file
View File

@ -0,0 +1,506 @@
CeCILL FREE SOFTWARE LICENSE AGREEMENT
Notice
This Agreement is a Free Software license agreement that is the result
of discussions between its authors in order to ensure compliance with
the two main principles guiding its drafting:
* firstly, compliance with the principles governing the distribution
of Free Software: access to source code, broad rights granted to
users,
* secondly, the election of a governing law, French law, with which
it is conformant, both as regards the law of torts and
intellectual property law, and the protection that it offers to
both authors and holders of the economic rights over software.
The authors of the CeCILL (for Ce[a] C[nrs] I[nria] L[ogiciel] L[ibre])
license are:
Commissariat <20> l'Energie Atomique - CEA, a public scientific, technical
and industrial research establishment, having its principal place of
business at 25 rue Leblanc, immeuble Le Ponant D, 75015 Paris, France.
Centre National de la Recherche Scientifique - CNRS, a public scientific
and technological establishment, having its principal place of business
at 3 rue Michel-Ange, 75794 Paris cedex 16, France.
Institut National de Recherche en Informatique et en Automatique -
INRIA, a public scientific and technological establishment, having its
principal place of business at Domaine de Voluceau, Rocquencourt, BP
105, 78153 Le Chesnay cedex, France.
Preamble
The purpose of this Free Software license agreement is to grant users
the right to modify and redistribute the software governed by this
license within the framework of an open source distribution model.
The exercising of these rights is conditional upon certain obligations
for users so as to preserve this status for all subsequent redistributions.
In consideration of access to the source code and the rights to copy,
modify and redistribute granted by the license, users are provided only
with a limited warranty and the software's author, the holder of the
economic rights, and the successive licensors only have limited liability.
In this respect, the risks associated with loading, using, modifying
and/or developing or reproducing the software by the user are brought to
the user's attention, given its Free Software status, which may make it
complicated to use, with the result that its use is reserved for
developers and experienced professionals having in-depth computer
knowledge. Users are therefore encouraged to load and test the
suitability of the software as regards their requirements in conditions
enabling the security of their systems and/or data to be ensured and,
more generally, to use and operate it in the same conditions of
security. This Agreement may be freely reproduced and published,
provided it is not altered, and that no provisions are either added or
removed herefrom.
This Agreement may apply to any or all software for which the holder of
the economic rights decides to submit the use thereof to its provisions.
Article 1 - DEFINITIONS
For the purpose of this Agreement, when the following expressions
commence with a capital letter, they shall have the following meaning:
Agreement: means this license agreement, and its possible subsequent
versions and annexes.
Software: means the software in its Object Code and/or Source Code form
and, where applicable, its documentation, "as is" when the Licensee
accepts the Agreement.
Initial Software: means the Software in its Source Code and possibly its
Object Code form and, where applicable, its documentation, "as is" when
it is first distributed under the terms and conditions of the Agreement.
Modified Software: means the Software modified by at least one
Contribution.
Source Code: means all the Software's instructions and program lines to
which access is required so as to modify the Software.
Object Code: means the binary files originating from the compilation of
the Source Code.
Holder: means the holder(s) of the economic rights over the Initial
Software.
Licensee: means the Software user(s) having accepted the Agreement.
Contributor: means a Licensee having made at least one Contribution.
Licensor: means the Holder, or any other individual or legal entity, who
distributes the Software under the Agreement.
Contribution: means any or all modifications, corrections, translations,
adaptations and/or new functions integrated into the Software by any or
all Contributors, as well as any or all Internal Modules.
Module: means a set of sources files including their documentation that
enables supplementary functions or services in addition to those offered
by the Software.
External Module: means any or all Modules, not derived from the
Software, so that this Module and the Software run in separate address
spaces, with one calling the other when they are run.
Internal Module: means any or all Module, connected to the Software so
that they both execute in the same address space.
GNU GPL: means the GNU General Public License version 2 or any
subsequent version, as published by the Free Software Foundation Inc.
Parties: mean both the Licensee and the Licensor.
These expressions may be used both in singular and plural form.
Article 2 - PURPOSE
The purpose of the Agreement is the grant by the Licensor to the
Licensee of a non-exclusive, transferable and worldwide license for the
Software as set forth in Article 5 hereinafter for the whole term of the
protection granted by the rights over said Software.
Article 3 - ACCEPTANCE
3.1 The Licensee shall be deemed as having accepted the terms and
conditions of this Agreement upon the occurrence of the first of the
following events:
* (i) loading the Software by any or all means, notably, by
downloading from a remote server, or by loading from a physical
medium;
* (ii) the first time the Licensee exercises any of the rights
granted hereunder.
3.2 One copy of the Agreement, containing a notice relating to the
characteristics of the Software, to the limited warranty, and to the
fact that its use is restricted to experienced users has been provided
to the Licensee prior to its acceptance as set forth in Article 3.1
hereinabove, and the Licensee hereby acknowledges that it has read and
understood it.
Article 4 - EFFECTIVE DATE AND TERM
4.1 EFFECTIVE DATE
The Agreement shall become effective on the date when it is accepted by
the Licensee as set forth in Article 3.1.
4.2 TERM
The Agreement shall remain in force for the entire legal term of
protection of the economic rights over the Software.
Article 5 - SCOPE OF RIGHTS GRANTED
The Licensor hereby grants to the Licensee, who accepts, the following
rights over the Software for any or all use, and for the term of the
Agreement, on the basis of the terms and conditions set forth hereinafter.
Besides, if the Licensor owns or comes to own one or more patents
protecting all or part of the functions of the Software or of its
components, the Licensor undertakes not to enforce the rights granted by
these patents against successive Licensees using, exploiting or
modifying the Software. If these patents are transferred, the Licensor
undertakes to have the transferees subscribe to the obligations set
forth in this paragraph.
5.1 RIGHT OF USE
The Licensee is authorized to use the Software, without any limitation
as to its fields of application, with it being hereinafter specified
that this comprises:
1. permanent or temporary reproduction of all or part of the Software
by any or all means and in any or all form.
2. loading, displaying, running, or storing the Software on any or
all medium.
3. entitlement to observe, study or test its operation so as to
determine the ideas and principles behind any or all constituent
elements of said Software. This shall apply when the Licensee
carries out any or all loading, displaying, running, transmission
or storage operation as regards the Software, that it is entitled
to carry out hereunder.
5.2 ENTITLEMENT TO MAKE CONTRIBUTIONS
The right to make Contributions includes the right to translate, adapt,
arrange, or make any or all modifications to the Software, and the right
to reproduce the resulting software.
The Licensee is authorized to make any or all Contributions to the
Software provided that it includes an explicit notice that it is the
author of said Contribution and indicates the date of the creation thereof.
5.3 RIGHT OF DISTRIBUTION
In particular, the right of distribution includes the right to publish,
transmit and communicate the Software to the general public on any or
all medium, and by any or all means, and the right to market, either in
consideration of a fee, or free of charge, one or more copies of the
Software by any means.
The Licensee is further authorized to distribute copies of the modified
or unmodified Software to third parties according to the terms and
conditions set forth hereinafter.
5.3.1 DISTRIBUTION OF SOFTWARE WITHOUT MODIFICATION
The Licensee is authorized to distribute true copies of the Software in
Source Code or Object Code form, provided that said distribution
complies with all the provisions of the Agreement and is accompanied by:
1. a copy of the Agreement,
2. a notice relating to the limitation of both the Licensor's
warranty and liability as set forth in Articles 8 and 9,
and that, in the event that only the Object Code of the Software is
redistributed, the Licensee allows future Licensees unhindered access to
the full Source Code of the Software by indicating how to access it, it
being understood that the additional cost of acquiring the Source Code
shall not exceed the cost of transferring the data.
5.3.2 DISTRIBUTION OF MODIFIED SOFTWARE
When the Licensee makes a Contribution to the Software, the terms and
conditions for the distribution of the resulting Modified Software
become subject to all the provisions of this Agreement.
The Licensee is authorized to distribute the Modified Software, in
source code or object code form, provided that said distribution
complies with all the provisions of the Agreement and is accompanied by:
1. a copy of the Agreement,
2. a notice relating to the limitation of both the Licensor's
warranty and liability as set forth in Articles 8 and 9,
and that, in the event that only the object code of the Modified
Software is redistributed, the Licensee allows future Licensees
unhindered access to the full source code of the Modified Software by
indicating how to access it, it being understood that the additional
cost of acquiring the source code shall not exceed the cost of
transferring the data.
5.3.3 DISTRIBUTION OF EXTERNAL MODULES
When the Licensee has developed an External Module, the terms and
conditions of this Agreement do not apply to said External Module, that
may be distributed under a separate license agreement.
5.3.4 COMPATIBILITY WITH THE GNU GPL
The Licensee can include a code that is subject to the provisions of one
of the versions of the GNU GPL in the Modified or unmodified Software,
and distribute that entire code under the terms of the same version of
the GNU GPL.
The Licensee can include the Modified or unmodified Software in a code
that is subject to the provisions of one of the versions of the GNU GPL,
and distribute that entire code under the terms of the same version of
the GNU GPL.
Article 6 - INTELLECTUAL PROPERTY
6.1 OVER THE INITIAL SOFTWARE
The Holder owns the economic rights over the Initial Software. Any or
all use of the Initial Software is subject to compliance with the terms
and conditions under which the Holder has elected to distribute its work
and no one shall be entitled to modify the terms and conditions for the
distribution of said Initial Software.
The Holder undertakes that the Initial Software will remain ruled at
least by this Agreement, for the duration set forth in Article 4.2.
6.2 OVER THE CONTRIBUTIONS
The Licensee who develops a Contribution is the owner of the
intellectual property rights over this Contribution as defined by
applicable law.
6.3 OVER THE EXTERNAL MODULES
The Licensee who develops an External Module is the owner of the
intellectual property rights over this External Module as defined by
applicable law and is free to choose the type of agreement that shall
govern its distribution.
6.4 JOINT PROVISIONS
The Licensee expressly undertakes:
1. not to remove, or modify, in any manner, the intellectual property
notices attached to the Software;
2. to reproduce said notices, in an identical manner, in the copies
of the Software modified or not.
The Licensee undertakes not to directly or indirectly infringe the
intellectual property rights of the Holder and/or Contributors on the
Software and to take, where applicable, vis-<2D>-vis its staff, any and all
measures required to ensure respect of said intellectual property rights
of the Holder and/or Contributors.
Article 7 - RELATED SERVICES
7.1 Under no circumstances shall the Agreement oblige the Licensor to
provide technical assistance or maintenance services for the Software.
However, the Licensor is entitled to offer this type of services. The
terms and conditions of such technical assistance, and/or such
maintenance, shall be set forth in a separate instrument. Only the
Licensor offering said maintenance and/or technical assistance services
shall incur liability therefor.
7.2 Similarly, any Licensor is entitled to offer to its licensees, under
its sole responsibility, a warranty, that shall only be binding upon
itself, for the redistribution of the Software and/or the Modified
Software, under terms and conditions that it is free to decide. Said
warranty, and the financial terms and conditions of its application,
shall be subject of a separate instrument executed between the Licensor
and the Licensee.
Article 8 - LIABILITY
8.1 Subject to the provisions of Article 8.2, the Licensee shall be
entitled to claim compensation for any direct loss it may have suffered
from the Software as a result of a fault on the part of the relevant
Licensor, subject to providing evidence thereof.
8.2 The Licensor's liability is limited to the commitments made under
this Agreement and shall not be incurred as a result of in particular:
(i) loss due the Licensee's total or partial failure to fulfill its
obligations, (ii) direct or consequential loss that is suffered by the
Licensee due to the use or performance of the Software, and (iii) more
generally, any consequential loss. In particular the Parties expressly
agree that any or all pecuniary or business loss (i.e. loss of data,
loss of profits, operating loss, loss of customers or orders,
opportunity cost, any disturbance to business activities) or any or all
legal proceedings instituted against the Licensee by a third party,
shall constitute consequential loss and shall not provide entitlement to
any or all compensation from the Licensor.
Article 9 - WARRANTY
9.1 The Licensee acknowledges that the scientific and technical
state-of-the-art when the Software was distributed did not enable all
possible uses to be tested and verified, nor for the presence of
possible defects to be detected. In this respect, the Licensee's
attention has been drawn to the risks associated with loading, using,
modifying and/or developing and reproducing the Software which are
reserved for experienced users.
The Licensee shall be responsible for verifying, by any or all means,
the suitability of the product for its requirements, its good working
order, and for ensuring that it shall not cause damage to either persons
or properties.
9.2 The Licensor hereby represents, in good faith, that it is entitled
to grant all the rights over the Software (including in particular the
rights set forth in Article 5).
9.3 The Licensee acknowledges that the Software is supplied "as is" by
the Licensor without any other express or tacit warranty, other than
that provided for in Article 9.2 and, in particular, without any warranty
as to its commercial value, its secured, safe, innovative or relevant
nature.
Specifically, the Licensor does not warrant that the Software is free
from any error, that it will operate without interruption, that it will
be compatible with the Licensee's own equipment and software
configuration, nor that it will meet the Licensee's requirements.
9.4 The Licensor does not either expressly or tacitly warrant that the
Software does not infringe any third party intellectual property right
relating to a patent, software or any other property right. Therefore,
the Licensor disclaims any and all liability towards the Licensee
arising out of any or all proceedings for infringement that may be
instituted in respect of the use, modification and redistribution of the
Software. Nevertheless, should such proceedings be instituted against
the Licensee, the Licensor shall provide it with technical and legal
assistance for its defense. Such technical and legal assistance shall be
decided on a case-by-case basis between the relevant Licensor and the
Licensee pursuant to a memorandum of understanding. The Licensor
disclaims any and all liability as regards the Licensee's use of the
name of the Software. No warranty is given as regards the existence of
prior rights over the name of the Software or as regards the existence
of a trademark.
Article 10 - TERMINATION
10.1 In the event of a breach by the Licensee of its obligations
hereunder, the Licensor may automatically terminate this Agreement
thirty (30) days after notice has been sent to the Licensee and has
remained ineffective.
10.2 A Licensee whose Agreement is terminated shall no longer be
authorized to use, modify or distribute the Software. However, any
licenses that it may have granted prior to termination of the Agreement
shall remain valid subject to their having been granted in compliance
with the terms and conditions hereof.
Article 11 - MISCELLANEOUS
11.1 EXCUSABLE EVENTS
Neither Party shall be liable for any or all delay, or failure to
perform the Agreement, that may be attributable to an event of force
majeure, an act of God or an outside cause, such as defective
functioning or interruptions of the electricity or telecommunications
networks, network paralysis following a virus attack, intervention by
government authorities, natural disasters, water damage, earthquakes,
fire, explosions, strikes and labor unrest, war, etc.
11.2 Any failure by either Party, on one or more occasions, to invoke
one or more of the provisions hereof, shall under no circumstances be
interpreted as being a waiver by the interested Party of its right to
invoke said provision(s) subsequently.
11.3 The Agreement cancels and replaces any or all previous agreements,
whether written or oral, between the Parties and having the same
purpose, and constitutes the entirety of the agreement between said
Parties concerning said purpose. No supplement or modification to the
terms and conditions hereof shall be effective as between the Parties
unless it is made in writing and signed by their duly authorized
representatives.
11.4 In the event that one or more of the provisions hereof were to
conflict with a current or future applicable act or legislative text,
said act or legislative text shall prevail, and the Parties shall make
the necessary amendments so as to comply with said act or legislative
text. All other provisions shall remain effective. Similarly, invalidity
of a provision of the Agreement, for any reason whatsoever, shall not
cause the Agreement as a whole to be invalid.
11.5 LANGUAGE
The Agreement is drafted in both French and English and both versions
are deemed authentic.
Article 12 - NEW VERSIONS OF THE AGREEMENT
12.1 Any person is authorized to duplicate and distribute copies of this
Agreement.
12.2 So as to ensure coherence, the wording of this Agreement is
protected and may only be modified by the authors of the License, who
reserve the right to periodically publish updates or new versions of the
Agreement, each with a separate number. These subsequent versions may
address new issues encountered by Free Software.
12.3 Any Software distributed under a given version of the Agreement may
only be subsequently distributed under the same version of the Agreement
or a subsequent version, subject to the provisions of Article 5.3.4.
Article 13 - GOVERNING LAW AND JURISDICTION
13.1 The Agreement is governed by French law. The Parties agree to
endeavor to seek an amicable solution to any disagreements or disputes
that may arise during the performance of the Agreement.
13.2 Failing an amicable solution within two (2) months as from their
occurrence, and unless emergency proceedings are necessary, the
disagreements or disputes shall be referred to the Paris Courts having
jurisdiction, by the more diligent Party.
Version 2.0 dated 2006-09-05.

512
Licence_CeCILL_V2-fr.txt Normal file
View File

@ -0,0 +1,512 @@
CONTRAT DE LICENCE DE LOGICIEL LIBRE CeCILL
Avertissement
Ce contrat est une licence de logiciel libre issue d'une concertation
entre ses auteurs afin que le respect de deux grands principes pr<70>side <20>
sa r<>daction:
* d'une part, le respect des principes de diffusion des logiciels
libres: acc<63>s au code source, droits <20>tendus conf<6E>r<EFBFBD>s aux
utilisateurs,
* d'autre part, la d<>signation d'un droit applicable, le droit
fran<61>ais, auquel elle est conforme, tant au regard du droit de la
responsabilit<69> civile que du droit de la propri<72>t<EFBFBD> intellectuelle
et de la protection qu'il offre aux auteurs et titulaires des
droits patrimoniaux sur un logiciel.
Les auteurs de la licence CeCILL (pour Ce[a] C[nrs] I[nria] L[ogiciel]
L[ibre]) sont:
Commissariat <20> l'Energie Atomique - CEA, <20>tablissement public de
recherche <20> caract<63>re scientifique, technique et industriel, dont le
si<EFBFBD>ge est situ<74> 25 rue Leblanc, immeuble Le Ponant D, 75015 Paris.
Centre National de la Recherche Scientifique - CNRS, <20>tablissement
public <20> caract<63>re scientifique et technologique, dont le si<73>ge est
situ<EFBFBD> 3 rue Michel-Ange, 75794 Paris cedex 16.
Institut National de Recherche en Informatique et en Automatique -
INRIA, <20>tablissement public <20> caract<63>re scientifique et technologique,
dont le si<73>ge est situ<74> Domaine de Voluceau, Rocquencourt, BP 105, 78153
Le Chesnay cedex.
Pr<50>ambule
Ce contrat est une licence de logiciel libre dont l'objectif est de
conf<EFBFBD>rer aux utilisateurs la libert<72> de modification et de
redistribution du logiciel r<>gi par cette licence dans le cadre d'un
mod<EFBFBD>le de diffusion en logiciel libre.
L'exercice de ces libert<72>s est assorti de certains devoirs <20> la charge
des utilisateurs afin de pr<70>server ce statut au cours des
redistributions ult<6C>rieures.
L'accessibilit<69> au code source et les droits de copie, de modification
et de redistribution qui en d<>coulent ont pour contrepartie de n'offrir
aux utilisateurs qu'une garantie limit<69>e et de ne faire peser sur
l'auteur du logiciel, le titulaire des droits patrimoniaux et les
conc<EFBFBD>dants successifs qu'une responsabilit<69> restreinte.
A cet <20>gard l'attention de l'utilisateur est attir<69>e sur les risques
associ<EFBFBD>s au chargement, <20> l'utilisation, <20> la modification et/ou au
d<EFBFBD>veloppement et <20> la reproduction du logiciel par l'utilisateur <20>tant
donn<EFBFBD> sa sp<73>cificit<69> de logiciel libre, qui peut le rendre complexe <20>
manipuler et qui le r<>serve donc <20> des d<>veloppeurs ou des
professionnels avertis poss<73>dant des connaissances informatiques
approfondies. Les utilisateurs sont donc invit<69>s <20> charger et tester
l'ad<61>quation du logiciel <20> leurs besoins dans des conditions permettant
d'assurer la s<>curit<69> de leurs syst<73>mes et/ou de leurs donn<6E>es et, plus
g<EFBFBD>n<EFBFBD>ralement, <20> l'utiliser et l'exploiter dans les m<>mes conditions de
s<EFBFBD>curit<EFBFBD>. Ce contrat peut <20>tre reproduit et diffus<75> librement, sous
r<EFBFBD>serve de le conserver en l'<27>tat, sans ajout ni suppression de clauses.
Ce contrat est susceptible de s'appliquer <20> tout logiciel dont le
titulaire des droits patrimoniaux d<>cide de soumettre l'exploitation aux
dispositions qu'il contient.
Article 1 - DEFINITIONS
Dans ce contrat, les termes suivants, lorsqu'ils seront <20>crits avec une
lettre capitale, auront la signification suivante:
Contrat: d<>signe le pr<70>sent contrat de licence, ses <20>ventuelles versions
post<EFBFBD>rieures et annexes.
Logiciel: d<>signe le logiciel sous sa forme de Code Objet et/ou de Code
Source et le cas <20>ch<63>ant sa documentation, dans leur <20>tat au moment de
l'acceptation du Contrat par le Licenci<63>.
Logiciel Initial: d<>signe le Logiciel sous sa forme de Code Source et
<EFBFBD>ventuellement de Code Objet et le cas <20>ch<63>ant sa documentation, dans
leur <20>tat au moment de leur premi<6D>re diffusion sous les termes du Contrat.
Logiciel Modifi<66>: d<>signe le Logiciel modifi<66> par au moins une
Contribution.
Code Source: d<>signe l'ensemble des instructions et des lignes de
programme du Logiciel et auquel l'acc<63>s est n<>cessaire en vue de
modifier le Logiciel.
Code Objet: d<>signe les fichiers binaires issus de la compilation du
Code Source.
Titulaire: d<>signe le ou les d<>tenteurs des droits patrimoniaux d'auteur
sur le Logiciel Initial.
Licenci<EFBFBD>: d<>signe le ou les utilisateurs du Logiciel ayant accept<70> le
Contrat.
Contributeur: d<>signe le Licenci<63> auteur d'au moins une Contribution.
Conc<EFBFBD>dant: d<>signe le Titulaire ou toute personne physique ou morale
distribuant le Logiciel sous le Contrat.
Contribution: d<>signe l'ensemble des modifications, corrections,
traductions, adaptations et/ou nouvelles fonctionnalit<69>s int<6E>gr<67>es dans
le Logiciel par tout Contributeur, ainsi que tout Module Interne.
Module: d<>signe un ensemble de fichiers sources y compris leur
documentation qui permet de r<>aliser des fonctionnalit<69>s ou services
suppl<EFBFBD>mentaires <20> ceux fournis par le Logiciel.
Module Externe: d<>signe tout Module, non d<>riv<69> du Logiciel, tel que ce
Module et le Logiciel s'ex<65>cutent dans des espaces d'adressage
diff<EFBFBD>rents, l'un appelant l'autre au moment de leur ex<65>cution.
Module Interne: d<>signe tout Module li<6C> au Logiciel de telle sorte
qu'ils s'ex<65>cutent dans le m<>me espace d'adressage.
GNU GPL: d<>signe la GNU General Public License dans sa version 2 ou
toute version ult<6C>rieure, telle que publi<6C>e par Free Software Foundation
Inc.
Parties: d<>signe collectivement le Licenci<63> et le Conc<6E>dant.
Ces termes s'entendent au singulier comme au pluriel.
Article 2 - OBJET
Le Contrat a pour objet la concession par le Conc<6E>dant au Licenci<63> d'une
licence non exclusive, cessible et mondiale du Logiciel telle que
d<EFBFBD>finie ci-apr<70>s <20> l'article 5 pour toute la dur<75>e de protection des droits
portant sur ce Logiciel.
Article 3 - ACCEPTATION
3.1 L'acceptation par le Licenci<63> des termes du Contrat est r<>put<75>e
acquise du fait du premier des faits suivants:
* (i) le chargement du Logiciel par tout moyen notamment par
t<>l<EFBFBD>chargement <20> partir d'un serveur distant ou par chargement <20>
partir d'un support physique;
* (ii) le premier exercice par le Licenci<63> de l'un quelconque des
droits conc<6E>d<EFBFBD>s par le Contrat.
3.2 Un exemplaire du Contrat, contenant notamment un avertissement
relatif aux sp<73>cificit<69>s du Logiciel, <20> la restriction de garantie et <20>
la limitation <20> un usage par des utilisateurs exp<78>riment<6E>s a <20>t<EFBFBD> mis <20>
disposition du Licenci<63> pr<70>alablement <20> son acceptation telle que
d<EFBFBD>finie <20> l'article 3.1 ci dessus et le Licenci<63> reconna<6E>t en avoir pris
connaissance.
Article 4 - ENTREE EN VIGUEUR ET DUREE
4.1 ENTREE EN VIGUEUR
Le Contrat entre en vigueur <20> la date de son acceptation par le Licenci<63>
telle que d<>finie en 3.1.
4.2 DUREE
Le Contrat produira ses effets pendant toute la dur<75>e l<>gale de
protection des droits patrimoniaux portant sur le Logiciel.
Article 5 - ETENDUE DES DROITS CONCEDES
Le Conc<6E>dant conc<6E>de au Licenci<63>, qui accepte, les droits suivants sur
le Logiciel pour toutes destinations et pour la dur<75>e du Contrat dans
les conditions ci-apr<70>s d<>taill<6C>es.
Par ailleurs, si le Conc<6E>dant d<>tient ou venait <20> d<>tenir un ou
plusieurs brevets d'invention prot<6F>geant tout ou partie des
fonctionnalit<EFBFBD>s du Logiciel ou de ses composants, il s'engage <20> ne pas
opposer les <20>ventuels droits conf<6E>r<EFBFBD>s par ces brevets aux Licenci<63>s
successifs qui utiliseraient, exploiteraient ou modifieraient le
Logiciel. En cas de cession de ces brevets, le Conc<6E>dant s'engage <20>
faire reprendre les obligations du pr<70>sent alin<69>a aux cessionnaires.
5.1 DROIT D'UTILISATION
Le Licenci<63> est autoris<69> <20> utiliser le Logiciel, sans restriction quant
aux domaines d'application, <20>tant ci-apr<70>s pr<70>cis<69> que cela comporte:
1. la reproduction permanente ou provisoire du Logiciel en tout ou
partie par tout moyen et sous toute forme.
2. le chargement, l'affichage, l'ex<65>cution, ou le stockage du
Logiciel sur tout support.
3. la possibilit<69> d'en observer, d'en <20>tudier, ou d'en tester le
fonctionnement afin de d<>terminer les id<69>es et principes qui sont
<20> la base de n'importe quel <20>l<EFBFBD>ment de ce Logiciel; et ceci,
lorsque le Licenci<63> effectue toute op<6F>ration de chargement,
d'affichage, d'ex<65>cution, de transmission ou de stockage du
Logiciel qu'il est en droit d'effectuer en vertu du Contrat.
5.2 DROIT D'APPORTER DES CONTRIBUTIONS
Le droit d'apporter des Contributions comporte le droit de traduire,
d'adapter, d'arranger ou d'apporter toute autre modification au Logiciel
et le droit de reproduire le logiciel en r<>sultant.
Le Licenci<63> est autoris<69> <20> apporter toute Contribution au Logiciel sous
r<EFBFBD>serve de mentionner, de fa<66>on explicite, son nom en tant qu'auteur de
cette Contribution et la date de cr<63>ation de celle-ci.
5.3 DROIT DE DISTRIBUTION
Le droit de distribution comporte notamment le droit de diffuser, de
transmettre et de communiquer le Logiciel au public sur tout support et
par tout moyen ainsi que le droit de mettre sur le march<63> <20> titre
on<EFBFBD>reux ou gratuit, un ou des exemplaires du Logiciel par tout proc<6F>d<EFBFBD>.
Le Licenci<63> est autoris<69> <20> distribuer des copies du Logiciel, modifi<66> ou
non, <20> des tiers dans les conditions ci-apr<70>s d<>taill<6C>es.
5.3.1 DISTRIBUTION DU LOGICIEL SANS MODIFICATION
Le Licenci<63> est autoris<69> <20> distribuer des copies conformes du Logiciel,
sous forme de Code Source ou de Code Objet, <20> condition que cette
distribution respecte les dispositions du Contrat dans leur totalit<69> et
soit accompagn<67>e:
1. d'un exemplaire du Contrat,
2. d'un avertissement relatif <20> la restriction de garantie et de
responsabilit<69> du Conc<6E>dant telle que pr<70>vue aux articles 8
et 9,
et que, dans le cas o<> seul le Code Objet du Logiciel est redistribu<62>,
le Licenci<63> permette aux futurs Licenci<63>s d'acc<63>der facilement au Code
Source complet du Logiciel en indiquant les modalit<69>s d'acc<63>s, <20>tant
entendu que le co<63>t additionnel d'acquisition du Code Source ne devra
pas exc<78>der le simple co<63>t de transfert des donn<6E>es.
5.3.2 DISTRIBUTION DU LOGICIEL MODIFIE
Lorsque le Licenci<63> apporte une Contribution au Logiciel, les conditions
de distribution du Logiciel Modifi<66> en r<>sultant sont alors soumises <20>
l'int<6E>gralit<69> des dispositions du Contrat.
Le Licenci<63> est autoris<69> <20> distribuer le Logiciel Modifi<66>, sous forme de
code source ou de code objet, <20> condition que cette distribution
respecte les dispositions du Contrat dans leur totalit<69> et soit
accompagn<EFBFBD>e:
1. d'un exemplaire du Contrat,
2. d'un avertissement relatif <20> la restriction de garantie et de
responsabilit<69> du Conc<6E>dant telle que pr<70>vue aux articles 8
et 9,
et que, dans le cas o<> seul le code objet du Logiciel Modifi<66> est
redistribu<EFBFBD>, le Licenci<63> permette aux futurs Licenci<63>s d'acc<63>der
facilement au code source complet du Logiciel Modifi<66> en indiquant les
modalit<EFBFBD>s d'acc<63>s, <20>tant entendu que le co<63>t additionnel d'acquisition
du code source ne devra pas exc<78>der le simple co<63>t de transfert des donn<6E>es.
5.3.3 DISTRIBUTION DES MODULES EXTERNES
Lorsque le Licenci<63> a d<>velopp<70> un Module Externe les conditions du
Contrat ne s'appliquent pas <20> ce Module Externe, qui peut <20>tre distribu<62>
sous un contrat de licence diff<66>rent.
5.3.4 COMPATIBILITE AVEC LA LICENCE GNU GPL
Le Licenci<63> peut inclure un code soumis aux dispositions d'une des
versions de la licence GNU GPL dans le Logiciel modifi<66> ou non et
distribuer l'ensemble sous les conditions de la m<>me version de la
licence GNU GPL.
Le Licenci<63> peut inclure le Logiciel modifi<66> ou non dans un code soumis
aux dispositions d'une des versions de la licence GNU GPL et distribuer
l'ensemble sous les conditions de la m<>me version de la licence GNU GPL.
Article 6 - PROPRIETE INTELLECTUELLE
6.1 SUR LE LOGICIEL INITIAL
Le Titulaire est d<>tenteur des droits patrimoniaux sur le Logiciel
Initial. Toute utilisation du Logiciel Initial est soumise au respect
des conditions dans lesquelles le Titulaire a choisi de diffuser son
oeuvre et nul autre n'a la facult<6C> de modifier les conditions de
diffusion de ce Logiciel Initial.
Le Titulaire s'engage <20> ce que le Logiciel Initial reste au moins r<>gi
par le Contrat et ce, pour la dur<75>e vis<69>e <20> l'article 4.2.
6.2 SUR LES CONTRIBUTIONS
Le Licenci<63> qui a d<>velopp<70> une Contribution est titulaire sur celle-ci
des droits de propri<72>t<EFBFBD> intellectuelle dans les conditions d<>finies par
la l<>gislation applicable.
6.3 SUR LES MODULES EXTERNES
Le Licenci<63> qui a d<>velopp<70> un Module Externe est titulaire sur celui-ci
des droits de propri<72>t<EFBFBD> intellectuelle dans les conditions d<>finies par
la l<>gislation applicable et reste libre du choix du contrat r<>gissant
sa diffusion.
6.4 DISPOSITIONS COMMUNES
Le Licenci<63> s'engage express<73>ment:
1. <20> ne pas supprimer ou modifier de quelque mani<6E>re que ce soit les
mentions de propri<72>t<EFBFBD> intellectuelle appos<6F>es sur le Logiciel;
2. <20> reproduire <20> l'identique lesdites mentions de propri<72>t<EFBFBD>
intellectuelle sur les copies du Logiciel modifi<66> ou non.
Le Licenci<63> s'engage <20> ne pas porter atteinte, directement ou
indirectement, aux droits de propri<72>t<EFBFBD> intellectuelle du Titulaire et/ou
des Contributeurs sur le Logiciel et <20> prendre, le cas <20>ch<63>ant, <20>
l'<27>gard de son personnel toutes les mesures n<>cessaires pour assurer le
respect des dits droits de propri<72>t<EFBFBD> intellectuelle du Titulaire et/ou
des Contributeurs.
Article 7 - SERVICES ASSOCIES
7.1 Le Contrat n'oblige en aucun cas le Conc<6E>dant <20> la r<>alisation de
prestations d'assistance technique ou de maintenance du Logiciel.
Cependant le Conc<6E>dant reste libre de proposer ce type de services. Les
termes et conditions d'une telle assistance technique et/ou d'une telle
maintenance seront alors d<>termin<69>s dans un acte s<>par<61>. Ces actes de
maintenance et/ou assistance technique n'engageront que la seule
responsabilit<EFBFBD> du Conc<6E>dant qui les propose.
7.2 De m<>me, tout Conc<6E>dant est libre de proposer, sous sa seule
responsabilit<EFBFBD>, <20> ses licenci<63>s une garantie, qui n'engagera que lui,
lors de la redistribution du Logiciel et/ou du Logiciel Modifi<66> et ce,
dans les conditions qu'il souhaite. Cette garantie et les modalit<69>s
financi<EFBFBD>res de son application feront l'objet d'un acte s<>par<61> entre le
Conc<EFBFBD>dant et le Licenci<63>.
Article 8 - RESPONSABILITE
8.1 Sous r<>serve des dispositions de l'article 8.2, le Licenci<63> a la
facult<EFBFBD>, sous r<>serve de prouver la faute du Conc<6E>dant concern<72>, de
solliciter la r<>paration du pr<70>judice direct qu'il subirait du fait du
Logiciel et dont il apportera la preuve.
8.2 La responsabilit<69> du Conc<6E>dant est limit<69>e aux engagements pris en
application du Contrat et ne saurait <20>tre engag<61>e en raison notamment:
(i) des dommages dus <20> l'inex<65>cution, totale ou partielle, de ses
obligations par le Licenci<63>, (ii) des dommages directs ou indirects
d<EFBFBD>coulant de l'utilisation ou des performances du Logiciel subis par le
Licenci<EFBFBD> et (iii) plus g<>n<EFBFBD>ralement d'un quelconque dommage indirect. En
particulier, les Parties conviennent express<73>ment que tout pr<70>judice
financier ou commercial (par exemple perte de donn<6E>es, perte de
b<EFBFBD>n<EFBFBD>fices, perte d'exploitation, perte de client<6E>le ou de commandes,
manque <20> gagner, trouble commercial quelconque) ou toute action dirig<69>e
contre le Licenci<63> par un tiers, constitue un dommage indirect et
n'ouvre pas droit <20> r<>paration par le Conc<6E>dant.
Article 9 - GARANTIE
9.1 Le Licenci<63> reconna<6E>t que l'<27>tat actuel des connaissances
scientifiques et techniques au moment de la mise en circulation du
Logiciel ne permet pas d'en tester et d'en v<>rifier toutes les
utilisations ni de d<>tecter l'existence d'<27>ventuels d<>fauts. L'attention
du Licenci<63> a <20>t<EFBFBD> attir<69>e sur ce point sur les risques associ<63>s au
chargement, <20> l'utilisation, la modification et/ou au d<>veloppement et <20>
la reproduction du Logiciel qui sont r<>serv<72>s <20> des utilisateurs avertis.
Il rel<65>ve de la responsabilit<69> du Licenci<63> de contr<74>ler, par tous
moyens, l'ad<61>quation du produit <20> ses besoins, son bon fonctionnement et
de s'assurer qu'il ne causera pas de dommages aux personnes et aux biens.
9.2 Le Conc<6E>dant d<>clare de bonne foi <20>tre en droit de conc<6E>der
l'ensemble des droits attach<63>s au Logiciel (comprenant notamment les
droits vis<69>s <20> l'article 5).
9.3 Le Licenci<63> reconna<6E>t que le Logiciel est fourni "en l'<27>tat" par le
Conc<EFBFBD>dant sans autre garantie, expresse ou tacite, que celle pr<70>vue <20>
l'article 9.2 et notamment sans aucune garantie sur sa valeur commerciale,
son caract<63>re s<>curis<69>, innovant ou pertinent.
En particulier, le Conc<6E>dant ne garantit pas que le Logiciel est exempt
d'erreur, qu'il fonctionnera sans interruption, qu'il sera compatible
avec l'<27>quipement du Licenci<63> et sa configuration logicielle ni qu'il
remplira les besoins du Licenci<63>.
9.4 Le Conc<6E>dant ne garantit pas, de mani<6E>re expresse ou tacite, que le
Logiciel ne porte pas atteinte <20> un quelconque droit de propri<72>t<EFBFBD>
intellectuelle d'un tiers portant sur un brevet, un logiciel ou sur tout
autre droit de propri<72>t<EFBFBD>. Ainsi, le Conc<6E>dant exclut toute garantie au
profit du Licenci<63> contre les actions en contrefa<66>on qui pourraient <20>tre
diligent<EFBFBD>es au titre de l'utilisation, de la modification, et de la
redistribution du Logiciel. N<>anmoins, si de telles actions sont
exerc<EFBFBD>es contre le Licenci<63>, le Conc<6E>dant lui apportera son aide
technique et juridique pour sa d<>fense. Cette aide technique et
juridique est d<>termin<69>e au cas par cas entre le Conc<6E>dant concern<72> et
le Licenci<63> dans le cadre d'un protocole d'accord. Le Conc<6E>dant d<>gage
toute responsabilit<69> quant <20> l'utilisation de la d<>nomination du
Logiciel par le Licenci<63>. Aucune garantie n'est apport<72>e quant <20>
l'existence de droits ant<6E>rieurs sur le nom du Logiciel et sur
l'existence d'une marque.
Article 10 - RESILIATION
10.1 En cas de manquement par le Licenci<63> aux obligations mises <20> sa
charge par le Contrat, le Conc<6E>dant pourra r<>silier de plein droit le
Contrat trente (30) jours apr<70>s notification adress<73>e au Licenci<63> et
rest<EFBFBD>e sans effet.
10.2 Le Licenci<63> dont le Contrat est r<>sili<6C> n'est plus autoris<69> <20>
utiliser, modifier ou distribuer le Logiciel. Cependant, toutes les
licences qu'il aura conc<6E>d<EFBFBD>es ant<6E>rieurement <20> la r<>siliation du Contrat
resteront valides sous r<>serve qu'elles aient <20>t<EFBFBD> effectu<74>es en
conformit<EFBFBD> avec le Contrat.
Article 11 - DISPOSITIONS DIVERSES
11.1 CAUSE EXTERIEURE
Aucune des Parties ne sera responsable d'un retard ou d'une d<>faillance
d'ex<65>cution du Contrat qui serait d<> <20> un cas de force majeure, un cas
fortuit ou une cause ext<78>rieure, telle que, notamment, le mauvais
fonctionnement ou les interruptions du r<>seau <20>lectrique ou de
t<EFBFBD>l<EFBFBD>communication, la paralysie du r<>seau li<6C>e <20> une attaque
informatique, l'intervention des autorit<69>s gouvernementales, les
catastrophes naturelles, les d<>g<EFBFBD>ts des eaux, les tremblements de terre,
le feu, les explosions, les gr<67>ves et les conflits sociaux, l'<27>tat de
guerre...
11.2 Le fait, par l'une ou l'autre des Parties, d'omettre en une ou
plusieurs occasions de se pr<70>valoir d'une ou plusieurs dispositions du
Contrat, ne pourra en aucun cas impliquer renonciation par la Partie
int<EFBFBD>ress<EFBFBD>e <20> s'en pr<70>valoir ult<6C>rieurement.
11.3 Le Contrat annule et remplace toute convention ant<6E>rieure, <20>crite
ou orale, entre les Parties sur le m<>me objet et constitue l'accord
entier entre les Parties sur cet objet. Aucune addition ou modification
aux termes du Contrat n'aura d'effet <20> l'<27>gard des Parties <20> moins
d'<27>tre faite par <20>crit et sign<67>e par leurs repr<70>sentants d<>ment habilit<69>s.
11.4 Dans l'hypoth<74>se o<> une ou plusieurs des dispositions du Contrat
s'av<61>rerait contraire <20> une loi ou <20> un texte applicable, existants ou
futurs, cette loi ou ce texte pr<70>vaudrait, et les Parties feraient les
amendements n<>cessaires pour se conformer <20> cette loi ou <20> ce texte.
Toutes les autres dispositions resteront en vigueur. De m<>me, la
nullit<EFBFBD>, pour quelque raison que ce soit, d'une des dispositions du
Contrat ne saurait entra<72>ner la nullit<69> de l'ensemble du Contrat.
11.5 LANGUE
Le Contrat est r<>dig<69> en langue fran<61>aise et en langue anglaise, ces
deux versions faisant <20>galement foi.
Article 12 - NOUVELLES VERSIONS DU CONTRAT
12.1 Toute personne est autoris<69>e <20> copier et distribuer des copies de
ce Contrat.
12.2 Afin d'en pr<70>server la coh<6F>rence, le texte du Contrat est prot<6F>g<EFBFBD>
et ne peut <20>tre modifi<66> que par les auteurs de la licence, lesquels se
r<EFBFBD>servent le droit de publier p<>riodiquement des mises <20> jour ou de
nouvelles versions du Contrat, qui poss<73>deront chacune un num<75>ro
distinct. Ces versions ult<6C>rieures seront susceptibles de prendre en
compte de nouvelles probl<62>matiques rencontr<74>es par les logiciels libres.
12.3 Tout Logiciel diffus<75> sous une version donn<6E>e du Contrat ne pourra
faire l'objet d'une diffusion ult<6C>rieure que sous la m<>me version du
Contrat ou une version post<73>rieure, sous r<>serve des dispositions de
l'article 5.3.4.
Article 13 - LOI APPLICABLE ET COMPETENCE TERRITORIALE
13.1 Le Contrat est r<>gi par la loi fran<61>aise. Les Parties conviennent
de tenter de r<>gler <20> l'amiable les diff<66>rends ou litiges qui
viendraient <20> se produire par suite ou <20> l'occasion du Contrat.
13.2 A d<>faut d'accord amiable dans un d<>lai de deux (2) mois <20> compter
de leur survenance et sauf situation relevant d'une proc<6F>dure d'urgence,
les diff<66>rends ou litiges seront port<72>s par la Partie la plus diligente
devant les Tribunaux comp<6D>tents de Paris.
Version 2.0 du 2006-09-05.

1
VERSION Normal file
View File

@ -0,0 +1 @@
1.0

1389
ahoc_metazoas.gv Normal file

File diff suppressed because it is too large Load Diff

15
ecoPrimerCommands Normal file
View File

@ -0,0 +1,15 @@
./ecoPrimer -d /Groups/Barcode-Leca/eubacteria-gr -l 10 -L 1000 -e 3 > euBactResults.txt
./ecoPrimer -d ChloroDB/chloroplast -l 5 -L 120 -r 58023 -e 3 > chloroVascularPlantsEric.txt
./ecoPrimer -d /Users/tiayyba/Documents/Data/mitochondrion/mitochondrion -q 0.4 -s 0.5 -l 10 -L 60 -r 1 -i 1 -T 0.2 -p > setsRoot.txt
./ecoPCR -d /Users/tiayyba/Documents/workspace/ecoPrimers/src/mitochondrion/mitochondrion -l 50 -L 120 -r 7742 TAGAACAGGCTCCTCTAG TTAGATACCCCACTATGC > 12SV5.ecoPCR
ecoTaxSpecificity -d /Users/tiayyba/Documents/workspace/ecoPrimers/src/mitochondrion/mitochondrion /Users/tiayyba/Documents/workspace/ecoPCR/src/12SV5.ecoPCR
149 ./ecoPrimer -d /Users/tiayyba/Documents/Data/mitochondrion/mitochondrion -r 40674 -E 9606 -l 10 -L 100 > MamalsNotHomoSapien.primers
150 ./ecoPrimer -d /Users/tiayyba/Documents/Data/mitochondrion/mitochondrion -q 0.4 -s 0.5 -r 40674 -E 9606 -l 10 -L 100 > MamalsNotHomoSapien.primers
151 ./ecoPrimer -d /Users/tiayyba/Documents/Data/mitochondrion/mitochondrion -q 0.5 -s 0.6 -r 40674 -E 9606 -l 10 -L 100 > MamalsNotHomoSapien1.primers

BIN
src copy.zip Normal file

Binary file not shown.

250
src/.Rhistory Normal file
View File

@ -0,0 +1,250 @@
plot(s$seq, s$size, xlab="Sequence Count", ylab="Memory used[B]", main="a. memory used vs sequence count without data mining")
abline(v = 273, col = "Blue")
text(400, 0.7e+09, "nSeq = 273")
plot(t$seq, t$size, xlab="Sequence Count", ylab="Memory used[B]", main="b. memory used vs sequence count with data mining")
abline(v = 273, col = "Blue")
text(450, 700, "nSeq = 273")
plot(s$seq, s$time, xlab="Sequence Count", ylab="Time[s]", main="c. time vs sequence count withount data mining")
abline(v = 273, col = "Blue")
text(450, 300, "nSeq = 273")
plot(t$seq, t$time, xlab="Sequence Count", ylab="Time[s]", main="d. time vs sequence count with data mining")
abline(v = 273, col = "Blue")
text(350, 20, "nSeq = 273")
par(mfrow=c(2,2))
plot(s$seq, s$size, xlab="Sequence Count", ylab="Memory used[B]", main="a. memory used vs sequence count without data mining")
abline(v = 273, col = "Blue")
text(400, 0.6e+09, "nSeq = 273")
plot(t$seq, t$size, xlab="Sequence Count", ylab="Memory used[B]", main="b. memory used vs sequence count with data mining")
abline(v = 273, col = "Blue")
text(400, 600, "nSeq = 273")
plot(s$seq, s$time, xlab="Sequence Count", ylab="Time[s]", main="c. time vs sequence count withount data mining")
abline(v = 273, col = "Blue")
text(400, 300, "nSeq = 273")
plot(t$seq, t$time, xlab="Sequence Count", ylab="Time[s]", main="d. time vs sequence count with data mining")
abline(v = 273, col = "Blue")
text(400, 15, "nSeq = 273")
par(mfrow=c(2,2))
plot(s$seq, s$size, xlab="Sequence Count", ylab="Memory used[B]", main="a. memory used vs sequence count without data mining")
abline(v = 273, col = "Blue")
text(400, 0.6e+09, "Nseq = 273")
plot(t$seq, t$size, xlab="Sequence Count", ylab="Memory used[B]", main="b. memory used vs sequence count with data mining")
abline(v = 273, col = "Blue")
text(400, 600, "Nseq = 273")
plot(s$seq, s$time, xlab="Sequence Count", ylab="Time[s]", main="c. time vs sequence count withount data mining")
abline(v = 273, col = "Blue")
text(400, 300, "Nseq = 273")
plot(t$seq, t$time, xlab="Sequence Count", ylab="Time[s]", main="d. time vs sequence count with data mining")
abline(v = 273, col = "Blue")
text(400, 15, "Nseq = 273")
s = read.table('/Users/tiayyba/Desktop/euBact/ecoprimer_71493.log', header= T)
t = read.table('/Users/tiayyba/Desktop/euBact/ecoprimer_84784.log', header= T)
par(mfrow=c(2,2))
plot(s$seq, s$size, xlab="Sequence Count", ylab="Memory used[B]", main="a. memory used vs sequence count without data mining")
plot(s$seq, s$size*1.5, xlab="Sequence Count", ylab="Memory used[B]", main="a. memory used vs sequence count without data mining")
abline(v = 273, col = "Blue")
text(450, 3e+09, "nSeq = 273")
plot(t$seq, t$size*1.5, xlab="Sequence Count", ylab="Memory used[B]", main="b. memory used vs sequence count with data mining")
abline(v = 273, col = "Blue")
text(450, 1150, "nSeq = 273")
par(mfrow=c(2,2))
plot(s$seq, s$size*1.5, xlab="Sequence Count", ylab="Memory used[B]", main="a. memory used vs sequence count without data mining")
abline(v = 273, col = "Blue")
text(450, 3e+09, "nSeq = 273")
plot(t$seq, t$size*1.5, xlab="Sequence Count", ylab="Memory used[B]", main="b. memory used vs sequence count with data mining")
abline(v = 273, col = "Blue")
text(450, 1150, "nSeq = 273")
par(mfrow=c(2,2))
plot(s$seq, s$size*1.5, xlab="Sequence Count", ylab="Memory used[B]", main="a. memory used vs sequence count without data mining")
abline(v = 273, col = "Blue")
text(450, 1.7e+09, "nSeq = 273")
par(mfrow=c(2,2))
plot(s$seq, s$size*1.5, xlab="Sequence Count", ylab="Memory used[B]", main="a. memory used vs sequence count without data mining")
abline(v = 273, col = "Blue")
text(450, 1.4e+09, "nSeq = 273")
plot(t$seq, t$size*1.5, xlab="Sequence Count", ylab="Memory used[B]", main="b. memory used vs sequence count with data mining")
abline(v = 273, col = "Blue")
text(450, 500, "nSeq = 273")
par(mfrow=c(2,2))
plot(s$seq, s$size*1.5, xlab="Sequence Count", ylab="Memory used[B]", main="a. memory used vs sequence count without data mining")
abline(v = 273, col = "Blue")
text(450, 1.4e+09, "nSeq = 273")
plot(t$seq, t$size*1.5, xlab="Sequence Count", ylab="Memory used[B]", main="b. memory used vs sequence count with data mining")
abline(v = 273, col = "Blue")
text(450, 500, "nSeq = 273")
text(450, 1000, "nSeq = 273")
par(mfrow=c(2,2))
plot(s$seq, s$size*1.5, xlab="Sequence Count", ylab="Memory used[B]", main="a. memory used vs sequence count without data mining")
abline(v = 273, col = "Blue")
text(450, 1.3e+09, "nSeq = 273")
plot(t$seq, t$size*1.5, xlab="Sequence Count", ylab="Memory used[B]", main="b. memory used vs sequence count with data mining")
abline(v = 273, col = "Blue")
text(450, 1000, "nSeq = 273")
plot(s$seq, s$time, xlab="Sequence Count", ylab="Time[s]", main="c. time vs sequence count withount data mining")
abline(v = 273, col = "Blue")
text(450, 700, "nSeq = 273")
plot(t$seq, t$time, xlab="Sequence Count", ylab="Time[s]", main="d. time vs sequence count with data mining")
abline(v = 273, col = "Blue")
text(450, 30, "nSeq = 273")
s = read.table('/Users/tiayyba/Desktop/UsedistRef1.txt', header = T)
s
plot(s$Distance, s$Count)
plot(s$Distance, log(s$Count))
plot(log(s$Distance), log(s$Count))
plot(s$Distance, log(s$Count))
s = read.table('/Users/tiayyba/Desktop/UU_results/UU_F83/plots/F83_distRef1.old.txt', header = T)
plot(s$Distance, log(s$Count))
s = read.table('/Users/tiayyba/Desktop/UsedistRef1.txt', header = T)
plot(s$Distance*100, log(s$Count))
s = read.table('/Users/tiayyba/Desktop/UU_results/UU_F83/plots/F83_distRef1.old.txt', header = T)
plot(s$Distance, log(s$Count))
s = read.table('/Users/tiayyba/Desktop/UsedistRef1.txt', header = T)
plot(s$Distance, log(s$Count))
s = read.table('/Users/tiayyba/Desktop/UU_results/UU_F83/plots/F83_distRef1.old.txt', header = T)
plot(s$Distance, log(s$Count))
s = read.table('/Users/tiayyba/Desktop/UsedistRef1.txt', header = T)
plot(s$Distance, log(s$Count))
plot(s$Distance*1000, log(s$Count))
s = read.table('/Users/tiayyba/Desktop/UsedistRef1.txt', header = T)
plot(s$Distance*1000, log(s$Count))
u = read.table('/Users/tiayyba/Desktop/UU_FdistRef2.txt', header = T)
u
max(u$Count)
max(u$Distance)
plot(u$Distance, u$Count)
plot(log(u$Distance), log(u$Count))
plot(u$Distance, u$Count, log(xy))
plot(u$Distance, u$Count, log=xy)
plot(u$Distance, u$Count, log='xy')
plot(u$Distance+1, u$Count, log='xy')
plot(u$Distance, u$Count, log='xy')
?plot
s = read.table('/Users/tiayyba/Desktop/UU_FdistRef1.txt', header = T)
plot(s$Distance, s$Count)
plot(log(s$Distance), log(s$Count))
plot(s$Distance, s$Count, log='xy')
plot(s$Distance, log(s$Count))
s = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F83/plots/F83_distRef1.old.txt', header = T)
s
sNew = s[order(s$Count),]
sNew
plot(s$Distance, s$Count)
s = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F83/plots/F83_distRef1.old.txt', header = T)
t = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F83/plots/F83_distRef2.old.txt', header = T)
s = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F83/plots/F83_distRef2.old.txt', header = T)
u = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F83/plots/F83_distRef1.old.txt', header = T)
t = data.frame(count=s$Count,uu=u$Distance,cs=s$Distance,col=(s$Distance < u$Distance)+1)
plot(s$Distance,s$Count,log='xy',col=t$col)
par(mfrow=c(1,2))
plot(s$Distance+1,s$Count,log='xy',col=t$col)
plot(u$Distance+1,u$Count,log='xy',col=t$col)
s = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F83/plots/F83_distRef2.old.txt', header = T)
u = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F83/plots/F83_distRef1.old.txt', header = T)
t = data.frame(count=s$Count,uu=u$Distance,cs=s$Distance,col=(s$Distance < u$Distance)+1)
plot(u$Distance+1,u$Count,log='xy',col=t$col)
par(mfrow=c(1,2))
plot(s$Distance+1,s$Count,log='xy',col=t$col)
plot(u$Distance+1,u$Count,log='xy',col=t$col)
s = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F82/plots/F82_distRef1.txt', header = F)
s = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F82/plots/F82_distRef1.txt', header = T)
s = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F82/plots/F82_distRef2.txt', header = T)
u = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F82/plots/F82_distRef1.txt', header = T)
t = data.frame(count=s$Count,uu=u$Distance,cs=s$Distance,col=(s$Distance < u$Distance)+1)
t = data.frame(count=u$Count,uu=u$Distance,cs=s$Distance,col=(s$Distance < u$Distance)+1)
s = read.table('/Users/tiayyba/Desktop/abc'header = F)
s = read.table('/Users/tiayyba/Desktop/abc', header = F)
plot(s)
s
s = read.table('/Users/tiayyba/Desktop/abc', header = F)
s
plot(s$V2, s$V1)
plot(s$V1, s$V2)
plot(s$V1, s$V2, xlab="sequence number", ylab="sequence count")
s = read.table('/Users/tiayyba/Desktop/abc', header = F)
s
s = read.table('/Users/tiayyba/Desktop/abc', header = F)
s
Sr tacg gcta ctag actg
par(mfrow=c(2,2))
plot(s$V1, s$V2, xlab="sequence number", ylab="sequence count", main = "sample: tacg")
plot(s$V1, s$V3, xlab="sequence number", ylab="sequence count", main = "sample: gcta")
plot(s$V1, s$V4, xlab="sequence number", ylab="sequence count", main = "sample: ctag")
plot(s$V1, s$V5, xlab="sequence number", ylab="sequence count", main = "sample: actg")
s = read.table('/Users/tiayyba/Desktop/abc', header = F)
s
plot(s$V1, s$V2, xlab="sequence number", ylab="sequence count", main = "sample: Ranunculus_acris")
Sr tacg gcta ctag actg
par(mfrow=c(2,2))
plot(s$V1, s$V2, xlab="sequence number", ylab="sequence count", main = "sample: Ranunculus_acris")
plot(s$V1, s$V3, xlab="sequence number", ylab="sequence count", main = "sample: Luzula_sudetica")
plot(s$V1, s$V4, xlab="sequence number", ylab="sequence count", main = "sample: Deschampsia_cespitosa")
plot(s$V1, s$V5, xlab="sequence number", ylab="sequence count", main = "sample: Cardamine_pratensis_paludosa")
s = read.table('/Users/tiayyba/Desktop/abc', header = F)
par(mfrow=c(2,2))
plot(s$V1, s$V2, xlab="sequence number", ylab="sequence count", main = "sample 1: Ranunculus_acris")
plot(s$V1, s$V3, xlab="sequence number", ylab="sequence count", main = "sample 2: Luzula_sudetica")
plot(s$V1, s$V4, xlab="sequence number", ylab="sequence count", main = "sample 3: Deschampsia_cespitosa")
plot(s$V1, s$V5, xlab="sequence number", ylab="sequence count", main = "sample 4: Cardamine_pratensis_paludosa")
s = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F83/plots/F83_dist.txt', header = F)
s = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F83/plots/F83_dist.txt', header = F, row.names = 1)
s = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F83/plots/F83_dist.txt', header = F, row.names=1)
s = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F83/plots/F83_dist.txt',row.names=1)
s
plot(s$count, s$uu)
plot(s$uu, s$count)
plot(s$uu, s$count, col = s$col)
plot(s$cs, s$count, col = s$col)
plot(s$cs, s$count, col = s$col, col = (s$uu < s$cs) +1)
plot(s$uu, s$count)
plot(log(s$uu), log(s$count))
plot(s$uu, log(s$count))
plot(log(s$uu), log(s$count))
plot(s$uu, s$count)
plot(s$uu, log(s$count))
plot(s$uu, log(s$count), xlab = "distance from Uncia uncia", ylab = "log(SequenceCount)")
= read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F83/plots/F83_dist.txt', header = T)
s= read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F83/plots/F83_dist.txt', header = T)
s
par(mfrow = c(1,2))
plot(s$cs, s$count, col = s$col)
plot(s$cs, s$count,log = 'xy', col = s$col)
par(mfrow = c(1,2))
plot(s$cs, s$count,log = 'xy', col = s$col)
plot(s$uu, s$count,log = 'xy', col = s$col)
score = (0.407)*0.890*exp(-sqrt(692.33/1.247))
score
score1 = (0.511)*0.904*exp(-sqrt(481/148.5))
score1
score1 = sqrt(481/148.5)
score1
score = sqrt(692.33/1.247)
score
score = (0.407)^2*0.890*sqrt(692.33/1.247)
score
score1 = (0.511)^2*0.904*sqrt(481/148.5)
score1
score = (0.407)*0.890*sqrt(692.33/1.247)
score
score1 = (0.511)*0.904*sqrt(481/148.5)
score1
score = (0.407)*0.890*692.33/sqrt(1.247)
score
score1 = (0.511)*0.904*481/sqrt(148.5)
score1
sum = 714 + 710 + 699
sum
2123/3
1760+3210+4950+2090
12010+3300+220
4400+385+55+495+715+800+450+2200+1540+1210+550
944 - 220
724 - 40
684 -10
364-278
944-657
250+220
7*9
q
quit
s = read.table('metazoas_Mdyn_T50.gv')
s = read.table('metazoas_Mdyn_T50.gv.hist.gv')
t = read.table('test.gv')

BIN
src/Documents/CalculTM.xls Normal file

Binary file not shown.

78
src/Makefile Normal file
View File

@ -0,0 +1,78 @@
EXEC=ecoPrimers
PRIMER_SRC= ecoprimer.c
PRIMER_OBJ= $(patsubst %.c,%.o,$(PRIMER_SRC))
SRCS= $(PRIMER_SRC)
LIB= -lecoprimer -lecoPCR -lthermo -lz -lm
LIBFILE= libecoPCR/libecoPCR.a \
libecoprimer/libecoprimer.a \
libthermo/libthermo.a \
include global.mk
all: $(EXEC)
########
#
# ecoPrimer compilation
#
########
# executable compilation and link
ecoPrimers: $(PRIMER_OBJ) $(LIBFILE)
$(CC) -g $(LDFLAGS) -O5 -m64 -o $@ $< $(LIBPATH) $(LIB)
########
#
# library compilation
#
########
libecoPCR/libecoPCR.a:
$(MAKE) -C libecoPCR
libecoprimer/libecoprimer.a:
$(MAKE) -C libecoprimer
libthermo/libthermo.a:
$(MAKE) -C libthermo
########
#
# project management
#
########
clean:
rm -f *.o
rm -f $(EXEC)
$(MAKE) -C libecoPCR clean
$(MAKE) -C libecoprimer clean
$(MAKE) -C libthermo clean
########
#
# clean for k2 to remove .o and .P files
#
########
k2clean:
rm -f *.o
rm -f *.P
rm -f libecoPCR/*.o
rm -f libecoPCR/*.P
rm -f libecoprimer/*.o
rm -f libecoprimer/*.P
rm -f libthermo/*.o
rm -f libthermo/*.P

BIN
src/ecoPrimers Executable file

Binary file not shown.

32
src/ecoprimer.P Normal file
View File

@ -0,0 +1,32 @@
ecoprimer.o ecoprimer.P : ecoprimer.c libecoprimer/ecoprimer.h /usr/include/inttypes.h \
/usr/include/sys/cdefs.h /usr/include/_types.h \
/usr/include/sys/_types.h /usr/include/machine/_types.h \
/usr/include/i386/_types.h \
/usr/lib/gcc/i686-apple-darwin10/4.2.1/include/stdint.h \
/usr/include/stdlib.h /usr/include/Availability.h \
/usr/include/AvailabilityInternal.h /usr/include/sys/wait.h \
/usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \
/usr/include/machine/signal.h /usr/include/i386/signal.h \
/usr/include/i386/_structs.h /usr/include/sys/_structs.h \
/usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \
/usr/include/sys/resource.h /usr/include/machine/endian.h \
/usr/include/i386/endian.h /usr/include/sys/_endian.h \
/usr/include/libkern/_OSByteOrder.h \
/usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \
/usr/include/machine/types.h /usr/include/i386/types.h \
/usr/include/i386/_types.h /usr/include/stdio.h \
/usr/include/secure/_stdio.h /usr/include/secure/_common.h \
libecoprimer/ecotype.h libecoprimer/../libecoPCR/ecoPCR.h \
libecoprimer/../libthermo/nnparams.h /usr/include/math.h \
/usr/include/architecture/i386/math.h /usr/include/string.h \
/usr/include/secure/_string.h libecoprimer/apat.h \
libecoprimer/libstki.h libecoprimer/debug.h libecoprimer/PrimerSets.h \
libecoprimer/ecoprimer.h libecoprimer/ahocorasick.h \
/usr/include/ctype.h /usr/include/runetype.h /usr/include/getopt.h \
/usr/include/unistd.h /usr/include/sys/unistd.h \
/usr/include/sys/select.h /usr/include/sys/_select.h \
/usr/include/time.h /usr/include/_structs.h /usr/include/sys/time.h \
/usr/include/dlfcn.h \
/usr/lib/gcc/i686-apple-darwin10/4.2.1/include/stdbool.h \
/usr/include/AvailabilityMacros.h libthermo/nnparams.h \
libthermo/thermostats.h libthermo/../libecoprimer/ecoprimer.h

1019
src/ecoprimer.c Executable file

File diff suppressed because it is too large Load Diff

21
src/global.mk Normal file
View File

@ -0,0 +1,21 @@
MACHINE=MAC_OS_X
LIBPATH= -Llibapat -LlibecoPCR -Llibecoprimer -Llibthermo
MAKEDEPEND = gcc -D$(MACHINE) -M $(CPPFLAGS) -o $*.d $<
CC=gcc
CFLAGS= -W -Wall -m64 -g
#CFLAGS= -W -Wall -O5 -m64 -g
#CFLAGS= -W -Wall -O0 -m64 -g
#CFLAGS= -W -Wall -O5 -fast -g
default: all
%.o: %.c
$(CC) -D$(MACHINE) $(CFLAGS) -c -o $@ $<
%.P : %.c
$(MAKEDEPEND)
@sed 's/\($*\)\.o[ :]*/\1.o $@ : /g' < $*.d > $@; \
rm -f $*.d; [ -s $@ ] || rm -f $@
include $(SRCS:.c=.P)

30
src/libecoPCR/Makefile Normal file
View File

@ -0,0 +1,30 @@
SOURCES = ecodna.c \
ecoError.c \
ecoIOUtils.c \
ecoMalloc.c \
ecorank.c \
ecoseq.c \
ecotax.c \
ecofilter.c \
econame.c
SRCS=$(SOURCES)
OBJECTS= $(patsubst %.c,%.o,$(SOURCES))
LIBFILE= libecoPCR.a
RANLIB= ranlib
include ../global.mk
all: $(LIBFILE)
clean:
rm -rf $(OBJECTS) $(LIBFILE)
$(LIBFILE): $(OBJECTS)
ar -cr $@ $?
$(RANLIB) $@

26
src/libecoPCR/ecoError.c Normal file
View File

@ -0,0 +1,26 @@
#include "ecoPCR.h"
#include <stdio.h>
#include <stdlib.h>
/*
* print the message given as argument and exit the program
* @param error error number
* @param message the text explaining what's going on
* @param filename the file source where the program failed
* @param linenumber the line where it has failed
* filename and linenumber are written at pre-processing
* time by a macro
*/
void ecoError(int32_t error,
const char* message,
const char * filename,
int linenumber)
{
fprintf(stderr,"Error %d in file %s line %d : %s\n",
error,
filename,
linenumber,
message);
abort();
}

122
src/libecoPCR/ecoIOUtils.c Normal file
View File

@ -0,0 +1,122 @@
#include "ecoPCR.h"
#include <stdio.h>
#include <stdlib.h>
#define SWAPINT32(x) ((((x) << 24) & 0xFF000000) | (((x) << 8) & 0xFF0000) | \
(((x) >> 8) & 0xFF00) | (((x) >> 24) & 0xFF))
int32_t is_big_endian()
{
int32_t i=1;
return (int32_t)((char*)&i)[0];
}
int32_t swap_int32_t(int32_t i)
{
return SWAPINT32(i);
}
/**
* Read part of the file
* @param *f the database
* @param recordSize the size to be read
*
* @return buffer
*/
void *read_ecorecord(FILE *f,int32_t *recordSize)
{
static void *buffer =NULL;
int32_t buffersize=0;
int32_t read;
if (!recordSize)
ECOERROR(ECO_ASSERT_ERROR,
"recordSize cannot be NULL");
read = fread(recordSize,
1,
sizeof(int32_t),
f);
if (feof(f))
return NULL;
if (read != sizeof(int32_t))
ECOERROR(ECO_IO_ERROR,"Reading record size error");
if (is_big_endian())
*recordSize=swap_int32_t(*recordSize);
if (buffersize < *recordSize)
{
if (buffer)
buffer = ECOREALLOC(buffer,*recordSize,
"Increase size of record buffer");
else
buffer = ECOMALLOC(*recordSize,
"Allocate record buffer");
}
read = fread(buffer,
1,
*recordSize,
f);
if (read != *recordSize)
ECOERROR(ECO_IO_ERROR,"Reading record data error");
return buffer;
};
/**
* Open the database and check it's readable
* @param filename name of the database (.sdx, .rdx, .tbx)
* @param sequencecount buffer - pointer to variable storing the number of occurence
* @param abort_on_open_error boolean to define the behaviour in case of error
* while opening the database
* @return FILE type
**/
FILE *open_ecorecorddb(const char *filename,
int32_t *sequencecount,
int32_t abort_on_open_error)
{
FILE *f;
int32_t read;
f = fopen(filename,"rb");
if (!f)
{
if (abort_on_open_error)
ECOERROR(ECO_IO_ERROR,"Cannot open file");
else
{
*sequencecount=0;
return NULL;
}
}
read = fread(sequencecount,
1,
sizeof(int32_t),
f);
if (read != sizeof(int32_t))
ECOERROR(ECO_IO_ERROR,"Reading record size error");
if (is_big_endian())
*sequencecount=swap_int32_t(*sequencecount);
return f;
}

96
src/libecoPCR/ecoMalloc.c Normal file
View File

@ -0,0 +1,96 @@
#include "ecoPCR.h"
#include <stdlib.h>
static int eco_log_malloc = 0;
static size_t eco_amount_malloc=0;
static size_t eco_chunk_malloc=0;
void eco_trace_memory_allocation()
{
eco_log_malloc=1;
}
void eco_untrace_memory_allocation()
{
eco_log_malloc=0;
}
void ecoMallocedMemory()
{
//eco_amount_malloc;
}
void *eco_malloc(int64_t chunksize,
const char *error_message,
const char *filename,
int32_t line)
{
void * chunk;
chunk = calloc(1,chunksize);
if (!chunk)
ecoError(ECO_MEM_ERROR,error_message,filename,line);
eco_chunk_malloc++;
if (eco_log_malloc)
fprintf(stderr,
"Memory segment located at %p of size %d is allocated (file : %s [%d])",
chunk,
chunksize,
filename,
line);
return chunk;
}
void *eco_realloc(void *chunk,
int64_t newsize,
const char *error_message,
const char *filename,
int32_t line)
{
void *newchunk;
newchunk = realloc(chunk,newsize);
if (!newchunk)
{
ecoError(ECO_MEM_ERROR,error_message,filename,line);
fprintf(stderr,"Requested memory : %d\n",newsize);
}
if (!chunk)
eco_chunk_malloc++;
if (eco_log_malloc)
fprintf(stderr,
"Old memory segment %p is reallocated at %p with a size of %d (file : %s [%d])",
chunk,
newchunk,
newsize,
filename,
line);
return newchunk;
}
void eco_free(void *chunk,
const char *error_message,
const char *filename,
int32_t line)
{
free(chunk);
if (eco_log_malloc)
fprintf(stderr,
"Memory segment %p is released => %s (file : %s [%d])",
chunk,
error_message,
filename,
line);
eco_chunk_malloc--;
}

270
src/libecoPCR/ecoPCR.h Normal file
View File

@ -0,0 +1,270 @@
#ifndef ECOPCR_H_
#define ECOPCR_H_
#include <stdio.h>
#include <inttypes.h>
/*****************************************************
*
* Data type declarations
*
*****************************************************/
/*
*
* Sequence types
*
*/
typedef struct {
int32_t taxid;
char AC[20];
int32_t DE_length;
int32_t SQ_length;
int32_t CSQ_length; /*what is this CSQ_length ? */
char data[1];
} ecoseqformat_t;
typedef struct {
int32_t taxid;
int32_t SQ_length;
int32_t isexample;
char *AC;
char *DE;
char *SQ;
int32_t ranktaxonid;/*TR: taxon id to which the sequence belongs*/
} ecoseq_t, *pecoseq_t;
/*
*
* Taxonomy taxon types
*
*/
typedef struct {
int32_t taxid;
int32_t rank;
int32_t parent;
int32_t namelength;
char name[1];
} ecotxformat_t;
typedef struct ecotxnode {
int32_t taxid;
int32_t rank;
struct ecotxnode *parent;
char *name;
} ecotx_t;
typedef struct {
int32_t count;
ecotx_t taxon[1];
} ecotxidx_t;
/*
*
* Taxonomy rank types
*
*/
typedef struct {
int32_t count;
char* label[1];
} ecorankidx_t;
/*
*
* Taxonomy name types
*
*/
typedef struct {
int32_t is_scientificname;
int32_t namelength;
int32_t classlength;
int32_t taxid;
char names[1];
} econameformat_t;
typedef struct {
char *name;
char *classname;
int32_t is_scientificname;
struct ecotxnode *taxon;
} econame_t;
typedef struct {
int32_t count;
econame_t names[1];
} econameidx_t;
typedef struct {
ecorankidx_t *ranks;
econameidx_t *names;
ecotxidx_t *taxons;
} ecotaxonomy_t;
/*****************************************************
*
* Function declarations
*
*****************************************************/
/*
*
* Low level system functions
*
*/
int32_t is_big_endian();
int32_t swap_int32_t(int32_t);
void *eco_malloc(int64_t chunksize,
const char *error_message,
const char *filename,
int32_t line);
void *eco_realloc(void *chunk,
int64_t chunksize,
const char *error_message,
const char *filename,
int32_t line);
void eco_free(void *chunk,
const char *error_message,
const char *filename,
int32_t line);
void eco_trace_memory_allocation();
void eco_untrace_memory_allocation();
#define ECOMALLOC(size,error_message) \
eco_malloc((size),(error_message),__FILE__,__LINE__)
#define ECOREALLOC(chunk,size,error_message) \
eco_realloc((chunk),(size),(error_message),__FILE__,__LINE__)
#define ECOFREE(chunk,error_message) \
eco_free((chunk),(error_message),__FILE__,__LINE__)
/*
*
* Error managment
*
*/
void ecoError(int32_t,const char*,const char *,int);
#define ECOERROR(code,message) ecoError((code),(message),__FILE__,__LINE__)
#define ECO_IO_ERROR (1)
#define ECO_MEM_ERROR (2)
#define ECO_ASSERT_ERROR (3)
#define ECO_NOTFOUND_ERROR (4)
/*
*
* Low level Disk access functions
*
*/
FILE *open_ecorecorddb(const char *filename,
int32_t *sequencecount,
int32_t abort_on_open_error);
void *read_ecorecord(FILE *,int32_t *recordSize);
/*
* Read function in internal binary format
*/
FILE *open_ecoseqdb(const char *filename,
int32_t *sequencecount);
ecoseq_t *readnext_ecoseq(FILE *);
ecorankidx_t *read_rankidx(const char *filename);
econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy);
/**
* Read taxonomy data as formated by the ecoPCRFormat.py script.
*
* This function is normaly uses internaly by the read_taxonomy
* function and should not be called directly.
*
* @arg filename path to the *.tdx file of the reformated db
*
* @return pointer to a taxonomy index structure
*/
ecotxidx_t *read_taxonomyidx(const char *filename);
ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName);
ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy, int32_t taxid);
ecotx_t *eco_findtaxonatrank(ecotx_t *taxon, int32_t rankidx);
int eco_isundertaxon(ecotx_t *taxon, int other_taxid);
ecoseq_t *ecoseq_iterator(const char *prefix);
ecoseq_t *new_ecoseq();
int32_t delete_ecoseq(ecoseq_t *);
ecoseq_t *new_ecoseq_with_data( char *AC,
char *DE,
char *SQ,
int32_t taxid
);
int32_t delete_taxon(ecotx_t *taxon);
int32_t delete_taxonomy(ecotxidx_t *index);
int32_t rank_index(const char* label,ecorankidx_t* ranks);
//int32_t delete_apatseq(SeqPtr pseq);
//PatternPtr buildPattern(const char *pat, int32_t error_max);
//PatternPtr complementPattern(PatternPtr pat);
//
//SeqPtr ecoseq2apatseq(ecoseq_t *in,SeqPtr out,int32_t circular);
char *ecoComplementPattern(char *nucAcSeq);
char *ecoComplementSequence(char *nucAcSeq);
char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end);
ecotx_t *eco_getspecies(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
ecotx_t *eco_getgenus(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
ecotx_t *eco_getfamily(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
ecotx_t *eco_getkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy);
int eco_is_taxid_ignored(int32_t *ignored_taxid, int32_t tab_len, int32_t taxid);
int eco_is_taxid_included(ecotaxonomy_t *taxonomy, int32_t *included_taxid, int32_t tab_len, int32_t taxid);
#endif /*ECOPCR_H_*/

202
src/libecoPCR/ecoapat.c Normal file
View File

@ -0,0 +1,202 @@
#include "../libapat/libstki.h"
#include "../libapat/apat.h"
#include "ecoPCR.h"
#include <string.h>
static void EncodeSequence(SeqPtr seq);
static void UpperSequence(char *seq);
/* -------------------------------------------- */
/* uppercase sequence */
/* -------------------------------------------- */
#define IS_LOWER(c) (((c) >= 'a') && ((c) <= 'z'))
#define TO_UPPER(c) ((c) - 'a' + 'A')
void UpperSequence(char *seq)
{
char *cseq;
for (cseq = seq ; *cseq ; cseq++)
if (IS_LOWER(*cseq))
*cseq = TO_UPPER(*cseq);
}
#undef IS_LOWER
#undef TO_UPPER
/* -------------------------------------------- */
/* encode sequence */
/* IS_UPPER is slightly faster than isupper */
/* -------------------------------------------- */
#define IS_UPPER(c) (((c) >= 'A') && ((c) <= 'Z'))
void EncodeSequence(SeqPtr seq)
{
int i;
UInt8 *data;
char *cseq;
data = seq->data;
cseq = seq->cseq;
while (*cseq) {
*data = (IS_UPPER(*cseq) ? *cseq - 'A' : 0x0);
data++;
cseq++;
}
for (i=0,cseq=seq->cseq;i < seq->circular; i++,cseq++,data++)
*data = (IS_UPPER(*cseq) ? *cseq - 'A' : 0x0);
for (i = 0 ; i < MAX_PATTERN ; i++)
seq->hitpos[i]->top = seq->hiterr[i]->top = 0;
}
#undef IS_UPPER
SeqPtr ecoseq2apatseq(ecoseq_t *in,SeqPtr out,int32_t circular)
{
int i;
if (!out)
{
out = ECOMALLOC(sizeof(Seq),
"Error in Allocation of a new Seq structure");
for (i = 0 ; i < MAX_PATTERN ; i++)
{
if (! (out->hitpos[i] = NewStacki(kMinStackiSize)))
ECOERROR(ECO_MEM_ERROR,"Error in hit stack Allocation");
if (! (out->hiterr[i] = NewStacki(kMinStackiSize)))
ECOERROR(ECO_MEM_ERROR,"Error in error stack Allocation");
}
}
out->name = in->AC;
out->seqsiz = out->seqlen = in->SQ_length;
out->circular = circular;
if (!out->data)
{
out->data = ECOMALLOC((out->seqlen+circular) *sizeof(UInt8),
"Error in Allocation of a new Seq data member");
out->datsiz= out->seqlen+circular;
}
else if ((out->seqlen +circular) >= out->datsiz)
{
out->data = ECOREALLOC(out->data,(out->seqlen+circular),
"Error during Seq data buffer realloc");
out->datsiz= out->seqlen+circular;
}
out->cseq = in->SQ;
EncodeSequence(out);
return out;
}
int32_t delete_apatseq(SeqPtr pseq)
{
int i;
if (pseq) {
if (pseq->data)
ECOFREE(pseq->data,"Freeing sequence data buffer");
for (i = 0 ; i < MAX_PATTERN ; i++) {
if (pseq->hitpos[i]) FreeStacki(pseq->hitpos[i]);
if (pseq->hiterr[i]) FreeStacki(pseq->hiterr[i]);
}
ECOFREE(pseq,"Freeing apat sequence structure");
return 0;
}
return 1;
}
/*
PatternPtr buildPattern(const char *pat, int32_t error_max)
{
PatternPtr pattern;
int32_t patlen;
pattern = ECOMALLOC(sizeof(Pattern),
"Error in pattern allocation");
pattern->ok = Vrai;
pattern->hasIndel= Faux;
pattern->maxerr = error_max;
patlen = strlen(pat);
pattern->cpat = ECOMALLOC(sizeof(char)*patlen+1,
"Error in sequence pattern allocation");
strncpy(pattern->cpat,pat,patlen);
pattern->cpat[patlen]=0;
UpperSequence(pattern->cpat);
if (!CheckPattern(pattern))
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern checking");
if (! EncodePattern(pattern, dna))
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern encoding");
if (! CreateS(pattern, ALPHA_LEN))
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern compiling");
return pattern;
}
PatternPtr complementPattern(PatternPtr pat)
{
PatternPtr pattern;
pattern = ECOMALLOC(sizeof(Pattern),
"Error in pattern allocation");
pattern->ok = Vrai;
pattern->hasIndel= pat->hasIndel;
pattern->maxerr = pat->maxerr;
pattern->patlen = pat->patlen;
pattern->cpat = ECOMALLOC(sizeof(char)*(strlen(pat->cpat)+1),
"Error in sequence pattern allocation");
strcpy(pattern->cpat,pat->cpat);
ecoComplementPattern(pattern->cpat);
if (!CheckPattern(pattern))
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern checking");
if (! EncodePattern(pattern, dna))
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern encoding");
if (! CreateS(pattern, ALPHA_LEN))
ECOERROR(ECO_ASSERT_ERROR,"Error in pattern compiling");
return pattern;
}
*/

153
src/libecoPCR/ecodna.c Normal file
View File

@ -0,0 +1,153 @@
#include <string.h>
#include "ecoPCR.h"
/*
* @doc: DNA alphabet (IUPAC)
*/
#define LX_BIO_DNA_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZ#![]"
/*
* @doc: complementary DNA alphabet (IUPAC)
*/
#define LX_BIO_CDNA_ALPHA "TVGHEFCDIJMLKNOPQYSAABWXRZ#!]["
static char sNuc[] = LX_BIO_DNA_ALPHA;
static char sAnuc[] = LX_BIO_CDNA_ALPHA;
static char LXBioBaseComplement(char nucAc);
static char *LXBioSeqComplement(char *nucAcSeq);
static char *reverseSequence(char *str,char isPattern);
/* ---------------------------- */
char LXBioBaseComplement(char nucAc)
{
char *c;
if ((c = strchr(sNuc, nucAc)))
return sAnuc[(c - sNuc)];
else
return nucAc;
}
/* ---------------------------- */
char *LXBioSeqComplement(char *nucAcSeq)
{
char *s;
for (s = nucAcSeq ; *s ; s++)
*s = LXBioBaseComplement(*s);
return nucAcSeq;
}
char *reverseSequence(char *str,char isPattern)
{
char *sb, *se, c;
if (! str)
return str;
sb = str;
se = str + strlen(str) - 1;
while(sb <= se) {
c = *sb;
*sb++ = *se;
*se-- = c;
}
sb = str;
se = str + strlen(str) - 1;
if (isPattern)
for (;sb < se; sb++)
{
if (*sb=='#')
{
if (((se - sb) > 2) && (*(sb+2)=='!'))
{
*sb='!';
sb+=2;
*sb='#';
}
else
{
*sb=*(sb+1);
sb++;
*sb='#';
}
}
else if (*sb=='!')
{
*sb=*(sb-1);
*(sb-1)='!';
}
}
return str;
}
char *ecoComplementPattern(char *nucAcSeq)
{
return reverseSequence(LXBioSeqComplement(nucAcSeq),1);
}
char *ecoComplementSequence(char *nucAcSeq)
{
return reverseSequence(LXBioSeqComplement(nucAcSeq),0);
}
char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end)
{
static char *buffer = NULL;
static int32_t buffSize= 0;
int32_t length;
if (begin < end)
{
length = end - begin;
if (length >= buffSize)
{
buffSize = length+1;
if (buffer)
buffer=ECOREALLOC(buffer,buffSize,
"Error in reallocating sub sequence buffer");
else
buffer=ECOMALLOC(buffSize,
"Error in allocating sub sequence buffer");
}
strncpy(buffer,nucAcSeq + begin,length);
buffer[length]=0;
}
else
{
length = end + strlen(nucAcSeq) - begin;
if (length >= buffSize)
{
buffSize = length+1;
if (buffer)
buffer=ECOREALLOC(buffer,buffSize,
"Error in reallocating sub sequence buffer");
else
buffer=ECOMALLOC(buffSize,
"Error in allocating sub sequence buffer");
}
strncpy(buffer,nucAcSeq+begin,length - end);
strncpy(buffer+(length-end),nucAcSeq ,end);
buffer[length]=0;
}
return buffer;
}

20
src/libecoPCR/ecofilter.c Normal file
View File

@ -0,0 +1,20 @@
#include "ecoPCR.h"
int eco_is_taxid_included( ecotaxonomy_t *taxonomy,
int32_t *restricted_taxid,
int32_t tab_len,
int32_t taxid)
{
int i;
ecotx_t *taxon;
taxon = eco_findtaxonbytaxid(taxonomy, taxid);
for (i=0; i < tab_len; i++)
if ( (taxon->taxid == restricted_taxid[i]) ||
(eco_isundertaxon(taxon, restricted_taxid[i])) )
return 1;
return 0;
}

61
src/libecoPCR/econame.c Normal file
View File

@ -0,0 +1,61 @@
#include "ecoPCR.h"
#include <string.h>
#include <stdlib.h>
static econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy);
econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy)
{
int32_t count;
FILE *f;
econameidx_t *indexname;
int32_t i;
f = open_ecorecorddb(filename,&count,1);
indexname = (econameidx_t*) ECOMALLOC(sizeof(econameidx_t) + sizeof(econame_t) * (count-1),"Allocate names");
indexname->count=count;
for (i=0; i < count; i++){
readnext_econame(f,(indexname->names)+i,taxonomy);
}
return indexname;
}
econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy)
{
econameformat_t *raw;
int32_t rs;
raw = read_ecorecord(f,&rs);
if (!raw)
return NULL;
if (is_big_endian())
{
raw->is_scientificname = swap_int32_t(raw->is_scientificname);
raw->namelength = swap_int32_t(raw->namelength);
raw->classlength = swap_int32_t(raw->classlength);
raw->taxid = swap_int32_t(raw->taxid);
}
name->is_scientificname=raw->is_scientificname;
name->name = ECOMALLOC((raw->namelength+1) * sizeof(char),"Allocate name");
strncpy(name->name,raw->names,raw->namelength);
name->name[raw->namelength]=0;
name->classname = ECOMALLOC((raw->classlength+1) * sizeof(char),"Allocate classname");
strncpy(name->classname,(raw->names+raw->namelength),raw->classlength);
name->classname[raw->classlength]=0;
name->taxon = taxonomy->taxons->taxon + raw->taxid;
return name;
}

52
src/libecoPCR/ecorank.c Normal file
View File

@ -0,0 +1,52 @@
#include "ecoPCR.h"
#include <string.h>
#include <stdlib.h>
static int compareRankLabel(const void *label1, const void *label2);
ecorankidx_t *read_rankidx(const char *filename)
{
int32_t count;
FILE *f;
ecorankidx_t *index;
int32_t i;
int32_t rs;
char *buffer;
f = open_ecorecorddb(filename,&count,1);
index = (ecorankidx_t*) ECOMALLOC(sizeof(ecorankidx_t) + sizeof(char*) * (count-1),
"Allocate rank index");
index->count=count;
for (i=0; i < count; i++)
{
buffer = read_ecorecord(f,&rs);
index->label[i]=(char*) ECOMALLOC(rs+1,
"Allocate rank label");
strncpy(index->label[i],buffer,rs);
}
return index;
}
int32_t rank_index(const char* label,ecorankidx_t* ranks)
{
char **rep;
rep = bsearch(label,ranks->label,ranks->count,sizeof(char*),compareRankLabel);
if (rep)
return rep-ranks->label;
else
ECOERROR(ECO_NOTFOUND_ERROR,"Rank label not found");
return -1;
}
int compareRankLabel(const void *label1, const void *label2)
{
return strcmp((const char*)label1,*(const char**)label2);
}

233
src/libecoPCR/ecoseq.c Normal file
View File

@ -0,0 +1,233 @@
#include "ecoPCR.h"
#include <stdlib.h>
#include <string.h>
#include <zlib.h>
#include <string.h>
#include <stdio.h>
static FILE *open_seqfile(const char *prefix,int32_t index);
ecoseq_t *new_ecoseq()
{
void *tmp;
tmp = ECOMALLOC(sizeof(ecoseq_t),"Allocate new ecoseq structure");
return tmp;
}
int32_t delete_ecoseq(ecoseq_t * seq)
{
if (seq)
{
if (seq->AC)
ECOFREE(seq->AC,"Free sequence AC");
if (seq->DE)
ECOFREE(seq->DE,"Free sequence DE");
if (seq->SQ)
ECOFREE(seq->SQ,"Free sequence SQ");
ECOFREE(seq,"Free sequence structure");
return 0;
}
return 1;
}
ecoseq_t *new_ecoseq_with_data( char *AC,
char *DE,
char *SQ,
int32_t taxid_idx
)
{
ecoseq_t *tmp;
int32_t lstr;
tmp = new_ecoseq();
tmp->taxid=taxid_idx;
if (AC)
{
lstr =strlen(AC);
tmp->AC=ECOMALLOC((lstr+1) * sizeof(char),
"Allocate sequence accession");
strcpy(tmp->AC,AC);
}
if (DE)
{
lstr =strlen(DE);
tmp->DE=ECOMALLOC((lstr+1) * sizeof(char),
"Allocate sequence definition");
strcpy(tmp->DE,DE);
}
if (SQ)
{
lstr =strlen(SQ);
tmp->SQ=ECOMALLOC((lstr+1) * sizeof(char),
"Allocate sequence data");
strcpy(tmp->SQ,SQ);
}
tmp->isexample=1;
return tmp;
}
/**
* ?? used ??
**/
FILE *open_ecoseqdb(const char *filename,
int32_t *sequencecount)
{
return open_ecorecorddb(filename,sequencecount,1);
}
ecoseq_t *readnext_ecoseq(FILE *f)
{
char *compressed=NULL;
ecoseqformat_t *raw;
ecoseq_t *seq;
int32_t comp_status;
unsigned long int seqlength;
int32_t rs;
char *c;
int32_t i;
raw = read_ecorecord(f,&rs);
if (!raw)
return NULL;
if (is_big_endian())
{
raw->CSQ_length = swap_int32_t(raw->CSQ_length);
raw->DE_length = swap_int32_t(raw->DE_length);
raw->SQ_length = swap_int32_t(raw->SQ_length);
raw->taxid = swap_int32_t(raw->taxid);
}
seq = new_ecoseq();
seq->taxid = raw->taxid;
seq->AC = ECOMALLOC(strlen(raw->AC) +1,
"Allocate Sequence Accesion number");
strncpy(seq->AC,raw->AC,strlen(raw->AC));
seq->DE = ECOMALLOC(raw->DE_length+1,
"Allocate Sequence definition");
strncpy(seq->DE,raw->data,raw->DE_length);
seqlength = seq->SQ_length = raw->SQ_length;
compressed = raw->data + raw->DE_length;
seq->SQ = ECOMALLOC(seqlength+1,
"Allocate sequence buffer");
seq->isexample=1;
comp_status = uncompress((unsigned char*)seq->SQ,
&seqlength,
(unsigned char*)compressed,
raw->CSQ_length);
if (comp_status != Z_OK)
ECOERROR(ECO_IO_ERROR,"I cannot uncompress sequence data");
for (c=seq->SQ,i=0;i<seqlength;c++,i++)
*c=toupper(*c);
// fprintf(stderr,"seq name : %30s seq size : %d\n",seq->DE,seq->SQ_length);
return seq;
}
/**
* Open the sequences database (.sdx file)
* @param prefix name of the database (radical without extension)
* @param index integer
*
* @return file object
*/
FILE *open_seqfile(const char *prefix,int32_t index)
{
char filename_buffer[1024];
int32_t filename_length;
FILE *input;
int32_t seqcount;
filename_length = snprintf(filename_buffer,
1023,
"%s_%03d.sdx",
prefix,
index);
if (filename_length >= 1024)
ECOERROR(ECO_ASSERT_ERROR,"file name is too long");
filename_buffer[filename_length]=0;
input=open_ecorecorddb(filename_buffer,&seqcount,0);
if (input)
fprintf(stderr,"# Reading file %s containing %d sequences...\n",
filename_buffer,
seqcount);
return input;
}
ecoseq_t *ecoseq_iterator(const char *prefix)
{
static FILE *current_seq_file= NULL;
static int32_t current_file_idx = 1;
static char current_prefix[1024];
ecoseq_t *seq;
if (prefix)
{
current_file_idx = 1;
if (current_seq_file)
fclose(current_seq_file);
strncpy(current_prefix,prefix,1023);
current_prefix[1024]=0;
current_seq_file = open_seqfile(current_prefix,
current_file_idx);
if (!current_seq_file)
return NULL;
}
seq = readnext_ecoseq(current_seq_file);
if (!seq && feof(current_seq_file))
{
current_file_idx++;
fclose(current_seq_file);
current_seq_file = open_seqfile(current_prefix,
current_file_idx);
if (current_seq_file)
seq = readnext_ecoseq(current_seq_file);
}
return seq;
}

329
src/libecoPCR/ecotax.c Normal file
View File

@ -0,0 +1,329 @@
#include "ecoPCR.h"
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
static ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon);
/**
* Open the taxonomy database
* @param pointer to the database (.tdx file)
* @return a ecotxidx_t structure
*/
ecotxidx_t *read_taxonomyidx(const char *filename)
{
int32_t count;
FILE *f;
ecotxidx_t *index;
int32_t i;
f = open_ecorecorddb(filename,&count,1);
index = (ecotxidx_t*) ECOMALLOC(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count-1),
"Allocate taxonomy");
index->count=count;
for (i=0; i < count; i++){
readnext_ecotaxon(f,&(index->taxon[i]));
index->taxon[i].parent=index->taxon + (size_t)index->taxon[i].parent;
}
return index;
}
int32_t delete_taxonomy(ecotxidx_t *index)
{
int32_t i;
if (index)
{
for (i=0; i< index->count; i++)
if (index->taxon[i].name)
ECOFREE(index->taxon[i].name,"Free scientific name");
ECOFREE(index,"Free Taxonomy");
return 0;
}
return 1;
}
int32_t delete_taxon(ecotx_t *taxon)
{
if (taxon)
{
if (taxon->name)
ECOFREE(taxon->name,"Free scientific name");
ECOFREE(taxon,"Free Taxon");
return 0;
}
return 1;
}
/**
* Read the database for a given taxon a save the data
* into the taxon structure(if any found)
* @param *f pointer to FILE type returned by fopen
* @param *taxon pointer to the structure
*
* @return a ecotx_t structure if any taxon found else NULL
*/
ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon)
{
ecotxformat_t *raw;
int32_t rs;
raw = read_ecorecord(f,&rs);
if (!raw)
return NULL;
if (is_big_endian())
{
raw->namelength = swap_int32_t(raw->namelength);
raw->parent = swap_int32_t(raw->parent);
raw->rank = swap_int32_t(raw->rank);
raw->taxid = swap_int32_t(raw->taxid);
}
taxon->parent = (ecotx_t*)(size_t)raw->parent;
taxon->taxid = raw->taxid;
taxon->rank = raw->rank;
taxon->name = ECOMALLOC((raw->namelength+1) * sizeof(char),
"Allocate taxon scientific name");
strncpy(taxon->name,raw->name,raw->namelength);
return taxon;
}
ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName)
{
ecotaxonomy_t *tax;
char *filename;
int buffsize;
tax = ECOMALLOC(sizeof(ecotaxonomy_t),
"Allocate taxonomy structure");
buffsize = strlen(prefix)+10;
filename = ECOMALLOC(buffsize,
"Allocate filename");
snprintf(filename,buffsize,"%s.rdx",prefix);
tax->ranks = read_rankidx(filename);
snprintf(filename,buffsize,"%s.tdx",prefix);
tax->taxons = read_taxonomyidx(filename);
if (readAlternativeName)
{
snprintf(filename,buffsize,"%s.ndx",prefix);
tax->names=read_nameidx(filename,tax);
}
else
tax->names=NULL;
return tax;
}
int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy)
{
if (taxonomy)
{
if (taxonomy->ranks)
ECOFREE(taxonomy->ranks,"Free rank index");
if (taxonomy->taxons)
ECOFREE(taxonomy->taxons,"Free taxon index");
ECOFREE(taxonomy,"Free taxonomy structure");
return 0;
}
return 1;
}
ecotx_t *eco_findtaxonatrank(ecotx_t *taxon,
int32_t rankidx)
{
ecotx_t *current_taxon;
ecotx_t *next_taxon;
current_taxon = taxon;
next_taxon = current_taxon->parent;
while ((current_taxon!=next_taxon) && // I' am the root node
(current_taxon->rank!=rankidx))
{
current_taxon = next_taxon;
next_taxon = current_taxon->parent;
}
if (current_taxon->rank==rankidx)
return current_taxon;
else
return NULL;
}
/**
* Get back information concerning a taxon from a taxonomic id
* @param *taxonomy the taxonomy database
* @param taxid the taxonomic id
*
* @result a ecotx_t structure containing the taxonimic information
**/
ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy,
int32_t taxid)
{
ecotx_t *current_taxon;
int32_t taxoncount;
int32_t i;
taxoncount=taxonomy->taxons->count;
for (current_taxon=taxonomy->taxons->taxon,
i=0;
i < taxoncount;
i++,
current_taxon++){
if (current_taxon->taxid==taxid){
return current_taxon;
}
}
return (ecotx_t*)NULL;
}
/**
* Find out if taxon is son of other taxon (identified by its taxid)
* @param *taxon son taxon
* @param parent_taxid taxonomic id of the other taxon
*
* @return 1 is the other taxid math a parent taxid, else 0
**/
int eco_isundertaxon(ecotx_t *taxon,
int other_taxid)
{
ecotx_t *next_parent;
next_parent = taxon->parent;
while ( (other_taxid != next_parent->taxid) &&
(strcmp(next_parent->name, "root")) )
{
next_parent = next_parent->parent;
}
if (other_taxid == next_parent->taxid)
return 1;
else
return 0;
}
ecotx_t *eco_getspecies(ecotx_t *taxon,
ecotaxonomy_t *taxonomy)
{
static ecotaxonomy_t *tax=NULL;
static int32_t rankindex=-1;
if (taxonomy && tax!=taxonomy)
{
rankindex = rank_index("species",taxonomy->ranks);
tax=taxonomy;
}
if (!tax || rankindex < 0)
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
return eco_findtaxonatrank(taxon,rankindex);
}
ecotx_t *eco_getgenus(ecotx_t *taxon,
ecotaxonomy_t *taxonomy)
{
static ecotaxonomy_t *tax=NULL;
static int32_t rankindex=-1;
if (taxonomy && tax!=taxonomy)
{
rankindex = rank_index("genus",taxonomy->ranks);
tax=taxonomy;
}
if (!tax || rankindex < 0)
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
return eco_findtaxonatrank(taxon,rankindex);
}
ecotx_t *eco_getfamily(ecotx_t *taxon,
ecotaxonomy_t *taxonomy)
{
static ecotaxonomy_t *tax=NULL;
static int32_t rankindex=-1;
if (taxonomy && tax!=taxonomy)
{
rankindex = rank_index("family",taxonomy->ranks);
tax=taxonomy;
}
if (!tax || rankindex < 0)
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
return eco_findtaxonatrank(taxon,rankindex);
}
ecotx_t *eco_getkingdom(ecotx_t *taxon,
ecotaxonomy_t *taxonomy)
{
static ecotaxonomy_t *tax=NULL;
static int32_t rankindex=-1;
if (taxonomy && tax!=taxonomy)
{
rankindex = rank_index("kingdom",taxonomy->ranks);
tax=taxonomy;
}
if (!tax || rankindex < 0)
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
return eco_findtaxonatrank(taxon,rankindex);
}
ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,
ecotaxonomy_t *taxonomy)
{
static ecotaxonomy_t *tax=NULL;
static int32_t rankindex=-1;
if (taxonomy && tax!=taxonomy)
{
rankindex = rank_index("superkingdom",taxonomy->ranks);
tax=taxonomy;
}
if (!tax || rankindex < 0)
ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined");
return eco_findtaxonatrank(taxon,rankindex);
}

39
src/libecoprimer/Makefile Normal file
View File

@ -0,0 +1,39 @@
SOURCES = goodtaxon.c \
readdnadb.c \
smothsort.c \
sortword.c \
hashsequence.c \
strictprimers.c \
aproxpattern.c \
merge.c \
queue.c \
libstki.c \
sortmatch.c \
pairtree.c \
pairs.c \
taxstats.c \
apat_search.c \
filtering.c \
PrimerSets.c \
ahocorasick.c
SRCS=$(SOURCES)
OBJECTS= $(patsubst %.c,%.o,$(SOURCES))
LIBFILE= libecoprimer.a
RANLIB= ranlib
include ../global.mk
all: $(LIBFILE)
clean:
rm -rf $(OBJECTS) $(LIBFILE)
$(LIBFILE): $(OBJECTS)
ar -cr $@ $?
$(RANLIB) $@

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,58 @@
#ifndef PRIMERSETS_H_
#define PRIMERSETS_H_
#include "ecoprimer.h"
#define PRIMERS_IN_SET_COUNT 10
typedef struct {
int *set_wellIdentifiedTaxa;
int32_t set_pairs[PRIMERS_IN_SET_COUNT];
float set_specificity;
float set_coverage;
float set_lmean;
float set_lcov;
float set_score;
int32_t set_intaxa;
int32_t set_wi_cnt;
}pairset;
typedef struct{
ppair_t* sortedpairs;
int32_t sorted_count;
pecodnadb_t seqdb;
poptions_t options;
}SetParams;
typedef struct{
float t_spc; //specificity contribution
float t_cov; //coverage contribution
float t_lmd; //link spread difference
float len; //length
float score; //score
}primerscore;
void add_pair_in_set (pairset *pair_set, int32_t pset_idx, int32_t prb_idx, SetParams *pparams);
void get_next_pair_options (int *pair_wi_count_sorted_ids, pairset *pair_set, SetParams *pparams);
float get_links_distribution (int prb_idx, pairset *prob_set, SetParams *pparams);
pairset build_primers_set_greedy_spc (SetParams *pparams);
void get_set_mean_cov_stats (pairset *prob_set, SetParams *pparams);
void some_other_set_possibilities (pairset *pair_set,
ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options);
void sets_by_SimulatedAnealing (pairset *pair_set,
ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options);
void sets_by_TabuSearch (pairset *pair_set,
ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options);
pairset * sets_by_BruteForce (ppair_t * sortedpairs,
int32_t sorted_count, pecodnadb_t seqdb, poptions_t options);
pairset * extend_set_randomly (pairset *pair_set, SetParams *params, int extend_to_cnt);
void build_and_print_sets (ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options);
int32_t get_next_option_increasing_cov (pairset *pair_set, SetParams *pparams);
void reset_set_props (pairset *pair_set, SetParams *pparams);
void primers_graph_graphviz (ppair_t * sortedpairs,
int32_t sorted_count, poptions_t options);
size_t primers_changeSortedArray (ppair_t ** pairs,
size_t sorted_count, poptions_t options);
size_t primers_filterWithGivenLinks (ppair_t ** pairs,
size_t sorted_count, poptions_t options);
#endif

479
src/libecoprimer/ahocorasick.c Executable file
View File

@ -0,0 +1,479 @@
/*
* ahocorasick.h
*
* Created on: 26 march 2011
* Author: tiayyba
*/
#include <inttypes.h>
#include "hashencoder.h"
#include "ahocorasick.h"
void ahoc_graphKeywordTree (aho_state *root);
aho_state *groot = NULL; //just for graph testing
#define BASEATINDEX(w, l, i) (uint8_t)((((w)&(0x3LLU<<(((l)-(i))*2)))>>(((l)-(i))*2)) & 0x3LLU)
void ahoc_addOutputElement (aho_state *node, bool_t isdirect, uint32_t idx)
{
if (!node) return;
if (node->output.count == 0)
node->output.out_set = ECOMALLOC(sizeof(aho_output),
"Cannot allocate memory for aho-corasick state output element");
else
node->output.out_set = ECOREALLOC(node->output.out_set, (node->output.count+1)*sizeof(aho_output),
"Cannot allocate memory for aho-corasick state output element");
node->output.out_set[node->output.count].wordidx = idx;
node->output.out_set[node->output.count].isdirect = isdirect;
node->output.count++;
}
//is the passed output element in the set
bool_t ahoc_isOutputIn (aho_state *node, aho_output ot)
{
uint32_t i;
for (i=0; i<node->output.count; i++)
if (node->output.out_set[i].isdirect == ot.isdirect && node->output.out_set[i].wordidx == ot.wordidx) return TRUE;
return FALSE;
}
//take union of output of the two nodes and put in node1
void ahoc_unionOutputElements (aho_state *node1, aho_state *node2)
{
uint32_t i;
for (i=0; i<node2->output.count; i++)
if (ahoc_isOutputIn (node1, node2->output.out_set[i]) == FALSE)
ahoc_addOutputElement (node1, node2->output.out_set[i].isdirect, node2->output.out_set[i].wordidx);
}
void ahoc_addKeyword (aho_state *root, word_t w, bool_t isdirect, uint32_t idx, poptions_t options)
{
uint32_t i;
aho_state *nextnode = root;
uint8_t basecode;
static uint32_t state_id = 0;
//fprintf (stderr, "%s\n", ecoUnhashWord(w, options->primer_length));
for (i=1; i<=options->primer_length; i++)
{
basecode = BASEATINDEX (w, options->primer_length, i);
//fprintf (stderr, "%d", basecode);
if (nextnode->next[basecode] == NULL)
{
//add new state
nextnode->next[basecode] = ECOMALLOC(sizeof(aho_state),
"Cannot allocate memory for aho-corasick state");
nextnode = nextnode->next[basecode];
//initialize state
nextnode->id = ++state_id;
nextnode->next[0]=nextnode->next[1]=nextnode->next[2]=nextnode->next[3]=NULL;
nextnode->fail = NULL;
nextnode->output.count = 0;
}
else
nextnode = nextnode->next[basecode];
}
//fprintf (stderr, "\n", basecode);
//new pattern addess so add node ouptup element
ahoc_addOutputElement (nextnode, isdirect, idx);
}
void ahoc_buildKeywordTree (aho_state *root, pwordcount_t words, poptions_t options)
{
uint32_t i;
if (!root) return;
//init root
root->id = 0;
root->next[0]=root->next[1]=root->next[2]=root->next[3]=NULL;
root->fail = NULL;
root->output.count = 0;
//now add each word as a pattern in the keyword tree
for (i=0; i<words->size; i++)
{
//add direct word
word_t w=WORD(words->words[i]);
ahoc_addKeyword (root, w, TRUE, i, options);
//add reverse word
w=ecoComplementWord(w,options->primer_length);
ahoc_addKeyword (root, w, FALSE, i, options);
}
//loop on root if some base has no out going edge from roots
for (i=0; i<4; i++)
if (root->next[i] == NULL)
root->next[i] = root;
}
void ahoc_enqueue (aho_queue *ahoqueue, aho_state *node)
{
queue_node *q;
if (node == NULL) return;
q = ECOMALLOC(sizeof(queue_node),
"Cannot allocate memory for aho-corasick queue node");
q->state_node = node;
q->next = NULL;
if (ahoqueue->first == NULL)
{
ahoqueue->first = q;
ahoqueue->last = q;
}
else
{
ahoqueue->last->next = q;
ahoqueue->last = q;
}
}
aho_state *ahoc_dequeue (aho_queue *ahoqueue)
{
aho_state *node = NULL;
queue_node *q;
if (ahoqueue->first == NULL) return node;
q = ahoqueue->first;
ahoqueue->first = q->next;
node = q->state_node;
ECOFREE (q, "Cannot free memory for aho-corasick queue node");
return node;
}
//set fail links and output sets for the keyword tree
void ahoc_updateForFailAndOutput (aho_state *root)
{
int32_t i;
aho_queue Q;
aho_state *node_r;
aho_state *node_u;
aho_state *node_v;
//empty queue
Q.first = NULL;
Q.last = NULL;
//for us alphabet has 4 elements, A=0, C=1, G=2 and T=3
for (i=0; i<4; i++)
{
if (root->next[i] != root && root->next[i] != NULL)
{
root->next[i]->fail = root;
ahoc_enqueue (&Q, root->next[i]);
}
}
//while queue not empty
while (Q.first != NULL)
{
node_r = ahoc_dequeue (&Q);
for (i=0; i<4; i++)
{
if (node_r->next[i] != NULL)
{
node_u = node_r->next[i];
ahoc_enqueue (&Q, node_u);
node_v = node_r->fail;
while (node_v->next[i] == NULL)
node_v = node_v->fail;
node_u->fail = node_v->next[i];
ahoc_unionOutputElements (node_u, node_u->fail);
}
}
}
}
void ahoc_freeKeywordTree (aho_state *node)
{
int i;
for (i=0; i<4; i++)
if (node->next[i])
ahoc_freeKeywordTree (node->next[i]);
if (node->output.count > 0)
ECOFREE (node->output.out_set, "Free failed for node output");
ECOFREE (node, "Free failed for node");
}
pprimercount_t ahoc_lookforStrictPrimers (pecodnadb_t database, uint32_t seqdbsize,uint32_t exampleCount,
pwordcount_t words,poptions_t options)
{
aho_state automaton_root;
aho_state *curr_state;
//uint32_t inSequenceQuorum;
uint32_t outSequenceQuorum;
pprimer_t data;
pprimercount_t primers;
uint32_t i, j, k;
int32_t pos;
uint32_t lmax;
char *base;
int8_t code;
uint32_t goodPrimers=0;
static int iii=0;
//inSequenceQuorum = (uint32_t)floor((float)exampleCount * options->sensitivity_quorum);
outSequenceQuorum = (uint32_t)floor((float)(seqdbsize-exampleCount) * options->false_positive_quorum);
//fprintf(stderr," Primers should be at least present in %d/%d example sequences\n",inSequenceQuorum,exampleCount);
fprintf(stderr," Primers should not be present in more than %d/%d counterexample sequences\n",outSequenceQuorum,(seqdbsize-exampleCount));
data = ECOMALLOC(words->size * sizeof(primer_t),
"Cannot allocate memory for fuzzy matching results");
for (i=0; i < words->size; i++)
{
data[i].word=WORD(words->words[i]);
data[i].inexample = 0;
data[i].outexample= 0;
data[i].directCount=ECOMALLOC(seqdbsize * sizeof(uint32_t),
"Cannot allocate memory for primer position");
data[i].directPos = ECOMALLOC(seqdbsize * sizeof(poslist_t),
"Cannot allocate memory for primer position");
data[i].reverseCount=ECOMALLOC(seqdbsize * sizeof(uint32_t),
"Cannot allocate memory for primer position");
data[i].reversePos = ECOMALLOC(seqdbsize * sizeof(poslist_t),
"Cannot allocate memory for primer position");
}
//build keywords automaton
ahoc_buildKeywordTree (&automaton_root, words, options);
//set fail links and output sets
ahoc_updateForFailAndOutput (&automaton_root);
//debug; print keywordtree in a gv file
//ahoc_graphKeywordTree (&automaton_root);
//loop on each sequence for its each base and find words
for (i=0; i < seqdbsize; i++)
{
if(database[i]->SQ_length <= options->primer_length) continue;
lmax = database[i]->SQ_length;
if (!options->circular)
lmax += options->primer_length-1;
curr_state = &automaton_root;
for (j=0,base=database[i]->SQ; j<lmax; j++,base++)
{
if (i==(uint32_t)database[i]->SQ_length) base=database[i]->SQ;
//code = encoder[(*base) - 'A'];
code = *base;
//if (iii++ < 30)
// fprintf (stderr, "%d:%d,", *base, code);
if (code < 0 || code > 3)
{
//if error char, start from root for next character
//+forget any incomplete words
curr_state = &automaton_root;
continue;
}
while (curr_state->next[code] == NULL) curr_state = curr_state->fail;
curr_state = curr_state->next[code];
//start position of primer is options->primer_length-1 chars back
pos = j-options->primer_length+1;
if (pos < 0) pos = database[i]->SQ_length+pos;
//set output, if there is some output on this state then
//+all words in the output set complete here, so increment their
//+found properties for current sequence
for (k=0; k<curr_state->output.count; k++)
{
if (curr_state->output.out_set[k].isdirect)
data[curr_state->output.out_set[k].wordidx].directCount[i]++;
else
data[curr_state->output.out_set[k].wordidx].reverseCount[i]++;
if (options->no_multi_match)
{
if ((data[curr_state->output.out_set[k].wordidx].directCount[i] +
data[curr_state->output.out_set[k].wordidx].reverseCount[i]) > 1)
//since multimach not allowd, set an indication on 1st seq position that
//+ a multimatch was found, so that this word will be filtered out
//+ and because of first postion we wont have to search the whole array
//+ to find if it voilated nomultimatch constraint for some seq
data[curr_state->output.out_set[k].wordidx].directCount[0] = 2;
else
{
if (curr_state->output.out_set[k].isdirect)
//direct word found on jth position of ith sequence
data[curr_state->output.out_set[k].wordidx].directPos[i].value = (uint32_t)pos;
else
//reverse word found on jth position of ith sequence
data[curr_state->output.out_set[k].wordidx].reversePos[i].value = (uint32_t)pos;
}
}
else
{
//okay multi match allowed
if (curr_state->output.out_set[k].isdirect)
{
if (data[curr_state->output.out_set[k].wordidx].directCount[i] == 1)
data[curr_state->output.out_set[k].wordidx].directPos[i].value = (uint32_t)pos;
else
{
//need to create or extend the positions list
if (data[curr_state->output.out_set[k].wordidx].directCount[i] == 2)
{
//for second element, first was put in .value, so dont forget to copy that in the array too
data[curr_state->output.out_set[k].wordidx].directPos[i].pointer = ECOMALLOC(2 * sizeof(uint32_t),
"Cannot allocate memory for primer position");
data[curr_state->output.out_set[k].wordidx].directPos[i].pointer[0] = data[curr_state->output.out_set[k].wordidx].directPos[i].value;
data[curr_state->output.out_set[k].wordidx].directPos[i].pointer[1] = (uint32_t)pos;
}
else
{
//for third or greater element
data[curr_state->output.out_set[k].wordidx].directPos[i].pointer = ECOREALLOC(data[curr_state->output.out_set[k].wordidx].directPos[i].pointer,
data[curr_state->output.out_set[k].wordidx].directCount[i] * sizeof(uint32_t),
"Cannot allocate memory for primer position");
data[curr_state->output.out_set[k].wordidx].directPos[i].pointer[data[curr_state->output.out_set[k].wordidx].directCount[i]-1] = (uint32_t)pos;
}
}
}
else
{
if (data[curr_state->output.out_set[k].wordidx].reverseCount[i] == 1)
data[curr_state->output.out_set[k].wordidx].reversePos[i].value = (uint32_t)pos;
else
{
//need to create or extend the positions list
if (data[curr_state->output.out_set[k].wordidx].reverseCount[i] == 2)
{
//for second element, first was put in .value, so dont forget to copy that in the array too
data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer = ECOMALLOC(2 * sizeof(uint32_t),
"Cannot allocate memory for primer position");
data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer[0] = data[curr_state->output.out_set[k].wordidx].reversePos[i].value;
data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer[1] = (uint32_t)pos;
}
else
{
//for third or greater element
data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer = ECOREALLOC(data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer,
data[curr_state->output.out_set[k].wordidx].reverseCount[i] * sizeof(uint32_t),
"Cannot allocate memory for primer position");
data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer[data[curr_state->output.out_set[k].wordidx].reverseCount[i]-1] = (uint32_t)pos;
}
}
}
}
//dont forget to increment inexample or outexample count, but only once for a sequence
if ((data[curr_state->output.out_set[k].wordidx].directCount[i] +
data[curr_state->output.out_set[k].wordidx].reverseCount[i]) == 1)
{
if (database[i]->isexample)
data[curr_state->output.out_set[k].wordidx].inexample++;
else
data[curr_state->output.out_set[k].wordidx].outexample++;
}
}
}
}
//Only thing that remains is to remove the failed words
for (i=0,j=0; i<words->size; i++)
{
fprintf(stderr,"Primers %5d/%lld analyzed => sequence : %s in %d example and %d counterexample sequences \r",
i+1,words->size,ecoUnhashWord(data[i].word,options->primer_length),
data[i].inexample,data[i].outexample);
//if (data[i].inexample < inSequenceQuorum || (data[i].directCount[0] == 2 && options->no_multi_match))
if (data[i].directCount[0] == 2 && options->no_multi_match)
{
//bad word, delete from the array
for (k=0; k<seqdbsize; k++)
{
if (data[i].directCount[k] > 1)
ECOFREE (data[i].directPos[k].pointer, "Cannot free position pointer.");
if (data[i].reverseCount[k] > 1)
ECOFREE (data[i].reversePos[k].pointer, "Cannot free position pointer.");
}
ECOFREE (data[i].directCount, "Cannot free position pointer.");
ECOFREE (data[i].directPos, "Cannot free position pointer.");
ECOFREE (data[i].reverseCount, "Cannot free position pointer.");
ECOFREE (data[i].reversePos, "Cannot free position pointer.");
}
else
{
//data[i].good = data[i].inexample >= inSequenceQuorum && data[i].outexample <= outSequenceQuorum;
data[i].good = data[i].outexample <= outSequenceQuorum;
goodPrimers+=data[i].good? 1:0;
if (j < i)
data[j] = data[i];
j++;
}
}
fprintf(stderr,"\n\nOn %lld analyzed primers %d respect quorum conditions\n",words->size,goodPrimers);
fprintf(stderr,"Conserved primers for further analysis : %d/%lld\n",j,words->size);
primers = ECOMALLOC(sizeof(primercount_t),"Cannot allocate memory for primer table");
primers->primers=ECOREALLOC(data,
j * sizeof(primer_t),
"Cannot reallocate memory for fuzzy matching results");
primers->size=j;
//free memory of keyword table
for (i=0; i<4; i++)
if (automaton_root.next[i] != &automaton_root)
ahoc_freeKeywordTree (automaton_root.next[i]);
return primers;
}
void ahoc_graphPrintNodesInfo (aho_state *node, FILE* gfile)
{
uint32_t i;
fprintf (gfile, "\"%d\"[\n", node->id);
fprintf (gfile, "label=\"%d\\n", node->id);
for (i=0; i<node->output.count; i++)
fprintf (gfile, "%d%c,", node->output.out_set[i].wordidx, node->output.out_set[i].isdirect?'d':'r');
fprintf (gfile, "\"\n];\n");
for (i=0; i<4; i++)
if (node->next[i] != NULL && node->next[i] != node)
ahoc_graphPrintNodesInfo (node->next[i], gfile);
}
void ahoc_graphPrintNodesLinks (aho_state *node, FILE* gfile)
{
uint32_t i;
static int j=0;
for (i=0; i<4; i++)
if (node->next[i] != NULL && node->next[i] != node)
{
fprintf (gfile, "\"%d\" -> \"%d\" [\n", node->id, node->next[i]->id);
fprintf (gfile, "label=\"%c\"\n];\n", "ACGT"[i]);
}
if (j++ < 40)
if (node->fail != NULL && node->fail != groot)
{
fprintf (gfile, "\"%d\" -> \"%d\" [\n", node->id, node->fail->id);
fprintf (gfile, "color= \"red\"\n];\n");
}
for (i=0; i<4; i++)
if (node->next[i] != NULL && node->next[i] != node)
ahoc_graphPrintNodesLinks (node->next[i], gfile);
}
void ahoc_graphKeywordTree (aho_state *root)
{
FILE *gfile;
groot=root;
gfile = fopen ("keywordtree.gv", "w");
fprintf (gfile, "digraph keywordtree {\n");
ahoc_graphPrintNodesInfo (root, gfile);
ahoc_graphPrintNodesLinks (root, gfile);
fprintf (gfile, "}\n");
fclose(gfile);
}

43
src/libecoprimer/ahocorasick.h Executable file
View File

@ -0,0 +1,43 @@
/*
* ahocorasick.h
*
* Created on: 26 march 2011
* Author: tiayyba
*/
#ifndef H_ahocorasick
#define H_ahocorasick
#include "ecoprimer.h"
typedef struct aho_output_t{
uint32_t wordidx; //index of strict word (dont save the word of 64B)
bool_t isdirect; //we need to find both direct and reverse words so we must know which one is it
}aho_output;
typedef struct aho_output_count_t{
uint32_t count;
aho_output *out_set;
}aho_output_count;
typedef struct aho_state_t{
int32_t id;
struct aho_state_t *next[4]; //for labels A=0,C=1,G=2 and T=3
struct aho_state_t *fail;
aho_output_count output;
}aho_state;
typedef struct queue_node_t {
aho_state *state_node;
struct queue_node_t *next;
}queue_node;
typedef struct{
queue_node *first;
queue_node *last;
}aho_queue;
pprimercount_t ahoc_lookforStrictPrimers (pecodnadb_t database, uint32_t seqdbsize,uint32_t exampleCount,
pwordcount_t words,poptions_t options);
#endif /* H_ahocorasick */

View File

@ -0,0 +1,131 @@
/*
* amplifiatree.c
*
* Created on: 7 mars 2009
* Author: coissac
*/
#include "ecoprimer.h"
#include <search.h>
static void cleanamplifia(pamplifia_t amplifia);
static void deleteamplifialist(pamplifialist_t list);
static int cmpamplifia(const void* p1,const void*p2);
static void cleanamplifiatlist(pamplifiacount_t list)
{
if (list->amplifias)
ECOFREE(list->amplifias,
"Free amplifia list");
}
static void cleanamplifia(pamplifia_t amplifia)
{
cleanamplifiatlist(&(amplifia->pcr));
}
static pamplifialist_t newamplifialist(pamplifialist_t parent, size_t size)
{
pamplifialist_t tmp;
tmp=ECOMALLOC(sizeof(amplifialist_t)+sizeof(amplifia_t)*(size-1),
"Cannot allocate new amplifia list");
tmp->amplifiaslots=size;
tmp->amplifiacount=0;
tmp->next=NULL;
if (parent)
parent->next=(void*)tmp;
return tmp;
}
static void deleteamplifialist(pamplifialist_t list)
{
size_t i;
if (list)
{
if (list->next)
{
deleteamplifialist(list->next);
list->next=NULL;
}
for (i=0; i < list->amplifiacount; i++)
cleanamplifia((list->amplifias)+i);
ECOFREE(list,"Delete amplifia list");
}
}
static int cmpamplifia(const void* p1,const void*p2)
{
pamplifia_t pr1,pr2;
pr1=(pamplifia_t)p1;
pr2=(pamplifia_t)p2;
if (pr1->p1 < pr2->p1) return -1;
if (pr1->p1 > pr2->p1) return 1;
if (pr1->asdirect1 < pr2->asdirect1) return -1;
if (pr1->asdirect1 > pr2->asdirect1) return 1;
if (pr1->p2 < pr2->p2) return -1;
if (pr1->p2 > pr2->p2) return 1;
if (pr1->asdirect2 < pr2->asdirect2) return -1;
if (pr1->asdirect2 > pr2->asdirect2) return 1;
return 0;
}
pamplifia_t amplifiaintree (amplifia_t key,
pamplifiatree_t amplifialist)
{
if (!amplifialist->tree)
return NULL;
return *((pamplifia_t*)tsearch((const void *)(&key),
&(amplifialist->tree),
cmpamplifia
));
}
pamplifia_t insertamplifia(amplifia_t key,
pamplifiatree_t list)
{
pamplifia_t current;
pamplifia_t found;
if (list->last->amplifiacount==list->last->amplifiaslots)
{
list->last->next=newamplifialist(list,100);
list->last=list->last->next;
}
current = list->last->amplifias + list->last->amplifiacount;
*current=key;
found = *((pamplifia_t*)tsearch((const void *)current,
&(list->tree),
cmpamplifia));
if (found==current)
list->last->amplifiacount++;
return found;
}
pamplifiatree_t initamplifiatree(pamplifiatree_t tree)
{
if (!tree)
tree = ECOMALLOC(sizeof(amplifiatree_t),"Cannot allocate amplifia tree");
tree->first=newamplifialist(NULL,500);
tree->last=tree->first;
tree->tree=NULL;
}

120
src/libecoprimer/apat.h Normal file
View File

@ -0,0 +1,120 @@
/* ==================================================== */
/* Copyright (c) Atelier de BioInformatique */
/* Dec. 94 */
/* File: apat.h */
/* Purpose: pattern scan */
/* History: */
/* 28/12/94 : <Gloup> ascan first version */
/* 14/05/99 : <Gloup> last revision */
/* ==================================================== */
#ifndef H_apat
#define H_apat
#include "libstki.h"
#include "inttypes.h"
#include "../libecoPCR/ecoPCR.h"
/* ----------------------------------------------- */
/* constantes */
/* ----------------------------------------------- */
#ifndef BUFSIZ
#define BUFSIZ 1024 /* io buffer size */
#endif
#define MAX_NAME_LEN BUFSIZ /* max length of sequence name */
#define ALPHA_LEN 4 /* alphabet length */
/* *DO NOT* modify */
#define MAX_PATTERN 4 /* max # of patterns */
/* *DO NOT* modify */
#define MAX_PAT_LEN 32 /* max pattern length */
/* *DO NOT* modify */
#define MAX_PAT_ERR 32 /* max # of errors */
/* *DO NOT* modify */
#define PATMASK 0x3ffffff /* mask for 26 symbols */
/* *DO NOT* modify */
#define OBLIBIT 0x4000000 /* bit 27 to 1 -> oblig. pos */
/* *DO NOT* modify */
/* mask for position */
#define ONEMASK 0x80000000 /* mask for highest position */
/* masks for Levenhstein edit */
#define OPER_IDT 0x00000000 /* identity */
#define OPER_INS 0x40000000 /* insertion */
#define OPER_DEL 0x80000000 /* deletion */
#define OPER_SUB 0xc0000000 /* substitution */
#define OPER_SHFT 30 /* <unused> shift */
/* Levenhstein Opcodes */
#define SOPER_IDT 0x0 /* identity */
#define SOPER_INS 0x1 /* insertion */
#define SOPER_DEL 0x2 /* deletion */
#define SOPER_SUB 0x3 /* substitution */
/* Levenhstein Opcodes masks */
#define OPERMASK 0xc0000000 /* mask for Opcodes */
#define NOPERMASK 0x3fffffff /* negate of previous */
/* ----------------------------------------------- */
/* data structures */
/* ----------------------------------------------- */
typedef uint32_t pattern_t[ALPHA_LEN], *ppattern_t;
/* -------------------- */
typedef struct { /* pattern */
/* -------------------- */
int patlen; /* pattern length */
int maxerr; /* max # of errors */
uint32_t omask; /* oblig. bits mask */
bool_t circular; /* is circular sequence */
} patternParam_t, *ppatternParam_t;
/* ----------------------------------------------- */
/* macros */
/* ----------------------------------------------- */
#ifndef NEW
#define NEW(typ) (typ*)malloc(sizeof(typ))
#define NEWN(typ, dim) (typ*)malloc((unsigned long)(dim) * sizeof(typ))
#define REALLOC(typ, ptr, dim) (typ*)realloc((void *) (ptr), (unsigned long)(dim) * sizeof(typ))
#define FREE(ptr) free((void *) ptr)
#endif
/* ----------------------------------------------- */
/* prototypes */
/* ----------------------------------------------- */
/* apat_search.c */
int32_t ManberNoErr(pecoseq_t pseq,ppattern_t pat,
ppatternParam_t param,
StackiPtr stkpos);
int32_t ManberSub(pecoseq_t pseq,ppattern_t pat,
ppatternParam_t param,
StackiPtr stkpos);
int32_t ManberAll(pecoseq_t pseq,ppattern_t pat,
ppatternParam_t param,
StackiPtr stkpos);
#endif /* H_apat */

View File

@ -0,0 +1,65 @@
/* ==================================================== */
/* Copyright (c) Atelier de BioInformatique */
/* Mar. 92 */
/* File: apat_parse.c */
/* Purpose: Codage du pattern */
/* History: */
/* 00/07/94 : <Gloup> first version (stanford) */
/* 00/11/94 : <Gloup> revised for DNA/PROTEIN */
/* 30/12/94 : <Gloup> modified EncodePattern */
/* for manber search */
/* 14/05/99 : <Gloup> indels added */
/* ==================================================== */
#include <ctype.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "apat.h"
#include "ecoprimer.h"
/* IUPAC Dna */
static int32_t sDnaCode[] = {
/* IUPAC */
0x00000001 /* A */, 0x0000000E /* B */, 0x00000002 /* C */,
0x0000000D /* D */, 0x00000000 /* E */, 0x00000000 /* F */,
0x00000004 /* G */, 0x0000000B /* H */, 0x00000000 /* I */,
0x00000000 /* J */, 0x0000000C /* K */, 0x00000000 /* L */,
0x00000003 /* M */, 0x0000000F /* N */, 0x00000000 /* O */,
0x00000000 /* P */, 0x00000000 /* Q */, 0x00000005 /* R */,
0x00000006 /* S */, 0x00000008 /* T */, 0x00000008 /* U */,
0x00000007 /* V */, 0x00000009 /* W */, 0x00000000 /* X */,
0x0000000A /* Y */, 0x00000000 /* Z */
};
/* -------------------------------------------- */
/* internal replacement of gets */
/* -------------------------------------------- */
static char *sGets(char *buffer, int size) {
char *ebuf;
if (! fgets(buffer, size-1, stdin))
return NULL;
/* remove trailing line feed */
ebuf = buffer + strlen(buffer);
while (--ebuf >= buffer) {
if ((*ebuf == '\n') || (*ebuf == '\r'))
*ebuf = '\000';
else
break;
}
return buffer;
}
/* -------------------------------------------- */
/* Interface */
/* -------------------------------------------- */

View File

@ -0,0 +1,155 @@
/* ==================================================== */
/* Copyright (c) Atelier de BioInformatique */
/* Dec. 94 */
/* File: apat_search.c */
/* Purpose: recherche du pattern */
/* algorithme de Baeza-Yates/Gonnet */
/* Manber (agrep) */
/* History: */
/* 07/12/94 : <MFS> first version */
/* 28/12/94 : <Gloup> revised version */
/* 14/05/99 : <Gloup> last revision */
/* ==================================================== */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "libstki.h"
#include "apat.h"
#define POP PopiOut
#define PUSH(s,v) PushiIn(&(s),(v))
#define TOPCURS CursiToTop
#define DOWNREAD ReadiDown
#define KRONECK(x, msk) ((~x & msk) ? 0 : 1)
#define MIN(x, y) ((x) < (y) ? (x) : (y))
/* -------------------------------------------- */
/* Baeza-Yates/Manber algorithm */
/* NoError */
/* -------------------------------------------- */
int32_t ManberNoErr(pecoseq_t pseq,ppattern_t pat,
ppatternParam_t param,
StackiPtr stkpos)
{
int32_t pos;
uint32_t smask, r;
uint8_t *data;
int32_t end;
end = (size_t)(pseq->SQ_length);
if (param->circular)
end+=param->patlen - 1;
/* create local masks */
smask = r = 0x1L << param->patlen;
/* init. scan */
data = (uint8_t*)(pseq->SQ);
/* loop on text data */
for (pos = 0 ; pos < end ; pos++,data++) {
if (pos==pseq->SQ_length)
data=(uint8_t*)(pseq->SQ);
if (*data < 4)
r = (r >> 1) & pat[*data];
else
r=0;
if (r & 0x1L) {
PUSH(stkpos, pos - param->patlen + 1);
}
r |= smask;
}
return stkpos->top; /* aka # of hits */
}
/* -------------------------------------------- */
/* Baeza-Yates/Manber algorithm */
/* Substitution only */
/* */
/* Note : r array is stored as : */
/* 0 0 r(0,j) r(0,j+1) r(1,j) r(1,j+1) ... */
/* */
/* -------------------------------------------- */
int32_t ManberSub(pecoseq_t pseq,ppattern_t pat,
ppatternParam_t param,
StackiPtr stkpos)
{
int e, found;
int32_t pos;
uint32_t smask, cmask, sindx;
uint32_t *pr, r[2 * MAX_PAT_ERR + 2];
uint8_t *data;
int32_t end;
end = (size_t)(pseq->SQ_length);
if (param->circular)
end+=param->patlen - 1;
/* create local masks */
r[0] = r[1] = 0x0;
cmask = smask = 0x1L << param->patlen;
for (e = 0, pr = r + 3 ; e <= param->maxerr ; e++, pr += 2)
*pr = cmask;
cmask = ~ param->omask;
/* init. scan */
data = (uint8_t*)(pseq->SQ);
/* loop on text data */
for (pos = 0 ; pos < end ; pos++,data++) {
if (pos==pseq->SQ_length)
data=(uint8_t*)(pseq->SQ);
sindx = (*data==4) ? 0:pat[*data];
for (e = found = 0, pr = r ; e <= param->maxerr ; e++, pr += 2) {
pr[2] = pr[3] | smask;
pr[3] = ((pr[0] >> 1) & cmask) /* sub */
| ((pr[2] >> 1) & sindx); /* ident */
if (pr[3] & 0x1L) { /* found */
if (! found) {
PUSH(stkpos, pos - param->patlen + 1);
}
found++;
}
}
}
return stkpos->top; /* aka # of hits */
}
/* -------------------------------------------- */
/* Baeza-Yates/Manber algorithm */
/* API call to previous functions */
/* -------------------------------------------- */
int32_t ManberAll(pecoseq_t pseq,ppattern_t pat,
ppatternParam_t param,
StackiPtr stkpos)
{
if (param->maxerr == 0)
return ManberNoErr(pseq,
pat, param,
stkpos);
else
return ManberSub(pseq,
pat, param,
stkpos);
}

View File

@ -0,0 +1,237 @@
/*
* aproxpattern.c
*
* Created on: 20 nov. 2008
* Author: coissac
*/
#include "ecoprimer.h"
#include "apat.h"
#include <math.h>
static uint8_t encoder[] = {0, // A
4, // b
1, // C
4,4,4, // d, e, f
2, // G
4,4,4,4,4,4,4,4,4,4,4,4, // h,i,j,k,l,m,n,o,p,q,r,s
3,3, // T,U
4,4,4,4,4}; // v,w,x,y,z
ppattern_t buildPatternFromWord(word_t word, uint32_t patlen)
{
static pattern_t pattern;
uint32_t i;
for (i = 0 ; i < ALPHA_LEN ; i++)
pattern[i] = 0x0;
for (i=0;i < patlen; i++)
{
pattern[word & 3LLU] |= 1 << i;
word>>=2;
}
return pattern;
}
#ifdef IS_UPPER
#undef IS_UPPER
#endif
/* -------------------------------------------- */
/* encode sequence */
/* IS_UPPER is slightly faster than isupper */
/* -------------------------------------------- */
#define IS_UPPER(c) (((c) >= 'A') && ((c) <= 'Z'))
void encodeSequence(ecoseq_t *seq)
{
int i;
uint8_t *data;
char *cseq;
data = (uint8_t*)(seq->SQ);
cseq = seq->SQ;
for (i=0;i<seq->SQ_length;i++,data++,cseq++)
{
*data = encoder[(IS_UPPER(*cseq) ? *cseq : 'Z') - 'A'];
}
}
pprimercount_t lookforAproxPrimer(pecodnadb_t database, uint32_t seqdbsize,uint32_t exampleCount,
pwordcount_t words,poptions_t options)
{
pprimer_t data;
pprimercount_t primers;
ppattern_t pattern;
patternParam_t params;
uint32_t i;
uint32_t w;
uint32_t j;
Stacki positions;
uint32_t count=1;
uint32_t goodPrimers=0;
uint32_t inSequenceQuorum;
uint32_t outSequenceQuorum;
bool_t conserved = TRUE;
//poslist_t ttt;
inSequenceQuorum = (uint32_t)floor((float)exampleCount * options->sensitivity_quorum);
outSequenceQuorum = (uint32_t)floor((float)(seqdbsize-exampleCount) * options->false_positive_quorum);
fprintf(stderr," Primers should be at least present in %d/%d example sequences\n",inSequenceQuorum,exampleCount);
fprintf(stderr," Primers should not be present in more than %d/%d counterexample sequences\n",outSequenceQuorum,(seqdbsize-exampleCount));
data = ECOMALLOC(words->size * sizeof(primer_t),
"Cannot allocate memory for fuzzy matching results");
params.circular = options->circular;
params.maxerr = options->error_max;
// params.omask = (1 << options->strict_three_prime) -1;
params.omask = 0;
params.patlen = options->primer_length;
positions.val=NULL;
for (i=0,w=0; i < words->size; i++)
{
data[w].word=WORD(words->words[i]);
data[w].inexample = 0;
data[w].outexample= 0;
count = 1;
if (conserved)
{
data[w].directCount=ECOMALLOC(seqdbsize * sizeof(uint32_t),
"Cannot allocate memory for primer position");
data[w].directPos = ECOMALLOC(seqdbsize * sizeof(poslist_t),
"Cannot allocate memory for primer position");
data[w].reverseCount=ECOMALLOC(seqdbsize * sizeof(uint32_t),
"Cannot allocate memory for primer position");
data[w].reversePos = ECOMALLOC(seqdbsize * sizeof(poslist_t),
"Cannot allocate memory for primer position");
}
pattern = buildPatternFromWord(data[w].word,options->primer_length);
positions.val=NULL;
for (j=0; j < seqdbsize && (count < 2 || !options->no_multi_match); j++)
{
positions.cursor=0;
positions.top =0;
if (!positions.val)
{
positions.size=1;
positions.val = ECOMALLOC(sizeof(uint32_t),
"Cannot allocate memory for primer position");
}
count = ManberAll(database[j],pattern,&params,&positions);
data[w].directCount[j]=count;
if (count>1)
{
data[w].directPos[j].pointer = (uint32_t*)positions.val;
positions.val=NULL;
}
else
{
data[w].directPos[j].pointer=NULL;
if (count==1)
data[w].directPos[j].value = (uint32_t)*(positions.val);
}
}
pattern = buildPatternFromWord(ecoComplementWord(data[w].word,options->primer_length),
options->primer_length);
for (j=0; j < seqdbsize && (count < 2 || !options->no_multi_match); j++)
{
positions.cursor=0;
positions.top =0;
if (!positions.val)
{
positions.size=1;
positions.val = ECOMALLOC(sizeof(uint32_t),
"Cannot allocate memory for primer position");
}
count = ManberAll(database[j],pattern,&params,&positions);
data[w].reverseCount[j]=count;
if (count>1)
{
data[w].reversePos[j].pointer = (uint32_t*)positions.val;
positions.val=NULL;
}
else
{
data[w].reversePos[j].pointer=NULL;
if (count==1)
data[w].reversePos[j].value = (uint32_t)*(positions.val);
}
if (database[j]->isexample)
{
data[w].inexample+=(data[w].directCount[j] || data[w].reverseCount[j])? 1:0;
}
else
{
data[w].outexample+=(data[w].directCount[j] || data[w].reverseCount[j])? 1:0;
}
count+=data[w].directCount[j];
}
data[w].good = data[w].inexample >= inSequenceQuorum && data[w].outexample <= outSequenceQuorum;
goodPrimers+=data[w].good? 1:0;
fprintf(stderr,"Primers %5d/%d analyzed => sequence : %s in %d example and %d counterexample sequences \r",
i+1,words->size,ecoUnhashWord(data[w].word,options->primer_length),
data[w].inexample,data[w].outexample);
conserved=data[w].inexample >= inSequenceQuorum;
conserved=conserved && (count < 2 || !options->no_multi_match);
if (conserved)
w++;
}
if (positions.val)
ECOFREE(positions.val,"Free stack position pointer");
if (!conserved)
{
ECOFREE(data[w].directCount,"Free direct count table");
ECOFREE(data[w].directPos,"Free direct count table");
ECOFREE(data[w].reverseCount,"Free direct count table");
ECOFREE(data[w].reversePos,"Free direct count table");
}
fprintf(stderr,"\n\nOn %d analyzed primers %d respect quorum conditions\n",words->size,goodPrimers);
fprintf(stderr,"Conserved primers for further analysis : %d/%d\n",w,words->size);
primers = ECOMALLOC(sizeof(primercount_t),"Cannot allocate memory for primer table");
primers->primers=ECOREALLOC(data,
w * sizeof(primer_t),
"Cannot reallocate memory for fuzzy matching results");
primers->size=w;
return primers;
}

29
src/libecoprimer/debug.h Normal file
View File

@ -0,0 +1,29 @@
/*
* debug.h
*
* Created on: 12 nov. 2008
* Author: coissac
*/
#ifndef DEBUG_H_
#define DEBUG_H_
#include <stdio.h>
#ifdef DEBUG
#define DEBUG_LOG(message,...) { \
char *text; \
(void)asprintf(&text,(message),##__VA_ARGS__); \
fprintf(stderr,"DEBUG %s (line %d) : %s\n",__FILE__,__LINE__,(text)); \
free(text); \
}
#else
#define DEBUG_LOG(message, ...)
#endif
#endif /* DEBUG_H_ */

366
src/libecoprimer/ecoprimer.h Executable file
View File

@ -0,0 +1,366 @@
/*
* epsort.h
*
* Created on: 6 nov. 2008
* Author: coissac
*/
#ifndef EPSORT_H_
#define EPSORT_H_
#include <inttypes.h>
#include <stdlib.h>
#include <stdio.h>
#include "ecotype.h"
#include "../libecoPCR/ecoPCR.h"
#include "../libthermo/nnparams.h"
#include "apat.h"
#define DEBUG
#include "debug.h"
/****
* Word format used :
*
* bit 63 : bad word -> this word should not be used
* bit 62 : multi word -> this word is not uniq in at least one seq
* bits 0-61 : hashed dna word of max size 31 pb
* code used for a : 00
* code used for c : 01
* code used for g : 10
* code used for t : 11
*/
typedef uint64_t word_t, *pword_t;
#define WORD(x) ((x) & 0x3FFFFFFFFFFFFFFFLLU)
#define WORD(x) ((x) & 0x3FFFFFFFFFFFFFFFLLU)
#define ISBADWORD(x) (((x) & 0x8000000000000000LLU) >> 63)
#define SETBADWORD(x) ((x) | 0x8000000000000000LLU)
#define RESETBADWORD(x) ((x) & 0x7FFFFFFFFFFFFFFFLLU)
#define ISMULTIWORD(x) (((x) & 0x4000000000000000LLU) >> 62)
#define SETMULTIWORD(x) ((x) | 0x4000000000000000LLU)
#define RESETMULTIWORD(x) ((x) & 0xBFFFFFFFFFFFFFFFLLU)
#define WORDMASK(s) ((1LLU << ((s) * 2)) -1)
#define LSHIFTWORD(x,s) (((x) << 2) & WORDMASK(s))
#define RSHIFTWORD(x,s) (((x) & WORDMASK(s))>> 2)
#define ERRORMASK(s) ((int32_t)((1LLU << (s)) -1))
#define RAPPENDBASE(x,s,c) (LSHIFTWORD((x),(s)) | (word_t)(c))
#define LAPPENDBASE(x,s,c) (RSHIFTWORD((x),(s)) | ((word_t)((~(c)) & 3) << (((s)-1) *2)))
#define ECO_ASSERT(x,message) if (!(x)) \
{ \
fprintf(stderr,"Assertion Error in %s (line %d): %s\n", \
__FILE__,\
__LINE__,\
message\
); \
exit(ECO_ASSERT_ERROR); \
}
#define MINI(x,y) (((x) < (y)) ? (x):(y))
#define MAXI(x,y) (((x) < (y)) ? (y):(x))
#define FWORDSIZE (13)
#define FWORDMASK WORDMASK(FWORDSIZE)
#define FILTERWORD(x) ((uint32_t)((x) & FWORDMASK))
#define CFILTERWORD(x,s) ((uint32_t)(((x) >> (((s)-FWORDSIZE)*2)) & FWORDMASK))
typedef struct {
pword_t words;
uint32_t *strictcount;
uint32_t inseqcount;
uint32_t outseqcount;
uint64_t size;
} wordcount_t, *pwordcount_t;
typedef union {
uint32_t *pointer;
uint32_t value;
} poslist_t, *pposlist_t;
/**
* primer_t structure store fuzzy match positions for a primer
* on all sequences
*/
typedef struct {
word_t word; //< code for the primer
uint32_t *directCount; //< Occurrence count on direct strand
pposlist_t directPos; //< list of position list on direct strand
uint32_t *reverseCount; //< Occurrence count on reverse strand
pposlist_t reversePos; //< list of position list on reverse strand
bool_t good; //< primer match more than quorum example and no
// more counterexample quorum.
uint32_t inexample; //< count of example sequences matching primer
uint32_t outexample; //< count of counterexample sequences matching primer
} primer_t, *pprimer_t;
/**
* primercount_t structure store fuzzy match positions for all primers
* on all sequences as a list of primer_t
*/
typedef struct {
pprimer_t primers;
uint32_t size;
} primercount_t, *pprimercount_t;
typedef struct {
pprimer_t primer;
uint32_t position;
bool_t strand;
} primermatch_t, *pprimermatch_t;
/*TR: Added*/
typedef struct {
pprimermatch_t matches;
uint32_t matchcount;
} primermatchcount_t, *pprimermatchcount_t;
typedef struct {
pecoseq_t sequence;
bool_t strand;
const char *amplifia;
int32_t length;
uint32_t begin;
uint32_t end;
} amplifia_t, *pamplifia_t;
typedef struct {
pamplifia_t amplifias;
uint32_t ampcount;
uint32_t ampslot;
} amplifiacount_t, *pamplifiacount_t;
typedef struct {
char *amplifia;
int32_t *taxonids;
uint32_t seqidcount;
uint32_t seqidindex;
} ampseqset_t, *pampseqset_t;
typedef struct {
int32_t taxonid;
char **amplifia;
uint32_t amplifiacount;
uint32_t amplifiaindex;
} taxampset_t, *ptaxampset_t;
typedef struct {
pprimer_t p1;
bool_t asdirect1;
pprimer_t p2;
bool_t asdirect2;
amplifiacount_t pcr;
uint32_t inexample; //< example sequence count
uint32_t outexample; //< counterexample sequence count
uint32_t intaxa; //< example taxa count
uint32_t outtaxa; //< counterexample taxa count
uint32_t notwellidentifiedtaxa;
int *wellIdentifiedSeqs; //< an array having elements equla to total seqs
// values are either 0 or 1, if seq is well identified
// its 1 else 0
int *coveredSeqs; //< an array having elements equal to total seqs, 1 if seq is covered else 0
// these statistics are relative to inexample sequences
uint32_t mind; //< minimum distance between primers
uint32_t maxd; //< maximum distance between primers
uint32_t sumd; //< distance sum
uint32_t amplifiacount;
float yule;
float quorumin;
float quorumout;
float bs;
float bc;
int32_t refsequence;
//
// uint32_t taxsetcount;
// uint32_t taxsetindex;
// ptaxampset_t taxset;
//
// uint32_t oktaxoncount;
uint32_t curseqid;
float p1temp; //strict primer1 melting temperature
float p1mintemp; //approx primer1 minimum melting temperature
float p2temp; //strict primer2 melting temperature
float p2mintemp; //approx primer2 minimum melting temperature
} pair_t, *ppair_t;
/*TR: Added*/
typedef struct {
size_t paircount;
size_t pairslots;
void* next;
pair_t pairs[1];
} pairlist_t, *ppairlist_t;
typedef struct {
ppairlist_t first;
ppairlist_t last;
void *tree;
int32_t count;
} pairtree_t, *ppairtree_t;
typedef struct {
pword_t words;
uint32_t *count;
uint32_t push;
uint32_t pop;
uint32_t size;
bool_t empty;
bool_t full;
} queue_t, *pqueue_t;
typedef struct {
pword_t words;
uint32_t *count;
uint32_t write;
uint32_t read1;
uint32_t read2;
uint32_t size;
} merge_t, *pmerge_t;
typedef struct {
const char *amplifia;
bool_t strand;
int32_t length;
int32_t taxoncount;
void *taxontree;
}amptotaxon_t, *pamptotaxon_t;
typedef struct {
int32_t taxid;
void *amptree;
}taxontoamp_t, *ptaxontoamp_t;
typedef struct {
bool_t printAC;
bool_t statistics;
bool_t filtering;
uint32_t lmin; //**< Amplifia minimal length
uint32_t lmax; //**< Amplifia maximal length
uint32_t error_max; //**< maximum error count in fuzzy search
uint32_t primer_length; //**< minimal length of the primers
int32_t *restricted_taxid; //**< limit amplification below these taxid
int32_t *ignored_taxid; //**< no amplification below these taxid
int32_t *exception_taxid;
char *prefix;
char *reference;
pecoseq_t refseq;
uint32_t refseqid;
uint32_t circular;
uint32_t doublestrand;
float strict_quorum;
float strict_exclude_quorum;
float sensitivity_quorum;
float false_positive_quorum;
uint32_t strict_three_prime;
int32_t r; //**< count of restrited taxa (restricted_taxid array size)
int32_t g; //**< count of ignored taxa (ignored_taxid array size)
int32_t e; //**< count of ignored taxa (ignored_taxid array size)
bool_t no_multi_match;
char taxonrank[20]; //TR to count ranks against a pair
int32_t taxonrankidx; //TR to count ranks against a pair
// Some statistics useful for options filters
int32_t dbsize;
int32_t insamples;
int32_t outsamples;
int32_t intaxa;
int32_t outtaxa;
int saltmethod;
float salt;
PNNParams pnparm;
bool_t print_sets_of_primers;
float specificity_threshold;
int links_cnt;
float max_links_percent;
bool_t filter_on_links;
} options_t, *poptions_t;
typedef ecoseq_t **pecodnadb_t;
void sortword(pword_t table,uint32_t N);
pecodnadb_t readdnadb(const char *name, ecotaxonomy_t *taxonomy, uint32_t *size,poptions_t options);
int isGoodTaxon(ecotaxonomy_t *taxonomy,int32_t taxon,poptions_t options);
int isExampleTaxon(ecotaxonomy_t *taxonomy,int32_t taxon,poptions_t options);
int isCounterExampleTaxon(ecotaxonomy_t *taxonomy,int32_t taxon,poptions_t options);
uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq);
pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint32_t doublestrand, ecoseq_t *seq,uint32_t *size,int32_t *neededWords,uint32_t neededWordCount,
int32_t quorum);
uint32_t ecoCompactHashSequence(pword_t dest,uint32_t size);
const char* ecoUnhashWord(word_t word,uint32_t size);
word_t ecoComplementWord(word_t word,uint32_t size);
uint32_t ecoFindWord(pwordcount_t table,word_t word);
void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,uint32_t seqQuorum);
pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount);
void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t exampleCount,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount);
pqueue_t newQueue(pqueue_t queue, uint32_t size);
pqueue_t resizeQueue(pqueue_t queue, uint32_t size);
void pop(pqueue_t queue);
void push(pqueue_t queue, word_t word, uint32_t count);
pqueue_t cleanQueue(pqueue_t queue);
pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
uint32_t exampleCount, poptions_t options);
uint32_t filterMultiStrictPrimer(pwordcount_t strictprimers);
void encodeSequence(ecoseq_t *seq);
ppattern_t buildPatternFromWord(word_t word, uint32_t patlen);
pprimercount_t lookforAproxPrimer(pecodnadb_t database, uint32_t seqdbsize,uint32_t exampleCount,
pwordcount_t words,poptions_t options);
void sortmatch(pprimermatch_t table,uint32_t N);
ppairtree_t initpairtree(ppairtree_t tree);
ppair_t pairintree (pair_t key,ppairtree_t pairlist);
ppair_t insertpair(pair_t key,ppairtree_t list);
/*TR: Added*/
ppairtree_t buildPrimerPairs(pecodnadb_t seqdb,uint32_t seqdbsize,pprimercount_t primers,poptions_t options);
int32_t counttaxon(int32_t taxid);
int32_t getrankdbstats(pecodnadb_t seqdb,
uint32_t seqdbsize,
ecotaxonomy_t *taxonomy,
poptions_t options);
float taxonomycoverage(ppair_t pair, poptions_t options, pecodnadb_t seqdb,uint32_t seqdbsize);
char ecoComplementChar(char base);
void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize);
int32_t *filteringSeq(pecodnadb_t database, uint32_t seqdbsize,
uint32_t exampleCount,poptions_t options,uint32_t *size,int32_t sequenceQuorum);
void printSeqTest(pecodnadb_t seqdb,uint32_t seqdbsize);
#endif /* EPSORT_H_ */

View File

@ -0,0 +1,14 @@
/*
* ecotype.h
*
* Created on: 24 nov. 2008
* Author: coissac
*/
#ifndef ECOTYPE_H_
#define ECOTYPE_H_
typedef enum { FALSE=0,TRUE=1} bool_t, *pbool_t;
#endif /* ECOTYPE_H_ */

View File

@ -0,0 +1,188 @@
/*
* filtering.c
*
* Created on: 12 mai 2009
* Author: coissac
*/
#include "ecoprimer.h"
#include <string.h>
#include <math.h>
#include "hashencoder.h"
static int32_t *ecoFilteringHashSequence(int32_t *dest,
uint32_t circular,
uint32_t doublestrand,
ecoseq_t *seq,
uint32_t *size);
static int32_t *ecoFilteringHashSequence(int32_t *dest,
uint32_t circular,
uint32_t doublestrand,
ecoseq_t *seq,
uint32_t *size)
{
static char *in_last_seq=NULL;
uint32_t i=0;
uint32_t j;
char *base;
int8_t code;
int32_t error=0;
word_t word=0;
word_t antiword=0;
uint32_t goodword;
uint32_t lmax=0;
// run on the first call;
if (dest==(void*)-1)
{
if (in_last_seq) ECOFREE(in_last_seq,"Free in last seq table");
return NULL;
}
*size = pow(4,FWORDSIZE);
if (!in_last_seq)
in_last_seq = ECOMALLOC(*size*sizeof(char),
"Cannot allocate filtering hash table");
memset(in_last_seq,0,*size*sizeof(char));
if (!dest)
{
dest = ECOMALLOC(*size*sizeof(int32_t),
"Cannot allocate filtering hash table");
memset(dest,0,*size*sizeof(int32_t));
}
lmax = seq->SQ_length;
if (!circular)
lmax-= FWORDSIZE-1;
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i));
for (i=0, base = seq->SQ; i < FWORDSIZE && i < lmax; i++,base++)
{
error<<= 1;
error&=ERRORMASK(FWORDSIZE);
code = encoder[(*base) - 'A'];
if (code <0)
{
code = 0;
error|= 1;
}
word=RAPPENDBASE(word,FWORDSIZE,code);
if (doublestrand)
antiword=LAPPENDBASE(antiword,FWORDSIZE,code);
}
if (!error && i==FWORDSIZE)
{
goodword=(uint32_t)((doublestrand) ? MINI(word,antiword):word);
if (!in_last_seq[goodword])
{
in_last_seq[goodword]=1;
dest[goodword]++;
}
}
for (j=1; j < lmax; j++,i++,base++)
{
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j));
/* roll over the sequence for circular ones */
if (i==(uint32_t)seq->SQ_length) base=seq->SQ;
error<<= 1;
error&=ERRORMASK(FWORDSIZE);
//code = -1;
//if((*base) >= 'A' && (*base) <= 'Z')
code = encoder[(*base) - 'A'];
if (code <0)
{
code = 0;
error|= 1;
}
word=RAPPENDBASE(word,FWORDSIZE,code);
if (doublestrand)
antiword=LAPPENDBASE(antiword,FWORDSIZE,code);
if (!error)
{
if (doublestrand)
goodword=(uint32_t)MINI(word,antiword);
else
goodword=word;
if (!in_last_seq[goodword])
{
in_last_seq[goodword]=1;
dest[goodword]++;
}
}
}
return dest;
}
int32_t *filteringSeq(pecodnadb_t database, uint32_t seqdbsize,
uint32_t exampleCount,poptions_t options,uint32_t *size,int32_t sequenceQuorum)
{
int32_t *wordscount=NULL;
int32_t keep=0;
uint32_t i,j=0;
for (i=0;i<seqdbsize;i++)
{
if (database[i]->isexample && database[i]->SQ_length > options->primer_length)
{
j++;
wordscount=ecoFilteringHashSequence(wordscount,
options->circular,
options->doublestrand,
database[i],
size);
}
fprintf(stderr," Filtered sequences %5u/%5u \r",j,exampleCount);
}
fprintf(stderr,"\n");
for (i=0;i<*size;i++)
if (wordscount[i] >= sequenceQuorum)
keep++;
(void)ecoFilteringHashSequence((int32_t*)-1,
options->circular,
options->doublestrand,
NULL,
NULL);
fprintf(stderr,"ok\n Considered word of size %d for filtering : %d\n",FWORDSIZE,keep);
return wordscount;
}

View File

@ -0,0 +1,64 @@
/*
* goodtaxon.c
*
* Created on: 7 nov. 2008
* Author: coissac
*/
#include "ecoprimer.h"
int isGoodTaxon(ecotaxonomy_t *taxonomy,int32_t taxon,poptions_t options)
{
int result;
result=((options->r == 0) || (eco_is_taxid_included(taxonomy,
options->restricted_taxid,
options->r,
taxonomy->taxons->taxon[taxon].taxid)
)) &&
((options->e == 0) || !(eco_is_taxid_included(taxonomy,
options->exception_taxid,
options->e,
taxonomy->taxons->taxon[taxon].taxid)
));
return result;
}
int isExampleTaxon(ecotaxonomy_t *taxonomy,int32_t taxon,poptions_t options)
{
int result;
result=( (options->r == 0) || (eco_is_taxid_included(taxonomy,
options->restricted_taxid,
options->r,
taxonomy->taxons->taxon[taxon].taxid)
)) &&
((options->e == 0) || !(eco_is_taxid_included(taxonomy,
options->exception_taxid,
options->e,
taxonomy->taxons->taxon[taxon].taxid)
));
return result;
}
int isCounterExampleTaxon(ecotaxonomy_t *taxonomy,int32_t taxon,poptions_t options)
{
int result;
result=((options->g != 0) && (eco_is_taxid_included(taxonomy,
options->ignored_taxid,
options->g,
taxonomy->taxons->taxon[taxon].taxid))
) || ((options->e != 0) && (eco_is_taxid_included(taxonomy,
options->exception_taxid,
options->e,
taxonomy->taxons->taxon[taxon].taxid))
);
return result;
}

View File

@ -0,0 +1,21 @@
/*
* hashencoder.h
*
* Created on: 12 mai 2009
* Author: coissac
*/
#ifndef HASHENCODER_H_
#define HASHENCODER_H_
static int8_t encoder[] = {0, // A
-1, // b
1, // C
-1,-1,-1, // d, e, f
2, // G
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // h,i,j,k,l,m,n,o,p,q,r,s
3,3, // T,U
-1,-1,-1,-1,-1}; // v,w,x,y,z
#endif /* HASHENCODER_H_ */

View File

@ -0,0 +1,243 @@
/*
* hashsequence.c
*
* Created on: 7 nov. 2008
* Author: coissac
*/
#include "ecoprimer.h"
static int cmpword(const void *x,const void *y);
#include "hashencoder.h"
uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq)
{
uint32_t wordcount;
wordcount = seq->SQ_length;
if (!circular) wordcount-=wordsize-1;
return wordcount;
}
pword_t ecoHashSequence(pword_t dest,
uint32_t wordsize,
uint32_t circular,
uint32_t doublestrand,
ecoseq_t *seq,
uint32_t *size,
int32_t *neededWords,
uint32_t neededWordCount,
int32_t quorum)
{
uint32_t i=0;
uint32_t j;
char *base;
int8_t code;
int32_t error=0;
word_t word=0;
word_t antiword=0;
word_t goodword;
uint32_t lmax=0;
(*size)=0;
lmax = seq->SQ_length;
if (!circular)
lmax-= wordsize-1;
if (!dest)
dest = ECOMALLOC(lmax*sizeof(word_t),
"I cannot allocate memory for sequence hashing"
);
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i));
for (i=0, base = seq->SQ; i < wordsize && i < lmax; i++,base++)
{
error<<= 1;
error&=ERRORMASK(wordsize);
code = encoder[(*base) - 'A'];
if (code <0)
{
code = 0;
error|= 1;
}
word=RAPPENDBASE(word,wordsize,code);
if (doublestrand)
antiword=LAPPENDBASE(antiword,wordsize,code);
if (neededWordCount && i>=(FWORDSIZE-1))
{
goodword = (doublestrand) ? MINI(FILTERWORD(word),CFILTERWORD(antiword,wordsize)):FILTERWORD(word);
if (neededWords[(uint32_t)goodword]<quorum)
error|= (1 << (FWORDSIZE-1));
}
}
if (!error && i==wordsize)
{
dest[*size]=(doublestrand) ? MINI(word,antiword):word;
(*size)++;
}
for (j=1; j < lmax; j++,i++,base++)
{
// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j));
/* roll over the sequence for circular ones */
if (i==(uint32_t)seq->SQ_length) base=seq->SQ;
error<<= 1;
error&=ERRORMASK(wordsize);
code = encoder[(*base) - 'A'];
if (code <0)
{
code = 0;
error|= 1;
}
word=RAPPENDBASE(word,wordsize,code);
if (doublestrand)
antiword=LAPPENDBASE(antiword,wordsize,code);
if (neededWordCount)
{
goodword = (doublestrand) ? MINI(FILTERWORD(word),CFILTERWORD(antiword,wordsize)):FILTERWORD(word);
if (neededWords[(uint32_t)goodword]<quorum)
error|= (1 << (FWORDSIZE-1));
// else
// DEBUG_LOG("%s goodword = %p %d/%d (pos:%d error:%d)",seq->AC,goodword,neededWords[(uint32_t)goodword],quorum,i,error);
}
if (!error)
{
dest[*size]=(doublestrand) ? MINI(word,antiword):word;
(*size)++;
}
}
// DEBUG_LOG("%s goodword = %d",seq->AC,*size);
return dest;
}
uint32_t ecoCompactHashSequence(pword_t table,uint32_t size)
{
uint32_t i,j;
word_t current;
// bool_t here=FALSE;
sortword(table,size);
current = 0;
current=SETMULTIWORD(current); /* build impossible word for the first loop cycle */
// if (strcmp(ecoUnhashWord(table[size-1],18),"GTTTGTTCAACGATTAAA")==0)
// here=TRUE;
for (i=0,j=0; j < size;j++)
{
if (WORD(table[j])!=current)
{
current =table[j];
table[i]=current;
i++;
}
else
table[i]=SETMULTIWORD(table[i]);
}
// if (strcmp(ecoUnhashWord(WORD(table[i-1]),18),"TACGACCTCGATGTTGGA")==0)
// DEBUG_LOG("winner %d",i)
return i;
}
const char* ecoUnhashWord(word_t word,uint32_t size)
{
static char buffer[32];
static char decode[]="ACGT";
uint32_t i;
for (i=0; i < size; i++)
{
buffer[i]=decode[(word >> (2 * (size - 1 -i))) & 3];
}
buffer[size]=0;
return buffer;
}
word_t ecoComplementWord(word_t word,uint32_t size)
{
word_t rep=0;
uint32_t i;
// DEBUG_LOG("%llx %llx",word,~word);
word=(~word) & WORDMASK(size);
for (i=0;i < size; i++)
{
rep = RAPPENDBASE(rep,size,word & 3LLU);
// DEBUG_LOG("%016llx %016llx %016llx",word,word & 3LLU,rep);
word>>=2;
}
// DEBUG_LOG("Complemented = %s",ecoUnhashWord(rep,18));
return rep;
}
static int cmpword(const void *x,const void *y)
{
word_t w1 = *(pword_t)x;
word_t w2 = *(pword_t)y;
w1 = WORD(w1);
w2 = WORD(w2);
if (w1 < w2)
return -1;
if (w1 > w2)
return +1;
return 0;
}
uint32_t ecoFindWord(pwordcount_t table,word_t word)
{
pword_t dest;
dest = (pword_t)bsearch((const void*)&word,(const void*)table->words,table->size,sizeof(word_t),cmpword);
if (dest)
return dest - table->words;
else
return ~0;
}
char ecoComplementChar(char base)
{
return (base < 4)? !base & 3: 4;
}

379
src/libecoprimer/libstki.c Normal file
View File

@ -0,0 +1,379 @@
/* ==================================================== */
/* Copyright (c) Atelier de BioInformatique */
/* Mar. 92 */
/* File: libstki.c */
/* Purpose: A library to deal with 'stacks' of */
/* integers */
/* Note: 'stacks' are dynamic (i.e. size is */
/* automatically readjusted when needed) */
/* History: */
/* 00/03/92 : <Gloup> first draft */
/* 15/08/93 : <Gloup> revised version */
/* 14/05/99 : <Gloup> last revision */
/* ==================================================== */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "libstki.h"
#include "ecoprimer.h"
/* ============================ */
/* Constantes et Macros locales */
/* ============================ */
#define ExpandStack(stkh) ResizeStacki((stkh), (*stkh)->size << 1)
#define ShrinkStack(stkh) ResizeStacki((stkh), (*stkh)->size >> 1)
static int16_t sStkiLastError = kStkiNoErr;
/* -------------------------------------------- */
/* gestion des erreurs */
/* get/reset erreur flag */
/* */
/* @function: StkiError */
/* -------------------------------------------- */
int16_t StkiError(bool_t reset)
{
int16_t err;
err = sStkiLastError;
if (reset)
sStkiLastError = kStkiNoErr;
return err;
} /* end of StkiError */
/* -------------------------------------------- */
/* creation d'un stack */
/* */
/* @function: NewStacki */
/* -------------------------------------------- */
StackiPtr NewStacki(int32_t size)
{
StackiPtr stki;
if (! (stki = NEW(Stacki)))
return NULL;
stki->size = size;
stki->top = 0;
stki->cursor = 0;
if ( ! (stki->val = NEWN(int32_t, size))) {
sStkiLastError = kStkiMemErr;
return FreeStacki(stki);
}
return stki;
} /* end of NewStacki */
/* -------------------------------------------- */
/* liberation d'un stack */
/* */
/* @function: FreeStacki */
/* -------------------------------------------- */
StackiPtr FreeStacki(StackiPtr stki)
{
if (stki) {
if (stki->val)
ECOFREE(stki->val,"Free stack values");
ECOFREE(stki,"Free stack");
}
return NULL;
} /* end of FreeStacki */
/* -------------------------------------------- */
/* creation d'un vecteur de stacks */
/* */
/* @function: NewStackiVector */
/* -------------------------------------------- */
StackiHdle NewStackiVector(int32_t vectSize, int32_t stackSize)
{
int32_t i;
StackiHdle stkh;
if (! (stkh = NEWN(StackiPtr, vectSize))) {
sStkiLastError = kStkiMemErr;
return NULL;
}
for (i = 0 ; i < vectSize ; i++)
if (! (stkh[i] = NewStacki(stackSize)))
return FreeStackiVector(stkh, i);
return stkh;
} /* end of NewStackiVector */
/* -------------------------------------------- */
/* liberation d'un vecteur de stacks */
/* */
/* @function: FreeStackiVector */
/* -------------------------------------------- */
StackiHdle FreeStackiVector(StackiHdle stkh, int32_t vectSize)
{
int32_t i;
if (stkh) {
for (i = 0 ; i < vectSize ; i++)
(void) FreeStacki(stkh[i]);
ECOFREE(stkh,"Free stack vector");
}
return NULL;
} /* end of FreeStackiVector */
/* -------------------------------------------- */
/* resize d'un stack */
/* */
/* @function: ResizeStacki */
/* -------------------------------------------- */
int32_t ResizeStacki(StackiHdle stkh, int32_t size)
{
int32_t resize = 0; /* assume error */
int32_t *val;
if ((val = ECOREALLOC((*stkh)->val, size * sizeof(int32_t),"Cannot reallocate stack values"))) {
(*stkh)->size = resize = size;
(*stkh)->val = val;
}
if (! resize)
sStkiLastError = kStkiMemErr;
return resize;
} /* end of ResizeStacki */
/* -------------------------------------------- */
/* empilage(/lement) */
/* */
/* @function: PushiIn */
/* -------------------------------------------- */
bool_t PushiIn(StackiHdle stkh, int32_t val)
{
if (((*stkh)->top >= (*stkh)->size) && (! ExpandStack(stkh)))
return FALSE;
(*stkh)->val[((*stkh)->top)++] = val;
return TRUE;
} /* end of PushiIn */
/* -------------------------------------------- */
/* depilage(/lement) */
/* */
/* @function: PopiOut */
/* -------------------------------------------- */
bool_t PopiOut(StackiHdle stkh, int32_t *val)
{
if ((*stkh)->top <= 0)
return FALSE;
*val = (*stkh)->val[--((*stkh)->top)];
if ( ((*stkh)->top < ((*stkh)->size >> 1))
&& ((*stkh)->top > kMinStackiSize))
(void) ShrinkStack(stkh);
return TRUE;
} /* end of PopiOut */
/* -------------------------------------------- */
/* lecture descendante */
/* */
/* @function: ReadiDown */
/* -------------------------------------------- */
bool_t ReadiDown(StackiPtr stki, int32_t *val)
{
if (stki->cursor <= 0)
return FALSE;
*val = stki->val[--(stki->cursor)];
return TRUE;
} /* end of ReadiDown */
/* -------------------------------------------- */
/* lecture ascendante */
/* */
/* @function: ReadiUp */
/* -------------------------------------------- */
bool_t ReadiUp(StackiPtr stki, int32_t *val)
{
if (stki->cursor >= stki->top)
return FALSE;
*val = stki->val[(stki->cursor)++];
return TRUE;
} /* end of ReadiUp */
/* -------------------------------------------- */
/* remontee/descente du curseur */
/* */
/* @function: CursiToTop */
/* @function: CursiToBottom */
/* -------------------------------------------- */
void CursiToTop(StackiPtr stki)
{
stki->cursor = stki->top;
} /* end of CursiToTop */
void CursiToBottom(stki)
StackiPtr stki;
{
stki->cursor = 0;
} /* end of CursiToBottom */
/* -------------------------------------------- */
/* echange des valeurs cursor <-> (top - 1) */
/* */
/* @function: CursiSwap */
/* -------------------------------------------- */
void CursiSwap(StackiPtr stki)
{
int32_t tmp;
if ((stki->top <= 0) || (stki->cursor < 0))
return;
tmp = stki->val[stki->cursor];
stki->val[stki->cursor] = stki->val[stki->top - 1];
stki->val[stki->top - 1] = tmp;
} /* end of CursiSwap */
/* -------------------------------------------- */
/* Recherche d'une valeur en stack a partir du */
/* curseur courant en descendant. */
/* on laisse le curseur a l'endroit trouve */
/* */
/* @function: SearchDownStacki */
/* -------------------------------------------- */
bool_t SearchDownStacki(StackiPtr stki, int32_t sval)
{
int32_t val;
bool_t more;
while ((more = ReadiDown(stki, &val)))
if (val == sval)
break;
return more;
} /* end of SearchDownStacki */
/* -------------------------------------------- */
/* Recherche dichotomique d'une valeur en stack */
/* le stack est suppose trie par valeurs */
/* croissantes. */
/* on place le curseur a l'endroit trouve */
/* */
/* @function: BinSearchStacki */
/* -------------------------------------------- */
bool_t BinSearchStacki(StackiPtr stki, int32_t sval)
{
int32_t midd, low, high, span;
low = 0;
high = stki->top - 1;
while (high >= low) {
midd = (high + low) / 2;
span = stki->val[midd] - sval;
if (span == 0) {
stki->cursor = midd;
return TRUE;
}
if (span > 0)
high = midd - 1;
else
low = midd + 1;
}
return FALSE;
} /* end of BinSearchStacki */
/* -------------------------------------------- */
/* teste l'egalite *physique* de deux stacks */
/* */
/* @function: SameStacki */
/* -------------------------------------------- */
bool_t SameStacki(StackiPtr stki1, StackiPtr stki2)
{
if (stki1->top != stki2->top)
return FALSE;
return ((memcmp(stki1->val, stki2->val,
stki1->top * sizeof(int32_t)) == 0) ? TRUE : FALSE);
} /* end of SameStacki */
/* -------------------------------------------- */
/* inverse l'ordre des elements dans un stack */
/* */
/* @function: ReverseStacki */
/* -------------------------------------------- */
bool_t ReverseStacki(StackiPtr stki)
{
int32_t *t, *b, swp;
if (stki->top <= 0)
return FALSE;
b = stki->val;
t = b + stki->top - 1;
while (t > b) {
swp = *t;
*t-- = *b;
*b++ = swp;
}
return TRUE;
} /* end of ReverseStacki */

View File

@ -0,0 +1,89 @@
/* ==================================================== */
/* Copyright (c) Atelier de BioInformatique */
/* Mar. 92 */
/* File: libstki.h */
/* Purpose: library of dynamic stacks holding */
/* integer values */
/* History: */
/* 00/03/92 : <Gloup> first draft */
/* 07/07/93 : <Gloup> complete revision */
/* 10/03/94 : <Gloup> added xxxVector funcs */
/* 14/05/99 : <Gloup> last revision */
/* ==================================================== */
#ifndef _H_libstki
#define _H_libstki
#include "ecotype.h"
/* ==================================================== */
/* Constantes de dimensionnement */
/* ==================================================== */
#ifndef kMinStackiSize
#define kMinStackiSize 2 /* taille mini stack */
#endif
#define kStkiNoErr 0 /* ok */
#define kStkiMemErr 1 /* not enough memory */
#define kStkiReset TRUE
#define kStkiGet FALSE
/* ==================================================== */
/* Macros standards */
/* ==================================================== */
#ifndef NEW
#define NEW(typ) (typ*)malloc(sizeof(typ))
#define NEWN(typ, dim) (typ*)malloc((uint32_t)(dim) * sizeof(typ))
#define REALLOC(typ, ptr, dim) (typ*)realloc((void *) (ptr), (uint32_t)(dim) * sizeof(typ))
#define FREE(ptr) free((Ptr) ptr)
#endif
/* ==================================================== */
/* Types & Structures de donnees */
/* ==================================================== */
/* -------------------- */
/* structure : pile */
/* -------------------- */
typedef struct Stacki {
/* ---------------------*/
int32_t size; /* stack size */
int32_t top; /* current free pos. */
int32_t cursor; /* current cursor */
int32_t *val; /* values */
/* ---------------------*/
} Stacki, *StackiPtr, **StackiHdle;
/* ==================================================== */
/* Prototypes (generated by mproto) */
/* ==================================================== */
/* libstki.c */
int16_t StkiError (bool_t reset );
StackiPtr NewStacki (int32_t size );
StackiPtr FreeStacki (StackiPtr stki );
StackiHdle NewStackiVector (int32_t vectSize, int32_t stackSize );
StackiHdle FreeStackiVector (StackiHdle stkh, int32_t vectSize );
int32_t ResizeStacki (StackiHdle stkh , int32_t size );
bool_t PushiIn (StackiHdle stkh , int32_t val );
bool_t PopiOut (StackiHdle stkh , int32_t *val );
bool_t ReadiDown (StackiPtr stki , int32_t *val );
bool_t ReadiUp (StackiPtr stki , int32_t *val );
void CursiToTop (StackiPtr stki );
void CursiToBottom (StackiPtr stki );
void CursiSwap (StackiPtr stki );
bool_t SearchDownStacki (StackiPtr stki , int32_t sval );
bool_t BinSearchStacki (StackiPtr stki , int32_t sval );
bool_t SameStacki (StackiPtr stki1 , StackiPtr stki2 );
bool_t ReverseStacki (StackiPtr stki );
#endif /* _H_libstki */

View File

@ -0,0 +1,7 @@
/*
* mapping.c
*
* Created on: 25 nov. 2008
* Author: coissac
*/

152
src/libecoprimer/merge.c Normal file
View File

@ -0,0 +1,152 @@
/*
* merge.c
*
* Created on: 11 nov. 2008
* Author: coissac
*/
#include "ecoprimer.h"
static pmerge_t mergeInit(pmerge_t merge,pwordcount_t data,uint32_t s1,uint32_t s2);
static pmerge_t mergeInit(pmerge_t merge, pwordcount_t data,uint32_t s1,uint32_t s2)
{
merge->words = data->words;
merge->count = data->strictcount;
merge->write = 0;
merge->read1 = 0;
merge->read2 = s1;
merge->size = s1+s2;
return merge;
}
typedef enum {S1=1,S2=2,STACK=3} source_t;
void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,uint32_t seqQuorum)
{
merge_t merged;
source_t source;
word_t currentword,tmpword;
uint32_t currentcount,tmpcount;
int same;
queue_t queue;
int nsame=0;
uint32_t maxcount=0;
bool_t writed=TRUE;
// DEBUG_LOG("Coucou %p s1= %d s2= %d",data,s1,s2)
(void)mergeInit(&merged,data,s1,s2);
(void)newQueue(&queue,MINI(s1,s2));
while (merged.read1 < s1 || merged.read2 < merged.size)
{
if (! queue.empty)
{
currentword = queue.words[queue.pop];
currentcount = queue.count[queue.pop];
source=STACK;
}
else
{
currentword = merged.words[merged.read1];
currentcount = merged.count[merged.read1];
source=S1;
}
if (merged.read2 < merged.size &&
WORD(currentword) > WORD(merged.words[merged.read2]))
{
currentword = merged.words[merged.read2];
currentcount = merged.count[merged.read2];
source = S2;
}
same = (source != S2) && (WORD(currentword) == WORD(merged.words[merged.read2]));
nsame+=same;
// DEBUG_LOG("Merging : r1 = %d s1 = %d r2 = %d size = %d word = %s source=%u same=%u",merged.read1,s1,merged.read2-s1,merged.size,ecoUnhashWord(currentword,18),source,same)
tmpword = merged.words[merged.write];
tmpcount= merged.count[merged.write];
merged.words[merged.write] = currentword;
merged.count[merged.write] = currentcount;
if (source != S2)
{
if (same)
{
merged.count[merged.write]+=merged.count[merged.read2];
if (ISMULTIWORD(currentword) || ISMULTIWORD(merged.words[merged.read2]))
merged.words[merged.write]=SETMULTIWORD(currentword);
merged.read2++;
}
if (source==STACK)
pop(&queue);
merged.read1++;
}
else
merged.read2++;
if (writed && merged.read1 <= merged.write && merged.write < s1)
push(&queue,tmpword,tmpcount);
if (merged.count[merged.write] > maxcount)
maxcount=merged.count[merged.write];
writed = remainingSeq + merged.count[merged.write] >= seqQuorum;
if (writed)
merged.write++;
// else
// DEBUG_LOG("Remove word : %s count : %d remainingSeq : %d total : %d Quorum : %d",
// ecoUnhashWord(currentword,18),merged.count[merged.write],remainingSeq,maxcount+remainingSeq,seqQuorum);
} /* while loop */
// DEBUG_LOG("r1 : %d r2 : %d qsize : %d nsame : %d tot : %d write : %s count : %d source : %d size : %d pop : %d push : %d empty : %d",merged.read1,merged.read2-s1,qsize,nsame,qsize+nsame,ecoUnhashWord(currentword,18),currentcount,source,queue.size,queue.pop,queue.push,queue.empty)
if (merged.read2 < merged.size)
{
//DEBUG_LOG("end1 %d %d/%d %d/%d",merged.write,merged.read1,s1,merged.read2,merged.size);
for (;merged.read2 < merged.size;merged.read2++)
{
merged.words[merged.write]=merged.words[merged.read2];
merged.count[merged.write]=merged.count[merged.read2];
if (remainingSeq + merged.count[merged.write] >= seqQuorum)
merged.write++;
}
}
else {
//DEBUG_LOG("end2 %d %d/%d %d/%d",merged.write,merged.read1,s1,merged.read2,merged.size);
while (! queue.empty)
{
// DEBUG_LOG("write : %s count : %d write : %d size : %d pop : %d push : %d empty : %d",ecoUnhashWord(queue.words[queue.pop],18),queue.count[queue.pop],merged.write,queue.size,queue.pop,queue.push,queue.empty)
merged.words[merged.write]=queue.words[queue.pop];
merged.count[merged.write]=queue.count[queue.pop];
pop(&queue);
if (remainingSeq + merged.count[merged.write] >= seqQuorum)
merged.write++;
}
}
data->size = merged.write;
cleanQueue(&queue);
// DEBUG_LOG("Max count : %d remainingSeq : %d total : %d Quorum : %d",maxcount,remainingSeq,maxcount+remainingSeq,seqQuorum)
// DEBUG_LOG("Second word : %s",ecoUnhashWord(data->words[1],18))
// DEBUG_LOG("Last word : %s",ecoUnhashWord(data->words[data->size-1],18))
}

460
src/libecoprimer/pairs.c Normal file
View File

@ -0,0 +1,460 @@
/*
* pairs.c
*
* Created on: 15 d<>c. 2008
* Author: coissac
*/
#include "ecoprimer.h"
#include <string.h>
#include <stdlib.h>
#include "../libthermo/thermostats.h"
static void buildPrimerPairsForOneSeq(uint32_t seqid,
pecodnadb_t seqdb,
pprimercount_t primers,
ppairtree_t pairs,
poptions_t options);
/*************************************
*
* pair collection management
*
*************************************/
#ifdef MASKEDCODE
char *addamplifiasetelem (ppair_t pair, char* amplifia, int32_t taxid)
{
uint32_t i;
uint32_t j;
char *ampused = NULL;
if(pair->ampsetcount == 0)
{
pair->ampsetcount = 500;
pair->ampsetindex = 0;
pair->ampset = ECOMALLOC(pair->ampsetcount * sizeof(ampseqset_t),"Cannot allocate amplifia set");
}
for (i = 0; i < pair->ampsetindex; i++)
{
if (strcmp (pair->ampset[i].amplifia, amplifia) == 0)
{
ampused = pair->ampset[i].amplifia;
break;
}
}
if (i == 0)
{
pair->ampset[i].seqidcount = 100;
pair->ampset[i].seqidindex = 0;
pair->ampset[i].taxonids = ECOMALLOC(pair->ampset[i].seqidcount * sizeof(uint32_t),"Cannot allocate amplifia sequence table");
}
if (pair->ampsetindex == pair->ampsetcount)
{
pair->ampsetcount += 500;
pair->ampset = ECOREALLOC(pair->ampset, pair->ampsetcount * sizeof(ampseqset_t), "Cannot allocate amplifia set");
}
if (pair->ampset[i].seqidindex == pair->ampset[i].seqidcount)
{
pair->ampset[i].seqidcount += 100;
pair->ampset[i].taxonids = ECOREALLOC(pair->ampset[i].taxonids, pair->ampset[i].seqidcount * sizeof(int32_t), "Cannot allocate amplifia sequence table");
}
if (pair->ampset[i].amplifia == NULL)
{
pair->ampset[i].amplifia = amplifia;
pair->ampsetindex++;
}
for (j = 0; j < pair->ampset[i].seqidindex; j++)
{
if (pair->ampset[i].taxonids[j] == taxid) break;
}
if (j == pair->ampset[i].seqidindex)
pair->ampset[i].taxonids[pair->ampset[i].seqidindex++] = taxid;
return ampused;
}
void addtaxampsetelem (ppair_t pair, int32_t taxid, char *amplifia)
{
uint32_t i;
uint32_t j;
if(pair->taxsetcount == 0)
{
pair->taxsetcount = 500;
pair->taxsetindex = 0;
pair->taxset = ECOMALLOC(pair->taxsetcount * sizeof(taxampset_t),"Cannot allocate taxon set");
}
for (i = 0; i < pair->taxsetindex; i++)
{
if (pair->taxset[i].taxonid == taxid) break;
}
if (i == 0)
{
pair->taxset[i].amplifiacount = 100;
pair->taxset[i].amplifiaindex = 0;
pair->taxset[i].amplifia = ECOMALLOC(pair->taxset[i].amplifiacount * sizeof(char *),"Cannot allocate amplifia table");
}
if (pair->taxsetindex == pair->taxsetcount)
{
pair->taxsetcount += 500;
pair->taxset = ECOREALLOC(pair->taxset, pair->taxsetcount * sizeof(taxampset_t), "Cannot allocate taxon set");
}
if (pair->taxset[i].amplifiaindex == pair->taxset[i].amplifiacount)
{
pair->taxset[i].amplifiacount += 100;
pair->taxset[i].amplifia = ECOREALLOC(pair->taxset[i].amplifia, pair->taxset[i].amplifiacount * sizeof(char *), "Cannot allocate amplifia table");
}
if (pair->taxset[i].taxonid == 0)
{
pair->taxset[i].taxonid = taxid;
pair->taxsetindex++;
}
for (j = 0; j < pair->taxset[i].amplifiaindex; j++)
{
if (strcmp(pair->taxset[i].amplifia[j], amplifia) == 0) break;
}
if (j == pair->taxset[i].amplifiaindex)
{
pair->taxset[i].amplifia[j] = amplifia;
pair->taxset[i].amplifiaindex++;
}
}
char *getamplifia (pecoseq_t seq, uint32_t start, uint32_t len)
{
fprintf(stderr,"start : %d length : %d\n",start,len);
char *amplifia = ECOMALLOC((len + 1) * sizeof(char),"Cannot allocate amplifia");
char *seqc = &seq->SQ[start];
strncpy(amplifia, seqc, len);
return amplifia;
}
#endif
/*TR: Added*/
ppairtree_t buildPrimerPairs(pecodnadb_t seqdb,uint32_t seqdbsize,pprimercount_t primers,poptions_t options)
{
uint32_t i;
ppairtree_t primerpairs;
primerpairs = initpairtree(NULL);
for (i=0; i < seqdbsize; i++)
{
buildPrimerPairsForOneSeq(i, seqdb, primers, primerpairs, options);
}
return primerpairs;
}
#define DMAX (2000000000)
static void buildPrimerPairsForOneSeq(uint32_t seqid,
pecodnadb_t seqdb,
pprimercount_t primers,
ppairtree_t pairs,
poptions_t options)
{
static uint32_t paircount=0;
uint32_t i,j,k;
uint32_t matchcount=0;
pprimermatch_t matches = NULL;
//primermatchcount_t seqmatchcount;
ppair_t pcurrent;
pair_t current;
pprimer_t wswp;
bool_t bswp;
size_t distance;
bool_t strand;
//char prmr[50];
//float mtemp;
word_t w1, w1a, omask = (0x1L << (options->strict_three_prime*2)) -1;
word_t w2, w2a;//, wtmp;
uint32_t bp1,bp2;
//prmr[options->primer_length] = '\0';
for (i=0;i < primers->size; i++)
{
matchcount+=primers->primers[i].directCount[seqid];
matchcount+=primers->primers[i].reverseCount[seqid];
}
if (matchcount <= 0)
return;
matches = ECOMALLOC(matchcount * sizeof(primermatch_t),"Cannot allocate primers match table");
for (i=0,j=0;i < primers->size; i++)
{
if (primers->primers[i].directCount[seqid])
{
if (primers->primers[i].directCount[seqid]==1)
{
matches[j].primer = primers->primers+i;
matches[j].strand=TRUE;
matches[j].position=primers->primers[i].directPos[seqid].value;
j++;
}
else for (k=0; k < primers->primers[i].directCount[seqid]; k++,j++)
{
matches[j].primer = primers->primers+i;
matches[j].strand=TRUE;
matches[j].position=primers->primers[i].directPos[seqid].pointer[k];
}
}
if (primers->primers[i].reverseCount[seqid])
{
if (primers->primers[i].reverseCount[seqid]==1)
{
matches[j].primer = primers->primers+i;
matches[j].strand=FALSE;
matches[j].position=primers->primers[i].reversePos[seqid].value;
j++;
}
else for (k=0; k < primers->primers[i].reverseCount[seqid]; k++,j++)
{
matches[j].primer = primers->primers+i;
matches[j].strand=FALSE;
matches[j].position=primers->primers[i].reversePos[seqid].pointer[k];
}
}
}
if (matchcount>1)
{
// fprintf(stderr,"\n====================================\n");
sortmatch(matches,matchcount); // sort in ascending order by position
for (i=0; i < matchcount;i++)
{
// For all primers matching the sequence
/*for(j=i+1;
(j<matchcount)
&& ((distance=matches[j].position - matches[i].position - options->primer_length) < options->lmax);
j++
)//*/
for (j=i+1; j<matchcount; j++)
{
if (matches[j].position - matches[i].position <= options->primer_length) continue;
distance = matches[j].position - matches[i].position - options->primer_length;
if (distance >= options->lmax) break;
// For all not too far primers
if ( (matches[i].primer->good || matches[j].primer->good)
&& (distance > options->lmin)
)
{
// If possible primer pair
current.p1 = matches[i].primer;
current.asdirect1=matches[i].strand;
current.p2 = matches[j].primer;
current.asdirect2= !matches[j].strand;
current.maxd=DMAX;
current.mind=DMAX;
current.sumd=0;
current.amplifiacount=0;
current.inexample=0;
current.outexample=0;
current.curseqid = 0;
current.refsequence=-1;
//current.p1temp = 100;
//current.p1mintemp = 100;
//current.p2temp = 100;
//current.p2mintemp = 100;
// Standardize the pair
strand = current.p2->word > current.p1->word;
if (!strand)
{
wswp = current.p1;
current.p1=current.p2;
current.p2=wswp;
bswp = current.asdirect1;
current.asdirect1=current.asdirect2;
current.asdirect2=bswp;
}
//Code to make sure that if -3 option is given then
//3' end must match upto given number of base pairs
if (options->strict_three_prime > 0)
{
w1 = current.p1->word;
w2 = current.p2->word;
if (!current.asdirect1) //make sure that word is from 5' to 3'
w1=ecoComplementWord(w1,options->primer_length);
if (!current.asdirect2) //make sure that word is from 5' to 3'
w2=ecoComplementWord(w2,options->primer_length);
//now both w1 and w2 are from 5' to 3' end
bp1 = matches[i].position;
bp2 = matches[j].position;
if (!strand)
{
bp1 = matches[j].position;
bp2 = matches[i].position;
}
//get word of first approximate repeat
w1a = extractSite(seqdb[seqid]->SQ,bp1,options->primer_length,strand);
//get word of second approximate repeat
w2a = extractSite(seqdb[seqid]->SQ,bp2,options->primer_length,!strand);
w1 = w1 & omask; //keep only strict_three_prime bases on the right (3') end
w2 = w2 & omask; //keep only strict_three_prime bases on the right (3') end
w1a = w1a & omask; //keep only strict_three_prime bases on the right (3') end
w2a = w2a & omask; //keep only strict_three_prime bases on the right (3') end
//now check that both words and primers of amplifia have same bases on 3' end
if ((w1 ^ w1a) != 0) continue;
if ((w2 ^ w2a) != 0) continue;
}
// Look for the new pair in already seen pairs
pcurrent = insertpair(current,pairs);
if (seqdb[seqid]->isexample)
{
//pcurrent->inexample++;
pcurrent->sumd+=distance;
pcurrent->amplifiacount++;
if ((pcurrent->maxd==DMAX) || (distance > pcurrent->maxd))
pcurrent->maxd = distance;
if (distance < pcurrent->mind)
pcurrent->mind = distance;
}
//else
// pcurrent->outexample++;
//for each pair we save current sequence id in the pair
//when we see this pair for the first time in currnet sequence
//because we want to increment inexample & outexample count
//only once for one sequence
if (pcurrent->curseqid != (seqid+1))
{
if (seqdb[seqid]->isexample)
pcurrent->inexample++;
else
pcurrent->outexample++;
if (pcurrent->curseqid != 0)
pcurrent->curseqid = seqid+1;
}
/*if ((pcurrent->outexample+pcurrent->inexample)==0)
{
fprintf(stderr,"pcurrent->outexample+pcurrent->inexample=0!\n");
exit(0);
}*/
if (pcurrent->curseqid == 0)//((pcurrent->outexample+pcurrent->inexample)==1)
{
pcurrent->curseqid = seqid+1;
paircount++;
pcurrent->pcr.ampslot=200;
pcurrent->pcr.ampcount=0;
pcurrent->pcr.amplifias = ECOMALLOC(sizeof(amplifia_t)*pcurrent->pcr.ampslot,
"Cannot allocate amplifia table");
}
else
{
if (pcurrent->pcr.ampslot==pcurrent->pcr.ampcount)
{
pcurrent->pcr.ampslot+=200;
pcurrent->pcr.amplifias = ECOREALLOC(pcurrent->pcr.amplifias,
sizeof(amplifia_t)*pcurrent->pcr.ampslot,
"Cannot allocate amplifia table");
}
}
if (seqid==options->refseqid)
pcurrent->refsequence=seqid;
pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].length=distance;
pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].sequence=seqdb[seqid];
pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].strand=strand;
pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].begin=matches[i].position + options->primer_length;
pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].end= matches[j].position - 1;
if (strand)
pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].amplifia= seqdb[seqid]->SQ + matches[i].position + options->primer_length;
else
pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].amplifia= seqdb[seqid]->SQ + matches[j].position - 1 ;
/*strncpy (prmr, seqdb[seqid]->SQ + matches[i].position, options->primer_length);
mtemp = nparam_CalcSelfTM (options->pnparm, prmr, options->primer_length) - 273.0;
if (mtemp < pcurrent->p1mintemp)
pcurrent->p1mintemp = mtemp;
//fprintf (stderr, "prmr1: %s\n", seqdb[seqid]->SQ);
strncpy (prmr, seqdb[seqid]->SQ + matches[j].position, options->primer_length);
mtemp = nparam_CalcSelfTM (options->pnparm, prmr, options->primer_length) - 273.0;
if (mtemp < pcurrent->p2mintemp)
pcurrent->p2mintemp = mtemp;
//fprintf (stderr, "prmr2: %s\n", prmr);
if (pcurrent->p1temp == 100)
pcurrent->p1temp = nparam_CalcSelfTM (options->pnparm, ecoUnhashWord(pcurrent->p1->word, options->primer_length), 0) - 273.0;
if (pcurrent->p2temp == 100)
pcurrent->p2temp = nparam_CalcSelfTM (options->pnparm, ecoUnhashWord(pcurrent->p2->word, options->primer_length), 0) - 273.0;
*/
pcurrent->pcr.ampcount++;
// fprintf(stderr,"%c%c W1 : %s direct : %c",
// "bG"[(int)pcurrent->p1->good],
// "bG"[(int)pcurrent->p2->good],
// ecoUnhashWord(pcurrent->p1->word, options->primer_length),
// "><"[(int)pcurrent->asdirect1]
// );
//
// fprintf(stderr," W2 : %s direct : %c distance : %d (min/max/avg : %d/%d/%f) in/out: %d/%d %c (%d pairs)\n",
// ecoUnhashWord(pcurrent->p2->word, options->primer_length),
// "><"[(int)pcurrent->asdirect2],
// distance,
// pcurrent->mind,pcurrent->maxd,
// (pcurrent->inexample) ? (float)pcurrent->sumd/pcurrent->inexample:0.0,
// pcurrent->inexample,pcurrent->outexample,
// " N"[(pcurrent->outexample+pcurrent->inexample)==1],
// paircount
//
// );
//
}
}
}
}
pairs->count=paircount;
}

136
src/libecoprimer/pairtree.c Normal file
View File

@ -0,0 +1,136 @@
/*
* pairtree.c
*
* Created on: 7 mars 2009
* Author: coissac
*/
#include "ecoprimer.h"
#include <search.h>
static void cleanpair(ppair_t pair);
static void deletepairlist(ppairlist_t list);
static int cmppair(const void* p1,const void*p2);
static void cleanamplifiatlist(pamplifiacount_t list)
{
if (list->amplifias)
ECOFREE(list->amplifias,
"Free amplifia list");
}
static void cleanpair(ppair_t pair)
{
cleanamplifiatlist(&(pair->pcr));
}
static ppairlist_t newpairlist(ppairlist_t parent, size_t size)
{
ppairlist_t tmp;
tmp=ECOMALLOC(sizeof(pairlist_t)+sizeof(pair_t)*(size-1),
"Cannot allocate new pair list");
tmp->pairslots=size;
tmp->paircount=0;
tmp->next=NULL;
if (parent)
parent->next=(void*)tmp;
return tmp;
}
static void deletepairlist(ppairlist_t list)
{
size_t i;
if (list)
{
if (list->next)
{
deletepairlist(list->next);
list->next=NULL;
}
for (i=0; i < list->paircount; i++)
cleanpair((list->pairs)+i);
ECOFREE(list,"Delete pair list");
}
}
static int cmppair(const void* p1,const void*p2)
{
ppair_t pr1,pr2;
pr1=(ppair_t)p1;
pr2=(ppair_t)p2;
if (pr1->p1 < pr2->p1) return -1;
if (pr1->p1 > pr2->p1) return 1;
if (pr1->asdirect1 < pr2->asdirect1) return -1;
if (pr1->asdirect1 > pr2->asdirect1) return 1;
if (pr1->p2 < pr2->p2) return -1;
if (pr1->p2 > pr2->p2) return 1;
if (pr1->asdirect2 < pr2->asdirect2) return -1;
if (pr1->asdirect2 > pr2->asdirect2) return 1;
return 0;
}
ppair_t pairintree (pair_t key,
ppairtree_t pairlist)
{
if (!pairlist->tree)
return NULL;
return *((ppair_t*)tsearch((const void *)(&key),
&(pairlist->tree),
cmppair
));
}
ppair_t insertpair(pair_t key,
ppairtree_t list)
{
ppair_t current;
ppair_t found;
if (list->last->paircount==list->last->pairslots)
{
list->last->next=newpairlist(list->last,100);
list->last=list->last->next;
}
current = list->last->pairs + list->last->paircount;
*current=key;
found = *((ppair_t*)tsearch((const void *)current,
&(list->tree),
cmppair));
if (found==current)
list->last->paircount++;
return found;
}
ppairtree_t initpairtree(ppairtree_t tree)
{
if (!tree)
tree = ECOMALLOC(sizeof(pairtree_t),"Cannot allocate pair tree");
tree->first=newpairlist(NULL,300);
tree->last=tree->first;
tree->tree=NULL;
tree->count=0;
return tree;
}

100
src/libecoprimer/queue.c Normal file
View File

@ -0,0 +1,100 @@
/*
* queue.c
*
* Created on: 14 nov. 2008
* Author: coissac
*/
#include "ecoprimer.h"
pqueue_t newQueue(pqueue_t queue, uint32_t size)
{
if (!queue)
queue = ECOMALLOC(sizeof(queue_t),"Cannot allocate queue structure");
queue->size=0;
resizeQueue(queue,size);
return queue;
}
pqueue_t resizeQueue(pqueue_t queue, uint32_t size)
{
queue->pop=0;
queue->push=0;
queue->empty=TRUE;
queue->full=FALSE;
if (!queue->size)
{
queue->count=ECOMALLOC(size * sizeof(uint32_t),
"Cannot allocate count queue array"
);
queue->words=ECOMALLOC(size * sizeof(word_t),
"Cannot allocate word queue array"
);
queue->size=size;
}
else if (size > queue->size)
{
queue->count=ECOREALLOC(queue->count,
size * sizeof(uint32_t),
"Cannot allocate count queue array"
);
queue->words=ECOREALLOC(queue->words,
size * sizeof(word_t),
"Cannot allocate word queue array"
);
queue->size=size;
}
return queue;
}
pqueue_t cleanQueue(pqueue_t queue)
{
if (queue->size)
{
if (queue->count)
ECOFREE(queue->count,"Free count queue");
if (queue->words)
ECOFREE(queue->words,"Free words queue");
}
queue->size=0;
return queue;
}
void push(pqueue_t queue, word_t word, uint32_t count)
{
ECO_ASSERT(!queue->full,"Queue is full");
queue->count[queue->push]=count;
queue->words[queue->push]=word;
queue->push++;
if (queue->push==queue->size)
queue->push=0;
queue->full=queue->push==queue->pop;
queue->empty=FALSE;
}
void pop(pqueue_t queue)
{
ECO_ASSERT(!queue->empty,"Queue is empty");
queue->pop++;
if (queue->pop==queue->size)
queue->pop=0;
queue->empty=queue->push==queue->pop;
queue->full=FALSE;
}

View File

@ -0,0 +1,59 @@
/*
* readdnadb.c
*
* Created on: 7 nov. 2008
* Author: coissac
*/
#include "ecoprimer.h"
pecodnadb_t readdnadb(const char *name, ecotaxonomy_t *taxonomy, uint32_t *size,poptions_t options)
{
ecoseq_t *seq;
uint32_t buffsize=100;
pecodnadb_t db;
db = ECOMALLOC(buffsize*sizeof(ecoseq_t*),"I cannot allocate db memory");
for(seq=ecoseq_iterator(name), *size=0;
seq;
seq=ecoseq_iterator(NULL)
)
{
if (isExampleTaxon(taxonomy,seq->taxid,options) ||
isCounterExampleTaxon(taxonomy,seq->taxid,options))
{
if (*size==buffsize)
{
buffsize*=2;
db = ECOREALLOC(db,buffsize*sizeof(ecoseq_t*),"I cannot allocate db memory");
}
db[*size]=seq;
(*size)++;
}
else
{
delete_ecoseq(seq);
}
};
db = ECOREALLOC(db,(*size)*sizeof(ecoseq_t*),"I cannot allocate db memory");
return db;
}
void printSeqTest(pecodnadb_t seqdb,uint32_t seqdbsize)
{
uint32_t i;
char ch[11];
ch [10] = '\0';
for (i=0; i < seqdbsize; i++)
{
strncpy (ch, seqdb[i]->SQ, 10);
fprintf (stderr, "seq %d = %s\n", i, ch);
}
exit (0);
}

View File

@ -0,0 +1,265 @@
/*
* This file is part of the Sofia-SIP package
*
* Copyright (C) 2005 Nokia Corporation.
*
* Contact: Pekka Pessi <pekka.pessi@nokia.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* as published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA
*
*/
/**@file smoothsort.c
* @brief Smoothsort implementation
*
* Smoothsort is a in-place sorting algorithm with performance of O(NlogN)
* in worst case and O(n) in best case.
*
* @sa <a href="http://www.enterag.ch/hartwig/order/smoothsort.pdf">
* "Smoothsort, an alternative for sorting in-situ", E.D. Dijkstra, EWD796a</a>,
* &lt;http://www.enterag.ch/hartwig/order/smoothsort.pdf&gt;.
*
* @author Pekka Pessi <Pekka.Pessi@nokia.com>
*/
#include <assert.h>
#include <stdio.h>
#include <sys/types.h>
#include <inttypes.h> /* <EC> add sto switch from size_t to uint32_t */
/** Description of current stretch */
typedef struct {
uint32_t b, c; /**< Leonardo numbers */
unsigned long long p; /**< Concatenation codification */
} stretch;
/** Description of array */
typedef struct
{
void *m;
int (*less)(void *m, uint32_t a, uint32_t b);
void (*swap)(void *m, uint32_t a, uint32_t b);
} array;
static inline uint32_t stretch_up(stretch s[1])
{
uint32_t next;
s->p >>= 1;
next = s->b + s->c + 1, s->c = s->b, s->b = next;
return next;
}
static inline uint32_t stretch_down(stretch s[1], unsigned bit)
{
uint32_t next;
s->p <<= 1, s->p |= bit;
next = s->c, s->c = s->b - s->c - 1, s->b = next;
return next;
}
#if DEBUG_SMOOTHSORT
static char const *binary(unsigned long long p)
{
static char binary[65];
int i;
if (p == 0)
return "0";
binary[64] = 0;
for (i = 64; p; p >>= 1)
binary[--i] = "01"[p & 1];
return binary + i;
}
#else
#define DEBUG(x) ((void)0)
#endif
/**
* Sift the root of the stretch.
*
* The low values are sifted up (towards index 0) from root.
*
* @param array description of array to sort
* @param r root of the stretch
* @param s description of current stretch
*/
static void sift(array const *array, uint32_t r, stretch s)
{
while (s.b >= 3) {
uint32_t r2 = r - s.b + s.c;
if (!array->less(array->m, r - 1, r2)) {
r2 = r - 1;
stretch_down(&s, 0);
}
if (array->less(array->m, r2, r))
break;
DEBUG(("\tswap(%p @%zu <=> @%zu)\n", array, r, r2));
array->swap(array->m, r, r2); r = r2;
stretch_down(&s, 0);
}
}
/** Trinkle the roots of the given stretches
*
* @param array description of array to sort
* @param r root of the stretch
* @param s description of stretches to concatenate
*/
static void trinkle(array const *array, uint32_t r, stretch s)
{
DEBUG(("trinkle(%p, %zu, (%u, %s))\n", array, r, s.b, binary(s.p)));
while (s.p != 0) {
uint32_t r2, r3;
while ((s.p & 1) == 0)
stretch_up(&s);
if (s.p == 1)
break;
r3 = r - s.b;
if (array->less(array->m, r3, r))
break;
s.p--;
if (s.b < 3) {
DEBUG(("\tswap(%p @%zu <=> @%zu b=%u)\n", array, r, r3, s.b));
array->swap(array->m, r, r3); r = r3;
continue;
}
r2 = r - s.b + s.c;
if (array->less(array->m, r2, r - 1)) {
r2 = r - 1;
stretch_down(&s, 0);
}
if (array->less(array->m, r2, r3)) {
DEBUG(("swap(%p [%zu]=[%zu])\n", array, r, r3));
array->swap(array->m, r, r3); r = r3;
continue;
}
DEBUG(("\tswap(%p @%zu <=> @%zu b=%u)\n", array, r, r2, s.b));
array->swap(array->m, r, r2); r = r2;
stretch_down(&s, 0);
break;
}
sift(array, r, s);
}
/** Trinkles the stretches when the adjacent stretches are already trusty.
*
* @param array description of array to sort
* @param r root of the stretch
* @param stretch description of stretches to trinkle
*/
static void semitrinkle(array const *array, uint32_t r, stretch s)
{
uint32_t r1 = r - s.c;
DEBUG(("semitrinkle(%p, %zu, (%u, %s))\n", array, r, s.b, binary(s.p)));
if (array->less(array->m, r, r1)) {
DEBUG(("\tswap(%p @%zu <=> @%zu b=%u)\n", array, r, r1, s.b));
array->swap(array->m, r, r1);
trinkle(array, r1, s);
}
}
/** Sort array using smoothsort.
*
* Sort @a N elements from array @a base starting with index @a r with smoothsort.
*
* @param base pointer to array
* @param r lowest index to sort
* @param N number of elements to sort
* @param less comparison function returning nonzero if m[a] < m[b]
* @param swap swapper function exchanging elements m[a] and m[b]
*/
void su_smoothsort(void *base, uint32_t r, uint32_t N,
int (*less)(void *m, uint32_t a, uint32_t b),
void (*swap)(void *m, uint32_t a, uint32_t b))
{
stretch s = { 1, 1, 1 };
uint32_t q;
array const array[1] = {{ base, less, swap }};
assert(less && swap);
if (base == NULL || N <= 1 || less == NULL || swap == NULL)
return;
DEBUG(("\nsmoothsort(%p, %zu)\n", array, nmemb));
for (q = 1; q != N; q++, r++, s.p++) {
DEBUG(("loop0 q=%zu, b=%u, p=%s \n", q, s.b, binary(s.p)));
if ((s.p & 7) == 3) {
sift(array, r, s), stretch_up(&s), stretch_up(&s);
}
else /* if ((s.p & 3) == 1) */ { assert((s.p & 3) == 1);
if (q + s.c < N)
sift(array, r, s);
else
trinkle(array, r, s);
while (stretch_down(&s, 0) > 1)
;
}
}
trinkle(array, r, s);
for (; q > 1; q--) {
s.p--;
DEBUG(("loop1 q=%zu: b=%u p=%s\n", q, s.b, binary(s.p)));
if (s.b <= 1) {
while ((s.p & 1) == 0)
stretch_up(&s);
--r;
}
else /* if b >= 3 */ {
if (s.p) semitrinkle(array, r - (s.b - s.c), s);
stretch_down(&s, 1);
semitrinkle(array, --r, s);
stretch_down(&s, 1);
}
}
}

View File

@ -0,0 +1,51 @@
/*
* sortmatch.c
*
* Created on: 15 d<>c. 2008
* Author: coissac
*/
/*
* sortword.c
*
*
* Created on: 6 nov. 2008
* Author: coissac
*/
#include "ecoprimer.h"
#include <math.h>
void su_smoothsort(void *base, uint32_t r, uint32_t N,
int (*less)(void *m, uint32_t a, uint32_t b),
void (*swap)(void *m, uint32_t a, uint32_t b));
static int less(void *m, uint32_t a, uint32_t b);
static void swap(void *m, uint32_t a, uint32_t b);
void sortmatch(pprimermatch_t table,uint32_t N)
{
su_smoothsort((void*)table,0,N,less,swap);
}
int less(void *m, uint32_t a, uint32_t b)
{
pprimermatch_t t;
t = (pprimermatch_t)m;
return t[a].position <= t[b].position;
}
void swap(void *m, uint32_t a, uint32_t b)
{
primermatch_t tmp;
pprimermatch_t t;
t = (pprimermatch_t)m;
tmp = t[a];
t[a]= t[b];
t[b]= tmp;
}

View File

@ -0,0 +1,44 @@
/*
* sortword.c
*
*
* Created on: 6 nov. 2008
* Author: coissac
*/
#include "ecoprimer.h"
#include <math.h>
void su_smoothsort(void *base, uint32_t r, uint32_t N,
int (*less)(void *m, uint32_t a, uint32_t b),
void (*swap)(void *m, uint32_t a, uint32_t b));
static int less(void *m, uint32_t a, uint32_t b);
static void swap(void *m, uint32_t a, uint32_t b);
void sortword(pword_t table,uint32_t N)
{
su_smoothsort((void*)table,0,N,less,swap);
}
int less(void *m, uint32_t a, uint32_t b)
{
pword_t t;
t = (pword_t)m;
return WORD(t[a]) <= WORD(t[b]);
}
void swap(void *m, uint32_t a, uint32_t b)
{
word_t tmp;
pword_t t;
t = (pword_t)m;
tmp = t[a];
t[a]= t[b];
t[b]= tmp;
}

View File

@ -0,0 +1,264 @@
/*
* strictprimers.c
*
* Created on: 7 nov. 2008
* Author: coissac
*/
#define _GNU_SOURCE
#include "ecoprimer.h"
#include <string.h>
#include <math.h>
#include <sys/resource.h>
#include <unistd.h>
#include <stdio.h>
#ifndef RUSAGE_SELF
#define RUSAGE_SELF 0
#define RUSAGE_CHILDREN -1
#endif
static double timeval_subtract (struct timeval *x, struct timeval *y);
/* Subtract the `struct timeval' values X and Y,
Return elapsed secondes as a double. */
double timeval_subtract (struct timeval *x, struct timeval *y)
{
struct timeval result;
/* Perform the carry for the later subtraction by updating y. */
if (x->tv_usec < y->tv_usec) {
int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1;
y->tv_usec -= 1000000 * nsec;
y->tv_sec += nsec;
}
if (x->tv_usec - y->tv_usec > 1000000) {
int nsec = (x->tv_usec - y->tv_usec) / 1000000;
y->tv_usec += 1000000 * nsec;
y->tv_sec -= nsec;
}
/* Compute the time remaining to wait.
tv_usec is certainly positive. */
result.tv_sec = x->tv_sec - y->tv_sec;
result.tv_usec = x->tv_usec - y->tv_usec;
return (double)result.tv_sec + (double)result.tv_usec/1e6;
}
pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount)
{
uint32_t i;
uint32_t buffsize;
//wordcount_t t;
if (!table)
table = ECOMALLOC(sizeof(wordcount_t),"Cannot allocate memory for word count structure");
table->words=NULL;
table->size =0;
table->outseqcount=0;
table->inseqcount=0;
table->strictcount =0;
if (seq)
{
table->words = ecoHashSequence(NULL,wordsize,circular,doublestrand,seq,&buffsize,neededWords,neededWordCount,seqQuorum);
table->size = ecoCompactHashSequence(table->words,buffsize);
table->inseqcount=1;
table->strictcount =ECOMALLOC((table->size*sizeof(uint32_t)),
"Cannot allocate memory for word count table"
);
for (i=0; i < table->size; i++) table->strictcount[i]=1;
}
return table;
}
void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t exampleCount,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount)
{
uint32_t buffersize;
pword_t newtable;
uint32_t newsize;
uint32_t i;
buffersize = table->size + ecoWordCount(wordsize,circular,seq);
table->words = ECOREALLOC(table->words,buffersize*sizeof(word_t),
"\n\nCannot allocate memory to extend word table" );
newtable = table->words + table->size;
// DEBUG_LOG("Words = %x (%u) new = %x", table->words,table->size,newtable);
(void)ecoHashSequence(newtable,wordsize,circular,doublestrand,seq,&newsize,neededWords,neededWordCount,seqQuorum);
// DEBUG_LOG("new seq wordCount : %d",newsize);
newsize = ecoCompactHashSequence(newtable,newsize);
// DEBUG_LOG("compacted wordCount : %d",newsize);
buffersize = table->size + newsize;
// resize the count buffer
table->inseqcount++;
//fprintf (stderr, "\nOldAddress: %x", table->strictcount);
table->strictcount = ECOREALLOC(table->strictcount,(buffersize+5000)*sizeof(uint32_t),
"Cannot allocate memory to extend example word count table");
//fprintf (stderr, " NewAddress: %x\n", table->strictcount);
for (i=table->size; i < buffersize; i++)
table->strictcount[i]=1;
// Now we have to merge in situ the two tables
ecomerge(table,table->size,newsize,exampleCount - table->inseqcount,seqQuorum);
// DEBUG_LOG("Dictionnary size : %d",table->size);
}
pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
uint32_t exampleCount,poptions_t options)
{
struct rusage start;
struct rusage usage;
double seconde;
char *logfilename;
FILE *logfile;
uint32_t i;
bool_t first=TRUE;
pwordcount_t strictprimers=NULL;
uint64_t totallength=0;
uint32_t sequenceQuorum = (uint32_t)floor((float)exampleCount * options->strict_quorum);
int32_t *neededWords;
uint32_t neededWordCount;
fprintf(stderr,"Filtering... ");
if (options->filtering)
neededWords = filteringSeq(database,seqdbsize,exampleCount,options,&neededWordCount,(int32_t)sequenceQuorum);
else
{
neededWordCount=0;
neededWords=NULL;
}
if (options->statistics)
{
asprintf(&logfilename,"ecoprimer_%d.log",getpid());
logfile = fopen(logfilename,"w");
fprintf(logfile,"# seq\tlength\tsize\ttime\tspeed\n");
fclose(logfile);
}
fprintf(stderr," Primers should be at least present in %d/%d example sequences\n",sequenceQuorum,exampleCount);
strictprimers = initCountTable(NULL,options->primer_length,
options->circular,
options->doublestrand,
0,
NULL,NULL,0);
getrusage(RUSAGE_SELF,&start);
for (i=0;i<seqdbsize;i++)
{
if (database[i]->isexample && database[i]->SQ_length > options->primer_length)
{
if (first)
{
strictprimers = initCountTable(strictprimers,options->primer_length,
options->circular,
options->doublestrand,
sequenceQuorum,
database[i],neededWords,neededWordCount);
first=FALSE;
}
else
{
uint32_t s;
s = strictprimers->size;
// DEBUG_LOG("stack size : %u",s);
addSeqToWordCountTable(strictprimers,options->primer_length,
options->circular,
options->doublestrand,
exampleCount,
sequenceQuorum,
database[i],neededWords,neededWordCount);
};
totallength+=database[i]->SQ_length;
getrusage(RUSAGE_SELF,&usage);
if (options->statistics)
{
asprintf(&logfilename,"ecoprimer_%d.log",getpid());
logfile = fopen(logfilename,"a");
seconde = timeval_subtract(&(usage.ru_utime),&(start.ru_utime)) +
timeval_subtract(&(usage.ru_stime),&(start.ru_stime));
fprintf(logfile,"%d\t%llu\t%lu\t%8.3f\t%8.3e\n",i,
(long long unsigned)totallength,
strictprimers->size*(sizeof(int64_t)+sizeof(int32_t)),
seconde,seconde/(double)totallength);
fclose(logfile);
}
}
else
strictprimers->outseqcount++;
fprintf(stderr," Indexed sequences %5d/%5d : considered words %-10llu \r",
(int32_t)i+1,(int32_t)seqdbsize,
(long long unsigned)strictprimers->size);
// DEBUG_LOG("First word : %s ==> %d",ecoUnhashWord(strictprimers->words[0],18),strictprimers->incount[0])
// DEBUG_LOG("Second word : %s ==> %d",ecoUnhashWord(strictprimers->words[1],18),strictprimers->incount[1])
}
strictprimers->strictcount = ECOREALLOC(strictprimers->strictcount,
sizeof(uint32_t)*strictprimers->size,
"Cannot reallocate strict primer count table");
strictprimers->words = ECOREALLOC(strictprimers->words,
sizeof(word_t)*strictprimers->size,
"Cannot reallocate strict primer table");
if (neededWords)
ECOFREE(neededWords,"Clean needed word table");
return strictprimers;
}
uint32_t filterMultiStrictPrimer(pwordcount_t strictprimers)
{
uint32_t i;
uint32_t w;
for (i=0,w=0;i < strictprimers->size;i++)
{
if (w < i)
{
strictprimers->words[w]=strictprimers->words[i];
strictprimers->strictcount[w]=strictprimers->strictcount[i];
}
if (! ISMULTIWORD(strictprimers->words[w]))
w++;
}
strictprimers->size=w;
strictprimers->strictcount = ECOREALLOC(strictprimers->strictcount,
sizeof(uint32_t)*strictprimers->size,
"Cannot reallocate strict primer count table");
strictprimers->words = ECOREALLOC(strictprimers->words,
sizeof(word_t)*strictprimers->size,
"Cannot reallocate strict primer table");
return w;
}

378
src/libecoprimer/taxstats.c Normal file
View File

@ -0,0 +1,378 @@
/*
* taxstats.c
*
* Created on: 12 mars 2009
* Author: coissac
*/
#include <search.h>
//void tdestroy (void *root, void (*free_node)(void *nodep));
#include "ecoprimer.h"
static int cmptaxon(const void *t1, const void* t2);
void **tree_root = NULL;
int delete_passes = 0;
void delete_twalkaction (const void *node, VISIT order, int level)
{
switch (order)
{
case preorder:
delete_passes++;
break;
case postorder:
delete_passes++;
break;
case endorder:
delete_passes++;
break;
case leaf:
if (tree_root)
tdelete (node, tree_root,cmptaxon);
delete_passes++;
break;
}
}
void free_tree_nodes (void *tree)
{
while (1)
{
delete_passes = 0;
twalk (tree, delete_twalkaction);
if (delete_passes <= 1) break;
}
}
static int cmptaxon(const void *t1, const void* t2)
{
const size_t taxid1=(size_t)t1;
const size_t taxid2=(size_t)t2;
// fprintf(stderr,"==> counted taxid1 : %d\n",taxid1);
// fprintf(stderr,"==> counted taxid2 : %d\n",taxid2);
if (taxid1 < taxid2)
return -1;
if (taxid1 > taxid2)
return +1;
return 0;
}
int32_t counttaxon(int32_t taxid)
{
static void* taxontree=NULL;
static int32_t taxoncount=0;
// fprintf(stderr,"counted taxid : %d taxontree %p\n",taxid,taxontree);
if (taxid==-1)
{
if (taxontree)
{
tree_root = (void **)&taxontree;
//free_tree_nodes (taxontree);
ECOFREE(taxontree,"Free taxon tree");
tree_root = NULL;
}
taxontree=NULL;
taxoncount=0;
return 0;
}
if ((taxid > 0) && ((!taxontree) || (!tfind((void*)((size_t)taxid),&taxontree,cmptaxon))))
{
tsearch((void*)((size_t)taxid),&taxontree,cmptaxon);
taxoncount++;
}
return taxoncount;
}
int32_t getrankdbstats(pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxonomy,
poptions_t options)
{
uint32_t i;
ecotx_t *taxon;
ecotx_t *tmptaxon;
counttaxon(-1);
options->intaxa = 0;
for (i=0;i<seqdbsize;i++)
{
taxon = &(taxonomy->taxons->taxon[seqdb[i]->taxid]);
seqdb[i]->isexample=isExampleTaxon(taxonomy,seqdb[i]->taxid,options);
tmptaxon = eco_findtaxonatrank(taxon,
options->taxonrankidx);
// fprintf(stderr,"Taxid : %d %p\n",taxon->taxid,tmptaxon);
if (tmptaxon)
{
// fprintf(stderr,"orig : %d trans : %d\n",taxon->taxid,
// tmptaxon->taxid);
seqdb[i]->ranktaxonid=tmptaxon->taxid;
if (seqdb[i]->isexample)
options->intaxa = counttaxon(tmptaxon->taxid);
}
else
seqdb[i]->ranktaxonid=-1;
}
counttaxon(-1);
options->outtaxa = 0;
for (i=0;i<seqdbsize;i++)
{
if (seqdb[i]->ranktaxonid>=0 && !seqdb[i]->isexample)
options->outtaxa = counttaxon(seqdb[i]->ranktaxonid);
}
return options->outtaxa + options->intaxa;
}
float taxonomycoverage(ppair_t pair, poptions_t options, pecodnadb_t seqdb,uint32_t seqdbsize)
{
int32_t seqcount;
int32_t i;
int32_t incount=0;
int32_t outcount=0;
uint32_t j;
memset (pair->coveredSeqs, 0, seqdbsize*sizeof (int));
seqcount=pair->pcr.ampcount;
counttaxon(-1);
for (i=0; i < seqcount; i++)
if (pair->pcr.amplifias[i].sequence->isexample
&& pair->pcr.amplifias[i].sequence->ranktaxonid > 0 )
{
incount = counttaxon(pair->pcr.amplifias[i].sequence->ranktaxonid);
for (j=0; j<seqdbsize; j++)
if (pair->pcr.amplifias[i].sequence == seqdb[j])
{pair->coveredSeqs[j] = 1; break;}
}
counttaxon(-1);
for (i=0; i < seqcount; i++)
if (!pair->pcr.amplifias[i].sequence->isexample
&& pair->pcr.amplifias[i].sequence->ranktaxonid)
outcount = counttaxon(pair->pcr.amplifias[i].sequence->ranktaxonid);
pair->intaxa=incount;
pair->outtaxa=outcount;
pair->bc=(float)incount/options->intaxa;
return pair->bc;
}
/*
static int cmpamp(const void *ampf1, const void* ampf2)
{
int i;
int j = 0;
int incr = 1;
char cd1;
char cd2;
int chd = 0;
int len = 0;
pamptotaxon_t pampf1 = (pamptotaxon_t) ampf1;
pamptotaxon_t pampf2 = (pamptotaxon_t) ampf2;
if (pampf1->strand != pampf2->strand)
{
incr = -1;
j = pampf1->length - 1;
if (pampf2->strand)
{
pampf1 = (pamptotaxon_t) ampf2;
pampf2 = (pamptotaxon_t) ampf1;
chd = 1;
}
//j = pampf2->length - 1; should have been here and pampf2 instead of pampf1?
}
len = (pampf1->length <= pampf2->length)? pampf1->length: pampf2->length;
for (i = 0; i < len; i++, j += incr)
{
cd1 = pampf1->amplifia[i];
if (incr == -1)
cd2 = ecoComplementChar(pampf2->amplifia[j]);
else
cd2 = pampf2->amplifia[j];
if (cd1 < cd2) return chd ? 1: -1;
if (cd2 < cd1) return chd ? -1: 1;
}
if (pampf1->length > pampf2->length) return chd ? -1: 1;
if (pampf2->length > pampf1->length) return chd ? 1: -1;
return 0;
}*/
static int cmpamp(const void *ampf1, const void* ampf2)
{
int i;
char cd1;
char cd2;
int len = 0;
char *ch1;
char *ch2;
int incr1;
int incr2;
pamptotaxon_t pampf1 = (pamptotaxon_t) ampf1;
pamptotaxon_t pampf2 = (pamptotaxon_t) ampf2;
ch1 = pampf1->amplifia;
ch2 = pampf2->amplifia;
incr1 = 1;
incr2 = 1;
if (!pampf1->strand)
incr1 = -1;
if (!pampf2->strand)
incr2 = -1;
len = (pampf1->length <= pampf2->length)? pampf1->length: pampf2->length;
for (i = 0; i < len; i++)
{
cd1 = *ch1;
if (incr1 == -1)
cd1 = ecoComplementChar(*ch1);
cd2 = *ch2;
if (incr2 == -1)
cd2 = ecoComplementChar(*ch2);
if (cd1 < cd2) return -1;
if (cd2 < cd1) return 1;
ch1 += incr1;
ch2 += incr2;
}
if (pampf1->length > pampf2->length) return 1;
if (pampf2->length > pampf1->length) return -1;
return 0;
}
void twalkaction (const void *node, VISIT order, int level)
{
int32_t *taxid = (int32_t*)node;
//const size_t taxid=(size_t)node;
//printf ("\t%d:%p, ", *taxid, node);
counttaxon(*taxid);
}
int32_t gtxid;
void twalkaction2 (const void *node, VISIT order, int level)
{
int32_t *pt = (int32_t *) node;
gtxid = *pt;
}
void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize)
{
uint32_t i, j;
uint32_t ampfindex = 0;
int32_t taxid;
uint32_t wellidentifiedcount;
void *ampftree = NULL;
pamptotaxon_t pcurrentampf;
pamptotaxon_t *ptmp;
pamptotaxon_t ampfwithtaxtree = ECOMALLOC(sizeof(amptotaxon_t) * pair->pcr.ampcount,"Cannot allocate amplifia tree");
for (i = 0; i < pair->pcr.ampcount; i++)
{
/*populate taxon ids tree against each unique amplifia
i.e set of taxon ids for each amplifia*/
if (pair->pcr.amplifias[i].sequence->isexample)
{
ampfwithtaxtree[ampfindex].amplifia = pair->pcr.amplifias[i].amplifia;
ampfwithtaxtree[ampfindex].strand = pair->pcr.amplifias[i].strand;
ampfwithtaxtree[ampfindex].length = pair->pcr.amplifias[i].length;
pcurrentampf = &ampfwithtaxtree[ampfindex];
taxid = pair->pcr.amplifias[i].sequence->ranktaxonid;
ptmp = tfind((const void*)pcurrentampf, &ampftree, cmpamp);
if (ptmp == NULL)
{
pcurrentampf = &ampfwithtaxtree[ampfindex];
tsearch((void*)pcurrentampf,&ampftree,cmpamp);
ampfindex++;
}
else
pcurrentampf = *ptmp;
if (tfind((void*)((size_t)taxid), &(pcurrentampf->taxontree), cmptaxon) == NULL)
{
pcurrentampf->taxoncount++;
tsearch((void*)((size_t)taxid),&(pcurrentampf->taxontree),cmptaxon);
}
}
}
memset (pair->wellIdentifiedSeqs, 0, seqdbsize*sizeof (int));
//counttaxon(-1);
for (i = 0; i < ampfindex; i++)
{
if (ampfwithtaxtree[i].taxoncount > 1)
{
//printf ("\nampfwithtaxtree[i].taxoncount: %d\n", ampfwithtaxtree[i].taxoncount);
//twalk(ampfwithtaxtree[i].taxontree, twalkaction);
}
//TR 5/9/10 - added code for well identified seqs
else if(ampfwithtaxtree[i].taxoncount == 1) /*well identified*/
{
gtxid = -1;
twalk(ampfwithtaxtree[i].taxontree, twalkaction2);
if (gtxid != -1)
{
for (j = 0; j < seqdbsize; j++)
if (seqdb[j]->ranktaxonid == gtxid
&& seqdb[j]->isexample
&&(pair->p1->directCount[j] > 0
|| pair->p1->reverseCount[j] > 0)
&& (pair->p2->directCount[j] > 0
|| pair->p2->reverseCount[j] > 0))
{
pair->wellIdentifiedSeqs[j] = 1;
}
}
}
}
//printf ("\n");
counttaxon(-1);
wellidentifiedcount = 0;
for (j = 0; j < seqdbsize; j++)
if (pair->wellIdentifiedSeqs[j] == 1)
counttaxon(seqdb[j]->ranktaxonid);
wellidentifiedcount = counttaxon(-2);
//pair->notwellidentifiedtaxa = counttaxon(-2);
pair->notwellidentifiedtaxa = (pair->intaxa-wellidentifiedcount); //counttaxon(-2);
//pair->bs = ((float)pair->intaxa - (float)pair->notwellidentifiedtaxa) / pair->intaxa;
pair->bs = ((float)wellidentifiedcount) / (float)pair->intaxa;
ECOFREE (ampfwithtaxtree, "Free amplifia table");
}

23
src/libthermo/Makefile Normal file
View File

@ -0,0 +1,23 @@
SOURCES = nnparams.c \
thermostats.c
SRCS=$(SOURCES)
OBJECTS= $(patsubst %.c,%.o,$(SOURCES))
LIBFILE= libthermo.a
RANLIB= ranlib
include ../global.mk
all: $(LIBFILE)
clean:
rm -rf $(OBJECTS) $(LIBFILE)
$(LIBFILE): $(OBJECTS)
ar -cr $@ $?
$(RANLIB) $@

600
src/libthermo/nnparams.c Normal file
View File

@ -0,0 +1,600 @@
/*
* nnparams.cpp
* PHunterLib
*
* Nearest Neighbor Model / Parameters
*
* Created by Tiayyba Riaz on 7/2/09.
*
*/
#include <memory.h>
#include <math.h>
#include <stdio.h>
#include <string.h>
#include"nnparams.h"
double forbidden_entropy;
double nparam_GetInitialEntropy(PNNParams nparm)
{
return -5.9f+nparm->rlogc;
}
//Retrieve Enthalpy for given NN-Pair from parameter table
double nparam_GetEnthalpy(PNNParams nparm, char x0, char x1, char y0, char y1)
{
return ndH(x0,x1,y0,y1); //xx, yx are already numbers
}
//Retrieve Entropy for given NN-Pair from parameter table
double nparam_GetEntropy(PNNParams nparm, char x0, char x1, char y0, char y1)
{
//xx and yx are already numbers
char nx0=x0;//nparam_convertNum(x0);
char nx1=x1;//nparam_convertNum(x1);
char ny0=y0;//nparam_convertNum(y0);
char ny1=y1;//nparam_convertNum(y1);
double answer = ndS(nx0,nx1,ny0,ny1);
/*Salt correction Santalucia*/
if (nparm->saltMethod == SALT_METHOD_SANTALUCIA) {
if(nx0!=5 && 1<= nx1 && nx1<=4) {
answer += 0.5*nparm->kfac;
}
if(ny1!=5 && 1<= ny0 && ny0<=4) {
answer += 0.5*nparm->kfac;
}
}
/*Salt correction Owczarzy*/
if (nparm->saltMethod == SALT_METHOD_OWCZARZY) {
double logk = log(nparm->kplus);
answer += ndH(nx0,nx1,ny0,ny1)*((4.29 * nparm->gcContent-3.95)*0.00001*logk+ 0.0000094*logk*logk);
}
return answer;
}
/* PURPOSE: Return melting temperature TM for given entropy and enthalpy
* Assuming a one-state transition and using the formula
* TM = dH / (dS + R ln(Ct/4))
* entropy = dS + R ln Ct/4 (must already be included!)
* enthaklpy = dH
* where
* dH = enthalpy
* dS = entropy
* R = Boltzmann factor
* Ct = Strand Concentration
*
* PARAMETERS:
* entrypy and enthalpy
*
* RETURN VALUE:
* temperature
*/
double nparam_CalcTM(double entropy,double enthalpy)
{
double tm = 0; // absolute zero - return if model fails!
if (enthalpy>=forbidden_enthalpy) //||(entropy==-cfact))
return 0;
if (entropy<0) // avoid division by zero and model errors!
{
tm = enthalpy/entropy;// - kfac; //LKFEB
if (tm<0)
return 0;
}
return tm;
}
void nparam_InitParams(PNNParams nparm, double c1, double c2, double kp, int sm)
{
nparm->Ct1 = c1;
nparm->Ct2 = c2;
nparm->kplus = kp;
int maxCT = 1;
if(nparm->Ct2 > nparm->Ct1)
{
maxCT = 2;
}
double ctFactor;
if(nparm->Ct1 == nparm->Ct2)
{
ctFactor = nparm->Ct1/2;
}
else if (maxCT == 1)
{
ctFactor = nparm->Ct1-nparm->Ct2/2;
}
else
{
ctFactor = nparm->Ct2-nparm->Ct1/2;
}
nparm->rlogc = R * log(ctFactor);
forbidden_entropy = nparm->rlogc;
nparm->kfac = 0.368 * log (nparm->kplus);
nparm->saltMethod = sm;
int x,y,a,b; // variables used as counters...
// Set all parameters to zero!
memset(nparm->dH,0,sizeof(nparm->dH));
memset(nparm->dS,0,sizeof(nparm->dS));
// Set all X-/Y-, -X/Y- and X-/-Y so, that TM will be VERY small!
for (x=1;x<=4;x++)
{
for (y=1;y<=4;y++)
{
ndH(0,x,y,0)=forbidden_enthalpy;
ndS(0,x,y,0)=forbidden_entropy;
ndH(x,0,0,y)=forbidden_enthalpy;
ndS(x,0,0,y)=forbidden_entropy;
ndH(x,0,y,0)=forbidden_enthalpy;
ndS(x,0,y,0)=forbidden_entropy;
// forbid X-/Y$ and X$/Y- etc., i.e. terminal must not be paired with gap!
ndH(x,5,y,0)=forbidden_enthalpy;
ndS(x,5,y,0)=forbidden_entropy;
ndH(x,0,y,5)=forbidden_enthalpy;
ndS(x,0,y,5)=forbidden_entropy;
ndH(5,x,0,y)=forbidden_enthalpy;
ndS(5,x,0,y)=forbidden_entropy;
ndH(0,x,5,y)=forbidden_enthalpy;
ndS(0,x,5,y)=forbidden_entropy;
// forbid X$/-Y etc.
ndH(x,5,0,y)=forbidden_enthalpy;
ndS(x,5,0,y)=forbidden_entropy;
ndH(x,0,5,y)=forbidden_enthalpy;
ndS(x,0,5,y)=forbidden_entropy;
ndH(5,x,y,0)=forbidden_enthalpy;
ndS(5,x,y,0)=forbidden_entropy;
ndH(0,x,y,5)=forbidden_enthalpy;
ndS(0,x,y,5)=forbidden_entropy;
}
// also, forbid x-/-- and --/x-, i.e. no two inner gaps paired
ndH(x,0,0,0)=forbidden_enthalpy;
ndS(x,0,0,0)=forbidden_entropy;
ndH(0,0,x,0)=forbidden_enthalpy;
ndS(0,0,x,0)=forbidden_entropy;
// x-/-$
ndH(x,0,0,5)=forbidden_enthalpy;
ndS(x,0,0,5)=forbidden_entropy;
ndH(5,0,0,x)=forbidden_enthalpy;
ndS(5,0,0,x)=forbidden_entropy;
ndH(0,5,x,0)=forbidden_enthalpy;
ndS(x,0,0,5)=forbidden_entropy;
ndH(0,x,5,0)=forbidden_enthalpy;
ndS(0,x,5,0)=forbidden_entropy;
}
// forbid --/--
ndH(0,0,0,0)=forbidden_enthalpy;
ndS(0,0,0,0)=forbidden_entropy;
ndH(5,0,0,0)=forbidden_enthalpy;
ndS(5,0,0,0)=forbidden_entropy;
ndH(0,0,5,0)=forbidden_enthalpy;
ndS(0,0,5,0)=forbidden_entropy;
ndH(0,5,5,0)=forbidden_enthalpy;
ndS(0,5,5,0)=forbidden_entropy;
// Interior loops (double Mismatches)
#define iloop_entropy -0.97f
#define iloop_enthalpy 0.0f
for (x=1; x<=4; x++)
for (y=1; y<=4; y++)
for (a=1; a<=4; a++)
for (b=1; b<=4; b++)
// AT and CG pair, and as A=1, C=2, G=3, T=4 this means
// we have Watson-Crick pairs if (x+a==5) and (y+b)==5.
if (!((x+a==5)||(y+b==5)))
{
// No watson-crick-pair, i.e. double mismatch!
// set enthalpy/entropy to loop expansion!
ndH(x,y,a,b) = iloop_enthalpy;
ndS(x,y,a,b) = iloop_entropy;
}
// xy/-- and --/xy (Bulge Loops of size > 1)
#define bloop_entropy -1.3f
#define bloop_enthalpy 0.0f
for (x=1; x<=4; x++)
for (y=1; y<=4; y++)
{
ndH(x,y,0,0) = bloop_enthalpy;
ndS(x,y,0,0) = bloop_entropy;
ndH(0,0,x,y) = bloop_enthalpy;
ndS(0,0,x,y) = bloop_entropy;
}
// x-/ya abd xa/y- as well as -x/ay and ax/-y
// bulge opening and closing parameters with
// adjacent matches / mismatches
// obulge_mism and cbulge_mism chosen so high to avoid
// AAAAAAAAA
// T--G----T
// being better than
// AAAAAAAAA
// TG------T
#define obulge_match_H (-2.66f * 1000)
#define obulge_match_S -14.22f
#define cbulge_match_H (-2.66f * 1000)
#define cbulge_match_S -14.22f
#define obulge_mism_H (0.0f * 1000)
#define obulge_mism_S -6.45f
#define cbulge_mism_H 0.0f
#define cbulge_mism_S -6.45f
for (x=1; x<=4; x++)
for (y=1; y<=4; y++)
for (a=1; a<=4; a++)
{
if (x+y==5) // other base pair matches!
{
ndH(x,0,y,a)=obulge_match_H; // bulge opening
ndS(x,0,y,a)=obulge_match_S;
ndH(x,a,y,0)=obulge_match_H;
ndS(x,a,y,0)=obulge_match_S;
ndH(0,x,a,y)=cbulge_match_H; // bulge closing
ndS(0,x,a,y)=cbulge_match_S;
ndH(a,x,0,y)=cbulge_match_H;
ndS(a,x,0,y)=cbulge_match_S;
}
else
{ // mismatch in other base pair!
ndH(x,0,y,a)=obulge_mism_H; // bulge opening
ndS(x,0,y,a)=obulge_mism_S;
ndH(x,a,y,0)=obulge_mism_H;
ndS(x,a,y,0)=obulge_mism_S;
ndH(0,x,a,y)=cbulge_mism_H; // bulge closing
ndS(0,x,a,y)=cbulge_mism_S;
ndH(a,x,0,y)=cbulge_mism_H;
ndS(a,x,0,y)=cbulge_mism_S;
}
}
// Watson-Crick pairs (note that only ten are unique, as obviously
// 5'-AG-3'/3'-TC-5' = 5'-CT-3'/3'-GA-5' etc.
ndH(1,1,4,4)=-7.6f*1000; ndS(1,1,4,4)=-21.3f; // AA/TT 04
ndH(1,2,4,3)=-8.4f*1000; ndS(1,2,4,3)=-22.4f; // AC/TG adapted GT/CA
ndH(1,3,4,2)=-7.8f*1000; ndS(1,3,4,2)=-21.0f; // AG/TC adapted CT/GA
ndH(1,4,4,1)=-7.2f*1000; ndS(1,4,4,1)=-20.4f; // AT/TA 04
ndH(2,1,3,4)=-8.5f*1000; ndS(2,1,3,4)=-22.7f; // CA/GT 04
ndH(2,2,3,3)=-8.0f*1000; ndS(2,2,3,3)=-19.9f; // CC/GG adapted GG/CC
ndH(2,3,3,2)=-10.6f*1000; ndS(2,3,3,2)=-27.2f; // CG/GC 04
ndH(2,4,3,1)=-7.8f*1000; ndS(2,4,3,1)=-21.0f; // CT/GA 04
ndH(3,1,2,4)=-8.2f*1000; ndS(3,1,2,4)=-22.2f; // GA/CT 04
ndH(3,2,2,3)=-9.8f*1000; ndS(3,2,2,3)=-24.4f; // GC/CG 04
ndH(3,3,2,2)=-8.0f*1000; ndS(3,3,2,2)=-19.9f; // GG/CC 04
ndH(3,4,2,1)=-8.4f*1000; ndS(3,4,2,1)=-22.4f; // GT/CA 04
ndH(4,1,1,4)=-7.2f*1000; ndS(4,1,1,4)=-21.3f; // TA/AT 04
ndH(4,2,1,3)=-8.2f*1000; ndS(4,2,1,3)=-22.2f; // TC/AG adapted GA/CT
ndH(4,3,1,2)=-8.5f*1000; ndS(4,3,1,2)=-22.7f; // TG/AC adapted CA/GT
ndH(4,4,1,1)=-7.6f*1000; ndS(4,4,1,1)=-21.3f; // TT/AA adapted AA/TT
// A-C Mismatches (Values for pH 7.0)
ndH(1,1,2,4)=7.6f*1000; ndS(1,1,2,4)=20.2f; // AA/CT
ndH(1,1,4,2)=2.3f*1000; ndS(1,1,4,2)=4.6f; // AA/TC
ndH(1,2,2,3)=-0.7f*1000; ndS(1,2,2,3)=-3.8f; // AC/CG
ndH(1,2,4,1)=5.3f*1000; ndS(1,2,4,1)=14.6f; // AC/TA
ndH(1,3,2,2)=0.6f*1000; ndS(1,3,2,2)=-0.6f; // AG/CC
ndH(1,4,2,1)=5.3f*1000; ndS(1,4,2,1)=14.6f; // AT/CA
ndH(2,1,1,4)=3.4f*1000; ndS(2,1,1,4)=8.0f; // CA/AT
ndH(2,1,3,2)=1.9f*1000; ndS(2,1,3,2)=3.7f; // CA/GC
ndH(2,2,1,3)=5.2f*1000; ndS(2,2,1,3)=14.2f; // CC/AG
ndH(2,2,3,1)=0.6f*1000; ndS(2,2,3,1)=-0.6f; // CC/GA
ndH(2,3,1,2)=1.9f*1000; ndS(2,3,1,2)=3.7f; // CG/AC
ndH(2,4,1,1)=2.3f*1000; ndS(2,4,1,1)=4.6f; // CT/AA
ndH(3,1,2,2)=5.2f*1000; ndS(3,1,2,2)=14.2f; // GA/CC
ndH(3,2,2,1)=-0.7f*1000; ndS(3,2,2,1)=-3.8f; // GC/CA
ndH(4,1,1,2)=3.4f*1000; ndS(4,1,1,2)=8.0f; // TA/AC
ndH(4,2,1,1)=7.6f*1000; ndS(4,2,1,1)=20.2f; // TC/AA
// C-T Mismatches
ndH(1,2,4,4)=0.7f*1000; ndS(1,2,4,4)=0.2f; // AC/TT
ndH(1,4,4,2)=-1.2f*1000; ndS(1,4,4,2)=-6.2f; // AT/TC
ndH(2,1,4,4)=1.0f*1000; ndS(2,1,4,4)=0.7f; // CA/TT
ndH(2,2,3,4)=-0.8f*1000; ndS(2,2,3,4)=-4.5f; // CC/GT
ndH(2,2,4,3)=5.2f*1000; ndS(2,2,4,3)=13.5f; // CC/TG
ndH(2,3,4,2)=-1.5f*1000; ndS(2,3,4,2)=-6.1f; // CG/TC
ndH(2,4,3,2)=-1.5f*1000; ndS(2,4,3,2)=-6.1f; // CT/GC
ndH(2,4,4,1)=-1.2f*1000; ndS(2,4,4,1)=-6.2f; // CT/TA
ndH(3,2,2,4)=2.3f*1000; ndS(3,2,2,4)=5.4f; // GC/CT
ndH(3,4,2,2)=5.2f*1000; ndS(3,4,2,2)=13.5f; // GT/CC
ndH(4,1,2,4)=1.2f*1000; ndS(4,1,2,4)=0.7f; // TA/CT
ndH(4,2,2,3)=2.3f*1000; ndS(4,2,2,3)=5.4f; // TC/CG
ndH(4,2,1,4)=1.2f*1000; ndS(4,2,1,4)=0.7f; // TC/AT
ndH(4,3,2,2)=-0.8f*1000; ndS(4,3,2,2)=-4.5f; // TG/CC
ndH(4,4,2,1)=0.7f*1000; ndS(4,4,2,1)=0.2f; // TT/CA
ndH(4,4,1,2)=1.0f*1000; ndS(4,4,1,2)=0.7f; // TT/AC
// G-A Mismatches
ndH(1,1,3,4)=3.0f*1000; ndS(1,1,3,4)=7.4f; // AA/GT
ndH(1,1,4,3)=-0.6f*1000; ndS(1,1,4,3)=-2.3f; // AA/TG
ndH(1,2,3,3)=0.5f*1000; ndS(1,2,3,3)=3.2f; // AC/GG
ndH(1,3,3,2)=-4.0f*1000; ndS(1,3,3,2)=-13.2f; // AG/GC
ndH(1,3,4,1)=-0.7f*1000; ndS(1,3,4,1)=-2.3f; // AG/TA
ndH(1,4,3,1)=-0.7f*1000; ndS(1,4,3,1)=-2.3f; // AT/GA
ndH(2,1,3,3)=-0.7f*1000; ndS(2,1,3,3)=-2.3f; // CA/GG
ndH(2,3,3,1)=-4.0f*1000; ndS(2,3,3,1)=-13.2f; // CG/GA
ndH(3,1,1,4)=0.7f*1000; ndS(3,1,1,4)=0.7f; // GA/AT
ndH(3,1,2,3)=-0.6f*1000; ndS(3,1,2,3)=-1.0f; // GA/CG
ndH(3,2,1,3)=-0.6f*1000; ndS(3,2,1,3)=-1.0f; // GC/AG
ndH(3,3,1,2)=-0.7f*1000; ndS(3,3,1,2)=-2.3f; // GG/AC
ndH(3,3,2,1)=0.5f*1000; ndS(3,3,2,1)=3.2f; // GG/CA
ndH(3,4,1,1)=-0.6f*1000; ndS(3,4,1,1)=-2.3f; // GT/AA
ndH(4,1,1,3)=0.7f*1000; ndS(4,1,1,3)=0.7f; // TA/AG
ndH(4,3,1,1)=3.0f*1000; ndS(4,3,1,1)=7.4f; // TG/AA
// G-T Mismatches
ndH(1,3,4,4)=1.0f*1000; ndS(1,3,4,4)=0.9f; // AG/TT
ndH(1,4,4,3)=-2.5f*1000; ndS(1,4,4,3)=-8.3f; // AT/TG
ndH(2,3,3,4)=-4.1f*1000; ndS(2,3,3,4)=-11.7f; // CG/GT
ndH(2,4,3,3)=-2.8f*1000; ndS(2,4,3,3)=-8.0f; // CT/GG
ndH(3,1,4,4)=-1.3f*1000; ndS(3,1,4,4)=-5.3f; // GA/TT
ndH(3,2,4,3)=-4.4f*1000; ndS(3,2,4,3)=-12.3f; // GC/TG
ndH(3,3,2,4)=3.3f*1000; ndS(3,3,2,4)=10.4f; // GG/CT
ndH(3,3,4,2)=-2.8f*1000; ndS(3,3,4,2)=-8.0f; // GG/TC
// ndH(3,3,4,4)=5.8f*1000; ndS(3,3,4,4)=16.3f; // GG/TT
ndH(3,4,2,3)=-4.4f*1000; ndS(3,4,2,3)=-12.3f; // GT/CG
ndH(3,4,4,1)=-2.5f*1000; ndS(3,4,4,1)=-8.3f; // GT/TA
// ndH(3,4,4,3)=4.1f*1000; ndS(3,4,4,3)=9.5f; // GT/TG
ndH(4,1,3,4)=-0.1f*1000; ndS(4,1,3,4)=-1.7f; // TA/GT
ndH(4,2,3,3)=3.3f*1000; ndS(4,2,3,3)=10.4f; // TC/GG
ndH(4,3,1,4)=-0.1f*1000; ndS(4,3,1,4)=-1.7f; // TG/AT
ndH(4,3,3,2)=-4.1f*1000; ndS(4,3,3,2)=-11.7f; // TG/GC
// ndH(4,3,3,4)=-1.4f*1000; ndS(4,3,3,4)=-6.2f; // TG/GT
ndH(4,4,1,3)=-1.3f*1000; ndS(4,4,1,3)=-5.3f; // TT/AG
ndH(4,4,3,1)=1.0f*1000; ndS(4,4,3,1)=0.9f; // TT/GA
// ndH(4,4,3,3)=5.8f*1000; ndS(4,4,3,3)=16.3f; // TT/GG
// A-A Mismatches
ndH(1,1,1,4)=4.7f*1000; ndS(1,1,1,4)=12.9f; // AA/AT
ndH(1,1,4,1)=1.2f*1000; ndS(1,1,4,1)=1.7f; // AA/TA
ndH(1,2,1,3)=-2.9f*1000; ndS(1,2,1,3)=-9.8f; // AC/AG
ndH(1,3,1,2)=-0.9f*1000; ndS(1,3,1,2)=-4.2f; // AG/AC
ndH(1,4,1,1)=1.2f*1000; ndS(1,4,1,1)=1.7f; // AT/AA
ndH(2,1,3,1)=-0.9f*1000; ndS(2,1,3,1)=-4.2f; // CA/GA
ndH(3,1,2,1)=-2.9f*1000; ndS(3,1,2,1)=-9.8f; // GA/CA
ndH(4,1,1,1)=4.7f*1000; ndS(4,1,1,1)=12.9f; // TA/AA
// C-C Mismatches
ndH(1,2,4,2)=0.0f*1000; ndS(1,2,4,2)=-4.4f; // AC/TC
ndH(2,1,2,4)=6.1f*1000; ndS(2,1,2,4)=16.4f; // CA/CT
ndH(2,2,2,3)=3.6f*1000; ndS(2,2,2,3)=8.9f; // CC/CG
ndH(2,2,3,2)=-1.5f*1000; ndS(2,2,3,2)=-7.2f; // CC/GC
ndH(2,3,2,2)=-1.5f*1000; ndS(2,3,2,2)=-7.2f; // CG/CC
ndH(2,4,2,1)=0.0f*1000; ndS(2,4,2,1)=-4.4f; // CT/CA
ndH(3,2,2,2)=3.6f*1000; ndS(3,2,2,2)=8.9f; // GC/CC
ndH(4,2,1,2)=6.1f*1000; ndS(4,2,1,2)=16.4f; // TC/AC
// G-G Mismatches
ndH(1,3,4,3)=-3.1f*1000; ndS(1,3,4,3)=-9.5f; // AG/TG
ndH(2,3,3,3)=-4.9f*1000; ndS(2,3,3,3)=-15.3f; // CG/GG
ndH(3,1,3,4)=1.6f*1000; ndS(3,1,3,4)=3.6f; // GA/GT
ndH(3,2,3,3)=-6.0f*1000; ndS(3,2,3,3)=-15.8f; // GC/GG
ndH(3,3,2,3)=-6.0f*1000; ndS(3,3,2,3)=-15.8f; // GG/CG
ndH(3,3,3,2)=-4.9f*1000; ndS(3,3,3,2)=-15.3f; // GG/GC
ndH(3,4,3,1)=-3.1f*1000; ndS(3,4,3,1)=-9.5f; // GT/GA
ndH(4,3,1,3)=1.6f*1000; ndS(4,3,1,3)=3.6f; // TG/AG
// T-T Mismatches
ndH(1,4,4,4)=-2.7f*1000; ndS(1,4,4,4)=-10.8f; // AT/TT
ndH(2,4,3,4)=-5.0f*1000; ndS(2,4,3,4)=-15.8f; // CT/GT
ndH(3,4,2,4)=-2.2f*1000; ndS(3,4,2,4)=-8.4f; // GT/CT
ndH(4,1,4,4)=0.2f*1000; ndS(4,1,4,4)=-1.5f; // TA/TT
ndH(4,2,4,3)=-2.2f*1000; ndS(4,2,4,3)=-8.4f; // TC/TG
ndH(4,3,4,2)=-5.0f*1000; ndS(4,3,4,2)=-15.8f; // TG/TC
ndH(4,4,1,4)=0.2f*1000; ndS(4,4,1,4)=-1.5f; // TT/AT
ndH(4,4,4,1)=-2.7f*1000; ndS(4,4,4,1)=-10.8f; // TT/TA
// Dangling Ends
ndH(5,1,1,4)=-0.7f*1000; ndS(5,1,1,4)=-0.8f; // $A/AT
ndH(5,1,2,4)=4.4f*1000; ndS(5,1,2,4)=14.9f; // $A/CT
ndH(5,1,3,4)=-1.6f*1000; ndS(5,1,3,4)=-3.6f; // $A/GT
ndH(5,1,4,4)=2.9f*1000; ndS(5,1,4,4)=10.4f; // $A/TT
ndH(5,2,1,3)=-2.1f*1000; ndS(5,2,1,3)=-3.9f; // $C/AG
ndH(5,2,2,3)=-0.2f*1000; ndS(5,2,2,3)=-0.1f; // $C/CG
ndH(5,2,3,3)=-3.9f*1000; ndS(5,2,3,3)=-11.2f; // $C/GG
ndH(5,2,4,3)=-4.4f*1000; ndS(5,2,4,3)=-13.1f; // $C/TG
ndH(5,3,1,2)=-5.9f*1000; ndS(5,3,1,2)=-16.5f; // $G/AC
ndH(5,3,2,2)=-2.6f*1000; ndS(5,3,2,2)=-7.4f; // $G/CC
ndH(5,3,3,2)=-3.2f*1000; ndS(5,3,3,2)=-10.4f; // $G/GC
ndH(5,3,4,2)=-5.2f*1000; ndS(5,3,4,2)=-15.0f; // $G/TC
ndH(5,4,1,1)=-0.5f*1000; ndS(5,4,1,1)=-1.1f; // $T/AA
ndH(5,4,2,1)=4.7f*1000; ndS(5,4,2,1)=14.2f; // $T/CA
ndH(5,4,3,1)=-4.1f*1000; ndS(5,4,3,1)=-13.1f; // $T/GA
ndH(5,4,4,1)=-3.8f*1000; ndS(5,4,4,1)=-12.6f; // $T/TA
ndH(1,5,4,1)=-2.9f*1000; ndS(1,5,4,1)=-7.6f; // A$/TA
ndH(1,5,4,2)=-4.1f*1000; ndS(1,5,4,2)=-13.0f; // A$/TC
ndH(1,5,4,3)=-4.2f*1000; ndS(1,5,4,3)=-15.0f; // A$/TG
ndH(1,5,4,4)=-0.2f*1000; ndS(1,5,4,4)=-0.5f; // A$/TT
ndH(1,1,5,4)=0.2f*1000; ndS(1,1,5,4)=2.3f; // AA/$T
ndH(1,1,4,5)=-0.5f*1000; ndS(1,1,4,5)=-1.1f; // AA/T$
ndH(1,2,5,3)=-6.3f*1000; ndS(1,2,5,3)=-17.1f; // AC/$G
ndH(1,2,4,5)=4.7f*1000; ndS(1,2,4,5)=14.2f; // AC/T$
ndH(1,3,5,2)=-3.7f*1000; ndS(1,3,5,2)=-10.0f; // AG/$C
ndH(1,3,4,5)=-4.1f*1000; ndS(1,3,4,5)=-13.1f; // AG/T$
ndH(1,4,5,1)=-2.9f*1000; ndS(1,4,5,1)=-7.6f; // AT/$A
ndH(1,4,4,5)=-3.8f*1000; ndS(1,4,4,5)=-12.6f; // AT/T$
ndH(2,5,3,1)=-3.7f*1000; ndS(2,5,3,1)=-10.0f; // C$/GA
ndH(2,5,3,2)=-4.0f*1000; ndS(2,5,3,2)=-11.9f; // C$/GC
ndH(2,5,3,3)=-3.9f*1000; ndS(2,5,3,3)=-10.9f; // C$/GG
ndH(2,5,3,4)=-4.9f*1000; ndS(2,5,3,4)=-13.8f; // C$/GT
ndH(2,1,5,4)=0.6f*1000; ndS(2,1,5,4)=3.3f; // CA/$T
ndH(2,1,3,5)=-5.9f*1000; ndS(2,1,3,5)=-16.5f; // CA/G$
ndH(2,2,5,3)=-4.4f*1000; ndS(2,2,5,3)=-12.6f; // CC/$G
ndH(2,2,3,5)=-2.6f*1000; ndS(2,2,3,5)=-7.4f; // CC/G$
ndH(2,3,5,2)=-4.0f*1000; ndS(2,3,5,2)=-11.9f; // CG/$C
ndH(2,3,3,5)=-3.2f*1000; ndS(2,3,3,5)=-10.4f; // CG/G$
ndH(2,4,5,1)=-4.1f*1000; ndS(2,4,5,1)=-13.0f; // CT/$A
ndH(2,4,3,5)=-5.2f*1000; ndS(2,4,3,5)=-15.0f; // CT/G$
ndH(3,5,2,1)=-6.3f*1000; ndS(3,5,2,1)=-17.1f; // G$/CA
ndH(3,5,2,2)=-4.4f*1000; ndS(3,5,2,2)=-12.6f; // G$/CC
ndH(3,5,2,3)=-5.1f*1000; ndS(3,5,2,3)=-14.0f; // G$/CG
ndH(3,5,2,4)=-4.0f*1000; ndS(3,5,2,4)=-10.9f; // G$/CT
ndH(3,1,5,4)=-1.1f*1000; ndS(3,1,5,4)=-1.6f; // GA/$T
ndH(3,1,2,5)=-2.1f*1000; ndS(3,1,2,5)=-3.9f; // GA/C$
ndH(3,2,5,3)=-5.1f*1000; ndS(3,2,5,3)=-14.0f; // GC/$G
ndH(3,2,2,5)=-0.2f*1000; ndS(3,2,2,5)=-0.1f; // GC/C$
ndH(3,3,5,2)=-3.9f*1000; ndS(3,3,5,2)=-10.9f; // GG/$C
ndH(3,3,2,5)=-3.9f*1000; ndS(3,3,2,5)=-11.2f; // GG/C$
ndH(3,4,5,1)=-4.2f*1000; ndS(3,4,5,1)=-15.0f; // GT/$A
ndH(3,4,2,5)=-4.4f*1000; ndS(3,4,2,5)=-13.1f; // GT/C$
ndH(4,5,1,1)=0.2f*1000; ndS(4,5,1,1)=2.3f; // T$/AA
ndH(4,5,1,2)=0.6f*1000; ndS(4,5,1,2)=3.3f; // T$/AC
ndH(4,5,1,3)=-1.1f*1000; ndS(4,5,1,3)=-1.6f; // T$/AG
ndH(4,5,1,4)=-6.9f*1000; ndS(4,5,1,4)=-20.0f; // T$/AT
ndH(4,1,5,4)=-6.9f*1000; ndS(4,1,5,4)=-20.0f; // TA/$T
ndH(4,1,1,5)=-0.7f*1000; ndS(4,1,1,5)=-0.7f; // TA/A$
ndH(4,2,5,3)=-4.0f*1000; ndS(4,2,5,3)=-10.9f; // TC/$G
ndH(4,2,1,5)=4.4f*1000; ndS(4,2,1,5)=14.9f; // TC/A$
ndH(4,3,5,2)=-4.9f*1000; ndS(4,3,5,2)=-13.8f; // TG/$C
ndH(4,3,1,5)=-1.6f*1000; ndS(4,3,1,5)=-3.6f; // TG/A$
ndH(4,4,5,1)=-0.2f*1000; ndS(4,4,5,1)=-0.5f; // TT/$A
ndH(4,4,1,5)=2.9f*1000; ndS(4,4,1,5)=10.4f; // TT/A$
return;
}
int nparam_CountGCContent(char * seq ) {
int lseq = strlen(seq);
int k;
double count = 0;
for( k=0;k<lseq;k++) {
if (seq[k] == 'G' || seq[k] == 'C' ) {
count+=1;
}
}
return count;
}
void nparam_CleanSeq (char* inseq, char* outseq, int len)
{
int seqlen = strlen (inseq);
int i, j;
if (len != 0)
seqlen = len;
outseq[0]='x';
for (i = 0, j = 0; i < seqlen && outseq[0]; i++,j++)
{
switch (inseq[i])
{
case 'a':
case '\0':
case 'A':
outseq[j] = 'A'; break;
case 'c':
case '\1':
case 'C':
outseq[j] = 'C'; break;
case 'g':
case '\2':
case 'G':
outseq[j] = 'G'; break;
case 't':
case '\3':
case 'T':
outseq[j] = 'T'; break;
default:
outseq[0]=0;
}
}
outseq[j] = '\0';
}
//Calculate TM for given sequence against its complement
double nparam_CalcSelfTM(PNNParams nparm, char* seq, int len)
{
double thedH = 0;
//double thedS = nparam_GetInitialEntropy(nparm);
double thedS = -5.9f+nparm->rlogc;
double mtemp;
char c1;
char c2;
char c3;
char c4;
unsigned int i;
char nseq[50];
char *useq = seq;
nparam_CleanSeq (seq, nseq, len);
useq = nseq;
for ( i=1;i<len;i++)
{
c1 = GETREVCODE(useq[i-1]); //nparam_getComplement(seq[i-1],1);
c2 = GETREVCODE(useq[i]); //nparam_getComplement(seq[i],1);
c3 = GETNUMCODE(useq[i-1]);
c4 = GETNUMCODE(useq[i]);
thedH += nparm->dH[c3][c4][c1][c2];//nparam_GetEnthalpy(nparm, c3,c4,c1,c2);
thedS += nparam_GetEntropy(nparm, c3,c4,c1,c2);
}
//printf("------------------\n");
mtemp = nparam_CalcTM(thedS,thedH);
//fprintf(stderr,"Enthalpy: %f, entropy: %f, seq: %s rloc=%f\n", thedH, thedS, useq, nparm->rlogc);
//exit (0);
return mtemp;
}
double nparam_CalcTwoTM(PNNParams nparm, char* seq1, char* seq2, int len)
{
double thedH = 0;
//double thedS = nparam_GetInitialEntropy(nparm);
double thedS = -5.9f+nparm->rlogc;
double mtemp;
char c1;
char c2;
char c3;
char c4;
unsigned int i;
char nseq1[50];
char nseq2[50];
char *useq1;
char *useq2;
nparam_CleanSeq (seq1, nseq1, len);
useq1 = nseq1;
nparam_CleanSeq (seq2, nseq2, len);
useq2 = nseq2;
//fprintf (stderr,"Primer : %s\n",useq);
for ( i=1;i<len;i++)
{
c1 = GETREVCODE(useq2[i-1]); //nparam_getComplement(seq[i-1],1);
c2 = GETREVCODE(useq2[i]); //nparam_getComplement(seq[i],1);
c3 = GETNUMCODE(useq1[i-1]);
c4 = GETNUMCODE(useq1[i]);
//fprintf (stderr,"Primer : %s %f %f %d %d, %d %d %f\n",useq,thedH,thedS,(int)c3,(int)c4,(int)c1,(int)c2,nparam_GetEnthalpy(nparm, c3,c4,c1,c2));
thedH += nparm->dH[c3][c4][c1][c2];//nparam_GetEnthalpy(nparm, c3,c4,c1,c2);
thedS += nparam_GetEntropy(nparm, c3,c4,c1,c2);
}
//fprintf(stderr,"------------------\n");
mtemp = nparam_CalcTM(thedS,thedH);
//if (mtemp == 0)
//{
// fprintf(stderr,"Enthalpy: %f, entropy: %f, seq: %s\n", thedH, thedS, useq);
//exit (0);
//}
return mtemp;
}
double calculateMeltingTemperatureBasic (char * seq) {
int gccount;
double temp;
int seqlen;
seqlen = strlen (seq);
gccount = nparam_CountGCContent (seq);
temp = 64.9 + 41*(gccount - 16.4)/seqlen;
return temp;
}

72
src/libthermo/nnparams.h Normal file
View File

@ -0,0 +1,72 @@
/*
* nnparams.h
* PHunterLib
*
* Nearest Neighbor Model Parameters
*
* Created by Tiayyba Riaz on 02/07/09.
*
*/
#ifndef NNPARAMS_H_
#define NNPARAMS_H_
#include <math.h>
#include <string.h>
//#include "../libecoprimer/ecoprimer.h"
// following defines to simplify coding...
#define ndH(a,b,c,d) nparm->dH[a][b][c][d]
#define ndS(a,b,c,d) nparm->dS[a][b][c][d]
#define forbidden_enthalpy 1000000000000000000.0f
#define R 1.987f
#define SALT_METHOD_SANTALUCIA 1
#define SALT_METHOD_OWCZARZY 2
#define DEF_CONC_PRIMERS 0.0000008
#define DEF_CONC_SEQUENCES 0
#define DEF_SALT 0.05
#define GETNUMCODE(a) bpencoder[a - 'A']
#define GETREVCODE(a) 5-bpencoder[a - 'A']
extern double forbidden_entropy;
static char bpencoder[] = { 1, // A
0, // b
2, // C
0,0,0, // d, e, f
3, // G
0,0,0,0,0,0,0,0,0,0,0,0, // h,i,j,k,l,m,n,o,p,q,r,s
4,0, // T,U
0,0,0,0,0}; // v,w,x,y,z
typedef struct CNNParams_st
{
double Ct1;
double Ct2;
double rlogc;
double kplus;
double kfac;
int saltMethod;
double gcContent;
double new_TM;
double dH[6][6][6][6]; // A-C-G-T + gap + initiation (dangling end, $ sign)
double dS[6][6][6][6];
}CNNParams, * PNNParams;
void nparam_InitParams(PNNParams nparm, double c1, double c2, double kp, int sm);
int nparam_CountGCContent(char * seq );
double nparam_GetEntropy(PNNParams nparm, char x0, char x1, char y0, char y1);
double nparam_GetEnthalpy(PNNParams nparm, char x0, char x1, char y0, char y1);
double nparam_CalcTM(double entropy,double enthalpy);
double nparam_CalcSelfTM(PNNParams nparm, char* seq, int len);
double nparam_CalcTwoTM(PNNParams nparm, char* seq1, char* seq2, int len);
double nparam_GetInitialEntropy(PNNParams nparm) ;
double calculateMeltingTemperatureBasic (char * seq);
//void getThermoProperties (ppair_t* pairs, size_t count, poptions_t options);
#endif

115
src/libthermo/thermostats.c Normal file
View File

@ -0,0 +1,115 @@
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <stdlib.h>
#include "thermostats.h"
word_t extractSite(char* sequence, size_t begin, size_t length, bool_t strand)
{
char *c;
char *start;
uint32_t l;
word_t site = 0;
start=sequence+begin;
if (!strand)
start+=length-1;
for (c=start,
l=0;
l<length;
l++,
c+=(strand)? 1:-1)
site = (site << 2) | ((strand)? (*c):(~*c)&3);
return site;
}
void getThermoProperties (ppair_t* pairs, size_t count, poptions_t options)
{
size_t i, j,k,l;
uint32_t bp1,bp2;
uint32_t ep1,ep2;
word_t w1;
word_t w2;
bool_t strand;
char *sq,*sq1,*sq2,*c;
char prmrd[50];
char prmrr[50];
char sqsite[50];
double mtemp;
for (i = 0; i < count; i++)
{
w1 = pairs[i]->p1->word;
w2 = pairs[i]->p2->word;
if (!pairs[i]->asdirect1)
w1=ecoComplementWord(w1,options->primer_length);
if (!pairs[i]->asdirect2)
w2=ecoComplementWord(w2,options->primer_length);
strncpy(prmrd,ecoUnhashWord(w1, options->primer_length),options->primer_length);
strncpy(prmrr,ecoUnhashWord(w2, options->primer_length),options->primer_length);
prmrd[options->primer_length]=0;
prmrr[options->primer_length]=0;
pairs[i]->p1temp = nparam_CalcSelfTM (options->pnparm, prmrd, options->primer_length) - 273.0;
pairs[i]->p2temp = nparam_CalcSelfTM (options->pnparm, prmrr, options->primer_length) - 273.0;
pairs[i]->p1mintemp = 100;
pairs[i]->p2mintemp = 100;
for (j = 0; j < pairs[i]->pcr.ampcount; j++)
if (pairs[i]->pcr.amplifias[j].sequence->isexample)
{
sq = pairs[i]->pcr.amplifias[j].sequence->SQ;
strand = pairs[i]->pcr.amplifias[j].strand;
bp1 = pairs[i]->pcr.amplifias[j].begin - options->primer_length;
bp2 = pairs[i]->pcr.amplifias[j].end + 1;
if (!strand)
{
uint32_t tmp;
tmp=bp1;
bp1=bp2;
bp2=tmp;
}
// printf("%s : %s, %c",prmrd,
// ecoUnhashWord(extractSite(sq,bp1,options->primer_length,strand),options->primer_length),
// "rd"[strand]);
mtemp = nparam_CalcTwoTM(options->pnparm,
prmrd,
ecoUnhashWord(extractSite(sq,bp1,options->primer_length,strand),options->primer_length),
options->primer_length) - 273.0;
// printf(" %4.2f %4.2f\n",pairs[i]->p1temp,mtemp);
if (mtemp < pairs[i]->p1mintemp)
pairs[i]->p1mintemp = mtemp;
// printf("%s : %s, %c\n",prmrr,ecoUnhashWord(extractSite(sq,bp2,options->primer_length,!strand),options->primer_length),
// "rd"[strand]);
//
mtemp = nparam_CalcTwoTM(options->pnparm,
prmrr,
ecoUnhashWord(extractSite(sq,bp2,options->primer_length,!strand),options->primer_length),
options->primer_length) - 273.0;
if (mtemp < pairs[i]->p2mintemp)
pairs[i]->p2mintemp = mtemp;
}
if (w2 < w1)
{
mtemp = pairs[i]->p1temp;
pairs[i]->p1temp = pairs[i]->p2temp;
pairs[i]->p2temp = mtemp;
mtemp = pairs[i]->p1mintemp;
pairs[i]->p1mintemp = pairs[i]->p2mintemp;
pairs[i]->p2mintemp = mtemp;
}
}
}

View File

@ -0,0 +1,9 @@
#ifndef THERMOSTATS_H_
#define THERMOSTATS_H_
#include "../libecoprimer/ecoprimer.h"
void getThermoProperties (ppair_t* pairs, size_t count, poptions_t options);
word_t extractSite(char* sequence, size_t begin, size_t length, bool_t strand);
#endif

651
tools/ecoPCRFormat.py Executable file
View File

@ -0,0 +1,651 @@
#!/usr/bin/env python
import re
import gzip
import struct
import sys
import time
import getopt
try:
import psycopg2
_dbenable=True
except ImportError:
_dbenable=False
#####
#
#
# Generic file function
#
#
#####
def universalOpen(file):
if isinstance(file,str):
if file[-3:] == '.gz':
rep = gzip.open(file)
else:
rep = open(file)
else:
rep = file
return rep
def universalTell(file):
if isinstance(file, gzip.GzipFile):
file=file.myfileobj
return file.tell()
def fileSize(file):
if isinstance(file, gzip.GzipFile):
file=file.myfileobj
pos = file.tell()
file.seek(0,2)
length = file.tell()
file.seek(pos,0)
return length
def progressBar(pos,max,reset=False,delta=[]):
if reset:
del delta[:]
if not delta:
delta.append(time.time())
delta.append(time.time())
delta[1]=time.time()
elapsed = delta[1]-delta[0]
percent = float(pos)/max * 100
remain = time.strftime('%H:%M:%S',time.gmtime(elapsed / percent * (100-percent)))
bar = '#' * int(percent/2)
bar+= '|/-\\-'[pos % 5]
bar+= ' ' * (50 - int(percent/2))
sys.stderr.write('\r%5.1f %% |%s] remain : %s' %(percent,bar,remain))
#####
#
#
# NCBI Dump Taxonomy reader
#
#
#####
def endLessIterator(endedlist):
for x in endedlist:
yield x
while(1):
yield endedlist[-1]
class ColumnFile(object):
def __init__(self,stream,sep=None,strip=True,types=None):
if isinstance(stream,str):
self._stream = open(stream)
elif hasattr(stream,'next'):
self._stream = stream
else:
raise ValueError,'stream must be string or an iterator'
self._delimiter=sep
self._strip=strip
if types:
self._types=[x for x in types]
for i in xrange(len(self._types)):
if self._types[i] is bool:
self._types[i]=ColumnFile.str2bool
else:
self._types=None
def str2bool(x):
return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False}))
str2bool = staticmethod(str2bool)
def __iter__(self):
return self
def next(self):
ligne = self._stream.next()
data = ligne.split(self._delimiter)
if self._strip or self._types:
data = [x.strip() for x in data]
if self._types:
it = endLessIterator(self._types)
data = [x[1](x[0]) for x in ((y,it.next()) for y in data)]
return data
def taxonCmp(t1,t2):
if t1[0] < t2[0]:
return -1
elif t1[0] > t2[0]:
return +1
return 0
def bsearchTaxon(taxonomy,taxid):
taxCount = len(taxonomy)
begin = 0
end = taxCount
oldcheck=taxCount
check = begin + end / 2
while check != oldcheck and taxonomy[check][0]!=taxid :
if taxonomy[check][0] < taxid:
begin=check
else:
end=check
oldcheck=check
check = (begin + end) / 2
if taxonomy[check][0]==taxid:
return check
else:
return None
def readNodeTable(file):
file = universalOpen(file)
nodes = ColumnFile(file,
sep='|',
types=(int,int,str,
str,str,bool,
int,bool,int,
bool,bool,bool,str))
print >>sys.stderr,"Reading taxonomy dump file..."
taxonomy=[[n[0],n[2],n[1]] for n in nodes]
print >>sys.stderr,"List all taxonomy rank..."
ranks =list(set(x[1] for x in taxonomy))
ranks.sort()
ranks = dict(map(None,ranks,xrange(len(ranks))))
print >>sys.stderr,"Sorting taxons..."
taxonomy.sort(taxonCmp)
print >>sys.stderr,"Indexing taxonomy..."
index = {}
for t in taxonomy:
index[t[0]]=bsearchTaxon(taxonomy, t[0])
print >>sys.stderr,"Indexing parent and rank..."
for t in taxonomy:
t[1]=ranks[t[1]]
t[2]=index[t[2]]
return taxonomy,ranks,index
def nameIterator(file):
file = universalOpen(file)
names = ColumnFile(file,
sep='|',
types=(int,str,
str,str))
for taxid,name,unique,classname,white in names:
yield taxid,name,classname
def mergedNodeIterator(file):
file = universalOpen(file)
merged = ColumnFile(file,
sep='|',
types=(int,int,str))
for taxid,current,white in merged:
yield taxid,current
def deletedNodeIterator(file):
file = universalOpen(file)
deleted = ColumnFile(file,
sep='|',
types=(int,str))
for taxid,white in deleted:
yield taxid
def readTaxonomyDump(taxdir):
taxonomy,ranks,index = readNodeTable('%s/nodes.dmp' % taxdir)
print >>sys.stderr,"Adding scientific name..."
alternativeName=[]
for taxid,name,classname in nameIterator('%s/names.dmp' % taxdir):
alternativeName.append((name,classname,index[taxid]))
if classname == 'scientific name':
taxonomy[index[taxid]].append(name)
print >>sys.stderr,"Adding taxid alias..."
for taxid,current in mergedNodeIterator('%s/merged.dmp' % taxdir):
index[taxid]=index[current]
print >>sys.stderr,"Adding deleted taxid..."
for taxid in deletedNodeIterator('%s/delnodes.dmp' % taxdir):
index[taxid]=None
return taxonomy,ranks,alternativeName,index
def readTaxonomyDB(dbname):
connection = psycopg2.connect(database=dbname)
cursor = connection.cursor()
cursor.execute("select numid,rank,parent from ncbi_taxonomy.taxon")
taxonomy=[list(x) for x in cursor]
cursor.execute("select rank_class from ncbi_taxonomy.taxon_rank_class order by rank_class")
ranks=cursor.fetchall()
ranks = dict(map(None,(x[0] for x in ranks),xrange(len(ranks))))
print >>sys.stderr,"Sorting taxons..."
taxonomy.sort(taxonCmp)
print >>sys.stderr,"Indexing taxonomy..."
index = {}
for t in taxonomy:
index[t[0]]=bsearchTaxon(taxonomy, t[0])
print >>sys.stderr,"Indexing parent and rank..."
for t in taxonomy:
t[1]=ranks[t[1]]
try:
t[2]=index[t[2]]
except KeyError,e:
if t[2] is None and t[0]==1:
t[2]=index[t[0]]
else:
raise e
cursor.execute("select taxid,name,category from ncbi_taxonomy.name")
alternativeName=[]
for taxid,name,classname in cursor:
alternativeName.append((name,classname,index[taxid]))
if classname == 'scientific name':
taxonomy[index[taxid]].append(name)
cursor.execute("select old_numid,current_numid from ncbi_taxonomy.taxon_id_alias")
print >>sys.stderr,"Adding taxid alias..."
for taxid,current in cursor:
if current is not None:
index[taxid]=index[current]
else:
index[taxid]=None
return taxonomy,ranks,alternativeName,index
#####
#
#
# Genbank/EMBL sequence reader
#
#
#####
def entryIterator(file):
file = universalOpen(file)
rep =[]
for ligne in file:
rep.append(ligne)
if ligne == '//\n':
rep = ''.join(rep)
yield rep
rep = []
def fastaEntryIterator(file):
file = universalOpen(file)
rep =[]
for ligne in file:
if ligne[0] == '>' and rep:
rep = ''.join(rep)
yield rep
rep = []
rep.append(ligne)
if rep:
rep = ''.join(rep)
yield rep
_cleanSeq = re.compile('[ \n0-9]+')
def cleanSeq(seq):
return _cleanSeq.sub('',seq)
_gbParseID = re.compile('(?<=^LOCUS {7})[^ ]+(?= )',re.MULTILINE)
_gbParseDE = re.compile('(?<=^DEFINITION {2}).+?\. *$(?=[^ ])',re.MULTILINE+re.DOTALL)
_gbParseSQ = re.compile('(?<=^ORIGIN).+?(?=^//$)',re.MULTILINE+re.DOTALL)
_gbParseTX = re.compile('(?<= /db_xref="taxon:)[0-9]+(?=")')
def genbankEntryParser(entry):
Id = _gbParseID.findall(entry)[0]
De = ' '.join(_gbParseDE.findall(entry)[0].split())
Sq = cleanSeq(_gbParseSQ.findall(entry)[0].upper())
try:
Tx = int(_gbParseTX.findall(entry)[0])
except IndexError:
Tx = None
return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq}
######################
_cleanDef = re.compile('[\nDE]')
def cleanDef(definition):
return _cleanDef.sub('',definition)
_emblParseID = re.compile('(?<=^ID {3})[^ ]+(?=;)',re.MULTILINE)
_emblParseDE = re.compile('(?<=^DE {3}).+?\. *$(?=[^ ])',re.MULTILINE+re.DOTALL)
_emblParseSQ = re.compile('(?<=^ ).+?(?=^//$)',re.MULTILINE+re.DOTALL)
_emblParseTX = re.compile('(?<= /db_xref="taxon:)[0-9]+(?=")')
def emblEntryParser(entry):
Id = _emblParseID.findall(entry)[0]
De = ' '.join(cleanDef(_emblParseDE.findall(entry)[0]).split())
Sq = cleanSeq(_emblParseSQ.findall(entry)[0].upper())
try:
Tx = int(_emblParseTX.findall(entry)[0])
except IndexError:
Tx = None
return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq}
######################
_fastaSplit=re.compile(';\W*')
def parseFasta(seq):
seq=seq.split('\n')
title = seq[0].strip()[1:].split(None,1)
id=title[0]
if len(title) == 2:
field = _fastaSplit.split(title[1])
else:
field=[]
info = dict(x.split('=',1) for x in field if '=' in x)
definition = ' '.join([x for x in field if '=' not in x])
seq=(''.join([x.strip() for x in seq[1:]])).upper()
return id,seq,definition,info
def fastaEntryParser(entry):
id,seq,definition,info = parseFasta(entry)
Tx = info.get('taxid',None)
if Tx is not None:
Tx=int(Tx)
return {'id':id,'taxid':Tx,'definition':definition,'sequence':seq}
def sequenceIteratorFactory(entryParser,entryIterator):
def sequenceIterator(file):
for entry in entryIterator(file):
yield entryParser(entry)
return sequenceIterator
def taxonomyInfo(entry,connection):
taxid = entry['taxid']
curseur = connection.cursor()
curseur.execute("""
select taxid,species,genus,family,
taxonomy.scientificName(taxid) as sn,
taxonomy.scientificName(species) as species_sn,
taxonomy.scientificName(genus) as genus_sn,
taxonomy.scientificName(family) as family_sn
from
(
select alias as taxid,
taxonomy.getSpecies(alias) as species,
taxonomy.getGenus(alias) as genus,
taxonomy.getFamily(alias) as family
from taxonomy.aliases
where id=%d ) as tax
""" % taxid)
rep = curseur.fetchone()
entry['current_taxid']=rep[0]
entry['species']=rep[1]
entry['genus']=rep[2]
entry['family']=rep[3]
entry['scientific_name']=rep[4]
entry['species_sn']=rep[5]
entry['genus_sn']=rep[6]
entry['family_sn']=rep[7]
return entry
#####
#
#
# Binary writer
#
#
#####
def ecoSeqPacker(sq):
compactseq = gzip.zlib.compress(sq['sequence'],9)
cptseqlength = len(compactseq)
delength = len(sq['definition'])
totalSize = 4 + 20 + 4 + 4 + 4 + cptseqlength + delength
packed = struct.pack('> I I 20s I I I %ds %ds' % (delength,cptseqlength),
totalSize,
sq['taxid'],
sq['id'],
delength,
len(sq['sequence']),
cptseqlength,
sq['definition'],
compactseq)
assert len(packed) == totalSize+4, "error in sequence packing"
return packed
def ecoTaxPacker(tx):
namelength = len(tx[3])
totalSize = 4 + 4 + 4 + 4 + namelength
packed = struct.pack('> I I I I I %ds' % namelength,
totalSize,
tx[0],
tx[1],
tx[2],
namelength,
tx[3])
return packed
def ecoRankPacker(rank):
namelength = len(rank)
packed = struct.pack('> I %ds' % namelength,
namelength,
rank)
return packed
def ecoNamePacker(name):
namelength = len(name[0])
classlength= len(name[1])
totalSize = namelength + classlength + 4 + 4 + 4 + 4
packed = struct.pack('> I I I I I %ds %ds' % (namelength,classlength),
totalSize,
int(name[1]=='scientific name'),
namelength,
classlength,
name[2],
name[0],
name[1])
return packed
def ecoSeqWriter(file,input,taxindex,parser):
output = open(file,'wb')
input = universalOpen(input)
inputsize = fileSize(input)
entries = parser(input)
seqcount=0
skipped = []
output.write(struct.pack('> I',seqcount))
progressBar(1, inputsize,reset=True)
for entry in entries:
if entry['taxid'] is not None:
try:
entry['taxid']=taxindex[entry['taxid']]
except KeyError:
entry['taxid']=None
if entry['taxid'] is not None:
seqcount+=1
output.write(ecoSeqPacker(entry))
else:
skipped.append(entry['id'])
where = universalTell(input)
progressBar(where, inputsize)
print >>sys.stderr," Readed sequences : %d " % seqcount,
else:
skipped.append(entry['id'])
print >>sys.stderr
output.seek(0,0)
output.write(struct.pack('> I',seqcount))
output.close()
return skipped
def ecoTaxWriter(file,taxonomy):
output = open(file,'wb')
output.write(struct.pack('> I',len(taxonomy)))
for tx in taxonomy:
output.write(ecoTaxPacker(tx))
output.close()
def ecoRankWriter(file,ranks):
output = open(file,'wb')
output.write(struct.pack('> I',len(ranks)))
rankNames = ranks.keys()
rankNames.sort()
for rank in rankNames:
output.write(ecoRankPacker(rank))
output.close()
def nameCmp(n1,n2):
name1=n1[0].upper()
name2=n2[0].upper()
if name1 < name2:
return -1
elif name1 > name2:
return 1
return 0
def ecoNameWriter(file,names):
output = open(file,'wb')
output.write(struct.pack('> I',len(names)))
names.sort(nameCmp)
for name in names:
output.write(ecoNamePacker(name))
output.close()
def ecoDBWriter(prefix,taxonomy,seqFileNames,parser):
ecoRankWriter('%s.rdx' % prefix, taxonomy[1])
ecoTaxWriter('%s.tdx' % prefix, taxonomy[0])
ecoNameWriter('%s.ndx' % prefix, taxonomy[2])
filecount = 0
for filename in seqFileNames:
filecount+=1
sk=ecoSeqWriter('%s_%03d.sdx' % (prefix,filecount),
filename,
taxonomy[3],
parser)
if sk:
print >>sys.stderr,"Skipped entry :"
print >>sys.stderr,sk
def ecoParseOptions(arguments):
opt = {
'prefix' : 'ecodb',
'taxdir' : 'taxdump',
'parser' : sequenceIteratorFactory(genbankEntryParser,
entryIterator)
}
o,filenames = getopt.getopt(arguments,
'ht:T:n:gfe',
['help',
'taxonomy=',
'taxonomy_db=',
'name=',
'genbank',
'fasta',
'embl'])
for name,value in o:
if name in ('-h','--help'):
printHelp()
exit()
elif name in ('-t','--taxonomy'):
opt['taxmod']='dump'
opt['taxdir']=value
elif name in ('-T','--taxonomy_db'):
opt['taxmod']='db'
opt['taxdb']=value
elif name in ('-n','--name'):
opt['prefix']=value
elif name in ('-g','--genbank'):
opt['parser']=sequenceIteratorFactory(genbankEntryParser,
entryIterator)
elif name in ('-f','--fasta'):
opt['parser']=sequenceIteratorFactory(fastaEntryParser,
fastaEntryIterator)
elif name in ('-e','--embl'):
opt['parser']=sequenceIteratorFactory(emblEntryParser,
entryIterator)
else:
raise ValueError,'Unknown option %s' % name
return opt,filenames
def printHelp():
print "-----------------------------------"
print " ecoPCRFormat.py"
print "-----------------------------------"
print "ecoPCRFormat.py [option] <argument>"
print "-----------------------------------"
print "-e --embl :[E]mbl format"
print "-f --fasta :[F]asta format"
print "-g --genbank :[G]enbank format"
print "-h --help :[H]elp - print this help"
print "-n --name :[N]ame of the new database created"
print "-t --taxonomy :[T]axonomy - path to the taxonomy database"
print " :bcp-like dump from GenBank taxonomy database."
print "-----------------------------------"
if __name__ == '__main__':
opt,filenames = ecoParseOptions(sys.argv[1:])
if opt['taxmod']=='dump':
taxonomy = readTaxonomyDump(opt['taxdir'])
elif opt['taxmod']=='db':
taxonomy = readTaxonomyDB(opt['taxdb'])
ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser'])