commit e2ebc357eea44e582b468dff4290c6c688157f3b Author: Eric Coissac Date: Wed Jun 1 22:42:30 2011 +0000 Backup with sets and ahocorasick git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/branches/ecoPrimers-2.1@298 60f365c0-8329-0410-b2a4-ec073aeeaa1d diff --git a/.cproject b/.cproject new file mode 100644 index 0000000..f99343a --- /dev/null +++ b/.cproject @@ -0,0 +1,221 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.project b/.project new file mode 100644 index 0000000..50a83f9 --- /dev/null +++ b/.project @@ -0,0 +1,83 @@ + + + ecoPrimers + + + + + + org.python.pydev.PyDevBuilder + + + + + org.eclipse.cdt.managedbuilder.core.genmakebuilder + clean,full,incremental, + + + ?name? + + + + org.eclipse.cdt.make.core.append_environment + true + + + org.eclipse.cdt.make.core.autoBuildTarget + all + + + org.eclipse.cdt.make.core.buildArguments + + + + org.eclipse.cdt.make.core.buildCommand + make + + + org.eclipse.cdt.make.core.cleanBuildTarget + clean + + + org.eclipse.cdt.make.core.contents + org.eclipse.cdt.make.core.activeConfigSettings + + + org.eclipse.cdt.make.core.enableAutoBuild + false + + + org.eclipse.cdt.make.core.enableCleanBuild + true + + + org.eclipse.cdt.make.core.enableFullBuild + true + + + org.eclipse.cdt.make.core.fullBuildTarget + all + + + org.eclipse.cdt.make.core.stopOnError + true + + + org.eclipse.cdt.make.core.useDefaultBuildCmd + true + + + + + org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder + + + + + + org.eclipse.cdt.core.cnature + org.eclipse.cdt.managedbuilder.core.ScannerConfigNature + org.eclipse.cdt.managedbuilder.core.managedBuildNature + org.python.pydev.pythonNature + + diff --git a/.pydevproject b/.pydevproject new file mode 100644 index 0000000..ac30bc7 --- /dev/null +++ b/.pydevproject @@ -0,0 +1,7 @@ + + + + +Python 2.6 +python 2.6 + diff --git a/Licence_CeCILL_V2-en.txt b/Licence_CeCILL_V2-en.txt new file mode 100644 index 0000000..fcc8df2 --- /dev/null +++ b/Licence_CeCILL_V2-en.txt @@ -0,0 +1,506 @@ + +CeCILL FREE SOFTWARE LICENSE AGREEMENT + + + Notice + +This Agreement is a Free Software license agreement that is the result +of discussions between its authors in order to ensure compliance with +the two main principles guiding its drafting: + + * firstly, compliance with the principles governing the distribution + of Free Software: access to source code, broad rights granted to + users, + * secondly, the election of a governing law, French law, with which + it is conformant, both as regards the law of torts and + intellectual property law, and the protection that it offers to + both authors and holders of the economic rights over software. + +The authors of the CeCILL (for Ce[a] C[nrs] I[nria] L[ogiciel] L[ibre]) +license are: + +Commissariat à l'Energie Atomique - CEA, a public scientific, technical +and industrial research establishment, having its principal place of +business at 25 rue Leblanc, immeuble Le Ponant D, 75015 Paris, France. + +Centre National de la Recherche Scientifique - CNRS, a public scientific +and technological establishment, having its principal place of business +at 3 rue Michel-Ange, 75794 Paris cedex 16, France. + +Institut National de Recherche en Informatique et en Automatique - +INRIA, a public scientific and technological establishment, having its +principal place of business at Domaine de Voluceau, Rocquencourt, BP +105, 78153 Le Chesnay cedex, France. + + + Preamble + +The purpose of this Free Software license agreement is to grant users +the right to modify and redistribute the software governed by this +license within the framework of an open source distribution model. + +The exercising of these rights is conditional upon certain obligations +for users so as to preserve this status for all subsequent redistributions. + +In consideration of access to the source code and the rights to copy, +modify and redistribute granted by the license, users are provided only +with a limited warranty and the software's author, the holder of the +economic rights, and the successive licensors only have limited liability. + +In this respect, the risks associated with loading, using, modifying +and/or developing or reproducing the software by the user are brought to +the user's attention, given its Free Software status, which may make it +complicated to use, with the result that its use is reserved for +developers and experienced professionals having in-depth computer +knowledge. Users are therefore encouraged to load and test the +suitability of the software as regards their requirements in conditions +enabling the security of their systems and/or data to be ensured and, +more generally, to use and operate it in the same conditions of +security. This Agreement may be freely reproduced and published, +provided it is not altered, and that no provisions are either added or +removed herefrom. + +This Agreement may apply to any or all software for which the holder of +the economic rights decides to submit the use thereof to its provisions. + + + Article 1 - DEFINITIONS + +For the purpose of this Agreement, when the following expressions +commence with a capital letter, they shall have the following meaning: + +Agreement: means this license agreement, and its possible subsequent +versions and annexes. + +Software: means the software in its Object Code and/or Source Code form +and, where applicable, its documentation, "as is" when the Licensee +accepts the Agreement. + +Initial Software: means the Software in its Source Code and possibly its +Object Code form and, where applicable, its documentation, "as is" when +it is first distributed under the terms and conditions of the Agreement. + +Modified Software: means the Software modified by at least one +Contribution. + +Source Code: means all the Software's instructions and program lines to +which access is required so as to modify the Software. + +Object Code: means the binary files originating from the compilation of +the Source Code. + +Holder: means the holder(s) of the economic rights over the Initial +Software. + +Licensee: means the Software user(s) having accepted the Agreement. + +Contributor: means a Licensee having made at least one Contribution. + +Licensor: means the Holder, or any other individual or legal entity, who +distributes the Software under the Agreement. + +Contribution: means any or all modifications, corrections, translations, +adaptations and/or new functions integrated into the Software by any or +all Contributors, as well as any or all Internal Modules. + +Module: means a set of sources files including their documentation that +enables supplementary functions or services in addition to those offered +by the Software. + +External Module: means any or all Modules, not derived from the +Software, so that this Module and the Software run in separate address +spaces, with one calling the other when they are run. + +Internal Module: means any or all Module, connected to the Software so +that they both execute in the same address space. + +GNU GPL: means the GNU General Public License version 2 or any +subsequent version, as published by the Free Software Foundation Inc. + +Parties: mean both the Licensee and the Licensor. + +These expressions may be used both in singular and plural form. + + + Article 2 - PURPOSE + +The purpose of the Agreement is the grant by the Licensor to the +Licensee of a non-exclusive, transferable and worldwide license for the +Software as set forth in Article 5 hereinafter for the whole term of the +protection granted by the rights over said Software. + + + Article 3 - ACCEPTANCE + +3.1 The Licensee shall be deemed as having accepted the terms and +conditions of this Agreement upon the occurrence of the first of the +following events: + + * (i) loading the Software by any or all means, notably, by + downloading from a remote server, or by loading from a physical + medium; + * (ii) the first time the Licensee exercises any of the rights + granted hereunder. + +3.2 One copy of the Agreement, containing a notice relating to the +characteristics of the Software, to the limited warranty, and to the +fact that its use is restricted to experienced users has been provided +to the Licensee prior to its acceptance as set forth in Article 3.1 +hereinabove, and the Licensee hereby acknowledges that it has read and +understood it. + + + Article 4 - EFFECTIVE DATE AND TERM + + + 4.1 EFFECTIVE DATE + +The Agreement shall become effective on the date when it is accepted by +the Licensee as set forth in Article 3.1. + + + 4.2 TERM + +The Agreement shall remain in force for the entire legal term of +protection of the economic rights over the Software. + + + Article 5 - SCOPE OF RIGHTS GRANTED + +The Licensor hereby grants to the Licensee, who accepts, the following +rights over the Software for any or all use, and for the term of the +Agreement, on the basis of the terms and conditions set forth hereinafter. + +Besides, if the Licensor owns or comes to own one or more patents +protecting all or part of the functions of the Software or of its +components, the Licensor undertakes not to enforce the rights granted by +these patents against successive Licensees using, exploiting or +modifying the Software. If these patents are transferred, the Licensor +undertakes to have the transferees subscribe to the obligations set +forth in this paragraph. + + + 5.1 RIGHT OF USE + +The Licensee is authorized to use the Software, without any limitation +as to its fields of application, with it being hereinafter specified +that this comprises: + + 1. permanent or temporary reproduction of all or part of the Software + by any or all means and in any or all form. + + 2. loading, displaying, running, or storing the Software on any or + all medium. + + 3. entitlement to observe, study or test its operation so as to + determine the ideas and principles behind any or all constituent + elements of said Software. This shall apply when the Licensee + carries out any or all loading, displaying, running, transmission + or storage operation as regards the Software, that it is entitled + to carry out hereunder. + + + 5.2 ENTITLEMENT TO MAKE CONTRIBUTIONS + +The right to make Contributions includes the right to translate, adapt, +arrange, or make any or all modifications to the Software, and the right +to reproduce the resulting software. + +The Licensee is authorized to make any or all Contributions to the +Software provided that it includes an explicit notice that it is the +author of said Contribution and indicates the date of the creation thereof. + + + 5.3 RIGHT OF DISTRIBUTION + +In particular, the right of distribution includes the right to publish, +transmit and communicate the Software to the general public on any or +all medium, and by any or all means, and the right to market, either in +consideration of a fee, or free of charge, one or more copies of the +Software by any means. + +The Licensee is further authorized to distribute copies of the modified +or unmodified Software to third parties according to the terms and +conditions set forth hereinafter. + + + 5.3.1 DISTRIBUTION OF SOFTWARE WITHOUT MODIFICATION + +The Licensee is authorized to distribute true copies of the Software in +Source Code or Object Code form, provided that said distribution +complies with all the provisions of the Agreement and is accompanied by: + + 1. a copy of the Agreement, + + 2. a notice relating to the limitation of both the Licensor's + warranty and liability as set forth in Articles 8 and 9, + +and that, in the event that only the Object Code of the Software is +redistributed, the Licensee allows future Licensees unhindered access to +the full Source Code of the Software by indicating how to access it, it +being understood that the additional cost of acquiring the Source Code +shall not exceed the cost of transferring the data. + + + 5.3.2 DISTRIBUTION OF MODIFIED SOFTWARE + +When the Licensee makes a Contribution to the Software, the terms and +conditions for the distribution of the resulting Modified Software +become subject to all the provisions of this Agreement. + +The Licensee is authorized to distribute the Modified Software, in +source code or object code form, provided that said distribution +complies with all the provisions of the Agreement and is accompanied by: + + 1. a copy of the Agreement, + + 2. a notice relating to the limitation of both the Licensor's + warranty and liability as set forth in Articles 8 and 9, + +and that, in the event that only the object code of the Modified +Software is redistributed, the Licensee allows future Licensees +unhindered access to the full source code of the Modified Software by +indicating how to access it, it being understood that the additional +cost of acquiring the source code shall not exceed the cost of +transferring the data. + + + 5.3.3 DISTRIBUTION OF EXTERNAL MODULES + +When the Licensee has developed an External Module, the terms and +conditions of this Agreement do not apply to said External Module, that +may be distributed under a separate license agreement. + + + 5.3.4 COMPATIBILITY WITH THE GNU GPL + +The Licensee can include a code that is subject to the provisions of one +of the versions of the GNU GPL in the Modified or unmodified Software, +and distribute that entire code under the terms of the same version of +the GNU GPL. + +The Licensee can include the Modified or unmodified Software in a code +that is subject to the provisions of one of the versions of the GNU GPL, +and distribute that entire code under the terms of the same version of +the GNU GPL. + + + Article 6 - INTELLECTUAL PROPERTY + + + 6.1 OVER THE INITIAL SOFTWARE + +The Holder owns the economic rights over the Initial Software. Any or +all use of the Initial Software is subject to compliance with the terms +and conditions under which the Holder has elected to distribute its work +and no one shall be entitled to modify the terms and conditions for the +distribution of said Initial Software. + +The Holder undertakes that the Initial Software will remain ruled at +least by this Agreement, for the duration set forth in Article 4.2. + + + 6.2 OVER THE CONTRIBUTIONS + +The Licensee who develops a Contribution is the owner of the +intellectual property rights over this Contribution as defined by +applicable law. + + + 6.3 OVER THE EXTERNAL MODULES + +The Licensee who develops an External Module is the owner of the +intellectual property rights over this External Module as defined by +applicable law and is free to choose the type of agreement that shall +govern its distribution. + + + 6.4 JOINT PROVISIONS + +The Licensee expressly undertakes: + + 1. not to remove, or modify, in any manner, the intellectual property + notices attached to the Software; + + 2. to reproduce said notices, in an identical manner, in the copies + of the Software modified or not. + +The Licensee undertakes not to directly or indirectly infringe the +intellectual property rights of the Holder and/or Contributors on the +Software and to take, where applicable, vis-à-vis its staff, any and all +measures required to ensure respect of said intellectual property rights +of the Holder and/or Contributors. + + + Article 7 - RELATED SERVICES + +7.1 Under no circumstances shall the Agreement oblige the Licensor to +provide technical assistance or maintenance services for the Software. + +However, the Licensor is entitled to offer this type of services. The +terms and conditions of such technical assistance, and/or such +maintenance, shall be set forth in a separate instrument. Only the +Licensor offering said maintenance and/or technical assistance services +shall incur liability therefor. + +7.2 Similarly, any Licensor is entitled to offer to its licensees, under +its sole responsibility, a warranty, that shall only be binding upon +itself, for the redistribution of the Software and/or the Modified +Software, under terms and conditions that it is free to decide. Said +warranty, and the financial terms and conditions of its application, +shall be subject of a separate instrument executed between the Licensor +and the Licensee. + + + Article 8 - LIABILITY + +8.1 Subject to the provisions of Article 8.2, the Licensee shall be +entitled to claim compensation for any direct loss it may have suffered +from the Software as a result of a fault on the part of the relevant +Licensor, subject to providing evidence thereof. + +8.2 The Licensor's liability is limited to the commitments made under +this Agreement and shall not be incurred as a result of in particular: +(i) loss due the Licensee's total or partial failure to fulfill its +obligations, (ii) direct or consequential loss that is suffered by the +Licensee due to the use or performance of the Software, and (iii) more +generally, any consequential loss. In particular the Parties expressly +agree that any or all pecuniary or business loss (i.e. loss of data, +loss of profits, operating loss, loss of customers or orders, +opportunity cost, any disturbance to business activities) or any or all +legal proceedings instituted against the Licensee by a third party, +shall constitute consequential loss and shall not provide entitlement to +any or all compensation from the Licensor. + + + Article 9 - WARRANTY + +9.1 The Licensee acknowledges that the scientific and technical +state-of-the-art when the Software was distributed did not enable all +possible uses to be tested and verified, nor for the presence of +possible defects to be detected. In this respect, the Licensee's +attention has been drawn to the risks associated with loading, using, +modifying and/or developing and reproducing the Software which are +reserved for experienced users. + +The Licensee shall be responsible for verifying, by any or all means, +the suitability of the product for its requirements, its good working +order, and for ensuring that it shall not cause damage to either persons +or properties. + +9.2 The Licensor hereby represents, in good faith, that it is entitled +to grant all the rights over the Software (including in particular the +rights set forth in Article 5). + +9.3 The Licensee acknowledges that the Software is supplied "as is" by +the Licensor without any other express or tacit warranty, other than +that provided for in Article 9.2 and, in particular, without any warranty +as to its commercial value, its secured, safe, innovative or relevant +nature. + +Specifically, the Licensor does not warrant that the Software is free +from any error, that it will operate without interruption, that it will +be compatible with the Licensee's own equipment and software +configuration, nor that it will meet the Licensee's requirements. + +9.4 The Licensor does not either expressly or tacitly warrant that the +Software does not infringe any third party intellectual property right +relating to a patent, software or any other property right. Therefore, +the Licensor disclaims any and all liability towards the Licensee +arising out of any or all proceedings for infringement that may be +instituted in respect of the use, modification and redistribution of the +Software. Nevertheless, should such proceedings be instituted against +the Licensee, the Licensor shall provide it with technical and legal +assistance for its defense. Such technical and legal assistance shall be +decided on a case-by-case basis between the relevant Licensor and the +Licensee pursuant to a memorandum of understanding. The Licensor +disclaims any and all liability as regards the Licensee's use of the +name of the Software. No warranty is given as regards the existence of +prior rights over the name of the Software or as regards the existence +of a trademark. + + + Article 10 - TERMINATION + +10.1 In the event of a breach by the Licensee of its obligations +hereunder, the Licensor may automatically terminate this Agreement +thirty (30) days after notice has been sent to the Licensee and has +remained ineffective. + +10.2 A Licensee whose Agreement is terminated shall no longer be +authorized to use, modify or distribute the Software. However, any +licenses that it may have granted prior to termination of the Agreement +shall remain valid subject to their having been granted in compliance +with the terms and conditions hereof. + + + Article 11 - MISCELLANEOUS + + + 11.1 EXCUSABLE EVENTS + +Neither Party shall be liable for any or all delay, or failure to +perform the Agreement, that may be attributable to an event of force +majeure, an act of God or an outside cause, such as defective +functioning or interruptions of the electricity or telecommunications +networks, network paralysis following a virus attack, intervention by +government authorities, natural disasters, water damage, earthquakes, +fire, explosions, strikes and labor unrest, war, etc. + +11.2 Any failure by either Party, on one or more occasions, to invoke +one or more of the provisions hereof, shall under no circumstances be +interpreted as being a waiver by the interested Party of its right to +invoke said provision(s) subsequently. + +11.3 The Agreement cancels and replaces any or all previous agreements, +whether written or oral, between the Parties and having the same +purpose, and constitutes the entirety of the agreement between said +Parties concerning said purpose. No supplement or modification to the +terms and conditions hereof shall be effective as between the Parties +unless it is made in writing and signed by their duly authorized +representatives. + +11.4 In the event that one or more of the provisions hereof were to +conflict with a current or future applicable act or legislative text, +said act or legislative text shall prevail, and the Parties shall make +the necessary amendments so as to comply with said act or legislative +text. All other provisions shall remain effective. Similarly, invalidity +of a provision of the Agreement, for any reason whatsoever, shall not +cause the Agreement as a whole to be invalid. + + + 11.5 LANGUAGE + +The Agreement is drafted in both French and English and both versions +are deemed authentic. + + + Article 12 - NEW VERSIONS OF THE AGREEMENT + +12.1 Any person is authorized to duplicate and distribute copies of this +Agreement. + +12.2 So as to ensure coherence, the wording of this Agreement is +protected and may only be modified by the authors of the License, who +reserve the right to periodically publish updates or new versions of the +Agreement, each with a separate number. These subsequent versions may +address new issues encountered by Free Software. + +12.3 Any Software distributed under a given version of the Agreement may +only be subsequently distributed under the same version of the Agreement +or a subsequent version, subject to the provisions of Article 5.3.4. + + + Article 13 - GOVERNING LAW AND JURISDICTION + +13.1 The Agreement is governed by French law. The Parties agree to +endeavor to seek an amicable solution to any disagreements or disputes +that may arise during the performance of the Agreement. + +13.2 Failing an amicable solution within two (2) months as from their +occurrence, and unless emergency proceedings are necessary, the +disagreements or disputes shall be referred to the Paris Courts having +jurisdiction, by the more diligent Party. + + +Version 2.0 dated 2006-09-05. diff --git a/Licence_CeCILL_V2-fr.txt b/Licence_CeCILL_V2-fr.txt new file mode 100644 index 0000000..1613fca --- /dev/null +++ b/Licence_CeCILL_V2-fr.txt @@ -0,0 +1,512 @@ + +CONTRAT DE LICENCE DE LOGICIEL LIBRE CeCILL + + + Avertissement + +Ce contrat est une licence de logiciel libre issue d'une concertation +entre ses auteurs afin que le respect de deux grands principes préside à +sa rédaction: + + * d'une part, le respect des principes de diffusion des logiciels + libres: accès au code source, droits étendus conférés aux + utilisateurs, + * d'autre part, la désignation d'un droit applicable, le droit + français, auquel elle est conforme, tant au regard du droit de la + responsabilité civile que du droit de la propriété intellectuelle + et de la protection qu'il offre aux auteurs et titulaires des + droits patrimoniaux sur un logiciel. + +Les auteurs de la licence CeCILL (pour Ce[a] C[nrs] I[nria] L[ogiciel] +L[ibre]) sont: + +Commissariat à l'Energie Atomique - CEA, établissement public de +recherche à caractère scientifique, technique et industriel, dont le +siège est situé 25 rue Leblanc, immeuble Le Ponant D, 75015 Paris. + +Centre National de la Recherche Scientifique - CNRS, établissement +public à caractère scientifique et technologique, dont le siège est +situé 3 rue Michel-Ange, 75794 Paris cedex 16. + +Institut National de Recherche en Informatique et en Automatique - +INRIA, établissement public à caractère scientifique et technologique, +dont le siège est situé Domaine de Voluceau, Rocquencourt, BP 105, 78153 +Le Chesnay cedex. + + + Préambule + +Ce contrat est une licence de logiciel libre dont l'objectif est de +conférer aux utilisateurs la liberté de modification et de +redistribution du logiciel régi par cette licence dans le cadre d'un +modèle de diffusion en logiciel libre. + +L'exercice de ces libertés est assorti de certains devoirs à la charge +des utilisateurs afin de préserver ce statut au cours des +redistributions ultérieures. + +L'accessibilité au code source et les droits de copie, de modification +et de redistribution qui en découlent ont pour contrepartie de n'offrir +aux utilisateurs qu'une garantie limitée et de ne faire peser sur +l'auteur du logiciel, le titulaire des droits patrimoniaux et les +concédants successifs qu'une responsabilité restreinte. + +A cet égard l'attention de l'utilisateur est attirée sur les risques +associés au chargement, à l'utilisation, à la modification et/ou au +développement et à la reproduction du logiciel par l'utilisateur étant +donné sa spécificité de logiciel libre, qui peut le rendre complexe à +manipuler et qui le réserve donc à des développeurs ou des +professionnels avertis possédant des connaissances informatiques +approfondies. Les utilisateurs sont donc invités à charger et tester +l'adéquation du logiciel à leurs besoins dans des conditions permettant +d'assurer la sécurité de leurs systèmes et/ou de leurs données et, plus +généralement, à l'utiliser et l'exploiter dans les mêmes conditions de +sécurité. Ce contrat peut être reproduit et diffusé librement, sous +réserve de le conserver en l'état, sans ajout ni suppression de clauses. + +Ce contrat est susceptible de s'appliquer à tout logiciel dont le +titulaire des droits patrimoniaux décide de soumettre l'exploitation aux +dispositions qu'il contient. + + + Article 1 - DEFINITIONS + +Dans ce contrat, les termes suivants, lorsqu'ils seront écrits avec une +lettre capitale, auront la signification suivante: + +Contrat: désigne le présent contrat de licence, ses éventuelles versions +postérieures et annexes. + +Logiciel: désigne le logiciel sous sa forme de Code Objet et/ou de Code +Source et le cas échéant sa documentation, dans leur état au moment de +l'acceptation du Contrat par le Licencié. + +Logiciel Initial: désigne le Logiciel sous sa forme de Code Source et +éventuellement de Code Objet et le cas échéant sa documentation, dans +leur état au moment de leur première diffusion sous les termes du Contrat. + +Logiciel Modifié: désigne le Logiciel modifié par au moins une +Contribution. + +Code Source: désigne l'ensemble des instructions et des lignes de +programme du Logiciel et auquel l'accès est nécessaire en vue de +modifier le Logiciel. + +Code Objet: désigne les fichiers binaires issus de la compilation du +Code Source. + +Titulaire: désigne le ou les détenteurs des droits patrimoniaux d'auteur +sur le Logiciel Initial. + +Licencié: désigne le ou les utilisateurs du Logiciel ayant accepté le +Contrat. + +Contributeur: désigne le Licencié auteur d'au moins une Contribution. + +Concédant: désigne le Titulaire ou toute personne physique ou morale +distribuant le Logiciel sous le Contrat. + +Contribution: désigne l'ensemble des modifications, corrections, +traductions, adaptations et/ou nouvelles fonctionnalités intégrées dans +le Logiciel par tout Contributeur, ainsi que tout Module Interne. + +Module: désigne un ensemble de fichiers sources y compris leur +documentation qui permet de réaliser des fonctionnalités ou services +supplémentaires à ceux fournis par le Logiciel. + +Module Externe: désigne tout Module, non dérivé du Logiciel, tel que ce +Module et le Logiciel s'exécutent dans des espaces d'adressage +différents, l'un appelant l'autre au moment de leur exécution. + +Module Interne: désigne tout Module lié au Logiciel de telle sorte +qu'ils s'exécutent dans le même espace d'adressage. + +GNU GPL: désigne la GNU General Public License dans sa version 2 ou +toute version ultérieure, telle que publiée par Free Software Foundation +Inc. + +Parties: désigne collectivement le Licencié et le Concédant. + +Ces termes s'entendent au singulier comme au pluriel. + + + Article 2 - OBJET + +Le Contrat a pour objet la concession par le Concédant au Licencié d'une +licence non exclusive, cessible et mondiale du Logiciel telle que +définie ci-après à l'article 5 pour toute la durée de protection des droits +portant sur ce Logiciel. + + + Article 3 - ACCEPTATION + +3.1 L'acceptation par le Licencié des termes du Contrat est réputée +acquise du fait du premier des faits suivants: + + * (i) le chargement du Logiciel par tout moyen notamment par + téléchargement à partir d'un serveur distant ou par chargement à + partir d'un support physique; + * (ii) le premier exercice par le Licencié de l'un quelconque des + droits concédés par le Contrat. + +3.2 Un exemplaire du Contrat, contenant notamment un avertissement +relatif aux spécificités du Logiciel, à la restriction de garantie et à +la limitation à un usage par des utilisateurs expérimentés a été mis à +disposition du Licencié préalablement à son acceptation telle que +définie à l'article 3.1 ci dessus et le Licencié reconnaît en avoir pris +connaissance. + + + Article 4 - ENTREE EN VIGUEUR ET DUREE + + + 4.1 ENTREE EN VIGUEUR + +Le Contrat entre en vigueur à la date de son acceptation par le Licencié +telle que définie en 3.1. + + + 4.2 DUREE + +Le Contrat produira ses effets pendant toute la durée légale de +protection des droits patrimoniaux portant sur le Logiciel. + + + Article 5 - ETENDUE DES DROITS CONCEDES + +Le Concédant concède au Licencié, qui accepte, les droits suivants sur +le Logiciel pour toutes destinations et pour la durée du Contrat dans +les conditions ci-après détaillées. + +Par ailleurs, si le Concédant détient ou venait à détenir un ou +plusieurs brevets d'invention protégeant tout ou partie des +fonctionnalités du Logiciel ou de ses composants, il s'engage à ne pas +opposer les éventuels droits conférés par ces brevets aux Licenciés +successifs qui utiliseraient, exploiteraient ou modifieraient le +Logiciel. En cas de cession de ces brevets, le Concédant s'engage à +faire reprendre les obligations du présent alinéa aux cessionnaires. + + + 5.1 DROIT D'UTILISATION + +Le Licencié est autorisé à utiliser le Logiciel, sans restriction quant +aux domaines d'application, étant ci-après précisé que cela comporte: + + 1. la reproduction permanente ou provisoire du Logiciel en tout ou + partie par tout moyen et sous toute forme. + + 2. le chargement, l'affichage, l'exécution, ou le stockage du + Logiciel sur tout support. + + 3. la possibilité d'en observer, d'en étudier, ou d'en tester le + fonctionnement afin de déterminer les idées et principes qui sont + à la base de n'importe quel élément de ce Logiciel; et ceci, + lorsque le Licencié effectue toute opération de chargement, + d'affichage, d'exécution, de transmission ou de stockage du + Logiciel qu'il est en droit d'effectuer en vertu du Contrat. + + + 5.2 DROIT D'APPORTER DES CONTRIBUTIONS + +Le droit d'apporter des Contributions comporte le droit de traduire, +d'adapter, d'arranger ou d'apporter toute autre modification au Logiciel +et le droit de reproduire le logiciel en résultant. + +Le Licencié est autorisé à apporter toute Contribution au Logiciel sous +réserve de mentionner, de façon explicite, son nom en tant qu'auteur de +cette Contribution et la date de création de celle-ci. + + + 5.3 DROIT DE DISTRIBUTION + +Le droit de distribution comporte notamment le droit de diffuser, de +transmettre et de communiquer le Logiciel au public sur tout support et +par tout moyen ainsi que le droit de mettre sur le marché à titre +onéreux ou gratuit, un ou des exemplaires du Logiciel par tout procédé. + +Le Licencié est autorisé à distribuer des copies du Logiciel, modifié ou +non, à des tiers dans les conditions ci-après détaillées. + + + 5.3.1 DISTRIBUTION DU LOGICIEL SANS MODIFICATION + +Le Licencié est autorisé à distribuer des copies conformes du Logiciel, +sous forme de Code Source ou de Code Objet, à condition que cette +distribution respecte les dispositions du Contrat dans leur totalité et +soit accompagnée: + + 1. d'un exemplaire du Contrat, + + 2. d'un avertissement relatif à la restriction de garantie et de + responsabilité du Concédant telle que prévue aux articles 8 + et 9, + +et que, dans le cas où seul le Code Objet du Logiciel est redistribué, +le Licencié permette aux futurs Licenciés d'accéder facilement au Code +Source complet du Logiciel en indiquant les modalités d'accès, étant +entendu que le coût additionnel d'acquisition du Code Source ne devra +pas excéder le simple coût de transfert des données. + + + 5.3.2 DISTRIBUTION DU LOGICIEL MODIFIE + +Lorsque le Licencié apporte une Contribution au Logiciel, les conditions +de distribution du Logiciel Modifié en résultant sont alors soumises à +l'intégralité des dispositions du Contrat. + +Le Licencié est autorisé à distribuer le Logiciel Modifié, sous forme de +code source ou de code objet, à condition que cette distribution +respecte les dispositions du Contrat dans leur totalité et soit +accompagnée: + + 1. d'un exemplaire du Contrat, + + 2. d'un avertissement relatif à la restriction de garantie et de + responsabilité du Concédant telle que prévue aux articles 8 + et 9, + +et que, dans le cas où seul le code objet du Logiciel Modifié est +redistribué, le Licencié permette aux futurs Licenciés d'accéder +facilement au code source complet du Logiciel Modifié en indiquant les +modalités d'accès, étant entendu que le coût additionnel d'acquisition +du code source ne devra pas excéder le simple coût de transfert des données. + + + 5.3.3 DISTRIBUTION DES MODULES EXTERNES + +Lorsque le Licencié a développé un Module Externe les conditions du +Contrat ne s'appliquent pas à ce Module Externe, qui peut être distribué +sous un contrat de licence différent. + + + 5.3.4 COMPATIBILITE AVEC LA LICENCE GNU GPL + +Le Licencié peut inclure un code soumis aux dispositions d'une des +versions de la licence GNU GPL dans le Logiciel modifié ou non et +distribuer l'ensemble sous les conditions de la même version de la +licence GNU GPL. + +Le Licencié peut inclure le Logiciel modifié ou non dans un code soumis +aux dispositions d'une des versions de la licence GNU GPL et distribuer +l'ensemble sous les conditions de la même version de la licence GNU GPL. + + + Article 6 - PROPRIETE INTELLECTUELLE + + + 6.1 SUR LE LOGICIEL INITIAL + +Le Titulaire est détenteur des droits patrimoniaux sur le Logiciel +Initial. Toute utilisation du Logiciel Initial est soumise au respect +des conditions dans lesquelles le Titulaire a choisi de diffuser son +oeuvre et nul autre n'a la faculté de modifier les conditions de +diffusion de ce Logiciel Initial. + +Le Titulaire s'engage à ce que le Logiciel Initial reste au moins régi +par le Contrat et ce, pour la durée visée à l'article 4.2. + + + 6.2 SUR LES CONTRIBUTIONS + +Le Licencié qui a développé une Contribution est titulaire sur celle-ci +des droits de propriété intellectuelle dans les conditions définies par +la législation applicable. + + + 6.3 SUR LES MODULES EXTERNES + +Le Licencié qui a développé un Module Externe est titulaire sur celui-ci +des droits de propriété intellectuelle dans les conditions définies par +la législation applicable et reste libre du choix du contrat régissant +sa diffusion. + + + 6.4 DISPOSITIONS COMMUNES + +Le Licencié s'engage expressément: + + 1. à ne pas supprimer ou modifier de quelque manière que ce soit les + mentions de propriété intellectuelle apposées sur le Logiciel; + + 2. à reproduire à l'identique lesdites mentions de propriété + intellectuelle sur les copies du Logiciel modifié ou non. + +Le Licencié s'engage à ne pas porter atteinte, directement ou +indirectement, aux droits de propriété intellectuelle du Titulaire et/ou +des Contributeurs sur le Logiciel et à prendre, le cas échéant, à +l'égard de son personnel toutes les mesures nécessaires pour assurer le +respect des dits droits de propriété intellectuelle du Titulaire et/ou +des Contributeurs. + + + Article 7 - SERVICES ASSOCIES + +7.1 Le Contrat n'oblige en aucun cas le Concédant à la réalisation de +prestations d'assistance technique ou de maintenance du Logiciel. + +Cependant le Concédant reste libre de proposer ce type de services. Les +termes et conditions d'une telle assistance technique et/ou d'une telle +maintenance seront alors déterminés dans un acte séparé. Ces actes de +maintenance et/ou assistance technique n'engageront que la seule +responsabilité du Concédant qui les propose. + +7.2 De même, tout Concédant est libre de proposer, sous sa seule +responsabilité, à ses licenciés une garantie, qui n'engagera que lui, +lors de la redistribution du Logiciel et/ou du Logiciel Modifié et ce, +dans les conditions qu'il souhaite. Cette garantie et les modalités +financières de son application feront l'objet d'un acte séparé entre le +Concédant et le Licencié. + + + Article 8 - RESPONSABILITE + +8.1 Sous réserve des dispositions de l'article 8.2, le Licencié a la +faculté, sous réserve de prouver la faute du Concédant concerné, de +solliciter la réparation du préjudice direct qu'il subirait du fait du +Logiciel et dont il apportera la preuve. + +8.2 La responsabilité du Concédant est limitée aux engagements pris en +application du Contrat et ne saurait être engagée en raison notamment: +(i) des dommages dus à l'inexécution, totale ou partielle, de ses +obligations par le Licencié, (ii) des dommages directs ou indirects +découlant de l'utilisation ou des performances du Logiciel subis par le +Licencié et (iii) plus généralement d'un quelconque dommage indirect. En +particulier, les Parties conviennent expressément que tout préjudice +financier ou commercial (par exemple perte de données, perte de +bénéfices, perte d'exploitation, perte de clientèle ou de commandes, +manque à gagner, trouble commercial quelconque) ou toute action dirigée +contre le Licencié par un tiers, constitue un dommage indirect et +n'ouvre pas droit à réparation par le Concédant. + + + Article 9 - GARANTIE + +9.1 Le Licencié reconnaît que l'état actuel des connaissances +scientifiques et techniques au moment de la mise en circulation du +Logiciel ne permet pas d'en tester et d'en vérifier toutes les +utilisations ni de détecter l'existence d'éventuels défauts. L'attention +du Licencié a été attirée sur ce point sur les risques associés au +chargement, à l'utilisation, la modification et/ou au développement et à +la reproduction du Logiciel qui sont réservés à des utilisateurs avertis. + +Il relève de la responsabilité du Licencié de contrôler, par tous +moyens, l'adéquation du produit à ses besoins, son bon fonctionnement et +de s'assurer qu'il ne causera pas de dommages aux personnes et aux biens. + +9.2 Le Concédant déclare de bonne foi être en droit de concéder +l'ensemble des droits attachés au Logiciel (comprenant notamment les +droits visés à l'article 5). + +9.3 Le Licencié reconnaît que le Logiciel est fourni "en l'état" par le +Concédant sans autre garantie, expresse ou tacite, que celle prévue à +l'article 9.2 et notamment sans aucune garantie sur sa valeur commerciale, +son caractère sécurisé, innovant ou pertinent. + +En particulier, le Concédant ne garantit pas que le Logiciel est exempt +d'erreur, qu'il fonctionnera sans interruption, qu'il sera compatible +avec l'équipement du Licencié et sa configuration logicielle ni qu'il +remplira les besoins du Licencié. + +9.4 Le Concédant ne garantit pas, de manière expresse ou tacite, que le +Logiciel ne porte pas atteinte à un quelconque droit de propriété +intellectuelle d'un tiers portant sur un brevet, un logiciel ou sur tout +autre droit de propriété. Ainsi, le Concédant exclut toute garantie au +profit du Licencié contre les actions en contrefaçon qui pourraient être +diligentées au titre de l'utilisation, de la modification, et de la +redistribution du Logiciel. Néanmoins, si de telles actions sont +exercées contre le Licencié, le Concédant lui apportera son aide +technique et juridique pour sa défense. Cette aide technique et +juridique est déterminée au cas par cas entre le Concédant concerné et +le Licencié dans le cadre d'un protocole d'accord. Le Concédant dégage +toute responsabilité quant à l'utilisation de la dénomination du +Logiciel par le Licencié. Aucune garantie n'est apportée quant à +l'existence de droits antérieurs sur le nom du Logiciel et sur +l'existence d'une marque. + + + Article 10 - RESILIATION + +10.1 En cas de manquement par le Licencié aux obligations mises à sa +charge par le Contrat, le Concédant pourra résilier de plein droit le +Contrat trente (30) jours après notification adressée au Licencié et +restée sans effet. + +10.2 Le Licencié dont le Contrat est résilié n'est plus autorisé à +utiliser, modifier ou distribuer le Logiciel. Cependant, toutes les +licences qu'il aura concédées antérieurement à la résiliation du Contrat +resteront valides sous réserve qu'elles aient été effectuées en +conformité avec le Contrat. + + + Article 11 - DISPOSITIONS DIVERSES + + + 11.1 CAUSE EXTERIEURE + +Aucune des Parties ne sera responsable d'un retard ou d'une défaillance +d'exécution du Contrat qui serait dû à un cas de force majeure, un cas +fortuit ou une cause extérieure, telle que, notamment, le mauvais +fonctionnement ou les interruptions du réseau électrique ou de +télécommunication, la paralysie du réseau liée à une attaque +informatique, l'intervention des autorités gouvernementales, les +catastrophes naturelles, les dégâts des eaux, les tremblements de terre, +le feu, les explosions, les grèves et les conflits sociaux, l'état de +guerre... + +11.2 Le fait, par l'une ou l'autre des Parties, d'omettre en une ou +plusieurs occasions de se prévaloir d'une ou plusieurs dispositions du +Contrat, ne pourra en aucun cas impliquer renonciation par la Partie +intéressée à s'en prévaloir ultérieurement. + +11.3 Le Contrat annule et remplace toute convention antérieure, écrite +ou orale, entre les Parties sur le même objet et constitue l'accord +entier entre les Parties sur cet objet. Aucune addition ou modification +aux termes du Contrat n'aura d'effet à l'égard des Parties à moins +d'être faite par écrit et signée par leurs représentants dûment habilités. + +11.4 Dans l'hypothèse où une ou plusieurs des dispositions du Contrat +s'avèrerait contraire à une loi ou à un texte applicable, existants ou +futurs, cette loi ou ce texte prévaudrait, et les Parties feraient les +amendements nécessaires pour se conformer à cette loi ou à ce texte. +Toutes les autres dispositions resteront en vigueur. De même, la +nullité, pour quelque raison que ce soit, d'une des dispositions du +Contrat ne saurait entraîner la nullité de l'ensemble du Contrat. + + + 11.5 LANGUE + +Le Contrat est rédigé en langue française et en langue anglaise, ces +deux versions faisant également foi. + + + Article 12 - NOUVELLES VERSIONS DU CONTRAT + +12.1 Toute personne est autorisée à copier et distribuer des copies de +ce Contrat. + +12.2 Afin d'en préserver la cohérence, le texte du Contrat est protégé +et ne peut être modifié que par les auteurs de la licence, lesquels se +réservent le droit de publier périodiquement des mises à jour ou de +nouvelles versions du Contrat, qui posséderont chacune un numéro +distinct. Ces versions ultérieures seront susceptibles de prendre en +compte de nouvelles problématiques rencontrées par les logiciels libres. + +12.3 Tout Logiciel diffusé sous une version donnée du Contrat ne pourra +faire l'objet d'une diffusion ultérieure que sous la même version du +Contrat ou une version postérieure, sous réserve des dispositions de +l'article 5.3.4. + + + Article 13 - LOI APPLICABLE ET COMPETENCE TERRITORIALE + +13.1 Le Contrat est régi par la loi française. Les Parties conviennent +de tenter de régler à l'amiable les différends ou litiges qui +viendraient à se produire par suite ou à l'occasion du Contrat. + +13.2 A défaut d'accord amiable dans un délai de deux (2) mois à compter +de leur survenance et sauf situation relevant d'une procédure d'urgence, +les différends ou litiges seront portés par la Partie la plus diligente +devant les Tribunaux compétents de Paris. + + +Version 2.0 du 2006-09-05. diff --git a/VERSION b/VERSION new file mode 100644 index 0000000..d3827e7 --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +1.0 diff --git a/ahoc_metazoas.gv b/ahoc_metazoas.gv new file mode 100644 index 0000000..fdf1e81 --- /dev/null +++ b/ahoc_metazoas.gv @@ -0,0 +1,1389 @@ +graph primerlinks { + 21 -- 150 [label="116: 0.26: 0.17"]; + 21 -- 151 [label="116: 0.26: 0.17"]; + 21 -- 152 [label="117: 0.26: 0.17"]; + 21 -- 153 [label="116: 0.26: 0.17"]; + 21 -- 154 [label="116: 0.26: 0.17"]; + 21 -- 155 [label="116: 0.26: 0.17"]; + 21 -- 163 [label="107: 0.26: 0.18"]; + 21 -- 164 [label="102: 0.26: 0.17"]; + 21 -- 190 [label="74: 0.26: 0.15"]; + 32 -- 150 [label="111: 0.25: 0.17"]; + 32 -- 151 [label="111: 0.25: 0.17"]; + 32 -- 152 [label="112: 0.25: 0.17"]; + 32 -- 153 [label="111: 0.25: 0.17"]; + 32 -- 154 [label="111: 0.25: 0.17"]; + 32 -- 155 [label="111: 0.25: 0.17"]; + 32 -- 163 [label="102: 0.25: 0.18"]; + 32 -- 164 [label="97: 0.25: 0.17"]; + 32 -- 169 [label="110: 0.25: 0.19"]; + 32 -- 170 [label="110: 0.25: 0.20"]; + 32 -- 173 [label="109: 0.25: 0.20"]; + 32 -- 181 [label="103: 0.25: 0.15"]; + 32 -- 190 [label="72: 0.25: 0.15"]; + 80 -- 150 [label="118: 0.20: 0.17"]; + 80 -- 151 [label="118: 0.20: 0.17"]; + 80 -- 152 [label="119: 0.20: 0.17"]; + 80 -- 153 [label="118: 0.20: 0.17"]; + 80 -- 154 [label="118: 0.20: 0.17"]; + 80 -- 155 [label="118: 0.20: 0.17"]; + 80 -- 163 [label="105: 0.20: 0.18"]; + 80 -- 164 [label="102: 0.20: 0.17"]; + 80 -- 190 [label="74: 0.20: 0.15"]; + 88 -- 144 [label="125: 0.21: 0.21"]; + 88 -- 150 [label="104: 0.21: 0.17"]; + 88 -- 151 [label="104: 0.21: 0.17"]; + 88 -- 152 [label="105: 0.21: 0.17"]; + 88 -- 153 [label="104: 0.21: 0.17"]; + 88 -- 154 [label="104: 0.21: 0.17"]; + 88 -- 155 [label="104: 0.21: 0.17"]; + 88 -- 163 [label="93: 0.21: 0.18"]; + 88 -- 164 [label="90: 0.21: 0.17"]; + 88 -- 169 [label="96: 0.21: 0.19"]; + 88 -- 170 [label="96: 0.21: 0.20"]; + 88 -- 172 [label="98: 0.21: 0.20"]; + 88 -- 173 [label="95: 0.21: 0.20"]; + 88 -- 174 [label="98: 0.21: 0.20"]; + 88 -- 176 [label="97: 0.21: 0.19"]; + 88 -- 177 [label="96: 0.21: 0.19"]; + 88 -- 178 [label="97: 0.21: 0.19"]; + 88 -- 179 [label="95: 0.21: 0.19"]; + 88 -- 181 [label="97: 0.21: 0.15"]; + 88 -- 183 [label="96: 0.21: 0.17"]; + 88 -- 184 [label="96: 0.21: 0.17"]; + 88 -- 185 [label="96: 0.21: 0.17"]; + 88 -- 186 [label="96: 0.21: 0.17"]; + 88 -- 190 [label="67: 0.21: 0.15"]; + 88 -- 191 [label="81: 0.21: 0.15"]; + 91 -- 150 [label="116: 0.19: 0.17"]; + 91 -- 151 [label="116: 0.19: 0.17"]; + 91 -- 152 [label="117: 0.19: 0.17"]; + 91 -- 153 [label="116: 0.19: 0.17"]; + 91 -- 154 [label="116: 0.19: 0.17"]; + 91 -- 155 [label="116: 0.19: 0.17"]; + 91 -- 163 [label="101: 0.19: 0.18"]; + 91 -- 164 [label="99: 0.19: 0.17"]; + 91 -- 169 [label="110: 0.19: 0.19"]; + 91 -- 170 [label="110: 0.19: 0.20"]; + 91 -- 173 [label="109: 0.19: 0.20"]; + 91 -- 190 [label="72: 0.19: 0.15"]; + 92 -- 94 [label="147: 0.19: 0.19"]; + 92 -- 95 [label="145: 0.19: 0.19"]; + 92 -- 144 [label="124: 0.19: 0.21"]; + 92 -- 150 [label="96: 0.19: 0.17"]; + 92 -- 151 [label="96: 0.19: 0.17"]; + 92 -- 152 [label="96: 0.19: 0.17"]; + 92 -- 153 [label="96: 0.19: 0.17"]; + 92 -- 154 [label="96: 0.19: 0.17"]; + 92 -- 155 [label="95: 0.19: 0.17"]; + 92 -- 163 [label="82: 0.19: 0.18"]; + 92 -- 164 [label="79: 0.19: 0.17"]; + 92 -- 169 [label="94: 0.19: 0.19"]; + 92 -- 170 [label="94: 0.19: 0.20"]; + 92 -- 172 [label="95: 0.19: 0.20"]; + 92 -- 173 [label="93: 0.19: 0.20"]; + 92 -- 174 [label="95: 0.19: 0.20"]; + 92 -- 176 [label="94: 0.19: 0.19"]; + 92 -- 177 [label="94: 0.19: 0.19"]; + 92 -- 178 [label="94: 0.19: 0.19"]; + 92 -- 179 [label="93: 0.19: 0.19"]; + 92 -- 181 [label="89: 0.19: 0.15"]; + 92 -- 183 [label="93: 0.19: 0.17"]; + 92 -- 184 [label="94: 0.19: 0.17"]; + 92 -- 185 [label="94: 0.19: 0.17"]; + 92 -- 186 [label="94: 0.19: 0.17"]; + 92 -- 188 [label="83: 0.19: 0.15"]; + 92 -- 190 [label="59: 0.19: 0.15"]; + 92 -- 191 [label="72: 0.19: 0.15"]; + 93 -- 131 [label="126: 0.19: 0.22"]; + 93 -- 167 [label="105: 0.19: 0.20"]; + 93 -- 175 [label="102: 0.19: 0.20"]; + 93 -- 187 [label="89: 0.19: 0.15"]; + 93 -- 189 [label="86: 0.19: 0.16"]; + 94 -- 131 [label="127: 0.19: 0.22"]; + 94 -- 167 [label="106: 0.19: 0.20"]; + 94 -- 175 [label="105: 0.19: 0.20"]; + 94 -- 190 [label="86: 0.19: 0.15"]; + 95 -- 131 [label="126: 0.19: 0.22"]; + 95 -- 167 [label="106: 0.19: 0.20"]; + 95 -- 175 [label="105: 0.19: 0.20"]; + 95 -- 190 [label="87: 0.19: 0.15"]; + 96 -- 150 [label="110: 0.20: 0.17"]; + 96 -- 151 [label="110: 0.20: 0.17"]; + 96 -- 152 [label="111: 0.20: 0.17"]; + 96 -- 153 [label="110: 0.20: 0.17"]; + 96 -- 154 [label="110: 0.20: 0.17"]; + 96 -- 155 [label="110: 0.20: 0.17"]; + 96 -- 163 [label="105: 0.20: 0.18"]; + 96 -- 164 [label="101: 0.20: 0.17"]; + 96 -- 169 [label="110: 0.20: 0.19"]; + 96 -- 170 [label="110: 0.20: 0.20"]; + 96 -- 173 [label="109: 0.20: 0.20"]; + 96 -- 190 [label="68: 0.20: 0.15"]; + 97 -- 150 [label="115: 0.19: 0.17"]; + 97 -- 151 [label="115: 0.19: 0.17"]; + 97 -- 152 [label="116: 0.19: 0.17"]; + 97 -- 153 [label="115: 0.19: 0.17"]; + 97 -- 154 [label="115: 0.19: 0.17"]; + 97 -- 155 [label="115: 0.19: 0.17"]; + 97 -- 163 [label="102: 0.19: 0.18"]; + 97 -- 164 [label="100: 0.19: 0.17"]; + 97 -- 169 [label="109: 0.19: 0.19"]; + 97 -- 170 [label="109: 0.19: 0.20"]; + 97 -- 172 [label="110: 0.19: 0.20"]; + 97 -- 173 [label="108: 0.19: 0.20"]; + 97 -- 174 [label="110: 0.19: 0.20"]; + 97 -- 176 [label="109: 0.19: 0.19"]; + 97 -- 177 [label="109: 0.19: 0.19"]; + 97 -- 178 [label="109: 0.19: 0.19"]; + 97 -- 190 [label="71: 0.19: 0.15"]; + 98 -- 150 [label="114: 0.19: 0.17"]; + 98 -- 151 [label="114: 0.19: 0.17"]; + 98 -- 152 [label="115: 0.19: 0.17"]; + 98 -- 153 [label="114: 0.19: 0.17"]; + 98 -- 154 [label="114: 0.19: 0.17"]; + 98 -- 155 [label="114: 0.19: 0.17"]; + 98 -- 163 [label="101: 0.19: 0.18"]; + 98 -- 164 [label="99: 0.19: 0.17"]; + 98 -- 169 [label="109: 0.19: 0.19"]; + 98 -- 170 [label="109: 0.19: 0.20"]; + 98 -- 172 [label="110: 0.19: 0.20"]; + 98 -- 173 [label="108: 0.19: 0.20"]; + 98 -- 174 [label="110: 0.19: 0.20"]; + 98 -- 176 [label="109: 0.19: 0.19"]; + 98 -- 177 [label="109: 0.19: 0.19"]; + 98 -- 178 [label="109: 0.19: 0.19"]; + 98 -- 190 [label="70: 0.19: 0.15"]; + 99 -- 150 [label="109: 0.20: 0.17"]; + 99 -- 151 [label="109: 0.20: 0.17"]; + 99 -- 152 [label="110: 0.20: 0.17"]; + 99 -- 153 [label="109: 0.20: 0.17"]; + 99 -- 154 [label="109: 0.20: 0.17"]; + 99 -- 155 [label="109: 0.20: 0.17"]; + 99 -- 163 [label="106: 0.20: 0.18"]; + 99 -- 164 [label="102: 0.20: 0.17"]; + 99 -- 190 [label="67: 0.20: 0.15"]; + 100 -- 150 [label="118: 0.19: 0.17"]; + 100 -- 151 [label="118: 0.19: 0.17"]; + 100 -- 152 [label="119: 0.19: 0.17"]; + 100 -- 153 [label="118: 0.19: 0.17"]; + 100 -- 154 [label="118: 0.19: 0.17"]; + 100 -- 155 [label="118: 0.19: 0.17"]; + 100 -- 163 [label="109: 0.19: 0.18"]; + 100 -- 164 [label="106: 0.19: 0.17"]; + 100 -- 190 [label="74: 0.19: 0.15"]; + 101 -- 150 [label="107: 0.20: 0.17"]; + 101 -- 151 [label="107: 0.20: 0.17"]; + 101 -- 152 [label="108: 0.20: 0.17"]; + 101 -- 153 [label="107: 0.20: 0.17"]; + 101 -- 154 [label="107: 0.20: 0.17"]; + 101 -- 155 [label="107: 0.20: 0.17"]; + 101 -- 163 [label="100: 0.20: 0.18"]; + 101 -- 164 [label="97: 0.20: 0.17"]; + 101 -- 169 [label="111: 0.20: 0.19"]; + 101 -- 170 [label="111: 0.20: 0.20"]; + 101 -- 173 [label="110: 0.20: 0.20"]; + 101 -- 190 [label="65: 0.20: 0.15"]; + 102 -- 150 [label="115: 0.18: 0.17"]; + 102 -- 151 [label="115: 0.18: 0.17"]; + 102 -- 152 [label="116: 0.18: 0.17"]; + 102 -- 153 [label="115: 0.18: 0.17"]; + 102 -- 154 [label="115: 0.18: 0.17"]; + 102 -- 155 [label="115: 0.18: 0.17"]; + 102 -- 163 [label="107: 0.18: 0.18"]; + 102 -- 164 [label="104: 0.18: 0.17"]; + 102 -- 190 [label="71: 0.18: 0.15"]; + 103 -- 150 [label="107: 0.20: 0.17"]; + 103 -- 151 [label="107: 0.20: 0.17"]; + 103 -- 152 [label="108: 0.20: 0.17"]; + 103 -- 153 [label="107: 0.20: 0.17"]; + 103 -- 154 [label="107: 0.20: 0.17"]; + 103 -- 155 [label="107: 0.20: 0.17"]; + 103 -- 163 [label="102: 0.20: 0.18"]; + 103 -- 164 [label="99: 0.20: 0.17"]; + 103 -- 190 [label="65: 0.20: 0.15"]; + 104 -- 150 [label="107: 0.20: 0.17"]; + 104 -- 151 [label="107: 0.20: 0.17"]; + 104 -- 152 [label="108: 0.20: 0.17"]; + 104 -- 153 [label="107: 0.20: 0.17"]; + 104 -- 154 [label="107: 0.20: 0.17"]; + 104 -- 155 [label="107: 0.20: 0.17"]; + 104 -- 163 [label="101: 0.20: 0.18"]; + 104 -- 164 [label="98: 0.20: 0.17"]; + 104 -- 190 [label="65: 0.20: 0.15"]; + 105 -- 150 [label="108: 0.19: 0.17"]; + 105 -- 151 [label="108: 0.19: 0.17"]; + 105 -- 152 [label="109: 0.19: 0.17"]; + 105 -- 153 [label="108: 0.19: 0.17"]; + 105 -- 154 [label="108: 0.19: 0.17"]; + 105 -- 155 [label="108: 0.19: 0.17"]; + 105 -- 163 [label="102: 0.19: 0.18"]; + 105 -- 164 [label="99: 0.19: 0.17"]; + 105 -- 190 [label="66: 0.19: 0.15"]; + 106 -- 150 [label="107: 0.20: 0.17"]; + 106 -- 151 [label="107: 0.20: 0.17"]; + 106 -- 152 [label="108: 0.20: 0.17"]; + 106 -- 153 [label="107: 0.20: 0.17"]; + 106 -- 154 [label="107: 0.20: 0.17"]; + 106 -- 155 [label="107: 0.20: 0.17"]; + 106 -- 163 [label="99: 0.20: 0.18"]; + 106 -- 164 [label="96: 0.20: 0.17"]; + 106 -- 190 [label="65: 0.20: 0.15"]; + 107 -- 163 [label="115: 0.20: 0.18"]; + 107 -- 164 [label="113: 0.20: 0.17"]; + 107 -- 190 [label="84: 0.20: 0.15"]; + 108 -- 150 [label="115: 0.18: 0.17"]; + 108 -- 151 [label="115: 0.18: 0.17"]; + 108 -- 152 [label="116: 0.18: 0.17"]; + 108 -- 153 [label="115: 0.18: 0.17"]; + 108 -- 154 [label="115: 0.18: 0.17"]; + 108 -- 155 [label="115: 0.18: 0.17"]; + 108 -- 163 [label="103: 0.18: 0.18"]; + 108 -- 164 [label="100: 0.18: 0.17"]; + 108 -- 190 [label="71: 0.18: 0.15"]; + 109 -- 150 [label="106: 0.19: 0.17"]; + 109 -- 151 [label="106: 0.19: 0.17"]; + 109 -- 152 [label="107: 0.19: 0.17"]; + 109 -- 153 [label="106: 0.19: 0.17"]; + 109 -- 154 [label="106: 0.19: 0.17"]; + 109 -- 155 [label="106: 0.19: 0.17"]; + 109 -- 163 [label="99: 0.19: 0.18"]; + 109 -- 164 [label="96: 0.19: 0.17"]; + 109 -- 169 [label="111: 0.19: 0.19"]; + 109 -- 170 [label="111: 0.19: 0.20"]; + 109 -- 173 [label="110: 0.19: 0.20"]; + 109 -- 190 [label="64: 0.19: 0.15"]; + 110 -- 163 [label="113: 0.21: 0.18"]; + 110 -- 164 [label="111: 0.21: 0.17"]; + 110 -- 190 [label="86: 0.21: 0.15"]; + 111 -- 163 [label="113: 0.21: 0.18"]; + 111 -- 164 [label="111: 0.21: 0.17"]; + 111 -- 190 [label="86: 0.21: 0.15"]; + 112 -- 163 [label="113: 0.21: 0.18"]; + 112 -- 164 [label="111: 0.21: 0.17"]; + 112 -- 190 [label="86: 0.21: 0.15"]; + 113 -- 150 [label="115: 0.18: 0.17"]; + 113 -- 151 [label="115: 0.18: 0.17"]; + 113 -- 152 [label="116: 0.18: 0.17"]; + 113 -- 153 [label="115: 0.18: 0.17"]; + 113 -- 154 [label="115: 0.18: 0.17"]; + 113 -- 155 [label="115: 0.18: 0.17"]; + 113 -- 163 [label="107: 0.18: 0.18"]; + 113 -- 164 [label="104: 0.18: 0.17"]; + 113 -- 190 [label="71: 0.18: 0.15"]; + 114 -- 150 [label="107: 0.19: 0.17"]; + 114 -- 151 [label="107: 0.19: 0.17"]; + 114 -- 152 [label="108: 0.19: 0.17"]; + 114 -- 153 [label="107: 0.19: 0.17"]; + 114 -- 154 [label="107: 0.19: 0.17"]; + 114 -- 155 [label="107: 0.19: 0.17"]; + 114 -- 163 [label="100: 0.19: 0.18"]; + 114 -- 164 [label="97: 0.19: 0.17"]; + 114 -- 169 [label="111: 0.19: 0.19"]; + 114 -- 170 [label="111: 0.19: 0.20"]; + 114 -- 173 [label="110: 0.19: 0.20"]; + 114 -- 190 [label="65: 0.19: 0.15"]; + 115 -- 163 [label="113: 0.20: 0.18"]; + 115 -- 164 [label="111: 0.20: 0.17"]; + 115 -- 190 [label="84: 0.20: 0.15"]; + 116 -- 150 [label="104: 0.19: 0.17"]; + 116 -- 151 [label="104: 0.19: 0.17"]; + 116 -- 152 [label="105: 0.19: 0.17"]; + 116 -- 153 [label="104: 0.19: 0.17"]; + 116 -- 154 [label="104: 0.19: 0.17"]; + 116 -- 155 [label="104: 0.19: 0.17"]; + 116 -- 163 [label="96: 0.19: 0.18"]; + 116 -- 164 [label="94: 0.19: 0.17"]; + 116 -- 169 [label="106: 0.19: 0.19"]; + 116 -- 170 [label="106: 0.19: 0.20"]; + 116 -- 172 [label="107: 0.19: 0.20"]; + 116 -- 173 [label="105: 0.19: 0.20"]; + 116 -- 174 [label="107: 0.19: 0.20"]; + 116 -- 176 [label="106: 0.19: 0.19"]; + 116 -- 177 [label="106: 0.19: 0.19"]; + 116 -- 178 [label="106: 0.19: 0.19"]; + 116 -- 179 [label="105: 0.19: 0.19"]; + 116 -- 190 [label="62: 0.19: 0.15"]; + 117 -- 150 [label="115: 0.18: 0.17"]; + 117 -- 151 [label="115: 0.18: 0.17"]; + 117 -- 152 [label="116: 0.18: 0.17"]; + 117 -- 153 [label="115: 0.18: 0.17"]; + 117 -- 154 [label="115: 0.18: 0.17"]; + 117 -- 155 [label="115: 0.18: 0.17"]; + 117 -- 163 [label="102: 0.18: 0.18"]; + 117 -- 164 [label="100: 0.18: 0.17"]; + 117 -- 169 [label="108: 0.18: 0.19"]; + 117 -- 170 [label="108: 0.18: 0.20"]; + 117 -- 172 [label="109: 0.18: 0.20"]; + 117 -- 173 [label="107: 0.18: 0.20"]; + 117 -- 174 [label="109: 0.18: 0.20"]; + 117 -- 176 [label="108: 0.18: 0.19"]; + 117 -- 177 [label="108: 0.18: 0.19"]; + 117 -- 178 [label="108: 0.18: 0.19"]; + 117 -- 179 [label="107: 0.18: 0.19"]; + 117 -- 190 [label="71: 0.18: 0.15"]; + 118 -- 150 [label="107: 0.17: 0.17"]; + 118 -- 151 [label="107: 0.17: 0.17"]; + 118 -- 152 [label="108: 0.17: 0.17"]; + 118 -- 153 [label="107: 0.17: 0.17"]; + 118 -- 154 [label="107: 0.17: 0.17"]; + 118 -- 155 [label="107: 0.17: 0.17"]; + 118 -- 163 [label="95: 0.17: 0.18"]; + 118 -- 164 [label="93: 0.17: 0.17"]; + 118 -- 169 [label="102: 0.17: 0.19"]; + 118 -- 170 [label="102: 0.17: 0.20"]; + 118 -- 172 [label="103: 0.17: 0.20"]; + 118 -- 173 [label="101: 0.17: 0.20"]; + 118 -- 174 [label="103: 0.17: 0.20"]; + 118 -- 176 [label="102: 0.17: 0.19"]; + 118 -- 177 [label="102: 0.17: 0.19"]; + 118 -- 178 [label="102: 0.17: 0.19"]; + 118 -- 179 [label="101: 0.17: 0.19"]; + 118 -- 190 [label="66: 0.17: 0.15"]; + 119 -- 131 [label="129: 0.20: 0.22"]; + 119 -- 163 [label="113: 0.20: 0.18"]; + 119 -- 164 [label="111: 0.20: 0.17"]; + 119 -- 167 [label="112: 0.20: 0.20"]; + 119 -- 190 [label="84: 0.20: 0.15"]; + 120 -- 131 [label="123: 0.19: 0.22"]; + 120 -- 150 [label="122: 0.19: 0.17"]; + 120 -- 151 [label="122: 0.19: 0.17"]; + 120 -- 152 [label="122: 0.19: 0.17"]; + 120 -- 153 [label="122: 0.19: 0.17"]; + 120 -- 154 [label="122: 0.19: 0.17"]; + 120 -- 155 [label="122: 0.19: 0.17"]; + 120 -- 167 [label="102: 0.19: 0.20"]; + 120 -- 175 [label="101: 0.19: 0.20"]; + 120 -- 190 [label="76: 0.19: 0.15"]; + 121 -- 131 [label="127: 0.17: 0.22"]; + 121 -- 144 [label="116: 0.17: 0.21"]; + 121 -- 150 [label="92: 0.17: 0.17"]; + 121 -- 151 [label="92: 0.17: 0.17"]; + 121 -- 152 [label="92: 0.17: 0.17"]; + 121 -- 153 [label="92: 0.17: 0.17"]; + 121 -- 154 [label="92: 0.17: 0.17"]; + 121 -- 155 [label="91: 0.17: 0.17"]; + 121 -- 163 [label="78: 0.17: 0.18"]; + 121 -- 164 [label="75: 0.17: 0.17"]; + 121 -- 169 [label="85: 0.17: 0.19"]; + 121 -- 170 [label="85: 0.17: 0.20"]; + 121 -- 172 [label="87: 0.17: 0.20"]; + 121 -- 173 [label="84: 0.17: 0.20"]; + 121 -- 174 [label="87: 0.17: 0.20"]; + 121 -- 176 [label="86: 0.17: 0.19"]; + 121 -- 177 [label="86: 0.17: 0.19"]; + 121 -- 178 [label="86: 0.17: 0.19"]; + 121 -- 179 [label="85: 0.17: 0.19"]; + 121 -- 181 [label="85: 0.17: 0.15"]; + 121 -- 183 [label="84: 0.17: 0.17"]; + 121 -- 184 [label="85: 0.17: 0.17"]; + 121 -- 185 [label="85: 0.17: 0.17"]; + 121 -- 186 [label="86: 0.17: 0.17"]; + 121 -- 188 [label="79: 0.17: 0.15"]; + 121 -- 190 [label="57: 0.17: 0.15"]; + 121 -- 191 [label="66: 0.17: 0.15"]; + 122 -- 150 [label="105: 0.19: 0.17"]; + 122 -- 151 [label="105: 0.19: 0.17"]; + 122 -- 152 [label="106: 0.19: 0.17"]; + 122 -- 153 [label="105: 0.19: 0.17"]; + 122 -- 154 [label="105: 0.19: 0.17"]; + 122 -- 155 [label="105: 0.19: 0.17"]; + 122 -- 163 [label="97: 0.19: 0.18"]; + 122 -- 164 [label="95: 0.19: 0.17"]; + 122 -- 169 [label="107: 0.19: 0.19"]; + 122 -- 170 [label="107: 0.19: 0.20"]; + 122 -- 172 [label="108: 0.19: 0.20"]; + 122 -- 173 [label="106: 0.19: 0.20"]; + 122 -- 174 [label="108: 0.19: 0.20"]; + 122 -- 176 [label="107: 0.19: 0.19"]; + 122 -- 177 [label="107: 0.19: 0.19"]; + 122 -- 178 [label="107: 0.19: 0.19"]; + 122 -- 179 [label="106: 0.19: 0.19"]; + 122 -- 190 [label="63: 0.19: 0.15"]; + 123 -- 150 [label="105: 0.19: 0.17"]; + 123 -- 151 [label="105: 0.19: 0.17"]; + 123 -- 152 [label="106: 0.19: 0.17"]; + 123 -- 153 [label="105: 0.19: 0.17"]; + 123 -- 154 [label="105: 0.19: 0.17"]; + 123 -- 155 [label="105: 0.19: 0.17"]; + 123 -- 163 [label="97: 0.19: 0.18"]; + 123 -- 164 [label="95: 0.19: 0.17"]; + 123 -- 169 [label="107: 0.19: 0.19"]; + 123 -- 170 [label="107: 0.19: 0.20"]; + 123 -- 172 [label="108: 0.19: 0.20"]; + 123 -- 173 [label="106: 0.19: 0.20"]; + 123 -- 174 [label="108: 0.19: 0.20"]; + 123 -- 176 [label="107: 0.19: 0.19"]; + 123 -- 177 [label="107: 0.19: 0.19"]; + 123 -- 178 [label="107: 0.19: 0.19"]; + 123 -- 179 [label="106: 0.19: 0.19"]; + 123 -- 190 [label="63: 0.19: 0.15"]; + 124 -- 150 [label="106: 0.19: 0.17"]; + 124 -- 151 [label="106: 0.19: 0.17"]; + 124 -- 152 [label="107: 0.19: 0.17"]; + 124 -- 153 [label="106: 0.19: 0.17"]; + 124 -- 154 [label="106: 0.19: 0.17"]; + 124 -- 155 [label="106: 0.19: 0.17"]; + 124 -- 163 [label="102: 0.19: 0.18"]; + 124 -- 164 [label="99: 0.19: 0.17"]; + 124 -- 169 [label="110: 0.19: 0.19"]; + 124 -- 170 [label="110: 0.19: 0.20"]; + 124 -- 173 [label="109: 0.19: 0.20"]; + 124 -- 177 [label="109: 0.19: 0.19"]; + 124 -- 181 [label="104: 0.19: 0.15"]; + 124 -- 190 [label="64: 0.19: 0.15"]; + 125 -- 150 [label="117: 0.16: 0.17"]; + 125 -- 151 [label="117: 0.16: 0.17"]; + 125 -- 152 [label="117: 0.16: 0.17"]; + 125 -- 153 [label="117: 0.16: 0.17"]; + 125 -- 154 [label="117: 0.16: 0.17"]; + 125 -- 155 [label="116: 0.16: 0.17"]; + 125 -- 163 [label="109: 0.16: 0.18"]; + 125 -- 164 [label="106: 0.16: 0.17"]; + 125 -- 169 [label="102: 0.16: 0.19"]; + 125 -- 170 [label="102: 0.16: 0.20"]; + 125 -- 172 [label="104: 0.16: 0.20"]; + 125 -- 173 [label="101: 0.16: 0.20"]; + 125 -- 174 [label="104: 0.16: 0.20"]; + 125 -- 176 [label="103: 0.16: 0.19"]; + 125 -- 177 [label="103: 0.16: 0.19"]; + 125 -- 178 [label="103: 0.16: 0.19"]; + 125 -- 179 [label="102: 0.16: 0.19"]; + 125 -- 183 [label="100: 0.16: 0.17"]; + 125 -- 190 [label="77: 0.16: 0.15"]; + 125 -- 191 [label="80: 0.16: 0.15"]; + 126 -- 150 [label="116: 0.16: 0.17"]; + 126 -- 151 [label="116: 0.16: 0.17"]; + 126 -- 152 [label="116: 0.16: 0.17"]; + 126 -- 153 [label="116: 0.16: 0.17"]; + 126 -- 154 [label="116: 0.16: 0.17"]; + 126 -- 155 [label="115: 0.16: 0.17"]; + 126 -- 163 [label="108: 0.16: 0.18"]; + 126 -- 164 [label="105: 0.16: 0.17"]; + 126 -- 169 [label="100: 0.16: 0.19"]; + 126 -- 170 [label="100: 0.16: 0.20"]; + 126 -- 172 [label="102: 0.16: 0.20"]; + 126 -- 173 [label="99: 0.16: 0.20"]; + 126 -- 174 [label="102: 0.16: 0.20"]; + 126 -- 176 [label="101: 0.16: 0.19"]; + 126 -- 177 [label="101: 0.16: 0.19"]; + 126 -- 178 [label="101: 0.16: 0.19"]; + 126 -- 179 [label="100: 0.16: 0.19"]; + 126 -- 181 [label="106: 0.16: 0.15"]; + 126 -- 183 [label="98: 0.16: 0.17"]; + 126 -- 184 [label="99: 0.16: 0.17"]; + 126 -- 185 [label="99: 0.16: 0.17"]; + 126 -- 190 [label="76: 0.16: 0.15"]; + 126 -- 191 [label="80: 0.16: 0.15"]; + 127 -- 150 [label="102: 0.18: 0.17"]; + 127 -- 151 [label="102: 0.18: 0.17"]; + 127 -- 152 [label="103: 0.18: 0.17"]; + 127 -- 153 [label="102: 0.18: 0.17"]; + 127 -- 154 [label="102: 0.18: 0.17"]; + 127 -- 155 [label="102: 0.18: 0.17"]; + 127 -- 163 [label="96: 0.18: 0.18"]; + 127 -- 164 [label="93: 0.18: 0.17"]; + 127 -- 169 [label="99: 0.18: 0.19"]; + 127 -- 170 [label="99: 0.18: 0.20"]; + 127 -- 172 [label="100: 0.18: 0.20"]; + 127 -- 173 [label="98: 0.18: 0.20"]; + 127 -- 174 [label="100: 0.18: 0.20"]; + 127 -- 176 [label="99: 0.18: 0.19"]; + 127 -- 177 [label="99: 0.18: 0.19"]; + 127 -- 178 [label="99: 0.18: 0.19"]; + 127 -- 179 [label="98: 0.18: 0.19"]; + 127 -- 181 [label="102: 0.18: 0.15"]; + 127 -- 183 [label="99: 0.18: 0.17"]; + 127 -- 184 [label="99: 0.18: 0.17"]; + 127 -- 185 [label="99: 0.18: 0.17"]; + 127 -- 190 [label="63: 0.18: 0.15"]; + 127 -- 191 [label="82: 0.18: 0.15"]; + 128 -- 150 [label="103: 0.19: 0.17"]; + 128 -- 151 [label="103: 0.19: 0.17"]; + 128 -- 152 [label="104: 0.19: 0.17"]; + 128 -- 153 [label="103: 0.19: 0.17"]; + 128 -- 154 [label="103: 0.19: 0.17"]; + 128 -- 155 [label="103: 0.19: 0.17"]; + 128 -- 163 [label="94: 0.19: 0.18"]; + 128 -- 164 [label="92: 0.19: 0.17"]; + 128 -- 169 [label="104: 0.19: 0.19"]; + 128 -- 170 [label="104: 0.19: 0.20"]; + 128 -- 172 [label="105: 0.19: 0.20"]; + 128 -- 173 [label="103: 0.19: 0.20"]; + 128 -- 174 [label="105: 0.19: 0.20"]; + 128 -- 176 [label="104: 0.19: 0.19"]; + 128 -- 177 [label="104: 0.19: 0.19"]; + 128 -- 178 [label="104: 0.19: 0.19"]; + 128 -- 179 [label="103: 0.19: 0.19"]; + 128 -- 181 [label="106: 0.19: 0.15"]; + 128 -- 190 [label="61: 0.19: 0.15"]; + 129 -- 150 [label="107: 0.19: 0.17"]; + 129 -- 151 [label="107: 0.19: 0.17"]; + 129 -- 152 [label="108: 0.19: 0.17"]; + 129 -- 153 [label="107: 0.19: 0.17"]; + 129 -- 154 [label="107: 0.19: 0.17"]; + 129 -- 155 [label="107: 0.19: 0.17"]; + 129 -- 163 [label="103: 0.19: 0.18"]; + 129 -- 164 [label="100: 0.19: 0.17"]; + 129 -- 190 [label="65: 0.19: 0.15"]; + 130 -- 150 [label="108: 0.18: 0.17"]; + 130 -- 151 [label="108: 0.18: 0.17"]; + 130 -- 152 [label="109: 0.18: 0.17"]; + 130 -- 153 [label="108: 0.18: 0.17"]; + 130 -- 154 [label="108: 0.18: 0.17"]; + 130 -- 155 [label="108: 0.18: 0.17"]; + 130 -- 163 [label="103: 0.18: 0.18"]; + 130 -- 164 [label="100: 0.18: 0.17"]; + 130 -- 190 [label="66: 0.18: 0.15"]; + 131 -- 142 [label="118: 0.22: 0.18"]; + 131 -- 144 [label="114: 0.22: 0.21"]; + 131 -- 147 [label="116: 0.22: 0.18"]; + 131 -- 150 [label="101: 0.22: 0.17"]; + 131 -- 151 [label="101: 0.22: 0.17"]; + 131 -- 152 [label="102: 0.22: 0.17"]; + 131 -- 153 [label="101: 0.22: 0.17"]; + 131 -- 154 [label="101: 0.22: 0.17"]; + 131 -- 155 [label="101: 0.22: 0.17"]; + 131 -- 163 [label="103: 0.22: 0.18"]; + 131 -- 164 [label="101: 0.22: 0.17"]; + 131 -- 181 [label="87: 0.22: 0.15"]; + 131 -- 188 [label="81: 0.22: 0.15"]; + 131 -- 190 [label="69: 0.22: 0.15"]; + 131 -- 191 [label="79: 0.22: 0.15"]; + 132 -- 150 [label="110: 0.17: 0.17"]; + 132 -- 151 [label="110: 0.17: 0.17"]; + 132 -- 152 [label="111: 0.17: 0.17"]; + 132 -- 153 [label="110: 0.17: 0.17"]; + 132 -- 154 [label="110: 0.17: 0.17"]; + 132 -- 155 [label="111: 0.17: 0.17"]; + 132 -- 163 [label="96: 0.17: 0.18"]; + 132 -- 164 [label="93: 0.17: 0.17"]; + 132 -- 169 [label="109: 0.17: 0.19"]; + 132 -- 170 [label="109: 0.17: 0.20"]; + 132 -- 172 [label="110: 0.17: 0.20"]; + 132 -- 173 [label="109: 0.17: 0.20"]; + 132 -- 174 [label="110: 0.17: 0.20"]; + 132 -- 177 [label="109: 0.17: 0.19"]; + 132 -- 178 [label="109: 0.17: 0.19"]; + 132 -- 190 [label="68: 0.17: 0.15"]; + 133 -- 150 [label="109: 0.17: 0.17"]; + 133 -- 151 [label="109: 0.17: 0.17"]; + 133 -- 152 [label="110: 0.17: 0.17"]; + 133 -- 153 [label="109: 0.17: 0.17"]; + 133 -- 154 [label="109: 0.17: 0.17"]; + 133 -- 155 [label="110: 0.17: 0.17"]; + 133 -- 163 [label="95: 0.17: 0.18"]; + 133 -- 164 [label="92: 0.17: 0.17"]; + 133 -- 169 [label="108: 0.17: 0.19"]; + 133 -- 170 [label="108: 0.17: 0.20"]; + 133 -- 172 [label="109: 0.17: 0.20"]; + 133 -- 173 [label="108: 0.17: 0.20"]; + 133 -- 174 [label="109: 0.17: 0.20"]; + 133 -- 176 [label="109: 0.17: 0.19"]; + 133 -- 177 [label="108: 0.17: 0.19"]; + 133 -- 178 [label="108: 0.17: 0.19"]; + 133 -- 179 [label="107: 0.17: 0.19"]; + 133 -- 190 [label="67: 0.17: 0.15"]; + 134 -- 150 [label="110: 0.17: 0.17"]; + 134 -- 151 [label="110: 0.17: 0.17"]; + 134 -- 152 [label="111: 0.17: 0.17"]; + 134 -- 153 [label="110: 0.17: 0.17"]; + 134 -- 154 [label="110: 0.17: 0.17"]; + 134 -- 155 [label="111: 0.17: 0.17"]; + 134 -- 163 [label="95: 0.17: 0.18"]; + 134 -- 164 [label="92: 0.17: 0.17"]; + 134 -- 169 [label="108: 0.17: 0.19"]; + 134 -- 170 [label="108: 0.17: 0.20"]; + 134 -- 172 [label="109: 0.17: 0.20"]; + 134 -- 173 [label="108: 0.17: 0.20"]; + 134 -- 174 [label="109: 0.17: 0.20"]; + 134 -- 176 [label="109: 0.17: 0.19"]; + 134 -- 177 [label="108: 0.17: 0.19"]; + 134 -- 178 [label="108: 0.17: 0.19"]; + 134 -- 179 [label="107: 0.17: 0.19"]; + 134 -- 190 [label="68: 0.17: 0.15"]; + 135 -- 150 [label="110: 0.17: 0.17"]; + 135 -- 151 [label="110: 0.17: 0.17"]; + 135 -- 152 [label="111: 0.17: 0.17"]; + 135 -- 153 [label="110: 0.17: 0.17"]; + 135 -- 154 [label="110: 0.17: 0.17"]; + 135 -- 155 [label="111: 0.17: 0.17"]; + 135 -- 163 [label="94: 0.17: 0.18"]; + 135 -- 164 [label="91: 0.17: 0.17"]; + 135 -- 169 [label="108: 0.17: 0.19"]; + 135 -- 170 [label="108: 0.17: 0.20"]; + 135 -- 172 [label="109: 0.17: 0.20"]; + 135 -- 173 [label="107: 0.17: 0.20"]; + 135 -- 174 [label="109: 0.17: 0.20"]; + 135 -- 176 [label="108: 0.17: 0.19"]; + 135 -- 177 [label="108: 0.17: 0.19"]; + 135 -- 178 [label="108: 0.17: 0.19"]; + 135 -- 179 [label="107: 0.17: 0.19"]; + 135 -- 190 [label="68: 0.17: 0.15"]; + 136 -- 150 [label="104: 0.18: 0.17"]; + 136 -- 151 [label="104: 0.18: 0.17"]; + 136 -- 152 [label="105: 0.18: 0.17"]; + 136 -- 153 [label="104: 0.18: 0.17"]; + 136 -- 154 [label="104: 0.18: 0.17"]; + 136 -- 155 [label="104: 0.18: 0.17"]; + 136 -- 163 [label="100: 0.18: 0.18"]; + 136 -- 164 [label="97: 0.18: 0.17"]; + 136 -- 169 [label="111: 0.18: 0.19"]; + 136 -- 170 [label="111: 0.18: 0.20"]; + 136 -- 173 [label="110: 0.18: 0.20"]; + 136 -- 181 [label="105: 0.18: 0.15"]; + 136 -- 190 [label="62: 0.18: 0.15"]; + 137 -- 150 [label="104: 0.18: 0.17"]; + 137 -- 151 [label="104: 0.18: 0.17"]; + 137 -- 152 [label="105: 0.18: 0.17"]; + 137 -- 153 [label="104: 0.18: 0.17"]; + 137 -- 154 [label="104: 0.18: 0.17"]; + 137 -- 155 [label="104: 0.18: 0.17"]; + 137 -- 163 [label="99: 0.18: 0.18"]; + 137 -- 164 [label="96: 0.18: 0.17"]; + 137 -- 169 [label="111: 0.18: 0.19"]; + 137 -- 170 [label="111: 0.18: 0.20"]; + 137 -- 173 [label="110: 0.18: 0.20"]; + 137 -- 181 [label="105: 0.18: 0.15"]; + 137 -- 190 [label="62: 0.18: 0.15"]; + 138 -- 150 [label="106: 0.18: 0.17"]; + 138 -- 151 [label="106: 0.18: 0.17"]; + 138 -- 152 [label="107: 0.18: 0.17"]; + 138 -- 153 [label="106: 0.18: 0.17"]; + 138 -- 154 [label="106: 0.18: 0.17"]; + 138 -- 155 [label="106: 0.18: 0.17"]; + 138 -- 163 [label="99: 0.18: 0.18"]; + 138 -- 164 [label="97: 0.18: 0.17"]; + 138 -- 169 [label="108: 0.18: 0.19"]; + 138 -- 170 [label="108: 0.18: 0.20"]; + 138 -- 172 [label="109: 0.18: 0.20"]; + 138 -- 173 [label="107: 0.18: 0.20"]; + 138 -- 174 [label="109: 0.18: 0.20"]; + 138 -- 176 [label="108: 0.18: 0.19"]; + 138 -- 177 [label="108: 0.18: 0.19"]; + 138 -- 178 [label="107: 0.18: 0.19"]; + 138 -- 179 [label="106: 0.18: 0.19"]; + 138 -- 190 [label="64: 0.18: 0.15"]; + 139 -- 150 [label="106: 0.16: 0.17"]; + 139 -- 151 [label="106: 0.16: 0.17"]; + 139 -- 152 [label="107: 0.16: 0.17"]; + 139 -- 153 [label="106: 0.16: 0.17"]; + 139 -- 154 [label="106: 0.16: 0.17"]; + 139 -- 155 [label="107: 0.16: 0.17"]; + 139 -- 163 [label="93: 0.16: 0.18"]; + 139 -- 164 [label="90: 0.16: 0.17"]; + 139 -- 169 [label="106: 0.16: 0.19"]; + 139 -- 170 [label="106: 0.16: 0.20"]; + 139 -- 172 [label="107: 0.16: 0.20"]; + 139 -- 173 [label="106: 0.16: 0.20"]; + 139 -- 174 [label="107: 0.16: 0.20"]; + 139 -- 176 [label="107: 0.16: 0.19"]; + 139 -- 177 [label="106: 0.16: 0.19"]; + 139 -- 178 [label="106: 0.16: 0.19"]; + 139 -- 179 [label="105: 0.16: 0.19"]; + 139 -- 190 [label="64: 0.16: 0.15"]; + 140 -- 150 [label="108: 0.16: 0.17"]; + 140 -- 151 [label="108: 0.16: 0.17"]; + 140 -- 152 [label="109: 0.16: 0.17"]; + 140 -- 153 [label="108: 0.16: 0.17"]; + 140 -- 154 [label="108: 0.16: 0.17"]; + 140 -- 155 [label="109: 0.16: 0.17"]; + 140 -- 163 [label="91: 0.16: 0.18"]; + 140 -- 164 [label="89: 0.16: 0.17"]; + 140 -- 169 [label="105: 0.16: 0.19"]; + 140 -- 170 [label="105: 0.16: 0.20"]; + 140 -- 172 [label="106: 0.16: 0.20"]; + 140 -- 173 [label="105: 0.16: 0.20"]; + 140 -- 174 [label="106: 0.16: 0.20"]; + 140 -- 176 [label="106: 0.16: 0.19"]; + 140 -- 177 [label="105: 0.16: 0.19"]; + 140 -- 178 [label="105: 0.16: 0.19"]; + 140 -- 179 [label="104: 0.16: 0.19"]; + 140 -- 190 [label="66: 0.16: 0.15"]; + 141 -- 150 [label="108: 0.16: 0.17"]; + 141 -- 151 [label="108: 0.16: 0.17"]; + 141 -- 152 [label="109: 0.16: 0.17"]; + 141 -- 153 [label="108: 0.16: 0.17"]; + 141 -- 154 [label="108: 0.16: 0.17"]; + 141 -- 155 [label="109: 0.16: 0.17"]; + 141 -- 163 [label="92: 0.16: 0.18"]; + 141 -- 164 [label="90: 0.16: 0.17"]; + 141 -- 169 [label="104: 0.16: 0.19"]; + 141 -- 170 [label="104: 0.16: 0.20"]; + 141 -- 172 [label="105: 0.16: 0.20"]; + 141 -- 173 [label="104: 0.16: 0.20"]; + 141 -- 174 [label="105: 0.16: 0.20"]; + 141 -- 176 [label="105: 0.16: 0.19"]; + 141 -- 177 [label="104: 0.16: 0.19"]; + 141 -- 178 [label="104: 0.16: 0.19"]; + 141 -- 179 [label="103: 0.16: 0.19"]; + 141 -- 190 [label="66: 0.16: 0.15"]; + 142 -- 150 [label="114: 0.18: 0.17"]; + 142 -- 151 [label="114: 0.18: 0.17"]; + 142 -- 152 [label="114: 0.18: 0.17"]; + 142 -- 153 [label="114: 0.18: 0.17"]; + 142 -- 154 [label="114: 0.18: 0.17"]; + 142 -- 155 [label="114: 0.18: 0.17"]; + 142 -- 163 [label="117: 0.18: 0.18"]; + 142 -- 164 [label="114: 0.18: 0.17"]; + 142 -- 167 [label="97: 0.18: 0.20"]; + 142 -- 175 [label="96: 0.18: 0.20"]; + 142 -- 189 [label="85: 0.18: 0.16"]; + 142 -- 190 [label="70: 0.18: 0.15"]; + 143 -- 150 [label="99: 0.18: 0.17"]; + 143 -- 151 [label="99: 0.18: 0.17"]; + 143 -- 152 [label="100: 0.18: 0.17"]; + 143 -- 153 [label="99: 0.18: 0.17"]; + 143 -- 154 [label="99: 0.18: 0.17"]; + 143 -- 155 [label="99: 0.18: 0.17"]; + 143 -- 163 [label="83: 0.18: 0.18"]; + 143 -- 164 [label="81: 0.18: 0.17"]; + 143 -- 169 [label="100: 0.18: 0.19"]; + 143 -- 170 [label="100: 0.18: 0.20"]; + 143 -- 172 [label="101: 0.18: 0.20"]; + 143 -- 173 [label="99: 0.18: 0.20"]; + 143 -- 174 [label="101: 0.18: 0.20"]; + 143 -- 176 [label="100: 0.18: 0.19"]; + 143 -- 177 [label="100: 0.18: 0.19"]; + 143 -- 178 [label="100: 0.18: 0.19"]; + 143 -- 179 [label="99: 0.18: 0.19"]; + 143 -- 181 [label="102: 0.18: 0.15"]; + 143 -- 183 [label="100: 0.18: 0.17"]; + 143 -- 184 [label="100: 0.18: 0.17"]; + 143 -- 190 [label="57: 0.18: 0.15"]; + 143 -- 191 [label="84: 0.18: 0.15"]; + 144 -- 150 [label="103: 0.21: 0.17"]; + 144 -- 151 [label="103: 0.21: 0.17"]; + 144 -- 152 [label="103: 0.21: 0.17"]; + 144 -- 153 [label="103: 0.21: 0.17"]; + 144 -- 154 [label="103: 0.21: 0.17"]; + 144 -- 155 [label="103: 0.21: 0.17"]; + 144 -- 163 [label="109: 0.21: 0.18"]; + 144 -- 164 [label="106: 0.21: 0.17"]; + 144 -- 167 [label="97: 0.21: 0.20"]; + 144 -- 169 [label="98: 0.21: 0.19"]; + 144 -- 170 [label="98: 0.21: 0.20"]; + 144 -- 172 [label="100: 0.21: 0.20"]; + 144 -- 173 [label="98: 0.21: 0.20"]; + 144 -- 174 [label="100: 0.21: 0.20"]; + 144 -- 175 [label="96: 0.21: 0.20"]; + 144 -- 176 [label="100: 0.21: 0.19"]; + 144 -- 177 [label="98: 0.21: 0.19"]; + 144 -- 178 [label="98: 0.21: 0.19"]; + 144 -- 179 [label="96: 0.21: 0.19"]; + 144 -- 181 [label="101: 0.21: 0.15"]; + 144 -- 183 [label="95: 0.21: 0.17"]; + 144 -- 184 [label="95: 0.21: 0.17"]; + 144 -- 185 [label="95: 0.21: 0.17"]; + 144 -- 186 [label="95: 0.21: 0.17"]; + 144 -- 187 [label="82: 0.21: 0.15"]; + 144 -- 188 [label="85: 0.21: 0.15"]; + 144 -- 189 [label="79: 0.21: 0.16"]; + 144 -- 190 [label="75: 0.21: 0.15"]; + 144 -- 191 [label="85: 0.21: 0.15"]; + 145 -- 150 [label="106: 0.16: 0.17"]; + 145 -- 151 [label="106: 0.16: 0.17"]; + 145 -- 152 [label="107: 0.16: 0.17"]; + 145 -- 153 [label="106: 0.16: 0.17"]; + 145 -- 154 [label="106: 0.16: 0.17"]; + 145 -- 155 [label="107: 0.16: 0.17"]; + 145 -- 163 [label="92: 0.16: 0.18"]; + 145 -- 164 [label="89: 0.16: 0.17"]; + 145 -- 169 [label="105: 0.16: 0.19"]; + 145 -- 170 [label="105: 0.16: 0.20"]; + 145 -- 172 [label="106: 0.16: 0.20"]; + 145 -- 173 [label="105: 0.16: 0.20"]; + 145 -- 174 [label="106: 0.16: 0.20"]; + 145 -- 176 [label="106: 0.16: 0.19"]; + 145 -- 177 [label="105: 0.16: 0.19"]; + 145 -- 178 [label="105: 0.16: 0.19"]; + 145 -- 179 [label="104: 0.16: 0.19"]; + 145 -- 190 [label="64: 0.16: 0.15"]; + 146 -- 150 [label="99: 0.18: 0.17"]; + 146 -- 151 [label="99: 0.18: 0.17"]; + 146 -- 152 [label="100: 0.18: 0.17"]; + 146 -- 153 [label="99: 0.18: 0.17"]; + 146 -- 154 [label="99: 0.18: 0.17"]; + 146 -- 155 [label="99: 0.18: 0.17"]; + 146 -- 163 [label="82: 0.18: 0.18"]; + 146 -- 164 [label="80: 0.18: 0.17"]; + 146 -- 169 [label="99: 0.18: 0.19"]; + 146 -- 170 [label="99: 0.18: 0.20"]; + 146 -- 172 [label="100: 0.18: 0.20"]; + 146 -- 173 [label="98: 0.18: 0.20"]; + 146 -- 174 [label="100: 0.18: 0.20"]; + 146 -- 176 [label="99: 0.18: 0.19"]; + 146 -- 177 [label="99: 0.18: 0.19"]; + 146 -- 178 [label="99: 0.18: 0.19"]; + 146 -- 179 [label="98: 0.18: 0.19"]; + 146 -- 181 [label="102: 0.18: 0.15"]; + 146 -- 183 [label="99: 0.18: 0.17"]; + 146 -- 184 [label="99: 0.18: 0.17"]; + 146 -- 185 [label="99: 0.18: 0.17"]; + 146 -- 190 [label="57: 0.18: 0.15"]; + 146 -- 191 [label="83: 0.18: 0.15"]; + 147 -- 150 [label="113: 0.18: 0.17"]; + 147 -- 151 [label="113: 0.18: 0.17"]; + 147 -- 152 [label="113: 0.18: 0.17"]; + 147 -- 153 [label="113: 0.18: 0.17"]; + 147 -- 154 [label="113: 0.18: 0.17"]; + 147 -- 155 [label="113: 0.18: 0.17"]; + 147 -- 163 [label="114: 0.18: 0.18"]; + 147 -- 164 [label="111: 0.18: 0.17"]; + 147 -- 167 [label="96: 0.18: 0.20"]; + 147 -- 175 [label="95: 0.18: 0.20"]; + 147 -- 184 [label="100: 0.18: 0.17"]; + 147 -- 189 [label="84: 0.18: 0.16"]; + 147 -- 190 [label="70: 0.18: 0.15"]; + 148 -- 150 [label="107: 0.16: 0.17"]; + 148 -- 151 [label="107: 0.16: 0.17"]; + 148 -- 152 [label="108: 0.16: 0.17"]; + 148 -- 153 [label="107: 0.16: 0.17"]; + 148 -- 154 [label="107: 0.16: 0.17"]; + 148 -- 155 [label="108: 0.16: 0.17"]; + 148 -- 163 [label="90: 0.16: 0.18"]; + 148 -- 164 [label="88: 0.16: 0.17"]; + 148 -- 169 [label="103: 0.16: 0.19"]; + 148 -- 170 [label="103: 0.16: 0.20"]; + 148 -- 172 [label="104: 0.16: 0.20"]; + 148 -- 173 [label="103: 0.16: 0.20"]; + 148 -- 174 [label="104: 0.16: 0.20"]; + 148 -- 176 [label="104: 0.16: 0.19"]; + 148 -- 177 [label="103: 0.16: 0.19"]; + 148 -- 178 [label="103: 0.16: 0.19"]; + 148 -- 179 [label="102: 0.16: 0.19"]; + 148 -- 190 [label="65: 0.16: 0.15"]; + 149 -- 150 [label="107: 0.16: 0.17"]; + 149 -- 151 [label="107: 0.16: 0.17"]; + 149 -- 152 [label="108: 0.16: 0.17"]; + 149 -- 153 [label="107: 0.16: 0.17"]; + 149 -- 154 [label="107: 0.16: 0.17"]; + 149 -- 155 [label="108: 0.16: 0.17"]; + 149 -- 163 [label="89: 0.16: 0.18"]; + 149 -- 164 [label="87: 0.16: 0.17"]; + 149 -- 169 [label="104: 0.16: 0.19"]; + 149 -- 170 [label="104: 0.16: 0.20"]; + 149 -- 172 [label="105: 0.16: 0.20"]; + 149 -- 173 [label="104: 0.16: 0.20"]; + 149 -- 174 [label="105: 0.16: 0.20"]; + 149 -- 176 [label="105: 0.16: 0.19"]; + 149 -- 177 [label="104: 0.16: 0.19"]; + 149 -- 178 [label="104: 0.16: 0.19"]; + 149 -- 179 [label="103: 0.16: 0.19"]; + 149 -- 190 [label="65: 0.16: 0.15"]; + 150 -- 156 [label="103: 0.17: 0.16"]; + 150 -- 157 [label="106: 0.17: 0.16"]; + 150 -- 158 [label="102: 0.17: 0.17"]; + 150 -- 159 [label="93: 0.17: 0.17"]; + 150 -- 160 [label="100: 0.17: 0.17"]; + 150 -- 161 [label="103: 0.17: 0.15"]; + 150 -- 162 [label="104: 0.17: 0.15"]; + 150 -- 163 [label="102: 0.17: 0.18"]; + 150 -- 164 [label="99: 0.17: 0.17"]; + 150 -- 165 [label="98: 0.17: 0.16"]; + 150 -- 166 [label="98: 0.17: 0.16"]; + 150 -- 167 [label="81: 0.17: 0.20"]; + 150 -- 168 [label="97: 0.17: 0.16"]; + 150 -- 169 [label="106: 0.17: 0.19"]; + 150 -- 170 [label="106: 0.17: 0.20"]; + 150 -- 171 [label="94: 0.17: 0.16"]; + 150 -- 172 [label="107: 0.17: 0.20"]; + 150 -- 173 [label="104: 0.17: 0.20"]; + 150 -- 174 [label="107: 0.17: 0.20"]; + 150 -- 175 [label="80: 0.17: 0.20"]; + 150 -- 176 [label="105: 0.17: 0.19"]; + 150 -- 177 [label="106: 0.17: 0.19"]; + 150 -- 178 [label="106: 0.17: 0.19"]; + 150 -- 179 [label="105: 0.17: 0.19"]; + 150 -- 180 [label="94: 0.17: 0.15"]; + 150 -- 182 [label="92: 0.17: 0.15"]; + 150 -- 183 [label="89: 0.17: 0.17"]; + 150 -- 184 [label="90: 0.17: 0.17"]; + 150 -- 185 [label="90: 0.17: 0.17"]; + 150 -- 186 [label="90: 0.17: 0.17"]; + 150 -- 187 [label="76: 0.17: 0.15"]; + 150 -- 188 [label="80: 0.17: 0.15"]; + 150 -- 189 [label="62: 0.17: 0.16"]; + 150 -- 191 [label="82: 0.17: 0.15"]; + 151 -- 156 [label="103: 0.17: 0.16"]; + 151 -- 157 [label="106: 0.17: 0.16"]; + 151 -- 158 [label="102: 0.17: 0.17"]; + 151 -- 159 [label="93: 0.17: 0.17"]; + 151 -- 160 [label="100: 0.17: 0.17"]; + 151 -- 161 [label="103: 0.17: 0.15"]; + 151 -- 162 [label="104: 0.17: 0.15"]; + 151 -- 163 [label="101: 0.17: 0.18"]; + 151 -- 164 [label="98: 0.17: 0.17"]; + 151 -- 165 [label="98: 0.17: 0.16"]; + 151 -- 166 [label="98: 0.17: 0.16"]; + 151 -- 167 [label="81: 0.17: 0.20"]; + 151 -- 168 [label="97: 0.17: 0.16"]; + 151 -- 169 [label="105: 0.17: 0.19"]; + 151 -- 170 [label="105: 0.17: 0.20"]; + 151 -- 171 [label="94: 0.17: 0.16"]; + 151 -- 172 [label="106: 0.17: 0.20"]; + 151 -- 173 [label="103: 0.17: 0.20"]; + 151 -- 174 [label="106: 0.17: 0.20"]; + 151 -- 175 [label="80: 0.17: 0.20"]; + 151 -- 176 [label="104: 0.17: 0.19"]; + 151 -- 177 [label="105: 0.17: 0.19"]; + 151 -- 178 [label="105: 0.17: 0.19"]; + 151 -- 179 [label="104: 0.17: 0.19"]; + 151 -- 180 [label="94: 0.17: 0.15"]; + 151 -- 182 [label="92: 0.17: 0.15"]; + 151 -- 183 [label="88: 0.17: 0.17"]; + 151 -- 184 [label="89: 0.17: 0.17"]; + 151 -- 185 [label="89: 0.17: 0.17"]; + 151 -- 186 [label="89: 0.17: 0.17"]; + 151 -- 187 [label="76: 0.17: 0.15"]; + 151 -- 188 [label="80: 0.17: 0.15"]; + 151 -- 189 [label="62: 0.17: 0.16"]; + 151 -- 191 [label="81: 0.17: 0.15"]; + 152 -- 156 [label="104: 0.17: 0.16"]; + 152 -- 157 [label="107: 0.17: 0.16"]; + 152 -- 158 [label="103: 0.17: 0.17"]; + 152 -- 159 [label="94: 0.17: 0.17"]; + 152 -- 160 [label="101: 0.17: 0.17"]; + 152 -- 161 [label="104: 0.17: 0.15"]; + 152 -- 162 [label="105: 0.17: 0.15"]; + 152 -- 163 [label="101: 0.17: 0.18"]; + 152 -- 164 [label="98: 0.17: 0.17"]; + 152 -- 165 [label="99: 0.17: 0.16"]; + 152 -- 166 [label="99: 0.17: 0.16"]; + 152 -- 167 [label="82: 0.17: 0.20"]; + 152 -- 168 [label="98: 0.17: 0.16"]; + 152 -- 169 [label="105: 0.17: 0.19"]; + 152 -- 170 [label="105: 0.17: 0.20"]; + 152 -- 171 [label="95: 0.17: 0.16"]; + 152 -- 172 [label="106: 0.17: 0.20"]; + 152 -- 173 [label="103: 0.17: 0.20"]; + 152 -- 174 [label="106: 0.17: 0.20"]; + 152 -- 175 [label="81: 0.17: 0.20"]; + 152 -- 176 [label="104: 0.17: 0.19"]; + 152 -- 177 [label="105: 0.17: 0.19"]; + 152 -- 178 [label="105: 0.17: 0.19"]; + 152 -- 179 [label="104: 0.17: 0.19"]; + 152 -- 180 [label="95: 0.17: 0.15"]; + 152 -- 182 [label="92: 0.17: 0.15"]; + 152 -- 183 [label="88: 0.17: 0.17"]; + 152 -- 184 [label="89: 0.17: 0.17"]; + 152 -- 185 [label="89: 0.17: 0.17"]; + 152 -- 186 [label="89: 0.17: 0.17"]; + 152 -- 187 [label="77: 0.17: 0.15"]; + 152 -- 188 [label="80: 0.17: 0.15"]; + 152 -- 189 [label="63: 0.17: 0.16"]; + 152 -- 191 [label="81: 0.17: 0.15"]; + 153 -- 156 [label="103: 0.17: 0.16"]; + 153 -- 157 [label="106: 0.17: 0.16"]; + 153 -- 158 [label="102: 0.17: 0.17"]; + 153 -- 159 [label="93: 0.17: 0.17"]; + 153 -- 160 [label="100: 0.17: 0.17"]; + 153 -- 161 [label="103: 0.17: 0.15"]; + 153 -- 162 [label="104: 0.17: 0.15"]; + 153 -- 163 [label="101: 0.17: 0.18"]; + 153 -- 164 [label="98: 0.17: 0.17"]; + 153 -- 165 [label="98: 0.17: 0.16"]; + 153 -- 166 [label="98: 0.17: 0.16"]; + 153 -- 167 [label="81: 0.17: 0.20"]; + 153 -- 168 [label="97: 0.17: 0.16"]; + 153 -- 169 [label="105: 0.17: 0.19"]; + 153 -- 170 [label="105: 0.17: 0.20"]; + 153 -- 171 [label="94: 0.17: 0.16"]; + 153 -- 172 [label="106: 0.17: 0.20"]; + 153 -- 173 [label="103: 0.17: 0.20"]; + 153 -- 174 [label="106: 0.17: 0.20"]; + 153 -- 175 [label="80: 0.17: 0.20"]; + 153 -- 176 [label="104: 0.17: 0.19"]; + 153 -- 177 [label="105: 0.17: 0.19"]; + 153 -- 178 [label="105: 0.17: 0.19"]; + 153 -- 179 [label="104: 0.17: 0.19"]; + 153 -- 180 [label="94: 0.17: 0.15"]; + 153 -- 182 [label="92: 0.17: 0.15"]; + 153 -- 183 [label="88: 0.17: 0.17"]; + 153 -- 184 [label="89: 0.17: 0.17"]; + 153 -- 185 [label="89: 0.17: 0.17"]; + 153 -- 186 [label="89: 0.17: 0.17"]; + 153 -- 187 [label="76: 0.17: 0.15"]; + 153 -- 188 [label="80: 0.17: 0.15"]; + 153 -- 189 [label="62: 0.17: 0.16"]; + 153 -- 191 [label="81: 0.17: 0.15"]; + 154 -- 156 [label="103: 0.17: 0.16"]; + 154 -- 157 [label="106: 0.17: 0.16"]; + 154 -- 158 [label="102: 0.17: 0.17"]; + 154 -- 159 [label="93: 0.17: 0.17"]; + 154 -- 160 [label="100: 0.17: 0.17"]; + 154 -- 161 [label="103: 0.17: 0.15"]; + 154 -- 162 [label="104: 0.17: 0.15"]; + 154 -- 163 [label="101: 0.17: 0.18"]; + 154 -- 164 [label="98: 0.17: 0.17"]; + 154 -- 165 [label="98: 0.17: 0.16"]; + 154 -- 166 [label="98: 0.17: 0.16"]; + 154 -- 167 [label="81: 0.17: 0.20"]; + 154 -- 168 [label="97: 0.17: 0.16"]; + 154 -- 169 [label="105: 0.17: 0.19"]; + 154 -- 170 [label="105: 0.17: 0.20"]; + 154 -- 171 [label="94: 0.17: 0.16"]; + 154 -- 172 [label="106: 0.17: 0.20"]; + 154 -- 173 [label="103: 0.17: 0.20"]; + 154 -- 174 [label="106: 0.17: 0.20"]; + 154 -- 175 [label="80: 0.17: 0.20"]; + 154 -- 176 [label="104: 0.17: 0.19"]; + 154 -- 177 [label="105: 0.17: 0.19"]; + 154 -- 178 [label="105: 0.17: 0.19"]; + 154 -- 179 [label="104: 0.17: 0.19"]; + 154 -- 180 [label="94: 0.17: 0.15"]; + 154 -- 182 [label="92: 0.17: 0.15"]; + 154 -- 183 [label="88: 0.17: 0.17"]; + 154 -- 184 [label="89: 0.17: 0.17"]; + 154 -- 185 [label="89: 0.17: 0.17"]; + 154 -- 186 [label="89: 0.17: 0.17"]; + 154 -- 187 [label="76: 0.17: 0.15"]; + 154 -- 188 [label="80: 0.17: 0.15"]; + 154 -- 189 [label="62: 0.17: 0.16"]; + 154 -- 191 [label="81: 0.17: 0.15"]; + 155 -- 156 [label="103: 0.17: 0.16"]; + 155 -- 157 [label="107: 0.17: 0.16"]; + 155 -- 158 [label="102: 0.17: 0.17"]; + 155 -- 159 [label="93: 0.17: 0.17"]; + 155 -- 160 [label="100: 0.17: 0.17"]; + 155 -- 161 [label="104: 0.17: 0.15"]; + 155 -- 162 [label="105: 0.17: 0.15"]; + 155 -- 163 [label="100: 0.17: 0.18"]; + 155 -- 164 [label="97: 0.17: 0.17"]; + 155 -- 165 [label="99: 0.17: 0.16"]; + 155 -- 166 [label="99: 0.17: 0.16"]; + 155 -- 167 [label="81: 0.17: 0.20"]; + 155 -- 168 [label="98: 0.17: 0.16"]; + 155 -- 169 [label="104: 0.17: 0.19"]; + 155 -- 170 [label="104: 0.17: 0.20"]; + 155 -- 171 [label="95: 0.17: 0.16"]; + 155 -- 172 [label="105: 0.17: 0.20"]; + 155 -- 173 [label="102: 0.17: 0.20"]; + 155 -- 174 [label="105: 0.17: 0.20"]; + 155 -- 175 [label="80: 0.17: 0.20"]; + 155 -- 176 [label="103: 0.17: 0.19"]; + 155 -- 177 [label="104: 0.17: 0.19"]; + 155 -- 178 [label="104: 0.17: 0.19"]; + 155 -- 179 [label="103: 0.17: 0.19"]; + 155 -- 180 [label="95: 0.17: 0.15"]; + 155 -- 182 [label="92: 0.17: 0.15"]; + 155 -- 183 [label="87: 0.17: 0.17"]; + 155 -- 184 [label="88: 0.17: 0.17"]; + 155 -- 185 [label="88: 0.17: 0.17"]; + 155 -- 186 [label="88: 0.17: 0.17"]; + 155 -- 187 [label="77: 0.17: 0.15"]; + 155 -- 188 [label="80: 0.17: 0.15"]; + 155 -- 189 [label="62: 0.17: 0.16"]; + 155 -- 191 [label="81: 0.17: 0.15"]; + 156 -- 163 [label="92: 0.16: 0.18"]; + 156 -- 164 [label="89: 0.16: 0.17"]; + 156 -- 169 [label="93: 0.16: 0.19"]; + 156 -- 170 [label="93: 0.16: 0.20"]; + 156 -- 172 [label="95: 0.16: 0.20"]; + 156 -- 173 [label="92: 0.16: 0.20"]; + 156 -- 174 [label="95: 0.16: 0.20"]; + 156 -- 176 [label="94: 0.16: 0.19"]; + 156 -- 177 [label="93: 0.16: 0.19"]; + 156 -- 178 [label="94: 0.16: 0.19"]; + 156 -- 179 [label="92: 0.16: 0.19"]; + 156 -- 181 [label="100: 0.16: 0.15"]; + 156 -- 183 [label="93: 0.16: 0.17"]; + 156 -- 184 [label="93: 0.16: 0.17"]; + 156 -- 185 [label="93: 0.16: 0.17"]; + 156 -- 186 [label="93: 0.16: 0.17"]; + 156 -- 190 [label="64: 0.16: 0.15"]; + 156 -- 191 [label="82: 0.16: 0.15"]; + 157 -- 163 [label="85: 0.16: 0.18"]; + 157 -- 164 [label="83: 0.16: 0.17"]; + 157 -- 169 [label="99: 0.16: 0.19"]; + 157 -- 170 [label="99: 0.16: 0.20"]; + 157 -- 172 [label="100: 0.16: 0.20"]; + 157 -- 173 [label="99: 0.16: 0.20"]; + 157 -- 174 [label="100: 0.16: 0.20"]; + 157 -- 176 [label="100: 0.16: 0.19"]; + 157 -- 177 [label="99: 0.16: 0.19"]; + 157 -- 178 [label="99: 0.16: 0.19"]; + 157 -- 179 [label="98: 0.16: 0.19"]; + 157 -- 183 [label="100: 0.16: 0.17"]; + 157 -- 184 [label="99: 0.16: 0.17"]; + 157 -- 185 [label="99: 0.16: 0.17"]; + 157 -- 190 [label="64: 0.16: 0.15"]; + 158 -- 163 [label="96: 0.17: 0.18"]; + 158 -- 164 [label="93: 0.17: 0.17"]; + 158 -- 169 [label="98: 0.17: 0.19"]; + 158 -- 170 [label="98: 0.17: 0.20"]; + 158 -- 172 [label="100: 0.17: 0.20"]; + 158 -- 173 [label="97: 0.17: 0.20"]; + 158 -- 174 [label="100: 0.17: 0.20"]; + 158 -- 176 [label="99: 0.17: 0.19"]; + 158 -- 177 [label="98: 0.17: 0.19"]; + 158 -- 178 [label="99: 0.17: 0.19"]; + 158 -- 179 [label="97: 0.17: 0.19"]; + 158 -- 181 [label="99: 0.17: 0.15"]; + 158 -- 183 [label="98: 0.17: 0.17"]; + 158 -- 184 [label="98: 0.17: 0.17"]; + 158 -- 185 [label="98: 0.17: 0.17"]; + 158 -- 186 [label="98: 0.17: 0.17"]; + 158 -- 190 [label="62: 0.17: 0.15"]; + 158 -- 191 [label="85: 0.17: 0.15"]; + 159 -- 163 [label="79: 0.17: 0.18"]; + 159 -- 164 [label="77: 0.17: 0.17"]; + 159 -- 169 [label="94: 0.17: 0.19"]; + 159 -- 170 [label="94: 0.17: 0.20"]; + 159 -- 172 [label="95: 0.17: 0.20"]; + 159 -- 173 [label="93: 0.17: 0.20"]; + 159 -- 174 [label="95: 0.17: 0.20"]; + 159 -- 176 [label="94: 0.17: 0.19"]; + 159 -- 177 [label="94: 0.17: 0.19"]; + 159 -- 178 [label="94: 0.17: 0.19"]; + 159 -- 179 [label="93: 0.17: 0.19"]; + 159 -- 181 [label="97: 0.17: 0.15"]; + 159 -- 183 [label="94: 0.17: 0.17"]; + 159 -- 184 [label="94: 0.17: 0.17"]; + 159 -- 185 [label="94: 0.17: 0.17"]; + 159 -- 186 [label="94: 0.17: 0.17"]; + 159 -- 188 [label="88: 0.17: 0.15"]; + 159 -- 190 [label="54: 0.17: 0.15"]; + 159 -- 191 [label="76: 0.17: 0.15"]; + 160 -- 163 [label="84: 0.17: 0.18"]; + 160 -- 164 [label="82: 0.17: 0.17"]; + 160 -- 169 [label="99: 0.17: 0.19"]; + 160 -- 170 [label="99: 0.17: 0.20"]; + 160 -- 172 [label="100: 0.17: 0.20"]; + 160 -- 173 [label="98: 0.17: 0.20"]; + 160 -- 174 [label="100: 0.17: 0.20"]; + 160 -- 176 [label="99: 0.17: 0.19"]; + 160 -- 177 [label="99: 0.17: 0.19"]; + 160 -- 178 [label="99: 0.17: 0.19"]; + 160 -- 179 [label="98: 0.17: 0.19"]; + 160 -- 181 [label="102: 0.17: 0.15"]; + 160 -- 183 [label="99: 0.17: 0.17"]; + 160 -- 184 [label="99: 0.17: 0.17"]; + 160 -- 185 [label="99: 0.17: 0.17"]; + 160 -- 190 [label="58: 0.17: 0.15"]; + 160 -- 191 [label="84: 0.17: 0.15"]; + 161 -- 163 [label="77: 0.15: 0.18"]; + 161 -- 164 [label="75: 0.15: 0.17"]; + 161 -- 169 [label="98: 0.15: 0.19"]; + 161 -- 170 [label="98: 0.15: 0.20"]; + 161 -- 172 [label="99: 0.15: 0.20"]; + 161 -- 173 [label="98: 0.15: 0.20"]; + 161 -- 174 [label="99: 0.15: 0.20"]; + 161 -- 176 [label="99: 0.15: 0.19"]; + 161 -- 177 [label="98: 0.15: 0.19"]; + 161 -- 178 [label="99: 0.15: 0.19"]; + 161 -- 179 [label="98: 0.15: 0.19"]; + 161 -- 183 [label="99: 0.15: 0.17"]; + 161 -- 184 [label="98: 0.15: 0.17"]; + 161 -- 185 [label="98: 0.15: 0.17"]; + 161 -- 186 [label="98: 0.15: 0.17"]; + 161 -- 190 [label="61: 0.15: 0.15"]; + 162 -- 163 [label="78: 0.15: 0.18"]; + 162 -- 164 [label="76: 0.15: 0.17"]; + 162 -- 169 [label="98: 0.15: 0.19"]; + 162 -- 170 [label="98: 0.15: 0.20"]; + 162 -- 172 [label="99: 0.15: 0.20"]; + 162 -- 173 [label="98: 0.15: 0.20"]; + 162 -- 174 [label="99: 0.15: 0.20"]; + 162 -- 176 [label="99: 0.15: 0.19"]; + 162 -- 177 [label="98: 0.15: 0.19"]; + 162 -- 178 [label="98: 0.15: 0.19"]; + 162 -- 179 [label="97: 0.15: 0.19"]; + 162 -- 183 [label="99: 0.15: 0.17"]; + 162 -- 184 [label="98: 0.15: 0.17"]; + 162 -- 185 [label="98: 0.15: 0.17"]; + 162 -- 186 [label="98: 0.15: 0.17"]; + 162 -- 190 [label="62: 0.15: 0.15"]; + 163 -- 165 [label="81: 0.18: 0.16"]; + 163 -- 166 [label="82: 0.18: 0.16"]; + 163 -- 167 [label="81: 0.18: 0.20"]; + 163 -- 168 [label="80: 0.18: 0.16"]; + 163 -- 169 [label="92: 0.18: 0.19"]; + 163 -- 170 [label="92: 0.18: 0.20"]; + 163 -- 171 [label="79: 0.18: 0.16"]; + 163 -- 172 [label="94: 0.18: 0.20"]; + 163 -- 173 [label="91: 0.18: 0.20"]; + 163 -- 174 [label="94: 0.18: 0.20"]; + 163 -- 175 [label="79: 0.18: 0.20"]; + 163 -- 176 [label="93: 0.18: 0.19"]; + 163 -- 177 [label="92: 0.18: 0.19"]; + 163 -- 178 [label="91: 0.18: 0.19"]; + 163 -- 179 [label="89: 0.18: 0.19"]; + 163 -- 180 [label="73: 0.18: 0.15"]; + 163 -- 181 [label="76: 0.18: 0.15"]; + 163 -- 182 [label="72: 0.18: 0.15"]; + 163 -- 183 [label="80: 0.18: 0.17"]; + 163 -- 184 [label="80: 0.18: 0.17"]; + 163 -- 185 [label="80: 0.18: 0.17"]; + 163 -- 186 [label="80: 0.18: 0.17"]; + 163 -- 187 [label="58: 0.18: 0.15"]; + 163 -- 189 [label="57: 0.18: 0.16"]; + 163 -- 190 [label="73: 0.18: 0.15"]; + 164 -- 165 [label="78: 0.17: 0.16"]; + 164 -- 166 [label="79: 0.17: 0.16"]; + 164 -- 167 [label="79: 0.17: 0.20"]; + 164 -- 168 [label="78: 0.17: 0.16"]; + 164 -- 169 [label="90: 0.17: 0.19"]; + 164 -- 170 [label="90: 0.17: 0.20"]; + 164 -- 171 [label="76: 0.17: 0.16"]; + 164 -- 172 [label="92: 0.17: 0.20"]; + 164 -- 173 [label="89: 0.17: 0.20"]; + 164 -- 174 [label="92: 0.17: 0.20"]; + 164 -- 175 [label="77: 0.17: 0.20"]; + 164 -- 176 [label="91: 0.17: 0.19"]; + 164 -- 177 [label="90: 0.17: 0.19"]; + 164 -- 178 [label="89: 0.17: 0.19"]; + 164 -- 179 [label="87: 0.17: 0.19"]; + 164 -- 180 [label="71: 0.17: 0.15"]; + 164 -- 181 [label="74: 0.17: 0.15"]; + 164 -- 182 [label="70: 0.17: 0.15"]; + 164 -- 183 [label="78: 0.17: 0.17"]; + 164 -- 184 [label="78: 0.17: 0.17"]; + 164 -- 185 [label="78: 0.17: 0.17"]; + 164 -- 186 [label="78: 0.17: 0.17"]; + 164 -- 187 [label="56: 0.17: 0.15"]; + 164 -- 189 [label="55: 0.17: 0.16"]; + 164 -- 190 [label="70: 0.17: 0.15"]; + 165 -- 169 [label="100: 0.16: 0.19"]; + 165 -- 170 [label="100: 0.16: 0.20"]; + 165 -- 172 [label="101: 0.16: 0.20"]; + 165 -- 173 [label="99: 0.16: 0.20"]; + 165 -- 174 [label="101: 0.16: 0.20"]; + 165 -- 176 [label="100: 0.16: 0.19"]; + 165 -- 177 [label="100: 0.16: 0.19"]; + 165 -- 178 [label="100: 0.16: 0.19"]; + 165 -- 179 [label="99: 0.16: 0.19"]; + 165 -- 181 [label="102: 0.16: 0.15"]; + 165 -- 183 [label="100: 0.16: 0.17"]; + 165 -- 184 [label="100: 0.16: 0.17"]; + 165 -- 190 [label="58: 0.16: 0.15"]; + 166 -- 169 [label="99: 0.16: 0.19"]; + 166 -- 170 [label="99: 0.16: 0.20"]; + 166 -- 172 [label="100: 0.16: 0.20"]; + 166 -- 173 [label="99: 0.16: 0.20"]; + 166 -- 174 [label="100: 0.16: 0.20"]; + 166 -- 176 [label="100: 0.16: 0.19"]; + 166 -- 177 [label="99: 0.16: 0.19"]; + 166 -- 178 [label="99: 0.16: 0.19"]; + 166 -- 179 [label="98: 0.16: 0.19"]; + 166 -- 181 [label="102: 0.16: 0.15"]; + 166 -- 183 [label="100: 0.16: 0.17"]; + 166 -- 184 [label="99: 0.16: 0.17"]; + 166 -- 185 [label="99: 0.16: 0.17"]; + 166 -- 190 [label="58: 0.16: 0.15"]; + 167 -- 181 [label="75: 0.20: 0.15"]; + 167 -- 188 [label="70: 0.20: 0.15"]; + 167 -- 190 [label="50: 0.20: 0.15"]; + 167 -- 191 [label="63: 0.20: 0.15"]; + 168 -- 169 [label="96: 0.16: 0.19"]; + 168 -- 170 [label="96: 0.16: 0.20"]; + 168 -- 172 [label="97: 0.16: 0.20"]; + 168 -- 173 [label="96: 0.16: 0.20"]; + 168 -- 174 [label="97: 0.16: 0.20"]; + 168 -- 176 [label="97: 0.16: 0.19"]; + 168 -- 177 [label="96: 0.16: 0.19"]; + 168 -- 178 [label="96: 0.16: 0.19"]; + 168 -- 179 [label="95: 0.16: 0.19"]; + 168 -- 181 [label="102: 0.16: 0.15"]; + 168 -- 183 [label="97: 0.16: 0.17"]; + 168 -- 184 [label="96: 0.16: 0.17"]; + 168 -- 185 [label="96: 0.16: 0.17"]; + 168 -- 186 [label="96: 0.16: 0.17"]; + 168 -- 190 [label="57: 0.16: 0.15"]; + 169 -- 171 [label="97: 0.19: 0.16"]; + 169 -- 180 [label="94: 0.19: 0.15"]; + 169 -- 181 [label="98: 0.19: 0.15"]; + 169 -- 182 [label="88: 0.19: 0.15"]; + 169 -- 190 [label="67: 0.19: 0.15"]; + 170 -- 171 [label="97: 0.20: 0.16"]; + 170 -- 180 [label="94: 0.20: 0.15"]; + 170 -- 181 [label="98: 0.20: 0.15"]; + 170 -- 182 [label="88: 0.20: 0.15"]; + 170 -- 190 [label="67: 0.20: 0.15"]; + 171 -- 172 [label="98: 0.16: 0.20"]; + 171 -- 173 [label="97: 0.16: 0.20"]; + 171 -- 174 [label="98: 0.16: 0.20"]; + 171 -- 176 [label="98: 0.16: 0.19"]; + 171 -- 177 [label="97: 0.16: 0.19"]; + 171 -- 178 [label="97: 0.16: 0.19"]; + 171 -- 179 [label="96: 0.16: 0.19"]; + 171 -- 181 [label="98: 0.16: 0.15"]; + 171 -- 183 [label="98: 0.16: 0.17"]; + 171 -- 184 [label="97: 0.16: 0.17"]; + 171 -- 185 [label="97: 0.16: 0.17"]; + 171 -- 186 [label="97: 0.16: 0.17"]; + 171 -- 190 [label="54: 0.16: 0.15"]; + 172 -- 180 [label="95: 0.20: 0.15"]; + 172 -- 181 [label="100: 0.20: 0.15"]; + 172 -- 182 [label="89: 0.20: 0.15"]; + 172 -- 190 [label="68: 0.20: 0.15"]; + 173 -- 180 [label="94: 0.20: 0.15"]; + 173 -- 181 [label="97: 0.20: 0.15"]; + 173 -- 182 [label="88: 0.20: 0.15"]; + 173 -- 190 [label="65: 0.20: 0.15"]; + 174 -- 180 [label="95: 0.20: 0.15"]; + 174 -- 181 [label="100: 0.20: 0.15"]; + 174 -- 182 [label="89: 0.20: 0.15"]; + 174 -- 190 [label="68: 0.20: 0.15"]; + 175 -- 181 [label="75: 0.20: 0.15"]; + 175 -- 188 [label="70: 0.20: 0.15"]; + 175 -- 190 [label="50: 0.20: 0.15"]; + 175 -- 191 [label="63: 0.20: 0.15"]; + 176 -- 180 [label="95: 0.19: 0.15"]; + 176 -- 181 [label="99: 0.19: 0.15"]; + 176 -- 182 [label="89: 0.19: 0.15"]; + 176 -- 190 [label="66: 0.19: 0.15"]; + 177 -- 180 [label="94: 0.19: 0.15"]; + 177 -- 181 [label="99: 0.19: 0.15"]; + 177 -- 182 [label="88: 0.19: 0.15"]; + 177 -- 190 [label="67: 0.19: 0.15"]; + 178 -- 180 [label="94: 0.19: 0.15"]; + 178 -- 181 [label="99: 0.19: 0.15"]; + 178 -- 182 [label="88: 0.19: 0.15"]; + 178 -- 190 [label="67: 0.19: 0.15"]; + 179 -- 180 [label="93: 0.19: 0.15"]; + 179 -- 181 [label="98: 0.19: 0.15"]; + 179 -- 182 [label="87: 0.19: 0.15"]; + 179 -- 190 [label="66: 0.19: 0.15"]; + 180 -- 181 [label="99: 0.15: 0.15"]; + 180 -- 183 [label="95: 0.15: 0.17"]; + 180 -- 184 [label="94: 0.15: 0.17"]; + 180 -- 185 [label="94: 0.15: 0.17"]; + 180 -- 186 [label="94: 0.15: 0.17"]; + 180 -- 190 [label="54: 0.15: 0.15"]; + 180 -- 191 [label="85: 0.15: 0.15"]; + 181 -- 182 [label="93: 0.15: 0.15"]; + 181 -- 183 [label="83: 0.15: 0.17"]; + 181 -- 184 [label="83: 0.15: 0.17"]; + 181 -- 185 [label="83: 0.15: 0.17"]; + 181 -- 186 [label="84: 0.15: 0.17"]; + 181 -- 187 [label="77: 0.15: 0.15"]; + 181 -- 188 [label="81: 0.15: 0.15"]; + 181 -- 189 [label="61: 0.15: 0.16"]; + 181 -- 190 [label="71: 0.15: 0.15"]; + 181 -- 191 [label="73: 0.15: 0.15"]; + 182 -- 183 [label="89: 0.15: 0.17"]; + 182 -- 184 [label="88: 0.15: 0.17"]; + 182 -- 185 [label="88: 0.15: 0.17"]; + 182 -- 186 [label="88: 0.15: 0.17"]; + 182 -- 188 [label="86: 0.15: 0.15"]; + 182 -- 189 [label="87: 0.15: 0.16"]; + 182 -- 190 [label="54: 0.15: 0.15"]; + 182 -- 191 [label="79: 0.15: 0.15"]; + 183 -- 190 [label="56: 0.17: 0.15"]; + 183 -- 191 [label="87: 0.17: 0.15"]; + 184 -- 190 [label="58: 0.17: 0.15"]; + 184 -- 191 [label="86: 0.17: 0.15"]; + 185 -- 190 [label="58: 0.17: 0.15"]; + 185 -- 191 [label="86: 0.17: 0.15"]; + 186 -- 190 [label="58: 0.17: 0.15"]; + 186 -- 191 [label="86: 0.17: 0.15"]; + 187 -- 188 [label="67: 0.15: 0.15"]; + 187 -- 190 [label="44: 0.15: 0.15"]; + 187 -- 191 [label="66: 0.15: 0.15"]; + 188 -- 189 [label="66: 0.15: 0.16"]; + 188 -- 190 [label="46: 0.15: 0.15"]; + 189 -- 190 [label="34: 0.16: 0.15"]; + 189 -- 191 [label="60: 0.16: 0.15"]; + 190 -- 191 [label="51: 0.15: 0.15"]; +} diff --git a/ecoPrimerCommands b/ecoPrimerCommands new file mode 100644 index 0000000..8ec7636 --- /dev/null +++ b/ecoPrimerCommands @@ -0,0 +1,15 @@ + + +./ecoPrimer -d /Groups/Barcode-Leca/eubacteria-gr -l 10 -L 1000 -e 3 > euBactResults.txt +./ecoPrimer -d ChloroDB/chloroplast -l 5 -L 120 -r 58023 -e 3 > chloroVascularPlantsEric.txt +./ecoPrimer -d /Users/tiayyba/Documents/Data/mitochondrion/mitochondrion -q 0.4 -s 0.5 -l 10 -L 60 -r 1 -i 1 -T 0.2 -p > setsRoot.txt + + + ./ecoPCR -d /Users/tiayyba/Documents/workspace/ecoPrimers/src/mitochondrion/mitochondrion -l 50 -L 120 -r 7742 TAGAACAGGCTCCTCTAG TTAGATACCCCACTATGC > 12SV5.ecoPCR + ecoTaxSpecificity -d /Users/tiayyba/Documents/workspace/ecoPrimers/src/mitochondrion/mitochondrion /Users/tiayyba/Documents/workspace/ecoPCR/src/12SV5.ecoPCR + + + + 149 ./ecoPrimer -d /Users/tiayyba/Documents/Data/mitochondrion/mitochondrion -r 40674 -E 9606 -l 10 -L 100 > MamalsNotHomoSapien.primers + 150 ./ecoPrimer -d /Users/tiayyba/Documents/Data/mitochondrion/mitochondrion -q 0.4 -s 0.5 -r 40674 -E 9606 -l 10 -L 100 > MamalsNotHomoSapien.primers + 151 ./ecoPrimer -d /Users/tiayyba/Documents/Data/mitochondrion/mitochondrion -q 0.5 -s 0.6 -r 40674 -E 9606 -l 10 -L 100 > MamalsNotHomoSapien1.primers diff --git a/src copy.zip b/src copy.zip new file mode 100644 index 0000000..dd1a43c Binary files /dev/null and b/src copy.zip differ diff --git a/src/.Rhistory b/src/.Rhistory new file mode 100644 index 0000000..4eaef7b --- /dev/null +++ b/src/.Rhistory @@ -0,0 +1,250 @@ +plot(s$seq, s$size, xlab="Sequence Count", ylab="Memory used[B]", main="a. memory used vs sequence count without data mining") +abline(v = 273, col = "Blue") +text(400, 0.7e+09, "nSeq = 273") +plot(t$seq, t$size, xlab="Sequence Count", ylab="Memory used[B]", main="b. memory used vs sequence count with data mining") +abline(v = 273, col = "Blue") +text(450, 700, "nSeq = 273") +plot(s$seq, s$time, xlab="Sequence Count", ylab="Time[s]", main="c. time vs sequence count withount data mining") +abline(v = 273, col = "Blue") +text(450, 300, "nSeq = 273") + plot(t$seq, t$time, xlab="Sequence Count", ylab="Time[s]", main="d. time vs sequence count with data mining") +abline(v = 273, col = "Blue") +text(350, 20, "nSeq = 273") +par(mfrow=c(2,2)) +plot(s$seq, s$size, xlab="Sequence Count", ylab="Memory used[B]", main="a. memory used vs sequence count without data mining") +abline(v = 273, col = "Blue") +text(400, 0.6e+09, "nSeq = 273") +plot(t$seq, t$size, xlab="Sequence Count", ylab="Memory used[B]", main="b. memory used vs sequence count with data mining") +abline(v = 273, col = "Blue") +text(400, 600, "nSeq = 273") +plot(s$seq, s$time, xlab="Sequence Count", ylab="Time[s]", main="c. time vs sequence count withount data mining") +abline(v = 273, col = "Blue") +text(400, 300, "nSeq = 273") + plot(t$seq, t$time, xlab="Sequence Count", ylab="Time[s]", main="d. time vs sequence count with data mining") +abline(v = 273, col = "Blue") +text(400, 15, "nSeq = 273") +par(mfrow=c(2,2)) +plot(s$seq, s$size, xlab="Sequence Count", ylab="Memory used[B]", main="a. memory used vs sequence count without data mining") +abline(v = 273, col = "Blue") +text(400, 0.6e+09, "Nseq = 273") +plot(t$seq, t$size, xlab="Sequence Count", ylab="Memory used[B]", main="b. memory used vs sequence count with data mining") +abline(v = 273, col = "Blue") +text(400, 600, "Nseq = 273") +plot(s$seq, s$time, xlab="Sequence Count", ylab="Time[s]", main="c. time vs sequence count withount data mining") +abline(v = 273, col = "Blue") +text(400, 300, "Nseq = 273") +plot(t$seq, t$time, xlab="Sequence Count", ylab="Time[s]", main="d. time vs sequence count with data mining") +abline(v = 273, col = "Blue") +text(400, 15, "Nseq = 273") +s = read.table('/Users/tiayyba/Desktop/euBact/ecoprimer_71493.log', header= T) +t = read.table('/Users/tiayyba/Desktop/euBact/ecoprimer_84784.log', header= T) +par(mfrow=c(2,2)) +plot(s$seq, s$size, xlab="Sequence Count", ylab="Memory used[B]", main="a. memory used vs sequence count without data mining") +plot(s$seq, s$size*1.5, xlab="Sequence Count", ylab="Memory used[B]", main="a. memory used vs sequence count without data mining") +abline(v = 273, col = "Blue") +text(450, 3e+09, "nSeq = 273") +plot(t$seq, t$size*1.5, xlab="Sequence Count", ylab="Memory used[B]", main="b. memory used vs sequence count with data mining") +abline(v = 273, col = "Blue") +text(450, 1150, "nSeq = 273") +par(mfrow=c(2,2)) +plot(s$seq, s$size*1.5, xlab="Sequence Count", ylab="Memory used[B]", main="a. memory used vs sequence count without data mining") +abline(v = 273, col = "Blue") +text(450, 3e+09, "nSeq = 273") +plot(t$seq, t$size*1.5, xlab="Sequence Count", ylab="Memory used[B]", main="b. memory used vs sequence count with data mining") + abline(v = 273, col = "Blue") +text(450, 1150, "nSeq = 273") +par(mfrow=c(2,2)) +plot(s$seq, s$size*1.5, xlab="Sequence Count", ylab="Memory used[B]", main="a. memory used vs sequence count without data mining") + abline(v = 273, col = "Blue") +text(450, 1.7e+09, "nSeq = 273") +par(mfrow=c(2,2)) +plot(s$seq, s$size*1.5, xlab="Sequence Count", ylab="Memory used[B]", main="a. memory used vs sequence count without data mining") + abline(v = 273, col = "Blue") +text(450, 1.4e+09, "nSeq = 273") +plot(t$seq, t$size*1.5, xlab="Sequence Count", ylab="Memory used[B]", main="b. memory used vs sequence count with data mining") + abline(v = 273, col = "Blue") +text(450, 500, "nSeq = 273") +par(mfrow=c(2,2)) +plot(s$seq, s$size*1.5, xlab="Sequence Count", ylab="Memory used[B]", main="a. memory used vs sequence count without data mining") +abline(v = 273, col = "Blue") +text(450, 1.4e+09, "nSeq = 273") +plot(t$seq, t$size*1.5, xlab="Sequence Count", ylab="Memory used[B]", main="b. memory used vs sequence count with data mining") +abline(v = 273, col = "Blue") +text(450, 500, "nSeq = 273") +text(450, 1000, "nSeq = 273") +par(mfrow=c(2,2)) +plot(s$seq, s$size*1.5, xlab="Sequence Count", ylab="Memory used[B]", main="a. memory used vs sequence count without data mining") +abline(v = 273, col = "Blue") +text(450, 1.3e+09, "nSeq = 273") +plot(t$seq, t$size*1.5, xlab="Sequence Count", ylab="Memory used[B]", main="b. memory used vs sequence count with data mining") +abline(v = 273, col = "Blue") +text(450, 1000, "nSeq = 273") +plot(s$seq, s$time, xlab="Sequence Count", ylab="Time[s]", main="c. time vs sequence count withount data mining") +abline(v = 273, col = "Blue") +text(450, 700, "nSeq = 273") + plot(t$seq, t$time, xlab="Sequence Count", ylab="Time[s]", main="d. time vs sequence count with data mining") +abline(v = 273, col = "Blue") +text(450, 30, "nSeq = 273") +s = read.table('/Users/tiayyba/Desktop/UsedistRef1.txt', header = T) +s +plot(s$Distance, s$Count) +plot(s$Distance, log(s$Count)) +plot(log(s$Distance), log(s$Count)) +plot(s$Distance, log(s$Count)) +s = read.table('/Users/tiayyba/Desktop/UU_results/UU_F83/plots/F83_distRef1.old.txt', header = T) +plot(s$Distance, log(s$Count)) +s = read.table('/Users/tiayyba/Desktop/UsedistRef1.txt', header = T) +plot(s$Distance*100, log(s$Count)) +s = read.table('/Users/tiayyba/Desktop/UU_results/UU_F83/plots/F83_distRef1.old.txt', header = T) +plot(s$Distance, log(s$Count)) +s = read.table('/Users/tiayyba/Desktop/UsedistRef1.txt', header = T) +plot(s$Distance, log(s$Count)) +s = read.table('/Users/tiayyba/Desktop/UU_results/UU_F83/plots/F83_distRef1.old.txt', header = T) +plot(s$Distance, log(s$Count)) +s = read.table('/Users/tiayyba/Desktop/UsedistRef1.txt', header = T) +plot(s$Distance, log(s$Count)) +plot(s$Distance*1000, log(s$Count)) +s = read.table('/Users/tiayyba/Desktop/UsedistRef1.txt', header = T) +plot(s$Distance*1000, log(s$Count)) +u = read.table('/Users/tiayyba/Desktop/UU_FdistRef2.txt', header = T) +u +max(u$Count) +max(u$Distance) +plot(u$Distance, u$Count) +plot(log(u$Distance), log(u$Count)) +plot(u$Distance, u$Count, log(xy)) +plot(u$Distance, u$Count, log=xy) +plot(u$Distance, u$Count, log='xy') +plot(u$Distance+1, u$Count, log='xy') +plot(u$Distance, u$Count, log='xy') +?plot +s = read.table('/Users/tiayyba/Desktop/UU_FdistRef1.txt', header = T) +plot(s$Distance, s$Count) +plot(log(s$Distance), log(s$Count)) +plot(s$Distance, s$Count, log='xy') +plot(s$Distance, log(s$Count)) +s = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F83/plots/F83_distRef1.old.txt', header = T) +s +sNew = s[order(s$Count),] +sNew +plot(s$Distance, s$Count) +s = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F83/plots/F83_distRef1.old.txt', header = T) +t = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F83/plots/F83_distRef2.old.txt', header = T) +s = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F83/plots/F83_distRef2.old.txt', header = T) +u = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F83/plots/F83_distRef1.old.txt', header = T) + t = data.frame(count=s$Count,uu=u$Distance,cs=s$Distance,col=(s$Distance < u$Distance)+1) +plot(s$Distance,s$Count,log='xy',col=t$col) + par(mfrow=c(1,2)) +plot(s$Distance+1,s$Count,log='xy',col=t$col) +plot(u$Distance+1,u$Count,log='xy',col=t$col) +s = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F83/plots/F83_distRef2.old.txt', header = T) +u = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F83/plots/F83_distRef1.old.txt', header = T) + t = data.frame(count=s$Count,uu=u$Distance,cs=s$Distance,col=(s$Distance < u$Distance)+1) +plot(u$Distance+1,u$Count,log='xy',col=t$col) + par(mfrow=c(1,2)) +plot(s$Distance+1,s$Count,log='xy',col=t$col) +plot(u$Distance+1,u$Count,log='xy',col=t$col) +s = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F82/plots/F82_distRef1.txt', header = F) +s = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F82/plots/F82_distRef1.txt', header = T) +s = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F82/plots/F82_distRef2.txt', header = T) +u = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F82/plots/F82_distRef1.txt', header = T) +t = data.frame(count=s$Count,uu=u$Distance,cs=s$Distance,col=(s$Distance < u$Distance)+1) +t = data.frame(count=u$Count,uu=u$Distance,cs=s$Distance,col=(s$Distance < u$Distance)+1) +s = read.table('/Users/tiayyba/Desktop/abc'header = F) +s = read.table('/Users/tiayyba/Desktop/abc', header = F) +plot(s) +s +s = read.table('/Users/tiayyba/Desktop/abc', header = F) +s +plot(s$V2, s$V1) +plot(s$V1, s$V2) +plot(s$V1, s$V2, xlab="sequence number", ylab="sequence count") +s = read.table('/Users/tiayyba/Desktop/abc', header = F) +s +s = read.table('/Users/tiayyba/Desktop/abc', header = F) +s +Sr tacg gcta ctag actg par(mfrow=c(2,2)) + plot(s$V1, s$V2, xlab="sequence number", ylab="sequence count", main = "sample: tacg") + plot(s$V1, s$V3, xlab="sequence number", ylab="sequence count", main = "sample: gcta") + plot(s$V1, s$V4, xlab="sequence number", ylab="sequence count", main = "sample: ctag") + plot(s$V1, s$V5, xlab="sequence number", ylab="sequence count", main = "sample: actg") +s = read.table('/Users/tiayyba/Desktop/abc', header = F) +s + plot(s$V1, s$V2, xlab="sequence number", ylab="sequence count", main = "sample: Ranunculus_acris") +Sr tacg gcta ctag actg par(mfrow=c(2,2)) + plot(s$V1, s$V2, xlab="sequence number", ylab="sequence count", main = "sample: Ranunculus_acris") + plot(s$V1, s$V3, xlab="sequence number", ylab="sequence count", main = "sample: Luzula_sudetica") + plot(s$V1, s$V4, xlab="sequence number", ylab="sequence count", main = "sample: Deschampsia_cespitosa") + plot(s$V1, s$V5, xlab="sequence number", ylab="sequence count", main = "sample: Cardamine_pratensis_paludosa") +s = read.table('/Users/tiayyba/Desktop/abc', header = F) +par(mfrow=c(2,2)) + plot(s$V1, s$V2, xlab="sequence number", ylab="sequence count", main = "sample 1: Ranunculus_acris") + plot(s$V1, s$V3, xlab="sequence number", ylab="sequence count", main = "sample 2: Luzula_sudetica") + plot(s$V1, s$V4, xlab="sequence number", ylab="sequence count", main = "sample 3: Deschampsia_cespitosa") + plot(s$V1, s$V5, xlab="sequence number", ylab="sequence count", main = "sample 4: Cardamine_pratensis_paludosa") +s = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F83/plots/F83_dist.txt', header = F) +s = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F83/plots/F83_dist.txt', header = F, row.names = 1) +s = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F83/plots/F83_dist.txt', header = F, row.names=1) +s = read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F83/plots/F83_dist.txt',row.names=1) +s + plot(s$count, s$uu) + plot(s$uu, s$count) + plot(s$uu, s$count, col = s$col) + plot(s$cs, s$count, col = s$col) + plot(s$cs, s$count, col = s$col, col = (s$uu < s$cs) +1) + plot(s$uu, s$count) + plot(log(s$uu), log(s$count)) + plot(s$uu, log(s$count)) + plot(log(s$uu), log(s$count)) + plot(s$uu, s$count) + plot(s$uu, log(s$count)) + plot(s$uu, log(s$count), xlab = "distance from Uncia uncia", ylab = "log(SequenceCount)") += read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F83/plots/F83_dist.txt', header = T) +s= read.table('/Users/tiayyba/Desktop/ErrModel/Borneo/UnciaUncia/UU_F83/plots/F83_dist.txt', header = T) +s +par(mfrow = c(1,2)) +plot(s$cs, s$count, col = s$col) +plot(s$cs, s$count,log = 'xy', col = s$col) +par(mfrow = c(1,2)) +plot(s$cs, s$count,log = 'xy', col = s$col) +plot(s$uu, s$count,log = 'xy', col = s$col) +score = (0.407)*0.890*exp(-sqrt(692.33/1.247)) +score +score1 = (0.511)*0.904*exp(-sqrt(481/148.5)) +score1 +score1 = sqrt(481/148.5) +score1 +score = sqrt(692.33/1.247) +score +score = (0.407)^2*0.890*sqrt(692.33/1.247) +score +score1 = (0.511)^2*0.904*sqrt(481/148.5) +score1 +score = (0.407)*0.890*sqrt(692.33/1.247) +score +score1 = (0.511)*0.904*sqrt(481/148.5) +score1 +score = (0.407)*0.890*692.33/sqrt(1.247) +score +score1 = (0.511)*0.904*481/sqrt(148.5) +score1 +sum = 714 + 710 + 699 +sum +2123/3 +1760+3210+4950+2090 +12010+3300+220 +4400+385+55+495+715+800+450+2200+1540+1210+550 +944 - 220 +724 - 40 +684 -10 +364-278 +944-657 +250+220 +7*9 +q +quit +s = read.table('metazoas_Mdyn_T50.gv') +s = read.table('metazoas_Mdyn_T50.gv.hist.gv') +t = read.table('test.gv') +hist(t$V3) +hist(s$V3) diff --git a/src/Documents/CalculTM.xls b/src/Documents/CalculTM.xls new file mode 100644 index 0000000..094eb31 Binary files /dev/null and b/src/Documents/CalculTM.xls differ diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000..2098184 --- /dev/null +++ b/src/Makefile @@ -0,0 +1,78 @@ +EXEC=ecoPrimers + +PRIMER_SRC= ecoprimer.c +PRIMER_OBJ= $(patsubst %.c,%.o,$(PRIMER_SRC)) + + +SRCS= $(PRIMER_SRC) + +LIB= -lecoprimer -lecoPCR -lthermo -lz -lm + +LIBFILE= libecoPCR/libecoPCR.a \ + libecoprimer/libecoprimer.a \ + libthermo/libthermo.a \ + + + +include global.mk + +all: $(EXEC) + + +######## +# +# ecoPrimer compilation +# +######## + +# executable compilation and link + +ecoPrimers: $(PRIMER_OBJ) $(LIBFILE) + $(CC) -g $(LDFLAGS) -O5 -m64 -o $@ $< $(LIBPATH) $(LIB) + + +######## +# +# library compilation +# +######## + +libecoPCR/libecoPCR.a: + $(MAKE) -C libecoPCR + +libecoprimer/libecoprimer.a: + $(MAKE) -C libecoprimer + +libthermo/libthermo.a: + $(MAKE) -C libthermo + +######## +# +# project management +# +######## + +clean: + rm -f *.o + rm -f $(EXEC) + $(MAKE) -C libecoPCR clean + $(MAKE) -C libecoprimer clean + $(MAKE) -C libthermo clean + + + +######## +# +# clean for k2 to remove .o and .P files +# +######## + +k2clean: + rm -f *.o + rm -f *.P + rm -f libecoPCR/*.o + rm -f libecoPCR/*.P + rm -f libecoprimer/*.o + rm -f libecoprimer/*.P + rm -f libthermo/*.o + rm -f libthermo/*.P diff --git a/src/ecoPrimers b/src/ecoPrimers new file mode 100755 index 0000000..9bbb2ea Binary files /dev/null and b/src/ecoPrimers differ diff --git a/src/ecoprimer.P b/src/ecoprimer.P new file mode 100644 index 0000000..0c50432 --- /dev/null +++ b/src/ecoprimer.P @@ -0,0 +1,32 @@ +ecoprimer.o ecoprimer.P : ecoprimer.c libecoprimer/ecoprimer.h /usr/include/inttypes.h \ + /usr/include/sys/cdefs.h /usr/include/_types.h \ + /usr/include/sys/_types.h /usr/include/machine/_types.h \ + /usr/include/i386/_types.h \ + /usr/lib/gcc/i686-apple-darwin10/4.2.1/include/stdint.h \ + /usr/include/stdlib.h /usr/include/Availability.h \ + /usr/include/AvailabilityInternal.h /usr/include/sys/wait.h \ + /usr/include/sys/signal.h /usr/include/sys/appleapiopts.h \ + /usr/include/machine/signal.h /usr/include/i386/signal.h \ + /usr/include/i386/_structs.h /usr/include/sys/_structs.h \ + /usr/include/machine/_structs.h /usr/include/mach/i386/_structs.h \ + /usr/include/sys/resource.h /usr/include/machine/endian.h \ + /usr/include/i386/endian.h /usr/include/sys/_endian.h \ + /usr/include/libkern/_OSByteOrder.h \ + /usr/include/libkern/i386/_OSByteOrder.h /usr/include/alloca.h \ + /usr/include/machine/types.h /usr/include/i386/types.h \ + /usr/include/i386/_types.h /usr/include/stdio.h \ + /usr/include/secure/_stdio.h /usr/include/secure/_common.h \ + libecoprimer/ecotype.h libecoprimer/../libecoPCR/ecoPCR.h \ + libecoprimer/../libthermo/nnparams.h /usr/include/math.h \ + /usr/include/architecture/i386/math.h /usr/include/string.h \ + /usr/include/secure/_string.h libecoprimer/apat.h \ + libecoprimer/libstki.h libecoprimer/debug.h libecoprimer/PrimerSets.h \ + libecoprimer/ecoprimer.h libecoprimer/ahocorasick.h \ + /usr/include/ctype.h /usr/include/runetype.h /usr/include/getopt.h \ + /usr/include/unistd.h /usr/include/sys/unistd.h \ + /usr/include/sys/select.h /usr/include/sys/_select.h \ + /usr/include/time.h /usr/include/_structs.h /usr/include/sys/time.h \ + /usr/include/dlfcn.h \ + /usr/lib/gcc/i686-apple-darwin10/4.2.1/include/stdbool.h \ + /usr/include/AvailabilityMacros.h libthermo/nnparams.h \ + libthermo/thermostats.h libthermo/../libecoprimer/ecoprimer.h diff --git a/src/ecoprimer.c b/src/ecoprimer.c new file mode 100755 index 0000000..b9c996c --- /dev/null +++ b/src/ecoprimer.c @@ -0,0 +1,1019 @@ +/* + * ecoprimer.c + * + * Created on: 7 nov. 2008 + * Author: coissac + */ + +#include "libecoprimer/ecoprimer.h" +#include "libecoprimer/PrimerSets.h" +#include "libecoprimer/ahocorasick.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include"libthermo/nnparams.h" +#include"libthermo/thermostats.h" + + +#define VERSION "0.3" + /* TR: by default, statistics are made on species level*/ +#define DEFAULTTAXONRANK "species" + +static int cmpprintedpairs(const void* p1,const void* p2); +//float _Z27calculateMeltingTemperature_ (char * seq1, char * seq2); +pwordcount_t reduce_words_to_debug (pwordcount_t words, poptions_t options); +void print_wordwith_positions (primer_t prm, uint32_t seqdbsize, poptions_t options); + +void* lib_handle = NULL; +float (*calcMelTemp)(char*, char*); + +/* ----------------------------------------------- */ +/* printout help */ +/* ----------------------------------------------- */ +#define PP fprintf(stdout, + +static void PrintHelp() +{ + PP "------------------------------------------\n"); + PP " ecoPrimer Version %s\n", VERSION); + PP "------------------------------------------\n"); + PP "synopsis : finding primers and measureing the quality of primers and barcode region\n"); + PP "usage: ./ecoPrimer [options] \n"); + PP "------------------------------------------\n"); + PP "options:\n"); + PP "-d : [D]atabase : to match the expected format, the database\n"); + PP " has to be formated first by the ecoPCRFormat.py program located.\n"); + PP " in the ecoPCR/tools directory.\n"); + PP " ecoPCRFormat.py creates three file types :\n"); + PP " .sdx : contains the sequences\n"); + PP " .tdx : contains information concerning the taxonomy\n"); + PP " .rdx : contains the taxonomy rank\n\n"); + PP " ecoPrimer needs all the file type. As a result, you have to write the\n"); + PP " database radical without any extension. For example /ecoPrimerDB/fstvert\n\n"); + PP "-e : [E]rror : max error allowed by oligonucleotide (0 by default)\n\n"); + PP "-h : [H]elp - print help\n\n"); + PP "-i : [I]gnore the given taxonomy id (define the counterexample taxon set).\n\n"); + PP "-l : minimum [L]ength : define the minimum amplication length. \n\n"); + PP "-L : maximum [L]ength : define the maximum amplicationlength. \n\n"); + PP "-r : [R]estricts the search to the given taxonomic id (restrict the example taxon set).\n\n"); + PP "-E : [E]xception taxid allows to indicate than some subclade of example sequences are conterexamples.\n\n"); + PP "-c : Consider that the database sequences are [c]ircular\n\n"); + PP "-3 : Three prime strict match\n\n"); + PP "-q : Strict matching [q]uorum, percentage of the sequences in which strict primers are found. By default it is 70\n\n"); + PP "-s : [S]ensitivity quorum\n\n"); + PP "-t : required [t]axon level for results, by default the results are computed at species level\n\n"); + PP "-x : false positive quorum\n\n"); + PP "-D : set in [d]ouble strand mode\n\n"); + PP "-O : set the primer length (default 18) \n\n"); + PP "-S : Set in [s]ingle strand mode\n\n"); + PP "-m : Salt correction method for Tm computation (SANTALUCIA : 1 or OWCZARZY:2, default=1)\n\n"); + PP "-a : Salt contentration in M for Tm computation (default 0.05 M)\n\n"); + PP "-U : No multi match\n\n"); + PP "-R : Define the [R]eference sequence identifier (must be part of example set)\n\n"); + PP "-A : Print the list of all identifier of sequences present in the database\n\n"); + PP "-f : Remove data mining step during strict primer identification\n\n"); + PP "-v : Store statistic file about memory usage during strict primer identification\n\n"); + PP "-p : Print sets of primers\n\n"); + PP "-T : Ignore pairs having specificity below this Threshold\n\n"); + PP "\n"); + PP "------------------------------------------\n"); + PP "Table result description : \n"); + PP "column 1 : serial number\n"); + PP "column 2 : primer1\n"); + PP "column 3 : primer2\n"); + PP "column 4 : primer1 Tm without mismatch\n"); + PP "column 5 : primer1 lowest Tm against exemple sequences\n"); + PP "column 6 : primer2 Tm without mismatch\n"); + PP "column 7 : primer2 lowest Tm against exemple sequences\n"); + PP "column 8 : primer1 G+C count\n"); + PP "column 9 : primer2 G+C count\n"); + PP "column 10 : good/bad\n"); + PP "column 11 : amplified example sequence count\n"); + PP "column 12 : amplified counterexample sequence count\n"); + PP "column 13 : yule\n"); + PP "column 14 : amplified example taxa count\n"); + PP "column 15 : amplified counterexample taxa count\n"); + PP "column 16 : ratio of amplified example taxa versus all example taxa (Bc index)\n"); + PP "column 17 : unambiguously identified example taxa count\n"); + PP "column 18 : ratio of specificity unambiguously identified example taxa versus all example taxa (Bs index)\n"); + PP "column 19 : minimum amplified length\n"); + PP "column 20 : maximum amplified length\n"); + PP "column 21 : average amplified length\n"); + PP "------------------------------------------\n"); + PP " http://www.grenoble.prabi.fr/trac/ecoPrimer/\n"); + PP "------------------------------------------\n\n"); + PP "\n"); + +} + +static void ExitUsage(int stat) +{ + PP "usage: ecoprimer [-d database] [-l value] [-L value] [-e value] [-r taxid] [-i taxid] [-R rank] [-t taxon level]\n"); + PP "type \"ecoprimer -h\" for help\n"); + + if (stat) + exit(stat); +} + +#undef PP + +void initoptions(poptions_t options) +{ + options->statistics=FALSE; + options->filtering=TRUE; + options->lmin=0; //< Amplifia minimal length + options->lmax=1000; //< Amplifia maximal length + options->error_max=3; //**< maximum error count in fuzzy search + options->primer_length=18; //**< minimal length of the primers + options->restricted_taxid=NULL; //**< limit amplification below these taxid + options->ignored_taxid=NULL; //**< no amplification below these taxid + options->exception_taxid=NULL; //**< no amplification below these taxid + options->prefix=NULL; + options->reference=NULL; + options->refseq=NULL; + options->circular=0; + options->doublestrand=1; + options->strict_quorum=0.7; + options->strict_exclude_quorum=0.1; + options->sensitivity_quorum=0.9; + options->false_positive_quorum=0.1; + options->strict_three_prime=0; + options->r=0; + options->g=0; + options->e=0; + options->no_multi_match=FALSE; + options->pnparm = NULL; + strcpy(options->taxonrank, DEFAULTTAXONRANK); /*taxon level for results, species by default*/ + options->saltmethod = SALT_METHOD_SANTALUCIA; + options->salt = DEF_SALT; + options->printAC=FALSE; + options->print_sets_of_primers = FALSE; + options->specificity_threshold = 0.6; + options->links_cnt = 1; + options->max_links_percent = -1; /*graph only those primers having maximum 15% links*/ + options->filter_on_links = TRUE; +} + +void printapair(int32_t index,ppair_t pair, poptions_t options) +{ + bool_t asdirect1=pair->asdirect1; + bool_t asdirect2=pair->asdirect2; + bool_t asdirecttmp; + word_t w1=pair->p1->word; + word_t w2=pair->p2->word; + word_t wtmp; + bool_t good1=pair->p1->good; + bool_t good2=pair->p2->good; + bool_t goodtmp; + bool_t strand; + uint32_t i, j; + float temp; + CNNParams nnparams; + + //nparam_InitParams(&nnparams, DEF_CONC_PRIMERS,DEF_CONC_SEQUENCES,DEF_SALT,SALT_METHOD_SANTALUCIA); + + char *c; + char p1[32]; + char p2[32]; + + if (!asdirect1) + w1=ecoComplementWord(w1,options->primer_length); + + if (!asdirect2) + w2=ecoComplementWord(w2,options->primer_length); + + + if (w2 < w1) + { + wtmp=w1; + w1=w2; + w2=wtmp; + + asdirecttmp=asdirect1; + asdirect1=asdirect2; + asdirect2=asdirecttmp; + + goodtmp=good1; + good1=good2; + good2=goodtmp; + } + + //print serial number + printf("%6d\t",index); + + c = ecoUnhashWord(w1,options->primer_length); + strcpy (p1, c); + c = ecoUnhashWord(w2,options->primer_length); + strcpy (p2, c); + + //print primer1 + printf("%s\t", p1); + + //print primer2 + printf("%s", p2); + + //print primer1 melting temperature + printf ("\t%3.1f", pair->p1temp); + + //print minimum melting temperature of approximate versions of primer1 + printf ("\t%3.1f", pair->p1mintemp); + + //print primer2 melting temperature + printf ("\t%3.1f", pair->p2temp); + + //print minimum melting temperature of approximate versions of primer2 + printf ("\t%3.1f", pair->p2mintemp); + + //print gc contents of primer1 + printf ("\t%d",nparam_CountGCContent(p1)); + + //print gc contents of primer2 + printf ("\t%d",nparam_CountGCContent(p2)); + + //print good/bad pair indicator + printf("\t%c%c", "bG"[(int)good1],"bG"[(int)good2]); + + //print inexample count + printf("\t%d", pair->inexample); + + //print out example count + printf("\t%d", pair->outexample); + + //print yule + printf("\t%4.3f", pair->yule); + + //print in taxa count + printf("\t%d", pair->intaxa); + + //print out taxa count + printf("\t%d", pair->outtaxa); + + //print coverage + printf("\t%4.3f", (float)pair->bc); + + //print well identified taxa count + printf("\t%d", pair->intaxa - pair->notwellidentifiedtaxa); + + //print specificity + printf("\t%4.3f", pair->bs); + + //print min amplifia lenght + printf("\t%d", pair->mind); + + //print max amplifia lenght + printf("\t%d", pair->maxd); + + //print average amplifia lenght + printf("\t%3.2f", (float)pair->sumd/pair->amplifiacount); + + //print amplifia information about reference sequence if specified + if (options->refseq && pair->refsequence >=0) + { + printf("\t%s:",options->reference); + strand = pair->pcr.amplifias[pair->refsequence].strand; + + if (strand) + printf("join("); + else + printf("complement("); + + printf("%d..%d,%d..%d",pair->pcr.amplifias[pair->refsequence].begin - options->primer_length + 1, + pair->pcr.amplifias[pair->refsequence].begin, + pair->pcr.amplifias[pair->refsequence].end + 2, + pair->pcr.amplifias[pair->refsequence].end + options->primer_length + 1 + ); + printf(")"); + printf("\t"); + + for (c=pair->pcr.amplifias[pair->refsequence].amplifia, + i=pair->pcr.amplifias[pair->refsequence].begin; + i<=pair->pcr.amplifias[pair->refsequence].end; + i++, + c+=(strand)? 1:-1) + printf("%c","acgt"[(strand)? (*c):(~*c)&3]); + + + } + else + printf("\t\t"); + +/* j=0; + for (i=0; idbsize; i++) + if (pair->wellIdentifiedSeqs[i] == 1) + j++; + printf("%d", j);*/ + + printf("\n"); + +} + +static int cmpprintedpairs(const void* p1,const void* p2) +{ + float s1,s2; + ppair_t pair1,pair2; + + pair1=*((ppair_t*)p1); + pair2=*((ppair_t*)p2); + + s1 = pair1->yule * pair1->bs; + s2 = pair2->yule * pair2->bs; + +// fprintf(stderr,"s1 : %4.3f %4.3f %4.3f\n",pair1->yule , pair1->bs,s1); +// fprintf(stderr,"s2 : %4.3f %4.3f %4.3f\n\n",pair2->yule , pair2->bs,s2); + + if (s1 > s2) return -1; + if (s1 < s2) return 1; + return 0; +} + +uint32_t filterandsortpairs(ppair_t* sortedpairs,uint32_t count, poptions_t options, pecodnadb_t seqdb) +{ + uint32_t i,j; + float q,qfp; + + for (i=0,j=0;i < count;i++) + { + if (options->insamples) + q = (float)sortedpairs[i]->inexample/options->insamples; + else q=1.0; + + if (options->outsamples) + qfp = (float)sortedpairs[i]->outexample/options->outsamples; + else qfp=0.0; + + sortedpairs[i]->wellIdentifiedSeqs = NULL; //TR 05/09/10 - wellIdentified needed for primer sets + sortedpairs[i]->coveredSeqs = NULL; //TR 05/09/10 - wellIdentified needed for primer sets + sortedpairs[i]->quorumin = q; + sortedpairs[i]->quorumout = qfp; + sortedpairs[i]->yule = q - qfp; + sortedpairs[j]=sortedpairs[i]; + + if (q > options->sensitivity_quorum && + qfp < options->false_positive_quorum) + { + //TR 05/09/10 - wellIdentified needed for primer sets + sortedpairs[j]->wellIdentifiedSeqs = ECOMALLOC(options->dbsize * sizeof(int),"Cannot allocate well_identified_array"); + sortedpairs[j]->coveredSeqs = ECOMALLOC(options->dbsize * sizeof(int),"Cannot allocate well_identified_array"); + (void)taxonomycoverage(sortedpairs[j],options, seqdb, options->dbsize); + taxonomyspecificity(sortedpairs[j], seqdb, options->dbsize); + //j++; + //if specificity less than user provieded threshold (default 60%) then ignore this pair + if (sortedpairs[j]->bs >= options->specificity_threshold) + j++; + } + + } + qsort(sortedpairs,j,sizeof(ppair_t),cmpprintedpairs); + return j; +} + + +void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy, pecodnadb_t seqdb) +{ + ppair_t* sortedpairs; + ppair_t* index; + ppairlist_t pl; + size_t i,j; + size_t count; + char *taxon[]={"taxon","taxa"}; + ecotx_t *current_taxon; + //pairset pair_sets; + pairset *pset = NULL; + + //printf("Index\tPrimer1\tPrimer2\tGB\tInexampleCount\tOutexampleCount\tYule\tIntaxaCount\tOuttaxaCount\tCoverage\tSpecificity\tMinAmplifiedLength\tMaxAmplifiedLength\tAvgAmplifiedLength\n"); + + fprintf(stderr,"Total pair count : %d\n",pairs->count); + + sortedpairs = ECOMALLOC(pairs->count*sizeof(ppair_t),"Cannot Allocate ordered pairs"); + index=sortedpairs; + pl=pairs->first; + j=0; + while(pl->next) + { + for (i=0;ipaircount;i++,j++) + sortedpairs[j]=pl->pairs+i; + pl=pl->next; + } + + for (i=0;ipaircount;i++,j++) + sortedpairs[j]=pl->pairs+i; + + count=filterandsortpairs(sortedpairs,pairs->count,options, seqdb); + getThermoProperties(sortedpairs, count, options); + + fprintf(stderr,"Total good pair count : %u\n",(uint32_t)count); + + printf("#\n"); + printf("# ecoPrimer version %s\n",VERSION); + printf("# Rank level optimisation : %s\n", options->taxonrank); + printf("# max error count by oligonucleotide : %d\n",options->error_max); + printf("#\n"); + + if (options->r) + { + printf("# Restricted to %s:\n",taxon[(options->r>1) ? 1:0]); + for(i=0;i<(uint32_t)options->r;i++) + { + current_taxon=eco_findtaxonbytaxid(taxonomy,options->restricted_taxid[i]); + printf("# %d : %s (%s)\n", current_taxon->taxid, + current_taxon->name, + taxonomy->ranks->label[current_taxon->rank] + ); + } + printf("#\n"); + } + if (options->g) + { + printf("# Ignore %s:\n",taxon[(options->g>1) ? 1:0]); + for(i=0;i<(uint32_t)options->r;i++) + { + current_taxon=eco_findtaxonbytaxid(taxonomy,options->ignored_taxid[i]); + printf("# %d : %s (%s)\n", current_taxon->taxid, + current_taxon->name, + taxonomy->ranks->label[current_taxon->rank] + ); + } + printf("#\n"); + } + printf("# strict primer quorum : %3.2f\n",options->strict_quorum); + printf("# example quorum : %3.2f\n",options->sensitivity_quorum); + if (options->g + options->r) + printf("# counterexample quorum : %3.2f\n",options->false_positive_quorum); + + printf("#\n"); + printf("# database : %s\n",options->prefix); + printf("# Database is constituted of %5d examples corresponding to %5d %s\n",options->insamples, + options->intaxa,options->taxonrank); + printf("# and %5d counterexamples corresponding to %5d %s\n",options->outsamples, + options->outtaxa,options->taxonrank); + printf("#\n"); + + if (options->lmin && options->lmax) + printf("# amplifiat length between [%d,%d] bp\n",options->lmin,options->lmax); + else if (options->lmin) + printf("# amplifiat length larger than %d bp\n",options->lmin); + else if (options->lmax) + printf("# amplifiat length smaller than %d bp\n",options->lmax); + if (options->circular) + printf("# DB sequences are considered as circular\n"); + else + printf("# DB sequences are considered as linear\n"); + printf("# Pairs having specificity less than %0.2f will be ignored\n", options->specificity_threshold); + printf("#\n"); + + + for (i=0;i < count;i++) + printapair(i,sortedpairs[i],options); + + if (options->filter_on_links) + { + fprintf (stderr, "Old size: %d, ", count); + count = primers_changeSortedArray (&sortedpairs, count, options); + //count = primers_filterWithGivenLinks (&sortedpairs, count, options); + fprintf (stderr, "New size: %d\n", count); + + if (count == 0) + { + fprintf (stderr, "No pairs passed the links constraints.\n"); + printf ("No pairs passed the links constraints.\n"); + return; + } + + for (i=0;i < count;i++) + printapair(i,sortedpairs[i],options); + } + + if (options->print_sets_of_primers == TRUE) + { + /*pair_sets = build_primers_set (sortedpairs, count, seqdb, options); + printf("Results from Greedy Algorithm and some other possibilities:\n"); + some_other_set_possibilities (&pair_sets, sortedpairs, count, seqdb, options); + printf("Results from simulated Anealing:\n"); + sets_by_SimulatedAnealing (&pair_sets, sortedpairs, count, seqdb, options); + printf("Results from Tabu Search:\n"); + sets_by_TabuSearch (&pair_sets, sortedpairs, count, seqdb, options);*/ + //pset = sets_by_BruteForce (sortedpairs, count, seqdb, options); + //if (pset) + /*/{ + printf("Results from simulated Anealing:\n"); + sets_by_SimulatedAnealing (pset, sortedpairs, count, seqdb, options); + printf("Results from Tabu Search:\n"); + sets_by_TabuSearch (pset, sortedpairs, count, seqdb, options); + + if (pset) + { + ECOFREE (pset->set_wellIdentifiedTaxa, "Could not free memory for pair set wi"); + ECOFREE (pset, "Could not free memory for pair"); + } + }*/ + build_and_print_sets (sortedpairs, count, seqdb, options); + } + //primers_graph_graphviz (sortedpairs, count, options); +} + + +/*updateseqparams: This function counts the insample and outsample sequences + * and with each sequences adds a tag of the taxon to which the sequence beongs*/ + +void updateseqparams (pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxonomy, + poptions_t options, int32_t *insamples, int32_t *outsamples) +{ + uint32_t i; + int32_t taxid; + ecotx_t *tmptaxon; + + for (i=0;iisexample=isExampleTaxon(taxonomy,seqdb[i]->taxid,options); + if (seqdb[i]->isexample) + (*insamples)++; + else + (*outsamples)++; + + taxid = taxonomy->taxons->taxon[seqdb[i]->taxid].taxid; + tmptaxon = eco_findtaxonbytaxid(taxonomy, taxid); + if (tmptaxon) + tmptaxon = eco_findtaxonatrank(tmptaxon, options->taxonrankidx); + if (tmptaxon) + seqdb[i]->ranktaxonid = tmptaxon->taxid; + } +} + +void setresulttaxonrank (ecotaxonomy_t *taxonomy, poptions_t options) +{ + int32_t i; + + /*set taxon rank for which result is to be given*/ + for (i = 0; i < taxonomy->ranks->count; i++) + { + if (strcmp(taxonomy->ranks->label[i], options->taxonrank) == 0) + { + options->taxonrankidx = i; + break; + } + } + + if (i == taxonomy->ranks->count) + { + fprintf(stderr,"\nUnknown taxon level: '%s'\n", options->taxonrank); + exit(0); + } +} +/* to get db stats, totals of species, genus etc....*/ + +static void printAC(pecodnadb_t seqdb,uint32_t seqdbsize) +{ + uint32_t i; + + for (i=0; i< seqdbsize;i++) + printf("%15s (%8d bp ): %s\n",seqdb[i]->AC,seqdb[i]->SQ_length,seqdb[i]->DE); +} + +int main(int argc, char **argv) +{ + pecodnadb_t seqdb; /* of type ecoseq_t */ + uint32_t seqdbsize=0; + ecotaxonomy_t *taxonomy; + + options_t options; + int carg; + int32_t errflag=0; + + int32_t insamples=0; + int32_t outsamples=0; + uint32_t i; + + pwordcount_t words; +// pwordcount_t words2; + pprimercount_t primers; + ppairtree_t pairs; + + int32_t rankdbstats = 0; + + CNNParams nnparams; + + initoptions(&options); + + while ((carg = getopt(argc, argv, "hAfvcUDSpbE:d:l:L:e:i:r:R:q:3:s:x:t:O:m:a:T:k:M:")) != -1) { + + switch (carg) { + /* ---------------------------- */ + case 'v': /* set in single strand mode */ + /* ---------------------------- */ + options.statistics=TRUE; + break; + + /* ---------------------------- */ + case 'f': /* set in single strand mode */ + /* ---------------------------- */ + options.filtering=FALSE; + break; + + /* ---------------------------- */ + case 'A': /* set in single strand mode */ + /* ---------------------------- */ + options.printAC=TRUE; + break; + + /* -------------------- */ + case 'd': /* database name */ + /* -------------------- */ + options.prefix = ECOMALLOC(strlen(optarg)+1, + "Error on prefix allocation"); + strcpy(options.prefix,optarg); + break; + + /* -------------------- */ + case 'h': /* help */ + /* -------------------- */ + PrintHelp(); + exit(0); + break; + + /* ------------------------- */ + case 'l': /* min amplification lenght */ + /* ------------------------- */ + sscanf(optarg,"%d",&(options.lmin)); + break; + + /* -------------------------- */ + case 'L': /* max amplification lenght */ + /* -------------------------- */ + sscanf(optarg,"%d",&(options.lmax)); + break; + + /* -------------------- */ + case 'e': /* error max */ + /* -------------------- */ + sscanf(optarg,"%d",&(options.error_max)); + break; + + + /* ------------------------ */ + case '3': /* three prime strict match */ + /* ------------------------ */ + sscanf(optarg,"%d",&(options.strict_three_prime)); + break; + + /* -------------------- */ + case 'q': /* strict matching quorum */ + /* -------------------- */ + sscanf(optarg,"%f",&(options.strict_quorum)); + break; + + /* -------------------- */ + case 's': /* strict matching quorum */ + /* -------------------- */ + sscanf(optarg,"%f",&(options.sensitivity_quorum)); + break; + + /* -------------------- */ + case 't': /* required taxon level for results */ + /* -------------------- */ + strncpy(options.taxonrank, optarg, 19); + options.taxonrank[19] = 0; + break; + + /* -------------------- */ + case 'x': /* strict matching quorum */ + /* -------------------- */ + sscanf(optarg,"%f",&(options.false_positive_quorum)); + break; + + /* ---------------------------- */ + case 'D': /* set in double strand mode */ + /* ---------------------------- */ + options.doublestrand=1; + break; + + /* ---------------------------- */ + case 'S': /* set in single strand mode */ + /* ---------------------------- */ + options.doublestrand=0; + break; + + /* ---------------------------- */ + case 'U': /* set in single strand mode */ + /* ---------------------------- */ + options.no_multi_match=TRUE; + break; + + /* ------------------------------------------ */ + case 'r': /* stores the restricting search taxonomic id */ + /* ------------------------------------------ */ + options.restricted_taxid = ECOREALLOC(options.restricted_taxid,sizeof(int32_t)*(options.r+1), + "Error on restricted_taxid reallocation"); + sscanf(optarg,"%d",&(options.restricted_taxid[options.r])); + options.r++; + break; + + /* ------------------------------------------ */ + case 'E': /* stores the restricting search taxonomic id */ + /* ------------------------------------------ */ + options.exception_taxid = ECOREALLOC(options.exception_taxid,sizeof(int32_t)*(options.e+1), + "Error on exception_taxid reallocation"); + sscanf(optarg,"%d",&(options.exception_taxid[options.e])); + options.e++; + break; + + /* -------------------- */ + case 'R': /* reference sequence */ + /* -------------------- */ + options.reference = ECOMALLOC(strlen(optarg)+1, + "Error on prefix allocation"); + strcpy(options.reference,optarg); + break; + + /* --------------------------------- */ + case 'i': /* stores the taxonomic id to ignore */ + /* --------------------------------- */ + options.ignored_taxid = ECOREALLOC(options.ignored_taxid,sizeof(int32_t)*(options.g+1), + "Error on excluded_taxid reallocation"); + sscanf(optarg,"%d",&(options.ignored_taxid[options.g])); + options.g++; + break; + + /* --------------------------------- */ + case 'O': /* set primer size */ + /* --------------------------------- */ + sscanf(optarg,"%d",&(options.primer_length)); + break; + + /* --------------------------------- */ + case 'm': /* set salt method */ + /* --------------------------------- */ + sscanf(optarg,"%d",&(options.saltmethod)); + break; + + /* --------------------------------- */ + case 'a': /* set salt */ + /* --------------------------------- */ + sscanf(optarg,"%f",&(options.salt)); + break; + + /* -------------------- */ + case 'c': /* sequences are circular */ + /* --------------------------------- */ + options.circular = 1; + break; + + /* -------------------- */ + case 'p': /* print sets of primers */ + /* --------------------------------- */ + options.print_sets_of_primers = TRUE; + break; + + /* --------------------------------- */ + case 'T': /* Ignore pairs having specificity below this Threshold */ + /* --------------------------------- */ + sscanf(optarg,"%f",&(options.specificity_threshold)); + break; + + /* --------------------------------- */ + case 'M': /* Max link percentage for graph */ + /* --------------------------------- */ + sscanf(optarg,"%f",&(options.max_links_percent)); + break; + + /* --------------------------------- */ + case 'k': /* links count */ + /* --------------------------------- */ + sscanf(optarg,"%d",&(options.links_cnt)); + break; + + case 'b': + options.filter_on_links = FALSE; + break; + + case '?': /* bad option */ + /* -------------------- */ + errflag++; + } + + } + options.pnparm = &nnparams; + if (options.saltmethod != 2) //if not SALT_METHOD_OWCZARZY + options.saltmethod = SALT_METHOD_SANTALUCIA; //then force SALT_METHOD_SANTALUCIA + + if (options.salt < 0.01 || options.salt > 0.3) //if salt value out of literature values + options.salt = DEF_SALT; //set to default + + nparam_InitParams(&nnparams, DEF_CONC_PRIMERS,DEF_CONC_SEQUENCES,options.salt,options.saltmethod); + + fprintf(stderr,"Reading taxonomy database ..."); + taxonomy = read_taxonomy(options.prefix,0); + fprintf(stderr,"Ok\n"); + + setresulttaxonrank(taxonomy, &options); /*TR: set rank level for statistics*/ + + fprintf(stderr,"Reading sequence database ...\n"); + + seqdb = readdnadb(options.prefix,taxonomy,&seqdbsize, &options); + + if (options.printAC) + { + printAC(seqdb,seqdbsize); + exit(0); + } + if (options.reference) + for (i=0; i < seqdbsize;i++) + if (strcmp(seqdb[i]->AC,options.reference)==0) + { + options.refseq=seqdb[i]; + options.refseqid=i; + fprintf(stderr,"Reference sequence %s identified\n",options.reference); + } + + fprintf(stderr,"Ok\n"); + fprintf(stderr,"Sequence read : %d\n",(int32_t)seqdbsize); + + updateseqparams(seqdb, seqdbsize, taxonomy, &options, &insamples , &outsamples); + options.dbsize=seqdbsize; + options.insamples=insamples; + options.outsamples=outsamples; + + rankdbstats = getrankdbstats(seqdb, seqdbsize, taxonomy, &options); + + fprintf(stderr,"Database is constituted of %5d examples corresponding to %5d %s\n",insamples, + options.intaxa,options.taxonrank); + fprintf(stderr," and %5d counterexamples corresponding to %5d %s\n",outsamples, + options.outtaxa,options.taxonrank); + fprintf(stderr,"Total distinct %s count %d\n",options.taxonrank, rankdbstats); + + fprintf(stderr,"\nIndexing words in sequences\n"); + + words = lookforStrictPrimer(seqdb,seqdbsize,insamples,&options); + fprintf(stderr,"\n Strict primer count : %d\n",words->size); + + /*/TR Testing + fprintf(stderr,"\nReducing for debugging\n"); + words = reduce_words_to_debug (words, &options); + ///*/ +// options.filtering=FALSE; +// words2= lookforStrictPrimer(seqdb,seqdbsize,insamples,&options); +// fprintf(stderr,"\n Strict primer count : %d\n",words2->size); +// +// fprintf(stderr,"\n\n Primer sample : \n"); +// for (i=0; isize; i++) +// fprintf(stderr," + Primer : %s sequence count : %d\n",ecoUnhashWord(words->words[i],options.primer_length),words->strictcount[i]); +// fprintf(stderr,"\n\n Primer sample : \n"); +// for (i=0; isize; i++) +// fprintf(stderr," + Primer : %s sequence count : %d\n",ecoUnhashWord(words2->words[i],options.primer_length),words2->strictcount[i]); + + if (options.no_multi_match) + { + (void)filterMultiStrictPrimer(words); + fprintf(stderr,"\n Strict primer with single match count : %d\n",words->size); + } + + + fprintf(stderr,"\n\n Primer sample : \n"); + for (i=0; isize); i++) + fprintf(stderr," + Primer : %s sequence count : %d\n",ecoUnhashWord(words->words[i],options.primer_length),words->strictcount[i]); + + fprintf(stderr,"\nEncoding sequences for fuzzy pattern matching...\n"); + for (i=0;istrictcount,"Free strict primer count table"); + + if (options.error_max == 0)//aho, if(options.error_max == 0 && 0) old + primers = ahoc_lookforStrictPrimers (seqdb,seqdbsize,insamples,words,&options); + else + primers = lookforAproxPrimer(seqdb,seqdbsize,insamples,words,&options); + + //for (i=0; isize; i++) + // print_wordwith_positions (primers->primers[i], seqdbsize, &options); + + ECOFREE(words->words,"Free strict primer table"); + ECOFREE(words,"Free strict primer structure"); + fprintf(stderr,"\n\n Approximate repeats :%d \n", primers->size); + + fprintf(stderr,"\n\n Primer sample : \n"); + for (i=0; isize); i++) + fprintf(stderr," + Primer : %s example sequence count : %5d counterexample sequence count : %5d status : %s\n",ecoUnhashWord(primers->primers[i].word,options.primer_length), + primers->primers[i].inexample, + primers->primers[i].outexample, + primers->primers[i].good ? "good":"bad"); + + fprintf(stderr,"\n"); + + + pairs = buildPrimerPairs(seqdb, seqdbsize, primers, &options); + printpairs (pairs, &options,taxonomy, seqdb); + + return 0; +} + +#define DEBUG_WORDS_CNT 14 +pwordcount_t reduce_words_to_debug (pwordcount_t words, poptions_t options) +{ + uint32_t i, k; + pwordcount_t new_words; + char *rwrd; + char dwrd[20]; + /*char *strict_words[DEBUG_WORDS_CNT] = {"GAGTCTCTGCACCTATCC", "GCAATCCTGAGCCAAATC", "ACCCCTAACCACAACTCA", + "TCCGAACCGACTGATGTT", "GAAGCTTGGGTGAAACTA", "GGAGAACCAGCTAGCTCT", "GCTGGTTCTCCCCGAAAT", + "TCGATTTGGTACCGCTCT", "AAAGGAGAGAGAGGGATT", "GGATTGCTAATCCGTTGT", "CCCCCATCGTCTCACTGG", + "TGAGGCGCAGCAGTTGAC", "GCGCTACGGCGCTGAAGT", "TTTCCTGGGAGTATGGCA"};*/ + char *strict_words[DEBUG_WORDS_CNT] = {"CTCCGGTCTGAACTCAGA", "TGTTGGATCAGGACATCC", "TAGATAGAAACCGACCTG", + "TGGTGCAGCCGCTATTAA", "AGATAGAAACTGACCTGG", "TGGTGCAGCCGCTATTAA", "CTAATGGTGCAGCCGCTA", + "TAGAAACTGACCTGGATT", "AGATAGAAACCGACCTGG", "ATGGTGCAGCCGCTATTA", "ATAGATAGAAACCGACCT", + "GCCGCTATTAAGGGTTCG", "GGTGCAGCCGCTATTAAG", "TAGAAACTGACCTGGATT"}; + int word_seen[DEBUG_WORDS_CNT]; + + + new_words = ECOMALLOC(sizeof(wordcount_t),"Cannot allocate memory for word count structure"); + new_words->inseqcount = words->inseqcount; + new_words->outseqcount = words->outseqcount; + new_words->size = DEBUG_WORDS_CNT; + new_words->strictcount = ECOMALLOC((new_words->size*sizeof(uint32_t)), "Cannot allocate memory for word count table"); + new_words->words = ECOMALLOC(new_words->size*sizeof(word_t), "I cannot allocate memory for debug words"); + + for (k = 0; k < DEBUG_WORDS_CNT; k++) + word_seen[k] = 0; + + for (i=0; i < words->size; i++) + { + rwrd = ecoUnhashWord(words->words[i],options->primer_length); + strcpy (dwrd, rwrd); + rwrd = ecoUnhashWord(ecoComplementWord(words->words[i],options->primer_length),options->primer_length); + for (k = 0; k < DEBUG_WORDS_CNT; k++) + { + if (strcmp (dwrd, strict_words[k]) == 0) break; + if (strcmp (rwrd, strict_words[k]) == 0) break; + } + + if (k < DEBUG_WORDS_CNT) + { + if (word_seen[k] == 0) + { + new_words->words[k] = words->words[i]; + new_words->strictcount[k] = words->strictcount[i]; + } + word_seen[k]++; + } + } + + fprintf (stderr, "Debug Words Info:\n"); + for (k = 0; k < DEBUG_WORDS_CNT; k++) + fprintf (stderr, "%s:%d\n", strict_words[k], word_seen[k]); + + + //clean input wods; + ECOFREE(words->words,"Clean word table"); + ECOFREE(words->strictcount,"Clean word count table"); + ECOFREE(words,"Clean word structure"); + + return new_words; +} + +void print_wordwith_positions (primer_t prm, uint32_t seqdbsize, poptions_t options) +{ + char *wrd; + uint32_t i, j; + char *twrd = "GCCTGTTTACCAAAAACA"; + + wrd = ecoUnhashWord(prm.word,options->primer_length); + + if (strcmp (twrd, wrd) == 0) + { + printf ("Positions for Word: %s\n", wrd); + for (i=0; i 0) + { + printf ("%d:", i); + if (prm.directCount[i] == 1) + printf ("%d", prm.directPos[i].value); + else + for (j=0; j 0) + { + printf ("%d:", i); + if (prm.reverseCount[i] == 1) + printf ("%d", prm.reversePos[i].value); + else + for (j=0; j $@; \ + rm -f $*.d; [ -s $@ ] || rm -f $@ + +include $(SRCS:.c=.P) diff --git a/src/libecoPCR/Makefile b/src/libecoPCR/Makefile new file mode 100644 index 0000000..e2c1e3e --- /dev/null +++ b/src/libecoPCR/Makefile @@ -0,0 +1,30 @@ + +SOURCES = ecodna.c \ + ecoError.c \ + ecoIOUtils.c \ + ecoMalloc.c \ + ecorank.c \ + ecoseq.c \ + ecotax.c \ + ecofilter.c \ + econame.c + +SRCS=$(SOURCES) + +OBJECTS= $(patsubst %.c,%.o,$(SOURCES)) + +LIBFILE= libecoPCR.a +RANLIB= ranlib + + +include ../global.mk + + +all: $(LIBFILE) + +clean: + rm -rf $(OBJECTS) $(LIBFILE) + +$(LIBFILE): $(OBJECTS) + ar -cr $@ $? + $(RANLIB) $@ diff --git a/src/libecoPCR/ecoError.c b/src/libecoPCR/ecoError.c new file mode 100644 index 0000000..00bbfa2 --- /dev/null +++ b/src/libecoPCR/ecoError.c @@ -0,0 +1,26 @@ +#include "ecoPCR.h" +#include +#include + +/* + * print the message given as argument and exit the program + * @param error error number + * @param message the text explaining what's going on + * @param filename the file source where the program failed + * @param linenumber the line where it has failed + * filename and linenumber are written at pre-processing + * time by a macro + */ +void ecoError(int32_t error, + const char* message, + const char * filename, + int linenumber) +{ + fprintf(stderr,"Error %d in file %s line %d : %s\n", + error, + filename, + linenumber, + message); + + abort(); +} diff --git a/src/libecoPCR/ecoIOUtils.c b/src/libecoPCR/ecoIOUtils.c new file mode 100644 index 0000000..73fb812 --- /dev/null +++ b/src/libecoPCR/ecoIOUtils.c @@ -0,0 +1,122 @@ +#include "ecoPCR.h" +#include +#include + +#define SWAPINT32(x) ((((x) << 24) & 0xFF000000) | (((x) << 8) & 0xFF0000) | \ + (((x) >> 8) & 0xFF00) | (((x) >> 24) & 0xFF)) + + +int32_t is_big_endian() +{ + int32_t i=1; + + return (int32_t)((char*)&i)[0]; +} + + + + +int32_t swap_int32_t(int32_t i) +{ + return SWAPINT32(i); +} + + +/** + * Read part of the file + * @param *f the database + * @param recordSize the size to be read + * + * @return buffer + */ +void *read_ecorecord(FILE *f,int32_t *recordSize) +{ + static void *buffer =NULL; + int32_t buffersize=0; + int32_t read; + + if (!recordSize) + ECOERROR(ECO_ASSERT_ERROR, + "recordSize cannot be NULL"); + + read = fread(recordSize, + 1, + sizeof(int32_t), + f); + + if (feof(f)) + return NULL; + + if (read != sizeof(int32_t)) + ECOERROR(ECO_IO_ERROR,"Reading record size error"); + + if (is_big_endian()) + *recordSize=swap_int32_t(*recordSize); + + if (buffersize < *recordSize) + { + if (buffer) + buffer = ECOREALLOC(buffer,*recordSize, + "Increase size of record buffer"); + else + buffer = ECOMALLOC(*recordSize, + "Allocate record buffer"); + } + + read = fread(buffer, + 1, + *recordSize, + f); + + if (read != *recordSize) + ECOERROR(ECO_IO_ERROR,"Reading record data error"); + + return buffer; +}; + + + + + +/** + * Open the database and check it's readable + * @param filename name of the database (.sdx, .rdx, .tbx) + * @param sequencecount buffer - pointer to variable storing the number of occurence + * @param abort_on_open_error boolean to define the behaviour in case of error + * while opening the database + * @return FILE type + **/ +FILE *open_ecorecorddb(const char *filename, + int32_t *sequencecount, + int32_t abort_on_open_error) +{ + FILE *f; + int32_t read; + + f = fopen(filename,"rb"); + + if (!f) + { + if (abort_on_open_error) + ECOERROR(ECO_IO_ERROR,"Cannot open file"); + else + { + *sequencecount=0; + return NULL; + } + } + + read = fread(sequencecount, + 1, + sizeof(int32_t), + f); + + if (read != sizeof(int32_t)) + ECOERROR(ECO_IO_ERROR,"Reading record size error"); + + if (is_big_endian()) + *sequencecount=swap_int32_t(*sequencecount); + + return f; +} + diff --git a/src/libecoPCR/ecoMalloc.c b/src/libecoPCR/ecoMalloc.c new file mode 100644 index 0000000..d44ce10 --- /dev/null +++ b/src/libecoPCR/ecoMalloc.c @@ -0,0 +1,96 @@ +#include "ecoPCR.h" +#include + +static int eco_log_malloc = 0; +static size_t eco_amount_malloc=0; +static size_t eco_chunk_malloc=0; + +void eco_trace_memory_allocation() +{ + eco_log_malloc=1; +} + +void eco_untrace_memory_allocation() +{ + eco_log_malloc=0; +} + +void ecoMallocedMemory() +{ + //eco_amount_malloc; +} + +void *eco_malloc(int64_t chunksize, + const char *error_message, + const char *filename, + int32_t line) +{ + void * chunk; + + chunk = calloc(1,chunksize); + + + if (!chunk) + ecoError(ECO_MEM_ERROR,error_message,filename,line); + + eco_chunk_malloc++; + + if (eco_log_malloc) + fprintf(stderr, + "Memory segment located at %p of size %d is allocated (file : %s [%d])", + chunk, + chunksize, + filename, + line); + + return chunk; +} + +void *eco_realloc(void *chunk, + int64_t newsize, + const char *error_message, + const char *filename, + int32_t line) +{ + void *newchunk; + + newchunk = realloc(chunk,newsize); + + + if (!newchunk) + { + ecoError(ECO_MEM_ERROR,error_message,filename,line); + fprintf(stderr,"Requested memory : %d\n",newsize); + } + if (!chunk) + eco_chunk_malloc++; + + if (eco_log_malloc) + fprintf(stderr, + "Old memory segment %p is reallocated at %p with a size of %d (file : %s [%d])", + chunk, + newchunk, + newsize, + filename, + line); + + return newchunk; +} + +void eco_free(void *chunk, + const char *error_message, + const char *filename, + int32_t line) +{ + free(chunk); + + if (eco_log_malloc) + fprintf(stderr, + "Memory segment %p is released => %s (file : %s [%d])", + chunk, + error_message, + filename, + line); + + eco_chunk_malloc--; +} diff --git a/src/libecoPCR/ecoPCR.h b/src/libecoPCR/ecoPCR.h new file mode 100644 index 0000000..237ec32 --- /dev/null +++ b/src/libecoPCR/ecoPCR.h @@ -0,0 +1,270 @@ +#ifndef ECOPCR_H_ +#define ECOPCR_H_ + +#include +#include + +/***************************************************** + * + * Data type declarations + * + *****************************************************/ + +/* + * + * Sequence types + * + */ + +typedef struct { + + int32_t taxid; + char AC[20]; + int32_t DE_length; + int32_t SQ_length; + int32_t CSQ_length; /*what is this CSQ_length ? */ + + char data[1]; + +} ecoseqformat_t; + +typedef struct { + int32_t taxid; + int32_t SQ_length; + int32_t isexample; + char *AC; + char *DE; + char *SQ; + + int32_t ranktaxonid;/*TR: taxon id to which the sequence belongs*/ +} ecoseq_t, *pecoseq_t; + +/* + * + * Taxonomy taxon types + * + */ + + +typedef struct { + int32_t taxid; + int32_t rank; + int32_t parent; + int32_t namelength; + char name[1]; + +} ecotxformat_t; + +typedef struct ecotxnode { + int32_t taxid; + int32_t rank; + struct ecotxnode *parent; + char *name; +} ecotx_t; + +typedef struct { + int32_t count; + ecotx_t taxon[1]; +} ecotxidx_t; + + +/* + * + * Taxonomy rank types + * + */ + +typedef struct { + int32_t count; + char* label[1]; +} ecorankidx_t; + +/* + * + * Taxonomy name types + * + */ + +typedef struct { + int32_t is_scientificname; + int32_t namelength; + int32_t classlength; + int32_t taxid; + char names[1]; +} econameformat_t; + + + typedef struct { + char *name; + char *classname; + int32_t is_scientificname; + struct ecotxnode *taxon; +} econame_t; + + +typedef struct { + int32_t count; + econame_t names[1]; +} econameidx_t; + + + typedef struct { + ecorankidx_t *ranks; + econameidx_t *names; + ecotxidx_t *taxons; +} ecotaxonomy_t; + + +/***************************************************** + * + * Function declarations + * + *****************************************************/ + +/* + * + * Low level system functions + * + */ + +int32_t is_big_endian(); +int32_t swap_int32_t(int32_t); + +void *eco_malloc(int64_t chunksize, + const char *error_message, + const char *filename, + int32_t line); + + +void *eco_realloc(void *chunk, + int64_t chunksize, + const char *error_message, + const char *filename, + int32_t line); + +void eco_free(void *chunk, + const char *error_message, + const char *filename, + int32_t line); + +void eco_trace_memory_allocation(); +void eco_untrace_memory_allocation(); + +#define ECOMALLOC(size,error_message) \ + eco_malloc((size),(error_message),__FILE__,__LINE__) + +#define ECOREALLOC(chunk,size,error_message) \ + eco_realloc((chunk),(size),(error_message),__FILE__,__LINE__) + +#define ECOFREE(chunk,error_message) \ + eco_free((chunk),(error_message),__FILE__,__LINE__) + + + + +/* + * + * Error managment + * + */ + + +void ecoError(int32_t,const char*,const char *,int); + +#define ECOERROR(code,message) ecoError((code),(message),__FILE__,__LINE__) + +#define ECO_IO_ERROR (1) +#define ECO_MEM_ERROR (2) +#define ECO_ASSERT_ERROR (3) +#define ECO_NOTFOUND_ERROR (4) + + +/* + * + * Low level Disk access functions + * + */ + +FILE *open_ecorecorddb(const char *filename, + int32_t *sequencecount, + int32_t abort_on_open_error); + +void *read_ecorecord(FILE *,int32_t *recordSize); + + + +/* + * Read function in internal binary format + */ + +FILE *open_ecoseqdb(const char *filename, + int32_t *sequencecount); + +ecoseq_t *readnext_ecoseq(FILE *); + +ecorankidx_t *read_rankidx(const char *filename); + +econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy); + + + + /** + * Read taxonomy data as formated by the ecoPCRFormat.py script. + * + * This function is normaly uses internaly by the read_taxonomy + * function and should not be called directly. + * + * @arg filename path to the *.tdx file of the reformated db + * + * @return pointer to a taxonomy index structure + */ + +ecotxidx_t *read_taxonomyidx(const char *filename); + +ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName); + +ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy, int32_t taxid); + +ecotx_t *eco_findtaxonatrank(ecotx_t *taxon, int32_t rankidx); + +int eco_isundertaxon(ecotx_t *taxon, int other_taxid); + +ecoseq_t *ecoseq_iterator(const char *prefix); + + + +ecoseq_t *new_ecoseq(); +int32_t delete_ecoseq(ecoseq_t *); +ecoseq_t *new_ecoseq_with_data( char *AC, + char *DE, + char *SQ, + int32_t taxid + ); + + +int32_t delete_taxon(ecotx_t *taxon); +int32_t delete_taxonomy(ecotxidx_t *index); + + +int32_t rank_index(const char* label,ecorankidx_t* ranks); + +//int32_t delete_apatseq(SeqPtr pseq); +//PatternPtr buildPattern(const char *pat, int32_t error_max); +//PatternPtr complementPattern(PatternPtr pat); +// +//SeqPtr ecoseq2apatseq(ecoseq_t *in,SeqPtr out,int32_t circular); + +char *ecoComplementPattern(char *nucAcSeq); +char *ecoComplementSequence(char *nucAcSeq); +char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end); + +ecotx_t *eco_getspecies(ecotx_t *taxon,ecotaxonomy_t *taxonomy); +ecotx_t *eco_getgenus(ecotx_t *taxon,ecotaxonomy_t *taxonomy); +ecotx_t *eco_getfamily(ecotx_t *taxon,ecotaxonomy_t *taxonomy); +ecotx_t *eco_getkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy); +ecotx_t *eco_getsuperkingdom(ecotx_t *taxon,ecotaxonomy_t *taxonomy); + +int eco_is_taxid_ignored(int32_t *ignored_taxid, int32_t tab_len, int32_t taxid); +int eco_is_taxid_included(ecotaxonomy_t *taxonomy, int32_t *included_taxid, int32_t tab_len, int32_t taxid); + +#endif /*ECOPCR_H_*/ diff --git a/src/libecoPCR/ecoapat.c b/src/libecoPCR/ecoapat.c new file mode 100644 index 0000000..284e579 --- /dev/null +++ b/src/libecoPCR/ecoapat.c @@ -0,0 +1,202 @@ +#include "../libapat/libstki.h" +#include "../libapat/apat.h" + +#include "ecoPCR.h" + +#include + +static void EncodeSequence(SeqPtr seq); +static void UpperSequence(char *seq); + +/* -------------------------------------------- */ +/* uppercase sequence */ +/* -------------------------------------------- */ + +#define IS_LOWER(c) (((c) >= 'a') && ((c) <= 'z')) +#define TO_UPPER(c) ((c) - 'a' + 'A') + +void UpperSequence(char *seq) +{ + char *cseq; + + for (cseq = seq ; *cseq ; cseq++) + if (IS_LOWER(*cseq)) + *cseq = TO_UPPER(*cseq); +} + +#undef IS_LOWER +#undef TO_UPPER + + + + +/* -------------------------------------------- */ +/* encode sequence */ +/* IS_UPPER is slightly faster than isupper */ +/* -------------------------------------------- */ + +#define IS_UPPER(c) (((c) >= 'A') && ((c) <= 'Z')) + + + +void EncodeSequence(SeqPtr seq) +{ + int i; + UInt8 *data; + char *cseq; + + data = seq->data; + cseq = seq->cseq; + + while (*cseq) { + + *data = (IS_UPPER(*cseq) ? *cseq - 'A' : 0x0); + data++; + cseq++; + } + + for (i=0,cseq=seq->cseq;i < seq->circular; i++,cseq++,data++) + *data = (IS_UPPER(*cseq) ? *cseq - 'A' : 0x0); + + for (i = 0 ; i < MAX_PATTERN ; i++) + seq->hitpos[i]->top = seq->hiterr[i]->top = 0; + +} + +#undef IS_UPPER + + +SeqPtr ecoseq2apatseq(ecoseq_t *in,SeqPtr out,int32_t circular) +{ + int i; + + if (!out) + { + out = ECOMALLOC(sizeof(Seq), + "Error in Allocation of a new Seq structure"); + + for (i = 0 ; i < MAX_PATTERN ; i++) + { + + if (! (out->hitpos[i] = NewStacki(kMinStackiSize))) + ECOERROR(ECO_MEM_ERROR,"Error in hit stack Allocation"); + + if (! (out->hiterr[i] = NewStacki(kMinStackiSize))) + ECOERROR(ECO_MEM_ERROR,"Error in error stack Allocation"); + } + } + + + out->name = in->AC; + out->seqsiz = out->seqlen = in->SQ_length; + out->circular = circular; + + if (!out->data) + { + out->data = ECOMALLOC((out->seqlen+circular) *sizeof(UInt8), + "Error in Allocation of a new Seq data member"); + out->datsiz= out->seqlen+circular; + } + else if ((out->seqlen +circular) >= out->datsiz) + { + out->data = ECOREALLOC(out->data,(out->seqlen+circular), + "Error during Seq data buffer realloc"); + out->datsiz= out->seqlen+circular; + } + + out->cseq = in->SQ; + + EncodeSequence(out); + + return out; +} + +int32_t delete_apatseq(SeqPtr pseq) +{ + int i; + + if (pseq) { + + if (pseq->data) + ECOFREE(pseq->data,"Freeing sequence data buffer"); + + for (i = 0 ; i < MAX_PATTERN ; i++) { + if (pseq->hitpos[i]) FreeStacki(pseq->hitpos[i]); + if (pseq->hiterr[i]) FreeStacki(pseq->hiterr[i]); + } + + ECOFREE(pseq,"Freeing apat sequence structure"); + + return 0; + } + + return 1; +} + +/* + +PatternPtr buildPattern(const char *pat, int32_t error_max) +{ + PatternPtr pattern; + int32_t patlen; + + pattern = ECOMALLOC(sizeof(Pattern), + "Error in pattern allocation"); + + pattern->ok = Vrai; + pattern->hasIndel= Faux; + pattern->maxerr = error_max; + patlen = strlen(pat); + + pattern->cpat = ECOMALLOC(sizeof(char)*patlen+1, + "Error in sequence pattern allocation"); + + strncpy(pattern->cpat,pat,patlen); + pattern->cpat[patlen]=0; + UpperSequence(pattern->cpat); + + if (!CheckPattern(pattern)) + ECOERROR(ECO_ASSERT_ERROR,"Error in pattern checking"); + + if (! EncodePattern(pattern, dna)) + ECOERROR(ECO_ASSERT_ERROR,"Error in pattern encoding"); + + if (! CreateS(pattern, ALPHA_LEN)) + ECOERROR(ECO_ASSERT_ERROR,"Error in pattern compiling"); + + return pattern; + +} + +PatternPtr complementPattern(PatternPtr pat) +{ + PatternPtr pattern; + + pattern = ECOMALLOC(sizeof(Pattern), + "Error in pattern allocation"); + + pattern->ok = Vrai; + pattern->hasIndel= pat->hasIndel; + pattern->maxerr = pat->maxerr; + pattern->patlen = pat->patlen; + + pattern->cpat = ECOMALLOC(sizeof(char)*(strlen(pat->cpat)+1), + "Error in sequence pattern allocation"); + + strcpy(pattern->cpat,pat->cpat); + + ecoComplementPattern(pattern->cpat); + + if (!CheckPattern(pattern)) + ECOERROR(ECO_ASSERT_ERROR,"Error in pattern checking"); + + if (! EncodePattern(pattern, dna)) + ECOERROR(ECO_ASSERT_ERROR,"Error in pattern encoding"); + + if (! CreateS(pattern, ALPHA_LEN)) + ECOERROR(ECO_ASSERT_ERROR,"Error in pattern compiling"); + + return pattern; + +} +*/ diff --git a/src/libecoPCR/ecodna.c b/src/libecoPCR/ecodna.c new file mode 100644 index 0000000..7d29a0e --- /dev/null +++ b/src/libecoPCR/ecodna.c @@ -0,0 +1,153 @@ +#include +#include "ecoPCR.h" + +/* + * @doc: DNA alphabet (IUPAC) + */ +#define LX_BIO_DNA_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZ#![]" + +/* + * @doc: complementary DNA alphabet (IUPAC) + */ +#define LX_BIO_CDNA_ALPHA "TVGHEFCDIJMLKNOPQYSAABWXRZ#!][" + + +static char sNuc[] = LX_BIO_DNA_ALPHA; +static char sAnuc[] = LX_BIO_CDNA_ALPHA; + +static char LXBioBaseComplement(char nucAc); +static char *LXBioSeqComplement(char *nucAcSeq); +static char *reverseSequence(char *str,char isPattern); + + +/* ---------------------------- */ + +char LXBioBaseComplement(char nucAc) +{ + char *c; + + if ((c = strchr(sNuc, nucAc))) + return sAnuc[(c - sNuc)]; + else + return nucAc; +} + +/* ---------------------------- */ + +char *LXBioSeqComplement(char *nucAcSeq) +{ + char *s; + + for (s = nucAcSeq ; *s ; s++) + *s = LXBioBaseComplement(*s); + + return nucAcSeq; +} + + +char *reverseSequence(char *str,char isPattern) +{ + char *sb, *se, c; + + if (! str) + return str; + + sb = str; + se = str + strlen(str) - 1; + + while(sb <= se) { + c = *sb; + *sb++ = *se; + *se-- = c; + } + + sb = str; + se = str + strlen(str) - 1; + + if (isPattern) + for (;sb < se; sb++) + { + if (*sb=='#') + { + if (((se - sb) > 2) && (*(sb+2)=='!')) + { + *sb='!'; + sb+=2; + *sb='#'; + } + else + { + *sb=*(sb+1); + sb++; + *sb='#'; + } + } + else if (*sb=='!') + { + *sb=*(sb-1); + *(sb-1)='!'; + } + } + + return str; +} + +char *ecoComplementPattern(char *nucAcSeq) +{ + return reverseSequence(LXBioSeqComplement(nucAcSeq),1); +} + +char *ecoComplementSequence(char *nucAcSeq) +{ + return reverseSequence(LXBioSeqComplement(nucAcSeq),0); +} + + +char *getSubSequence(char* nucAcSeq,int32_t begin,int32_t end) +{ + static char *buffer = NULL; + static int32_t buffSize= 0; + int32_t length; + + if (begin < end) + { + length = end - begin; + + if (length >= buffSize) + { + buffSize = length+1; + if (buffer) + buffer=ECOREALLOC(buffer,buffSize, + "Error in reallocating sub sequence buffer"); + else + buffer=ECOMALLOC(buffSize, + "Error in allocating sub sequence buffer"); + + } + + strncpy(buffer,nucAcSeq + begin,length); + buffer[length]=0; + } + else + { + length = end + strlen(nucAcSeq) - begin; + + if (length >= buffSize) + { + buffSize = length+1; + if (buffer) + buffer=ECOREALLOC(buffer,buffSize, + "Error in reallocating sub sequence buffer"); + else + buffer=ECOMALLOC(buffSize, + "Error in allocating sub sequence buffer"); + + } + strncpy(buffer,nucAcSeq+begin,length - end); + strncpy(buffer+(length-end),nucAcSeq ,end); + buffer[length]=0; + } + + return buffer; +} + diff --git a/src/libecoPCR/ecofilter.c b/src/libecoPCR/ecofilter.c new file mode 100644 index 0000000..8c737c6 --- /dev/null +++ b/src/libecoPCR/ecofilter.c @@ -0,0 +1,20 @@ +#include "ecoPCR.h" + +int eco_is_taxid_included( ecotaxonomy_t *taxonomy, + int32_t *restricted_taxid, + int32_t tab_len, + int32_t taxid) +{ + int i; + ecotx_t *taxon; + + taxon = eco_findtaxonbytaxid(taxonomy, taxid); + + for (i=0; i < tab_len; i++) + if ( (taxon->taxid == restricted_taxid[i]) || + (eco_isundertaxon(taxon, restricted_taxid[i])) ) + return 1; + + return 0; +} + diff --git a/src/libecoPCR/econame.c b/src/libecoPCR/econame.c new file mode 100644 index 0000000..835d79c --- /dev/null +++ b/src/libecoPCR/econame.c @@ -0,0 +1,61 @@ +#include "ecoPCR.h" +#include +#include + +static econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy); + +econameidx_t *read_nameidx(const char *filename,ecotaxonomy_t *taxonomy) +{ + + int32_t count; + FILE *f; + econameidx_t *indexname; + int32_t i; + + f = open_ecorecorddb(filename,&count,1); + + indexname = (econameidx_t*) ECOMALLOC(sizeof(econameidx_t) + sizeof(econame_t) * (count-1),"Allocate names"); + + indexname->count=count; + + for (i=0; i < count; i++){ + readnext_econame(f,(indexname->names)+i,taxonomy); + } + + return indexname; +} + +econame_t *readnext_econame(FILE *f,econame_t *name,ecotaxonomy_t *taxonomy) +{ + + econameformat_t *raw; + int32_t rs; + + raw = read_ecorecord(f,&rs); + + if (!raw) + return NULL; + + if (is_big_endian()) + { + raw->is_scientificname = swap_int32_t(raw->is_scientificname); + raw->namelength = swap_int32_t(raw->namelength); + raw->classlength = swap_int32_t(raw->classlength); + raw->taxid = swap_int32_t(raw->taxid); + } + + name->is_scientificname=raw->is_scientificname; + + name->name = ECOMALLOC((raw->namelength+1) * sizeof(char),"Allocate name"); + strncpy(name->name,raw->names,raw->namelength); + name->name[raw->namelength]=0; + + name->classname = ECOMALLOC((raw->classlength+1) * sizeof(char),"Allocate classname"); + strncpy(name->classname,(raw->names+raw->namelength),raw->classlength); + name->classname[raw->classlength]=0; + + name->taxon = taxonomy->taxons->taxon + raw->taxid; + + return name; +} + diff --git a/src/libecoPCR/ecorank.c b/src/libecoPCR/ecorank.c new file mode 100644 index 0000000..4796088 --- /dev/null +++ b/src/libecoPCR/ecorank.c @@ -0,0 +1,52 @@ +#include "ecoPCR.h" +#include +#include + +static int compareRankLabel(const void *label1, const void *label2); + +ecorankidx_t *read_rankidx(const char *filename) +{ + int32_t count; + FILE *f; + ecorankidx_t *index; + int32_t i; + int32_t rs; + char *buffer; + + f = open_ecorecorddb(filename,&count,1); + + index = (ecorankidx_t*) ECOMALLOC(sizeof(ecorankidx_t) + sizeof(char*) * (count-1), + "Allocate rank index"); + + index->count=count; + + for (i=0; i < count; i++) + { + buffer = read_ecorecord(f,&rs); + index->label[i]=(char*) ECOMALLOC(rs+1, + "Allocate rank label"); + strncpy(index->label[i],buffer,rs); + } + + return index; +} + +int32_t rank_index(const char* label,ecorankidx_t* ranks) +{ + char **rep; + + rep = bsearch(label,ranks->label,ranks->count,sizeof(char*),compareRankLabel); + + if (rep) + return rep-ranks->label; + else + ECOERROR(ECO_NOTFOUND_ERROR,"Rank label not found"); + + return -1; +} + + +int compareRankLabel(const void *label1, const void *label2) +{ + return strcmp((const char*)label1,*(const char**)label2); +} diff --git a/src/libecoPCR/ecoseq.c b/src/libecoPCR/ecoseq.c new file mode 100644 index 0000000..1368be9 --- /dev/null +++ b/src/libecoPCR/ecoseq.c @@ -0,0 +1,233 @@ +#include "ecoPCR.h" +#include +#include +#include +#include +#include + +static FILE *open_seqfile(const char *prefix,int32_t index); + + +ecoseq_t *new_ecoseq() +{ + void *tmp; + + tmp = ECOMALLOC(sizeof(ecoseq_t),"Allocate new ecoseq structure"); + + return tmp; +} + +int32_t delete_ecoseq(ecoseq_t * seq) +{ + + if (seq) + { + if (seq->AC) + ECOFREE(seq->AC,"Free sequence AC"); + + if (seq->DE) + ECOFREE(seq->DE,"Free sequence DE"); + + if (seq->SQ) + ECOFREE(seq->SQ,"Free sequence SQ"); + + ECOFREE(seq,"Free sequence structure"); + + return 0; + + } + + return 1; +} + +ecoseq_t *new_ecoseq_with_data( char *AC, + char *DE, + char *SQ, + int32_t taxid_idx + ) +{ + ecoseq_t *tmp; + int32_t lstr; + tmp = new_ecoseq(); + + tmp->taxid=taxid_idx; + + if (AC) + { + lstr =strlen(AC); + tmp->AC=ECOMALLOC((lstr+1) * sizeof(char), + "Allocate sequence accession"); + strcpy(tmp->AC,AC); + } + + if (DE) + { + lstr =strlen(DE); + tmp->DE=ECOMALLOC((lstr+1) * sizeof(char), + "Allocate sequence definition"); + strcpy(tmp->DE,DE); + } + + if (SQ) + { + lstr =strlen(SQ); + tmp->SQ=ECOMALLOC((lstr+1) * sizeof(char), + "Allocate sequence data"); + strcpy(tmp->SQ,SQ); + } + + tmp->isexample=1; + + return tmp; + +} + +/** + * ?? used ?? + **/ +FILE *open_ecoseqdb(const char *filename, + int32_t *sequencecount) +{ + return open_ecorecorddb(filename,sequencecount,1); +} + +ecoseq_t *readnext_ecoseq(FILE *f) +{ + char *compressed=NULL; + + ecoseqformat_t *raw; + ecoseq_t *seq; + int32_t comp_status; + unsigned long int seqlength; + int32_t rs; + char *c; + int32_t i; + + raw = read_ecorecord(f,&rs); + + if (!raw) + return NULL; + + if (is_big_endian()) + { + raw->CSQ_length = swap_int32_t(raw->CSQ_length); + raw->DE_length = swap_int32_t(raw->DE_length); + raw->SQ_length = swap_int32_t(raw->SQ_length); + raw->taxid = swap_int32_t(raw->taxid); + } + + seq = new_ecoseq(); + + seq->taxid = raw->taxid; + + seq->AC = ECOMALLOC(strlen(raw->AC) +1, + "Allocate Sequence Accesion number"); + strncpy(seq->AC,raw->AC,strlen(raw->AC)); + + + seq->DE = ECOMALLOC(raw->DE_length+1, + "Allocate Sequence definition"); + strncpy(seq->DE,raw->data,raw->DE_length); + + seqlength = seq->SQ_length = raw->SQ_length; + + compressed = raw->data + raw->DE_length; + + seq->SQ = ECOMALLOC(seqlength+1, + "Allocate sequence buffer"); + + seq->isexample=1; + + comp_status = uncompress((unsigned char*)seq->SQ, + &seqlength, + (unsigned char*)compressed, + raw->CSQ_length); + + if (comp_status != Z_OK) + ECOERROR(ECO_IO_ERROR,"I cannot uncompress sequence data"); + + for (c=seq->SQ,i=0;iDE,seq->SQ_length); + return seq; +} + +/** + * Open the sequences database (.sdx file) + * @param prefix name of the database (radical without extension) + * @param index integer + * + * @return file object + */ +FILE *open_seqfile(const char *prefix,int32_t index) +{ + char filename_buffer[1024]; + int32_t filename_length; + FILE *input; + int32_t seqcount; + + filename_length = snprintf(filename_buffer, + 1023, + "%s_%03d.sdx", + prefix, + index); + + + + if (filename_length >= 1024) + ECOERROR(ECO_ASSERT_ERROR,"file name is too long"); + + filename_buffer[filename_length]=0; + + input=open_ecorecorddb(filename_buffer,&seqcount,0); + + if (input) + fprintf(stderr,"# Reading file %s containing %d sequences...\n", + filename_buffer, + seqcount); + + return input; +} + +ecoseq_t *ecoseq_iterator(const char *prefix) +{ + static FILE *current_seq_file= NULL; + static int32_t current_file_idx = 1; + static char current_prefix[1024]; + ecoseq_t *seq; + + if (prefix) + { + current_file_idx = 1; + + if (current_seq_file) + fclose(current_seq_file); + + strncpy(current_prefix,prefix,1023); + current_prefix[1024]=0; + + current_seq_file = open_seqfile(current_prefix, + current_file_idx); + + if (!current_seq_file) + return NULL; + + } + + seq = readnext_ecoseq(current_seq_file); + + if (!seq && feof(current_seq_file)) + { + current_file_idx++; + fclose(current_seq_file); + current_seq_file = open_seqfile(current_prefix, + current_file_idx); + + + if (current_seq_file) + seq = readnext_ecoseq(current_seq_file); + } + + return seq; +} diff --git a/src/libecoPCR/ecotax.c b/src/libecoPCR/ecotax.c new file mode 100644 index 0000000..a0ade86 --- /dev/null +++ b/src/libecoPCR/ecotax.c @@ -0,0 +1,329 @@ +#include "ecoPCR.h" +#include +#include +#include + +static ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon); + + /** + * Open the taxonomy database + * @param pointer to the database (.tdx file) + * @return a ecotxidx_t structure + */ +ecotxidx_t *read_taxonomyidx(const char *filename) +{ + int32_t count; + FILE *f; + ecotxidx_t *index; + int32_t i; + + f = open_ecorecorddb(filename,&count,1); + + index = (ecotxidx_t*) ECOMALLOC(sizeof(ecotxidx_t) + sizeof(ecotx_t) * (count-1), + "Allocate taxonomy"); + + index->count=count; + for (i=0; i < count; i++){ + readnext_ecotaxon(f,&(index->taxon[i])); + index->taxon[i].parent=index->taxon + (size_t)index->taxon[i].parent; + } + return index; +} + + +int32_t delete_taxonomy(ecotxidx_t *index) +{ + int32_t i; + + if (index) + { + for (i=0; i< index->count; i++) + if (index->taxon[i].name) + ECOFREE(index->taxon[i].name,"Free scientific name"); + + ECOFREE(index,"Free Taxonomy"); + + return 0; + } + + return 1; +} + + + +int32_t delete_taxon(ecotx_t *taxon) +{ + if (taxon) + { + if (taxon->name) + ECOFREE(taxon->name,"Free scientific name"); + + ECOFREE(taxon,"Free Taxon"); + + return 0; + } + + return 1; +} + + +/** + * Read the database for a given taxon a save the data + * into the taxon structure(if any found) + * @param *f pointer to FILE type returned by fopen + * @param *taxon pointer to the structure + * + * @return a ecotx_t structure if any taxon found else NULL + */ +ecotx_t *readnext_ecotaxon(FILE *f,ecotx_t *taxon) +{ + + ecotxformat_t *raw; + int32_t rs; + + raw = read_ecorecord(f,&rs); + + if (!raw) + return NULL; + + if (is_big_endian()) + { + raw->namelength = swap_int32_t(raw->namelength); + raw->parent = swap_int32_t(raw->parent); + raw->rank = swap_int32_t(raw->rank); + raw->taxid = swap_int32_t(raw->taxid); + } + + taxon->parent = (ecotx_t*)(size_t)raw->parent; + taxon->taxid = raw->taxid; + taxon->rank = raw->rank; + + taxon->name = ECOMALLOC((raw->namelength+1) * sizeof(char), + "Allocate taxon scientific name"); + + strncpy(taxon->name,raw->name,raw->namelength); + + return taxon; +} + + +ecotaxonomy_t *read_taxonomy(const char *prefix,int32_t readAlternativeName) +{ + ecotaxonomy_t *tax; + char *filename; + int buffsize; + + tax = ECOMALLOC(sizeof(ecotaxonomy_t), + "Allocate taxonomy structure"); + + buffsize = strlen(prefix)+10; + + filename = ECOMALLOC(buffsize, + "Allocate filename"); + + snprintf(filename,buffsize,"%s.rdx",prefix); + + tax->ranks = read_rankidx(filename); + + snprintf(filename,buffsize,"%s.tdx",prefix); + + tax->taxons = read_taxonomyidx(filename); + + if (readAlternativeName) + { + snprintf(filename,buffsize,"%s.ndx",prefix); + tax->names=read_nameidx(filename,tax); + } + else + tax->names=NULL; + return tax; + +} + + + +int32_t delete_ecotaxonomy(ecotaxonomy_t *taxonomy) +{ + if (taxonomy) + { + if (taxonomy->ranks) + ECOFREE(taxonomy->ranks,"Free rank index"); + + if (taxonomy->taxons) + ECOFREE(taxonomy->taxons,"Free taxon index"); + + ECOFREE(taxonomy,"Free taxonomy structure"); + + return 0; + } + + return 1; +} + +ecotx_t *eco_findtaxonatrank(ecotx_t *taxon, + int32_t rankidx) +{ + ecotx_t *current_taxon; + ecotx_t *next_taxon; + + current_taxon = taxon; + next_taxon = current_taxon->parent; + + while ((current_taxon!=next_taxon) && // I' am the root node + (current_taxon->rank!=rankidx)) + { + current_taxon = next_taxon; + next_taxon = current_taxon->parent; + } + + if (current_taxon->rank==rankidx) + return current_taxon; + else + return NULL; +} + +/** + * Get back information concerning a taxon from a taxonomic id + * @param *taxonomy the taxonomy database + * @param taxid the taxonomic id + * + * @result a ecotx_t structure containing the taxonimic information + **/ +ecotx_t *eco_findtaxonbytaxid(ecotaxonomy_t *taxonomy, + int32_t taxid) +{ + ecotx_t *current_taxon; + int32_t taxoncount; + int32_t i; + + taxoncount=taxonomy->taxons->count; + + for (current_taxon=taxonomy->taxons->taxon, + i=0; + i < taxoncount; + i++, + current_taxon++){ + if (current_taxon->taxid==taxid){ + return current_taxon; + } + } + + return (ecotx_t*)NULL; +} + +/** + * Find out if taxon is son of other taxon (identified by its taxid) + * @param *taxon son taxon + * @param parent_taxid taxonomic id of the other taxon + * + * @return 1 is the other taxid math a parent taxid, else 0 + **/ +int eco_isundertaxon(ecotx_t *taxon, + int other_taxid) +{ + ecotx_t *next_parent; + + next_parent = taxon->parent; + + while ( (other_taxid != next_parent->taxid) && + (strcmp(next_parent->name, "root")) ) + { + next_parent = next_parent->parent; + } + + if (other_taxid == next_parent->taxid) + return 1; + else + return 0; +} + +ecotx_t *eco_getspecies(ecotx_t *taxon, + ecotaxonomy_t *taxonomy) +{ + static ecotaxonomy_t *tax=NULL; + static int32_t rankindex=-1; + + if (taxonomy && tax!=taxonomy) + { + rankindex = rank_index("species",taxonomy->ranks); + tax=taxonomy; + } + + if (!tax || rankindex < 0) + ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); + + return eco_findtaxonatrank(taxon,rankindex); +} + +ecotx_t *eco_getgenus(ecotx_t *taxon, + ecotaxonomy_t *taxonomy) +{ + static ecotaxonomy_t *tax=NULL; + static int32_t rankindex=-1; + + if (taxonomy && tax!=taxonomy) + { + rankindex = rank_index("genus",taxonomy->ranks); + tax=taxonomy; + } + + if (!tax || rankindex < 0) + ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); + + return eco_findtaxonatrank(taxon,rankindex); +} + + +ecotx_t *eco_getfamily(ecotx_t *taxon, + ecotaxonomy_t *taxonomy) +{ + static ecotaxonomy_t *tax=NULL; + static int32_t rankindex=-1; + + if (taxonomy && tax!=taxonomy) + { + rankindex = rank_index("family",taxonomy->ranks); + tax=taxonomy; + } + + if (!tax || rankindex < 0) + ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); + + return eco_findtaxonatrank(taxon,rankindex); +} + +ecotx_t *eco_getkingdom(ecotx_t *taxon, + ecotaxonomy_t *taxonomy) +{ + static ecotaxonomy_t *tax=NULL; + static int32_t rankindex=-1; + + if (taxonomy && tax!=taxonomy) + { + rankindex = rank_index("kingdom",taxonomy->ranks); + tax=taxonomy; + } + + if (!tax || rankindex < 0) + ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); + + return eco_findtaxonatrank(taxon,rankindex); +} + +ecotx_t *eco_getsuperkingdom(ecotx_t *taxon, + ecotaxonomy_t *taxonomy) +{ + static ecotaxonomy_t *tax=NULL; + static int32_t rankindex=-1; + + if (taxonomy && tax!=taxonomy) + { + rankindex = rank_index("superkingdom",taxonomy->ranks); + tax=taxonomy; + } + + if (!tax || rankindex < 0) + ECOERROR(ECO_ASSERT_ERROR,"No taxonomy defined"); + + return eco_findtaxonatrank(taxon,rankindex); +} diff --git a/src/libecoprimer/Makefile b/src/libecoprimer/Makefile new file mode 100644 index 0000000..878ea1b --- /dev/null +++ b/src/libecoprimer/Makefile @@ -0,0 +1,39 @@ + +SOURCES = goodtaxon.c \ + readdnadb.c \ + smothsort.c \ + sortword.c \ + hashsequence.c \ + strictprimers.c \ + aproxpattern.c \ + merge.c \ + queue.c \ + libstki.c \ + sortmatch.c \ + pairtree.c \ + pairs.c \ + taxstats.c \ + apat_search.c \ + filtering.c \ + PrimerSets.c \ + ahocorasick.c + +SRCS=$(SOURCES) + +OBJECTS= $(patsubst %.c,%.o,$(SOURCES)) + +LIBFILE= libecoprimer.a +RANLIB= ranlib + + +include ../global.mk + + +all: $(LIBFILE) + +clean: + rm -rf $(OBJECTS) $(LIBFILE) + +$(LIBFILE): $(OBJECTS) + ar -cr $@ $? + $(RANLIB) $@ diff --git a/src/libecoprimer/PrimerSets.c b/src/libecoprimer/PrimerSets.c new file mode 100644 index 0000000..ae4d957 --- /dev/null +++ b/src/libecoprimer/PrimerSets.c @@ -0,0 +1,1770 @@ +#include +#include +#include + +//#include "ecoprimer.h" +#include "PrimerSets.h" + +int TabuList[PRIMERS_IN_SET_COUNT]; +int next_tabu_slot = -1; +int total_pairs = -1; +//int32_t total_wi = -1; + +int32_t counttaxon(int32_t taxid); +int find_in_tabu (int index); + +int32_t count_taxons (int32_t taxid) +{ + static int32_t count = 0; + static int32_t slots = 0; + static int32_t *taxon_array = NULL; + int32_t i; + + if (taxid == -1) + { + if (taxon_array) + ECOFREE (taxon_array, "Could not free memory for taxon array"); + taxon_array = NULL; + slots = 0; + count = 0; + } + else + { + for (i = 0; i < count; i++) + { + if (taxid == taxon_array[i]) return count; + } + + if (count == slots) + { + slots += 500; + + if (taxon_array == NULL) + { + taxon_array = (int32_t *) ECOMALLOC(slots*sizeof (int32_t), + "Could not allocate memory for taxon array"); + } + else + taxon_array = (int32_t *) ECOREALLOC(taxon_array, slots*sizeof (int32_t), + "Could not reallocate memory for taxon array"); + } + taxon_array[count] = taxid; + count++; + } + return count; +} + +float get_set_coverage (pairset *p_set, SetParams *pparams, int32_t pidx_toexclude) +{ + int32_t i, j; + float cov; + int32_t s_intaxa = 0; + int32_t seqcount; + + //counttaxon(-1); + count_taxons (-1); + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + { + if (p_set->set_pairs[i] == -1 || pidx_toexclude == i) continue; + + seqcount=pparams->sortedpairs[p_set->set_pairs[i]]->pcr.ampcount; + for (j=0; j < seqcount; j++) + if (pparams->sortedpairs[p_set->set_pairs[i]]->pcr.amplifias[j].sequence->isexample + && pparams->sortedpairs[p_set->set_pairs[i]]->pcr.amplifias[j].sequence->ranktaxonid > 0 ) + s_intaxa = count_taxons(pparams->sortedpairs[p_set->set_pairs[i]]->pcr.amplifias[j].sequence->ranktaxonid); + + } + //fprintf(stderr, "%d/%d\n", s_intaxa, pparams->options->intaxa); + p_set->set_intaxa = s_intaxa; + cov = s_intaxa*1.0/(pparams->options->intaxa*1.0); + count_taxons (-1); + return cov; +} + +void set_cov_spc (pairset *p_set, SetParams *pparams) +{ + int32_t i; + int32_t ssp = 0; + + count_taxons (-1); + for (i = 0; i < pparams->options->dbsize; i++) + if (p_set->set_wellIdentifiedTaxa[i] == 1) + ssp = count_taxons (pparams->seqdb[i]->ranktaxonid); + + //set coverage + p_set->set_coverage = get_set_coverage (p_set, pparams, -1); + + //set specificity + p_set->set_specificity = ((float)ssp)/((float)p_set->set_intaxa); + p_set->set_wi_cnt = ssp; + count_taxons (-1); +} + +//currently used only to open dead lock +void tabu_aspiration (pairset *pair_set) +{ + int i; + int count = 0; + + for (i=0; iset_pairs[i] != -1) + count++; + if (TabuList[i] != -1) + count++; + } + + if (count == total_pairs) + for (i=0; iset_pairs[i] == id) return 0; + + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + if (pair_set->set_pairs[i] != -1) secnt++; + + //if (secnt == PRIMERS_IN_SET_COUNT) return 0; + if (secnt == 0) return 1; + + lnk_prcnt = 1.0; + if (secnt > pparams->options->links_cnt) + lnk_prcnt = (pparams->options->links_cnt*1.0)/(secnt*1.0); + + //TR 6/2/11: new elements must have some links with atleast one elem of set + if (get_links_distribution (id, pair_set, pparams) < lnk_prcnt) return 0; + + //if in tabu search search tabu list as well + if (next_tabu_slot != -1) + { + //effency_switch is only there to avoid tabu_aspiration execution + //each time + effency_switch++; + if ((effency_switch%5) == 0) + { + effency_switch=1; + tabu_aspiration(pair_set); + } + + i = find_in_tabu (id); + if (i != -1) return 0; + } + return 1; +} + +pairset build_primers_set_greedy_cov (SetParams *params) +{ + pairset pair_set; + int32_t i; + int32_t pset_idx; + int32_t prb_idx; + + memset (&pair_set, 0, sizeof(pairset)); + pair_set.set_wellIdentifiedTaxa = (int *) ECOMALLOC(params->options->dbsize*sizeof (int), + "Could not allocate memory for pair set"); + + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + pair_set.set_pairs[i] = -1; + + pset_idx = 0; + prb_idx = 0; + //add first pair by default, this should be the one having highiest specificty + add_pair_in_set (&pair_set, pset_idx, prb_idx, params); + pset_idx++; + + while (pset_idx < PRIMERS_IN_SET_COUNT) + { + if (pair_set.set_coverage == 1.0) break; + prb_idx = get_next_option_increasing_cov (&pair_set, params); + if (prb_idx == 0) break; + add_pair_in_set (&pair_set, pset_idx, prb_idx, params); + pset_idx++; + } + //get_set_mean_cov_stats (&pair_set, ¶ms); + reset_set_props (&pair_set, params); + + return pair_set; +} + +int32_t get_next_option_increasing_cov (pairset *pair_set, SetParams *pparams) +{ + float ini_set_cov; + float set_cov; + float cov_diff = 0.; + int32_t i, id_slot = -1, max_id = 0; + + //coverage already 1, dont proceed. + if (pair_set->set_coverage == 1.0) return 0; + + //find next available slot in the set + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + if (pair_set->set_pairs[i] == -1) + { + id_slot = i; + break; + } + //set already full + if (id_slot == -1) return 0; + //save original set coverage + ini_set_cov = pair_set->set_coverage; + for (i = 1; i < pparams->sorted_count; i++) + { + if (ok_to_add (i, pair_set, pparams) == 0) continue; + pair_set->set_pairs[id_slot] = i; + set_cov = get_set_coverage (pair_set, pparams, -1); + if ((set_cov - ini_set_cov) > cov_diff) + { + cov_diff = set_cov - ini_set_cov; + max_id = i; + } + } + pair_set->set_pairs[id_slot] = -1; + return max_id; +} + + +//1. Add in set the first pair having highiest specificity +//2. For the sequences not WI by primers in set, calculate count for each pair equal to number of seqs WI by it +//3. Take the pair with highiest such count and see its links with primers already in set, +//4. If no/insufficient links, take next pair else add pair in set +//5. repeate 3,4 untill pair gets added in set +//6. Repeate 2 to 5 until set completes + +pairset build_primers_set_greedy_spc (SetParams *params) +{ + pairset pair_set; + int32_t i; + int32_t pset_idx; + int32_t prb_idx; + int *pair_wi_count_sorted_ids; + + memset (&pair_set, 0, sizeof(pairset)); + pair_set.set_wellIdentifiedTaxa = (int *) ECOMALLOC(params->options->dbsize*sizeof (int), + "Could not allocate memory for pair set"); + + pair_wi_count_sorted_ids = (int *) ECOMALLOC(params->sorted_count*sizeof (int), + "Could not allocate memory for pair_wi_count_sorted"); + + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + pair_set.set_pairs[i] = -1; + + pset_idx = 0; + prb_idx = 0; + //add first pair by default, this should be the one having highiest specificty + add_pair_in_set (&pair_set, pset_idx, prb_idx, params); + pset_idx++; + + while (pset_idx < PRIMERS_IN_SET_COUNT) + { + //get a sorted list of pair ids with the pair well identifying most of the remaining seqs at top + get_next_pair_options (pair_wi_count_sorted_ids, &pair_set, params); + + if (pair_wi_count_sorted_ids[0] == 0) + { + fprintf (stderr, "No further pair found, total so far %d\n", pset_idx); + break; + } + + for (prb_idx = 0; prb_idx < params->sorted_count; prb_idx++) + { + if (pair_wi_count_sorted_ids[prb_idx]) + if (ok_to_add (pair_wi_count_sorted_ids[prb_idx], &pair_set, params)) + { + //fprintf (stderr, "Oktoadd\n"); + add_pair_in_set (&pair_set, pset_idx, pair_wi_count_sorted_ids[prb_idx], params); + pset_idx++; + } + if (pset_idx == PRIMERS_IN_SET_COUNT) break; + } + + if (prb_idx == params->sorted_count) + { + fprintf (stderr, "No further pair found, total so far %d\n", pset_idx); + break; + } + + if (pair_set.set_specificity == 1.0) + { + fprintf (stderr, "Set Complete with total primers: %d\n", pset_idx); + break; + } + + } + //get_set_mean_cov_stats (&pair_set, ¶ms); + reset_set_props (&pair_set, params); + return pair_set; +} + +float get_links_distribution (int prb_idx, pairset *pair_set, SetParams *pparams) +{ + int i, j; + int *pair_link_count; + int *pwi; + int *pswi; + float pcnt = 0.0; + float pscnt = 0.0; + + pair_link_count = (int *) ECOMALLOC(PRIMERS_IN_SET_COUNT*sizeof (int), + "Could not allocate memory for pair_link_count"); + + pwi = pparams->sortedpairs[prb_idx]->wellIdentifiedSeqs; + //fprintf(stderr, "a,"); + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + { + pair_link_count[i] = 0; + if (pair_set->set_pairs[i] != -1) + { + //fprintf(stderr, "b,"); + pswi = pparams->sortedpairs[pair_set->set_pairs[i]]->wellIdentifiedSeqs; + for (j = 0; j < pparams->options->dbsize; j++) + if (pwi[j] == 1 && pwi[j] == pswi[j]) + pair_link_count[i] += 1; + } + } + //fprintf(stderr, "c,"); + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + { + if (pair_set->set_pairs[i] != -1) + pscnt++; + if (pair_link_count[i] > 0) + pcnt++; + } + ECOFREE (pair_link_count, "Could not free memory for pair_link_count"); + //fprintf(stderr, "d,"); + return (pcnt/pscnt); +} + + +void get_next_pair_options (int *pair_wi_count_sorted_ids, pairset *pair_set, SetParams *pparams) +{ + int *pair_count; + int32_t i, j; + int max; + int tmp; + + pair_count = (int *) ECOMALLOC(pparams->sorted_count*sizeof (int), + "Could not allocate memory for pair_count"); + + memset (pair_wi_count_sorted_ids, 0, pparams->sorted_count*sizeof(int)); + + for (i = 0; i < pparams->options->dbsize; i++) + { + if (pair_set->set_wellIdentifiedTaxa[i] == 1) continue; + + for (j = 0; j < pparams->sorted_count; j++) + { + if (pparams->sortedpairs[j]->wellIdentifiedSeqs[i] == 1) + pair_count[j] += 1; + } + } + + //set pair ids + for (j = 0; j < pparams->sorted_count; j++) + pair_wi_count_sorted_ids[j] = j; + + //set count of primers already in set to zero (it should already be zero) just for testing + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + if (pair_set->set_pairs[i] != -1) + pair_count[pair_set->set_pairs[i]] = 0; + + //sort two arrays in descending wrt count + for (i = 0; i < pparams->sorted_count - 1; i++) + { + max = i; + for (j = i + 1; j < pparams->sorted_count; j++) + if (pair_count[max] < pair_count[j]) + max = j; + + if (max > i) + { + tmp = pair_count[i]; + pair_count[i] = pair_count[max]; + pair_count[max] = tmp; + + tmp = pair_wi_count_sorted_ids[i]; + pair_wi_count_sorted_ids[i] = pair_wi_count_sorted_ids[max]; + pair_wi_count_sorted_ids[max] = tmp; + } + } + + for (i = 0; i < pparams->sorted_count - 1; i++) + if (pair_count[i] == 0) + pair_wi_count_sorted_ids[i] = 0; + //else + // fprintf (stderr, "%d:%d, ", i, pair_count[i]); + + ECOFREE (pair_count, "Could not free memory for pair_count"); +} + +void add_pair_in_set (pairset *pair_set, int32_t pset_idx, int32_t prb_idx, SetParams *pparams) +{ + int *pwi; + int32_t i; + + if (prb_idx < 0 || prb_idx >= pparams->sorted_count) return; + pair_set->set_pairs[pset_idx] = prb_idx; + +// fprintf (stderr, "%d:", prb_idx); + //fprintf (stderr, "%d:", pparams->sortedpairs[prb_idx]); + //fprintf (stderr, "%d:", pparams->sortedpairs[prb_idx]->wellIdentifiedSeqs); + //fprintf (stderr, "%d:%d, ", i, pair_count[i]); + + pwi = pparams->sortedpairs[prb_idx]->wellIdentifiedSeqs; + for (i = 0; i < pparams->options->dbsize; i++) + if (pwi[i] == 1) + pair_set->set_wellIdentifiedTaxa[i] = 1; + + set_cov_spc (pair_set, pparams); +} + +int isthisset (pairset *pair_set) +{ + int set1[PRIMERS_IN_SET_COUNT]; + int set2[PRIMERS_IN_SET_COUNT]; + int i,j=0,k; + for (i=0; iset_pairs[i] != -1) set2[j++]=pair_set->set_pairs[i]; + } + set1[0]=0;set1[1]=2;set1[2]=7; + if (j==3) + { + for(i=0;i<3;i++) + { + for(k=0;k<3;k++) + if (set2[k]==set1[i])break; + if (k==3)break; + } + if(i==3) return 1;//found + } + + set1[0]=0;set1[1]=2;set1[2]=37; + if (j==3) + { + for(i=0;i<3;i++) + { + for(k=0;k<3;k++) + if (set2[k]==set1[i])break; + if (k==3)break; + } + if(i==3) return 1;//found + } + return 0; +} + +void get_set_mean_cov_stats (pairset *pair_set, SetParams *pparams) +{ + int32_t i, j, k; + int interseq_vals[PRIMERS_IN_SET_COUNT*PRIMERS_IN_SET_COUNT]; + int interseq_cnt = 0; + double msum; + int *p1wi; + int *p2wi; + int dbg=0; + + dbg=isthisset(pair_set); + + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + { + if (pair_set->set_pairs[i] == -1) continue; + p1wi = pparams->sortedpairs[pair_set->set_pairs[i]]->wellIdentifiedSeqs; + + if(dbg) + { + printf ("\n\nWellIdentified for primer pair:%d\n",pair_set->set_pairs[i]); + for (k = 0; k < pparams->options->dbsize; k++) + if(p1wi[k]==1) + printf("%d,",k); + printf("\n"); + } + + for (j = i+1; j < PRIMERS_IN_SET_COUNT; j++) + { + if (pair_set->set_pairs[j] == -1) continue; + p2wi = pparams->sortedpairs[pair_set->set_pairs[j]]->wellIdentifiedSeqs; + interseq_vals[interseq_cnt] = 0; + + if (dbg) + { + printf ("Intersection for %d and %d:\n", pair_set->set_pairs[i], pair_set->set_pairs[j]); + } + + for (k = 0; k < pparams->options->dbsize; k++) + if (p1wi[k] == 1 && p2wi[k] == 1) + { + interseq_vals[interseq_cnt]++; + if(dbg) + printf("%d,",k); + } + if(dbg) + printf("\n"); + interseq_cnt++; + } + } + + //calculate mean + msum = 0; + pair_set->set_score = 0; + pair_set->set_lmean = 0; + pair_set->set_lcov = -1; + if (interseq_cnt == 0) return; + + for (i = 0; i < interseq_cnt; i++) + msum += interseq_vals[i]; + pair_set->set_lmean = msum/interseq_cnt; + + msum = 0; + for (i = 0; i < interseq_cnt; i++) + msum += (interseq_vals[i] - pair_set->set_lmean)*(interseq_vals[i] - pair_set->set_lmean); + pair_set->set_lcov = msum/interseq_cnt; + + if (pair_set->set_lcov != 0) + //pair_set->set_score = (pair_set->set_coverage*pair_set->set_specificity*pair_set->set_specificity*(sqrt(pair_set->set_lmean)))/sqrt(sqrt(pair_set->set_lcov)); + pair_set->set_score = (pair_set->set_coverage*pair_set->set_specificity*(pair_set->set_lmean))/sqrt(pair_set->set_lcov); +} + +void get_set_mean_cov_normalised_stats (pairset *pair_set, SetParams *pparams) +{ + int32_t i, j, k; + int interseq_vals[PRIMERS_IN_SET_COUNT*PRIMERS_IN_SET_COUNT]; + int interseq_cnt = 0; + double msum; + int *p1wi; + int *p2wi; + int dbg=0; + + dbg=isthisset(pair_set); + + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + { + if (pair_set->set_pairs[i] == -1) continue; + p1wi = pparams->sortedpairs[pair_set->set_pairs[i]]->wellIdentifiedSeqs; + + if(dbg) + { + printf ("\n\nWellIdentified for primer pair:%d\n",pair_set->set_pairs[i]); + for (k = 0; k < pparams->options->dbsize; k++) + if(p1wi[k]==1) + printf("%d,",k); + printf("\n"); + } + + for (j = i+1; j < PRIMERS_IN_SET_COUNT; j++) + { + if (pair_set->set_pairs[j] == -1) continue; + p2wi = pparams->sortedpairs[pair_set->set_pairs[j]]->wellIdentifiedSeqs; + interseq_vals[interseq_cnt] = 0; + + if (dbg) + { + printf ("Intersection for %d and %d:\n", pair_set->set_pairs[i], pair_set->set_pairs[j]); + } + + for (k = 0; k < pparams->options->dbsize; k++) + if (p1wi[k] == 1 && p2wi[k] == 1) + { + interseq_vals[interseq_cnt]++; + if(dbg) + printf("%d,",k); + } + if(dbg) + printf("\n"); + interseq_cnt++; + } + } + + //calculate mean + msum = 0; + pair_set->set_score = 0; + pair_set->set_lmean = 0; + pair_set->set_lcov = -1; + if (interseq_cnt == 0) return; + + for (i = 0; i < interseq_cnt; i++) + msum += interseq_vals[i]; + pair_set->set_lmean = msum/interseq_cnt; + + msum = 0; + for (i = 0; i < interseq_cnt; i++) + msum += (interseq_vals[i] - pair_set->set_lmean)*(interseq_vals[i] - pair_set->set_lmean); + pair_set->set_lcov = msum/interseq_cnt; + + if (pair_set->set_lcov != 0) + { + //normalised links + double nl = pair_set->set_lmean/sqrt (pair_set->set_lcov); + nl = nl/pparams->options->insamples; //max links cannot be more than insample value + pair_set->set_score = pair_set->set_coverage*pair_set->set_specificity*nl; + } +} + +void reset_set_props (pairset *pair_set, SetParams *pparams) +{ + int *pwi; + int i, j; + + memset (pair_set->set_wellIdentifiedTaxa, 0, pparams->options->dbsize*sizeof(int)); + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + { + if (pair_set->set_pairs[i] == -1) continue; + pwi = pparams->sortedpairs[pair_set->set_pairs[i]]->wellIdentifiedSeqs; + for (j = 0; j < pparams->options->dbsize; j++) + if (pwi[j] == 1) + pair_set->set_wellIdentifiedTaxa[j] = 1; + } + + set_cov_spc (pair_set, pparams); + + //TR 6/2/11: commented following, now score is just product of spc and cov + //get_set_mean_cov_stats (pair_set, pparams); + //get_set_mean_cov_normalised_stats (pair_set, pparams); + //pair_set->set_score = pair_set->set_coverage*pair_set->set_specificity; + pair_set->set_score = pair_set->set_coverage; + //pair_set->set_score = pair_set->set_specificity; +} + +void print_set_info (pairset *pair_set, SetParams *pparams) +{ + int i; + int printed1st = 0; + + //TR 6/2/11: commented following, now score is just product of spc and cov + //printf ("%4.3f\t%4.3f\t%4.3f\t%4.3f\t%4.3f\t", pair_set->set_specificity, + // pair_set->set_coverage,pair_set->set_lmean, + // sqrt(pair_set->set_lcov),pair_set->set_score); + + printf ("%4.3f\t%4.3f\t%4.3f\t", pair_set->set_coverage, + pair_set->set_specificity,pair_set->set_score); + + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + { + if (pair_set->set_pairs[i] == -1) continue; + + if (printed1st) + printf (":%d", pair_set->set_pairs[i]); + else + printf ("%d", pair_set->set_pairs[i]); + printed1st = 1; + } + printf ("\t%d\t%d", pair_set->set_intaxa, pair_set->set_wi_cnt); + printf ("\n"); +} + +void some_other_set_possibilities (pairset *pair_set, + ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options) +{ + SetParams params; + pairset tmp_pair_set; + int i, j; + int *wi; + + params.sortedpairs = sortedpairs; + params.sorted_count = sorted_count; + params.seqdb = seqdb; + params.options = options; + wi = (int *) ECOMALLOC(options->dbsize*sizeof (int), + "Could not allocate memory for pair set wi"); + //print stats for first original set + printf ("\nspecificity\tcoverage\tmean\tcovariance\tscore\tprimers\n"); + print_set_info (pair_set, ¶ms); + + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + { + if (pair_set->set_pairs[i] == -1) continue; + + for (j = 0; j < sorted_count; j++) + { + if (ok_to_add (j, pair_set, ¶ms)) + { + tmp_pair_set = *pair_set; + tmp_pair_set.set_pairs[i] = j; + memset (wi, 0, options->dbsize*sizeof (int)); + tmp_pair_set.set_wellIdentifiedTaxa = wi; + reset_set_props (&tmp_pair_set, ¶ms); + print_set_info (&tmp_pair_set, ¶ms); + } + } + } + ECOFREE (wi, "Could not free memory for pair set wi"); +} + +pairset clone_set (pairset *s, int32_t dbsize) +{ + pairset clone; + + clone = *s; + clone.set_wellIdentifiedTaxa = (int *) ECOMALLOC(dbsize*sizeof (int), + "Could not allocate memory for pair set"); + memcpy (clone.set_wellIdentifiedTaxa, s->set_wellIdentifiedTaxa, dbsize*sizeof (int)); + return clone; +} + +void add_in_tabu (int index) +{ + if (next_tabu_slot == -1) return; + + TabuList[next_tabu_slot] = index; + next_tabu_slot++; + if (next_tabu_slot >= PRIMERS_IN_SET_COUNT) next_tabu_slot = 0; +} + +int find_in_tabu (int index) +{ + int i; + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + if (TabuList[i] == index) return i; + + return -1; +} + +//do random changes in the seed set to generate new set +pairset get_neighbor (pairset *set, SetParams *params) +{ + int pinset = 0; + int i, j, id, cnt; + int how_many_to_replace; + pairset nset; + int replace_idx; + + //take the seed set as next neighbour sets + nset = clone_set (set, params->options->dbsize); + //see how many elements are in this set + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + { + if (nset.set_pairs[i] != -1) pinset++; + } + + if (pinset == params->sorted_count) return nset; + + //Randomly get number of elements to be replaced + //with new unused elements + how_many_to_replace = rand ()%pinset + 1; + //replace these many elements in the seed set + for (i = 0; i < how_many_to_replace; i++) + { + do + { + //we wil choose a random unused element as new element + id = rand ()%params->sorted_count; + }while (ok_to_add (id, &nset, params) == 0); + //again choose a random element in the set to replace + replace_idx = rand ()%pinset+1; + cnt = 0; + for (j = 0; j < PRIMERS_IN_SET_COUNT; j++) + { + if (nset.set_pairs[j] != -1) cnt++; + if (cnt == replace_idx) + { + if (next_tabu_slot != -1) + add_in_tabu (nset.set_pairs[j]); + + nset.set_pairs[j] = id; + break; + } + } + } + reset_set_props (&nset, params); + return nset; +} + +//remove a random number of least contributing elements +//from the seed set with random elements from the remaining +pairset get_neighbor4 (pairset *set, SetParams *params) +{ + int pinset = 0; + int i, j, id, id2; + int how_many_to_replace; + pairset nset; + //int replace_idx; + float contribution[PRIMERS_IN_SET_COUNT]; + int usedids[PRIMERS_IN_SET_COUNT]; + float sscore; + float leastContri; + //float lastLeastcontri; + int k, l; + + //take the seed set as next neighbour sets + nset = clone_set (set, params->options->dbsize); + //see how many elements are in this set + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + { + if (nset.set_pairs[i] != -1) pinset++; + } + + if (pinset == params->sorted_count) return nset; + + //Randomly get number of elements to be replaced + //with new unused elements + how_many_to_replace = rand ()%pinset + 1; + + //calculate contribution of each element in the set + sscore = nset.set_score; + //fprintf (stderr, "{%f-", sscore); + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + { + contribution[i] = 10000.0; + if (nset.set_pairs[i] != -1) + { + id = nset.set_pairs[i]; + nset.set_pairs[i] = -1; + reset_set_props (&nset, params); + contribution[i] = sscore - nset.set_score; + //fprintf (stderr, "[%f;%f]", nset.set_score, contribution[i]); + nset.set_pairs[i] = id; + reset_set_props (&nset, params); + //fprintf (stderr, "%f:, ", nset.set_score); + } + usedids[i] = -1; + } + + //lastLeastcontri = 10000.0; + k=0; + //replace these many elements in the seed set + //fprintf (stderr, "} (%d) ", how_many_to_replace); + for (i = 0; i < how_many_to_replace; i++) + { + do + { + //we wil choose a random unused element as new element + id = rand ()%params->sorted_count; + }while (ok_to_add (id, &nset, params) == 0); + + leastContri = 10000.0; + id2 = -1; + for (j = 0; j < PRIMERS_IN_SET_COUNT; j++) + { + if (nset.set_pairs[j] == -1) continue; + for (l = 0; l < k; l++) + if (usedids[l] == j) break; + + if (leastContri > contribution[j] /*&& leastContri >= lastLeastcontri*/ && l == k) + { + leastContri = contribution[j]; + id2 = j; + //fprintf (stderr, "%f:%d, ", leastContri, id2); + } + } + + if (id2 != -1) + { + usedids[k++] = id2; + //lastLeastcontri = leastContri; + if (next_tabu_slot != -1) + add_in_tabu (nset.set_pairs[id2]); + + nset.set_pairs[id2] = id; + } + } + //fprintf (stderr, "\n"); + reset_set_props (&nset, params); + return nset; +} + +//by replacing one element of the set with next from unused +int which_one_to_replace; +int last_replaced_with; +pairset get_neighbor2 (pairset *set, SetParams *params) +{ + int pinset = 0; + int i, j; + pairset nset; + + //take the seed set as next neighbour sets + nset = clone_set (set, params->options->dbsize); + //see how many elements are in this set + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + { + if (nset.set_pairs[i] != -1) pinset++; + } + + if (pinset == params->sorted_count) return nset; + + if (which_one_to_replace == pinset) which_one_to_replace = 0; + if (last_replaced_with == params->sorted_count) last_replaced_with = 0; + + j = -1; + for (i = 0; i < pinset; i++) + { + if (nset.set_pairs[i] != -1) j++; + if (j == which_one_to_replace) break; + } + + for (j = last_replaced_with; j < params->sorted_count; j++) + if (ok_to_add (j, &nset, params) == 1) break; + + if (j < params->sorted_count) + { + if (next_tabu_slot != -1) + add_in_tabu (nset.set_pairs[i]); + + nset.set_pairs[i] = j; + reset_set_props (&nset, params); + } + last_replaced_with++; + which_one_to_replace++; + + return nset; +} + +//replace element having least contribution with the next from the list +pairset get_neighbor3 (pairset *set, SetParams *params) +{ + int pinset = 0; + int i, j, id; + pairset nset; + float least_contribution; + float sscore; + + //take the seed set as next neighbour sets + nset = clone_set (set, params->options->dbsize); + //see how many elements are in this set + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + { + if (nset.set_pairs[i] != -1) pinset++; + } + + if (pinset == params->sorted_count) return nset; + + if (last_replaced_with == params->sorted_count) last_replaced_with = 0; + + sscore = nset.set_score; + which_one_to_replace = -1; + least_contribution = 1000.0; //impossible + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + { + if (nset.set_pairs[i] != -1) + { + id = nset.set_pairs[i]; + nset.set_pairs[i] = -1; + reset_set_props (&nset, params); + if ((sscore - nset.set_score) < least_contribution) + { + which_one_to_replace = i; + least_contribution = sscore - nset.set_score; + } + nset.set_pairs[i] = id; + } + } + + for (j = last_replaced_with; j < params->sorted_count; j++) + if (ok_to_add (j, &nset, params) == 1) break; + + if (j < params->sorted_count && which_one_to_replace != -1) + { + if (next_tabu_slot != -1) + add_in_tabu (nset.set_pairs[which_one_to_replace]); + + nset.set_pairs[which_one_to_replace] = j; + reset_set_props (&nset, params); + } + last_replaced_with++; + + return nset; +} + +float sa_P (float score, float nscore, float tem) +{ + if (nscore >= score) return 0.95; + if (nscore < score && tem > 0.3) return 0.1; + if (nscore < score && tem > 0.1) return 0.0001; + return 0.0; +} + +void extend_set_to (pairset *pair_set, SetParams *params, int extend_to_cnt) +{ + int pinset = 0; + int i, j; + if (extend_to_cnt > PRIMERS_IN_SET_COUNT) return; + + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + if (pair_set->set_pairs[i] != -1) pinset++; + + if (pinset >= extend_to_cnt) return; + + for (i = 0; i < params->sorted_count; i++) + { + if (ok_to_add (i, pair_set, params) == 1) + { + for (j = 0; j < PRIMERS_IN_SET_COUNT; j++) + if (pair_set->set_pairs[j] == -1) break; + add_pair_in_set (pair_set, j, i, params); + reset_set_props (pair_set, params); + pinset++; + if (pinset >= extend_to_cnt) break; + } + } +} + +pairset * extend_set_randomly (pairset *pset, SetParams *params, int extend_to_cnt) +{ + int pinset = 0; + int i = 0; + int id; + pairset *pair_set; + + if (extend_to_cnt > PRIMERS_IN_SET_COUNT) return pset; + + if (pset == NULL) + { + pair_set = (pairset *) ECOMALLOC(sizeof (pairset), + "Could not allocate memory for pair"); + pair_set->set_wellIdentifiedTaxa = (int *) ECOMALLOC(params->options->dbsize*sizeof (int), + "Could not allocate memory for pair set WI"); + + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + pair_set->set_pairs[i] = -1; + } + else + pair_set = pset; + + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + if (pair_set->set_pairs[i] != -1) pinset++; + + if (pinset >= extend_to_cnt) return pair_set; + + i = 0; + if (pinset == 0) + { + id = rand ()%params->sorted_count; + add_pair_in_set (pair_set, i, id, params); + i++; + pinset++; + } + + while (pinset < extend_to_cnt) + { + do + { + //we wil choose a random unused element as new element + id = rand ()%params->sorted_count; + }while (ok_to_add (id, pair_set, params) == 0); + add_pair_in_set (pair_set, i, id, params); + i++; + pinset++; + } + return pair_set; +} + +void set_reduce_to_best (pairset *pair_set, SetParams *params) +{ + int original_members[PRIMERS_IN_SET_COUNT]; + int i; + int mcnt = 0; + float max_score; + int m_to_remove = -1; + int tmem; + + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + { + original_members[i] = pair_set->set_pairs[i]; + if (original_members[i] != -1) mcnt++; + } + + while (mcnt > 3) + { + max_score = pair_set->set_score; + m_to_remove = -1; + + for (i = 0; i < PRIMERS_IN_SET_COUNT; i++) + { + if (pair_set->set_pairs[i] == -1) continue; + tmem = pair_set->set_pairs[i]; + pair_set->set_pairs[i] = -1; //remove current element temporarily + reset_set_props (pair_set, params); + pair_set->set_pairs[i] = tmem; //restore + + if (max_score <= pair_set->set_score) + { + max_score = pair_set->set_score; + m_to_remove = i; + } + } + + if (m_to_remove != -1) + { + pair_set->set_pairs[m_to_remove] = -1; //remove element + reset_set_props (pair_set, params); + mcnt--; + } + else + { + reset_set_props (pair_set, params); + break; + } + } +} + +float sa_temp (int k, int kmax) +{ + return ((kmax - k)*1.0)/(kmax*1.0); +} + +void sets_by_SimulatedAnealing (pairset *pair_set, + ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options) +{ + SetParams params; + pairset set, bset, nset; + float score, bscore, nscore; + int k = 0, mcnt=0; + int kmax = sorted_count*10; + float max_score = 10000.0; + float min_spc, min_cov; + float max_spc, max_cov; + double randval = 0; + + int32_t count_per_size; + int32_t size_counter = 1; + + srand ( time(NULL) ); + params.sortedpairs = sortedpairs; + params.sorted_count = sorted_count; + params.seqdb = seqdb; + params.options = options; + which_one_to_replace = 0; + last_replaced_with = 0; + next_tabu_slot = -1; //dont use tabu search props + total_pairs = sorted_count; + + if (pair_set == NULL) + { + pair_set = extend_set_randomly (NULL, ¶ms, 3); + printf("\nStart Random seed set for Simulated :\n"); + print_set_info (&pair_set, ¶ms); + } + min_spc = max_spc = pair_set->set_specificity; + min_cov = max_cov = pair_set->set_coverage; + + for (k = 0; k < PRIMERS_IN_SET_COUNT; k++) + { + if (pair_set->set_pairs[k] != -1) mcnt++; + } + count_per_size = kmax/(PRIMERS_IN_SET_COUNT-mcnt); + k = 1; + + set = clone_set (pair_set, options->dbsize); + + /*if (mcnt < 5) + { + printf ("Set before extension:\n"); + print_set_info (&set, ¶ms); + extend_set_to (&set, ¶ms, 5); + printf ("Set after extension:\n"); + print_set_info (&set, ¶ms); + set_reduce_to_best (&set, ¶ms); + printf ("Set after reduction to best size:\n"); + print_set_info (&set, ¶ms); + }*/ + mcnt++; + extend_set_to (&set, ¶ms, mcnt); + + bset = clone_set (&set, options->dbsize); + score = bset.set_score; + bscore = score; + nset.set_wellIdentifiedTaxa = NULL; + + //srand ( time(NULL) ); + + while (k <= kmax && score < max_score) + { + if (k == (size_counter*count_per_size)) + { + size_counter++; + mcnt++; + extend_set_to (&set, ¶ms, mcnt); + } + //nset = get_neighbor (&set, ¶ms); //all random + //nset = get_neighbor2 (&set, ¶ms); //replace next with next available + //nset = get_neighbor3 (&set, ¶ms); //replace the one with least contribution with next + nset = get_neighbor4 (&set, ¶ms); //replace randome no of least contributing elements with random elements in the remaining set + + if (nset.set_specificity < min_spc) + min_spc = nset.set_specificity; + + if (nset.set_specificity > max_spc) + max_spc = nset.set_specificity; + + if (nset.set_coverage < min_cov) + min_cov = nset.set_coverage; + + if (nset.set_coverage > max_cov) + max_cov = nset.set_coverage; + + nscore = nset.set_score; + printf ("Neighbor: "); + print_set_info (&nset, ¶ms); + + if (nscore > bscore) + { + ECOFREE (bset.set_wellIdentifiedTaxa, "Could not free memory for pair set wi"); + bset = clone_set (&nset, options->dbsize); + bscore = nscore; + printf ("best: "); + print_set_info (&nset, ¶ms); + } + + randval = (double)rand()/(double)RAND_MAX; + if (sa_P (score, nscore, sa_temp (k,kmax)) > randval) + { + ECOFREE (set.set_wellIdentifiedTaxa, "Could not free memory for pair set wi"); + set = clone_set (&nset, options->dbsize); + score = nscore; + which_one_to_replace = 0; + last_replaced_with = 0; + printf ("Seed Set: "); + print_set_info (&set, ¶ms); + } + k++; + ECOFREE (nset.set_wellIdentifiedTaxa, "Could not free memory for pair set wi"); + } + printf ("Minimum specificity: %0.3f, Maximum specificity: %0.3f, range: %0.3f\n",min_spc, max_spc, max_spc-min_spc); + printf ("Minimum coverage: %0.3f, Maximum coverage: %0.3f, range: %0.3f\n",min_cov, max_cov, max_cov-min_cov); +} + + +void sets_by_TabuSearch (pairset *pair_set, + ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options) +{ + SetParams params; + pairset set, bset, nset; + float bscore, nscore; + int k = 0, mcnt=0; + int kmax = sorted_count*10; + float max_score = 1000.0; + float min_spc, min_cov; + float max_spc, max_cov; + + int32_t count_per_size; + int32_t size_counter = 1; + + srand ( time(NULL) ); + params.sortedpairs = sortedpairs; + params.sorted_count = sorted_count; + params.seqdb = seqdb; + params.options = options; + which_one_to_replace = 0; + last_replaced_with = 0; + next_tabu_slot = 0; //use tabu search + total_pairs = sorted_count; + + if (pair_set == NULL) + pair_set = extend_set_randomly (NULL, ¶ms, 3); + min_spc = max_spc = pair_set->set_specificity; + min_cov = max_cov = pair_set->set_coverage; + + for (k = 0; k < PRIMERS_IN_SET_COUNT; k++) + { + if (pair_set->set_pairs[k] != -1) mcnt++; + } + count_per_size = kmax/(PRIMERS_IN_SET_COUNT-mcnt); + + set = clone_set (pair_set, options->dbsize); + /*if (mcnt < 5) + { + printf ("Set before extension:\n"); + print_set_info (&set, ¶ms); + extend_set_to (&set, ¶ms, 5); + printf ("Set after extension:\n"); + print_set_info (&set, ¶ms); + set_reduce_to_best (&set, ¶ms); + printf ("Set after reduction to best size:\n"); + print_set_info (&set, ¶ms); + }*/ + mcnt++; + extend_set_to (&set, ¶ms, mcnt); + + bset = clone_set (&set, options->dbsize); + bscore = bset.set_score; + nset.set_wellIdentifiedTaxa = NULL; + + for (k = 0; k < PRIMERS_IN_SET_COUNT; k++) + TabuList[k] = -1; + + k = 1; + while (k < kmax && bscore < max_score) + { + if (k == (size_counter*count_per_size)) + { + size_counter++; + mcnt++; + extend_set_to (&set, ¶ms, mcnt); + } + + //nset = get_neighbor (&set, ¶ms); //all random + //nset = get_neighbor2 (&set, ¶ms); //replace next with next available + //nset = get_neighbor3 (&set, ¶ms); //replace the one with least contribution with next + nset = get_neighbor4 (&set, ¶ms); //replace randome no of least contributing elements with random elements in the remaining set + + if (nset.set_specificity < min_spc) + min_spc = nset.set_specificity; + + if (nset.set_specificity > max_spc) + max_spc = nset.set_specificity; + + if (nset.set_coverage < min_cov) + min_cov = nset.set_coverage; + + if (nset.set_coverage > max_cov) + max_cov = nset.set_coverage; + + nscore = nset.set_score; + printf ("Neighbor: "); + print_set_info (&nset, ¶ms); + + if (nscore > bscore) + { + ECOFREE (bset.set_wellIdentifiedTaxa, "Could not free memory for pair set wi"); + bset = clone_set (&nset, options->dbsize); + bscore = nscore; + printf ("best: "); + print_set_info (&nset, ¶ms); + } + ECOFREE (set.set_wellIdentifiedTaxa, "Could not free memory for pair set wi"); + set = nset; + k++; + } + printf ("Minimum specificity: %0.3f, Maximum specificity: %0.3f, range: %0.3f\n",min_spc, max_spc, max_spc-min_spc); + printf ("Minimum coverage: %0.3f, Maximum coverage: %0.3f, range: %0.3f\n",min_cov, max_cov, max_cov-min_cov); +} + +pairset *sets_by_BruteForce (ppair_t * sortedpairs, +//void sets_by_BruteForce (ppair_t * sortedpairs, + int32_t sorted_count, pecodnadb_t seqdb, poptions_t options) +{ + SetParams params; + params.sortedpairs = sortedpairs; + params.sorted_count = sorted_count; + params.seqdb = seqdb; + params.options = options; + + pairset set; + pairset *pset = NULL; + if (sorted_count < 3) + { + printf ("Too few primer pairs to find a pair set.\n"); + return NULL; + } + + set.set_wellIdentifiedTaxa = (int *) ECOMALLOC(options->dbsize*sizeof (int), + "Could not allocate memory for pair set WI"); + int32_t set_indeces_array[PRIMERS_IN_SET_COUNT]; + //int start_elements = 3; + int end_elements = 3; + int current_elements = 3; + int32_t i, j; + float maxscore = -1000.0; + int maxcount = 2000; + int counter = 0; + + if (sorted_count <= PRIMERS_IN_SET_COUNT) + end_elements = sorted_count; + + if (end_elements < sorted_count) + { + pset = (pairset *) ECOMALLOC(sizeof (pairset), + "Could not allocate memory for pair"); + pset->set_wellIdentifiedTaxa = (int *) ECOMALLOC(options->dbsize*sizeof (int), + "Could not allocate memory for pair set WI"); + } + + while (current_elements <= end_elements) + { + for (i=0; idbsize*sizeof (int)); + for (i=0; i maxscore) + { + maxscore = set.set_score; + printf ("best: "); + print_set_info (&set, ¶ms); + + ECOFREE (pset->set_wellIdentifiedTaxa, "Could not free memory for pair set wi"); + *pset = clone_set (&set, options->dbsize); + } + else + { + printf ("set: "); + print_set_info (&set, ¶ms); + } + + for (i=current_elements-1; i>0; i--) + { + set_indeces_array[i]++; + if (set_indeces_array[i] == (sorted_count+(i-current_elements+1))) + { + set_indeces_array[i] = set_indeces_array[i-1]+2; + for (j=i+1; j= sorted_count) + { + break; + } + if (i < current_elements) //above loop broken? + { + current_elements++; + break; + } + counter++; + if (counter > maxcount) + break; + } + if (counter > maxcount) + break; + } + return pset; +} + +void build_and_print_sets (ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options) +{ + SetParams params; + pairset pair_set; + pairset *pset; + + params.sortedpairs = sortedpairs; + params.sorted_count = sorted_count; + params.seqdb = seqdb; + params.options = options; + + pair_set = build_primers_set_greedy_spc (¶ms); + printf("Greedy algorithm results based on specificity:\n"); + print_set_info (&pair_set, ¶ms); + if (pair_set.set_wellIdentifiedTaxa) + ECOFREE (pair_set.set_wellIdentifiedTaxa, "Could not free memory for pair set wi"); + + pair_set = build_primers_set_greedy_cov (¶ms); + printf("\nGreedy algorithm results based on coverage:\n"); + print_set_info (&pair_set, ¶ms); + if (pair_set.set_wellIdentifiedTaxa) + ECOFREE (pair_set.set_wellIdentifiedTaxa, "Could not free memory for pair set wi"); + + pset = extend_set_randomly (NULL, ¶ms, 3); + printf("\nStart Random seed set:\n"); + print_set_info (pset, ¶ms); + + printf("\nResults from simulated Anealing:\n"); + sets_by_SimulatedAnealing (pset, sortedpairs, sorted_count, seqdb, options); + printf("\nResults from Tabu Search:\n"); + sets_by_TabuSearch (pset, sortedpairs, sorted_count, seqdb, options); +} + +void primers_graph_graphviz (ppair_t * sortedpairs, + int32_t sorted_count, poptions_t options) +{ + int32_t i, j, k, total_links; + char fileName[100] = "PrimerLinks"; + int *owi; + int *iwi; + int allowedtaxa; + FILE *of; + + srand ( time(NULL) ); + sprintf (fileName, "PrimerLinks_%d.gv", rand ()); + of = fopen (fileName, "w"); + + fprintf (of, "graph primerlinks {\n"); + for (i=0; iwellIdentifiedSeqs; + + for (j=i+1; jwellIdentifiedSeqs; + total_links = 0; + + for (k=0; kdbsize; k++) + if (owi[k] == 1 && iwi[k] == 1) + total_links++; + + //if (total_links > 0 && ((total_links*1.0) <= (options->max_links_percent*options->dbsize))) + //if (total_links > 0 && total_links <= options->max_links_percent) + + if((sortedpairs[i]->intaxa - sortedpairs[i]->notwellidentifiedtaxa) < (sortedpairs[j]->intaxa - sortedpairs[j]->notwellidentifiedtaxa)) + allowedtaxa = (sortedpairs[i]->intaxa - sortedpairs[i]->notwellidentifiedtaxa)/2; + else + allowedtaxa = (sortedpairs[j]->intaxa - sortedpairs[j]->notwellidentifiedtaxa)/2; + + + //if (total_links > 5 && (total_links <= options->max_links_percent || options->max_links_percent==-1)) + // fprintf (of, "\t%d -- %d [label=\"%d: %0.2f: %0.2f\"];\n", i, j, total_links,sortedpairs[i]->bc,sortedpairs[j]->bc ); + + allowedtaxa = options->max_links_percent; + if (total_links > 5 && total_links < allowedtaxa) + fprintf (of, "\t%d -- %d [label=\"%d: %0.2f: %0.2f\"];\n", i, j, total_links,sortedpairs[i]->bc,sortedpairs[j]->bc ); + //fprintf (of, "\t%d\t%d\t%d\n", i, j, total_links); + + //fprintf (of, "\t%d\t%d\t%d\n", i, j, total_links); + //fprintf (of, "\t%d -- %d;\n", i, j, total_links); + } + } + fprintf (of, "}\n"); + fclose (of); +} + +int32_t *addinset (int32_t *set, int32_t i, int32_t j, int32_t* slots, int32_t *index) +{ + int32_t k; + + if (*index == *slots) + { + *slots += 50; + set = ECOREALLOC(set, (*slots)*sizeof (int32_t), + "Could not allocate memory for index set."); + } + + if (i > -1) + { + for (k=0; k<*index; k++) + if (set[k] == i) break; + if (k== *index) + set[(*index)++] = i; + } + + if (j > -1) + { + for (k=0; k<*index; k++) + if (set[k] == j) break; + if (k== *index) + set[(*index)++] = j; + } + + return set; +} + +size_t primers_changeSortedArray (ppair_t ** pairs, + size_t sorted_count, poptions_t options) +{ + int32_t i, j, k, l, total_links; + int *owi; + int *iwi; + int allowedtaxa; + ppair_t *sortedpairs = *pairs; + bool_t passed; + + int32_t *idx_set = NULL; + int32_t slots=50, index=0; + + idx_set = ECOMALLOC(slots*sizeof (int32_t), + "Could not allocate memory for index set."); + + for (i=0; iwellIdentifiedSeqs; + passed = FALSE; + + for (j=0; jwellIdentifiedSeqs; + total_links = 0; + + for (k=0; kdbsize; k++) + if (owi[k] == 1 && iwi[k] == 1) + total_links++; + + //if (total_links > 0 && ((total_links*1.0) <= (options->max_links_percent*options->dbsize))) + //if (total_links > 0 && total_links <= options->max_links_percent) + + if((sortedpairs[i]->intaxa - sortedpairs[i]->notwellidentifiedtaxa) < (sortedpairs[j]->intaxa - sortedpairs[j]->notwellidentifiedtaxa)) + allowedtaxa = (sortedpairs[i]->intaxa - sortedpairs[i]->notwellidentifiedtaxa)/2; + else + allowedtaxa = (sortedpairs[j]->intaxa - sortedpairs[j]->notwellidentifiedtaxa)/2; + + + //if (total_links > 5 && (total_links <= options->max_links_percent || options->max_links_percent==-1)) + // fprintf (of, "\t%d -- %d [label=\"%d: %0.2f: %0.2f\"];\n", i, j, total_links,sortedpairs[i]->bc,sortedpairs[j]->bc ); + + if (options->max_links_percent > 0) + { + allowedtaxa = options->max_links_percent; + if (total_links > allowedtaxa) + passed = TRUE; + break; + } + else + if (!(total_links > 5 && total_links <= allowedtaxa)) + { + //idx_set = addinset (idx_set, i, j, &slots, &index); + passed = TRUE; + break; + } + } + if (passed == TRUE) + idx_set = addinset (idx_set, i, -1, &slots, &index); + } + + i=-1; + for (j=0; jwellIdentifiedSeqs, "Cannot free wi for changing sorted array"); + ECOFREE (sortedpairs[j]->pcr.amplifias, "Cannot free wi for changing sorted array"); + if (i == -1) i = j; + } + else + { + if (i != -1) + sortedpairs[i++] = sortedpairs[j]; + } + } + ECOFREE (idx_set, "Cannot free index set."); + if (i != -1) + { + *pairs = ECOREALLOC (*pairs, i*sizeof(pair_t), "Cannot free wi for changing sorted array"); + } + else i=sorted_count; + return i; +} + + +int32_t *addinset_withLinks (int32_t *set, int32_t i, int32_t* slots, int32_t *index, ppair_t *pairs, poptions_t options) +{ + int32_t j, k, total_links; + int *owi; + int *iwi; + bool_t passed = TRUE; + int allowedtaxa; + + //see if we need to extend the set array + if (*index == *slots) + { + *slots += 50; + set = ECOREALLOC(set, (*slots)*sizeof (int32_t), + "Could not allocate memory for index set."); + } + + //find no of links of current element i with all the elements + //in the set to see that they are within limit + owi = pairs[i]->coveredSeqs; + for (j=0; j<*index; j++) + { + iwi = pairs[set[j]]->coveredSeqs; + total_links = 0; + + for (k=0; kdbsize; k++) + if (owi[k] == 1 && iwi[k] == 1) + total_links++; + + //if((pairs[i]->intaxa - pairs[i]->notwellidentifiedtaxa) < (pairs[set[j]]->intaxa - pairs[set[j]]->notwellidentifiedtaxa)) + // allowedtaxa = (pairs[i]->intaxa - pairs[i]->notwellidentifiedtaxa)/2; + //else + // allowedtaxa = (pairs[set[j]]->intaxa - pairs[set[j]]->notwellidentifiedtaxa)/2; + + if(pairs[i]->intaxa < pairs[set[j]]->intaxa) + allowedtaxa = pairs[i]->intaxa/2; + else + allowedtaxa = pairs[set[j]]->intaxa/2; + + if (!(total_links > 5 && total_links <= allowedtaxa)) + passed = FALSE; + } + + //links respect the limits with all set elements + if (passed) + { + for (k=0; k<*index; k++) + if (set[k] == i) break; + if (k== *index) + set[(*index)++] = i; + } + + return set; +} + +size_t primers_filterWithGivenLinks (ppair_t ** pairs, + size_t sorted_count, poptions_t options) +{ + int32_t i, j, k; + ppair_t *sortedpairs = *pairs; + bool_t passed; + + int32_t *idx_set = NULL; + int32_t slots=50, index=0; + int *cov = ECOMALLOC(options->dbsize*sizeof (int), + "Could not allocate memory for index set."); + + idx_set = ECOMALLOC(slots*sizeof (int32_t), + "Could not allocate memory for index set."); + + for (i=sorted_count-1; i>=0; i--) + { + idx_set = addinset_withLinks (idx_set, i, &slots, &index, sortedpairs, options); + } + + i=-1; + for (j=0; jcoveredSeqs, "Cannot free wi for changing sorted array"); + ECOFREE (sortedpairs[j]->wellIdentifiedSeqs, "Cannot free wi for changing sorted array"); + ECOFREE (sortedpairs[j]->pcr.amplifias, "Cannot free wi for changing sorted array"); + if (i == -1) i = j; + } + else + { + if (i != -1) + sortedpairs[i++] = sortedpairs[j]; + } + } + ECOFREE (idx_set, "Cannot free index set."); + if (i != -1) + { + *pairs = ECOREALLOC (*pairs, i*sizeof(pair_t), "Cannot free wi for changing sorted array"); + } + else i=sorted_count; + + + for (j=0; jdbsize; k++) + if ((*pairs)[j]->coveredSeqs[k] == 1) + cov[k] = 1; + j=0; + for (k=0; kdbsize; k++) + if (cov[k] == 1) + j++; + fprintf (stderr, "\nALL ELEMENTS COVERAGE: (%d/%d) %0.2f\n", j, options->intaxa, j*1.0/options->intaxa); + ECOFREE (cov, "Cannot free cov"); + + return i; +} diff --git a/src/libecoprimer/PrimerSets.h b/src/libecoprimer/PrimerSets.h new file mode 100644 index 0000000..44d6294 --- /dev/null +++ b/src/libecoprimer/PrimerSets.h @@ -0,0 +1,58 @@ +#ifndef PRIMERSETS_H_ +#define PRIMERSETS_H_ + +#include "ecoprimer.h" + +#define PRIMERS_IN_SET_COUNT 10 + +typedef struct { + int *set_wellIdentifiedTaxa; + int32_t set_pairs[PRIMERS_IN_SET_COUNT]; + float set_specificity; + float set_coverage; + float set_lmean; + float set_lcov; + float set_score; + int32_t set_intaxa; + int32_t set_wi_cnt; +}pairset; + +typedef struct{ + ppair_t* sortedpairs; + int32_t sorted_count; + pecodnadb_t seqdb; + poptions_t options; +}SetParams; + +typedef struct{ + float t_spc; //specificity contribution + float t_cov; //coverage contribution + float t_lmd; //link spread difference + float len; //length + float score; //score +}primerscore; + +void add_pair_in_set (pairset *pair_set, int32_t pset_idx, int32_t prb_idx, SetParams *pparams); +void get_next_pair_options (int *pair_wi_count_sorted_ids, pairset *pair_set, SetParams *pparams); +float get_links_distribution (int prb_idx, pairset *prob_set, SetParams *pparams); +pairset build_primers_set_greedy_spc (SetParams *pparams); +void get_set_mean_cov_stats (pairset *prob_set, SetParams *pparams); +void some_other_set_possibilities (pairset *pair_set, + ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options); +void sets_by_SimulatedAnealing (pairset *pair_set, + ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options); +void sets_by_TabuSearch (pairset *pair_set, + ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options); +pairset * sets_by_BruteForce (ppair_t * sortedpairs, + int32_t sorted_count, pecodnadb_t seqdb, poptions_t options); +pairset * extend_set_randomly (pairset *pair_set, SetParams *params, int extend_to_cnt); +void build_and_print_sets (ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options); +int32_t get_next_option_increasing_cov (pairset *pair_set, SetParams *pparams); +void reset_set_props (pairset *pair_set, SetParams *pparams); +void primers_graph_graphviz (ppair_t * sortedpairs, + int32_t sorted_count, poptions_t options); +size_t primers_changeSortedArray (ppair_t ** pairs, + size_t sorted_count, poptions_t options); +size_t primers_filterWithGivenLinks (ppair_t ** pairs, + size_t sorted_count, poptions_t options); +#endif diff --git a/src/libecoprimer/ahocorasick.c b/src/libecoprimer/ahocorasick.c new file mode 100755 index 0000000..078be0d --- /dev/null +++ b/src/libecoprimer/ahocorasick.c @@ -0,0 +1,479 @@ +/* + * ahocorasick.h + * + * Created on: 26 march 2011 + * Author: tiayyba + */ +#include +#include "hashencoder.h" +#include "ahocorasick.h" + +void ahoc_graphKeywordTree (aho_state *root); +aho_state *groot = NULL; //just for graph testing + +#define BASEATINDEX(w, l, i) (uint8_t)((((w)&(0x3LLU<<(((l)-(i))*2)))>>(((l)-(i))*2)) & 0x3LLU) + +void ahoc_addOutputElement (aho_state *node, bool_t isdirect, uint32_t idx) +{ + if (!node) return; + if (node->output.count == 0) + node->output.out_set = ECOMALLOC(sizeof(aho_output), + "Cannot allocate memory for aho-corasick state output element"); + else + node->output.out_set = ECOREALLOC(node->output.out_set, (node->output.count+1)*sizeof(aho_output), + "Cannot allocate memory for aho-corasick state output element"); + node->output.out_set[node->output.count].wordidx = idx; + node->output.out_set[node->output.count].isdirect = isdirect; + node->output.count++; +} + +//is the passed output element in the set +bool_t ahoc_isOutputIn (aho_state *node, aho_output ot) +{ + uint32_t i; + + for (i=0; ioutput.count; i++) + if (node->output.out_set[i].isdirect == ot.isdirect && node->output.out_set[i].wordidx == ot.wordidx) return TRUE; + return FALSE; +} + +//take union of output of the two nodes and put in node1 +void ahoc_unionOutputElements (aho_state *node1, aho_state *node2) +{ + uint32_t i; + + for (i=0; ioutput.count; i++) + if (ahoc_isOutputIn (node1, node2->output.out_set[i]) == FALSE) + ahoc_addOutputElement (node1, node2->output.out_set[i].isdirect, node2->output.out_set[i].wordidx); +} + +void ahoc_addKeyword (aho_state *root, word_t w, bool_t isdirect, uint32_t idx, poptions_t options) +{ + uint32_t i; + aho_state *nextnode = root; + uint8_t basecode; + static uint32_t state_id = 0; + + //fprintf (stderr, "%s\n", ecoUnhashWord(w, options->primer_length)); + for (i=1; i<=options->primer_length; i++) + { + basecode = BASEATINDEX (w, options->primer_length, i); + //fprintf (stderr, "%d", basecode); + if (nextnode->next[basecode] == NULL) + { + //add new state + nextnode->next[basecode] = ECOMALLOC(sizeof(aho_state), + "Cannot allocate memory for aho-corasick state"); + nextnode = nextnode->next[basecode]; + //initialize state + nextnode->id = ++state_id; + nextnode->next[0]=nextnode->next[1]=nextnode->next[2]=nextnode->next[3]=NULL; + nextnode->fail = NULL; + nextnode->output.count = 0; + } + else + nextnode = nextnode->next[basecode]; + } + //fprintf (stderr, "\n", basecode); + //new pattern addess so add node ouptup element + ahoc_addOutputElement (nextnode, isdirect, idx); +} + +void ahoc_buildKeywordTree (aho_state *root, pwordcount_t words, poptions_t options) +{ + uint32_t i; + if (!root) return; + + //init root + root->id = 0; + root->next[0]=root->next[1]=root->next[2]=root->next[3]=NULL; + root->fail = NULL; + root->output.count = 0; + + //now add each word as a pattern in the keyword tree + for (i=0; isize; i++) + { + //add direct word + word_t w=WORD(words->words[i]); + ahoc_addKeyword (root, w, TRUE, i, options); + + //add reverse word + w=ecoComplementWord(w,options->primer_length); + ahoc_addKeyword (root, w, FALSE, i, options); + } + + //loop on root if some base has no out going edge from roots + for (i=0; i<4; i++) + if (root->next[i] == NULL) + root->next[i] = root; +} + +void ahoc_enqueue (aho_queue *ahoqueue, aho_state *node) +{ + queue_node *q; + if (node == NULL) return; + + q = ECOMALLOC(sizeof(queue_node), + "Cannot allocate memory for aho-corasick queue node"); + q->state_node = node; + q->next = NULL; + + if (ahoqueue->first == NULL) + { + ahoqueue->first = q; + ahoqueue->last = q; + } + else + { + ahoqueue->last->next = q; + ahoqueue->last = q; + } +} + +aho_state *ahoc_dequeue (aho_queue *ahoqueue) +{ + aho_state *node = NULL; + queue_node *q; + + if (ahoqueue->first == NULL) return node; + q = ahoqueue->first; + ahoqueue->first = q->next; + + node = q->state_node; + ECOFREE (q, "Cannot free memory for aho-corasick queue node"); + return node; +} + +//set fail links and output sets for the keyword tree +void ahoc_updateForFailAndOutput (aho_state *root) +{ + int32_t i; + aho_queue Q; + aho_state *node_r; + aho_state *node_u; + aho_state *node_v; + + //empty queue + Q.first = NULL; + Q.last = NULL; + + //for us alphabet has 4 elements, A=0, C=1, G=2 and T=3 + for (i=0; i<4; i++) + { + if (root->next[i] != root && root->next[i] != NULL) + { + root->next[i]->fail = root; + ahoc_enqueue (&Q, root->next[i]); + } + } + + //while queue not empty + while (Q.first != NULL) + { + node_r = ahoc_dequeue (&Q); + for (i=0; i<4; i++) + { + if (node_r->next[i] != NULL) + { + node_u = node_r->next[i]; + ahoc_enqueue (&Q, node_u); + node_v = node_r->fail; + while (node_v->next[i] == NULL) + node_v = node_v->fail; + node_u->fail = node_v->next[i]; + ahoc_unionOutputElements (node_u, node_u->fail); + } + } + } +} + +void ahoc_freeKeywordTree (aho_state *node) +{ + int i; + for (i=0; i<4; i++) + if (node->next[i]) + ahoc_freeKeywordTree (node->next[i]); + if (node->output.count > 0) + ECOFREE (node->output.out_set, "Free failed for node output"); + ECOFREE (node, "Free failed for node"); +} + +pprimercount_t ahoc_lookforStrictPrimers (pecodnadb_t database, uint32_t seqdbsize,uint32_t exampleCount, + pwordcount_t words,poptions_t options) +{ + aho_state automaton_root; + aho_state *curr_state; + //uint32_t inSequenceQuorum; + uint32_t outSequenceQuorum; + pprimer_t data; + pprimercount_t primers; + uint32_t i, j, k; + int32_t pos; + uint32_t lmax; + char *base; + int8_t code; + uint32_t goodPrimers=0; + static int iii=0; + + + //inSequenceQuorum = (uint32_t)floor((float)exampleCount * options->sensitivity_quorum); + outSequenceQuorum = (uint32_t)floor((float)(seqdbsize-exampleCount) * options->false_positive_quorum); + + //fprintf(stderr," Primers should be at least present in %d/%d example sequences\n",inSequenceQuorum,exampleCount); + fprintf(stderr," Primers should not be present in more than %d/%d counterexample sequences\n",outSequenceQuorum,(seqdbsize-exampleCount)); + + data = ECOMALLOC(words->size * sizeof(primer_t), + "Cannot allocate memory for fuzzy matching results"); + for (i=0; i < words->size; i++) + { + data[i].word=WORD(words->words[i]); + data[i].inexample = 0; + data[i].outexample= 0; + + data[i].directCount=ECOMALLOC(seqdbsize * sizeof(uint32_t), + "Cannot allocate memory for primer position"); + data[i].directPos = ECOMALLOC(seqdbsize * sizeof(poslist_t), + "Cannot allocate memory for primer position"); + data[i].reverseCount=ECOMALLOC(seqdbsize * sizeof(uint32_t), + "Cannot allocate memory for primer position"); + data[i].reversePos = ECOMALLOC(seqdbsize * sizeof(poslist_t), + "Cannot allocate memory for primer position"); + } + + //build keywords automaton + ahoc_buildKeywordTree (&automaton_root, words, options); + //set fail links and output sets + ahoc_updateForFailAndOutput (&automaton_root); + + //debug; print keywordtree in a gv file + //ahoc_graphKeywordTree (&automaton_root); + + //loop on each sequence for its each base and find words + for (i=0; i < seqdbsize; i++) + { + if(database[i]->SQ_length <= options->primer_length) continue; + + lmax = database[i]->SQ_length; + if (!options->circular) + lmax += options->primer_length-1; + curr_state = &automaton_root; + + for (j=0,base=database[i]->SQ; jSQ_length) base=database[i]->SQ; + + //code = encoder[(*base) - 'A']; + code = *base; + //if (iii++ < 30) + // fprintf (stderr, "%d:%d,", *base, code); + if (code < 0 || code > 3) + { + //if error char, start from root for next character + //+forget any incomplete words + curr_state = &automaton_root; + continue; + } + while (curr_state->next[code] == NULL) curr_state = curr_state->fail; + curr_state = curr_state->next[code]; + + //start position of primer is options->primer_length-1 chars back + pos = j-options->primer_length+1; + if (pos < 0) pos = database[i]->SQ_length+pos; + + //set output, if there is some output on this state then + //+all words in the output set complete here, so increment their + //+found properties for current sequence + for (k=0; koutput.count; k++) + { + if (curr_state->output.out_set[k].isdirect) + data[curr_state->output.out_set[k].wordidx].directCount[i]++; + else + data[curr_state->output.out_set[k].wordidx].reverseCount[i]++; + + if (options->no_multi_match) + { + if ((data[curr_state->output.out_set[k].wordidx].directCount[i] + + data[curr_state->output.out_set[k].wordidx].reverseCount[i]) > 1) + //since multimach not allowd, set an indication on 1st seq position that + //+ a multimatch was found, so that this word will be filtered out + //+ and because of first postion we wont have to search the whole array + //+ to find if it voilated nomultimatch constraint for some seq + data[curr_state->output.out_set[k].wordidx].directCount[0] = 2; + else + { + if (curr_state->output.out_set[k].isdirect) + //direct word found on jth position of ith sequence + data[curr_state->output.out_set[k].wordidx].directPos[i].value = (uint32_t)pos; + else + //reverse word found on jth position of ith sequence + data[curr_state->output.out_set[k].wordidx].reversePos[i].value = (uint32_t)pos; + } + } + else + { + //okay multi match allowed + if (curr_state->output.out_set[k].isdirect) + { + if (data[curr_state->output.out_set[k].wordidx].directCount[i] == 1) + data[curr_state->output.out_set[k].wordidx].directPos[i].value = (uint32_t)pos; + else + { + //need to create or extend the positions list + if (data[curr_state->output.out_set[k].wordidx].directCount[i] == 2) + { + //for second element, first was put in .value, so dont forget to copy that in the array too + data[curr_state->output.out_set[k].wordidx].directPos[i].pointer = ECOMALLOC(2 * sizeof(uint32_t), + "Cannot allocate memory for primer position"); + data[curr_state->output.out_set[k].wordidx].directPos[i].pointer[0] = data[curr_state->output.out_set[k].wordidx].directPos[i].value; + data[curr_state->output.out_set[k].wordidx].directPos[i].pointer[1] = (uint32_t)pos; + } + else + { + //for third or greater element + data[curr_state->output.out_set[k].wordidx].directPos[i].pointer = ECOREALLOC(data[curr_state->output.out_set[k].wordidx].directPos[i].pointer, + data[curr_state->output.out_set[k].wordidx].directCount[i] * sizeof(uint32_t), + "Cannot allocate memory for primer position"); + data[curr_state->output.out_set[k].wordidx].directPos[i].pointer[data[curr_state->output.out_set[k].wordidx].directCount[i]-1] = (uint32_t)pos; + } + } + } + else + { + if (data[curr_state->output.out_set[k].wordidx].reverseCount[i] == 1) + data[curr_state->output.out_set[k].wordidx].reversePos[i].value = (uint32_t)pos; + else + { + //need to create or extend the positions list + if (data[curr_state->output.out_set[k].wordidx].reverseCount[i] == 2) + { + //for second element, first was put in .value, so dont forget to copy that in the array too + data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer = ECOMALLOC(2 * sizeof(uint32_t), + "Cannot allocate memory for primer position"); + data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer[0] = data[curr_state->output.out_set[k].wordidx].reversePos[i].value; + data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer[1] = (uint32_t)pos; + } + else + { + //for third or greater element + data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer = ECOREALLOC(data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer, + data[curr_state->output.out_set[k].wordidx].reverseCount[i] * sizeof(uint32_t), + "Cannot allocate memory for primer position"); + data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer[data[curr_state->output.out_set[k].wordidx].reverseCount[i]-1] = (uint32_t)pos; + } + } + } + } + //dont forget to increment inexample or outexample count, but only once for a sequence + if ((data[curr_state->output.out_set[k].wordidx].directCount[i] + + data[curr_state->output.out_set[k].wordidx].reverseCount[i]) == 1) + { + if (database[i]->isexample) + data[curr_state->output.out_set[k].wordidx].inexample++; + else + data[curr_state->output.out_set[k].wordidx].outexample++; + } + } + } + } + + //Only thing that remains is to remove the failed words + for (i=0,j=0; isize; i++) + { + fprintf(stderr,"Primers %5d/%lld analyzed => sequence : %s in %d example and %d counterexample sequences \r", + i+1,words->size,ecoUnhashWord(data[i].word,options->primer_length), + data[i].inexample,data[i].outexample); + + //if (data[i].inexample < inSequenceQuorum || (data[i].directCount[0] == 2 && options->no_multi_match)) + if (data[i].directCount[0] == 2 && options->no_multi_match) + { + //bad word, delete from the array + for (k=0; k 1) + ECOFREE (data[i].directPos[k].pointer, "Cannot free position pointer."); + if (data[i].reverseCount[k] > 1) + ECOFREE (data[i].reversePos[k].pointer, "Cannot free position pointer."); + } + ECOFREE (data[i].directCount, "Cannot free position pointer."); + ECOFREE (data[i].directPos, "Cannot free position pointer."); + ECOFREE (data[i].reverseCount, "Cannot free position pointer."); + ECOFREE (data[i].reversePos, "Cannot free position pointer."); + } + else + { + //data[i].good = data[i].inexample >= inSequenceQuorum && data[i].outexample <= outSequenceQuorum; + data[i].good = data[i].outexample <= outSequenceQuorum; + goodPrimers+=data[i].good? 1:0; + if (j < i) + data[j] = data[i]; + j++; + } + } + fprintf(stderr,"\n\nOn %lld analyzed primers %d respect quorum conditions\n",words->size,goodPrimers); + fprintf(stderr,"Conserved primers for further analysis : %d/%lld\n",j,words->size); + + primers = ECOMALLOC(sizeof(primercount_t),"Cannot allocate memory for primer table"); + primers->primers=ECOREALLOC(data, + j * sizeof(primer_t), + "Cannot reallocate memory for fuzzy matching results"); + primers->size=j; + + //free memory of keyword table + for (i=0; i<4; i++) + if (automaton_root.next[i] != &automaton_root) + ahoc_freeKeywordTree (automaton_root.next[i]); + + return primers; +} + +void ahoc_graphPrintNodesInfo (aho_state *node, FILE* gfile) +{ + uint32_t i; + fprintf (gfile, "\"%d\"[\n", node->id); + fprintf (gfile, "label=\"%d\\n", node->id); + for (i=0; ioutput.count; i++) + fprintf (gfile, "%d%c,", node->output.out_set[i].wordidx, node->output.out_set[i].isdirect?'d':'r'); + fprintf (gfile, "\"\n];\n"); + + for (i=0; i<4; i++) + if (node->next[i] != NULL && node->next[i] != node) + ahoc_graphPrintNodesInfo (node->next[i], gfile); +} + +void ahoc_graphPrintNodesLinks (aho_state *node, FILE* gfile) +{ + uint32_t i; + static int j=0; + + for (i=0; i<4; i++) + if (node->next[i] != NULL && node->next[i] != node) + { + fprintf (gfile, "\"%d\" -> \"%d\" [\n", node->id, node->next[i]->id); + fprintf (gfile, "label=\"%c\"\n];\n", "ACGT"[i]); + } + + if (j++ < 40) + if (node->fail != NULL && node->fail != groot) + { + fprintf (gfile, "\"%d\" -> \"%d\" [\n", node->id, node->fail->id); + fprintf (gfile, "color= \"red\"\n];\n"); + } + + for (i=0; i<4; i++) + if (node->next[i] != NULL && node->next[i] != node) + ahoc_graphPrintNodesLinks (node->next[i], gfile); +} + +void ahoc_graphKeywordTree (aho_state *root) +{ + FILE *gfile; + + groot=root; + gfile = fopen ("keywordtree.gv", "w"); + fprintf (gfile, "digraph keywordtree {\n"); + ahoc_graphPrintNodesInfo (root, gfile); + ahoc_graphPrintNodesLinks (root, gfile); + fprintf (gfile, "}\n"); + fclose(gfile); +} + diff --git a/src/libecoprimer/ahocorasick.h b/src/libecoprimer/ahocorasick.h new file mode 100755 index 0000000..097af4f --- /dev/null +++ b/src/libecoprimer/ahocorasick.h @@ -0,0 +1,43 @@ +/* + * ahocorasick.h + * + * Created on: 26 march 2011 + * Author: tiayyba + */ + +#ifndef H_ahocorasick +#define H_ahocorasick + +#include "ecoprimer.h" + +typedef struct aho_output_t{ + uint32_t wordidx; //index of strict word (dont save the word of 64B) + bool_t isdirect; //we need to find both direct and reverse words so we must know which one is it +}aho_output; + +typedef struct aho_output_count_t{ + uint32_t count; + aho_output *out_set; +}aho_output_count; + +typedef struct aho_state_t{ + int32_t id; + struct aho_state_t *next[4]; //for labels A=0,C=1,G=2 and T=3 + struct aho_state_t *fail; + aho_output_count output; +}aho_state; + +typedef struct queue_node_t { + aho_state *state_node; + struct queue_node_t *next; +}queue_node; + +typedef struct{ + queue_node *first; + queue_node *last; +}aho_queue; + +pprimercount_t ahoc_lookforStrictPrimers (pecodnadb_t database, uint32_t seqdbsize,uint32_t exampleCount, + pwordcount_t words,poptions_t options); +#endif /* H_ahocorasick */ + diff --git a/src/libecoprimer/amplifiatree.c b/src/libecoprimer/amplifiatree.c new file mode 100644 index 0000000..3698dea --- /dev/null +++ b/src/libecoprimer/amplifiatree.c @@ -0,0 +1,131 @@ +/* + * amplifiatree.c + * + * Created on: 7 mars 2009 + * Author: coissac + */ + +#include "ecoprimer.h" +#include + +static void cleanamplifia(pamplifia_t amplifia); +static void deleteamplifialist(pamplifialist_t list); +static int cmpamplifia(const void* p1,const void*p2); + + +static void cleanamplifiatlist(pamplifiacount_t list) +{ + if (list->amplifias) + ECOFREE(list->amplifias, + "Free amplifia list"); +} + +static void cleanamplifia(pamplifia_t amplifia) +{ + cleanamplifiatlist(&(amplifia->pcr)); +} + +static pamplifialist_t newamplifialist(pamplifialist_t parent, size_t size) +{ + pamplifialist_t tmp; + + tmp=ECOMALLOC(sizeof(amplifialist_t)+sizeof(amplifia_t)*(size-1), + "Cannot allocate new amplifia list"); + + tmp->amplifiaslots=size; + tmp->amplifiacount=0; + tmp->next=NULL; + + if (parent) + parent->next=(void*)tmp; + + return tmp; +} + +static void deleteamplifialist(pamplifialist_t list) +{ + size_t i; + + if (list) + { + if (list->next) + { + deleteamplifialist(list->next); + list->next=NULL; + } + for (i=0; i < list->amplifiacount; i++) + cleanamplifia((list->amplifias)+i); + + ECOFREE(list,"Delete amplifia list"); + } + +} + +static int cmpamplifia(const void* p1,const void*p2) +{ + pamplifia_t pr1,pr2; + + pr1=(pamplifia_t)p1; + pr2=(pamplifia_t)p2; + + if (pr1->p1 < pr2->p1) return -1; + if (pr1->p1 > pr2->p1) return 1; + + if (pr1->asdirect1 < pr2->asdirect1) return -1; + if (pr1->asdirect1 > pr2->asdirect1) return 1; + + if (pr1->p2 < pr2->p2) return -1; + if (pr1->p2 > pr2->p2) return 1; + + if (pr1->asdirect2 < pr2->asdirect2) return -1; + if (pr1->asdirect2 > pr2->asdirect2) return 1; + + return 0; +} + +pamplifia_t amplifiaintree (amplifia_t key, + pamplifiatree_t amplifialist) +{ + if (!amplifialist->tree) + return NULL; + + return *((pamplifia_t*)tsearch((const void *)(&key), + &(amplifialist->tree), + cmpamplifia + )); +} + +pamplifia_t insertamplifia(amplifia_t key, + pamplifiatree_t list) +{ + pamplifia_t current; + pamplifia_t found; + + if (list->last->amplifiacount==list->last->amplifiaslots) + { + list->last->next=newamplifialist(list,100); + list->last=list->last->next; + } + + current = list->last->amplifias + list->last->amplifiacount; + *current=key; + + found = *((pamplifia_t*)tsearch((const void *)current, + &(list->tree), + cmpamplifia)); + if (found==current) + list->last->amplifiacount++; + + return found; +} + +pamplifiatree_t initamplifiatree(pamplifiatree_t tree) +{ + if (!tree) + tree = ECOMALLOC(sizeof(amplifiatree_t),"Cannot allocate amplifia tree"); + + tree->first=newamplifialist(NULL,500); + tree->last=tree->first; + + tree->tree=NULL; +} diff --git a/src/libecoprimer/apat.h b/src/libecoprimer/apat.h new file mode 100644 index 0000000..dd9ae06 --- /dev/null +++ b/src/libecoprimer/apat.h @@ -0,0 +1,120 @@ +/* ==================================================== */ +/* Copyright (c) Atelier de BioInformatique */ +/* Dec. 94 */ +/* File: apat.h */ +/* Purpose: pattern scan */ +/* History: */ +/* 28/12/94 : ascan first version */ +/* 14/05/99 : last revision */ +/* ==================================================== */ + + +#ifndef H_apat +#define H_apat + + +#include "libstki.h" +#include "inttypes.h" +#include "../libecoPCR/ecoPCR.h" + + +/* ----------------------------------------------- */ +/* constantes */ +/* ----------------------------------------------- */ + +#ifndef BUFSIZ +#define BUFSIZ 1024 /* io buffer size */ +#endif + +#define MAX_NAME_LEN BUFSIZ /* max length of sequence name */ + +#define ALPHA_LEN 4 /* alphabet length */ + /* *DO NOT* modify */ + +#define MAX_PATTERN 4 /* max # of patterns */ + /* *DO NOT* modify */ + +#define MAX_PAT_LEN 32 /* max pattern length */ + /* *DO NOT* modify */ + +#define MAX_PAT_ERR 32 /* max # of errors */ + /* *DO NOT* modify */ + +#define PATMASK 0x3ffffff /* mask for 26 symbols */ + /* *DO NOT* modify */ + +#define OBLIBIT 0x4000000 /* bit 27 to 1 -> oblig. pos */ + /* *DO NOT* modify */ + + /* mask for position */ +#define ONEMASK 0x80000000 /* mask for highest position */ + + /* masks for Levenhstein edit */ +#define OPER_IDT 0x00000000 /* identity */ +#define OPER_INS 0x40000000 /* insertion */ +#define OPER_DEL 0x80000000 /* deletion */ +#define OPER_SUB 0xc0000000 /* substitution */ + +#define OPER_SHFT 30 /* shift */ + + /* Levenhstein Opcodes */ +#define SOPER_IDT 0x0 /* identity */ +#define SOPER_INS 0x1 /* insertion */ +#define SOPER_DEL 0x2 /* deletion */ +#define SOPER_SUB 0x3 /* substitution */ + + /* Levenhstein Opcodes masks */ +#define OPERMASK 0xc0000000 /* mask for Opcodes */ +#define NOPERMASK 0x3fffffff /* negate of previous */ + + + +/* ----------------------------------------------- */ +/* data structures */ +/* ----------------------------------------------- */ + + +typedef uint32_t pattern_t[ALPHA_LEN], *ppattern_t; + + /* -------------------- */ +typedef struct { /* pattern */ + /* -------------------- */ +int patlen; /* pattern length */ +int maxerr; /* max # of errors */ +uint32_t omask; /* oblig. bits mask */ +bool_t circular; /* is circular sequence */ +} patternParam_t, *ppatternParam_t; + + +/* ----------------------------------------------- */ +/* macros */ +/* ----------------------------------------------- */ + +#ifndef NEW +#define NEW(typ) (typ*)malloc(sizeof(typ)) +#define NEWN(typ, dim) (typ*)malloc((unsigned long)(dim) * sizeof(typ)) +#define REALLOC(typ, ptr, dim) (typ*)realloc((void *) (ptr), (unsigned long)(dim) * sizeof(typ)) +#define FREE(ptr) free((void *) ptr) +#endif + +/* ----------------------------------------------- */ +/* prototypes */ +/* ----------------------------------------------- */ + + /* apat_search.c */ + +int32_t ManberNoErr(pecoseq_t pseq,ppattern_t pat, + ppatternParam_t param, + StackiPtr stkpos); + +int32_t ManberSub(pecoseq_t pseq,ppattern_t pat, + ppatternParam_t param, + StackiPtr stkpos); + +int32_t ManberAll(pecoseq_t pseq,ppattern_t pat, + ppatternParam_t param, + StackiPtr stkpos); + + +#endif /* H_apat */ + diff --git a/src/libecoprimer/apat_parse.c b/src/libecoprimer/apat_parse.c new file mode 100644 index 0000000..c11b3a7 --- /dev/null +++ b/src/libecoprimer/apat_parse.c @@ -0,0 +1,65 @@ +/* ==================================================== */ +/* Copyright (c) Atelier de BioInformatique */ +/* Mar. 92 */ +/* File: apat_parse.c */ +/* Purpose: Codage du pattern */ +/* History: */ +/* 00/07/94 : first version (stanford) */ +/* 00/11/94 : revised for DNA/PROTEIN */ +/* 30/12/94 : modified EncodePattern */ +/* for manber search */ +/* 14/05/99 : indels added */ +/* ==================================================== */ + +#include +#include +#include +#include + +#include "apat.h" +#include "ecoprimer.h" + + + /* IUPAC Dna */ +static int32_t sDnaCode[] = { + /* IUPAC */ + + 0x00000001 /* A */, 0x0000000E /* B */, 0x00000002 /* C */, + 0x0000000D /* D */, 0x00000000 /* E */, 0x00000000 /* F */, + 0x00000004 /* G */, 0x0000000B /* H */, 0x00000000 /* I */, + 0x00000000 /* J */, 0x0000000C /* K */, 0x00000000 /* L */, + 0x00000003 /* M */, 0x0000000F /* N */, 0x00000000 /* O */, + 0x00000000 /* P */, 0x00000000 /* Q */, 0x00000005 /* R */, + 0x00000006 /* S */, 0x00000008 /* T */, 0x00000008 /* U */, + 0x00000007 /* V */, 0x00000009 /* W */, 0x00000000 /* X */, + 0x0000000A /* Y */, 0x00000000 /* Z */ +}; + + +/* -------------------------------------------- */ +/* internal replacement of gets */ +/* -------------------------------------------- */ +static char *sGets(char *buffer, int size) { + + char *ebuf; + + if (! fgets(buffer, size-1, stdin)) + return NULL; + + /* remove trailing line feed */ + + ebuf = buffer + strlen(buffer); + + while (--ebuf >= buffer) { + if ((*ebuf == '\n') || (*ebuf == '\r')) + *ebuf = '\000'; + else + break; + } + + return buffer; +} + +/* -------------------------------------------- */ +/* Interface */ +/* -------------------------------------------- */ diff --git a/src/libecoprimer/apat_search.c b/src/libecoprimer/apat_search.c new file mode 100644 index 0000000..22ae905 --- /dev/null +++ b/src/libecoprimer/apat_search.c @@ -0,0 +1,155 @@ +/* ==================================================== */ +/* Copyright (c) Atelier de BioInformatique */ +/* Dec. 94 */ +/* File: apat_search.c */ +/* Purpose: recherche du pattern */ +/* algorithme de Baeza-Yates/Gonnet */ +/* Manber (agrep) */ +/* History: */ +/* 07/12/94 : first version */ +/* 28/12/94 : revised version */ +/* 14/05/99 : last revision */ +/* ==================================================== */ + + +#include +#include +#include + +#include "libstki.h" +#include "apat.h" + +#define POP PopiOut +#define PUSH(s,v) PushiIn(&(s),(v)) +#define TOPCURS CursiToTop +#define DOWNREAD ReadiDown + +#define KRONECK(x, msk) ((~x & msk) ? 0 : 1) +#define MIN(x, y) ((x) < (y) ? (x) : (y)) + + +/* -------------------------------------------- */ +/* Baeza-Yates/Manber algorithm */ +/* NoError */ +/* -------------------------------------------- */ +int32_t ManberNoErr(pecoseq_t pseq,ppattern_t pat, + ppatternParam_t param, + StackiPtr stkpos) +{ + int32_t pos; + uint32_t smask, r; + uint8_t *data; + int32_t end; + + end = (size_t)(pseq->SQ_length); + + if (param->circular) + end+=param->patlen - 1; + + + /* create local masks */ + smask = r = 0x1L << param->patlen; + /* init. scan */ + data = (uint8_t*)(pseq->SQ); + + /* loop on text data */ + for (pos = 0 ; pos < end ; pos++,data++) { + if (pos==pseq->SQ_length) + data=(uint8_t*)(pseq->SQ); + + if (*data < 4) + r = (r >> 1) & pat[*data]; + else + r=0; + + if (r & 0x1L) { + PUSH(stkpos, pos - param->patlen + 1); + } + + r |= smask; + } + return stkpos->top; /* aka # of hits */ +} + +/* -------------------------------------------- */ +/* Baeza-Yates/Manber algorithm */ +/* Substitution only */ +/* */ +/* Note : r array is stored as : */ +/* 0 0 r(0,j) r(0,j+1) r(1,j) r(1,j+1) ... */ +/* */ +/* -------------------------------------------- */ +int32_t ManberSub(pecoseq_t pseq,ppattern_t pat, + ppatternParam_t param, + StackiPtr stkpos) +{ + int e, found; + int32_t pos; + uint32_t smask, cmask, sindx; + uint32_t *pr, r[2 * MAX_PAT_ERR + 2]; + uint8_t *data; + int32_t end; + + end = (size_t)(pseq->SQ_length); + + if (param->circular) + end+=param->patlen - 1; + + /* create local masks */ + r[0] = r[1] = 0x0; + + cmask = smask = 0x1L << param->patlen; + + for (e = 0, pr = r + 3 ; e <= param->maxerr ; e++, pr += 2) + *pr = cmask; + + cmask = ~ param->omask; + /* init. scan */ + data = (uint8_t*)(pseq->SQ); + + /* loop on text data */ + + for (pos = 0 ; pos < end ; pos++,data++) { + if (pos==pseq->SQ_length) + data=(uint8_t*)(pseq->SQ); + + sindx = (*data==4) ? 0:pat[*data]; + + for (e = found = 0, pr = r ; e <= param->maxerr ; e++, pr += 2) { + + pr[2] = pr[3] | smask; + + pr[3] = ((pr[0] >> 1) & cmask) /* sub */ + | ((pr[2] >> 1) & sindx); /* ident */ + + if (pr[3] & 0x1L) { /* found */ + if (! found) { + PUSH(stkpos, pos - param->patlen + 1); + } + found++; + } + } + } + + return stkpos->top; /* aka # of hits */ +} + + +/* -------------------------------------------- */ +/* Baeza-Yates/Manber algorithm */ +/* API call to previous functions */ +/* -------------------------------------------- */ +int32_t ManberAll(pecoseq_t pseq,ppattern_t pat, + ppatternParam_t param, + StackiPtr stkpos) +{ + if (param->maxerr == 0) + return ManberNoErr(pseq, + pat, param, + stkpos); + else + return ManberSub(pseq, + pat, param, + stkpos); +} + diff --git a/src/libecoprimer/aproxpattern.c b/src/libecoprimer/aproxpattern.c new file mode 100644 index 0000000..0cf349e --- /dev/null +++ b/src/libecoprimer/aproxpattern.c @@ -0,0 +1,237 @@ +/* + * aproxpattern.c + * + * Created on: 20 nov. 2008 + * Author: coissac + */ + + +#include "ecoprimer.h" +#include "apat.h" +#include + +static uint8_t encoder[] = {0, // A + 4, // b + 1, // C + 4,4,4, // d, e, f + 2, // G + 4,4,4,4,4,4,4,4,4,4,4,4, // h,i,j,k,l,m,n,o,p,q,r,s + 3,3, // T,U + 4,4,4,4,4}; // v,w,x,y,z + + +ppattern_t buildPatternFromWord(word_t word, uint32_t patlen) +{ + static pattern_t pattern; + uint32_t i; + + for (i = 0 ; i < ALPHA_LEN ; i++) + pattern[i] = 0x0; + + for (i=0;i < patlen; i++) + { + pattern[word & 3LLU] |= 1 << i; + word>>=2; + } + + return pattern; + +} + + +#ifdef IS_UPPER +#undef IS_UPPER +#endif + +/* -------------------------------------------- */ +/* encode sequence */ +/* IS_UPPER is slightly faster than isupper */ +/* -------------------------------------------- */ + +#define IS_UPPER(c) (((c) >= 'A') && ((c) <= 'Z')) + +void encodeSequence(ecoseq_t *seq) +{ + int i; + uint8_t *data; + char *cseq; + + data = (uint8_t*)(seq->SQ); + cseq = seq->SQ; + + for (i=0;iSQ_length;i++,data++,cseq++) + { + *data = encoder[(IS_UPPER(*cseq) ? *cseq : 'Z') - 'A']; + } +} + +pprimercount_t lookforAproxPrimer(pecodnadb_t database, uint32_t seqdbsize,uint32_t exampleCount, + pwordcount_t words,poptions_t options) +{ + pprimer_t data; + pprimercount_t primers; + ppattern_t pattern; + patternParam_t params; + uint32_t i; + uint32_t w; + uint32_t j; + Stacki positions; + uint32_t count=1; + uint32_t goodPrimers=0; + + uint32_t inSequenceQuorum; + uint32_t outSequenceQuorum; + bool_t conserved = TRUE; + + //poslist_t ttt; + + + inSequenceQuorum = (uint32_t)floor((float)exampleCount * options->sensitivity_quorum); + outSequenceQuorum = (uint32_t)floor((float)(seqdbsize-exampleCount) * options->false_positive_quorum); + + fprintf(stderr," Primers should be at least present in %d/%d example sequences\n",inSequenceQuorum,exampleCount); + fprintf(stderr," Primers should not be present in more than %d/%d counterexample sequences\n",outSequenceQuorum,(seqdbsize-exampleCount)); + + data = ECOMALLOC(words->size * sizeof(primer_t), + "Cannot allocate memory for fuzzy matching results"); + + params.circular = options->circular; + params.maxerr = options->error_max; +// params.omask = (1 << options->strict_three_prime) -1; + params.omask = 0; + params.patlen = options->primer_length; + + positions.val=NULL; + + for (i=0,w=0; i < words->size; i++) + { + data[w].word=WORD(words->words[i]); + data[w].inexample = 0; + data[w].outexample= 0; + count = 1; + + if (conserved) + { + data[w].directCount=ECOMALLOC(seqdbsize * sizeof(uint32_t), + "Cannot allocate memory for primer position"); + data[w].directPos = ECOMALLOC(seqdbsize * sizeof(poslist_t), + "Cannot allocate memory for primer position"); + data[w].reverseCount=ECOMALLOC(seqdbsize * sizeof(uint32_t), + "Cannot allocate memory for primer position"); + data[w].reversePos = ECOMALLOC(seqdbsize * sizeof(poslist_t), + "Cannot allocate memory for primer position"); + } + + pattern = buildPatternFromWord(data[w].word,options->primer_length); + positions.val=NULL; + + for (j=0; j < seqdbsize && (count < 2 || !options->no_multi_match); j++) + { + positions.cursor=0; + positions.top =0; + if (!positions.val) + { + positions.size=1; + positions.val = ECOMALLOC(sizeof(uint32_t), + "Cannot allocate memory for primer position"); + } + + + count = ManberAll(database[j],pattern,¶ms,&positions); + data[w].directCount[j]=count; + + + if (count>1) + { + data[w].directPos[j].pointer = (uint32_t*)positions.val; + positions.val=NULL; + } + else + { + data[w].directPos[j].pointer=NULL; + if (count==1) + data[w].directPos[j].value = (uint32_t)*(positions.val); + } + + + } + + pattern = buildPatternFromWord(ecoComplementWord(data[w].word,options->primer_length), + options->primer_length); + + for (j=0; j < seqdbsize && (count < 2 || !options->no_multi_match); j++) + { + positions.cursor=0; + positions.top =0; + if (!positions.val) + { + positions.size=1; + positions.val = ECOMALLOC(sizeof(uint32_t), + "Cannot allocate memory for primer position"); + } + + count = ManberAll(database[j],pattern,¶ms,&positions); + data[w].reverseCount[j]=count; + + if (count>1) + { + data[w].reversePos[j].pointer = (uint32_t*)positions.val; + positions.val=NULL; + } + else + { + data[w].reversePos[j].pointer=NULL; + if (count==1) + data[w].reversePos[j].value = (uint32_t)*(positions.val); + } + + if (database[j]->isexample) + { + data[w].inexample+=(data[w].directCount[j] || data[w].reverseCount[j])? 1:0; + } + else + { + data[w].outexample+=(data[w].directCount[j] || data[w].reverseCount[j])? 1:0; + + } + + count+=data[w].directCount[j]; + } + + data[w].good = data[w].inexample >= inSequenceQuorum && data[w].outexample <= outSequenceQuorum; + goodPrimers+=data[w].good? 1:0; + + fprintf(stderr,"Primers %5d/%d analyzed => sequence : %s in %d example and %d counterexample sequences \r", + i+1,words->size,ecoUnhashWord(data[w].word,options->primer_length), + data[w].inexample,data[w].outexample); + + + conserved=data[w].inexample >= inSequenceQuorum; + conserved=conserved && (count < 2 || !options->no_multi_match); + + if (conserved) + w++; + } + + if (positions.val) + ECOFREE(positions.val,"Free stack position pointer"); + + if (!conserved) + { + ECOFREE(data[w].directCount,"Free direct count table"); + ECOFREE(data[w].directPos,"Free direct count table"); + ECOFREE(data[w].reverseCount,"Free direct count table"); + ECOFREE(data[w].reversePos,"Free direct count table"); + } + + fprintf(stderr,"\n\nOn %d analyzed primers %d respect quorum conditions\n",words->size,goodPrimers); + fprintf(stderr,"Conserved primers for further analysis : %d/%d\n",w,words->size); + + primers = ECOMALLOC(sizeof(primercount_t),"Cannot allocate memory for primer table"); + primers->primers=ECOREALLOC(data, + w * sizeof(primer_t), + "Cannot reallocate memory for fuzzy matching results"); + primers->size=w; + + return primers; +} diff --git a/src/libecoprimer/debug.h b/src/libecoprimer/debug.h new file mode 100644 index 0000000..48f473a --- /dev/null +++ b/src/libecoprimer/debug.h @@ -0,0 +1,29 @@ +/* + * debug.h + * + * Created on: 12 nov. 2008 + * Author: coissac + */ + +#ifndef DEBUG_H_ +#define DEBUG_H_ + +#include + + +#ifdef DEBUG + +#define DEBUG_LOG(message,...) { \ + char *text; \ + (void)asprintf(&text,(message),##__VA_ARGS__); \ + fprintf(stderr,"DEBUG %s (line %d) : %s\n",__FILE__,__LINE__,(text)); \ + free(text); \ + } + +#else + +#define DEBUG_LOG(message, ...) + +#endif + +#endif /* DEBUG_H_ */ diff --git a/src/libecoprimer/ecoprimer.h b/src/libecoprimer/ecoprimer.h new file mode 100755 index 0000000..37fefd1 --- /dev/null +++ b/src/libecoprimer/ecoprimer.h @@ -0,0 +1,366 @@ +/* + * epsort.h + * + * Created on: 6 nov. 2008 + * Author: coissac + */ + +#ifndef EPSORT_H_ +#define EPSORT_H_ + +#include +#include +#include +#include "ecotype.h" +#include "../libecoPCR/ecoPCR.h" +#include "../libthermo/nnparams.h" +#include "apat.h" + +#define DEBUG +#include "debug.h" + +/**** + * Word format used : + * + * bit 63 : bad word -> this word should not be used + * bit 62 : multi word -> this word is not uniq in at least one seq + * bits 0-61 : hashed dna word of max size 31 pb + * code used for a : 00 + * code used for c : 01 + * code used for g : 10 + * code used for t : 11 + */ + +typedef uint64_t word_t, *pword_t; + +#define WORD(x) ((x) & 0x3FFFFFFFFFFFFFFFLLU) +#define WORD(x) ((x) & 0x3FFFFFFFFFFFFFFFLLU) + +#define ISBADWORD(x) (((x) & 0x8000000000000000LLU) >> 63) +#define SETBADWORD(x) ((x) | 0x8000000000000000LLU) +#define RESETBADWORD(x) ((x) & 0x7FFFFFFFFFFFFFFFLLU) + +#define ISMULTIWORD(x) (((x) & 0x4000000000000000LLU) >> 62) +#define SETMULTIWORD(x) ((x) | 0x4000000000000000LLU) +#define RESETMULTIWORD(x) ((x) & 0xBFFFFFFFFFFFFFFFLLU) + + +#define WORDMASK(s) ((1LLU << ((s) * 2)) -1) +#define LSHIFTWORD(x,s) (((x) << 2) & WORDMASK(s)) +#define RSHIFTWORD(x,s) (((x) & WORDMASK(s))>> 2) +#define ERRORMASK(s) ((int32_t)((1LLU << (s)) -1)) + +#define RAPPENDBASE(x,s,c) (LSHIFTWORD((x),(s)) | (word_t)(c)) +#define LAPPENDBASE(x,s,c) (RSHIFTWORD((x),(s)) | ((word_t)((~(c)) & 3) << (((s)-1) *2))) + + +#define ECO_ASSERT(x,message) if (!(x)) \ + { \ + fprintf(stderr,"Assertion Error in %s (line %d): %s\n", \ + __FILE__,\ + __LINE__,\ + message\ + ); \ + exit(ECO_ASSERT_ERROR); \ + } + +#define MINI(x,y) (((x) < (y)) ? (x):(y)) +#define MAXI(x,y) (((x) < (y)) ? (y):(x)) + +#define FWORDSIZE (13) +#define FWORDMASK WORDMASK(FWORDSIZE) +#define FILTERWORD(x) ((uint32_t)((x) & FWORDMASK)) +#define CFILTERWORD(x,s) ((uint32_t)(((x) >> (((s)-FWORDSIZE)*2)) & FWORDMASK)) + + + +typedef struct { + pword_t words; + uint32_t *strictcount; + uint32_t inseqcount; + uint32_t outseqcount; + uint64_t size; +} wordcount_t, *pwordcount_t; + + +typedef union { + uint32_t *pointer; + uint32_t value; +} poslist_t, *pposlist_t; + + +/** + * primer_t structure store fuzzy match positions for a primer + * on all sequences + */ + +typedef struct { + word_t word; //< code for the primer + uint32_t *directCount; //< Occurrence count on direct strand + pposlist_t directPos; //< list of position list on direct strand + + uint32_t *reverseCount; //< Occurrence count on reverse strand + pposlist_t reversePos; //< list of position list on reverse strand + + bool_t good; //< primer match more than quorum example and no + // more counterexample quorum. + + uint32_t inexample; //< count of example sequences matching primer + uint32_t outexample; //< count of counterexample sequences matching primer +} primer_t, *pprimer_t; + +/** + * primercount_t structure store fuzzy match positions for all primers + * on all sequences as a list of primer_t + */ +typedef struct { + pprimer_t primers; + uint32_t size; +} primercount_t, *pprimercount_t; + +typedef struct { + pprimer_t primer; + uint32_t position; + bool_t strand; +} primermatch_t, *pprimermatch_t; + +/*TR: Added*/ +typedef struct { + pprimermatch_t matches; + uint32_t matchcount; +} primermatchcount_t, *pprimermatchcount_t; + +typedef struct { + pecoseq_t sequence; + bool_t strand; + const char *amplifia; + int32_t length; + uint32_t begin; + uint32_t end; +} amplifia_t, *pamplifia_t; + +typedef struct { + pamplifia_t amplifias; + uint32_t ampcount; + uint32_t ampslot; +} amplifiacount_t, *pamplifiacount_t; + +typedef struct { + char *amplifia; + int32_t *taxonids; + uint32_t seqidcount; + uint32_t seqidindex; +} ampseqset_t, *pampseqset_t; + +typedef struct { + int32_t taxonid; + char **amplifia; + uint32_t amplifiacount; + uint32_t amplifiaindex; +} taxampset_t, *ptaxampset_t; + +typedef struct { + pprimer_t p1; + bool_t asdirect1; + pprimer_t p2; + bool_t asdirect2; + + amplifiacount_t pcr; + + uint32_t inexample; //< example sequence count + uint32_t outexample; //< counterexample sequence count + uint32_t intaxa; //< example taxa count + uint32_t outtaxa; //< counterexample taxa count + uint32_t notwellidentifiedtaxa; + + int *wellIdentifiedSeqs; //< an array having elements equla to total seqs + // values are either 0 or 1, if seq is well identified + // its 1 else 0 + int *coveredSeqs; //< an array having elements equal to total seqs, 1 if seq is covered else 0 + + // these statistics are relative to inexample sequences + + uint32_t mind; //< minimum distance between primers + uint32_t maxd; //< maximum distance between primers + uint32_t sumd; //< distance sum + uint32_t amplifiacount; + float yule; + float quorumin; + float quorumout; + float bs; + float bc; + int32_t refsequence; +// +// uint32_t taxsetcount; +// uint32_t taxsetindex; +// ptaxampset_t taxset; +// +// uint32_t oktaxoncount; + uint32_t curseqid; + float p1temp; //strict primer1 melting temperature + float p1mintemp; //approx primer1 minimum melting temperature + float p2temp; //strict primer2 melting temperature + float p2mintemp; //approx primer2 minimum melting temperature +} pair_t, *ppair_t; + +/*TR: Added*/ + +typedef struct { + size_t paircount; + size_t pairslots; + void* next; + pair_t pairs[1]; +} pairlist_t, *ppairlist_t; + +typedef struct { + ppairlist_t first; + ppairlist_t last; + void *tree; + int32_t count; +} pairtree_t, *ppairtree_t; + +typedef struct { + pword_t words; + uint32_t *count; + uint32_t push; + uint32_t pop; + uint32_t size; + bool_t empty; + bool_t full; +} queue_t, *pqueue_t; + +typedef struct { + pword_t words; + uint32_t *count; + uint32_t write; + uint32_t read1; + uint32_t read2; + uint32_t size; +} merge_t, *pmerge_t; + +typedef struct { + const char *amplifia; + bool_t strand; + int32_t length; + int32_t taxoncount; + void *taxontree; +}amptotaxon_t, *pamptotaxon_t; + +typedef struct { + int32_t taxid; + void *amptree; +}taxontoamp_t, *ptaxontoamp_t; + +typedef struct { + bool_t printAC; + bool_t statistics; + bool_t filtering; + uint32_t lmin; //**< Amplifia minimal length + uint32_t lmax; //**< Amplifia maximal length + uint32_t error_max; //**< maximum error count in fuzzy search + uint32_t primer_length; //**< minimal length of the primers + int32_t *restricted_taxid; //**< limit amplification below these taxid + int32_t *ignored_taxid; //**< no amplification below these taxid + int32_t *exception_taxid; + char *prefix; + char *reference; + pecoseq_t refseq; + uint32_t refseqid; + uint32_t circular; + uint32_t doublestrand; + float strict_quorum; + float strict_exclude_quorum; + float sensitivity_quorum; + float false_positive_quorum; + uint32_t strict_three_prime; + int32_t r; //**< count of restrited taxa (restricted_taxid array size) + int32_t g; //**< count of ignored taxa (ignored_taxid array size) + int32_t e; //**< count of ignored taxa (ignored_taxid array size) + bool_t no_multi_match; + char taxonrank[20]; //TR to count ranks against a pair + int32_t taxonrankidx; //TR to count ranks against a pair + + // Some statistics useful for options filters + + int32_t dbsize; + int32_t insamples; + int32_t outsamples; + int32_t intaxa; + int32_t outtaxa; + int saltmethod; + float salt; + PNNParams pnparm; + bool_t print_sets_of_primers; + float specificity_threshold; + int links_cnt; + float max_links_percent; + bool_t filter_on_links; +} options_t, *poptions_t; + +typedef ecoseq_t **pecodnadb_t; + +void sortword(pword_t table,uint32_t N); + + +pecodnadb_t readdnadb(const char *name, ecotaxonomy_t *taxonomy, uint32_t *size,poptions_t options); + +int isGoodTaxon(ecotaxonomy_t *taxonomy,int32_t taxon,poptions_t options); +int isExampleTaxon(ecotaxonomy_t *taxonomy,int32_t taxon,poptions_t options); +int isCounterExampleTaxon(ecotaxonomy_t *taxonomy,int32_t taxon,poptions_t options); + +uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq); +pword_t ecoHashSequence(pword_t dest, uint32_t wordsize, uint32_t circular, uint32_t doublestrand, ecoseq_t *seq,uint32_t *size,int32_t *neededWords,uint32_t neededWordCount, + int32_t quorum); +uint32_t ecoCompactHashSequence(pword_t dest,uint32_t size); +const char* ecoUnhashWord(word_t word,uint32_t size); +word_t ecoComplementWord(word_t word,uint32_t size); +uint32_t ecoFindWord(pwordcount_t table,word_t word); + + +void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,uint32_t seqQuorum); +pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount); +void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t exampleCount,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount); + +pqueue_t newQueue(pqueue_t queue, uint32_t size); +pqueue_t resizeQueue(pqueue_t queue, uint32_t size); + +void pop(pqueue_t queue); +void push(pqueue_t queue, word_t word, uint32_t count); + +pqueue_t cleanQueue(pqueue_t queue); + +pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize, + uint32_t exampleCount, poptions_t options); +uint32_t filterMultiStrictPrimer(pwordcount_t strictprimers); + +void encodeSequence(ecoseq_t *seq); +ppattern_t buildPatternFromWord(word_t word, uint32_t patlen); + +pprimercount_t lookforAproxPrimer(pecodnadb_t database, uint32_t seqdbsize,uint32_t exampleCount, + pwordcount_t words,poptions_t options); + +void sortmatch(pprimermatch_t table,uint32_t N); + +ppairtree_t initpairtree(ppairtree_t tree); +ppair_t pairintree (pair_t key,ppairtree_t pairlist); +ppair_t insertpair(pair_t key,ppairtree_t list); + + +/*TR: Added*/ +ppairtree_t buildPrimerPairs(pecodnadb_t seqdb,uint32_t seqdbsize,pprimercount_t primers,poptions_t options); + +int32_t counttaxon(int32_t taxid); +int32_t getrankdbstats(pecodnadb_t seqdb, + uint32_t seqdbsize, + ecotaxonomy_t *taxonomy, + poptions_t options); +float taxonomycoverage(ppair_t pair, poptions_t options, pecodnadb_t seqdb,uint32_t seqdbsize); +char ecoComplementChar(char base); +void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize); + +int32_t *filteringSeq(pecodnadb_t database, uint32_t seqdbsize, + uint32_t exampleCount,poptions_t options,uint32_t *size,int32_t sequenceQuorum); + +void printSeqTest(pecodnadb_t seqdb,uint32_t seqdbsize); + +#endif /* EPSORT_H_ */ diff --git a/src/libecoprimer/ecotype.h b/src/libecoprimer/ecotype.h new file mode 100644 index 0000000..38ce5a8 --- /dev/null +++ b/src/libecoprimer/ecotype.h @@ -0,0 +1,14 @@ +/* + * ecotype.h + * + * Created on: 24 nov. 2008 + * Author: coissac + */ + +#ifndef ECOTYPE_H_ +#define ECOTYPE_H_ + +typedef enum { FALSE=0,TRUE=1} bool_t, *pbool_t; + + +#endif /* ECOTYPE_H_ */ diff --git a/src/libecoprimer/filtering.c b/src/libecoprimer/filtering.c new file mode 100644 index 0000000..24e1387 --- /dev/null +++ b/src/libecoprimer/filtering.c @@ -0,0 +1,188 @@ +/* + * filtering.c + * + * Created on: 12 mai 2009 + * Author: coissac + */ + +#include "ecoprimer.h" +#include +#include + +#include "hashencoder.h" + +static int32_t *ecoFilteringHashSequence(int32_t *dest, + uint32_t circular, + uint32_t doublestrand, + ecoseq_t *seq, + uint32_t *size); + + + + + +static int32_t *ecoFilteringHashSequence(int32_t *dest, + uint32_t circular, + uint32_t doublestrand, + ecoseq_t *seq, + uint32_t *size) +{ + static char *in_last_seq=NULL; + uint32_t i=0; + uint32_t j; + char *base; + int8_t code; + int32_t error=0; + word_t word=0; + word_t antiword=0; + uint32_t goodword; + uint32_t lmax=0; + + // run on the first call; + + + if (dest==(void*)-1) + { + if (in_last_seq) ECOFREE(in_last_seq,"Free in last seq table"); + return NULL; + } + + + *size = pow(4,FWORDSIZE); + + if (!in_last_seq) + in_last_seq = ECOMALLOC(*size*sizeof(char), + "Cannot allocate filtering hash table"); + + memset(in_last_seq,0,*size*sizeof(char)); + + + if (!dest) + { + dest = ECOMALLOC(*size*sizeof(int32_t), + "Cannot allocate filtering hash table"); + memset(dest,0,*size*sizeof(int32_t)); + } + + lmax = seq->SQ_length; + if (!circular) + lmax-= FWORDSIZE-1; + + + +// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i)); + + for (i=0, base = seq->SQ; i < FWORDSIZE && i < lmax; i++,base++) + { + error<<= 1; + error&=ERRORMASK(FWORDSIZE); + + code = encoder[(*base) - 'A']; + if (code <0) + { + code = 0; + error|= 1; + } + + + word=RAPPENDBASE(word,FWORDSIZE,code); + if (doublestrand) + antiword=LAPPENDBASE(antiword,FWORDSIZE,code); + } + + if (!error && i==FWORDSIZE) + { + + goodword=(uint32_t)((doublestrand) ? MINI(word,antiword):word); + + if (!in_last_seq[goodword]) + { + in_last_seq[goodword]=1; + dest[goodword]++; + } + } + + + for (j=1; j < lmax; j++,i++,base++) + { + +// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,j,(seq->SQ+j)); + + /* roll over the sequence for circular ones */ + if (i==(uint32_t)seq->SQ_length) base=seq->SQ; + + error<<= 1; + error&=ERRORMASK(FWORDSIZE); + + //code = -1; + //if((*base) >= 'A' && (*base) <= 'Z') + code = encoder[(*base) - 'A']; + if (code <0) + { + code = 0; + error|= 1; + } + + word=RAPPENDBASE(word,FWORDSIZE,code); + if (doublestrand) + antiword=LAPPENDBASE(antiword,FWORDSIZE,code); + + if (!error) + { + if (doublestrand) + goodword=(uint32_t)MINI(word,antiword); + else + goodword=word; + if (!in_last_seq[goodword]) + { + in_last_seq[goodword]=1; + dest[goodword]++; + } + } + + } + + return dest; + +} + + +int32_t *filteringSeq(pecodnadb_t database, uint32_t seqdbsize, + uint32_t exampleCount,poptions_t options,uint32_t *size,int32_t sequenceQuorum) +{ + int32_t *wordscount=NULL; + int32_t keep=0; + uint32_t i,j=0; + + for (i=0;iisexample && database[i]->SQ_length > options->primer_length) + { + j++; + wordscount=ecoFilteringHashSequence(wordscount, + options->circular, + options->doublestrand, + database[i], + size); + } + fprintf(stderr," Filtered sequences %5u/%5u \r",j,exampleCount); + + } + + fprintf(stderr,"\n"); + + for (i=0;i<*size;i++) + if (wordscount[i] >= sequenceQuorum) + keep++; + + + (void)ecoFilteringHashSequence((int32_t*)-1, + options->circular, + options->doublestrand, + NULL, + NULL); + + fprintf(stderr,"ok\n Considered word of size %d for filtering : %d\n",FWORDSIZE,keep); + return wordscount; + +} diff --git a/src/libecoprimer/goodtaxon.c b/src/libecoprimer/goodtaxon.c new file mode 100644 index 0000000..68f940c --- /dev/null +++ b/src/libecoprimer/goodtaxon.c @@ -0,0 +1,64 @@ +/* + * goodtaxon.c + * + * Created on: 7 nov. 2008 + * Author: coissac + */ + + +#include "ecoprimer.h" + +int isGoodTaxon(ecotaxonomy_t *taxonomy,int32_t taxon,poptions_t options) +{ + int result; + + result=((options->r == 0) || (eco_is_taxid_included(taxonomy, + options->restricted_taxid, + options->r, + taxonomy->taxons->taxon[taxon].taxid) + )) && + ((options->e == 0) || !(eco_is_taxid_included(taxonomy, + options->exception_taxid, + options->e, + taxonomy->taxons->taxon[taxon].taxid) + )); + + return result; +} + +int isExampleTaxon(ecotaxonomy_t *taxonomy,int32_t taxon,poptions_t options) +{ + int result; + + result=( (options->r == 0) || (eco_is_taxid_included(taxonomy, + options->restricted_taxid, + options->r, + taxonomy->taxons->taxon[taxon].taxid) + )) && + ((options->e == 0) || !(eco_is_taxid_included(taxonomy, + options->exception_taxid, + options->e, + taxonomy->taxons->taxon[taxon].taxid) + )); + + return result; +} + + +int isCounterExampleTaxon(ecotaxonomy_t *taxonomy,int32_t taxon,poptions_t options) +{ + int result; + + result=((options->g != 0) && (eco_is_taxid_included(taxonomy, + options->ignored_taxid, + options->g, + taxonomy->taxons->taxon[taxon].taxid)) + ) || ((options->e != 0) && (eco_is_taxid_included(taxonomy, + options->exception_taxid, + options->e, + taxonomy->taxons->taxon[taxon].taxid)) + ); + + + return result; +} diff --git a/src/libecoprimer/hashencoder.h b/src/libecoprimer/hashencoder.h new file mode 100644 index 0000000..00540d7 --- /dev/null +++ b/src/libecoprimer/hashencoder.h @@ -0,0 +1,21 @@ +/* + * hashencoder.h + * + * Created on: 12 mai 2009 + * Author: coissac + */ + +#ifndef HASHENCODER_H_ +#define HASHENCODER_H_ + +static int8_t encoder[] = {0, // A + -1, // b + 1, // C + -1,-1,-1, // d, e, f + 2, // G + -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // h,i,j,k,l,m,n,o,p,q,r,s + 3,3, // T,U + -1,-1,-1,-1,-1}; // v,w,x,y,z + + +#endif /* HASHENCODER_H_ */ diff --git a/src/libecoprimer/hashsequence.c b/src/libecoprimer/hashsequence.c new file mode 100644 index 0000000..2caea48 --- /dev/null +++ b/src/libecoprimer/hashsequence.c @@ -0,0 +1,243 @@ +/* + * hashsequence.c + * + * Created on: 7 nov. 2008 + * Author: coissac + */ + + +#include "ecoprimer.h" + +static int cmpword(const void *x,const void *y); + +#include "hashencoder.h" + +uint32_t ecoWordCount(uint32_t wordsize, uint32_t circular, ecoseq_t *seq) +{ + uint32_t wordcount; + + wordcount = seq->SQ_length; + + if (!circular) wordcount-=wordsize-1; + + return wordcount; +} + +pword_t ecoHashSequence(pword_t dest, + uint32_t wordsize, + uint32_t circular, + uint32_t doublestrand, + ecoseq_t *seq, + uint32_t *size, + int32_t *neededWords, + uint32_t neededWordCount, + int32_t quorum) +{ + uint32_t i=0; + uint32_t j; + char *base; + int8_t code; + int32_t error=0; + word_t word=0; + word_t antiword=0; + word_t goodword; + uint32_t lmax=0; + + (*size)=0; + + lmax = seq->SQ_length; + if (!circular) + lmax-= wordsize-1; + + if (!dest) + dest = ECOMALLOC(lmax*sizeof(word_t), + "I cannot allocate memory for sequence hashing" + ); + +// DEBUG_LOG("Sequence %s @ %d : %18.18s",seq->AC,i,(seq->SQ+i)); + + for (i=0, base = seq->SQ; i < wordsize && i < lmax; i++,base++) + { + + error<<= 1; + error&=ERRORMASK(wordsize); + + code = encoder[(*base) - 'A']; + if (code <0) + { + code = 0; + error|= 1; + } + + + word=RAPPENDBASE(word,wordsize,code); + + if (doublestrand) + antiword=LAPPENDBASE(antiword,wordsize,code); + + if (neededWordCount && i>=(FWORDSIZE-1)) + { + + goodword = (doublestrand) ? MINI(FILTERWORD(word),CFILTERWORD(antiword,wordsize)):FILTERWORD(word); + if (neededWords[(uint32_t)goodword]AC,j,(seq->SQ+j)); + + /* roll over the sequence for circular ones */ + + if (i==(uint32_t)seq->SQ_length) base=seq->SQ; + + error<<= 1; + error&=ERRORMASK(wordsize); + + code = encoder[(*base) - 'A']; + if (code <0) + { + code = 0; + error|= 1; + } + + word=RAPPENDBASE(word,wordsize,code); + if (doublestrand) + antiword=LAPPENDBASE(antiword,wordsize,code); + + if (neededWordCount) + { + goodword = (doublestrand) ? MINI(FILTERWORD(word),CFILTERWORD(antiword,wordsize)):FILTERWORD(word); + if (neededWords[(uint32_t)goodword]AC,goodword,neededWords[(uint32_t)goodword],quorum,i,error); + + } + + + if (!error) + { + dest[*size]=(doublestrand) ? MINI(word,antiword):word; + (*size)++; + } + + } + // DEBUG_LOG("%s goodword = %d",seq->AC,*size); + return dest; + +} + +uint32_t ecoCompactHashSequence(pword_t table,uint32_t size) +{ + uint32_t i,j; + word_t current; +// bool_t here=FALSE; + + sortword(table,size); + + current = 0; + current=SETMULTIWORD(current); /* build impossible word for the first loop cycle */ + +// if (strcmp(ecoUnhashWord(table[size-1],18),"GTTTGTTCAACGATTAAA")==0) +// here=TRUE; + + for (i=0,j=0; j < size;j++) + { + if (WORD(table[j])!=current) + { + current =table[j]; + table[i]=current; + i++; + } + else + table[i]=SETMULTIWORD(table[i]); + } + +// if (strcmp(ecoUnhashWord(WORD(table[i-1]),18),"TACGACCTCGATGTTGGA")==0) +// DEBUG_LOG("winner %d",i) + + return i; +} + +const char* ecoUnhashWord(word_t word,uint32_t size) +{ + static char buffer[32]; + static char decode[]="ACGT"; + + uint32_t i; + + for (i=0; i < size; i++) + { + buffer[i]=decode[(word >> (2 * (size - 1 -i))) & 3]; + } + + buffer[size]=0; + + return buffer; +} + +word_t ecoComplementWord(word_t word,uint32_t size) +{ + word_t rep=0; + uint32_t i; + +// DEBUG_LOG("%llx %llx",word,~word); + word=(~word) & WORDMASK(size); + for (i=0;i < size; i++) + { + + rep = RAPPENDBASE(rep,size,word & 3LLU); +// DEBUG_LOG("%016llx %016llx %016llx",word,word & 3LLU,rep); + word>>=2; + } +// DEBUG_LOG("Complemented = %s",ecoUnhashWord(rep,18)); + return rep; + +} + +static int cmpword(const void *x,const void *y) +{ + word_t w1 = *(pword_t)x; + word_t w2 = *(pword_t)y; + + w1 = WORD(w1); + w2 = WORD(w2); + + if (w1 < w2) + return -1; + if (w1 > w2) + return +1; + + return 0; +} + +uint32_t ecoFindWord(pwordcount_t table,word_t word) +{ + pword_t dest; + + dest = (pword_t)bsearch((const void*)&word,(const void*)table->words,table->size,sizeof(word_t),cmpword); + + if (dest) + return dest - table->words; + else + return ~0; +} + +char ecoComplementChar(char base) +{ + return (base < 4)? !base & 3: 4; +} + diff --git a/src/libecoprimer/libstki.c b/src/libecoprimer/libstki.c new file mode 100644 index 0000000..9bdebf2 --- /dev/null +++ b/src/libecoprimer/libstki.c @@ -0,0 +1,379 @@ +/* ==================================================== */ +/* Copyright (c) Atelier de BioInformatique */ +/* Mar. 92 */ +/* File: libstki.c */ +/* Purpose: A library to deal with 'stacks' of */ +/* integers */ +/* Note: 'stacks' are dynamic (i.e. size is */ +/* automatically readjusted when needed) */ +/* History: */ +/* 00/03/92 : first draft */ +/* 15/08/93 : revised version */ +/* 14/05/99 : last revision */ +/* ==================================================== */ + +#include +#include +#include + +#include "libstki.h" +#include "ecoprimer.h" + + +/* ============================ */ +/* Constantes et Macros locales */ +/* ============================ */ + +#define ExpandStack(stkh) ResizeStacki((stkh), (*stkh)->size << 1) + +#define ShrinkStack(stkh) ResizeStacki((stkh), (*stkh)->size >> 1) + + +static int16_t sStkiLastError = kStkiNoErr; + +/* -------------------------------------------- */ +/* gestion des erreurs */ +/* get/reset erreur flag */ +/* */ +/* @function: StkiError */ +/* -------------------------------------------- */ + +int16_t StkiError(bool_t reset) +{ + int16_t err; + + err = sStkiLastError; + + if (reset) + sStkiLastError = kStkiNoErr; + + return err; + +} /* end of StkiError */ + +/* -------------------------------------------- */ +/* creation d'un stack */ +/* */ +/* @function: NewStacki */ +/* -------------------------------------------- */ + +StackiPtr NewStacki(int32_t size) +{ + StackiPtr stki; + + if (! (stki = NEW(Stacki))) + return NULL; + + stki->size = size; + stki->top = 0; + stki->cursor = 0; + + if ( ! (stki->val = NEWN(int32_t, size))) { + sStkiLastError = kStkiMemErr; + return FreeStacki(stki); + } + + return stki; + +} /* end of NewStacki */ + + +/* -------------------------------------------- */ +/* liberation d'un stack */ +/* */ +/* @function: FreeStacki */ +/* -------------------------------------------- */ + +StackiPtr FreeStacki(StackiPtr stki) +{ + if (stki) { + if (stki->val) + ECOFREE(stki->val,"Free stack values"); + ECOFREE(stki,"Free stack"); + } + + return NULL; + +} /* end of FreeStacki */ + +/* -------------------------------------------- */ +/* creation d'un vecteur de stacks */ +/* */ +/* @function: NewStackiVector */ +/* -------------------------------------------- */ + +StackiHdle NewStackiVector(int32_t vectSize, int32_t stackSize) +{ + int32_t i; + StackiHdle stkh; + + if (! (stkh = NEWN(StackiPtr, vectSize))) { + sStkiLastError = kStkiMemErr; + return NULL; + } + + for (i = 0 ; i < vectSize ; i++) + if (! (stkh[i] = NewStacki(stackSize))) + return FreeStackiVector(stkh, i); + + return stkh; + +} /* end of NewStackiVector */ + + +/* -------------------------------------------- */ +/* liberation d'un vecteur de stacks */ +/* */ +/* @function: FreeStackiVector */ +/* -------------------------------------------- */ + +StackiHdle FreeStackiVector(StackiHdle stkh, int32_t vectSize) +{ + int32_t i; + + if (stkh) { + for (i = 0 ; i < vectSize ; i++) + (void) FreeStacki(stkh[i]); + ECOFREE(stkh,"Free stack vector"); + } + + return NULL; + +} /* end of FreeStackiVector */ + +/* -------------------------------------------- */ +/* resize d'un stack */ +/* */ +/* @function: ResizeStacki */ +/* -------------------------------------------- */ + +int32_t ResizeStacki(StackiHdle stkh, int32_t size) +{ + int32_t resize = 0; /* assume error */ + int32_t *val; + + if ((val = ECOREALLOC((*stkh)->val, size * sizeof(int32_t),"Cannot reallocate stack values"))) { + (*stkh)->size = resize = size; + (*stkh)->val = val; + } + + if (! resize) + sStkiLastError = kStkiMemErr; + + return resize; + +} /* end of ResizeStacki */ + +/* -------------------------------------------- */ +/* empilage(/lement) */ +/* */ +/* @function: PushiIn */ +/* -------------------------------------------- */ + +bool_t PushiIn(StackiHdle stkh, int32_t val) +{ + if (((*stkh)->top >= (*stkh)->size) && (! ExpandStack(stkh))) + return FALSE; + + (*stkh)->val[((*stkh)->top)++] = val; + + return TRUE; + +} /* end of PushiIn */ + +/* -------------------------------------------- */ +/* depilage(/lement) */ +/* */ +/* @function: PopiOut */ +/* -------------------------------------------- */ + +bool_t PopiOut(StackiHdle stkh, int32_t *val) +{ + if ((*stkh)->top <= 0) + return FALSE; + + *val = (*stkh)->val[--((*stkh)->top)]; + + if ( ((*stkh)->top < ((*stkh)->size >> 1)) + && ((*stkh)->top > kMinStackiSize)) + + (void) ShrinkStack(stkh); + + return TRUE; + +} /* end of PopiOut */ + +/* -------------------------------------------- */ +/* lecture descendante */ +/* */ +/* @function: ReadiDown */ +/* -------------------------------------------- */ + +bool_t ReadiDown(StackiPtr stki, int32_t *val) +{ + if (stki->cursor <= 0) + return FALSE; + + *val = stki->val[--(stki->cursor)]; + + return TRUE; + +} /* end of ReadiDown */ + +/* -------------------------------------------- */ +/* lecture ascendante */ +/* */ +/* @function: ReadiUp */ +/* -------------------------------------------- */ + +bool_t ReadiUp(StackiPtr stki, int32_t *val) +{ + if (stki->cursor >= stki->top) + return FALSE; + + *val = stki->val[(stki->cursor)++]; + + return TRUE; + +} /* end of ReadiUp */ + +/* -------------------------------------------- */ +/* remontee/descente du curseur */ +/* */ +/* @function: CursiToTop */ +/* @function: CursiToBottom */ +/* -------------------------------------------- */ + +void CursiToTop(StackiPtr stki) +{ + stki->cursor = stki->top; + +} /* end of CursiToTop */ + +void CursiToBottom(stki) + StackiPtr stki; +{ + stki->cursor = 0; + +} /* end of CursiToBottom */ + +/* -------------------------------------------- */ +/* echange des valeurs cursor <-> (top - 1) */ +/* */ +/* @function: CursiSwap */ +/* -------------------------------------------- */ + +void CursiSwap(StackiPtr stki) +{ + int32_t tmp; + + if ((stki->top <= 0) || (stki->cursor < 0)) + return; + + tmp = stki->val[stki->cursor]; + stki->val[stki->cursor] = stki->val[stki->top - 1]; + stki->val[stki->top - 1] = tmp; + +} /* end of CursiSwap */ + +/* -------------------------------------------- */ +/* Recherche d'une valeur en stack a partir du */ +/* curseur courant en descendant. */ +/* on laisse le curseur a l'endroit trouve */ +/* */ +/* @function: SearchDownStacki */ +/* -------------------------------------------- */ + +bool_t SearchDownStacki(StackiPtr stki, int32_t sval) +{ + int32_t val; + bool_t more; + + while ((more = ReadiDown(stki, &val))) + if (val == sval) + break; + + return more; + +} /* end of SearchDownStacki */ + +/* -------------------------------------------- */ +/* Recherche dichotomique d'une valeur en stack */ +/* le stack est suppose trie par valeurs */ +/* croissantes. */ +/* on place le curseur a l'endroit trouve */ +/* */ +/* @function: BinSearchStacki */ +/* -------------------------------------------- */ + +bool_t BinSearchStacki(StackiPtr stki, int32_t sval) +{ + int32_t midd, low, high, span; + + low = 0; + high = stki->top - 1; + + while (high >= low) { + + midd = (high + low) / 2; + + span = stki->val[midd] - sval; + + if (span == 0) { + stki->cursor = midd; + return TRUE; + } + + if (span > 0) + high = midd - 1; + else + low = midd + 1; + } + + return FALSE; + +} /* end of BinSearchStacki */ + +/* -------------------------------------------- */ +/* teste l'egalite *physique* de deux stacks */ +/* */ +/* @function: SameStacki */ +/* -------------------------------------------- */ + +bool_t SameStacki(StackiPtr stki1, StackiPtr stki2) +{ + if (stki1->top != stki2->top) + return FALSE; + + return ((memcmp(stki1->val, stki2->val, + stki1->top * sizeof(int32_t)) == 0) ? TRUE : FALSE); + +} /* end of SameStacki */ + + +/* -------------------------------------------- */ +/* inverse l'ordre des elements dans un stack */ +/* */ +/* @function: ReverseStacki */ +/* -------------------------------------------- */ + +bool_t ReverseStacki(StackiPtr stki) +{ + int32_t *t, *b, swp; + + if (stki->top <= 0) + return FALSE; + + b = stki->val; + t = b + stki->top - 1; + + while (t > b) { + swp = *t; + *t-- = *b; + *b++ = swp; + } + + return TRUE; + +} /* end of ReverseStacki */ + diff --git a/src/libecoprimer/libstki.h b/src/libecoprimer/libstki.h new file mode 100644 index 0000000..cad7d60 --- /dev/null +++ b/src/libecoprimer/libstki.h @@ -0,0 +1,89 @@ +/* ==================================================== */ +/* Copyright (c) Atelier de BioInformatique */ +/* Mar. 92 */ +/* File: libstki.h */ +/* Purpose: library of dynamic stacks holding */ +/* integer values */ +/* History: */ +/* 00/03/92 : first draft */ +/* 07/07/93 : complete revision */ +/* 10/03/94 : added xxxVector funcs */ +/* 14/05/99 : last revision */ +/* ==================================================== */ + +#ifndef _H_libstki +#define _H_libstki + + +#include "ecotype.h" + +/* ==================================================== */ +/* Constantes de dimensionnement */ +/* ==================================================== */ + +#ifndef kMinStackiSize +#define kMinStackiSize 2 /* taille mini stack */ +#endif + + +#define kStkiNoErr 0 /* ok */ +#define kStkiMemErr 1 /* not enough memory */ + +#define kStkiReset TRUE +#define kStkiGet FALSE + +/* ==================================================== */ +/* Macros standards */ +/* ==================================================== */ + +#ifndef NEW +#define NEW(typ) (typ*)malloc(sizeof(typ)) +#define NEWN(typ, dim) (typ*)malloc((uint32_t)(dim) * sizeof(typ)) +#define REALLOC(typ, ptr, dim) (typ*)realloc((void *) (ptr), (uint32_t)(dim) * sizeof(typ)) +#define FREE(ptr) free((Ptr) ptr) +#endif + + +/* ==================================================== */ +/* Types & Structures de donnees */ +/* ==================================================== */ + + /* -------------------- */ + /* structure : pile */ + /* -------------------- */ +typedef struct Stacki { + /* ---------------------*/ + int32_t size; /* stack size */ + int32_t top; /* current free pos. */ + int32_t cursor; /* current cursor */ + int32_t *val; /* values */ + /* ---------------------*/ +} Stacki, *StackiPtr, **StackiHdle; + + + +/* ==================================================== */ +/* Prototypes (generated by mproto) */ +/* ==================================================== */ + + /* libstki.c */ + +int16_t StkiError (bool_t reset ); +StackiPtr NewStacki (int32_t size ); +StackiPtr FreeStacki (StackiPtr stki ); +StackiHdle NewStackiVector (int32_t vectSize, int32_t stackSize ); +StackiHdle FreeStackiVector (StackiHdle stkh, int32_t vectSize ); +int32_t ResizeStacki (StackiHdle stkh , int32_t size ); +bool_t PushiIn (StackiHdle stkh , int32_t val ); +bool_t PopiOut (StackiHdle stkh , int32_t *val ); +bool_t ReadiDown (StackiPtr stki , int32_t *val ); +bool_t ReadiUp (StackiPtr stki , int32_t *val ); +void CursiToTop (StackiPtr stki ); +void CursiToBottom (StackiPtr stki ); +void CursiSwap (StackiPtr stki ); +bool_t SearchDownStacki (StackiPtr stki , int32_t sval ); +bool_t BinSearchStacki (StackiPtr stki , int32_t sval ); +bool_t SameStacki (StackiPtr stki1 , StackiPtr stki2 ); +bool_t ReverseStacki (StackiPtr stki ); + +#endif /* _H_libstki */ diff --git a/src/libecoprimer/mapping.c b/src/libecoprimer/mapping.c new file mode 100644 index 0000000..96c84bd --- /dev/null +++ b/src/libecoprimer/mapping.c @@ -0,0 +1,7 @@ +/* + * mapping.c + * + * Created on: 25 nov. 2008 + * Author: coissac + */ + diff --git a/src/libecoprimer/merge.c b/src/libecoprimer/merge.c new file mode 100644 index 0000000..a638ca9 --- /dev/null +++ b/src/libecoprimer/merge.c @@ -0,0 +1,152 @@ +/* + * merge.c + * + * Created on: 11 nov. 2008 + * Author: coissac + */ + +#include "ecoprimer.h" + +static pmerge_t mergeInit(pmerge_t merge,pwordcount_t data,uint32_t s1,uint32_t s2); + + +static pmerge_t mergeInit(pmerge_t merge, pwordcount_t data,uint32_t s1,uint32_t s2) +{ + merge->words = data->words; + merge->count = data->strictcount; + merge->write = 0; + merge->read1 = 0; + merge->read2 = s1; + merge->size = s1+s2; + return merge; +} + + +typedef enum {S1=1,S2=2,STACK=3} source_t; + +void ecomerge(pwordcount_t data,uint32_t s1,uint32_t s2,uint32_t remainingSeq,uint32_t seqQuorum) +{ + merge_t merged; + source_t source; + word_t currentword,tmpword; + uint32_t currentcount,tmpcount; + int same; + queue_t queue; + int nsame=0; + uint32_t maxcount=0; + bool_t writed=TRUE; + +// DEBUG_LOG("Coucou %p s1= %d s2= %d",data,s1,s2) + + (void)mergeInit(&merged,data,s1,s2); + (void)newQueue(&queue,MINI(s1,s2)); + + + while (merged.read1 < s1 || merged.read2 < merged.size) + { + if (! queue.empty) + { + currentword = queue.words[queue.pop]; + currentcount = queue.count[queue.pop]; + source=STACK; + } + else + { + currentword = merged.words[merged.read1]; + currentcount = merged.count[merged.read1]; + source=S1; + } + + if (merged.read2 < merged.size && + WORD(currentword) > WORD(merged.words[merged.read2])) + { + currentword = merged.words[merged.read2]; + currentcount = merged.count[merged.read2]; + source = S2; + } + + same = (source != S2) && (WORD(currentword) == WORD(merged.words[merged.read2])); + nsame+=same; + +// DEBUG_LOG("Merging : r1 = %d s1 = %d r2 = %d size = %d word = %s source=%u same=%u",merged.read1,s1,merged.read2-s1,merged.size,ecoUnhashWord(currentword,18),source,same) + + tmpword = merged.words[merged.write]; + tmpcount= merged.count[merged.write]; + + merged.words[merged.write] = currentword; + merged.count[merged.write] = currentcount; + + if (source != S2) + { + if (same) + { + merged.count[merged.write]+=merged.count[merged.read2]; + + if (ISMULTIWORD(currentword) || ISMULTIWORD(merged.words[merged.read2])) + merged.words[merged.write]=SETMULTIWORD(currentword); + + merged.read2++; + } + + if (source==STACK) + pop(&queue); + merged.read1++; + } + else + merged.read2++; + + if (writed && merged.read1 <= merged.write && merged.write < s1) + push(&queue,tmpword,tmpcount); + + if (merged.count[merged.write] > maxcount) + maxcount=merged.count[merged.write]; + + writed = remainingSeq + merged.count[merged.write] >= seqQuorum; + if (writed) + merged.write++; + + +// else +// DEBUG_LOG("Remove word : %s count : %d remainingSeq : %d total : %d Quorum : %d", +// ecoUnhashWord(currentword,18),merged.count[merged.write],remainingSeq,maxcount+remainingSeq,seqQuorum); + + } /* while loop */ + +// DEBUG_LOG("r1 : %d r2 : %d qsize : %d nsame : %d tot : %d write : %s count : %d source : %d size : %d pop : %d push : %d empty : %d",merged.read1,merged.read2-s1,qsize,nsame,qsize+nsame,ecoUnhashWord(currentword,18),currentcount,source,queue.size,queue.pop,queue.push,queue.empty) + + + if (merged.read2 < merged.size) + { + //DEBUG_LOG("end1 %d %d/%d %d/%d",merged.write,merged.read1,s1,merged.read2,merged.size); + for (;merged.read2 < merged.size;merged.read2++) + { + merged.words[merged.write]=merged.words[merged.read2]; + merged.count[merged.write]=merged.count[merged.read2]; + if (remainingSeq + merged.count[merged.write] >= seqQuorum) + merged.write++; + + } + } + else { + //DEBUG_LOG("end2 %d %d/%d %d/%d",merged.write,merged.read1,s1,merged.read2,merged.size); + while (! queue.empty) + { +// DEBUG_LOG("write : %s count : %d write : %d size : %d pop : %d push : %d empty : %d",ecoUnhashWord(queue.words[queue.pop],18),queue.count[queue.pop],merged.write,queue.size,queue.pop,queue.push,queue.empty) + merged.words[merged.write]=queue.words[queue.pop]; + merged.count[merged.write]=queue.count[queue.pop]; + pop(&queue); + if (remainingSeq + merged.count[merged.write] >= seqQuorum) + merged.write++; + } + } + + data->size = merged.write; + + cleanQueue(&queue); + +// DEBUG_LOG("Max count : %d remainingSeq : %d total : %d Quorum : %d",maxcount,remainingSeq,maxcount+remainingSeq,seqQuorum) +// DEBUG_LOG("Second word : %s",ecoUnhashWord(data->words[1],18)) +// DEBUG_LOG("Last word : %s",ecoUnhashWord(data->words[data->size-1],18)) + + +} diff --git a/src/libecoprimer/pairs.c b/src/libecoprimer/pairs.c new file mode 100644 index 0000000..88c1c4b --- /dev/null +++ b/src/libecoprimer/pairs.c @@ -0,0 +1,460 @@ +/* + * pairs.c + * + * Created on: 15 dŽc. 2008 + * Author: coissac + */ + +#include "ecoprimer.h" +#include +#include +#include "../libthermo/thermostats.h" + +static void buildPrimerPairsForOneSeq(uint32_t seqid, + pecodnadb_t seqdb, + pprimercount_t primers, + ppairtree_t pairs, + poptions_t options); + + + + + + +/************************************* + * + * pair collection management + * + *************************************/ + +#ifdef MASKEDCODE + +char *addamplifiasetelem (ppair_t pair, char* amplifia, int32_t taxid) +{ + uint32_t i; + uint32_t j; + char *ampused = NULL; + + if(pair->ampsetcount == 0) + { + pair->ampsetcount = 500; + pair->ampsetindex = 0; + pair->ampset = ECOMALLOC(pair->ampsetcount * sizeof(ampseqset_t),"Cannot allocate amplifia set"); + } + + for (i = 0; i < pair->ampsetindex; i++) + { + if (strcmp (pair->ampset[i].amplifia, amplifia) == 0) + { + ampused = pair->ampset[i].amplifia; + break; + } + } + + if (i == 0) + { + pair->ampset[i].seqidcount = 100; + pair->ampset[i].seqidindex = 0; + pair->ampset[i].taxonids = ECOMALLOC(pair->ampset[i].seqidcount * sizeof(uint32_t),"Cannot allocate amplifia sequence table"); + } + + if (pair->ampsetindex == pair->ampsetcount) + { + pair->ampsetcount += 500; + pair->ampset = ECOREALLOC(pair->ampset, pair->ampsetcount * sizeof(ampseqset_t), "Cannot allocate amplifia set"); + } + + if (pair->ampset[i].seqidindex == pair->ampset[i].seqidcount) + { + pair->ampset[i].seqidcount += 100; + pair->ampset[i].taxonids = ECOREALLOC(pair->ampset[i].taxonids, pair->ampset[i].seqidcount * sizeof(int32_t), "Cannot allocate amplifia sequence table"); + } + + if (pair->ampset[i].amplifia == NULL) + { + pair->ampset[i].amplifia = amplifia; + pair->ampsetindex++; + } + + for (j = 0; j < pair->ampset[i].seqidindex; j++) + { + if (pair->ampset[i].taxonids[j] == taxid) break; + } + + if (j == pair->ampset[i].seqidindex) + pair->ampset[i].taxonids[pair->ampset[i].seqidindex++] = taxid; + return ampused; +} + +void addtaxampsetelem (ppair_t pair, int32_t taxid, char *amplifia) +{ + uint32_t i; + uint32_t j; + + if(pair->taxsetcount == 0) + { + pair->taxsetcount = 500; + pair->taxsetindex = 0; + pair->taxset = ECOMALLOC(pair->taxsetcount * sizeof(taxampset_t),"Cannot allocate taxon set"); + } + + for (i = 0; i < pair->taxsetindex; i++) + { + if (pair->taxset[i].taxonid == taxid) break; + } + + if (i == 0) + { + pair->taxset[i].amplifiacount = 100; + pair->taxset[i].amplifiaindex = 0; + pair->taxset[i].amplifia = ECOMALLOC(pair->taxset[i].amplifiacount * sizeof(char *),"Cannot allocate amplifia table"); + } + + if (pair->taxsetindex == pair->taxsetcount) + { + pair->taxsetcount += 500; + pair->taxset = ECOREALLOC(pair->taxset, pair->taxsetcount * sizeof(taxampset_t), "Cannot allocate taxon set"); + } + + if (pair->taxset[i].amplifiaindex == pair->taxset[i].amplifiacount) + { + pair->taxset[i].amplifiacount += 100; + pair->taxset[i].amplifia = ECOREALLOC(pair->taxset[i].amplifia, pair->taxset[i].amplifiacount * sizeof(char *), "Cannot allocate amplifia table"); + } + + if (pair->taxset[i].taxonid == 0) + { + pair->taxset[i].taxonid = taxid; + pair->taxsetindex++; + } + + for (j = 0; j < pair->taxset[i].amplifiaindex; j++) + { + if (strcmp(pair->taxset[i].amplifia[j], amplifia) == 0) break; + } + + if (j == pair->taxset[i].amplifiaindex) + { + pair->taxset[i].amplifia[j] = amplifia; + pair->taxset[i].amplifiaindex++; + } +} + +char *getamplifia (pecoseq_t seq, uint32_t start, uint32_t len) +{ + fprintf(stderr,"start : %d length : %d\n",start,len); + char *amplifia = ECOMALLOC((len + 1) * sizeof(char),"Cannot allocate amplifia"); + char *seqc = &seq->SQ[start]; + + strncpy(amplifia, seqc, len); + return amplifia; +} + +#endif + +/*TR: Added*/ +ppairtree_t buildPrimerPairs(pecodnadb_t seqdb,uint32_t seqdbsize,pprimercount_t primers,poptions_t options) +{ + uint32_t i; + ppairtree_t primerpairs; + + primerpairs = initpairtree(NULL); + + for (i=0; i < seqdbsize; i++) + { + buildPrimerPairsForOneSeq(i, seqdb, primers, primerpairs, options); + } + return primerpairs; +} + +#define DMAX (2000000000) + +static void buildPrimerPairsForOneSeq(uint32_t seqid, + pecodnadb_t seqdb, + pprimercount_t primers, + ppairtree_t pairs, + poptions_t options) +{ + static uint32_t paircount=0; + uint32_t i,j,k; + uint32_t matchcount=0; + pprimermatch_t matches = NULL; + //primermatchcount_t seqmatchcount; + ppair_t pcurrent; + pair_t current; + pprimer_t wswp; + bool_t bswp; + size_t distance; + bool_t strand; + //char prmr[50]; + //float mtemp; + word_t w1, w1a, omask = (0x1L << (options->strict_three_prime*2)) -1; + word_t w2, w2a;//, wtmp; + uint32_t bp1,bp2; + + //prmr[options->primer_length] = '\0'; + + for (i=0;i < primers->size; i++) + { + matchcount+=primers->primers[i].directCount[seqid]; + matchcount+=primers->primers[i].reverseCount[seqid]; + } + + if (matchcount <= 0) + return; + + matches = ECOMALLOC(matchcount * sizeof(primermatch_t),"Cannot allocate primers match table"); + + for (i=0,j=0;i < primers->size; i++) + { + if (primers->primers[i].directCount[seqid]) + { + if (primers->primers[i].directCount[seqid]==1) + { + matches[j].primer = primers->primers+i; + matches[j].strand=TRUE; + matches[j].position=primers->primers[i].directPos[seqid].value; + j++; + } + else for (k=0; k < primers->primers[i].directCount[seqid]; k++,j++) + { + matches[j].primer = primers->primers+i; + matches[j].strand=TRUE; + matches[j].position=primers->primers[i].directPos[seqid].pointer[k]; + } + } + + if (primers->primers[i].reverseCount[seqid]) + { + if (primers->primers[i].reverseCount[seqid]==1) + { + matches[j].primer = primers->primers+i; + matches[j].strand=FALSE; + matches[j].position=primers->primers[i].reversePos[seqid].value; + j++; + } + else for (k=0; k < primers->primers[i].reverseCount[seqid]; k++,j++) + { + matches[j].primer = primers->primers+i; + matches[j].strand=FALSE; + matches[j].position=primers->primers[i].reversePos[seqid].pointer[k]; + } + } + } + + if (matchcount>1) + { +// fprintf(stderr,"\n====================================\n"); + + sortmatch(matches,matchcount); // sort in ascending order by position + + for (i=0; i < matchcount;i++) + { + // For all primers matching the sequence + + /*for(j=i+1; + (jprimer_length) < options->lmax); + j++ + )//*/ + for (j=i+1; jprimer_length) continue; + distance = matches[j].position - matches[i].position - options->primer_length; + if (distance >= options->lmax) break; + + + // For all not too far primers + + if ( (matches[i].primer->good || matches[j].primer->good) + && (distance > options->lmin) + ) + { + // If possible primer pair + current.p1 = matches[i].primer; + current.asdirect1=matches[i].strand; + current.p2 = matches[j].primer; + current.asdirect2= !matches[j].strand; + current.maxd=DMAX; + current.mind=DMAX; + current.sumd=0; + current.amplifiacount=0; + current.inexample=0; + current.outexample=0; + current.curseqid = 0; + current.refsequence=-1; + //current.p1temp = 100; + //current.p1mintemp = 100; + //current.p2temp = 100; + //current.p2mintemp = 100; + + // Standardize the pair + strand = current.p2->word > current.p1->word; + if (!strand) + { + wswp = current.p1; + current.p1=current.p2; + current.p2=wswp; + + bswp = current.asdirect1; + current.asdirect1=current.asdirect2; + current.asdirect2=bswp; + } + + + //Code to make sure that if -3 option is given then + //3' end must match upto given number of base pairs + if (options->strict_three_prime > 0) + { + w1 = current.p1->word; + w2 = current.p2->word; + if (!current.asdirect1) //make sure that word is from 5' to 3' + w1=ecoComplementWord(w1,options->primer_length); + + if (!current.asdirect2) //make sure that word is from 5' to 3' + w2=ecoComplementWord(w2,options->primer_length); + //now both w1 and w2 are from 5' to 3' end + bp1 = matches[i].position; + bp2 = matches[j].position; + if (!strand) + { + bp1 = matches[j].position; + bp2 = matches[i].position; + } + //get word of first approximate repeat + w1a = extractSite(seqdb[seqid]->SQ,bp1,options->primer_length,strand); + //get word of second approximate repeat + w2a = extractSite(seqdb[seqid]->SQ,bp2,options->primer_length,!strand); + + w1 = w1 & omask; //keep only strict_three_prime bases on the right (3') end + w2 = w2 & omask; //keep only strict_three_prime bases on the right (3') end + w1a = w1a & omask; //keep only strict_three_prime bases on the right (3') end + w2a = w2a & omask; //keep only strict_three_prime bases on the right (3') end + + //now check that both words and primers of amplifia have same bases on 3' end + if ((w1 ^ w1a) != 0) continue; + if ((w2 ^ w2a) != 0) continue; + } + + + + // Look for the new pair in already seen pairs + + pcurrent = insertpair(current,pairs); + + + if (seqdb[seqid]->isexample) + + { + //pcurrent->inexample++; + pcurrent->sumd+=distance; + pcurrent->amplifiacount++; + + if ((pcurrent->maxd==DMAX) || (distance > pcurrent->maxd)) + pcurrent->maxd = distance; + + if (distance < pcurrent->mind) + pcurrent->mind = distance; + } + //else + // pcurrent->outexample++; + + //for each pair we save current sequence id in the pair + //when we see this pair for the first time in currnet sequence + //because we want to increment inexample & outexample count + //only once for one sequence + if (pcurrent->curseqid != (seqid+1)) + { + if (seqdb[seqid]->isexample) + pcurrent->inexample++; + else + pcurrent->outexample++; + + if (pcurrent->curseqid != 0) + pcurrent->curseqid = seqid+1; + } + + /*if ((pcurrent->outexample+pcurrent->inexample)==0) + { + fprintf(stderr,"pcurrent->outexample+pcurrent->inexample=0!\n"); + exit(0); + }*/ + + if (pcurrent->curseqid == 0)//((pcurrent->outexample+pcurrent->inexample)==1) + { + pcurrent->curseqid = seqid+1; + paircount++; + pcurrent->pcr.ampslot=200; + pcurrent->pcr.ampcount=0; + pcurrent->pcr.amplifias = ECOMALLOC(sizeof(amplifia_t)*pcurrent->pcr.ampslot, + "Cannot allocate amplifia table"); + } + else + { + if (pcurrent->pcr.ampslot==pcurrent->pcr.ampcount) + { + pcurrent->pcr.ampslot+=200; + pcurrent->pcr.amplifias = ECOREALLOC(pcurrent->pcr.amplifias, + sizeof(amplifia_t)*pcurrent->pcr.ampslot, + "Cannot allocate amplifia table"); + } + } + + if (seqid==options->refseqid) + pcurrent->refsequence=seqid; + pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].length=distance; + pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].sequence=seqdb[seqid]; + pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].strand=strand; + pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].begin=matches[i].position + options->primer_length; + pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].end= matches[j].position - 1; + + if (strand) + pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].amplifia= seqdb[seqid]->SQ + matches[i].position + options->primer_length; + else + pcurrent->pcr.amplifias[pcurrent->pcr.ampcount].amplifia= seqdb[seqid]->SQ + matches[j].position - 1 ; + + + /*strncpy (prmr, seqdb[seqid]->SQ + matches[i].position, options->primer_length); + mtemp = nparam_CalcSelfTM (options->pnparm, prmr, options->primer_length) - 273.0; + if (mtemp < pcurrent->p1mintemp) + pcurrent->p1mintemp = mtemp; + //fprintf (stderr, "prmr1: %s\n", seqdb[seqid]->SQ); + strncpy (prmr, seqdb[seqid]->SQ + matches[j].position, options->primer_length); + mtemp = nparam_CalcSelfTM (options->pnparm, prmr, options->primer_length) - 273.0; + if (mtemp < pcurrent->p2mintemp) + pcurrent->p2mintemp = mtemp; + //fprintf (stderr, "prmr2: %s\n", prmr); + + if (pcurrent->p1temp == 100) + pcurrent->p1temp = nparam_CalcSelfTM (options->pnparm, ecoUnhashWord(pcurrent->p1->word, options->primer_length), 0) - 273.0; + if (pcurrent->p2temp == 100) + pcurrent->p2temp = nparam_CalcSelfTM (options->pnparm, ecoUnhashWord(pcurrent->p2->word, options->primer_length), 0) - 273.0; + */ + pcurrent->pcr.ampcount++; +// fprintf(stderr,"%c%c W1 : %s direct : %c", +// "bG"[(int)pcurrent->p1->good], +// "bG"[(int)pcurrent->p2->good], +// ecoUnhashWord(pcurrent->p1->word, options->primer_length), +// "><"[(int)pcurrent->asdirect1] +// ); +// +// fprintf(stderr," W2 : %s direct : %c distance : %d (min/max/avg : %d/%d/%f) in/out: %d/%d %c (%d pairs)\n", +// ecoUnhashWord(pcurrent->p2->word, options->primer_length), +// "><"[(int)pcurrent->asdirect2], +// distance, +// pcurrent->mind,pcurrent->maxd, +// (pcurrent->inexample) ? (float)pcurrent->sumd/pcurrent->inexample:0.0, +// pcurrent->inexample,pcurrent->outexample, +// " N"[(pcurrent->outexample+pcurrent->inexample)==1], +// paircount +// +// ); +// + + } + } + } + } + pairs->count=paircount; + +} diff --git a/src/libecoprimer/pairtree.c b/src/libecoprimer/pairtree.c new file mode 100644 index 0000000..7155104 --- /dev/null +++ b/src/libecoprimer/pairtree.c @@ -0,0 +1,136 @@ +/* + * pairtree.c + * + * Created on: 7 mars 2009 + * Author: coissac + */ + +#include "ecoprimer.h" +#include + +static void cleanpair(ppair_t pair); +static void deletepairlist(ppairlist_t list); +static int cmppair(const void* p1,const void*p2); + + +static void cleanamplifiatlist(pamplifiacount_t list) +{ + if (list->amplifias) + ECOFREE(list->amplifias, + "Free amplifia list"); +} + +static void cleanpair(ppair_t pair) +{ + cleanamplifiatlist(&(pair->pcr)); +} + +static ppairlist_t newpairlist(ppairlist_t parent, size_t size) +{ + ppairlist_t tmp; + + tmp=ECOMALLOC(sizeof(pairlist_t)+sizeof(pair_t)*(size-1), + "Cannot allocate new pair list"); + + tmp->pairslots=size; + tmp->paircount=0; + tmp->next=NULL; + + if (parent) + parent->next=(void*)tmp; + + + return tmp; +} + +static void deletepairlist(ppairlist_t list) +{ + size_t i; + + if (list) + { + if (list->next) + { + deletepairlist(list->next); + list->next=NULL; + } + for (i=0; i < list->paircount; i++) + cleanpair((list->pairs)+i); + + ECOFREE(list,"Delete pair list"); + } + +} + +static int cmppair(const void* p1,const void*p2) +{ + ppair_t pr1,pr2; + + pr1=(ppair_t)p1; + pr2=(ppair_t)p2; + + if (pr1->p1 < pr2->p1) return -1; + if (pr1->p1 > pr2->p1) return 1; + + if (pr1->asdirect1 < pr2->asdirect1) return -1; + if (pr1->asdirect1 > pr2->asdirect1) return 1; + + if (pr1->p2 < pr2->p2) return -1; + if (pr1->p2 > pr2->p2) return 1; + + if (pr1->asdirect2 < pr2->asdirect2) return -1; + if (pr1->asdirect2 > pr2->asdirect2) return 1; + + return 0; +} + +ppair_t pairintree (pair_t key, + ppairtree_t pairlist) +{ + if (!pairlist->tree) + return NULL; + + return *((ppair_t*)tsearch((const void *)(&key), + &(pairlist->tree), + cmppair + )); +} + +ppair_t insertpair(pair_t key, + ppairtree_t list) +{ + ppair_t current; + ppair_t found; + + if (list->last->paircount==list->last->pairslots) + { + list->last->next=newpairlist(list->last,100); + list->last=list->last->next; + } + + current = list->last->pairs + list->last->paircount; + *current=key; + + found = *((ppair_t*)tsearch((const void *)current, + &(list->tree), + cmppair)); + if (found==current) + list->last->paircount++; + + return found; +} + +ppairtree_t initpairtree(ppairtree_t tree) +{ + + if (!tree) + tree = ECOMALLOC(sizeof(pairtree_t),"Cannot allocate pair tree"); + + tree->first=newpairlist(NULL,300); + tree->last=tree->first; + + tree->tree=NULL; + tree->count=0; + + return tree; +} diff --git a/src/libecoprimer/queue.c b/src/libecoprimer/queue.c new file mode 100644 index 0000000..7d11b25 --- /dev/null +++ b/src/libecoprimer/queue.c @@ -0,0 +1,100 @@ +/* + * queue.c + * + * Created on: 14 nov. 2008 + * Author: coissac + */ + +#include "ecoprimer.h" + + + +pqueue_t newQueue(pqueue_t queue, uint32_t size) +{ + if (!queue) + queue = ECOMALLOC(sizeof(queue_t),"Cannot allocate queue structure"); + + queue->size=0; + + resizeQueue(queue,size); + + return queue; + +} + +pqueue_t resizeQueue(pqueue_t queue, uint32_t size) +{ + queue->pop=0; + queue->push=0; + queue->empty=TRUE; + queue->full=FALSE; + + if (!queue->size) + { + queue->count=ECOMALLOC(size * sizeof(uint32_t), + "Cannot allocate count queue array" + ); + queue->words=ECOMALLOC(size * sizeof(word_t), + "Cannot allocate word queue array" + ); + queue->size=size; + } + else if (size > queue->size) + { + queue->count=ECOREALLOC(queue->count, + size * sizeof(uint32_t), + "Cannot allocate count queue array" + ); + queue->words=ECOREALLOC(queue->words, + size * sizeof(word_t), + "Cannot allocate word queue array" + ); + + queue->size=size; + } + + return queue; +} + +pqueue_t cleanQueue(pqueue_t queue) +{ + if (queue->size) + { + if (queue->count) + ECOFREE(queue->count,"Free count queue"); + if (queue->words) + ECOFREE(queue->words,"Free words queue"); + } + + queue->size=0; + + return queue; +} + +void push(pqueue_t queue, word_t word, uint32_t count) +{ + ECO_ASSERT(!queue->full,"Queue is full"); + + queue->count[queue->push]=count; + queue->words[queue->push]=word; + + queue->push++; + + if (queue->push==queue->size) + queue->push=0; + + queue->full=queue->push==queue->pop; + queue->empty=FALSE; +} + +void pop(pqueue_t queue) +{ + ECO_ASSERT(!queue->empty,"Queue is empty"); + queue->pop++; + + if (queue->pop==queue->size) + queue->pop=0; + + queue->empty=queue->push==queue->pop; + queue->full=FALSE; +} diff --git a/src/libecoprimer/readdnadb.c b/src/libecoprimer/readdnadb.c new file mode 100644 index 0000000..ced45c5 --- /dev/null +++ b/src/libecoprimer/readdnadb.c @@ -0,0 +1,59 @@ +/* + * readdnadb.c + * + * Created on: 7 nov. 2008 + * Author: coissac + */ + +#include "ecoprimer.h" + +pecodnadb_t readdnadb(const char *name, ecotaxonomy_t *taxonomy, uint32_t *size,poptions_t options) +{ + ecoseq_t *seq; + uint32_t buffsize=100; + pecodnadb_t db; + + db = ECOMALLOC(buffsize*sizeof(ecoseq_t*),"I cannot allocate db memory"); + + + for(seq=ecoseq_iterator(name), *size=0; + seq; + seq=ecoseq_iterator(NULL) + ) + { + if (isExampleTaxon(taxonomy,seq->taxid,options) || + isCounterExampleTaxon(taxonomy,seq->taxid,options)) + { + if (*size==buffsize) + { + buffsize*=2; + db = ECOREALLOC(db,buffsize*sizeof(ecoseq_t*),"I cannot allocate db memory"); + } + db[*size]=seq; + (*size)++; + } + else + { + delete_ecoseq(seq); + } + }; + + db = ECOREALLOC(db,(*size)*sizeof(ecoseq_t*),"I cannot allocate db memory"); + + return db; +} + + +void printSeqTest(pecodnadb_t seqdb,uint32_t seqdbsize) +{ + uint32_t i; + char ch[11]; + ch [10] = '\0'; + + for (i=0; i < seqdbsize; i++) + { + strncpy (ch, seqdb[i]->SQ, 10); + fprintf (stderr, "seq %d = %s\n", i, ch); + } + exit (0); +} diff --git a/src/libecoprimer/smothsort.c b/src/libecoprimer/smothsort.c new file mode 100644 index 0000000..72ee444 --- /dev/null +++ b/src/libecoprimer/smothsort.c @@ -0,0 +1,265 @@ +/* + * This file is part of the Sofia-SIP package + * + * Copyright (C) 2005 Nokia Corporation. + * + * Contact: Pekka Pessi + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + * + */ + +/**@file smoothsort.c + * @brief Smoothsort implementation + * + * Smoothsort is a in-place sorting algorithm with performance of O(NlogN) + * in worst case and O(n) in best case. + * + * @sa + * "Smoothsort, an alternative for sorting in-situ", E.D. Dijkstra, EWD796a, + * <http://www.enterag.ch/hartwig/order/smoothsort.pdf>. + * + * @author Pekka Pessi + */ + + +#include +#include +#include +#include /* add sto switch from size_t to uint32_t */ + +/** Description of current stretch */ +typedef struct { + uint32_t b, c; /**< Leonardo numbers */ + unsigned long long p; /**< Concatenation codification */ +} stretch; + +/** Description of array */ +typedef struct +{ + void *m; + int (*less)(void *m, uint32_t a, uint32_t b); + void (*swap)(void *m, uint32_t a, uint32_t b); +} array; + +static inline uint32_t stretch_up(stretch s[1]) +{ + uint32_t next; + + s->p >>= 1; + + next = s->b + s->c + 1, s->c = s->b, s->b = next; + + return next; +} + +static inline uint32_t stretch_down(stretch s[1], unsigned bit) +{ + uint32_t next; + + s->p <<= 1, s->p |= bit; + + next = s->c, s->c = s->b - s->c - 1, s->b = next; + + return next; +} + +#if DEBUG_SMOOTHSORT +static char const *binary(unsigned long long p) +{ + static char binary[65]; + int i; + + if (p == 0) + return "0"; + + binary[64] = 0; + + for (i = 64; p; p >>= 1) + binary[--i] = "01"[p & 1]; + + return binary + i; +} +#else +#define DEBUG(x) ((void)0) +#endif + +/** + * Sift the root of the stretch. + * + * The low values are sifted up (towards index 0) from root. + * + * @param array description of array to sort + * @param r root of the stretch + * @param s description of current stretch + */ +static void sift(array const *array, uint32_t r, stretch s) +{ + while (s.b >= 3) { + uint32_t r2 = r - s.b + s.c; + + if (!array->less(array->m, r - 1, r2)) { + r2 = r - 1; + stretch_down(&s, 0); + } + + if (array->less(array->m, r2, r)) + break; + + DEBUG(("\tswap(%p @%zu <=> @%zu)\n", array, r, r2)); + + array->swap(array->m, r, r2); r = r2; + + stretch_down(&s, 0); + } +} + +/** Trinkle the roots of the given stretches + * + * @param array description of array to sort + * @param r root of the stretch + * @param s description of stretches to concatenate + */ +static void trinkle(array const *array, uint32_t r, stretch s) +{ + DEBUG(("trinkle(%p, %zu, (%u, %s))\n", array, r, s.b, binary(s.p))); + + while (s.p != 0) { + uint32_t r2, r3; + + while ((s.p & 1) == 0) + stretch_up(&s); + + if (s.p == 1) + break; + + r3 = r - s.b; + + if (array->less(array->m, r3, r)) + break; + + s.p--; + + if (s.b < 3) { + DEBUG(("\tswap(%p @%zu <=> @%zu b=%u)\n", array, r, r3, s.b)); + array->swap(array->m, r, r3); r = r3; + continue; + } + + r2 = r - s.b + s.c; + + if (array->less(array->m, r2, r - 1)) { + r2 = r - 1; + stretch_down(&s, 0); + } + + if (array->less(array->m, r2, r3)) { + DEBUG(("swap(%p [%zu]=[%zu])\n", array, r, r3)); + array->swap(array->m, r, r3); r = r3; + continue; + } + + DEBUG(("\tswap(%p @%zu <=> @%zu b=%u)\n", array, r, r2, s.b)); + array->swap(array->m, r, r2); r = r2; + stretch_down(&s, 0); + break; + } + + sift(array, r, s); +} + +/** Trinkles the stretches when the adjacent stretches are already trusty. + * + * @param array description of array to sort + * @param r root of the stretch + * @param stretch description of stretches to trinkle + */ +static void semitrinkle(array const *array, uint32_t r, stretch s) +{ + uint32_t r1 = r - s.c; + + DEBUG(("semitrinkle(%p, %zu, (%u, %s))\n", array, r, s.b, binary(s.p))); + + if (array->less(array->m, r, r1)) { + DEBUG(("\tswap(%p @%zu <=> @%zu b=%u)\n", array, r, r1, s.b)); + array->swap(array->m, r, r1); + trinkle(array, r1, s); + } +} + +/** Sort array using smoothsort. + * + * Sort @a N elements from array @a base starting with index @a r with smoothsort. + * + * @param base pointer to array + * @param r lowest index to sort + * @param N number of elements to sort + * @param less comparison function returning nonzero if m[a] < m[b] + * @param swap swapper function exchanging elements m[a] and m[b] + */ +void su_smoothsort(void *base, uint32_t r, uint32_t N, + int (*less)(void *m, uint32_t a, uint32_t b), + void (*swap)(void *m, uint32_t a, uint32_t b)) +{ + stretch s = { 1, 1, 1 }; + uint32_t q; + + array const array[1] = {{ base, less, swap }}; + + assert(less && swap); + + if (base == NULL || N <= 1 || less == NULL || swap == NULL) + return; + + DEBUG(("\nsmoothsort(%p, %zu)\n", array, nmemb)); + + for (q = 1; q != N; q++, r++, s.p++) { + DEBUG(("loop0 q=%zu, b=%u, p=%s \n", q, s.b, binary(s.p))); + + if ((s.p & 7) == 3) { + sift(array, r, s), stretch_up(&s), stretch_up(&s); + } + else /* if ((s.p & 3) == 1) */ { assert((s.p & 3) == 1); + if (q + s.c < N) + sift(array, r, s); + else + trinkle(array, r, s); + + while (stretch_down(&s, 0) > 1) + ; + } + } + + trinkle(array, r, s); + + for (; q > 1; q--) { + s.p--; + + DEBUG(("loop1 q=%zu: b=%u p=%s\n", q, s.b, binary(s.p))); + + if (s.b <= 1) { + while ((s.p & 1) == 0) + stretch_up(&s); + --r; + } + else /* if b >= 3 */ { + if (s.p) semitrinkle(array, r - (s.b - s.c), s); + stretch_down(&s, 1); + semitrinkle(array, --r, s); + stretch_down(&s, 1); + } + } +} diff --git a/src/libecoprimer/sortmatch.c b/src/libecoprimer/sortmatch.c new file mode 100644 index 0000000..f3771b7 --- /dev/null +++ b/src/libecoprimer/sortmatch.c @@ -0,0 +1,51 @@ +/* + * sortmatch.c + * + * Created on: 15 dŽc. 2008 + * Author: coissac + */ + +/* + * sortword.c + * + * + * Created on: 6 nov. 2008 + * Author: coissac + */ + +#include "ecoprimer.h" +#include + +void su_smoothsort(void *base, uint32_t r, uint32_t N, + int (*less)(void *m, uint32_t a, uint32_t b), + void (*swap)(void *m, uint32_t a, uint32_t b)); + +static int less(void *m, uint32_t a, uint32_t b); +static void swap(void *m, uint32_t a, uint32_t b); + + +void sortmatch(pprimermatch_t table,uint32_t N) +{ + su_smoothsort((void*)table,0,N,less,swap); +} + +int less(void *m, uint32_t a, uint32_t b) +{ + pprimermatch_t t; + + t = (pprimermatch_t)m; + + return t[a].position <= t[b].position; +} + +void swap(void *m, uint32_t a, uint32_t b) +{ + primermatch_t tmp; + pprimermatch_t t; + + t = (pprimermatch_t)m; + tmp = t[a]; + t[a]= t[b]; + t[b]= tmp; +} + diff --git a/src/libecoprimer/sortword.c b/src/libecoprimer/sortword.c new file mode 100644 index 0000000..389630f --- /dev/null +++ b/src/libecoprimer/sortword.c @@ -0,0 +1,44 @@ +/* + * sortword.c + * + * + * Created on: 6 nov. 2008 + * Author: coissac + */ + +#include "ecoprimer.h" +#include + +void su_smoothsort(void *base, uint32_t r, uint32_t N, + int (*less)(void *m, uint32_t a, uint32_t b), + void (*swap)(void *m, uint32_t a, uint32_t b)); + +static int less(void *m, uint32_t a, uint32_t b); +static void swap(void *m, uint32_t a, uint32_t b); + + +void sortword(pword_t table,uint32_t N) +{ + su_smoothsort((void*)table,0,N,less,swap); +} + +int less(void *m, uint32_t a, uint32_t b) +{ + pword_t t; + + t = (pword_t)m; + + return WORD(t[a]) <= WORD(t[b]); +} + +void swap(void *m, uint32_t a, uint32_t b) +{ + word_t tmp; + pword_t t; + + t = (pword_t)m; + tmp = t[a]; + t[a]= t[b]; + t[b]= tmp; +} + diff --git a/src/libecoprimer/strictprimers.c b/src/libecoprimer/strictprimers.c new file mode 100644 index 0000000..b20587e --- /dev/null +++ b/src/libecoprimer/strictprimers.c @@ -0,0 +1,264 @@ +/* + * strictprimers.c + * + * Created on: 7 nov. 2008 + * Author: coissac + */ + +#define _GNU_SOURCE +#include "ecoprimer.h" +#include +#include +#include +#include +#include + +#ifndef RUSAGE_SELF +#define RUSAGE_SELF 0 +#define RUSAGE_CHILDREN -1 +#endif + +static double timeval_subtract (struct timeval *x, struct timeval *y); + + + /* Subtract the `struct timeval' values X and Y, + Return elapsed secondes as a double. */ + +double timeval_subtract (struct timeval *x, struct timeval *y) +{ + struct timeval result; + + /* Perform the carry for the later subtraction by updating y. */ + if (x->tv_usec < y->tv_usec) { + int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1; + y->tv_usec -= 1000000 * nsec; + y->tv_sec += nsec; + } + if (x->tv_usec - y->tv_usec > 1000000) { + int nsec = (x->tv_usec - y->tv_usec) / 1000000; + y->tv_usec += 1000000 * nsec; + y->tv_sec -= nsec; + } + + /* Compute the time remaining to wait. + tv_usec is certainly positive. */ + result.tv_sec = x->tv_sec - y->tv_sec; + result.tv_usec = x->tv_usec - y->tv_usec; + + return (double)result.tv_sec + (double)result.tv_usec/1e6; + } + +pwordcount_t initCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount) +{ + uint32_t i; + uint32_t buffsize; + //wordcount_t t; + + if (!table) + table = ECOMALLOC(sizeof(wordcount_t),"Cannot allocate memory for word count structure"); + + table->words=NULL; + table->size =0; + table->outseqcount=0; + table->inseqcount=0; + table->strictcount =0; + + if (seq) + { + table->words = ecoHashSequence(NULL,wordsize,circular,doublestrand,seq,&buffsize,neededWords,neededWordCount,seqQuorum); + table->size = ecoCompactHashSequence(table->words,buffsize); + + table->inseqcount=1; + table->strictcount =ECOMALLOC((table->size*sizeof(uint32_t)), + "Cannot allocate memory for word count table" + ); + + for (i=0; i < table->size; i++) table->strictcount[i]=1; + } + + return table; +} + +void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circular, uint32_t doublestrand,uint32_t exampleCount,uint32_t seqQuorum,ecoseq_t *seq,int32_t *neededWords,uint32_t neededWordCount) +{ + uint32_t buffersize; + pword_t newtable; + uint32_t newsize; + uint32_t i; + + buffersize = table->size + ecoWordCount(wordsize,circular,seq); + + table->words = ECOREALLOC(table->words,buffersize*sizeof(word_t), + "\n\nCannot allocate memory to extend word table" ); + + + newtable = table->words + table->size; + +// DEBUG_LOG("Words = %x (%u) new = %x", table->words,table->size,newtable); + + (void)ecoHashSequence(newtable,wordsize,circular,doublestrand,seq,&newsize,neededWords,neededWordCount,seqQuorum); +// DEBUG_LOG("new seq wordCount : %d",newsize); + + newsize = ecoCompactHashSequence(newtable,newsize); + +// DEBUG_LOG("compacted wordCount : %d",newsize); + buffersize = table->size + newsize; + + // resize the count buffer + + table->inseqcount++; + + //fprintf (stderr, "\nOldAddress: %x", table->strictcount); + table->strictcount = ECOREALLOC(table->strictcount,(buffersize+5000)*sizeof(uint32_t), + "Cannot allocate memory to extend example word count table"); + //fprintf (stderr, " NewAddress: %x\n", table->strictcount); + + for (i=table->size; i < buffersize; i++) + table->strictcount[i]=1; + + + + // Now we have to merge in situ the two tables + + ecomerge(table,table->size,newsize,exampleCount - table->inseqcount,seqQuorum); +// DEBUG_LOG("Dictionnary size : %d",table->size); + +} + +pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize, + uint32_t exampleCount,poptions_t options) +{ + struct rusage start; + struct rusage usage; + double seconde; + char *logfilename; + FILE *logfile; + uint32_t i; + bool_t first=TRUE; + pwordcount_t strictprimers=NULL; + uint64_t totallength=0; + uint32_t sequenceQuorum = (uint32_t)floor((float)exampleCount * options->strict_quorum); + int32_t *neededWords; + uint32_t neededWordCount; + + fprintf(stderr,"Filtering... "); + + if (options->filtering) + neededWords = filteringSeq(database,seqdbsize,exampleCount,options,&neededWordCount,(int32_t)sequenceQuorum); + else + { + neededWordCount=0; + neededWords=NULL; + } + + if (options->statistics) + { + asprintf(&logfilename,"ecoprimer_%d.log",getpid()); + logfile = fopen(logfilename,"w"); + fprintf(logfile,"# seq\tlength\tsize\ttime\tspeed\n"); + fclose(logfile); + } + + + fprintf(stderr," Primers should be at least present in %d/%d example sequences\n",sequenceQuorum,exampleCount); + + strictprimers = initCountTable(NULL,options->primer_length, + options->circular, + options->doublestrand, + 0, + NULL,NULL,0); + + + getrusage(RUSAGE_SELF,&start); + + for (i=0;iisexample && database[i]->SQ_length > options->primer_length) + { + + if (first) + { + strictprimers = initCountTable(strictprimers,options->primer_length, + options->circular, + options->doublestrand, + sequenceQuorum, + database[i],neededWords,neededWordCount); + first=FALSE; + } + else + { + uint32_t s; + s = strictprimers->size; +// DEBUG_LOG("stack size : %u",s); + addSeqToWordCountTable(strictprimers,options->primer_length, + options->circular, + options->doublestrand, + exampleCount, + sequenceQuorum, + database[i],neededWords,neededWordCount); + }; + totallength+=database[i]->SQ_length; + getrusage(RUSAGE_SELF,&usage); + if (options->statistics) + { + asprintf(&logfilename,"ecoprimer_%d.log",getpid()); + logfile = fopen(logfilename,"a"); + seconde = timeval_subtract(&(usage.ru_utime),&(start.ru_utime)) + + timeval_subtract(&(usage.ru_stime),&(start.ru_stime)); + fprintf(logfile,"%d\t%llu\t%lu\t%8.3f\t%8.3e\n",i, + (long long unsigned)totallength, + strictprimers->size*(sizeof(int64_t)+sizeof(int32_t)), + seconde,seconde/(double)totallength); + fclose(logfile); + } + } + else + strictprimers->outseqcount++; + + fprintf(stderr," Indexed sequences %5d/%5d : considered words %-10llu \r", + (int32_t)i+1,(int32_t)seqdbsize, + (long long unsigned)strictprimers->size); + +// DEBUG_LOG("First word : %s ==> %d",ecoUnhashWord(strictprimers->words[0],18),strictprimers->incount[0]) +// DEBUG_LOG("Second word : %s ==> %d",ecoUnhashWord(strictprimers->words[1],18),strictprimers->incount[1]) + } + + strictprimers->strictcount = ECOREALLOC(strictprimers->strictcount, + sizeof(uint32_t)*strictprimers->size, + "Cannot reallocate strict primer count table"); + strictprimers->words = ECOREALLOC(strictprimers->words, + sizeof(word_t)*strictprimers->size, + "Cannot reallocate strict primer table"); + + if (neededWords) + ECOFREE(neededWords,"Clean needed word table"); + + return strictprimers; +} + +uint32_t filterMultiStrictPrimer(pwordcount_t strictprimers) +{ + uint32_t i; + uint32_t w; + + for (i=0,w=0;i < strictprimers->size;i++) + { + if (w < i) + { + strictprimers->words[w]=strictprimers->words[i]; + strictprimers->strictcount[w]=strictprimers->strictcount[i]; + } + if (! ISMULTIWORD(strictprimers->words[w])) + w++; + } + + strictprimers->size=w; + strictprimers->strictcount = ECOREALLOC(strictprimers->strictcount, + sizeof(uint32_t)*strictprimers->size, + "Cannot reallocate strict primer count table"); + strictprimers->words = ECOREALLOC(strictprimers->words, + sizeof(word_t)*strictprimers->size, + "Cannot reallocate strict primer table"); + + return w; +} diff --git a/src/libecoprimer/taxstats.c b/src/libecoprimer/taxstats.c new file mode 100644 index 0000000..0898303 --- /dev/null +++ b/src/libecoprimer/taxstats.c @@ -0,0 +1,378 @@ +/* + * taxstats.c + * + * Created on: 12 mars 2009 + * Author: coissac + */ + +#include +//void tdestroy (void *root, void (*free_node)(void *nodep)); + +#include "ecoprimer.h" + +static int cmptaxon(const void *t1, const void* t2); + +void **tree_root = NULL; +int delete_passes = 0; + +void delete_twalkaction (const void *node, VISIT order, int level) +{ + switch (order) + { + case preorder: + delete_passes++; + break; + case postorder: + delete_passes++; + break; + case endorder: + delete_passes++; + break; + case leaf: + if (tree_root) + tdelete (node, tree_root,cmptaxon); + delete_passes++; + break; + } +} + +void free_tree_nodes (void *tree) +{ + while (1) + { + delete_passes = 0; + twalk (tree, delete_twalkaction); + if (delete_passes <= 1) break; + } +} + +static int cmptaxon(const void *t1, const void* t2) +{ + const size_t taxid1=(size_t)t1; + const size_t taxid2=(size_t)t2; + + // fprintf(stderr,"==> counted taxid1 : %d\n",taxid1); + // fprintf(stderr,"==> counted taxid2 : %d\n",taxid2); + + if (taxid1 < taxid2) + return -1; + if (taxid1 > taxid2) + return +1; + return 0; +} + +int32_t counttaxon(int32_t taxid) +{ + static void* taxontree=NULL; + static int32_t taxoncount=0; + + // fprintf(stderr,"counted taxid : %d taxontree %p\n",taxid,taxontree); + + if (taxid==-1) + { + if (taxontree) + { + tree_root = (void **)&taxontree; + //free_tree_nodes (taxontree); + ECOFREE(taxontree,"Free taxon tree"); + tree_root = NULL; + } + taxontree=NULL; + taxoncount=0; + return 0; + } + + + if ((taxid > 0) && ((!taxontree) || (!tfind((void*)((size_t)taxid),&taxontree,cmptaxon)))) + { + tsearch((void*)((size_t)taxid),&taxontree,cmptaxon); + taxoncount++; + } + return taxoncount; +} + +int32_t getrankdbstats(pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *taxonomy, + poptions_t options) +{ + + uint32_t i; + ecotx_t *taxon; + ecotx_t *tmptaxon; + + counttaxon(-1); + options->intaxa = 0; + + for (i=0;itaxons->taxon[seqdb[i]->taxid]); + seqdb[i]->isexample=isExampleTaxon(taxonomy,seqdb[i]->taxid,options); + + tmptaxon = eco_findtaxonatrank(taxon, + options->taxonrankidx); + + // fprintf(stderr,"Taxid : %d %p\n",taxon->taxid,tmptaxon); + + if (tmptaxon) + { + // fprintf(stderr,"orig : %d trans : %d\n",taxon->taxid, + // tmptaxon->taxid); + + seqdb[i]->ranktaxonid=tmptaxon->taxid; + if (seqdb[i]->isexample) + options->intaxa = counttaxon(tmptaxon->taxid); + } + else + seqdb[i]->ranktaxonid=-1; + } + + counttaxon(-1); + options->outtaxa = 0; + + for (i=0;iranktaxonid>=0 && !seqdb[i]->isexample) + options->outtaxa = counttaxon(seqdb[i]->ranktaxonid); + } + + return options->outtaxa + options->intaxa; +} + + +float taxonomycoverage(ppair_t pair, poptions_t options, pecodnadb_t seqdb,uint32_t seqdbsize) +{ + int32_t seqcount; + int32_t i; + int32_t incount=0; + int32_t outcount=0; + uint32_t j; + + + memset (pair->coveredSeqs, 0, seqdbsize*sizeof (int)); + seqcount=pair->pcr.ampcount; + + counttaxon(-1); + for (i=0; i < seqcount; i++) + if (pair->pcr.amplifias[i].sequence->isexample + && pair->pcr.amplifias[i].sequence->ranktaxonid > 0 ) + { + incount = counttaxon(pair->pcr.amplifias[i].sequence->ranktaxonid); + + for (j=0; jpcr.amplifias[i].sequence == seqdb[j]) + {pair->coveredSeqs[j] = 1; break;} + } + + counttaxon(-1); + for (i=0; i < seqcount; i++) + if (!pair->pcr.amplifias[i].sequence->isexample + && pair->pcr.amplifias[i].sequence->ranktaxonid) + outcount = counttaxon(pair->pcr.amplifias[i].sequence->ranktaxonid); + + + pair->intaxa=incount; + pair->outtaxa=outcount; + pair->bc=(float)incount/options->intaxa; + return pair->bc; +} + +/* +static int cmpamp(const void *ampf1, const void* ampf2) +{ + int i; + int j = 0; + int incr = 1; + char cd1; + char cd2; + int chd = 0; + int len = 0; + + pamptotaxon_t pampf1 = (pamptotaxon_t) ampf1; + pamptotaxon_t pampf2 = (pamptotaxon_t) ampf2; + + + if (pampf1->strand != pampf2->strand) + { + incr = -1; + j = pampf1->length - 1; + + if (pampf2->strand) + { + pampf1 = (pamptotaxon_t) ampf2; + pampf2 = (pamptotaxon_t) ampf1; + chd = 1; + } + //j = pampf2->length - 1; should have been here and pampf2 instead of pampf1? + } + + len = (pampf1->length <= pampf2->length)? pampf1->length: pampf2->length; + + for (i = 0; i < len; i++, j += incr) + { + cd1 = pampf1->amplifia[i]; + if (incr == -1) + cd2 = ecoComplementChar(pampf2->amplifia[j]); + else + cd2 = pampf2->amplifia[j]; + + if (cd1 < cd2) return chd ? 1: -1; + if (cd2 < cd1) return chd ? -1: 1; + } + + if (pampf1->length > pampf2->length) return chd ? -1: 1; + if (pampf2->length > pampf1->length) return chd ? 1: -1; + + return 0; +}*/ + + +static int cmpamp(const void *ampf1, const void* ampf2) +{ + int i; + char cd1; + char cd2; + int len = 0; + char *ch1; + char *ch2; + int incr1; + int incr2; + + pamptotaxon_t pampf1 = (pamptotaxon_t) ampf1; + pamptotaxon_t pampf2 = (pamptotaxon_t) ampf2; + + ch1 = pampf1->amplifia; + ch2 = pampf2->amplifia; + + incr1 = 1; + incr2 = 1; + + if (!pampf1->strand) + incr1 = -1; + if (!pampf2->strand) + incr2 = -1; + + len = (pampf1->length <= pampf2->length)? pampf1->length: pampf2->length; + for (i = 0; i < len; i++) + { + cd1 = *ch1; + if (incr1 == -1) + cd1 = ecoComplementChar(*ch1); + + cd2 = *ch2; + if (incr2 == -1) + cd2 = ecoComplementChar(*ch2); + + if (cd1 < cd2) return -1; + if (cd2 < cd1) return 1; + + ch1 += incr1; + ch2 += incr2; + } + + if (pampf1->length > pampf2->length) return 1; + if (pampf2->length > pampf1->length) return -1; + + return 0; +} + +void twalkaction (const void *node, VISIT order, int level) +{ + int32_t *taxid = (int32_t*)node; + //const size_t taxid=(size_t)node; + //printf ("\t%d:%p, ", *taxid, node); + counttaxon(*taxid); +} + +int32_t gtxid; +void twalkaction2 (const void *node, VISIT order, int level) +{ + int32_t *pt = (int32_t *) node; + gtxid = *pt; +} + +void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize) +{ + uint32_t i, j; + uint32_t ampfindex = 0; + int32_t taxid; + uint32_t wellidentifiedcount; + + void *ampftree = NULL; + pamptotaxon_t pcurrentampf; + pamptotaxon_t *ptmp; + + pamptotaxon_t ampfwithtaxtree = ECOMALLOC(sizeof(amptotaxon_t) * pair->pcr.ampcount,"Cannot allocate amplifia tree"); + + for (i = 0; i < pair->pcr.ampcount; i++) + { + /*populate taxon ids tree against each unique amplifia + i.e set of taxon ids for each amplifia*/ + if (pair->pcr.amplifias[i].sequence->isexample) + { + ampfwithtaxtree[ampfindex].amplifia = pair->pcr.amplifias[i].amplifia; + ampfwithtaxtree[ampfindex].strand = pair->pcr.amplifias[i].strand; + ampfwithtaxtree[ampfindex].length = pair->pcr.amplifias[i].length; + pcurrentampf = &fwithtaxtree[ampfindex]; + taxid = pair->pcr.amplifias[i].sequence->ranktaxonid; + ptmp = tfind((const void*)pcurrentampf, &ftree, cmpamp); + if (ptmp == NULL) + { + pcurrentampf = &fwithtaxtree[ampfindex]; + tsearch((void*)pcurrentampf,&ftree,cmpamp); + ampfindex++; + } + else + pcurrentampf = *ptmp; + + if (tfind((void*)((size_t)taxid), &(pcurrentampf->taxontree), cmptaxon) == NULL) + { + pcurrentampf->taxoncount++; + tsearch((void*)((size_t)taxid),&(pcurrentampf->taxontree),cmptaxon); + } + } + } + + memset (pair->wellIdentifiedSeqs, 0, seqdbsize*sizeof (int)); + //counttaxon(-1); + for (i = 0; i < ampfindex; i++) + { + if (ampfwithtaxtree[i].taxoncount > 1) + { + //printf ("\nampfwithtaxtree[i].taxoncount: %d\n", ampfwithtaxtree[i].taxoncount); + //twalk(ampfwithtaxtree[i].taxontree, twalkaction); + } + //TR 5/9/10 - added code for well identified seqs + else if(ampfwithtaxtree[i].taxoncount == 1) /*well identified*/ + { + gtxid = -1; + twalk(ampfwithtaxtree[i].taxontree, twalkaction2); + + if (gtxid != -1) + { + for (j = 0; j < seqdbsize; j++) + if (seqdb[j]->ranktaxonid == gtxid + && seqdb[j]->isexample + &&(pair->p1->directCount[j] > 0 + || pair->p1->reverseCount[j] > 0) + && (pair->p2->directCount[j] > 0 + || pair->p2->reverseCount[j] > 0)) + { + pair->wellIdentifiedSeqs[j] = 1; + } + } + } + } + //printf ("\n"); + counttaxon(-1); + wellidentifiedcount = 0; + for (j = 0; j < seqdbsize; j++) + if (pair->wellIdentifiedSeqs[j] == 1) + counttaxon(seqdb[j]->ranktaxonid); + wellidentifiedcount = counttaxon(-2); + //pair->notwellidentifiedtaxa = counttaxon(-2); + pair->notwellidentifiedtaxa = (pair->intaxa-wellidentifiedcount); //counttaxon(-2); + //pair->bs = ((float)pair->intaxa - (float)pair->notwellidentifiedtaxa) / pair->intaxa; + pair->bs = ((float)wellidentifiedcount) / (float)pair->intaxa; + + ECOFREE (ampfwithtaxtree, "Free amplifia table"); + +} diff --git a/src/libthermo/Makefile b/src/libthermo/Makefile new file mode 100644 index 0000000..1685cba --- /dev/null +++ b/src/libthermo/Makefile @@ -0,0 +1,23 @@ + +SOURCES = nnparams.c \ + thermostats.c + +SRCS=$(SOURCES) + +OBJECTS= $(patsubst %.c,%.o,$(SOURCES)) + +LIBFILE= libthermo.a +RANLIB= ranlib + + +include ../global.mk + + +all: $(LIBFILE) + +clean: + rm -rf $(OBJECTS) $(LIBFILE) + +$(LIBFILE): $(OBJECTS) + ar -cr $@ $? + $(RANLIB) $@ diff --git a/src/libthermo/nnparams.c b/src/libthermo/nnparams.c new file mode 100644 index 0000000..f4b4937 --- /dev/null +++ b/src/libthermo/nnparams.c @@ -0,0 +1,600 @@ +/* + * nnparams.cpp + * PHunterLib + * + * Nearest Neighbor Model / Parameters + * + * Created by Tiayyba Riaz on 7/2/09. + * + */ + +#include +#include +#include +#include +#include"nnparams.h" + + +double forbidden_entropy; + + +double nparam_GetInitialEntropy(PNNParams nparm) +{ + return -5.9f+nparm->rlogc; +} + + +//Retrieve Enthalpy for given NN-Pair from parameter table +double nparam_GetEnthalpy(PNNParams nparm, char x0, char x1, char y0, char y1) +{ + return ndH(x0,x1,y0,y1); //xx, yx are already numbers +} + + +//Retrieve Entropy for given NN-Pair from parameter table +double nparam_GetEntropy(PNNParams nparm, char x0, char x1, char y0, char y1) +{ + //xx and yx are already numbers + char nx0=x0;//nparam_convertNum(x0); + char nx1=x1;//nparam_convertNum(x1); + char ny0=y0;//nparam_convertNum(y0); + char ny1=y1;//nparam_convertNum(y1); + double answer = ndS(nx0,nx1,ny0,ny1); + /*Salt correction Santalucia*/ + if (nparm->saltMethod == SALT_METHOD_SANTALUCIA) { + if(nx0!=5 && 1<= nx1 && nx1<=4) { + answer += 0.5*nparm->kfac; + } + if(ny1!=5 && 1<= ny0 && ny0<=4) { + answer += 0.5*nparm->kfac; + } + } + /*Salt correction Owczarzy*/ + if (nparm->saltMethod == SALT_METHOD_OWCZARZY) { + double logk = log(nparm->kplus); + answer += ndH(nx0,nx1,ny0,ny1)*((4.29 * nparm->gcContent-3.95)*0.00001*logk+ 0.0000094*logk*logk); + } + return answer; +} + +/* PURPOSE: Return melting temperature TM for given entropy and enthalpy +* Assuming a one-state transition and using the formula +* TM = dH / (dS + R ln(Ct/4)) +* entropy = dS + R ln Ct/4 (must already be included!) +* enthaklpy = dH +* where +* dH = enthalpy +* dS = entropy +* R = Boltzmann factor +* Ct = Strand Concentration +* +* PARAMETERS: +* entrypy and enthalpy +* +* RETURN VALUE: +* temperature +*/ + +double nparam_CalcTM(double entropy,double enthalpy) +{ + double tm = 0; // absolute zero - return if model fails! + if (enthalpy>=forbidden_enthalpy) //||(entropy==-cfact)) + return 0; + if (entropy<0) // avoid division by zero and model errors! + { + tm = enthalpy/entropy;// - kfac; //LKFEB + if (tm<0) + return 0; + } + return tm; +} + + +void nparam_InitParams(PNNParams nparm, double c1, double c2, double kp, int sm) +{ + nparm->Ct1 = c1; + nparm->Ct2 = c2; + nparm->kplus = kp; + int maxCT = 1; + if(nparm->Ct2 > nparm->Ct1) + { + maxCT = 2; + } + double ctFactor; + if(nparm->Ct1 == nparm->Ct2) + { + ctFactor = nparm->Ct1/2; + } + else if (maxCT == 1) + { + ctFactor = nparm->Ct1-nparm->Ct2/2; + } + else + { + ctFactor = nparm->Ct2-nparm->Ct1/2; + } + nparm->rlogc = R * log(ctFactor); + forbidden_entropy = nparm->rlogc; + nparm->kfac = 0.368 * log (nparm->kplus); + nparm->saltMethod = sm; + int x,y,a,b; // variables used as counters... + + // Set all parameters to zero! + memset(nparm->dH,0,sizeof(nparm->dH)); + memset(nparm->dS,0,sizeof(nparm->dS)); + + // Set all X-/Y-, -X/Y- and X-/-Y so, that TM will be VERY small! + for (x=1;x<=4;x++) + { + for (y=1;y<=4;y++) + { + ndH(0,x,y,0)=forbidden_enthalpy; + ndS(0,x,y,0)=forbidden_entropy; + ndH(x,0,0,y)=forbidden_enthalpy; + ndS(x,0,0,y)=forbidden_entropy; + ndH(x,0,y,0)=forbidden_enthalpy; + ndS(x,0,y,0)=forbidden_entropy; + // forbid X-/Y$ and X$/Y- etc., i.e. terminal must not be paired with gap! + ndH(x,5,y,0)=forbidden_enthalpy; + ndS(x,5,y,0)=forbidden_entropy; + ndH(x,0,y,5)=forbidden_enthalpy; + ndS(x,0,y,5)=forbidden_entropy; + ndH(5,x,0,y)=forbidden_enthalpy; + ndS(5,x,0,y)=forbidden_entropy; + ndH(0,x,5,y)=forbidden_enthalpy; + ndS(0,x,5,y)=forbidden_entropy; + // forbid X$/-Y etc. + ndH(x,5,0,y)=forbidden_enthalpy; + ndS(x,5,0,y)=forbidden_entropy; + ndH(x,0,5,y)=forbidden_enthalpy; + ndS(x,0,5,y)=forbidden_entropy; + ndH(5,x,y,0)=forbidden_enthalpy; + ndS(5,x,y,0)=forbidden_entropy; + ndH(0,x,y,5)=forbidden_enthalpy; + ndS(0,x,y,5)=forbidden_entropy; + + } + // also, forbid x-/-- and --/x-, i.e. no two inner gaps paired + ndH(x,0,0,0)=forbidden_enthalpy; + ndS(x,0,0,0)=forbidden_entropy; + ndH(0,0,x,0)=forbidden_enthalpy; + ndS(0,0,x,0)=forbidden_entropy; + // x-/-$ + ndH(x,0,0,5)=forbidden_enthalpy; + ndS(x,0,0,5)=forbidden_entropy; + ndH(5,0,0,x)=forbidden_enthalpy; + ndS(5,0,0,x)=forbidden_entropy; + ndH(0,5,x,0)=forbidden_enthalpy; + ndS(x,0,0,5)=forbidden_entropy; + ndH(0,x,5,0)=forbidden_enthalpy; + ndS(0,x,5,0)=forbidden_entropy; + } + // forbid --/-- + ndH(0,0,0,0)=forbidden_enthalpy; + ndS(0,0,0,0)=forbidden_entropy; + + ndH(5,0,0,0)=forbidden_enthalpy; + ndS(5,0,0,0)=forbidden_entropy; + ndH(0,0,5,0)=forbidden_enthalpy; + ndS(0,0,5,0)=forbidden_entropy; + ndH(0,5,5,0)=forbidden_enthalpy; + ndS(0,5,5,0)=forbidden_entropy; + + // Interior loops (double Mismatches) + #define iloop_entropy -0.97f + #define iloop_enthalpy 0.0f + for (x=1; x<=4; x++) + for (y=1; y<=4; y++) + for (a=1; a<=4; a++) + for (b=1; b<=4; b++) + // AT and CG pair, and as A=1, C=2, G=3, T=4 this means + // we have Watson-Crick pairs if (x+a==5) and (y+b)==5. + if (!((x+a==5)||(y+b==5))) + { + // No watson-crick-pair, i.e. double mismatch! + // set enthalpy/entropy to loop expansion! + ndH(x,y,a,b) = iloop_enthalpy; + ndS(x,y,a,b) = iloop_entropy; + } + + // xy/-- and --/xy (Bulge Loops of size > 1) + #define bloop_entropy -1.3f + #define bloop_enthalpy 0.0f + for (x=1; x<=4; x++) + for (y=1; y<=4; y++) + { + ndH(x,y,0,0) = bloop_enthalpy; + ndS(x,y,0,0) = bloop_entropy; + ndH(0,0,x,y) = bloop_enthalpy; + ndS(0,0,x,y) = bloop_entropy; + } + + // x-/ya abd xa/y- as well as -x/ay and ax/-y + // bulge opening and closing parameters with + // adjacent matches / mismatches + // obulge_mism and cbulge_mism chosen so high to avoid + // AAAAAAAAA + // T--G----T + // being better than + // AAAAAAAAA + // TG------T + #define obulge_match_H (-2.66f * 1000) + #define obulge_match_S -14.22f + #define cbulge_match_H (-2.66f * 1000) + #define cbulge_match_S -14.22f + #define obulge_mism_H (0.0f * 1000) + #define obulge_mism_S -6.45f + #define cbulge_mism_H 0.0f + #define cbulge_mism_S -6.45f + for (x=1; x<=4; x++) + for (y=1; y<=4; y++) + for (a=1; a<=4; a++) + { + if (x+y==5) // other base pair matches! + { + ndH(x,0,y,a)=obulge_match_H; // bulge opening + ndS(x,0,y,a)=obulge_match_S; + ndH(x,a,y,0)=obulge_match_H; + ndS(x,a,y,0)=obulge_match_S; + ndH(0,x,a,y)=cbulge_match_H; // bulge closing + ndS(0,x,a,y)=cbulge_match_S; + ndH(a,x,0,y)=cbulge_match_H; + ndS(a,x,0,y)=cbulge_match_S; + } + else + { // mismatch in other base pair! + ndH(x,0,y,a)=obulge_mism_H; // bulge opening + ndS(x,0,y,a)=obulge_mism_S; + ndH(x,a,y,0)=obulge_mism_H; + ndS(x,a,y,0)=obulge_mism_S; + ndH(0,x,a,y)=cbulge_mism_H; // bulge closing + ndS(0,x,a,y)=cbulge_mism_S; + ndH(a,x,0,y)=cbulge_mism_H; + ndS(a,x,0,y)=cbulge_mism_S; + } + } + + // Watson-Crick pairs (note that only ten are unique, as obviously + // 5'-AG-3'/3'-TC-5' = 5'-CT-3'/3'-GA-5' etc. + ndH(1,1,4,4)=-7.6f*1000; ndS(1,1,4,4)=-21.3f; // AA/TT 04 + ndH(1,2,4,3)=-8.4f*1000; ndS(1,2,4,3)=-22.4f; // AC/TG adapted GT/CA + ndH(1,3,4,2)=-7.8f*1000; ndS(1,3,4,2)=-21.0f; // AG/TC adapted CT/GA + ndH(1,4,4,1)=-7.2f*1000; ndS(1,4,4,1)=-20.4f; // AT/TA 04 + ndH(2,1,3,4)=-8.5f*1000; ndS(2,1,3,4)=-22.7f; // CA/GT 04 + ndH(2,2,3,3)=-8.0f*1000; ndS(2,2,3,3)=-19.9f; // CC/GG adapted GG/CC + ndH(2,3,3,2)=-10.6f*1000; ndS(2,3,3,2)=-27.2f; // CG/GC 04 + ndH(2,4,3,1)=-7.8f*1000; ndS(2,4,3,1)=-21.0f; // CT/GA 04 + ndH(3,1,2,4)=-8.2f*1000; ndS(3,1,2,4)=-22.2f; // GA/CT 04 + ndH(3,2,2,3)=-9.8f*1000; ndS(3,2,2,3)=-24.4f; // GC/CG 04 + ndH(3,3,2,2)=-8.0f*1000; ndS(3,3,2,2)=-19.9f; // GG/CC 04 + ndH(3,4,2,1)=-8.4f*1000; ndS(3,4,2,1)=-22.4f; // GT/CA 04 + ndH(4,1,1,4)=-7.2f*1000; ndS(4,1,1,4)=-21.3f; // TA/AT 04 + ndH(4,2,1,3)=-8.2f*1000; ndS(4,2,1,3)=-22.2f; // TC/AG adapted GA/CT + ndH(4,3,1,2)=-8.5f*1000; ndS(4,3,1,2)=-22.7f; // TG/AC adapted CA/GT + ndH(4,4,1,1)=-7.6f*1000; ndS(4,4,1,1)=-21.3f; // TT/AA adapted AA/TT + + // A-C Mismatches (Values for pH 7.0) + ndH(1,1,2,4)=7.6f*1000; ndS(1,1,2,4)=20.2f; // AA/CT + ndH(1,1,4,2)=2.3f*1000; ndS(1,1,4,2)=4.6f; // AA/TC + ndH(1,2,2,3)=-0.7f*1000; ndS(1,2,2,3)=-3.8f; // AC/CG + ndH(1,2,4,1)=5.3f*1000; ndS(1,2,4,1)=14.6f; // AC/TA + ndH(1,3,2,2)=0.6f*1000; ndS(1,3,2,2)=-0.6f; // AG/CC + ndH(1,4,2,1)=5.3f*1000; ndS(1,4,2,1)=14.6f; // AT/CA + ndH(2,1,1,4)=3.4f*1000; ndS(2,1,1,4)=8.0f; // CA/AT + ndH(2,1,3,2)=1.9f*1000; ndS(2,1,3,2)=3.7f; // CA/GC + ndH(2,2,1,3)=5.2f*1000; ndS(2,2,1,3)=14.2f; // CC/AG + ndH(2,2,3,1)=0.6f*1000; ndS(2,2,3,1)=-0.6f; // CC/GA + ndH(2,3,1,2)=1.9f*1000; ndS(2,3,1,2)=3.7f; // CG/AC + ndH(2,4,1,1)=2.3f*1000; ndS(2,4,1,1)=4.6f; // CT/AA + ndH(3,1,2,2)=5.2f*1000; ndS(3,1,2,2)=14.2f; // GA/CC + ndH(3,2,2,1)=-0.7f*1000; ndS(3,2,2,1)=-3.8f; // GC/CA + ndH(4,1,1,2)=3.4f*1000; ndS(4,1,1,2)=8.0f; // TA/AC + ndH(4,2,1,1)=7.6f*1000; ndS(4,2,1,1)=20.2f; // TC/AA + + // C-T Mismatches + ndH(1,2,4,4)=0.7f*1000; ndS(1,2,4,4)=0.2f; // AC/TT + ndH(1,4,4,2)=-1.2f*1000; ndS(1,4,4,2)=-6.2f; // AT/TC + ndH(2,1,4,4)=1.0f*1000; ndS(2,1,4,4)=0.7f; // CA/TT + ndH(2,2,3,4)=-0.8f*1000; ndS(2,2,3,4)=-4.5f; // CC/GT + ndH(2,2,4,3)=5.2f*1000; ndS(2,2,4,3)=13.5f; // CC/TG + ndH(2,3,4,2)=-1.5f*1000; ndS(2,3,4,2)=-6.1f; // CG/TC + ndH(2,4,3,2)=-1.5f*1000; ndS(2,4,3,2)=-6.1f; // CT/GC + ndH(2,4,4,1)=-1.2f*1000; ndS(2,4,4,1)=-6.2f; // CT/TA + ndH(3,2,2,4)=2.3f*1000; ndS(3,2,2,4)=5.4f; // GC/CT + ndH(3,4,2,2)=5.2f*1000; ndS(3,4,2,2)=13.5f; // GT/CC + ndH(4,1,2,4)=1.2f*1000; ndS(4,1,2,4)=0.7f; // TA/CT + ndH(4,2,2,3)=2.3f*1000; ndS(4,2,2,3)=5.4f; // TC/CG + ndH(4,2,1,4)=1.2f*1000; ndS(4,2,1,4)=0.7f; // TC/AT + ndH(4,3,2,2)=-0.8f*1000; ndS(4,3,2,2)=-4.5f; // TG/CC + ndH(4,4,2,1)=0.7f*1000; ndS(4,4,2,1)=0.2f; // TT/CA + ndH(4,4,1,2)=1.0f*1000; ndS(4,4,1,2)=0.7f; // TT/AC + + // G-A Mismatches + ndH(1,1,3,4)=3.0f*1000; ndS(1,1,3,4)=7.4f; // AA/GT + ndH(1,1,4,3)=-0.6f*1000; ndS(1,1,4,3)=-2.3f; // AA/TG + ndH(1,2,3,3)=0.5f*1000; ndS(1,2,3,3)=3.2f; // AC/GG + ndH(1,3,3,2)=-4.0f*1000; ndS(1,3,3,2)=-13.2f; // AG/GC + ndH(1,3,4,1)=-0.7f*1000; ndS(1,3,4,1)=-2.3f; // AG/TA + ndH(1,4,3,1)=-0.7f*1000; ndS(1,4,3,1)=-2.3f; // AT/GA + ndH(2,1,3,3)=-0.7f*1000; ndS(2,1,3,3)=-2.3f; // CA/GG + ndH(2,3,3,1)=-4.0f*1000; ndS(2,3,3,1)=-13.2f; // CG/GA + ndH(3,1,1,4)=0.7f*1000; ndS(3,1,1,4)=0.7f; // GA/AT + ndH(3,1,2,3)=-0.6f*1000; ndS(3,1,2,3)=-1.0f; // GA/CG + ndH(3,2,1,3)=-0.6f*1000; ndS(3,2,1,3)=-1.0f; // GC/AG + ndH(3,3,1,2)=-0.7f*1000; ndS(3,3,1,2)=-2.3f; // GG/AC + ndH(3,3,2,1)=0.5f*1000; ndS(3,3,2,1)=3.2f; // GG/CA + ndH(3,4,1,1)=-0.6f*1000; ndS(3,4,1,1)=-2.3f; // GT/AA + ndH(4,1,1,3)=0.7f*1000; ndS(4,1,1,3)=0.7f; // TA/AG + ndH(4,3,1,1)=3.0f*1000; ndS(4,3,1,1)=7.4f; // TG/AA + + // G-T Mismatches + ndH(1,3,4,4)=1.0f*1000; ndS(1,3,4,4)=0.9f; // AG/TT + ndH(1,4,4,3)=-2.5f*1000; ndS(1,4,4,3)=-8.3f; // AT/TG + ndH(2,3,3,4)=-4.1f*1000; ndS(2,3,3,4)=-11.7f; // CG/GT + ndH(2,4,3,3)=-2.8f*1000; ndS(2,4,3,3)=-8.0f; // CT/GG + ndH(3,1,4,4)=-1.3f*1000; ndS(3,1,4,4)=-5.3f; // GA/TT + ndH(3,2,4,3)=-4.4f*1000; ndS(3,2,4,3)=-12.3f; // GC/TG + ndH(3,3,2,4)=3.3f*1000; ndS(3,3,2,4)=10.4f; // GG/CT + ndH(3,3,4,2)=-2.8f*1000; ndS(3,3,4,2)=-8.0f; // GG/TC +// ndH(3,3,4,4)=5.8f*1000; ndS(3,3,4,4)=16.3f; // GG/TT + ndH(3,4,2,3)=-4.4f*1000; ndS(3,4,2,3)=-12.3f; // GT/CG + ndH(3,4,4,1)=-2.5f*1000; ndS(3,4,4,1)=-8.3f; // GT/TA +// ndH(3,4,4,3)=4.1f*1000; ndS(3,4,4,3)=9.5f; // GT/TG + ndH(4,1,3,4)=-0.1f*1000; ndS(4,1,3,4)=-1.7f; // TA/GT + ndH(4,2,3,3)=3.3f*1000; ndS(4,2,3,3)=10.4f; // TC/GG + ndH(4,3,1,4)=-0.1f*1000; ndS(4,3,1,4)=-1.7f; // TG/AT + ndH(4,3,3,2)=-4.1f*1000; ndS(4,3,3,2)=-11.7f; // TG/GC +// ndH(4,3,3,4)=-1.4f*1000; ndS(4,3,3,4)=-6.2f; // TG/GT + ndH(4,4,1,3)=-1.3f*1000; ndS(4,4,1,3)=-5.3f; // TT/AG + ndH(4,4,3,1)=1.0f*1000; ndS(4,4,3,1)=0.9f; // TT/GA +// ndH(4,4,3,3)=5.8f*1000; ndS(4,4,3,3)=16.3f; // TT/GG + + // A-A Mismatches + ndH(1,1,1,4)=4.7f*1000; ndS(1,1,1,4)=12.9f; // AA/AT + ndH(1,1,4,1)=1.2f*1000; ndS(1,1,4,1)=1.7f; // AA/TA + ndH(1,2,1,3)=-2.9f*1000; ndS(1,2,1,3)=-9.8f; // AC/AG + ndH(1,3,1,2)=-0.9f*1000; ndS(1,3,1,2)=-4.2f; // AG/AC + ndH(1,4,1,1)=1.2f*1000; ndS(1,4,1,1)=1.7f; // AT/AA + ndH(2,1,3,1)=-0.9f*1000; ndS(2,1,3,1)=-4.2f; // CA/GA + ndH(3,1,2,1)=-2.9f*1000; ndS(3,1,2,1)=-9.8f; // GA/CA + ndH(4,1,1,1)=4.7f*1000; ndS(4,1,1,1)=12.9f; // TA/AA + + // C-C Mismatches + ndH(1,2,4,2)=0.0f*1000; ndS(1,2,4,2)=-4.4f; // AC/TC + ndH(2,1,2,4)=6.1f*1000; ndS(2,1,2,4)=16.4f; // CA/CT + ndH(2,2,2,3)=3.6f*1000; ndS(2,2,2,3)=8.9f; // CC/CG + ndH(2,2,3,2)=-1.5f*1000; ndS(2,2,3,2)=-7.2f; // CC/GC + ndH(2,3,2,2)=-1.5f*1000; ndS(2,3,2,2)=-7.2f; // CG/CC + ndH(2,4,2,1)=0.0f*1000; ndS(2,4,2,1)=-4.4f; // CT/CA + ndH(3,2,2,2)=3.6f*1000; ndS(3,2,2,2)=8.9f; // GC/CC + ndH(4,2,1,2)=6.1f*1000; ndS(4,2,1,2)=16.4f; // TC/AC + + // G-G Mismatches + ndH(1,3,4,3)=-3.1f*1000; ndS(1,3,4,3)=-9.5f; // AG/TG + ndH(2,3,3,3)=-4.9f*1000; ndS(2,3,3,3)=-15.3f; // CG/GG + ndH(3,1,3,4)=1.6f*1000; ndS(3,1,3,4)=3.6f; // GA/GT + ndH(3,2,3,3)=-6.0f*1000; ndS(3,2,3,3)=-15.8f; // GC/GG + ndH(3,3,2,3)=-6.0f*1000; ndS(3,3,2,3)=-15.8f; // GG/CG + ndH(3,3,3,2)=-4.9f*1000; ndS(3,3,3,2)=-15.3f; // GG/GC + ndH(3,4,3,1)=-3.1f*1000; ndS(3,4,3,1)=-9.5f; // GT/GA + ndH(4,3,1,3)=1.6f*1000; ndS(4,3,1,3)=3.6f; // TG/AG + + // T-T Mismatches + ndH(1,4,4,4)=-2.7f*1000; ndS(1,4,4,4)=-10.8f; // AT/TT + ndH(2,4,3,4)=-5.0f*1000; ndS(2,4,3,4)=-15.8f; // CT/GT + ndH(3,4,2,4)=-2.2f*1000; ndS(3,4,2,4)=-8.4f; // GT/CT + ndH(4,1,4,4)=0.2f*1000; ndS(4,1,4,4)=-1.5f; // TA/TT + ndH(4,2,4,3)=-2.2f*1000; ndS(4,2,4,3)=-8.4f; // TC/TG + ndH(4,3,4,2)=-5.0f*1000; ndS(4,3,4,2)=-15.8f; // TG/TC + ndH(4,4,1,4)=0.2f*1000; ndS(4,4,1,4)=-1.5f; // TT/AT + ndH(4,4,4,1)=-2.7f*1000; ndS(4,4,4,1)=-10.8f; // TT/TA + + // Dangling Ends + ndH(5,1,1,4)=-0.7f*1000; ndS(5,1,1,4)=-0.8f; // $A/AT + ndH(5,1,2,4)=4.4f*1000; ndS(5,1,2,4)=14.9f; // $A/CT + ndH(5,1,3,4)=-1.6f*1000; ndS(5,1,3,4)=-3.6f; // $A/GT + ndH(5,1,4,4)=2.9f*1000; ndS(5,1,4,4)=10.4f; // $A/TT + ndH(5,2,1,3)=-2.1f*1000; ndS(5,2,1,3)=-3.9f; // $C/AG + ndH(5,2,2,3)=-0.2f*1000; ndS(5,2,2,3)=-0.1f; // $C/CG + ndH(5,2,3,3)=-3.9f*1000; ndS(5,2,3,3)=-11.2f; // $C/GG + ndH(5,2,4,3)=-4.4f*1000; ndS(5,2,4,3)=-13.1f; // $C/TG + ndH(5,3,1,2)=-5.9f*1000; ndS(5,3,1,2)=-16.5f; // $G/AC + ndH(5,3,2,2)=-2.6f*1000; ndS(5,3,2,2)=-7.4f; // $G/CC + ndH(5,3,3,2)=-3.2f*1000; ndS(5,3,3,2)=-10.4f; // $G/GC + ndH(5,3,4,2)=-5.2f*1000; ndS(5,3,4,2)=-15.0f; // $G/TC + ndH(5,4,1,1)=-0.5f*1000; ndS(5,4,1,1)=-1.1f; // $T/AA + ndH(5,4,2,1)=4.7f*1000; ndS(5,4,2,1)=14.2f; // $T/CA + ndH(5,4,3,1)=-4.1f*1000; ndS(5,4,3,1)=-13.1f; // $T/GA + ndH(5,4,4,1)=-3.8f*1000; ndS(5,4,4,1)=-12.6f; // $T/TA + ndH(1,5,4,1)=-2.9f*1000; ndS(1,5,4,1)=-7.6f; // A$/TA + ndH(1,5,4,2)=-4.1f*1000; ndS(1,5,4,2)=-13.0f; // A$/TC + ndH(1,5,4,3)=-4.2f*1000; ndS(1,5,4,3)=-15.0f; // A$/TG + ndH(1,5,4,4)=-0.2f*1000; ndS(1,5,4,4)=-0.5f; // A$/TT + ndH(1,1,5,4)=0.2f*1000; ndS(1,1,5,4)=2.3f; // AA/$T + ndH(1,1,4,5)=-0.5f*1000; ndS(1,1,4,5)=-1.1f; // AA/T$ + ndH(1,2,5,3)=-6.3f*1000; ndS(1,2,5,3)=-17.1f; // AC/$G + ndH(1,2,4,5)=4.7f*1000; ndS(1,2,4,5)=14.2f; // AC/T$ + ndH(1,3,5,2)=-3.7f*1000; ndS(1,3,5,2)=-10.0f; // AG/$C + ndH(1,3,4,5)=-4.1f*1000; ndS(1,3,4,5)=-13.1f; // AG/T$ + ndH(1,4,5,1)=-2.9f*1000; ndS(1,4,5,1)=-7.6f; // AT/$A + ndH(1,4,4,5)=-3.8f*1000; ndS(1,4,4,5)=-12.6f; // AT/T$ + ndH(2,5,3,1)=-3.7f*1000; ndS(2,5,3,1)=-10.0f; // C$/GA + ndH(2,5,3,2)=-4.0f*1000; ndS(2,5,3,2)=-11.9f; // C$/GC + ndH(2,5,3,3)=-3.9f*1000; ndS(2,5,3,3)=-10.9f; // C$/GG + ndH(2,5,3,4)=-4.9f*1000; ndS(2,5,3,4)=-13.8f; // C$/GT + ndH(2,1,5,4)=0.6f*1000; ndS(2,1,5,4)=3.3f; // CA/$T + ndH(2,1,3,5)=-5.9f*1000; ndS(2,1,3,5)=-16.5f; // CA/G$ + ndH(2,2,5,3)=-4.4f*1000; ndS(2,2,5,3)=-12.6f; // CC/$G + ndH(2,2,3,5)=-2.6f*1000; ndS(2,2,3,5)=-7.4f; // CC/G$ + ndH(2,3,5,2)=-4.0f*1000; ndS(2,3,5,2)=-11.9f; // CG/$C + ndH(2,3,3,5)=-3.2f*1000; ndS(2,3,3,5)=-10.4f; // CG/G$ + ndH(2,4,5,1)=-4.1f*1000; ndS(2,4,5,1)=-13.0f; // CT/$A + ndH(2,4,3,5)=-5.2f*1000; ndS(2,4,3,5)=-15.0f; // CT/G$ + ndH(3,5,2,1)=-6.3f*1000; ndS(3,5,2,1)=-17.1f; // G$/CA + ndH(3,5,2,2)=-4.4f*1000; ndS(3,5,2,2)=-12.6f; // G$/CC + ndH(3,5,2,3)=-5.1f*1000; ndS(3,5,2,3)=-14.0f; // G$/CG + ndH(3,5,2,4)=-4.0f*1000; ndS(3,5,2,4)=-10.9f; // G$/CT + ndH(3,1,5,4)=-1.1f*1000; ndS(3,1,5,4)=-1.6f; // GA/$T + ndH(3,1,2,5)=-2.1f*1000; ndS(3,1,2,5)=-3.9f; // GA/C$ + ndH(3,2,5,3)=-5.1f*1000; ndS(3,2,5,3)=-14.0f; // GC/$G + ndH(3,2,2,5)=-0.2f*1000; ndS(3,2,2,5)=-0.1f; // GC/C$ + ndH(3,3,5,2)=-3.9f*1000; ndS(3,3,5,2)=-10.9f; // GG/$C + ndH(3,3,2,5)=-3.9f*1000; ndS(3,3,2,5)=-11.2f; // GG/C$ + ndH(3,4,5,1)=-4.2f*1000; ndS(3,4,5,1)=-15.0f; // GT/$A + ndH(3,4,2,5)=-4.4f*1000; ndS(3,4,2,5)=-13.1f; // GT/C$ + ndH(4,5,1,1)=0.2f*1000; ndS(4,5,1,1)=2.3f; // T$/AA + ndH(4,5,1,2)=0.6f*1000; ndS(4,5,1,2)=3.3f; // T$/AC + ndH(4,5,1,3)=-1.1f*1000; ndS(4,5,1,3)=-1.6f; // T$/AG + ndH(4,5,1,4)=-6.9f*1000; ndS(4,5,1,4)=-20.0f; // T$/AT + ndH(4,1,5,4)=-6.9f*1000; ndS(4,1,5,4)=-20.0f; // TA/$T + ndH(4,1,1,5)=-0.7f*1000; ndS(4,1,1,5)=-0.7f; // TA/A$ + ndH(4,2,5,3)=-4.0f*1000; ndS(4,2,5,3)=-10.9f; // TC/$G + ndH(4,2,1,5)=4.4f*1000; ndS(4,2,1,5)=14.9f; // TC/A$ + ndH(4,3,5,2)=-4.9f*1000; ndS(4,3,5,2)=-13.8f; // TG/$C + ndH(4,3,1,5)=-1.6f*1000; ndS(4,3,1,5)=-3.6f; // TG/A$ + ndH(4,4,5,1)=-0.2f*1000; ndS(4,4,5,1)=-0.5f; // TT/$A + ndH(4,4,1,5)=2.9f*1000; ndS(4,4,1,5)=10.4f; // TT/A$ + + return; +} + +int nparam_CountGCContent(char * seq ) { + int lseq = strlen(seq); + int k; + double count = 0; + for( k=0;krlogc; + double mtemp; + char c1; + char c2; + char c3; + char c4; + unsigned int i; + char nseq[50]; + char *useq = seq; + + nparam_CleanSeq (seq, nseq, len); + useq = nseq; + + for ( i=1;idH[c3][c4][c1][c2];//nparam_GetEnthalpy(nparm, c3,c4,c1,c2); + thedS += nparam_GetEntropy(nparm, c3,c4,c1,c2); + } + //printf("------------------\n"); + mtemp = nparam_CalcTM(thedS,thedH); + //fprintf(stderr,"Enthalpy: %f, entropy: %f, seq: %s rloc=%f\n", thedH, thedS, useq, nparm->rlogc); + //exit (0); + return mtemp; +} + +double nparam_CalcTwoTM(PNNParams nparm, char* seq1, char* seq2, int len) +{ + double thedH = 0; + //double thedS = nparam_GetInitialEntropy(nparm); + double thedS = -5.9f+nparm->rlogc; + double mtemp; + char c1; + char c2; + char c3; + char c4; + unsigned int i; + char nseq1[50]; + char nseq2[50]; + char *useq1; + char *useq2; + + nparam_CleanSeq (seq1, nseq1, len); + useq1 = nseq1; + + nparam_CleanSeq (seq2, nseq2, len); + useq2 = nseq2; + + //fprintf (stderr,"Primer : %s\n",useq); + for ( i=1;idH[c3][c4][c1][c2];//nparam_GetEnthalpy(nparm, c3,c4,c1,c2); + thedS += nparam_GetEntropy(nparm, c3,c4,c1,c2); + } + //fprintf(stderr,"------------------\n"); + mtemp = nparam_CalcTM(thedS,thedH); + //if (mtemp == 0) + //{ + // fprintf(stderr,"Enthalpy: %f, entropy: %f, seq: %s\n", thedH, thedS, useq); + //exit (0); + //} + return mtemp; +} + +double calculateMeltingTemperatureBasic (char * seq) { + int gccount; + double temp; + int seqlen; + + seqlen = strlen (seq); + gccount = nparam_CountGCContent (seq); + temp = 64.9 + 41*(gccount - 16.4)/seqlen; + return temp; +} diff --git a/src/libthermo/nnparams.h b/src/libthermo/nnparams.h new file mode 100644 index 0000000..5520ae1 --- /dev/null +++ b/src/libthermo/nnparams.h @@ -0,0 +1,72 @@ +/* + * nnparams.h + * PHunterLib + * + * Nearest Neighbor Model Parameters + * + * Created by Tiayyba Riaz on 02/07/09. + * + */ + +#ifndef NNPARAMS_H_ +#define NNPARAMS_H_ + +#include +#include +//#include "../libecoprimer/ecoprimer.h" + +// following defines to simplify coding... +#define ndH(a,b,c,d) nparm->dH[a][b][c][d] +#define ndS(a,b,c,d) nparm->dS[a][b][c][d] +#define forbidden_enthalpy 1000000000000000000.0f +#define R 1.987f +#define SALT_METHOD_SANTALUCIA 1 +#define SALT_METHOD_OWCZARZY 2 + +#define DEF_CONC_PRIMERS 0.0000008 +#define DEF_CONC_SEQUENCES 0 +#define DEF_SALT 0.05 + +#define GETNUMCODE(a) bpencoder[a - 'A'] +#define GETREVCODE(a) 5-bpencoder[a - 'A'] + + +extern double forbidden_entropy; + +static char bpencoder[] = { 1, // A + 0, // b + 2, // C + 0,0,0, // d, e, f + 3, // G + 0,0,0,0,0,0,0,0,0,0,0,0, // h,i,j,k,l,m,n,o,p,q,r,s + 4,0, // T,U + 0,0,0,0,0}; // v,w,x,y,z + + +typedef struct CNNParams_st +{ + double Ct1; + double Ct2; + double rlogc; + double kplus; + double kfac; + int saltMethod; + double gcContent; + double new_TM; + double dH[6][6][6][6]; // A-C-G-T + gap + initiation (dangling end, $ sign) + double dS[6][6][6][6]; +}CNNParams, * PNNParams; + +void nparam_InitParams(PNNParams nparm, double c1, double c2, double kp, int sm); +int nparam_CountGCContent(char * seq ); +double nparam_GetEntropy(PNNParams nparm, char x0, char x1, char y0, char y1); +double nparam_GetEnthalpy(PNNParams nparm, char x0, char x1, char y0, char y1); +double nparam_CalcTM(double entropy,double enthalpy); +double nparam_CalcSelfTM(PNNParams nparm, char* seq, int len); +double nparam_CalcTwoTM(PNNParams nparm, char* seq1, char* seq2, int len); + +double nparam_GetInitialEntropy(PNNParams nparm) ; +double calculateMeltingTemperatureBasic (char * seq); +//void getThermoProperties (ppair_t* pairs, size_t count, poptions_t options); + +#endif diff --git a/src/libthermo/thermostats.c b/src/libthermo/thermostats.c new file mode 100644 index 0000000..9141e17 --- /dev/null +++ b/src/libthermo/thermostats.c @@ -0,0 +1,115 @@ +#include +#include +#include +#include +#include "thermostats.h" + +word_t extractSite(char* sequence, size_t begin, size_t length, bool_t strand) +{ + char *c; + char *start; + uint32_t l; + word_t site = 0; + + start=sequence+begin; + if (!strand) + start+=length-1; + + + for (c=start, + l=0; + lp1->word; + w2 = pairs[i]->p2->word; + + if (!pairs[i]->asdirect1) + w1=ecoComplementWord(w1,options->primer_length); + + if (!pairs[i]->asdirect2) + w2=ecoComplementWord(w2,options->primer_length); + + strncpy(prmrd,ecoUnhashWord(w1, options->primer_length),options->primer_length); + strncpy(prmrr,ecoUnhashWord(w2, options->primer_length),options->primer_length); + prmrd[options->primer_length]=0; + prmrr[options->primer_length]=0; + pairs[i]->p1temp = nparam_CalcSelfTM (options->pnparm, prmrd, options->primer_length) - 273.0; + pairs[i]->p2temp = nparam_CalcSelfTM (options->pnparm, prmrr, options->primer_length) - 273.0; + pairs[i]->p1mintemp = 100; + pairs[i]->p2mintemp = 100; + + for (j = 0; j < pairs[i]->pcr.ampcount; j++) + if (pairs[i]->pcr.amplifias[j].sequence->isexample) + { + + sq = pairs[i]->pcr.amplifias[j].sequence->SQ; + strand = pairs[i]->pcr.amplifias[j].strand; + bp1 = pairs[i]->pcr.amplifias[j].begin - options->primer_length; + bp2 = pairs[i]->pcr.amplifias[j].end + 1; + + if (!strand) + { + uint32_t tmp; + tmp=bp1; + bp1=bp2; + bp2=tmp; + } + +// printf("%s : %s, %c",prmrd, +// ecoUnhashWord(extractSite(sq,bp1,options->primer_length,strand),options->primer_length), +// "rd"[strand]); + mtemp = nparam_CalcTwoTM(options->pnparm, + prmrd, + ecoUnhashWord(extractSite(sq,bp1,options->primer_length,strand),options->primer_length), + options->primer_length) - 273.0; +// printf(" %4.2f %4.2f\n",pairs[i]->p1temp,mtemp); + if (mtemp < pairs[i]->p1mintemp) + pairs[i]->p1mintemp = mtemp; + +// printf("%s : %s, %c\n",prmrr,ecoUnhashWord(extractSite(sq,bp2,options->primer_length,!strand),options->primer_length), +// "rd"[strand]); +// + mtemp = nparam_CalcTwoTM(options->pnparm, + prmrr, + ecoUnhashWord(extractSite(sq,bp2,options->primer_length,!strand),options->primer_length), + options->primer_length) - 273.0; + if (mtemp < pairs[i]->p2mintemp) + pairs[i]->p2mintemp = mtemp; + } + + if (w2 < w1) + { + mtemp = pairs[i]->p1temp; + pairs[i]->p1temp = pairs[i]->p2temp; + pairs[i]->p2temp = mtemp; + + mtemp = pairs[i]->p1mintemp; + pairs[i]->p1mintemp = pairs[i]->p2mintemp; + pairs[i]->p2mintemp = mtemp; + } + + } +} diff --git a/src/libthermo/thermostats.h b/src/libthermo/thermostats.h new file mode 100644 index 0000000..40c4806 --- /dev/null +++ b/src/libthermo/thermostats.h @@ -0,0 +1,9 @@ +#ifndef THERMOSTATS_H_ +#define THERMOSTATS_H_ + +#include "../libecoprimer/ecoprimer.h" + +void getThermoProperties (ppair_t* pairs, size_t count, poptions_t options); +word_t extractSite(char* sequence, size_t begin, size_t length, bool_t strand); + +#endif \ No newline at end of file diff --git a/tools/ecoPCRFormat.py b/tools/ecoPCRFormat.py new file mode 100755 index 0000000..3884001 --- /dev/null +++ b/tools/ecoPCRFormat.py @@ -0,0 +1,651 @@ +#!/usr/bin/env python + +import re +import gzip +import struct +import sys +import time +import getopt + +try: + import psycopg2 + _dbenable=True +except ImportError: + _dbenable=False + +##### +# +# +# Generic file function +# +# +##### + +def universalOpen(file): + if isinstance(file,str): + if file[-3:] == '.gz': + rep = gzip.open(file) + else: + rep = open(file) + else: + rep = file + return rep + +def universalTell(file): + if isinstance(file, gzip.GzipFile): + file=file.myfileobj + return file.tell() + +def fileSize(file): + if isinstance(file, gzip.GzipFile): + file=file.myfileobj + pos = file.tell() + file.seek(0,2) + length = file.tell() + file.seek(pos,0) + return length + +def progressBar(pos,max,reset=False,delta=[]): + if reset: + del delta[:] + if not delta: + delta.append(time.time()) + delta.append(time.time()) + + delta[1]=time.time() + elapsed = delta[1]-delta[0] + percent = float(pos)/max * 100 + remain = time.strftime('%H:%M:%S',time.gmtime(elapsed / percent * (100-percent))) + bar = '#' * int(percent/2) + bar+= '|/-\\-'[pos % 5] + bar+= ' ' * (50 - int(percent/2)) + sys.stderr.write('\r%5.1f %% |%s] remain : %s' %(percent,bar,remain)) + +##### +# +# +# NCBI Dump Taxonomy reader +# +# +##### + +def endLessIterator(endedlist): + for x in endedlist: + yield x + while(1): + yield endedlist[-1] + +class ColumnFile(object): + + def __init__(self,stream,sep=None,strip=True,types=None): + if isinstance(stream,str): + self._stream = open(stream) + elif hasattr(stream,'next'): + self._stream = stream + else: + raise ValueError,'stream must be string or an iterator' + self._delimiter=sep + self._strip=strip + if types: + self._types=[x for x in types] + for i in xrange(len(self._types)): + if self._types[i] is bool: + self._types[i]=ColumnFile.str2bool + else: + self._types=None + + def str2bool(x): + return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False})) + + str2bool = staticmethod(str2bool) + + + def __iter__(self): + return self + + def next(self): + ligne = self._stream.next() + data = ligne.split(self._delimiter) + if self._strip or self._types: + data = [x.strip() for x in data] + if self._types: + it = endLessIterator(self._types) + data = [x[1](x[0]) for x in ((y,it.next()) for y in data)] + return data + +def taxonCmp(t1,t2): + if t1[0] < t2[0]: + return -1 + elif t1[0] > t2[0]: + return +1 + return 0 + +def bsearchTaxon(taxonomy,taxid): + taxCount = len(taxonomy) + begin = 0 + end = taxCount + oldcheck=taxCount + check = begin + end / 2 + while check != oldcheck and taxonomy[check][0]!=taxid : + if taxonomy[check][0] < taxid: + begin=check + else: + end=check + oldcheck=check + check = (begin + end) / 2 + + + if taxonomy[check][0]==taxid: + return check + else: + return None + + + +def readNodeTable(file): + + file = universalOpen(file) + + nodes = ColumnFile(file, + sep='|', + types=(int,int,str, + str,str,bool, + int,bool,int, + bool,bool,bool,str)) + print >>sys.stderr,"Reading taxonomy dump file..." + taxonomy=[[n[0],n[2],n[1]] for n in nodes] + print >>sys.stderr,"List all taxonomy rank..." + ranks =list(set(x[1] for x in taxonomy)) + ranks.sort() + ranks = dict(map(None,ranks,xrange(len(ranks)))) + + print >>sys.stderr,"Sorting taxons..." + taxonomy.sort(taxonCmp) + + print >>sys.stderr,"Indexing taxonomy..." + index = {} + for t in taxonomy: + index[t[0]]=bsearchTaxon(taxonomy, t[0]) + + print >>sys.stderr,"Indexing parent and rank..." + for t in taxonomy: + t[1]=ranks[t[1]] + t[2]=index[t[2]] + + + return taxonomy,ranks,index + +def nameIterator(file): + file = universalOpen(file) + names = ColumnFile(file, + sep='|', + types=(int,str, + str,str)) + for taxid,name,unique,classname,white in names: + yield taxid,name,classname + +def mergedNodeIterator(file): + file = universalOpen(file) + merged = ColumnFile(file, + sep='|', + types=(int,int,str)) + for taxid,current,white in merged: + yield taxid,current + +def deletedNodeIterator(file): + file = universalOpen(file) + deleted = ColumnFile(file, + sep='|', + types=(int,str)) + for taxid,white in deleted: + yield taxid + +def readTaxonomyDump(taxdir): + taxonomy,ranks,index = readNodeTable('%s/nodes.dmp' % taxdir) + + print >>sys.stderr,"Adding scientific name..." + + alternativeName=[] + for taxid,name,classname in nameIterator('%s/names.dmp' % taxdir): + alternativeName.append((name,classname,index[taxid])) + if classname == 'scientific name': + taxonomy[index[taxid]].append(name) + + print >>sys.stderr,"Adding taxid alias..." + for taxid,current in mergedNodeIterator('%s/merged.dmp' % taxdir): + index[taxid]=index[current] + + print >>sys.stderr,"Adding deleted taxid..." + for taxid in deletedNodeIterator('%s/delnodes.dmp' % taxdir): + index[taxid]=None + + return taxonomy,ranks,alternativeName,index + +def readTaxonomyDB(dbname): + connection = psycopg2.connect(database=dbname) + + cursor = connection.cursor() + cursor.execute("select numid,rank,parent from ncbi_taxonomy.taxon") + taxonomy=[list(x) for x in cursor] + + cursor.execute("select rank_class from ncbi_taxonomy.taxon_rank_class order by rank_class") + ranks=cursor.fetchall() + ranks = dict(map(None,(x[0] for x in ranks),xrange(len(ranks)))) + + print >>sys.stderr,"Sorting taxons..." + taxonomy.sort(taxonCmp) + + print >>sys.stderr,"Indexing taxonomy..." + index = {} + for t in taxonomy: + index[t[0]]=bsearchTaxon(taxonomy, t[0]) + + print >>sys.stderr,"Indexing parent and rank..." + for t in taxonomy: + t[1]=ranks[t[1]] + try: + t[2]=index[t[2]] + except KeyError,e: + if t[2] is None and t[0]==1: + t[2]=index[t[0]] + else: + raise e + + cursor.execute("select taxid,name,category from ncbi_taxonomy.name") + + alternativeName=[] + for taxid,name,classname in cursor: + alternativeName.append((name,classname,index[taxid])) + if classname == 'scientific name': + taxonomy[index[taxid]].append(name) + + cursor.execute("select old_numid,current_numid from ncbi_taxonomy.taxon_id_alias") + + print >>sys.stderr,"Adding taxid alias..." + for taxid,current in cursor: + if current is not None: + index[taxid]=index[current] + else: + index[taxid]=None + + + return taxonomy,ranks,alternativeName,index + +##### +# +# +# Genbank/EMBL sequence reader +# +# +##### + +def entryIterator(file): + file = universalOpen(file) + rep =[] + for ligne in file: + rep.append(ligne) + if ligne == '//\n': + rep = ''.join(rep) + yield rep + rep = [] + +def fastaEntryIterator(file): + file = universalOpen(file) + rep =[] + for ligne in file: + if ligne[0] == '>' and rep: + rep = ''.join(rep) + yield rep + rep = [] + rep.append(ligne) + if rep: + rep = ''.join(rep) + yield rep + +_cleanSeq = re.compile('[ \n0-9]+') + +def cleanSeq(seq): + return _cleanSeq.sub('',seq) + + +_gbParseID = re.compile('(?<=^LOCUS {7})[^ ]+(?= )',re.MULTILINE) +_gbParseDE = re.compile('(?<=^DEFINITION {2}).+?\. *$(?=[^ ])',re.MULTILINE+re.DOTALL) +_gbParseSQ = re.compile('(?<=^ORIGIN).+?(?=^//$)',re.MULTILINE+re.DOTALL) +_gbParseTX = re.compile('(?<= /db_xref="taxon:)[0-9]+(?=")') + +def genbankEntryParser(entry): + Id = _gbParseID.findall(entry)[0] + De = ' '.join(_gbParseDE.findall(entry)[0].split()) + Sq = cleanSeq(_gbParseSQ.findall(entry)[0].upper()) + try: + Tx = int(_gbParseTX.findall(entry)[0]) + except IndexError: + Tx = None + return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq} + +###################### + +_cleanDef = re.compile('[\nDE]') + +def cleanDef(definition): + return _cleanDef.sub('',definition) + +_emblParseID = re.compile('(?<=^ID {3})[^ ]+(?=;)',re.MULTILINE) +_emblParseDE = re.compile('(?<=^DE {3}).+?\. *$(?=[^ ])',re.MULTILINE+re.DOTALL) +_emblParseSQ = re.compile('(?<=^ ).+?(?=^//$)',re.MULTILINE+re.DOTALL) +_emblParseTX = re.compile('(?<= /db_xref="taxon:)[0-9]+(?=")') + +def emblEntryParser(entry): + Id = _emblParseID.findall(entry)[0] + De = ' '.join(cleanDef(_emblParseDE.findall(entry)[0]).split()) + Sq = cleanSeq(_emblParseSQ.findall(entry)[0].upper()) + try: + Tx = int(_emblParseTX.findall(entry)[0]) + except IndexError: + Tx = None + return {'id':Id,'taxid':Tx,'definition':De,'sequence':Sq} + + +###################### + +_fastaSplit=re.compile(';\W*') + +def parseFasta(seq): + seq=seq.split('\n') + title = seq[0].strip()[1:].split(None,1) + id=title[0] + if len(title) == 2: + field = _fastaSplit.split(title[1]) + else: + field=[] + info = dict(x.split('=',1) for x in field if '=' in x) + definition = ' '.join([x for x in field if '=' not in x]) + seq=(''.join([x.strip() for x in seq[1:]])).upper() + return id,seq,definition,info + + +def fastaEntryParser(entry): + id,seq,definition,info = parseFasta(entry) + Tx = info.get('taxid',None) + if Tx is not None: + Tx=int(Tx) + return {'id':id,'taxid':Tx,'definition':definition,'sequence':seq} + + +def sequenceIteratorFactory(entryParser,entryIterator): + def sequenceIterator(file): + for entry in entryIterator(file): + yield entryParser(entry) + return sequenceIterator + + +def taxonomyInfo(entry,connection): + taxid = entry['taxid'] + curseur = connection.cursor() + curseur.execute(""" + select taxid,species,genus,family, + taxonomy.scientificName(taxid) as sn, + taxonomy.scientificName(species) as species_sn, + taxonomy.scientificName(genus) as genus_sn, + taxonomy.scientificName(family) as family_sn + from + ( + select alias as taxid, + taxonomy.getSpecies(alias) as species, + taxonomy.getGenus(alias) as genus, + taxonomy.getFamily(alias) as family + from taxonomy.aliases + where id=%d ) as tax + """ % taxid) + rep = curseur.fetchone() + entry['current_taxid']=rep[0] + entry['species']=rep[1] + entry['genus']=rep[2] + entry['family']=rep[3] + entry['scientific_name']=rep[4] + entry['species_sn']=rep[5] + entry['genus_sn']=rep[6] + entry['family_sn']=rep[7] + return entry + +##### +# +# +# Binary writer +# +# +##### + +def ecoSeqPacker(sq): + + compactseq = gzip.zlib.compress(sq['sequence'],9) + cptseqlength = len(compactseq) + delength = len(sq['definition']) + + totalSize = 4 + 20 + 4 + 4 + 4 + cptseqlength + delength + + packed = struct.pack('> I I 20s I I I %ds %ds' % (delength,cptseqlength), + totalSize, + sq['taxid'], + sq['id'], + delength, + len(sq['sequence']), + cptseqlength, + sq['definition'], + compactseq) + + assert len(packed) == totalSize+4, "error in sequence packing" + + return packed + +def ecoTaxPacker(tx): + + namelength = len(tx[3]) + + totalSize = 4 + 4 + 4 + 4 + namelength + + packed = struct.pack('> I I I I I %ds' % namelength, + totalSize, + tx[0], + tx[1], + tx[2], + namelength, + tx[3]) + + return packed + +def ecoRankPacker(rank): + + namelength = len(rank) + + packed = struct.pack('> I %ds' % namelength, + namelength, + rank) + + return packed + +def ecoNamePacker(name): + + namelength = len(name[0]) + classlength= len(name[1]) + totalSize = namelength + classlength + 4 + 4 + 4 + 4 + + packed = struct.pack('> I I I I I %ds %ds' % (namelength,classlength), + totalSize, + int(name[1]=='scientific name'), + namelength, + classlength, + name[2], + name[0], + name[1]) + + return packed + +def ecoSeqWriter(file,input,taxindex,parser): + output = open(file,'wb') + input = universalOpen(input) + inputsize = fileSize(input) + entries = parser(input) + seqcount=0 + skipped = [] + + output.write(struct.pack('> I',seqcount)) + + progressBar(1, inputsize,reset=True) + for entry in entries: + if entry['taxid'] is not None: + try: + entry['taxid']=taxindex[entry['taxid']] + except KeyError: + entry['taxid']=None + if entry['taxid'] is not None: + seqcount+=1 + output.write(ecoSeqPacker(entry)) + else: + skipped.append(entry['id']) + where = universalTell(input) + progressBar(where, inputsize) + print >>sys.stderr," Readed sequences : %d " % seqcount, + else: + skipped.append(entry['id']) + + print >>sys.stderr + output.seek(0,0) + output.write(struct.pack('> I',seqcount)) + + output.close() + return skipped + + +def ecoTaxWriter(file,taxonomy): + output = open(file,'wb') + output.write(struct.pack('> I',len(taxonomy))) + + for tx in taxonomy: + output.write(ecoTaxPacker(tx)) + + output.close() + +def ecoRankWriter(file,ranks): + output = open(file,'wb') + output.write(struct.pack('> I',len(ranks))) + + rankNames = ranks.keys() + rankNames.sort() + + for rank in rankNames: + output.write(ecoRankPacker(rank)) + + output.close() + +def nameCmp(n1,n2): + name1=n1[0].upper() + name2=n2[0].upper() + if name1 < name2: + return -1 + elif name1 > name2: + return 1 + return 0 + + +def ecoNameWriter(file,names): + output = open(file,'wb') + output.write(struct.pack('> I',len(names))) + + names.sort(nameCmp) + + for name in names: + output.write(ecoNamePacker(name)) + + output.close() + +def ecoDBWriter(prefix,taxonomy,seqFileNames,parser): + + ecoRankWriter('%s.rdx' % prefix, taxonomy[1]) + ecoTaxWriter('%s.tdx' % prefix, taxonomy[0]) + ecoNameWriter('%s.ndx' % prefix, taxonomy[2]) + + filecount = 0 + for filename in seqFileNames: + filecount+=1 + sk=ecoSeqWriter('%s_%03d.sdx' % (prefix,filecount), + filename, + taxonomy[3], + parser) + if sk: + print >>sys.stderr,"Skipped entry :" + print >>sys.stderr,sk + +def ecoParseOptions(arguments): + opt = { + 'prefix' : 'ecodb', + 'taxdir' : 'taxdump', + 'parser' : sequenceIteratorFactory(genbankEntryParser, + entryIterator) + } + + o,filenames = getopt.getopt(arguments, + 'ht:T:n:gfe', + ['help', + 'taxonomy=', + 'taxonomy_db=', + 'name=', + 'genbank', + 'fasta', + 'embl']) + + for name,value in o: + if name in ('-h','--help'): + printHelp() + exit() + elif name in ('-t','--taxonomy'): + opt['taxmod']='dump' + opt['taxdir']=value + elif name in ('-T','--taxonomy_db'): + opt['taxmod']='db' + opt['taxdb']=value + elif name in ('-n','--name'): + opt['prefix']=value + elif name in ('-g','--genbank'): + opt['parser']=sequenceIteratorFactory(genbankEntryParser, + entryIterator) + + elif name in ('-f','--fasta'): + opt['parser']=sequenceIteratorFactory(fastaEntryParser, + fastaEntryIterator) + + elif name in ('-e','--embl'): + opt['parser']=sequenceIteratorFactory(emblEntryParser, + entryIterator) + else: + raise ValueError,'Unknown option %s' % name + + return opt,filenames + +def printHelp(): + print "-----------------------------------" + print " ecoPCRFormat.py" + print "-----------------------------------" + print "ecoPCRFormat.py [option] " + print "-----------------------------------" + print "-e --embl :[E]mbl format" + print "-f --fasta :[F]asta format" + print "-g --genbank :[G]enbank format" + print "-h --help :[H]elp - print this help" + print "-n --name :[N]ame of the new database created" + print "-t --taxonomy :[T]axonomy - path to the taxonomy database" + print " :bcp-like dump from GenBank taxonomy database." + print "-----------------------------------" + +if __name__ == '__main__': + + opt,filenames = ecoParseOptions(sys.argv[1:]) + + if opt['taxmod']=='dump': + taxonomy = readTaxonomyDump(opt['taxdir']) + elif opt['taxmod']=='db': + taxonomy = readTaxonomyDB(opt['taxdb']) + + + ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser']) +