My complete changes on my laptop, with specificity bug fix + ahocorasick + sets

git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPrimers/trunk@393 60f365c0-8329-0410-b2a4-ec073aeeaa1d
This commit is contained in:
2012-01-03 21:05:31 +00:00
parent 19887e9a46
commit 1f5a30b0df
13 changed files with 2502 additions and 254 deletions

364
.cproject
View File

@ -1,151 +1,221 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?> <?fileVersion 4.0.0?>
<cproject> <cproject>
<storageModule moduleId="org.eclipse.cdt.core.settings"> <storageModule moduleId="org.eclipse.cdt.core.settings">
<cconfiguration id="cdt.managedbuild.toolchain.gnu.macosx.base.2134184396"> <cconfiguration id="cdt.managedbuild.toolchain.gnu.macosx.base.2134184396">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.toolchain.gnu.macosx.base.2134184396" moduleId="org.eclipse.cdt.core.settings" name="MacOSX GCC"> <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.toolchain.gnu.macosx.base.2134184396" moduleId="org.eclipse.cdt.core.settings" name="MacOSX GCC">
<externalSettings/> <externalSettings/>
<extensions> <extensions>
<extension id="org.eclipse.cdt.core.MachO" point="org.eclipse.cdt.core.BinaryParser"/> <extension id="org.eclipse.cdt.core.MachO" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/> <extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/> <extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/> <extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.MakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/> <extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
</extensions> <extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
</storageModule> </extensions>
<storageModule moduleId="cdtBuildSystem" version="4.0.0"> </storageModule>
<configuration artifactName="ecoPrimers" buildProperties="" id="cdt.managedbuild.toolchain.gnu.macosx.base.2134184396" name="MacOSX GCC" parent="org.eclipse.cdt.build.core.emptycfg"> <storageModule moduleId="cdtBuildSystem" version="4.0.0">
<folderInfo id="cdt.managedbuild.toolchain.gnu.macosx.base.2134184396.1840911077" name="/" resourcePath=""> <configuration artifactName="ecoPrimers" buildProperties="" description="" id="cdt.managedbuild.toolchain.gnu.macosx.base.2134184396" name="MacOSX GCC" parent="org.eclipse.cdt.build.core.emptycfg">
<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.base.766054112" name="cdt.managedbuild.toolchain.gnu.macosx.base" superClass="cdt.managedbuild.toolchain.gnu.macosx.base"> <folderInfo id="cdt.managedbuild.toolchain.gnu.macosx.base.2134184396.1840911077" name="/" resourcePath="">
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.MachO" id="cdt.managedbuild.target.gnu.platform.macosx.base.2057035265" name="Debug Platform" osList="macosx" superClass="cdt.managedbuild.target.gnu.platform.macosx.base"/> <toolChain id="cdt.managedbuild.toolchain.gnu.macosx.base.766054112" name="cdt.managedbuild.toolchain.gnu.macosx.base" superClass="cdt.managedbuild.toolchain.gnu.macosx.base">
<builder id="cdt.managedbuild.target.gnu.builder.macosx.base.783726363" managedBuildOn="false" name="Gnu Make Builder.MacOSX GCC" superClass="cdt.managedbuild.target.gnu.builder.macosx.base"/> <targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.MachO" id="cdt.managedbuild.target.gnu.platform.macosx.base.2057035265" name="Debug Platform" osList="macosx" superClass="cdt.managedbuild.target.gnu.platform.macosx.base"/>
<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.base.914103467" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.base"> <builder id="cdt.managedbuild.target.gnu.builder.macosx.base.783726363" keepEnvironmentInBuildfile="false" managedBuildOn="false" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.base"/>
<inputType id="cdt.managedbuild.tool.macosx.c.linker.input.62980206" superClass="cdt.managedbuild.tool.macosx.c.linker.input"> <tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.base.914103467" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.base">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/> <inputType id="cdt.managedbuild.tool.macosx.c.linker.input.62980206" superClass="cdt.managedbuild.tool.macosx.c.linker.input">
<additionalInput kind="additionalinput" paths="$(LIBS)"/> <additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
</inputType> <additionalInput kind="additionalinput" paths="$(LIBS)"/>
</tool> </inputType>
<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.base.691108439" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.base"/> </tool>
<tool id="cdt.managedbuild.tool.gnu.assembler.macosx.base.695639877" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.macosx.base"> <tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.base.691108439" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.base"/>
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.1507665054" superClass="cdt.managedbuild.tool.gnu.assembler.input"/> <tool id="cdt.managedbuild.tool.gnu.assembler.macosx.base.695639877" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.macosx.base">
</tool> <option id="gnu.both.asm.option.include.paths.1544375094" name="Include paths (-I)" superClass="gnu.both.asm.option.include.paths" valueType="includePath"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.macosx.base.1786370580" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.macosx.base"/> <inputType id="cdt.managedbuild.tool.gnu.assembler.input.1507665054" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.macosx.base.454329831" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.macosx.base"/> </tool>
<tool id="cdt.managedbuild.tool.gnu.c.compiler.macosx.base.1928774909" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.macosx.base"> <tool id="cdt.managedbuild.tool.gnu.archiver.macosx.base.1786370580" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.macosx.base"/>
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.330854350" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/> <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.macosx.base.454329831" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.macosx.base"/>
</tool> <tool id="cdt.managedbuild.tool.gnu.c.compiler.macosx.base.1928774909" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.macosx.base">
</toolChain> <option id="gnu.c.compiler.option.include.paths.823251305" superClass="gnu.c.compiler.option.include.paths" valueType="includePath">
</folderInfo> <listOptionValue builtIn="false" value="/usr/include"/>
</configuration> </option>
</storageModule> <inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.330854350" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
<storageModule moduleId="scannerConfiguration"> </tool>
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile"/> </toolChain>
<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile"> </folderInfo>
<buildOutputProvider> </configuration>
<openAction enabled="true" filePath=""/> </storageModule>
<parser enabled="true"/> <storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</buildOutputProvider> <storageModule moduleId="org.eclipse.cdt.core.language.mapping"/>
<scannerInfoProvider id="specsFile"> <storageModule moduleId="scannerConfiguration">
<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/> <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile"/>
<parser enabled="true"/> <profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
</scannerInfoProvider> <buildOutputProvider>
</profile> <openAction enabled="true" filePath=""/>
<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile"> <parser enabled="true"/>
<buildOutputProvider> </buildOutputProvider>
<openAction enabled="true" filePath=""/> <scannerInfoProvider id="specsFile">
<parser enabled="true"/> <runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
</buildOutputProvider> <parser enabled="true"/>
<scannerInfoProvider id="makefileGenerator"> </scannerInfoProvider>
<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/> </profile>
<parser enabled="true"/> <profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
</scannerInfoProvider> <buildOutputProvider>
</profile> <openAction enabled="true" filePath=""/>
<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile"> <parser enabled="true"/>
<buildOutputProvider> </buildOutputProvider>
<openAction enabled="true" filePath=""/> <scannerInfoProvider id="makefileGenerator">
<parser enabled="true"/> <runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
</buildOutputProvider> <parser enabled="true"/>
<scannerInfoProvider id="specsFile"> </scannerInfoProvider>
<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/> </profile>
<parser enabled="true"/> <profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
</scannerInfoProvider> <buildOutputProvider>
</profile> <openAction enabled="true" filePath=""/>
<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"> <parser enabled="true"/>
<buildOutputProvider> </buildOutputProvider>
<openAction enabled="true" filePath=""/> <scannerInfoProvider id="specsFile">
<parser enabled="true"/> <runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
</buildOutputProvider> <parser enabled="true"/>
<scannerInfoProvider id="specsFile"> </scannerInfoProvider>
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/> </profile>
<parser enabled="true"/> <profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
</scannerInfoProvider> <buildOutputProvider>
</profile> <openAction enabled="true" filePath=""/>
<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"> <parser enabled="true"/>
<buildOutputProvider> </buildOutputProvider>
<openAction enabled="true" filePath=""/> <scannerInfoProvider id="specsFile">
<parser enabled="true"/> <runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
</buildOutputProvider> <parser enabled="true"/>
<scannerInfoProvider id="specsFile"> </scannerInfoProvider>
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/> </profile>
<parser enabled="true"/> <profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
</scannerInfoProvider> <buildOutputProvider>
</profile> <openAction enabled="true" filePath=""/>
<profile id="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfile"> <parser enabled="true"/>
<buildOutputProvider> </buildOutputProvider>
<openAction enabled="true" filePath=""/> <scannerInfoProvider id="specsFile">
<parser enabled="true"/> <runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
</buildOutputProvider> <parser enabled="true"/>
<scannerInfoProvider id="specsFile"> </scannerInfoProvider>
<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/> </profile>
<parser enabled="true"/> <profile id="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfile">
</scannerInfoProvider> <buildOutputProvider>
</profile> <openAction enabled="true" filePath=""/>
<profile id="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP"> <parser enabled="true"/>
<buildOutputProvider> </buildOutputProvider>
<openAction enabled="true" filePath=""/> <scannerInfoProvider id="specsFile">
<parser enabled="true"/> <runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
</buildOutputProvider> <parser enabled="true"/>
<scannerInfoProvider id="specsFile"> </scannerInfoProvider>
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/> </profile>
<parser enabled="true"/> <profile id="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP">
</scannerInfoProvider> <buildOutputProvider>
</profile> <openAction enabled="true" filePath=""/>
<profile id="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC"> <parser enabled="true"/>
<buildOutputProvider> </buildOutputProvider>
<openAction enabled="true" filePath=""/> <scannerInfoProvider id="specsFile">
<parser enabled="true"/> <runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
</buildOutputProvider> <parser enabled="true"/>
<scannerInfoProvider id="specsFile"> </scannerInfoProvider>
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/> </profile>
<parser enabled="true"/> <profile id="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC">
</scannerInfoProvider> <buildOutputProvider>
</profile> <openAction enabled="true" filePath=""/>
<profile id="org.eclipse.cdt.managedbuilder.xlc.core.XLCManagedMakePerProjectProfile"> <parser enabled="true"/>
<buildOutputProvider> </buildOutputProvider>
<openAction enabled="false" filePath=""/> <scannerInfoProvider id="specsFile">
<parser enabled="false"/> <runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
</buildOutputProvider> <parser enabled="true"/>
<scannerInfoProvider id="specsFile"> </scannerInfoProvider>
<runAction arguments="-E -v ${plugin_state_location}/${specs_file}" command="${XL_compilerRoot}/xlc" useDefault="true"/> </profile>
<parser enabled="true"/> <scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.macosx.base.2134184396;cdt.managedbuild.toolchain.gnu.macosx.base.2134184396.1840911077;cdt.managedbuild.tool.gnu.c.compiler.macosx.base.1928774909;cdt.managedbuild.tool.gnu.c.compiler.input.330854350">
</scannerInfoProvider> <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</profile> <profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
<profile id="org.eclipse.cdt.managedbuilder.xlc.core.XLCManagedMakePerProjectProfileCPP"> <buildOutputProvider>
<buildOutputProvider> <openAction enabled="true" filePath=""/>
<openAction enabled="false" filePath=""/> <parser enabled="true"/>
<parser enabled="false"/> </buildOutputProvider>
</buildOutputProvider> <scannerInfoProvider id="specsFile">
<scannerInfoProvider id="specsFile"> <runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
<runAction arguments="-E -v ${plugin_state_location}/${specs_file}" command="${XL_compilerRoot}/xlC" useDefault="true"/> <parser enabled="true"/>
<parser enabled="true"/> </scannerInfoProvider>
</scannerInfoProvider> </profile>
</profile> <profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
</storageModule> <buildOutputProvider>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/> <openAction enabled="true" filePath=""/>
</cconfiguration> <parser enabled="true"/>
</storageModule> </buildOutputProvider>
<storageModule moduleId="cdtBuildSystem" version="4.0.0"> <scannerInfoProvider id="makefileGenerator">
<project id="ecoPrimers.null.1292969001" name="ecoPrimers"/> <runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
</storageModule> <parser enabled="true"/>
</scannerInfoProvider>
</profile>
<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
<buildOutputProvider>
<openAction enabled="true" filePath=""/>
<parser enabled="true"/>
</buildOutputProvider>
<scannerInfoProvider id="specsFile">
<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
<parser enabled="true"/>
</scannerInfoProvider>
</profile>
<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
<buildOutputProvider>
<openAction enabled="true" filePath=""/>
<parser enabled="true"/>
</buildOutputProvider>
<scannerInfoProvider id="specsFile">
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
<parser enabled="true"/>
</scannerInfoProvider>
</profile>
<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
<buildOutputProvider>
<openAction enabled="true" filePath=""/>
<parser enabled="true"/>
</buildOutputProvider>
<scannerInfoProvider id="specsFile">
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
<parser enabled="true"/>
</scannerInfoProvider>
</profile>
<profile id="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfile">
<buildOutputProvider>
<openAction enabled="true" filePath=""/>
<parser enabled="true"/>
</buildOutputProvider>
<scannerInfoProvider id="specsFile">
<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
<parser enabled="true"/>
</scannerInfoProvider>
</profile>
<profile id="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP">
<buildOutputProvider>
<openAction enabled="true" filePath=""/>
<parser enabled="true"/>
</buildOutputProvider>
<scannerInfoProvider id="specsFile">
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
<parser enabled="true"/>
</scannerInfoProvider>
</profile>
<profile id="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC">
<buildOutputProvider>
<openAction enabled="true" filePath=""/>
<parser enabled="true"/>
</buildOutputProvider>
<scannerInfoProvider id="specsFile">
<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
<parser enabled="true"/>
</scannerInfoProvider>
</profile>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
</cconfiguration>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<project id="ecoPrimers.null.1292969001" name="ecoPrimers"/>
</storageModule>
</cproject> </cproject>

View File

@ -7,6 +7,7 @@
#include "libecoprimer/ecoprimer.h" #include "libecoprimer/ecoprimer.h"
#include "libecoprimer/PrimerSets.h" #include "libecoprimer/PrimerSets.h"
#include "libecoprimer/ahocorasick.h"
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include <ctype.h> #include <ctype.h>
@ -25,6 +26,8 @@
static int cmpprintedpairs(const void* p1,const void* p2); static int cmpprintedpairs(const void* p1,const void* p2);
//float _Z27calculateMeltingTemperature_ (char * seq1, char * seq2); //float _Z27calculateMeltingTemperature_ (char * seq1, char * seq2);
pwordcount_t reduce_words_to_debug (pwordcount_t words, poptions_t options);
void print_wordwith_positions (primer_t prm, uint32_t seqdbsize, poptions_t options);
void* lib_handle = NULL; void* lib_handle = NULL;
float (*calcMelTemp)(char*, char*); float (*calcMelTemp)(char*, char*);
@ -71,12 +74,12 @@ static void PrintHelp()
PP "-m : Salt correction method for Tm computation (SANTALUCIA : 1 or OWCZARZY:2, default=1)\n\n"); PP "-m : Salt correction method for Tm computation (SANTALUCIA : 1 or OWCZARZY:2, default=1)\n\n");
PP "-a : Salt contentration in M for Tm computation (default 0.05 M)\n\n"); PP "-a : Salt contentration in M for Tm computation (default 0.05 M)\n\n");
PP "-U : No multi match\n\n"); PP "-U : No multi match\n\n");
PP "-U : Define the [R]eference sequence identifier (must be part of example set)\n\n"); PP "-R : Define the [R]eference sequence identifier (must be part of example set)\n\n");
PP "-A : Print the list of all identifier of sequences present in the database\n\n"); PP "-A : Print the list of all identifier of sequences present in the database\n\n");
PP "-f : Remove data mining step during strict primer identification\n\n"); PP "-f : Remove data mining step during strict primer identification\n\n");
PP "-v : Store statistic file about memory usage during strict primer identification\n\n"); PP "-v : Store statistic file about memory usage during strict primer identification\n\n");
PP "-p : Print sets of primers\n\n"); PP "-p : Print sets of primers (may take several minutes after primers have been designed!)\n\n");
PP "-T : Ignore pairs having specificity below this Threshold\n\n"); PP "-T : Ignore pairs having specificity below this Threshold\n\n");
PP "\n"); PP "\n");
PP "------------------------------------------\n"); PP "------------------------------------------\n");
PP "Table result description : \n"); PP "Table result description : \n");
@ -151,6 +154,9 @@ void initoptions(poptions_t options)
options->printAC=FALSE; options->printAC=FALSE;
options->print_sets_of_primers = FALSE; options->print_sets_of_primers = FALSE;
options->specificity_threshold = 0.6; options->specificity_threshold = 0.6;
options->links_cnt = 1;
options->max_links_percent = -1; /*graph only those primers having maximum 15% links*/
options->filter_on_links = FALSE;
} }
void printapair(int32_t index,ppair_t pair, poptions_t options) void printapair(int32_t index,ppair_t pair, poptions_t options)
@ -165,7 +171,7 @@ void printapair(int32_t index,ppair_t pair, poptions_t options)
bool_t good2=pair->p2->good; bool_t good2=pair->p2->good;
bool_t goodtmp; bool_t goodtmp;
bool_t strand; bool_t strand;
uint32_t i; uint32_t i, j;
float temp; float temp;
CNNParams nnparams; CNNParams nnparams;
@ -296,6 +302,12 @@ void printapair(int32_t index,ppair_t pair, poptions_t options)
else else
printf("\t\t"); printf("\t\t");
/* j=0;
for (i=0; i<options->dbsize; i++)
if (pair->wellIdentifiedSeqs[i] == 1)
j++;
printf("%d", j);*/
printf("\n"); printf("\n");
} }
@ -335,6 +347,7 @@ uint32_t filterandsortpairs(ppair_t* sortedpairs,uint32_t count, poptions_t opti
else qfp=0.0; else qfp=0.0;
sortedpairs[i]->wellIdentifiedSeqs = NULL; //TR 05/09/10 - wellIdentified needed for primer sets sortedpairs[i]->wellIdentifiedSeqs = NULL; //TR 05/09/10 - wellIdentified needed for primer sets
sortedpairs[i]->coveredSeqs = NULL; //TR 05/09/10 - wellIdentified needed for primer sets
sortedpairs[i]->quorumin = q; sortedpairs[i]->quorumin = q;
sortedpairs[i]->quorumout = qfp; sortedpairs[i]->quorumout = qfp;
sortedpairs[i]->yule = q - qfp; sortedpairs[i]->yule = q - qfp;
@ -345,13 +358,13 @@ uint32_t filterandsortpairs(ppair_t* sortedpairs,uint32_t count, poptions_t opti
{ {
//TR 05/09/10 - wellIdentified needed for primer sets //TR 05/09/10 - wellIdentified needed for primer sets
sortedpairs[j]->wellIdentifiedSeqs = ECOMALLOC(options->dbsize * sizeof(int),"Cannot allocate well_identified_array"); sortedpairs[j]->wellIdentifiedSeqs = ECOMALLOC(options->dbsize * sizeof(int),"Cannot allocate well_identified_array");
(void)taxonomycoverage(sortedpairs[j],options); sortedpairs[j]->coveredSeqs = ECOMALLOC(options->dbsize * sizeof(int),"Cannot allocate well_identified_array");
(void)taxonomycoverage(sortedpairs[j],options, seqdb, options->dbsize);
taxonomyspecificity(sortedpairs[j], seqdb, options->dbsize); taxonomyspecificity(sortedpairs[j], seqdb, options->dbsize);
//j++; //j++;
//if specificity less than user provieded threshold (default 60%) then ignore this pair //if specificity less than user provieded threshold (default 60%) then ignore this pair
if (sortedpairs[j]->bs >= options->specificity_threshold) if (sortedpairs[j]->bs >= options->specificity_threshold)
j++; j++;
} }
} }
@ -369,7 +382,8 @@ void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy,
size_t count; size_t count;
char *taxon[]={"taxon","taxa"}; char *taxon[]={"taxon","taxa"};
ecotx_t *current_taxon; ecotx_t *current_taxon;
pairset pair_sets; //pairset pair_sets;
pairset *pset = NULL;
//printf("Index\tPrimer1\tPrimer2\tGB\tInexampleCount\tOutexampleCount\tYule\tIntaxaCount\tOuttaxaCount\tCoverage\tSpecificity\tMinAmplifiedLength\tMaxAmplifiedLength\tAvgAmplifiedLength\n"); //printf("Index\tPrimer1\tPrimer2\tGB\tInexampleCount\tOutexampleCount\tYule\tIntaxaCount\tOuttaxaCount\tCoverage\tSpecificity\tMinAmplifiedLength\tMaxAmplifiedLength\tAvgAmplifiedLength\n");
@ -388,7 +402,7 @@ void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy,
for (i=0;i<pl->paircount;i++,j++) for (i=0;i<pl->paircount;i++,j++)
sortedpairs[j]=pl->pairs+i; sortedpairs[j]=pl->pairs+i;
count=filterandsortpairs(sortedpairs,pairs->count,options, seqdb); count=filterandsortpairs(sortedpairs,pairs->count,options, seqdb);
getThermoProperties(sortedpairs, count, options); getThermoProperties(sortedpairs, count, options);
@ -451,15 +465,55 @@ void printpairs (ppairtree_t pairs, poptions_t options,ecotaxonomy_t *taxonomy,
printf("# DB sequences are considered as linear\n"); printf("# DB sequences are considered as linear\n");
printf("# Pairs having specificity less than %0.2f will be ignored\n", options->specificity_threshold); printf("# Pairs having specificity less than %0.2f will be ignored\n", options->specificity_threshold);
printf("#\n"); printf("#\n");
for (i=0;i < count;i++) for (i=0;i < count;i++)
printapair(i,sortedpairs[i],options); printapair(i,sortedpairs[i],options);
if (options->filter_on_links)
{
fprintf (stderr, "Old size: %d, ", count);
count = primers_changeSortedArray (&sortedpairs, count, options);
//count = primers_filterWithGivenLinks (&sortedpairs, count, options);
fprintf (stderr, "New size: %d\n", count);
if (count == 0)
{
fprintf (stderr, "No pairs passed the links constraints.\n");
printf ("No pairs passed the links constraints.\n");
return;
}
for (i=0;i < count;i++)
printapair(i,sortedpairs[i],options);
}
if (options->print_sets_of_primers == TRUE) if (options->print_sets_of_primers == TRUE)
{ {
pair_sets = build_primers_set (sortedpairs, count, seqdb, options); /*pair_sets = build_primers_set (sortedpairs, count, seqdb, options);
printf("Results from Greedy Algorithm and some other possibilities:\n");
some_other_set_possibilities (&pair_sets, sortedpairs, count, seqdb, options); some_other_set_possibilities (&pair_sets, sortedpairs, count, seqdb, options);
printf("Results from simulated Anealing:\n");
sets_by_SimulatedAnealing (&pair_sets, sortedpairs, count, seqdb, options);
printf("Results from Tabu Search:\n");
sets_by_TabuSearch (&pair_sets, sortedpairs, count, seqdb, options);*/
//pset = sets_by_BruteForce (sortedpairs, count, seqdb, options);
//if (pset)
/*/{
printf("Results from simulated Anealing:\n");
sets_by_SimulatedAnealing (pset, sortedpairs, count, seqdb, options);
printf("Results from Tabu Search:\n");
sets_by_TabuSearch (pset, sortedpairs, count, seqdb, options);
if (pset)
{
ECOFREE (pset->set_wellIdentifiedTaxa, "Could not free memory for pair set wi");
ECOFREE (pset, "Could not free memory for pair");
}
}*/
build_and_print_sets (sortedpairs, count, seqdb, options);
} }
//primers_graph_graphviz (sortedpairs, count, options);
} }
@ -545,7 +599,7 @@ int main(int argc, char **argv)
initoptions(&options); initoptions(&options);
while ((carg = getopt(argc, argv, "hAfvcUDSpE:d:l:L:e:i:r:R:q:3:s:x:t:O:m:a:T:")) != -1) { while ((carg = getopt(argc, argv, "hAfvcUDSpbE:d:l:L:e:i:r:R:q:3:s:x:t:O:m:a:T:k:M:")) != -1) {
switch (carg) { switch (carg) {
/* ---------------------------- */ /* ---------------------------- */
@ -711,15 +765,31 @@ int main(int argc, char **argv)
/* -------------------- */ /* -------------------- */
case 'p': /* print sets of primers */ case 'p': /* print sets of primers */
/* --------------------------------- */ /* --------------------------------- */
options.print_sets_of_primers = TRUE; //options.print_sets_of_primers = TRUE;
break; break;
/* --------------------------------- */
case 'T': /* Ignore pairs having specificity below this Threshold */
/* --------------------------------- */ /* --------------------------------- */
case 'T': /* Ignore pairs having specificity below this Threshold */ sscanf(optarg,"%f",&(options.specificity_threshold));
break;
/* --------------------------------- */ /* --------------------------------- */
sscanf(optarg,"%f",&(options.specificity_threshold)); case 'M': /* Max link percentage for graph */
/* --------------------------------- */
sscanf(optarg,"%f",&(options.max_links_percent));
break; break;
/* --------------------------------- */
case 'k': /* links count */
/* --------------------------------- */
sscanf(optarg,"%d",&(options.links_cnt));
break;
case 'b':
options.filter_on_links = TRUE;
break;
case '?': /* bad option */ case '?': /* bad option */
/* -------------------- */ /* -------------------- */
errflag++; errflag++;
@ -779,7 +849,11 @@ int main(int argc, char **argv)
words = lookforStrictPrimer(seqdb,seqdbsize,insamples,&options); words = lookforStrictPrimer(seqdb,seqdbsize,insamples,&options);
fprintf(stderr,"\n Strict primer count : %d\n",words->size); fprintf(stderr,"\n Strict primer count : %d\n",words->size);
/*/TR Testing
fprintf(stderr,"\nReducing for debugging\n");
words = reduce_words_to_debug (words, &options);
///*/
// options.filtering=FALSE; // options.filtering=FALSE;
// words2= lookforStrictPrimer(seqdb,seqdbsize,insamples,&options); // words2= lookforStrictPrimer(seqdb,seqdbsize,insamples,&options);
// fprintf(stderr,"\n Strict primer count : %d\n",words2->size); // fprintf(stderr,"\n Strict primer count : %d\n",words2->size);
@ -802,7 +876,6 @@ int main(int argc, char **argv)
for (i=0; i<MINI(10,words->size); i++) for (i=0; i<MINI(10,words->size); i++)
fprintf(stderr," + Primer : %s sequence count : %d\n",ecoUnhashWord(words->words[i],options.primer_length),words->strictcount[i]); fprintf(stderr," + Primer : %s sequence count : %d\n",ecoUnhashWord(words->words[i],options.primer_length),words->strictcount[i]);
fprintf(stderr,"\nEncoding sequences for fuzzy pattern matching...\n"); fprintf(stderr,"\nEncoding sequences for fuzzy pattern matching...\n");
for (i=0;i<seqdbsize;i++) for (i=0;i<seqdbsize;i++)
{ {
@ -812,7 +885,13 @@ int main(int argc, char **argv)
ECOFREE(words->strictcount,"Free strict primer count table"); ECOFREE(words->strictcount,"Free strict primer count table");
primers = lookforAproxPrimer(seqdb,seqdbsize,insamples,words,&options); if (options.error_max == 0)//aho, if(options.error_max == 0 && 0) old
primers = ahoc_lookforStrictPrimers (seqdb,seqdbsize,insamples,words,&options);
else
primers = lookforAproxPrimer(seqdb,seqdbsize,insamples,words,&options);
//for (i=0; i<primers->size; i++)
// print_wordwith_positions (primers->primers[i], seqdbsize, &options);
ECOFREE(words->words,"Free strict primer table"); ECOFREE(words->words,"Free strict primer table");
ECOFREE(words,"Free strict primer structure"); ECOFREE(words,"Free strict primer structure");
@ -833,3 +912,108 @@ int main(int argc, char **argv)
return 0; return 0;
} }
#define DEBUG_WORDS_CNT 14
pwordcount_t reduce_words_to_debug (pwordcount_t words, poptions_t options)
{
uint32_t i, k;
pwordcount_t new_words;
char *rwrd;
char dwrd[20];
/*char *strict_words[DEBUG_WORDS_CNT] = {"GAGTCTCTGCACCTATCC", "GCAATCCTGAGCCAAATC", "ACCCCTAACCACAACTCA",
"TCCGAACCGACTGATGTT", "GAAGCTTGGGTGAAACTA", "GGAGAACCAGCTAGCTCT", "GCTGGTTCTCCCCGAAAT",
"TCGATTTGGTACCGCTCT", "AAAGGAGAGAGAGGGATT", "GGATTGCTAATCCGTTGT", "CCCCCATCGTCTCACTGG",
"TGAGGCGCAGCAGTTGAC", "GCGCTACGGCGCTGAAGT", "TTTCCTGGGAGTATGGCA"};*/
char *strict_words[DEBUG_WORDS_CNT] = {"CTCCGGTCTGAACTCAGA", "TGTTGGATCAGGACATCC", "TAGATAGAAACCGACCTG",
"TGGTGCAGCCGCTATTAA", "AGATAGAAACTGACCTGG", "TGGTGCAGCCGCTATTAA", "CTAATGGTGCAGCCGCTA",
"TAGAAACTGACCTGGATT", "AGATAGAAACCGACCTGG", "ATGGTGCAGCCGCTATTA", "ATAGATAGAAACCGACCT",
"GCCGCTATTAAGGGTTCG", "GGTGCAGCCGCTATTAAG", "TAGAAACTGACCTGGATT"};
int word_seen[DEBUG_WORDS_CNT];
new_words = ECOMALLOC(sizeof(wordcount_t),"Cannot allocate memory for word count structure");
new_words->inseqcount = words->inseqcount;
new_words->outseqcount = words->outseqcount;
new_words->size = DEBUG_WORDS_CNT;
new_words->strictcount = ECOMALLOC((new_words->size*sizeof(uint32_t)), "Cannot allocate memory for word count table");
new_words->words = ECOMALLOC(new_words->size*sizeof(word_t), "I cannot allocate memory for debug words");
for (k = 0; k < DEBUG_WORDS_CNT; k++)
word_seen[k] = 0;
for (i=0; i < words->size; i++)
{
rwrd = ecoUnhashWord(words->words[i],options->primer_length);
strcpy (dwrd, rwrd);
rwrd = ecoUnhashWord(ecoComplementWord(words->words[i],options->primer_length),options->primer_length);
for (k = 0; k < DEBUG_WORDS_CNT; k++)
{
if (strcmp (dwrd, strict_words[k]) == 0) break;
if (strcmp (rwrd, strict_words[k]) == 0) break;
}
if (k < DEBUG_WORDS_CNT)
{
if (word_seen[k] == 0)
{
new_words->words[k] = words->words[i];
new_words->strictcount[k] = words->strictcount[i];
}
word_seen[k]++;
}
}
fprintf (stderr, "Debug Words Info:\n");
for (k = 0; k < DEBUG_WORDS_CNT; k++)
fprintf (stderr, "%s:%d\n", strict_words[k], word_seen[k]);
//clean input wods;
ECOFREE(words->words,"Clean word table");
ECOFREE(words->strictcount,"Clean word count table");
ECOFREE(words,"Clean word structure");
return new_words;
}
void print_wordwith_positions (primer_t prm, uint32_t seqdbsize, poptions_t options)
{
char *wrd;
uint32_t i, j;
char *twrd = "GCCTGTTTACCAAAAACA";
wrd = ecoUnhashWord(prm.word,options->primer_length);
if (strcmp (twrd, wrd) == 0)
{
printf ("Positions for Word: %s\n", wrd);
for (i=0; i<seqdbsize; i++)
{
if (prm.directCount[i] > 0)
{
printf ("%d:", i);
if (prm.directCount[i] == 1)
printf ("%d", prm.directPos[i].value);
else
for (j=0; j<prm.directCount[i]; j++)
printf ("%d,", prm.directPos[i].pointer[j]);
printf (" ");
}
}
printf ("\n");
for (i=0; i<seqdbsize; i++)
{
if (prm.reverseCount[i] > 0)
{
printf ("%d:", i);
if (prm.reverseCount[i] == 1)
printf ("%d", prm.reversePos[i].value);
else
for (j=0; j<prm.reverseCount[i]; j++)
printf ("%d,", prm.reversePos[i].pointer[j]);
printf (" ");
}
}
printf ("\n");
}
}

View File

@ -3,7 +3,8 @@ LIBPATH= -Llibapat -LlibecoPCR -Llibecoprimer -Llibthermo
MAKEDEPEND = gcc -D$(MACHINE) -M $(CPPFLAGS) -o $*.d $< MAKEDEPEND = gcc -D$(MACHINE) -M $(CPPFLAGS) -o $*.d $<
CC=gcc CC=gcc
CFLAGS= -W -Wall -O5 -m64 CFLAGS= -W -Wall -m64 -g
#CFLAGS= -W -Wall -O5 -m64 -g
#CFLAGS= -W -Wall -O0 -m64 -g #CFLAGS= -W -Wall -O0 -m64 -g
#CFLAGS= -W -Wall -O5 -fast -g #CFLAGS= -W -Wall -O5 -fast -g

View File

@ -15,7 +15,8 @@ SOURCES = goodtaxon.c \
taxstats.c \ taxstats.c \
apat_search.c \ apat_search.c \
filtering.c \ filtering.c \
PrimerSets.c PrimerSets.c \
ahocorasick.c
SRCS=$(SOURCES) SRCS=$(SOURCES)

File diff suppressed because it is too large Load Diff

View File

@ -13,6 +13,8 @@ typedef struct {
float set_lmean; float set_lmean;
float set_lcov; float set_lcov;
float set_score; float set_score;
int32_t set_intaxa;
int32_t set_wi_cnt;
}pairset; }pairset;
typedef struct{ typedef struct{
@ -33,9 +35,24 @@ typedef struct{
void add_pair_in_set (pairset *pair_set, int32_t pset_idx, int32_t prb_idx, SetParams *pparams); void add_pair_in_set (pairset *pair_set, int32_t pset_idx, int32_t prb_idx, SetParams *pparams);
void get_next_pair_options (int *pair_wi_count_sorted_ids, pairset *pair_set, SetParams *pparams); void get_next_pair_options (int *pair_wi_count_sorted_ids, pairset *pair_set, SetParams *pparams);
float get_links_distribution (int prb_idx, pairset *prob_set, SetParams *pparams); float get_links_distribution (int prb_idx, pairset *prob_set, SetParams *pparams);
pairset build_primers_set (ppair_t* sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, pairset build_primers_set_greedy_spc (SetParams *pparams);
poptions_t options);
void get_set_mean_cov_stats (pairset *prob_set, SetParams *pparams); void get_set_mean_cov_stats (pairset *prob_set, SetParams *pparams);
void some_other_set_possibilities (pairset *pair_set, void some_other_set_possibilities (pairset *pair_set,
ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options); ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options);
void sets_by_SimulatedAnealing (pairset *pair_set,
ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options);
void sets_by_TabuSearch (pairset *pair_set,
ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options);
pairset * sets_by_BruteForce (ppair_t * sortedpairs,
int32_t sorted_count, pecodnadb_t seqdb, poptions_t options);
pairset * extend_set_randomly (pairset *pair_set, SetParams *params, int extend_to_cnt);
void build_and_print_sets (ppair_t * sortedpairs, int32_t sorted_count, pecodnadb_t seqdb, poptions_t options);
int32_t get_next_option_increasing_cov (pairset *pair_set, SetParams *pparams);
void reset_set_props (pairset *pair_set, SetParams *pparams);
void primers_graph_graphviz (ppair_t * sortedpairs,
int32_t sorted_count, poptions_t options);
size_t primers_changeSortedArray (ppair_t ** pairs,
size_t sorted_count, poptions_t options);
size_t primers_filterWithGivenLinks (ppair_t ** pairs,
size_t sorted_count, poptions_t options);
#endif #endif

479
src/libecoprimer/ahocorasick.c Executable file
View File

@ -0,0 +1,479 @@
/*
* ahocorasick.h
*
* Created on: 26 march 2011
* Author: tiayyba
*/
#include <inttypes.h>
#include "hashencoder.h"
#include "ahocorasick.h"
void ahoc_graphKeywordTree (aho_state *root);
aho_state *groot = NULL; //just for graph testing
#define BASEATINDEX(w, l, i) (uint8_t)((((w)&(0x3LLU<<(((l)-(i))*2)))>>(((l)-(i))*2)) & 0x3LLU)
void ahoc_addOutputElement (aho_state *node, bool_t isdirect, uint32_t idx)
{
if (!node) return;
if (node->output.count == 0)
node->output.out_set = ECOMALLOC(sizeof(aho_output),
"Cannot allocate memory for aho-corasick state output element");
else
node->output.out_set = ECOREALLOC(node->output.out_set, (node->output.count+1)*sizeof(aho_output),
"Cannot allocate memory for aho-corasick state output element");
node->output.out_set[node->output.count].wordidx = idx;
node->output.out_set[node->output.count].isdirect = isdirect;
node->output.count++;
}
//is the passed output element in the set
bool_t ahoc_isOutputIn (aho_state *node, aho_output ot)
{
uint32_t i;
for (i=0; i<node->output.count; i++)
if (node->output.out_set[i].isdirect == ot.isdirect && node->output.out_set[i].wordidx == ot.wordidx) return TRUE;
return FALSE;
}
//take union of output of the two nodes and put in node1
void ahoc_unionOutputElements (aho_state *node1, aho_state *node2)
{
uint32_t i;
for (i=0; i<node2->output.count; i++)
if (ahoc_isOutputIn (node1, node2->output.out_set[i]) == FALSE)
ahoc_addOutputElement (node1, node2->output.out_set[i].isdirect, node2->output.out_set[i].wordidx);
}
void ahoc_addKeyword (aho_state *root, word_t w, bool_t isdirect, uint32_t idx, poptions_t options)
{
uint32_t i;
aho_state *nextnode = root;
uint8_t basecode;
static uint32_t state_id = 0;
//fprintf (stderr, "%s\n", ecoUnhashWord(w, options->primer_length));
for (i=1; i<=options->primer_length; i++)
{
basecode = BASEATINDEX (w, options->primer_length, i);
//fprintf (stderr, "%d", basecode);
if (nextnode->next[basecode] == NULL)
{
//add new state
nextnode->next[basecode] = ECOMALLOC(sizeof(aho_state),
"Cannot allocate memory for aho-corasick state");
nextnode = nextnode->next[basecode];
//initialize state
nextnode->id = ++state_id;
nextnode->next[0]=nextnode->next[1]=nextnode->next[2]=nextnode->next[3]=NULL;
nextnode->fail = NULL;
nextnode->output.count = 0;
}
else
nextnode = nextnode->next[basecode];
}
//fprintf (stderr, "\n", basecode);
//new pattern addess so add node ouptup element
ahoc_addOutputElement (nextnode, isdirect, idx);
}
void ahoc_buildKeywordTree (aho_state *root, pwordcount_t words, poptions_t options)
{
uint32_t i;
if (!root) return;
//init root
root->id = 0;
root->next[0]=root->next[1]=root->next[2]=root->next[3]=NULL;
root->fail = NULL;
root->output.count = 0;
//now add each word as a pattern in the keyword tree
for (i=0; i<words->size; i++)
{
//add direct word
word_t w=WORD(words->words[i]);
ahoc_addKeyword (root, w, TRUE, i, options);
//add reverse word
w=ecoComplementWord(w,options->primer_length);
ahoc_addKeyword (root, w, FALSE, i, options);
}
//loop on root if some base has no out going edge from roots
for (i=0; i<4; i++)
if (root->next[i] == NULL)
root->next[i] = root;
}
void ahoc_enqueue (aho_queue *ahoqueue, aho_state *node)
{
queue_node *q;
if (node == NULL) return;
q = ECOMALLOC(sizeof(queue_node),
"Cannot allocate memory for aho-corasick queue node");
q->state_node = node;
q->next = NULL;
if (ahoqueue->first == NULL)
{
ahoqueue->first = q;
ahoqueue->last = q;
}
else
{
ahoqueue->last->next = q;
ahoqueue->last = q;
}
}
aho_state *ahoc_dequeue (aho_queue *ahoqueue)
{
aho_state *node = NULL;
queue_node *q;
if (ahoqueue->first == NULL) return node;
q = ahoqueue->first;
ahoqueue->first = q->next;
node = q->state_node;
ECOFREE (q, "Cannot free memory for aho-corasick queue node");
return node;
}
//set fail links and output sets for the keyword tree
void ahoc_updateForFailAndOutput (aho_state *root)
{
int32_t i;
aho_queue Q;
aho_state *node_r;
aho_state *node_u;
aho_state *node_v;
//empty queue
Q.first = NULL;
Q.last = NULL;
//for us alphabet has 4 elements, A=0, C=1, G=2 and T=3
for (i=0; i<4; i++)
{
if (root->next[i] != root && root->next[i] != NULL)
{
root->next[i]->fail = root;
ahoc_enqueue (&Q, root->next[i]);
}
}
//while queue not empty
while (Q.first != NULL)
{
node_r = ahoc_dequeue (&Q);
for (i=0; i<4; i++)
{
if (node_r->next[i] != NULL)
{
node_u = node_r->next[i];
ahoc_enqueue (&Q, node_u);
node_v = node_r->fail;
while (node_v->next[i] == NULL)
node_v = node_v->fail;
node_u->fail = node_v->next[i];
ahoc_unionOutputElements (node_u, node_u->fail);
}
}
}
}
void ahoc_freeKeywordTree (aho_state *node)
{
int i;
for (i=0; i<4; i++)
if (node->next[i])
ahoc_freeKeywordTree (node->next[i]);
if (node->output.count > 0)
ECOFREE (node->output.out_set, "Free failed for node output");
ECOFREE (node, "Free failed for node");
}
pprimercount_t ahoc_lookforStrictPrimers (pecodnadb_t database, uint32_t seqdbsize,uint32_t exampleCount,
pwordcount_t words,poptions_t options)
{
aho_state automaton_root;
aho_state *curr_state;
//uint32_t inSequenceQuorum;
uint32_t outSequenceQuorum;
pprimer_t data;
pprimercount_t primers;
uint32_t i, j, k;
int32_t pos;
uint32_t lmax;
char *base;
int8_t code;
uint32_t goodPrimers=0;
static int iii=0;
//inSequenceQuorum = (uint32_t)floor((float)exampleCount * options->sensitivity_quorum);
outSequenceQuorum = (uint32_t)floor((float)(seqdbsize-exampleCount) * options->false_positive_quorum);
//fprintf(stderr," Primers should be at least present in %d/%d example sequences\n",inSequenceQuorum,exampleCount);
fprintf(stderr," Primers should not be present in more than %d/%d counterexample sequences\n",outSequenceQuorum,(seqdbsize-exampleCount));
data = ECOMALLOC(words->size * sizeof(primer_t),
"Cannot allocate memory for fuzzy matching results");
for (i=0; i < words->size; i++)
{
data[i].word=WORD(words->words[i]);
data[i].inexample = 0;
data[i].outexample= 0;
data[i].directCount=ECOMALLOC(seqdbsize * sizeof(uint32_t),
"Cannot allocate memory for primer position");
data[i].directPos = ECOMALLOC(seqdbsize * sizeof(poslist_t),
"Cannot allocate memory for primer position");
data[i].reverseCount=ECOMALLOC(seqdbsize * sizeof(uint32_t),
"Cannot allocate memory for primer position");
data[i].reversePos = ECOMALLOC(seqdbsize * sizeof(poslist_t),
"Cannot allocate memory for primer position");
}
//build keywords automaton
ahoc_buildKeywordTree (&automaton_root, words, options);
//set fail links and output sets
ahoc_updateForFailAndOutput (&automaton_root);
//debug; print keywordtree in a gv file
//ahoc_graphKeywordTree (&automaton_root);
//loop on each sequence for its each base and find words
for (i=0; i < seqdbsize; i++)
{
if(database[i]->SQ_length <= options->primer_length) continue;
lmax = database[i]->SQ_length;
if (!options->circular)
lmax += options->primer_length-1;
curr_state = &automaton_root;
for (j=0,base=database[i]->SQ; j<lmax; j++,base++)
{
if (i==(uint32_t)database[i]->SQ_length) base=database[i]->SQ;
//code = encoder[(*base) - 'A'];
code = *base;
//if (iii++ < 30)
// fprintf (stderr, "%d:%d,", *base, code);
if (code < 0 || code > 3)
{
//if error char, start from root for next character
//+forget any incomplete words
curr_state = &automaton_root;
continue;
}
while (curr_state->next[code] == NULL) curr_state = curr_state->fail;
curr_state = curr_state->next[code];
//start position of primer is options->primer_length-1 chars back
pos = j-options->primer_length+1;
if (pos < 0) pos = database[i]->SQ_length+pos;
//set output, if there is some output on this state then
//+all words in the output set complete here, so increment their
//+found properties for current sequence
for (k=0; k<curr_state->output.count; k++)
{
if (curr_state->output.out_set[k].isdirect)
data[curr_state->output.out_set[k].wordidx].directCount[i]++;
else
data[curr_state->output.out_set[k].wordidx].reverseCount[i]++;
if (options->no_multi_match)
{
if ((data[curr_state->output.out_set[k].wordidx].directCount[i] +
data[curr_state->output.out_set[k].wordidx].reverseCount[i]) > 1)
//since multimach not allowd, set an indication on 1st seq position that
//+ a multimatch was found, so that this word will be filtered out
//+ and because of first postion we wont have to search the whole array
//+ to find if it voilated nomultimatch constraint for some seq
data[curr_state->output.out_set[k].wordidx].directCount[0] = 2;
else
{
if (curr_state->output.out_set[k].isdirect)
//direct word found on jth position of ith sequence
data[curr_state->output.out_set[k].wordidx].directPos[i].value = (uint32_t)pos;
else
//reverse word found on jth position of ith sequence
data[curr_state->output.out_set[k].wordidx].reversePos[i].value = (uint32_t)pos;
}
}
else
{
//okay multi match allowed
if (curr_state->output.out_set[k].isdirect)
{
if (data[curr_state->output.out_set[k].wordidx].directCount[i] == 1)
data[curr_state->output.out_set[k].wordidx].directPos[i].value = (uint32_t)pos;
else
{
//need to create or extend the positions list
if (data[curr_state->output.out_set[k].wordidx].directCount[i] == 2)
{
//for second element, first was put in .value, so dont forget to copy that in the array too
data[curr_state->output.out_set[k].wordidx].directPos[i].pointer = ECOMALLOC(2 * sizeof(uint32_t),
"Cannot allocate memory for primer position");
data[curr_state->output.out_set[k].wordidx].directPos[i].pointer[0] = data[curr_state->output.out_set[k].wordidx].directPos[i].value;
data[curr_state->output.out_set[k].wordidx].directPos[i].pointer[1] = (uint32_t)pos;
}
else
{
//for third or greater element
data[curr_state->output.out_set[k].wordidx].directPos[i].pointer = ECOREALLOC(data[curr_state->output.out_set[k].wordidx].directPos[i].pointer,
data[curr_state->output.out_set[k].wordidx].directCount[i] * sizeof(uint32_t),
"Cannot allocate memory for primer position");
data[curr_state->output.out_set[k].wordidx].directPos[i].pointer[data[curr_state->output.out_set[k].wordidx].directCount[i]-1] = (uint32_t)pos;
}
}
}
else
{
if (data[curr_state->output.out_set[k].wordidx].reverseCount[i] == 1)
data[curr_state->output.out_set[k].wordidx].reversePos[i].value = (uint32_t)pos;
else
{
//need to create or extend the positions list
if (data[curr_state->output.out_set[k].wordidx].reverseCount[i] == 2)
{
//for second element, first was put in .value, so dont forget to copy that in the array too
data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer = ECOMALLOC(2 * sizeof(uint32_t),
"Cannot allocate memory for primer position");
data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer[0] = data[curr_state->output.out_set[k].wordidx].reversePos[i].value;
data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer[1] = (uint32_t)pos;
}
else
{
//for third or greater element
data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer = ECOREALLOC(data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer,
data[curr_state->output.out_set[k].wordidx].reverseCount[i] * sizeof(uint32_t),
"Cannot allocate memory for primer position");
data[curr_state->output.out_set[k].wordidx].reversePos[i].pointer[data[curr_state->output.out_set[k].wordidx].reverseCount[i]-1] = (uint32_t)pos;
}
}
}
}
//dont forget to increment inexample or outexample count, but only once for a sequence
if ((data[curr_state->output.out_set[k].wordidx].directCount[i] +
data[curr_state->output.out_set[k].wordidx].reverseCount[i]) == 1)
{
if (database[i]->isexample)
data[curr_state->output.out_set[k].wordidx].inexample++;
else
data[curr_state->output.out_set[k].wordidx].outexample++;
}
}
}
}
//Only thing that remains is to remove the failed words
for (i=0,j=0; i<words->size; i++)
{
fprintf(stderr,"Primers %5d/%lld analyzed => sequence : %s in %d example and %d counterexample sequences \r",
i+1,words->size,ecoUnhashWord(data[i].word,options->primer_length),
data[i].inexample,data[i].outexample);
//if (data[i].inexample < inSequenceQuorum || (data[i].directCount[0] == 2 && options->no_multi_match))
if (data[i].directCount[0] == 2 && options->no_multi_match)
{
//bad word, delete from the array
for (k=0; k<seqdbsize; k++)
{
if (data[i].directCount[k] > 1)
ECOFREE (data[i].directPos[k].pointer, "Cannot free position pointer.");
if (data[i].reverseCount[k] > 1)
ECOFREE (data[i].reversePos[k].pointer, "Cannot free position pointer.");
}
ECOFREE (data[i].directCount, "Cannot free position pointer.");
ECOFREE (data[i].directPos, "Cannot free position pointer.");
ECOFREE (data[i].reverseCount, "Cannot free position pointer.");
ECOFREE (data[i].reversePos, "Cannot free position pointer.");
}
else
{
//data[i].good = data[i].inexample >= inSequenceQuorum && data[i].outexample <= outSequenceQuorum;
data[i].good = data[i].outexample <= outSequenceQuorum;
goodPrimers+=data[i].good? 1:0;
if (j < i)
data[j] = data[i];
j++;
}
}
fprintf(stderr,"\n\nOn %lld analyzed primers %d respect quorum conditions\n",words->size,goodPrimers);
fprintf(stderr,"Conserved primers for further analysis : %d/%lld\n",j,words->size);
primers = ECOMALLOC(sizeof(primercount_t),"Cannot allocate memory for primer table");
primers->primers=ECOREALLOC(data,
j * sizeof(primer_t),
"Cannot reallocate memory for fuzzy matching results");
primers->size=j;
//free memory of keyword table
for (i=0; i<4; i++)
if (automaton_root.next[i] != &automaton_root)
ahoc_freeKeywordTree (automaton_root.next[i]);
return primers;
}
void ahoc_graphPrintNodesInfo (aho_state *node, FILE* gfile)
{
uint32_t i;
fprintf (gfile, "\"%d\"[\n", node->id);
fprintf (gfile, "label=\"%d\\n", node->id);
for (i=0; i<node->output.count; i++)
fprintf (gfile, "%d%c,", node->output.out_set[i].wordidx, node->output.out_set[i].isdirect?'d':'r');
fprintf (gfile, "\"\n];\n");
for (i=0; i<4; i++)
if (node->next[i] != NULL && node->next[i] != node)
ahoc_graphPrintNodesInfo (node->next[i], gfile);
}
void ahoc_graphPrintNodesLinks (aho_state *node, FILE* gfile)
{
uint32_t i;
static int j=0;
for (i=0; i<4; i++)
if (node->next[i] != NULL && node->next[i] != node)
{
fprintf (gfile, "\"%d\" -> \"%d\" [\n", node->id, node->next[i]->id);
fprintf (gfile, "label=\"%c\"\n];\n", "ACGT"[i]);
}
if (j++ < 40)
if (node->fail != NULL && node->fail != groot)
{
fprintf (gfile, "\"%d\" -> \"%d\" [\n", node->id, node->fail->id);
fprintf (gfile, "color= \"red\"\n];\n");
}
for (i=0; i<4; i++)
if (node->next[i] != NULL && node->next[i] != node)
ahoc_graphPrintNodesLinks (node->next[i], gfile);
}
void ahoc_graphKeywordTree (aho_state *root)
{
FILE *gfile;
groot=root;
gfile = fopen ("keywordtree.gv", "w");
fprintf (gfile, "digraph keywordtree {\n");
ahoc_graphPrintNodesInfo (root, gfile);
ahoc_graphPrintNodesLinks (root, gfile);
fprintf (gfile, "}\n");
fclose(gfile);
}

43
src/libecoprimer/ahocorasick.h Executable file
View File

@ -0,0 +1,43 @@
/*
* ahocorasick.h
*
* Created on: 26 march 2011
* Author: tiayyba
*/
#ifndef H_ahocorasick
#define H_ahocorasick
#include "ecoprimer.h"
typedef struct aho_output_t{
uint32_t wordidx; //index of strict word (dont save the word of 64B)
bool_t isdirect; //we need to find both direct and reverse words so we must know which one is it
}aho_output;
typedef struct aho_output_count_t{
uint32_t count;
aho_output *out_set;
}aho_output_count;
typedef struct aho_state_t{
int32_t id;
struct aho_state_t *next[4]; //for labels A=0,C=1,G=2 and T=3
struct aho_state_t *fail;
aho_output_count output;
}aho_state;
typedef struct queue_node_t {
aho_state *state_node;
struct queue_node_t *next;
}queue_node;
typedef struct{
queue_node *first;
queue_node *last;
}aho_queue;
pprimercount_t ahoc_lookforStrictPrimers (pecodnadb_t database, uint32_t seqdbsize,uint32_t exampleCount,
pwordcount_t words,poptions_t options);
#endif /* H_ahocorasick */

View File

@ -176,6 +176,7 @@ typedef struct {
int *wellIdentifiedSeqs; //< an array having elements equla to total seqs int *wellIdentifiedSeqs; //< an array having elements equla to total seqs
// values are either 0 or 1, if seq is well identified // values are either 0 or 1, if seq is well identified
// its 1 else 0 // its 1 else 0
int *coveredSeqs; //< an array having elements equal to total seqs, 1 if seq is covered else 0
// these statistics are relative to inexample sequences // these statistics are relative to inexample sequences
@ -291,6 +292,9 @@ typedef struct {
PNNParams pnparm; PNNParams pnparm;
bool_t print_sets_of_primers; bool_t print_sets_of_primers;
float specificity_threshold; float specificity_threshold;
int links_cnt;
float max_links_percent;
bool_t filter_on_links;
} options_t, *poptions_t; } options_t, *poptions_t;
typedef ecoseq_t **pecodnadb_t; typedef ecoseq_t **pecodnadb_t;
@ -350,7 +354,7 @@ int32_t getrankdbstats(pecodnadb_t seqdb,
uint32_t seqdbsize, uint32_t seqdbsize,
ecotaxonomy_t *taxonomy, ecotaxonomy_t *taxonomy,
poptions_t options); poptions_t options);
float taxonomycoverage(ppair_t pair, poptions_t options); float taxonomycoverage(ppair_t pair, poptions_t options, pecodnadb_t seqdb,uint32_t seqdbsize);
char ecoComplementChar(char base); char ecoComplementChar(char base);
void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize); void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize);

View File

@ -114,6 +114,8 @@ static int32_t *ecoFilteringHashSequence(int32_t *dest,
error<<= 1; error<<= 1;
error&=ERRORMASK(FWORDSIZE); error&=ERRORMASK(FWORDSIZE);
//code = -1;
//if((*base) >= 'A' && (*base) <= 'Z')
code = encoder[(*base) - 'A']; code = encoder[(*base) - 'A'];
if (code <0) if (code <0)
{ {
@ -154,7 +156,7 @@ int32_t *filteringSeq(pecodnadb_t database, uint32_t seqdbsize,
for (i=0;i<seqdbsize;i++) for (i=0;i<seqdbsize;i++)
{ {
if (database[i]->isexample) if (database[i]->isexample && database[i]->SQ_length > options->primer_length)
{ {
j++; j++;
wordscount=ecoFilteringHashSequence(wordscount, wordscount=ecoFilteringHashSequence(wordscount,

View File

@ -179,7 +179,7 @@ static void buildPrimerPairsForOneSeq(uint32_t seqid,
uint32_t i,j,k; uint32_t i,j,k;
uint32_t matchcount=0; uint32_t matchcount=0;
pprimermatch_t matches = NULL; pprimermatch_t matches = NULL;
primermatchcount_t seqmatchcount; //primermatchcount_t seqmatchcount;
ppair_t pcurrent; ppair_t pcurrent;
pair_t current; pair_t current;
pprimer_t wswp; pprimer_t wswp;
@ -189,9 +189,9 @@ static void buildPrimerPairsForOneSeq(uint32_t seqid,
//char prmr[50]; //char prmr[50];
//float mtemp; //float mtemp;
word_t w1, w1a, omask = (0x1L << (options->strict_three_prime*2)) -1; word_t w1, w1a, omask = (0x1L << (options->strict_three_prime*2)) -1;
word_t w2, w2a, wtmp; word_t w2, w2a;//, wtmp;
uint32_t bp1,bp2; uint32_t bp1,bp2;
//prmr[options->primer_length] = '\0'; //prmr[options->primer_length] = '\0';
for (i=0;i < primers->size; i++) for (i=0;i < primers->size; i++)
@ -252,16 +252,17 @@ static void buildPrimerPairsForOneSeq(uint32_t seqid,
{ {
// For all primers matching the sequence // For all primers matching the sequence
//for(j=i+1; /*for(j=i+1;
// (j<matchcount) (j<matchcount)
// && ((distance=matches[j].position - matches[i].position - options->primer_length) < options->lmax); && ((distance=matches[j].position - matches[i].position - options->primer_length) < options->lmax);
// j++ j++
// ) )//*/
for (j=i+1; j<matchcount; j++) for (j=i+1; j<matchcount; j++)
{ {
if (matches[j].position - matches[i].position <= options->primer_length) continue; if (matches[j].position - matches[i].position <= options->primer_length) continue;
distance = matches[j].position - matches[i].position - options->primer_length; distance = matches[j].position - matches[i].position - options->primer_length;
if (distance >= options->lmax) break; if (distance >= options->lmax) break;
// For all not too far primers // For all not too far primers
@ -269,9 +270,7 @@ static void buildPrimerPairsForOneSeq(uint32_t seqid,
&& (distance > options->lmin) && (distance > options->lmin)
) )
{ {
// If possible primer pair // If possible primer pair
current.p1 = matches[i].primer; current.p1 = matches[i].primer;
current.asdirect1=matches[i].strand; current.asdirect1=matches[i].strand;
current.p2 = matches[j].primer; current.p2 = matches[j].primer;
@ -456,7 +455,6 @@ static void buildPrimerPairsForOneSeq(uint32_t seqid,
} }
} }
} }
pairs->count=paircount; pairs->count=paircount;
} }

View File

@ -108,10 +108,11 @@ void addSeqToWordCountTable(pwordcount_t table, uint32_t wordsize, uint32_t circ
table->inseqcount++; table->inseqcount++;
//fprintf (stderr, "\nOldAddress: %x", table->strictcount);
table->strictcount = ECOREALLOC(table->strictcount,buffersize*sizeof(uint32_t), table->strictcount = ECOREALLOC(table->strictcount,(buffersize+5000)*sizeof(uint32_t),
"Cannot allocate memory to extend example word count table"); "Cannot allocate memory to extend example word count table");
//fprintf (stderr, " NewAddress: %x\n", table->strictcount);
for (i=table->size; i < buffersize; i++) for (i=table->size; i < buffersize; i++)
table->strictcount[i]=1; table->strictcount[i]=1;
@ -172,7 +173,7 @@ pwordcount_t lookforStrictPrimer(pecodnadb_t database, uint32_t seqdbsize,
for (i=0;i<seqdbsize;i++) for (i=0;i<seqdbsize;i++)
{ {
if (database[i]->isexample) if (database[i]->isexample && database[i]->SQ_length > options->primer_length)
{ {
if (first) if (first)

View File

@ -6,10 +6,46 @@
*/ */
#include <search.h> #include <search.h>
//void tdestroy (void *root, void (*free_node)(void *nodep));
#include "ecoprimer.h" #include "ecoprimer.h"
static int cmptaxon(const void *t1, const void* t2); static int cmptaxon(const void *t1, const void* t2);
void **tree_root = NULL;
int delete_passes = 0;
void delete_twalkaction (const void *node, VISIT order, int level)
{
switch (order)
{
case preorder:
delete_passes++;
break;
case postorder:
delete_passes++;
break;
case endorder:
delete_passes++;
break;
case leaf:
if (tree_root)
tdelete (node, tree_root,cmptaxon);
delete_passes++;
break;
}
}
void free_tree_nodes (void *tree)
{
while (1)
{
delete_passes = 0;
twalk (tree, delete_twalkaction);
if (delete_passes <= 1) break;
}
}
static int cmptaxon(const void *t1, const void* t2) static int cmptaxon(const void *t1, const void* t2)
{ {
const size_t taxid1=(size_t)t1; const size_t taxid1=(size_t)t1;
@ -35,7 +71,12 @@ int32_t counttaxon(int32_t taxid)
if (taxid==-1) if (taxid==-1)
{ {
if (taxontree) if (taxontree)
{
tree_root = (void **)&taxontree;
//free_tree_nodes (taxontree);
ECOFREE(taxontree,"Free taxon tree"); ECOFREE(taxontree,"Free taxon tree");
tree_root = NULL;
}
taxontree=NULL; taxontree=NULL;
taxoncount=0; taxoncount=0;
return 0; return 0;
@ -97,22 +138,30 @@ int32_t getrankdbstats(pecodnadb_t seqdb, uint32_t seqdbsize, ecotaxonomy_t *tax
} }
float taxonomycoverage(ppair_t pair, poptions_t options) float taxonomycoverage(ppair_t pair, poptions_t options, pecodnadb_t seqdb,uint32_t seqdbsize)
{ {
int32_t seqcount; int32_t seqcount;
int32_t i; int32_t i;
int32_t incount=0; int32_t incount=0;
int32_t outcount=0; int32_t outcount=0;
uint32_t j;
memset (pair->coveredSeqs, 0, seqdbsize*sizeof (int));
seqcount=pair->pcr.ampcount; seqcount=pair->pcr.ampcount;
counttaxon(-1); counttaxon(-1);
for (i=0; i < seqcount; i++) for (i=0; i < seqcount; i++)
if (pair->pcr.amplifias[i].sequence->isexample if (pair->pcr.amplifias[i].sequence->isexample
&& pair->pcr.amplifias[i].sequence->ranktaxonid > 0 ) && pair->pcr.amplifias[i].sequence->ranktaxonid > 0 )
{
incount = counttaxon(pair->pcr.amplifias[i].sequence->ranktaxonid); incount = counttaxon(pair->pcr.amplifias[i].sequence->ranktaxonid);
for (j=0; j<seqdbsize; j++)
if (pair->pcr.amplifias[i].sequence == seqdb[j])
{pair->coveredSeqs[j] = 1; break;}
}
counttaxon(-1); counttaxon(-1);
for (i=0; i < seqcount; i++) for (i=0; i < seqcount; i++)
if (!pair->pcr.amplifias[i].sequence->isexample if (!pair->pcr.amplifias[i].sequence->isexample
@ -145,12 +194,14 @@ static int cmpamp(const void *ampf1, const void* ampf2)
{ {
incr = -1; incr = -1;
j = pampf1->length - 1; j = pampf1->length - 1;
if (pampf2->strand) if (pampf2->strand)
{ {
pampf1 = (pamptotaxon_t) ampf2; pampf1 = (pamptotaxon_t) ampf2;
pampf2 = (pamptotaxon_t) ampf1; pampf2 = (pamptotaxon_t) ampf1;
chd = 1; chd = 1;
} }
//j = pampf2->length - 1; should have been here and pampf2 instead of pampf1?
} }
len = (pampf1->length <= pampf2->length)? pampf1->length: pampf2->length; len = (pampf1->length <= pampf2->length)? pampf1->length: pampf2->length;
@ -173,6 +224,7 @@ static int cmpamp(const void *ampf1, const void* ampf2)
return 0; return 0;
}*/ }*/
static int cmpamp(const void *ampf1, const void* ampf2) static int cmpamp(const void *ampf1, const void* ampf2)
{ {
int i; int i;
@ -183,10 +235,10 @@ static int cmpamp(const void *ampf1, const void* ampf2)
char *ch2; char *ch2;
int incr1; int incr1;
int incr2; int incr2;
pamptotaxon_t pampf1 = (pamptotaxon_t) ampf1; pamptotaxon_t pampf1 = (pamptotaxon_t) ampf1;
pamptotaxon_t pampf2 = (pamptotaxon_t) ampf2; pamptotaxon_t pampf2 = (pamptotaxon_t) ampf2;
ch1 = pampf1->amplifia; ch1 = pampf1->amplifia;
ch2 = pampf2->amplifia; ch2 = pampf2->amplifia;
@ -218,7 +270,7 @@ static int cmpamp(const void *ampf1, const void* ampf2)
if (pampf1->length > pampf2->length) return 1; if (pampf1->length > pampf2->length) return 1;
if (pampf2->length > pampf1->length) return -1; if (pampf2->length > pampf1->length) return -1;
return 0; return 0;
} }
@ -242,6 +294,8 @@ void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize)
uint32_t i, j; uint32_t i, j;
uint32_t ampfindex = 0; uint32_t ampfindex = 0;
int32_t taxid; int32_t taxid;
uint32_t wellidentifiedcount;
void *ampftree = NULL; void *ampftree = NULL;
pamptotaxon_t pcurrentampf; pamptotaxon_t pcurrentampf;
pamptotaxon_t *ptmp; pamptotaxon_t *ptmp;
@ -278,11 +332,14 @@ void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize)
} }
memset (pair->wellIdentifiedSeqs, 0, seqdbsize*sizeof (int)); memset (pair->wellIdentifiedSeqs, 0, seqdbsize*sizeof (int));
counttaxon(-1); //counttaxon(-1);
for (i = 0; i < ampfindex; i++) for (i = 0; i < ampfindex; i++)
{ {
if (ampfwithtaxtree[i].taxoncount > 1) if (ampfwithtaxtree[i].taxoncount > 1)
twalk(ampfwithtaxtree[i].taxontree, twalkaction); {
//printf ("\nampfwithtaxtree[i].taxoncount: %d\n", ampfwithtaxtree[i].taxoncount);
//twalk(ampfwithtaxtree[i].taxontree, twalkaction);
}
//TR 5/9/10 - added code for well identified seqs //TR 5/9/10 - added code for well identified seqs
else if(ampfwithtaxtree[i].taxoncount == 1) /*well identified*/ else if(ampfwithtaxtree[i].taxoncount == 1) /*well identified*/
{ {
@ -293,6 +350,7 @@ void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize)
{ {
for (j = 0; j < seqdbsize; j++) for (j = 0; j < seqdbsize; j++)
if (seqdb[j]->ranktaxonid == gtxid if (seqdb[j]->ranktaxonid == gtxid
&& seqdb[j]->isexample
&&(pair->p1->directCount[j] > 0 &&(pair->p1->directCount[j] > 0
|| pair->p1->reverseCount[j] > 0) || pair->p1->reverseCount[j] > 0)
&& (pair->p2->directCount[j] > 0 && (pair->p2->directCount[j] > 0
@ -303,10 +361,18 @@ void taxonomyspecificity (ppair_t pair, pecodnadb_t seqdb,uint32_t seqdbsize)
} }
} }
} }
//printf ("\n");
pair->notwellidentifiedtaxa = counttaxon(-2); counttaxon(-1);
pair->bs = ((float)pair->intaxa - (float)pair->notwellidentifiedtaxa) / pair->intaxa; wellidentifiedcount = 0;
for (j = 0; j < seqdbsize; j++)
if (pair->wellIdentifiedSeqs[j] == 1)
counttaxon(seqdb[j]->ranktaxonid);
wellidentifiedcount = counttaxon(-2);
//pair->notwellidentifiedtaxa = counttaxon(-2);
pair->notwellidentifiedtaxa = (pair->intaxa-wellidentifiedcount); //counttaxon(-2);
//pair->bs = ((float)pair->intaxa - (float)pair->notwellidentifiedtaxa) / pair->intaxa;
pair->bs = ((float)wellidentifiedcount) / (float)pair->intaxa;
ECOFREE (ampfwithtaxtree, "Free amplifia table"); ECOFREE (ampfwithtaxtree, "Free amplifia table");
} }