diff --git a/config/default.conf b/config/default.conf index 672d1b1..ff863ec 100644 --- a/config/default.conf +++ b/config/default.conf @@ -103,13 +103,16 @@ TAR = tar # PRTDIR : port dependent files location (libraries and binaries) # BINDIR : port binaries # LIBDIR : port libraries +# INCDIR : port includes # PRTDIR = $(CFGDIR)../ports/$(PORTNAME) -BINDIR = $(PRTDIR)/bin +BINDIR = $(abspath $(PRTDIR))/bin -LIBDIR = $(PRTDIR)/lib +LIBDIR = $(abspath $(PRTDIR))/lib + +INCDIR = $(abspath $(PRTDIR))/include # ------------------------------------ # default gmake variable in implicit rules diff --git a/config/ports/i386-darwin.conf b/config/ports/i386-darwin.conf index b8a7999..dcff9ed 100644 --- a/config/ports/i386-darwin.conf +++ b/config/ports/i386-darwin.conf @@ -18,9 +18,15 @@ # General compilation flags # ------------------------------------ +CC = /usr/bin/gcc +CXX = /usr/bin/g++ +CXXPP = /usr/bin/cpp +CPP = /usr/bin/cpp + # # MACHDEF : define machine and OS specific flags # +MACHINE = MACOSX MACHDEF = -DLX_TARGET_MACINTEL -DLITTLE_ENDIAN -DMACOSX diff --git a/config/ports/i386-linux.conf b/config/ports/i386-linux.conf index d90af22..e031e64 100755 --- a/config/ports/i386-linux.conf +++ b/config/ports/i386-linux.conf @@ -22,7 +22,7 @@ # MACHDEF : define machine and OS specific flags # -MACHDEF = -DLX_TARGET_LINUX -DLITTLE_ENDIAN +MACHDEF = -DLX_TARGET_LINUX -DLITTLE_ENDIAN -DLINUX # # MATH_LIBS : machine specific math librairies diff --git a/config/targets/package.targ b/config/targets/package.targ index f5918b8..40550dc 100644 --- a/config/targets/package.targ +++ b/config/targets/package.targ @@ -15,6 +15,9 @@ PKGDIR ?= build.$(PORTNAME) PRTPATH = $(abspath $(PRTDIR)) +PRTPATH_BIN = $(PRTPATH)/bin +PKG_CONFIG = $(PRTPATH)/bin/pkg-config + # # Rules # @@ -28,7 +31,17 @@ pkg.expand:: test -f $(PKGDIR)/configure || $(TAR) zxf $(PKGTAR) -C $(PKGDIR) --strip-components 1 pkg.make:: pkg.expand - test -f $(PKGDIR)/Makefile || (cd $(PKGDIR) && ./configure --prefix=$(PRTPATH)) + echo $(PKG_CONFIG) + test -f $(PKGDIR)/Makefile || (export PATH="$(PRTPATH_BIN):$$PATH" && \ + export PKG_CONFIG=$(PKG_CONFIG) && \ + export CC="$(CC)" && \ + export CXX="$(CXX)" && \ + export CPP="$(CPP)" && \ + export CXXPP="$(CXXPP)" && \ + export CFLAGS="$(CFLAGS)" && \ + export LDFLAGS="$(LDFLAGS)" && \ + cd $(PKGDIR) && \ + ./configure --prefix=$(PRTPATH) $(CONFIGURE_OPTIONS)) $(MAKE) -C $(PKGDIR) pkg.install:: pkg.make diff --git a/src/Makefile b/src/Makefile index f3b5492..a60f4ef 100755 --- a/src/Makefile +++ b/src/Makefile @@ -17,12 +17,18 @@ # include ../config/auto.conf -DIRS = exonerate \ +DIRS = aragorn \ + clustalo \ + exonerate \ + hmmer3 \ kimono \ + muscle \ + ncbiblast \ prokov \ + repseek \ sequtils \ - aragorn \ - ncbiblast + sumaclust \ + sumatra include ../config/targets/propagate.targ diff --git a/src/clustalo/Makefile b/src/clustalo/Makefile new file mode 100755 index 0000000..f7d29e7 --- /dev/null +++ b/src/clustalo/Makefile @@ -0,0 +1,29 @@ +# --------------------------------------------------------------- +# $Id: $ +# --------------------------------------------------------------- +# @file: Makefile +# @desc: makefile for lxpack +# +# @history: +# @history: +# @+ : Apr 97 : Created +# @+ : Mar 02 : Updated for LXxware +# +# @note: should be processed with gnu compatible make +# @note: helixware_compatible +# +# @end: +# --------------------------------------------------------------- +# +include ../../config/auto.conf + +DIRS = argtable \ + clustalo + +include ../../config/targets/propagate.targ + +include ../../config/targets/help.targ + +all:: + $(MAKE) ACTION=$@ _action + diff --git a/src/clustalo/argtable/Makefile b/src/clustalo/argtable/Makefile new file mode 100644 index 0000000..ad86dad --- /dev/null +++ b/src/clustalo/argtable/Makefile @@ -0,0 +1,24 @@ +# --------------------------------------------------------------- +# $Id: $ +# --------------------------------------------------------------- +# @file: Makefile +# @desc: makefile for package exonerate +# +# @history: +# @+ : Sept 15 : Adapted to ORG.Annot +# +# @note: should be processed with gnu compatible make +# @note: helixware_compatible +# +# @end: +# --------------------------------------------------------------- +# + +include ../../../config/auto.conf + +PKG = argtable2-13 + +include $(CFGDIR)targets/package.targ + +include $(CFGDIR)targets/help.targ + diff --git a/src/clustalo/argtable/argtable2-13.tgz b/src/clustalo/argtable/argtable2-13.tgz new file mode 100644 index 0000000..367fdbb Binary files /dev/null and b/src/clustalo/argtable/argtable2-13.tgz differ diff --git a/src/clustalo/clustalo/Makefile b/src/clustalo/clustalo/Makefile new file mode 100644 index 0000000..eded7e2 --- /dev/null +++ b/src/clustalo/clustalo/Makefile @@ -0,0 +1,24 @@ +# --------------------------------------------------------------- +# $Id: $ +# --------------------------------------------------------------- +# @file: Makefile +# @desc: makefile for package exonerate +# +# @history: +# @+ : Sept 15 : Adapted to ORG.Annot +# +# @note: should be processed with gnu compatible make +# @note: helixware_compatible +# +# @end: +# --------------------------------------------------------------- +# + +include ../../../config/auto.conf + +PKG = clustal-omega-1.2.1 + +include $(CFGDIR)targets/package.targ + +include $(CFGDIR)targets/help.targ + diff --git a/src/clustalo/clustalo/clustal-omega-1.2.1.tgz b/src/clustalo/clustalo/clustal-omega-1.2.1.tgz new file mode 100644 index 0000000..60b27aa Binary files /dev/null and b/src/clustalo/clustalo/clustal-omega-1.2.1.tgz differ diff --git a/src/exonerate/.DS_Store b/src/exonerate/.DS_Store index 5008ddf..b064811 100644 Binary files a/src/exonerate/.DS_Store and b/src/exonerate/.DS_Store differ diff --git a/src/exonerate/Makefile b/src/exonerate/Makefile old mode 100644 new mode 100755 index d83448d..3869d99 --- a/src/exonerate/Makefile +++ b/src/exonerate/Makefile @@ -2,10 +2,12 @@ # $Id: $ # --------------------------------------------------------------- # @file: Makefile -# @desc: makefile for package exonerate +# @desc: makefile for lxpack # # @history: -# @+ : Sept 15 : Adapted to ORG.Annot +# @history: +# @+ : Apr 97 : Created +# @+ : Mar 02 : Updated for LXxware # # @note: should be processed with gnu compatible make # @note: helixware_compatible @@ -13,12 +15,18 @@ # @end: # --------------------------------------------------------------- # - include ../../config/auto.conf -PKG = exonerate-2.2.0 +DIRS = pkg-config \ + libffi \ + gettext \ + glib2 \ + exonerate -include $(CFGDIR)targets/package.targ +include ../../config/targets/propagate.targ -include $(CFGDIR)targets/help.targ +include ../../config/targets/help.targ + +all:: + $(MAKE) ACTION=$@ _action diff --git a/src/exonerate/exonerate-2.2.0.tgz b/src/exonerate/exonerate-2.2.0.tgz deleted file mode 100644 index 76c321d..0000000 Binary files a/src/exonerate/exonerate-2.2.0.tgz and /dev/null differ diff --git a/src/exonerate/exonerate/.DS_Store b/src/exonerate/exonerate/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/src/exonerate/exonerate/.DS_Store differ diff --git a/src/exonerate/exonerate/Makefile b/src/exonerate/exonerate/Makefile new file mode 100644 index 0000000..e852763 --- /dev/null +++ b/src/exonerate/exonerate/Makefile @@ -0,0 +1,24 @@ +# --------------------------------------------------------------- +# $Id: $ +# --------------------------------------------------------------- +# @file: Makefile +# @desc: makefile for package exonerate +# +# @history: +# @+ : Sept 15 : Adapted to ORG.Annot +# +# @note: should be processed with gnu compatible make +# @note: helixware_compatible +# +# @end: +# --------------------------------------------------------------- +# + +include ../../../config/auto.conf + +PKG = exonerate-2.2.0_EC + +include $(CFGDIR)targets/package.targ + +include $(CFGDIR)targets/help.targ + diff --git a/src/exonerate/exonerate/exonerate-2.2.0_EC.tgz b/src/exonerate/exonerate/exonerate-2.2.0_EC.tgz new file mode 100644 index 0000000..b7bb7d6 Binary files /dev/null and b/src/exonerate/exonerate/exonerate-2.2.0_EC.tgz differ diff --git a/src/exonerate/gettext/Makefile b/src/exonerate/gettext/Makefile new file mode 100644 index 0000000..e2455c7 --- /dev/null +++ b/src/exonerate/gettext/Makefile @@ -0,0 +1,24 @@ +# --------------------------------------------------------------- +# $Id: $ +# --------------------------------------------------------------- +# @file: Makefile +# @desc: makefile for package exonerate +# +# @history: +# @+ : Sept 15 : Adapted to ORG.Annot +# +# @note: should be processed with gnu compatible make +# @note: helixware_compatible +# +# @end: +# --------------------------------------------------------------- +# + +include ../../../config/auto.conf + +PKG = gettext-0.19 + +include $(CFGDIR)targets/package.targ + +include $(CFGDIR)targets/help.targ + diff --git a/src/exonerate/gettext/gettext-0.19.tgz b/src/exonerate/gettext/gettext-0.19.tgz new file mode 100644 index 0000000..a3760bc Binary files /dev/null and b/src/exonerate/gettext/gettext-0.19.tgz differ diff --git a/src/exonerate/glib2/Makefile b/src/exonerate/glib2/Makefile new file mode 100644 index 0000000..1fe1c1b --- /dev/null +++ b/src/exonerate/glib2/Makefile @@ -0,0 +1,25 @@ +# --------------------------------------------------------------- +# $Id: $ +# --------------------------------------------------------------- +# @file: Makefile +# @desc: makefile for package exonerate +# +# @history: +# @+ : Sept 15 : Adapted to ORG.Annot +# +# @note: should be processed with gnu compatible make +# @note: helixware_compatible +# +# @end: +# --------------------------------------------------------------- +# + +include ../../../config/auto.conf + +PKG = glib-2.44.1 +CONFIGURE_OPTIONS = --disable-dtrace + +include $(CFGDIR)targets/package.targ + +include $(CFGDIR)targets/help.targ + diff --git a/src/exonerate/glib2/glib-2.44.1.tgz b/src/exonerate/glib2/glib-2.44.1.tgz new file mode 100644 index 0000000..26af174 Binary files /dev/null and b/src/exonerate/glib2/glib-2.44.1.tgz differ diff --git a/src/exonerate/libffi/.DS_Store b/src/exonerate/libffi/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/src/exonerate/libffi/.DS_Store differ diff --git a/src/exonerate/libffi/Makefile b/src/exonerate/libffi/Makefile new file mode 100644 index 0000000..63664a9 --- /dev/null +++ b/src/exonerate/libffi/Makefile @@ -0,0 +1,24 @@ +# --------------------------------------------------------------- +# $Id: $ +# --------------------------------------------------------------- +# @file: Makefile +# @desc: makefile for package exonerate +# +# @history: +# @+ : Sept 15 : Adapted to ORG.Annot +# +# @note: should be processed with gnu compatible make +# @note: helixware_compatible +# +# @end: +# --------------------------------------------------------------- +# + +include ../../../config/auto.conf + +PKG = libffi-3.2.1 + +include $(CFGDIR)targets/package.targ + +include $(CFGDIR)targets/help.targ + diff --git a/src/exonerate/libffi/libffi-3.2.1.tgz b/src/exonerate/libffi/libffi-3.2.1.tgz new file mode 100644 index 0000000..37d2ec3 Binary files /dev/null and b/src/exonerate/libffi/libffi-3.2.1.tgz differ diff --git a/src/exonerate/pkg-config/Makefile b/src/exonerate/pkg-config/Makefile new file mode 100644 index 0000000..34f1961 --- /dev/null +++ b/src/exonerate/pkg-config/Makefile @@ -0,0 +1,25 @@ +# --------------------------------------------------------------- +# $Id: $ +# --------------------------------------------------------------- +# @file: Makefile +# @desc: makefile for package exonerate +# +# @history: +# @+ : Sept 15 : Adapted to ORG.Annot +# +# @note: should be processed with gnu compatible make +# @note: helixware_compatible +# +# @end: +# --------------------------------------------------------------- +# + +include ../../../config/auto.conf + +PKG = pkg-config-0.29 +CONFIGURE_OPTIONS= --with-internal-glib + +include $(CFGDIR)targets/package.targ + +include $(CFGDIR)targets/help.targ + diff --git a/src/exonerate/pkg-config/pkg-config-0.29.tgz b/src/exonerate/pkg-config/pkg-config-0.29.tgz new file mode 100644 index 0000000..4ef6eb2 Binary files /dev/null and b/src/exonerate/pkg-config/pkg-config-0.29.tgz differ diff --git a/src/hmmer3/Makefile b/src/hmmer3/Makefile new file mode 100755 index 0000000..165f386 --- /dev/null +++ b/src/hmmer3/Makefile @@ -0,0 +1,28 @@ +# --------------------------------------------------------------- +# $Id: $ +# --------------------------------------------------------------- +# @file: Makefile +# @desc: makefile for lxpack +# +# @history: +# @history: +# @+ : Apr 97 : Created +# @+ : Mar 02 : Updated for LXxware +# +# @note: should be processed with gnu compatible make +# @note: helixware_compatible +# +# @end: +# --------------------------------------------------------------- +# +include ../../config/auto.conf + +DIRS = hmmer3 + +include ../../config/targets/propagate.targ + +include ../../config/targets/help.targ + +all:: + $(MAKE) ACTION=$@ _action + diff --git a/src/hmmer3/hmmer-3.1b1.tar b/src/hmmer3/hmmer-3.1b1.tar deleted file mode 100644 index d1ff1f8..0000000 Binary files a/src/hmmer3/hmmer-3.1b1.tar and /dev/null differ diff --git a/src/hmmer3/hmmer3/Makefile b/src/hmmer3/hmmer3/Makefile new file mode 100644 index 0000000..dcccf17 --- /dev/null +++ b/src/hmmer3/hmmer3/Makefile @@ -0,0 +1,24 @@ +# --------------------------------------------------------------- +# $Id: $ +# --------------------------------------------------------------- +# @file: Makefile +# @desc: makefile for package exonerate +# +# @history: +# @+ : Sept 15 : Adapted to ORG.Annot +# +# @note: should be processed with gnu compatible make +# @note: helixware_compatible +# +# @end: +# --------------------------------------------------------------- +# + +include ../../../config/auto.conf + +PKG = hmmer-3.1b1 + +include $(CFGDIR)targets/package.targ + +include $(CFGDIR)targets/help.targ + diff --git a/src/hmmer3/hmmer3/hmmer-3.1b1.tgz b/src/hmmer3/hmmer3/hmmer-3.1b1.tgz new file mode 100644 index 0000000..e850af4 Binary files /dev/null and b/src/hmmer3/hmmer3/hmmer-3.1b1.tgz differ diff --git a/src/kimono/Makefile b/src/kimono/Makefile index 730be01..c9e9752 100755 --- a/src/kimono/Makefile +++ b/src/kimono/Makefile @@ -1,4 +1,4 @@ -# --------------------------------------------------------------- +#--------------------------------------------------------------- # $Id: $ # --------------------------------------------------------------- # @file: Makefile @@ -25,9 +25,6 @@ include ../../config/targets/help.targ all:: $(MAKE) ACTION=$@ _action - test -d $(PRTDIR) || mkdir $(PRTDIR) - test -d $(BINDIR) || mkdir $(BINDIR) - \cp -f lxpack/ports/$(PORTNAME)/bin/* $(BINDIR) clean:: $(MAKE) -C lxpack portclean diff --git a/src/kimono/lxpack/Makefile b/src/kimono/lxpack/Makefile index bda90a8..18c29f1 100755 --- a/src/kimono/lxpack/Makefile +++ b/src/kimono/lxpack/Makefile @@ -15,14 +15,14 @@ # @end: # --------------------------------------------------------------- # -include ./config/auto.conf +include ../../../config/auto.conf DIRS = src \ tests -include ./config/targets/propagate.targ +include ../../../config/targets/propagate.targ -include ./config/targets/help.targ +include ../../../config/targets/help.targ portclean:: $(MAKE) ACTION=$@ _action diff --git a/src/kimono/lxpack/config/README.txt b/src/kimono/lxpack/config/README.txt deleted file mode 100755 index dbcb92a..0000000 --- a/src/kimono/lxpack/config/README.txt +++ /dev/null @@ -1,51 +0,0 @@ - -$Id: README.txt 1825 2013-02-26 09:39:47Z viari $ - -This directory contains Makefile machine specific configuration files -(and default targets to help you writing Makefile's) - -These headers should be used with GNU make or compatible - -# -# portname -# - -To check your port, issue : - - ./guess_port - - if output is 'unknown ::' then you should : - - add a port entry in guess_port for :: - - create a ports/.conf configuration file - (the best is to start from another port file, - choose whatever looks closest) - -# -# configuration flags -# - -auto.conf : the main configuration file : - - determine the machine port thru 'guess_port' shell - - include 'default.conf' file - - include the machine specific 'ports/.conf' file - -default.conf : default configuration (included by 'auto.conf') - -ports/.conf : machine specific configuration (included by 'auto.conf') - -# -# utility targets -# - -targets/help.targ : target for standard help - -targets/propagate.targ : target for propagating targets to subdirectories - -targets/package.targ : default targets for standard package with 'configure' - -targets/empty.targ : default empty targets (defined as double colon rules) - -targets/lxbin.targ : default make targets for standard lx binary (without libraries) - -targets/debug.targ : target to print debug information (for dev.) - diff --git a/src/kimono/lxpack/config/auto.conf b/src/kimono/lxpack/config/auto.conf deleted file mode 100644 index 265ad82..0000000 --- a/src/kimono/lxpack/config/auto.conf +++ /dev/null @@ -1,54 +0,0 @@ -# -# $Id: auto.conf 1825 2013-02-26 09:39:47Z viari $ -# -# auto.conf -# auto configuration file using guess_port -# -# this file is included in Makefile -# - -# -# default shell for gnu-make -# - -SHELL = /bin/sh - -# -# CFGDIR : location of config files = this file directory location -# -# CFGPRT : port name (as returned by guess_port) -# - -# because builtin 'lastword' is missing in gnu-make 3.80 - -lastword = $(word $(words $1), $1) - -CFGDIR := $(dir $(call lastword, $(MAKEFILE_LIST))) - -CFGPRT := $(shell $(CFGDIR)guess_port) - -# check if port is correctly defined - -ifneq (1, $(words $(CFGPRT))) - entry := $(call lastword, $(CFGPRT)) - $(error port is undefined - add entry for "$(entry)" in configuration file -) -endif - -# -# PORTNAME : port name to use : default is CFGPRT but may be futher modified -# by machine specific configuration - -PORTNAME = $(CFGPRT) - -# -# default configuration -# may be overriden by machine dependant definitions below -# - -include $(CFGDIR)default.conf - -# -# machine dependant definitions -# - -include $(CFGDIR)ports/$(CFGPRT).conf diff --git a/src/kimono/lxpack/config/default.conf b/src/kimono/lxpack/config/default.conf deleted file mode 100644 index 672d1b1..0000000 --- a/src/kimono/lxpack/config/default.conf +++ /dev/null @@ -1,124 +0,0 @@ -# -# $Id: default.conf 2007 2013-12-03 14:21:39Z viari $ -# -# default.conf -# default configuration flags -# maybe further redefined by machine specific configuration -# -# this file is included by auto.conf -# - -# ------------------------------------ -# General compilation flags -# ------------------------------------ - -# -# MACHDEF : define machine and OS specific flags -# - -MACHDEF = - -# -# CC : (ansi C) compiler command to use -# you may add some machine specific flags (like -arch ...) -# in the .conf configuration file -# - -CC = gcc - -# -# default compiler optimizer flag -# - -OPTIM = -O - -# -# CC_LIBS : additionnal machine specific $(CC) libraries -# like '-lC' on some machines -# - -CC_LIBS = - -# -# MALLOC_LIBS : machine specific malloc librairies -# like '-lmalloc' on SGI -# - -MALLOC_LIBS = - -# -# MATH_LIBS : machine specific math librairies -# like '-lm' on Solaris -# - -MATH_LIBS = - -# -# LINT : looks like LINT command does not exist anymore -# here is a rough replacement -# - -LINT = gcc -S -Wall -Wno-format-y2k -W -Wstrict-prototypes \ - -Wmissing-prototypes -Wpointer-arith -Wreturn-type \ - -Wcast-qual -Wwrite-strings -Wswitch -Wshadow \ - -Wcast-align -Wbad-function-cast -Wchar-subscripts \ - -Winline -Wnested-externs -Wredundant-decls - -# ------------------------------------ -# General system commands -# ------------------------------------ - -# -# AR : AR archive command -# ARFLAGS : $(AR) archiving flags -# ARXFLAGS : $(AR) extraction flags -# - -AR = ar -ARFLAGS = rcv -ARXFLAGS = xv - -# -# RANLIB : ranlib command -# - -RANLIB = ranlib - -# -# DIFF : diff command -# - -DIFF = diff - -# -# TAR : tar command -# - -TAR = tar - -# ------------------------------------ -# Default locations -# ------------------------------------ -# -# PRTDIR : port dependent files location (libraries and binaries) -# BINDIR : port binaries -# LIBDIR : port libraries -# - -PRTDIR = $(CFGDIR)../ports/$(PORTNAME) - -BINDIR = $(PRTDIR)/bin - -LIBDIR = $(PRTDIR)/lib - -# ------------------------------------ -# default gmake variable in implicit rules -# ------------------------------------ - -CFLAGS = $(OPTIM) $(MACHDEF) -I$(INCDIR) - -LDFLAGS = -L$(LIBDIR) -L. - -LDLIBS = $(LIBS) $(MALLOC_LIBS) $(MATH_LIBS) $(CC_LIBS) - -LINTFLAGS = $(MACHDEF) -I$(INCDIR) diff --git a/src/kimono/lxpack/config/guess_port b/src/kimono/lxpack/config/guess_port deleted file mode 100755 index 56e0ae3..0000000 --- a/src/kimono/lxpack/config/guess_port +++ /dev/null @@ -1,33 +0,0 @@ -#! /bin/sh -# -# $Id: guess_port 1825 2013-02-26 09:39:47Z viari $ -# -# @file: guess_port -# @desc: attempt to guess the portname -# @usage: guess_port -# -# @history: -# @+ Nov. 2000 first draft adapted from GNU config.guess -# @+ Feb. 2010 moved to sh -# - -mach=`uname -m` -syst=`uname -s` -rels=`uname -r` - -case ${mach}:${syst}:${rels} in - - alpha:OSF1:* ) echo alpha-osf1;; - sun4*:SunOS:5.* ) echo sparc-solaris;; - i86pc:SunOS:5.* ) echo i386-solaris;; - sun4*:SunOS:* ) echo sparc-sunos;; - Power*:Darwin:* ) echo ppc-darwin;; - i*86:Linux:* ) echo i386-linux;; - x*86*:Linux:* ) echo i386-linux;; - i*86:Darwin:* ) echo i386-darwin;; - IP*:IRIX*:* ) echo mips-irix;; - i*86:MINGW32*:* ) echo x86-mingw32;; - - *) echo unknown ${mach}:${syst}:${rels}; exit 1;; -esac -exit 0 diff --git a/src/kimono/lxpack/config/ports/i386-darwin.conf b/src/kimono/lxpack/config/ports/i386-darwin.conf deleted file mode 100644 index b8a7999..0000000 --- a/src/kimono/lxpack/config/ports/i386-darwin.conf +++ /dev/null @@ -1,26 +0,0 @@ -# -# $Id: i386-darwin.conf 1825 2013-02-26 09:39:47Z viari $ -# -# i386-darwin.conf -# configuration file for MacOS-X/Intel-Based/Darwin 1.2 with gcc compiler -# this file is included in Makefile -# -# system (uname -srp) : Darwin 8.7.1 i386 -# compiler (cc --version) : i686-apple-darwin8-gcc-4.0.1 -# -# check tags -# @uname:uname -srp:Darwin 8.7.1 i386 -# @cc:cc --version:i686-apple-darwin8-gcc-4.0.1 -# -# - -# ------------------------------------ -# General compilation flags -# ------------------------------------ - -# -# MACHDEF : define machine and OS specific flags -# - -MACHDEF = -DLX_TARGET_MACINTEL -DLITTLE_ENDIAN -DMACOSX - diff --git a/src/kimono/lxpack/config/ports/i386-linux.conf b/src/kimono/lxpack/config/ports/i386-linux.conf deleted file mode 100755 index d90af22..0000000 --- a/src/kimono/lxpack/config/ports/i386-linux.conf +++ /dev/null @@ -1,32 +0,0 @@ -# -# $Id: i386-linux.conf 1825 2013-02-26 09:39:47Z viari $ -# -# i386-linux.conf -# configuration file for linux ix86 with GNU gcc compiler -# this file is included in Makefile -# -# system (uname -srp) : Linux 2.2.14-5.0 unknown -# compiler (gcc --version) : egcs-2.91.66 -# -# check tags -# @uname:uname -srp:Linux 2.2.14-5.0 unknown -# @cc:cc --version:egcs-2.91.66 -# -# - -# ------------------------------------ -# General compilation flags -# ------------------------------------ - -# -# MACHDEF : define machine and OS specific flags -# - -MACHDEF = -DLX_TARGET_LINUX -DLITTLE_ENDIAN - -# -# MATH_LIBS : machine specific math librairies -# - -MATH_LIBS = -lm - diff --git a/src/kimono/lxpack/config/ports/ppc-darwin.conf b/src/kimono/lxpack/config/ports/ppc-darwin.conf deleted file mode 100755 index 553345b..0000000 --- a/src/kimono/lxpack/config/ports/ppc-darwin.conf +++ /dev/null @@ -1,32 +0,0 @@ -# -# $Id: ppc-darwin.conf 1825 2013-02-26 09:39:47Z viari $ -# -# ppc-darwin.conf -# configuration file for MacOS-X/Darwin 1.2 with native cc compiler -# this file is included in Makefile -# -# system (uname -srp) : Darwin 1.2 powerpc -# compiler (cc --version) : 2.7.2.1 -# -# check tags -# @uname:uname -srp:Darwin 1.2 powerpc -# @cc:cc --version:2.7.2.1 -# -# - -# ------------------------------------ -# General compilation flags -# ------------------------------------ - -# -# MACHDEF : define machine and OS specific flags -# - -MACHDEF = -DLX_TARGET_MACPPC -DBIG_ENDIAN - -# -# CC : name of (ansi C) compiler to use -# - -CC = cc -arch ppc - diff --git a/src/kimono/lxpack/config/ports/sparc-solaris.conf b/src/kimono/lxpack/config/ports/sparc-solaris.conf deleted file mode 100755 index 46ce21a..0000000 --- a/src/kimono/lxpack/config/ports/sparc-solaris.conf +++ /dev/null @@ -1,31 +0,0 @@ -# -# $Id: sparc-solaris.conf 1825 2013-02-26 09:39:47Z viari $ -# -# sparc-solaris.conf -# configuration file for sparc solaris with GNU gcc compiler -# this file is included in Makefile -# -# system (uname -srp) : SunOS 5.8 sparc -# compiler (gcc --version) : 2.95.2 -# -# check tags -# @uname:uname -srp:SunOS 5.8 sparc -# @cc:cc --version:2.95.2 -# -# - -# ------------------------------------ -# General compilation flags -# ------------------------------------ - -# -# MACHDEF : define machine and OS specific flags -# - -MACHDEF = -DLX_TARGET_SOLARIS -DBIG_ENDIAN - -# -# MATH_LIBS : machine specific math librairies -# - -MATH_LIBS = -lm diff --git a/src/kimono/lxpack/config/ports/x86-mingw32.conf b/src/kimono/lxpack/config/ports/x86-mingw32.conf deleted file mode 100644 index 0dfea5d..0000000 --- a/src/kimono/lxpack/config/ports/x86-mingw32.conf +++ /dev/null @@ -1,54 +0,0 @@ -# -# $Id: x86-mingw32.conf 1825 2013-02-26 09:39:47Z viari $ -# -# x86-mingw32 -# configuration file for MinGW with GNU gcc compiler. -# -# this file is included in Makefile -# -# - -# -# rename PORTNAME safely since MinGW produce pure win32 executables -# without dll's -# - -PORTNAME = x86-win32 - -# ------------------------------------ -# General compilation flags -# ------------------------------------ - -# -# CC_LIBS : additionnal machine specific $(CC) libraries -# -# libiberty is needed for some system extensions (like mkstemps) -# - -CC_LIBS = -liberty - -# -# MACHDEF : define machine and OS specific flags -# -# -DDLMALLOC : use dlmalloc instead of malloc (which does not have mallinfo) -# -posix is a new replacement for several MinGW32 flags, including: -# -D__USE_MINGW_ANSI_STDIO : mingw gcc flag to recognize the C99 "%zu" format -# - -MACHDEF = -posix -DLX_TARGET_WIN32 -DWIN_MINGW -DDLMALLOC -DLITTLE_ENDIAN - -# -# MATH_LIBS : machine specific math librairies -# - -MATH_LIBS = -lm - -# ------------------------------------ -# General system commands -# ------------------------------------ - -# -# DIFF : diff command / should ignore cr on windows -# - -DIFF = diff --strip-trailing-cr diff --git a/src/kimono/lxpack/config/targets/debug.targ b/src/kimono/lxpack/config/targets/debug.targ deleted file mode 100644 index c25e3ca..0000000 --- a/src/kimono/lxpack/config/targets/debug.targ +++ /dev/null @@ -1,25 +0,0 @@ -# -# $Id: help.targ 1825 2013-02-26 09:39:47Z viari $ -# -# debug.targ -# -# target to print debug information (dev. only) -# -# it defines the following targets: -# -# debug : -# print debug -# -# it requires auto.conf -# - -.PHONY: debug - -debug:: - @echo "+ PORTNAME: $(PORTNAME)" - @echo "+ CFGPRT: $(CFGPRT)" - @echo "+ CFGDIR: $(CFGDIR)" - @echo "+ PRTDIR: $(PRTDIR)" - @echo "+ MACHDEF: $(MACHDEF)" - - diff --git a/src/kimono/lxpack/config/targets/empty.targ b/src/kimono/lxpack/config/targets/empty.targ deleted file mode 100644 index 9642422..0000000 --- a/src/kimono/lxpack/config/targets/empty.targ +++ /dev/null @@ -1,24 +0,0 @@ -# -# $Id: $ -# -# epty.targ -# -# default empty targets (defined as double colon rules) -# -# - -# -# Rules -# - -.PHONY: all test clean portclean help - -all:: - -test:: - -clean:: - -portclean:: clean - -test:: diff --git a/src/kimono/lxpack/config/targets/help.targ b/src/kimono/lxpack/config/targets/help.targ deleted file mode 100644 index f0128a5..0000000 --- a/src/kimono/lxpack/config/targets/help.targ +++ /dev/null @@ -1,23 +0,0 @@ -# -# $Id: help.targ 1825 2013-02-26 09:39:47Z viari $ -# -# help.targ -# -# default target to print help -# -# it defines the following targets: -# -# help : -# print help -# - -.PHONY: help - -help:: - @ echo "basic usage: make [+]" - @ echo "valid :" - @ echo " all : compile everything for current port [default target]" - @ echo " clean : local cleanup" - @ echo " portclean : cleanup distribution for current port" - @ echo " test : run tests on current port" - @ echo " help : print this help" diff --git a/src/kimono/lxpack/config/targets/lxbin.targ b/src/kimono/lxpack/config/targets/lxbin.targ deleted file mode 100644 index 4e6dbe9..0000000 --- a/src/kimono/lxpack/config/targets/lxbin.targ +++ /dev/null @@ -1,51 +0,0 @@ -# -# $Id: $ -# -# lxbin.targ -# -# default make targets for standard lx binary -# -# you should define the 'PROGS' and 'OSRC' variables -# and optionnaly 'LIBS' if binaries have to be linked with libraries -# -# note: if main source code for binary PROG is PROG.c, there is nothing to do, -# else (e.g. if it involves several sources files) you should also add local -# file dependencies. e.g under the form: -# -# mymain: $(OBJ) mymain_base.c mymain_help.c -# $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS) $(LDLIBS) -# -# -# 'auto.conf' should have been included -# - -OBJ = $(OSRC:.c=.o) - -INCDIR = ../include - -# -# Rules -# - -.PHONY: all prelib install test clean portclean - -all:: prelib $(PROGS) install - @echo "+++++++++++ binaries $(PROGS) done" - -prelib:: - test -d $(PRTDIR) || mkdir $(PRTDIR) # because some linker may complain - test -d $(LIBDIR) || mkdir $(LIBDIR) # if -L$(LIBDIR) does not exist - -install:: - test -d $(PRTDIR) || mkdir $(PRTDIR) - test -d $(BINDIR) || mkdir $(BINDIR) - -for f in $(PROGS) ; do \cp -f $$f $(BINDIR) ; done - -test:: - -clean:: - -\rm -f *.o cvstatic* *% *.bak so_loc* - -\rm -f $(PROGS) - -portclean:: clean - -(! test -d $(BINDIR)) || (cd $(BINDIR) && \rm -f $(PROGS)) diff --git a/src/kimono/lxpack/config/targets/lxlib.targ b/src/kimono/lxpack/config/targets/lxlib.targ deleted file mode 100644 index 1be65c3..0000000 --- a/src/kimono/lxpack/config/targets/lxlib.targ +++ /dev/null @@ -1,43 +0,0 @@ -# -# $Id: $ -# -# lxlib.targ -# -# default make targets for standard lx library -# -# you should define the 'LOCLIB' and 'OSRC' variables -# -# 'auto.conf' should have been included -# - -OBJ = $(OSRC:.c=.o) - -INCDIR = ../include - -# -# Rules -# - -.PHONY: all lib install test clean portclean - -all:: lib install - @echo "+++++++++++ library $(LOCLIB) done" - -lib:: $(OBJ) - $(AR) $(ARFLAGS) $(LOCLIB) $(OBJ) - $(RANLIB) $(LOCLIB) - -install:: - test -d $(PRTDIR) || mkdir $(PRTDIR) - test -d $(LIBDIR) || mkdir $(LIBDIR) - \cp -f $(LOCLIB) $(LIBDIR) - $(RANLIB) $(LIBDIR)/$(LOCLIB) - -test:: - -clean:: - -\rm -f *.o cvstatic* *% *.bak so_loc* - -\rm -f $(LOCLIB) - -portclean:: clean - -(! test -d $(LIBDIR)) || (cd $(LIBDIR) && \rm -f $(LOCLIB)) diff --git a/src/kimono/lxpack/config/targets/package.targ b/src/kimono/lxpack/config/targets/package.targ deleted file mode 100644 index f5918b8..0000000 --- a/src/kimono/lxpack/config/targets/package.targ +++ /dev/null @@ -1,48 +0,0 @@ -# -# $Id: package.targ 1825 2013-02-26 09:39:47Z viari $ -# -# package.targ -# -# default make targets for standard package with configure -# -# you should define the 'PKG' variable -# (and optionaly 'PKGTAR', 'PKGDIR') -# - -PKGTAR ?= $(PKG).tgz - -PKGDIR ?= build.$(PORTNAME) - -PRTPATH = $(abspath $(PRTDIR)) - -# -# Rules -# - -.PHONY: all clean test portclean pkg pkg.expand pkg.make pkg.install - -all:: pkg - -pkg.expand:: - test -d $(PKGDIR) || mkdir $(PKGDIR) - test -f $(PKGDIR)/configure || $(TAR) zxf $(PKGTAR) -C $(PKGDIR) --strip-components 1 - -pkg.make:: pkg.expand - test -f $(PKGDIR)/Makefile || (cd $(PKGDIR) && ./configure --prefix=$(PRTPATH)) - $(MAKE) -C $(PKGDIR) - -pkg.install:: pkg.make - $(MAKE) -C $(PKGDIR) install - -pkg:: pkg.install - @echo "+++++++++++ package $(PKG) done" - -test:: - (! test -d $(PKGDIR)) || $(MAKE) -C $(PKGDIR) test - -clean:: - (! test -d $(PKGDIR)) || $(MAKE) -C $(PKGDIR) clean - -portclean:: - (! test -d $(PKGDIR)) || $(MAKE) -C $(PKGDIR) distclean - (! test -d $(PKGDIR)) || \rm -r $(PKGDIR) diff --git a/src/kimono/lxpack/config/targets/propagate.targ b/src/kimono/lxpack/config/targets/propagate.targ deleted file mode 100644 index 2e9df18..0000000 --- a/src/kimono/lxpack/config/targets/propagate.targ +++ /dev/null @@ -1,30 +0,0 @@ -# -# $Id: propagate.targ 1825 2013-02-26 09:39:47Z viari $ -# -# propagate.targ -# -# default make targets for library containers -# -# you should define the 'DIRS' variable -# -# It will propagate 'MAKE ' to all -# directories listed in DIRS -# - -# -# Rules -# - -.PHONY: all _action $(DIRS) - -.DEFAULT: - $(MAKE) ACTION=$@ _action - -all:: - $(MAKE) ACTION=all _action - -_action: $(DIRS) - @echo "$(ACTION) done" - -$(DIRS): - $(MAKE) -C $@ $(ACTION) diff --git a/src/kimono/lxpack/src/Makefile b/src/kimono/lxpack/src/Makefile index ce3d46b..d45b2cb 100644 --- a/src/kimono/lxpack/src/Makefile +++ b/src/kimono/lxpack/src/Makefile @@ -13,7 +13,7 @@ # --------------------------------------------------------------- # -include ../config/auto.conf +include ../../../../config/auto.conf PROGS = kimono kimfit @@ -29,8 +29,8 @@ OSRC = fasta_io.c \ kim_genetic.c \ kim_codonskew.c -include ../config/targets/lxbin.targ -include ../config/targets/help.targ +include ../../../../config/targets/lxbin.targ +include ../../../../config/targets/help.targ # # file dependencies diff --git a/src/kimono/lxpack/tests/Makefile b/src/kimono/lxpack/tests/Makefile index 0842294..830788f 100755 --- a/src/kimono/lxpack/tests/Makefile +++ b/src/kimono/lxpack/tests/Makefile @@ -16,7 +16,7 @@ # --------------------------------------------------------------- # -include ../config/targets/empty.targ +include ../../../../config/targets/empty.targ clean:: -\rm -f *.tst diff --git a/src/muscle/Makefile b/src/muscle/Makefile new file mode 100755 index 0000000..6e300a0 --- /dev/null +++ b/src/muscle/Makefile @@ -0,0 +1,30 @@ +#--------------------------------------------------------------- +# $Id: $ +# --------------------------------------------------------------- +# @file: Makefile +# @desc: makefile for lxpack +# +# @history: +# @history: +# @+ : Apr 97 : Created +# @+ : Mar 02 : Updated for LXxware +# +# @note: should be processed with gnu compatible make +# @note: helixware_compatible +# +# @end: +# --------------------------------------------------------------- +# +include ../../config/auto.conf + +DIRS = muscle3.8.31 + +include ../../config/targets/propagate.targ + +include ../../config/targets/help.targ + +all:: + $(MAKE) ACTION=$@ _action + +clean:: + $(MAKE) -C lxpack portclean diff --git a/src/muscle/muscle3.8.31/Makefile b/src/muscle/muscle3.8.31/Makefile new file mode 100755 index 0000000..d16c191 --- /dev/null +++ b/src/muscle/muscle3.8.31/Makefile @@ -0,0 +1,30 @@ +#--------------------------------------------------------------- +# $Id: $ +# --------------------------------------------------------------- +# @file: Makefile +# @desc: makefile for lxpack +# +# @history: +# @history: +# @+ : Apr 97 : Created +# @+ : Mar 02 : Updated for LXxware +# +# @note: should be processed with gnu compatible make +# @note: helixware_compatible +# +# @end: +# --------------------------------------------------------------- +# +include ../../../config/auto.conf + +DIRS = src + +include ../../../config/targets/propagate.targ + +include ../../../config/targets/help.targ + +all:: + $(MAKE) ACTION=$@ _action + +clean:: + $(MAKE) -C lxpack portclean diff --git a/src/muscle/muscle3.8.31/src/Makefile b/src/muscle/muscle3.8.31/src/Makefile new file mode 100644 index 0000000..9631dab --- /dev/null +++ b/src/muscle/muscle3.8.31/src/Makefile @@ -0,0 +1,11 @@ +include ../../../../config/auto.conf + +all: muscle install + +muscle: + chmod +x ./mk + (export CXX=$(CXX) && ./mk) + +install: + cp muscle $(BINDIR) + diff --git a/src/muscle/muscle3.8.31/src/README.txt b/src/muscle/muscle3.8.31/src/README.txt new file mode 100644 index 0000000..8e4afd2 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/README.txt @@ -0,0 +1,27 @@ +MUSCLE v3.0 source code README +------------------------------ + +http://www.drive5.com/muscle + +This version of MUSCLE was built and tested on two platforms: +Windows XP and Red Hat Linux 8.0. + +On Windows, I used Microsoft Visual C++ .Net, which I find +to be the best C++ compile / edit / test environment I've +tried on any platform. The Microsoft project file is +muscle.vcproj. + +The Linux make file is Makefile. This is a very simple-minded +make file (because I am a Linux development novice), so should +be easy to understand. By default, it uses shared libraries, +but I found this to give problems when copying between +different Linux versions. The fix was to use the linker +flag -lm static (commented out), which gives a much bigger +but more portable binary. The posted binary was linked with +static libraries. + +The source code was not written to be maintained by anyone +but me, so the usual apologies and caveats apply. + +Bob Edgar, +January 2004 diff --git a/src/muscle/muscle3.8.31/src/aligngivenpath.cpp b/src/muscle/muscle3.8.31/src/aligngivenpath.cpp new file mode 100644 index 0000000..a130605 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/aligngivenpath.cpp @@ -0,0 +1,802 @@ +#include "muscle.h" +#include "msa.h" +#include "pwpath.h" +#include "profile.h" + +#define TRACE 0 + +static void LogPP(const ProfPos &PP) + { + Log("ResidueGroup %u\n", PP.m_uResidueGroup); + Log("AllGaps %d\n", PP.m_bAllGaps); + Log("Occ %.3g\n", PP.m_fOcc); + Log("LL=%.3g LG=%.3g GL=%.3g GG=%.3g\n", PP.m_LL, PP.m_LG, PP.m_GL, PP.m_GG); + Log("Freqs "); + for (unsigned i = 0; i < 20; ++i) + if (PP.m_fcCounts[i] > 0) + Log("%c=%.3g ", LetterToChar(i), PP.m_fcCounts[i]); + Log("\n"); + } + +static void AssertProfPosEq(const ProfPos *PA, const ProfPos *PB, unsigned i) + { + const ProfPos &PPA = PA[i]; + const ProfPos &PPB = PB[i]; +#define eq(x) if (PPA.m_##x != PPB.m_##x) { LogPP(PPA); LogPP(PPB); Quit("AssertProfPosEq." #x); } +#define be(x) if (!BTEq(PPA.m_##x, PPB.m_##x)) { LogPP(PPA); LogPP(PPB); Quit("AssertProfPosEq." #x); } + eq(bAllGaps) + eq(uResidueGroup) + + be(LL) + be(LG) + be(GL) + be(GG) + be(fOcc) + be(scoreGapOpen) + be(scoreGapClose) + + for (unsigned j = 0; j < 20; ++j) + { +#define eqj(x) if (PPA.m_##x != PPB.m_##x) Quit("AssertProfPosEq j=%u " #x, j); +#define bej(x) if (!BTEq(PPA.m_##x, PPB.m_##x)) Quit("AssertProfPosEq j=%u " #x, j); + bej(fcCounts[j]); +// eqj(uSortOrder[j]) // may differ due to ties, don't check? + bej(AAScores[j]) +#undef eqj +#undef bej + } +#undef eq +#undef be + } + +void AssertProfsEq(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB) + { + if (uLengthA != uLengthB) + Quit("AssertProfsEq: lengths differ %u %u", uLengthA, uLengthB); + for (unsigned i = 0; i < uLengthB; ++i) + AssertProfPosEq(PA, PB, i); + } + +#if DEBUG +static void ValidateProf(const ProfPos *Prof, unsigned uLength) + { + for (unsigned i = 0; i < uLength; ++i) + { + const ProfPos &PP = Prof[i]; + + FCOUNT s1 = PP.m_LL + PP.m_LG + PP.m_GL + PP.m_GG; + assert(BTEq(s1, 1.0)); + + if (i > 0) + { + const ProfPos &PPPrev = Prof[i-1]; + FCOUNT s2 = PPPrev.m_LL + PPPrev.m_GL; + FCOUNT s3 = PP.m_LL + PP.m_LG; + assert(BTEq(s2, s3)); + } + if (i < uLength - 1) + { + const ProfPos &PPNext = Prof[i+1]; + FCOUNT s4 = PP.m_LL + PP.m_GL; + FCOUNT s5 = PPNext.m_LL + PPNext.m_LG; + assert(BTEq(s4, s5)); + } + } + } +#else +#define ValidateProf(Prof, Length) /* empty */ +#endif + +static void ScoresFromFreqsPos(ProfPos *Prof, unsigned uLength, unsigned uPos) + { + ProfPos &PP = Prof[uPos]; + SortCounts(PP.m_fcCounts, PP.m_uSortOrder); + PP.m_uResidueGroup = ResidueGroupFromFCounts(PP.m_fcCounts); + +// "Occupancy" + PP.m_fOcc = PP.m_LL + PP.m_GL; + +// Frequency of gap-opens in this position (i) +// Gap open = letter in i-1 and gap in i +// = iff LG in i + FCOUNT fcOpen = PP.m_LG; + +// Frequency of gap-closes in this position +// Gap close = gap in i and letter in i+1 +// = iff GL in i+1 + FCOUNT fcClose; + if (uPos + 1 < uLength) + fcClose = Prof[uPos + 1].m_GL; + else + fcClose = PP.m_GG + PP.m_LG; + + PP.m_scoreGapOpen = (SCORE) ((1.0 - fcOpen)*g_scoreGapOpen/2.0); + PP.m_scoreGapClose = (SCORE) ((1.0 - fcClose)*g_scoreGapOpen/2.0); +#if DOUBLE_AFFINE + PP.m_scoreGapOpen2 = (SCORE) ((1.0 - fcOpen)*g_scoreGapOpen2/2.0); + PP.m_scoreGapClose2 = (SCORE) ((1.0 - fcClose)*g_scoreGapOpen2/2.0); +#endif + + for (unsigned i = 0; i < g_AlphaSize; ++i) + { + SCORE scoreSum = 0; + for (unsigned j = 0; j < g_AlphaSize; ++j) + scoreSum += PP.m_fcCounts[j]*(*g_ptrScoreMatrix)[i][j]; + PP.m_AAScores[i] = scoreSum; + } + } + +void ProfScoresFromFreqs(ProfPos *Prof, unsigned uLength) + { + for (unsigned i = 0; i < uLength; ++i) + ScoresFromFreqsPos(Prof, uLength, i); + } + +static void AppendDelete(const MSA &msaA, unsigned &uColIndexA, + unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined, + unsigned &uColIndexCombined) + { +#if TRACE + Log("AppendDelete ColIxA=%u ColIxCmb=%u\n", + uColIndexA, uColIndexCombined); +#endif + for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) + { + char c = msaA.GetChar(uSeqIndexA, uColIndexA); + msaCombined.SetChar(uSeqIndexA, uColIndexCombined, c); + } + + for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) + msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, '-'); + + ++uColIndexCombined; + ++uColIndexA; + } + +static void AppendInsert(const MSA &msaB, unsigned &uColIndexB, + unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined, + unsigned &uColIndexCombined) + { +#if TRACE + Log("AppendInsert ColIxB=%u ColIxCmb=%u\n", + uColIndexB, uColIndexCombined); +#endif + for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) + msaCombined.SetChar(uSeqIndexA, uColIndexCombined, '-'); + + for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) + { + char c = msaB.GetChar(uSeqIndexB, uColIndexB); + msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, c); + } + + ++uColIndexCombined; + ++uColIndexB; + } + +static void AppendTplInserts(const MSA &msaA, unsigned &uColIndexA, unsigned uColCountA, + const MSA &msaB, unsigned &uColIndexB, unsigned uColCountB, unsigned uSeqCountA, + unsigned uSeqCountB, MSA &msaCombined, unsigned &uColIndexCombined) + { +#if TRACE + Log("AppendTplInserts ColIxA=%u ColIxB=%u ColIxCmb=%u\n", + uColIndexA, uColIndexB, uColIndexCombined); +#endif + const unsigned uLengthA = msaA.GetColCount(); + const unsigned uLengthB = msaB.GetColCount(); + + unsigned uNewColCount = uColCountA; + if (uColCountB > uNewColCount) + uNewColCount = uColCountB; + + for (unsigned n = 0; n < uColCountA; ++n) + { + for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) + { + char c = msaA.GetChar(uSeqIndexA, uColIndexA + n); + c = UnalignChar(c); + msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, c); + } + } + for (unsigned n = uColCountA; n < uNewColCount; ++n) + { + for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) + msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, '.'); + } + + for (unsigned n = 0; n < uColCountB; ++n) + { + for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) + { + char c = msaB.GetChar(uSeqIndexB, uColIndexB + n); + c = UnalignChar(c); + msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, c); + } + } + for (unsigned n = uColCountB; n < uNewColCount; ++n) + { + for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) + msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, '.'); + } + + uColIndexCombined += uNewColCount; + uColIndexA += uColCountA; + uColIndexB += uColCountB; + } + +static void AppendMatch(const MSA &msaA, unsigned &uColIndexA, const MSA &msaB, + unsigned &uColIndexB, unsigned uSeqCountA, unsigned uSeqCountB, + MSA &msaCombined, unsigned &uColIndexCombined) + { +#if TRACE + Log("AppendMatch ColIxA=%u ColIxB=%u ColIxCmb=%u\n", + uColIndexA, uColIndexB, uColIndexCombined); +#endif + + for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) + { + char c = msaA.GetChar(uSeqIndexA, uColIndexA); + msaCombined.SetChar(uSeqIndexA, uColIndexCombined, c); + } + + for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) + { + char c = msaB.GetChar(uSeqIndexB, uColIndexB); + msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, c); + } + + ++uColIndexA; + ++uColIndexB; + ++uColIndexCombined; + } + +void AlignTwoMSAsGivenPath(const PWPath &Path, const MSA &msaA, const MSA &msaB, + MSA &msaCombined) + { + msaCombined.Clear(); + +#if TRACE + Log("FastAlignProfiles\n"); + Log("Template A:\n"); + msaA.LogMe(); + Log("Template B:\n"); + msaB.LogMe(); +#endif + + const unsigned uColCountA = msaA.GetColCount(); + const unsigned uColCountB = msaB.GetColCount(); + + const unsigned uSeqCountA = msaA.GetSeqCount(); + const unsigned uSeqCountB = msaB.GetSeqCount(); + + msaCombined.SetSeqCount(uSeqCountA + uSeqCountB); + +// Copy sequence names into combined MSA + for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) + { + msaCombined.SetSeqName(uSeqIndexA, msaA.GetSeqName(uSeqIndexA)); + msaCombined.SetSeqId(uSeqIndexA, msaA.GetSeqId(uSeqIndexA)); + } + + for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) + { + msaCombined.SetSeqName(uSeqCountA + uSeqIndexB, msaB.GetSeqName(uSeqIndexB)); + msaCombined.SetSeqId(uSeqCountA + uSeqIndexB, msaB.GetSeqId(uSeqIndexB)); + } + + unsigned uColIndexA = 0; + unsigned uColIndexB = 0; + unsigned uColIndexCombined = 0; + const unsigned uEdgeCount = Path.GetEdgeCount(); + for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) + { + const PWEdge &Edge = Path.GetEdge(uEdgeIndex); +#if TRACE + Log("\nEdge %u %c%u.%u\n", + uEdgeIndex, + Edge.cType, + Edge.uPrefixLengthA, + Edge.uPrefixLengthB); +#endif + const char cType = Edge.cType; + const unsigned uPrefixLengthA = Edge.uPrefixLengthA; + unsigned uColCountA = 0; + if (uPrefixLengthA > 0) + { + const unsigned uNodeIndexA = uPrefixLengthA - 1; + const unsigned uTplColIndexA = uNodeIndexA; + if (uTplColIndexA > uColIndexA) + uColCountA = uTplColIndexA - uColIndexA; + } + + const unsigned uPrefixLengthB = Edge.uPrefixLengthB; + unsigned uColCountB = 0; + if (uPrefixLengthB > 0) + { + const unsigned uNodeIndexB = uPrefixLengthB - 1; + const unsigned uTplColIndexB = uNodeIndexB; + if (uTplColIndexB > uColIndexB) + uColCountB = uTplColIndexB - uColIndexB; + } + +// TODO: This code looks like a hangover from HMM estimation -- can we delete it? + assert(uColCountA == 0); + assert(uColCountB == 0); + AppendTplInserts(msaA, uColIndexA, uColCountA, msaB, uColIndexB, uColCountB, + uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); + + switch (cType) + { + case 'M': + { + assert(uPrefixLengthA > 0); + assert(uPrefixLengthB > 0); + const unsigned uColA = uPrefixLengthA - 1; + const unsigned uColB = uPrefixLengthB - 1; + assert(uColIndexA == uColA); + assert(uColIndexB == uColB); + AppendMatch(msaA, uColIndexA, msaB, uColIndexB, uSeqCountA, uSeqCountB, + msaCombined, uColIndexCombined); + break; + } + case 'D': + { + assert(uPrefixLengthA > 0); + const unsigned uColA = uPrefixLengthA - 1; + assert(uColIndexA == uColA); + AppendDelete(msaA, uColIndexA, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); + break; + } + case 'I': + { + assert(uPrefixLengthB > 0); + const unsigned uColB = uPrefixLengthB - 1; + assert(uColIndexB == uColB); + AppendInsert(msaB, uColIndexB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); + break; + } + default: + assert(false); + } + } + unsigned uInsertColCountA = uColCountA - uColIndexA; + unsigned uInsertColCountB = uColCountB - uColIndexB; + +// TODO: This code looks like a hangover from HMM estimation -- can we delete it? + assert(uInsertColCountA == 0); + assert(uInsertColCountB == 0); + AppendTplInserts(msaA, uColIndexA, uInsertColCountA, msaB, uColIndexB, + uInsertColCountB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); + + assert(msaCombined.GetColCount() == uEdgeCount); + } + +static const ProfPos PPStart = + { + false, //m_bAllGaps; + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // m_uSortOrder[21]; + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // m_fcCounts[20]; + 1.0, // m_LL; + 0.0, // m_LG; + 0.0, // m_GL; + 0.0, // m_GG; + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // m_ALScores + 0, // m_uResidueGroup; + 1.0, // m_fOcc; + 0.0, // m_fcStartOcc; + 0.0, // m_fcEndOcc; + 0.0, // m_scoreGapOpen; + 0.0, // m_scoreGapClose; + }; + +// MM +// Ai–1 Ai Out +// X X LL LL +// X - LG LG +// - X GL GL +// - - GG GG +// +// Bj–1 Bj +// X X LL LL +// X - LG LG +// - X GL GL +// - - GG GG +static void SetGapsMM( + const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, + const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, + ProfPos *POut, unsigned uColIndexOut) + { + const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart; + const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart; + ProfPos &PPO = POut[uColIndexOut]; + + PPO.m_LL = wA*PPA.m_LL + wB*PPB.m_LL; + PPO.m_LG = wA*PPA.m_LG + wB*PPB.m_LG; + PPO.m_GL = wA*PPA.m_GL + wB*PPB.m_GL; + PPO.m_GG = wA*PPA.m_GG + wB*PPB.m_GG; + } + +// MD +// Ai–1 Ai Out +// X X LL LL +// X - LG LG +// - X GL GL +// - - GG GG +// +// Bj (-) +// X - ?L LG +// - - ?G GG +static void SetGapsMD( + const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, + const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, + ProfPos *POut, unsigned uColIndexOut) + { + const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart; + const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart; + ProfPos &PPO = POut[uColIndexOut]; + + PPO.m_LL = wA*PPA.m_LL; + PPO.m_LG = wA*PPA.m_LG + wB*(PPB.m_LL + PPB.m_GL); + PPO.m_GL = wA*PPA.m_GL; + PPO.m_GG = wA*PPA.m_GG + wB*(PPB.m_LG + PPB.m_GG); + } + +// DD +// Ai–1 Ai Out +// X X LL LL +// X - LG LG +// - X GL GL +// - - GG GG +// +// (-) (-) +// - - ?? GG +static void SetGapsDD( + const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, + const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, + ProfPos *POut, unsigned uColIndexOut) + { + const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart; + ProfPos &PPO = POut[uColIndexOut]; + + PPO.m_LL = wA*PPA.m_LL; + PPO.m_LG = wA*PPA.m_LG; + PPO.m_GL = wA*PPA.m_GL; + PPO.m_GG = wA*PPA.m_GG + wB; + } + +// MI +// Ai (-) Out +// X - ?L LG +// - - ?G GG + +// Bj–1 Bj +// X X LL LL +// X - LG LG +// - X GL GL +// - - GG GG +static void SetGapsMI( + const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, + const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, + ProfPos *POut, unsigned uColIndexOut) + { + const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart; + const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart; + ProfPos &PPO = POut[uColIndexOut]; + + PPO.m_LL = wB*PPB.m_LL; + PPO.m_LG = wB*PPB.m_LG + wA*(PPA.m_LL + PPA.m_GL); + PPO.m_GL = wB*PPB.m_GL; + PPO.m_GG = wB*PPB.m_GG + wA*(PPA.m_LG + PPA.m_GG); + } + +// DM +// Ai–1 Ai Out +// X X LL LL +// X - LG LG +// - X GL GL +// - - GG GG +// +// (-) Bj +// - X ?L GL +// - - ?G GG +static void SetGapsDM( + const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, + const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, + ProfPos *POut, unsigned uColIndexOut) + { + const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart; + const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart; + ProfPos &PPO = POut[uColIndexOut]; + + PPO.m_LL = wA*PPA.m_LL; + PPO.m_LG = wA*PPA.m_LG; + PPO.m_GL = wA*PPA.m_GL + wB*(PPB.m_LL + PPB.m_GL); + PPO.m_GG = wA*PPA.m_GG + wB*(PPB.m_LG + PPB.m_GG); + } + +// IM +// (-) Ai Out +// - X ?L GL +// - - ?G GG + +// Bj–1 Bj +// X X LL LL +// X - LG LG +// - X GL GL +// - - GG GG +static void SetGapsIM( + const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, + const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, + ProfPos *POut, unsigned uColIndexOut) + { + const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart; + const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart; + ProfPos &PPO = POut[uColIndexOut]; + + PPO.m_LL = wB*PPB.m_LL; + PPO.m_LG = wB*PPB.m_LG; + PPO.m_GL = wB*PPB.m_GL + wA*(PPA.m_LL + PPA.m_GL); + PPO.m_GG = wB*PPB.m_GG + wA*(PPA.m_LG + PPA.m_GG); + } + +// ID +// (-) Ai Out +// - X ?L GL +// - - ?G GG + +// Bj (-) +// X - ?L LG +// - - ?G GG +static void SetGapsID( + const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, + const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, + ProfPos *POut, unsigned uColIndexOut) + { + const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart; + const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart; + ProfPos &PPO = POut[uColIndexOut]; + + PPO.m_LL = 0; + PPO.m_LG = wB*PPB.m_GL + wB*PPB.m_LL; + PPO.m_GL = wA*PPA.m_GL + wA*PPA.m_LL; + PPO.m_GG = wA*(PPA.m_LG + PPA.m_GG) + wB*(PPB.m_LG + PPB.m_GG); + } + +// DI +// Ai (-) Out +// X - ?L LG +// - - ?G GG + +// (-) Bj +// - X ?L GL +// - - ?G GG +static void SetGapsDI( + const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, + const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, + ProfPos *POut, unsigned uColIndexOut) + { + const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart; + const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart; + ProfPos &PPO = POut[uColIndexOut]; + + PPO.m_LL = 0; + PPO.m_LG = wA*PPA.m_GL + wA*PPA.m_LL; + PPO.m_GL = wB*PPB.m_GL + wB*PPB.m_LL; + PPO.m_GG = wA*(PPA.m_LG + PPA.m_GG) + wB*(PPB.m_LG + PPB.m_GG); + } + +// II +// (-) (-) Out +// - - ?? GG + +// Bj–1 Bj +// X X LL LL +// X - LG LG +// - X GL GL +// - - GG GG +static void SetGapsII( + const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, + const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, + ProfPos *POut, unsigned uColIndexOut) + { + const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart; + ProfPos &PPO = POut[uColIndexOut]; + + PPO.m_LL = wB*PPB.m_LL; + PPO.m_LG = wB*PPB.m_LG; + PPO.m_GL = wB*PPB.m_GL; + PPO.m_GG = wB*PPB.m_GG + wA; + } + +static void SetFreqs( + const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, + const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, + ProfPos *POut, unsigned uColIndexOut) + { + const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart; + const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart; + ProfPos &PPO = POut[uColIndexOut]; + + if (g_bNormalizeCounts) + { + const FCOUNT fA = PPA.m_fOcc*wA/(wA + wB); + const FCOUNT fB = PPB.m_fOcc*wB/(wA + wB); + FCOUNT fTotal = 0; + for (unsigned i = 0; i < 20; ++i) + { + const FCOUNT f = fA*PPA.m_fcCounts[i] + fB*PPB.m_fcCounts[i]; + PPO.m_fcCounts[i] = f; + fTotal += f; + } + if (fTotal > 0) + for (unsigned i = 0; i < 20; ++i) + PPO.m_fcCounts[i] /= fTotal; + } + else + { + for (unsigned i = 0; i < 20; ++i) + PPO.m_fcCounts[i] = wA*PPA.m_fcCounts[i] + wB*PPB.m_fcCounts[i]; + } + } + +void AlignTwoProfsGivenPath(const PWPath &Path, + const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA, + const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB, + ProfPos **ptrPOut, unsigned *ptruLengthOut) + { +#if TRACE + Log("AlignTwoProfsGivenPath wA=%.3g wB=%.3g Path=\n", wA, wB); + Path.LogMe(); +#endif + assert(BTEq(wA + wB, 1.0)); + + unsigned uColIndexA = 0; + unsigned uColIndexB = 0; + unsigned uColIndexOut = 0; + const unsigned uEdgeCount = Path.GetEdgeCount(); + ProfPos *POut = new ProfPos[uEdgeCount]; + char cPrevType = 'M'; + for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) + { + const PWEdge &Edge = Path.GetEdge(uEdgeIndex); + const char cType = Edge.cType; + + const unsigned uPrefixLengthA = Edge.uPrefixLengthA; + const unsigned uPrefixLengthB = Edge.uPrefixLengthB; + +#if TRACE + Log("\nEdge %u %c%u.%u ColA=%u ColB=%u\n", + uEdgeIndex, + Edge.cType, + Edge.uPrefixLengthA, + Edge.uPrefixLengthB, + uColIndexA, + uColIndexB); +#endif + + POut[uColIndexOut].m_bAllGaps = false; + switch (cType) + { + case 'M': + { + assert(uPrefixLengthA > 0); + assert(uPrefixLengthB > 0); + SetFreqs( + PA, uPrefixLengthA, wA, + PB, uPrefixLengthB, wB, + POut, uColIndexOut); + switch (cPrevType) + { + case 'M': + SetGapsMM( + PA, uPrefixLengthA, wA, + PB, uPrefixLengthB, wB, + POut, uColIndexOut); + break; + case 'D': + SetGapsDM( + PA, uPrefixLengthA, wA, + PB, uPrefixLengthB, wB, + POut, uColIndexOut); + break; + case 'I': + SetGapsIM( + PA, uPrefixLengthA, wA, + PB, uPrefixLengthB, wB, + POut, uColIndexOut); + break; + default: + Quit("Bad cPrevType"); + } + ++uColIndexA; + ++uColIndexB; + ++uColIndexOut; + break; + } + case 'D': + { + assert(uPrefixLengthA > 0); + SetFreqs( + PA, uPrefixLengthA, wA, + PB, uPrefixLengthB, 0, + POut, uColIndexOut); + switch (cPrevType) + { + case 'M': + SetGapsMD( + PA, uPrefixLengthA, wA, + PB, uPrefixLengthB, wB, + POut, uColIndexOut); + break; + case 'D': + SetGapsDD( + PA, uPrefixLengthA, wA, + PB, uPrefixLengthB, wB, + POut, uColIndexOut); + break; + case 'I': + SetGapsID( + PA, uPrefixLengthA, wA, + PB, uPrefixLengthB, wB, + POut, uColIndexOut); + break; + default: + Quit("Bad cPrevType"); + } + ++uColIndexA; + ++uColIndexOut; + break; + } + case 'I': + { + assert(uPrefixLengthB > 0); + SetFreqs( + PA, uPrefixLengthA, 0, + PB, uPrefixLengthB, wB, + POut, uColIndexOut); + switch (cPrevType) + { + case 'M': + SetGapsMI( + PA, uPrefixLengthA, wA, + PB, uPrefixLengthB, wB, + POut, uColIndexOut); + break; + case 'D': + SetGapsDI( + PA, uPrefixLengthA, wA, + PB, uPrefixLengthB, wB, + POut, uColIndexOut); + break; + case 'I': + SetGapsII( + PA, uPrefixLengthA, wA, + PB, uPrefixLengthB, wB, + POut, uColIndexOut); + break; + default: + Quit("Bad cPrevType"); + } + ++uColIndexB; + ++uColIndexOut; + break; + } + default: + assert(false); + } + cPrevType = cType; + } + assert(uColIndexOut == uEdgeCount); + + ProfScoresFromFreqs(POut, uEdgeCount); + ValidateProf(POut, uEdgeCount); + + *ptrPOut = POut; + *ptruLengthOut = uEdgeCount; + +#if TRACE + Log("AlignTwoProfsGivenPath:\n"); + ListProfile(POut, uEdgeCount, 0); +#endif + } diff --git a/src/muscle/muscle3.8.31/src/aligngivenpathsw.cpp b/src/muscle/muscle3.8.31/src/aligngivenpathsw.cpp new file mode 100644 index 0000000..d3e03c1 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/aligngivenpathsw.cpp @@ -0,0 +1,237 @@ +#include "muscle.h" +#include "msa.h" +#include "pwpath.h" +#include "profile.h" + +#define TRACE 0 + +static void AppendDelete(const MSA &msaA, unsigned &uColIndexA, + unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined, + unsigned &uColIndexCombined) + { +#if TRACE + Log("AppendDelete ColIxA=%u ColIxCmb=%u\n", + uColIndexA, uColIndexCombined); +#endif + for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) + { + char c = msaA.GetChar(uSeqIndexA, uColIndexA); + msaCombined.SetChar(uSeqIndexA, uColIndexCombined, c); + } + + for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) + msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, '-'); + + ++uColIndexCombined; + ++uColIndexA; + } + +static void AppendInsert(const MSA &msaB, unsigned &uColIndexB, + unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined, + unsigned &uColIndexCombined) + { +#if TRACE + Log("AppendInsert ColIxB=%u ColIxCmb=%u\n", + uColIndexB, uColIndexCombined); +#endif + for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) + msaCombined.SetChar(uSeqIndexA, uColIndexCombined, '-'); + + for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) + { + char c = msaB.GetChar(uSeqIndexB, uColIndexB); + msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, c); + } + + ++uColIndexCombined; + ++uColIndexB; + } + +static void AppendUnalignedTerminals(const MSA &msaA, unsigned &uColIndexA, unsigned uColCountA, + const MSA &msaB, unsigned &uColIndexB, unsigned uColCountB, unsigned uSeqCountA, + unsigned uSeqCountB, MSA &msaCombined, unsigned &uColIndexCombined) + { +#if TRACE + Log("AppendUnalignedTerminals ColIxA=%u ColIxB=%u ColIxCmb=%u\n", + uColIndexA, uColIndexB, uColIndexCombined); +#endif + const unsigned uLengthA = msaA.GetColCount(); + const unsigned uLengthB = msaB.GetColCount(); + + unsigned uNewColCount = uColCountA; + if (uColCountB > uNewColCount) + uNewColCount = uColCountB; + + for (unsigned n = 0; n < uColCountA; ++n) + { + for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) + { + char c = msaA.GetChar(uSeqIndexA, uColIndexA + n); + c = UnalignChar(c); + msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, c); + } + } + for (unsigned n = uColCountA; n < uNewColCount; ++n) + { + for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) + msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, '.'); + } + + for (unsigned n = 0; n < uColCountB; ++n) + { + for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) + { + char c = msaB.GetChar(uSeqIndexB, uColIndexB + n); + c = UnalignChar(c); + msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, c); + } + } + for (unsigned n = uColCountB; n < uNewColCount; ++n) + { + for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) + msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, '.'); + } + + uColIndexCombined += uNewColCount; + uColIndexA += uColCountA; + uColIndexB += uColCountB; + } + +static void AppendMatch(const MSA &msaA, unsigned &uColIndexA, const MSA &msaB, + unsigned &uColIndexB, unsigned uSeqCountA, unsigned uSeqCountB, + MSA &msaCombined, unsigned &uColIndexCombined) + { +#if TRACE + Log("AppendMatch ColIxA=%u ColIxB=%u ColIxCmb=%u\n", + uColIndexA, uColIndexB, uColIndexCombined); +#endif + + for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) + { + char c = msaA.GetChar(uSeqIndexA, uColIndexA); + msaCombined.SetChar(uSeqIndexA, uColIndexCombined, c); + } + + for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) + { + char c = msaB.GetChar(uSeqIndexB, uColIndexB); + msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, c); + } + + ++uColIndexA; + ++uColIndexB; + ++uColIndexCombined; + } + +void AlignTwoMSAsGivenPathSW(const PWPath &Path, const MSA &msaA, const MSA &msaB, + MSA &msaCombined) + { + msaCombined.Clear(); + +#if TRACE + Log("AlignTwoMSAsGivenPathSW\n"); + Log("Template A:\n"); + msaA.LogMe(); + Log("Template B:\n"); + msaB.LogMe(); +#endif + + const unsigned uColCountA = msaA.GetColCount(); + const unsigned uColCountB = msaB.GetColCount(); + + const unsigned uSeqCountA = msaA.GetSeqCount(); + const unsigned uSeqCountB = msaB.GetSeqCount(); + + msaCombined.SetSeqCount(uSeqCountA + uSeqCountB); + +// Copy sequence names into combined MSA + for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) + { + msaCombined.SetSeqName(uSeqIndexA, msaA.GetSeqName(uSeqIndexA)); + msaCombined.SetSeqId(uSeqIndexA, msaA.GetSeqId(uSeqIndexA)); + } + + for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) + { + msaCombined.SetSeqName(uSeqCountA + uSeqIndexB, msaB.GetSeqName(uSeqIndexB)); + msaCombined.SetSeqId(uSeqCountA + uSeqIndexB, msaB.GetSeqId(uSeqIndexB)); + } + + unsigned uColIndexA = 0; + unsigned uColIndexB = 0; + unsigned uColIndexCombined = 0; + const unsigned uEdgeCount = Path.GetEdgeCount(); + for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) + { + const PWEdge &Edge = Path.GetEdge(uEdgeIndex); +#if TRACE + Log("\nEdge %u %c%u.%u\n", + uEdgeIndex, + Edge.cType, + Edge.uPrefixLengthA, + Edge.uPrefixLengthB); +#endif + const char cType = Edge.cType; + const unsigned uPrefixLengthA = Edge.uPrefixLengthA; + unsigned uColCountA = 0; + if (uPrefixLengthA > 0) + { + const unsigned uNodeIndexA = uPrefixLengthA - 1; + const unsigned uTplColIndexA = uNodeIndexA; + if (uTplColIndexA > uColIndexA) + uColCountA = uTplColIndexA - uColIndexA; + } + + const unsigned uPrefixLengthB = Edge.uPrefixLengthB; + unsigned uColCountB = 0; + if (uPrefixLengthB > 0) + { + const unsigned uNodeIndexB = uPrefixLengthB - 1; + const unsigned uTplColIndexB = uNodeIndexB; + if (uTplColIndexB > uColIndexB) + uColCountB = uTplColIndexB - uColIndexB; + } + + AppendUnalignedTerminals(msaA, uColIndexA, uColCountA, msaB, uColIndexB, uColCountB, + uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); + + switch (cType) + { + case 'M': + { + assert(uPrefixLengthA > 0); + assert(uPrefixLengthB > 0); + const unsigned uColA = uPrefixLengthA - 1; + const unsigned uColB = uPrefixLengthB - 1; + assert(uColIndexA == uColA); + assert(uColIndexB == uColB); + AppendMatch(msaA, uColIndexA, msaB, uColIndexB, uSeqCountA, uSeqCountB, + msaCombined, uColIndexCombined); + break; + } + case 'D': + { + assert(uPrefixLengthA > 0); + const unsigned uColA = uPrefixLengthA - 1; + assert(uColIndexA == uColA); + AppendDelete(msaA, uColIndexA, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); + break; + } + case 'I': + { + assert(uPrefixLengthB > 0); + const unsigned uColB = uPrefixLengthB - 1; + assert(uColIndexB == uColB); + AppendInsert(msaB, uColIndexB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); + break; + } + default: + assert(false); + } + } + unsigned uInsertColCountA = uColCountA - uColIndexA; + unsigned uInsertColCountB = uColCountB - uColIndexB; + + AppendUnalignedTerminals(msaA, uColIndexA, uInsertColCountA, msaB, uColIndexB, + uInsertColCountB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); + } diff --git a/src/muscle/muscle3.8.31/src/aligntwomsas.cpp b/src/muscle/muscle3.8.31/src/aligntwomsas.cpp new file mode 100644 index 0000000..d428c93 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/aligntwomsas.cpp @@ -0,0 +1,41 @@ +#include "muscle.h" +#include "msa.h" +#include "profile.h" +#include "pwpath.h" +#include "textfile.h" +#include "timing.h" + +SCORE AlignTwoMSAs(const MSA &msa1, const MSA &msa2, MSA &msaOut, PWPath &Path, + bool bLockLeft, bool bLockRight) + { + const unsigned uLengthA = msa1.GetColCount(); + const unsigned uLengthB = msa2.GetColCount(); + + ProfPos *PA = ProfileFromMSA(msa1); + ProfPos *PB = ProfileFromMSA(msa2); + + if (bLockLeft) + { + PA[0].m_scoreGapOpen = MINUS_INFINITY; + PB[0].m_scoreGapOpen = MINUS_INFINITY; + } + + if (bLockRight) + { + PA[uLengthA-1].m_scoreGapClose = MINUS_INFINITY; + PB[uLengthB-1].m_scoreGapClose = MINUS_INFINITY; + } + + float r = (float) uLengthA/ (float) (uLengthB + 1); // +1 to prevent div 0 + if (r < 1) + r = 1/r; + + SCORE Score = GlobalAlign(PA, uLengthA, PB, uLengthB, Path); + + AlignTwoMSAsGivenPath(Path, msa1, msa2, msaOut); + + delete[] PA; + delete[] PB; + + return Score; + } diff --git a/src/muscle/muscle3.8.31/src/aligntwoprofs.cpp b/src/muscle/muscle3.8.31/src/aligntwoprofs.cpp new file mode 100644 index 0000000..dc42c63 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/aligntwoprofs.cpp @@ -0,0 +1,31 @@ +#include "muscle.h" +#include "msa.h" +#include "profile.h" +#include "pwpath.h" + +SCORE GlobalAlign4(ProfPos *PA, unsigned uLengthA, ProfPos *PB, + unsigned uLengthB, PWPath &Path); + +SCORE AlignTwoProfs( + const ProfPos *PA, unsigned uLengthA, WEIGHT wA, + const ProfPos *PB, unsigned uLengthB, WEIGHT wB, + PWPath &Path, ProfPos **ptrPout, unsigned *ptruLengthOut) + { + assert(uLengthA < 100000); + assert(uLengthB < 100000); + + float r = (float) uLengthA/ (float) (uLengthB + 1); // +1 to prevent div 0 + if (r < 1) + r = 1/r; + + SCORE Score = GlobalAlign(PA, uLengthA, PB, uLengthB, Path); + + AlignTwoProfsGivenPath(Path, PA, uLengthB, wA/(wA + wB), PB, uLengthB, wB/(wA + wB), + ptrPout, ptruLengthOut); + +#if HYDRO + if (ALPHA_Amino == g_Alpha) + Hydro(*ptrPout, *ptruLengthOut); +#endif + return Score; + } diff --git a/src/muscle/muscle3.8.31/src/aln.cpp b/src/muscle/muscle3.8.31/src/aln.cpp new file mode 100644 index 0000000..4452bc8 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/aln.cpp @@ -0,0 +1,170 @@ +#include "muscle.h" +#include +#include +#include "msa.h" +#include "textfile.h" + +const unsigned uCharsPerLine = 60; +const int MIN_NAME = 10; +const int MAX_NAME = 32; + +static char GetAlnConsensusChar(const MSA &a, unsigned uColIndex); + +void MSA::ToAlnFile(TextFile &File) const + { + if (g_bClwStrict) + File.PutString("CLUSTAL W (1.81) multiple sequence alignment\n"); + else + { + File.PutString("MUSCLE (" + SHORT_VERSION ")" + " multiple sequence alignment\n"); + File.PutString("\n"); + } + + int iLongestNameLength = 0; + for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) + { + const char *ptrName = GetSeqName(uSeqIndex); + const char *ptrBlank = strchr(ptrName, ' '); + int iLength; + if (0 != ptrBlank) + iLength = (int) (ptrBlank - ptrName); + else + iLength = (int) strlen(ptrName); + if (iLength > iLongestNameLength) + iLongestNameLength = iLength; + } + if (iLongestNameLength > MAX_NAME) + iLongestNameLength = MAX_NAME; + if (iLongestNameLength < MIN_NAME) + iLongestNameLength = MIN_NAME; + + unsigned uLineCount = (GetColCount() - 1)/uCharsPerLine + 1; + for (unsigned uLineIndex = 0; uLineIndex < uLineCount; ++uLineIndex) + { + File.PutString("\n"); + unsigned uStartColIndex = uLineIndex*uCharsPerLine; + unsigned uEndColIndex = uStartColIndex + uCharsPerLine - 1; + if (uEndColIndex >= GetColCount()) + uEndColIndex = GetColCount() - 1; + char Name[MAX_NAME+1]; + for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) + { + const char *ptrName = GetSeqName(uSeqIndex); + const char *ptrBlank = strchr(ptrName, ' '); + int iLength; + if (0 != ptrBlank) + iLength = (int) (ptrBlank - ptrName); + else + iLength = (int) strlen(ptrName); + if (iLength > MAX_NAME) + iLength = MAX_NAME; + memset(Name, ' ', MAX_NAME); + memcpy(Name, ptrName, iLength); + Name[iLongestNameLength] = 0; + + File.PutFormat("%s ", Name); + for (unsigned uColIndex = uStartColIndex; uColIndex <= uEndColIndex; + ++uColIndex) + { + const char c = GetChar(uSeqIndex, uColIndex); + File.PutFormat("%c", toupper(c)); + } + File.PutString("\n"); + } + + memset(Name, ' ', MAX_NAME); + Name[iLongestNameLength] = 0; + File.PutFormat("%s ", Name); + for (unsigned uColIndex = uStartColIndex; uColIndex <= uEndColIndex; + ++uColIndex) + { + const char c = GetAlnConsensusChar(*this, uColIndex); + File.PutChar(c); + } + File.PutString("\n"); + } + } + +static char GetAlnConsensusChar(const MSA &a, unsigned uColIndex) + { + const unsigned uSeqCount = a.GetSeqCount(); + unsigned BitMap = 0; + unsigned Count = 0; + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + unsigned uLetter = a.GetLetterEx(uSeqIndex, uColIndex); + assert(uLetter < 32); + unsigned Bit = (1 << uLetter); + if (!(BitMap & Bit)) + ++Count; + BitMap |= Bit; + } + +// '*' indicates positions which have a single, fully conserved residue + if (1 == Count) + return '*'; + + if (ALPHA_Amino != g_Alpha) + return ' '; + +#define B(a) (1 << AX_##a) +#define S2(a, b) S(B(a) | B(b)) +#define S3(a, b, c) S(B(a) | B(b) | B(c)) +#define S4(a, b, c, d) S(B(a) | B(b) | B(c) | B(d)) +#define S(w) if (0 == (BitMap & ~(w)) && (BitMap & (w)) != 0) return ':'; + +#define W3(a, b, c) W(B(a) | B(b) | B(c)) +#define W4(a, b, c, d) W(B(a) | B(b) | B(c) | B(d)) +#define W5(a, b, c, d, e) W(B(a) | B(b) | B(c) | B(d) | B(e)) +#define W6(a, b, c, d, e, f) W(B(a) | B(b) | B(c) | B(d) | B(e) | B(f)) +#define W(w) if (0 == (BitMap & ~(w)) && (BitMap & (w)) != 0) return '.'; + +// ':' indicates that one of the following 'strong' +// groups is fully conserved +// STA +// NEQK +// NHQK +// NDEQ +// QHRK +// MILV +// MILF +// HY +// FYW +// + S3(S, T, A) + S4(N, E, Q, K) + S4(N, H, Q, K) + S4(N, D, E, Q) + S4(M, I, L, V) + S4(M, I, L, F) + S2(H, Y) + S3(F, Y, W) + +// '.' indicates that one of the following 'weaker' +// groups is fully conserved +// CSA +// ATV +// SAG +// STNK +// STPA +// SGND +// SNDEQK +// NDEQHK +// NEQHRK +// FVLIM +// HFY + W3(C, S, A) + W3(A, T, V) + W3(S, A, G) + W4(S, T, N, K) + W4(S, T, P, A) + W4(S, G, N, D) + W6(S, N, D, E, Q, K) + W6(N, W, Q, H, R, K) + W5(F, V, L, I, M) + W3(H, F, Y) + + return ' '; + } diff --git a/src/muscle/muscle3.8.31/src/alpha.cpp b/src/muscle/muscle3.8.31/src/alpha.cpp new file mode 100644 index 0000000..91a4498 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/alpha.cpp @@ -0,0 +1,283 @@ +#include "muscle.h" +#include + +/*** +From Bioperl docs: +Extended DNA / RNA alphabet +------------------------------------------ +Symbol Meaning Nucleic Acid +------------------------------------------ + A A Adenine + C C Cytosine + G G Guanine + T T Thymine + U U Uracil + M A or C + R A or G + W A or T + S C or G + Y C or T + K G or T + V A or C or G + H A or C or T + D A or G or T + B C or G or T + X G or A or T or C + N G or A or T or C + +IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE: + Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030. +***/ + +unsigned g_CharToLetter[MAX_CHAR]; +unsigned g_CharToLetterEx[MAX_CHAR]; + +char g_LetterToChar[MAX_ALPHA]; +char g_LetterExToChar[MAX_ALPHA_EX]; + +char g_UnalignChar[MAX_CHAR]; +char g_AlignChar[MAX_CHAR]; + +bool g_IsWildcardChar[MAX_CHAR]; +bool g_IsResidueChar[MAX_CHAR]; + +ALPHA g_Alpha = ALPHA_Undefined; +unsigned g_AlphaSize = 0; + +#define Res(c, Letter) \ + { \ + const unsigned char Upper = (unsigned char) toupper(c); \ + const unsigned char Lower = (unsigned char) tolower(c); \ + g_CharToLetter[Upper] = Letter; \ + g_CharToLetter[Lower] = Letter; \ + g_CharToLetterEx[Upper] = Letter; \ + g_CharToLetterEx[Lower] = Letter; \ + g_LetterToChar[Letter] = Upper; \ + g_LetterExToChar[Letter] = Upper; \ + g_IsResidueChar[Upper] = true; \ + g_IsResidueChar[Lower] = true; \ + g_AlignChar[Upper] = Upper; \ + g_AlignChar[Lower] = Upper; \ + g_UnalignChar[Upper] = Lower; \ + g_UnalignChar[Lower] = Lower; \ + } + +#define Wild(c, Letter) \ + { \ + const unsigned char Upper = (unsigned char) toupper(c); \ + const unsigned char Lower = (unsigned char) tolower(c); \ + g_CharToLetterEx[Upper] = Letter; \ + g_CharToLetterEx[Lower] = Letter; \ + g_LetterExToChar[Letter] = Upper; \ + g_IsResidueChar[Upper] = true; \ + g_IsResidueChar[Lower] = true; \ + g_AlignChar[Upper] = Upper; \ + g_AlignChar[Lower] = Upper; \ + g_UnalignChar[Upper] = Lower; \ + g_UnalignChar[Lower] = Lower; \ + g_IsWildcardChar[Lower] = true; \ + g_IsWildcardChar[Upper] = true; \ + } + +static unsigned GetAlphaSize(ALPHA Alpha) + { + switch (Alpha) + { + case ALPHA_Amino: + return 20; + + case ALPHA_RNA: + case ALPHA_DNA: + return 4; + } + Quit("Invalid Alpha=%d", Alpha); + return 0; + } + +static void InitArrays() + { + memset(g_CharToLetter, 0xff, sizeof(g_CharToLetter)); + memset(g_CharToLetterEx, 0xff, sizeof(g_CharToLetterEx)); + + memset(g_LetterToChar, '?', sizeof(g_LetterToChar)); + memset(g_LetterExToChar, '?', sizeof(g_LetterExToChar)); + + memset(g_AlignChar, '?', sizeof(g_UnalignChar)); + memset(g_UnalignChar, '?', sizeof(g_UnalignChar)); + + memset(g_IsWildcardChar, 0, sizeof(g_IsWildcardChar)); + } + +static void SetGapChar(char c) + { + unsigned char u = (unsigned char) c; + + g_CharToLetterEx[u] = AX_GAP; + g_LetterExToChar[AX_GAP] = u; + g_AlignChar[u] = u; + g_UnalignChar[u] = u; + } + +static void SetAlphaDNA() + { + Res('A', NX_A) + Res('C', NX_C) + Res('G', NX_G) + Res('T', NX_T) + Wild('M', NX_M) + Wild('R', NX_R) + Wild('W', NX_W) + Wild('S', NX_S) + Wild('Y', NX_Y) + Wild('K', NX_K) + Wild('V', NX_V) + Wild('H', NX_H) + Wild('D', NX_D) + Wild('B', NX_B) + Wild('X', NX_X) + Wild('N', NX_N) + } + +static void SetAlphaRNA() + { + Res('A', NX_A) + Res('C', NX_C) + Res('G', NX_G) + Res('U', NX_U) + Res('T', NX_T) + Wild('M', NX_M) + Wild('R', NX_R) + Wild('W', NX_W) + Wild('S', NX_S) + Wild('Y', NX_Y) + Wild('K', NX_K) + Wild('V', NX_V) + Wild('H', NX_H) + Wild('D', NX_D) + Wild('B', NX_B) + Wild('X', NX_X) + Wild('N', NX_N) + } + +static void SetAlphaAmino() + { + Res('A', AX_A) + Res('C', AX_C) + Res('D', AX_D) + Res('E', AX_E) + Res('F', AX_F) + Res('G', AX_G) + Res('H', AX_H) + Res('I', AX_I) + Res('K', AX_K) + Res('L', AX_L) + Res('M', AX_M) + Res('N', AX_N) + Res('P', AX_P) + Res('Q', AX_Q) + Res('R', AX_R) + Res('S', AX_S) + Res('T', AX_T) + Res('V', AX_V) + Res('W', AX_W) + Res('Y', AX_Y) + + Wild('B', AX_B) + Wild('X', AX_X) + Wild('Z', AX_Z) + } + +void SetAlpha(ALPHA Alpha) + { + InitArrays(); + + SetGapChar('.'); + SetGapChar('-'); + + switch (Alpha) + { + case ALPHA_Amino: + SetAlphaAmino(); + break; + + case ALPHA_DNA: + SetAlphaDNA(); + + case ALPHA_RNA: + SetAlphaRNA(); + break; + + default: + Quit("Invalid Alpha=%d", Alpha); + } + + g_AlphaSize = GetAlphaSize(Alpha); + g_Alpha = Alpha; + + if (g_bVerbose) + Log("Alphabet %s\n", ALPHAToStr(g_Alpha)); + } + +char GetWildcardChar() + { + switch (g_Alpha) + { + case ALPHA_Amino: + return 'X'; + + case ALPHA_DNA: + case ALPHA_RNA: + return 'N'; + + default: + Quit("Invalid Alpha=%d", g_Alpha); + } + return '?'; + } + +bool IsNucleo(char c) + { + return strchr("ACGTURYNacgturyn", c) != 0; + } + +bool IsDNA(char c) + { + return strchr("AGCTNagctn", c) != 0; + } + +bool IsRNA(char c) + { + return strchr("AGCUNagcun", c) != 0; + } + +static char InvalidLetters[256]; +static int InvalidLetterCount = 0; + +void ClearInvalidLetterWarning() + { + memset(InvalidLetters, 0, 256); + } + +void InvalidLetterWarning(char c, char w) + { + InvalidLetters[(unsigned char) c] = 1; + ++InvalidLetterCount; + } + +void ReportInvalidLetters() + { + if (0 == InvalidLetterCount) + return; + + char Str[257]; + memset(Str, 0, 257); + + int n = 0; + for (int i = 0; i < 256; ++i) + { + if (InvalidLetters[i]) + Str[n++] = (char) i; + } + Warning("Assuming %s (see -seqtype option), invalid letters found: %s", + ALPHAToStr(g_Alpha), Str); + } diff --git a/src/muscle/muscle3.8.31/src/alpha.h b/src/muscle/muscle3.8.31/src/alpha.h new file mode 100644 index 0000000..8a7af32 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/alpha.h @@ -0,0 +1,106 @@ +#ifndef alpha_h +#define alpha_h + +bool StrHasAmino(const char *Str); +bool StrHasGap(const char *Str); +void ClearInvalidLetterWarning(); +void InvalidLetterWarning(char c, char w); +void ReportInvalidLetters(); + +extern unsigned g_CharToLetter[]; +extern unsigned g_CharToLetterEx[]; + +extern char g_LetterToChar[]; +extern char g_LetterExToChar[]; + +extern char g_UnalignChar[]; +extern char g_AlignChar[]; + +extern bool g_IsWildcardChar[]; +extern bool g_IsResidueChar[]; + +#define CharToLetter(c) (g_CharToLetter[(unsigned char) (c)]) +#define CharToLetterEx(c) (g_CharToLetterEx[(unsigned char) (c)]) + +#define LetterToChar(u) (g_LetterToChar[u]) +#define LetterExToChar(u) (g_LetterExToChar[u]) + +#define IsResidueChar(c) (g_IsResidueChar[(unsigned char) (c)]) +#define IsGapChar(c) ('-' == (c) || '.' == (c)) +#define IsWildcardChar(c) (g_IsWildcardChar[(unsigned char) (c)]) + +#define AlignChar(c) (g_AlignChar[(unsigned char) (c)]) +#define UnalignChar(c) (g_UnalignChar[(unsigned char) (c)]) + +// AX=Amino alphabet with eXtensions (B, Z and X) +enum AX + { + AX_A, + AX_C, + AX_D, + AX_E, + AX_F, + AX_G, + AX_H, + AX_I, + AX_K, + AX_L, + AX_M, + AX_N, + AX_P, + AX_Q, + AX_R, + AX_S, + AX_T, + AX_V, + AX_W, + AX_Y, + + AX_X, // Any + + AX_B, // D or N + AX_Z, // E or Q + + AX_GAP, + }; +const unsigned AX_COUNT = AX_GAP + 1; + +// NX=Nucleotide alphabet with extensions +enum NX + { + NX_A, + NX_C, + NX_G, + NX_T, + NX_U = NX_T, + + NX_M, // AC + NX_R, // AG + NX_W, // AT + NX_S, // CG + NX_Y, // CT + NX_K, // GT + NX_V, // ACG + NX_H, // ACT + NX_D, // AGT + NX_B, // CGT + NX_X, // GATC + NX_N, // GATC + NX_GAP + }; +const unsigned NX_COUNT = NX_GAP + 1; + +const unsigned MAX_ALPHA = 20; +const unsigned MAX_ALPHA_EX = AX_COUNT; +const unsigned MAX_CHAR = 256; + +extern ALPHA g_Alpha; +extern unsigned g_AlphaSize; + +void SetAlpha(ALPHA Alpha); +char GetWildcardChar(); +bool IsNucleo(char c); +bool IsDNA(char c); +bool IsRNA(char c); + +#endif // alpha_h diff --git a/src/muscle/muscle3.8.31/src/anchors.cpp b/src/muscle/muscle3.8.31/src/anchors.cpp new file mode 100644 index 0000000..8e78aba --- /dev/null +++ b/src/muscle/muscle3.8.31/src/anchors.cpp @@ -0,0 +1,218 @@ +#include "muscle.h" +#include "msa.h" +#include "objscore.h" + +#define TRACE 0 + +static void WindowSmooth(const SCORE Score[], unsigned uCount, unsigned uWindowLength, + SCORE SmoothScore[], double dCeil) + { +#define Ceil(x) ((SCORE) ((x) > dCeil ? dCeil : (x))) + + if (1 != uWindowLength%2) + Quit("WindowSmooth=%u must be odd", uWindowLength); + + if (uCount <= uWindowLength) + { + for (unsigned i = 0; i < uCount; ++i) + SmoothScore[i] = 0; + return; + } + + const unsigned w2 = uWindowLength/2; + for (unsigned i = 0; i < w2; ++i) + { + SmoothScore[i] = 0; + SmoothScore[uCount - i - 1] = 0; + } + + SCORE scoreWindowTotal = 0; + for (unsigned i = 0; i < uWindowLength; ++i) + { + scoreWindowTotal += Ceil(Score[i]); + } + + for (unsigned i = w2; ; ++i) + { + SmoothScore[i] = scoreWindowTotal/uWindowLength; + if (i == uCount - w2 - 1) + break; + + scoreWindowTotal -= Ceil(Score[i - w2]); + scoreWindowTotal += Ceil(Score[i + w2 + 1]); + } +#undef Ceil + } + +// Find columns that score above the given threshold. +// A range of scores is defined between the average +// and the maximum. The threshold is a fraction 0.0 .. 1.0 +// within that range, where 0.0 is the average score +// and 1.0 is the maximum score. +// "Grade" is by analogy with grading on a curve. +static void FindBestColsGrade(const SCORE Score[], unsigned uCount, + double dThreshold, unsigned BestCols[], unsigned *ptruBestColCount) + { + SCORE scoreTotal = 0; + for (unsigned uIndex = 0; uIndex < uCount; ++uIndex) + scoreTotal += Score[uIndex]; + const SCORE scoreAvg = scoreTotal / uCount; + + SCORE scoreMax = MINUS_INFINITY; + for (unsigned uIndex = 0; uIndex < uCount; ++uIndex) + if (Score[uIndex] > scoreMax) + scoreMax = Score[uIndex]; + + unsigned uBestColCount = 0; + for (unsigned uIndex = 0; uIndex < uCount; ++uIndex) + { + const SCORE s = Score[uIndex]; + const double dHeight = (s - scoreAvg)/(scoreMax - scoreAvg); + if (dHeight >= dThreshold) + { + BestCols[uBestColCount] = uIndex; + ++uBestColCount; + } + } + *ptruBestColCount = uBestColCount; + } + +// Best col only if all following criteria satisfied: +// (1) Score >= min +// (2) Smoothed score >= min +// (3) No gaps. +static void FindBestColsCombo(const MSA &msa, const SCORE Score[], + const SCORE SmoothScore[], double dMinScore, double dMinSmoothScore, + unsigned BestCols[], unsigned *ptruBestColCount) + { + const unsigned uColCount = msa.GetColCount(); + + unsigned uBestColCount = 0; + for (unsigned uIndex = 0; uIndex < uColCount; ++uIndex) + { + if (Score[uIndex] < dMinScore) + continue; + if (SmoothScore[uIndex] < dMinSmoothScore) + continue; + if (msa.ColumnHasGap(uIndex)) + continue; + BestCols[uBestColCount] = uIndex; + ++uBestColCount; + } + *ptruBestColCount = uBestColCount; + } + +static void ListBestCols(const MSA &msa, const SCORE Score[], const SCORE SmoothScore[], + unsigned BestCols[], unsigned uBestColCount) + { + const unsigned uColCount = msa.GetColCount(); + const unsigned uSeqCount = msa.GetSeqCount(); + + Log("Col "); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + Log("%u", uSeqIndex%10); + Log(" "); + + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + { + Log("%3u ", uColIndex); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + Log("%c", msa.GetChar(uSeqIndex, uColIndex)); + + Log(" %10.3f", Score[uColIndex]); + Log(" %10.3f", SmoothScore[uColIndex]); + + for (unsigned i = 0; i < uBestColCount; ++i) + if (BestCols[i] == uColIndex) + Log(" <-- Best"); + Log("\n"); + } + } + +// If two best columns are found within a window, choose +// the highest-scoring. If more than two, choose the one +// closest to the center of the window. +static void MergeBestCols(const SCORE Scores[], const unsigned BestCols[], + unsigned uBestColCount, unsigned uWindowLength, unsigned AnchorCols[], + unsigned *ptruAnchorColCount) + { + unsigned uAnchorColCount = 0; + for (unsigned n = 0; n < uBestColCount; /* update inside loop */) + { + unsigned uBestColIndex = BestCols[n]; + unsigned uCountWithinWindow = 0; + for (unsigned i = n + 1; i < uBestColCount; ++i) + { + unsigned uBestColIndex2 = BestCols[i]; + if (uBestColIndex2 - uBestColIndex >= uWindowLength) + break; + ++uCountWithinWindow; + } + unsigned uAnchorCol = uBestColIndex; + if (1 == uCountWithinWindow) + { + unsigned uBestColIndex2 = BestCols[n+1]; + if (Scores[uBestColIndex] > Scores[uBestColIndex2]) + uAnchorCol = uBestColIndex; + else + uAnchorCol = uBestColIndex2; + } + else if (uCountWithinWindow > 1) + { + unsigned uWindowCenter = uBestColIndex + uWindowLength/2; + int iClosestDist = uWindowLength; + unsigned uClosestCol = uBestColIndex; + for (unsigned i = n + 1; i < n + uCountWithinWindow; ++i) + { + unsigned uColIndex = BestCols[i]; + int iDist = uColIndex - uBestColIndex; + if (iDist < 0) + iDist = -iDist; + if (iDist < iClosestDist) + { + uClosestCol = uColIndex; + iClosestDist = iDist; + } + } + uAnchorCol = uClosestCol; + } + AnchorCols[uAnchorColCount] = uAnchorCol; + ++uAnchorColCount; + n += uCountWithinWindow + 1; + } + *ptruAnchorColCount = uAnchorColCount; + } + +void FindAnchorCols(const MSA &msa, unsigned AnchorCols[], + unsigned *ptruAnchorColCount) + { + const unsigned uColCount = msa.GetColCount(); + if (uColCount < 16) + { + *ptruAnchorColCount = 0; + return; + } + + SCORE *MatchScore = new SCORE[uColCount]; + SCORE *SmoothScore = new SCORE[uColCount]; + unsigned *BestCols = new unsigned[uColCount]; + + GetLetterScores(msa, MatchScore); + WindowSmooth(MatchScore, uColCount, g_uSmoothWindowLength, SmoothScore, + g_dSmoothScoreCeil); + + unsigned uBestColCount; + FindBestColsCombo(msa, MatchScore, SmoothScore, g_dMinBestColScore, g_dMinSmoothScore, + BestCols, &uBestColCount); + +#if TRACE + ListBestCols(msa, MatchScore, SmoothScore, BestCols, uBestColCount); +#endif + + MergeBestCols(MatchScore, BestCols, uBestColCount, g_uAnchorSpacing, AnchorCols, + ptruAnchorColCount); + + delete[] MatchScore; + delete[] SmoothScore; + delete[] BestCols; + } diff --git a/src/muscle/muscle3.8.31/src/bittraceback.cpp b/src/muscle/muscle3.8.31/src/bittraceback.cpp new file mode 100644 index 0000000..9165985 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/bittraceback.cpp @@ -0,0 +1,206 @@ +#include "muscle.h" +#include "pwpath.h" + +#define TRACE 0 + +static char XlatEdgeType(char c) + { + if ('E' == c) + return 'D'; + if ('J' == c) + return 'I'; + return c; + } + +static const char *BitsToStr(char Bits) + { + static char Str[] = "xM xD xI"; + + switch (Bits & BIT_xM) + { + case BIT_MM: + Str[0] = 'M'; + break; + case BIT_DM: + Str[0] = 'D'; + break; + case BIT_IM: + Str[0] = 'I'; + break; + } + + switch (Bits & BIT_xD) + { + case BIT_MD: + Str[3] = 'M'; + break; + case BIT_DD: + Str[3] = 'D'; + break; + } + + switch (Bits & BIT_xI) + { + case BIT_MI: + Str[6] = 'M'; + break; + case BIT_II: + Str[6] = 'I'; + break; + } + + return Str; + } + +static inline char XChar(char Bits, char cType) + { + switch (cType) + { + case 'M': + { + switch (Bits & BIT_xM) + { + case BIT_MM: + return 'M'; + case BIT_DM: + return 'D'; + case BIT_IM: + return 'I'; +#if DOUBLE_AFFINE + case BIT_EM: + return 'E'; + case BIT_JM: + return 'J'; +#endif + } + Quit("Huh!?"); + return '?'; + } + case 'D': + { + switch (Bits & BIT_xD) + { + case BIT_MD: + return 'M'; + case BIT_DD: + return 'D'; + } + Quit("Huh!?"); + return '?'; + } + case 'I': + { + switch (Bits & BIT_xI) + { + case BIT_MI: + return 'M'; + case BIT_II: + return 'I'; + } + Quit("Huh!?"); + return '?'; + } +#if DOUBLE_AFFINE + case 'E': + { + switch (Bits & BIT_xE) + { + case BIT_ME: + return 'M'; + case BIT_EE: + return 'E'; + } + Quit("Huh!?"); + return '?'; + } + case 'J': + { + switch (Bits & BIT_xJ) + { + case BIT_MJ: + return 'M'; + case BIT_JJ: + return 'J'; + } + Quit("Huh!?"); + return '?'; + } +#endif + default: + Quit("Huh?"); + return '?'; + } + } + +void BitTraceBack(char **TraceBack, unsigned uLengthA, unsigned uLengthB, + char LastEdge, PWPath &Path) + { +#if TRACE + Log("BitTraceBack\n"); +#endif + Path.Clear(); + + PWEdge Edge; + Edge.uPrefixLengthA = uLengthA; + Edge.uPrefixLengthB = uLengthB; + char Bits = TraceBack[uLengthA][uLengthB]; + Edge.cType = LastEdge; + for (;;) + { +#if TRACE + Log("Prepend %c%d.%d\n", Edge.cType, Edge.uPrefixLengthA, Edge.uPrefixLengthB); +#endif + char cSave = Edge.cType; + Edge.cType = XlatEdgeType(cSave); + Path.PrependEdge(Edge); + Edge.cType = cSave; + + unsigned PLA = Edge.uPrefixLengthA; + unsigned PLB = Edge.uPrefixLengthB; + char Bits = TraceBack[PLA][PLB]; + char NextEdgeType = XChar(Bits, Edge.cType); +#if TRACE + Log("XChar(%s, %c) = %c\n", BitsToStr(Bits), Edge.cType, NextEdgeType); +#endif + switch (Edge.cType) + { + case 'M': + { + if (Edge.uPrefixLengthA == 0) + Quit("BitTraceBack MA=0"); + if (Edge.uPrefixLengthB == 0) + Quit("BitTraceBack MA=0"); + --(Edge.uPrefixLengthA); + --(Edge.uPrefixLengthB); + break; + } + case 'D': + case 'E': + { + if (Edge.uPrefixLengthA == 0) + Quit("BitTraceBack DA=0"); + --(Edge.uPrefixLengthA); + break; + } + case 'I': + case 'J': + { + if (Edge.uPrefixLengthB == 0) + Quit("BitTraceBack IB=0"); + --(Edge.uPrefixLengthB); + break; + } + default: + Quit("BitTraceBack: Invalid edge %c", Edge); + } + + if (0 == Edge.uPrefixLengthA && 0 == Edge.uPrefixLengthB) + break; + + Edge.cType = NextEdgeType; + } + +#if TRACE + Path.LogMe(); +#endif + } diff --git a/src/muscle/muscle3.8.31/src/blosum62.cpp b/src/muscle/muscle3.8.31/src/blosum62.cpp new file mode 100644 index 0000000..64124b9 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/blosum62.cpp @@ -0,0 +1,28 @@ +#include "muscle.h" + +int BLOSUM62[20][20] = + { +// A C D E F G H I K L M N P Q R S T V W Y + { 4, 0, -2, -1, -2, 0, -2, -1, -1, -1, -1, -2, -1, -1, -1, 1, 0, 0, -3, -2}, // A + { 0, 9, -3, -4, -2, -3, -3, -1, -3, -1, -1, -3, -3, -3, -3, -1, -1, -1, -2, -2}, // C + {-2, -3, 6, 2, -3, -1, -1, -3, -1, -4, -3, 1, -1, 0, -2, 0, -1, -3, -4, -3}, // D + {-1, -4, 2, 5, -3, -2, 0, -3, 1, -3, -2, 0, -1, 2, 0, 0, -1, -2, -3, -2}, // E + {-2, -2, -3, -3, 6, -3, -1, 0, -3, 0, 0, -3, -4, -3, -3, -2, -2, -1, 1, 3}, // F + { 0, -3, -1, -2, -3, 6, -2, -4, -2, -4, -3, 0, -2, -2, -2, 0, -2, -3, -2, -3}, // G + {-2, -3, -1, 0, -1, -2, 8, -3, -1, -3, -2, 1, -2, 0, 0, -1, -2, -3, -2, 2}, // H + {-1, -1, -3, -3, 0, -4, -3, 4, -3, 2, 1, -3, -3, -3, -3, -2, -1, 3, -3, -1}, // I + {-1, -3, -1, 1, -3, -2, -1, -3, 5, -2, -1, 0, -1, 1, 2, 0, -1, -2, -3, -2}, // K + {-1, -1, -4, -3, 0, -4, -3, 2, -2, 4, 2, -3, -3, -2, -2, -2, -1, 1, -2, -1}, // L + {-1, -1, -3, -2, 0, -3, -2, 1, -1, 2, 5, -2, -2, 0, -1, -1, -1, 1, -1, -1}, // M + {-2, -3, 1, 0, -3, 0, 1, -3, 0, -3, -2, 6, -2, 0, 0, 1, 0, -3, -4, -2}, // N + {-1, -3, -1, -1, -4, -2, -2, -3, -1, -3, -2, -2, 7, -1, -2, -1, -1, -2, -4, -3}, // P + {-1, -3, 0, 2, -3, -2, 0, -3, 1, -2, 0, 0, -1, 5, 1, 0, -1, -2, -2, -1}, // Q + {-1, -3, -2, 0, -3, -2, 0, -3, 2, -2, -1, 0, -2, 1, 5, -1, -1, -3, -3, -2}, // R + { 1, -1, 0, 0, -2, 0, -1, -2, 0, -2, -1, 1, -1, 0, -1, 4, 1, -2, -3, -2}, // S + { 0, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, 0, -1, -1, -1, 1, 5, 0, -2, -2}, // T + { 0, -1, -3, -2, -1, -3, -3, 3, -2, 1, 1, -3, -2, -2, -3, -2, 0, 4, -3, -1}, // V + {-3, -2, -4, -3, 1, -2, -2, -3, -3, -2, -1, -4, -4, -2, -3, -3, -2, -3, 11, 2}, // W + {-2, -2, -3, -2, 3, -3, 2, -1, -2, -1, -1, -2, -3, -1, -2, -2, -2, -1, 2, 7}, // Y + }; + +double BLOSUM62_Expected = -0.5209; diff --git a/src/muscle/muscle3.8.31/src/blosumla.cpp b/src/muscle/muscle3.8.31/src/blosumla.cpp new file mode 100644 index 0000000..42be10f --- /dev/null +++ b/src/muscle/muscle3.8.31/src/blosumla.cpp @@ -0,0 +1,118 @@ +#include "muscle.h" + +#define GAPVAL 0.3 +#define GAPGAPVAL 5.0 + +// Blosum62 log-average factor matrix +static float Blosum62LA[20][20] = + { +#define v(x) ((float) x) +#define S_ROW(n, c, A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y) \ + { v(A), v(C), v(D), v(E), v(F), v(G), v(H), v(I), v(K), v(L), v(M), v(N), v(P), v(Q), \ + v(R), v(S), v(T), v(V), v(W), v(Y) }, + +// Blosum62 log average matrix +// A C D E F +// G H I K L +// M N P Q R +// S T V W Y +S_ROW( 0, 'A', 3.9029401, 0.8679881, 0.5446049, 0.7412640, 0.4648942, + 1.0568696, 0.5693654, 0.6324813, 0.7753898, 0.6019460, + 0.7231498, 0.5883077, 0.7541214, 0.7568035, 0.6126988, + 1.4721037, 0.9844022, 0.9364584, 0.4165484, 0.5426125) + +S_ROW( 1, 'C', 0.8679881, 19.5765802, 0.3014542, 0.2859347, 0.4389910, + 0.4203886, 0.3550472, 0.6534589, 0.3491296, 0.6422760, + 0.6113537, 0.3978026, 0.3795628, 0.3657796, 0.3089379, + 0.7384148, 0.7405530, 0.7558448, 0.4499807, 0.4342013) + +S_ROW( 2, 'D', 0.5446049, 0.3014542, 7.3979253, 1.6878109, 0.2989696, + 0.6343015, 0.6785593, 0.3390155, 0.7840905, 0.2866128, + 0.3464547, 1.5538520, 0.5987177, 0.8970811, 0.5732000, + 0.9135051, 0.6947898, 0.3365004, 0.2321050, 0.3456829) + +S_ROW( 3, 'E', 0.7412640, 0.2859347, 1.6878109, 5.4695276, 0.3307441, + 0.4812675, 0.9600400, 0.3305223, 1.3082782, 0.3728734, + 0.5003421, 0.9112983, 0.6792027, 1.9017376, 0.9607983, + 0.9503570, 0.7414260, 0.4289431, 0.3743021, 0.4964664) + +S_ROW( 4, 'F', 0.4648942, 0.4389910, 0.2989696, 0.3307441, 8.1287983, + 0.3406407, 0.6519893, 0.9457698, 0.3440433, 1.1545978, + 1.0043715, 0.3542882, 0.2874440, 0.3339729, 0.3807263, + 0.4399736, 0.4816930, 0.7450894, 1.3743775, 2.7693817) + +S_ROW( 5, 'G', 1.0568696, 0.4203886, 0.6343015, 0.4812675, 0.3406407, + 6.8763075, 0.4929663, 0.2750096, 0.5888716, 0.2845039, + 0.3954865, 0.8637114, 0.4773858, 0.5386498, 0.4499840, + 0.9035965, 0.5792712, 0.3369551, 0.4216898, 0.3487141) + +S_ROW( 6, 'H', 0.5693654, 0.3550472, 0.6785593, 0.9600400, 0.6519893, + 0.4929663, 13.5060070, 0.3262878, 0.7788884, 0.3806759, + 0.5841316, 1.2220028, 0.4728797, 1.1679835, 0.9170473, + 0.7367319, 0.5575021, 0.3394474, 0.4440859, 1.7979036) + +S_ROW( 7, 'I', 0.6324813, 0.6534589, 0.3390155, 0.3305223, 0.9457698, + 0.2750096, 0.3262878, 3.9979299, 0.3963730, 1.6944349, + 1.4777449, 0.3279345, 0.3846629, 0.3829375, 0.3547509, + 0.4431634, 0.7798163, 2.4175121, 0.4088732, 0.6303898) + +S_ROW( 8, 'K', 0.7753898, 0.3491296, 0.7840905, 1.3082782, 0.3440433, + 0.5888716, 0.7788884, 0.3963730, 4.7643359, 0.4282702, + 0.6253033, 0.9398419, 0.7037741, 1.5543233, 2.0768092, + 0.9319192, 0.7929060, 0.4565429, 0.3589319, 0.5321784) + +S_ROW( 9, 'L', 0.6019460, 0.6422760, 0.2866128, 0.3728734, 1.1545978, + 0.2845039, 0.3806759, 1.6944349, 0.4282702, 3.7966214, + 1.9942957, 0.3100430, 0.3711219, 0.4773261, 0.4739194, + 0.4288939, 0.6603292, 1.3142355, 0.5680359, 0.6920589) + +S_ROW(10, 'M', 0.7231498, 0.6113537, 0.3464547, 0.5003421, 1.0043715, + 0.3954865, 0.5841316, 1.4777449, 0.6253033, 1.9942957, + 6.4814549, 0.4745299, 0.4238960, 0.8642486, 0.6226249, + 0.5985578, 0.7938018, 1.2689365, 0.6103022, 0.7083636) + +S_ROW(11, 'N', 0.5883077, 0.3978026, 1.5538520, 0.9112983, 0.3542882, + 0.8637114, 1.2220028, 0.3279345, 0.9398419, 0.3100430, + 0.4745299, 7.0940964, 0.4999337, 1.0005835, 0.8586298, + 1.2315289, 0.9841525, 0.3690340, 0.2777841, 0.4860309) + +S_ROW(12, 'P', 0.7541214, 0.3795628, 0.5987177, 0.6792027, 0.2874440, + 0.4773858, 0.4728797, 0.3846629, 0.7037741, 0.3711219, + 0.4238960, 0.4999337, 12.8375452, 0.6412803, 0.4815348, + 0.7555033, 0.6888962, 0.4430825, 0.2818321, 0.3635216) + +S_ROW(13, 'Q', 0.7568035, 0.3657796, 0.8970811, 1.9017376, 0.3339729, + 0.5386498, 1.1679835, 0.3829375, 1.5543233, 0.4773261, + 0.8642486, 1.0005835, 0.6412803, 6.2444210, 1.4057958, + 0.9655559, 0.7913219, 0.4667781, 0.5093584, 0.6110951) + +S_ROW(14, 'R', 0.6126988, 0.3089379, 0.5732000, 0.9607983, 0.3807263, + 0.4499840, 0.9170473, 0.3547509, 2.0768092, 0.4739194, + 0.6226249, 0.8586298, 0.4815348, 1.4057958, 6.6655769, + 0.7671661, 0.6777544, 0.4200721, 0.3951049, 0.5559652) + +S_ROW(15, 'S', 1.4721037, 0.7384148, 0.9135051, 0.9503570, 0.4399736, + 0.9035965, 0.7367319, 0.4431634, 0.9319192, 0.4288939, + 0.5985578, 1.2315289, 0.7555033, 0.9655559, 0.7671661, + 3.8428476, 1.6139205, 0.5652240, 0.3853031, 0.5575206) + +S_ROW(16, 'T', 0.9844022, 0.7405530, 0.6947898, 0.7414260, 0.4816930, + 0.5792712, 0.5575021, 0.7798163, 0.7929060, 0.6603292, + 0.7938018, 0.9841525, 0.6888962, 0.7913219, 0.6777544, + 1.6139205, 4.8321048, 0.9809432, 0.4309317, 0.5731577) + +S_ROW(17, 'V', 0.9364584, 0.7558448, 0.3365004, 0.4289431, 0.7450894, + 0.3369551, 0.3394474, 2.4175121, 0.4565429, 1.3142355, + 1.2689365, 0.3690340, 0.4430825, 0.4667781, 0.4200721, + 0.5652240, 0.9809432, 3.6921553, 0.3744576, 0.6580390) + +S_ROW(18, 'W', 0.4165484, 0.4499807, 0.2321050, 0.3743021, 1.3743775, + 0.4216898, 0.4440859, 0.4088732, 0.3589319, 0.5680359, + 0.6103022, 0.2777841, 0.2818321, 0.5093584, 0.3951049, + 0.3853031, 0.4309317, 0.3744576, 38.1077830, 2.1098056) + +S_ROW(19, 'Y', 0.5426125, 0.4342013, 0.3456829, 0.4964664, 2.7693817, + 0.3487141, 1.7979036, 0.6303898, 0.5321784, 0.6920589, + 0.7083636, 0.4860309, 0.3635216, 0.6110951, 0.5559652, + 0.5575206, 0.5731577, 0.6580390, 2.1098056, 9.8322054) + }; diff --git a/src/muscle/muscle3.8.31/src/clust.cpp b/src/muscle/muscle3.8.31/src/clust.cpp new file mode 100644 index 0000000..97e9d7c --- /dev/null +++ b/src/muscle/muscle3.8.31/src/clust.cpp @@ -0,0 +1,666 @@ +#include "muscle.h" +#include "clust.h" +#include "clustset.h" +#include + +#define TRACE 0 + +Clust::Clust() + { + m_Nodes = 0; + m_uNodeCount = 0; + m_uLeafCount = 0; + m_uClusterCount = 0; + m_JoinStyle = JOIN_Undefined; + m_dDist = 0; + m_uLeafCount = 0; + m_ptrSet = 0; + } + +Clust::~Clust() + { + delete[] m_Nodes; + delete[] m_dDist; + delete[] m_ClusterIndexToNodeIndex; + } + +void Clust::Create(ClustSet &Set, CLUSTER Method) + { + m_ptrSet = &Set; + + SetLeafCount(Set.GetLeafCount()); + + switch (Method) + { + case CLUSTER_UPGMA: + m_JoinStyle = JOIN_NearestNeighbor; + m_CentroidStyle = LINKAGE_Avg; + break; + + case CLUSTER_UPGMAMax: + m_JoinStyle = JOIN_NearestNeighbor; + m_CentroidStyle = LINKAGE_Max; + break; + + case CLUSTER_UPGMAMin: + m_JoinStyle = JOIN_NearestNeighbor; + m_CentroidStyle = LINKAGE_Min; + break; + + case CLUSTER_UPGMB: + m_JoinStyle = JOIN_NearestNeighbor; + m_CentroidStyle = LINKAGE_Biased; + break; + + case CLUSTER_NeighborJoining: + m_JoinStyle = JOIN_NeighborJoining; + m_CentroidStyle = LINKAGE_NeighborJoining; + break; + + default: + Quit("Clust::Create, invalid method %d", Method); + } + + if (m_uLeafCount <= 1) + Quit("Clust::Create: no leaves"); + + m_uNodeCount = 2*m_uLeafCount - 1; + m_Nodes = new ClustNode[m_uNodeCount]; + m_ClusterIndexToNodeIndex = new unsigned[m_uLeafCount]; + + m_ptrClusterList = 0; + for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) + { + ClustNode &Node = m_Nodes[uNodeIndex]; + Node.m_uIndex = uNodeIndex; + if (uNodeIndex < m_uLeafCount) + { + Node.m_uSize = 1; + Node.m_uLeafIndexes = new unsigned[1]; + Node.m_uLeafIndexes[0] = uNodeIndex; + AddToClusterList(uNodeIndex); + } + else + Node.m_uSize = 0; + } + +// Compute initial distance matrix between leaves + SetProgressDesc("Build dist matrix"); + unsigned uPairIndex = 0; + const unsigned uPairCount = (m_uLeafCount*(m_uLeafCount - 1))/2; + for (unsigned i = 0; i < m_uLeafCount; ++i) + for (unsigned j = 0; j < i; ++j) + { + const float dDist = (float) m_ptrSet->ComputeDist(*this, i, j); + SetDist(i, j, dDist); + if (0 == uPairIndex%10000) + Progress(uPairIndex, uPairCount); + ++uPairIndex; + } + ProgressStepsDone(); + +// Call CreateCluster once for each internal node in the tree + SetProgressDesc("Build guide tree"); + m_uClusterCount = m_uLeafCount; + const unsigned uInternalNodeCount = m_uNodeCount - m_uLeafCount; + for (unsigned uNodeIndex = m_uLeafCount; uNodeIndex < m_uNodeCount; ++uNodeIndex) + { + unsigned i = uNodeIndex + 1 - m_uLeafCount; + Progress(i, uInternalNodeCount); + CreateCluster(); + } + ProgressStepsDone(); + } + +void Clust::CreateCluster() + { + unsigned uLeftNodeIndex; + unsigned uRightNodeIndex; + float dLeftLength; + float dRightLength; + ChooseJoin(&uLeftNodeIndex, &uRightNodeIndex, &dLeftLength, &dRightLength); + + const unsigned uNewNodeIndex = m_uNodeCount - m_uClusterCount + 1; + + JoinNodes(uLeftNodeIndex, uRightNodeIndex, dLeftLength, dRightLength, + uNewNodeIndex); + +#if TRACE + Log("Merge New=%u L=%u R=%u Ld=%7.2g Rd=%7.2g\n", + uNewNodeIndex, uLeftNodeIndex, uRightNodeIndex, dLeftLength, dRightLength); +#endif + +// Compute distances to other clusters + --m_uClusterCount; + for (unsigned uNodeIndex = GetFirstCluster(); uNodeIndex != uInsane; + uNodeIndex = GetNextCluster(uNodeIndex)) + { + if (uNodeIndex == uLeftNodeIndex || uNodeIndex == uRightNodeIndex) + continue; + + if (uNewNodeIndex == uNodeIndex) + continue; + + const float dDist = ComputeDist(uNewNodeIndex, uNodeIndex); + SetDist(uNewNodeIndex, uNodeIndex, dDist); + } + + for (unsigned uNodeIndex = GetFirstCluster(); uNodeIndex != uInsane; + uNodeIndex = GetNextCluster(uNodeIndex)) + { + if (uNodeIndex == uLeftNodeIndex || uNodeIndex == uRightNodeIndex) + continue; + + if (uNewNodeIndex == uNodeIndex) + continue; + +#if REDLACK + const float dMetric = ComputeMetric(uNewNodeIndex, uNodeIndex); + InsertMetric(uNewNodeIndex, uNodeIndex, dMetric); +#endif + } + } + +void Clust::ChooseJoin(unsigned *ptruLeftIndex, unsigned *ptruRightIndex, + float *ptrdLeftLength, float *ptrdRightLength) + { + switch (m_JoinStyle) + { + case JOIN_NearestNeighbor: + ChooseJoinNearestNeighbor(ptruLeftIndex, ptruRightIndex, ptrdLeftLength, + ptrdRightLength); + return; + case JOIN_NeighborJoining: + ChooseJoinNeighborJoining(ptruLeftIndex, ptruRightIndex, ptrdLeftLength, + ptrdRightLength); + return; + } + Quit("Clust::ChooseJoin, Invalid join style %u", m_JoinStyle); + } + +void Clust::ChooseJoinNearestNeighbor(unsigned *ptruLeftIndex, + unsigned *ptruRightIndex, float *ptrdLeftLength, float *ptrdRightLength) + { + const unsigned uClusterCount = GetClusterCount(); + + unsigned uMinLeftNodeIndex; + unsigned uMinRightNodeIndex; + GetMinMetric(&uMinLeftNodeIndex, &uMinRightNodeIndex); + + float dMinDist = GetDist(uMinLeftNodeIndex, uMinRightNodeIndex); + + const float dLeftHeight = GetHeight(uMinLeftNodeIndex); + const float dRightHeight = GetHeight(uMinRightNodeIndex); + + *ptruLeftIndex = uMinLeftNodeIndex; + *ptruRightIndex = uMinRightNodeIndex; + *ptrdLeftLength = dMinDist/2 - dLeftHeight; + *ptrdRightLength = dMinDist/2 - dRightHeight; + } + +void Clust::ChooseJoinNeighborJoining(unsigned *ptruLeftIndex, + unsigned *ptruRightIndex, float *ptrdLeftLength, float *ptrdRightLength) + { + const unsigned uClusterCount = GetClusterCount(); + + //unsigned uMinLeftNodeIndex = uInsane; + //unsigned uMinRightNodeIndex = uInsane; + //float dMinD = PLUS_INFINITY; + //for (unsigned i = GetFirstCluster(); i != uInsane; i = GetNextCluster(i)) + // { + // const float ri = Calc_r(i); + // for (unsigned j = GetNextCluster(i); j != uInsane; j = GetNextCluster(j)) + // { + // const float rj = Calc_r(j); + // const float dij = GetDist(i, j); + // const float Dij = dij - (ri + rj); + // if (Dij < dMinD) + // { + // dMinD = Dij; + // uMinLeftNodeIndex = i; + // uMinRightNodeIndex = j; + // } + // } + // } + + unsigned uMinLeftNodeIndex; + unsigned uMinRightNodeIndex; + GetMinMetric(&uMinLeftNodeIndex, &uMinRightNodeIndex); + + const float dDistLR = GetDist(uMinLeftNodeIndex, uMinRightNodeIndex); + const float rL = Calc_r(uMinLeftNodeIndex); + const float rR = Calc_r(uMinRightNodeIndex); + + const float dLeftLength = (dDistLR + rL - rR)/2; + const float dRightLength = (dDistLR - rL + rR)/2; + + *ptruLeftIndex = uMinLeftNodeIndex; + *ptruRightIndex = uMinRightNodeIndex; + *ptrdLeftLength = dLeftLength; + *ptrdRightLength = dRightLength; + } + +void Clust::JoinNodes(unsigned uLeftIndex, unsigned uRightIndex, float dLeftLength, + float dRightLength, unsigned uNodeIndex) + { + ClustNode &Parent = m_Nodes[uNodeIndex]; + ClustNode &Left = m_Nodes[uLeftIndex]; + ClustNode &Right = m_Nodes[uRightIndex]; + + Left.m_dLength = dLeftLength; + Right.m_dLength = dRightLength; + + Parent.m_ptrLeft = &Left; + Parent.m_ptrRight = &Right; + + Left.m_ptrParent = &Parent; + Right.m_ptrParent = &Parent; + + const unsigned uLeftSize = Left.m_uSize; + const unsigned uRightSize = Right.m_uSize; + const unsigned uParentSize = uLeftSize + uRightSize; + Parent.m_uSize = uParentSize; + + assert(0 == Parent.m_uLeafIndexes); + Parent.m_uLeafIndexes = new unsigned[uParentSize]; + + const unsigned uLeftBytes = uLeftSize*sizeof(unsigned); + const unsigned uRightBytes = uRightSize*sizeof(unsigned); + memcpy(Parent.m_uLeafIndexes, Left.m_uLeafIndexes, uLeftBytes); + memcpy(Parent.m_uLeafIndexes + uLeftSize, Right.m_uLeafIndexes, uRightBytes); + + DeleteFromClusterList(uLeftIndex); + DeleteFromClusterList(uRightIndex); + AddToClusterList(uNodeIndex); + } + +float Clust::Calc_r(unsigned uNodeIndex) const + { + const unsigned uClusterCount = GetClusterCount(); + if (2 == uClusterCount) + return 0; + + float dSum = 0; + for (unsigned i = GetFirstCluster(); i != uInsane; i = GetNextCluster(i)) + { + if (i == uNodeIndex) + continue; + dSum += GetDist(uNodeIndex, i); + } + return dSum/(uClusterCount - 2); + } + +float Clust::ComputeDist(unsigned uNewNodeIndex, unsigned uNodeIndex) + { + switch (m_CentroidStyle) + { + case LINKAGE_Avg: + return ComputeDistAverageLinkage(uNewNodeIndex, uNodeIndex); + + case LINKAGE_Min: + return ComputeDistMinLinkage(uNewNodeIndex, uNodeIndex); + + case LINKAGE_Max: + return ComputeDistMaxLinkage(uNewNodeIndex, uNodeIndex); + + case LINKAGE_Biased: + return ComputeDistMAFFT(uNewNodeIndex, uNodeIndex); + + case LINKAGE_NeighborJoining: + return ComputeDistNeighborJoining(uNewNodeIndex, uNodeIndex); + } + Quit("Clust::ComputeDist, invalid centroid style %u", m_CentroidStyle); + return (float) g_dNAN; + } + +float Clust::ComputeDistMinLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex) + { + const unsigned uLeftNodeIndex = GetLeftIndex(uNewNodeIndex); + const unsigned uRightNodeIndex = GetRightIndex(uNewNodeIndex); + const float dDistL = GetDist(uLeftNodeIndex, uNodeIndex); + const float dDistR = GetDist(uRightNodeIndex, uNodeIndex); + return (dDistL < dDistR ? dDistL : dDistR); + } + +float Clust::ComputeDistMaxLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex) + { + const unsigned uLeftNodeIndex = GetLeftIndex(uNewNodeIndex); + const unsigned uRightNodeIndex = GetRightIndex(uNewNodeIndex); + const float dDistL = GetDist(uLeftNodeIndex, uNodeIndex); + const float dDistR = GetDist(uRightNodeIndex, uNodeIndex); + return (dDistL > dDistR ? dDistL : dDistR); + } + +float Clust::ComputeDistAverageLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex) + { + const unsigned uLeftNodeIndex = GetLeftIndex(uNewNodeIndex); + const unsigned uRightNodeIndex = GetRightIndex(uNewNodeIndex); + const float dDistL = GetDist(uLeftNodeIndex, uNodeIndex); + const float dDistR = GetDist(uRightNodeIndex, uNodeIndex); + return (dDistL + dDistR)/2; + } + +float Clust::ComputeDistNeighborJoining(unsigned uNewNodeIndex, unsigned uNodeIndex) + { + const unsigned uLeftNodeIndex = GetLeftIndex(uNewNodeIndex); + const unsigned uRightNodeIndex = GetRightIndex(uNewNodeIndex); + const float dDistLR = GetDist(uLeftNodeIndex, uRightNodeIndex); + const float dDistL = GetDist(uLeftNodeIndex, uNodeIndex); + const float dDistR = GetDist(uRightNodeIndex, uNodeIndex); + const float dDist = (dDistL + dDistR - dDistLR)/2; + return dDist; + } + +// This is a mysterious variant of UPGMA reverse-engineered from MAFFT source. +float Clust::ComputeDistMAFFT(unsigned uNewNodeIndex, unsigned uNodeIndex) + { + const unsigned uLeftNodeIndex = GetLeftIndex(uNewNodeIndex); + const unsigned uRightNodeIndex = GetRightIndex(uNewNodeIndex); + + const float dDistLR = GetDist(uLeftNodeIndex, uRightNodeIndex); + const float dDistL = GetDist(uLeftNodeIndex, uNodeIndex); + const float dDistR = GetDist(uRightNodeIndex, uNodeIndex); + const float dMinDistLR = (dDistL < dDistR ? dDistL : dDistR); + const float dSumDistLR = dDistL + dDistR; + const float dDist = dMinDistLR*(1 - g_dSUEFF) + dSumDistLR*g_dSUEFF/2; + return dDist; + } + +unsigned Clust::GetClusterCount() const + { + return m_uClusterCount; + } + +void Clust::LogMe() const + { + Log("Clust %u leaves, %u nodes, %u clusters.\n", + m_uLeafCount, m_uNodeCount, m_uClusterCount); + + Log("Distance matrix\n"); + const unsigned uNodeCount = GetNodeCount(); + Log(" "); + for (unsigned i = 0; i < uNodeCount - 1; ++i) + Log(" %7u", i); + Log("\n"); + + Log(" "); + for (unsigned i = 0; i < uNodeCount - 1; ++i) + Log(" ------"); + Log("\n"); + + for (unsigned i = 0; i < uNodeCount - 1; ++i) + { + Log("%4u: ", i); + for (unsigned j = 0; j < i; ++j) + Log(" %7.2g", GetDist(i, j)); + Log("\n"); + } + + Log("\n"); + Log("Node Size Prnt Left Rght Length Name\n"); + Log("---- ---- ---- ---- ---- ------ ----\n"); + for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) + { + const ClustNode &Node = m_Nodes[uNodeIndex]; + Log("%4u %4u", uNodeIndex, Node.m_uSize); + if (0 != Node.m_ptrParent) + Log(" %4u", Node.m_ptrParent->m_uIndex); + else + Log(" "); + + if (0 != Node.m_ptrLeft) + Log(" %4u", Node.m_ptrLeft->m_uIndex); + else + Log(" "); + + if (0 != Node.m_ptrRight) + Log(" %4u", Node.m_ptrRight->m_uIndex); + else + Log(" "); + + if (uNodeIndex != m_uNodeCount - 1) + Log(" %7.3g", Node.m_dLength); + if (IsLeaf(uNodeIndex)) + { + const char *ptrName = GetNodeName(uNodeIndex); + if (0 != ptrName) + Log(" %s", ptrName); + } + if (GetRootNodeIndex() == uNodeIndex) + Log(" [ROOT]"); + Log("\n"); + } + } + +const ClustNode &Clust::GetNode(unsigned uNodeIndex) const + { + if (uNodeIndex >= m_uNodeCount) + Quit("ClustNode::GetNode(%u) %u", uNodeIndex, m_uNodeCount); + return m_Nodes[uNodeIndex]; + } + +bool Clust::IsLeaf(unsigned uNodeIndex) const + { + return uNodeIndex < m_uLeafCount; + } + +unsigned Clust::GetClusterSize(unsigned uNodeIndex) const + { + const ClustNode &Node = GetNode(uNodeIndex); + return Node.m_uSize; + } + +unsigned Clust::GetLeftIndex(unsigned uNodeIndex) const + { + const ClustNode &Node = GetNode(uNodeIndex); + if (0 == Node.m_ptrLeft) + Quit("Clust::GetLeftIndex: leaf"); + return Node.m_ptrLeft->m_uIndex; + } + +unsigned Clust::GetRightIndex(unsigned uNodeIndex) const + { + const ClustNode &Node = GetNode(uNodeIndex); + if (0 == Node.m_ptrRight) + Quit("Clust::GetRightIndex: leaf"); + return Node.m_ptrRight->m_uIndex; + } + +float Clust::GetLength(unsigned uNodeIndex) const + { + const ClustNode &Node = GetNode(uNodeIndex); + return Node.m_dLength; + } + +void Clust::SetLeafCount(unsigned uLeafCount) + { + if (uLeafCount <= 1) + Quit("Clust::SetLeafCount(%u)", uLeafCount); + + m_uLeafCount = uLeafCount; + const unsigned uNodeCount = GetNodeCount(); + +// Triangular matrix size excluding diagonal (all zeros in our case). + m_uTriangularMatrixSize = (uNodeCount*(uNodeCount - 1))/2; + m_dDist = new float[m_uTriangularMatrixSize]; + } + +unsigned Clust::GetLeafCount() const + { + return m_uLeafCount; + } + +unsigned Clust::VectorIndex(unsigned uIndex1, unsigned uIndex2) const + { + const unsigned uNodeCount = GetNodeCount(); + if (uIndex1 >= uNodeCount || uIndex2 >= uNodeCount) + Quit("DistVectorIndex(%u,%u) %u", uIndex1, uIndex2, uNodeCount); + unsigned v; + if (uIndex1 >= uIndex2) + v = uIndex2 + (uIndex1*(uIndex1 - 1))/2; + else + v = uIndex1 + (uIndex2*(uIndex2 - 1))/2; + assert(v < m_uTriangularMatrixSize); + return v; + } + +float Clust::GetDist(unsigned uIndex1, unsigned uIndex2) const + { + unsigned v = VectorIndex(uIndex1, uIndex2); + return m_dDist[v]; + } + +void Clust::SetDist(unsigned uIndex1, unsigned uIndex2, float dDist) + { + unsigned v = VectorIndex(uIndex1, uIndex2); + m_dDist[v] = dDist; + } + +float Clust::GetHeight(unsigned uNodeIndex) const + { + if (IsLeaf(uNodeIndex)) + return 0; + + const unsigned uLeftIndex = GetLeftIndex(uNodeIndex); + const unsigned uRightIndex = GetRightIndex(uNodeIndex); + const float dLeftLength = GetLength(uLeftIndex); + const float dRightLength = GetLength(uRightIndex); + const float dLeftHeight = dLeftLength + GetHeight(uLeftIndex); + const float dRightHeight = dRightLength + GetHeight(uRightIndex); + return (dLeftHeight + dRightHeight)/2; + } + +const char *Clust::GetNodeName(unsigned uNodeIndex) const + { + if (!IsLeaf(uNodeIndex)) + Quit("Clust::GetNodeName, is not leaf"); + return m_ptrSet->GetLeafName(uNodeIndex); + } + +unsigned Clust::GetNodeId(unsigned uNodeIndex) const + { + if (uNodeIndex >= GetLeafCount()) + return 0; + return m_ptrSet->GetLeafId(uNodeIndex); + } + +unsigned Clust::GetLeaf(unsigned uNodeIndex, unsigned uLeafIndex) const + { + const ClustNode &Node = GetNode(uNodeIndex); + const unsigned uLeafCount = Node.m_uSize; + if (uLeafIndex >= uLeafCount) + Quit("Clust::GetLeaf, invalid index"); + const unsigned uIndex = Node.m_uLeafIndexes[uLeafIndex]; + if (uIndex >= m_uNodeCount) + Quit("Clust::GetLeaf, index out of range"); + return uIndex; + } + +unsigned Clust::GetFirstCluster() const + { + if (0 == m_ptrClusterList) + return uInsane; + return m_ptrClusterList->m_uIndex; + } + +unsigned Clust::GetNextCluster(unsigned uIndex) const + { + ClustNode *ptrNode = &m_Nodes[uIndex]; + if (0 == ptrNode->m_ptrNextCluster) + return uInsane; + return ptrNode->m_ptrNextCluster->m_uIndex; + } + +void Clust::DeleteFromClusterList(unsigned uNodeIndex) + { + assert(uNodeIndex < m_uNodeCount); + ClustNode *ptrNode = &m_Nodes[uNodeIndex]; + ClustNode *ptrPrev = ptrNode->m_ptrPrevCluster; + ClustNode *ptrNext = ptrNode->m_ptrNextCluster; + + if (0 != ptrNext) + ptrNext->m_ptrPrevCluster = ptrPrev; + if (0 == ptrPrev) + { + assert(m_ptrClusterList == ptrNode); + m_ptrClusterList = ptrNext; + } + else + ptrPrev->m_ptrNextCluster = ptrNext; + + ptrNode->m_ptrNextCluster = 0; + ptrNode->m_ptrPrevCluster = 0; + } + +void Clust::AddToClusterList(unsigned uNodeIndex) + { + assert(uNodeIndex < m_uNodeCount); + ClustNode *ptrNode = &m_Nodes[uNodeIndex]; + + if (0 != m_ptrClusterList) + m_ptrClusterList->m_ptrPrevCluster = ptrNode; + + ptrNode->m_ptrNextCluster = m_ptrClusterList; + ptrNode->m_ptrPrevCluster = 0; + + m_ptrClusterList = ptrNode; + } + +float Clust::ComputeMetric(unsigned uIndex1, unsigned uIndex2) const + { + switch (m_JoinStyle) + { + case JOIN_NearestNeighbor: + return ComputeMetricNearestNeighbor(uIndex1, uIndex2); + + case JOIN_NeighborJoining: + return ComputeMetricNeighborJoining(uIndex1, uIndex2); + } + Quit("Clust::ComputeMetric"); + return 0; + } + +float Clust::ComputeMetricNeighborJoining(unsigned i, unsigned j) const + { + float ri = Calc_r(i); + float rj = Calc_r(j); + float dij = GetDist(i, j); + float dMetric = dij - (ri + rj); + return (float) dMetric; + } + +float Clust::ComputeMetricNearestNeighbor(unsigned i, unsigned j) const + { + return (float) GetDist(i, j); + } + +float Clust::GetMinMetricBruteForce(unsigned *ptruIndex1, unsigned *ptruIndex2) const + { + unsigned uMinLeftNodeIndex = uInsane; + unsigned uMinRightNodeIndex = uInsane; + float dMinMetric = PLUS_INFINITY; + for (unsigned uLeftNodeIndex = GetFirstCluster(); uLeftNodeIndex != uInsane; + uLeftNodeIndex = GetNextCluster(uLeftNodeIndex)) + { + for (unsigned uRightNodeIndex = GetNextCluster(uLeftNodeIndex); + uRightNodeIndex != uInsane; + uRightNodeIndex = GetNextCluster(uRightNodeIndex)) + { + float dMetric = ComputeMetric(uLeftNodeIndex, uRightNodeIndex); + if (dMetric < dMinMetric) + { + dMinMetric = dMetric; + uMinLeftNodeIndex = uLeftNodeIndex; + uMinRightNodeIndex = uRightNodeIndex; + } + } + } + *ptruIndex1 = uMinLeftNodeIndex; + *ptruIndex2 = uMinRightNodeIndex; + return dMinMetric; + } + +float Clust::GetMinMetric(unsigned *ptruIndex1, unsigned *ptruIndex2) const + { + return GetMinMetricBruteForce(ptruIndex1, ptruIndex2); + } diff --git a/src/muscle/muscle3.8.31/src/clust.h b/src/muscle/muscle3.8.31/src/clust.h new file mode 100644 index 0000000..4d86af6 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/clust.h @@ -0,0 +1,148 @@ +#ifndef Clust_h +#define Clust_h + +class Clust; +class ClustNode; +class ClustSet; +class Phylip; +class SortedNode; + +const unsigned RB_NIL = ((unsigned) 0xfff0); + +class ClustNode + { +public: + ClustNode() + { + m_uIndex = uInsane; + m_uSize = uInsane; + m_dLength = (float) dInsane; + m_ptrLeft = 0; + m_ptrRight = 0; + m_ptrParent = 0; + m_ptrNextCluster = 0; + m_ptrPrevCluster = 0; + m_uLeafIndexes = 0; + } + ~ClustNode() + { + delete[] m_uLeafIndexes; + } + unsigned m_uIndex; + unsigned m_uSize; + float m_dLength; + ClustNode *m_ptrLeft; + ClustNode *m_ptrRight; + ClustNode *m_ptrParent; + ClustNode *m_ptrNextCluster; + ClustNode *m_ptrPrevCluster; + unsigned *m_uLeafIndexes; + }; + +class Clust + { +public: + Clust(); + virtual ~Clust(); + + void Create(ClustSet &Set, CLUSTER Method); + + unsigned GetLeafCount() const; + + unsigned GetClusterCount() const; + unsigned GetClusterSize(unsigned uNodeIndex) const; + unsigned GetLeaf(unsigned uClusterIndex, unsigned uLeafIndex) const; + + unsigned GetNodeCount() const { return 2*m_uLeafCount - 1; } + const ClustNode &GetRoot() const { return m_Nodes[GetRootNodeIndex()]; } + unsigned GetRootNodeIndex() const { return m_uNodeCount - 1; } + + const ClustNode &GetNode(unsigned uNodeIndex) const; + bool IsLeaf(unsigned uNodeIndex) const; + unsigned GetLeftIndex(unsigned uNodeIndex) const; + unsigned GetRightIndex(unsigned uNodeIndex) const; + float GetLength(unsigned uNodeIndex) const; + float GetHeight(unsigned uNodeIndex) const; + const char *GetNodeName(unsigned uNodeIndex) const; + unsigned GetNodeId(unsigned uNodeIndex) const; + + JOIN GetJoinStyle() const { return m_JoinStyle; } + LINKAGE GetCentroidStyle() const { return m_CentroidStyle; } + + void SetDist(unsigned uIndex1, unsigned uIndex2, float dDist); + float GetDist(unsigned uIndex1, unsigned uIndex2) const; + + void ToPhylip(Phylip &tree); + + void LogMe() const; + +//private: + void SetLeafCount(unsigned uLeafCount); + + void CreateCluster(); + void JoinNodes(unsigned uLeftNodeIndex, unsigned uRightNodeIndex, + float dLeftLength, float dRightLength, unsigned uNewNodeIndex); + + void ChooseJoin(unsigned *ptruLeftIndex, unsigned *ptruRightIndex, + float *ptrdLeftLength, float *ptrdRightLength); + void ChooseJoinNeighborJoining(unsigned *ptruLeftIndex, unsigned *ptruRightIndex, + float *ptrdLeftLength, float *ptrdRightLength); + void ChooseJoinNearestNeighbor(unsigned *ptruLeftIndex, unsigned *ptruRightIndex, + float *ptrdLeftLength, float *ptrdRightLength); + + float ComputeDist(unsigned uNewNodeIndex, unsigned uNodeIndex); + float ComputeDistAverageLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex); + float ComputeDistMinLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex); + float ComputeDistMaxLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex); + float ComputeDistNeighborJoining(unsigned uNewNewIndex, unsigned uNodeIndex); + float ComputeDistMAFFT(unsigned uNewNewIndex, unsigned uNodeIndex); + + float Calc_r(unsigned uNodeIndex) const; + + unsigned VectorIndex(unsigned uIndex1, unsigned uIndex2) const; + + unsigned GetFirstCluster() const; + unsigned GetNextCluster(unsigned uNodeIndex) const; + + float ComputeMetric(unsigned uIndex1, unsigned uIndex2) const; + float ComputeMetricNearestNeighbor(unsigned i, unsigned j) const; + float ComputeMetricNeighborJoining(unsigned i, unsigned j) const; + + void InitMetric(unsigned uMaxNodeIndex); + void InsertMetric(unsigned uIndex1, unsigned uIndex2, float dMetric); + float GetMinMetric(unsigned *ptruIndex1, unsigned *ptruIndex2) const; + float GetMinMetricBruteForce(unsigned *ptruIndex1, unsigned *ptruIndex2) const; + void DeleteMetric(unsigned uIndex); + void DeleteMetric(unsigned uIndex1, unsigned uIndex2); + void ListMetric() const; + + void DeleteFromClusterList(unsigned uNodeIndex); + void AddToClusterList(unsigned uNodeIndex); + + void RBDelete(unsigned RBNode); + unsigned RBInsert(unsigned i, unsigned j, float fMetric); + + unsigned RBNext(unsigned RBNode) const; + unsigned RBPrev(unsigned RBNode) const; + unsigned RBMin(unsigned RBNode) const; + unsigned RBMax(unsigned RBNode) const; + + void ValidateRB(const char szMsg[] = 0) const; + void ValidateRBNode(unsigned Node, const char szMsg[]) const; + +//private: + JOIN m_JoinStyle; + LINKAGE m_CentroidStyle; + ClustNode *m_Nodes; + unsigned *m_ClusterIndexToNodeIndex; + unsigned *m_NodeIndexToClusterIndex; + unsigned m_uLeafCount; + unsigned m_uNodeCount; + unsigned m_uClusterCount; + unsigned m_uTriangularMatrixSize; + float *m_dDist; + ClustSet *m_ptrSet; + ClustNode *m_ptrClusterList; + }; + +#endif // Clust_h diff --git a/src/muscle/muscle3.8.31/src/cluster.cpp b/src/muscle/muscle3.8.31/src/cluster.cpp new file mode 100644 index 0000000..9701138 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/cluster.cpp @@ -0,0 +1,339 @@ +#include "muscle.h" +#include "cluster.h" +#include "distfunc.h" + +static inline float Min(float d1, float d2) + { + return d1 < d2 ? d1 : d2; + } + +static inline float Max(float d1, float d2) + { + return d1 > d2 ? d1 : d2; + } + +static inline float Mean(float d1, float d2) + { + return (float) ((d1 + d2)/2.0); + } + +#if _DEBUG +void ClusterTree::Validate(unsigned uNodeCount) + { + unsigned n; + ClusterNode *pNode; + unsigned uDisjointListCount = 0; + for (pNode = m_ptrDisjoints; pNode; pNode = pNode->GetNextDisjoint()) + { + ClusterNode *pPrev = pNode->GetPrevDisjoint(); + ClusterNode *pNext = pNode->GetNextDisjoint(); + if (0 != pPrev) + { + if (pPrev->GetNextDisjoint() != pNode) + { + Log("Prev->This mismatch, prev=\n"); + pPrev->LogMe(); + Log("This=\n"); + pNode->LogMe(); + Quit("ClusterTree::Validate()"); + } + } + else + { + if (pNode != m_ptrDisjoints) + { + Log("[%u]->prev = 0 but != m_ptrDisjoints=%d\n", + pNode->GetIndex(), + m_ptrDisjoints ? m_ptrDisjoints->GetIndex() : 0xffffffff); + pNode->LogMe(); + Quit("ClusterTree::Validate()"); + } + } + if (0 != pNext) + { + if (pNext->GetPrevDisjoint() != pNode) + { + Log("Next->This mismatch, next=\n"); + pNext->LogMe(); + Log("This=\n"); + pNode->LogMe(); + Quit("ClusterTree::Validate()"); + } + } + ++uDisjointListCount; + if (uDisjointListCount > m_uNodeCount) + Quit("Loop in disjoint list"); + } + + unsigned uParentlessNodeCount = 0; + for (n = 0; n < uNodeCount; ++n) + if (0 == m_Nodes[n].GetParent()) + ++uParentlessNodeCount; + + if (uDisjointListCount != uParentlessNodeCount) + Quit("Disjoints = %u Parentless = %u\n", uDisjointListCount, + uParentlessNodeCount); + } +#else // !_DEBUG +#define Validate(uNodeCount) // empty +#endif + +void ClusterNode::LogMe() const + { + unsigned uClusterSize = GetClusterSize(); + Log("[%02u] w=%5.3f CW=%5.3f LBW=%5.3f RBW=%5.3f LWT=%5.3f RWT=%5.3f L=%02d R=%02d P=%02d NxDj=%02d PvDj=%02d Sz=%02d {", + m_uIndex, + m_dWeight, + GetClusterWeight(), + GetLeftBranchWeight(), + GetRightBranchWeight(), + GetLeftWeight(), + GetRightWeight(), + m_ptrLeft ? m_ptrLeft->GetIndex() : 0xffffffff, + m_ptrRight ? m_ptrRight->GetIndex() : 0xffffffff, + m_ptrParent ? m_ptrParent->GetIndex() : 0xffffffff, + m_ptrNextDisjoint ? m_ptrNextDisjoint->GetIndex() : 0xffffffff, + m_ptrPrevDisjoint ? m_ptrPrevDisjoint->GetIndex() : 0xffffffff, + uClusterSize); + for (unsigned i = 0; i < uClusterSize; ++i) + Log(" %u", GetClusterLeaf(i)->GetIndex()); + Log(" }\n"); + } + +// How many leaves in the sub-tree under this node? +unsigned ClusterNode::GetClusterSize() const + { + unsigned uLeafCount = 0; + + if (0 == m_ptrLeft && 0 == m_ptrRight) + return 1; + + if (0 != m_ptrLeft) + uLeafCount += m_ptrLeft->GetClusterSize(); + if (0 != m_ptrRight) + uLeafCount += m_ptrRight->GetClusterSize(); + assert(uLeafCount > 0); + return uLeafCount; + } + +double ClusterNode::GetClusterWeight() const + { + double dWeight = 0.0; + if (0 != m_ptrLeft) + dWeight += m_ptrLeft->GetClusterWeight(); + if (0 != m_ptrRight) + dWeight += m_ptrRight->GetClusterWeight(); + return dWeight + GetWeight(); + } + +double ClusterNode::GetLeftBranchWeight() const + { + const ClusterNode *ptrLeft = GetLeft(); + if (0 == ptrLeft) + return 0.0; + + return GetWeight() - ptrLeft->GetWeight(); + } + +double ClusterNode::GetRightBranchWeight() const + { + const ClusterNode *ptrRight = GetRight(); + if (0 == ptrRight) + return 0.0; + + return GetWeight() - ptrRight->GetWeight(); + } + +double ClusterNode::GetRightWeight() const + { + const ClusterNode *ptrRight = GetRight(); + if (0 == ptrRight) + return 0.0; + return ptrRight->GetClusterWeight() + GetWeight(); + } + +double ClusterNode::GetLeftWeight() const + { + const ClusterNode *ptrLeft = GetLeft(); + if (0 == ptrLeft) + return 0.0; + return ptrLeft->GetClusterWeight() + GetWeight(); + } + +// Return n'th leaf in the sub-tree under this node. +const ClusterNode *ClusterNode::GetClusterLeaf(unsigned uLeafIndex) const + { + if (0 != m_ptrLeft) + { + if (0 == m_ptrRight) + return this; + + unsigned uLeftLeafCount = m_ptrLeft->GetClusterSize(); + + if (uLeafIndex < uLeftLeafCount) + return m_ptrLeft->GetClusterLeaf(uLeafIndex); + + assert(uLeafIndex >= uLeftLeafCount); + return m_ptrRight->GetClusterLeaf(uLeafIndex - uLeftLeafCount); + } + if (0 == m_ptrRight) + return this; + return m_ptrRight->GetClusterLeaf(uLeafIndex); + } + +void ClusterTree::DeleteFromDisjoints(ClusterNode *ptrNode) + { + ClusterNode *ptrPrev = ptrNode->GetPrevDisjoint(); + ClusterNode *ptrNext = ptrNode->GetNextDisjoint(); + + if (0 != ptrPrev) + ptrPrev->SetNextDisjoint(ptrNext); + else + m_ptrDisjoints = ptrNext; + + if (0 != ptrNext) + ptrNext->SetPrevDisjoint(ptrPrev); + +#if _DEBUG +// not algorithmically necessary, but improves clarity +// and supports Validate(). + ptrNode->SetPrevDisjoint(0); + ptrNode->SetNextDisjoint(0); +#endif + } + +void ClusterTree::AddToDisjoints(ClusterNode *ptrNode) + { + ptrNode->SetNextDisjoint(m_ptrDisjoints); + ptrNode->SetPrevDisjoint(0); + if (0 != m_ptrDisjoints) + m_ptrDisjoints->SetPrevDisjoint(ptrNode); + m_ptrDisjoints = ptrNode; + } + +ClusterTree::ClusterTree() + { + m_ptrDisjoints = 0; + m_Nodes = 0; + m_uNodeCount = 0; + } + +ClusterTree::~ClusterTree() + { + delete[] m_Nodes; + } + +void ClusterTree::LogMe() const + { + Log("Disjoints=%d\n", m_ptrDisjoints ? m_ptrDisjoints->GetIndex() : 0xffffffff); + for (unsigned i = 0; i < m_uNodeCount; ++i) + { + m_Nodes[i].LogMe(); + } + } + +ClusterNode *ClusterTree::GetRoot() const + { + return &m_Nodes[m_uNodeCount - 1]; + } + +// This is the UPGMA algorithm as described in Durbin et al. p166. +void ClusterTree::Create(const DistFunc &Dist) + { + unsigned i; + m_uLeafCount = Dist.GetCount(); + m_uNodeCount = 2*m_uLeafCount - 1; + + delete[] m_Nodes; + m_Nodes = new ClusterNode[m_uNodeCount]; + + for (i = 0; i < m_uNodeCount; ++i) + m_Nodes[i].SetIndex(i); + + for (i = 0; i < m_uLeafCount - 1; ++i) + m_Nodes[i].SetNextDisjoint(&m_Nodes[i+1]); + + for (i = 1; i < m_uLeafCount; ++i) + m_Nodes[i].SetPrevDisjoint(&m_Nodes[i-1]); + + m_ptrDisjoints = &m_Nodes[0]; + +// Log("Initial state\n"); +// LogMe(); +// Log("\n"); + + DistFunc ClusterDist; + ClusterDist.SetCount(m_uNodeCount); + double dMaxDist = 0.0; + for (i = 0; i < m_uLeafCount; ++i) + for (unsigned j = 0; j < m_uLeafCount; ++j) + { + float dDist = Dist.GetDist(i, j); + ClusterDist.SetDist(i, j, dDist); + } + + Validate(m_uLeafCount); + +// Iteration. N-1 joins needed to create a binary tree from N leaves. + for (unsigned uJoinIndex = m_uLeafCount; uJoinIndex < m_uNodeCount; + ++uJoinIndex) + { + // Find closest pair of clusters + unsigned uIndexClosest1; + unsigned uIndexClosest2; + bool bFound = false; + double dDistClosest = 9e99; + for (ClusterNode *ptrNode1 = m_ptrDisjoints; ptrNode1; + ptrNode1 = ptrNode1->GetNextDisjoint()) + { + for (ClusterNode *ptrNode2 = ptrNode1->GetNextDisjoint(); ptrNode2; + ptrNode2 = ptrNode2->GetNextDisjoint()) + { + unsigned i1 = ptrNode1->GetIndex(); + unsigned i2 = ptrNode2->GetIndex(); + double dDist = ClusterDist.GetDist(i1, i2); + if (dDist < dDistClosest) + { + bFound = true; + dDistClosest = dDist; + uIndexClosest1 = i1; + uIndexClosest2 = i2; + } + } + } + assert(bFound); + + ClusterNode &Join = m_Nodes[uJoinIndex]; + ClusterNode &Child1 = m_Nodes[uIndexClosest1]; + ClusterNode &Child2 = m_Nodes[uIndexClosest2]; + + Join.SetLeft(&Child1); + Join.SetRight(&Child2); + Join.SetWeight(dDistClosest); + + Child1.SetParent(&Join); + Child2.SetParent(&Join); + + DeleteFromDisjoints(&Child1); + DeleteFromDisjoints(&Child2); + AddToDisjoints(&Join); + +// Log("After join %d %d\n", uIndexClosest1, uIndexClosest2); +// LogMe(); + + // Calculate distance of every remaining disjoint cluster to the + // new cluster created by the join + for (ClusterNode *ptrNode = m_ptrDisjoints; ptrNode; + ptrNode = ptrNode->GetNextDisjoint()) + { + unsigned uNodeIndex = ptrNode->GetIndex(); + float dDist1 = ClusterDist.GetDist(uNodeIndex, uIndexClosest1); + float dDist2 = ClusterDist.GetDist(uNodeIndex, uIndexClosest2); + float dDist = Min(dDist1, dDist2); + ClusterDist.SetDist(uJoinIndex, uNodeIndex, dDist); + } + Validate(uJoinIndex+1); + } + GetRoot()->GetClusterWeight(); +// LogMe(); + } diff --git a/src/muscle/muscle3.8.31/src/cluster.h b/src/muscle/muscle3.8.31/src/cluster.h new file mode 100644 index 0000000..520dce1 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/cluster.h @@ -0,0 +1,86 @@ +class DistFunc; + +class ClusterNode + { + friend class ClusterTree; +public: + ClusterNode() + { + m_dWeight = 0.0; + m_dWeight2 = 0.0; + m_ptrLeft = 0; + m_ptrRight = 0; + m_ptrParent = 0; + m_uIndex = 0; + m_ptrPrevDisjoint = 0; + m_ptrNextDisjoint = 0; + } + ~ClusterNode() {} + +public: + unsigned GetIndex() const { return m_uIndex; } + ClusterNode *GetLeft() const { return m_ptrLeft; } + ClusterNode *GetRight() const { return m_ptrRight; } + ClusterNode *GetParent() const { return m_ptrParent; } + double GetWeight() const { return m_dWeight; } + + const ClusterNode *GetClusterLeaf(unsigned uLeafIndex) const; + unsigned GetClusterSize() const; + double GetClusterWeight() const; + double GetLeftBranchWeight() const; + double GetRightBranchWeight() const; + double GetLeftWeight() const; + double GetRightWeight() const; + + void LogMe() const; + + double GetWeight2() const { return m_dWeight2; } + void SetWeight2(double dWeight2) { m_dWeight2 = dWeight2; } + +protected: + void SetIndex(unsigned uIndex) { m_uIndex = uIndex; } + void SetWeight(double dWeight) { m_dWeight = dWeight; } + void SetLeft(ClusterNode *ptrLeft) { m_ptrLeft = ptrLeft; } + void SetRight(ClusterNode *ptrRight) { m_ptrRight = ptrRight; } + void SetParent(ClusterNode *ptrParent) { m_ptrParent = ptrParent; } + void SetNextDisjoint(ClusterNode *ptrNode) { m_ptrNextDisjoint = ptrNode; } + void SetPrevDisjoint(ClusterNode *ptrNode) { m_ptrPrevDisjoint = ptrNode; } + + ClusterNode *GetNextDisjoint() { return m_ptrNextDisjoint; } + ClusterNode *GetPrevDisjoint() { return m_ptrPrevDisjoint; } + +private: + double m_dWeight; + double m_dWeight2; + unsigned m_uIndex; + ClusterNode *m_ptrLeft; + ClusterNode *m_ptrRight; + ClusterNode *m_ptrParent; + ClusterNode *m_ptrNextDisjoint; + ClusterNode *m_ptrPrevDisjoint; + }; + +class ClusterTree + { +public: + ClusterTree(); + virtual ~ClusterTree(); + + void Create(const DistFunc &DF); + + ClusterNode *GetRoot() const; + void LogMe() const; + +protected: + void Join(ClusterNode *ptrNode1, ClusterNode *ptrNode2, + ClusterNode *ptrJoin); + void AddToDisjoints(ClusterNode *ptrNode); + void DeleteFromDisjoints(ClusterNode *ptrNode); + void Validate(unsigned uNodeCount); + +private: + ClusterNode *m_ptrDisjoints; + ClusterNode *m_Nodes; + unsigned m_uNodeCount; + unsigned m_uLeafCount; + }; diff --git a/src/muscle/muscle3.8.31/src/clustset.h b/src/muscle/muscle3.8.31/src/clustset.h new file mode 100644 index 0000000..5f4d76c --- /dev/null +++ b/src/muscle/muscle3.8.31/src/clustset.h @@ -0,0 +1,21 @@ +#ifndef ClustSet_h +#define ClustSet_h + +enum JOIN; +enum LINKAGE; +class Clust; + +class ClustSet + { +public: + virtual unsigned GetLeafCount() = 0; + virtual double ComputeDist(const Clust &C, unsigned uNodeIndex1, + unsigned uNodeIndex2) = 0; + virtual void JoinNodes(const Clust &C, unsigned uLeftNodeIndex, + unsigned uRightNodeIndex, unsigned uJoinedNodeIndex, + double *ptrdLeftLength, double *ptrdRightLength) = 0; + virtual const char *GetLeafName(unsigned uNodeIndex) = 0; + virtual unsigned GetLeafId(unsigned uNodeIndex) = 0; + }; + +#endif // ClustSet_h diff --git a/src/muscle/muscle3.8.31/src/clustsetdf.h b/src/muscle/muscle3.8.31/src/clustsetdf.h new file mode 100644 index 0000000..4e646ee --- /dev/null +++ b/src/muscle/muscle3.8.31/src/clustsetdf.h @@ -0,0 +1,48 @@ +#ifndef ClustSetDF_h +#define ClustSetDF_h + +class MSA; +class Clust; + +#include "clustset.h" +#include "distfunc.h" +#include "msa.h" + +class ClustSetDF : public ClustSet + { +public: + ClustSetDF(const DistFunc &DF) : + m_ptrDF(&DF) + { + } + +public: + virtual unsigned GetLeafCount() + { + return m_ptrDF->GetCount(); + } + virtual const char *GetLeafName(unsigned uNodeIndex) + { + return m_ptrDF->GetName(uNodeIndex); + } + virtual unsigned GetLeafId(unsigned uNodeIndex) + { + return m_ptrDF->GetId(uNodeIndex); + } + virtual void JoinNodes(const Clust &C, unsigned uLeftNodeIndex, + unsigned uRightNodeIndex, unsigned uJoinedNodeIndex, + double *ptrdLeftLength, double *ptrdRightLength) + { + Quit("ClustSetDF::JoinNodes, should never be called"); + } + virtual double ComputeDist(const Clust &C, unsigned uNodeIndex1, + unsigned uNodeIndex2) + { + return m_ptrDF->GetDist(uNodeIndex1, uNodeIndex2); + } + +private: + const DistFunc *m_ptrDF; + }; + +#endif // ClustSetDF_h diff --git a/src/muscle/muscle3.8.31/src/clustsetmsa.h b/src/muscle/muscle3.8.31/src/clustsetmsa.h new file mode 100644 index 0000000..659f5c3 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/clustsetmsa.h @@ -0,0 +1,55 @@ +#ifndef ClustSetMSA_h +#define ClustSetMSA_h + +class MSA; +class Clust; + +#include "clustset.h" +#include "msadist.h" + +// Distance matrix based set. +// Computes distances between leaves, never between +// joined clusters (leaves this to distance matrix method). +class ClustSetMSA : public ClustSet + { +public: + ClustSetMSA(const MSA &msa, MSADist &MD) : + m_ptrMSA(&msa), + m_ptrMSADist(&MD) + { + } + +public: + virtual unsigned GetLeafCount() + { + return m_ptrMSA->GetSeqCount(); + } + virtual const char *GetLeafName(unsigned uNodeIndex) + { + return m_ptrMSA->GetSeqName(uNodeIndex); + } + virtual unsigned GetLeafId(unsigned uNodeIndex) + { + return m_ptrMSA->GetSeqId(uNodeIndex); + } + virtual void JoinNodes(const Clust &C, unsigned uLeftNodeIndex, + unsigned uRightNodeIndex, unsigned uJoinedNodeIndex, + double *ptrdLeftLength, double *ptrdRightLength) + { + Quit("ClustSetMSA::JoinNodes, should never be called"); + } + virtual double ComputeDist(const Clust &C, unsigned uNodeIndex1, + unsigned uNodeIndex2) + { + return m_ptrMSADist->ComputeDist(*m_ptrMSA, uNodeIndex1, uNodeIndex2); + } + +public: + const MSA &GetMSA(); + +private: + const MSA *m_ptrMSA; + MSADist *m_ptrMSADist; + }; + +#endif // ClustSetMSA_h diff --git a/src/muscle/muscle3.8.31/src/clwwt.cpp b/src/muscle/muscle3.8.31/src/clwwt.cpp new file mode 100644 index 0000000..eb85302 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/clwwt.cpp @@ -0,0 +1,190 @@ +#include "muscle.h" +#include "tree.h" +#include "msa.h" + +/*** +Compute weights by the CLUSTALW method. +Thompson, Higgins and Gibson (1994), CABIOS (10) 19-29; +see also CLUSTALW paper. + +Weights are computed from the edge lengths of a rooted tree. + +Define the strength of an edge to be its length divided by the number +of leaves under that edge. The weight of a sequence is then the sum +of edge strengths on the path from the root to the leaf. + +Example. + + 0.2 + -----A 0.1 + -x ------- B 0.7 + --------y ----------- C + 0.3 ----------z + 0.4 -------------- D + 0.8 + +Edge Length Leaves Strength +---- ----- ------ -------- +xy 0.3 3 0.1 +xA 0.2 1 0.2 +yz 0.4 2 0.2 +yB 0.1 1 0.1 +zC 0.7 1 0.7 +zD 0.8 1 0.8 + +Leaf Path Strengths Weight +---- ---- --------- ------ +A xA 0.2 0.2 +B xy-yB 0.1 + 0.1 0.2 +C xy-yz-zC 0.1 + 0.2 + 0.7 1.0 +D xy-yz-zD 0.1 + 0.2 + 0.8 1.1 + +***/ + +#define TRACE 0 + +static unsigned CountLeaves(const Tree &tree, unsigned uNodeIndex, + unsigned LeavesUnderNode[]) + { + if (tree.IsLeaf(uNodeIndex)) + { + LeavesUnderNode[uNodeIndex] = 1; + return 1; + } + + const unsigned uLeft = tree.GetLeft(uNodeIndex); + const unsigned uRight = tree.GetRight(uNodeIndex); + const unsigned uRightCount = CountLeaves(tree, uRight, LeavesUnderNode); + const unsigned uLeftCount = CountLeaves(tree, uLeft, LeavesUnderNode); + const unsigned uCount = uRightCount + uLeftCount; + LeavesUnderNode[uNodeIndex] = uCount; + return uCount; + } + +void CalcClustalWWeights(const Tree &tree, WEIGHT Weights[]) + { +#if TRACE + Log("CalcClustalWWeights\n"); + tree.LogMe(); +#endif + + const unsigned uLeafCount = tree.GetLeafCount(); + if (0 == uLeafCount) + return; + else if (1 == uLeafCount) + { + Weights[0] = (WEIGHT) 1.0; + return; + } + else if (2 == uLeafCount) + { + Weights[0] = (WEIGHT) 0.5; + Weights[1] = (WEIGHT) 0.5; + return; + } + + if (!tree.IsRooted()) + Quit("CalcClustalWWeights requires rooted tree"); + + const unsigned uNodeCount = tree.GetNodeCount(); + unsigned *LeavesUnderNode = new unsigned[uNodeCount]; + memset(LeavesUnderNode, 0, uNodeCount*sizeof(unsigned)); + + const unsigned uRootNodeIndex = tree.GetRootNodeIndex(); + unsigned uLeavesUnderRoot = CountLeaves(tree, uRootNodeIndex, LeavesUnderNode); + if (uLeavesUnderRoot != uLeafCount) + Quit("WeightsFromTreee: Internal error, root count %u %u", + uLeavesUnderRoot, uLeafCount); + +#if TRACE + Log("Node Leaves Length Strength\n"); + Log("---- ------ -------- --------\n"); + // 1234 123456 12345678 12345678 +#endif + + double *Strengths = new double[uNodeCount]; + for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) + { + if (tree.IsRoot(uNodeIndex)) + { + Strengths[uNodeIndex] = 0.0; + continue; + } + const unsigned uParent = tree.GetParent(uNodeIndex); + const double dLength = tree.GetEdgeLength(uNodeIndex, uParent); + const unsigned uLeaves = LeavesUnderNode[uNodeIndex]; + const double dStrength = dLength / (double) uLeaves; + Strengths[uNodeIndex] = dStrength; +#if TRACE + Log("%4u %6u %8g %8g\n", uNodeIndex, uLeaves, dLength, dStrength); +#endif + } + +#if TRACE + Log("\n"); + Log(" Seq Path..Weight\n"); + Log("-------------------- ------------\n"); +#endif + for (unsigned n = 0; n < uLeafCount; ++n) + { + const unsigned uLeafNodeIndex = tree.LeafIndexToNodeIndex(n); +#if TRACE + Log("%20.20s %4u ", tree.GetLeafName(uLeafNodeIndex), uLeafNodeIndex); +#endif + if (!tree.IsLeaf(uLeafNodeIndex)) + Quit("CalcClustalWWeights: leaf"); + + double dWeight = 0; + unsigned uNode = uLeafNodeIndex; + while (!tree.IsRoot(uNode)) + { + dWeight += Strengths[uNode]; + uNode = tree.GetParent(uNode); +#if TRACE + Log("->%u(%g)", uNode, Strengths[uNode]); +#endif + } + if (dWeight < 0.0001) + { +#if TRACE + Log("zero->one"); +#endif + dWeight = 1.0; + } + Weights[n] = (WEIGHT) dWeight; +#if TRACE + Log(" = %g\n", dWeight); +#endif + } + + delete[] Strengths; + delete[] LeavesUnderNode; + + Normalize(Weights, uLeafCount); + } + +void MSA::SetClustalWWeights(const Tree &tree) + { + const unsigned uSeqCount = GetSeqCount(); + const unsigned uLeafCount = tree.GetLeafCount(); + + WEIGHT *Weights = new WEIGHT[uSeqCount]; + + CalcClustalWWeights(tree, Weights); + + for (unsigned n = 0; n < uLeafCount; ++n) + { + const WEIGHT w = Weights[n]; + const unsigned uLeafNodeIndex = tree.LeafIndexToNodeIndex(n); + const unsigned uId = tree.GetLeafId(uLeafNodeIndex); + const unsigned uSeqIndex = GetSeqIndex(uId); +#if DEBUG + if (GetSeqName(uSeqIndex) != tree.GetLeafName(uLeafNodeIndex)) + Quit("MSA::SetClustalWWeights: names don't match"); +#endif + SetSeqWeight(uSeqIndex, w); + } + NormalizeWeights((WEIGHT) 1.0); + + delete[] Weights; + } diff --git a/src/muscle/muscle3.8.31/src/color.cpp b/src/muscle/muscle3.8.31/src/color.cpp new file mode 100644 index 0000000..f0aa00f --- /dev/null +++ b/src/muscle/muscle3.8.31/src/color.cpp @@ -0,0 +1,189 @@ +#include "muscle.h" +#include "msa.h" + +static int Blosum62[23][23] = + { +// A B C D E F G H I K L M N P Q R S T V W X Y Z + +4, -2, +0, -2, -1, -2, +0, -2, -1, -1, -1, -1, -2, -1, -1, -1, +1, +0, +0, -3, -1, -2, -1, // A + -2, +6, -3, +6, +2, -3, -1, -1, -3, -1, -4, -3, +1, -1, +0, -2, +0, -1, -3, -4, -1, -3, +2, // B + +0, -3, +9, -3, -4, -2, -3, -3, -1, -3, -1, -1, -3, -3, -3, -3, -1, -1, -1, -2, -1, -2, -4, // C + -2, +6, -3, +6, +2, -3, -1, -1, -3, -1, -4, -3, +1, -1, +0, -2, +0, -1, -3, -4, -1, -3, +2, // D + -1, +2, -4, +2, +5, -3, -2, +0, -3, +1, -3, -2, +0, -1, +2, +0, +0, -1, -2, -3, -1, -2, +5, // E + + -2, -3, -2, -3, -3, +6, -3, -1, +0, -3, +0, +0, -3, -4, -3, -3, -2, -2, -1, +1, -1, +3, -3, // F + +0, -1, -3, -1, -2, -3, +6, -2, -4, -2, -4, -3, +0, -2, -2, -2, +0, -2, -3, -2, -1, -3, -2, // G + -2, -1, -3, -1, +0, -1, -2, +8, -3, -1, -3, -2, +1, -2, +0, +0, -1, -2, -3, -2, -1, +2, +0, // H + -1, -3, -1, -3, -3, +0, -4, -3, +4, -3, +2, +1, -3, -3, -3, -3, -2, -1, +3, -3, -1, -1, -3, // I + -1, -1, -3, -1, +1, -3, -2, -1, -3, +5, -2, -1, +0, -1, +1, +2, +0, -1, -2, -3, -1, -2, +1, // K + + -1, -4, -1, -4, -3, +0, -4, -3, +2, -2, +4, +2, -3, -3, -2, -2, -2, -1, +1, -2, -1, -1, -3, // L + -1, -3, -1, -3, -2, +0, -3, -2, +1, -1, +2, +5, -2, -2, +0, -1, -1, -1, +1, -1, -1, -1, -2, // M + -2, +1, -3, +1, +0, -3, +0, +1, -3, +0, -3, -2, +6, -2, +0, +0, +1, +0, -3, -4, -1, -2, +0, // N + -1, -1, -3, -1, -1, -4, -2, -2, -3, -1, -3, -2, -2, +7, -1, -2, -1, -1, -2, -4, -1, -3, -1, // P + -1, +0, -3, +0, +2, -3, -2, +0, -3, +1, -2, +0, +0, -1, +5, +1, +0, -1, -2, -2, -1, -1, +2, // Q + + -1, -2, -3, -2, +0, -3, -2, +0, -3, +2, -2, -1, +0, -2, +1, +5, -1, -1, -3, -3, -1, -2, +0, // R + +1, +0, -1, +0, +0, -2, +0, -1, -2, +0, -2, -1, +1, -1, +0, -1, +4, +1, -2, -3, -1, -2, +0, // S + +0, -1, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, +0, -1, -1, -1, +1, +5, +0, -2, -1, -2, -1, // T + +0, -3, -1, -3, -2, -1, -3, -3, +3, -2, +1, +1, -3, -2, -2, -3, -2, +0, +4, -3, -1, -1, -2, // V + -3, -4, -2, -4, -3, +1, -2, -2, -3, -3, -2, -1, -4, -4, -2, -3, -3, -2, -3,+11, -1, +2, -3, // W + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // X + -2, -3, -2, -3, -2, +3, -3, +2, -1, -2, -1, -1, -2, -3, -1, -2, -2, -2, -1, +2, -1, +7, -2, // Y + -1, +2, -4, +2, +5, -3, -2, +0, -3, +1, -3, -2, +0, -1, +2, +0, +0, -1, -2, -3, -1, -2, +5, // Z + }; + +static int toi_tab[26] = + { + 0, // A + 1, // B + 2, // C + 3, // D + 4, // E + 5, // F + 6, // G + 7, // H + 8, // I + -1, // J + 9, // K + 10, // L + 11, // M + 12, // N + -1, // O + 13, // P + 14, // Q + 15, // R + 16, // S + 17, // T + 17, // U + 18, // V + 19, // W + 20, // X + 21, // Y + 22, // Z + }; + +static int toi(char c) + { + c = toupper(c); + return toi_tab[c - 'A']; + } + +static int BlosumScore(char c1, char c2) + { + int i1 = toi(c1); + int i2 = toi(c2); + return Blosum62[i1][i2]; + } + +/*** +Consider a column with 5 As and 3 Bs. +There are: + 5x4 pairs of As. + 3x2 pairs of Bs. + 5x3x2 AB pairs + 8x7 = 5x4 + 3x2 + 5x3x2 pairs of letters +***/ +static double BlosumScoreCol(const MSA &a, unsigned uColIndex) + { + int iCounts[23]; + memset(iCounts, 0, sizeof(iCounts)); + const unsigned uSeqCount = a.GetSeqCount(); + unsigned uCharCount = 0; + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + char c = a.GetChar(uSeqIndex, uColIndex); + if (IsGapChar(c)) + continue; + int iChar = toi(c); + ++iCounts[iChar]; + ++uCharCount; + } + if (uCharCount < 2) + return -9; + int iTotalScore = 0; + for (int i1 = 0; i1 < 23; ++i1) + { + int iCounts1 = iCounts[i1]; + iTotalScore += iCounts1*(iCounts1 - 1)*Blosum62[i1][i1]; + for (int i2 = i1 + 1; i2 < 23; ++i2) + iTotalScore += iCounts[i2]*iCounts1*2*Blosum62[i1][i2]; + } + int iPairCount = uCharCount*(uCharCount - 1); + return (double) iTotalScore / (double) iPairCount; + } + +/*** +Consider a column with 5 As and 3 Bs. +A residue of type Q scores: + 5xAQ + 3xBQ +***/ +static void AssignColorsCol(const MSA &a, unsigned uColIndex, int **Colors) + { + int iCounts[23]; + memset(iCounts, 0, sizeof(iCounts)); + const unsigned uSeqCount = a.GetSeqCount(); + unsigned uCharCount = 0; + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + char c = a.GetChar(uSeqIndex, uColIndex); + if (IsGapChar(c)) + continue; + int iChar = toi(c); + ++iCounts[iChar]; + ++uCharCount; + } + int iMostConservedType = -1; + int iMostConservedCount = -1; + for (unsigned i = 0; i < 23; ++i) + { + if (iCounts[i] > iMostConservedCount) + { + iMostConservedType = i; + iMostConservedCount = iCounts[i]; + } + } + + double dColScore = BlosumScoreCol(a, uColIndex); + int c; + if (dColScore >= 3.0) + c = 3; + //else if (dColScore >= 1.0) + // c = 2; + else if (dColScore >= 0.2) + c = 1; + else + c = 0; + + int Color[23]; + for (unsigned uLetter = 0; uLetter < 23; ++uLetter) + { + double dScore = Blosum62[uLetter][iMostConservedType]; + if (dScore >= dColScore) + Color[uLetter] = c; + else + Color[uLetter] = 0; + } + + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + char c = a.GetChar(uSeqIndex, uColIndex); + if (IsGapChar(c)) + { + Colors[uSeqIndex][uColIndex] = 0; + continue; + } + int iLetter = toi(c); + if (iLetter >= 0 && iLetter < 23) + Colors[uSeqIndex][uColIndex] = Color[iLetter]; + else + Colors[uSeqIndex][uColIndex] = 0; + } + } + +void AssignColors(const MSA &a, int **Colors) + { + const unsigned uColCount = a.GetColCount(); + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + AssignColorsCol(a, uColIndex, Colors); + } diff --git a/src/muscle/muscle3.8.31/src/cons.cpp b/src/muscle/muscle3.8.31/src/cons.cpp new file mode 100644 index 0000000..c1507c5 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/cons.cpp @@ -0,0 +1,118 @@ +/*** +Conservation value for a column in an MSA is defined as the number +of times the most common letter appears divided by the number of +sequences. +***/ + +#include "muscle.h" +#include "msa.h" +#include + +double MSA::GetAvgCons() const + { + assert(GetSeqCount() > 0); + double dSum = 0; + unsigned uNonGapColCount = 0; + for (unsigned uColIndex = 0; uColIndex < GetColCount(); ++uColIndex) + { + if (!IsGapColumn(uColIndex)) + { + dSum += GetCons(uColIndex); + ++uNonGapColCount; + } + } + assert(uNonGapColCount > 0); + double dAvg = dSum / uNonGapColCount; + assert(dAvg > 0 && dAvg <= 1); + return dAvg; + } + +double MSA::GetCons(unsigned uColIndex) const + { + unsigned Counts[MAX_ALPHA]; + for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter) + Counts[uLetter] = 0; + + unsigned uMaxCount = 0; + for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) + { + if (IsGap(uSeqIndex, uColIndex)) + continue; + char c = GetChar(uSeqIndex, uColIndex); + c = toupper(c); + if ('X' == c || 'B' == c || 'Z' == c) + continue; + unsigned uLetter = GetLetter(uSeqIndex, uColIndex); + unsigned uCount = Counts[uLetter] + 1; + if (uCount > uMaxCount) + uMaxCount = uCount; + Counts[uLetter] = uCount; + } + +// Cons is undefined for all-gap column + if (0 == uMaxCount) + { +// assert(false); + return 1; + } + + double dCons = (double) uMaxCount / (double) GetSeqCount(); + assert(dCons > 0 && dCons <= 1); + return dCons; + } + +// Perecent identity of a pair of sequences. +// Positions with one or both gapped are ignored. +double MSA::GetPctIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const + { + const unsigned uColCount = GetColCount(); + unsigned uPosCount = 0; + unsigned uSameCount = 0; + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + { + const char c1 = GetChar(uSeqIndex1, uColIndex); + const char c2 = GetChar(uSeqIndex2, uColIndex); + if (IsGapChar(c1) || IsGapChar(c2)) + continue; + if (c1 == c2) + ++uSameCount; + ++uPosCount; + } + if (0 == uPosCount) + return 0; + return (double) uSameCount / (double) uPosCount; + } + +// Perecent group identity of a pair of sequences. +// Positions with one or both gapped are ignored. +double MSA::GetPctGroupIdentityPair(unsigned uSeqIndex1, + unsigned uSeqIndex2) const + { + extern unsigned ResidueGroup[]; + + const unsigned uColCount = GetColCount(); + unsigned uPosCount = 0; + unsigned uSameCount = 0; + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + { + if (IsGap(uSeqIndex1, uColIndex)) + continue; + if (IsGap(uSeqIndex2, uColIndex)) + continue; + if (IsWildcard(uSeqIndex1, uColIndex)) + continue; + if (IsWildcard(uSeqIndex2, uColIndex)) + continue; + + const unsigned uLetter1 = GetLetter(uSeqIndex1, uColIndex); + const unsigned uLetter2 = GetLetter(uSeqIndex2, uColIndex); + const unsigned uGroup1 = ResidueGroup[uLetter1]; + const unsigned uGroup2 = ResidueGroup[uLetter2]; + if (uGroup1 == uGroup2) + ++uSameCount; + ++uPosCount; + } + if (0 == uPosCount) + return 0; + return (double) uSameCount / (double) uPosCount; + } diff --git a/src/muscle/muscle3.8.31/src/diaglist.cpp b/src/muscle/muscle3.8.31/src/diaglist.cpp new file mode 100644 index 0000000..4f7ea86 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/diaglist.cpp @@ -0,0 +1,378 @@ +#include "muscle.h" +#include "diaglist.h" +#include "pwpath.h" + +#define MAX(x, y) ((x) > (y) ? (x) : (y)) +#define MIN(x, y) ((x) < (y) ? (x) : (y)) + +void DiagList::Add(const Diag &d) + { + if (m_uCount == MAX_DIAGS) + Quit("DiagList::Add, overflow %u", m_uCount); + m_Diags[m_uCount] = d; + ++m_uCount; + } + +void DiagList::Add(unsigned uStartPosA, unsigned uStartPosB, unsigned uLength) + { + Diag d; + d.m_uStartPosA = uStartPosA; + d.m_uStartPosB = uStartPosB; + d.m_uLength = uLength; + Add(d); + } + +const Diag &DiagList::Get(unsigned uIndex) const + { + if (uIndex >= m_uCount) + Quit("DiagList::Get(%u), count=%u", uIndex, m_uCount); + return m_Diags[uIndex]; + } + +void DiagList::LogMe() const + { + Log("DiagList::LogMe, count=%u\n", m_uCount); + Log(" n StartA StartB Length\n"); + Log("--- ------ ------ ------\n"); + for (unsigned n = 0; n < m_uCount; ++n) + { + const Diag &d = m_Diags[n]; + Log("%3u %6u %6u %6u\n", + n, d.m_uStartPosA, d.m_uStartPosB, d.m_uLength); + } + } + +void DiagList::FromPath(const PWPath &Path) + { + Clear(); + + const unsigned uEdgeCount = Path.GetEdgeCount(); + unsigned uLength = 0; + unsigned uStartPosA; + unsigned uStartPosB; + for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) + { + const PWEdge &Edge = Path.GetEdge(uEdgeIndex); + + // Typical cases + if (Edge.cType == 'M') + { + if (0 == uLength) + { + uStartPosA = Edge.uPrefixLengthA - 1; + uStartPosB = Edge.uPrefixLengthB - 1; + } + ++uLength; + } + else + { + if (uLength >= g_uMinDiagLength) + Add(uStartPosA, uStartPosB, uLength); + uLength = 0; + } + } + +// Special case for last edge + if (uLength >= g_uMinDiagLength) + Add(uStartPosA, uStartPosB, uLength); + } + +bool DiagList::NonZeroIntersection(const Diag &d) const + { + for (unsigned n = 0; n < m_uCount; ++n) + { + const Diag &d2 = m_Diags[n]; + if (DiagOverlap(d, d2) > 0) + return true; + } + return false; + } + +// DialogOverlap returns the length of the overlapping +// section of the two diagonals along the diagonals +// themselves; in other words, the length of +// the intersection of the two sets of cells in +// the matrix. +unsigned DiagOverlap(const Diag &d1, const Diag &d2) + { +// Determine where the diagonals intersect the A +// axis (extending them if required). If they +// intersect at different points, they do not +// overlap. Coordinates on a diagonal are +// given by B = A + c where c is the value of +// A at the intersection with the A axis. +// Hence, c = B - A for any point on the diagonal. + int c1 = (int) d1.m_uStartPosB - (int) d1.m_uStartPosA; + int c2 = (int) d2.m_uStartPosB - (int) d2.m_uStartPosA; + if (c1 != c2) + return 0; + + assert(DiagOverlapA(d1, d2) == DiagOverlapB(d1, d2)); + return DiagOverlapA(d1, d2); + } + +// DialogOverlapA returns the length of the overlapping +// section of the projection of the two diagonals onto +// the A axis. +unsigned DiagOverlapA(const Diag &d1, const Diag &d2) + { + unsigned uMaxStart = MAX(d1.m_uStartPosA, d2.m_uStartPosA); + unsigned uMinEnd = MIN(d1.m_uStartPosA + d1.m_uLength - 1, + d2.m_uStartPosA + d2.m_uLength - 1); + + int iLength = (int) uMinEnd - (int) uMaxStart + 1; + if (iLength < 0) + return 0; + return (unsigned) iLength; + } + +// DialogOverlapB returns the length of the overlapping +// section of the projection of the two diagonals onto +// the B axis. +unsigned DiagOverlapB(const Diag &d1, const Diag &d2) + { + unsigned uMaxStart = MAX(d1.m_uStartPosB, d2.m_uStartPosB); + unsigned uMinEnd = MIN(d1.m_uStartPosB + d1.m_uLength - 1, + d2.m_uStartPosB + d2.m_uLength - 1); + + int iLength = (int) uMinEnd - (int) uMaxStart + 1; + if (iLength < 0) + return 0; + return (unsigned) iLength; + } + +// Returns true if the two diagonals can be on the +// same path through the DP matrix. If DiagCompatible +// returns false, they cannot be in the same path +// and hence "contradict" each other. +bool DiagCompatible(const Diag &d1, const Diag &d2) + { + if (DiagOverlap(d1, d2) > 0) + return true; + return 0 == DiagOverlapA(d1, d2) && 0 == DiagOverlapB(d1, d2); + } + +// Returns the length of the "break" between two diagonals. +unsigned DiagBreak(const Diag &d1, const Diag &d2) + { + int c1 = (int) d1.m_uStartPosB - (int) d1.m_uStartPosA; + int c2 = (int) d2.m_uStartPosB - (int) d2.m_uStartPosA; + if (c1 != c2) + return 0; + + int iMaxStart = MAX(d1.m_uStartPosA, d2.m_uStartPosA); + int iMinEnd = MIN(d1.m_uStartPosA + d1.m_uLength - 1, + d2.m_uStartPosA + d1.m_uLength - 1); + int iBreak = iMaxStart - iMinEnd - 1; + if (iBreak < 0) + return 0; + return (unsigned) iBreak; + } + +// Merge diagonals that are continuations of each other with +// short breaks of up to length g_uMaxDiagBreak. +// In a sorted list of diagonals, we only have to check +// consecutive entries. +void MergeDiags(DiagList &DL) + { + return; +#if DEBUG + if (!DL.IsSorted()) + Quit("MergeDiags: !IsSorted"); +#endif + +// TODO: Fix this! +// Breaks must be with no offset (no gaps) + const unsigned uCount = DL.GetCount(); + if (uCount <= 1) + return; + + DiagList NewList; + + Diag MergedDiag; + const Diag *ptrPrev = &DL.Get(0); + for (unsigned i = 1; i < uCount; ++i) + { + const Diag *ptrDiag = &DL.Get(i); + unsigned uBreakLength = DiagBreak(*ptrPrev, *ptrDiag); + if (uBreakLength <= g_uMaxDiagBreak) + { + MergedDiag.m_uStartPosA = ptrPrev->m_uStartPosA; + MergedDiag.m_uStartPosB = ptrPrev->m_uStartPosB; + MergedDiag.m_uLength = ptrPrev->m_uLength + ptrDiag->m_uLength + + uBreakLength; + ptrPrev = &MergedDiag; + } + else + { + NewList.Add(*ptrPrev); + ptrPrev = ptrDiag; + } + } + NewList.Add(*ptrPrev); + DL.Copy(NewList); + } + +void DiagList::DeleteIncompatible() + { + assert(IsSorted()); + + if (m_uCount < 2) + return; + + bool *bFlagForDeletion = new bool[m_uCount]; + for (unsigned i = 0; i < m_uCount; ++i) + bFlagForDeletion[i] = false; + + for (unsigned i = 0; i < m_uCount; ++i) + { + const Diag &di = m_Diags[i]; + for (unsigned j = i + 1; j < m_uCount; ++j) + { + const Diag &dj = m_Diags[j]; + + // Verify sorted correctly + assert(di.m_uStartPosA <= dj.m_uStartPosA); + + // If two diagonals are incompatible and + // one is is much longer than the other, + // keep the longer one. + if (!DiagCompatible(di, dj)) + { + if (di.m_uLength > dj.m_uLength*4) + bFlagForDeletion[j] = true; + else if (dj.m_uLength > di.m_uLength*4) + bFlagForDeletion[i] = true; + else + { + bFlagForDeletion[i] = true; + bFlagForDeletion[j] = true; + } + } + } + } + + for (unsigned i = 0; i < m_uCount; ++i) + { + const Diag &di = m_Diags[i]; + if (bFlagForDeletion[i]) + continue; + + for (unsigned j = i + 1; j < m_uCount; ++j) + { + const Diag &dj = m_Diags[j]; + if (bFlagForDeletion[j]) + continue; + + // Verify sorted correctly + assert(di.m_uStartPosA <= dj.m_uStartPosA); + + // If sort order in B different from sorted order in A, + // either diags are incompatible or we detected a repeat + // or permutation. + if (di.m_uStartPosB >= dj.m_uStartPosB || !DiagCompatible(di, dj)) + { + bFlagForDeletion[i] = true; + bFlagForDeletion[j] = true; + } + } + } + + unsigned uNewCount = 0; + Diag *NewDiags = new Diag[m_uCount]; + for (unsigned i = 0; i < m_uCount; ++i) + { + if (bFlagForDeletion[i]) + continue; + + const Diag &d = m_Diags[i]; + NewDiags[uNewCount] = d; + ++uNewCount; + } + memcpy(m_Diags, NewDiags, uNewCount*sizeof(Diag)); + m_uCount = uNewCount; + delete[] NewDiags; + } + +void DiagList::Copy(const DiagList &DL) + { + Clear(); + unsigned uCount = DL.GetCount(); + for (unsigned i = 0; i < uCount; ++i) + Add(DL.Get(i)); + } + +// Check if sorted in increasing order of m_uStartPosA +bool DiagList::IsSorted() const + { + return true; + unsigned uCount = GetCount(); + for (unsigned i = 1; i < uCount; ++i) + if (m_Diags[i-1].m_uStartPosA > m_Diags[i].m_uStartPosA) + return false; + return true; + } + +// Sort in increasing order of m_uStartPosA +// Dumb bubble sort, but don't care about speed +// because don't get long lists. +void DiagList::Sort() + { + if (m_uCount < 2) + return; + + bool bContinue = true; + while (bContinue) + { + bContinue = false; + for (unsigned i = 0; i < m_uCount - 1; ++i) + { + if (m_Diags[i].m_uStartPosA > m_Diags[i+1].m_uStartPosA) + { + Diag Tmp = m_Diags[i]; + m_Diags[i] = m_Diags[i+1]; + m_Diags[i+1] = Tmp; + bContinue = true; + } + } + } + } + +//void TestDiag() +// { +// Diag d1; +// Diag d2; +// Diag d3; +// +// d1.m_uStartPosA = 0; +// d1.m_uStartPosB = 1; +// d1.m_uLength = 32; +// +// d2.m_uStartPosA = 55; +// d2.m_uStartPosB = 70; +// d2.m_uLength = 36; +// +// d3.m_uStartPosA = 102; +// d3.m_uStartPosB = 122; +// d3.m_uLength = 50; +// +// DiagList DL; +// DL.Add(d1); +// DL.Add(d2); +// DL.Add(d3); +// +// Log("Before DeleteIncompatible:\n"); +// DL.LogMe(); +// DL.DeleteIncompatible(); +// +// Log("After DeleteIncompatible:\n"); +// DL.LogMe(); +// +// MergeDiags(DL); +// Log("After Merge:\n"); +// DL.LogMe(); +// +// DPRegionList RL; +// DiagListToDPRegionList(DL, RL, 200, 200); +// RL.LogMe(); +// } diff --git a/src/muscle/muscle3.8.31/src/diaglist.h b/src/muscle/muscle3.8.31/src/diaglist.h new file mode 100644 index 0000000..a4464d5 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/diaglist.h @@ -0,0 +1,89 @@ +#ifndef diaglist_h +#define diaglist_h + +const unsigned EMPTY = (unsigned) ~0; +const unsigned MAX_DIAGS = 1024; + +struct Diag + { + unsigned m_uStartPosA; + unsigned m_uStartPosB; + unsigned m_uLength; + }; + +struct Rect + { + unsigned m_uStartPosA; + unsigned m_uStartPosB; + unsigned m_uLengthA; + unsigned m_uLengthB; + }; + +class DiagList + { +public: + DiagList() + { + m_uCount = 0; + } + ~DiagList() + { + Free(); + } + +public: +// Creation + void Clear() + { + Free(); + } + void FromPath(const PWPath &Path); + void Add(const Diag &d); + void Add(unsigned uStartPosA, unsigned uStartPosB, unsigned uLength); + void DeleteIncompatible(); + +// Accessors + unsigned GetCount() const + { + return m_uCount; + } + const Diag &Get(unsigned uIndex) const; + +// Operations + void Sort(); + void Copy(const DiagList &DL); + +// Query + // returns true iff given diagonal is included in the list + // in whole or in part. + bool NonZeroIntersection(const Diag &d) const; + bool IsSorted() const; + +// Diagnostics + void LogMe() const; + +private: + void Free() + { + m_uCount = 0; + } + +private: + unsigned m_uCount; + Diag m_Diags[MAX_DIAGS]; + }; + +unsigned DiagOverlap(const Diag &d1, const Diag &d2); +unsigned DiagOverlapA(const Diag &d1, const Diag &d2); +unsigned DiagOverlapB(const Diag &d1, const Diag &d2); +unsigned DiagBreak(const Diag &d1, const Diag &d2); +bool DiagCompatible(const Diag &d1, const Diag &d2); +void CheckDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, const MSA &msaA, const MSA &msaB, const PWPath &Path); +void FindDiags(const ProfPos *PX, unsigned uLengthX, const ProfPos *PY, + unsigned uLengthY, DiagList &DL); +void FindDiagsNuc(const ProfPos *PX, unsigned uLengthX, const ProfPos *PY, + unsigned uLengthY, DiagList &DL); +void MergeDiags(DiagList &DL); + +#endif // diaglist_h diff --git a/src/muscle/muscle3.8.31/src/diffobjscore.cpp b/src/muscle/muscle3.8.31/src/diffobjscore.cpp new file mode 100644 index 0000000..5b064fe --- /dev/null +++ b/src/muscle/muscle3.8.31/src/diffobjscore.cpp @@ -0,0 +1,162 @@ +#include "muscle.h" +#include "msa.h" +#include "objscore.h" +#include "profile.h" + +#define TRACE 0 +#define COMPARE_3_52 0 +#define BRUTE_LETTERS 0 + +static SCORE ScoreColLetters(const MSA &msa, unsigned uColIndex) + { + SCOREMATRIX &Mx = *g_ptrScoreMatrix; + const unsigned uSeqCount = msa.GetSeqCount(); + +#if BRUTE_LETTERS + SCORE BruteScore = 0; + for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) + { + unsigned uLetter1 = msa.GetLetterEx(uSeqIndex1, uColIndex); + if (uLetter1 >= g_AlphaSize) + continue; + WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1); + for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; ++uSeqIndex2) + { + unsigned uLetter2 = msa.GetLetterEx(uSeqIndex2, uColIndex); + if (uLetter2 >= g_AlphaSize) + continue; + WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2); + BruteScore += w1*w2*Mx[uLetter1][uLetter2]; + } + } +#endif + + double N = 0; + for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) + { + WEIGHT w = msa.GetSeqWeight(uSeqIndex1); + N += w; + } + if (N <= 0) + return 0; + + FCOUNT Freqs[20]; + memset(Freqs, 0, sizeof(Freqs)); + SCORE Score = 0; + for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) + { + unsigned uLetter = msa.GetLetterEx(uSeqIndex1, uColIndex); + if (uLetter >= g_AlphaSize) + continue; + WEIGHT w = msa.GetSeqWeight(uSeqIndex1); + Freqs[uLetter] += w; + Score -= w*w*Mx[uLetter][uLetter]; + } + + for (unsigned uLetter1 = 0; uLetter1 < g_AlphaSize; ++uLetter1) + { + const FCOUNT f1 = Freqs[uLetter1]; + Score += f1*f1*Mx[uLetter1][uLetter1]; + for (unsigned uLetter2 = uLetter1 + 1; uLetter2 < g_AlphaSize; ++uLetter2) + { + const FCOUNT f2 = Freqs[uLetter2]; + Score += 2*f1*f2*Mx[uLetter1][uLetter2]; + } + } + Score /= 2; +#if BRUTE_LETTERS + assert(BTEq(BruteScore, Score)); +#endif + return Score; + } + +static SCORE ScoreLetters(const MSA &msa, const unsigned Edges[], + unsigned uEdgeCount) + { + const unsigned uSeqCount = msa.GetSeqCount(); + const unsigned uColCount = msa.GetColCount(); + +// Letters + SCORE Score = 0; + for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) + { + const unsigned uColIndex = Edges[uEdgeIndex]; + assert(uColIndex < uColCount); + Score += ScoreColLetters(msa, uColIndex); + } + return Score; + } + +void GetLetterScores(const MSA &msa, SCORE Scores[]) + { + const unsigned uColCount = msa.GetColCount(); + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + Scores[uColIndex] = ScoreColLetters(msa, uColIndex); + } + +SCORE DiffObjScore( + const MSA &msa1, const PWPath &Path1, const unsigned Edges1[], unsigned uEdgeCount1, + const MSA &msa2, const PWPath &Path2, const unsigned Edges2[], unsigned uEdgeCount2) + { +#if TRACE + { + Log("============DiffObjScore===========\n"); + Log("msa1:\n"); + msa1.LogMe(); + Log("\n"); + Log("Cols1: "); + for (unsigned i = 0; i < uEdgeCount1; ++i) + Log(" %u", Edges1[i]); + Log("\n\n"); + Log("msa2:\n"); + msa2.LogMe(); + Log("Cols2: "); + for (unsigned i = 0; i < uEdgeCount2; ++i) + Log(" %u", Edges2[i]); + Log("\n\n"); + } +#endif + +#if COMPARE_3_52 + extern SCORE g_SPScoreLetters; + extern SCORE g_SPScoreGaps; + SCORE SP1 = ObjScoreSP(msa1); + SCORE SPLetters1 = g_SPScoreLetters; + SCORE SPGaps1 = g_SPScoreGaps; + + SCORE SP2 = ObjScoreSP(msa2); + SCORE SPLetters2 = g_SPScoreLetters; + SCORE SPGaps2 = g_SPScoreGaps; + SCORE SPDiffLetters = SPLetters2 - SPLetters1; + SCORE SPDiffGaps = SPGaps2 - SPGaps1; + SCORE SPDiff = SPDiffLetters + SPDiffGaps; +#endif + + SCORE Letters1 = ScoreLetters(msa1, Edges1, uEdgeCount1); + SCORE Letters2 = ScoreLetters(msa2, Edges2, uEdgeCount2); + + SCORE Gaps1 = ScoreGaps(msa1, Edges1, uEdgeCount1); + SCORE Gaps2 = ScoreGaps(msa2, Edges2, uEdgeCount2); + + SCORE DiffLetters = Letters2 - Letters1; + SCORE DiffGaps = Gaps2 - Gaps1; + SCORE Diff = DiffLetters + DiffGaps; + +#if COMPARE_3_52 + Log("ObjScoreSP Letters1=%.4g Letters2=%.4g DiffLetters=%.4g\n", + SPLetters1, SPLetters2, SPDiffLetters); + + Log("DiffObjScore Letters1=%.4g Letters2=%.4g DiffLetters=%.4g\n", + Letters1, Letters2, DiffLetters); + + Log("ObjScoreSP Gaps1=%.4g Gaps2=%.4g DiffGaps=%.4g\n", + SPGaps1, SPGaps2, SPDiffGaps); + + Log("DiffObjScore Gaps1=%.4g Gaps2=%.4g DiffGaps=%.4g\n", + Gaps1, Gaps2, DiffGaps); + + Log("SP diff=%.4g DiffObjScore Diff=%.4g\n", SPDiff, Diff); +#endif + + return Diff; + } diff --git a/src/muscle/muscle3.8.31/src/diffpaths.cpp b/src/muscle/muscle3.8.31/src/diffpaths.cpp new file mode 100644 index 0000000..632ada6 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/diffpaths.cpp @@ -0,0 +1,114 @@ +#include "muscle.h" +#include "pwpath.h" + +#define TRACE 0 + +void DiffPaths(const PWPath &p1, const PWPath &p2, unsigned Edges1[], + unsigned *ptruDiffCount1, unsigned Edges2[], unsigned *ptruDiffCount2) + { +#if TRACE + Log("DiffPaths\n"); + Log("p1="); + p1.LogMe(); + Log("p2="); + p2.LogMe(); +#endif + const unsigned uEdgeCount1 = p1.GetEdgeCount(); + const unsigned uEdgeCount2 = p2.GetEdgeCount(); + + unsigned uDiffCount1 = 0; + unsigned uDiffCount2 = 0; + unsigned uEdgeIndex1 = 0; + unsigned uEdgeIndex2 = 0; + const PWEdge *Edge1 = &p1.GetEdge(uEdgeIndex1); + const PWEdge *Edge2 = &p2.GetEdge(uEdgeIndex2); + for (;;) + { + unsigned uEdgeIndexTop1 = uEdgeIndex1; + unsigned uEdgeIndexTop2 = uEdgeIndex2; + Edge1 = &p1.GetEdge(uEdgeIndex1); + Edge2 = &p2.GetEdge(uEdgeIndex2); +#if TRACE + Log("e1[%u] PLA%u PLB%u %c, e2[%u] PLA%u PLB %u %c DC1=%u DC2=%u\n", + uEdgeIndex1, Edge1->uPrefixLengthA, Edge1->uPrefixLengthB, Edge1->cType, + uEdgeIndex2, Edge2->uPrefixLengthA, Edge2->uPrefixLengthB, Edge2->cType, + uDiffCount1, uDiffCount2); +#endif + if (Edge1->uPrefixLengthA == Edge2->uPrefixLengthA && + Edge1->uPrefixLengthB == Edge2->uPrefixLengthB) + { + if (!Edge1->Equal(*Edge2)) + { + Edges1[uDiffCount1++] = uEdgeIndex1; + Edges2[uDiffCount2++] = uEdgeIndex2; + } + ++uEdgeIndex1; + ++uEdgeIndex2; + } + + else if (Edge2->uPrefixLengthA < Edge1->uPrefixLengthA || + Edge2->uPrefixLengthB < Edge1->uPrefixLengthB) + Edges2[uDiffCount2++] = uEdgeIndex2++; + + else if (Edge1->uPrefixLengthA < Edge2->uPrefixLengthA || + Edge1->uPrefixLengthB < Edge2->uPrefixLengthB) + Edges1[uDiffCount1++] = uEdgeIndex1++; + + if (uEdgeCount1 == uEdgeIndex1) + { + while (uEdgeIndex2 < uEdgeCount2) + Edges2[uDiffCount2++] = uEdgeIndex2++; + goto Done; + } + if (uEdgeCount2 == uEdgeIndex2) + { + while (uEdgeIndex1 < uEdgeCount1) + Edges1[uDiffCount1++] = uEdgeIndex1++; + goto Done; + } + if (uEdgeIndex1 == uEdgeIndexTop1 && uEdgeIndex2 == uEdgeIndexTop2) + Quit("DiffPaths stuck"); + } +Done:; +#if TRACE + Log("DiffCount1=%u (%u %u)\n", uDiffCount1, uEdgeCount1, uEdgeCount2); + Log("Diffs1="); + for (unsigned i = 0; i < uDiffCount1; ++i) + { + const PWEdge e = p1.GetEdge(Edges1[i]); + Log(" %u=%c%u.%u", Edges1[i], e.cType, e.uPrefixLengthA, e.uPrefixLengthB); + } + Log("\n"); + Log("DiffCount2=%u\n", uDiffCount2); + Log("Diffs2="); + for (unsigned i = 0; i < uDiffCount2; ++i) + { + const PWEdge e = p2.GetEdge(Edges2[i]); + Log(" %u=%c%u.%u", Edges2[i], e.cType, e.uPrefixLengthA, e.uPrefixLengthB); + } + Log("\n"); +#endif + *ptruDiffCount1 = uDiffCount1; + *ptruDiffCount2 = uDiffCount2; + } + +void TestDiffPaths() + { + PWPath p1; + PWPath p2; + + p1.AppendEdge('M', 1, 1); + p1.AppendEdge('M', 2, 2); + p1.AppendEdge('M', 3, 3); + + p2.AppendEdge('M', 1, 1); + p2.AppendEdge('D', 2, 1); + p2.AppendEdge('I', 2, 2); + p2.AppendEdge('M', 3, 3); + + unsigned Edges1[64]; + unsigned Edges2[64]; + unsigned uDiffCount1; + unsigned uDiffCount2; + DiffPaths(p1, p2, Edges1, &uDiffCount1, Edges2, &uDiffCount2); + } diff --git a/src/muscle/muscle3.8.31/src/difftrees.cpp b/src/muscle/muscle3.8.31/src/difftrees.cpp new file mode 100644 index 0000000..7445d22 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/difftrees.cpp @@ -0,0 +1,381 @@ +#include "muscle.h" +#include "tree.h" + +#define TRACE 0 + +/*** +Algorithm to compare two trees, X and Y. + +A node x in X and node y in Y are defined to be +similar iff the set of leaves in the subtree under +x is identical to the set of leaves under y. + +A node is defined to be dissimilar iff it is not +similar to any node in the other tree. + +Nodes x and y are defined to be married iff every +node in the subtree under x is similar to a node +in the subtree under y. Married nodes are considered +to be equal. The subtrees under two married nodes can +at most differ by exchanges of left and right branches, +which we do not consider to be significant here. + +A node is defined to be a bachelor iff it is not +married. If a node is a bachelor, then it has a +dissimilar node in its subtree, and it follows +immediately from the definition of marriage that its +parent is also a bachelor. Hence all nodes on the path +from a bachelor node to the root are bachelors. + +We assume the trees have the same set of leaves, so +every leaf is trivially both similar and married to +the same leaf in the opposite tree. Bachelor nodes +are therefore always internal (i.e., non-leaf) nodes. + +A node is defined to be a diff iff (a) it is married +and (b) its parent is a bachelor. The subtree under +a diff is maximally similar to the other tree. (In +other words, you cannot extend the subtree without +adding a bachelor). + +The set of diffs is the subset of the two trees that +we consider to be identical. + +Example: + + -----A + -----k + ----j -----B +--i -----C + ------D + + + -----A + -----p + ----n -----B +--m -----D + ------C + + +The following pairs of internal nodes are similar. + + Nodes Set of leaves + ----- ------------- + k,p A,B + i,m A,B,C,D + +Bachelors in the first tree are i and j, bachelors +in the second tree are m and n. + +Node k and p are married, but i and m are not (because j +and n are bachelors). The diffs are C, D and k. + +The set of bachelor nodes can be viewed as the internal +nodes of a tree, the leaves of which are diffs. (To see +that there can't be disjoint subtrees, note that the path +from a diff to a root is all bachelor nodes, so there is +always a path between two diffs that goes through the root). +We call this tree the "diffs tree". + +There is a simple O(N) algorithm to build the diffs tree. +To achieve O(N) we avoid traversing a given subtree multiple +times and also avoid comparing lists of leaves. + +We visit nodes in depth-first order (i.e., a node is visited +before its parent). + +If either child of a node is a bachelor, we flag it as +a bachelor. + +If both children of the node we are visiting are married, +we check whether the spouses of those children have the +same parent in the other tree. If the parents are different, +the current node is a bachelor. If they have the same parent, +then the node we are visiting is the spouse of that parent. +We assign this newly identified married couple a unique integer +id. The id of a node is in one-to-one correspondence with the +set of leaves in its subtree. Two nodes have the same set of +leaves iff they have the same id. Bachelor nodes do not get +an id. +***/ + +static void BuildDiffs(const Tree &tree, unsigned uTreeNodeIndex, + const bool bIsDiff[], Tree &Diffs, unsigned uDiffsNodeIndex, + unsigned IdToDiffsLeafNodeIndex[]) + { +#if TRACE + Log("BuildDiffs(TreeNode=%u IsDiff=%d IsLeaf=%d)\n", + uTreeNodeIndex, bIsDiff[uTreeNodeIndex], tree.IsLeaf(uTreeNodeIndex)); +#endif + if (bIsDiff[uTreeNodeIndex]) + { + unsigned uLeafCount = tree.GetLeafCount(); + unsigned *Leaves = new unsigned[uLeafCount]; + GetLeaves(tree, uTreeNodeIndex, Leaves, &uLeafCount); + for (unsigned n = 0; n < uLeafCount; ++n) + { + const unsigned uLeafNodeIndex = Leaves[n]; + const unsigned uId = tree.GetLeafId(uLeafNodeIndex); + if (uId >= tree.GetLeafCount()) + Quit("BuildDiffs, id out of range"); + IdToDiffsLeafNodeIndex[uId] = uDiffsNodeIndex; +#if TRACE + Log(" Leaf id=%u DiffsNode=%u\n", uId, uDiffsNodeIndex); +#endif + } + delete[] Leaves; + return; + } + + if (tree.IsLeaf(uTreeNodeIndex)) + Quit("BuildDiffs: should never reach leaf"); + + const unsigned uTreeLeft = tree.GetLeft(uTreeNodeIndex); + const unsigned uTreeRight = tree.GetRight(uTreeNodeIndex); + + const unsigned uDiffsLeft = Diffs.AppendBranch(uDiffsNodeIndex); + const unsigned uDiffsRight = uDiffsLeft + 1; + + BuildDiffs(tree, uTreeLeft, bIsDiff, Diffs, uDiffsLeft, IdToDiffsLeafNodeIndex); + BuildDiffs(tree, uTreeRight, bIsDiff, Diffs, uDiffsRight, IdToDiffsLeafNodeIndex); + } + +void DiffTrees(const Tree &Tree1, const Tree &Tree2, Tree &Diffs, + unsigned IdToDiffsLeafNodeIndex[]) + { +#if TRACE + Log("Tree1:\n"); + Tree1.LogMe(); + Log("\n"); + Log("Tree2:\n"); + Tree2.LogMe(); +#endif + + if (!Tree1.IsRooted() || !Tree2.IsRooted()) + Quit("DiffTrees: requires rooted trees"); + + const unsigned uNodeCount = Tree1.GetNodeCount(); + const unsigned uNodeCount2 = Tree2.GetNodeCount(); + + const unsigned uLeafCount = Tree1.GetLeafCount(); + const unsigned uLeafCount2 = Tree2.GetLeafCount(); + assert(uLeafCount == uLeafCount2); + + if (uNodeCount != uNodeCount2) + Quit("DiffTrees: different node counts"); + +// Allocate tables so we can convert tree node index to +// and from the unique id with a O(1) lookup. + unsigned *NodeIndexToId1 = new unsigned[uNodeCount]; + unsigned *IdToNodeIndex2 = new unsigned[uNodeCount]; + + bool *bIsBachelor1 = new bool[uNodeCount]; + bool *bIsDiff1 = new bool[uNodeCount]; + + for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) + { + NodeIndexToId1[uNodeIndex] = uNodeCount; + bIsBachelor1[uNodeIndex] = false; + bIsDiff1[uNodeIndex] = false; + + // Use uNodeCount as value meaning "not set". + IdToNodeIndex2[uNodeIndex] = uNodeCount; + } + +// Initialize node index <-> id lookup tables + for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) + { + if (Tree1.IsLeaf(uNodeIndex)) + { + const unsigned uId = Tree1.GetLeafId(uNodeIndex); + if (uId >= uNodeCount) + Quit("Diff trees requires existing leaf ids in range 0 .. (N-1)"); + NodeIndexToId1[uNodeIndex] = uId; + } + + if (Tree2.IsLeaf(uNodeIndex)) + { + const unsigned uId = Tree2.GetLeafId(uNodeIndex); + if (uId >= uNodeCount) + Quit("Diff trees requires existing leaf ids in range 0 .. (N-1)"); + IdToNodeIndex2[uId] = uNodeIndex; + } + } + +// Validity check. This verifies that the ids +// pre-assigned to the leaves in Tree1 are unique +// (note that the id= uNodeCount) + { + Log("NewNode=%u uOld=%u > uNodeCount=%u\n", + uNewNodeIndex, uOld, uNodeCount); + Quit("Diff check failed"); + } + unsigned uIdNew = NewTree.GetLeafId(uNewNodeIndex); + unsigned uIdOld = OldTree.GetLeafId(uOld); + if (uIdNew != uIdOld) + { + Log("NewNode=%u uOld=%u IdNew=%u IdOld=%u\n", + uNewNodeIndex, uOld, uIdNew, uIdOld); + Quit("Diff check failed"); + } + continue; + } + + if (NODE_CHANGED == uOld) + continue; + + unsigned uNewLeft = NewTree.GetLeft(uNewNodeIndex); + unsigned uNewRight = NewTree.GetRight(uNewNodeIndex); + + unsigned uOldLeft = OldTree.GetLeft(uOld); + unsigned uOldRight = OldTree.GetRight(uOld); + + unsigned uNewLeftPartner = NewNodeIndexToOldNodeIndex[uNewLeft]; + unsigned uNewRightPartner = NewNodeIndexToOldNodeIndex[uNewRight]; + + bool bSameNotRotated = (uNewLeftPartner == uOldLeft && uNewRightPartner == uOldRight); + bool bSameRotated = (uNewLeftPartner == uOldRight && uNewRightPartner == uOldLeft); + if (!bSameNotRotated && !bSameRotated) + { + Log("NewNode=%u NewL=%u NewR=%u\n", uNewNodeIndex, uNewLeft, uNewRight); + Log("OldNode=%u OldL=%u OldR=%u\n", uOld, uOldLeft, uOldRight); + Log("NewLPartner=%u NewRPartner=%u\n", uNewLeftPartner, uNewRightPartner); + Quit("Diff check failed"); + } + } + } +#endif + } diff --git a/src/muscle/muscle3.8.31/src/distcalc.cpp b/src/muscle/muscle3.8.31/src/distcalc.cpp new file mode 100644 index 0000000..80058f5 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/distcalc.cpp @@ -0,0 +1,89 @@ +#include "muscle.h" +#include "distfunc.h" +#include "distcalc.h" +#include "msa.h" + +void DistCalcDF::Init(const DistFunc &DF) + { + m_ptrDF = &DF; + } + +void DistCalcDF::CalcDistRange(unsigned i, dist_t Dist[]) const + { + for (unsigned j = 0; j < i; ++j) + Dist[j] = m_ptrDF->GetDist(i, j); + } + +unsigned DistCalcDF::GetCount() const + { + return m_ptrDF->GetCount(); + } + +unsigned DistCalcDF::GetId(unsigned i) const + { + return m_ptrDF->GetId(i); + } + +const char *DistCalcDF::GetName(unsigned i) const + { + return m_ptrDF->GetName(i); + } + +void DistCalcMSA::Init(const MSA &msa, DISTANCE Distance) + { + m_ptrMSA = &msa; + m_Distance = Distance; + } + +void DistCalcMSA::CalcDistRange(unsigned i, dist_t Dist[]) const + { + for (unsigned j = 0; j < i; ++j) + { + switch (m_Distance) + { + case DISTANCE_PctIdKimura: + { + const float PctId = (float) m_ptrMSA->GetPctIdentityPair(i, j); + Dist[j] = (float) KimuraDist(PctId); + break; + } + case DISTANCE_PctIdLog: + { + const float PctId = (float) m_ptrMSA->GetPctIdentityPair(i, j); + Dist[j] = (float) PctIdToMAFFTDist(PctId); + break; + } + case DISTANCE_ScoreDist: + { + double GetScoreDist(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2); + Dist[j] = (float) GetScoreDist(*m_ptrMSA, i, j); + continue; + } + case DISTANCE_Edit: + { + const float PctId = (float) m_ptrMSA->GetPctIdentityPair(i, j); + if (PctId > 1.0) + Quit("Internal error, DISTANCE_Edit, pct id=%.3g", PctId); + Dist[j] = (float) 1.0 - PctId; + break; + } + default: + Quit("DistCalcMSA: Invalid DISTANCE_%u", m_Distance); + } + } + } + +unsigned DistCalcMSA::GetCount() const + { + return m_ptrMSA->GetSeqCount(); + } + +unsigned DistCalcMSA::GetId(unsigned i) const + { + return m_ptrMSA->GetSeqId(i); + } + +const char *DistCalcMSA::GetName(unsigned i) const + { + return m_ptrMSA->GetSeqName(i); + } diff --git a/src/muscle/muscle3.8.31/src/distcalc.h b/src/muscle/muscle3.8.31/src/distcalc.h new file mode 100644 index 0000000..6d1de55 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/distcalc.h @@ -0,0 +1,45 @@ +#ifndef DistCalc_h +#define DistCalc_h + +typedef float dist_t; +const dist_t BIG_DIST = (dist_t) 1e29; + +class DistFunc; + +class DistCalc + { +public: + virtual void CalcDistRange(unsigned i, dist_t Dist[]) const = 0; + virtual unsigned GetCount() const = 0; + virtual unsigned GetId(unsigned i) const = 0; + virtual const char *GetName(unsigned i) const = 0; + }; + +class DistCalcDF : public DistCalc + { +public: + void Init(const DistFunc &DF); + virtual void CalcDistRange(unsigned i, dist_t Dist[]) const; + virtual unsigned GetCount() const; + virtual unsigned GetId(unsigned i) const; + virtual const char *GetName(unsigned i) const; + +private: + const DistFunc *m_ptrDF; + }; + +class DistCalcMSA : public DistCalc + { +public: + void Init(const MSA &msa, DISTANCE Distance); + virtual void CalcDistRange(unsigned i, dist_t Dist[]) const; + virtual unsigned GetCount() const; + virtual unsigned GetId(unsigned i) const; + virtual const char *GetName(unsigned i) const; + +private: + const MSA *m_ptrMSA; + DISTANCE m_Distance; + }; + +#endif // DistCalc_h diff --git a/src/muscle/muscle3.8.31/src/distfunc.cpp b/src/muscle/muscle3.8.31/src/distfunc.cpp new file mode 100644 index 0000000..b435fb0 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/distfunc.cpp @@ -0,0 +1,113 @@ +#include "muscle.h" +#include "distfunc.h" +#include + +DistFunc::DistFunc() + { + m_Dists = 0; + m_uCount = 0; + m_uCacheCount = 0; + m_Names = 0; + m_Ids = 0; + } + +DistFunc::~DistFunc() + { + if (0 != m_Names) + { + for (unsigned i = 0; i < m_uCount; ++i) + free(m_Names[i]); + } + delete[] m_Dists; + delete[] m_Names; + delete[] m_Ids; + } + +float DistFunc::GetDist(unsigned uIndex1, unsigned uIndex2) const + { + return m_Dists[VectorIndex(uIndex1, uIndex2)]; + } + +unsigned DistFunc::GetCount() const + { + return m_uCount; + } + +void DistFunc::SetCount(unsigned uCount) + { + m_uCount = uCount; + if (uCount <= m_uCacheCount) + return; + delete[] m_Dists; + m_Dists = new float[VectorLength()]; + m_Names = new char *[m_uCount]; + m_Ids = new unsigned[m_uCount]; + m_uCacheCount = uCount; + + memset(m_Names, 0, m_uCount*sizeof(char *)); + memset(m_Ids, 0xff, m_uCount*sizeof(unsigned)); + memset(m_Dists, 0, VectorLength()*sizeof(float)); + } + +void DistFunc::SetDist(unsigned uIndex1, unsigned uIndex2, float dDist) + { + m_Dists[VectorIndex(uIndex1, uIndex2)] = dDist; + m_Dists[VectorIndex(uIndex2, uIndex1)] = dDist; + } + +unsigned DistFunc::VectorIndex(unsigned uIndex1, unsigned uIndex2) const + { + assert(uIndex1 < m_uCount && uIndex2 < m_uCount); + return uIndex1*m_uCount + uIndex2; + } + +unsigned DistFunc::VectorLength() const + { + return m_uCount*m_uCount; + } + +void DistFunc::SetName(unsigned uIndex, const char szName[]) + { + assert(uIndex < m_uCount); + m_Names[uIndex] = strsave(szName); + } + +void DistFunc::SetId(unsigned uIndex, unsigned uId) + { + assert(uIndex < m_uCount); + m_Ids[uIndex] = uId; + } + +const char *DistFunc::GetName(unsigned uIndex) const + { + assert(uIndex < m_uCount); + return m_Names[uIndex]; + } + +unsigned DistFunc::GetId(unsigned uIndex) const + { + assert(uIndex < m_uCount); + return m_Ids[uIndex]; + } + +void DistFunc::LogMe() const + { + Log("DistFunc::LogMe count=%u\n", m_uCount); + Log(" "); + for (unsigned i = 0; i < m_uCount; ++i) + Log(" %7u", i); + Log("\n"); + + Log(" "); + for (unsigned i = 0; i < m_uCount; ++i) + Log(" %7.7s", m_Names[i] ? m_Names[i] : ""); + Log("\n"); + + for (unsigned i = 0; i < m_uCount; ++i) + { + Log("%4u %10.10s : ", i, m_Names[i] ? m_Names[i] : ""); + for (unsigned j = 0; j <= i; ++j) + Log(" %7.4g", GetDist(i, j)); + Log("\n"); + } + } diff --git a/src/muscle/muscle3.8.31/src/distfunc.h b/src/muscle/muscle3.8.31/src/distfunc.h new file mode 100644 index 0000000..95a9f24 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/distfunc.h @@ -0,0 +1,36 @@ +#ifndef DistFunc_h +#define DistFunc_h + +class DistFunc + { +public: + DistFunc(); + virtual ~DistFunc(); + +public: + virtual void SetCount(unsigned uCount); + virtual void SetDist(unsigned uIndex1, unsigned uIndex2, float dDist); + + void SetName(unsigned uIndex, const char szName[]); + void SetId(unsigned uIndex, unsigned uId); + const char *GetName(unsigned uIndex) const; + unsigned GetId(unsigned uIndex) const; + + virtual float GetDist(unsigned uIndex1, unsigned uIndex2) const; + virtual unsigned GetCount() const; + + void LogMe() const; + +protected: + unsigned VectorIndex(unsigned uIndex, unsigned uIndex2) const; + unsigned VectorLength() const; + +private: + unsigned m_uCount; + unsigned m_uCacheCount; + float *m_Dists; + char **m_Names; + unsigned *m_Ids; + }; + +#endif // DistFunc_h diff --git a/src/muscle/muscle3.8.31/src/distpwkimura.cpp b/src/muscle/muscle3.8.31/src/distpwkimura.cpp new file mode 100644 index 0000000..cd666a3 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/distpwkimura.cpp @@ -0,0 +1,45 @@ +#include "muscle.h" +#include "distfunc.h" +#include "msa.h" +#include "seqvect.h" +#include "pwpath.h" + +void DistPWKimura(const SeqVect &v, DistFunc &DF) + { + SEQWEIGHT SeqWeightSave = GetSeqWeightMethod(); + SetSeqWeightMethod(SEQWEIGHT_Henikoff); + + const unsigned uSeqCount = v.Length(); + DF.SetCount(uSeqCount); + + const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2; + unsigned uCount = 0; + SetProgressDesc("PWKimura distance"); + for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) + { + const Seq &s1 = v.GetSeq(uSeqIndex1); + MSA msa1; + msa1.FromSeq(s1); + for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2) + { + if (0 == uCount%20) + Progress(uCount, uPairCount); + ++uCount; + const Seq &s2 = v.GetSeq(uSeqIndex2); + MSA msa2; + msa2.FromSeq(s2); + + PWPath Path; + MSA msaOut; + AlignTwoMSAs(msa1, msa2, msaOut, Path, false, false); + + double dPctId = msaOut.GetPctIdentityPair(0, 1); + float f = (float) KimuraDist(dPctId); + + DF.SetDist(uSeqIndex1, uSeqIndex2, f); + } + } + ProgressStepsDone(); + + SetSeqWeightMethod(SeqWeightSave); + } diff --git a/src/muscle/muscle3.8.31/src/domuscle.cpp b/src/muscle/muscle3.8.31/src/domuscle.cpp new file mode 100644 index 0000000..73c5534 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/domuscle.cpp @@ -0,0 +1,299 @@ +#include "muscle.h" +#include "textfile.h" +#include "seqvect.h" +#include "distfunc.h" +#include "msa.h" +#include "tree.h" +#include "profile.h" +#include "timing.h" + +static char g_strUseTreeWarning[] = +"\n******** WARNING ****************\n" +"\nYou specified the -usetree option.\n" +"Note that a good evolutionary tree may NOT be a good\n" +"guide tree for multiple alignment. For more details,\n" +"please refer to the user guide. To disable this\n" +"warning, use -usetree_nowarn .\n\n"; + +void DoMuscle() + { + SetOutputFileName(g_pstrOutFileName); + SetInputFileName(g_pstrInFileName); + + SetMaxIters(g_uMaxIters); + SetSeqWeightMethod(g_SeqWeight1); + + TextFile fileIn(g_pstrInFileName); + SeqVect v; + v.FromFASTAFile(fileIn); + const unsigned uSeqCount = v.Length(); + + if (0 == uSeqCount) + Quit("No sequences in input file"); + + ALPHA Alpha = ALPHA_Undefined; + switch (g_SeqType) + { + case SEQTYPE_Auto: + Alpha = v.GuessAlpha(); + break; + + case SEQTYPE_Protein: + Alpha = ALPHA_Amino; + break; + + case SEQTYPE_DNA: + Alpha = ALPHA_DNA; + break; + + case SEQTYPE_RNA: + Alpha = ALPHA_RNA; + break; + + default: + Quit("Invalid seq type"); + } + SetAlpha(Alpha); + v.FixAlpha(); + + PTR_SCOREMATRIX UserMatrix = 0; + if (0 != g_pstrMatrixFileName) + { + const char *FileName = g_pstrMatrixFileName; + const char *Path = getenv("MUSCLE_MXPATH"); + if (Path != 0) + { + size_t n = strlen(Path) + 1 + strlen(FileName) + 1; + char *NewFileName = new char[n]; + sprintf(NewFileName, "%s/%s", Path, FileName); + FileName = NewFileName; + } + TextFile File(FileName); + UserMatrix = ReadMx(File); + g_Alpha = ALPHA_Amino; + g_PPScore = PPSCORE_SP; + } + + SetPPScore(); + + if (0 != UserMatrix) + g_ptrScoreMatrix = UserMatrix; + + unsigned uMaxL = 0; + unsigned uTotL = 0; + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + unsigned L = v.GetSeq(uSeqIndex).Length(); + uTotL += L; + if (L > uMaxL) + uMaxL = L; + } + + SetIter(1); + g_bDiags = g_bDiags1; + SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount); + + SetMuscleSeqVect(v); + + MSA::SetIdCount(uSeqCount); + +// Initialize sequence ids. +// From this point on, ids must somehow propogate from here. + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + v.SetSeqId(uSeqIndex, uSeqIndex); + + if (0 == uSeqCount) + Quit("Input file '%s' has no sequences", g_pstrInFileName); + if (1 == uSeqCount) + { + TextFile fileOut(g_pstrOutFileName, true); + v.ToFile(fileOut); + return; + } + + if (uSeqCount > 1) + MHackStart(v); + +// First iteration + Tree GuideTree; + if (0 != g_pstrUseTreeFileName) + { + // Discourage users... + if (!g_bUseTreeNoWarn) + fprintf(stderr, "%s", g_strUseTreeWarning); + + // Read tree from file + TextFile TreeFile(g_pstrUseTreeFileName); + GuideTree.FromFile(TreeFile); + + // Make sure tree is rooted + if (!GuideTree.IsRooted()) + Quit("User tree must be rooted"); + + if (GuideTree.GetLeafCount() != uSeqCount) + Quit("User tree does not match input sequences"); + + const unsigned uNodeCount = GuideTree.GetNodeCount(); + for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) + { + if (!GuideTree.IsLeaf(uNodeIndex)) + continue; + const char *LeafName = GuideTree.GetLeafName(uNodeIndex); + unsigned uSeqIndex; + bool SeqFound = v.FindName(LeafName, &uSeqIndex); + if (!SeqFound) + Quit("Label %s in tree does not match sequences", LeafName); + unsigned uId = v.GetSeqIdFromName(LeafName); + GuideTree.SetLeafId(uNodeIndex, uId); + } + } + else + TreeFromSeqVect(v, GuideTree, g_Cluster1, g_Distance1, g_Root1, + g_pstrDistMxFileName1); + + const char *Tree1 = ValueOpt("Tree1"); + if (0 != Tree1) + { + TextFile f(Tree1, true); + GuideTree.ToFile(f); + if (g_bClusterOnly) + return; + } + + SetMuscleTree(GuideTree); + ValidateMuscleIds(GuideTree); + + MSA msa; + ProgNode *ProgNodes = 0; + if (g_bLow) + ProgNodes = ProgressiveAlignE(v, GuideTree, msa); + else + ProgressiveAlign(v, GuideTree, msa); + SetCurrentAlignment(msa); + + if (0 != g_pstrComputeWeightsFileName) + { + extern void OutWeights(const char *FileName, const MSA &msa); + SetMSAWeightsMuscle(msa); + OutWeights(g_pstrComputeWeightsFileName, msa); + return; + } + + ValidateMuscleIds(msa); + + if (1 == g_uMaxIters || 2 == uSeqCount) + { + //TextFile fileOut(g_pstrOutFileName, true); + //MHackEnd(msa); + //msa.ToFile(fileOut); + MuscleOutput(msa); + return; + } + + if (0 == g_pstrUseTreeFileName) + { + g_bDiags = g_bDiags2; + SetIter(2); + + if (g_bLow) + { + if (0 != g_uMaxTreeRefineIters) + RefineTreeE(msa, v, GuideTree, ProgNodes); + } + else + RefineTree(msa, GuideTree); + + const char *Tree2 = ValueOpt("Tree2"); + if (0 != Tree2) + { + TextFile f(Tree2, true); + GuideTree.ToFile(f); + } + } + + SetSeqWeightMethod(g_SeqWeight2); + SetMuscleTree(GuideTree); + + if (g_bAnchors) + RefineVert(msa, GuideTree, g_uMaxIters - 2); + else + RefineHoriz(msa, GuideTree, g_uMaxIters - 2, false, false); + +#if 0 +// Refining by subfamilies is disabled as it didn't give better +// results. I tried doing this before and after RefineHoriz. +// Should get back to this as it seems like this should work. + RefineSubfams(msa, GuideTree, g_uMaxIters - 2); +#endif + + ValidateMuscleIds(msa); + ValidateMuscleIds(GuideTree); + + //TextFile fileOut(g_pstrOutFileName, true); + //MHackEnd(msa); + //msa.ToFile(fileOut); + MuscleOutput(msa); + } + +void Run() + { + SetStartTime(); + Log("Started %s\n", GetTimeAsStr()); + for (int i = 0; i < g_argc; ++i) + Log("%s ", g_argv[i]); + Log("\n"); + +#if TIMING + TICKS t1 = GetClockTicks(); +#endif + if (g_bRefine) + Refine(); + else if (g_bRefineW) + { + extern void DoRefineW(); + DoRefineW(); + } + else if (g_bProfDB) + ProfDB(); + else if (g_bSW) + Local(); + else if (0 != g_pstrSPFileName) + DoSP(); + else if (g_bProfile) + Profile(); + else if (g_bPPScore) + PPScore(); + else if (g_bPAS) + ProgAlignSubFams(); + else if (g_bMakeTree) + { + extern void DoMakeTree(); + DoMakeTree(); + } + else + DoMuscle(); + +#if TIMING + extern TICKS g_ticksDP; + extern TICKS g_ticksObjScore; + TICKS t2 = GetClockTicks(); + TICKS TotalTicks = t2 - t1; + TICKS ticksOther = TotalTicks - g_ticksDP - g_ticksObjScore; + double dSecs = TicksToSecs(TotalTicks); + double PctDP = (double) g_ticksDP*100.0/(double) TotalTicks; + double PctOS = (double) g_ticksObjScore*100.0/(double) TotalTicks; + double PctOther = (double) ticksOther*100.0/(double) TotalTicks; + Log(" Ticks Secs Pct\n"); + Log(" ============ ======= =====\n"); + Log("DP %12ld %7.2f %5.1f%%\n", + (long) g_ticksDP, TicksToSecs(g_ticksDP), PctDP); + Log("OS %12ld %7.2f %5.1f%%\n", + (long) g_ticksObjScore, TicksToSecs(g_ticksObjScore), PctOS); + Log("Other %12ld %7.2f %5.1f%%\n", + (long) ticksOther, TicksToSecs(ticksOther), PctOther); + Log("Total %12ld %7.2f 100.0%%\n", (long) TotalTicks, dSecs); +#endif + + ListDiagSavings(); + Log("Finished %s\n", GetTimeAsStr()); + } diff --git a/src/muscle/muscle3.8.31/src/dosp.cpp b/src/muscle/muscle3.8.31/src/dosp.cpp new file mode 100644 index 0000000..8a8632e --- /dev/null +++ b/src/muscle/muscle3.8.31/src/dosp.cpp @@ -0,0 +1,60 @@ +#include "muscle.h" +#include "textfile.h" +#include "msa.h" +#include "objscore.h" +#include "tree.h" +#include "profile.h" + +void DoSP() + { + TextFile f(g_pstrSPFileName); + + MSA a; + a.FromFile(f); + + ALPHA Alpha = ALPHA_Undefined; + switch (g_SeqType) + { + case SEQTYPE_Auto: + Alpha = a.GuessAlpha(); + break; + + case SEQTYPE_Protein: + Alpha = ALPHA_Amino; + break; + + case SEQTYPE_DNA: + Alpha = ALPHA_DNA; + break; + + case SEQTYPE_RNA: + Alpha = ALPHA_RNA; + break; + + default: + Quit("Invalid SeqType"); + } + SetAlpha(Alpha); + a.FixAlpha(); + + SetPPScore(); + + const unsigned uSeqCount = a.GetSeqCount(); + if (0 == uSeqCount) + Quit("No sequences in input file %s", g_pstrSPFileName); + + MSA::SetIdCount(uSeqCount); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + a.SetSeqId(uSeqIndex, uSeqIndex); + + SetSeqWeightMethod(g_SeqWeight1); + Tree tree; + TreeFromMSA(a, tree, g_Cluster2, g_Distance2, g_Root2); + SetMuscleTree(tree); + SetMSAWeightsMuscle((MSA &) a); + + SCORE SP = ObjScoreSP(a); + + Log("File=%s;SP=%.4g\n", g_pstrSPFileName, SP); + fprintf(stderr, "File=%s;SP=%.4g\n", g_pstrSPFileName, SP); + } diff --git a/src/muscle/muscle3.8.31/src/dpregionlist.h b/src/muscle/muscle3.8.31/src/dpregionlist.h new file mode 100644 index 0000000..a83e110 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/dpregionlist.h @@ -0,0 +1,73 @@ +#ifndef DPRegionList_h +#define DPRegionList_h + +#include "diaglist.h" + +enum DPREGIONTYPE + { + DPREGIONTYPE_Unknown, + DPREGIONTYPE_Diag, + DPREGIONTYPE_Rect + }; + +struct DPRegion + { + DPREGIONTYPE m_Type; + union + { + Diag m_Diag; + Rect m_Rect; + }; + }; + +const unsigned MAX_DPREGIONS = 1024; + +class DPRegionList + { +public: + DPRegionList() + { + m_uCount = 0; + } + ~DPRegionList() + { + Free(); + } + +public: +// Creation + void Clear() + { + Free(); + } + void Add(const DPRegion &r); + +// Accessors + unsigned GetCount() const + { + return m_uCount; + } + const DPRegion &Get(unsigned uIndex) const + { + assert(uIndex < m_uCount); + return m_DPRegions[uIndex]; + } + +// Diagnostics + void LogMe() const; + +private: + void Free() + { + m_uCount = 0; + } + +private: + unsigned m_uCount; + DPRegion m_DPRegions[MAX_DPREGIONS]; + }; + +void DiagListToDPRegionList(const DiagList &DL, DPRegionList &RL, + unsigned uLengthA, unsigned uLengthB); + +#endif // DPRegionList_h diff --git a/src/muscle/muscle3.8.31/src/dpreglist.cpp b/src/muscle/muscle3.8.31/src/dpreglist.cpp new file mode 100644 index 0000000..5979761 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/dpreglist.cpp @@ -0,0 +1,108 @@ +#include "muscle.h" +#include "dpreglist.h" + +unsigned DPRegionList::GetDPArea() const + { + unsigned uArea = 0; + for (unsigned i = 0; i < m_uCount; ++i) + { + const DPRegion &r = m_DPRegions[i]; + if (DPREGIONTYPE_Rect == r.m_Type) + uArea += r.m_Rect.m_uLengthA*r.m_Rect.m_uLengthB; + } + return uArea; + } + +void DPRegionList::Add(const DPRegion &r) + { + if (m_uCount == MAX_DPREGIONS) + Quit("DPRegionList::Add, overflow %d", m_uCount); + m_DPRegions[m_uCount] = r; + ++m_uCount; + } + +void DPRegionList::LogMe() const + { + Log("DPRegionList::LogMe, count=%u\n", m_uCount); + Log("Region Type StartA StartB EndA EndB\n"); + Log("------ ---- ------ ------ ---- ----\n"); + for (unsigned i = 0; i < m_uCount; ++i) + { + const DPRegion &r = m_DPRegions[i]; + Log("%6u ", i); + if (DPREGIONTYPE_Diag == r.m_Type) + Log("Diag %6u %6u %6u %6u\n", + r.m_Diag.m_uStartPosA, + r.m_Diag.m_uStartPosB, + r.m_Diag.m_uStartPosA + r.m_Diag.m_uLength - 1, + r.m_Diag.m_uStartPosB + r.m_Diag.m_uLength - 1); + else if (DPREGIONTYPE_Rect == r.m_Type) + Log("Rect %6u %6u %6u %6u\n", + r.m_Rect.m_uStartPosA, + r.m_Rect.m_uStartPosB, + r.m_Rect.m_uStartPosA + r.m_Rect.m_uLengthA - 1, + r.m_Rect.m_uStartPosB + r.m_Rect.m_uLengthB - 1); + else + Log(" *** ERROR *** Type=%u\n", r.m_Type); + } + } + +void DiagListToDPRegionList(const DiagList &DL, DPRegionList &RL, + unsigned uLengthA, unsigned uLengthB) + { + if (g_uDiagMargin > g_uMinDiagLength/2) + Quit("Invalid parameters, diagmargin=%d must be <= 2*diaglength=%d", + g_uDiagMargin, g_uMinDiagLength); + + unsigned uStartPosA = 0; + unsigned uStartPosB = 0; + const unsigned uDiagCount = DL.GetCount(); + DPRegion r; + for (unsigned uDiagIndex = 0; uDiagIndex < uDiagCount; ++uDiagIndex) + { + const Diag &d = DL.Get(uDiagIndex); + assert(d.m_uLength >= g_uMinDiagLength); + const unsigned uStartVertexA = d.m_uStartPosA + g_uDiagMargin - 1; + const unsigned uStartVertexB = d.m_uStartPosB + g_uDiagMargin - 1; + const unsigned uEndVertexA = d.m_uStartPosA + d.m_uLength - g_uDiagMargin; + const unsigned uEndVertexB = d.m_uStartPosB + d.m_uLength - g_uDiagMargin; + + r.m_Type = DPREGIONTYPE_Rect; + r.m_Rect.m_uStartPosA = uStartPosA; + r.m_Rect.m_uStartPosB = uStartPosB; + + assert(uStartVertexA + 1 >= uStartPosA); + assert(uStartVertexB + 1 >= uStartPosB); + r.m_Rect.m_uLengthA = uStartVertexA + 1 - uStartPosA; + r.m_Rect.m_uLengthB = uStartVertexB + 1 - uStartPosB; + RL.Add(r); + + if (uEndVertexA > uStartVertexA + 1) + { + const unsigned uDiagLengthMinusCaps = uEndVertexA - uStartVertexA - 1; + + r.m_Type = DPREGIONTYPE_Diag; + r.m_Diag.m_uStartPosA = uStartVertexA + 1; + r.m_Diag.m_uStartPosB = uStartVertexB + 1; + assert(uEndVertexA - uStartVertexA == uEndVertexB - uStartVertexB); + r.m_Diag.m_uLength = uEndVertexA - uStartVertexA - 1; + RL.Add(r); + } + + uStartPosA = uEndVertexA; + uStartPosB = uEndVertexB; + } + + assert((int) uLengthA - (int) uStartPosA >= (int) g_uDiagMargin); + assert((int) uLengthB - (int) uStartPosB >= (int) g_uDiagMargin); + + r.m_Type = DPREGIONTYPE_Rect; + r.m_Rect.m_uStartPosA = uStartPosA; + r.m_Rect.m_uStartPosB = uStartPosB; + + assert(uLengthA >= uStartPosA); + assert(uLengthB >= uStartPosB); + r.m_Rect.m_uLengthA = uLengthA - uStartPosA; + r.m_Rect.m_uLengthB = uLengthB - uStartPosB; + RL.Add(r); + } diff --git a/src/muscle/muscle3.8.31/src/dpreglist.h b/src/muscle/muscle3.8.31/src/dpreglist.h new file mode 100644 index 0000000..5ab8317 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/dpreglist.h @@ -0,0 +1,76 @@ +#ifndef dpreglist_h +#define dpreglist_h + +#include "diaglist.h" + +enum DPREGIONTYPE + { + DPREGIONTYPE_Unknown, + DPREGIONTYPE_Diag, + DPREGIONTYPE_Rect + }; + +struct DPRegion + { + DPREGIONTYPE m_Type; + union + { + Diag m_Diag; + Rect m_Rect; + }; + }; + +const unsigned MAX_DPREGIONS = 1024; + +class DPRegionList + { +public: + DPRegionList() + { + m_uCount = 0; + } + ~DPRegionList() + { + Free(); + } + +public: +// Creation + void Clear() + { + Free(); + } + void Add(const DPRegion &r); + +// Accessors + unsigned GetCount() const + { + return m_uCount; + } + + const DPRegion &Get(unsigned uIndex) const + { + assert(uIndex < m_uCount); + return m_DPRegions[uIndex]; + } + + unsigned GetDPArea() const; + +// Diagnostics + void LogMe() const; + +private: + void Free() + { + m_uCount = 0; + } + +private: + unsigned m_uCount; + DPRegion m_DPRegions[MAX_DPREGIONS]; + }; + +void DiagListToDPRegionList(const DiagList &DL, DPRegionList &RL, + unsigned uLengthA, unsigned uLengthB); + +#endif // dpreglist_h diff --git a/src/muscle/muscle3.8.31/src/drawtree.cpp b/src/muscle/muscle3.8.31/src/drawtree.cpp new file mode 100644 index 0000000..dacfc9a --- /dev/null +++ b/src/muscle/muscle3.8.31/src/drawtree.cpp @@ -0,0 +1,41 @@ +#include "muscle.h" +#include "tree.h" + +/*** +Simple tree drawing algorithm. + +y coordinate of node is index in depth-first traversal. +x coordinate is distance from root. +***/ + +static unsigned DistFromRoot(const Tree &tree, unsigned uNodeIndex) + { + const unsigned uRoot = tree.GetRootNodeIndex(); + unsigned uDist = 0; + while (uNodeIndex != uRoot) + { + ++uDist; + uNodeIndex = tree.GetParent(uNodeIndex); + } + return uDist; + } + +static void DrawNode(const Tree &tree, unsigned uNodeIndex) + { + if (!tree.IsLeaf(uNodeIndex)) + DrawNode(tree, tree.GetLeft(uNodeIndex)); + + unsigned uDist = DistFromRoot(tree, uNodeIndex); + for (unsigned i = 0; i < 5*uDist; ++i) + Log(" "); + Log("%d\n", uNodeIndex); + + if (!tree.IsLeaf(uNodeIndex)) + DrawNode(tree, tree.GetRight(uNodeIndex)); + } + +void DrawTree(const Tree &tree) + { + unsigned uRoot = tree.GetRootNodeIndex(); + DrawNode(tree, uRoot); + } diff --git a/src/muscle/muscle3.8.31/src/edgelist.cpp b/src/muscle/muscle3.8.31/src/edgelist.cpp new file mode 100644 index 0000000..4fbac86 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/edgelist.cpp @@ -0,0 +1,88 @@ +#include "muscle.h" +#include "edgelist.h" + +EdgeList::EdgeList() + { + m_uNode1 = 0; + m_uNode2 = 0; + m_uCount = 0; + m_uCacheSize = 0; + } + +EdgeList::~EdgeList() + { + Clear(); + } + +void EdgeList::Clear() + { + delete[] m_uNode1; + delete[] m_uNode2; + m_uNode1 = 0; + m_uNode2 = 0; + m_uCount = 0; + m_uCacheSize = 0; + } + +void EdgeList::Add(unsigned uNode1, unsigned uNode2) + { + if (m_uCount <= m_uCacheSize) + Expand(); + m_uNode1[m_uCount] = uNode1; + m_uNode2[m_uCount] = uNode2; + ++m_uCount; + } + +unsigned EdgeList::GetCount() const + { + return m_uCount; + } + +void EdgeList::GetEdge(unsigned uIndex, unsigned *ptruNode1, unsigned *ptruNode2) const + { + if (uIndex > m_uCount) + Quit("EdgeList::GetEdge(%u) count=%u", uIndex, m_uCount); + *ptruNode1 = m_uNode1[uIndex]; + *ptruNode2 = m_uNode2[uIndex]; + } + +void EdgeList::Copy(const EdgeList &rhs) + { + Clear(); + const unsigned uCount = rhs.GetCount(); + for (unsigned n = 0; n < uCount; ++n) + { + unsigned uNode1; + unsigned uNode2; + rhs.GetEdge(n, &uNode1, &uNode2); + Add(uNode1, uNode2); + } + } + +void EdgeList::Expand() + { + unsigned uNewCacheSize = m_uCacheSize + 512; + unsigned *NewNode1 = new unsigned[uNewCacheSize]; + unsigned *NewNode2 = new unsigned[uNewCacheSize]; + if (m_uCount > 0) + { + memcpy(NewNode1, m_uNode1, m_uCount*sizeof(unsigned)); + memcpy(NewNode2, m_uNode2, m_uCount*sizeof(unsigned)); + } + delete[] m_uNode1; + delete[] m_uNode2; + m_uNode1 = NewNode1; + m_uNode2 = NewNode2; + m_uCacheSize = uNewCacheSize; + } + +void EdgeList::LogMe() const + { + for (unsigned n = 0; n < m_uCount; ++n) + { + if (n > 0) + Log(" "); + Log("%u->%u", m_uNode1[n], m_uNode2[n]); + } + Log("\n"); + } diff --git a/src/muscle/muscle3.8.31/src/edgelist.h b/src/muscle/muscle3.8.31/src/edgelist.h new file mode 100644 index 0000000..b318027 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/edgelist.h @@ -0,0 +1,28 @@ +#ifndef EdgeList_h +#define EdgeList_h + +class EdgeList + { +public: + EdgeList(); + virtual ~EdgeList(); + +public: + void Clear(); + void Add(unsigned uNode1, unsigned uNode2); + unsigned GetCount() const; + void GetEdge(unsigned uIndex, unsigned *ptruNode1, unsigned *ptruNode2) const; + void Copy(const EdgeList &rhs); + void LogMe() const; + +private: + void Expand(); + +private: + unsigned m_uCount; + unsigned m_uCacheSize; + unsigned *m_uNode1; + unsigned *m_uNode2; + }; + +#endif // EdgeList_h diff --git a/src/muscle/muscle3.8.31/src/enumopts.cpp b/src/muscle/muscle3.8.31/src/enumopts.cpp new file mode 100644 index 0000000..d504a88 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/enumopts.cpp @@ -0,0 +1,8 @@ +#include "muscle.h" +#include "enumopts.h" + +#define s(t) EnumOpt t##_Opts[] = { +#define c(t, x) #x, t##_##x, +#define e(t) 0, 0 }; + +#include "enums.h" diff --git a/src/muscle/muscle3.8.31/src/enumopts.h b/src/muscle/muscle3.8.31/src/enumopts.h new file mode 100644 index 0000000..e3962d5 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/enumopts.h @@ -0,0 +1,16 @@ +#ifndef enumopts_h +#define enumopts_h + +struct EnumOpt + { + const char *pstrOpt; + int iValue; + }; + +#define s(t) extern EnumOpt t##_Opts[]; +#define c(t, x) /* empty */ +#define e(t) /* empty */ +#include "enums.h" + + +#endif // enumopts_h diff --git a/src/muscle/muscle3.8.31/src/enums.h b/src/muscle/muscle3.8.31/src/enums.h new file mode 100644 index 0000000..beaf7e6 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/enums.h @@ -0,0 +1,98 @@ +// enums.h +// Define enum types. +// Exploit macro hacks to avoid lots of repetetive typing. +// Generally I am opposed to macro hacks because of the +// highly obscure code that results, but in this case it +// makes maintenance much easier and less error-prone. +// The idea is that this file can be included in different +// places with different definitions of s (Start), c (Case) +// and e (End). See types.h. + +s(ALPHA) +c(ALPHA, Amino) +c(ALPHA, DNA) +c(ALPHA, RNA) +e(ALPHA) + +s(SEQTYPE) +c(SEQTYPE, Protein) +c(SEQTYPE, DNA) +c(SEQTYPE, RNA) +c(SEQTYPE, Auto) +e(SEQTYPE) + +s(ROOT) +c(ROOT, Pseudo) +c(ROOT, MidLongestSpan) +c(ROOT, MinAvgLeafDist) +e(ROOT) + +s(CLUSTER) +c(CLUSTER, UPGMA) +c(CLUSTER, UPGMAMax) +c(CLUSTER, UPGMAMin) +c(CLUSTER, UPGMB) +c(CLUSTER, NeighborJoining) +e(CLUSTER) + +s(JOIN) +c(JOIN, NearestNeighbor) +c(JOIN, NeighborJoining) +e(JOIN) + +s(LINKAGE) +c(LINKAGE, Min) +c(LINKAGE, Avg) +c(LINKAGE, Max) +c(LINKAGE, NeighborJoining) +c(LINKAGE, Biased) +e(LINKAGE) + +s(DISTANCE) +c(DISTANCE, Kmer6_6) +c(DISTANCE, Kmer20_3) +c(DISTANCE, Kmer20_4) +c(DISTANCE, Kbit20_3) +c(DISTANCE, Kmer4_6) +c(DISTANCE, PctIdKimura) +c(DISTANCE, PctIdLog) +c(DISTANCE, PWKimura) +c(DISTANCE, PWScoreDist) +c(DISTANCE, ScoreDist) +c(DISTANCE, Edit) +e(DISTANCE) + +s(PPSCORE) +c(PPSCORE, LE) +c(PPSCORE, SP) +c(PPSCORE, SV) +c(PPSCORE, SPN) +e(PPSCORE) + +s(SEQWEIGHT) +c(SEQWEIGHT, None) +c(SEQWEIGHT, Henikoff) +c(SEQWEIGHT, HenikoffPB) +c(SEQWEIGHT, GSC) +c(SEQWEIGHT, ClustalW) +c(SEQWEIGHT, ThreeWay) +e(SEQWEIGHT) + +s(OBJSCORE) +c(OBJSCORE, SP) // Sum of Pairs of sequences +c(OBJSCORE, DP) // Dynamic Programming score +c(OBJSCORE, XP) // Cross Pairs = sum of pairs between two MSAs +c(OBJSCORE, PS) // sum of Prof-Seq score for all seqs in MSA +c(OBJSCORE, SPF) // sum of pairs, fast approximation +c(OBJSCORE, SPM) // sp if <= 100 seqs, spf otherwise +e(OBJSCORE) + +s(TERMGAPS) +c(TERMGAPS, Full) +c(TERMGAPS, Half) +c(TERMGAPS, Ext) +e(TERMGAPS) + +#undef s +#undef c +#undef e diff --git a/src/muscle/muscle3.8.31/src/enumtostr.cpp b/src/muscle/muscle3.8.31/src/enumtostr.cpp new file mode 100644 index 0000000..8b085a9 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/enumtostr.cpp @@ -0,0 +1,16 @@ +#include "muscle.h" +#include + +static char szMsg[64]; + +// Define XXXToStr(XXX x) functions for each enum type XXX. +#define s(t) const char *t##ToStr(t x) { switch (x) { case t##_Undefined: return "Undefined"; +#define c(t, x) case t##_##x: return #x; +#define e(t) } sprintf(szMsg, #t "_%d", x); return szMsg; } +#include "enums.h" + +// Define StrToXXX(const char *Str) functions for each enum type XXX. +#define s(t) t StrTo##t(const char *Str) { if (0) ; +#define c(t, x) else if (0 == stricmp(#x, Str)) return t##_##x; +#define e(t) Quit("Invalid value %s for type %s", Str, #t); return t##_Undefined; } +#include "enums.h" diff --git a/src/muscle/muscle3.8.31/src/estring.cpp b/src/muscle/muscle3.8.31/src/estring.cpp new file mode 100644 index 0000000..ef7a576 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/estring.cpp @@ -0,0 +1,689 @@ +#include "muscle.h" +#include "pwpath.h" +#include "estring.h" +#include "seq.h" +#include "msa.h" + +/*** +An "estring" is an edit string that operates on a sequence. +An estring is represented as a vector of integers. +It is interpreted in order of increasing suffix. +A positive value n means copy n letters. +A negative value -n means insert n indels. +Zero marks the end of the vector. +Consecutive entries must have opposite sign, i.e. the +shortest possible representation must be used. + +A "tpair" is a traceback path for a pairwise alignment +represented as two estrings, one for each sequence. +***/ + +#define c2(c,d) (((unsigned char) c) << 8 | (unsigned char) d) + +unsigned LengthEstring(const short es[]) + { + unsigned i = 0; + while (*es++ != 0) + ++i; + return i; + } + +short *EstringNewCopy(const short es[]) + { + unsigned n = LengthEstring(es) + 1; + short *esNew = new short[n]; + memcpy(esNew, es, n*sizeof(short)); + return esNew; + } + +void LogEstring(const short es[]) + { + Log("<"); + for (unsigned i = 0; es[i] != 0; ++i) + { + if (i > 0) + Log(" "); + Log("%d", es[i]); + } + Log(">"); + } + +static bool EstringsEq(const short es1[], const short es2[]) + { + for (;;) + { + if (*es1 != *es2) + return false; + if (0 == *es1) + break; + ++es1; + ++es2; + } + return true; + } + +static void EstringCounts(const short es[], unsigned *ptruSymbols, + unsigned *ptruIndels) + { + unsigned uSymbols = 0; + unsigned uIndels = 0; + for (unsigned i = 0; es[i] != 0; ++i) + { + short n = es[i]; + if (n > 0) + uSymbols += n; + else if (n < 0) + uIndels += -n; + } + *ptruSymbols = uSymbols; + *ptruIndels = uIndels; + } + +static char *EstringOp(const short es[], const char s[]) + { + unsigned uSymbols; + unsigned uIndels; + EstringCounts(es, &uSymbols, &uIndels); + assert((unsigned) strlen(s) == uSymbols); + char *sout = new char[uSymbols + uIndels + 1]; + char *psout = sout; + for (;;) + { + int n = *es++; + if (0 == n) + break; + if (n > 0) + for (int i = 0; i < n; ++i) + *psout++ = *s++; + else + for (int i = 0; i < -n; ++i) + *psout++ = '-'; + } + assert(0 == *s); + *psout = 0; + return sout; + } + +void EstringOp(const short es[], const Seq &sIn, Seq &sOut) + { +#if DEBUG + unsigned uSymbols; + unsigned uIndels; + EstringCounts(es, &uSymbols, &uIndels); + assert(sIn.Length() == uSymbols); +#endif + sOut.Clear(); + sOut.SetName(sIn.GetName()); + int p = 0; + for (;;) + { + int n = *es++; + if (0 == n) + break; + if (n > 0) + for (int i = 0; i < n; ++i) + { + char c = sIn[p++]; + sOut.push_back(c); + } + else + for (int i = 0; i < -n; ++i) + sOut.push_back('-'); + } + } + +unsigned EstringOp(const short es[], const Seq &sIn, MSA &a) + { + unsigned uSymbols; + unsigned uIndels; + EstringCounts(es, &uSymbols, &uIndels); + assert(sIn.Length() == uSymbols); + + unsigned uColCount = uSymbols + uIndels; + + a.Clear(); + a.SetSize(1, uColCount); + + a.SetSeqName(0, sIn.GetName()); + a.SetSeqId(0, sIn.GetId()); + + unsigned p = 0; + unsigned uColIndex = 0; + for (;;) + { + int n = *es++; + if (0 == n) + break; + if (n > 0) + for (int i = 0; i < n; ++i) + { + char c = sIn[p++]; + a.SetChar(0, uColIndex++, c); + } + else + for (int i = 0; i < -n; ++i) + a.SetChar(0, uColIndex++, '-'); + } + assert(uColIndex == uColCount); + return uColCount; + } + +void PathToEstrings(const PWPath &Path, short **ptresA, short **ptresB) + { +// First pass to determine size of estrings esA and esB + const unsigned uEdgeCount = Path.GetEdgeCount(); + if (0 == uEdgeCount) + { + short *esA = new short[1]; + short *esB = new short[1]; + esA[0] = 0; + esB[0] = 0; + *ptresA = esA; + *ptresB = esB; + return; + } + + unsigned iLengthA = 1; + unsigned iLengthB = 1; + const char cFirstEdgeType = Path.GetEdge(0).cType; + char cPrevEdgeType = cFirstEdgeType; + for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex) + { + const PWEdge &Edge = Path.GetEdge(uEdgeIndex); + char cEdgeType = Edge.cType; + + switch (c2(cPrevEdgeType, cEdgeType)) + { + case c2('M', 'M'): + case c2('D', 'D'): + case c2('I', 'I'): + break; + + case c2('D', 'M'): + case c2('M', 'D'): + ++iLengthB; + break; + + case c2('I', 'M'): + case c2('M', 'I'): + ++iLengthA; + break; + + case c2('I', 'D'): + case c2('D', 'I'): + ++iLengthB; + ++iLengthA; + break; + + default: + assert(false); + } + cPrevEdgeType = cEdgeType; + } + +// Pass2 for seq A + { + short *esA = new short[iLengthA+1]; + unsigned iA = 0; + switch (Path.GetEdge(0).cType) + { + case 'M': + case 'D': + esA[0] = 1; + break; + + case 'I': + esA[0] = -1; + break; + + default: + assert(false); + } + + char cPrevEdgeType = cFirstEdgeType; + for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex) + { + const PWEdge &Edge = Path.GetEdge(uEdgeIndex); + char cEdgeType = Edge.cType; + + switch (c2(cPrevEdgeType, cEdgeType)) + { + case c2('M', 'M'): + case c2('D', 'D'): + case c2('D', 'M'): + case c2('M', 'D'): + ++(esA[iA]); + break; + + case c2('I', 'D'): + case c2('I', 'M'): + ++iA; + esA[iA] = 1; + break; + + case c2('M', 'I'): + case c2('D', 'I'): + ++iA; + esA[iA] = -1; + break; + + case c2('I', 'I'): + --(esA[iA]); + break; + + default: + assert(false); + } + + cPrevEdgeType = cEdgeType; + } + assert(iA == iLengthA - 1); + esA[iLengthA] = 0; + *ptresA = esA; + } + + { +// Pass2 for seq B + short *esB = new short[iLengthB+1]; + unsigned iB = 0; + switch (Path.GetEdge(0).cType) + { + case 'M': + case 'I': + esB[0] = 1; + break; + + case 'D': + esB[0] = -1; + break; + + default: + assert(false); + } + + char cPrevEdgeType = cFirstEdgeType; + for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex) + { + const PWEdge &Edge = Path.GetEdge(uEdgeIndex); + char cEdgeType = Edge.cType; + + switch (c2(cPrevEdgeType, cEdgeType)) + { + case c2('M', 'M'): + case c2('I', 'I'): + case c2('I', 'M'): + case c2('M', 'I'): + ++(esB[iB]); + break; + + case c2('D', 'I'): + case c2('D', 'M'): + ++iB; + esB[iB] = 1; + break; + + case c2('M', 'D'): + case c2('I', 'D'): + ++iB; + esB[iB] = -1; + break; + + case c2('D', 'D'): + --(esB[iB]); + break; + + default: + assert(false); + } + + cPrevEdgeType = cEdgeType; + } + assert(iB == iLengthB - 1); + esB[iLengthB] = 0; + *ptresB = esB; + } + +#if DEBUG + { + const PWEdge &LastEdge = Path.GetEdge(uEdgeCount - 1); + unsigned uSymbols; + unsigned uIndels; + EstringCounts(*ptresA, &uSymbols, &uIndels); + assert(uSymbols == LastEdge.uPrefixLengthA); + assert(uSymbols + uIndels == uEdgeCount); + + EstringCounts(*ptresB, &uSymbols, &uIndels); + assert(uSymbols == LastEdge.uPrefixLengthB); + assert(uSymbols + uIndels == uEdgeCount); + + PWPath TmpPath; + EstringsToPath(*ptresA, *ptresB, TmpPath); + TmpPath.AssertEqual(Path); + } +#endif + } + +void EstringsToPath(const short esA[], const short esB[], PWPath &Path) + { + Path.Clear(); + unsigned iA = 0; + unsigned iB = 0; + int nA = esA[iA++]; + int nB = esB[iB++]; + unsigned uPrefixLengthA = 0; + unsigned uPrefixLengthB = 0; + for (;;) + { + char cType; + if (nA > 0) + { + if (nB > 0) + { + cType = 'M'; + --nA; + --nB; + } + else if (nB < 0) + { + cType = 'D'; + --nA; + ++nB; + } + else + assert(false); + } + else if (nA < 0) + { + if (nB > 0) + { + cType = 'I'; + ++nA; + --nB; + } + else + assert(false); + } + else + assert(false); + + switch (cType) + { + case 'M': + ++uPrefixLengthA; + ++uPrefixLengthB; + break; + case 'D': + ++uPrefixLengthA; + break; + case 'I': + ++uPrefixLengthB; + break; + } + + PWEdge Edge; + Edge.cType = cType; + Edge.uPrefixLengthA = uPrefixLengthA; + Edge.uPrefixLengthB = uPrefixLengthB; + Path.AppendEdge(Edge); + + if (nA == 0) + { + if (0 == esA[iA]) + { + assert(0 == esB[iB]); + break; + } + nA = esA[iA++]; + } + if (nB == 0) + nB = esB[iB++]; + } + } + +/*** +Multiply two estrings to make a third estring. +The product of two estrings e1*e2 is defined to be +the estring that produces the same result as applying +e1 then e2. Multiplication is not commutative. In fact, +the reversed order is undefined unless both estrings +consist of a single, identical, positive entry. +A primary motivation for using estrings is that +multiplication is very fast, reducing the time +needed to construct the root alignment. + +Example + + <-1,3>(XXX) = -XXX + <2,-1,2>(-XXX) = -X-XX + +Therefore, + + <-1,3>*<2,-1,2> = <-1,1,-1,2> +***/ + +static bool CanMultiplyEstrings(const short es1[], const short es2[]) + { + unsigned uSymbols1; + unsigned uSymbols2; + unsigned uIndels1; + unsigned uIndels2; + EstringCounts(es1, &uSymbols1, &uIndels1); + EstringCounts(es2, &uSymbols2, &uIndels2); + return uSymbols1 + uIndels1 == uSymbols2; + } + +static inline void AppendGaps(short esp[], int &ip, int n) + { + if (-1 == ip) + esp[++ip] = n; + else if (esp[ip] < 0) + esp[ip] += n; + else + esp[++ip] = n; + } + +static inline void AppendSymbols(short esp[], int &ip, int n) + { + if (-1 == ip) + esp[++ip] = n; + else if (esp[ip] > 0) + esp[ip] += n; + else + esp[++ip] = n; + } + +void MulEstrings(const short es1[], const short es2[], short esp[]) + { + assert(CanMultiplyEstrings(es1, es2)); + + unsigned i1 = 0; + int ip = -1; + int n1 = es1[i1++]; + for (unsigned i2 = 0; ; ++i2) + { + int n2 = es2[i2]; + if (0 == n2) + break; + if (n2 > 0) + { + for (;;) + { + if (n1 < 0) + { + if (n2 > -n1) + { + AppendGaps(esp, ip, n1); + n2 += n1; + n1 = es1[i1++]; + } + else if (n2 == -n1) + { + AppendGaps(esp, ip, n1); + n1 = es1[i1++]; + break; + } + else + { + assert(n2 < -n1); + AppendGaps(esp, ip, -n2); + n1 += n2; + break; + } + } + else + { + assert(n1 > 0); + if (n2 > n1) + { + AppendSymbols(esp, ip, n1); + n2 -= n1; + n1 = es1[i1++]; + } + else if (n2 == n1) + { + AppendSymbols(esp, ip, n1); + n1 = es1[i1++]; + break; + } + else + { + assert(n2 < n1); + AppendSymbols(esp, ip, n2); + n1 -= n2; + break; + } + } + } + } + else + { + assert(n2 < 0); + AppendGaps(esp, ip, n2); + } + } + esp[++ip] = 0; + +#if DEBUG + { + int MaxLen = (int) (LengthEstring(es1) + LengthEstring(es2) + 1); + assert(ip < MaxLen); + if (ip >= 2) + for (int i = 0; i < ip - 2; ++i) + { + if (!(esp[i] > 0 && esp[i+1] < 0 || esp[i] < 0 && esp[i+1] > 0)) + { + Log("Bad result of MulEstring: "); + LogEstring(esp); + Quit("Assert failed (alternating signs)"); + } + } + unsigned uSymbols1; + unsigned uSymbols2; + unsigned uSymbolsp; + unsigned uIndels1; + unsigned uIndels2; + unsigned uIndelsp; + EstringCounts(es1, &uSymbols1, &uIndels1); + EstringCounts(es2, &uSymbols2, &uIndels2); + EstringCounts(esp, &uSymbolsp, &uIndelsp); + if (uSymbols1 + uIndels1 != uSymbols2) + { + Log("Bad result of MulEstring: "); + LogEstring(esp); + Quit("Assert failed (counts1 %u %u %u)", + uSymbols1, uIndels1, uSymbols2); + } + } +#endif + } + +static void test(const short es1[], const short es2[], const short esa[]) + { + unsigned uSymbols1; + unsigned uSymbols2; + unsigned uIndels1; + unsigned uIndels2; + EstringCounts(es1, &uSymbols1, &uIndels1); + EstringCounts(es2, &uSymbols2, &uIndels2); + + char s[4096]; + memset(s, 'X', sizeof(s)); + s[uSymbols1] = 0; + + char *s1 = EstringOp(es1, s); + char *s12 = EstringOp(es2, s1); + + memset(s, 'X', sizeof(s)); + s[uSymbols2] = 0; + char *s2 = EstringOp(es2, s); + + Log("%s * %s = %s\n", s1, s2, s12); + + LogEstring(es1); + Log(" * "); + LogEstring(es2); + Log(" = "); + LogEstring(esa); + Log("\n"); + + short esp[4096]; + MulEstrings(es1, es2, esp); + LogEstring(esp); + if (!EstringsEq(esp, esa)) + Log(" *ERROR* "); + Log("\n"); + + memset(s, 'X', sizeof(s)); + s[uSymbols1] = 0; + char *sp = EstringOp(esp, s); + Log("%s\n", sp); + Log("\n==========\n\n"); + } + +void TestEstrings() + { + SetListFileName("c:\\tmp\\muscle.log", false); + //{ + //short es1[] = { -1, 1, -1, 0 }; + //short es2[] = { 1, -1, 2, 0 }; + //short esa[] = { -2, 1, -1, 0 }; + //test(es1, es2, esa); + //} + //{ + //short es1[] = { 2, -1, 2, 0 }; + //short es2[] = { 1, -1, 3, -1, 1, 0 }; + //short esa[] = { 1, -1, 1, -1, 1, -1, 1, 0 }; + //test(es1, es2, esa); + //} + //{ + //short es1[] = { -1, 3, 0 }; + //short es2[] = { 2, -1, 2, 0 }; + //short esa[] = { -1, 1, -1, 2, 0 }; + //test(es1, es2, esa); + //} + //{ + //short es1[] = { -1, 1, -1, 1, 0}; + //short es2[] = { 4, 0 }; + //short esa[] = { -1, 1, -1, 1, 0}; + //test(es1, es2, esa); + //} + //{ + //short es1[] = { 1, -1, 1, -1, 0}; + //short es2[] = { 4, 0 }; + //short esa[] = { 1, -1, 1, -1, 0}; + //test(es1, es2, esa); + //} + //{ + //short es1[] = { 1, -1, 1, -1, 0}; + //short es2[] = { -1, 4, -1, 0 }; + //short esa[] = { -1, 1, -1, 1, -2, 0}; + //test(es1, es2, esa); + //} + { + short es1[] = { 106, -77, 56, -2, 155, -3, 123, -2, 0}; + short es2[] = { 50, -36, 34, -3, 12, -6, 1, -6, 18, -17, 60, -5, 349, -56, 0 }; + short esa[] = { 0 }; + test(es1, es2, esa); + } + exit(0); + } diff --git a/src/muscle/muscle3.8.31/src/estring.h b/src/muscle/muscle3.8.31/src/estring.h new file mode 100644 index 0000000..be354f8 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/estring.h @@ -0,0 +1,13 @@ +#ifndef pathsum_h +#define pathsum_h + +void PathToEstrings(const PWPath &Path, short **ptresA, short **ptresB); +void EstringsToPath(const short esA[], const short esB[], PWPath &Path); +void MulEstrings(const short es1[], const short es2[], short esp[]); +void EstringOp(const short es[], const Seq &sIn, Seq &sOut); +unsigned EstringOp(const short es[], const Seq &sIn, MSA &a); +void LogEstring(const short es[]); +unsigned LengthEstring(const short es[]); +short *EstringNewCopy(const short es[]); + +#endif // pathsum_h diff --git a/src/muscle/muscle3.8.31/src/fasta.cpp b/src/muscle/muscle3.8.31/src/fasta.cpp new file mode 100644 index 0000000..0eee1c5 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/fasta.cpp @@ -0,0 +1,56 @@ +#include "muscle.h" +#include +#include +#include "msa.h" +#include "textfile.h" + +const unsigned FASTA_BLOCK = 60; + +void MSA::FromFASTAFile(TextFile &File) + { + Clear(); + + FILE *f = File.GetStdioFile(); + + unsigned uSeqCount = 0; + unsigned uColCount = uInsane; + for (;;) + { + char *Label; + unsigned uSeqLength; + char *SeqData = GetFastaSeq(f, &uSeqLength, &Label, false); + if (0 == SeqData) + break; + AppendSeq(SeqData, uSeqLength, Label); + } + } + +void MSA::ToFASTAFile(TextFile &File) const + { + const unsigned uColCount = GetColCount(); + assert(uColCount > 0); + const unsigned uLinesPerSeq = (GetColCount() - 1)/FASTA_BLOCK + 1; + const unsigned uSeqCount = GetSeqCount(); + + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + File.PutString(">"); + File.PutString(GetSeqName(uSeqIndex)); + File.PutString("\n"); + + unsigned n = 0; + for (unsigned uLine = 0; uLine < uLinesPerSeq; ++uLine) + { + unsigned uLetters = uColCount - uLine*FASTA_BLOCK; + if (uLetters > FASTA_BLOCK) + uLetters = FASTA_BLOCK; + for (unsigned i = 0; i < uLetters; ++i) + { + char c = GetChar(uSeqIndex, n); + File.PutChar(c); + ++n; + } + File.PutChar('\n'); + } + } + } diff --git a/src/muscle/muscle3.8.31/src/fasta2.cpp b/src/muscle/muscle3.8.31/src/fasta2.cpp new file mode 100644 index 0000000..98b0fa3 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/fasta2.cpp @@ -0,0 +1,114 @@ +#include "muscle.h" +#include +#include + +//const int BUFFER_BYTES = 16*1024; +const int BUFFER_BYTES = 128; +const int CR = '\r'; +const int NL = '\n'; + +#define ADD(c) \ + { \ + if (Pos >= BufferLength) \ + { \ + const int NewBufferLength = BufferLength + BUFFER_BYTES; \ + char *NewBuffer = new char[NewBufferLength]; \ + memcpy(NewBuffer, Buffer, BufferLength); \ + delete[] Buffer; \ + Buffer = NewBuffer; \ + BufferLength = NewBufferLength; \ + } \ + Buffer[Pos++] = c; \ + } + +// Get next sequence from file. +char *GetFastaSeq(FILE *f, unsigned *ptrSeqLength, char **ptrLabel, bool DeleteGaps) + { + unsigned BufferLength = 0; + unsigned Pos = 0; + char *Buffer = 0; + + int c = fgetc(f); + if (EOF == c) + return 0; + if ('>' != c) + Quit("Invalid file format, expected '>' to start FASTA label"); + + for (;;) + { + int c = fgetc(f); + if (EOF == c) + Quit("End-of-file or input error in FASTA label"); + + // NL or CR terminates label + if (NL == c || CR == c) + break; + + // All other characters added to label + ADD(c) + } + +// Nul-terminate label + ADD(0) + *ptrLabel = Buffer; + + BufferLength = 0; + Pos = 0; + Buffer = 0; + int PreviousChar = NL; + for (;;) + { + int c = fgetc(f); + if (EOF == c) + { + if (feof(f)) + break; + else if (ferror(f)) + Quit("Error reading FASTA file, ferror=TRUE feof=FALSE errno=%d %s", + errno, strerror(errno)); + else + Quit("Error reading FASTA file, fgetc=EOF feof=FALSE ferror=FALSE errno=%d %s", + errno, strerror(errno)); + } + + if ('>' == c) + { + if (NL == PreviousChar || CR == PreviousChar) + { + ungetc(c, f); + break; + } + else + Quit("Unexpected '>' in FASTA sequence data"); + } + else if (isspace(c)) + ; + else if (IsGapChar(c)) + { + if (!DeleteGaps) + ADD(c) + } + else if (isalpha(c)) + { + c = toupper(c); + ADD(c) + } + else if (isprint(c)) + { + Warning("Invalid character '%c' in FASTA sequence data, ignored", c); + continue; + } + else + { + Warning("Invalid byte hex %02x in FASTA sequence data, ignored", (unsigned char) c); + continue; + } + PreviousChar = c; + } + + if (0 == Pos) + return GetFastaSeq(f, ptrSeqLength, ptrLabel, DeleteGaps); + + *ptrSeqLength = Pos; + return Buffer; + } diff --git a/src/muscle/muscle3.8.31/src/fastclust.cpp b/src/muscle/muscle3.8.31/src/fastclust.cpp new file mode 100644 index 0000000..b77d4fc --- /dev/null +++ b/src/muscle/muscle3.8.31/src/fastclust.cpp @@ -0,0 +1,77 @@ +#include "muscle.h" +#include "seqvect.h" +#include "distfunc.h" +#include "clust.h" +#include "clustsetdf.h" +#include "tree.h" +#include "clust.h" +#include "distcalc.h" +#include + +static void TreeFromSeqVect_NJ(const DistFunc &DF, CLUSTER Cluster, Tree &tree) + { + ClustSetDF CSD(DF); + + Clust C; + C.Create(CSD, Cluster); + + tree.FromClust(C); + } + +static void TreeFromSeqVect_UPGMA(const DistFunc &DF, CLUSTER Cluster, Tree &tree) + { + LINKAGE Linkage = LINKAGE_Undefined; + switch (Cluster) + { + case CLUSTER_UPGMA: + Linkage = LINKAGE_Avg; + break; + case CLUSTER_UPGMAMin: + Linkage = LINKAGE_Min; + break; + case CLUSTER_UPGMAMax: + Linkage = LINKAGE_Max; + break; + case CLUSTER_UPGMB: + Linkage = LINKAGE_Biased; + break; + default: + Quit("TreeFromSeqVect_UPGMA, CLUSTER_%u not supported", Cluster); + } + + DistCalcDF DC; + DC.Init(DF); + UPGMA2(DC, tree, Linkage); + } + +static void SaveDF(const SeqVect &v, DistFunc &d, const char *FileName) + { + FILE *f = fopen(FileName, "w"); + if (f == 0) + Quit("Cannot create %s", FileName); + + unsigned n = v.GetSeqCount(); + fprintf(f, "%u\n", n); + for (unsigned i = 0; i < n; ++i) + { + fprintf(f, "%10.10s ", v.GetSeqName(i)); + for (unsigned j = 0; j < i; ++j) + fprintf(f, " %9g", d.GetDist(i, j)); + fprintf(f, "\n"); + } + fclose(f); + } + +void TreeFromSeqVect(const SeqVect &v, Tree &tree, CLUSTER Cluster, + DISTANCE Distance, ROOT Root, const char *SaveFileName) + { + DistFunc DF; + DistUnaligned(v, Distance, DF); + if (SaveFileName != 0) + SaveDF(v, DF, SaveFileName); + if (CLUSTER_NeighborJoining == Cluster) + TreeFromSeqVect_NJ(DF, Cluster, tree); + else + TreeFromSeqVect_UPGMA(DF, Cluster, tree); + FixRoot(tree, Root); + } diff --git a/src/muscle/muscle3.8.31/src/fastdist.cpp b/src/muscle/muscle3.8.31/src/fastdist.cpp new file mode 100644 index 0000000..a9271c6 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/fastdist.cpp @@ -0,0 +1,56 @@ +#include "muscle.h" +#include "distfunc.h" +#include "seqvect.h" + +void DistPWScoreDist(const SeqVect &v, DistFunc &DF); + +void DistUnaligned(const SeqVect &v, DISTANCE DistMethod, DistFunc &DF) + { + const unsigned uSeqCount = v.Length(); + + switch (DistMethod) + { + case DISTANCE_Kmer6_6: + DistKmer6_6(v, DF); + break; + + case DISTANCE_Kmer20_3: + DistKmer20_3(v, DF); + break; + + case DISTANCE_Kmer20_4: + FastDistKmer(v, DF); + break; + + case DISTANCE_Kbit20_3: + DistKbit20_3(v, DF); + break; + + case DISTANCE_Kmer4_6: + DistKmer4_6(v, DF); + break; + + case DISTANCE_PWKimura: + DistPWKimura(v, DF); + break; + + case DISTANCE_PWScoreDist: + DistPWScoreDist(v, DF); + break; + + default: + Quit("DistUnaligned, unsupported distance method %d", DistMethod); + } + +// const char **SeqNames = (const char **) malloc(uSeqCount*sizeof(char *)); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + const Seq &s = *(v[uSeqIndex]); + + const char *ptrName = s.GetName(); + unsigned uId = s.GetId(); + + DF.SetName(uSeqIndex, ptrName); + DF.SetId(uSeqIndex, uId); + } + } diff --git a/src/muscle/muscle3.8.31/src/fastdistjones.cpp b/src/muscle/muscle3.8.31/src/fastdistjones.cpp new file mode 100644 index 0000000..bacc485 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/fastdistjones.cpp @@ -0,0 +1,206 @@ +#include "muscle.h" +#include "distfunc.h" +#include "seqvect.h" +#include + +const unsigned TRIPLE_COUNT = 20*20*20; + +struct TripleCount + { + unsigned m_uSeqCount; // How many sequences have this triple? + unsigned short *m_Counts; // m_Counts[s] = nr of times triple found in seq s + }; +static TripleCount *TripleCounts; + +// WARNING: Sequences MUST be stripped of gaps and upper case! +void DistKmer20_3(const SeqVect &v, DistFunc &DF) + { + const unsigned uSeqCount = v.Length(); + + DF.SetCount(uSeqCount); + if (0 == uSeqCount) + return; + for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) + { + DF.SetDist(uSeq1, uSeq1, 0); + for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) + DF.SetDist(uSeq1, uSeq2, 0); + } + + const unsigned uTripleArrayBytes = TRIPLE_COUNT*sizeof(TripleCount); + TripleCounts = (TripleCount *) malloc(uTripleArrayBytes); + if (0 == TripleCounts) + Quit("Not enough memory (TripleCounts)"); + memset(TripleCounts, 0, uTripleArrayBytes); + + for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord) + { + TripleCount &tc = *(TripleCounts + uWord); + const unsigned uBytes = uSeqCount*sizeof(short); + tc.m_Counts = (unsigned short *) malloc(uBytes); + memset(tc.m_Counts, 0, uBytes); + } + + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + Seq &s = *(v[uSeqIndex]); + const unsigned uSeqLength = s.Length(); + for (unsigned uPos = 0; uPos < uSeqLength - 2; ++uPos) + { + const unsigned uLetter1 = CharToLetterEx(s[uPos]); + if (uLetter1 >= 20) + continue; + const unsigned uLetter2 = CharToLetterEx(s[uPos+1]); + if (uLetter2 >= 20) + continue; + const unsigned uLetter3 = CharToLetterEx(s[uPos+2]); + if (uLetter3 >= 20) + continue; + + const unsigned uWord = uLetter1 + uLetter2*20 + uLetter3*20*20; + assert(uWord < TRIPLE_COUNT); + + TripleCount &tc = *(TripleCounts + uWord); + const unsigned uOldCount = tc.m_Counts[uSeqIndex]; + if (0 == uOldCount) + ++(tc.m_uSeqCount); + + ++(tc.m_Counts[uSeqIndex]); + } + } + +#if TRACE + { + Log("TripleCounts\n"); + unsigned uGrandTotal = 0; + for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord) + { + const TripleCount &tc = *(TripleCounts + uWord); + if (0 == tc.m_uSeqCount) + continue; + + const unsigned uLetter3 = uWord/(20*20); + const unsigned uLetter2 = (uWord - uLetter3*20*20)/20; + const unsigned uLetter1 = uWord%20; + Log("Word %6u %c%c%c %6u", + uWord, + LetterToCharAmino(uLetter1), + LetterToCharAmino(uLetter2), + LetterToCharAmino(uLetter3), + tc.m_uSeqCount); + + unsigned uSeqCountWithThisWord = 0; + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + const unsigned uCount = tc.m_Counts[uSeqIndex]; + if (uCount > 0) + { + ++uSeqCountWithThisWord; + Log(" %u=%u", uSeqIndex, uCount); + uGrandTotal += uCount; + } + } + if (uSeqCountWithThisWord != tc.m_uSeqCount) + Log(" *** SQ ERROR *** %u %u", tc.m_uSeqCount, uSeqCountWithThisWord); + Log("\n"); + } + + unsigned uTotalBySeqLength = 0; + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + Seq &s = *(v[uSeqIndex]); + const unsigned uSeqLength = s.Length(); + uTotalBySeqLength += uSeqLength - 2; + } + if (uGrandTotal != uTotalBySeqLength) + Log("*** TOTALS DISAGREE *** %u %u\n", uGrandTotal, uTotalBySeqLength); + } +#endif + + const unsigned uSeqListBytes = uSeqCount*sizeof(unsigned); + unsigned short *SeqList = (unsigned short *) malloc(uSeqListBytes); + + for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord) + { + const TripleCount &tc = *(TripleCounts + uWord); + if (0 == tc.m_uSeqCount) + continue; + + unsigned uSeqCountFound = 0; + memset(SeqList, 0, uSeqListBytes); + + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + if (tc.m_Counts[uSeqIndex] > 0) + { + SeqList[uSeqCountFound] = uSeqIndex; + ++uSeqCountFound; + if (uSeqCountFound == tc.m_uSeqCount) + break; + } + } + assert(uSeqCountFound == tc.m_uSeqCount); + + for (unsigned uSeq1 = 0; uSeq1 < uSeqCountFound; ++uSeq1) + { + const unsigned uSeqIndex1 = SeqList[uSeq1]; + const unsigned uCount1 = tc.m_Counts[uSeqIndex1]; + for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) + { + const unsigned uSeqIndex2 = SeqList[uSeq2]; + const unsigned uCount2 = tc.m_Counts[uSeqIndex2]; + const unsigned uMinCount = uCount1 < uCount2 ? uCount1 : uCount2; + const double d = DF.GetDist(uSeqIndex1, uSeqIndex2); + DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (d + uMinCount)); + } + } + } + delete[] SeqList; + free(TripleCounts); + + unsigned uDone = 0; + const unsigned uTotal = (uSeqCount*(uSeqCount - 1))/2; + for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) + { + DF.SetDist(uSeq1, uSeq1, 0.0); + + const Seq &s1 = *(v[uSeq1]); + const unsigned uLength1 = s1.Length(); + + for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) + { + const Seq &s2 = *(v[uSeq2]); + const unsigned uLength2 = s2.Length(); + unsigned uMinLength = uLength1 < uLength2 ? uLength1 : uLength2; + if (uMinLength < 3) + { + DF.SetDist(uSeq1, uSeq2, 1.0); + continue; + } + + const double dTripleCount = DF.GetDist(uSeq1, uSeq2); + if (dTripleCount == 0) + { + DF.SetDist(uSeq1, uSeq2, 1.0); + continue; + } + double dNormalizedTripletScore = dTripleCount/(uMinLength - 2); + //double dEstimatedPairwiseIdentity = exp(0.3912*log(dNormalizedTripletScore)); + //if (dEstimatedPairwiseIdentity > 1) + // dEstimatedPairwiseIdentity = 1; +// DF.SetDist(uSeq1, uSeq2, (float) (1.0 - dEstimatedPairwiseIdentity)); + DF.SetDist(uSeq1, uSeq2, (float) dNormalizedTripletScore); + +#if TRACE + { + Log("%s - %s Triplet count = %g Lengths %u, %u Estimated pwid = %g\n", + s1.GetName(), s2.GetName(), dTripleCount, uLength1, uLength2, + dEstimatedPairwiseIdentity); + } +#endif + if (uDone%1000 == 0) + Progress(uDone, uTotal); + } + } + ProgressStepsDone(); + } diff --git a/src/muscle/muscle3.8.31/src/fastdistkbit.cpp b/src/muscle/muscle3.8.31/src/fastdistkbit.cpp new file mode 100644 index 0000000..60ac1f8 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/fastdistkbit.cpp @@ -0,0 +1,109 @@ +#include "muscle.h" +#include "distfunc.h" +#include "seqvect.h" +#include + +#define MIN(x, y) ((x) < (y) ? (x) : (y)) + +static void SetKmerBitVector(const Seq &s, byte Bits[]) + { + const unsigned uLength = s.Length(); + const unsigned k = 3; // kmer length + unsigned i = 0; + unsigned c = 0; + unsigned h = 0; + for (unsigned j = 0; j < k - 1; ++j) + { + unsigned x = CharToLetterEx(s[i++]); + if (x <= AX_Y) + c = c*20 + x; + else + { + c = 0; + h = j + 1; + } + } + for ( ; i < uLength; ++i) + { + unsigned x = CharToLetterEx(s[i++]); + if (x <= AX_Y) + c = (c*20 + x)%8000; + else + { + c = 0; + h = i + k; + } + if (i >= h) + { + unsigned ByteOffset = c/8; + unsigned BitOffset = c%8; + Bits[ByteOffset] |= (1 << BitOffset); + } + } + } + +static unsigned CommonBitCount(const byte Bits1[], const byte Bits2[]) + { + const byte * const p1end = Bits1 + 1000; + const byte *p2 = Bits2; + + unsigned uCount = 0; + for (const byte *p1 = Bits1; p1 != p1end; ++p1) + { + // Here is a cute trick for efficiently counting the + // bits common between two bytes by combining them into + // a single word. + unsigned b = *p1 | (*p2 << 8); + while (b != 0) + { + if (b & 0x101) + ++uCount; + b >>= 1; + } + ++p2; + } + return uCount; + } + +void DistKbit20_3(const SeqVect &v, DistFunc &DF) + { + const unsigned uSeqCount = v.Length(); + DF.SetCount(uSeqCount); + +// There are 20^3 = 8,000 distinct kmers in the 20-letter alphabet. +// For each sequence, we create a bit vector of length 8,000, i.e. +// 1,000 bytes, having one bit per kmer. The bit is set to 1 if the +// kmer is present in the sequence. + const unsigned uBytes = uSeqCount*1000; + byte *BitVector = new byte[uBytes]; + memset(BitVector, 0, uBytes); + + SetProgressDesc("K-bit distance matrix"); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + SetKmerBitVector(*v[uSeqIndex], BitVector + uSeqIndex*1000); + + unsigned uDone = 0; + const unsigned uTotal = (uSeqCount*(uSeqCount - 1))/2; + for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) + { + const byte *Bits1 = BitVector + uSeqIndex1*1000; + const unsigned uLength1 = v[uSeqIndex1]->Length(); + for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2) + { + const byte *Bits2 = BitVector + uSeqIndex2*1000; + const unsigned uLength2 = v[uSeqIndex2]->Length(); + const float fCount = (float) CommonBitCount(Bits1, Bits2); + + // Distance measure = K / min(L1, L2) + // K is number of distinct kmers that are found in both sequences + const float fDist = fCount / MIN(uLength1, uLength2); + DF.SetDist(uSeqIndex1, uSeqIndex2, fDist); + if (uDone%10000 == 0) + Progress(uDone, uTotal); + ++uDone; + } + } + ProgressStepsDone(); + + delete[] BitVector; + } diff --git a/src/muscle/muscle3.8.31/src/fastdistkmer.cpp b/src/muscle/muscle3.8.31/src/fastdistkmer.cpp new file mode 100644 index 0000000..569a75b --- /dev/null +++ b/src/muscle/muscle3.8.31/src/fastdistkmer.cpp @@ -0,0 +1,247 @@ +#include "muscle.h" +#include "msa.h" +#include "seqvect.h" +#include "seq.h" +#include "distfunc.h" +#include + +#define TRACE 0 + +/*** +Some candidate alphabets considered because they +have high correlations and small table sizes. +Correlation coefficent is between k-mer distance +and %id D measured from a CLUSTALW alignment. +Table size is N^k where N is size of alphabet. +A is standard (uncompressed) amino alphabet. + + Correlation +Alpha N k Table Size all 25-50% +----- -- - ---------- ---- ------ +A 20 3 8,000 0.943 0.575 +A 20 4 160,000 0.962 0.685 << +LiA 14 4 38,416 0.966 0.645 +SEB 14 4 38,416 0.964 0.634 +LiA 13 4 28,561 0.965 0.640 +LiA 12 4 20,736 0.963 0.620 +LiA 10 5 100,000 0.964 0.652 + +We select A with k=4 because it has the best +correlations. The only drawback is a large table +size, but space is readily available and the only +additional time cost is in resetting the table to +zero, which can be done quickly with memset or by +keeping a list of the k-mers that were found (should +test to see which is faster, and may vary by compiler +and processor type). It also has the minor advantage +that we don't need to convert the alphabet. + +Fractional identity d is estimated as follows. + + F = fractional k-mer count + if F is 0: F = 0.01 + Y = log(0.02 + F) + d = -4.1 + 4.12*Y + +The constant 0.02 was chosen to make the relationship +between Y and D linear. The constants -4.1 and 4.12 +were chosen to fit a straight line to the scatterplot +of Y vs D. +***/ + +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) + +const unsigned K = 4; +const unsigned N = 20; +const unsigned N_2 = 20*20; +const unsigned N_3 = 20*20*20; +const unsigned N_4 = 20*20*20*20; + +const unsigned TABLE_SIZE = N_4; + +// For debug output +const char *KmerToStr(unsigned Kmer) + { + static char s[5]; + + unsigned c3 = (Kmer/N_3)%N; + unsigned c2 = (Kmer/N_2)%N; + unsigned c1 = (Kmer/N)%N; + unsigned c0 = Kmer%N; + + s[0] = LetterToChar(c3); + s[1] = LetterToChar(c2); + s[2] = LetterToChar(c1); + s[3] = LetterToChar(c0); + return s; + } + +void CountKmers(const byte s[], unsigned uSeqLength, byte KmerCounts[]) + { +#if TRACE + Log("CountKmers\n"); +#endif + memset(KmerCounts, 0, TABLE_SIZE*sizeof(byte)); + + const byte *ptrKmerStart = s; + const byte *ptrKmerEnd = s + 4; + const byte *ptrSeqEnd = s + uSeqLength; + + unsigned c3 = s[0]*N_3; + unsigned c2 = s[1]*N_2; + unsigned c1 = s[2]*N; + unsigned c0 = s[3]; + + unsigned Kmer = c3 + c2 + c1 + c0; + + for (;;) + { + assert(Kmer < TABLE_SIZE); + +#if TRACE + Log("Kmer=%d=%s\n", Kmer, KmerToStr(Kmer)); +#endif + ++(KmerCounts[Kmer]); + + if (ptrKmerEnd == ptrSeqEnd) + break; + + // Compute k-mer as function of previous k-mer: + // 1. Subtract first letter from previous k-mer. + // 2. Multiply by N. + // 3. Add next letter. + c3 = (*ptrKmerStart++) * N_3; + Kmer = (Kmer - c3)*N; + Kmer += *ptrKmerEnd++; + } + } + +unsigned CommonKmerCount(const byte Seq[], unsigned uSeqLength, + const byte KmerCounts1[], const byte Seq2[], unsigned uSeqLength2) + { + byte KmerCounts2[TABLE_SIZE]; + CountKmers(Seq2, uSeqLength2, KmerCounts2); + + const byte *ptrKmerStart = Seq; + const byte *ptrKmerEnd = Seq + 4; + const byte *ptrSeqEnd = Seq + uSeqLength; + + unsigned c3 = Seq[0]*N_3; + unsigned c2 = Seq[1]*N_2; + unsigned c1 = Seq[2]*N; + unsigned c0 = Seq[3]; + + unsigned Kmer = c3 + c2 + c1 + c0; + + unsigned uCommonCount = 0; + for (;;) + { + assert(Kmer < TABLE_SIZE); + + const byte Count1 = KmerCounts1[Kmer]; + const byte Count2 = KmerCounts2[Kmer]; + + uCommonCount += MIN(Count1, Count2); + + // Hack so we don't double-count + KmerCounts2[Kmer] = 0; + + if (ptrKmerEnd == ptrSeqEnd) + break; + + // Compute k-mer as function of previous k-mer: + // 1. Subtract first letter from previous k-mer. + // 2. Multiply by N. + // 3. Add next letter. + c3 = (*ptrKmerStart++) * N_3; + Kmer = (Kmer - c3)*N; + Kmer += *ptrKmerEnd++; + } + return uCommonCount; + } + +static void SeqToLetters(const Seq &s, byte Letters[]) + { + const unsigned uSeqLength = s.Length(); + for (unsigned uCol = 0; uCol < uSeqLength; ++uCol) + { + char c = s.GetChar(uCol); + // Ugly hack. My k-mer counting code isn't wild-card + // aware. Arbitrarily replace wildcards by a specific + // amino acid. + if (IsWildcardChar(c)) + c = 'A'; + *Letters++ = CharToLetter(c); + } + } + +void FastDistKmer(const SeqVect &v, DistFunc &DF) + { + byte KmerCounts[TABLE_SIZE]; + + const unsigned uSeqCount = v.GetSeqCount(); + + DF.SetCount(uSeqCount); + if (0 == uSeqCount) + return; + +// Initialize distance matrix to zero + for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) + { + DF.SetDist(uSeq1, uSeq1, 0); + for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) + DF.SetDist(uSeq1, uSeq2, 0); + } + + unsigned uMaxLength = 0; + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + const Seq &s = v.GetSeq(uSeqIndex); + unsigned uSeqLength = s.Length(); + if (uSeqLength > uMaxLength) + uMaxLength = uSeqLength; + } + if (0 == uMaxLength) + return; + + byte *Seq1Letters = new byte[uMaxLength]; + byte *Seq2Letters = new byte[uMaxLength]; + + for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount - 1; ++uSeqIndex1) + { + const Seq &s1 = v.GetSeq(uSeqIndex1); + const unsigned uSeqLength1 = s1.Length(); + + SeqToLetters(s1, Seq1Letters); + CountKmers(Seq1Letters, uSeqLength1, KmerCounts); + + for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; + ++uSeqIndex2) + { + const Seq &s2 = v.GetSeq(uSeqIndex2); + const unsigned uSeqLength2 = s2.Length(); + + SeqToLetters(s2, Seq2Letters); + + unsigned uCommonKmerCount = CommonKmerCount(Seq1Letters, uSeqLength1, + KmerCounts, Seq2Letters, uSeqLength2); + + unsigned uMinLength = MIN(uSeqLength1, uSeqLength2); + double F = (double) uCommonKmerCount / (uMinLength - K + 1); + if (0.0 == F) + F = 0.01; + double Y = log(0.02 + F); + double EstimatedPctId = Y/4.12 + 0.995; + double KD = KimuraDist(EstimatedPctId); +// DF.SetDist(uSeqIndex1, uSeqIndex2, (float) KD); + DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (1 - F)); +#if TRACE + Log("CommonCount=%u, MinLength=%u, F=%6.4f Y=%6.4f, %%id=%6.4f, KimuraDist=%8.4f\n", + uCommonKmerCount, uMinLength, F, Y, EstimatedPctId, KD); +#endif + } + } + + delete[] Seq1Letters; + delete[] Seq2Letters; + } diff --git a/src/muscle/muscle3.8.31/src/fastdistmafft.cpp b/src/muscle/muscle3.8.31/src/fastdistmafft.cpp new file mode 100644 index 0000000..642236e --- /dev/null +++ b/src/muscle/muscle3.8.31/src/fastdistmafft.cpp @@ -0,0 +1,290 @@ +#include "muscle.h" +#include "distfunc.h" +#include "seqvect.h" +#include + +#define TRACE 0 + +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) +#define MAX(x, y) (((x) > (y)) ? (x) : (y)) + +const unsigned TUPLE_COUNT = 6*6*6*6*6*6; +static unsigned char Count1[TUPLE_COUNT]; +static unsigned char Count2[TUPLE_COUNT]; + +// Amino acid groups according to MAFFT (sextet5) +// 0 = A G P S T +// 1 = I L M V +// 2 = N D Q E B Z +// 3 = R H K +// 4 = F W Y +// 5 = C +// 6 = X . - U +unsigned ResidueGroup[] = + { + 0, // AX_A, + 5, // AX_C, + 2, // AX_D, + 2, // AX_E, + 4, // AX_F, + 0, // AX_G, + 3, // AX_H, + 1, // AX_I, + 3, // AX_K, + 1, // AX_L, + 1, // AX_M, + 2, // AX_N, + 0, // AX_P, + 2, // AX_Q, + 3, // AX_R, + 0, // AX_S, + 0, // AX_T, + 1, // AX_V, + 4, // AX_W, + 4, // AX_Y, + + 2, // AX_B, // D or N + 2, // AX_Z, // E or Q + 0, // AX_X, // Unknown // ******** TODO ************* + // This isn't the correct way of avoiding group 6 + 0 // AX_GAP, // ******** TODO ****************** + }; +unsigned uResidueGroupCount = sizeof(ResidueGroup)/sizeof(ResidueGroup[0]); + +static char *TupleToStr(int t) + { + static char s[7]; + int t1, t2, t3, t4, t5, t6; + + t1 = t%6; + t2 = (t/6)%6; + t3 = (t/(6*6))%6; + t4 = (t/(6*6*6))%6; + t5 = (t/(6*6*6*6))%6; + t6 = (t/(6*6*6*6*6))%6; + + s[5] = '0' + t1; + s[4] = '0' + t2; + s[3] = '0' + t3; + s[2] = '0' + t4; + s[1] = '0' + t5; + s[0] = '0' + t6; + return s; + } + +static unsigned GetTuple(const unsigned uLetters[], unsigned n) + { + assert(uLetters[n] < uResidueGroupCount); + assert(uLetters[n+1] < uResidueGroupCount); + assert(uLetters[n+2] < uResidueGroupCount); + assert(uLetters[n+3] < uResidueGroupCount); + assert(uLetters[n+4] < uResidueGroupCount); + assert(uLetters[n+5] < uResidueGroupCount); + + unsigned u1 = ResidueGroup[uLetters[n]]; + unsigned u2 = ResidueGroup[uLetters[n+1]]; + unsigned u3 = ResidueGroup[uLetters[n+2]]; + unsigned u4 = ResidueGroup[uLetters[n+3]]; + unsigned u5 = ResidueGroup[uLetters[n+4]]; + unsigned u6 = ResidueGroup[uLetters[n+5]]; + + return u6 + u5*6 + u4*6*6 + u3*6*6*6 + u2*6*6*6*6 + u1*6*6*6*6*6; + } + +static void CountTuples(const unsigned L[], unsigned uTupleCount, unsigned char Count[]) + { + memset(Count, 0, TUPLE_COUNT*sizeof(unsigned char)); + for (unsigned n = 0; n < uTupleCount; ++n) + { + const unsigned uTuple = GetTuple(L, n); + ++(Count[uTuple]); + } + } + +static void ListCount(const unsigned char Count[]) + { + for (unsigned n = 0; n < TUPLE_COUNT; ++n) + { + if (0 == Count[n]) + continue; + Log("%s %u\n", TupleToStr(n), Count[n]); + } + } + +void DistKmer6_6(const SeqVect &v, DistFunc &DF) + { + const unsigned uSeqCount = v.Length(); + + DF.SetCount(uSeqCount); + if (0 == uSeqCount) + return; + +// Initialize distance matrix to zero + for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) + { + DF.SetDist(uSeq1, uSeq1, 0); + for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) + DF.SetDist(uSeq1, uSeq2, 0); + } + +// Convert to letters + unsigned **Letters = new unsigned *[uSeqCount]; + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + Seq &s = *(v[uSeqIndex]); + const unsigned uSeqLength = s.Length(); + unsigned *L = new unsigned[uSeqLength]; + Letters[uSeqIndex] = L; + for (unsigned n = 0; n < uSeqLength; ++n) + { + char c = s[n]; + L[n] = CharToLetterEx(c); + assert(L[n] < uResidueGroupCount); + } + } + + unsigned **uCommonTupleCount = new unsigned *[uSeqCount]; + for (unsigned n = 0; n < uSeqCount; ++n) + { + uCommonTupleCount[n] = new unsigned[uSeqCount]; + memset(uCommonTupleCount[n], 0, uSeqCount*sizeof(unsigned)); + } + + const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2; + unsigned uCount = 0; + for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) + { + Seq &seq1 = *(v[uSeq1]); + const unsigned uSeqLength1 = seq1.Length(); + if (uSeqLength1 < 5) + continue; + + const unsigned uTupleCount = uSeqLength1 - 5; + const unsigned *L = Letters[uSeq1]; + CountTuples(L, uTupleCount, Count1); +#if TRACE + { + Log("Seq1=%d\n", uSeq1); + Log("Groups:\n"); + for (unsigned n = 0; n < uSeqLength1; ++n) + Log("%u", ResidueGroup[L[n]]); + Log("\n"); + + Log("Tuples:\n"); + ListCount(Count1); + } +#endif + + SetProgressDesc("K-mer dist pass 1"); + for (unsigned uSeq2 = 0; uSeq2 <= uSeq1; ++uSeq2) + { + if (0 == uCount%500) + Progress(uCount, uPairCount); + ++uCount; + Seq &seq2 = *(v[uSeq2]); + const unsigned uSeqLength2 = seq2.Length(); + if (uSeqLength2 < 5) + { + if (uSeq1 == uSeq2) + DF.SetDist(uSeq1, uSeq2, 0); + else + DF.SetDist(uSeq1, uSeq2, 1); + continue; + } + + // First pass through seq 2 to count tuples + const unsigned uTupleCount = uSeqLength2 - 5; + const unsigned *L = Letters[uSeq2]; + CountTuples(L, uTupleCount, Count2); +#if TRACE + Log("Seq2=%d Counts=\n", uSeq2); + ListCount(Count2); +#endif + + // Second pass to accumulate sum of shared tuples + // MAFFT defines this as the sum over unique tuples + // in seq2 of the minimum of the number of tuples found + // in the two sequences. + unsigned uSum = 0; + for (unsigned n = 0; n < uTupleCount; ++n) + { + const unsigned uTuple = GetTuple(L, n); + uSum += MIN(Count1[uTuple], Count2[uTuple]); + + // This is a hack to make sure each unique tuple counted only once. + Count2[uTuple] = 0; + } +#if TRACE + { + Seq &s1 = *(v[uSeq1]); + Seq &s2 = *(v[uSeq2]); + const char *pName1 = s1.GetName(); + const char *pName2 = s2.GetName(); + Log("Common count %s(%d) - %s(%d) =%u\n", + pName1, uSeq1, pName2, uSeq2, uSum); + } +#endif + uCommonTupleCount[uSeq1][uSeq2] = uSum; + uCommonTupleCount[uSeq2][uSeq1] = uSum; + } + } + ProgressStepsDone(); + + uCount = 0; + SetProgressDesc("K-mer dist pass 2"); + for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) + { + Seq &s1 = *(v[uSeq1]); + const char *pName1 = s1.GetName(); + + double dCommonTupleCount11 = uCommonTupleCount[uSeq1][uSeq1]; + if (0 == dCommonTupleCount11) + dCommonTupleCount11 = 1; + + DF.SetDist(uSeq1, uSeq1, 0); + for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) + { + if (0 == uCount%500) + Progress(uCount, uPairCount); + ++uCount; + + double dCommonTupleCount22 = uCommonTupleCount[uSeq2][uSeq2]; + if (0 == dCommonTupleCount22) + dCommonTupleCount22 = 1; + + const double dDist1 = 3.0*(dCommonTupleCount11 - uCommonTupleCount[uSeq1][uSeq2]) + /dCommonTupleCount11; + const double dDist2 = 3.0*(dCommonTupleCount22 - uCommonTupleCount[uSeq1][uSeq2]) + /dCommonTupleCount22; + + // dMinDist is the value used for tree-building in MAFFT + const double dMinDist = MIN(dDist1, dDist2); + DF.SetDist(uSeq1, uSeq2, (float) dMinDist); + + //const double dEstimatedPctId = TupleDistToEstimatedPctId(dMinDist); + //g_dfPwId.SetDist(uSeq1, uSeq2, dEstimatedPctId); + // **** TODO **** why does this make score slightly worse?? + //const double dKimuraDist = KimuraDist(dEstimatedPctId); + //DF.SetDist(uSeq1, uSeq2, dKimuraDist); + } + } + ProgressStepsDone(); + + for (unsigned n = 0; n < uSeqCount; ++n) + delete[] uCommonTupleCount[n]; + delete[] uCommonTupleCount; + delete[] Letters; + } + +double PctIdToMAFFTDist(double dPctId) + { + if (dPctId < 0.05) + dPctId = 0.05; + double dDist = -log(dPctId); + return dDist; + } + +double PctIdToHeightMAFFT(double dPctId) + { + return PctIdToMAFFTDist(dPctId); + } diff --git a/src/muscle/muscle3.8.31/src/fastdistnuc.cpp b/src/muscle/muscle3.8.31/src/fastdistnuc.cpp new file mode 100644 index 0000000..a7a7e7f --- /dev/null +++ b/src/muscle/muscle3.8.31/src/fastdistnuc.cpp @@ -0,0 +1,265 @@ +#include "muscle.h" +#include "distfunc.h" +#include "seqvect.h" +#include + +#define TRACE 0 + +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) +#define MAX(x, y) (((x) > (y)) ? (x) : (y)) + +const unsigned TUPLE_COUNT = 6*6*6*6*6*6; +static unsigned char Count1[TUPLE_COUNT]; +static unsigned char Count2[TUPLE_COUNT]; + +// Nucleotide groups according to MAFFT (sextet5) +// 0 = A +// 1 = C +// 2 = G +// 3 = T +// 4 = other + +static unsigned ResidueGroup[] = + { + 0, // NX_A, + 1, // NX_C, + 2, // NX_G, + 3, // NX_T/U + 4, // NX_N, + 4, // NX_R, + 4, // NX_Y, + 4, // NX_GAP + }; +static unsigned uResidueGroupCount = sizeof(ResidueGroup)/sizeof(ResidueGroup[0]); + +static char *TupleToStr(int t) + { + static char s[7]; + int t1, t2, t3, t4, t5, t6; + + t1 = t%6; + t2 = (t/6)%6; + t3 = (t/(6*6))%6; + t4 = (t/(6*6*6))%6; + t5 = (t/(6*6*6*6))%6; + t6 = (t/(6*6*6*6*6))%6; + + s[5] = '0' + t1; + s[4] = '0' + t2; + s[3] = '0' + t3; + s[2] = '0' + t4; + s[1] = '0' + t5; + s[0] = '0' + t6; + return s; + } + +static unsigned GetTuple(const unsigned uLetters[], unsigned n) + { + assert(uLetters[n] < uResidueGroupCount); + assert(uLetters[n+1] < uResidueGroupCount); + assert(uLetters[n+2] < uResidueGroupCount); + assert(uLetters[n+3] < uResidueGroupCount); + assert(uLetters[n+4] < uResidueGroupCount); + assert(uLetters[n+5] < uResidueGroupCount); + + unsigned u1 = ResidueGroup[uLetters[n]]; + unsigned u2 = ResidueGroup[uLetters[n+1]]; + unsigned u3 = ResidueGroup[uLetters[n+2]]; + unsigned u4 = ResidueGroup[uLetters[n+3]]; + unsigned u5 = ResidueGroup[uLetters[n+4]]; + unsigned u6 = ResidueGroup[uLetters[n+5]]; + + return u6 + u5*6 + u4*6*6 + u3*6*6*6 + u2*6*6*6*6 + u1*6*6*6*6*6; + } + +static void CountTuples(const unsigned L[], unsigned uTupleCount, unsigned char Count[]) + { + memset(Count, 0, TUPLE_COUNT*sizeof(unsigned char)); + for (unsigned n = 0; n < uTupleCount; ++n) + { + const unsigned uTuple = GetTuple(L, n); + ++(Count[uTuple]); + } + } + +static void ListCount(const unsigned char Count[]) + { + for (unsigned n = 0; n < TUPLE_COUNT; ++n) + { + if (0 == Count[n]) + continue; + Log("%s %u\n", TupleToStr(n), Count[n]); + } + } + +void DistKmer4_6(const SeqVect &v, DistFunc &DF) + { + if (ALPHA_DNA != g_Alpha && ALPHA_RNA != g_Alpha) + Quit("DistKmer4_6 requires nucleo alphabet"); + + const unsigned uSeqCount = v.Length(); + + DF.SetCount(uSeqCount); + if (0 == uSeqCount) + return; + +// Initialize distance matrix to zero + for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) + { + DF.SetDist(uSeq1, uSeq1, 0); + for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) + DF.SetDist(uSeq1, uSeq2, 0); + } + +// Convert to letters + unsigned **Letters = new unsigned *[uSeqCount]; + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + Seq &s = *(v[uSeqIndex]); + const unsigned uSeqLength = s.Length(); + unsigned *L = new unsigned[uSeqLength]; + Letters[uSeqIndex] = L; + for (unsigned n = 0; n < uSeqLength; ++n) + { + char c = s[n]; + L[n] = CharToLetterEx(c); + if (L[n] >= 4) + L[n] = 4; + } + } + + unsigned **uCommonTupleCount = new unsigned *[uSeqCount]; + for (unsigned n = 0; n < uSeqCount; ++n) + { + uCommonTupleCount[n] = new unsigned[uSeqCount]; + memset(uCommonTupleCount[n], 0, uSeqCount*sizeof(unsigned)); + } + + const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2; + unsigned uCount = 0; + for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) + { + Seq &seq1 = *(v[uSeq1]); + const unsigned uSeqLength1 = seq1.Length(); + if (uSeqLength1 < 5) + continue; + + const unsigned uTupleCount = uSeqLength1 - 5; + const unsigned *L = Letters[uSeq1]; + CountTuples(L, uTupleCount, Count1); +#if TRACE + { + Log("Seq1=%d\n", uSeq1); + Log("Groups:\n"); + for (unsigned n = 0; n < uSeqLength1; ++n) + Log("%u", ResidueGroup[L[n]]); + Log("\n"); + + Log("Tuples:\n"); + ListCount(Count1); + } +#endif + + SetProgressDesc("K-mer dist pass 1"); + for (unsigned uSeq2 = 0; uSeq2 <= uSeq1; ++uSeq2) + { + if (0 == uCount%500) + Progress(uCount, uPairCount); + ++uCount; + Seq &seq2 = *(v[uSeq2]); + const unsigned uSeqLength2 = seq2.Length(); + if (uSeqLength2 < 5) + { + if (uSeq1 == uSeq2) + DF.SetDist(uSeq1, uSeq2, 0); + else + DF.SetDist(uSeq1, uSeq2, 1); + continue; + } + + // First pass through seq 2 to count tuples + const unsigned uTupleCount = uSeqLength2 - 5; + const unsigned *L = Letters[uSeq2]; + CountTuples(L, uTupleCount, Count2); +#if TRACE + Log("Seq2=%d Counts=\n", uSeq2); + ListCount(Count2); +#endif + + // Second pass to accumulate sum of shared tuples + // MAFFT defines this as the sum over unique tuples + // in seq2 of the minimum of the number of tuples found + // in the two sequences. + unsigned uSum = 0; + for (unsigned n = 0; n < uTupleCount; ++n) + { + const unsigned uTuple = GetTuple(L, n); + uSum += MIN(Count1[uTuple], Count2[uTuple]); + + // This is a hack to make sure each unique tuple counted only once. + Count2[uTuple] = 0; + } +#if TRACE + { + Seq &s1 = *(v[uSeq1]); + Seq &s2 = *(v[uSeq2]); + const char *pName1 = s1.GetName(); + const char *pName2 = s2.GetName(); + Log("Common count %s(%d) - %s(%d) =%u\n", + pName1, uSeq1, pName2, uSeq2, uSum); + } +#endif + uCommonTupleCount[uSeq1][uSeq2] = uSum; + uCommonTupleCount[uSeq2][uSeq1] = uSum; + } + } + ProgressStepsDone(); + + uCount = 0; + SetProgressDesc("K-mer dist pass 2"); + for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) + { + Seq &s1 = *(v[uSeq1]); + const char *pName1 = s1.GetName(); + + double dCommonTupleCount11 = uCommonTupleCount[uSeq1][uSeq1]; + if (0 == dCommonTupleCount11) + dCommonTupleCount11 = 1; + + DF.SetDist(uSeq1, uSeq1, 0); + for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) + { + if (0 == uCount%500) + Progress(uCount, uPairCount); + ++uCount; + + double dCommonTupleCount22 = uCommonTupleCount[uSeq2][uSeq2]; + if (0 == dCommonTupleCount22) + dCommonTupleCount22 = 1; + + const double dDist1 = 3.0*(dCommonTupleCount11 - uCommonTupleCount[uSeq1][uSeq2]) + /dCommonTupleCount11; + const double dDist2 = 3.0*(dCommonTupleCount22 - uCommonTupleCount[uSeq1][uSeq2]) + /dCommonTupleCount22; + + // dMinDist is the value used for tree-building in MAFFT + const double dMinDist = MIN(dDist1, dDist2); + DF.SetDist(uSeq1, uSeq2, (float) dMinDist); + + //const double dEstimatedPctId = TupleDistToEstimatedPctId(dMinDist); + //g_dfPwId.SetDist(uSeq1, uSeq2, dEstimatedPctId); + // **** TODO **** why does this make score slightly worse?? + //const double dKimuraDist = KimuraDist(dEstimatedPctId); + //DF.SetDist(uSeq1, uSeq2, dKimuraDist); + } + } + ProgressStepsDone(); + + for (unsigned n = 0; n < uSeqCount; ++n) + { + delete[] uCommonTupleCount[n]; + delete[] Letters[n]; + } + delete[] uCommonTupleCount; + delete[] Letters; + } diff --git a/src/muscle/muscle3.8.31/src/fastscorepath2.cpp b/src/muscle/muscle3.8.31/src/fastscorepath2.cpp new file mode 100644 index 0000000..37a5b3b --- /dev/null +++ b/src/muscle/muscle3.8.31/src/fastscorepath2.cpp @@ -0,0 +1,165 @@ +#include "muscle.h" +#include "profile.h" +#include "pwpath.h" + +SCORE FastScorePath2(const ProfPos *PA, unsigned uLengthA, + const ProfPos *PB, unsigned uLengthB, const PWPath &Path) + { + const unsigned uEdgeCount = Path.GetEdgeCount(); + Log("Edge SS PLA PLB Match Gap Total\n"); + Log("---- -- --- --- ----- --- -----\n"); + char cType = 'S'; + SCORE scoreTotal = 0; + for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) + { + const PWEdge &Edge = Path.GetEdge(uEdgeIndex); + const char cPrevType = cType; + cType = Edge.cType; + const unsigned uPrefixLengthA = Edge.uPrefixLengthA; + const unsigned uPrefixLengthB = Edge.uPrefixLengthB; + bool bGap = false; + bool bMatch = false; + SCORE scoreGap = 0; + SCORE scoreMatch = 0; + + switch (cType) + { + case 'M': + { + if (0 == uPrefixLengthA || 0 == uPrefixLengthB) + Quit("FastScorePath2, M zero length"); + + const ProfPos &PPA = PA[uPrefixLengthA - 1]; + const ProfPos &PPB = PB[uPrefixLengthB - 1]; + + bMatch = true; + scoreMatch = ScoreProfPos2(PPA, PPB); + + if ('D' == cPrevType) + { + bGap = true; + assert(uPrefixLengthA > 1); + scoreGap = PA[uPrefixLengthA-2].m_scoreGapClose; + } + else if ('I' == cPrevType) + { + bGap = true; + assert(uPrefixLengthB > 1); + scoreGap = PB[uPrefixLengthB-2].m_scoreGapClose; + } + break; + } + + case 'D': + { + if (0 == uPrefixLengthA) + Quit("FastScorePath2, D zero length"); + + const ProfPos &PPA = PA[uPrefixLengthA - 1]; + bGap = true; + switch (cPrevType) + { + case 'S': + scoreGap = PPA.m_scoreGapOpen; + break; + case 'M': + scoreGap = PPA.m_scoreGapOpen; + break; + case 'D': +// scoreGap = g_scoreGapExtend; + scoreGap = 0; + break; + case 'I': + Quit("FastScorePath2 DI"); + } + break; + } + + case 'I': + { + if (0 == uPrefixLengthB) + Quit("FastScorePath2, I zero length"); + + const ProfPos &PPB = PB[uPrefixLengthB - 1]; + bGap = true; + switch (cPrevType) + { + case 'S': + scoreGap = PPB.m_scoreGapOpen; + break; + case 'M': + scoreGap = PPB.m_scoreGapOpen; + break; + case 'I': + scoreGap = 0; +// scoreGap = g_scoreGapExtend; + break; + case 'D': + Quit("FastScorePath2 DI"); + } + break; + } + + case 'U': + { + Quit("FastScorePath2 U"); + } + + default: + Quit("FastScorePath2: invalid type %c", cType); + } + + Log("%4u %c%c %4u %4u ", uEdgeIndex, cPrevType, cType, + uPrefixLengthA, uPrefixLengthB); + if (bMatch) + Log("%7.1f ", scoreMatch); + else + Log(" "); + if (bGap) + Log("%7.1f ", scoreGap); + else + Log(" "); + SCORE scoreEdge = scoreMatch + scoreGap; + scoreTotal += scoreEdge; + Log("%7.1f %7.1f", scoreEdge, scoreTotal); + Log("\n"); + } + + SCORE scoreGap = 0; +// if (!g_bTermGapsHalf) + switch (cType) + { + case 'M': + scoreGap = 0; + break; + + case 'D': + { + const ProfPos &LastPPA = PA[uLengthA - 1]; + scoreGap = LastPPA.m_scoreGapClose; + break; + } + + case 'I': + { + const ProfPos &LastPPB = PB[uLengthB - 1]; + scoreGap = LastPPB.m_scoreGapClose; + break; + } + + case 'U': + Quit("Unaligned regions not supported"); + + case 'S': + break; + + default: + Quit("Invalid type %c", cType); + } + + Log(" %cE %4u %4u %7.1f\n", cType, uLengthA, uLengthB, scoreGap); + scoreTotal += scoreGap; + + Log("Total = %g\n", scoreTotal); + return scoreTotal; + } diff --git a/src/muscle/muscle3.8.31/src/finddiags.cpp b/src/muscle/muscle3.8.31/src/finddiags.cpp new file mode 100644 index 0000000..0ddacb3 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/finddiags.cpp @@ -0,0 +1,161 @@ +#include "muscle.h" +#include "profile.h" +#include "diaglist.h" + +#define TRACE 0 + +const unsigned KTUP = 5; +const unsigned KTUPS = 6*6*6*6*6; +static unsigned TuplePos[KTUPS]; + +static char *TupleToStr(int t) + { + static char s[7]; + int t1, t2, t3, t4, t5; + + t1 = t%6; + t2 = (t/6)%6; + t3 = (t/(6*6))%6; + t4 = (t/(6*6*6))%6; + t5 = (t/(6*6*6*6))%6; + + s[4] = '0' + t1; + s[3] = '0' + t2; + s[2] = '0' + t3; + s[1] = '0' + t4; + s[0] = '0' + t5; + return s; + } + +static unsigned GetTuple(const ProfPos *PP, unsigned uPos) + { + const unsigned t0 = PP[uPos].m_uResidueGroup; + if (RESIDUE_GROUP_MULTIPLE == t0) + return EMPTY; + + const unsigned t1 = PP[uPos+1].m_uResidueGroup; + if (RESIDUE_GROUP_MULTIPLE == t1) + return EMPTY; + + const unsigned t2 = PP[uPos+2].m_uResidueGroup; + if (RESIDUE_GROUP_MULTIPLE == t2) + return EMPTY; + + const unsigned t3 = PP[uPos+3].m_uResidueGroup; + if (RESIDUE_GROUP_MULTIPLE == t3) + return EMPTY; + + const unsigned t4 = PP[uPos+4].m_uResidueGroup; + if (RESIDUE_GROUP_MULTIPLE == t4) + return EMPTY; + + return t0 + t1*6 + t2*6*6 + t3*6*6*6 + t4*6*6*6*6; + } + +void FindDiags(const ProfPos *PX, unsigned uLengthX, const ProfPos *PY, + unsigned uLengthY, DiagList &DL) + { + if (ALPHA_Amino != g_Alpha) + Quit("FindDiags: requires amino acid alphabet"); + + DL.Clear(); + + if (uLengthX < 12 || uLengthY < 12) + return; + +// Set A to shorter profile, B to longer + const ProfPos *PA; + const ProfPos *PB; + unsigned uLengthA; + unsigned uLengthB; + bool bSwap; + if (uLengthX < uLengthY) + { + bSwap = false; + PA = PX; + PB = PY; + uLengthA = uLengthX; + uLengthB = uLengthY; + } + else + { + bSwap = true; + PA = PY; + PB = PX; + uLengthA = uLengthY; + uLengthB = uLengthX; + } + +// Build tuple map for the longer profile, B + if (uLengthB < KTUP) + Quit("FindDiags: profile too short"); + + memset(TuplePos, EMPTY, sizeof(TuplePos)); + + for (unsigned uPos = 0; uPos < uLengthB - KTUP; ++uPos) + { + const unsigned uTuple = GetTuple(PB, uPos); + if (EMPTY == uTuple) + continue; + TuplePos[uTuple] = uPos; + } + +// Find matches + for (unsigned uPosA = 0; uPosA < uLengthA - KTUP; ++uPosA) + { + const unsigned uTuple = GetTuple(PA, uPosA); + if (EMPTY == uTuple) + continue; + const unsigned uPosB = TuplePos[uTuple]; + if (EMPTY == uPosB) + continue; + + // This tuple is found in both profiles + unsigned uStartPosA = uPosA; + unsigned uStartPosB = uPosB; + + // Try to extend the match forwards + unsigned uEndPosA = uPosA + KTUP - 1; + unsigned uEndPosB = uPosB + KTUP - 1; + for (;;) + { + if (uLengthA - 1 == uEndPosA || uLengthB - 1 == uEndPosB) + break; + const unsigned uAAGroupA = PA[uEndPosA+1].m_uResidueGroup; + if (RESIDUE_GROUP_MULTIPLE == uAAGroupA) + break; + const unsigned uAAGroupB = PB[uEndPosB+1].m_uResidueGroup; + if (RESIDUE_GROUP_MULTIPLE == uAAGroupB) + break; + if (uAAGroupA != uAAGroupB) + break; + ++uEndPosA; + ++uEndPosB; + } + uPosA = uEndPosA; + +#if TRACE + { + Log("Match: A %4u-%4u ", uStartPosA, uEndPosA); + for (unsigned n = uStartPosA; n <= uEndPosA; ++n) + Log("%c", 'A' + PA[n].m_uResidueGroup); + Log("\n"); + Log(" B %4u-%4u ", uStartPosB, uEndPosB); + for (unsigned n = uStartPosB; n <= uEndPosB; ++n) + Log("%c", 'A' + PB[n].m_uResidueGroup); + Log("\n"); + } +#endif + + const unsigned uLength = uEndPosA - uStartPosA + 1; + assert(uEndPosB - uStartPosB + 1 == uLength); + + if (uLength >= g_uMinDiagLength) + { + if (bSwap) + DL.Add(uStartPosB, uStartPosA, uLength); + else + DL.Add(uStartPosA, uStartPosB, uLength); + } + } + } diff --git a/src/muscle/muscle3.8.31/src/finddiagsn.cpp b/src/muscle/muscle3.8.31/src/finddiagsn.cpp new file mode 100644 index 0000000..58a2fc3 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/finddiagsn.cpp @@ -0,0 +1,152 @@ +#include "muscle.h" +#include "profile.h" +#include "diaglist.h" + +#define TRACE 0 + +#define pow4(i) (1 << (2*i)) // 4^i = 2^(2*i) +const unsigned K = 7; +const unsigned KTUPS = pow4(K); +static unsigned TuplePos[KTUPS]; + +static char *TupleToStr(int t) + { + static char s[K]; + + for (int i = 0; i < K; ++i) + { + unsigned Letter = (t/(pow4(i)))%4; + assert(Letter >= 0 && Letter < 4); + s[K-i-1] = LetterToChar(Letter); + } + + return s; + } + +static unsigned GetTuple(const ProfPos *PP, unsigned uPos) + { + unsigned t = 0; + + for (unsigned i = 0; i < K; ++i) + { + const unsigned uLetter = PP[uPos+i].m_uResidueGroup; + if (RESIDUE_GROUP_MULTIPLE == uLetter) + return EMPTY; + t = t*4 + uLetter; + } + + return t; + } + +void FindDiagsNuc(const ProfPos *PX, unsigned uLengthX, const ProfPos *PY, + unsigned uLengthY, DiagList &DL) + { + if (ALPHA_DNA != g_Alpha && ALPHA_RNA != g_Alpha) + Quit("FindDiagsNuc: requires nucleo alphabet"); + + DL.Clear(); + +// 16 is arbitrary slop, no principled reason for this. + if (uLengthX < K + 16 || uLengthY < K + 16) + return; + +// Set A to shorter profile, B to longer + const ProfPos *PA; + const ProfPos *PB; + unsigned uLengthA; + unsigned uLengthB; + bool bSwap; + if (uLengthX < uLengthY) + { + bSwap = false; + PA = PX; + PB = PY; + uLengthA = uLengthX; + uLengthB = uLengthY; + } + else + { + bSwap = true; + PA = PY; + PB = PX; + uLengthA = uLengthY; + uLengthB = uLengthX; + } + +#if TRACE + Log("FindDiagsNuc(LengthA=%d LengthB=%d\n", uLengthA, uLengthB); +#endif + +// Build tuple map for the longer profile, B + if (uLengthB < K) + Quit("FindDiags: profile too short"); + + memset(TuplePos, EMPTY, sizeof(TuplePos)); + + for (unsigned uPos = 0; uPos < uLengthB - K; ++uPos) + { + const unsigned uTuple = GetTuple(PB, uPos); + if (EMPTY == uTuple) + continue; + TuplePos[uTuple] = uPos; + } + +// Find matches + for (unsigned uPosA = 0; uPosA < uLengthA - K; ++uPosA) + { + const unsigned uTuple = GetTuple(PA, uPosA); + if (EMPTY == uTuple) + continue; + const unsigned uPosB = TuplePos[uTuple]; + if (EMPTY == uPosB) + continue; + + // This tuple is found in both profiles + unsigned uStartPosA = uPosA; + unsigned uStartPosB = uPosB; + + // Try to extend the match forwards + unsigned uEndPosA = uPosA + K - 1; + unsigned uEndPosB = uPosB + K - 1; + for (;;) + { + if (uLengthA - 1 == uEndPosA || uLengthB - 1 == uEndPosB) + break; + const unsigned uAAGroupA = PA[uEndPosA+1].m_uResidueGroup; + if (RESIDUE_GROUP_MULTIPLE == uAAGroupA) + break; + const unsigned uAAGroupB = PB[uEndPosB+1].m_uResidueGroup; + if (RESIDUE_GROUP_MULTIPLE == uAAGroupB) + break; + if (uAAGroupA != uAAGroupB) + break; + ++uEndPosA; + ++uEndPosB; + } + uPosA = uEndPosA; + +#if TRACE + { + Log("Match: A %4u-%4u ", uStartPosA, uEndPosA); + for (unsigned n = uStartPosA; n <= uEndPosA; ++n) + Log("%c", LetterToChar(PA[n].m_uResidueGroup)); + Log("\n"); + Log(" B %4u-%4u ", uStartPosB, uEndPosB); + for (unsigned n = uStartPosB; n <= uEndPosB; ++n) + Log("%c", LetterToChar(PB[n].m_uResidueGroup)); + Log("\n"); + } +#endif + + const unsigned uLength = uEndPosA - uStartPosA + 1; + assert(uEndPosB - uStartPosB + 1 == uLength); + + if (uLength >= g_uMinDiagLength) + { + if (bSwap) + DL.Add(uStartPosB, uStartPosA, uLength); + else + DL.Add(uStartPosA, uStartPosB, uLength); + } + } + } diff --git a/src/muscle/muscle3.8.31/src/gapscoredimer.h b/src/muscle/muscle3.8.31/src/gapscoredimer.h new file mode 100644 index 0000000..08059ab --- /dev/null +++ b/src/muscle/muscle3.8.31/src/gapscoredimer.h @@ -0,0 +1,69 @@ +// source code generated by dimer.py + +static SCORE GapScoreMM(const ProfPos &PPA, const ProfPos &PPB) + { + return + g_scoreGapOpen*(PPA.m_LL*PPB.m_LG + PPA.m_LG*PPB.m_LL + PPA.m_LG*PPB.m_GL + PPA.m_GL*PPB.m_LG) + + g_scoreGapExtend*(PPA.m_LL*PPB.m_GG + PPA.m_GG*PPB.m_LL) + + g_scoreGapAmbig*(PPA.m_GL*PPB.m_GG + PPA.m_GG*PPB.m_GL); + } + +static SCORE GapScoreMD(const ProfPos &PPA, const ProfPos &PPB) + { + return + g_scoreGapOpen*(PPA.m_LL*PPB.m_LL + PPA.m_LL*PPB.m_GL + PPA.m_GL*PPB.m_LL + PPA.m_GL*PPB.m_GL) + + g_scoreGapExtend*(PPA.m_LL*PPB.m_LG + PPA.m_LL*PPB.m_GG) + + g_scoreGapAmbig*(PPA.m_GL*PPB.m_LG + PPA.m_GL*PPB.m_GG); + } + +static SCORE GapScoreMI(const ProfPos &PPA, const ProfPos &PPB) + { + return + g_scoreGapOpen*(PPA.m_LL*PPB.m_LL + PPA.m_LL*PPB.m_GL + PPA.m_GL*PPB.m_LL + PPA.m_GL*PPB.m_GL) + + g_scoreGapExtend*(PPA.m_LG*PPB.m_LL + PPA.m_GG*PPB.m_LL) + + g_scoreGapAmbig*(PPA.m_LG*PPB.m_GL + PPA.m_GG*PPB.m_GL); + } + +static SCORE GapScoreDM(const ProfPos &PPA, const ProfPos &PPB) + { + return + g_scoreGapOpen*(PPA.m_LG*PPB.m_LL + PPA.m_LG*PPB.m_GL) + + g_scoreGapExtend*(PPA.m_LL*PPB.m_LG + PPA.m_LL*PPB.m_GG) + + g_scoreGapAmbig*(PPA.m_GL*PPB.m_LG + PPA.m_GL*PPB.m_GG + PPA.m_GG*PPB.m_LL + PPA.m_GG*PPB.m_GL); + } + +static SCORE GapScoreDD(const ProfPos &PPA, const ProfPos &PPB) + { + return + g_scoreGapExtend*(PPA.m_LL*PPB.m_LL + PPA.m_LL*PPB.m_LG + PPA.m_LL*PPB.m_GL + PPA.m_LL*PPB.m_GG) + + g_scoreGapAmbig*(PPA.m_GL*PPB.m_LL + PPA.m_GL*PPB.m_LG + PPA.m_GL*PPB.m_GL + PPA.m_GL*PPB.m_GG); + } + +static SCORE GapScoreDI(const ProfPos &PPA, const ProfPos &PPB) + { + return + g_scoreGapOpen*(PPA.m_LL*PPB.m_LL + PPA.m_LL*PPB.m_GL + PPA.m_GL*PPB.m_LL + PPA.m_GL*PPB.m_GL) + + g_scoreGapAmbig*(PPA.m_LG*PPB.m_LL + PPA.m_LG*PPB.m_GL + PPA.m_GG*PPB.m_LL + PPA.m_GG*PPB.m_GL); + } + +static SCORE GapScoreIM(const ProfPos &PPA, const ProfPos &PPB) + { + return + g_scoreGapOpen*(PPA.m_LL*PPB.m_LG + PPA.m_GL*PPB.m_LG) + + g_scoreGapExtend*(PPA.m_LG*PPB.m_LL + PPA.m_GG*PPB.m_LL) + + g_scoreGapAmbig*(PPA.m_LL*PPB.m_GG + PPA.m_LG*PPB.m_GL + PPA.m_GL*PPB.m_GG + PPA.m_GG*PPB.m_GL); + } + +static SCORE GapScoreID(const ProfPos &PPA, const ProfPos &PPB) + { + return + g_scoreGapOpen*(PPA.m_LL*PPB.m_LL + PPA.m_LL*PPB.m_GL + PPA.m_GL*PPB.m_LL + PPA.m_GL*PPB.m_GL) + + g_scoreGapAmbig*(PPA.m_LL*PPB.m_LG + PPA.m_LL*PPB.m_GG + PPA.m_GL*PPB.m_LG + PPA.m_GL*PPB.m_GG); + } + +static SCORE GapScoreII(const ProfPos &PPA, const ProfPos &PPB) + { + return + g_scoreGapExtend*(PPA.m_LL*PPB.m_LL + PPA.m_LG*PPB.m_LL + PPA.m_GL*PPB.m_LL + PPA.m_GG*PPB.m_LL) + + g_scoreGapAmbig*(PPA.m_LL*PPB.m_GL + PPA.m_LG*PPB.m_GL + PPA.m_GL*PPB.m_GL + PPA.m_GG*PPB.m_GL); + } diff --git a/src/muscle/muscle3.8.31/src/gatest.cpp b/src/muscle/muscle3.8.31/src/gatest.cpp new file mode 100644 index 0000000..35b6ffc --- /dev/null +++ b/src/muscle/muscle3.8.31/src/gatest.cpp @@ -0,0 +1,32 @@ +#include "muscle.h" +#include "pwpath.h" +#include "timing.h" +#include "textfile.h" +#include "msa.h" +#include "profile.h" + +SCORE GlobalAlign(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path) + { + if (g_bDiags) + return GlobalAlignDiags(PA, uLengthA, PB, uLengthB, Path); + else + return GlobalAlignNoDiags(PA, uLengthA, PB, uLengthB, Path); + } + +SCORE GlobalAlignNoDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path) + { + switch (g_PPScore) + { + case PPSCORE_LE: + return GlobalAlignLA(PA, uLengthA, PB, uLengthB, Path); + + case PPSCORE_SP: + return GlobalAlignNS(PA, uLengthA, PB, uLengthB, Path); + + case PPSCORE_SV: + return GlobalAlignSimple(PA, uLengthA, PB, uLengthB, Path); + } + return 0; + } diff --git a/src/muscle/muscle3.8.31/src/glbalign.cpp b/src/muscle/muscle3.8.31/src/glbalign.cpp new file mode 100644 index 0000000..a22d6a1 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/glbalign.cpp @@ -0,0 +1,165 @@ +#include "muscle.h" +#include "pwpath.h" +#include "timing.h" +#include "textfile.h" +#include "msa.h" +#include "profile.h" + +#if !VER_3_52 + +#define COMPARE_SIMPLE 0 + +#if TIMING +TICKS g_ticksDP = 0; +#endif + +#if 1 +extern bool g_bKeepSimpleDP; +SCORE NWSmall(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path); +SCORE NWDASmall(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path); +SCORE NWDASimple(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path); +SCORE NWDASimple2(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path); +SCORE GlobalAlignSimple(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path); + +SCORE GlobalAlignNoDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path) + { + return GlobalAlign(PA, uLengthA, PB, uLengthB, Path); + } + +#if COMPARE_SIMPLE + +SCORE GlobalAlign(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path) + { +#if TIMING + TICKS t1 = GetClockTicks(); +#endif + g_bKeepSimpleDP = true; + PWPath SimplePath; + GlobalAlignSimple(PA, uLengthA, PB, uLengthB, SimplePath); + + SCORE Score = NWSmall(PA, uLengthA, PB, uLengthB, Path); + + if (!Path.Equal(SimplePath)) + { + Log("Simple:\n"); + SimplePath.LogMe(); + Log("Small:\n"); + Path.LogMe(); + Quit("Paths differ"); + } + +#if TIMING + TICKS t2 = GetClockTicks(); + g_ticksDP += (t2 - t1); +#endif + return Score; + } + +#else // COMPARE_SIMPLE + +SCORE GlobalAlign(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path) + { +#if TIMING + TICKS t1 = GetClockTicks(); +#endif + SCORE Score = NWSmall(PA, uLengthA, PB, uLengthB, Path); +#if TIMING + TICKS t2 = GetClockTicks(); + g_ticksDP += (t2 - t1); +#endif + return Score; + } + +#endif + +#else // 1 + +static void AllInserts(PWPath &Path, unsigned uLengthB) + { + Path.Clear(); + PWEdge Edge; + Edge.cType = 'I'; + Edge.uPrefixLengthA = 0; + for (unsigned uPrefixLengthB = 1; uPrefixLengthB <= uLengthB; ++uPrefixLengthB) + { + Edge.uPrefixLengthB = uPrefixLengthB; + Path.AppendEdge(Edge); + } + } + +static void AllDeletes(PWPath &Path, unsigned uLengthA) + { + Path.Clear(); + PWEdge Edge; + Edge.cType = 'D'; + Edge.uPrefixLengthB = 0; + for (unsigned uPrefixLengthA = 1; uPrefixLengthA <= uLengthA; ++uPrefixLengthA) + { + Edge.uPrefixLengthA = uPrefixLengthA; + Path.AppendEdge(Edge); + } + } + +SCORE GlobalAlign(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path) + { +#if TIMING + TICKS t1 = GetClockTicks(); +#endif + if (0 == uLengthA) + { + AllInserts(Path, uLengthB); + return 0; + } + else if (0 == uLengthB) + { + AllDeletes(Path, uLengthA); + return 0; + } + + SCORE Score = 0; + if (g_bDiags) + Score = GlobalAlignDiags(PA, uLengthA, PB, uLengthB, Path); + else + Score = GlobalAlignNoDiags(PA, uLengthA, PB, uLengthB, Path); +#if TIMING + TICKS t2 = GetClockTicks(); + g_ticksDP += (t2 - t1); +#endif + return Score; + } + +SCORE GlobalAlignNoDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path) + { + if (g_bDimer) + return GlobalAlignDimer(PA, uLengthA, PB, uLengthB, Path); + + switch (g_PPScore) + { + case PPSCORE_LE: + return GlobalAlignLE(PA, uLengthA, PB, uLengthB, Path); + + case PPSCORE_SP: + case PPSCORE_SV: + return GlobalAlignSP(PA, uLengthA, PB, uLengthB, Path); + + case PPSCORE_SPN: + return GlobalAlignSPN(PA, uLengthA, PB, uLengthB, Path); + } + + Quit("Invalid PP score (GlobalAlignNoDiags)"); + return 0; + } + +#endif + +#endif // !VER_3_52 diff --git a/src/muscle/muscle3.8.31/src/glbalign352.cpp b/src/muscle/muscle3.8.31/src/glbalign352.cpp new file mode 100644 index 0000000..6305c7c --- /dev/null +++ b/src/muscle/muscle3.8.31/src/glbalign352.cpp @@ -0,0 +1,55 @@ +#include "muscle.h" +#include "pwpath.h" +#include "timing.h" +#include "textfile.h" +#include "msa.h" +#include "profile.h" + +#if VER_3_52 + +#if TIMING +TICKS g_ticksDP = 0; +#endif + +SCORE GlobalAlign(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path) + { +#if TIMING + TICKS t1 = GetClockTicks(); +#endif + SCORE Score = 0; + if (g_bDiags) + Score = GlobalAlignDiags(PA, uLengthA, PB, uLengthB, Path); + else + Score = GlobalAlignNoDiags(PA, uLengthA, PB, uLengthB, Path); +#if TIMING + TICKS t2 = GetClockTicks(); + g_ticksDP += (t2 - t1); +#endif + return Score; + } + +SCORE GlobalAlignNoDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path) + { + if (g_bDimer) + return GlobalAlignDimer(PA, uLengthA, PB, uLengthB, Path); + + switch (g_PPScore) + { + case PPSCORE_LE: + return GlobalAlignLE(PA, uLengthA, PB, uLengthB, Path); + + case PPSCORE_SP: + case PPSCORE_SV: + return GlobalAlignSP(PA, uLengthA, PB, uLengthB, Path); + + case PPSCORE_SPN: + return GlobalAlignSPN(PA, uLengthA, PB, uLengthB, Path); + } + + Quit("Invalid PP score (GlobalAlignNoDiags)"); + return 0; + } + +#endif // VER_3_52 diff --git a/src/muscle/muscle3.8.31/src/glbaligndiag.cpp b/src/muscle/muscle3.8.31/src/glbaligndiag.cpp new file mode 100644 index 0000000..ff6f035 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/glbaligndiag.cpp @@ -0,0 +1,172 @@ +#include "muscle.h" +#include "dpreglist.h" +#include "diaglist.h" +#include "pwpath.h" +#include "profile.h" +#include "timing.h" + +#define TRACE 0 +#define TRACE_PATH 0 +#define LIST_DIAGS 0 + +static double g_dDPAreaWithoutDiags = 0.0; +static double g_dDPAreaWithDiags = 0.0; + +static void OffsetPath(PWPath &Path, unsigned uOffsetA, unsigned uOffsetB) + { + const unsigned uEdgeCount = Path.GetEdgeCount(); + for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) + { + const PWEdge &Edge = Path.GetEdge(uEdgeIndex); + + // Nasty hack -- poke new values back into path, circumventing class + PWEdge &NonConstEdge = (PWEdge &) Edge; + NonConstEdge.uPrefixLengthA += uOffsetA; + NonConstEdge.uPrefixLengthB += uOffsetB; + } + } + +static void DiagToPath(const Diag &d, PWPath &Path) + { + Path.Clear(); + const unsigned uLength = d.m_uLength; + for (unsigned i = 0; i < uLength; ++i) + { + PWEdge Edge; + Edge.cType = 'M'; + Edge.uPrefixLengthA = d.m_uStartPosA + i + 1; + Edge.uPrefixLengthB = d.m_uStartPosB + i + 1; + Path.AppendEdge(Edge); + } + } + +static void AppendRegPath(PWPath &Path, const PWPath &RegPath) + { + const unsigned uRegEdgeCount = RegPath.GetEdgeCount(); + for (unsigned uRegEdgeIndex = 0; uRegEdgeIndex < uRegEdgeCount; ++uRegEdgeIndex) + { + const PWEdge &RegEdge = RegPath.GetEdge(uRegEdgeIndex); + Path.AppendEdge(RegEdge); + } + } + +SCORE GlobalAlignDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path) + { +#if LIST_DIAGS + TICKS t1 = GetClockTicks(); +#endif + + DiagList DL; + + if (ALPHA_Amino == g_Alpha) + FindDiags(PA, uLengthA, PB, uLengthB, DL); + else if (ALPHA_DNA == g_Alpha || ALPHA_RNA == g_Alpha) + FindDiagsNuc(PA, uLengthA, PB, uLengthB, DL); + else + Quit("GlobalAlignDiags: bad alpha"); + +#if TRACE + Log("GlobalAlignDiags, diag list:\n"); + DL.LogMe(); +#endif + + DL.Sort(); + DL.DeleteIncompatible(); + +#if TRACE + Log("After DeleteIncompatible:\n"); + DL.LogMe(); +#endif + + MergeDiags(DL); + +#if TRACE + Log("After MergeDiags:\n"); + DL.LogMe(); +#endif + + DPRegionList RL; + DiagListToDPRegionList(DL, RL, uLengthA, uLengthB); + +#if TRACE + Log("RegionList:\n"); + RL.LogMe(); +#endif + +#if LIST_DIAGS + { + TICKS t2 = GetClockTicks(); + unsigned uArea = RL.GetDPArea(); + Log("ticks=%ld\n", (long) (t2 - t1)); + Log("area=%u\n", uArea); + } +#endif + + g_dDPAreaWithoutDiags += uLengthA*uLengthB; + + double dDPAreaWithDiags = 0.0; + const unsigned uRegionCount = RL.GetCount(); + for (unsigned uRegionIndex = 0; uRegionIndex < uRegionCount; ++uRegionIndex) + { + const DPRegion &r = RL.Get(uRegionIndex); + + PWPath RegPath; + if (DPREGIONTYPE_Diag == r.m_Type) + { + DiagToPath(r.m_Diag, RegPath); +#if TRACE_PATH + Log("DiagToPath, path=\n"); + RegPath.LogMe(); +#endif + } + else if (DPREGIONTYPE_Rect == r.m_Type) + { + const unsigned uRegStartPosA = r.m_Rect.m_uStartPosA; + const unsigned uRegStartPosB = r.m_Rect.m_uStartPosB; + const unsigned uRegLengthA = r.m_Rect.m_uLengthA; + const unsigned uRegLengthB = r.m_Rect.m_uLengthB; + const ProfPos *RegPA = PA + uRegStartPosA; + const ProfPos *RegPB = PB + uRegStartPosB; + + dDPAreaWithDiags += uRegLengthA*uRegLengthB; + GlobalAlignNoDiags(RegPA, uRegLengthA, RegPB, uRegLengthB, RegPath); +#if TRACE_PATH + Log("GlobalAlignNoDiags RegPath=\n"); + RegPath.LogMe(); +#endif + OffsetPath(RegPath, uRegStartPosA, uRegStartPosB); +#if TRACE_PATH + Log("After offset path, RegPath=\n"); + RegPath.LogMe(); +#endif + } + else + Quit("GlobalAlignDiags, Invalid region type %u", r.m_Type); + + AppendRegPath(Path, RegPath); +#if TRACE_PATH + Log("After AppendPath, path="); + Path.LogMe(); +#endif + } + +#if TRACE + { + double dDPAreaWithoutDiags = uLengthA*uLengthB; + Log("DP area with diags %.3g without %.3g pct saved %.3g %%\n", + dDPAreaWithDiags, dDPAreaWithoutDiags, (1.0 - dDPAreaWithDiags/dDPAreaWithoutDiags)*100.0); + } +#endif + g_dDPAreaWithDiags += dDPAreaWithDiags; + return 0; + } + +void ListDiagSavings() + { + if (!g_bVerbose || !g_bDiags) + return; + double dAreaSaved = g_dDPAreaWithoutDiags - g_dDPAreaWithDiags; + double dPct = dAreaSaved*100.0/g_dDPAreaWithoutDiags; + Log("DP area saved by diagonals %-4.1f%%\n", dPct); + } diff --git a/src/muscle/muscle3.8.31/src/glbalignla.cpp b/src/muscle/muscle3.8.31/src/glbalignla.cpp new file mode 100644 index 0000000..f28f487 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/glbalignla.cpp @@ -0,0 +1,432 @@ +#include "muscle.h" +#include "profile.h" +#include "pwpath.h" + +#define OCC 1 + +struct DP_MEMORY + { + unsigned uLength; + SCORE *GapOpenA; + SCORE *GapOpenB; + SCORE *GapCloseA; + SCORE *GapCloseB; + SCORE *MPrev; + SCORE *MCurr; + SCORE *MWork; + SCORE *DPrev; + SCORE *DCurr; + SCORE *DWork; + SCORE **ScoreMxB; +#if OCC + FCOUNT *OccA; + FCOUNT *OccB; +#endif + unsigned **SortOrderA; + unsigned *uDeletePos; + FCOUNT **FreqsA; + int **TraceBack; + }; + +static struct DP_MEMORY DPM; + +static void AllocDPMem(unsigned uLengthA, unsigned uLengthB) + { +// Max prefix length + unsigned uLength = (uLengthA > uLengthB ? uLengthA : uLengthB) + 1; + if (uLength < DPM.uLength) + return; + +// Add 256 to allow for future expansion and +// round up to next multiple of 32. + uLength += 256; + uLength += 32 - uLength%32; + + const unsigned uOldLength = DPM.uLength; + if (uOldLength > 0) + { + for (unsigned i = 0; i < uOldLength; ++i) + { + delete[] DPM.TraceBack[i]; + delete[] DPM.FreqsA[i]; + delete[] DPM.SortOrderA[i]; + } + for (unsigned n = 0; n < 20; ++n) + delete[] DPM.ScoreMxB[n]; + + delete[] DPM.MPrev; + delete[] DPM.MCurr; + delete[] DPM.MWork; + delete[] DPM.DPrev; + delete[] DPM.DCurr; + delete[] DPM.DWork; + delete[] DPM.uDeletePos; + delete[] DPM.GapOpenA; + delete[] DPM.GapOpenB; + delete[] DPM.GapCloseA; + delete[] DPM.GapCloseB; + delete[] DPM.SortOrderA; + delete[] DPM.FreqsA; + delete[] DPM.ScoreMxB; + delete[] DPM.TraceBack; +#if OCC + delete[] DPM.OccA; + delete[] DPM.OccB; +#endif + } + + DPM.uLength = uLength; + + DPM.GapOpenA = new SCORE[uLength]; + DPM.GapOpenB = new SCORE[uLength]; + DPM.GapCloseA = new SCORE[uLength]; + DPM.GapCloseB = new SCORE[uLength]; +#if OCC + DPM.OccA = new FCOUNT[uLength]; + DPM.OccB = new FCOUNT[uLength]; +#endif + + DPM.SortOrderA = new unsigned*[uLength]; + DPM.FreqsA = new FCOUNT*[uLength]; + DPM.ScoreMxB = new SCORE*[20]; + DPM.MPrev = new SCORE[uLength]; + DPM.MCurr = new SCORE[uLength]; + DPM.MWork = new SCORE[uLength]; + + DPM.DPrev = new SCORE[uLength]; + DPM.DCurr = new SCORE[uLength]; + DPM.DWork = new SCORE[uLength]; + DPM.uDeletePos = new unsigned[uLength]; + + DPM.TraceBack = new int*[uLength]; + + for (unsigned uLetter = 0; uLetter < 20; ++uLetter) + DPM.ScoreMxB[uLetter] = new SCORE[uLength]; + + for (unsigned i = 0; i < uLength; ++i) + { + DPM.SortOrderA[i] = new unsigned[20]; + DPM.FreqsA[i] = new FCOUNT[20]; + DPM.TraceBack[i] = new int[uLength]; + } + } + +SCORE GlobalAlignLA(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path) + { + const unsigned uPrefixCountA = uLengthA + 1; + const unsigned uPrefixCountB = uLengthB + 1; + + AllocDPMem(uLengthA, uLengthB); + + SCORE *GapOpenA = DPM.GapOpenA; + SCORE *GapOpenB = DPM.GapOpenB; + SCORE *GapCloseA = DPM.GapCloseA; + SCORE *GapCloseB = DPM.GapCloseB; + + unsigned **SortOrderA = DPM.SortOrderA; + FCOUNT **FreqsA = DPM.FreqsA; + SCORE **ScoreMxB = DPM.ScoreMxB; + SCORE *MPrev = DPM.MPrev; + SCORE *MCurr = DPM.MCurr; + SCORE *MWork = DPM.MWork; + + SCORE *DPrev = DPM.DPrev; + SCORE *DCurr = DPM.DCurr; + SCORE *DWork = DPM.DWork; + +#if OCC + FCOUNT *OccA = DPM.OccA; + FCOUNT *OccB = DPM.OccB; +#endif + + unsigned *uDeletePos = DPM.uDeletePos; + + int **TraceBack = DPM.TraceBack; + + for (unsigned i = 0; i < uLengthA; ++i) + { + GapOpenA[i] = PA[i].m_scoreGapOpen; + GapCloseA[i] = PA[i].m_scoreGapClose; +#if OCC + OccA[i] = PA[i].m_fOcc; +#endif + + for (unsigned uLetter = 0; uLetter < 20; ++uLetter) + { + SortOrderA[i][uLetter] = PA[i].m_uSortOrder[uLetter]; + FreqsA[i][uLetter] = PA[i].m_fcCounts[uLetter]; + } + } + + for (unsigned j = 0; j < uLengthB; ++j) + { + GapOpenB[j] = PB[j].m_scoreGapOpen; + GapCloseB[j] = PB[j].m_scoreGapClose; +#if OCC + OccB[j] = PB[j].m_fOcc; +#endif + } + + for (unsigned uLetter = 0; uLetter < 20; ++uLetter) + { + for (unsigned j = 0; j < uLengthB; ++j) + ScoreMxB[uLetter][j] = PB[j].m_AAScores[uLetter]; + } + + for (unsigned i = 0; i < uPrefixCountA; ++i) + memset(TraceBack[i], 0, uPrefixCountB*sizeof(int)); + +// Special case for i=0 + unsigned **ptrSortOrderA = SortOrderA; + FCOUNT **ptrFreqsA = FreqsA; + assert(ptrSortOrderA == &(SortOrderA[0])); + assert(ptrFreqsA == &(FreqsA[0])); + TraceBack[0][0] = 0; + + SCORE scoreSum = 0; + unsigned *ptrSortOrderAi = SortOrderA[0]; + const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20; + FCOUNT *ptrFreqsAi = FreqsA[0]; + for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) + { + const unsigned uLetter = *ptrSortOrderAi; + const FCOUNT fcLetter = ptrFreqsAi[uLetter]; + if (0 == fcLetter) + break; + scoreSum += fcLetter*ScoreMxB[uLetter][0]; + } + if (0 == scoreSum) + MPrev[0] = -2.5; + else + { +#if OCC + MPrev[0] = (logf(scoreSum) - g_scoreCenter)*OccA[0]*OccB[0]; +#else + MPrev[0] = (logf(scoreSum) - g_scoreCenter); +#endif + } + +// D(0,0) is -infinity (requires I->D). + DPrev[0] = MINUS_INFINITY; + + for (unsigned j = 1; j < uLengthB; ++j) + { + // Only way to get M(0, j) looks like this: + // A ----X + // B XXXXX + // 0 j + // So gap-open at j=0, gap-close at j-1. + SCORE scoreSum = 0; + unsigned *ptrSortOrderAi = SortOrderA[0]; + const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20; + FCOUNT *ptrFreqsAi = FreqsA[0]; + for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) + { + const unsigned uLetter = *ptrSortOrderAi; + const FCOUNT fcLetter = ptrFreqsAi[uLetter]; + if (0 == fcLetter) + break; + scoreSum += fcLetter*ScoreMxB[uLetter][j]; + } + if (0 == scoreSum) + MPrev[j] = -2.5; + else + { +#if OCC + MPrev[j] = (logf(scoreSum) - g_scoreCenter)*OccA[0]*OccB[j] + + GapOpenB[0] + GapCloseB[j-1]; +#else + MPrev[j] = (logf(scoreSum) - g_scoreCenter) + + GapOpenB[0] + GapCloseB[j-1]; +#endif + } + TraceBack[0][j] = -(int) j; + + // Assume no D->I transitions, then can't be a delete if only + // one letter from A. + DPrev[j] = MINUS_INFINITY; + } + + SCORE IPrev_j_1; + for (unsigned i = 1; i < uLengthA; ++i) + { + ++ptrSortOrderA; + ++ptrFreqsA; + assert(ptrSortOrderA == &(SortOrderA[i])); + assert(ptrFreqsA == &(FreqsA[i])); + + SCORE *ptrMCurr_j = MCurr; + memset(ptrMCurr_j, 0, uLengthB*sizeof(SCORE)); + const FCOUNT *FreqsAi = *ptrFreqsA; + + const unsigned *SortOrderAi = *ptrSortOrderA; + const unsigned *ptrSortOrderAiEnd = SortOrderAi + 20; + const SCORE *ptrMCurrMax = MCurr + uLengthB; + for (const unsigned *ptrSortOrderAi = SortOrderAi; + ptrSortOrderAi != ptrSortOrderAiEnd; + ++ptrSortOrderAi) + { + const unsigned uLetter = *ptrSortOrderAi; + SCORE *NSBR_Letter = ScoreMxB[uLetter]; + const FCOUNT fcLetter = FreqsAi[uLetter]; + if (0 == fcLetter) + break; + SCORE *ptrNSBR = NSBR_Letter; + for (SCORE *ptrMCurr = MCurr; ptrMCurr != ptrMCurrMax; ++ptrMCurr) + *ptrMCurr += fcLetter*(*ptrNSBR++); + } + +#if OCC + const FCOUNT OccAi = OccA[i]; +#endif + for (unsigned j = 0; j < uLengthB; ++j) + { + if (MCurr[j] == 0) + MCurr[j] = -2.5; + else +#if OCC + MCurr[j] = (logf(MCurr[j]) - g_scoreCenter)*OccAi*OccB[j]; +#else + MCurr[j] = (logf(MCurr[j]) - g_scoreCenter); +#endif + } + + ptrMCurr_j = MCurr; + unsigned *ptrDeletePos = uDeletePos; + + // Special case for j=0 + // Only way to get M(i, 0) looks like this: + // 0 i + // A XXXXX + // B ----X + // So gap-open at i=0, gap-close at i-1. + assert(ptrMCurr_j == &(MCurr[0])); + *ptrMCurr_j += GapOpenA[0] + GapCloseA[i-1]; + + ++ptrMCurr_j; + + int *ptrTraceBack_ij = TraceBack[i]; + *ptrTraceBack_ij++ = (int) i; + + SCORE *ptrMPrev_j = MPrev; + SCORE *ptrDPrev = DPrev; + SCORE d = *ptrDPrev; + SCORE DNew = *ptrMPrev_j + GapOpenA[i]; + if (DNew > d) + { + d = DNew; + *ptrDeletePos = i; + } + + SCORE *ptrDCurr = DCurr; + + assert(ptrDCurr == &(DCurr[0])); + *ptrDCurr = d; + + // Can't have an insert if no letters from B + IPrev_j_1 = MINUS_INFINITY; + + unsigned uInsertPos; + const SCORE scoreGapOpenAi = GapOpenA[i]; + const SCORE scoreGapCloseAi_1 = GapCloseA[i-1]; + + for (unsigned j = 1; j < uLengthB; ++j) + { + // Here, MPrev_j is preserved from previous + // iteration so with current i,j is M[i-1][j-1] + SCORE MPrev_j = *ptrMPrev_j; + SCORE INew = MPrev_j + GapOpenB[j]; + if (INew > IPrev_j_1) + { + IPrev_j_1 = INew; + uInsertPos = j; + } + + SCORE scoreMax = MPrev_j; + + assert(ptrDPrev == &(DPrev[j-1])); + SCORE scoreD = *ptrDPrev++ + scoreGapCloseAi_1; + if (scoreD > scoreMax) + { + scoreMax = scoreD; + assert(ptrDeletePos == &(uDeletePos[j-1])); + *ptrTraceBack_ij = (int) i - (int) *ptrDeletePos; + assert(*ptrTraceBack_ij > 0); + } + ++ptrDeletePos; + + SCORE scoreI = IPrev_j_1 + GapCloseB[j-1]; + if (scoreI > scoreMax) + { + scoreMax = scoreI; + *ptrTraceBack_ij = (int) uInsertPos - (int) j; + assert(*ptrTraceBack_ij < 0); + } + + assert(ptrSortOrderA == &(SortOrderA[i])); + assert(ptrFreqsA == &(FreqsA[i])); + + *ptrMCurr_j += scoreMax; + assert(ptrMCurr_j == &(MCurr[j])); + ++ptrMCurr_j; + + MPrev_j = *(++ptrMPrev_j); + assert(ptrDPrev == &(DPrev[j])); + SCORE d = *ptrDPrev; + SCORE DNew = MPrev_j + scoreGapOpenAi; + if (DNew > d) + { + d = DNew; + assert(ptrDeletePos == &uDeletePos[j]); + *ptrDeletePos = i; + } + assert(ptrDCurr + 1 == &(DCurr[j])); + *(++ptrDCurr) = d; + + ++ptrTraceBack_ij; + } + + Rotate(MPrev, MCurr, MWork); + Rotate(DPrev, DCurr, DWork); + } + +// Special case for i=uLengthA + SCORE IPrev = MINUS_INFINITY; + + unsigned uInsertPos; + + for (unsigned j = 1; j < uLengthB; ++j) + { + SCORE INew = MPrev[j-1] + GapOpenB[j]; + if (INew > IPrev) + { + uInsertPos = j; + IPrev = INew; + } + } + +// Special case for i=uLengthA, j=uLengthB + SCORE scoreMax = MPrev[uLengthB-1]; + int iTraceBack = 0; + + SCORE scoreD = DPrev[uLengthB-1] + GapCloseA[uLengthA-1]; + if (scoreD > scoreMax) + { + scoreMax = scoreD; + iTraceBack = (int) uLengthA - (int) uDeletePos[uLengthB-1]; + } + + SCORE scoreI = IPrev + GapCloseB[uLengthB-1]; + if (scoreI > scoreMax) + { + scoreMax = scoreI; + iTraceBack = (int) uInsertPos - (int) uLengthB; + } + + TraceBack[uLengthA][uLengthB] = iTraceBack; + + TraceBackToPath(TraceBack, uLengthA, uLengthB, Path); + + return scoreMax; + } diff --git a/src/muscle/muscle3.8.31/src/glbalignle.cpp b/src/muscle/muscle3.8.31/src/glbalignle.cpp new file mode 100644 index 0000000..136ac82 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/glbalignle.cpp @@ -0,0 +1,435 @@ +#include "muscle.h" +#include "profile.h" +#include "pwpath.h" + +#define OCC 1 + +struct DP_MEMORY + { + unsigned uLength; + SCORE *GapOpenA; + SCORE *GapOpenB; + SCORE *GapCloseA; + SCORE *GapCloseB; + SCORE *MPrev; + SCORE *MCurr; + SCORE *MWork; + SCORE *DPrev; + SCORE *DCurr; + SCORE *DWork; + SCORE **ScoreMxB; +#if OCC + FCOUNT *OccA; + FCOUNT *OccB; +#endif + unsigned **SortOrderA; + unsigned *uDeletePos; + FCOUNT **FreqsA; + int **TraceBack; + }; + +static struct DP_MEMORY DPM; + +static void AllocDPMem(unsigned uLengthA, unsigned uLengthB) + { +// Max prefix length + unsigned uLength = (uLengthA > uLengthB ? uLengthA : uLengthB) + 1; + if (uLength < DPM.uLength) + return; + +// Add 256 to allow for future expansion and +// round up to next multiple of 32. + uLength += 256; + uLength += 32 - uLength%32; + + const unsigned uOldLength = DPM.uLength; + if (uOldLength > 0) + { + for (unsigned i = 0; i < uOldLength; ++i) + { + delete[] DPM.TraceBack[i]; + delete[] DPM.FreqsA[i]; + delete[] DPM.SortOrderA[i]; + } + for (unsigned n = 0; n < 20; ++n) + delete[] DPM.ScoreMxB[n]; + + delete[] DPM.MPrev; + delete[] DPM.MCurr; + delete[] DPM.MWork; + delete[] DPM.DPrev; + delete[] DPM.DCurr; + delete[] DPM.DWork; + delete[] DPM.uDeletePos; + delete[] DPM.GapOpenA; + delete[] DPM.GapOpenB; + delete[] DPM.GapCloseA; + delete[] DPM.GapCloseB; + delete[] DPM.SortOrderA; + delete[] DPM.FreqsA; + delete[] DPM.ScoreMxB; + delete[] DPM.TraceBack; +#if OCC + delete[] DPM.OccA; + delete[] DPM.OccB; +#endif + } + + DPM.uLength = uLength; + + DPM.GapOpenA = new SCORE[uLength]; + DPM.GapOpenB = new SCORE[uLength]; + DPM.GapCloseA = new SCORE[uLength]; + DPM.GapCloseB = new SCORE[uLength]; +#if OCC + DPM.OccA = new FCOUNT[uLength]; + DPM.OccB = new FCOUNT[uLength]; +#endif + + DPM.SortOrderA = new unsigned*[uLength]; + DPM.FreqsA = new FCOUNT*[uLength]; + DPM.ScoreMxB = new SCORE*[20]; + DPM.MPrev = new SCORE[uLength]; + DPM.MCurr = new SCORE[uLength]; + DPM.MWork = new SCORE[uLength]; + + DPM.DPrev = new SCORE[uLength]; + DPM.DCurr = new SCORE[uLength]; + DPM.DWork = new SCORE[uLength]; + DPM.uDeletePos = new unsigned[uLength]; + + DPM.TraceBack = new int*[uLength]; + + for (unsigned uLetter = 0; uLetter < 20; ++uLetter) + DPM.ScoreMxB[uLetter] = new SCORE[uLength]; + + for (unsigned i = 0; i < uLength; ++i) + { + DPM.SortOrderA[i] = new unsigned[20]; + DPM.FreqsA[i] = new FCOUNT[20]; + DPM.TraceBack[i] = new int[uLength]; + } + } + +SCORE GlobalAlignLE(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path) + { + SetTermGaps(PA, uLengthA); + SetTermGaps(PB, uLengthB); + + const unsigned uPrefixCountA = uLengthA + 1; + const unsigned uPrefixCountB = uLengthB + 1; + + AllocDPMem(uLengthA, uLengthB); + + SCORE *GapOpenA = DPM.GapOpenA; + SCORE *GapOpenB = DPM.GapOpenB; + SCORE *GapCloseA = DPM.GapCloseA; + SCORE *GapCloseB = DPM.GapCloseB; + + unsigned **SortOrderA = DPM.SortOrderA; + FCOUNT **FreqsA = DPM.FreqsA; + SCORE **ScoreMxB = DPM.ScoreMxB; + SCORE *MPrev = DPM.MPrev; + SCORE *MCurr = DPM.MCurr; + SCORE *MWork = DPM.MWork; + + SCORE *DPrev = DPM.DPrev; + SCORE *DCurr = DPM.DCurr; + SCORE *DWork = DPM.DWork; + +#if OCC + FCOUNT *OccA = DPM.OccA; + FCOUNT *OccB = DPM.OccB; +#endif + + unsigned *uDeletePos = DPM.uDeletePos; + + int **TraceBack = DPM.TraceBack; + + for (unsigned i = 0; i < uLengthA; ++i) + { + GapOpenA[i] = PA[i].m_scoreGapOpen; + GapCloseA[i] = PA[i].m_scoreGapClose; +#if OCC + OccA[i] = PA[i].m_fOcc; +#endif + + for (unsigned uLetter = 0; uLetter < 20; ++uLetter) + { + SortOrderA[i][uLetter] = PA[i].m_uSortOrder[uLetter]; + FreqsA[i][uLetter] = PA[i].m_fcCounts[uLetter]; + } + } + + for (unsigned j = 0; j < uLengthB; ++j) + { + GapOpenB[j] = PB[j].m_scoreGapOpen; + GapCloseB[j] = PB[j].m_scoreGapClose; +#if OCC + OccB[j] = PB[j].m_fOcc; +#endif + } + + for (unsigned uLetter = 0; uLetter < 20; ++uLetter) + { + for (unsigned j = 0; j < uLengthB; ++j) + ScoreMxB[uLetter][j] = PB[j].m_AAScores[uLetter]; + } + + for (unsigned i = 0; i < uPrefixCountA; ++i) + memset(TraceBack[i], 0, uPrefixCountB*sizeof(int)); + +// Special case for i=0 + unsigned **ptrSortOrderA = SortOrderA; + FCOUNT **ptrFreqsA = FreqsA; + assert(ptrSortOrderA == &(SortOrderA[0])); + assert(ptrFreqsA == &(FreqsA[0])); + TraceBack[0][0] = 0; + + SCORE scoreSum = 0; + unsigned *ptrSortOrderAi = SortOrderA[0]; + const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20; + FCOUNT *ptrFreqsAi = FreqsA[0]; + for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) + { + const unsigned uLetter = *ptrSortOrderAi; + const FCOUNT fcLetter = ptrFreqsAi[uLetter]; + if (0 == fcLetter) + break; + scoreSum += fcLetter*ScoreMxB[uLetter][0]; + } + if (0 == scoreSum) + MPrev[0] = -2.5; + else + { +#if OCC + MPrev[0] = (logf(scoreSum) - g_scoreCenter)*OccA[0]*OccB[0]; +#else + MPrev[0] = (logf(scoreSum) - g_scoreCenter); +#endif + } + +// D(0,0) is -infinity (requires I->D). + DPrev[0] = MINUS_INFINITY; + + for (unsigned j = 1; j < uLengthB; ++j) + { + // Only way to get M(0, j) looks like this: + // A ----X + // B XXXXX + // 0 j + // So gap-open at j=0, gap-close at j-1. + SCORE scoreSum = 0; + unsigned *ptrSortOrderAi = SortOrderA[0]; + const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20; + FCOUNT *ptrFreqsAi = FreqsA[0]; + for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) + { + const unsigned uLetter = *ptrSortOrderAi; + const FCOUNT fcLetter = ptrFreqsAi[uLetter]; + if (0 == fcLetter) + break; + scoreSum += fcLetter*ScoreMxB[uLetter][j]; + } + if (0 == scoreSum) + MPrev[j] = -2.5; + else + { +#if OCC + MPrev[j] = (logf(scoreSum) - g_scoreCenter)*OccA[0]*OccB[j] + + GapOpenB[0] + GapCloseB[j-1]; +#else + MPrev[j] = (logf(scoreSum) - g_scoreCenter) + + GapOpenB[0] + GapCloseB[j-1]; +#endif + } + TraceBack[0][j] = -(int) j; + + // Assume no D->I transitions, then can't be a delete if only + // one letter from A. + DPrev[j] = MINUS_INFINITY; + } + + SCORE IPrev_j_1; + for (unsigned i = 1; i < uLengthA; ++i) + { + ++ptrSortOrderA; + ++ptrFreqsA; + assert(ptrSortOrderA == &(SortOrderA[i])); + assert(ptrFreqsA == &(FreqsA[i])); + + SCORE *ptrMCurr_j = MCurr; + memset(ptrMCurr_j, 0, uLengthB*sizeof(SCORE)); + const FCOUNT *FreqsAi = *ptrFreqsA; + + const unsigned *SortOrderAi = *ptrSortOrderA; + const unsigned *ptrSortOrderAiEnd = SortOrderAi + 20; + const SCORE *ptrMCurrMax = MCurr + uLengthB; + for (const unsigned *ptrSortOrderAi = SortOrderAi; + ptrSortOrderAi != ptrSortOrderAiEnd; + ++ptrSortOrderAi) + { + const unsigned uLetter = *ptrSortOrderAi; + SCORE *NSBR_Letter = ScoreMxB[uLetter]; + const FCOUNT fcLetter = FreqsAi[uLetter]; + if (0 == fcLetter) + break; + SCORE *ptrNSBR = NSBR_Letter; + for (SCORE *ptrMCurr = MCurr; ptrMCurr != ptrMCurrMax; ++ptrMCurr) + *ptrMCurr += fcLetter*(*ptrNSBR++); + } + +#if OCC + const FCOUNT OccAi = OccA[i]; +#endif + for (unsigned j = 0; j < uLengthB; ++j) + { + if (MCurr[j] == 0) + MCurr[j] = -2.5; + else +#if OCC + MCurr[j] = (logf(MCurr[j]) - g_scoreCenter)*OccAi*OccB[j]; +#else + MCurr[j] = (logf(MCurr[j]) - g_scoreCenter); +#endif + } + + ptrMCurr_j = MCurr; + unsigned *ptrDeletePos = uDeletePos; + + // Special case for j=0 + // Only way to get M(i, 0) looks like this: + // 0 i + // A XXXXX + // B ----X + // So gap-open at i=0, gap-close at i-1. + assert(ptrMCurr_j == &(MCurr[0])); + *ptrMCurr_j += GapOpenA[0] + GapCloseA[i-1]; + + ++ptrMCurr_j; + + int *ptrTraceBack_ij = TraceBack[i]; + *ptrTraceBack_ij++ = (int) i; + + SCORE *ptrMPrev_j = MPrev; + SCORE *ptrDPrev = DPrev; + SCORE d = *ptrDPrev; + SCORE DNew = *ptrMPrev_j + GapOpenA[i]; + if (DNew > d) + { + d = DNew; + *ptrDeletePos = i; + } + + SCORE *ptrDCurr = DCurr; + + assert(ptrDCurr == &(DCurr[0])); + *ptrDCurr = d; + + // Can't have an insert if no letters from B + IPrev_j_1 = MINUS_INFINITY; + + unsigned uInsertPos = 0; + const SCORE scoreGapOpenAi = GapOpenA[i]; + const SCORE scoreGapCloseAi_1 = GapCloseA[i-1]; + + for (unsigned j = 1; j < uLengthB; ++j) + { + // Here, MPrev_j is preserved from previous + // iteration so with current i,j is M[i-1][j-1] + SCORE MPrev_j = *ptrMPrev_j; + SCORE INew = MPrev_j + GapOpenB[j]; + if (INew > IPrev_j_1) + { + IPrev_j_1 = INew; + uInsertPos = j; + } + + SCORE scoreMax = MPrev_j; + + assert(ptrDPrev == &(DPrev[j-1])); + SCORE scoreD = *ptrDPrev++ + scoreGapCloseAi_1; + if (scoreD > scoreMax) + { + scoreMax = scoreD; + assert(ptrDeletePos == &(uDeletePos[j-1])); + *ptrTraceBack_ij = (int) i - (int) *ptrDeletePos; + assert(*ptrTraceBack_ij > 0); + } + ++ptrDeletePos; + + SCORE scoreI = IPrev_j_1 + GapCloseB[j-1]; + if (scoreI > scoreMax) + { + scoreMax = scoreI; + *ptrTraceBack_ij = (int) uInsertPos - (int) j; + assert(*ptrTraceBack_ij < 0); + } + + assert(ptrSortOrderA == &(SortOrderA[i])); + assert(ptrFreqsA == &(FreqsA[i])); + + *ptrMCurr_j += scoreMax; + assert(ptrMCurr_j == &(MCurr[j])); + ++ptrMCurr_j; + + MPrev_j = *(++ptrMPrev_j); + assert(ptrDPrev == &(DPrev[j])); + SCORE d = *ptrDPrev; + SCORE DNew = MPrev_j + scoreGapOpenAi; + if (DNew > d) + { + d = DNew; + assert(ptrDeletePos == &uDeletePos[j]); + *ptrDeletePos = i; + } + assert(ptrDCurr + 1 == &(DCurr[j])); + *(++ptrDCurr) = d; + + ++ptrTraceBack_ij; + } + + Rotate(MPrev, MCurr, MWork); + Rotate(DPrev, DCurr, DWork); + } + +// Special case for i=uLengthA + SCORE IPrev = MINUS_INFINITY; + + unsigned uInsertPos; + + for (unsigned j = 1; j < uLengthB; ++j) + { + SCORE INew = MPrev[j-1] + GapOpenB[j]; + if (INew > IPrev) + { + uInsertPos = j; + IPrev = INew; + } + } + +// Special case for i=uLengthA, j=uLengthB + SCORE scoreMax = MPrev[uLengthB-1]; + int iTraceBack = 0; + + SCORE scoreD = DPrev[uLengthB-1] + GapCloseA[uLengthA-1]; + if (scoreD > scoreMax) + { + scoreMax = scoreD; + iTraceBack = (int) uLengthA - (int) uDeletePos[uLengthB-1]; + } + + SCORE scoreI = IPrev + GapCloseB[uLengthB-1]; + if (scoreI > scoreMax) + { + scoreMax = scoreI; + iTraceBack = (int) uInsertPos - (int) uLengthB; + } + + TraceBack[uLengthA][uLengthB] = iTraceBack; + + TraceBackToPath(TraceBack, uLengthA, uLengthB, Path); + + return scoreMax; + } diff --git a/src/muscle/muscle3.8.31/src/glbalignns.cpp b/src/muscle/muscle3.8.31/src/glbalignns.cpp new file mode 100644 index 0000000..45827db --- /dev/null +++ b/src/muscle/muscle3.8.31/src/glbalignns.cpp @@ -0,0 +1,374 @@ +#include "muscle.h" +#include "profile.h" +#include "pwpath.h" + +struct DP_MEMORY + { + unsigned uLength; + SCORE *GapOpenA; + SCORE *GapOpenB; + SCORE *GapCloseA; + SCORE *GapCloseB; + SCORE *MPrev; + SCORE *MCurr; + SCORE *MWork; + SCORE *DPrev; + SCORE *DCurr; + SCORE *DWork; + SCORE **ScoreMxB; + unsigned **SortOrderA; + unsigned *uDeletePos; + FCOUNT **FreqsA; + int **TraceBack; + }; + +static struct DP_MEMORY DPM; + +static void AllocDPMem(unsigned uLengthA, unsigned uLengthB) + { +// Max prefix length + unsigned uLength = (uLengthA > uLengthB ? uLengthA : uLengthB) + 1; + if (uLength < DPM.uLength) + return; + +// Add 256 to allow for future expansion and +// round up to next multiple of 32. + uLength += 256; + uLength += 32 - uLength%32; + + const unsigned uOldLength = DPM.uLength; + if (uOldLength > 0) + { + for (unsigned i = 0; i < uOldLength; ++i) + { + delete[] DPM.TraceBack[i]; + delete[] DPM.FreqsA[i]; + delete[] DPM.SortOrderA[i]; + } + for (unsigned n = 0; n < 20; ++n) + delete[] DPM.ScoreMxB[n]; + + delete[] DPM.MPrev; + delete[] DPM.MCurr; + delete[] DPM.MWork; + delete[] DPM.DPrev; + delete[] DPM.DCurr; + delete[] DPM.DWork; + delete[] DPM.uDeletePos; + delete[] DPM.GapOpenA; + delete[] DPM.GapOpenB; + delete[] DPM.GapCloseA; + delete[] DPM.GapCloseB; + delete[] DPM.SortOrderA; + delete[] DPM.FreqsA; + delete[] DPM.ScoreMxB; + delete[] DPM.TraceBack; + } + + DPM.uLength = uLength; + + DPM.GapOpenA = new SCORE[uLength]; + DPM.GapOpenB = new SCORE[uLength]; + DPM.GapCloseA = new SCORE[uLength]; + DPM.GapCloseB = new SCORE[uLength]; + + DPM.SortOrderA = new unsigned*[uLength]; + DPM.FreqsA = new FCOUNT*[uLength]; + DPM.ScoreMxB = new SCORE*[20]; + DPM.MPrev = new SCORE[uLength]; + DPM.MCurr = new SCORE[uLength]; + DPM.MWork = new SCORE[uLength]; + + DPM.DPrev = new SCORE[uLength]; + DPM.DCurr = new SCORE[uLength]; + DPM.DWork = new SCORE[uLength]; + DPM.uDeletePos = new unsigned[uLength]; + + DPM.TraceBack = new int*[uLength]; + + for (unsigned uLetter = 0; uLetter < 20; ++uLetter) + DPM.ScoreMxB[uLetter] = new SCORE[uLength]; + + for (unsigned i = 0; i < uLength; ++i) + { + DPM.SortOrderA[i] = new unsigned[20]; + DPM.FreqsA[i] = new FCOUNT[20]; + DPM.TraceBack[i] = new int[uLength]; + } + } + +SCORE GlobalAlignNS(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path) + { + const unsigned uPrefixCountA = uLengthA + 1; + const unsigned uPrefixCountB = uLengthB + 1; + + AllocDPMem(uLengthA, uLengthB); + + SCORE *GapOpenA = DPM.GapOpenA; + SCORE *GapOpenB = DPM.GapOpenB; + SCORE *GapCloseA = DPM.GapCloseA; + SCORE *GapCloseB = DPM.GapCloseB; + + unsigned **SortOrderA = DPM.SortOrderA; + FCOUNT **FreqsA = DPM.FreqsA; + SCORE **ScoreMxB = DPM.ScoreMxB; + SCORE *MPrev = DPM.MPrev; + SCORE *MCurr = DPM.MCurr; + SCORE *MWork = DPM.MWork; + + SCORE *DPrev = DPM.DPrev; + SCORE *DCurr = DPM.DCurr; + SCORE *DWork = DPM.DWork; + unsigned *uDeletePos = DPM.uDeletePos; + + int **TraceBack = DPM.TraceBack; + + for (unsigned i = 0; i < uLengthA; ++i) + { + GapOpenA[i] = PA[i].m_scoreGapOpen; + GapCloseA[i] = PA[i].m_scoreGapClose; + + for (unsigned uLetter = 0; uLetter < 20; ++uLetter) + { + SortOrderA[i][uLetter] = PA[i].m_uSortOrder[uLetter]; + FreqsA[i][uLetter] = PA[i].m_fcCounts[uLetter]; + } + } + + for (unsigned j = 0; j < uLengthB; ++j) + { + GapOpenB[j] = PB[j].m_scoreGapOpen; + GapCloseB[j] = PB[j].m_scoreGapClose; + } + + for (unsigned uLetter = 0; uLetter < 20; ++uLetter) + { + for (unsigned j = 0; j < uLengthB; ++j) + ScoreMxB[uLetter][j] = PB[j].m_AAScores[uLetter]; + } + + for (unsigned i = 0; i < uPrefixCountA; ++i) + memset(TraceBack[i], 0, uPrefixCountB*sizeof(int)); + +// Special case for i=0 + unsigned **ptrSortOrderA = SortOrderA; + FCOUNT **ptrFreqsA = FreqsA; + assert(ptrSortOrderA == &(SortOrderA[0])); + assert(ptrFreqsA == &(FreqsA[0])); + TraceBack[0][0] = 0; + + SCORE scoreSum = 0; + unsigned *ptrSortOrderAi = SortOrderA[0]; + const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20; + FCOUNT *ptrFreqsAi = FreqsA[0]; + for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) + { + const unsigned uLetter = *ptrSortOrderAi; + const FCOUNT fcLetter = ptrFreqsAi[uLetter]; + if (0 == fcLetter) + break; + scoreSum += fcLetter*ScoreMxB[uLetter][0]; + } + MPrev[0] = scoreSum - g_scoreCenter; + +// D(0,0) is -infinity (requires I->D). + DPrev[0] = MINUS_INFINITY; + + for (unsigned j = 1; j < uLengthB; ++j) + { + // Only way to get M(0, j) looks like this: + // A ----X + // B XXXXX + // 0 j + // So gap-open at j=0, gap-close at j-1. + SCORE scoreSum = 0; + unsigned *ptrSortOrderAi = SortOrderA[0]; + const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20; + FCOUNT *ptrFreqsAi = FreqsA[0]; + for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) + { + const unsigned uLetter = *ptrSortOrderAi; + const FCOUNT fcLetter = ptrFreqsAi[uLetter]; + if (0 == fcLetter) + break; + scoreSum += fcLetter*ScoreMxB[uLetter][j]; + } + MPrev[j] = scoreSum - g_scoreCenter + GapOpenB[0] + GapCloseB[j-1]; + TraceBack[0][j] = -(int) j; + + // Assume no D->I transitions, then can't be a delete if only + // one letter from A. + DPrev[j] = MINUS_INFINITY; + } + + SCORE IPrev_j_1; + for (unsigned i = 1; i < uLengthA; ++i) + { + ++ptrSortOrderA; + ++ptrFreqsA; + assert(ptrSortOrderA == &(SortOrderA[i])); + assert(ptrFreqsA == &(FreqsA[i])); + + SCORE *ptrMCurr_j = MCurr; + memset(ptrMCurr_j, 0, uLengthB*sizeof(SCORE)); + const FCOUNT *FreqsAi = *ptrFreqsA; + + const unsigned *SortOrderAi = *ptrSortOrderA; + const unsigned *ptrSortOrderAiEnd = SortOrderAi + 20; + const SCORE *ptrMCurrMax = MCurr + uLengthB; + for (const unsigned *ptrSortOrderAi = SortOrderAi; + ptrSortOrderAi != ptrSortOrderAiEnd; + ++ptrSortOrderAi) + { + const unsigned uLetter = *ptrSortOrderAi; + SCORE *NSBR_Letter = ScoreMxB[uLetter]; + const FCOUNT fcLetter = FreqsAi[uLetter]; + if (0 == fcLetter) + break; + SCORE *ptrNSBR = NSBR_Letter; + for (SCORE *ptrMCurr = MCurr; ptrMCurr != ptrMCurrMax; ++ptrMCurr) + *ptrMCurr += fcLetter*(*ptrNSBR++); + } + + for (unsigned j = 0; j < uLengthB; ++j) + MCurr[j] -= g_scoreCenter; + + ptrMCurr_j = MCurr; + unsigned *ptrDeletePos = uDeletePos; + + // Special case for j=0 + // Only way to get M(i, 0) looks like this: + // 0 i + // A XXXXX + // B ----X + // So gap-open at i=0, gap-close at i-1. + assert(ptrMCurr_j == &(MCurr[0])); + *ptrMCurr_j += GapOpenA[0] + GapCloseA[i-1]; + + ++ptrMCurr_j; + + int *ptrTraceBack_ij = TraceBack[i]; + *ptrTraceBack_ij++ = (int) i; + + SCORE *ptrMPrev_j = MPrev; + SCORE *ptrDPrev = DPrev; + SCORE d = *ptrDPrev; + SCORE DNew = *ptrMPrev_j + GapOpenA[i]; + if (DNew > d) + { + d = DNew; + *ptrDeletePos = i; + } + + SCORE *ptrDCurr = DCurr; + + assert(ptrDCurr == &(DCurr[0])); + *ptrDCurr = d; + + // Can't have an insert if no letters from B + IPrev_j_1 = MINUS_INFINITY; + + unsigned uInsertPos; + const SCORE scoreGapOpenAi = GapOpenA[i]; + const SCORE scoreGapCloseAi_1 = GapCloseA[i-1]; + + for (unsigned j = 1; j < uLengthB; ++j) + { + // Here, MPrev_j is preserved from previous + // iteration so with current i,j is M[i-1][j-1] + SCORE MPrev_j = *ptrMPrev_j; + SCORE INew = MPrev_j + GapOpenB[j]; + if (INew > IPrev_j_1) + { + IPrev_j_1 = INew; + uInsertPos = j; + } + + SCORE scoreMax = MPrev_j; + + assert(ptrDPrev == &(DPrev[j-1])); + SCORE scoreD = *ptrDPrev++ + scoreGapCloseAi_1; + if (scoreD > scoreMax) + { + scoreMax = scoreD; + assert(ptrDeletePos == &(uDeletePos[j-1])); + *ptrTraceBack_ij = (int) i - (int) *ptrDeletePos; + assert(*ptrTraceBack_ij > 0); + } + ++ptrDeletePos; + + SCORE scoreI = IPrev_j_1 + GapCloseB[j-1]; + if (scoreI > scoreMax) + { + scoreMax = scoreI; + *ptrTraceBack_ij = (int) uInsertPos - (int) j; + assert(*ptrTraceBack_ij < 0); + } + + assert(ptrSortOrderA == &(SortOrderA[i])); + assert(ptrFreqsA == &(FreqsA[i])); + + *ptrMCurr_j += scoreMax; + assert(ptrMCurr_j == &(MCurr[j])); + ++ptrMCurr_j; + + MPrev_j = *(++ptrMPrev_j); + assert(ptrDPrev == &(DPrev[j])); + SCORE d = *ptrDPrev; + SCORE DNew = MPrev_j + scoreGapOpenAi; + if (DNew > d) + { + d = DNew; + assert(ptrDeletePos == &uDeletePos[j]); + *ptrDeletePos = i; + } + assert(ptrDCurr + 1 == &(DCurr[j])); + *(++ptrDCurr) = d; + + ++ptrTraceBack_ij; + } + + Rotate(MPrev, MCurr, MWork); + Rotate(DPrev, DCurr, DWork); + } + +// Special case for i=uLengthA + SCORE IPrev = MINUS_INFINITY; + + unsigned uInsertPos; + + for (unsigned j = 1; j < uLengthB; ++j) + { + SCORE INew = MPrev[j-1] + GapOpenB[j]; + if (INew > IPrev) + { + uInsertPos = j; + IPrev = INew; + } + } + +// Special case for i=uLengthA, j=uLengthB + SCORE scoreMax = MPrev[uLengthB-1]; + int iTraceBack = 0; + + SCORE scoreD = DPrev[uLengthB-1] + GapCloseA[uLengthA-1]; + if (scoreD > scoreMax) + { + scoreMax = scoreD; + iTraceBack = (int) uLengthA - (int) uDeletePos[uLengthB-1]; + } + + SCORE scoreI = IPrev + GapCloseB[uLengthB-1]; + if (scoreI > scoreMax) + { + scoreMax = scoreI; + iTraceBack = (int) uInsertPos - (int) uLengthB; + } + + TraceBack[uLengthA][uLengthB] = iTraceBack; + + TraceBackToPath(TraceBack, uLengthA, uLengthB, Path); + + return scoreMax; + } diff --git a/src/muscle/muscle3.8.31/src/glbalignsimple.cpp b/src/muscle/muscle3.8.31/src/glbalignsimple.cpp new file mode 100644 index 0000000..7106755 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/glbalignsimple.cpp @@ -0,0 +1,368 @@ +#include "muscle.h" +#include +#include "pwpath.h" +#include "profile.h" +#include + +#define TRACE 0 + +#if 1 // SINGLE_AFFINE + +extern bool g_bKeepSimpleDP; +extern SCORE *g_DPM; +extern SCORE *g_DPD; +extern SCORE *g_DPI; +extern char *g_TBM; +extern char *g_TBD; +extern char *g_TBI; + +static const char *LocalScoreToStr(SCORE s) + { + static char str[16]; + if (s < -100000) + return " *"; + sprintf(str, "%6.1f", s); + return str; + } + +static void ListTB(const char *TBM_, const ProfPos *PA, const ProfPos *PB, + unsigned uPrefixCountA, unsigned uPrefixCountB) + { + Log(" "); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + char c = ' '; + if (uPrefixLengthB > 0) + c = ConsensusChar(PB[uPrefixLengthB - 1]); + Log(" %4u:%c", uPrefixLengthB, c); + } + Log("\n"); + for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + char c = ' '; + if (uPrefixLengthA > 0) + c = ConsensusChar(PA[uPrefixLengthA - 1]); + Log("%4u:%c ", uPrefixLengthA, c); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + Log(" %6c", TBM(uPrefixLengthA, uPrefixLengthB)); + Log("\n"); + } + } + +static void ListDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB, + unsigned uPrefixCountA, unsigned uPrefixCountB) + { + Log(" "); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + char c = ' '; + if (uPrefixLengthB > 0) + c = ConsensusChar(PB[uPrefixLengthB - 1]); + Log(" %4u:%c", uPrefixLengthB, c); + } + Log("\n"); + for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + char c = ' '; + if (uPrefixLengthA > 0) + c = ConsensusChar(PA[uPrefixLengthA - 1]); + Log("%4u:%c ", uPrefixLengthA, c); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB))); + Log("\n"); + } + } + +SCORE GlobalAlignSimple(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path) + { + assert(uLengthB > 0 && uLengthA > 0); + + SetTermGaps(PA, uLengthA); + SetTermGaps(PB, uLengthB); + + const unsigned uPrefixCountA = uLengthA + 1; + const unsigned uPrefixCountB = uLengthB + 1; + +// Allocate DP matrices + const size_t LM = uPrefixCountA*uPrefixCountB; + SCORE *DPL_ = new SCORE[LM]; + SCORE *DPM_ = new SCORE[LM]; + SCORE *DPD_ = new SCORE[LM]; + SCORE *DPI_ = new SCORE[LM]; + + char *TBM_ = new char[LM]; + char *TBD_ = new char[LM]; + char *TBI_ = new char[LM]; + + memset(TBM_, '?', LM); + memset(TBD_, '?', LM); + memset(TBI_, '?', LM); + + DPM(0, 0) = 0; + DPD(0, 0) = MINUS_INFINITY; + DPI(0, 0) = MINUS_INFINITY; + + DPM(1, 0) = MINUS_INFINITY; + DPD(1, 0) = PA[0].m_scoreGapOpen; + TBD(1, 0) = 'D'; + DPI(1, 0) = MINUS_INFINITY; + + DPM(0, 1) = MINUS_INFINITY; + DPD(0, 1) = MINUS_INFINITY; + DPI(0, 1) = PB[0].m_scoreGapOpen; + TBI(0, 1) = 'I'; + +// Empty prefix of B is special case + for (unsigned uPrefixLengthA = 2; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + // M=LetterA+LetterB, impossible with empty prefix + DPM(uPrefixLengthA, 0) = MINUS_INFINITY; + + // D=LetterA+GapB + DPD(uPrefixLengthA, 0) = DPD(uPrefixLengthA - 1, 0) + g_scoreGapExtend; + TBD(uPrefixLengthA, 0) = 'D'; + + // I=GapA+LetterB, impossible with empty prefix + DPI(uPrefixLengthA, 0) = MINUS_INFINITY; + } + +// Empty prefix of A is special case + for (unsigned uPrefixLengthB = 2; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + // M=LetterA+LetterB, impossible with empty prefix + DPM(0, uPrefixLengthB) = MINUS_INFINITY; + + // D=LetterA+GapB, impossible with empty prefix + DPD(0, uPrefixLengthB) = MINUS_INFINITY; + + // I=GapA+LetterB + DPI(0, uPrefixLengthB) = DPI(0, uPrefixLengthB - 1) + g_scoreGapExtend; + TBI(0, uPrefixLengthB) = 'I'; + } + +// Special case to agree with NWFast, no D-I transitions so... + DPD(uLengthA, 0) = MINUS_INFINITY; +// DPI(0, uLengthB) = MINUS_INFINITY; + +// ============ +// Main DP loop +// ============ + SCORE scoreGapCloseB = MINUS_INFINITY; + for (unsigned uPrefixLengthB = 1; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + const ProfPos &PPB = PB[uPrefixLengthB - 1]; + + SCORE scoreGapCloseA = MINUS_INFINITY; + for (unsigned uPrefixLengthA = 1; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + const ProfPos &PPA = PA[uPrefixLengthA - 1]; + + { + // Match M=LetterA+LetterB + SCORE scoreLL = ScoreProfPos2(PPA, PPB); + DPL(uPrefixLengthA, uPrefixLengthB) = scoreLL; + + SCORE scoreMM = DPM(uPrefixLengthA-1, uPrefixLengthB-1); + SCORE scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseA; + SCORE scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseB; + + SCORE scoreBest; + if (scoreMM >= scoreDM && scoreMM >= scoreIM) + { + scoreBest = scoreMM; + TBM(uPrefixLengthA, uPrefixLengthB) = 'M'; + } + else if (scoreDM >= scoreMM && scoreDM >= scoreIM) + { + scoreBest = scoreDM; + TBM(uPrefixLengthA, uPrefixLengthB) = 'D'; + } + else + { + assert(scoreIM >= scoreMM && scoreIM >= scoreDM); + scoreBest = scoreIM; + TBM(uPrefixLengthA, uPrefixLengthB) = 'I'; + } + DPM(uPrefixLengthA, uPrefixLengthB) = scoreBest + scoreLL; + } + + { + // Delete D=LetterA+GapB + SCORE scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) + + PA[uPrefixLengthA-1].m_scoreGapOpen; + SCORE scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB) + g_scoreGapExtend; + + SCORE scoreBest; + if (scoreMD >= scoreDD) + { + scoreBest = scoreMD; + TBD(uPrefixLengthA, uPrefixLengthB) = 'M'; + } + else + { + assert(scoreDD >= scoreMD); + scoreBest = scoreDD; + TBD(uPrefixLengthA, uPrefixLengthB) = 'D'; + } + DPD(uPrefixLengthA, uPrefixLengthB) = scoreBest; + } + + // Insert I=GapA+LetterB + { + SCORE scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) + + PB[uPrefixLengthB - 1].m_scoreGapOpen; + SCORE scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1) + g_scoreGapExtend; + + SCORE scoreBest; + if (scoreMI >= scoreII) + { + scoreBest = scoreMI; + TBI(uPrefixLengthA, uPrefixLengthB) = 'M'; + } + else + { + assert(scoreII > scoreMI); + scoreBest = scoreII; + TBI(uPrefixLengthA, uPrefixLengthB) = 'I'; + } + DPI(uPrefixLengthA, uPrefixLengthB) = scoreBest; + } + + scoreGapCloseA = PPA.m_scoreGapClose; + } + scoreGapCloseB = PPB.m_scoreGapClose; + } + +#if TRACE + Log("\n"); + Log("Simple DPL:\n"); + ListDP(DPL_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("\n"); + Log("Simple DPM:\n"); + ListDP(DPM_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("\n"); + Log("Simple DPD:\n"); + ListDP(DPD_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("\n"); + Log("Simple DPI:\n"); + ListDP(DPI_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("\n"); + Log("Simple TBM:\n"); + ListTB(TBM_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("\n"); + Log("Simple TBD:\n"); + ListTB(TBD_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("\n"); + Log("Simple TBI:\n"); + ListTB(TBI_, PA, PB, uPrefixCountA, uPrefixCountB); +#endif + +// Trace-back +// ========== + Path.Clear(); + +// Find last edge + SCORE M = DPM(uLengthA, uLengthB); + SCORE D = DPD(uLengthA, uLengthB) + PA[uLengthA-1].m_scoreGapClose; + SCORE I = DPI(uLengthA, uLengthB) + PB[uLengthB-1].m_scoreGapClose; + char cEdgeType = '?'; + + SCORE BestScore = MINUS_INFINITY; + if (M >= D && M >= I) + { + cEdgeType = 'M'; + BestScore = M; + } + else if (D >= M && D >= I) + { + cEdgeType = 'D'; + BestScore = D; + } + else + { + assert(I >= M && I >= D); + cEdgeType = 'I'; + BestScore = I; + } + +#if TRACE + Log("Simple: MAB=%.4g DAB=%.4g IAB=%.4g best=%c\n", M, D, I, cEdgeType); +#endif + + unsigned PLA = uLengthA; + unsigned PLB = uLengthB; + for (;;) + { + PWEdge Edge; + Edge.cType = cEdgeType; + Edge.uPrefixLengthA = PLA; + Edge.uPrefixLengthB = PLB; +#if TRACE + Log("Prepend %c%d.%d\n", Edge.cType, PLA, PLB); +#endif + Path.PrependEdge(Edge); + + switch (cEdgeType) + { + case 'M': + assert(PLA > 0); + assert(PLB > 0); + cEdgeType = TBM(PLA, PLB); + --PLA; + --PLB; + break; + + case 'D': + assert(PLA > 0); + cEdgeType = TBD(PLA, PLB); + --PLA; + break; + + case 'I': + assert(PLB > 0); + cEdgeType = TBI(PLA, PLB); + --PLB; + break; + + default: + Quit("Invalid edge %c", cEdgeType); + } + if (0 == PLA && 0 == PLB) + break; + } + Path.Validate(); + +// SCORE Score = TraceBack(PA, uLengthA, PB, uLengthB, DPM_, DPD_, DPI_, Path); + +#if TRACE + SCORE scorePath = FastScorePath2(PA, uLengthA, PB, uLengthB, Path); + Path.LogMe(); + Log("Score = %s Path = %s\n", LocalScoreToStr(BestScore), LocalScoreToStr(scorePath)); +#endif + + if (g_bKeepSimpleDP) + { + g_DPM = DPM_; + g_DPD = DPD_; + g_DPI = DPI_; + + g_TBM = TBM_; + g_TBD = TBD_; + g_TBI = TBI_; + } + else + { + delete[] DPM_; + delete[] DPD_; + delete[] DPI_; + + delete[] TBM_; + delete[] TBD_; + delete[] TBI_; + } + + return BestScore; + } + +#endif // SINLGLE_AFFINE diff --git a/src/muscle/muscle3.8.31/src/glbalignsp.cpp b/src/muscle/muscle3.8.31/src/glbalignsp.cpp new file mode 100644 index 0000000..af652cc --- /dev/null +++ b/src/muscle/muscle3.8.31/src/glbalignsp.cpp @@ -0,0 +1,374 @@ +#include "muscle.h" +#include "profile.h" +#include "pwpath.h" + +struct DP_MEMORY + { + unsigned uLength; + SCORE *GapOpenA; + SCORE *GapOpenB; + SCORE *GapCloseA; + SCORE *GapCloseB; + SCORE *MPrev; + SCORE *MCurr; + SCORE *MWork; + SCORE *DPrev; + SCORE *DCurr; + SCORE *DWork; + SCORE **ScoreMxB; + unsigned **SortOrderA; + unsigned *uDeletePos; + FCOUNT **FreqsA; + int **TraceBack; + }; + +static struct DP_MEMORY DPM; + +static void AllocDPMem(unsigned uLengthA, unsigned uLengthB) + { +// Max prefix length + unsigned uLength = (uLengthA > uLengthB ? uLengthA : uLengthB) + 1; + if (uLength < DPM.uLength) + return; + +// Add 256 to allow for future expansion and +// round up to next multiple of 32. + uLength += 256; + uLength += 32 - uLength%32; + + const unsigned uOldLength = DPM.uLength; + if (uOldLength > 0) + { + for (unsigned i = 0; i < uOldLength; ++i) + { + delete[] DPM.TraceBack[i]; + delete[] DPM.FreqsA[i]; + delete[] DPM.SortOrderA[i]; + } + for (unsigned n = 0; n < 20; ++n) + delete[] DPM.ScoreMxB[n]; + + delete[] DPM.MPrev; + delete[] DPM.MCurr; + delete[] DPM.MWork; + delete[] DPM.DPrev; + delete[] DPM.DCurr; + delete[] DPM.DWork; + delete[] DPM.uDeletePos; + delete[] DPM.GapOpenA; + delete[] DPM.GapOpenB; + delete[] DPM.GapCloseA; + delete[] DPM.GapCloseB; + delete[] DPM.SortOrderA; + delete[] DPM.FreqsA; + delete[] DPM.ScoreMxB; + delete[] DPM.TraceBack; + } + + DPM.uLength = uLength; + + DPM.GapOpenA = new SCORE[uLength]; + DPM.GapOpenB = new SCORE[uLength]; + DPM.GapCloseA = new SCORE[uLength]; + DPM.GapCloseB = new SCORE[uLength]; + + DPM.SortOrderA = new unsigned*[uLength]; + DPM.FreqsA = new FCOUNT*[uLength]; + DPM.ScoreMxB = new SCORE*[20]; + DPM.MPrev = new SCORE[uLength]; + DPM.MCurr = new SCORE[uLength]; + DPM.MWork = new SCORE[uLength]; + + DPM.DPrev = new SCORE[uLength]; + DPM.DCurr = new SCORE[uLength]; + DPM.DWork = new SCORE[uLength]; + DPM.uDeletePos = new unsigned[uLength]; + + DPM.TraceBack = new int*[uLength]; + + for (unsigned uLetter = 0; uLetter < 20; ++uLetter) + DPM.ScoreMxB[uLetter] = new SCORE[uLength]; + + for (unsigned i = 0; i < uLength; ++i) + { + DPM.SortOrderA[i] = new unsigned[20]; + DPM.FreqsA[i] = new FCOUNT[20]; + DPM.TraceBack[i] = new int[uLength]; + } + } + +SCORE GlobalAlignSP(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path) + { + const unsigned uPrefixCountA = uLengthA + 1; + const unsigned uPrefixCountB = uLengthB + 1; + + AllocDPMem(uLengthA, uLengthB); + + SCORE *GapOpenA = DPM.GapOpenA; + SCORE *GapOpenB = DPM.GapOpenB; + SCORE *GapCloseA = DPM.GapCloseA; + SCORE *GapCloseB = DPM.GapCloseB; + + unsigned **SortOrderA = DPM.SortOrderA; + FCOUNT **FreqsA = DPM.FreqsA; + SCORE **ScoreMxB = DPM.ScoreMxB; + SCORE *MPrev = DPM.MPrev; + SCORE *MCurr = DPM.MCurr; + SCORE *MWork = DPM.MWork; + + SCORE *DPrev = DPM.DPrev; + SCORE *DCurr = DPM.DCurr; + SCORE *DWork = DPM.DWork; + unsigned *uDeletePos = DPM.uDeletePos; + + int **TraceBack = DPM.TraceBack; + + for (unsigned i = 0; i < uLengthA; ++i) + { + GapOpenA[i] = PA[i].m_scoreGapOpen; + GapCloseA[i] = PA[i].m_scoreGapClose; + + for (unsigned uLetter = 0; uLetter < 20; ++uLetter) + { + SortOrderA[i][uLetter] = PA[i].m_uSortOrder[uLetter]; + FreqsA[i][uLetter] = PA[i].m_fcCounts[uLetter]; + } + } + + for (unsigned j = 0; j < uLengthB; ++j) + { + GapOpenB[j] = PB[j].m_scoreGapOpen; + GapCloseB[j] = PB[j].m_scoreGapClose; + } + + for (unsigned uLetter = 0; uLetter < 20; ++uLetter) + { + for (unsigned j = 0; j < uLengthB; ++j) + ScoreMxB[uLetter][j] = PB[j].m_AAScores[uLetter]; + } + + for (unsigned i = 0; i < uPrefixCountA; ++i) + memset(TraceBack[i], 0, uPrefixCountB*sizeof(int)); + +// Special case for i=0 + unsigned **ptrSortOrderA = SortOrderA; + FCOUNT **ptrFreqsA = FreqsA; + assert(ptrSortOrderA == &(SortOrderA[0])); + assert(ptrFreqsA == &(FreqsA[0])); + TraceBack[0][0] = 0; + + SCORE scoreSum = 0; + unsigned *ptrSortOrderAi = SortOrderA[0]; + const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20; + FCOUNT *ptrFreqsAi = FreqsA[0]; + for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) + { + const unsigned uLetter = *ptrSortOrderAi; + const FCOUNT fcLetter = ptrFreqsAi[uLetter]; + if (0 == fcLetter) + break; + scoreSum += fcLetter*ScoreMxB[uLetter][0]; + } + MPrev[0] = scoreSum - g_scoreCenter; + +// D(0,0) is -infinity (requires I->D). + DPrev[0] = MINUS_INFINITY; + + for (unsigned j = 1; j < uLengthB; ++j) + { + // Only way to get M(0, j) looks like this: + // A ----X + // B XXXXX + // 0 j + // So gap-open at j=0, gap-close at j-1. + SCORE scoreSum = 0; + unsigned *ptrSortOrderAi = SortOrderA[0]; + const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20; + FCOUNT *ptrFreqsAi = FreqsA[0]; + for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) + { + const unsigned uLetter = *ptrSortOrderAi; + const FCOUNT fcLetter = ptrFreqsAi[uLetter]; + if (0 == fcLetter) + break; + scoreSum += fcLetter*ScoreMxB[uLetter][j]; + } + MPrev[j] = scoreSum - g_scoreCenter + GapOpenB[0] + GapCloseB[j-1]; + TraceBack[0][j] = -(int) j; + + // Assume no D->I transitions, then can't be a delete if only + // one letter from A. + DPrev[j] = MINUS_INFINITY; + } + + SCORE IPrev_j_1; + for (unsigned i = 1; i < uLengthA; ++i) + { + ++ptrSortOrderA; + ++ptrFreqsA; + assert(ptrSortOrderA == &(SortOrderA[i])); + assert(ptrFreqsA == &(FreqsA[i])); + + SCORE *ptrMCurr_j = MCurr; + memset(ptrMCurr_j, 0, uLengthB*sizeof(SCORE)); + const FCOUNT *FreqsAi = *ptrFreqsA; + + const unsigned *SortOrderAi = *ptrSortOrderA; + const unsigned *ptrSortOrderAiEnd = SortOrderAi + 20; + const SCORE *ptrMCurrMax = MCurr + uLengthB; + for (const unsigned *ptrSortOrderAi = SortOrderAi; + ptrSortOrderAi != ptrSortOrderAiEnd; + ++ptrSortOrderAi) + { + const unsigned uLetter = *ptrSortOrderAi; + SCORE *NSBR_Letter = ScoreMxB[uLetter]; + const FCOUNT fcLetter = FreqsAi[uLetter]; + if (0 == fcLetter) + break; + SCORE *ptrNSBR = NSBR_Letter; + for (SCORE *ptrMCurr = MCurr; ptrMCurr != ptrMCurrMax; ++ptrMCurr) + *ptrMCurr += fcLetter*(*ptrNSBR++); + } + + for (unsigned j = 0; j < uLengthB; ++j) + MCurr[j] -= g_scoreCenter; + + ptrMCurr_j = MCurr; + unsigned *ptrDeletePos = uDeletePos; + + // Special case for j=0 + // Only way to get M(i, 0) looks like this: + // 0 i + // A XXXXX + // B ----X + // So gap-open at i=0, gap-close at i-1. + assert(ptrMCurr_j == &(MCurr[0])); + *ptrMCurr_j += GapOpenA[0] + GapCloseA[i-1]; + + ++ptrMCurr_j; + + int *ptrTraceBack_ij = TraceBack[i]; + *ptrTraceBack_ij++ = (int) i; + + SCORE *ptrMPrev_j = MPrev; + SCORE *ptrDPrev = DPrev; + SCORE d = *ptrDPrev; + SCORE DNew = *ptrMPrev_j + GapOpenA[i]; + if (DNew > d) + { + d = DNew; + *ptrDeletePos = i; + } + + SCORE *ptrDCurr = DCurr; + + assert(ptrDCurr == &(DCurr[0])); + *ptrDCurr = d; + + // Can't have an insert if no letters from B + IPrev_j_1 = MINUS_INFINITY; + + unsigned uInsertPos; + const SCORE scoreGapOpenAi = GapOpenA[i]; + const SCORE scoreGapCloseAi_1 = GapCloseA[i-1]; + + for (unsigned j = 1; j < uLengthB; ++j) + { + // Here, MPrev_j is preserved from previous + // iteration so with current i,j is M[i-1][j-1] + SCORE MPrev_j = *ptrMPrev_j; + SCORE INew = MPrev_j + GapOpenB[j]; + if (INew > IPrev_j_1) + { + IPrev_j_1 = INew; + uInsertPos = j; + } + + SCORE scoreMax = MPrev_j; + + assert(ptrDPrev == &(DPrev[j-1])); + SCORE scoreD = *ptrDPrev++ + scoreGapCloseAi_1; + if (scoreD > scoreMax) + { + scoreMax = scoreD; + assert(ptrDeletePos == &(uDeletePos[j-1])); + *ptrTraceBack_ij = (int) i - (int) *ptrDeletePos; + assert(*ptrTraceBack_ij > 0); + } + ++ptrDeletePos; + + SCORE scoreI = IPrev_j_1 + GapCloseB[j-1]; + if (scoreI > scoreMax) + { + scoreMax = scoreI; + *ptrTraceBack_ij = (int) uInsertPos - (int) j; + assert(*ptrTraceBack_ij < 0); + } + + assert(ptrSortOrderA == &(SortOrderA[i])); + assert(ptrFreqsA == &(FreqsA[i])); + + *ptrMCurr_j += scoreMax; + assert(ptrMCurr_j == &(MCurr[j])); + ++ptrMCurr_j; + + MPrev_j = *(++ptrMPrev_j); + assert(ptrDPrev == &(DPrev[j])); + SCORE d = *ptrDPrev; + SCORE DNew = MPrev_j + scoreGapOpenAi; + if (DNew > d) + { + d = DNew; + assert(ptrDeletePos == &uDeletePos[j]); + *ptrDeletePos = i; + } + assert(ptrDCurr + 1 == &(DCurr[j])); + *(++ptrDCurr) = d; + + ++ptrTraceBack_ij; + } + + Rotate(MPrev, MCurr, MWork); + Rotate(DPrev, DCurr, DWork); + } + +// Special case for i=uLengthA + SCORE IPrev = MINUS_INFINITY; + + unsigned uInsertPos; + + for (unsigned j = 1; j < uLengthB; ++j) + { + SCORE INew = MPrev[j-1] + GapOpenB[j]; + if (INew > IPrev) + { + uInsertPos = j; + IPrev = INew; + } + } + +// Special case for i=uLengthA, j=uLengthB + SCORE scoreMax = MPrev[uLengthB-1]; + int iTraceBack = 0; + + SCORE scoreD = DPrev[uLengthB-1] + GapCloseA[uLengthA-1]; + if (scoreD > scoreMax) + { + scoreMax = scoreD; + iTraceBack = (int) uLengthA - (int) uDeletePos[uLengthB-1]; + } + + SCORE scoreI = IPrev + GapCloseB[uLengthB-1]; + if (scoreI > scoreMax) + { + scoreMax = scoreI; + iTraceBack = (int) uInsertPos - (int) uLengthB; + } + + TraceBack[uLengthA][uLengthB] = iTraceBack; + + TraceBackToPath(TraceBack, uLengthA, uLengthB, Path); + + return scoreMax; + } diff --git a/src/muscle/muscle3.8.31/src/glbalignspn.cpp b/src/muscle/muscle3.8.31/src/glbalignspn.cpp new file mode 100644 index 0000000..0ddc727 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/glbalignspn.cpp @@ -0,0 +1,409 @@ +#include "muscle.h" +#include "profile.h" +#include "pwpath.h" + +struct DP_MEMORY + { + unsigned uLength; + SCORE *GapOpenA; + SCORE *GapOpenB; + SCORE *GapCloseA; + SCORE *GapCloseB; + SCORE *MPrev; + SCORE *MCurr; + SCORE *MWork; + SCORE *DPrev; + SCORE *DCurr; + SCORE *DWork; + SCORE **ScoreMxB; + unsigned **SortOrderA; + unsigned *uDeletePos; + FCOUNT **FreqsA; + int **TraceBack; + }; + +static struct DP_MEMORY DPM; + +void FreeDPMemSPN() + { + const unsigned uOldLength = DPM.uLength; + if (0 == uOldLength) + return; + + for (unsigned i = 0; i < uOldLength; ++i) + { + delete[] DPM.TraceBack[i]; + delete[] DPM.FreqsA[i]; + delete[] DPM.SortOrderA[i]; + } + for (unsigned n = 0; n < 4; ++n) + delete[] DPM.ScoreMxB[n]; + + delete[] DPM.MPrev; + delete[] DPM.MCurr; + delete[] DPM.MWork; + delete[] DPM.DPrev; + delete[] DPM.DCurr; + delete[] DPM.DWork; + delete[] DPM.uDeletePos; + delete[] DPM.GapOpenA; + delete[] DPM.GapOpenB; + delete[] DPM.GapCloseA; + delete[] DPM.GapCloseB; + delete[] DPM.SortOrderA; + delete[] DPM.FreqsA; + delete[] DPM.ScoreMxB; + delete[] DPM.TraceBack; + } + +static void AllocDPMem(unsigned uLengthA, unsigned uLengthB) + { +// Max prefix length + unsigned uLength = (uLengthA > uLengthB ? uLengthA : uLengthB) + 1; + if (uLength < DPM.uLength) + return; + +// Add 256 to allow for future expansion and +// round up to next multiple of 32. + uLength += 256; + uLength += 32 - uLength%32; + + const unsigned uOldLength = DPM.uLength; + if (uOldLength > 0) + { + for (unsigned i = 0; i < uOldLength; ++i) + { + delete[] DPM.TraceBack[i]; + delete[] DPM.FreqsA[i]; + delete[] DPM.SortOrderA[i]; + } + for (unsigned n = 0; n < 4; ++n) + delete[] DPM.ScoreMxB[n]; + + delete[] DPM.MPrev; + delete[] DPM.MCurr; + delete[] DPM.MWork; + delete[] DPM.DPrev; + delete[] DPM.DCurr; + delete[] DPM.DWork; + delete[] DPM.uDeletePos; + delete[] DPM.GapOpenA; + delete[] DPM.GapOpenB; + delete[] DPM.GapCloseA; + delete[] DPM.GapCloseB; + delete[] DPM.SortOrderA; + delete[] DPM.FreqsA; + delete[] DPM.ScoreMxB; + delete[] DPM.TraceBack; + } + + DPM.uLength = uLength; + + DPM.GapOpenA = new SCORE[uLength]; + DPM.GapOpenB = new SCORE[uLength]; + DPM.GapCloseA = new SCORE[uLength]; + DPM.GapCloseB = new SCORE[uLength]; + + DPM.SortOrderA = new unsigned*[uLength]; + DPM.FreqsA = new FCOUNT*[uLength]; + DPM.ScoreMxB = new SCORE*[4]; + DPM.MPrev = new SCORE[uLength]; + DPM.MCurr = new SCORE[uLength]; + DPM.MWork = new SCORE[uLength]; + + DPM.DPrev = new SCORE[uLength]; + DPM.DCurr = new SCORE[uLength]; + DPM.DWork = new SCORE[uLength]; + DPM.uDeletePos = new unsigned[uLength]; + + DPM.TraceBack = new int*[uLength]; + + for (unsigned uLetter = 0; uLetter < 4; ++uLetter) + DPM.ScoreMxB[uLetter] = new SCORE[uLength]; + + for (unsigned i = 0; i < uLength; ++i) + { + DPM.SortOrderA[i] = new unsigned[4]; + DPM.FreqsA[i] = new FCOUNT[4]; + DPM.TraceBack[i] = new int[uLength]; + } + } + +SCORE GlobalAlignSPN(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path) + { + if (ALPHA_DNA != g_Alpha || ALPHA_RNA == g_Alpha) + Quit("GlobalAlignSPN: must be nucleo"); + + const unsigned uPrefixCountA = uLengthA + 1; + const unsigned uPrefixCountB = uLengthB + 1; + + AllocDPMem(uLengthA, uLengthB); + + SCORE *GapOpenA = DPM.GapOpenA; + SCORE *GapOpenB = DPM.GapOpenB; + SCORE *GapCloseA = DPM.GapCloseA; + SCORE *GapCloseB = DPM.GapCloseB; + + unsigned **SortOrderA = DPM.SortOrderA; + FCOUNT **FreqsA = DPM.FreqsA; + SCORE **ScoreMxB = DPM.ScoreMxB; + SCORE *MPrev = DPM.MPrev; + SCORE *MCurr = DPM.MCurr; + SCORE *MWork = DPM.MWork; + + SCORE *DPrev = DPM.DPrev; + SCORE *DCurr = DPM.DCurr; + SCORE *DWork = DPM.DWork; + unsigned *uDeletePos = DPM.uDeletePos; + + int **TraceBack = DPM.TraceBack; + + for (unsigned i = 0; i < uLengthA; ++i) + { + GapOpenA[i] = PA[i].m_scoreGapOpen; + GapCloseA[i] = PA[i].m_scoreGapClose; + + for (unsigned uLetter = 0; uLetter < 4; ++uLetter) + { + SortOrderA[i][uLetter] = PA[i].m_uSortOrder[uLetter]; + FreqsA[i][uLetter] = PA[i].m_fcCounts[uLetter]; + } + } + + for (unsigned j = 0; j < uLengthB; ++j) + { + GapOpenB[j] = PB[j].m_scoreGapOpen; + GapCloseB[j] = PB[j].m_scoreGapClose; + } + + for (unsigned uLetter = 0; uLetter < 4; ++uLetter) + { + for (unsigned j = 0; j < uLengthB; ++j) + ScoreMxB[uLetter][j] = PB[j].m_AAScores[uLetter]; + } + + for (unsigned i = 0; i < uPrefixCountA; ++i) + memset(TraceBack[i], 0, uPrefixCountB*sizeof(int)); + +// Special case for i=0 + unsigned **ptrSortOrderA = SortOrderA; + FCOUNT **ptrFreqsA = FreqsA; + assert(ptrSortOrderA == &(SortOrderA[0])); + assert(ptrFreqsA == &(FreqsA[0])); + TraceBack[0][0] = 0; + + SCORE scoreSum = 0; + unsigned *ptrSortOrderAi = SortOrderA[0]; + const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 4; + FCOUNT *ptrFreqsAi = FreqsA[0]; + for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) + { + const unsigned uLetter = *ptrSortOrderAi; + const FCOUNT fcLetter = ptrFreqsAi[uLetter]; + if (0 == fcLetter) + break; + scoreSum += fcLetter*ScoreMxB[uLetter][0]; + } + MPrev[0] = scoreSum - g_scoreCenter; + +// D(0,0) is -infinity (requires I->D). + DPrev[0] = MINUS_INFINITY; + + for (unsigned j = 1; j < uLengthB; ++j) + { + // Only way to get M(0, j) looks like this: + // A ----X + // B XXXXX + // 0 j + // So gap-open at j=0, gap-close at j-1. + SCORE scoreSum = 0; + unsigned *ptrSortOrderAi = SortOrderA[0]; + const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 4; + FCOUNT *ptrFreqsAi = FreqsA[0]; + for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) + { + const unsigned uLetter = *ptrSortOrderAi; + const FCOUNT fcLetter = ptrFreqsAi[uLetter]; + if (0 == fcLetter) + break; + scoreSum += fcLetter*ScoreMxB[uLetter][j]; + } + MPrev[j] = scoreSum - g_scoreCenter + GapOpenB[0] + GapCloseB[j-1]; + TraceBack[0][j] = -(int) j; + + // Assume no D->I transitions, then can't be a delete if only + // one letter from A. + DPrev[j] = MINUS_INFINITY; + } + + SCORE IPrev_j_1; + for (unsigned i = 1; i < uLengthA; ++i) + { + ++ptrSortOrderA; + ++ptrFreqsA; + assert(ptrSortOrderA == &(SortOrderA[i])); + assert(ptrFreqsA == &(FreqsA[i])); + + SCORE *ptrMCurr_j = MCurr; + memset(ptrMCurr_j, 0, uLengthB*sizeof(SCORE)); + const FCOUNT *FreqsAi = *ptrFreqsA; + + const unsigned *SortOrderAi = *ptrSortOrderA; + const unsigned *ptrSortOrderAiEnd = SortOrderAi + 4; + const SCORE *ptrMCurrMax = MCurr + uLengthB; + for (const unsigned *ptrSortOrderAi = SortOrderAi; + ptrSortOrderAi != ptrSortOrderAiEnd; + ++ptrSortOrderAi) + { + const unsigned uLetter = *ptrSortOrderAi; + SCORE *NSBR_Letter = ScoreMxB[uLetter]; + const FCOUNT fcLetter = FreqsAi[uLetter]; + if (0 == fcLetter) + break; + SCORE *ptrNSBR = NSBR_Letter; + for (SCORE *ptrMCurr = MCurr; ptrMCurr != ptrMCurrMax; ++ptrMCurr) + *ptrMCurr += fcLetter*(*ptrNSBR++); + } + + for (unsigned j = 0; j < uLengthB; ++j) + MCurr[j] -= g_scoreCenter; + + ptrMCurr_j = MCurr; + unsigned *ptrDeletePos = uDeletePos; + + // Special case for j=0 + // Only way to get M(i, 0) looks like this: + // 0 i + // A XXXXX + // B ----X + // So gap-open at i=0, gap-close at i-1. + assert(ptrMCurr_j == &(MCurr[0])); + *ptrMCurr_j += GapOpenA[0] + GapCloseA[i-1]; + + ++ptrMCurr_j; + + int *ptrTraceBack_ij = TraceBack[i]; + *ptrTraceBack_ij++ = (int) i; + + SCORE *ptrMPrev_j = MPrev; + SCORE *ptrDPrev = DPrev; + SCORE d = *ptrDPrev; + SCORE DNew = *ptrMPrev_j + GapOpenA[i]; + if (DNew > d) + { + d = DNew; + *ptrDeletePos = i; + } + + SCORE *ptrDCurr = DCurr; + + assert(ptrDCurr == &(DCurr[0])); + *ptrDCurr = d; + + // Can't have an insert if no letters from B + IPrev_j_1 = MINUS_INFINITY; + + unsigned uInsertPos; + const SCORE scoreGapOpenAi = GapOpenA[i]; + const SCORE scoreGapCloseAi_1 = GapCloseA[i-1]; + + for (unsigned j = 1; j < uLengthB; ++j) + { + // Here, MPrev_j is preserved from previous + // iteration so with current i,j is M[i-1][j-1] + SCORE MPrev_j = *ptrMPrev_j; + SCORE INew = MPrev_j + GapOpenB[j]; + if (INew > IPrev_j_1) + { + IPrev_j_1 = INew; + uInsertPos = j; + } + + SCORE scoreMax = MPrev_j; + + assert(ptrDPrev == &(DPrev[j-1])); + SCORE scoreD = *ptrDPrev++ + scoreGapCloseAi_1; + if (scoreD > scoreMax) + { + scoreMax = scoreD; + assert(ptrDeletePos == &(uDeletePos[j-1])); + *ptrTraceBack_ij = (int) i - (int) *ptrDeletePos; + assert(*ptrTraceBack_ij > 0); + } + ++ptrDeletePos; + + SCORE scoreI = IPrev_j_1 + GapCloseB[j-1]; + if (scoreI > scoreMax) + { + scoreMax = scoreI; + *ptrTraceBack_ij = (int) uInsertPos - (int) j; + assert(*ptrTraceBack_ij < 0); + } + + assert(ptrSortOrderA == &(SortOrderA[i])); + assert(ptrFreqsA == &(FreqsA[i])); + + *ptrMCurr_j += scoreMax; + assert(ptrMCurr_j == &(MCurr[j])); + ++ptrMCurr_j; + + MPrev_j = *(++ptrMPrev_j); + assert(ptrDPrev == &(DPrev[j])); + SCORE d = *ptrDPrev; + SCORE DNew = MPrev_j + scoreGapOpenAi; + if (DNew > d) + { + d = DNew; + assert(ptrDeletePos == &uDeletePos[j]); + *ptrDeletePos = i; + } + assert(ptrDCurr + 1 == &(DCurr[j])); + *(++ptrDCurr) = d; + + ++ptrTraceBack_ij; + } + + Rotate(MPrev, MCurr, MWork); + Rotate(DPrev, DCurr, DWork); + } + +// Special case for i=uLengthA + SCORE IPrev = MINUS_INFINITY; + + unsigned uInsertPos; + + for (unsigned j = 1; j < uLengthB; ++j) + { + SCORE INew = MPrev[j-1] + GapOpenB[j]; + if (INew > IPrev) + { + uInsertPos = j; + IPrev = INew; + } + } + +// Special case for i=uLengthA, j=uLengthB + SCORE scoreMax = MPrev[uLengthB-1]; + int iTraceBack = 0; + + SCORE scoreD = DPrev[uLengthB-1] + GapCloseA[uLengthA-1]; + if (scoreD > scoreMax) + { + scoreMax = scoreD; + iTraceBack = (int) uLengthA - (int) uDeletePos[uLengthB-1]; + } + + SCORE scoreI = IPrev + GapCloseB[uLengthB-1]; + if (scoreI > scoreMax) + { + scoreMax = scoreI; + iTraceBack = (int) uInsertPos - (int) uLengthB; + } + + TraceBack[uLengthA][uLengthB] = iTraceBack; + + TraceBackToPath(TraceBack, uLengthA, uLengthB, Path); + + return scoreMax; + } diff --git a/src/muscle/muscle3.8.31/src/glbalignss.cpp b/src/muscle/muscle3.8.31/src/glbalignss.cpp new file mode 100644 index 0000000..8909c09 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/glbalignss.cpp @@ -0,0 +1,318 @@ +#include "muscle.h" +#include "profile.h" +#include "pwpath.h" +#include "seq.h" + +extern SCOREMATRIX VTML_SP; + +// #define SUBST(i, j) Subst(seqA, seqB, i, j) +#define SUBST(i, j) MxRowA[i][seqB.GetLetter(j)] + +static SCORE Subst(const Seq &seqA, const Seq &seqB, unsigned i, unsigned j) + { + assert(i < seqA.Length()); + assert(j < seqB.Length()); + + unsigned uLetterA = seqA.GetLetter(i); + unsigned uLetterB = seqB.GetLetter(j); + return VTML_SP[uLetterA][uLetterB] + g_scoreCenter; + } + +struct DP_MEMORY + { + unsigned uLength; + SCORE *MPrev; + SCORE *MCurr; + SCORE *MWork; + SCORE *DPrev; + SCORE *DCurr; + SCORE *DWork; + SCORE **MxRowA; + unsigned *LettersB; + unsigned *uDeletePos; + int **TraceBack; + }; + +static struct DP_MEMORY DPM; + +static void AllocDPMem(unsigned uLengthA, unsigned uLengthB) + { +// Max prefix length + unsigned uLength = (uLengthA > uLengthB ? uLengthA : uLengthB) + 1; + if (uLength < DPM.uLength) + return; + +// Add 256 to allow for future expansion and +// round up to next multiple of 32. + uLength += 256; + uLength += 32 - uLength%32; + + const unsigned uOldLength = DPM.uLength; + if (uOldLength > 0) + { + for (unsigned i = 0; i < uOldLength; ++i) + delete[] DPM.TraceBack[i]; + + delete[] DPM.MPrev; + delete[] DPM.MCurr; + delete[] DPM.MWork; + delete[] DPM.DPrev; + delete[] DPM.DCurr; + delete[] DPM.DWork; + delete[] DPM.MxRowA; + delete[] DPM.LettersB; + delete[] DPM.uDeletePos; + delete[] DPM.TraceBack; + } + + DPM.uLength = uLength; + + DPM.MPrev = new SCORE[uLength]; + DPM.MCurr = new SCORE[uLength]; + DPM.MWork = new SCORE[uLength]; + + DPM.DPrev = new SCORE[uLength]; + DPM.DCurr = new SCORE[uLength]; + DPM.DWork = new SCORE[uLength]; + DPM.MxRowA = new SCORE *[uLength]; + DPM.LettersB = new unsigned[uLength]; + DPM.uDeletePos = new unsigned[uLength]; + + DPM.TraceBack = new int*[uLength]; + + for (unsigned i = 0; i < uLength; ++i) + DPM.TraceBack[i] = new int[uLength]; + } + +static void RowFromSeq(const Seq &s, SCORE *Row[]) + { + const unsigned uLength = s.Length(); + for (unsigned i = 0; i < uLength; ++i) + { + char c = s.GetChar(i); + unsigned uLetter = CharToLetter(c); + if (uLetter < 20) + Row[i] = VTML_SP[uLetter]; + else + Row[i] = VTML_SP[AX_X]; + } + } + +static void LettersFromSeq(const Seq &s, unsigned Letters[]) + { + const unsigned uLength = s.Length(); + for (unsigned i = 0; i < uLength; ++i) + { + char c = s.GetChar(i); + unsigned uLetter = CharToLetter(c); + if (uLetter < 20) + Letters[i] = uLetter; + else + Letters[i] = AX_X; + } + } + +SCORE GlobalAlignSS(const Seq &seqA, const Seq &seqB, PWPath &Path) + { + const unsigned uLengthA = seqA.Length(); + const unsigned uLengthB = seqB.Length(); + const unsigned uPrefixCountA = uLengthA + 1; + const unsigned uPrefixCountB = uLengthB + 1; + + AllocDPMem(uLengthA, uLengthB); + + SCORE *MPrev = DPM.MPrev; + SCORE *MCurr = DPM.MCurr; + SCORE *MWork = DPM.MWork; + + SCORE *DPrev = DPM.DPrev; + SCORE *DCurr = DPM.DCurr; + SCORE *DWork = DPM.DWork; + SCORE **MxRowA = DPM.MxRowA; + unsigned *LettersB = DPM.LettersB; + + RowFromSeq(seqA, MxRowA); + LettersFromSeq(seqB, LettersB); + + unsigned *uDeletePos = DPM.uDeletePos; + + int **TraceBack = DPM.TraceBack; + +#if DEBUG + for (unsigned i = 0; i < uPrefixCountA; ++i) + memset(TraceBack[i], 0, uPrefixCountB*sizeof(int)); +#endif + +// Special case for i=0 + TraceBack[0][0] = 0; + MPrev[0] = MxRowA[0][LettersB[0]]; + +// D(0,0) is -infinity (requires I->D). + DPrev[0] = MINUS_INFINITY; + + for (unsigned j = 1; j < uLengthB; ++j) + { + unsigned uLetterB = LettersB[j]; + + // Only way to get M(0, j) looks like this: + // A ----X + // B XXXXX + // 0 j + // So gap-open at j=0, gap-close at j-1. + MPrev[j] = MxRowA[0][uLetterB] + g_scoreGapOpen/2; // term gaps half + TraceBack[0][j] = -(int) j; + + // Assume no D->I transitions, then can't be a delete if only + // one letter from A. + DPrev[j] = MINUS_INFINITY; + } + + SCORE IPrev_j_1; + for (unsigned i = 1; i < uLengthA; ++i) + { + SCORE *ptrMCurr_j = MCurr; + memset(ptrMCurr_j, 0, uLengthB*sizeof(SCORE)); + + const SCORE *RowA = MxRowA[i]; + const SCORE *ptrRowA = MxRowA[i]; + const SCORE *ptrMCurrEnd = ptrMCurr_j + uLengthB; + unsigned *ptrLettersB = LettersB; + for (; ptrMCurr_j != ptrMCurrEnd; ++ptrMCurr_j) + { + *ptrMCurr_j = RowA[*ptrLettersB]; + ++ptrLettersB; + } + + unsigned *ptrDeletePos = uDeletePos; + + // Special case for j=0 + // Only way to get M(i, 0) looks like this: + // 0 i + // A XXXXX + // B ----X + // So gap-open at i=0, gap-close at i-1. + ptrMCurr_j = MCurr; + assert(ptrMCurr_j == &(MCurr[0])); + *ptrMCurr_j += g_scoreGapOpen/2; // term gaps half + + ++ptrMCurr_j; + + int *ptrTraceBack_ij = TraceBack[i]; + *ptrTraceBack_ij++ = (int) i; + + SCORE *ptrMPrev_j = MPrev; + SCORE *ptrDPrev = DPrev; + SCORE d = *ptrDPrev; + SCORE DNew = *ptrMPrev_j + g_scoreGapOpen; + if (DNew > d) + { + d = DNew; + *ptrDeletePos = i; + } + + SCORE *ptrDCurr = DCurr; + + assert(ptrDCurr == &(DCurr[0])); + *ptrDCurr = d; + + // Can't have an insert if no letters from B + IPrev_j_1 = MINUS_INFINITY; + + unsigned uInsertPos; + + for (unsigned j = 1; j < uLengthB; ++j) + { + // Here, MPrev_j is preserved from previous + // iteration so with current i,j is M[i-1][j-1] + SCORE MPrev_j = *ptrMPrev_j; + SCORE INew = MPrev_j + g_scoreGapOpen; + if (INew > IPrev_j_1) + { + IPrev_j_1 = INew; + uInsertPos = j; + } + + SCORE scoreMax = MPrev_j; + + assert(ptrDPrev == &(DPrev[j-1])); + SCORE scoreD = *ptrDPrev++; + if (scoreD > scoreMax) + { + scoreMax = scoreD; + assert(ptrDeletePos == &(uDeletePos[j-1])); + *ptrTraceBack_ij = (int) i - (int) *ptrDeletePos; + assert(*ptrTraceBack_ij > 0); + } + ++ptrDeletePos; + + SCORE scoreI = IPrev_j_1; + if (scoreI > scoreMax) + { + scoreMax = scoreI; + *ptrTraceBack_ij = (int) uInsertPos - (int) j; + assert(*ptrTraceBack_ij < 0); + } + + *ptrMCurr_j += scoreMax; + assert(ptrMCurr_j == &(MCurr[j])); + ++ptrMCurr_j; + + MPrev_j = *(++ptrMPrev_j); + assert(ptrDPrev == &(DPrev[j])); + SCORE d = *ptrDPrev; + SCORE DNew = MPrev_j + g_scoreGapOpen; + if (DNew > d) + { + d = DNew; + assert(ptrDeletePos == &uDeletePos[j]); + *ptrDeletePos = i; + } + assert(ptrDCurr + 1 == &(DCurr[j])); + *(++ptrDCurr) = d; + + ++ptrTraceBack_ij; + } + + Rotate(MPrev, MCurr, MWork); + Rotate(DPrev, DCurr, DWork); + } + +// Special case for i=uLengthA + SCORE IPrev = MINUS_INFINITY; + + unsigned uInsertPos; + + for (unsigned j = 1; j < uLengthB; ++j) + { + SCORE INew = MPrev[j-1]; + if (INew > IPrev) + { + uInsertPos = j; + IPrev = INew; + } + } + +// Special case for i=uLengthA, j=uLengthB + SCORE scoreMax = MPrev[uLengthB-1]; + int iTraceBack = 0; + + SCORE scoreD = DPrev[uLengthB-1] - g_scoreGapOpen/2; // term gaps half + if (scoreD > scoreMax) + { + scoreMax = scoreD; + iTraceBack = (int) uLengthA - (int) uDeletePos[uLengthB-1]; + } + + SCORE scoreI = IPrev - g_scoreGapOpen/2; + if (scoreI > scoreMax) + { + scoreMax = scoreI; + iTraceBack = (int) uInsertPos - (int) uLengthB; + } + + TraceBack[uLengthA][uLengthB] = iTraceBack; + + TraceBackToPath(TraceBack, uLengthA, uLengthB, Path); + + return scoreMax; + } diff --git a/src/muscle/muscle3.8.31/src/glbalndimer.cpp b/src/muscle/muscle3.8.31/src/glbalndimer.cpp new file mode 100644 index 0000000..8ef3a6c --- /dev/null +++ b/src/muscle/muscle3.8.31/src/glbalndimer.cpp @@ -0,0 +1,390 @@ +#include "muscle.h" +#include +#include // for sprintf +#include "pwpath.h" +#include "profile.h" +#include "gapscoredimer.h" + +#define TRACE 0 + +static SCORE TraceBackDimer( const SCORE *DPM_, const SCORE *DPD_, const SCORE *DPI_, + const char *TBM_, const char *TBD_, const char *TBI_, + unsigned uLengthA, unsigned uLengthB, PWPath &Path); + +static const char *LocalScoreToStr(SCORE s) + { + static char str[16]; + if (MINUS_INFINITY == s) + return " *"; + sprintf(str, "%6.3g", s); + return str; + } + +#if TRACE +static void ListDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB, + unsigned uPrefixCountA, unsigned uPrefixCountB) + { + Log(" "); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + char c = ' '; + if (uPrefixLengthB > 0) + c = ConsensusChar(PB[uPrefixLengthB - 1]); + Log(" %4u:%c", uPrefixLengthB, c); + } + Log("\n"); + for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + char c = ' '; + if (uPrefixLengthA > 0) + c = ConsensusChar(PA[uPrefixLengthA - 1]); + Log("%4u:%c ", uPrefixLengthA, c); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB))); + Log("\n"); + } + } + +static void ListTB(const char *TBM_, const ProfPos *PA, const ProfPos *PB, + unsigned uPrefixCountA, unsigned uPrefixCountB) + { + Log(" "); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + Log("%2d", uPrefixLengthB); + Log("\n"); + Log(" "); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + char c = ' '; + if (uPrefixLengthB > 0) + c = ConsensusChar(PB[uPrefixLengthB - 1]); + Log(" %c", c); + } + Log("\n"); + for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + char c = ' '; + if (uPrefixLengthA > 0) + c = ConsensusChar(PA[uPrefixLengthA - 1]); + Log("%4u:%c ", uPrefixLengthA, c); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + Log(" %c", TBM(uPrefixLengthA, uPrefixLengthB)); + Log("\n"); + } + } +#endif // TRACE + +static ProfPos PPTerm; +static bool InitializePPTerm() + { + PPTerm.m_bAllGaps = false; + PPTerm.m_LL = 1; + PPTerm.m_LG = 0; + PPTerm.m_GL = 0; + PPTerm.m_GG = 0; + PPTerm.m_fOcc = 1; + return true; + } +static bool PPTermInitialized = InitializePPTerm(); + +static SCORE ScoreProfPosDimerLE(const ProfPos &PPA, const ProfPos &PPB) + { + SCORE Score = 0; + for (unsigned n = 0; n < 20; ++n) + { + const unsigned uLetter = PPA.m_uSortOrder[n]; + const FCOUNT fcLetter = PPA.m_fcCounts[uLetter]; + if (0 == fcLetter) + break; + Score += fcLetter*PPB.m_AAScores[uLetter]; + } + if (0 == Score) + return -2.5; + SCORE logScore = logf(Score); + return (SCORE) (logScore*(PPA.m_fOcc * PPB.m_fOcc)); + } + +static SCORE ScoreProfPosDimerPSP(const ProfPos &PPA, const ProfPos &PPB) + { + SCORE Score = 0; + for (unsigned n = 0; n < 20; ++n) + { + const unsigned uLetter = PPA.m_uSortOrder[n]; + const FCOUNT fcLetter = PPA.m_fcCounts[uLetter]; + if (0 == fcLetter) + break; + Score += fcLetter*PPB.m_AAScores[uLetter]; + } + return Score; + } + +static SCORE ScoreProfPosDimer(const ProfPos &PPA, const ProfPos &PPB) + { + switch (g_PPScore) + { + case PPSCORE_LE: + return ScoreProfPosDimerLE(PPA, PPB); + + case PPSCORE_SP: + case PPSCORE_SV: + return ScoreProfPosDimerPSP(PPA, PPB); + } + Quit("Invalid g_PPScore"); + return 0; + } + +// Global alignment dynamic programming +// This variant optimizes the profile-profile SP score under the +// dimer approximation. +SCORE GlobalAlignDimer(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path) + { + assert(uLengthB > 0 && uLengthA > 0); + + const unsigned uPrefixCountA = uLengthA + 1; + const unsigned uPrefixCountB = uLengthB + 1; + +// Allocate DP matrices + const size_t LM = uPrefixCountA*uPrefixCountB; + SCORE *DPM_ = new SCORE[LM]; + SCORE *DPD_ = new SCORE[LM]; + SCORE *DPI_ = new SCORE[LM]; + + char *TBM_ = new char[LM]; + char *TBD_ = new char[LM]; + char *TBI_ = new char[LM]; + + DPM(0, 0) = 0; + DPD(0, 0) = MINUS_INFINITY; + DPI(0, 0) = MINUS_INFINITY; + + TBM(0, 0) = 'S'; + TBD(0, 0) = '?'; + TBI(0, 0) = '?'; + + DPM(1, 0) = MINUS_INFINITY; + DPD(1, 0) = GapScoreMD(PA[0], PPTerm); + DPI(1, 0) = MINUS_INFINITY; + + TBM(1, 0) = '?'; + TBD(1, 0) = 'S'; + TBI(1, 0) = '?'; + + DPM(0, 1) = MINUS_INFINITY; + DPD(0, 1) = MINUS_INFINITY; + DPI(0, 1) = GapScoreMI(PPTerm, PB[0]); + + TBM(0, 1) = '?'; + TBD(0, 1) = '?'; + TBI(0, 1) = 'S'; + +// Empty prefix of B is special case + for (unsigned uPrefixLengthA = 2; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + // M=LetterA+LetterB, impossible with empty prefix + DPM(uPrefixLengthA, 0) = MINUS_INFINITY; + TBM(uPrefixLengthA, 0) = '?'; + + // D=LetterA+GapB + DPD(uPrefixLengthA, 0) = DPD(uPrefixLengthA - 1, 0) + + GapScoreDD(PA[uPrefixLengthA - 1], PPTerm); + TBD(uPrefixLengthA, 0) = 'D'; + + // I=GapA+LetterB, impossible with empty prefix + DPI(uPrefixLengthA, 0) = MINUS_INFINITY; + TBI(uPrefixLengthA, 0) = '?'; + } + +// Empty prefix of A is special case + for (unsigned uPrefixLengthB = 2; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + // M=LetterA+LetterB, impossible with empty prefix + DPM(0, uPrefixLengthB) = MINUS_INFINITY; + TBM(0, uPrefixLengthB) = '?'; + + // D=LetterA+GapB, impossible with empty prefix + DPD(0, uPrefixLengthB) = MINUS_INFINITY; + TBD(0, uPrefixLengthB) = '?'; + + // I=GapA+LetterB + DPI(0, uPrefixLengthB) = DPI(0, uPrefixLengthB - 1) + + GapScoreII(PPTerm, PB[uPrefixLengthB - 1]); + TBI(0, uPrefixLengthB) = 'I'; + } + +// ============ +// Main DP loop +// ============ + for (unsigned uPrefixLengthB = 1; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + const ProfPos &PPB = PB[uPrefixLengthB - 1]; + for (unsigned uPrefixLengthA = 1; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + const ProfPos &PPA = PA[uPrefixLengthA - 1]; + { + // Match M=LetterA+LetterB + SCORE scoreLL = ScoreProfPosDimer(PPA, PPB); + + SCORE scoreMM = DPM(uPrefixLengthA-1, uPrefixLengthB-1) + GapScoreMM(PPA, PPB); + SCORE scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + GapScoreDM(PPA, PPB); + SCORE scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + GapScoreIM(PPA, PPB); + + SCORE scoreBest = scoreMM; + char c = 'M'; + if (scoreDM > scoreBest) + { + scoreBest = scoreDM; + c = 'D'; + } + if (scoreIM > scoreBest) + { + scoreBest = scoreIM; + c = 'I'; + } + + DPM(uPrefixLengthA, uPrefixLengthB) = scoreBest + scoreLL; + TBM(uPrefixLengthA, uPrefixLengthB) = c; + } + { + // Delete D=LetterA+GapB + SCORE scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) + GapScoreMD(PPA, PPB); + SCORE scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB) + GapScoreDD(PPA, PPB); + SCORE scoreID = DPI(uPrefixLengthA-1, uPrefixLengthB) + GapScoreID(PPA, PPB); + + SCORE scoreBest = scoreMD; + char c = 'M'; + if (scoreDD > scoreBest) + { + scoreBest = scoreDD; + c = 'D'; + } + if (scoreID > scoreBest) + { + scoreBest = scoreID; + c = 'I'; + } + + DPD(uPrefixLengthA, uPrefixLengthB) = scoreBest; + TBD(uPrefixLengthA, uPrefixLengthB) = c; + } + { + // Insert I=GapA+LetterB + SCORE scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) + GapScoreMI(PPA, PPB); + SCORE scoreDI = DPD(uPrefixLengthA, uPrefixLengthB-1) + GapScoreDI(PPA, PPB); + SCORE scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1) + GapScoreII(PPA, PPB); + + SCORE scoreBest = scoreMI; + char c = 'M'; + if (scoreDI > scoreBest) + { + scoreBest = scoreDI; + c = 'D'; + } + if (scoreII > scoreBest) + { + scoreBest = scoreII; + c = 'I'; + } + + DPI(uPrefixLengthA, uPrefixLengthB) = scoreBest; + TBI(uPrefixLengthA, uPrefixLengthB) = c; + } + } + } + +#if TRACE + Log("DPM:\n"); + ListDP(DPM_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("DPD:\n"); + ListDP(DPD_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("DPI:\n"); + ListDP(DPI_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("TBM:\n"); + ListTB(TBM_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("TBD:\n"); + ListTB(TBD_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("TBI:\n"); + ListTB(TBI_, PA, PB, uPrefixCountA, uPrefixCountB); +#endif + + SCORE Score = TraceBackDimer(DPM_, DPD_, DPI_, TBM_, TBD_, TBI_, + uLengthA, uLengthB, Path); + +#if TRACE + Log("GlobalAlignDimer score = %.3g\n", Score); +#endif + + delete[] DPM_; + delete[] DPD_; + delete[] DPI_; + + delete[] TBM_; + delete[] TBD_; + delete[] TBI_; + + return Score; + } + +static SCORE TraceBackDimer( const SCORE *DPM_, const SCORE *DPD_, const SCORE *DPI_, + const char *TBM_, const char *TBD_, const char *TBI_, + unsigned uLengthA, unsigned uLengthB, PWPath &Path) + { + const unsigned uPrefixCountA = uLengthA + 1; + + unsigned uPrefixLengthA = uLengthA; + unsigned uPrefixLengthB = uLengthB; + + char cEdge = 'M'; + SCORE scoreMax = DPM(uLengthA, uLengthB); + if (DPD(uLengthA, uLengthB) > scoreMax) + { + scoreMax = DPD(uLengthA, uLengthB); + cEdge = 'D'; + } + if (DPI(uLengthA, uLengthB) > scoreMax) + { + scoreMax = DPI(uLengthA, uLengthB); + cEdge = 'I'; + } + + for (;;) + { + if (0 == uPrefixLengthA && 0 == uPrefixLengthB) + break; + + PWEdge Edge; + Edge.cType = cEdge; + Edge.uPrefixLengthA = uPrefixLengthA; + Edge.uPrefixLengthB = uPrefixLengthB; + Path.PrependEdge(Edge); + +#if TRACE + Log("PLA=%u PLB=%u Edge=%c\n", uPrefixLengthA, uPrefixLengthB, cEdge); +#endif + switch (cEdge) + { + case 'M': + assert(uPrefixLengthA > 0 && uPrefixLengthB > 0); + cEdge = TBM(uPrefixLengthA, uPrefixLengthB); + --uPrefixLengthA; + --uPrefixLengthB; + break; + case 'D': + assert(uPrefixLengthA > 0); + cEdge = TBD(uPrefixLengthA, uPrefixLengthB); + --uPrefixLengthA; + break; + case 'I': + assert(uPrefixLengthB > 0); + cEdge = TBI(uPrefixLengthA, uPrefixLengthB); + --uPrefixLengthB; + break; + default: + Quit("Invalid edge PLA=%u PLB=%u %c", uPrefixLengthA, uPrefixLengthB, cEdge); + } + } +#if TRACE + Path.LogMe(); +#endif + return scoreMax; + } diff --git a/src/muscle/muscle3.8.31/src/globals.cpp b/src/muscle/muscle3.8.31/src/globals.cpp new file mode 100644 index 0000000..b1a51a0 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/globals.cpp @@ -0,0 +1,289 @@ +#if WIN32 +#include +#include +#endif + +#include "muscle.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef MAX_PATH +#define MAX_PATH 260 +#endif + +static char g_strListFileName[MAX_PATH]; +static bool g_bListFileAppend = false; + +static SEQWEIGHT g_SeqWeight = SEQWEIGHT_Undefined; + +void SetSeqWeightMethod(SEQWEIGHT Method) + { + g_SeqWeight = Method; + } + +SEQWEIGHT GetSeqWeightMethod() + { + return g_SeqWeight; + } + +void SetListFileName(const char *ptrListFileName, bool bAppend) + { + assert(strlen(ptrListFileName) < MAX_PATH); + strcpy(g_strListFileName, ptrListFileName); + g_bListFileAppend = bAppend; + } + +void Log(const char szFormat[], ...) + { + if (0 == g_strListFileName[0]) + return; + + static FILE *f = NULL; + const char *mode; + if (g_bListFileAppend) + mode = "a"; + else + mode = "w"; + if (NULL == f) + f = _fsopen(g_strListFileName, mode, _SH_DENYNO); + if (NULL == f) + { + perror(g_strListFileName); + exit(EXIT_NotStarted); + } + + char szStr[4096]; + va_list ArgList; + va_start(ArgList, szFormat); + vsprintf(szStr, szFormat, ArgList); + fprintf(f, "%s", szStr); + fflush(f); + } + +const char *GetTimeAsStr() + { + static char szStr[32]; + time_t t; + time(&t); + struct tm *ptmCurrentTime = localtime(&t); + strcpy(szStr, asctime(ptmCurrentTime)); + assert('\n' == szStr[24]); + szStr[24] = 0; + return szStr; + } + +// Exit immediately with error message, printf-style. +void Quit(const char szFormat[], ...) + { + va_list ArgList; + char szStr[4096]; + + va_start(ArgList, szFormat); + vsprintf(szStr, szFormat, ArgList); + + fprintf(stderr, "\n*** ERROR *** %s\n", szStr); + + Log("\n*** FATAL ERROR *** "); + Log("%s\n", szStr); + Log("Stopped %s\n", GetTimeAsStr()); + +#ifdef WIN32 + if (IsDebuggerPresent()) + { + int iBtn = MessageBox(NULL, szStr, "muscle", MB_ICONERROR | MB_OKCANCEL); + if (IDCANCEL == iBtn) + Break(); + } +#endif + exit(EXIT_FatalError); + } + +void Warning(const char szFormat[], ...) + { + va_list ArgList; + char szStr[4096]; + + va_start(ArgList, szFormat); + vsprintf(szStr, szFormat, ArgList); + + fprintf(stderr, "\n*** WARNING *** %s\n", szStr); + Log("\n*** WARNING *** %s\n", szStr); + } + +// Remove leading and trailing blanks from string +void TrimBlanks(char szStr[]) + { + TrimLeadingBlanks(szStr); + TrimTrailingBlanks(szStr); + } + +void TrimLeadingBlanks(char szStr[]) + { + size_t n = strlen(szStr); + while (szStr[0] == ' ') + { + memmove(szStr, szStr+1, n); + szStr[--n] = 0; + } + } + +void TrimTrailingBlanks(char szStr[]) + { + size_t n = strlen(szStr); + while (n > 0 && szStr[n-1] == ' ') + szStr[--n] = 0; + } + +bool Verbose() + { + return true; + } + +SCORE StrToScore(const char *pszStr) + { + return (SCORE) atof(pszStr); + } + +void StripWhitespace(char szStr[]) + { + unsigned uOutPos = 0; + unsigned uInPos = 0; + while (char c = szStr[uInPos++]) + if (' ' != c && '\t' != c && '\n' != c && '\r' != c) + szStr[uOutPos++] = c; + szStr[uOutPos] = 0; + } + +void StripGaps(char szStr[]) + { + unsigned uOutPos = 0; + unsigned uInPos = 0; + while (char c = szStr[uInPos++]) + if ('-' != c) + szStr[uOutPos++] = c; + szStr[uOutPos] = 0; + } + +bool IsValidSignedInteger(const char *Str) + { + if (0 == strlen(Str)) + return false; + if ('+' == *Str || '-' == *Str) + ++Str; + while (char c = *Str++) + if (!isdigit(c)) + return false; + return true; + } + +bool IsValidInteger(const char *Str) + { + if (0 == strlen(Str)) + return false; + while (char c = *Str++) + if (!isdigit(c)) + return false; + return true; + } + +// Is c valid as first character in an identifier? +bool isidentf(char c) + { + return isalpha(c) || '_' == c; + } + +// Is c valid character in an identifier? +bool isident(char c) + { + return isalpha(c) || isdigit(c) || '_' == c; + } + +bool IsValidIdentifier(const char *Str) + { + if (!isidentf(Str[0])) + return false; + while (char c = *Str++) + if (!isident(c)) + return false; + return true; + } + +void SetLogFile() + { + const char *strFileName = ValueOpt("loga"); + if (0 != strFileName) + g_bListFileAppend = true; + else + strFileName = ValueOpt("log"); + if (0 == strFileName) + return; + strcpy(g_strListFileName, strFileName); + } + +// Get filename, stripping any extension and directory parts. +void NameFromPath(const char szPath[], char szName[], unsigned uBytes) + { + if (0 == uBytes) + return; + const char *pstrLastSlash = strrchr(szPath, '/'); + const char *pstrLastBackslash = strrchr(szPath, '\\'); + const char *pstrLastDot = strrchr(szPath, '.'); + const char *pstrLastSep = pstrLastSlash > pstrLastBackslash ? + pstrLastSlash : pstrLastBackslash; + const char *pstrBegin = pstrLastSep ? pstrLastSep + 1 : szPath; + const char *pstrEnd = pstrLastDot ? pstrLastDot - 1 : szPath + strlen(szPath); + unsigned uNameLength = (unsigned) (pstrEnd - pstrBegin + 1); + if (uNameLength > uBytes - 1) + uNameLength = uBytes - 1; + memcpy(szName, pstrBegin, uNameLength); + szName[uNameLength] = 0; + } + +char *strsave(const char *s) + { + char *ptrCopy = strdup(s); + if (0 == ptrCopy) + Quit("Out of memory"); + return ptrCopy; + } + +bool IsValidFloatChar(char c) + { + return isdigit(c) || '.' == c || 'e' == c || 'E' == c || 'd' == c || + 'D' == c || '.' == c || '+' == c || '-' == c; + } + +void Call_MY_ASSERT(const char *file, int line, bool b, const char *msg) + { + if (b) + return; + Quit("%s(%d): MY_ASSERT(%s)", file, line, msg); + } + +static size_t g_MemTotal; + +void MemPlus(size_t Bytes, char *Where) + { + g_MemTotal += Bytes; + Log("+%10u %6u %6u %s\n", + (unsigned) Bytes, + (unsigned) GetMemUseMB(), + (unsigned) (g_MemTotal/1000000), + Where); + } + +void MemMinus(size_t Bytes, char *Where) + { + g_MemTotal -= Bytes; + Log("-%10u %6u %6u %s\n", + (unsigned) Bytes, + (unsigned) GetMemUseMB(), + (unsigned) (g_MemTotal/1000000), + Where); + } diff --git a/src/muscle/muscle3.8.31/src/globalslinux.cpp b/src/muscle/muscle3.8.31/src/globalslinux.cpp new file mode 100644 index 0000000..f414fa9 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/globalslinux.cpp @@ -0,0 +1,163 @@ +#include "muscle.h" + +#if defined(__linux__) +#include +#include +#include +#include +#include +#include + +const int ONE_MB = 1000000; +const int MEM_WARNING_THRESHOLD = 20*ONE_MB; + +double GetNAN() + { + static unsigned long nan[2]={0xffffffff, 0x7fffffff}; + double dNAN = *( double* )nan; + return dNAN; + } + +double g_dNAN = GetNAN(); + +void chkmem(const char szMsg[]) + { + //assert(_CrtCheckMemory()); + } + +void Break() + { + //DebugBreak(); + } + +static char szCmdLine[4096]; + +void *ptrStartBreak = sbrk(0); + +const char *GetCmdLine() + { + return szCmdLine; + } + +double GetMemUseMB() + { + static char statm[64]; + static int PageSize; + if (0 == statm[0]) + { + PageSize = sysconf(_SC_PAGESIZE); + pid_t pid = getpid(); + sprintf(statm, "/proc/%d/statm", (int) pid); + } + + int fd = open(statm, O_RDONLY); + if (-1 == fd) + return -1; + char Buffer[64]; + int n = read(fd, Buffer, sizeof(Buffer) - 1); + close(fd); + fd = -1; + + if (n <= 0) + { + static bool Warned = false; + if (!Warned) + { + Warned = true; + Warning("*Warning* Cannot read %s errno=%d %s", + statm, errno, strerror(errno)); + } + return 0; + } + Buffer[n] = 0; + int Pages = atoi(Buffer); + + return ((double) Pages * (double) PageSize)/1e6; + } + +void SaveCmdLine(int argc, char *argv[]) + { + for (int i = 0; i < argc; ++i) + { + if (i > 0) + strcat(szCmdLine, " "); + strcat(szCmdLine, argv[i]); + } + } + +double dPeakMemUseMB = 0; + +double GetPeakMemUseMB() + { + CheckMemUse(); + return dPeakMemUseMB; + } + +double GetCPUGHz() + { + double dGHz = 2.5; + const char *e = getenv("CPUGHZ"); + if (0 != e) + dGHz = atof(e); + return dGHz; + } + +void CheckMemUse() + { + double dMB = GetMemUseMB(); + if (dMB > dPeakMemUseMB) + dPeakMemUseMB = dMB; + } + +double GetRAMSizeMB() + { + const double DEFAULT_RAM = 500; + static double RAMMB = 0; + if (RAMMB != 0) + return RAMMB; + + int fd = open("/proc/meminfo", O_RDONLY); + if (-1 == fd) + { + static bool Warned = false; + if (!Warned) + { + Warned = true; + Warning("*Warning* Cannot open /proc/meminfo errno=%d %s", + errno, strerror(errno)); + } + return DEFAULT_RAM; + } + char Buffer[1024]; + int n = read(fd, Buffer, sizeof(Buffer) - 1); + close(fd); + fd = -1; + + if (n <= 0) + { + static bool Warned = false; + if (!Warned) + { + Warned = true; + Warning("*Warning* Cannot read /proc/meminfo errno=%d %s", + errno, strerror(errno)); + } + return DEFAULT_RAM; + } + Buffer[n] = 0; + char *pMem = strstr(Buffer, "MemTotal: "); + if (0 == pMem) + { + static bool Warned = false; + if (!Warned) + { + Warned = true; + Warning("*Warning* 'MemTotal:' not found in /proc/meminfo"); + } + return DEFAULT_RAM; + } + int Bytes = atoi(pMem+9)*1000; + return ((double) Bytes)/1e6; + } + +#endif // !WIN32 diff --git a/src/muscle/muscle3.8.31/src/globalsosx.cpp b/src/muscle/muscle3.8.31/src/globalsosx.cpp new file mode 100644 index 0000000..a93385e --- /dev/null +++ b/src/muscle/muscle3.8.31/src/globalsosx.cpp @@ -0,0 +1,92 @@ +#ifdef __MACH__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +const double DEFAULT_RAM = 1e9; +const double DEFAULT_MEM_USE = 1e6; + +double GetNAN() + { + static unsigned long nan[2]={0xffffffff, 0x7fffffff}; + double dNAN = *( double* )nan; + return dNAN; + } + +double g_dNAN = GetNAN(); + + +double GetRAMSize() + { + static double CACHED_RAM = 0; + if (CACHED_RAM != 0) + return CACHED_RAM; + + uint64_t MemPages = 0; + size_t Len = sizeof(MemPages); + if (sysctlbyname("hw.memsize", &MemPages, &Len, NULL, 0) < 0) + return DEFAULT_RAM; + return (double) MemPages; + } + +double GetRAMSizeMB() + { + return GetRAMSize()/1e6; + } + +static double g_uPeakMemUseBytes; + +double GetMaxMemUseBytes() + { + return g_uPeakMemUseBytes; + } + +double GetPeakMemUseBytes() + { + return GetMaxMemUseBytes(); + } + +double GetMemUseBytes() + { + task_t mytask = mach_task_self(); + struct task_basic_info ti; + memset((void *) &ti, 0, sizeof(ti)); + mach_msg_type_number_t count = TASK_BASIC_INFO_COUNT; + kern_return_t ok = task_info(mytask, TASK_BASIC_INFO, (task_info_t) &ti, &count); + if (ok == KERN_INVALID_ARGUMENT) + return DEFAULT_MEM_USE; + + if (ok != KERN_SUCCESS) + return DEFAULT_MEM_USE; + + double uBytes = (double ) ti.resident_size; + if (uBytes > g_uPeakMemUseBytes) + g_uPeakMemUseBytes = uBytes; + return uBytes; + } + +double GetMemUseMB() + { + return GetMemUseBytes()/1e6; + } + +void OSInit() + { + } + +#endif // __MACH__ diff --git a/src/muscle/muscle3.8.31/src/globalsother.cpp b/src/muscle/muscle3.8.31/src/globalsother.cpp new file mode 100644 index 0000000..a1acf0a --- /dev/null +++ b/src/muscle/muscle3.8.31/src/globalsother.cpp @@ -0,0 +1,62 @@ +#include "muscle.h" + +#if !defined(__linux__) && !defined(_MSC_VER) && !defined(__MACH__) + +double GetNAN() + { + return 0.0; + } + +double g_dNAN = GetNAN(); + +void chkmem(const char szMsg[]) + { + } + +void Break() + { + } + +char szCmdLine[4096]; + +const char *GetCmdLine() + { + return "muscle"; + } + +double GetMemUseMB() + { + return 100.0; + } + +void SaveCmdLine(int argc, char *argv[]) + { + for (int i = 0; i < argc; ++i) + { + if (i > 0) + strcat(szCmdLine, " "); + strcat(szCmdLine, argv[i]); + } + } + +double GetPeakMemUseMB() + { + return 100.0; + } + +double GetCPUGHz() + { + return 2.0; + } + +void CheckMemUse() + { + } + +double GetRAMSizeMB() + { + return 500.0; + } + +#endif + diff --git a/src/muscle/muscle3.8.31/src/globalswin32.cpp b/src/muscle/muscle3.8.31/src/globalswin32.cpp new file mode 100644 index 0000000..c690108 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/globalswin32.cpp @@ -0,0 +1,100 @@ +#include "muscle.h" + +#if WIN32 +#include +#include +#include +#include +#include + +void DebugPrintf(const char *szFormat, ...) + { + va_list ArgList; + char szStr[4096]; + + va_start(ArgList, szFormat); + vsprintf(szStr, szFormat, ArgList); + + OutputDebugString(szStr); + } + +double GetNAN() + { + static unsigned long nan[2]={0xffffffff, 0x7fffffff}; + double dNAN = *( double* )nan; + assert(_isnan(dNAN)); + return dNAN; + } + +double g_dNAN = GetNAN(); + +void chkmem(const char szMsg[]) + { + if (!_CrtCheckMemory()) + Quit("chkmem(%s)", szMsg); + } + +void Break() + { + if (IsDebuggerPresent()) + DebugBreak(); + } + +const char *GetCmdLine() + { + return GetCommandLine(); + } + +static unsigned uPeakMemUseBytes; + +double GetRAMSizeMB() + { + MEMORYSTATUS MS; + GlobalMemoryStatus(&MS); + return MS.dwAvailPhys/1e6; + } + +double GetMemUseMB() + { + HANDLE hProc = GetCurrentProcess(); + PROCESS_MEMORY_COUNTERS PMC; + BOOL bOk = GetProcessMemoryInfo(hProc, &PMC, sizeof(PMC)); + assert(bOk); + //printf("GetMemUseMB()\n"); + //printf("%12u PageFaultCount\n", (unsigned) PMC.PageFaultCount); + //printf("%12u PagefileUsage\n", (unsigned) PMC.PagefileUsage); + //printf("%12u PeakPagefileUsage\n", (unsigned) PMC.PeakPagefileUsage); + //printf("%12u WorkingSetSize\n", (unsigned) PMC.WorkingSetSize); + //printf("%12u PeakWorkingSetSize\n", (unsigned) PMC.PeakWorkingSetSize); + //printf("%12u QuotaPagedPoolUsage\n", (unsigned) PMC.QuotaPagedPoolUsage); + //printf("%12u QuotaPeakPagedPoolUsage\n", (unsigned) PMC.QuotaPeakPagedPoolUsage); + //printf("%12u QuotaNonPagedPoolUsage\n", (unsigned) PMC.QuotaNonPagedPoolUsage); + //printf("%12u QuotaPeakNonPagedPoolUsage\n", (unsigned) PMC.QuotaPeakNonPagedPoolUsage); + unsigned uBytes = (unsigned) PMC.WorkingSetSize; + if (uBytes > uPeakMemUseBytes) + uPeakMemUseBytes = uBytes; + return (uBytes + 500000.0)/1000000.0; + } + +double GetPeakMemUseMB() + { + return (uPeakMemUseBytes + 500000.0)/1000000.0; + } + +void CheckMemUse() + { +// Side-effect: sets peak usage in uPeakMemUseBytes + GetMemUseMB(); + } + +double GetCPUGHz() + { + double dGHz = 2.5; + const char *e = getenv("CPUGHZ"); + if (0 != e) + dGHz = atof(e); + if (dGHz < 0.1 || dGHz > 1000.0) + Quit("Invalid value '%s' for environment variable CPUGHZ", e); + return dGHz; + } +#endif // WIN32 diff --git a/src/muscle/muscle3.8.31/src/gonnet.cpp b/src/muscle/muscle3.8.31/src/gonnet.cpp new file mode 100644 index 0000000..5d447c8 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/gonnet.cpp @@ -0,0 +1,499 @@ +#include "muscle.h" +#include "gonnet.h" + +#define ROW(A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y) \ + { A/4.0, C/4.0, D/4.0, E/4.0, F/4.0, G/4.0, H/4.0, I/4.0, K/4.0, L/4.0, M/4.0, N/4.0, P/4.0, Q/4.0, R/4.0, S/4.0, T/4.0, V/4.0, W/4.0, Y/4.0 }, + +static double Gonnet80[20][20] = + { +// A C D E F G H I K L +// M N P Q R S T V W Y +ROW( 1990, 1140, 930, 1070, 600, 1130, 850, 810, 940, 810, + 980, 900, 1080, 1020, 880, 1380, 1190, 1180, 370, 590) // A + +ROW( 1140, 2780, 310, 300, 850, 630, 810, 700, 360, 690, + 850, 690, 310, 480, 640, 1090, 900, 1030, 810, 920) // C + +ROW( 930, 310, 2200, 1550, 130, 980, 1070, 180, 1030, 150, + 360, 1450, 820, 1150, 800, 1100, 1000, 350, 0, 550) // D + +ROW( 1070, 300, 1550, 2120, 220, 770, 1070, 510, 1280, 490, + 710, 1110, 890, 1470, 1010, 1050, 970, 730, 260, 500) // E + +ROW( 600, 850, 130, 220, 2380, 90, 980, 1090, 350, 1310, + 1270, 490, 310, 540, 340, 470, 620, 930, 1400, 1730) // F + +ROW( 1130, 630, 980, 770, 90, 2210, 710, 100, 740, 200, + 410, 1060, 660, 800, 810, 1080, 720, 380, 430, 300) // G + +ROW( 850, 810, 1070, 1070, 980, 710, 2510, 600, 1120, 670, + 860, 1330, 790, 1380, 1140, 990, 1000, 590, 810, 1450) // H + +ROW( 810, 700, 180, 510, 1090, 100, 600, 2100, 650, 1460, + 1490, 530, 490, 640, 530, 620, 960, 1650, 610, 770) // I + +ROW( 940, 360, 1030, 1280, 350, 740, 1120, 650, 2090, 660, + 870, 1220, 870, 1410, 1570, 1040, 1090, 700, 350, 640) // K + +ROW( 810, 690, 150, 490, 1310, 200, 670, 1460, 660, 2010, + 1550, 450, 660, 850, 660, 600, 750, 1270, 800, 890) // L + +ROW( 980, 850, 360, 710, 1270, 410, 860, 1490, 870, 1550, + 2410, 620, 460, 1050, 710, 830, 990, 1250, 790, 870) // M + +ROW( 900, 690, 1450, 1110, 490, 1060, 1330, 530, 1220, 450, + 620, 2210, 760, 1180, 1020, 1290, 1170, 550, 380, 850) // N + +ROW( 1080, 310, 820, 890, 310, 660, 790, 490, 870, 660, + 460, 760, 2380, 1000, 790, 1100, 1040, 670, 120, 480) // P + +ROW( 1020, 480, 1150, 1470, 540, 800, 1380, 640, 1410, 850, + 1050, 1180, 1000, 2190, 1350, 1090, 1060, 730, 620, 710) // Q + +ROW( 880, 640, 800, 1010, 340, 810, 1140, 530, 1570, 660, + 710, 1020, 790, 1350, 2210, 970, 970, 640, 830, 740) // R + +ROW( 1380, 1090, 1100, 1050, 470, 1080, 990, 620, 1040, 600, + 830, 1290, 1100, 1090, 970, 2020, 1490, 810, 520, 780) // S + +ROW( 1190, 900, 1000, 970, 620, 720, 1000, 960, 1090, 750, + 990, 1170, 1040, 1060, 970, 1490, 2050, 1150, 370, 660) // T + +ROW( 1180, 1030, 350, 730, 930, 380, 590, 1650, 700, 1270, + 1250, 550, 670, 730, 640, 810, 1150, 2040, 440, 770) // V + +ROW( 370, 810, 0, 260, 1400, 430, 810, 610, 350, 800, + 790, 380, 120, 620, 830, 520, 370, 440, 2970, 1470) // W + +ROW( 590, 920, 550, 500, 1730, 300, 1450, 770, 640, 890, + 870, 850, 480, 710, 740, 780, 660, 770, 1470, 2470) // Y + }; + +static double Gonnet120[20][20] = + { +// A C D E F G H I K L +// M N P Q R S T V W Y +ROW( 1550, 950, 780, 870, 480, 930, 700, 690, 770, 660, + 790, 760, 900, 840, 730, 1120, 980, 960, 280, 480) // A + +ROW( 950, 2400, 270, 280, 700, 510, 650, 600, 320, 570, + 700, 550, 280, 400, 510, 890, 750, 850, 670, 760) // C + +ROW( 780, 270, 1780, 1310, 90, 820, 890, 160, 880, 140, + 320, 1220, 680, 970, 690, 910, 830, 310, 0, 430) // D + +ROW( 870, 280, 1310, 1680, 180, 650, 900, 410, 1070, 390, + 560, 950, 740, 1210, 860, 870, 810, 580, 180, 400) // E + +ROW( 480, 700, 90, 180, 1980, 40, 820, 930, 290, 1110, + 1070, 380, 240, 430, 280, 380, 490, 790, 1230, 1510) // F + +ROW( 930, 510, 820, 650, 40, 1860, 590, 90, 620, 140, + 310, 890, 550, 660, 660, 900, 610, 310, 300, 220) // G + +ROW( 700, 650, 890, 900, 820, 590, 2060, 480, 940, 540, + 680, 1100, 650, 1130, 950, 820, 820, 490, 680, 1220) // H + +ROW( 690, 600, 160, 410, 930, 90, 480, 1680, 520, 1240, + 1250, 410, 400, 530, 430, 520, 790, 1380, 500, 650) // I + +ROW( 770, 320, 880, 1070, 290, 620, 940, 520, 1650, 520, + 690, 1010, 720, 1160, 1320, 860, 900, 570, 280, 520) // K + +ROW( 660, 570, 140, 390, 1110, 140, 540, 1240, 520, 1620, + 1300, 350, 520, 660, 520, 490, 620, 1090, 670, 760) // L + +ROW( 790, 700, 320, 560, 1070, 310, 680, 1250, 690, 1300, + 1910, 500, 400, 820, 580, 670, 800, 1060, 650, 740) // M + +ROW( 760, 550, 1220, 950, 380, 890, 1100, 410, 1010, 350, + 500, 1760, 640, 970, 860, 1060, 960, 460, 280, 680) // N + +ROW( 900, 280, 680, 740, 240, 550, 650, 400, 720, 520, + 400, 640, 2010, 820, 660, 910, 860, 540, 70, 370) // P + +ROW( 840, 400, 970, 1210, 430, 660, 1130, 530, 1160, 660, + 820, 970, 820, 1700, 1120, 890, 870, 600, 470, 580) // Q + +ROW( 730, 510, 690, 860, 280, 660, 950, 430, 1320, 520, + 580, 860, 660, 1120, 1790, 810, 800, 520, 660, 590) // R + +ROW( 1120, 890, 910, 870, 380, 900, 820, 520, 860, 490, + 670, 1060, 910, 890, 810, 1560, 1220, 680, 390, 610) // S + +ROW( 980, 750, 830, 810, 490, 610, 820, 790, 900, 620, + 800, 960, 860, 870, 800, 1220, 1600, 930, 290, 540) // T + +ROW( 960, 850, 310, 580, 790, 310, 490, 1380, 570, 1090, + 1060, 460, 540, 600, 520, 680, 930, 1610, 370, 630) // V + +ROW( 280, 670, 0, 180, 1230, 300, 680, 500, 280, 670, + 650, 280, 70, 470, 660, 390, 290, 370, 2620, 1290) // W + +ROW( 480, 760, 430, 400, 1510, 220, 1220, 650, 520, 760, + 740, 680, 370, 580, 590, 610, 540, 630, 1290, 2070) // Y + }; + +static SCORE Gonnet160[20][20] = + { +// A C D E F G H I K L +// M N P Q R S T V W Y +ROW( 1240, 810, 670, 740, 400, 800, 600, 600, 660, 560, + 660, 660, 770, 710, 620, 940, 830, 790, 230, 410) // A + +ROW( 810, 2130, 250, 260, 600, 440, 550, 530, 300, 490, + 590, 470, 260, 360, 430, 760, 640, 720, 570, 650) // C + +ROW( 670, 250, 1480, 1120, 80, 710, 770, 160, 770, 130, + 280, 1040, 590, 840, 620, 780, 720, 290, 0, 360) // D + +ROW( 740, 260, 1120, 1370, 160, 570, 770, 350, 910, 330, + 470, 830, 640, 1010, 750, 750, 700, 480, 140, 340) // E + +ROW( 400, 600, 80, 160, 1690, 20, 710, 810, 250, 970, + 920, 310, 200, 370, 250, 330, 420, 700, 1100, 1340) // F + +ROW( 800, 440, 710, 570, 20, 1600, 510, 80, 540, 110, + 260, 760, 480, 570, 570, 770, 540, 260, 230, 180) // G + +ROW( 600, 550, 770, 770, 710, 510, 1710, 410, 800, 460, + 570, 930, 560, 950, 810, 700, 700, 430, 590, 1050) // H + +ROW( 600, 530, 160, 350, 810, 80, 410, 1370, 430, 1080, + 1070, 340, 350, 460, 370, 450, 660, 1180, 440, 580) // I + +ROW( 660, 300, 770, 910, 250, 540, 800, 430, 1330, 440, + 570, 860, 620, 980, 1130, 740, 760, 480, 240, 430) // K + +ROW( 560, 490, 130, 330, 970, 110, 460, 1080, 440, 1350, + 1120, 300, 430, 540, 430, 420, 540, 950, 580, 670) // L + +ROW( 660, 590, 280, 470, 920, 260, 570, 1070, 570, 1120, + 1540, 420, 360, 660, 490, 550, 670, 920, 560, 650) // M + +ROW( 660, 470, 1040, 830, 310, 760, 930, 340, 860, 300, + 420, 1430, 560, 830, 740, 890, 810, 400, 230, 560) // N + +ROW( 770, 260, 590, 640, 200, 480, 560, 350, 620, 430, + 360, 560, 1740, 700, 570, 780, 740, 460, 40, 300) // P + +ROW( 710, 360, 840, 1010, 370, 570, 950, 460, 980, 540, + 660, 830, 700, 1340, 950, 760, 740, 510, 380, 490) // Q + +ROW( 620, 430, 620, 750, 250, 570, 810, 370, 1130, 430, + 490, 740, 570, 950, 1490, 690, 690, 440, 540, 490) // R + +ROW( 940, 760, 780, 750, 330, 770, 700, 450, 740, 420, + 550, 890, 780, 760, 690, 1220, 1010, 580, 310, 500) // S + +ROW( 830, 640, 720, 700, 420, 540, 700, 660, 760, 540, + 670, 810, 740, 740, 690, 1010, 1280, 780, 240, 460) // T + +ROW( 790, 720, 290, 480, 700, 260, 430, 1180, 480, 950, + 920, 400, 460, 510, 440, 580, 780, 1310, 330, 540) // V + +ROW( 230, 570, 0, 140, 1100, 230, 590, 440, 240, 580, + 560, 230, 40, 380, 540, 310, 240, 330, 2360, 1160) // W + +ROW( 410, 650, 360, 340, 1340, 180, 1050, 580, 430, 670, + 650, 560, 300, 490, 490, 500, 460, 540, 1160, 1780) // Y + }; + +double Gonnet16[21][21] = + { +// A C D E F G H I K L +// M N P Q R S T V W Y +ROW( 124, 81, 67, 74, 40, 80, 60, 60, 66, 56, + 66, 66, 77, 71, 62, 94, 83, 79, 23, 41) // A + +ROW( 81, 213, 25, 26, 60, 44, 55, 53, 30, 49, + 59, 47, 26, 36, 43, 76, 64, 72, 57, 65) // C + +ROW( 67, 25, 148, 112, 8, 71, 77, 16, 77, 13, + 28, 104, 59, 84, 62, 78, 72, 29, 0, 36) // D + +ROW( 74, 26, 112, 137, 16, 57, 77, 35, 91, 33, + 47, 83, 64, 101, 75, 75, 70, 48, 14, 34) // E + +ROW( 40, 60, 8, 16, 169, 2, 71, 81, 25, 97, + 92, 31, 20, 37, 25, 33, 42, 70, 110, 134) // F + +ROW( 80, 44, 71, 57, 2, 160, 51, 8, 54, 11, + 26, 76, 48, 57, 57, 77, 54, 26, 23, 18) // G + +ROW( 60, 55, 77, 77, 71, 51, 171, 41, 80, 46, + 57, 93, 56, 95, 81, 70, 70, 43, 59, 105) // H + +ROW( 60, 53, 16, 35, 81, 8, 41, 137, 43, 108, + 107, 34, 35, 46, 37, 45, 66, 118, 44, 58) // I + +ROW( 66, 30, 77, 91, 25, 54, 80, 43, 133, 44, + 57, 86, 62, 98, 113, 74, 76, 48, 24, 43) // K + +ROW( 56, 49, 13, 33, 97, 11, 46, 108, 44, 135, + 112, 30, 43, 54, 43, 42, 54, 95, 58, 67) // L + +ROW( 66, 59, 28, 47, 92, 26, 57, 107, 57, 112, + 154, 42, 36, 66, 49, 55, 67, 92, 56, 65) // M + +ROW( 66, 47, 104, 83, 31, 76, 93, 34, 86, 30, + 42, 143, 56, 83, 74, 89, 81, 40, 23, 56) // N + +ROW( 77, 26, 59, 64, 20, 48, 56, 35, 62, 43, + 36, 56, 174, 70, 57, 78, 74, 46, 4, 30) // P + +ROW( 71, 36, 84, 101, 37, 57, 95, 46, 98, 54, + 66, 83, 70, 134, 95, 76, 74, 51, 38, 49) // Q + +ROW( 62, 43, 62, 75, 25, 57, 81, 37, 113, 43, + 49, 74, 57, 95, 149, 69, 69, 44, 54, 49) // R + +ROW( 94, 76, 78, 75, 33, 77, 70, 45, 74, 42, + 55, 89, 78, 76, 69, 122, 101, 58, 31, 50) // S + +ROW( 83, 64, 72, 70, 42, 54, 70, 66, 76, 54, + 67, 81, 74, 74, 69, 101, 128, 78, 24, 46) // T + +ROW( 79, 72, 29, 48, 70, 26, 43, 118, 48, 95, + 92, 40, 46, 51, 44, 58, 78, 131, 33, 54) // V + +ROW( 23, 57, 0, 14, 110, 23, 59, 44, 24, 58, + 56, 23, 4, 38, 54, 31, 24, 33, 236, 116) // W + +ROW( 41, 65, 36, 34, 134, 18, 105, 58, 43, 67, + 65, 56, 30, 49, 49, 50, 46, 54, 116, 178) // Y + }; + +static double Gonnet250[20][20] = + { +// A C D E F G H I K L +// M N P Q R S T V W Y +ROW( 760, 570, 490, 520, 290, 570, 440, 440, 480, 400, + 450, 490, 550, 500, 460, 630, 580, 530, 160, 300) // A + +ROW( 570, 1670, 200, 220, 440, 320, 390, 410, 240, 370, + 430, 340, 210, 280, 300, 530, 470, 520, 420, 470) // C + +ROW( 490, 200, 990, 790, 70, 530, 560, 140, 570, 120, + 220, 740, 450, 610, 490, 570, 520, 230, 0, 240) // D + +ROW( 520, 220, 790, 880, 130, 440, 560, 250, 640, 240, + 320, 610, 470, 690, 560, 540, 510, 330, 90, 250) // E + +ROW( 290, 440, 70, 130, 1220, 0, 510, 620, 190, 720, + 680, 210, 140, 260, 200, 240, 300, 530, 880, 1030) // F + +ROW( 570, 320, 530, 440, 0, 1180, 380, 70, 410, 80, + 170, 560, 360, 420, 420, 560, 410, 190, 120, 120) // G + +ROW( 440, 390, 560, 560, 510, 380, 1120, 300, 580, 330, + 390, 640, 410, 640, 580, 500, 490, 320, 440, 740) // H + +ROW( 440, 410, 140, 250, 620, 70, 300, 920, 310, 800, + 770, 240, 260, 330, 280, 340, 460, 830, 340, 450) // I + +ROW( 480, 240, 570, 640, 190, 410, 580, 310, 840, 310, + 380, 600, 460, 670, 790, 530, 530, 350, 170, 310) // K + +ROW( 400, 370, 120, 240, 720, 80, 330, 800, 310, 920, + 800, 220, 290, 360, 300, 310, 390, 700, 450, 520) // L + +ROW( 450, 430, 220, 320, 680, 170, 390, 770, 380, 800, + 950, 300, 280, 420, 350, 380, 460, 680, 420, 500) // M + +ROW( 490, 340, 740, 610, 210, 560, 640, 240, 600, 220, + 300, 900, 430, 590, 550, 610, 570, 300, 160, 380) // N + +ROW( 550, 210, 450, 470, 140, 360, 410, 260, 460, 290, + 280, 430, 1280, 500, 430, 560, 530, 340, 20, 210) // P + +ROW( 500, 280, 610, 690, 260, 420, 640, 330, 670, 360, + 420, 590, 500, 790, 670, 540, 520, 370, 250, 350) // Q + +ROW( 460, 300, 490, 560, 200, 420, 580, 280, 790, 300, + 350, 550, 430, 670, 990, 500, 500, 320, 360, 340) // R + +ROW( 630, 530, 570, 540, 240, 560, 500, 340, 530, 310, + 380, 610, 560, 540, 500, 740, 670, 420, 190, 330) // S + +ROW( 580, 470, 520, 510, 300, 410, 490, 460, 530, 390, + 460, 570, 530, 520, 500, 670, 770, 520, 170, 330) // T + +ROW( 530, 520, 230, 330, 530, 190, 320, 830, 350, 700, + 680, 300, 340, 370, 320, 420, 520, 860, 260, 410) // V + +ROW( 160, 420, 0, 90, 880, 120, 440, 340, 170, 450, + 420, 160, 20, 250, 360, 190, 170, 260, 1940, 930) // W + +ROW( 300, 470, 240, 250, 1030, 120, 740, 450, 310, 520, + 500, 380, 210, 350, 340, 330, 330, 410, 930, 1300) // Y + }; + +static double Gonnet350[20][20] = + { +// A C D E F G H I K L +// M N P Q R S T V W Y +ROW( 450, 390, 350, 360, 210, 400, 310, 310, 340, 280, + 310, 350, 380, 350, 330, 410, 390, 350, 110, 210) // A + +ROW( 390, 1280, 160, 180, 320, 230, 270, 300, 190, 280, + 310, 240, 170, 210, 220, 360, 330, 370, 310, 340) // C + +ROW( 350, 160, 640, 540, 50, 390, 400, 110, 410, 100, + 160, 500, 330, 430, 370, 400, 370, 170, 0, 170) // D + +ROW( 360, 180, 540, 550, 100, 330, 390, 180, 440, 170, + 220, 440, 350, 460, 410, 380, 360, 230, 60, 180) // E + +ROW( 210, 320, 50, 100, 860, 0, 360, 460, 140, 530, + 490, 150, 100, 190, 150, 170, 220, 400, 700, 770) // F + +ROW( 400, 230, 390, 330, 0, 860, 280, 60, 310, 50, + 120, 400, 280, 310, 310, 400, 300, 140, 50, 80) // G + +ROW( 310, 270, 400, 390, 360, 280, 680, 220, 400, 240, + 270, 430, 300, 420, 410, 350, 340, 240, 320, 500) // H + +ROW( 310, 300, 110, 180, 460, 60, 220, 620, 220, 570, + 540, 170, 190, 240, 200, 240, 320, 570, 260, 340) // I + +ROW( 340, 190, 410, 440, 140, 310, 400, 220, 530, 210, + 260, 420, 330, 450, 530, 370, 370, 250, 120, 210) // K + +ROW( 280, 280, 100, 170, 530, 50, 240, 570, 210, 630, + 560, 160, 200, 240, 210, 220, 280, 510, 340, 400) // L + +ROW( 310, 310, 160, 220, 490, 120, 270, 540, 260, 560, + 580, 210, 210, 280, 240, 260, 310, 490, 320, 370) // M + +ROW( 350, 240, 500, 440, 150, 400, 430, 170, 420, 160, + 210, 550, 320, 410, 390, 410, 390, 220, 110, 250) // N + +ROW( 380, 170, 330, 350, 100, 280, 300, 190, 330, 200, + 210, 320, 910, 350, 310, 390, 370, 240, 10, 150) // P + +ROW( 350, 210, 430, 460, 190, 310, 420, 240, 450, 240, + 280, 410, 350, 470, 450, 370, 360, 260, 160, 240) // Q + +ROW( 330, 220, 370, 410, 150, 310, 410, 200, 530, 210, + 240, 390, 310, 450, 630, 360, 350, 230, 230, 230) // R + +ROW( 410, 360, 400, 380, 170, 400, 350, 240, 370, 220, + 260, 410, 390, 370, 360, 450, 430, 290, 130, 230) // S + +ROW( 390, 330, 370, 360, 220, 300, 340, 320, 370, 280, + 310, 390, 370, 360, 350, 430, 460, 350, 120, 230) // T + +ROW( 350, 370, 170, 230, 400, 140, 240, 570, 250, 510, + 490, 220, 240, 260, 230, 290, 350, 560, 210, 310) // V + +ROW( 110, 310, 0, 60, 700, 50, 320, 260, 120, 340, + 320, 110, 10, 160, 230, 130, 120, 210, 1590, 740) // W + +ROW( 210, 340, 170, 180, 770, 80, 500, 340, 210, 400, + 370, 250, 150, 240, 230, 230, 230, 310, 740, 920) // Y + }; + +const t_ROW *GetGonnetMatrix(unsigned N) + { + switch (N) + { + case 80: + return Gonnet80; + case 120: + return Gonnet120; + //case 16: + // return Gonnet16; + //case 160: + // return Gonnet160; + case 250: + return Gonnet250; + case 350: + return Gonnet350; + } + Quit("Invalid Gonnet%u", N); + return 0; + } + +//SCORE GetGonnetGapOpen(unsigned N) +// { +// switch (N) +// { +// case 80: +// return -639; +// case 120: +// return -863; +// case 160: +// return -611; +// case 250: +// return -308; +// case 350: +// return -158; +// } +// Quit("Invalid Gonnet%u", N); +// return 0; +// } + +SCORE GetGonnetGapOpen(unsigned N) + { + switch (N) + { + case 80: + return -1000; + case 120: + return -800; + case 160: + return -700; + case 250: + return -200; + case 350: + return -175; + } + Quit("Invalid Gonnet%u", N); + return 0; + } + +SCORE GetGonnetGapExtend(unsigned N) + { + switch (N) + { + case 80: + return 350; + case 120: + return 200; + case 160: + return 175; + case 250: + return 20; + case 350: + return 20; + } + Quit("Invalid Gonnet%u", N); + return 0; + } + +//double GonnetLookup[400][400]; +// +//static bool InitGonnetLookup() +// { +// for (unsigned i = 0; i < 400; ++i) +// { +// const unsigned A1 = i/20; +// const unsigned A2 = i%20; +// for (unsigned j = 0; j <= i; ++j) +// { +// const unsigned B1 = j/20; +// const unsigned B2 = j%20; +// +// const double s00 = Gonnet16[A1][B1]; +// const double s01 = Gonnet16[A1][B2]; +// const double s10 = Gonnet16[A2][B1]; +// const double s11 = Gonnet16[A2][B2]; +// +// GonnetLookup[i][j] = GonnetLookup[j][i] = (s00 + s01 + s10 + s11)/4; +// } +// } +// return true; +// } +// +//static bool bGonnetLookupInitialized = InitGonnetLookup(); diff --git a/src/muscle/muscle3.8.31/src/gonnet.h b/src/muscle/muscle3.8.31/src/gonnet.h new file mode 100644 index 0000000..2e55c4a --- /dev/null +++ b/src/muscle/muscle3.8.31/src/gonnet.h @@ -0,0 +1,12 @@ +#ifndef Gonnet_h +#define Gonnet_h + +typedef double t_ROW[20]; + +const t_ROW *GetGonnetMatrix(unsigned N); +SCORE GetGonnetGapOpen(unsigned N); +SCORE GetGonnetGapExtend(unsigned N); + +extern double GonnetLookup[400][400]; + +#endif // Gonnet_h diff --git a/src/muscle/muscle3.8.31/src/gotowt.cpp b/src/muscle/muscle3.8.31/src/gotowt.cpp new file mode 100644 index 0000000..e69de29 diff --git a/src/muscle/muscle3.8.31/src/henikoffweight.cpp b/src/muscle/muscle3.8.31/src/henikoffweight.cpp new file mode 100644 index 0000000..53f3b34 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/henikoffweight.cpp @@ -0,0 +1,84 @@ +#include "muscle.h" +#include "msa.h" + +/*** +Compute Henikoff weights. +Steven Henikoff and Jorja G. Henikoff (1994), Position-based sequence weights. +J. Mol. Biol., 243(4):574-578. + +Award each different residue an equal share of the weight, and then to divide up +that weight equally among the sequences sharing the same residue. So if in a +position of a multiple alignment, r different residues are represented, a residue +represented in only one sequence contributes a score of 1/r to that sequence, whereas a +residue represented in s sequences contributes a score of 1/rs to each of the s +sequences. For each sequence, the contributions from each position are summed to give +a sequence weight. + +See also HenikoffWeightPB. +***/ + +void MSA::CalcHenikoffWeightsCol(unsigned uColIndex) const + { + const unsigned uSeqCount = GetSeqCount(); + +// Compute letter counts in this column + unsigned uLetterCount[MAX_ALPHA]; + memset(uLetterCount, 0, sizeof(uLetterCount)); + unsigned uDifferentLetterCount = 0; + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex); + if (uLetter >= 20) + continue; + unsigned uNewCount = uLetterCount[uLetter] + 1; + uLetterCount[uLetter] = uNewCount; + if (1 == uNewCount) + ++uDifferentLetterCount; + } + +// Compute weight contributions + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex); + if (uLetter >= 20) + continue; + const unsigned uCount = uLetterCount[uLetter]; + unsigned uDenom = uCount*uDifferentLetterCount; + if (uDenom == 0) + continue; + m_Weights[uSeqIndex] += (WEIGHT) (1.0/uDenom); + } + } + +void MSA::SetHenikoffWeights() const + { + const unsigned uColCount = GetColCount(); + const unsigned uSeqCount = GetSeqCount(); + + if (0 == uSeqCount) + return; + else if (1 == uSeqCount) + { + m_Weights[0] = (WEIGHT) 1.0; + return; + } + else if (2 == uSeqCount) + { + m_Weights[0] = (WEIGHT) 0.5; + m_Weights[1] = (WEIGHT) 0.5; + return; + } + + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + m_Weights[uSeqIndex] = 0.0; + + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + CalcHenikoffWeightsCol(uColIndex); + +// Set all-gap seqs weight to 0 + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + if (IsGapSeq(uSeqIndex)) + m_Weights[uSeqIndex] = 0.0; + + Normalize(m_Weights, uSeqCount); + } diff --git a/src/muscle/muscle3.8.31/src/henikoffweightpb.cpp b/src/muscle/muscle3.8.31/src/henikoffweightpb.cpp new file mode 100644 index 0000000..5b31315 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/henikoffweightpb.cpp @@ -0,0 +1,124 @@ +#include "muscle.h" +#include "msa.h" + +/*** +Compute Henikoff weights. +Steven Henikoff and Jorja G. Henikoff (1994), Position-based sequence weights. +J. Mol. Biol., 243(4):574-578. + +Award each different residue an equal share of the weight, and then to divide up +that weight equally among the sequences sharing the same residue. So if in a +position of a multiple alignment, r different residues are represented, a residue +represented in only one sequence contributes a score of 1/r to that sequence, whereas a +residue represented in s sequences contributes a score of 1/rs to each of the s +sequences. For each sequence, the contributions from each position are summed to give +a sequence weight. + +Here we use the variant from PSI-BLAST, which (a) treats gaps as a 21st letter, +and (b) ignores columns that are perfectly conserved. + +>>> WARNING -- I SUSPECT THIS DOESN'T WORK CORRECTLY <<< +***/ + +void MSA::CalcHenikoffWeightsColPB(unsigned uColIndex) const + { + const unsigned uSeqCount = GetSeqCount(); + +// Compute letter counts in this column + unsigned uLetterCount[MAX_ALPHA+1]; + memset(uLetterCount, 0, (MAX_ALPHA+1)*sizeof(unsigned)); + unsigned uLetter; + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex)) + uLetter = MAX_ALPHA; + else + uLetter = GetLetter(uSeqIndex, uColIndex); + ++(uLetterCount[uLetter]); + } + +// Check for special case of perfect conservation + for (unsigned uLetter = 0; uLetter < MAX_ALPHA+1; ++uLetter) + { + unsigned uCount = uLetterCount[uLetter]; + if (uCount > 0) + { + // Perfectly conserved? + if (uCount == uSeqCount) + return; + else + // If count > 0 but less than nr. sequences, can't be conserved + break; + } + } + +// Compute weight contributions + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + unsigned uLetter; + if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex)) + uLetter = MAX_ALPHA; + else + uLetter = GetLetter(uSeqIndex, uColIndex); + const unsigned uCount = uLetterCount[uLetter]; + m_Weights[uSeqIndex] += (WEIGHT) (1.0/uCount); + } + } + +bool MSA::IsGapSeq(unsigned uSeqIndex) const + { + const unsigned uColCount = GetColCount(); + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + if (!IsGap(uSeqIndex, uColIndex)) + return false; + return true; + } + +void MSA::SetUniformWeights() const + { + const unsigned uSeqCount = GetSeqCount(); + if (0 == uSeqCount) + return; + + const WEIGHT w = (WEIGHT) (1.0 / uSeqCount); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + m_Weights[uSeqIndex] = w; + } + +void MSA::SetHenikoffWeightsPB() const + { + const unsigned uColCount = GetColCount(); + const unsigned uSeqCount = GetSeqCount(); + + if (0 == uSeqCount) + return; + else if (1 == uSeqCount) + { + m_Weights[0] = 1.0; + return; + } + else if (2 == uSeqCount) + { + m_Weights[0] = (WEIGHT) 0.5; + m_Weights[1] = (WEIGHT) 0.5; + return; + } + + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + m_Weights[uSeqIndex] = 0.0; + + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + CalcHenikoffWeightsColPB(uColIndex); + +// Set all-gap seqs weight to 0 + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + if (IsGapSeq(uSeqIndex)) + m_Weights[uSeqIndex] = 0.0; + +// Check for special case of identical sequences, which will cause all +// columns to be skipped becasue they're perfectly conserved. + if (VectorIsZero(m_Weights, uSeqCount)) + VectorSet(m_Weights, uSeqCount, 1.0); + + Normalize(m_Weights, uSeqCount); + } diff --git a/src/muscle/muscle3.8.31/src/html.cpp b/src/muscle/muscle3.8.31/src/html.cpp new file mode 100644 index 0000000..8ef9438 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/html.cpp @@ -0,0 +1,136 @@ +#include "muscle.h" +#include +#include +#include "msa.h" +#include "textfile.h" + +const unsigned uCharsPerLine = 60; +const int MIN_NAME = 10; +const int MAX_NAME = 32; + +extern void AssignColors(const MSA &a, int **Colors); + +static int **MakeColors(const MSA &a) + { + const unsigned uSeqCount = a.GetSeqCount(); + const unsigned uColCount = a.GetColCount(); + + int **Colors = new int *[uSeqCount]; + for (unsigned i = 0; i < uSeqCount; ++i) + { + Colors[i] = new int[uColCount]; + memset(Colors[i], 0, uColCount*sizeof(int)); + } + AssignColors(a, Colors); + return Colors; + } + +static void ChangeColor(TextFile &File, int From, int To) + { + if (From == To) + return; + +#define COLOR_WHITE "FFFFFF" +#define COLOR_GRAY "C0C0C0" +#define COLOR_BLACK "000000" +#define COLOR_RED "FF0000" +#define COLOR_GREEN "00FF00" +#define COLOR_BLUE "5590FF" +#define COLOR_LIGHTBLUE "77FFFF" + +#define X(c) File.PutString(""); + switch (To) + { + case 0: + X(COLOR_WHITE) + break; + case 1: + X(COLOR_GRAY) + break; + case 2: + X(COLOR_BLUE) + break; + case 3: + X(COLOR_LIGHTBLUE) + break; + } + } + +#define COLOR_WINDOW "FFEEE0" + +void MSA::ToHTMLFile(TextFile &File) const + { + File.PutString("\n"); + File.PutString("\n"); + File.PutString("
");
+
+	int **Colors = MakeColors(*this);
+
+	int iLongestNameLength = 0;
+	for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
+		{
+		const char *ptrName = GetSeqName(uSeqIndex);
+		const char *ptrBlank = strchr(ptrName, ' ');
+		int iLength;
+		if (0 != ptrBlank)
+			iLength = (int) (ptrBlank - ptrName);
+		else
+			iLength = (int) strlen(ptrName);
+		if (iLength > iLongestNameLength)
+			iLongestNameLength = iLength;
+		}
+	if (iLongestNameLength > MAX_NAME)
+		iLongestNameLength = MAX_NAME;
+	if (iLongestNameLength < MIN_NAME)
+		iLongestNameLength = MIN_NAME;
+
+	unsigned uLineCount = (GetColCount() - 1)/uCharsPerLine + 1;
+	int CurrentColor = -1;
+	for (unsigned uLineIndex = 0; uLineIndex < uLineCount; ++uLineIndex)
+		{
+		File.PutString("\n");
+		unsigned uStartColIndex = uLineIndex*uCharsPerLine;
+		unsigned uEndColIndex = uStartColIndex + uCharsPerLine - 1;
+		if (uEndColIndex >= GetColCount())
+			uEndColIndex = GetColCount() - 1;
+		char Name[MAX_NAME+1];
+		for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
+			{
+			const char *ptrName = GetSeqName(uSeqIndex);
+			const char *ptrBlank = strchr(ptrName, ' ');
+			int iLength;
+			if (0 != ptrBlank)
+				iLength = (int) (ptrBlank - ptrName);
+			else
+				iLength = (int) strlen(ptrName);
+			if (iLength > MAX_NAME)
+				iLength = MAX_NAME;
+			memset(Name, ' ', MAX_NAME);
+			memcpy(Name, ptrName, iLength);
+			Name[iLongestNameLength] = 0;
+
+//			File.PutString("");
+			CurrentColor = -1;
+			File.PutString("");
+			File.PutFormat("%s      ", Name);
+			File.PutString("");
+			for (unsigned uColIndex = uStartColIndex; uColIndex <= uEndColIndex;
+			  ++uColIndex)
+				{
+				const int Color = Colors[uSeqIndex][uColIndex];
+				ChangeColor(File, CurrentColor, Color);
+				CurrentColor = Color;
+				const char c = GetChar(uSeqIndex, uColIndex);
+				if (Color == 0)
+					File.PutFormat("%c", tolower(c));
+				else
+					File.PutFormat("%c", toupper(c));
+				}
+			File.PutString("\n");
+			}
+		}
+	File.PutString("\n");
+	File.PutString("
\n"); + File.PutString("\n"); + File.PutString("\n"); + } diff --git a/src/muscle/muscle3.8.31/src/hydro.cpp b/src/muscle/muscle3.8.31/src/hydro.cpp new file mode 100644 index 0000000..30167bd --- /dev/null +++ b/src/muscle/muscle3.8.31/src/hydro.cpp @@ -0,0 +1,42 @@ +#include "muscle.h" +#include "profile.h" + +extern void TomHydro(ProfPos *Prof, unsigned Length); + +// Apply hydrophobicity heuristic to a profile +void Hydro(ProfPos *Prof, unsigned uLength) + { + if (ALPHA_Amino != g_Alpha) + return; + + if (g_bTomHydro) + { + TomHydro(Prof, uLength); + return; + } + + if (0 == g_uHydrophobicRunLength) + return; + + if (uLength <= g_uHydrophobicRunLength) + return; + + unsigned uRunLength = 0; + unsigned L2 = g_uHydrophobicRunLength/2; + for (unsigned uColIndex = L2; uColIndex < uLength - L2; ++uColIndex) + { + ProfPos &PP = Prof[uColIndex]; + bool bHydro = IsHydrophobic(PP.m_fcCounts); + if (bHydro) + { + ++uRunLength; + if (uRunLength >= g_uHydrophobicRunLength) + { + Prof[uColIndex-L2].m_scoreGapOpen *= (SCORE) g_dHydroFactor; + Prof[uColIndex-L2].m_scoreGapClose *= (SCORE) g_dHydroFactor; + } + } + else + uRunLength = 0; + } + } diff --git a/src/muscle/muscle3.8.31/src/intmath.cpp b/src/muscle/muscle3.8.31/src/intmath.cpp new file mode 100644 index 0000000..40c25bb --- /dev/null +++ b/src/muscle/muscle3.8.31/src/intmath.cpp @@ -0,0 +1,354 @@ +#include "muscle.h" +#include + +PROB ScoreToProb(SCORE Score) + { + if (MINUS_INFINITY >= Score) + return 0.0; + return (PROB) pow(2.0, (double) Score/INTSCALE); + } + +//#if 0 +//static const double log2e = log2(exp(1.0)); +// +//double lnTolog2(double ln) +// { +// return ln*log2e; +// } +// +//double log2(double x) +// { +// if (0 == x) +// return MINUS_INFINITY; +// +// static const double dInvLn2 = 1.0/log(2.0); +//// Multiply by inverse of log(2) just in case multiplication +//// is faster than division. +// return log(x)*dInvLn2; +// } +//#endif + +//SCORE ProbToScore(PROB Prob) +// { +// if (0.0 == Prob) +// return MINUS_INFINITY; +//// return (SCORE) floor(INTSCALE*log2(Prob)); +// return (SCORE) log2(Prob); +// } + +WEIGHT DoubleToWeight(double d) + { + assert(d >= 0); + return (WEIGHT) (INTSCALE*d); + } + +double WeightToDouble(WEIGHT w) + { + return (double) w / (double) INTSCALE; + } + +SCORE DoubleToScore(double d) + { + return (SCORE)(d*(double) INTSCALE); + } + +bool ScoreEq(SCORE s1, SCORE s2) + { + return BTEq(s1, s2); + } + +static bool BTEq2(BASETYPE b1, BASETYPE b2) + { + double diff = fabs(b1 - b2); + if (diff < 0.0001) + return true; + double sum = fabs(b1) + fabs(b2); + return diff/sum < 0.005; + } + +bool BTEq(double b1, double b2) + { + return BTEq2((BASETYPE) b1, (BASETYPE) b2); + } + +//const double dLn2 = log(2.0); + +//// pow2(x)=2^x +//double pow2(double x) +// { +// if (MINUS_INFINITY == x) +// return 0; +// return exp(x*dLn2); +// } + +//// lp2(x) = log2(1 + 2^-x), x >= 0 +//double lp2(double x) +// { +// return log2(1 + pow2(-x)); +// } + +// SumLog(x, y) = log2(2^x + 2^y) +//SCORE SumLog(SCORE x, SCORE y) +// { +// return (SCORE) log2(pow2(x) + pow2(y)); +// } +// +//// SumLog(x, y, z) = log2(2^x + 2^y + 2^z) +//SCORE SumLog(SCORE x, SCORE y, SCORE z) +// { +// return (SCORE) log2(pow2(x) + pow2(y) + pow2(z)); +// } +// +//// SumLog(w, x, y, z) = log2(2^w + 2^x + 2^y + 2^z) +//SCORE SumLog(SCORE w, SCORE x, SCORE y, SCORE z) +// { +// return (SCORE) log2(pow2(w) + pow2(x) + pow2(y) + pow2(z)); +// } + +//SCORE lp2Fast(SCORE x) +// { +// assert(x >= 0); +// const int iTableSize = 1000; +// const double dRange = 20.0; +// const double dScale = dRange/iTableSize; +// static SCORE dValue[iTableSize]; +// static bool bInit = false; +// if (!bInit) +// { +// for (int i = 0; i < iTableSize; ++i) +// dValue[i] = (SCORE) lp2(i*dScale); +// bInit = true; +// } +// if (x >= dRange) +// return 0.0; +// int i = (int) (x/dScale); +// assert(i >= 0 && i < iTableSize); +// SCORE dResult = dValue[i]; +// assert(BTEq(dResult, lp2(x))); +// return dResult; +// } +// +//// SumLog(x, y) = log2(2^x + 2^y) +//SCORE SumLogFast(SCORE x, SCORE y) +// { +// if (MINUS_INFINITY == x) +// { +// if (MINUS_INFINITY == y) +// return MINUS_INFINITY; +// return y; +// } +// else if (MINUS_INFINITY == y) +// return x; +// +// SCORE dResult; +// if (x > y) +// dResult = x + lp2Fast(x-y); +// else +// dResult = y + lp2Fast(y-x); +// assert(SumLog(x, y) == dResult); +// return dResult; +// } +// +//SCORE SumLogFast(SCORE x, SCORE y, SCORE z) +// { +// SCORE dResult = SumLogFast(x, SumLogFast(y, z)); +// assert(SumLog(x, y, z) == dResult); +// return dResult; +// } + +//SCORE SumLogFast(SCORE w, SCORE x, SCORE y, SCORE z) +// { +// SCORE dResult = SumLogFast(SumLogFast(w, x), SumLogFast(y, z)); +// assert(SumLog(w, x, y, z) == dResult); +// return dResult; +// } + +double VecSum(const double v[], unsigned n) + { + double dSum = 0.0; + for (unsigned i = 0; i < n; ++i) + dSum += v[i]; + return dSum; + } + +void Normalize(PROB p[], unsigned n) + { + unsigned i; + PROB dSum = 0.0; + for (i = 0; i < n; ++i) + dSum += p[i]; + if (0.0 == dSum) + Quit("Normalize, sum=0"); + for (i = 0; i < n; ++i) + p[i] /= dSum; + } + +void NormalizeUnlessZero(PROB p[], unsigned n) + { + unsigned i; + PROB dSum = 0.0; + for (i = 0; i < n; ++i) + dSum += p[i]; + if (0.0 == dSum) + return; + for (i = 0; i < n; ++i) + p[i] /= dSum; + } + +void Normalize(PROB p[], unsigned n, double dRequiredTotal) + { + unsigned i; + double dSum = 0.0; + for (i = 0; i < n; ++i) + dSum += p[i]; + if (0.0 == dSum) + Quit("Normalize, sum=0"); + double dFactor = dRequiredTotal / dSum; + for (i = 0; i < n; ++i) + p[i] *= (PROB) dFactor; + } + +bool VectorIsZero(const double dValues[], unsigned n) + { + for (unsigned i = 0; i < n; ++i) + if (dValues[i] != 0.0) + return false; + return true; + } + +void VectorSet(double dValues[], unsigned n, double d) + { + for (unsigned i = 0; i < n; ++i) + dValues[i] = d; + } + +bool VectorIsZero(const float dValues[], unsigned n) + { + for (unsigned i = 0; i < n; ++i) + if (dValues[i] != 0.0) + return false; + return true; + } + +void VectorSet(float dValues[], unsigned n, float d) + { + for (unsigned i = 0; i < n; ++i) + dValues[i] = d; + } + +double Correl(const double P[], const double Q[], unsigned uCount) + { + double dSumP = 0.0; + double dSumQ = 0.0; + for (unsigned n = 0; n < uCount; ++n) + { + dSumP += P[n]; + dSumQ += Q[n]; + } + const double dMeanP = dSumP/uCount; + const double dMeanQ = dSumQ/uCount; + + double dSum1 = 0.0; + double dSum2 = 0.0; + double dSum3 = 0.0; + for (unsigned n = 0; n < uCount; ++n) + { + const double dDiffP = P[n] - dMeanP; + const double dDiffQ = Q[n] - dMeanQ; + dSum1 += dDiffP*dDiffQ; + dSum2 += dDiffP*dDiffP; + dSum3 += dDiffQ*dDiffQ; + } + if (0 == dSum1) + return 0; + const double dCorrel = dSum1 / sqrt(dSum2*dSum3); + return dCorrel; + } + +float Correl(const float P[], const float Q[], unsigned uCount) + { + float dSumP = 0.0; + float dSumQ = 0.0; + for (unsigned n = 0; n < uCount; ++n) + { + dSumP += P[n]; + dSumQ += Q[n]; + } + const float dMeanP = dSumP/uCount; + const float dMeanQ = dSumQ/uCount; + + float dSum1 = 0.0; + float dSum2 = 0.0; + float dSum3 = 0.0; + for (unsigned n = 0; n < uCount; ++n) + { + const float dDiffP = P[n] - dMeanP; + const float dDiffQ = Q[n] - dMeanQ; + dSum1 += dDiffP*dDiffQ; + dSum2 += dDiffP*dDiffP; + dSum3 += dDiffQ*dDiffQ; + } + if (0 == dSum1) + return 0; + const float dCorrel = dSum1 / (float) sqrt(dSum2*dSum3); + return dCorrel; + } + +// Simple (but slow) function to compute Pearson ranks +// that allows for ties. Correctness and simplicity +// are priorities over speed here. +void Rank(const float P[], float Ranks[], unsigned uCount) + { + for (unsigned n = 0; n < uCount; ++n) + { + unsigned uNumberGreater = 0; + unsigned uNumberEqual = 0; + unsigned uNumberLess = 0; + double dValue = P[n]; + for (unsigned i = 0; i < uCount; ++i) + { + double v = P[i]; + if (v == dValue) + ++uNumberEqual; + else if (v < dValue) + ++uNumberLess; + else + ++uNumberGreater; + } + assert(uNumberEqual >= 1); + assert(uNumberEqual + uNumberLess + uNumberGreater == uCount); + Ranks[n] = (float) (1 + uNumberLess + (uNumberEqual - 1)/2.0); + } + } + +void Rank(const double P[], double Ranks[], unsigned uCount) + { + for (unsigned n = 0; n < uCount; ++n) + { + unsigned uNumberGreater = 0; + unsigned uNumberEqual = 0; + unsigned uNumberLess = 0; + double dValue = P[n]; + for (unsigned i = 0; i < uCount; ++i) + { + double v = P[i]; + if (v == dValue) + ++uNumberEqual; + else if (v < dValue) + ++uNumberLess; + else + ++uNumberGreater; + } + assert(uNumberEqual >= 1); + assert(uNumberEqual + uNumberLess + uNumberGreater == uCount); + Ranks[n] = (double) (1 + uNumberLess + (uNumberEqual - 1)/2.0); + } + } + +FCOUNT SumCounts(const FCOUNT Counts[]) + { + FCOUNT Sum = 0; + for (int i = 0; i < 20; ++i) + Sum += Counts[i]; + return Sum; + } diff --git a/src/muscle/muscle3.8.31/src/intmath.h b/src/muscle/muscle3.8.31/src/intmath.h new file mode 100644 index 0000000..b5f6d35 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/intmath.h @@ -0,0 +1,210 @@ +// IntMath.h: Header for doing fractional math with integers for speed. + +#ifndef IntMath_h +#define IntMath_h + +typedef float BASETYPE; +//typedef double BASETYPE; + +// Scaling factor used to store certain floating point +// values as integers to a few significant figures. +//const int INTSCALE = 1000; +const int INTSCALE = 1; + +// Type for a probability in range 0.0 to 1.0. +typedef BASETYPE PROB; + +// Type for an log-odds integer score. +// Stored as log2(PROB)*INTSCALE. +//typedef int SCORE; +typedef BASETYPE SCORE; + +// Type for a weight. +// Stored as w*INTSCALE where w is in range 0.0 to 1.0. +//typedef unsigned WEIGHT; +typedef BASETYPE WEIGHT; + +// Type for a fractional weighted count stored as n*WEIGHT/N +// where n=measured count (integer >= 0) and N is total for +// the distribution (e.g., n=number of residues of a given +// type in a column, N=number of residues in the column). +// Hence values in an FCOUNT variable range from 0..INTSCALE +// as an integer, representing "true" values 0.0 to 1.0. +//typedef unsigned FCOUNT; +typedef BASETYPE FCOUNT; + +// Representation of -infinity. Value should +// be large and negative, but not so large +// that adding a few of them overflows. +// TODO: Multiplied by 10 to work around bug +// when aligning Bali 1ckaA in ref4, which is +// so long that B->Mmax got to -infinity, causing +// traceback to fail. +//const int MINUS_INFINITY = -10000000; +const BASETYPE MINUS_INFINITY = (BASETYPE) -1e37; +const BASETYPE PLUS_INFINITY = (BASETYPE) 1e37; + +// Probability relative to a null model +typedef double RPROB; + +PROB ScoreToProb(SCORE Score); +SCORE ProbToScore(PROB Prob); +SCORE DoubleToScore(double d); +WEIGHT DoubleToWeight(double d); +double WeightToDouble(WEIGHT w); +SCORE MulScoreWeight(SCORE Score, WEIGHT Weight); +bool ScoreEq(SCORE s1, SCORE s2); +bool BTEq(double b1, double b2); + +static double ScoreToDouble(SCORE Score) + { + return (double) Score / (double) INTSCALE; + } + +#if 0 +// In-line assembler for Result = (x*y)/z +// Note that imul and idiv will do 64-bit arithmetic +// on 32-bit operands, so this shouldn't overflow +// Can't write this efficiently in C/C++ (would +// often overlow 32 bits). +#define MulDivAssign(Result, x, y, z) \ + { \ + int X = (x); \ + int Y = (y); \ + int Z = (z); \ + _asm mov eax,X \ + _asm imul Y \ + _asm mov ecx,Z \ + _asm idiv ecx \ + _asm mov Result,eax \ + } +#else +#define MulDivAssign(Result, x, y, z) Result = (((x)*(y))/(z)) +#endif + +#define MulScoreWeight(r, s, w) MulDivAssign(r, s, w, INTSCALE) +#define MulWeightWCount(r, wt, wc) MulDivAssign(r, wt, wc, INTSCALE) +#define MulFCountScore(r, fc, sc) MulDivAssign(r, fc, sc, INTSCALE) + +#if _DEBUG + +static inline SCORE Add2(SCORE a, SCORE b) + { + if (MINUS_INFINITY == a) + return MINUS_INFINITY; + if (MINUS_INFINITY == b) + return MINUS_INFINITY; + SCORE sum = a + b; + if (sum < MINUS_INFINITY) + return MINUS_INFINITY; +// assert(sum < OVERFLOW_WARN); + return sum; + } + +static inline SCORE Add3(SCORE a, SCORE b, SCORE c) + { + return Add2(Add2(a, b), c); + } + +static inline SCORE Add4(SCORE a, SCORE b, SCORE c, SCORE d) + { + return Add2(Add2(a, b), Add2(c, d)); + } + +static inline SCORE Add5(SCORE a, SCORE b, SCORE c, SCORE d, SCORE e) + { + return Add3(Add2(a, b), Add2(c, d), e); + } + +static inline SCORE Add6(SCORE a, SCORE b, SCORE c, SCORE d, SCORE e, SCORE f) + { + return Add3(Add2(a, b), Add2(c, d), Add2(e, f)); + } + +static inline SCORE Add7(SCORE a, SCORE b, SCORE c, SCORE d, SCORE e, SCORE f, SCORE g) + { + return Add4(Add2(a, b), Add2(c, d), Add2(e, f), g); + } + +static inline SCORE Mul2(SCORE a, SCORE b) + { + if (MINUS_INFINITY == a) + return MINUS_INFINITY; + if (MINUS_INFINITY == b) + return MINUS_INFINITY; + //__int64 prod = (__int64) a * (__int64) b; + //assert((SCORE) prod == prod); + //return (SCORE) prod; + return a*b; + } + +static inline SCORE Sub2(SCORE a, SCORE b) + { + if (MINUS_INFINITY == a) + return MINUS_INFINITY; + if (MINUS_INFINITY == b) + return MINUS_INFINITY; + SCORE diff = a - b; + if (diff < MINUS_INFINITY) + return MINUS_INFINITY; +// assert(diff < OVERFLOW_WARN); + return diff; + } + +static inline SCORE Div2(SCORE a, int b) + { + if (MINUS_INFINITY == a) + return MINUS_INFINITY; + return a/b; + } + +//static inline SCORE MulScoreWeight(SCORE s, WEIGHT w) +// { +// SCORE Prod = s*(SCORE) w; +// assert(Prod < OVERFLOW_WARN); +// extern void Log(const char Format[], ...); +// if (Prod/(SCORE) w != s) +// Log("**WARRNING MulScoreWeight Prod=%d w=%d Prod/w=%d s=%d\n", +// Prod, +// w, +// Prod/(SCORE) w, +// s); +// assert(Prod/ (SCORE) w == s); +// return Prod/INTSCALE; +// } +// +//static inline WCOUNT MulWeightWCount(WEIGHT wt, WCOUNT wc) +// { +// return (wt*wc)/INTSCALE; +// } + +#else +#define Add2(a, b) ((a) + (b)) +#define Sub2(a, b) ((MINUS_INFINITY == (a)) ? MINUS_INFINITY : ((a) - (b))) +#define Div2(a, b) ((MINUS_INFINITY == (a)) ? MINUS_INFINITY : ((a) / (b))) +#define Add3(a, b, c) ((a) + (b) + (c)) +#define Add4(a, b, c, d) ((a) + (b) + (c) + (d)) +#define Add5(a, b, c, d, e) ((a) + (b) + (c) + (d) + (e)) +#define Add6(a, b, c, d, e, f) ((a) + (b) + (c) + (d) + (e) + (f)) +#define Add7(a, b, c, d, e, f, g) ((a) + (b) + (c) + (d) + (e) + (f) + (g)) +//#define MulScoreWeight(s, w) (((s)*(SCORE) (w))/INTSCALE) +#define Mul2(a, b) ((a)*(b)) +#endif + +//static inline SCORE MulFCountScore(FCOUNT fc, SCORE sc) +// { +//// Fast way to say "if (fc >= 2^15 || sc >= 2^15)": +// if ((fc | sc) & 0xffff1000) +// { +// SCORE Score = ((fc+5)/10)*sc; +// assert(Score < assert); +// OVERFLOW_WARN(Score > MINUS_INFINITY); +// return Score/(INTSCALE/10); +// } +// SCORE Score = fc*sc; +// assert(Score < OVERFLOW_WARN); +// assert(Score > MINUS_INFINITY); +// return Score/INTSCALE; +// } + +#endif // IntMath_h diff --git a/src/muscle/muscle3.8.31/src/local.cpp b/src/muscle/muscle3.8.31/src/local.cpp new file mode 100644 index 0000000..c4d7a37 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/local.cpp @@ -0,0 +1,100 @@ +#include "muscle.h" +#include "textfile.h" +#include "msa.h" +#include "profile.h" +#include "pwpath.h" +#include "tree.h" + +#define TRACE 0 + +static void MSAFromFileName(const char *FileName, MSA &a) + { + TextFile File(FileName); + a.FromFile(File); + } + +static ProfPos *ProfileFromMSALocal(MSA &msa, Tree &tree) + { + const unsigned uSeqCount = msa.GetSeqCount(); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + msa.SetSeqId(uSeqIndex, uSeqIndex); + + TreeFromMSA(msa, tree, g_Cluster1, g_Distance1, g_Root1); + SetMuscleTree(tree); + return ProfileFromMSA(msa); + } + +void Local() + { + if (0 == g_pstrFileName1 || 0 == g_pstrFileName2) + Quit("Must specify both -in1 and -in2 for -sw"); + + SetSeqWeightMethod(g_SeqWeight1); + + MSA msa1; + MSA msa2; + + MSAFromFileName(g_pstrFileName1, msa1); + MSAFromFileName(g_pstrFileName2, msa2); + + ALPHA Alpha = ALPHA_Undefined; + switch (g_SeqType) + { + case SEQTYPE_Auto: + Alpha = msa1.GuessAlpha(); + break; + + case SEQTYPE_Protein: + Alpha = ALPHA_Amino; + break; + + case SEQTYPE_DNA: + Alpha = ALPHA_DNA; + break; + + case SEQTYPE_RNA: + Alpha = ALPHA_RNA; + break; + + default: + Quit("Invalid SeqType"); + } + SetAlpha(Alpha); + + msa1.FixAlpha(); + msa2.FixAlpha(); + + if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha) + SetPPScore(PPSCORE_SPN); + + const unsigned uSeqCount1 = msa1.GetSeqCount(); + const unsigned uSeqCount2 = msa2.GetSeqCount(); + const unsigned uMaxSeqCount = (uSeqCount1 > uSeqCount2 ? uSeqCount1 : uSeqCount2); + MSA::SetIdCount(uMaxSeqCount); + + unsigned uLength1 = msa1.GetColCount(); + unsigned uLength2 = msa2.GetColCount(); + + Tree tree1; + Tree tree2; + + ProfPos *Prof1 = ProfileFromMSALocal(msa1, tree1); + ProfPos *Prof2 = ProfileFromMSALocal(msa2, tree2); + + PWPath Path; + SW(Prof1, uLength1, Prof2, uLength2, Path); + +#if TRACE + Path.LogMe(); +#endif + + MSA msaOut; + AlignTwoMSAsGivenPathSW(Path, msa1, msa2, msaOut); + +#if TRACE + msaOut.LogMe(); +#endif + + TextFile fileOut(g_pstrOutFileName, true); + msaOut.ToFile(fileOut); + } diff --git a/src/muscle/muscle3.8.31/src/main.cpp b/src/muscle/muscle3.8.31/src/main.cpp new file mode 100644 index 0000000..7993c7b --- /dev/null +++ b/src/muscle/muscle3.8.31/src/main.cpp @@ -0,0 +1,72 @@ +//@@TODO reconcile /muscle with /muscle3.6 + +#include "muscle.h" +#include +#ifdef WIN32 +#include // for SetPriorityClass() +#include // for isatty() +#else +#include // for isatty() +#endif + +const char *MUSCLE_LONG_VERSION = "MUSCLE v" SHORT_VERSION "." +#include "svnversion.h" +" by Robert C. Edgar"; + +int g_argc; +char **g_argv; + +int main(int argc, char **argv) + { +#if WIN32 +// Multi-tasking does not work well in CPU-bound +// console apps running under Win32. +// Reducing the process priority allows GUI apps +// to run responsively in parallel. + SetPriorityClass(GetCurrentProcess(), BELOW_NORMAL_PRIORITY_CLASS); +#endif + g_argc = argc; + g_argv = argv; + + SetNewHandler(); + SetStartTime(); + ProcessArgVect(argc - 1, argv + 1); + SetParams(); + SetLogFile(); + + //extern void TestSubFams(const char *); + //TestSubFams(g_pstrInFileName); + //return 0; + + if (g_bVersion) + { + printf("%s\n", MUSCLE_LONG_VERSION); + exit(EXIT_SUCCESS); + } + + if (!g_bQuiet) + Credits(); + + if (MissingCommand() && isatty(0)) + { + Usage(); + exit(EXIT_SUCCESS); + } + + if (g_bCatchExceptions) + { + try + { + Run(); + } + catch (...) + { + OnException(); + exit(EXIT_Except); + } + } + else + Run(); + + exit(EXIT_Success); + } diff --git a/src/muscle/muscle3.8.31/src/make.err b/src/muscle/muscle3.8.31/src/make.err new file mode 100644 index 0000000..e69de29 diff --git a/src/muscle/muscle3.8.31/src/make.out b/src/muscle/muscle3.8.31/src/make.out new file mode 100644 index 0000000..fc8af25 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/make.out @@ -0,0 +1,2 @@ +g++ -O3 -march=pentiumpro -mcpu=pentiumpro -funroll-loops -Winline -DNDEBUG=1 -o muscle aligngivenpath.o aligngivenpathsw.o aligntwomsas.o aligntwoprofs.o alpha.o anchors.o blosumla.o clust.o cluster.o clwwt.o cons.o diaglist.o difftrees.o difftreese.o distcalc.o distfunc.o domuscle.o dosp.o dpreglist.o edgelist.o enumopts.o enumtostr.o estring.o fasta.o fastclust.o fastdist.o fastdistjones.o fastdistkbit.o fastdistkmer.o fastdistmafft.o fastscorepath2.o finddiags.o glbalign.o glbaligndiag.o glbalignle.o glbalignsimple.o glbalignsp.o globals.o globalslinux.o globalswin32.o gonnet.o gotowt.o henikoffweight.o henikoffweightpb.o hydro.o intmath.o local.o main.o makerootmsa.o mpam200.o msa.o msa2.o msadistkimura.o msf.o objscore.o objscore2.o onexception.o options.o pam200mafft.o params.o phy.o phy2.o phy3.o phy4.o phyfromclust.o phyfromfile.o phytofile.o posgap.o profile.o profilefrommsa.o progalign.o progress.o progressivealign.o pwpath.o realigndiffs.o realigndiffse.o refine.o refinehoriz.o refinesubfams.o refinetree.o refinetreee.o refinevert.o savebest.o scorehistory.o scoremx.o seq.o seqvect.o setblosumweights.o setgscweights.o setnewhandler.o sw.o textfile.o threewaywt.o traceback.o tracebackopt.o tracebacksw.o treefrommsa.o typetostr.o upgma2.o usage.o validateids.o vtml2.o -lm -static +strip muscle diff --git a/src/muscle/muscle3.8.31/src/makerootmsa.cpp b/src/muscle/muscle3.8.31/src/makerootmsa.cpp new file mode 100644 index 0000000..e83c3c1 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/makerootmsa.cpp @@ -0,0 +1,231 @@ +#include "muscle.h" +#include "tree.h" +#include "seqvect.h" +#include "profile.h" +#include "msa.h" +#include "pwpath.h" +#include "estring.h" + +#define TRACE 0 +#define VALIDATE 0 + +static void PathSeq(const Seq &s, const PWPath &Path, bool bRight, Seq &sOut) + { + short *esA; + short *esB; + PathToEstrings(Path, &esA, &esB); + + const unsigned uSeqLength = s.Length(); + const unsigned uEdgeCount = Path.GetEdgeCount(); + + sOut.Clear(); + sOut.SetName(s.GetName()); + unsigned uPos = 0; + for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) + { + const PWEdge &Edge = Path.GetEdge(uEdgeIndex); + char cType = Edge.cType; + if (bRight) + { + if (cType == 'I') + cType = 'D'; + else if (cType == 'D') + cType = 'I'; + } + switch (cType) + { + case 'M': + sOut.AppendChar(s[uPos++]); + break; + case 'D': + sOut.AppendChar('-'); + break; + case 'I': + sOut.AppendChar(s[uPos++]); + break; + default: + Quit("PathSeq, invalid edge type %c", cType); + } + } + } + +#if VALIDATE + +static void MakeRootSeq(const Seq &s, const Tree &GuideTree, unsigned uLeafNodeIndex, + const ProgNode Nodes[], Seq &sRoot) + { + sRoot.Copy(s); + unsigned uNodeIndex = uLeafNodeIndex; + for (;;) + { + unsigned uParent = GuideTree.GetParent(uNodeIndex); + if (NULL_NEIGHBOR == uParent) + break; + bool bRight = (GuideTree.GetLeft(uParent) == uNodeIndex); + uNodeIndex = uParent; + const PWPath &Path = Nodes[uNodeIndex].m_Path; + Seq sTmp; + PathSeq(sRoot, Path, bRight, sTmp); + sTmp.SetId(0); + sRoot.Copy(sTmp); + } + } + +#endif // VALIDATE + +static short *MakeRootSeqE(const Seq &s, const Tree &GuideTree, unsigned uLeafNodeIndex, + const ProgNode Nodes[], Seq &sRoot, short *Estring1, short *Estring2) + { + short *EstringCurr = Estring1; + short *EstringNext = Estring2; + + const unsigned uSeqLength = s.Length(); + EstringCurr[0] = uSeqLength; + EstringCurr[1] = 0; + + unsigned uNodeIndex = uLeafNodeIndex; + for (;;) + { + unsigned uParent = GuideTree.GetParent(uNodeIndex); + if (NULL_NEIGHBOR == uParent) + break; + bool bRight = (GuideTree.GetLeft(uParent) == uNodeIndex); + uNodeIndex = uParent; + const PWPath &Path = Nodes[uNodeIndex].m_Path; + const short *EstringNode = bRight ? + Nodes[uNodeIndex].m_EstringL : Nodes[uNodeIndex].m_EstringR; + + MulEstrings(EstringCurr, EstringNode, EstringNext); +#if TRACE + Log("\n"); + Log("Curr="); + LogEstring(EstringCurr); + Log("\n"); + Log("Node="); + LogEstring(EstringNode); + Log("\n"); + Log("Prod="); + LogEstring(EstringNext); + Log("\n"); +#endif + short *EstringTmp = EstringNext; + EstringNext = EstringCurr; + EstringCurr = EstringTmp; + } + EstringOp(EstringCurr, s, sRoot); + +#if TRACE + Log("Root estring="); + LogEstring(EstringCurr); + Log("\n"); + Log("Root seq="); + sRoot.LogMe(); +#endif + return EstringCurr; + } + +static unsigned GetFirstNodeIndex(const Tree &tree) + { + if (g_bStable) + return 0; + return tree.FirstDepthFirstNode(); + } + +static unsigned GetNextNodeIndex(const Tree &tree, unsigned uPrevNodeIndex) + { + if (g_bStable) + { + const unsigned uNodeCount = tree.GetNodeCount(); + unsigned uNodeIndex = uPrevNodeIndex; + for (;;) + { + ++uNodeIndex; + if (uNodeIndex >= uNodeCount) + return NULL_NEIGHBOR; + if (tree.IsLeaf(uNodeIndex)) + return uNodeIndex; + } + } + unsigned uNodeIndex = uPrevNodeIndex; + for (;;) + { + uNodeIndex = tree.NextDepthFirstNode(uNodeIndex); + if (NULL_NEIGHBOR == uNodeIndex || tree.IsLeaf(uNodeIndex)) + return uNodeIndex; + } + } + +void MakeRootMSA(const SeqVect &v, const Tree &GuideTree, ProgNode Nodes[], + MSA &a) + { +#if TRACE + Log("MakeRootMSA Tree="); + GuideTree.LogMe(); +#endif + const unsigned uSeqCount = v.GetSeqCount(); + unsigned uColCount = uInsane; + unsigned uSeqIndex = 0; + const unsigned uTreeNodeCount = GuideTree.GetNodeCount(); + const unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex(); + const PWPath &RootPath = Nodes[uRootNodeIndex].m_Path; + const unsigned uRootColCount = RootPath.GetEdgeCount(); + const unsigned uEstringSize = uRootColCount + 1; + short *Estring1 = new short[uEstringSize]; + short *Estring2 = new short[uEstringSize]; + SetProgressDesc("Root alignment"); + + unsigned uTreeNodeIndex = GetFirstNodeIndex(GuideTree); + do + { + Progress(uSeqIndex, uSeqCount); + + unsigned uId = GuideTree.GetLeafId(uTreeNodeIndex); + const Seq &s = *(v[uId]); + + Seq sRootE; + short *es = MakeRootSeqE(s, GuideTree, uTreeNodeIndex, Nodes, sRootE, + Estring1, Estring2); + Nodes[uTreeNodeIndex].m_EstringL = EstringNewCopy(es); + +#if VALIDATE + Seq sRoot; + MakeRootSeq(s, GuideTree, uTreeNodeIndex, Nodes, sRoot); + if (!sRoot.Eq(sRootE)) + { + Log("sRoot="); + sRoot.LogMe(); + Log("sRootE="); + sRootE.LogMe(); + Quit("Root seqs differ"); + } +#if TRACE + Log("MakeRootSeq=\n"); + sRoot.LogMe(); +#endif +#endif + + if (uInsane == uColCount) + { + uColCount = sRootE.Length(); + a.SetSize(uSeqCount, uColCount); + } + else + { + assert(uColCount == sRootE.Length()); + } + a.SetSeqName(uSeqIndex, s.GetName()); + a.SetSeqId(uSeqIndex, uId); + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + a.SetChar(uSeqIndex, uColIndex, sRootE[uColIndex]); + ++uSeqIndex; + + uTreeNodeIndex = GetNextNodeIndex(GuideTree, uTreeNodeIndex); + } + while (NULL_NEIGHBOR != uTreeNodeIndex); + + delete[] Estring1; + delete[] Estring2; + + ProgressStepsDone(); + assert(uSeqIndex == uSeqCount); + } diff --git a/src/muscle/muscle3.8.31/src/makerootmsab.cpp b/src/muscle/muscle3.8.31/src/makerootmsab.cpp new file mode 100644 index 0000000..24ca2e1 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/makerootmsab.cpp @@ -0,0 +1,62 @@ +#include "muscle.h" +#include "tree.h" +#include "profile.h" +#include "msa.h" +#include "seqvect.h" +#include "pwpath.h" + +static void DoSeq(Seq &s, unsigned uSeqIndex, const ProfPos *RootProf, + unsigned uRootProfLength, MSA &msaOut) + { + MSA msaSeq; + msaSeq.FromSeq(s); + const unsigned uSeqLength = s.Length(); + + MSA msaDummy; + msaDummy.SetSize(1, uRootProfLength); + msaDummy.SetSeqId(0, 0); + msaDummy.SetSeqName(0, "Dummy0"); + for (unsigned uColIndex = 0; uColIndex < uRootProfLength; ++uColIndex) + msaDummy.SetChar(0, uColIndex, '?'); + + ProfPos *SeqProf = ProfileFromMSA(msaSeq); + for (unsigned uColIndex = 0; uColIndex < uSeqLength; ++uColIndex) + { + ProfPos &PP = SeqProf[uColIndex]; + PP.m_scoreGapOpen = MINUS_INFINITY; + PP.m_scoreGapClose = MINUS_INFINITY; + } + + ProfPos *ProfOut; + unsigned uLengthOut; + PWPath Path; + AlignTwoProfs(SeqProf, uSeqLength, 1.0, RootProf, uRootProfLength, 1.0, + Path, &ProfOut, &uLengthOut); + assert(uLengthOut = uRootProfLength); + delete[] ProfOut; + + MSA msaCombined; + AlignTwoMSAsGivenPath(Path, msaSeq, msaDummy, msaCombined); + + msaCombined.LogMe(); + msaOut.SetSeqName(uSeqIndex, s.GetName()); + msaOut.SetSeqId(uSeqIndex, s.GetId()); + for (unsigned uColIndex = 0; uColIndex < uRootProfLength; ++uColIndex) + msaOut.SetChar(uSeqIndex, uColIndex, msaCombined.GetChar(0, uColIndex)); + } + +// Steven Brenner's O(NL^2) proposal for creating a root alignment +// Align each sequence to the profile at the root. +// Compare the e-string solution, which is O(NL log N). +void MakeRootMSABrenner(SeqVect &v, const Tree &GuideTree, ProgNode Nodes[], + MSA &a) + { + const unsigned uSeqCount = v.Length(); + const unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex(); + const ProfPos *RootProfile = Nodes[uRootNodeIndex].m_Prof; + const unsigned uRootColCount = Nodes[uRootNodeIndex].m_uLength; + a.SetSize(uSeqCount, uRootColCount); + + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + DoSeq(*v[uSeqIndex], uSeqIndex, RootProfile, uRootColCount, a); + } diff --git a/src/muscle/muscle3.8.31/src/maketree.cpp b/src/muscle/muscle3.8.31/src/maketree.cpp new file mode 100644 index 0000000..6411dd1 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/maketree.cpp @@ -0,0 +1,38 @@ +#include "muscle.h" +#include "msa.h" +#include "textfile.h" +#include "tree.h" + +void DoMakeTree() + { + if (g_pstrInFileName == 0 || g_pstrOutFileName == 0) + Quit("-maketree requires -in and -out "); + + SetStartTime(); + + SetSeqWeightMethod(g_SeqWeight1); + + TextFile MSAFile(g_pstrInFileName); + + MSA msa; + msa.FromFile(MSAFile); + + unsigned uSeqCount = msa.GetSeqCount(); + MSA::SetIdCount(uSeqCount); + +// Initialize sequence ids. +// From this point on, ids must somehow propogate from here. + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + msa.SetSeqId(uSeqIndex, uSeqIndex); + SetMuscleInputMSA(msa); + + Progress("%u sequences", uSeqCount); + + Tree tree; + TreeFromMSA(msa, tree, g_Cluster2, g_Distance2, g_Root2); + + TextFile TreeFile(g_pstrOutFileName, true); + tree.ToFile(TreeFile); + + Progress("Tree created"); + } diff --git a/src/muscle/muscle3.8.31/src/mhack.cpp b/src/muscle/muscle3.8.31/src/mhack.cpp new file mode 100644 index 0000000..5a38f90 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/mhack.cpp @@ -0,0 +1,64 @@ +#include "muscle.h" +#include "seqvect.h" +#include "msa.h" + +/*** +Methionine hack. +Most proteins start with M. +This results in odd-looking alignments with the terminal Ms aligned followed +immediately by gaps. +Hack this by treating terminal M like X. +***/ + +static bool *M; + +void MHackStart(SeqVect &v) + { + if (ALPHA_Amino != g_Alpha) + return; + + const unsigned uSeqCount = v.Length(); + M = new bool[uSeqCount]; + memset(M, 0, uSeqCount*sizeof(bool)); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + Seq &s = v.GetSeq(uSeqIndex); + if (0 == s.Length()) + continue; + unsigned uId = s.GetId(); + if (s[0] == 'M' || s[0] == 'm') + { + M[uId] = true; + s[0] = 'X'; + } + } + } + +void MHackEnd(MSA &msa) + { + if (ALPHA_Amino != g_Alpha) + return; + if (0 == M) + return; + + const unsigned uSeqCount = msa.GetSeqCount(); + const unsigned uColCount = msa.GetColCount(); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + unsigned uId = msa.GetSeqId(uSeqIndex); + if (M[uId]) + { + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + { + if (!msa.IsGap(uSeqIndex, uColIndex)) + { + msa.SetChar(uSeqIndex, uColIndex, 'M'); + break; + } + } + } + } + + delete[] M; + M = 0; + } diff --git a/src/muscle/muscle3.8.31/src/mk b/src/muscle/muscle3.8.31/src/mk new file mode 100755 index 0000000..7fa37b3 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/mk @@ -0,0 +1,21 @@ +#!/bin/bash +CPPNames='aligngivenpath aligngivenpathsw aligntwomsas aligntwoprofs aln alpha anchors bittraceback blosum62 blosumla clust cluster clwwt color cons diaglist diffobjscore diffpaths difftrees difftreese distcalc distfunc distpwkimura domuscle dosp dpreglist drawtree edgelist enumopts enumtostr estring fasta fasta2 fastclust fastdist fastdistjones fastdistkbit fastdistkmer fastdistmafft fastdistnuc fastscorepath2 finddiags finddiagsn glbalign glbalign352 glbaligndiag glbalignle glbalignsimple glbalignsp glbalignspn glbalignss glbalndimer globals globalslinux globalsosx globalsother globalswin32 gonnet henikoffweight henikoffweightpb html hydro intmath local main makerootmsa makerootmsab maketree mhack mpam200 msa msa2 msadistkimura msf muscle muscleout nucmx nwdasimple nwdasimple2 nwdasmall nwrec nwsmall objscore objscore2 objscoreda onexception options outweights pam200mafft params phy phy2 phy3 phy4 phyfromclust phyfromfile physeq phytofile posgap ppscore profdb profile profilefrommsa progalign progress progressivealign pwpath readmx realigndiffs realigndiffse refine refinehoriz refinesubfams refinetree refinetreee refinevert refinew savebest scoredist scoregaps scorehistory scorepp seq seqvect setblosumweights setgscweights setnewhandler spfast sptest stabilize subfam subfams sw termgaps textfile threewaywt tomhydro traceback tracebackopt tracebacksw treefrommsa typetostr upgma2 usage validateids vtml2 writescorefile' +ObjNames='aligngivenpath.o aligngivenpathsw.o aligntwomsas.o aligntwoprofs.o aln.o alpha.o anchors.o bittraceback.o blosum62.o blosumla.o clust.o cluster.o clwwt.o color.o cons.o diaglist.o diffobjscore.o diffpaths.o difftrees.o difftreese.o distcalc.o distfunc.o distpwkimura.o domuscle.o dosp.o dpreglist.o drawtree.o edgelist.o enumopts.o enumtostr.o estring.o fasta.o fasta2.o fastclust.o fastdist.o fastdistjones.o fastdistkbit.o fastdistkmer.o fastdistmafft.o fastdistnuc.o fastscorepath2.o finddiags.o finddiagsn.o glbalign.o glbalign352.o glbaligndiag.o glbalignle.o glbalignsimple.o glbalignsp.o glbalignspn.o glbalignss.o glbalndimer.o globals.o globalslinux.o globalsosx.o globalsother.o globalswin32.o gonnet.o henikoffweight.o henikoffweightpb.o html.o hydro.o intmath.o local.o main.o makerootmsa.o makerootmsab.o maketree.o mhack.o mpam200.o msa.o msa2.o msadistkimura.o msf.o muscle.o muscleout.o nucmx.o nwdasimple.o nwdasimple2.o nwdasmall.o nwrec.o nwsmall.o objscore.o objscore2.o objscoreda.o onexception.o options.o outweights.o pam200mafft.o params.o phy.o phy2.o phy3.o phy4.o phyfromclust.o phyfromfile.o physeq.o phytofile.o posgap.o ppscore.o profdb.o profile.o profilefrommsa.o progalign.o progress.o progressivealign.o pwpath.o readmx.o realigndiffs.o realigndiffse.o refine.o refinehoriz.o refinesubfams.o refinetree.o refinetreee.o refinevert.o refinew.o savebest.o scoredist.o scoregaps.o scorehistory.o scorepp.o seq.o seqvect.o setblosumweights.o setgscweights.o setnewhandler.o spfast.o sptest.o stabilize.o subfam.o subfams.o sw.o termgaps.o textfile.o threewaywt.o tomhydro.o traceback.o tracebackopt.o tracebacksw.o treefrommsa.o typetostr.o upgma2.o usage.o validateids.o vtml2.o writescorefile.o' + +rm -f *.o muscle.make.stdout.txt muscle.make.stderr.txt +for CPPName in $CPPNames +do + echo $CPPName >> /dev/tty + $CXX $ENV_GCC_OPTS -c -O3 -msse2 -mfpmath=sse -D_FILE_OFFSET_BITS=64 -DNDEBUG=1 $CPPName.cpp -o $CPPName.o >> muscle.make.stdout.txt 2>> muscle.make.stderr.txt +done + +LINK_OPTS= +if [ `uname -s` == Linux ] ; then + LINK_OPTS=-static +fi +$CXX $LINK_OPTS $ENV_LINK_OPTS -g -o muscle $ObjNames >> muscle.make.stdout.txt 2>> muscle.make.stderr.txt +tail muscle.make.stderr.txt + +strip muscle +ls -lh muscle +sum muscle diff --git a/src/muscle/muscle3.8.31/src/mpam200.cpp b/src/muscle/muscle3.8.31/src/mpam200.cpp new file mode 100644 index 0000000..daa3a74 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/mpam200.cpp @@ -0,0 +1,107 @@ +#include "muscle.h" + +const float PAM_200_CENTER = (float) 20.0; + +#define v(x) ((float) x + PAM_200_CENTER) +#define ROW(A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y) \ + { v(A), v(C), v(D), v(E), v(F), v(G), v(H), v(I), v(K), v(L), \ + v(M), v(N), v(P), v(Q), v(R), v(S), v(T), v(V), v(W), v(Y) }, + +float PAM200[32][32] = + { +// A C D E F G H I K L +// M N P Q R S T V W Y +ROW( 388, -0, 34, 32, -202, 159, -88, 89, -55, -67, + 19, 86, 186, -34, -32, 237, 273, 171, -326, -239) // A +ROW( -0, 1170, -248, -315, 74, -14, 43, -151, -204, -196, + -132, -49, -142, -215, 29, 165, -7, -69, 179, 313) // C +ROW( 34, -248, 625, 496, -419, 148, 78, -245, 55, -361, + -255, 332, -169, 122, -64, 45, -13, -167, -438, -148) // D +ROW( 32, -315, 496, 610, -480, 125, 25, -245, 175, -327, + -242, 166, -141, 279, 34, -30, -56, -150, -386, -305) // E +ROW( -202, 74, -419, -480, 888, -407, 62, 80, -443, 320, + 67, -236, -180, -294, -327, -51, -173, 31, -1, 584) // F +ROW( 159, -14, 148, 125, -407, 662, -114, -216, -34, -324, + -246, 79, -77, -68, 97, 155, 21, -93, -58, -349) // G +ROW( -88, 43, 78, 25, 62, -114, 766, -205, 144, -92, + -152, 238, 66, 368, 257, 35, -35, -217, -201, 468) // H +ROW( 89, -151, -245, -245, 80, -216, -205, 554, -224, 288, + 391, -114, -115, -222, -208, -19, 162, 469, -274, -153) // I +ROW( -55, -204, 55, 175, -443, -34, 144, -224, 632, -249, + -118, 186, -86, 315, 466, 2, 19, -227, -216, -264) // K +ROW( -67, -196, -361, -327, 320, -324, -92, 288, -249, 591, + 369, -223, 53, -86, -170, -69, -41, 239, -66, -29) // L +ROW( 19, -132, -255, -242, 67, -246, -152, 391, -118, 369, + 756, -131, -98, -124, -129, -49, 129, 331, -229, -182) // M +ROW( 86, -49, 332, 166, -236, 79, 238, -114, 186, -223, + -131, 516, -21, 88, 73, 240, 168, -118, -379, -8) // N +ROW( 186, -142, -169, -141, -180, -77, 66, -115, -86, 53, + -98, -21, 736, 122, 5, 221, 139, -75, -373, -226) // P +ROW( -34, -215, 122, 279, -294, -68, 368, -222, 315, -86, + -124, 88, 122, 635, 301, -13, -35, -195, -243, -73) // Q +ROW( -32, 29, -64, 34, -327, 97, 257, -208, 466, -170, + -129, 73, 5, 301, 606, 28, -4, -201, 104, -133) // R +ROW( 237, 165, 45, -30, -51, 155, 35, -19, 2, -69, + -49, 240, 221, -13, 28, 353, 259, 8, -213, -55) // S +ROW( 273, -7, -13, -56, -173, 21, -35, 162, 19, -41, + 129, 168, 139, -35, -4, 259, 422, 143, -343, -190) // T +ROW( 171, -69, -167, -150, 31, -93, -217, 469, -227, 239, + 331, -118, -75, -195, -201, 8, 143, 505, -245, -197) // V +ROW( -326, 179, -438, -386, -1, -58, -201, -274, -216, -66, + -229, -379, -373, -243, 104, -213, -343, -245, 1475, 63) // W +ROW( -239, 313, -148, -305, 584, -349, 468, -153, -264, -29, + -182, -8, -226, -73, -133, -55, -190, -197, 63, 979) // Y + }; + +#undef v +#define v(x) ((float) x) +#define RNC(A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y) \ + { v(A), v(C), v(D), v(E), v(F), v(G), v(H), v(I), v(K), v(L), \ + v(M), v(N), v(P), v(Q), v(R), v(S), v(T), v(V), v(W), v(Y) }, + +float PAM200NoCenter[32][32] = + + { +// A C D E F G H I K L +// M N P Q R S T V W Y +RNC( 388, -0, 34, 32, -202, 159, -88, 89, -55, -67, + 19, 86, 186, -34, -32, 237, 273, 171, -326, -239) // A +RNC( -0, 1170, -248, -315, 74, -14, 43, -151, -204, -196, + -132, -49, -142, -215, 29, 165, -7, -69, 179, 313) // C +RNC( 34, -248, 625, 496, -419, 148, 78, -245, 55, -361, + -255, 332, -169, 122, -64, 45, -13, -167, -438, -148) // D +RNC( 32, -315, 496, 610, -480, 125, 25, -245, 175, -327, + -242, 166, -141, 279, 34, -30, -56, -150, -386, -305) // E +RNC( -202, 74, -419, -480, 888, -407, 62, 80, -443, 320, + 67, -236, -180, -294, -327, -51, -173, 31, -1, 584) // F +RNC( 159, -14, 148, 125, -407, 662, -114, -216, -34, -324, + -246, 79, -77, -68, 97, 155, 21, -93, -58, -349) // G +RNC( -88, 43, 78, 25, 62, -114, 766, -205, 144, -92, + -152, 238, 66, 368, 257, 35, -35, -217, -201, 468) // H +RNC( 89, -151, -245, -245, 80, -216, -205, 554, -224, 288, + 391, -114, -115, -222, -208, -19, 162, 469, -274, -153) // I +RNC( -55, -204, 55, 175, -443, -34, 144, -224, 632, -249, + -118, 186, -86, 315, 466, 2, 19, -227, -216, -264) // K +RNC( -67, -196, -361, -327, 320, -324, -92, 288, -249, 591, + 369, -223, 53, -86, -170, -69, -41, 239, -66, -29) // L +RNC( 19, -132, -255, -242, 67, -246, -152, 391, -118, 369, + 756, -131, -98, -124, -129, -49, 129, 331, -229, -182) // M +RNC( 86, -49, 332, 166, -236, 79, 238, -114, 186, -223, + -131, 516, -21, 88, 73, 240, 168, -118, -379, -8) // N +RNC( 186, -142, -169, -141, -180, -77, 66, -115, -86, 53, + -98, -21, 736, 122, 5, 221, 139, -75, -373, -226) // P +RNC( -34, -215, 122, 279, -294, -68, 368, -222, 315, -86, + -124, 88, 122, 635, 301, -13, -35, -195, -243, -73) // Q +RNC( -32, 29, -64, 34, -327, 97, 257, -208, 466, -170, + -129, 73, 5, 301, 606, 28, -4, -201, 104, -133) // R +RNC( 237, 165, 45, -30, -51, 155, 35, -19, 2, -69, + -49, 240, 221, -13, 28, 353, 259, 8, -213, -55) // S +RNC( 273, -7, -13, -56, -173, 21, -35, 162, 19, -41, + 129, 168, 139, -35, -4, 259, 422, 143, -343, -190) // T +RNC( 171, -69, -167, -150, 31, -93, -217, 469, -227, 239, + 331, -118, -75, -195, -201, 8, 143, 505, -245, -197) // V +RNC( -326, 179, -438, -386, -1, -58, -201, -274, -216, -66, + -229, -379, -373, -243, 104, -213, -343, -245, 1475, 63) // W +RNC( -239, 313, -148, -305, 584, -349, 468, -153, -264, -29, + -182, -8, -226, -73, -133, -55, -190, -197, 63, 979) // Y + }; diff --git a/src/muscle/muscle3.8.31/src/msa.cpp b/src/muscle/muscle3.8.31/src/msa.cpp new file mode 100644 index 0000000..30a3fa6 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/msa.cpp @@ -0,0 +1,851 @@ +#include "muscle.h" +#include "msa.h" +#include "textfile.h" +#include "seq.h" +#include + +const unsigned DEFAULT_SEQ_LENGTH = 500; + +unsigned MSA::m_uIdCount = 0; + +MSA::MSA() + { + m_uSeqCount = 0; + m_uColCount = 0; + + m_szSeqs = 0; + m_szNames = 0; + m_Weights = 0; + + m_IdToSeqIndex = 0; + m_SeqIndexToId = 0; + + m_uCacheSeqCount = 0; + m_uCacheSeqLength = 0; + } + +MSA::~MSA() + { + Free(); + } + +void MSA::Free() + { + for (unsigned n = 0; n < m_uSeqCount; ++n) + { + delete[] m_szSeqs[n]; + delete[] m_szNames[n]; + } + + delete[] m_szSeqs; + delete[] m_szNames; + delete[] m_Weights; + delete[] m_IdToSeqIndex; + delete[] m_SeqIndexToId; + + m_uSeqCount = 0; + m_uColCount = 0; + + m_szSeqs = 0; + m_szNames = 0; + m_Weights = 0; + + m_IdToSeqIndex = 0; + m_SeqIndexToId = 0; + } + +void MSA::SetSize(unsigned uSeqCount, unsigned uColCount) + { + Free(); + + m_uSeqCount = uSeqCount; + m_uCacheSeqLength = uColCount; + m_uColCount = 0; + + if (0 == uSeqCount && 0 == uColCount) + return; + + m_szSeqs = new char *[uSeqCount]; + m_szNames = new char *[uSeqCount]; + m_Weights = new WEIGHT[uSeqCount]; + + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + m_szSeqs[uSeqIndex] = new char[uColCount+1]; + m_szNames[uSeqIndex] = 0; +#if DEBUG + m_Weights[uSeqIndex] = BTInsane; + memset(m_szSeqs[uSeqIndex], '?', uColCount); +#endif + m_szSeqs[uSeqIndex][uColCount] = 0; + } + + if (m_uIdCount > 0) + { + m_IdToSeqIndex = new unsigned[m_uIdCount]; + m_SeqIndexToId = new unsigned[m_uSeqCount]; +#if DEBUG + memset(m_IdToSeqIndex, 0xff, m_uIdCount*sizeof(unsigned)); + memset(m_SeqIndexToId, 0xff, m_uSeqCount*sizeof(unsigned)); +#endif + } + } + +void MSA::LogMe() const + { + if (0 == GetColCount()) + { + Log("MSA empty\n"); + return; + } + + const unsigned uColsPerLine = 50; + unsigned uLinesPerSeq = (GetColCount() - 1)/uColsPerLine + 1; + for (unsigned n = 0; n < uLinesPerSeq; ++n) + { + unsigned i; + unsigned iStart = n*uColsPerLine; + unsigned iEnd = GetColCount(); + if (iEnd - iStart + 1 > uColsPerLine) + iEnd = iStart + uColsPerLine; + Log(" "); + for (i = iStart; i < iEnd; ++i) + Log("%u", i%10); + Log("\n"); + Log(" "); + for (i = iStart; i + 9 < iEnd; i += 10) + Log("%-10u", i); + if (n == uLinesPerSeq - 1) + Log(" %-10u", GetColCount()); + Log("\n"); + for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) + { + Log("%12.12s", m_szNames[uSeqIndex]); + if (m_Weights[uSeqIndex] != BTInsane) + Log(" (%5.3f)", m_Weights[uSeqIndex]); + else + Log(" "); + Log(" "); + for (i = iStart; i < iEnd; ++i) + Log("%c", GetChar(uSeqIndex, i)); + if (0 != m_SeqIndexToId) + Log(" [%5u]", m_SeqIndexToId[uSeqIndex]); + Log("\n"); + } + Log("\n\n"); + } + } + +char MSA::GetChar(unsigned uSeqIndex, unsigned uIndex) const + { +// TODO: Performance cost? + if (uSeqIndex >= m_uSeqCount || uIndex >= m_uColCount) + Quit("MSA::GetChar(%u/%u,%u/%u)", + uSeqIndex, m_uSeqCount, uIndex, m_uColCount); + + char c = m_szSeqs[uSeqIndex][uIndex]; +// assert(IsLegalChar(c)); + return c; + } + +unsigned MSA::GetLetter(unsigned uSeqIndex, unsigned uIndex) const + { +// TODO: Performance cost? + char c = GetChar(uSeqIndex, uIndex); + unsigned uLetter = CharToLetter(c); + if (uLetter >= 20) + { + char c = ' '; + if (uSeqIndex < m_uSeqCount && uIndex < m_uColCount) + c = m_szSeqs[uSeqIndex][uIndex]; + Quit("MSA::GetLetter(%u/%u, %u/%u)='%c'/%u", + uSeqIndex, m_uSeqCount, uIndex, m_uColCount, c, uLetter); + } + return uLetter; + } + +unsigned MSA::GetLetterEx(unsigned uSeqIndex, unsigned uIndex) const + { +// TODO: Performance cost? + char c = GetChar(uSeqIndex, uIndex); + unsigned uLetter = CharToLetterEx(c); + return uLetter; + } + +void MSA::SetSeqName(unsigned uSeqIndex, const char szName[]) + { + if (uSeqIndex >= m_uSeqCount) + Quit("MSA::SetSeqName(%u, %s), count=%u", uSeqIndex, m_uSeqCount); + delete[] m_szNames[uSeqIndex]; + int n = (int) strlen(szName) + 1; + m_szNames[uSeqIndex] = new char[n]; + memcpy(m_szNames[uSeqIndex], szName, n); + } + +const char *MSA::GetSeqName(unsigned uSeqIndex) const + { + if (uSeqIndex >= m_uSeqCount) + Quit("MSA::GetSeqName(%u), count=%u", uSeqIndex, m_uSeqCount); + return m_szNames[uSeqIndex]; + } + +bool MSA::IsGap(unsigned uSeqIndex, unsigned uIndex) const + { + char c = GetChar(uSeqIndex, uIndex); + return IsGapChar(c); + } + +bool MSA::IsWildcard(unsigned uSeqIndex, unsigned uIndex) const + { + char c = GetChar(uSeqIndex, uIndex); + return IsWildcardChar(c); + } + +void MSA::SetChar(unsigned uSeqIndex, unsigned uIndex, char c) + { + if (uSeqIndex >= m_uSeqCount || uIndex > m_uCacheSeqLength) + Quit("MSA::SetChar(%u,%u)", uSeqIndex, uIndex); + + if (uIndex == m_uCacheSeqLength) + { + const unsigned uNewCacheSeqLength = m_uCacheSeqLength + DEFAULT_SEQ_LENGTH; + for (unsigned n = 0; n < m_uSeqCount; ++n) + { + char *ptrNewSeq = new char[uNewCacheSeqLength+1]; + memcpy(ptrNewSeq, m_szSeqs[n], m_uCacheSeqLength); + memset(ptrNewSeq + m_uCacheSeqLength, '?', DEFAULT_SEQ_LENGTH); + ptrNewSeq[uNewCacheSeqLength] = 0; + delete[] m_szSeqs[n]; + m_szSeqs[n] = ptrNewSeq; + } + + m_uColCount = uIndex; + m_uCacheSeqLength = uNewCacheSeqLength; + } + + if (uIndex >= m_uColCount) + m_uColCount = uIndex + 1; + m_szSeqs[uSeqIndex][uIndex] = c; + } + +void MSA::GetSeq(unsigned uSeqIndex, Seq &seq) const + { + assert(uSeqIndex < m_uSeqCount); + + seq.Clear(); + + for (unsigned n = 0; n < m_uColCount; ++n) + if (!IsGap(uSeqIndex, n)) + { + char c = GetChar(uSeqIndex, n); + if (!isalpha(c)) + Quit("Invalid character '%c' in sequence", c); + c = toupper(c); + seq.push_back(c); + } + const char *ptrName = GetSeqName(uSeqIndex); + seq.SetName(ptrName); + } + +bool MSA::HasGap() const + { + for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) + for (unsigned n = 0; n < GetColCount(); ++n) + if (IsGap(uSeqIndex, n)) + return true; + return false; + } + +bool MSA::IsLegalLetter(unsigned uLetter) const + { + return uLetter < 20; + } + +void MSA::SetSeqCount(unsigned uSeqCount) + { + Free(); + SetSize(uSeqCount, DEFAULT_SEQ_LENGTH); + } + +void MSA::CopyCol(unsigned uFromCol, unsigned uToCol) + { + assert(uFromCol < GetColCount()); + assert(uToCol < GetColCount()); + if (uFromCol == uToCol) + return; + + for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) + { + const char c = GetChar(uSeqIndex, uFromCol); + SetChar(uSeqIndex, uToCol, c); + } + } + +void MSA::Copy(const MSA &msa) + { + Free(); + const unsigned uSeqCount = msa.GetSeqCount(); + const unsigned uColCount = msa.GetColCount(); + SetSize(uSeqCount, uColCount); + + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + SetSeqName(uSeqIndex, msa.GetSeqName(uSeqIndex)); + const unsigned uId = msa.GetSeqId(uSeqIndex); + SetSeqId(uSeqIndex, uId); + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + { + const char c = msa.GetChar(uSeqIndex, uColIndex); + SetChar(uSeqIndex, uColIndex, c); + } + } + } + +bool MSA::IsGapColumn(unsigned uColIndex) const + { + assert(GetSeqCount() > 0); + for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) + if (!IsGap(uSeqIndex, uColIndex)) + return false; + return true; + } + +bool MSA::GetSeqIndex(const char *ptrSeqName, unsigned *ptruSeqIndex) const + { + for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) + if (0 == stricmp(ptrSeqName, GetSeqName(uSeqIndex))) + { + *ptruSeqIndex = uSeqIndex; + return true; + } + return false; + } + +void MSA::DeleteCol(unsigned uColIndex) + { + assert(uColIndex < m_uColCount); + size_t n = m_uColCount - uColIndex; + if (n > 0) + { + for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) + { + char *ptrSeq = m_szSeqs[uSeqIndex]; + memmove(ptrSeq + uColIndex, ptrSeq + uColIndex + 1, n); + } + } + --m_uColCount; + } + +void MSA::DeleteColumns(unsigned uColIndex, unsigned uColCount) + { + for (unsigned n = 0; n < uColCount; ++n) + DeleteCol(uColIndex); + } + +void MSA::FromFile(TextFile &File) + { + FromFASTAFile(File); + } + +// Weights sum to 1, WCounts sum to NIC +WEIGHT MSA::GetSeqWeight(unsigned uSeqIndex) const + { + assert(uSeqIndex < m_uSeqCount); + WEIGHT w = m_Weights[uSeqIndex]; + if (w == wInsane) + Quit("Seq weight not set"); + return w; + } + +void MSA::SetSeqWeight(unsigned uSeqIndex, WEIGHT w) const + { + assert(uSeqIndex < m_uSeqCount); + m_Weights[uSeqIndex] = w; + } + +void MSA::NormalizeWeights(WEIGHT wDesiredTotal) const + { + WEIGHT wTotal = 0; + for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) + wTotal += m_Weights[uSeqIndex]; + + if (0 == wTotal) + return; + + const WEIGHT f = wDesiredTotal/wTotal; + for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) + m_Weights[uSeqIndex] *= f; + } + +void MSA::CalcWeights() const + { + Quit("Calc weights not implemented"); + } + +static void FmtChar(char c, unsigned uWidth) + { + Log("%c", c); + for (unsigned n = 0; n < uWidth - 1; ++n) + Log(" "); + } + +static void FmtInt(unsigned u, unsigned uWidth) + { + static char szStr[1024]; + assert(uWidth < sizeof(szStr)); + if (u > 0) + sprintf(szStr, "%u", u); + else + strcpy(szStr, "."); + Log(szStr); + unsigned n = (unsigned) strlen(szStr); + if (n < uWidth) + for (unsigned i = 0; i < uWidth - n; ++i) + Log(" "); + } + +static void FmtInt0(unsigned u, unsigned uWidth) + { + static char szStr[1024]; + assert(uWidth < sizeof(szStr)); + sprintf(szStr, "%u", u); + Log(szStr); + unsigned n = (unsigned) strlen(szStr); + if (n < uWidth) + for (unsigned i = 0; i < uWidth - n; ++i) + Log(" "); + } + +static void FmtPad(unsigned n) + { + for (unsigned i = 0; i < n; ++i) + Log(" "); + } + +void MSA::FromSeq(const Seq &s) + { + unsigned uSeqLength = s.Length(); + SetSize(1, uSeqLength); + SetSeqName(0, s.GetName()); + if (0 != m_SeqIndexToId) + SetSeqId(0, s.GetId()); + for (unsigned n = 0; n < uSeqLength; ++n) + SetChar(0, n, s[n]); + } + +unsigned MSA::GetCharCount(unsigned uSeqIndex, unsigned uColIndex) const + { + assert(uSeqIndex < GetSeqCount()); + assert(uColIndex < GetColCount()); + + unsigned uCol = 0; + for (unsigned n = 0; n <= uColIndex; ++n) + if (!IsGap(uSeqIndex, n)) + ++uCol; + return uCol; + } + +void MSA::CopySeq(unsigned uToSeqIndex, const MSA &msaFrom, unsigned uFromSeqIndex) + { + assert(uToSeqIndex < m_uSeqCount); + const unsigned uColCount = msaFrom.GetColCount(); + assert(m_uColCount == uColCount || + (0 == m_uColCount && uColCount <= m_uCacheSeqLength)); + + memcpy(m_szSeqs[uToSeqIndex], msaFrom.GetSeqBuffer(uFromSeqIndex), uColCount); + SetSeqName(uToSeqIndex, msaFrom.GetSeqName(uFromSeqIndex)); + if (0 == m_uColCount) + m_uColCount = uColCount; + } + +const char *MSA::GetSeqBuffer(unsigned uSeqIndex) const + { + assert(uSeqIndex < m_uSeqCount); + return m_szSeqs[uSeqIndex]; + } + +void MSA::DeleteSeq(unsigned uSeqIndex) + { + assert(uSeqIndex < m_uSeqCount); + + delete m_szSeqs[uSeqIndex]; + delete m_szNames[uSeqIndex]; + + const unsigned uBytesToMove = (m_uSeqCount - uSeqIndex)*sizeof(char *); + if (uBytesToMove > 0) + { + memmove(m_szSeqs + uSeqIndex, m_szSeqs + uSeqIndex + 1, uBytesToMove); + memmove(m_szNames + uSeqIndex, m_szNames + uSeqIndex + 1, uBytesToMove); + } + + --m_uSeqCount; + + delete[] m_Weights; + m_Weights = 0; + } + +bool MSA::IsEmptyCol(unsigned uColIndex) const + { + const unsigned uSeqCount = GetSeqCount(); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + if (!IsGap(uSeqIndex, uColIndex)) + return false; + return true; + } + +//void MSA::DeleteEmptyCols(bool bProgress) +// { +// unsigned uColCount = GetColCount(); +// for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) +// { +// if (IsEmptyCol(uColIndex)) +// { +// if (bProgress) +// { +// Log("Deleting col %u of %u\n", uColIndex, uColCount); +// printf("Deleting col %u of %u\n", uColIndex, uColCount); +// } +// DeleteCol(uColIndex); +// --uColCount; +// } +// } +// } + +unsigned MSA::AlignedColIndexToColIndex(unsigned uAlignedColIndex) const + { + Quit("MSA::AlignedColIndexToColIndex not implemented"); + return 0; + } + +WEIGHT MSA::GetTotalSeqWeight() const + { + WEIGHT wTotal = 0; + const unsigned uSeqCount = GetSeqCount(); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + wTotal += m_Weights[uSeqIndex]; + return wTotal; + } + +bool MSA::SeqsEq(const MSA &a1, unsigned uSeqIndex1, const MSA &a2, + unsigned uSeqIndex2) + { + Seq s1; + Seq s2; + + a1.GetSeq(uSeqIndex1, s1); + a2.GetSeq(uSeqIndex2, s2); + + s1.StripGaps(); + s2.StripGaps(); + + return s1.EqIgnoreCase(s2); + } + +unsigned MSA::GetSeqLength(unsigned uSeqIndex) const + { + assert(uSeqIndex < GetSeqCount()); + + const unsigned uColCount = GetColCount(); + unsigned uLength = 0; + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + if (!IsGap(uSeqIndex, uColIndex)) + ++uLength; + return uLength; + } + +void MSA::GetPWID(unsigned uSeqIndex1, unsigned uSeqIndex2, double *ptrPWID, + unsigned *ptruPosCount) const + { + assert(uSeqIndex1 < GetSeqCount()); + assert(uSeqIndex2 < GetSeqCount()); + + unsigned uSameCount = 0; + unsigned uPosCount = 0; + const unsigned uColCount = GetColCount(); + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + { + char c1 = GetChar(uSeqIndex1, uColIndex); + if (IsGapChar(c1)) + continue; + char c2 = GetChar(uSeqIndex2, uColIndex); + if (IsGapChar(c2)) + continue; + ++uPosCount; + if (c1 == c2) + ++uSameCount; + } + *ptruPosCount = uPosCount; + if (uPosCount > 0) + *ptrPWID = 100.0 * (double) uSameCount / (double) uPosCount; + else + *ptrPWID = 0; + } + +void MSA::UnWeight() + { + for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) + m_Weights[uSeqIndex] = BTInsane; + } + +unsigned MSA::UniqueResidueTypes(unsigned uColIndex) const + { + assert(uColIndex < GetColCount()); + + unsigned Counts[MAX_ALPHA]; + memset(Counts, 0, sizeof(Counts)); + const unsigned uSeqCount = GetSeqCount(); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex)) + continue; + const unsigned uLetter = GetLetter(uSeqIndex, uColIndex); + ++(Counts[uLetter]); + } + unsigned uUniqueCount = 0; + for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter) + if (Counts[uLetter] > 0) + ++uUniqueCount; + return uUniqueCount; + } + +double MSA::GetOcc(unsigned uColIndex) const + { + unsigned uGapCount = 0; + for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) + if (IsGap(uSeqIndex, uColIndex)) + ++uGapCount; + unsigned uSeqCount = GetSeqCount(); + return (double) (uSeqCount - uGapCount) / (double) uSeqCount; + } + +void MSA::ToFile(TextFile &File) const + { + if (g_bMSF) + ToMSFFile(File); + else if (g_bAln) + ToAlnFile(File); + else if (g_bHTML) + ToHTMLFile(File); + else if (g_bPHYS) + ToPhySequentialFile(File); + else if (g_bPHYI) + ToPhyInterleavedFile(File); + else + ToFASTAFile(File); + if (0 != g_pstrScoreFileName) + WriteScoreFile(*this); + } + +bool MSA::ColumnHasGap(unsigned uColIndex) const + { + const unsigned uSeqCount = GetSeqCount(); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + if (IsGap(uSeqIndex, uColIndex)) + return true; + return false; + } + +void MSA::SetIdCount(unsigned uIdCount) + { + //if (m_uIdCount != 0) + // Quit("MSA::SetIdCount: may only be called once"); + + if (m_uIdCount > 0) + { + if (uIdCount > m_uIdCount) + Quit("MSA::SetIdCount: cannot increase count"); + return; + } + m_uIdCount = uIdCount; + } + +void MSA::SetSeqId(unsigned uSeqIndex, unsigned uId) + { + assert(uSeqIndex < m_uSeqCount); + assert(uId < m_uIdCount); + if (0 == m_SeqIndexToId) + { + if (0 == m_uIdCount) + Quit("MSA::SetSeqId, SetIdCount has not been called"); + m_IdToSeqIndex = new unsigned[m_uIdCount]; + m_SeqIndexToId = new unsigned[m_uSeqCount]; + + memset(m_IdToSeqIndex, 0xff, m_uIdCount*sizeof(unsigned)); + memset(m_SeqIndexToId, 0xff, m_uSeqCount*sizeof(unsigned)); + } + m_SeqIndexToId[uSeqIndex] = uId; + m_IdToSeqIndex[uId] = uSeqIndex; + } + +unsigned MSA::GetSeqIndex(unsigned uId) const + { + assert(uId < m_uIdCount); + assert(0 != m_IdToSeqIndex); + unsigned uSeqIndex = m_IdToSeqIndex[uId]; + assert(uSeqIndex < m_uSeqCount); + return uSeqIndex; + } + +bool MSA::GetSeqIndex(unsigned uId, unsigned *ptruIndex) const + { + for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) + { + if (uId == m_SeqIndexToId[uSeqIndex]) + { + *ptruIndex = uSeqIndex; + return true; + } + } + return false; + } + +unsigned MSA::GetSeqId(unsigned uSeqIndex) const + { + assert(uSeqIndex < m_uSeqCount); + unsigned uId = m_SeqIndexToId[uSeqIndex]; + assert(uId < m_uIdCount); + return uId; + } + +bool MSA::WeightsSet() const + { + return BTInsane != m_Weights[0]; + } + +void MSASubsetByIds(const MSA &msaIn, const unsigned Ids[], unsigned uIdCount, + MSA &msaOut) + { + const unsigned uColCount = msaIn.GetColCount(); + msaOut.SetSize(uIdCount, uColCount); + for (unsigned uSeqIndexOut = 0; uSeqIndexOut < uIdCount; ++uSeqIndexOut) + { + const unsigned uId = Ids[uSeqIndexOut]; + + const unsigned uSeqIndexIn = msaIn.GetSeqIndex(uId); + const char *ptrName = msaIn.GetSeqName(uSeqIndexIn); + + msaOut.SetSeqId(uSeqIndexOut, uId); + msaOut.SetSeqName(uSeqIndexOut, ptrName); + + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + { + const char c = msaIn.GetChar(uSeqIndexIn, uColIndex); + msaOut.SetChar(uSeqIndexOut, uColIndex, c); + } + } + } + +// Caller must allocate ptrSeq and ptrLabel as new char[n]. +void MSA::AppendSeq(char *ptrSeq, unsigned uSeqLength, char *ptrLabel) + { + if (m_uSeqCount > m_uCacheSeqCount) + Quit("Internal error MSA::AppendSeq"); + if (m_uSeqCount == m_uCacheSeqCount) + ExpandCache(m_uSeqCount + 4, uSeqLength); + m_szSeqs[m_uSeqCount] = ptrSeq; + m_szNames[m_uSeqCount] = ptrLabel; + ++m_uSeqCount; + } + +void MSA::ExpandCache(unsigned uSeqCount, unsigned uColCount) + { + if (m_IdToSeqIndex != 0 || m_SeqIndexToId != 0 || uSeqCount < m_uSeqCount) + Quit("Internal error MSA::ExpandCache"); + + if (m_uSeqCount > 0 && uColCount != m_uColCount) + Quit("Internal error MSA::ExpandCache, ColCount changed"); + + char **NewSeqs = new char *[uSeqCount]; + char **NewNames = new char *[uSeqCount]; + WEIGHT *NewWeights = new WEIGHT[uSeqCount]; + + for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) + { + NewSeqs[uSeqIndex] = m_szSeqs[uSeqIndex]; + NewNames[uSeqIndex] = m_szNames[uSeqIndex]; + NewWeights[uSeqIndex] = m_Weights[uSeqIndex]; + } + + for (unsigned uSeqIndex = m_uSeqCount; uSeqIndex < uSeqCount; ++uSeqIndex) + { + char *Seq = new char[uColCount]; + NewSeqs[uSeqIndex] = Seq; +#if DEBUG + memset(Seq, '?', uColCount); +#endif + } + + delete[] m_szSeqs; + delete[] m_szNames; + delete[] m_Weights; + + m_szSeqs = NewSeqs; + m_szNames = NewNames; + m_Weights = NewWeights; + + m_uCacheSeqCount = uSeqCount; + m_uCacheSeqLength = uColCount; + m_uColCount = uColCount; + } + +void MSA::FixAlpha() + { + ClearInvalidLetterWarning(); + for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) + { + for (unsigned uColIndex = 0; uColIndex < m_uColCount; ++uColIndex) + { + char c = GetChar(uSeqIndex, uColIndex); + if (!IsResidueChar(c) && !IsGapChar(c)) + { + char w = GetWildcardChar(); + // Warning("Invalid letter '%c', replaced by '%c'", c, w); + InvalidLetterWarning(c, w); + SetChar(uSeqIndex, uColIndex, w); + } + } + } + ReportInvalidLetters(); + } + +ALPHA MSA::GuessAlpha() const + { +// If at least MIN_NUCLEO_PCT of the first CHAR_COUNT non-gap +// letters belong to the nucleotide alphabet, guess nucleo. +// Otherwise amino. + const unsigned CHAR_COUNT = 100; + const unsigned MIN_NUCLEO_PCT = 95; + + const unsigned uSeqCount = GetSeqCount(); + const unsigned uColCount = GetColCount(); + if (0 == uSeqCount) + return ALPHA_Amino; + + unsigned uDNACount = 0; + unsigned uRNACount = 0; + unsigned uTotal = 0; + unsigned i = 0; + for (;;) + { + unsigned uSeqIndex = i/uColCount; + if (uSeqIndex >= uSeqCount) + break; + unsigned uColIndex = i%uColCount; + ++i; + char c = GetChar(uSeqIndex, uColIndex); + if (IsGapChar(c)) + continue; + if (IsDNA(c)) + ++uDNACount; + if (IsRNA(c)) + ++uRNACount; + ++uTotal; + if (uTotal >= CHAR_COUNT) + break; + } + if (uTotal != 0 && ((uRNACount*100)/uTotal) >= MIN_NUCLEO_PCT) + return ALPHA_RNA; + if (uTotal != 0 && ((uDNACount*100)/uTotal) >= MIN_NUCLEO_PCT) + return ALPHA_DNA; + return ALPHA_Amino; + } diff --git a/src/muscle/muscle3.8.31/src/msa.h b/src/muscle/muscle3.8.31/src/msa.h new file mode 100644 index 0000000..c023989 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/msa.h @@ -0,0 +1,179 @@ +#ifndef MSA_h +#define MSA_h + +const int MAX_SEQ_NAME = 63; +struct PathEdge; +class TextFile; +class Seq; +class ClusterNode; +class NodeCounts; +class DataBuffer; + +class MSA + { +public: + MSA(); + virtual ~MSA(); + +public: +// Ways to create an MSA + void FromFile(TextFile &File); + void FromFASTAFile(TextFile &File); + void FromSeq(const Seq &s); + + void ToFile(TextFile &File) const; + void ToFASTAFile(TextFile &File) const; + void ToMSFFile(TextFile &File, const char *ptrComment = 0) const; + void ToAlnFile(TextFile &File) const; + void ToHTMLFile(TextFile &File) const; + void ToPhySequentialFile(TextFile &File) const; + void ToPhyInterleavedFile(TextFile &File) const; + + void SetSize(unsigned uSeqCount, unsigned uColCount); + void SetSeqCount(unsigned uSeqCount); + char GetChar(unsigned uSeqIndex, unsigned uIndex) const; + unsigned GetLetter(unsigned uSeqIndex, unsigned uIndex) const; + unsigned GetLetterEx(unsigned uSeqIndex, unsigned uIndex) const; + const char *GetSeqName(unsigned uSeqIndex) const; + unsigned GetSeqId(unsigned uSeqIndex) const; + unsigned GetSeqIndex(unsigned uId) const; + bool GetSeqIndex(unsigned uId, unsigned *ptruIndex) const; + double GetOcc(unsigned uColIndex) const; + void GetFractionalWeightedCounts(unsigned uColIndex, bool bNormalize, + FCOUNT fcCounts[], FCOUNT *ptrfcGapStart, FCOUNT *ptrfcGapEnd, + FCOUNT *fcGapExtend, FCOUNT *ptrfOcc, + FCOUNT *fcLL, FCOUNT *fcLG, FCOUNT *fcGL, FCOUNT *fcGG) const; + bool IsGap(unsigned uSeqIndex, unsigned uColIndex) const; + bool IsWildcard(unsigned uSeqIndex, unsigned uColIndex) const; + bool IsGapColumn(unsigned uColIndex) const; + bool ColumnHasGap(unsigned uColIndex) const; + bool IsGapSeq(unsigned uSeqIndex) const; + + void SetChar(unsigned uSeqIndex, unsigned uColIndex, char c); + void SetSeqName(unsigned uSeqIndex, const char szName[]); + void SetSeqId(unsigned uSeqIndex, unsigned uId); + bool HasGap() const; + bool IsLegalLetter(unsigned uLetter) const; + void GetSeq(unsigned uSeqIndex, Seq &seq) const; + void Copy(const MSA &msa); + double GetCons(unsigned uColIndex) const; + double GetAvgCons() const; + double GetPctIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const; + bool GetSeqIndex(const char *ptrSeqName, unsigned *ptruSeqIndex) const; + void DeleteCol(unsigned uColIndex); + void DeleteColumns(unsigned uColIndex, unsigned uColCount); + void CopySeq(unsigned uToSeqIndex, const MSA &msaFrom, unsigned uFromSeqIndex); + void DeleteSeq(unsigned uSeqIndex); +// void DeleteEmptyCols(bool bProgress = false); + bool IsEmptyCol(unsigned uColIndex) const; + + WEIGHT GetSeqWeight(unsigned uSeqIndex) const; + WEIGHT GetTotalSeqWeight() const; + void SetSeqWeight(unsigned uSeqIndex, WEIGHT w) const; + void NormalizeWeights(WEIGHT wTotal) const; + bool WeightsSet() const; + + unsigned GetGCGCheckSum(unsigned uSeqIndex) const; + + ALPHA GuessAlpha() const; + void FixAlpha(); + + unsigned UniqueResidueTypes(unsigned uColIndex) const; + + void UnWeight(); + + void GetNodeCounts(unsigned uAlignedColIndex, NodeCounts &Counts) const; + void ValidateBreakMatrices() const; + unsigned GetCharCount(unsigned uSeqIndex, unsigned uColIndex) const; + const char *GetSeqBuffer(unsigned uSeqIndex) const; + unsigned AlignedColIndexToColIndex(unsigned uAlignedColIndex) const; + unsigned GetSeqLength(unsigned uSeqIndex) const; + void GetPWID(unsigned uSeqIndex1, unsigned uSeqIndex2, double *ptrdPWID, + unsigned *ptruPosCount) const; + + void GetPairMap(unsigned uSeqIndex1, unsigned uSeqIndex2, int iMap1[], + int iMap2[]) const; + + void LogMe() const; + void ListWeights() const; + + void GapInfoToDataBuffer(DataBuffer &Buffer) const; + void GapInfoFromDataBuffer(const DataBuffer &Buffer); + double GetPctGroupIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const; + + void Clear() + { + Free(); + } + unsigned GetSeqCount() const + { + return m_uSeqCount; + } + unsigned GetColCount() const + { + return m_uColCount; + } + + static bool SeqsEq(const MSA &a1, unsigned uSeqIndex1, const MSA &a2, + unsigned uSeqIndex2); + + static void SetIdCount(unsigned uIdCount); + +private: + friend void SetMSAWeightsMuscle(MSA &msa); + friend void SetThreeWayWeightsMuscle(MSA &msa); + void SetHenikoffWeightsPB() const; + void SetHenikoffWeights() const; + void SetGSCWeights() const; + void SetUniformWeights() const; + void SetClustalWWeights(const Tree &tree); + + void Free(); + void AppendSeq(char *ptrSeq, unsigned uSeqLength, char *ptrLabel); + void ExpandCache(unsigned uSeqCount, unsigned uColCount); + void CalcWeights() const; + void GetNameFromFASTAAnnotationLine(const char szLine[], + char szName[], unsigned uBytes); + void CopyCol(unsigned uFromCol, unsigned uToCol); + unsigned CalcBLOSUMWeights(ClusterTree &BlosumCluster) const; + void SetBLOSUMSubtreeWeight(const ClusterNode *ptrNode, double dWeight) const; + unsigned SetBLOSUMNodeWeight(const ClusterNode *ptrNode, double dMinDist) const; + void SetSubtreeWeight2(const ClusterNode *ptrNode) const; + void SetSubtreeGSCWeight(ClusterNode *ptrNode) const; + + void CalcHenikoffWeightsColPB(unsigned uColIndex) const; + void CalcHenikoffWeightsCol(unsigned uColIndex) const; + +private: + unsigned m_uSeqCount; + unsigned m_uColCount; + unsigned m_uCacheSeqLength; + unsigned m_uCacheSeqCount; + char **m_szSeqs; + char **m_szNames; + + static unsigned m_uIdCount; + + unsigned *m_IdToSeqIndex; + unsigned *m_SeqIndexToId; + + WEIGHT *m_Weights; + }; + +void SeqVectFromMSA(const MSA &msa, SeqVect &v); +void DeleteGappedCols(MSA &msa); +void MSAFromColRange(const MSA &msaIn, unsigned uFromColIndex, unsigned uColCount, + MSA &msaOut); +void MSACat(const MSA &msa1, const MSA &msa2, MSA &msaCat); +void MSAAppend(MSA &msa1, const MSA &msa2); +void MSAFromSeqSubset(const MSA &msaIn, const unsigned uSeqIndexes[], unsigned uSeqCount, + MSA &msaOut); +void AssertMSAEq(const MSA &msa1, const MSA &msa2); +void AssertMSAEqIgnoreCaseAndGaps(const MSA &msa1, const MSA &msa2); +void MSASubsetByIds(const MSA &msaIn, const unsigned Ids[], unsigned uIdCount, + MSA &msaOut); +void SetMSAWeightsMuscle(MSA &msa); +void SetClustalWWeightsMuscle(MSA &msa); +void SetThreeWayWeightsMuscle(MSA &msa); + +#endif // MSA_h diff --git a/src/muscle/muscle3.8.31/src/msa2.cpp b/src/muscle/muscle3.8.31/src/msa2.cpp new file mode 100644 index 0000000..c82b9d7 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/msa2.cpp @@ -0,0 +1,531 @@ +#include "muscle.h" +#include "msa.h" +#include "seqvect.h" +#include "profile.h" +#include "tree.h" + +// These global variables are a hack to allow the tree +// dependent iteration code to communicate the edge +// used to divide the tree. The three-way weighting +// scheme needs to know this edge in order to compute +// sequence weights. +static const Tree *g_ptrMuscleTree = 0; +unsigned g_uTreeSplitNode1 = NULL_NEIGHBOR; +unsigned g_uTreeSplitNode2 = NULL_NEIGHBOR; + +void MSA::GetFractionalWeightedCounts(unsigned uColIndex, bool bNormalize, + FCOUNT fcCounts[], FCOUNT *ptrfcGapStart, FCOUNT *ptrfcGapEnd, + FCOUNT *ptrfcGapExtend, FCOUNT *ptrfOcc, + FCOUNT *ptrfcLL, FCOUNT *ptrfcLG, FCOUNT *ptrfcGL, FCOUNT *ptrfcGG) const + { + const unsigned uSeqCount = GetSeqCount(); + const unsigned uColCount = GetColCount(); + + memset(fcCounts, 0, g_AlphaSize*sizeof(FCOUNT)); + WEIGHT wTotal = 0; + FCOUNT fGap = 0; + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + const WEIGHT w = GetSeqWeight(uSeqIndex); + if (IsGap(uSeqIndex, uColIndex)) + { + fGap += w; + continue; + } + else if (IsWildcard(uSeqIndex, uColIndex)) + { + const unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex); + switch (g_Alpha) + { + case ALPHA_Amino: + switch (uLetter) + { + case AX_B: // D or N + fcCounts[AX_D] += w/2; + fcCounts[AX_N] += w/2; + break; + case AX_Z: // E or Q + fcCounts[AX_E] += w/2; + fcCounts[AX_Q] += w/2; + break; + default: // any + { + const FCOUNT f = w/20; + for (unsigned uLetter = 0; uLetter < 20; ++uLetter) + fcCounts[uLetter] += f; + break; + } + } + break; + + case ALPHA_DNA: + case ALPHA_RNA: + switch (uLetter) + { + case AX_R: // G or A + fcCounts[NX_G] += w/2; + fcCounts[NX_A] += w/2; + break; + case AX_Y: // C or T/U + fcCounts[NX_C] += w/2; + fcCounts[NX_T] += w/2; + break; + default: // any + const FCOUNT f = w/20; + for (unsigned uLetter = 0; uLetter < 4; ++uLetter) + fcCounts[uLetter] += f; + break; + } + break; + + default: + Quit("Alphabet %d not supported", g_Alpha); + } + continue; + } + unsigned uLetter = GetLetter(uSeqIndex, uColIndex); + fcCounts[uLetter] += w; + wTotal += w; + } + *ptrfOcc = (float) (1.0 - fGap); + + if (bNormalize && wTotal > 0) + { + if (wTotal > 1.001) + Quit("wTotal=%g\n", wTotal); + for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter) + fcCounts[uLetter] /= wTotal; +// AssertNormalized(fcCounts); + } + + FCOUNT fcStartCount = 0; + if (uColIndex == 0) + { + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + if (IsGap(uSeqIndex, uColIndex)) + fcStartCount += GetSeqWeight(uSeqIndex); + } + else + { + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex - 1)) + fcStartCount += GetSeqWeight(uSeqIndex); + } + + FCOUNT fcEndCount = 0; + if (uColCount - 1 == uColIndex) + { + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + if (IsGap(uSeqIndex, uColIndex)) + fcEndCount += GetSeqWeight(uSeqIndex); + } + else + { + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex + 1)) + fcEndCount += GetSeqWeight(uSeqIndex); + } + + FCOUNT LL = 0; + FCOUNT LG = 0; + FCOUNT GL = 0; + FCOUNT GG = 0; + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + WEIGHT w = GetSeqWeight(uSeqIndex); + bool bLetterHere = !IsGap(uSeqIndex, uColIndex); + bool bLetterPrev = (uColIndex == 0 || !IsGap(uSeqIndex, uColIndex - 1)); + if (bLetterHere) + { + if (bLetterPrev) + LL += w; + else + GL += w; + } + else + { + if (bLetterPrev) + LG += w; + else + GG += w; + } + } + + FCOUNT fcExtendCount = 0; + if (uColIndex > 0 && uColIndex < GetColCount() - 1) + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + if (IsGap(uSeqIndex, uColIndex) && IsGap(uSeqIndex, uColIndex - 1) && + IsGap(uSeqIndex, uColIndex + 1)) + fcExtendCount += GetSeqWeight(uSeqIndex); + + *ptrfcLL = LL; + *ptrfcLG = LG; + *ptrfcGL = GL; + *ptrfcGG = GG; + *ptrfcGapStart = fcStartCount; + *ptrfcGapEnd = fcEndCount; + *ptrfcGapExtend = fcExtendCount; + } + +// Return true if the given column has no gaps and all +// its residues are in the same biochemical group. +bool MSAColIsConservative(const MSA &msa, unsigned uColIndex) + { + extern unsigned ResidueGroup[]; + + const unsigned uSeqCount = msa.GetColCount(); + if (0 == uSeqCount) + Quit("MSAColIsConservative: empty alignment"); + + if (msa.IsGap(0, uColIndex)) + return false; + + unsigned uLetter = msa.GetLetterEx(0, uColIndex); + const unsigned uGroup = ResidueGroup[uLetter]; + + for (unsigned uSeqIndex = 1; uSeqIndex < uSeqCount; ++uSeqIndex) + { + if (msa.IsGap(uSeqIndex, uColIndex)) + return false; + uLetter = msa.GetLetter(uSeqIndex, uColIndex); + if (ResidueGroup[uLetter] != uGroup) + return false; + } + return true; + } + +void MSAFromSeqRange(const MSA &msaIn, unsigned uFromSeqIndex, unsigned uSeqCount, + MSA &msaOut) + { + const unsigned uColCount = msaIn.GetColCount(); + msaOut.SetSize(uSeqCount, uColCount); + + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + const char *ptrName = msaIn.GetSeqName(uFromSeqIndex + uSeqIndex); + msaOut.SetSeqName(uSeqIndex, ptrName); + + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + { + const char c = msaIn.GetChar(uFromSeqIndex + uSeqIndex, uColIndex); + msaOut.SetChar(uSeqIndex, uColIndex, c); + } + } + } + +void MSAFromColRange(const MSA &msaIn, unsigned uFromColIndex, unsigned uColCount, + MSA &msaOut) + { + const unsigned uSeqCount = msaIn.GetSeqCount(); + const unsigned uInColCount = msaIn.GetColCount(); + + if (uFromColIndex + uColCount - 1 > uInColCount) + Quit("MSAFromColRange, out of bounds"); + + msaOut.SetSize(uSeqCount, uColCount); + + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + const char *ptrName = msaIn.GetSeqName(uSeqIndex); + unsigned uId = msaIn.GetSeqId(uSeqIndex); + msaOut.SetSeqName(uSeqIndex, ptrName); + msaOut.SetSeqId(uSeqIndex, uId); + + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + { + const char c = msaIn.GetChar(uSeqIndex, uFromColIndex + uColIndex); + msaOut.SetChar(uSeqIndex, uColIndex, c); + } + } + } + +void SeqVectFromMSA(const MSA &msa, SeqVect &v) + { + v.Clear(); + const unsigned uSeqCount = msa.GetSeqCount(); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + Seq s; + msa.GetSeq(uSeqIndex, s); + + s.StripGaps(); + //if (0 == s.Length()) + // continue; + + const char *ptrName = msa.GetSeqName(uSeqIndex); + s.SetName(ptrName); + + v.AppendSeq(s); + } + } + +void DeleteGappedCols(MSA &msa) + { + unsigned uColIndex = 0; + for (;;) + { + if (uColIndex >= msa.GetColCount()) + break; + if (msa.IsGapColumn(uColIndex)) + msa.DeleteCol(uColIndex); + else + ++uColIndex; + } + } + +void MSAFromSeqSubset(const MSA &msaIn, const unsigned uSeqIndexes[], unsigned uSeqCount, + MSA &msaOut) + { + const unsigned uColCount = msaIn.GetColCount(); + msaOut.SetSize(uSeqCount, uColCount); + for (unsigned uSeqIndexOut = 0; uSeqIndexOut < uSeqCount; ++uSeqIndexOut) + { + unsigned uSeqIndexIn = uSeqIndexes[uSeqIndexOut]; + const char *ptrName = msaIn.GetSeqName(uSeqIndexIn); + unsigned uId = msaIn.GetSeqId(uSeqIndexIn); + msaOut.SetSeqName(uSeqIndexOut, ptrName); + msaOut.SetSeqId(uSeqIndexOut, uId); + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + { + const char c = msaIn.GetChar(uSeqIndexIn, uColIndex); + msaOut.SetChar(uSeqIndexOut, uColIndex, c); + } + } + } + +void AssertMSAEqIgnoreCaseAndGaps(const MSA &msa1, const MSA &msa2) + { + const unsigned uSeqCount1 = msa1.GetSeqCount(); + const unsigned uSeqCount2 = msa2.GetSeqCount(); + if (uSeqCount1 != uSeqCount2) + Quit("Seq count differs"); + + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount1; ++uSeqIndex) + { + Seq seq1; + msa1.GetSeq(uSeqIndex, seq1); + + unsigned uId = msa1.GetSeqId(uSeqIndex); + unsigned uSeqIndex2 = msa2.GetSeqIndex(uId); + + Seq seq2; + msa2.GetSeq(uSeqIndex2, seq2); + + if (!seq1.EqIgnoreCaseAndGaps(seq2)) + { + Log("Input:\n"); + seq1.LogMe(); + Log("Output:\n"); + seq2.LogMe(); + Quit("Seq %s differ ", msa1.GetSeqName(uSeqIndex)); + } + } + } + +void AssertMSAEq(const MSA &msa1, const MSA &msa2) + { + const unsigned uSeqCount1 = msa1.GetSeqCount(); + const unsigned uSeqCount2 = msa2.GetSeqCount(); + if (uSeqCount1 != uSeqCount2) + Quit("Seq count differs"); + + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount1; ++uSeqIndex) + { + Seq seq1; + msa1.GetSeq(uSeqIndex, seq1); + + unsigned uId = msa1.GetSeqId(uSeqIndex); + unsigned uSeqIndex2 = msa2.GetSeqIndex(uId); + + Seq seq2; + msa2.GetSeq(uSeqIndex2, seq2); + + if (!seq1.Eq(seq2)) + { + Log("Input:\n"); + seq1.LogMe(); + Log("Output:\n"); + seq2.LogMe(); + Quit("Seq %s differ ", msa1.GetSeqName(uSeqIndex)); + } + } + } + +void SetMSAWeightsMuscle(MSA &msa) + { + SEQWEIGHT Method = GetSeqWeightMethod(); + switch (Method) + { + case SEQWEIGHT_None: + msa.SetUniformWeights(); + return; + + case SEQWEIGHT_Henikoff: + msa.SetHenikoffWeights(); + return; + + case SEQWEIGHT_HenikoffPB: + msa.SetHenikoffWeightsPB(); + return; + + case SEQWEIGHT_GSC: + msa.SetGSCWeights(); + return; + + case SEQWEIGHT_ClustalW: + SetClustalWWeightsMuscle(msa); + return; + + case SEQWEIGHT_ThreeWay: + SetThreeWayWeightsMuscle(msa); + return; + } + Quit("SetMSAWeightsMuscle, Invalid method=%d", Method); + } + +static WEIGHT *g_MuscleWeights; +static unsigned g_uMuscleIdCount; + +WEIGHT GetMuscleSeqWeightById(unsigned uId) + { + if (0 == g_MuscleWeights) + Quit("g_MuscleWeights = 0"); + if (uId >= g_uMuscleIdCount) + Quit("GetMuscleSeqWeightById(%u): count=%u", + uId, g_uMuscleIdCount); + + return g_MuscleWeights[uId]; + } + +void SetMuscleTree(const Tree &tree) + { + g_ptrMuscleTree = &tree; + + if (SEQWEIGHT_ClustalW != GetSeqWeightMethod()) + return; + + delete[] g_MuscleWeights; + + const unsigned uLeafCount = tree.GetLeafCount(); + g_uMuscleIdCount = uLeafCount; + g_MuscleWeights = new WEIGHT[uLeafCount]; + CalcClustalWWeights(tree, g_MuscleWeights); + } + +void SetClustalWWeightsMuscle(MSA &msa) + { + if (0 == g_MuscleWeights) + Quit("g_MuscleWeights = 0"); + const unsigned uSeqCount = msa.GetSeqCount(); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + const unsigned uId = msa.GetSeqId(uSeqIndex); + if (uId >= g_uMuscleIdCount) + Quit("SetClustalWWeightsMuscle: id out of range"); + msa.SetSeqWeight(uSeqIndex, g_MuscleWeights[uId]); + } + msa.NormalizeWeights((WEIGHT) 1.0); + } + +#define LOCAL_VERBOSE 0 + +void SetThreeWayWeightsMuscle(MSA &msa) + { + if (NULL_NEIGHBOR == g_uTreeSplitNode1 || NULL_NEIGHBOR == g_uTreeSplitNode2) + { + msa.SetHenikoffWeightsPB(); + return; + } + + const unsigned uMuscleSeqCount = g_ptrMuscleTree->GetLeafCount(); + WEIGHT *Weights = new WEIGHT[uMuscleSeqCount]; + + CalcThreeWayWeights(*g_ptrMuscleTree, g_uTreeSplitNode1, g_uTreeSplitNode2, + Weights); + + const unsigned uMSASeqCount = msa.GetSeqCount(); + for (unsigned uSeqIndex = 0; uSeqIndex < uMSASeqCount; ++uSeqIndex) + { + const unsigned uId = msa.GetSeqId(uSeqIndex); + if (uId >= uMuscleSeqCount) + Quit("SetThreeWayWeightsMuscle: id out of range"); + msa.SetSeqWeight(uSeqIndex, Weights[uId]); + } +#if LOCAL_VERBOSE + { + Log("SetThreeWayWeightsMuscle\n"); + for (unsigned n = 0; n < uMSASeqCount; ++n) + { + const unsigned uId = msa.GetSeqId(n); + Log("%20.20s %6.3f\n", msa.GetSeqName(n), Weights[uId]); + } + } +#endif + msa.NormalizeWeights((WEIGHT) 1.0); + + delete[] Weights; + } + +// Append msa2 at the end of msa1 +void MSAAppend(MSA &msa1, const MSA &msa2) + { + const unsigned uSeqCount = msa1.GetSeqCount(); + + const unsigned uColCount1 = msa1.GetColCount(); + const unsigned uColCount2 = msa2.GetColCount(); + const unsigned uColCountCat = uColCount1 + uColCount2; + + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + unsigned uId = msa1.GetSeqId(uSeqIndex); + unsigned uSeqIndex2 = msa2.GetSeqIndex(uId); + for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) + { + const char c = msa2.GetChar(uSeqIndex2, uColIndex); + msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, c); + } + } + } + +// "Catenate" two MSAs (by bad analogy with UNIX cat command). +// msa1 and msa2 must have same sequence names, but possibly +// in a different order. +// msaCat is the combined alignment produce by appending +// sequences in msa2 to sequences in msa1. +void MSACat(const MSA &msa1, const MSA &msa2, MSA &msaCat) + { + const unsigned uSeqCount = msa1.GetSeqCount(); + + const unsigned uColCount1 = msa1.GetColCount(); + const unsigned uColCount2 = msa2.GetColCount(); + const unsigned uColCountCat = uColCount1 + uColCount2; + + msaCat.SetSize(uSeqCount, uColCountCat); + + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + for (unsigned uColIndex = 0; uColIndex < uColCount1; ++uColIndex) + { + const char c = msa1.GetChar(uSeqIndex, uColIndex); + msaCat.SetChar(uSeqIndex, uColIndex, c); + } + + const char *ptrSeqName = msa1.GetSeqName(uSeqIndex); + unsigned uSeqIndex2; + msaCat.SetSeqName(uSeqIndex, ptrSeqName); + bool bFound = msa2.GetSeqIndex(ptrSeqName, &uSeqIndex2); + if (bFound) + { + for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) + { + const char c = msa2.GetChar(uSeqIndex2, uColIndex); + msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, c); + } + } + else + { + for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) + msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, '-'); + } + } + } diff --git a/src/muscle/muscle3.8.31/src/msadist.h b/src/muscle/muscle3.8.31/src/msadist.h new file mode 100644 index 0000000..8dcafcd --- /dev/null +++ b/src/muscle/muscle3.8.31/src/msadist.h @@ -0,0 +1,39 @@ +#ifndef MSADist_h +#define MSADist_h + +#include + +double GetScoreDist(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2); + +class MSADist + { +public: + MSADist(DISTANCE Distance) + { + m_Distance = Distance; + } + + double ComputeDist(const MSA &msa, unsigned uSeqIndex1, unsigned uSeqIndex2) + { + if (m_Distance == DISTANCE_ScoreDist) + return GetScoreDist(msa, uSeqIndex1, uSeqIndex2); + + double dPctId = msa.GetPctIdentityPair(uSeqIndex1, uSeqIndex2); + switch(m_Distance) + { + case DISTANCE_PctIdKimura: + return KimuraDist(dPctId); + case DISTANCE_PctIdLog: + if (dPctId < 0.05) + dPctId = 0.05; + return -log(dPctId); + } + Quit("MSADist::ComputeDist, invalid DISTANCE_%u", m_Distance); + return 0; + } + +private: + DISTANCE m_Distance; + }; + +#endif // MSADist_h diff --git a/src/muscle/muscle3.8.31/src/msadistkimura.cpp b/src/muscle/muscle3.8.31/src/msadistkimura.cpp new file mode 100644 index 0000000..6903c83 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/msadistkimura.cpp @@ -0,0 +1,88 @@ +#include "muscle.h" +#include "msa.h" +#include + +// "Standard" NJ distance: the Kimura measure. +// This is defined to be: +// +// log_e(1 - p - p*p/5) +// +// where p is the fraction of residues that differ, i.e.: +// +// p = (1 - fractional_conservation) +// +// This measure is infinite for p = 0.8541 and is considered +// unreliable for p >= 0.75 (according to the ClustalW docs). +// ClustalW uses a table lookup for values > 0.75. +// The following table was copied from the ClustalW file dayhoff.h. + +static int dayhoff_pams[]={ + 195, /* 75.0% observed d; 195 PAMs estimated = 195% estimated d */ + 196, /* 75.1% observed d; 196 PAMs estimated */ + 197, 198, 199, 200, 200, 201, 202, 203, + 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, + 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, + 223, 224, 226, 227, 228, 229, 230, 231, 232, 233, + 234, 236, 237, 238, 239, 240, 241, 243, 244, 245, + 246, 248, 249, 250, /* 250 PAMs = 80.3% observed d */ + 252, 253, 254, 255, 257, 258, + 260, 261, 262, 264, 265, 267, 268, 270, 271, 273, + 274, 276, 277, 279, 281, 282, 284, 285, 287, 289, + 291, 292, 294, 296, 298, 299, 301, 303, 305, 307, + 309, 311, 313, 315, 317, 319, 321, 323, 325, 328, + 330, 332, 335, 337, 339, 342, 344, 347, 349, 352, + 354, 357, 360, 362, 365, 368, 371, 374, 377, 380, + 383, 386, 389, 393, 396, 399, 403, 407, 410, 414, + 418, 422, 426, 430, 434, 438, 442, 447, 451, 456, + 461, 466, 471, 476, 482, 487, 493, 498, 504, 511, + 517, 524, 531, 538, 545, 553, 560, 569, 577, 586, + 595, 605, 615, 626, 637, 649, 661, 675, 688, 703, + 719, 736, 754, 775, 796, 819, 845, 874, 907, 945, + /* 92.9% observed; 945 PAMs */ + 988 /* 93.0% observed; 988 PAMs */ +}; +static int iTableEntries = sizeof(dayhoff_pams)/sizeof(dayhoff_pams[0]); + +double KimuraDist(double dPctId) + { + double p = 1 - dPctId; +// Typical case: use Kimura's empirical formula + if (p < 0.75) + return -log(1 - p - (p*p)/5); + +// Per ClustalW, return 10.0 for anything over 93% + if (p > 0.93) + return 10.0; + +// If p >= 0.75, use table lookup + assert(p <= 1 && p >= 0.75); +// Thanks for Michael Hoel for pointing out a bug +// in the table index calculation in versions <= 3.52. + int iTableIndex = (int) ((p - 0.75)*1000 + 0.5); + if (iTableIndex < 0 || iTableIndex >= iTableEntries) + Quit("Internal error in MSADistKimura::ComputeDist"); + + return dayhoff_pams[iTableIndex] / 100.0; + } + +//double MSADistKimura::ComputeDist(const MSA &msa, unsigned uSeqIndex1, +// unsigned uSeqIndex2) +// { +// double dPctId = msa.GetPctIdentityPair(uSeqIndex1, uSeqIndex2); +// return KimuraDist(dPctId); +// } + +double KimuraDistToPctId(double dKimuraDist) + { +// Solve quadratic equation + const double a = 0.2; + const double b = 1; + const double c = 1.0 - exp(-dKimuraDist); + const double p = (-b + sqrt(b*b + 4*a*c))/(2*a); + return 1 - p; + } + +double PctIdToHeightKimura(double dPctId) + { + return KimuraDist(dPctId); + } diff --git a/src/muscle/muscle3.8.31/src/msadistkimura.h b/src/muscle/muscle3.8.31/src/msadistkimura.h new file mode 100644 index 0000000..3a386ce --- /dev/null +++ b/src/muscle/muscle3.8.31/src/msadistkimura.h @@ -0,0 +1,13 @@ +#ifndef MSADistKimura_h +#define MSADistKimura_h + +#include "msadist.h" + +class MSADistKimura : public MSADist + { +public: + virtual double ComputeDist(const MSA &msa, unsigned uSeqIndex1, + unsigned uSeqIndex2); + }; + +#endif // MSADistKimura_h diff --git a/src/muscle/muscle3.8.31/src/msadistmafft.h b/src/muscle/muscle3.8.31/src/msadistmafft.h new file mode 100644 index 0000000..56772e1 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/msadistmafft.h @@ -0,0 +1,24 @@ +#ifndef MSADistMAFFT_h +#define MSADistMAFFT_h + +#include "msadist.h" +#include + +extern double PctIdToMAFFTDist(double dPctId); + +class MSADistMAFFT : public MSADist + { +public: + virtual double ComputeDist(const MSA &msa, unsigned uSeqIndex1, + unsigned uSeqIndex2) + { + double dPctId = msa.GetPctIdentityPair(uSeqIndex1, uSeqIndex2); + //if (dPctId < 0.05) + // dPctId = 0.05; + //double dDist = -log(dPctId); + //return dDist; + return PctIdToMAFFTDist(dPctId); + } + }; + +#endif // MSADistMAFFT_h diff --git a/src/muscle/muscle3.8.31/src/msf.cpp b/src/muscle/muscle3.8.31/src/msf.cpp new file mode 100644 index 0000000..251064a --- /dev/null +++ b/src/muscle/muscle3.8.31/src/msf.cpp @@ -0,0 +1,121 @@ +#include "muscle.h" +#include +#include +#include "msa.h" +#include "textfile.h" + +const int MAX_NAME = 63; + +const unsigned uCharsPerLine = 50; +const unsigned uCharsPerBlock = 10; + +// Truncate at first white space or MAX_NAME, whichever comes +// first, then pad with blanks up to PadLength. +static const char *GetPaddedName(const char *Name, int PadLength) + { + static char PaddedName[MAX_NAME+1]; + memset(PaddedName, ' ', MAX_NAME); + size_t n = strcspn(Name, " \t"); + memcpy(PaddedName, Name, n); + PaddedName[PadLength] = 0; + return PaddedName; + } + +static const char *strfind(const char *s, const char *t) + { + size_t n = strcspn(s, t); + if (0 == n) + return 0; + return s + n; + } + +// GCG checksum code kindly provided by Eric Martel. +unsigned MSA::GetGCGCheckSum(unsigned uSeqIndex) const + { + unsigned CheckSum = 0; + const unsigned uColCount = GetColCount(); + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + { + unsigned c = (unsigned) GetChar(uSeqIndex, uColIndex); + CheckSum += c*(uColIndex%57 + 1); + CheckSum %= 10000; + } + return CheckSum; + } + +static void MSFFixGaps(MSA &a) + { + const int SeqCount = a.GetSeqCount(); + const int ColCount = a.GetColCount(); + for (int SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) + { + for (int ColIndex = 0; ColIndex < ColCount; ++ColIndex) + if (a.IsGap(SeqIndex, ColIndex)) + a.SetChar(SeqIndex, ColIndex, '.'); + } + } + +void MSA::ToMSFFile(TextFile &File, const char *ptrComment) const + { +// Cast away const, yuck + SetMSAWeightsMuscle((MSA &) *this); + MSFFixGaps((MSA &) *this); + + File.PutString("PileUp\n"); + + if (0 != ptrComment) + File.PutFormat("Comment: %s\n", ptrComment); + else + File.PutString("\n"); + + char seqtype = (g_Alpha == ALPHA_DNA || g_Alpha == ALPHA_RNA) ? 'N' : 'A'; + File.PutFormat(" MSF: %u Type: %c Check: 0000 ..\n\n", + GetColCount(), seqtype); + + int iLongestNameLength = 0; + for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) + { + const char *Name = GetSeqName(uSeqIndex); + const char *PaddedName = GetPaddedName(Name, MAX_NAME); + int iLength = (int) strcspn(PaddedName, " \t"); + if (iLength > iLongestNameLength) + iLongestNameLength = iLength; + } + + for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) + { + const char *Name = GetSeqName(uSeqIndex); + const char *PaddedName = GetPaddedName(Name, iLongestNameLength); + File.PutFormat(" Name: %s", PaddedName); + File.PutFormat(" Len: %u Check: %5u Weight: %g\n", + GetColCount(), GetGCGCheckSum(uSeqIndex), GetSeqWeight(uSeqIndex)); + } + File.PutString("\n//\n"); + if (0 == GetColCount()) + return; + + unsigned uLineCount = (GetColCount() - 1)/uCharsPerLine + 1; + for (unsigned uLineIndex = 0; uLineIndex < uLineCount; ++uLineIndex) + { + File.PutString("\n"); + unsigned uStartColIndex = uLineIndex*uCharsPerLine; + unsigned uEndColIndex = uStartColIndex + uCharsPerLine - 1; + if (uEndColIndex >= GetColCount()) + uEndColIndex = GetColCount() - 1; + for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) + { + const char *Name = GetSeqName(uSeqIndex); + const char *PaddedName = GetPaddedName(Name, iLongestNameLength); + File.PutFormat("%s ", PaddedName); + for (unsigned uColIndex = uStartColIndex; uColIndex <= uEndColIndex; + ++uColIndex) + { + if (0 == uColIndex%uCharsPerBlock) + File.PutString(" "); + char c = GetChar(uSeqIndex, uColIndex); + File.PutFormat("%c", c); + } + File.PutString("\n"); + } + } + } diff --git a/src/muscle/muscle3.8.31/src/muscle.cpp b/src/muscle/muscle3.8.31/src/muscle.cpp new file mode 100644 index 0000000..2a118bf --- /dev/null +++ b/src/muscle/muscle3.8.31/src/muscle.cpp @@ -0,0 +1,130 @@ +#include "muscle.h" +#include "msa.h" +#include "seqvect.h" +#include "msa.h" +#include "tree.h" +#include "profile.h" + +void MUSCLE(SeqVect &v, MSA &msaOut) + { + const unsigned uSeqCount = v.Length(); + + if (0 == uSeqCount) + Quit("No sequences in input file"); + + ALPHA Alpha = ALPHA_Undefined; + switch (g_SeqType) + { + case SEQTYPE_Auto: + Alpha = v.GuessAlpha(); + break; + + case SEQTYPE_Protein: + Alpha = ALPHA_Amino; + break; + + case SEQTYPE_RNA: + Alpha = ALPHA_RNA; + break; + + case SEQTYPE_DNA: + Alpha = ALPHA_DNA; + break; + + default: + Quit("Invalid seq type"); + } + SetAlpha(Alpha); + v.FixAlpha(); + + if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha) + { + SetPPScore(PPSCORE_SPN); + g_Distance1 = DISTANCE_Kmer4_6; + } + + unsigned uMaxL = 0; + unsigned uTotL = 0; + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + unsigned L = v.GetSeq(uSeqIndex).Length(); + uTotL += L; + if (L > uMaxL) + uMaxL = L; + } + + SetIter(1); + g_bDiags = g_bDiags1; + SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount); + + MSA::SetIdCount(uSeqCount); + +//// Initialize sequence ids. +//// From this point on, ids must somehow propogate from here. +// for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) +// v.SetSeqId(uSeqIndex, uSeqIndex); + + if (uSeqCount > 1) + MHackStart(v); + + if (0 == uSeqCount) + { + msaOut.Clear(); + return; + } + + if (1 == uSeqCount && ALPHA_Amino == Alpha) + { + const Seq &s = v.GetSeq(0); + msaOut.FromSeq(s); + return; + } + +// First iteration + Tree GuideTree; + TreeFromSeqVect(v, GuideTree, g_Cluster1, g_Distance1, g_Root1); + + SetMuscleTree(GuideTree); + + ProgNode *ProgNodes = 0; + if (g_bLow) + ProgNodes = ProgressiveAlignE(v, GuideTree, msaOut); + else + ProgressiveAlign(v, GuideTree, msaOut); + SetCurrentAlignment(msaOut); + + if (1 == g_uMaxIters || 2 == uSeqCount) + { + MHackEnd(msaOut); + return; + } + + g_bDiags = g_bDiags2; + SetIter(2); + + if (g_bLow) + { + if (0 != g_uMaxTreeRefineIters) + RefineTreeE(msaOut, v, GuideTree, ProgNodes); + } + else + RefineTree(msaOut, GuideTree); + + extern void DeleteProgNode(ProgNode &Node); + const unsigned uNodeCount = GuideTree.GetNodeCount(); + for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) + DeleteProgNode(ProgNodes[uNodeIndex]); + + delete[] ProgNodes; + ProgNodes = 0; + + SetSeqWeightMethod(g_SeqWeight2); + SetMuscleTree(GuideTree); + + if (g_bAnchors) + RefineVert(msaOut, GuideTree, g_uMaxIters - 2); + else + RefineHoriz(msaOut, GuideTree, g_uMaxIters - 2, false, false); + + MHackEnd(msaOut); + } diff --git a/src/muscle/muscle3.8.31/src/muscle.h b/src/muscle/muscle3.8.31/src/muscle.h new file mode 100644 index 0000000..30c7b6d --- /dev/null +++ b/src/muscle/muscle3.8.31/src/muscle.h @@ -0,0 +1,330 @@ +#if DEBUG && !_DEBUG +#define _DEBUG 1 +#endif + +#if _DEBUG && !DEBUG +#define DEBUG 1 +#endif + +#if _MSC_VER +#define TIMING 0 +#endif + +#define VER_3_52 0 + +#ifdef _MSC_VER // Miscrosoft compiler +#pragma warning(disable : 4800) // int-bool conversion +#pragma warning(disable : 4996) // deprecated names like strdup, isatty. +#endif + +extern const char *MUSCLE_LONG_VERSION; +#define SHORT_VERSION "3.8" + +#include +#include +#include +#include +#include + +#define DOUBLE_AFFINE 0 +#define SINGLE_AFFINE 1 +#define PAF 0 + +#include "types.h" +#include "intmath.h" +#include "alpha.h" +#include "params.h" + +#ifndef _WIN32 +#define stricmp strcasecmp +#define strnicmp strncasecmp +#define _snprintf snprintf +#define _fsopen(name, mode, share) fopen((name), (mode)) +#endif + +#if DEBUG +#undef assert +#define assert(b) Call_MY_ASSERT(__FILE__, __LINE__, b, #b) +void Call_MY_ASSERT(const char *file, int line, bool b, const char *msg); +#else +#define assert(exp) ((void)0) +#endif + +extern int g_argc; +extern char **g_argv; + +#define Rotate(a, b, c) { SCORE *tmp = a; a = b; b = c; c = tmp; } + +const double VERY_LARGE_DOUBLE = 1e20; + +extern unsigned g_uTreeSplitNode1; +extern unsigned g_uTreeSplitNode2; + +// Number of elements in array a[] +#define countof(a) (sizeof(a)/sizeof(a[0])) + +// Maximum of two of any type +#define Max2(a, b) ((a) > (b) ? (a) : (b)) + +// Maximum of three of any type +#define Max3(a, b, c) Max2(Max2(a, b), c) + +// Minimum of two of any type +#define Min2(a, b) ((a) < (b) ? (a) : (b)) + +// Maximum of four of any type +#define Max4(a, b, c, d) Max2(Max2(a, b), Max2(c, d)) + +const double VERY_NEGATIVE_DOUBLE = -9e29; +const float VERY_NEGATIVE_FLOAT = (float) -9e29; + +const double BLOSUM_DIST = 0.62; // todo settable + +// insane value for uninitialized variables +const unsigned uInsane = 8888888; +const int iInsane = 8888888; +const SCORE scoreInsane = 8888888; +const char cInsane = (char) 0xcd; // int 3 instruction, used e.g. for unint. memory +const double dInsane = VERY_NEGATIVE_DOUBLE; +const float fInsane = VERY_NEGATIVE_FLOAT; +const char INVALID_STATE = '*'; +const BASETYPE BTInsane = (BASETYPE) dInsane; +const WEIGHT wInsane = BTInsane; + +extern double g_dNAN; + +extern unsigned long g_tStart; + +void Quit(const char szFormat[], ...); +void Warning(const char szFormat[], ...); +void TrimBlanks(char szStr[]); +void TrimLeadingBlanks(char szStr[]); +void TrimTrailingBlanks(char szStr[]); +void Log(const char szFormat[], ...); +bool Verbose(); +const char *ScoreToStr(SCORE Score); +const char *ScoreToStrL(SCORE Score); +SCORE StrToScore(const char *pszStr); +void Break(); + +double VecSum(const double v[], unsigned n); +bool IsValidInteger(const char *Str); +bool IsValidSignedInteger(const char *Str); +bool IsValidIdentifier(const char *Str); +bool IsValidFloatChar(char c); +bool isident(char c); +bool isidentf(char c); + +void TreeFromSeqVect(const SeqVect &c, Tree &tree, CLUSTER Cluster, + DISTANCE Distance, ROOT Root, const char *SaveFileName = 0); +void TreeFromMSA(const MSA &msa, Tree &tree, CLUSTER Cluster, + DISTANCE Distance, ROOT Root, const char *SaveFileName = 0); + +void StripGaps(char szStr[]); +void StripWhitespace(char szStr[]); +const char *GetTimeAsStr(); +unsigned CalcBLOSUMWeights(MSA &Aln, ClusterTree &BlosumCluster); +void CalcGSCWeights(MSA &Aln, const ClusterTree &BlosumCluster); +void AssertNormalized(const PROB p[]); +void AssertNormalizedOrZero(const PROB p[]); +void AssertNormalized(const double p[]); +bool VectorIsZero(const double dValues[], unsigned n); +void VectorSet(double dValues[], unsigned n, double d); +bool VectorIsZero(const float dValues[], unsigned n); +void VectorSet(float dValues[], unsigned n, float d); + +// @@TODO should be "not linux" +#if _WIN32 +double log2(double x); // Defined in on Linux +#endif + +double pow2(double x); +double lnTolog2(double ln); + +double lp2(double x); +SCORE SumLog(SCORE x, SCORE y); +SCORE SumLog(SCORE x, SCORE y, SCORE z); +SCORE SumLog(SCORE w, SCORE x, SCORE y, SCORE z); + +double lp2Fast(double x); +double SumLogFast(double x, double y); +double SumLogFast(double x, double y, double z); +double SumLogFast(double w, double x, double y, double z); + +void chkmem(const char szMsg[] = ""); + +void Normalize(PROB p[], unsigned n); +void Normalize(PROB p[], unsigned n, double dRequiredTotal); +void NormalizeUnlessZero(PROB p[], unsigned n); + +void DebugPrintf(const char szFormat[], ...); +void SetListFileName(const char *ptrListFileName, bool bAppend); +void ModelFromAlign(const char *strInputFileName, const char *strModelFileName, + double dMaxNIC); +double GetMemUseMB(); +double GetRAMSizeMB(); +double GetPeakMemUseMB(); +void CheckMemUse(); +const char *ElapsedTimeAsString(); +char *SecsToHHMMSS(long lSecs, char szStr[]); +double GetCPUGHz(); +SCORE GetBlosum62(unsigned uLetterA, unsigned uLetterB); +SCORE GetBlosum62d(unsigned uLetterA, unsigned uLetterB); +SCORE GetBlosum50(unsigned uLetterA, unsigned uLetterB); +void AssertNormalizedDist(const PROB p[], unsigned N); +void CmdLineError(const char *Format, ...); +void Fatal(const char *Format, ...); +void InitCmd(); +void ExecCommandLine(int argc, char *argv[]); +void DoCmd(); +void SetLogFile(); +void NameFromPath(const char szPath[], char szName[], unsigned uBytes); +char *strsave(const char *s); +void DistKmer20_3(const SeqVect &v, DistFunc &DF); +void DistKbit20_3(const SeqVect &v, DistFunc &DF); +void DistKmer6_6(const SeqVect &v, DistFunc &DF); +void DistKmer4_6(const SeqVect &v, DistFunc &DF); +void DistPWKimura(const SeqVect &v, DistFunc &DF); +void FastDistKmer(const SeqVect &v, DistFunc &DF); +void DistUnaligned(const SeqVect &v, DISTANCE DistMethod, DistFunc &DF); +double PctIdToMAFFTDist(double dPctId); +double KimuraDist(double dPctId); +void SetFastParams(); +void AssertProfsEq(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB); +void ValidateMuscleIds(const MSA &msa); +void ValidateMuscleIds(const Tree &tree); +void TraceBackToPath(int **TraceBack, unsigned uLengthA, + unsigned uLengthB, PWPath &Path); +void BitTraceBack(char **TraceBack, unsigned uLengthA, unsigned uLengthB, + char LastEdge, PWPath &Path); +SCORE AlignTwoMSAs(const MSA &msa1, const MSA &msa2, MSA &msaOut, PWPath &Path, + bool bLockLeft = false, bool bLockRight = false); +SCORE AlignTwoProfs( + const ProfPos *PA, unsigned uLengthA, WEIGHT wA, + const ProfPos *PB, unsigned uLengthB, WEIGHT wB, + PWPath &Path, ProfPos **ptrPout, unsigned *ptruLengthOut); +void AlignTwoProfsGivenPath(const PWPath &Path, + const ProfPos *PA, unsigned uLengthA, WEIGHT wA, + const ProfPos *PB, unsigned uLengthB, WEIGHT wB, + ProfPos **ptrPOut, unsigned *ptruLengthOut); +void AlignTwoMSAsGivenPathSW(const PWPath &Path, const MSA &msaA, const MSA &msaB, + MSA &msaCombined); +void AlignTwoMSAsGivenPath(const PWPath &Path, const MSA &msaA, const MSA &msaB, + MSA &msaCombined); +SCORE FastScorePath2(const ProfPos *PA, unsigned uLengthA, + const ProfPos *PB, unsigned uLengthB, const PWPath &Path); +SCORE GlobalAlignDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path); +SCORE GlobalAlignSimple(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path); +SCORE GlobalAlignSP(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path); +SCORE GlobalAlignSPN(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path); +SCORE GlobalAlignLE(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path); +void CalcThreeWayWeights(const Tree &tree, unsigned uNode1, unsigned uNode2, + WEIGHT *Weights); +SCORE GlobalAlignSS(const Seq &seqA, const Seq &seqB, PWPath &Path); +bool RefineHoriz(MSA &msaIn, const Tree &tree, unsigned uIters, bool bLockLeft, bool bLockRight); +bool RefineVert(MSA &msaIn, const Tree &tree, unsigned uIters); +SCORE GlobalAlignNoDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path); + +void SetInputFileName(const char *pstrFileName); +void SetIter(unsigned uIter); +void IncIter(); +void SetMaxIters(unsigned uMaxIters); +void Progress(unsigned uStep, unsigned uTotalSteps); +void Progress(const char *szFormat, ...); +void SetStartTime(); +void ProgressStepsDone(); +void SetProgressDesc(const char szDesc[]); +void SetSeqStats(unsigned uSeqCount, unsigned uMaxL, unsigned uAvgL); + +void SetNewHandler(); +void SaveCurrentAlignment(); +void SetCurrentAlignment(MSA &msa); +void SetOutputFileName(const char *out); + +#if DEBUG +void SetMuscleSeqVect(SeqVect &v); +void SetMuscleInputMSA(MSA &msa); +void ValidateMuscleIds(const MSA &msa); +void ValidateMuscleIds(const Tree &tree); +#else +#define SetMuscleSeqVect(x) /* empty */ +#define SetMuscleInputMSA(x) /* empty */ +#define ValidateMuscleIds(x) /* empty */ +#endif + +void ProcessArgVect(int argc, char *argv[]); +void ProcessArgStr(const char *Str); +void Usage(); +void SetParams(); + +void SortCounts(const FCOUNT fcCounts[], unsigned SortOrder[]); +unsigned ResidueGroupFromFCounts(const FCOUNT fcCounts[]); +FCOUNT SumCounts(const FCOUNT Counts[]); + +bool FlagOpt(const char *Name); +const char *ValueOpt(const char *Name); +void DoMuscle(); +void ProfDB(); +void DoSP(); +void ProgAlignSubFams(); +void Run(); +void ListParams(); +void OnException(); +void SetSeqWeightMethod(SEQWEIGHT Method); +SEQWEIGHT GetSeqWeightMethod(); +WEIGHT GetMuscleSeqWeightById(unsigned uId); +void ListDiagSavings(); +void CheckMaxTime(); +const char *MaxSecsToStr(); +unsigned long GetStartTime(); + +void ProgressiveAlign(const SeqVect &v, const Tree &GuideTree, MSA &a); +ProgNode *ProgressiveAlignE(const SeqVect &v, const Tree &GuideTree, MSA &a); + +void CalcDistRangeKmer6_6(const MSA &msa, unsigned uRow, float Dist[]); +void CalcDistRangeKmer20_3(const MSA &msa, unsigned uRow, float Dist[]); +void CalcDistRangeKmer20_4(const MSA &msa, unsigned uRow, float Dist[]); +void CalcDistRangePctIdKimura(const MSA &msa, unsigned uRow, float Dist[]); +void CalcDistRangePctIdLog(const MSA &msa, unsigned uRow, float Dist[]); + +void MakeRootMSA(const SeqVect &v, const Tree &GuideTree, ProgNode Nodes[], MSA &a); +void MakeRootMSABrenner(SeqVect &v, const Tree &GuideTree, ProgNode Nodes[], MSA &a); + +void Refine(); +void Local(); +void Profile(); +void PPScore(); +void UPGMA2(const DistCalc &DC, Tree &tree, LINKAGE Linkage); + +char *GetFastaSeq(FILE *f, unsigned *ptrSeqLength, char **ptrLabel, + bool DeleteGaps = true); +SCORE SW(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path); +void TraceBackSW(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, const SCORE *DPM_, const SCORE *DPD_, const SCORE *DPI_, + unsigned uPrefixLengthAMax, unsigned uPrefixLengthBMax, PWPath &Path); +void DiffPaths(const PWPath &p1, const PWPath &p2, unsigned Edges1[], + unsigned *ptruDiffCount1, unsigned Edges2[], unsigned *ptruDiffCount2); +void SetPPScore(bool bRespectFlagOpts = true); +void SetPPScore(PPSCORE p); +SCORE GlobalAlignDimer(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path); +bool MissingCommand(); +void Credits(); +void ProfileProfile(MSA &msa1, MSA &msa2, MSA &msaOut); +void MHackStart(SeqVect &v); +void MHackEnd(MSA &msa); +void WriteScoreFile(const MSA &msa); +char ConsensusChar(const ProfPos &PP); +void Stabilize(const MSA &msa, MSA &msaStable); +void MuscleOutput(MSA &msa); +PTR_SCOREMATRIX ReadMx(TextFile &File); +void MemPlus(size_t Bytes, char *Where); +void MemMinus(size_t Bytes, char *Where); diff --git a/src/muscle/muscle3.8.31/src/muscle.vcproj b/src/muscle/muscle3.8.31/src/muscle.vcproj new file mode 100644 index 0000000..226dac2 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/muscle.vcproj @@ -0,0 +1,922 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/muscle/muscle3.8.31/src/muscle21 b/src/muscle/muscle3.8.31/src/muscle21 new file mode 100644 index 0000000..45b995c Binary files /dev/null and b/src/muscle/muscle3.8.31/src/muscle21 differ diff --git a/src/muscle/muscle3.8.31/src/muscle21.vcproj b/src/muscle/muscle3.8.31/src/muscle21.vcproj new file mode 100644 index 0000000..eadf4a8 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/muscle21.vcproj @@ -0,0 +1,423 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/muscle/muscle3.8.31/src/muscleout.cpp b/src/muscle/muscle3.8.31/src/muscleout.cpp new file mode 100644 index 0000000..d4c451c --- /dev/null +++ b/src/muscle/muscle3.8.31/src/muscleout.cpp @@ -0,0 +1,116 @@ +#include "muscle.h" +#include "msa.h" +#include "params.h" +#include "textfile.h" + +static void DoOutput(MSA &msa) + { + bool AnyOutput = false; + +// Value options + if (g_pstrFASTAOutFileName) + { + TextFile File(g_pstrFASTAOutFileName, true); + msa.ToFASTAFile(File); + AnyOutput = true; + } + + if (g_pstrMSFOutFileName) + { + TextFile File(g_pstrMSFOutFileName, true); + msa.ToMSFFile(File); + AnyOutput = true; + } + + if (g_pstrClwOutFileName) + { + TextFile File(g_pstrClwOutFileName, true); + msa.ToAlnFile(File); + AnyOutput = true; + } + + if (g_pstrClwStrictOutFileName) + { + g_bClwStrict = true; + TextFile File(g_pstrClwStrictOutFileName, true); + msa.ToAlnFile(File); + AnyOutput = true; + } + + if (g_pstrHTMLOutFileName) + { + TextFile File(g_pstrHTMLOutFileName, true); + msa.ToHTMLFile(File); + AnyOutput = true; + } + + if (g_pstrPHYIOutFileName) + { + TextFile File(g_pstrPHYIOutFileName, true); + msa.ToPhyInterleavedFile(File); + AnyOutput = true; + } + + if (g_pstrPHYSOutFileName) + { + TextFile File(g_pstrPHYSOutFileName, true); + msa.ToPhySequentialFile(File); + AnyOutput = true; + } + +// Flag options, at most one used (because only one -out filename) + TextFile fileOut(g_pstrOutFileName, true); + if (g_bFASTA) + { + msa.ToFASTAFile(fileOut); + AnyOutput = true; + } + else if (g_bMSF) + { + msa.ToMSFFile(fileOut); + AnyOutput = true; + } + else if (g_bAln) + { + msa.ToAlnFile(fileOut); + AnyOutput = true; + } + else if (g_bHTML) + { + msa.ToHTMLFile(fileOut); + AnyOutput = true; + } + else if (g_bPHYI) + { + msa.ToPhyInterleavedFile(fileOut); + AnyOutput = true; + } + else if (g_bPHYS) + { + msa.ToPhySequentialFile(fileOut); + AnyOutput = true; + } + +// If -out option was given but no flags, output as FASTA + if (!AnyOutput) + msa.ToFASTAFile(fileOut); + + fileOut.Close(); + + if (0 != g_pstrScoreFileName) + WriteScoreFile(msa); + } + +void MuscleOutput(MSA &msa) + { + MHackEnd(msa); + if (g_bStable) + { + MSA msaStable; + Stabilize(msa, msaStable); + msa.Clear(); // save memory + DoOutput(msaStable); + } + else + DoOutput(msa); + } diff --git a/src/muscle/muscle3.8.31/src/nucmx.cpp b/src/muscle/muscle3.8.31/src/nucmx.cpp new file mode 100644 index 0000000..8593e9e --- /dev/null +++ b/src/muscle/muscle3.8.31/src/nucmx.cpp @@ -0,0 +1,23 @@ +#include "muscle.h" + +// BLASTZ default parameters +// open 400, extend 30, matrix as below + +const float NUC_EXTEND = 30; +const float NUC_SP_CENTER = 2*NUC_EXTEND; + +#define v(x) ((float) x + NUC_SP_CENTER) +#define ROW(A, C, G, T) \ + { v(A), v(C), v(G), v(T) }, + +float NUC_SP[32][32] = + { +// A C G T +ROW( 91, -114, -31, -123) // A + +ROW( -114, 100, -125, -31) // C + +ROW( -31, -125, 100, -114) // G + +ROW( -123, -31, -114, 91) // T + }; diff --git a/src/muscle/muscle3.8.31/src/nwdasimple.cpp b/src/muscle/muscle3.8.31/src/nwdasimple.cpp new file mode 100644 index 0000000..b4012d9 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/nwdasimple.cpp @@ -0,0 +1,494 @@ +#include "muscle.h" +#include +#include "pwpath.h" +#include "profile.h" +#include + +#define TRACE 0 + +bool g_bKeepSimpleDP; +SCORE *g_DPM; +SCORE *g_DPD; +SCORE *g_DPE; +SCORE *g_DPI; +SCORE *g_DPJ; +char *g_TBM; +char *g_TBD; +char *g_TBE; +char *g_TBI; +char *g_TBJ; + +#if DOUBLE_AFFINE + +static char XlatEdgeType(char c) + { + if ('E' == c) + return 'D'; + if ('J' == c) + return 'I'; + return c; + } + +static const char *LocalScoreToStr(SCORE s) + { + static char str[16]; + if (s < -100000) + return " *"; + sprintf(str, "%6.1f", s); + return str; + } + +static void ListTB(const char *TBM_, const ProfPos *PA, const ProfPos *PB, + unsigned uPrefixCountA, unsigned uPrefixCountB) + { + Log(" "); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + char c = ' '; + if (uPrefixLengthB > 0) + c = ConsensusChar(PB[uPrefixLengthB - 1]); + Log(" %4u:%c", uPrefixLengthB, c); + } + Log("\n"); + for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + char c = ' '; + if (uPrefixLengthA > 0) + c = ConsensusChar(PA[uPrefixLengthA - 1]); + Log("%4u:%c ", uPrefixLengthA, c); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + Log(" %6c", TBM(uPrefixLengthA, uPrefixLengthB)); + Log("\n"); + } + } + +static void ListDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB, + unsigned uPrefixCountA, unsigned uPrefixCountB) + { + Log(" "); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + char c = ' '; + if (uPrefixLengthB > 0) + c = ConsensusChar(PB[uPrefixLengthB - 1]); + Log(" %4u:%c", uPrefixLengthB, c); + } + Log("\n"); + for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + char c = ' '; + if (uPrefixLengthA > 0) + c = ConsensusChar(PA[uPrefixLengthA - 1]); + Log("%4u:%c ", uPrefixLengthA, c); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB))); + Log("\n"); + } + } + +SCORE NWDASimple(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path) + { + assert(uLengthB > 0 && uLengthA > 0); + + const unsigned uPrefixCountA = uLengthA + 1; + const unsigned uPrefixCountB = uLengthB + 1; + +// Allocate DP matrices + const size_t LM = uPrefixCountA*uPrefixCountB; + SCORE *DPL_ = new SCORE[LM]; + SCORE *DPM_ = new SCORE[LM]; + SCORE *DPD_ = new SCORE[LM]; + SCORE *DPE_ = new SCORE[LM]; + SCORE *DPI_ = new SCORE[LM]; + SCORE *DPJ_ = new SCORE[LM]; + + char *TBM_ = new char[LM]; + char *TBD_ = new char[LM]; + char *TBE_ = new char[LM]; + char *TBI_ = new char[LM]; + char *TBJ_ = new char[LM]; + + memset(TBM_, '?', LM); + memset(TBD_, '?', LM); + memset(TBE_, '?', LM); + memset(TBI_, '?', LM); + memset(TBJ_, '?', LM); + + DPM(0, 0) = 0; + DPD(0, 0) = MINUS_INFINITY; + DPE(0, 0) = MINUS_INFINITY; + DPI(0, 0) = MINUS_INFINITY; + DPJ(0, 0) = MINUS_INFINITY; + + DPM(1, 0) = MINUS_INFINITY; + DPD(1, 0) = PA[0].m_scoreGapOpen; + DPE(1, 0) = PA[0].m_scoreGapOpen2; + TBD(1, 0) = 'D'; + TBE(1, 0) = 'E'; + DPI(1, 0) = MINUS_INFINITY; + DPJ(1, 0) = MINUS_INFINITY; + + DPM(0, 1) = MINUS_INFINITY; + DPD(0, 1) = MINUS_INFINITY; + DPE(0, 1) = MINUS_INFINITY; + DPI(0, 1) = PB[0].m_scoreGapOpen; + DPJ(0, 1) = PB[0].m_scoreGapOpen2; + TBI(0, 1) = 'I'; + TBJ(0, 1) = 'J'; + +// Empty prefix of B is special case + for (unsigned uPrefixLengthA = 2; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + DPM(uPrefixLengthA, 0) = MINUS_INFINITY; + + DPD(uPrefixLengthA, 0) = DPD(uPrefixLengthA - 1, 0) + g_scoreGapExtend; + DPE(uPrefixLengthA, 0) = DPE(uPrefixLengthA - 1, 0) + g_scoreGapExtend2; + + TBD(uPrefixLengthA, 0) = 'D'; + TBE(uPrefixLengthA, 0) = 'E'; + + DPI(uPrefixLengthA, 0) = MINUS_INFINITY; + DPJ(uPrefixLengthA, 0) = MINUS_INFINITY; + } + +// Empty prefix of A is special case + for (unsigned uPrefixLengthB = 2; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + DPM(0, uPrefixLengthB) = MINUS_INFINITY; + + DPD(0, uPrefixLengthB) = MINUS_INFINITY; + DPE(0, uPrefixLengthB) = MINUS_INFINITY; + + DPI(0, uPrefixLengthB) = DPI(0, uPrefixLengthB - 1) + g_scoreGapExtend; + DPJ(0, uPrefixLengthB) = DPJ(0, uPrefixLengthB - 1) + g_scoreGapExtend2; + + TBI(0, uPrefixLengthB) = 'I'; + TBJ(0, uPrefixLengthB) = 'J'; + } + +// Special case to agree with NWFast, no D-I transitions so... + DPD(uLengthA, 0) = MINUS_INFINITY; + DPE(uLengthA, 0) = MINUS_INFINITY; +// DPI(0, uLengthB) = MINUS_INFINITY; +// DPJ(0, uLengthB) = MINUS_INFINITY; + +// ============ +// Main DP loop +// ============ + SCORE scoreGapCloseB = MINUS_INFINITY; + SCORE scoreGapClose2B = MINUS_INFINITY; + for (unsigned uPrefixLengthB = 1; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + const ProfPos &PPB = PB[uPrefixLengthB - 1]; + + SCORE scoreGapCloseA = MINUS_INFINITY; + SCORE scoreGapClose2A = MINUS_INFINITY; + for (unsigned uPrefixLengthA = 1; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + const ProfPos &PPA = PA[uPrefixLengthA - 1]; + + { + // Match M=LetterA+LetterB + SCORE scoreLL = ScoreProfPos2(PPA, PPB); + DPL(uPrefixLengthA, uPrefixLengthB) = scoreLL; + + SCORE scoreMM = DPM(uPrefixLengthA-1, uPrefixLengthB-1); + SCORE scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseA; + SCORE scoreEM = DPE(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapClose2A; + SCORE scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseB; + SCORE scoreJM = DPJ(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapClose2B; + + SCORE scoreBest; + if (scoreMM >= scoreDM && scoreMM >= scoreEM && scoreMM >= scoreIM && scoreMM >= scoreJM) + { + scoreBest = scoreMM; + TBM(uPrefixLengthA, uPrefixLengthB) = 'M'; + } + else if (scoreDM >= scoreMM && scoreDM >= scoreEM && scoreDM >= scoreIM && scoreDM >= scoreJM) + { + scoreBest = scoreDM; + TBM(uPrefixLengthA, uPrefixLengthB) = 'D'; + } + else if (scoreEM >= scoreMM && scoreEM >= scoreDM && scoreEM >= scoreIM && scoreEM >= scoreJM) + { + scoreBest = scoreEM; + TBM(uPrefixLengthA, uPrefixLengthB) = 'E'; + } + else if (scoreIM >= scoreMM && scoreIM >= scoreDM && scoreIM >= scoreEM && scoreIM >= scoreJM) + { + scoreBest = scoreIM; + TBM(uPrefixLengthA, uPrefixLengthB) = 'I'; + } + else + { + assert(scoreJM >= scoreMM && scoreJM >= scoreDM && scoreJM >= scoreEM && scoreJM >= scoreIM); + scoreBest = scoreJM; + TBM(uPrefixLengthA, uPrefixLengthB) = 'J'; + } + DPM(uPrefixLengthA, uPrefixLengthB) = scoreBest + scoreLL; + } + + { + // Delete D=LetterA+GapB + SCORE scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) + + PA[uPrefixLengthA-1].m_scoreGapOpen; + SCORE scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB) + g_scoreGapExtend; + + SCORE scoreBest; + if (scoreMD >= scoreDD) + { + scoreBest = scoreMD; + TBD(uPrefixLengthA, uPrefixLengthB) = 'M'; + } + else + { + assert(scoreDD >= scoreMD); + scoreBest = scoreDD; + TBD(uPrefixLengthA, uPrefixLengthB) = 'D'; + } + DPD(uPrefixLengthA, uPrefixLengthB) = scoreBest; + } + + { + // Delete E=LetterA+GapB + SCORE scoreME = DPM(uPrefixLengthA-1, uPrefixLengthB) + + PA[uPrefixLengthA-1].m_scoreGapOpen2; + SCORE scoreEE = DPE(uPrefixLengthA-1, uPrefixLengthB) + g_scoreGapExtend2; + + SCORE scoreBest; + if (scoreME >= scoreEE) + { + scoreBest = scoreME; + TBE(uPrefixLengthA, uPrefixLengthB) = 'M'; + } + else + { + assert(scoreEE >= scoreME); + scoreBest = scoreEE; + TBE(uPrefixLengthA, uPrefixLengthB) = 'E'; + } + DPE(uPrefixLengthA, uPrefixLengthB) = scoreBest; + } + + // Insert I=GapA+LetterB + { + SCORE scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) + + PB[uPrefixLengthB - 1].m_scoreGapOpen; + SCORE scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1) + g_scoreGapExtend; + + SCORE scoreBest; + if (scoreMI >= scoreII) + { + scoreBest = scoreMI; + TBI(uPrefixLengthA, uPrefixLengthB) = 'M'; + } + else + { + assert(scoreII > scoreMI); + scoreBest = scoreII; + TBI(uPrefixLengthA, uPrefixLengthB) = 'I'; + } + DPI(uPrefixLengthA, uPrefixLengthB) = scoreBest; + } + + // Insert J=GapA+LetterB + { + SCORE scoreMJ = DPM(uPrefixLengthA, uPrefixLengthB-1) + + PB[uPrefixLengthB - 1].m_scoreGapOpen2; + SCORE scoreJJ = DPJ(uPrefixLengthA, uPrefixLengthB-1) + g_scoreGapExtend2; + + SCORE scoreBest; + if (scoreMJ >= scoreJJ) + { + scoreBest = scoreMJ; + TBJ(uPrefixLengthA, uPrefixLengthB) = 'M'; + } + else + { + assert(scoreJJ > scoreMJ); + scoreBest = scoreJJ; + TBJ(uPrefixLengthA, uPrefixLengthB) = 'J'; + } + DPJ(uPrefixLengthA, uPrefixLengthB) = scoreBest; + } + + scoreGapCloseA = PPA.m_scoreGapClose; + scoreGapClose2A = PPA.m_scoreGapClose2; + } + scoreGapCloseB = PPB.m_scoreGapClose; + scoreGapClose2B = PPB.m_scoreGapClose2; + } + +#if TRACE + Log("\n"); + Log("DA Simple DPL:\n"); + ListDP(DPL_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("\n"); + Log("DA Simple DPM:\n"); + ListDP(DPM_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("\n"); + Log("DA Simple DPD:\n"); + ListDP(DPD_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("\n"); + Log("DA Simple DPE:\n"); + ListDP(DPE_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("\n"); + Log("DA Simple DPI:\n"); + ListDP(DPI_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("\n"); + Log("DA Simple DPJ:\n"); + ListDP(DPJ_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("\n"); + Log("DA Simple TBM:\n"); + ListTB(TBM_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("\n"); + Log("DA Simple TBD:\n"); + ListTB(TBD_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("\n"); + Log("DA Simple TBE:\n"); + ListTB(TBE_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("\n"); + Log("DA Simple TBI:\n"); + ListTB(TBI_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("\n"); + Log("DA Simple TBJ:\n"); + ListTB(TBJ_, PA, PB, uPrefixCountA, uPrefixCountB); +#endif + +// Trace-back +// ========== + Path.Clear(); + +// Find last edge + SCORE M = DPM(uLengthA, uLengthB); + SCORE D = DPD(uLengthA, uLengthB) + PA[uLengthA-1].m_scoreGapClose; + SCORE E = DPE(uLengthA, uLengthB) + PA[uLengthA-1].m_scoreGapClose2; + SCORE I = DPI(uLengthA, uLengthB) + PB[uLengthB-1].m_scoreGapClose; + SCORE J = DPJ(uLengthA, uLengthB) + PB[uLengthB-1].m_scoreGapClose2; + char cEdgeType = '?'; + + SCORE BestScore = M; + cEdgeType = 'M'; + if (D > BestScore) + { + cEdgeType = 'D'; + BestScore = D; + } + if (E > BestScore) + { + cEdgeType = 'E'; + BestScore = E; + } + if (I > BestScore) + { + cEdgeType = 'I'; + BestScore = I; + } + if (J > BestScore) + { + cEdgeType = 'J'; + BestScore = J; + } + +#if TRACE + Log("DA Simple: MAB=%.4g DAB=%.4g EAB=%.4g IAB=%.4g JAB=%.4g best=%c\n", + M, D, E, I, J, cEdgeType); +#endif + + unsigned PLA = uLengthA; + unsigned PLB = uLengthB; + for (;;) + { + PWEdge Edge; + Edge.cType = XlatEdgeType(cEdgeType); + Edge.uPrefixLengthA = PLA; + Edge.uPrefixLengthB = PLB; +#if TRACE + Log("Prepend %c%d.%d\n", Edge.cType, PLA, PLB); +#endif + Path.PrependEdge(Edge); + + switch (cEdgeType) + { + case 'M': + assert(PLA > 0); + assert(PLB > 0); + cEdgeType = TBM(PLA, PLB); + --PLA; + --PLB; + break; + + case 'D': + assert(PLA > 0); + cEdgeType = TBD(PLA, PLB); + --PLA; + break; + + case 'E': + assert(PLA > 0); + cEdgeType = TBE(PLA, PLB); + --PLA; + break; + + case 'I': + assert(PLB > 0); + cEdgeType = TBI(PLA, PLB); + --PLB; + break; + + case 'J': + assert(PLB > 0); + cEdgeType = TBJ(PLA, PLB); + --PLB; + break; + + default: + Quit("Invalid edge %c", cEdgeType); + } + if (0 == PLA && 0 == PLB) + break; + } + Path.Validate(); + +// SCORE Score = TraceBack(PA, uLengthA, PB, uLengthB, DPM_, DPD_, DPI_, Path); + +#if TRACE + SCORE scorePath = FastScorePath2(PA, uLengthA, PB, uLengthB, Path); + Path.LogMe(); + Log("Score = %s Path = %s\n", LocalScoreToStr(BestScore), LocalScoreToStr(scorePath)); +#endif + + if (g_bKeepSimpleDP) + { + g_DPM = DPM_; + g_DPD = DPD_; + g_DPE = DPE_; + g_DPI = DPI_; + g_DPJ = DPJ_; + + g_TBM = TBM_; + g_TBD = TBD_; + g_TBE = TBE_; + g_TBI = TBI_; + g_TBJ = TBJ_; + } + else + { + delete[] DPM_; + delete[] DPD_; + delete[] DPE_; + delete[] DPI_; + delete[] DPJ_; + + delete[] TBM_; + delete[] TBD_; + delete[] TBE_; + delete[] TBI_; + delete[] TBJ_; + } + + return BestScore; + } + +#endif // DOUBLE_AFFINE diff --git a/src/muscle/muscle3.8.31/src/nwdasimple2.cpp b/src/muscle/muscle3.8.31/src/nwdasimple2.cpp new file mode 100644 index 0000000..9551dc7 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/nwdasimple2.cpp @@ -0,0 +1,549 @@ +#include "muscle.h" +#include "pwpath.h" +#include "profile.h" + +#if DOUBLE_AFFINE + +#define TRACE 0 + +extern bool g_bKeepSimpleDP; +extern SCORE *g_DPM; +extern SCORE *g_DPD; +extern SCORE *g_DPE; +extern SCORE *g_DPI; +extern SCORE *g_DPJ; +extern char *g_TBM; +extern char *g_TBD; +extern char *g_TBE; +extern char *g_TBI; +extern char *g_TBJ; + +static char XlatEdgeType(char c) + { + if ('E' == c) + return 'D'; + if ('J' == c) + return 'I'; + return c; + } + +static const char *LocalScoreToStr(SCORE s) + { + static char str[16]; + if (s < -100000) + return " *"; + sprintf(str, "%6.1f", s); + return str; + } + +static void ListDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB, + unsigned uPrefixCountA, unsigned uPrefixCountB) + { + Log(" "); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + char c = ' '; + if (uPrefixLengthB > 0) + c = ConsensusChar(PB[uPrefixLengthB - 1]); + Log(" %4u:%c", uPrefixLengthB, c); + } + Log("\n"); + for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + char c = ' '; + if (uPrefixLengthA > 0) + c = ConsensusChar(PA[uPrefixLengthA - 1]); + Log("%4u:%c ", uPrefixLengthA, c); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB))); + Log("\n"); + } + } + +static void ListTB(const char *TBM_, const ProfPos *PA, const ProfPos *PB, + unsigned uPrefixCountA, unsigned uPrefixCountB) + { + Log(" "); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + char c = ' '; + if (uPrefixLengthB > 0) + c = ConsensusChar(PB[uPrefixLengthB - 1]); + Log(" %4u:%c", uPrefixLengthB, c); + } + Log("\n"); + for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + char c = ' '; + if (uPrefixLengthA > 0) + c = ConsensusChar(PA[uPrefixLengthA - 1]); + Log("%4u:%c ", uPrefixLengthA, c); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + Log(" %6c", TBM(uPrefixLengthA, uPrefixLengthB)); + Log("\n"); + } + } + +static void ListDPM(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB, + unsigned uPrefixCountA, unsigned uPrefixCountB) + { + Log(" "); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + char c = ' '; + if (uPrefixLengthB > 0) + c = ConsensusChar(PB[uPrefixLengthB - 1]); + Log(" %4u:%c", uPrefixLengthB, c); + } + Log("\n"); + for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + char c = ' '; + if (uPrefixLengthA > 0) + c = ConsensusChar(PA[uPrefixLengthA - 1]); + Log("%4u:%c ", uPrefixLengthA, c); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + SCORE x = (uPrefixLengthA + uPrefixLengthB)*g_scoreGapExtend; + SCORE s = DPM(uPrefixLengthA, uPrefixLengthB) - x; + Log(" %s", LocalScoreToStr(s)); + } + Log("\n"); + } + } + +extern SCORE ScoreProfPos2(const ProfPos &PP, const ProfPos &PPB); + +SCORE NWDASimple2(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path) + { + assert(uLengthB > 0 && uLengthA > 0); + + const unsigned uPrefixCountA = uLengthA + 1; + const unsigned uPrefixCountB = uLengthB + 1; + +// Allocate DP matrices + const size_t LM = uPrefixCountA*uPrefixCountB; + SCORE *DPM_ = new SCORE[LM]; + SCORE *DPD_ = new SCORE[LM]; + SCORE *DPE_ = new SCORE[LM]; + SCORE *DPI_ = new SCORE[LM]; + SCORE *DPJ_ = new SCORE[LM]; + SCORE *DPL_ = new SCORE[LM]; + + char *TBM_ = new char[LM]; + char *TBD_ = new char[LM]; + char *TBE_ = new char[LM]; + char *TBI_ = new char[LM]; + char *TBJ_ = new char[LM]; + + memset(DPM_, 0, LM*sizeof(SCORE)); + memset(DPD_, 0, LM*sizeof(SCORE)); + memset(DPE_, 0, LM*sizeof(SCORE)); + memset(DPI_, 0, LM*sizeof(SCORE)); + memset(DPJ_, 0, LM*sizeof(SCORE)); + +// memset(DPL_, 0, LM*sizeof(SCORE)); + + memset(TBM_, '?', LM); + memset(TBD_, '?', LM); + memset(TBE_, '?', LM); + memset(TBI_, '?', LM); + memset(TBJ_, '?', LM); + + DPM(0, 0) = 0; + DPD(0, 0) = MINUS_INFINITY; + DPE(0, 0) = MINUS_INFINITY; + DPI(0, 0) = MINUS_INFINITY; + DPJ(0, 0) = MINUS_INFINITY; + + DPM(1, 0) = MINUS_INFINITY; + DPD(1, 0) = PA[0].m_scoreGapOpen; + DPE(1, 0) = PA[0].m_scoreGapOpen2; + DPI(1, 0) = MINUS_INFINITY; + DPJ(1, 0) = MINUS_INFINITY; + + DPM(0, 1) = MINUS_INFINITY; + DPD(0, 1) = MINUS_INFINITY; + DPE(0, 1) = MINUS_INFINITY; + DPI(0, 1) = PB[0].m_scoreGapOpen; + DPJ(0, 1) = PB[0].m_scoreGapOpen2; + +// Empty prefix of B is special case + for (unsigned uPrefixLengthA = 2; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + // M=LetterA+LetterB, impossible with empty prefix + DPM(uPrefixLengthA, 0) = MINUS_INFINITY; + + // D=LetterA+GapB + DPD(uPrefixLengthA, 0) = DPD(uPrefixLengthA - 1, 0) + g_scoreGapExtend; + TBD(uPrefixLengthA, 0) = 'D'; + + DPE(uPrefixLengthA, 0) = DPE(uPrefixLengthA - 1, 0) + g_scoreGapExtend2; + TBE(uPrefixLengthA, 0) = 'E'; + + // I=GapA+LetterB, impossible with empty prefix + DPI(uPrefixLengthA, 0) = MINUS_INFINITY; + DPJ(uPrefixLengthA, 0) = MINUS_INFINITY; + } + +// Empty prefix of A is special case + for (unsigned uPrefixLengthB = 2; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + // M=LetterA+LetterB, impossible with empty prefix + DPM(0, uPrefixLengthB) = MINUS_INFINITY; + + // D=LetterA+GapB, impossible with empty prefix + DPD(0, uPrefixLengthB) = MINUS_INFINITY; + DPE(0, uPrefixLengthB) = MINUS_INFINITY; + + // I=GapA+LetterB + DPI(0, uPrefixLengthB) = DPI(0, uPrefixLengthB - 1) + g_scoreGapExtend; + TBI(0, uPrefixLengthB) = 'I'; + + DPJ(0, uPrefixLengthB) = DPJ(0, uPrefixLengthB - 1) + g_scoreGapExtend2; + TBJ(0, uPrefixLengthB) = 'J'; + } + +// ============ +// Main DP loop +// ============ + for (unsigned uPrefixLengthB = 1; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + const ProfPos &PPB = PB[uPrefixLengthB - 1]; + SCORE scoreGapCloseB; + if (uPrefixLengthB == 1) + scoreGapCloseB = MINUS_INFINITY; + else + scoreGapCloseB = PB[uPrefixLengthB-2].m_scoreGapClose; + + SCORE scoreGapClose2B; + if (uPrefixLengthB == 1) + scoreGapClose2B = MINUS_INFINITY; + else + scoreGapClose2B = PB[uPrefixLengthB-2].m_scoreGapClose2; + + for (unsigned uPrefixLengthA = 1; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + const ProfPos &PPA = PA[uPrefixLengthA - 1]; + + { + // Match M=LetterA+LetterB + SCORE scoreLL = ScoreProfPos2(PPA, PPB); + DPL(uPrefixLengthA, uPrefixLengthB) = scoreLL; + + SCORE scoreGapCloseA; + if (uPrefixLengthA == 1) + scoreGapCloseA = MINUS_INFINITY; + else + scoreGapCloseA = PA[uPrefixLengthA-2].m_scoreGapClose; + + SCORE scoreGapClose2A; + if (uPrefixLengthA == 1) + scoreGapClose2A = MINUS_INFINITY; + else + scoreGapClose2A = PA[uPrefixLengthA-2].m_scoreGapClose2; + + SCORE scoreMM = DPM(uPrefixLengthA-1, uPrefixLengthB-1); + SCORE scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseA; + SCORE scoreEM = DPE(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapClose2A; + SCORE scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseB; + SCORE scoreJM = DPJ(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapClose2B; + SCORE scoreBest; + if (scoreMM >= scoreDM && scoreMM >= scoreIM && scoreMM >= scoreEM && scoreMM >= scoreJM) + { + scoreBest = scoreMM; + TBM(uPrefixLengthA, uPrefixLengthB) = 'M'; + } + else if (scoreDM >= scoreMM && scoreDM >= scoreIM && scoreDM >= scoreEM && scoreDM >= scoreJM) + { + scoreBest = scoreDM; + TBM(uPrefixLengthA, uPrefixLengthB) = 'D'; + } + else if (scoreEM >= scoreMM && scoreEM >= scoreIM && scoreEM >= scoreDM && scoreEM >= scoreJM) + { + scoreBest = scoreEM; + TBM(uPrefixLengthA, uPrefixLengthB) = 'E'; + } + else if (scoreIM >= scoreMM && scoreIM >= scoreDM && scoreIM >= scoreEM && scoreIM >= scoreJM) + { + scoreBest = scoreIM; + TBM(uPrefixLengthA, uPrefixLengthB) = 'I'; + } + else if (scoreJM >= scoreMM && scoreJM >= scoreDM && scoreJM >= scoreEM && scoreJM >= scoreIM) + { + scoreBest = scoreJM; + TBM(uPrefixLengthA, uPrefixLengthB) = 'J'; + } + else + Quit("Max failed (M)"); + + DPM(uPrefixLengthA, uPrefixLengthB) = scoreBest + scoreLL; + } + + { + // Delete D=LetterA+GapB + SCORE scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) + + PA[uPrefixLengthA-1].m_scoreGapOpen; + SCORE scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB) + + g_scoreGapExtend; + + SCORE scoreBest; + if (scoreMD >= scoreDD) + { + scoreBest = scoreMD; + TBD(uPrefixLengthA, uPrefixLengthB) = 'M'; + } + else + { + assert(scoreDD >= scoreMD); + scoreBest = scoreDD; + TBD(uPrefixLengthA, uPrefixLengthB) = 'D'; + } + DPD(uPrefixLengthA, uPrefixLengthB) = scoreBest; + } + + { + // Delete E=LetterA+GapB + SCORE scoreME = DPM(uPrefixLengthA-1, uPrefixLengthB) + + PA[uPrefixLengthA-1].m_scoreGapOpen2; + SCORE scoreEE = DPE(uPrefixLengthA-1, uPrefixLengthB) + + g_scoreGapExtend2; + + SCORE scoreBest; + if (scoreME >= scoreEE) + { + scoreBest = scoreME; + TBE(uPrefixLengthA, uPrefixLengthB) = 'M'; + } + else + { + assert(scoreEE >= scoreME); + scoreBest = scoreEE; + TBE(uPrefixLengthA, uPrefixLengthB) = 'E'; + } + DPE(uPrefixLengthA, uPrefixLengthB) = scoreBest; + } + + // Insert I=GapA+LetterB + { + SCORE scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) + + PB[uPrefixLengthB-1].m_scoreGapOpen; + SCORE scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1) + + g_scoreGapExtend; + + SCORE scoreBest; + if (scoreMI >= scoreII) + { + scoreBest = scoreMI; + TBI(uPrefixLengthA, uPrefixLengthB) = 'M'; + } + else + { + assert(scoreII > scoreMI); + scoreBest = scoreII; + TBI(uPrefixLengthA, uPrefixLengthB) = 'I'; + } + DPI(uPrefixLengthA, uPrefixLengthB) = scoreBest; + } + + // Insert J=GapA+LetterB + { + SCORE scoreMJ = DPM(uPrefixLengthA, uPrefixLengthB-1) + + PB[uPrefixLengthB-1].m_scoreGapOpen2; + SCORE scoreJJ = DPJ(uPrefixLengthA, uPrefixLengthB-1) + + g_scoreGapExtend2; + + SCORE scoreBest; + if (scoreMJ > scoreJJ) + { + scoreBest = scoreMJ; + TBJ(uPrefixLengthA, uPrefixLengthB) = 'M'; + } + else + { + assert(scoreJJ >= scoreMJ); + scoreBest = scoreJJ; + TBJ(uPrefixLengthA, uPrefixLengthB) = 'J'; + } + DPJ(uPrefixLengthA, uPrefixLengthB) = scoreBest; + } + } + } + +// Special case: close gaps at end of alignment + DPD(uLengthA, uLengthB) += PA[uLengthA-1].m_scoreGapClose; + DPE(uLengthA, uLengthB) += PA[uLengthA-1].m_scoreGapClose2; + + DPI(uLengthA, uLengthB) += PB[uLengthB-1].m_scoreGapClose; + DPJ(uLengthA, uLengthB) += PB[uLengthB-1].m_scoreGapClose2; + +#if TRACE + Log("DPL:\n"); + ListDP(DPL_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("DPM:\n"); + ListDP(DPM_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("DPD:\n"); + ListDP(DPD_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("DPE:\n"); + ListDP(DPE_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("DPI:\n"); + ListDP(DPI_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("DPJ:\n"); + ListDP(DPJ_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("TBM:\n"); + ListTB(TBM_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("TBD:\n"); + ListTB(TBD_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("TBE:\n"); + ListTB(TBE_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("TBI:\n"); + ListTB(TBI_, PA, PB, uPrefixCountA, uPrefixCountB); + Log("TBJ:\n"); + ListTB(TBJ_, PA, PB, uPrefixCountA, uPrefixCountB); +#endif + +// ========== +// Trace-back +// ========== + + Path.Clear(); + +// Find last edge + char cEdgeType = '?'; + SCORE BestScore = MINUS_INFINITY; + SCORE M = DPM(uLengthA, uLengthB); + SCORE D = DPD(uLengthA, uLengthB); + SCORE E = DPE(uLengthA, uLengthB); + SCORE I = DPI(uLengthA, uLengthB); + SCORE J = DPJ(uLengthA, uLengthB); + + if (M >= D && M >= E && M >= I && M >= J) + { + cEdgeType = 'M'; + BestScore = M; + } + else if (D >= M && D >= E && D >= I && D >= J) + { + cEdgeType = 'D'; + BestScore = D; + } + else if (E >= M && E >= D && E >= I && E >= J) + { + cEdgeType = 'E'; + BestScore = E; + } + else if (I >= M && I >= D && I >= E && I >= J) + { + cEdgeType = 'I'; + BestScore = I; + } + else if (J >= M && J >= D && J >= E && J >= I) + { + cEdgeType = 'J'; + BestScore = J; + } + else + Quit("Bad max"); + + unsigned PLA = uLengthA; + unsigned PLB = uLengthB; + unsigned ECount = 0; + unsigned JCount = 0; + for (;;) + { +#if TRACE + Log("TraceBack: %c%u.%u\n", cEdgeType, PLA, PLB); +#endif + PWEdge Edge; + Edge.cType = XlatEdgeType(cEdgeType); + Edge.uPrefixLengthA = PLA; + Edge.uPrefixLengthB = PLB; + Path.PrependEdge(Edge); + + switch (cEdgeType) + { + case 'M': + assert(PLA > 0); + assert(PLB > 0); + cEdgeType = TBM(PLA, PLB); + --PLA; + --PLB; + break; + + case 'D': + assert(PLA > 0); + cEdgeType = TBD(PLA, PLB); + --PLA; + break; + + case 'E': + ++ECount; + assert(PLA > 0); + cEdgeType = TBE(PLA, PLB); + --PLA; + break; + + case 'I': + assert(PLB > 0); + cEdgeType = TBI(PLA, PLB); + --PLB; + break; + + case 'J': + ++JCount; + assert(PLB > 0); + cEdgeType = TBJ(PLA, PLB); + --PLB; + break; + + default: + Quit("Invalid edge %c", cEdgeType); + } + if (0 == PLA && 0 == PLB) + break; + } + //if (ECount > 0 || JCount > 0) + // fprintf(stderr, "E=%d J=%d\n", ECount, JCount); + Path.Validate(); + if (Path.GetMatchCount() + Path.GetDeleteCount() != uLengthA) + Quit("Path count A"); + if (Path.GetMatchCount() + Path.GetInsertCount() != uLengthB) + Quit("Path count B"); + + if (g_bKeepSimpleDP) + { + g_DPM = DPM_; + g_DPD = DPD_; + g_DPE = DPE_; + g_DPI = DPI_; + g_DPJ = DPJ_; + + g_TBM = TBM_; + g_TBD = TBD_; + g_TBE = TBE_; + g_TBI = TBI_; + g_TBJ = TBJ_; + } + else + { + delete[] DPM_; + delete[] DPD_; + delete[] DPE_; + delete[] DPI_; + delete[] DPJ_; + + delete[] TBM_; + delete[] TBD_; + delete[] TBE_; + delete[] TBI_; + delete[] TBJ_; + } + +#if TRACE + Log("BestScore=%.6g\n", BestScore); +#endif + return BestScore; + } + +#endif // DOUBLE_AFFINE diff --git a/src/muscle/muscle3.8.31/src/nwdasmall.cpp b/src/muscle/muscle3.8.31/src/nwdasmall.cpp new file mode 100644 index 0000000..5f12706 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/nwdasmall.cpp @@ -0,0 +1,947 @@ +#include "muscle.h" +#include +#include "pwpath.h" +#include "profile.h" +#include + +#if DOUBLE_AFFINE + +// NW double affine small memory, term gaps fully penalized +// (so up to caller to adjust in profile if desired). + +#define TRACE 0 + +#define MIN(x, y) ((x) < (y) ? (x) : (y)) + +#if TRACE +extern bool g_bKeepSimpleDP; +extern SCORE *g_DPM; +extern SCORE *g_DPD; +extern SCORE *g_DPE; +extern SCORE *g_DPI; +extern SCORE *g_DPJ; +extern char *g_TBM; +extern char *g_TBD; +extern char *g_TBE; +extern char *g_TBI; +extern char *g_TBJ; +#endif + +#if TRACE +#define ALLOC_TRACE() \ + const SCORE UNINIT = MINUS_INFINITY; \ + const size_t LM = uPrefixCountA*uPrefixCountB; \ + \ + SCORE *DPM_ = new SCORE[LM]; \ + SCORE *DPD_ = new SCORE[LM]; \ + SCORE *DPE_ = new SCORE[LM]; \ + SCORE *DPI_ = new SCORE[LM]; \ + SCORE *DPJ_ = new SCORE[LM]; \ + \ + char *TBM_ = new char[LM]; \ + char *TBD_ = new char[LM]; \ + char *TBE_ = new char[LM]; \ + char *TBI_ = new char[LM]; \ + char *TBJ_ = new char[LM]; \ + \ + memset(TBM_, '?', LM); \ + memset(TBD_, '?', LM); \ + memset(TBE_, '?', LM); \ + memset(TBI_, '?', LM); \ + memset(TBJ_, '?', LM); \ + \ + for (unsigned i = 0; i <= uLengthA; ++i) \ + for (unsigned j = 0; j <= uLengthB; ++j) \ + { \ + DPM(i, j) = UNINIT; \ + DPD(i, j) = UNINIT; \ + DPE(i, j) = UNINIT; \ + DPI(i, j) = UNINIT; \ + DPJ(i, j) = UNINIT; \ + } +#else +#define ALLOC_TRACE() +#endif + +#if TRACE +#define SetDPM(i, j, x) DPM(i, j) = x +#define SetDPD(i, j, x) DPD(i, j) = x +#define SetDPE(i, j, x) DPE(i, j) = x +#define SetDPI(i, j, x) DPI(i, j) = x +#define SetDPJ(i, j, x) DPJ(i, j) = x +#define SetTBM(i, j, x) TBM(i, j) = x +#define SetTBD(i, j, x) TBD(i, j) = x +#define SetTBE(i, j, x) TBE(i, j) = x +#define SetTBI(i, j, x) TBI(i, j) = x +#define SetTBJ(i, j, x) TBJ(i, j) = x +#else +#define SetDPM(i, j, x) /* empty */ +#define SetDPD(i, j, x) /* empty */ +#define SetDPE(i, j, x) /* empty */ +#define SetDPI(i, j, x) /* empty */ +#define SetDPJ(i, j, x) /* empty */ +#define SetTBM(i, j, x) /* empty */ +#define SetTBD(i, j, x) /* empty */ +#define SetTBE(i, j, x) /* empty */ +#define SetTBI(i, j, x) /* empty */ +#define SetTBJ(i, j, x) /* empty */ +#endif + +#define RECURSE_D(i, j) \ + { \ + SCORE DD = DRow[j] + e; \ + SCORE MD = MPrev[j] + PA[i-1].m_scoreGapOpen;\ + if (DD > MD) \ + { \ + DRow[j] = DD; \ + SetTBD(i, j, 'D'); \ + } \ + else \ + { \ + DRow[j] = MD; \ + SetBitTBD(TB, i, j, 'M'); \ + SetTBD(i, j, 'M'); \ + } \ + SetDPD(i, j, DRow[j]); \ + } + +#define RECURSE_E(i, j) \ + { \ + SCORE EE = ERow[j] + e2; \ + SCORE ME = MPrev[j] + PA[i-1].m_scoreGapOpen2;\ + if (EE > ME) \ + { \ + ERow[j] = EE; \ + SetTBE(i, j, 'E'); \ + } \ + else \ + { \ + ERow[j] = ME; \ + SetBitTBE(TB, i, j, 'M'); \ + SetTBE(i, j, 'M'); \ + } \ + SetDPE(i, j, ERow[j]); \ + } + +#define RECURSE_D_ATerm(j) RECURSE_D(uLengthA, j) +#define RECURSE_E_ATerm(j) RECURSE_E(uLengthA, j) + +#define RECURSE_D_BTerm(j) RECURSE_D(i, uLengthB) +#define RECURSE_E_BTerm(j) RECURSE_E(i, uLengthB) + +#define RECURSE_I(i, j) \ + { \ + Iij += e; \ + SCORE MI = MCurr[j-1] + PB[j-1].m_scoreGapOpen;\ + if (MI >= Iij) \ + { \ + Iij = MI; \ + SetBitTBI(TB, i, j, 'M'); \ + SetTBI(i, j, 'M'); \ + } \ + else \ + SetTBI(i, j, 'I'); \ + SetDPI(i, j, Iij); \ + } + +#define RECURSE_J(i, j) \ + { \ + Jij += e2; \ + SCORE MJ = MCurr[j-1] + PB[j-1].m_scoreGapOpen2;\ + if (MJ >= Jij) \ + { \ + Jij = MJ; \ + SetBitTBJ(TB, i, j, 'M'); \ + SetTBJ(i, j, 'M'); \ + } \ + else \ + SetTBJ(i, j, 'I'); \ + SetDPJ(i, j, Jij); \ + } + +#define RECURSE_I_ATerm(j) RECURSE_I(uLengthA, j) +#define RECURSE_J_ATerm(j) RECURSE_J(uLengthA, j) + +#define RECURSE_I_BTerm(j) RECURSE_I(i, uLengthB) +#define RECURSE_J_BTerm(j) RECURSE_J(i, uLengthB) + +#define RECURSE_M(i, j) \ + { \ + SCORE Best = MCurr[j]; /* MM */ \ + SetTBM(i+1, j+1, 'M'); \ + SetBitTBM(TB, i+1, j+1, 'M'); \ + \ + SCORE DM = DRow[j] + PA[i-1].m_scoreGapClose; \ + if (DM > Best) \ + { \ + Best = DM; \ + SetTBM(i+1, j+1, 'D'); \ + SetBitTBM(TB, i+1, j+1, 'D'); \ + } \ + \ + SCORE EM = ERow[j] + PA[i-1].m_scoreGapClose2; \ + if (EM > Best) \ + { \ + Best = EM; \ + SetTBM(i+1, j+1, 'E'); \ + SetBitTBM(TB, i+1, j+1, 'E'); \ + } \ + \ + SCORE IM = Iij + PB[j-1].m_scoreGapClose; \ + if (IM > Best) \ + { \ + Best = IM; \ + SetTBM(i+1, j+1, 'I'); \ + SetBitTBM(TB, i+1, j+1, 'I'); \ + } \ + \ + SCORE JM = Jij + PB[j-1].m_scoreGapClose2; \ + if (JM > Best) \ + { \ + Best = JM; \ + SetTBM(i+1, j+1, 'J'); \ + SetBitTBM(TB, i+1, j+1, 'J'); \ + } \ + MNext[j+1] += Best; \ + SetDPM(i+1, j+1, MNext[j+1]); \ + } + +#if TRACE +static bool LocalEq(BASETYPE b1, BASETYPE b2) + { + if (b1 < -100000 && b2 < -100000) + return true; + double diff = fabs(b1 - b2); + if (diff < 0.0001) + return true; + double sum = fabs(b1) + fabs(b2); + return diff/sum < 0.005; + } + +static char Get_M_Char(char Bits) + { + switch (Bits & BIT_xM) + { + case BIT_MM: + return 'M'; + case BIT_DM: + return 'D'; + case BIT_EM: + return 'E'; + case BIT_IM: + return 'I'; + case BIT_JM: + return 'J'; + } + Quit("Huh?"); + return '?'; + } + +static char Get_D_Char(char Bits) + { + return (Bits & BIT_xD) ? 'M' : 'D'; + } + +static char Get_E_Char(char Bits) + { + return (Bits & BIT_xE) ? 'M' : 'E'; + } + +static char Get_I_Char(char Bits) + { + return (Bits & BIT_xI) ? 'M' : 'I'; + } + +static char Get_J_Char(char Bits) + { + return (Bits & BIT_xJ) ? 'M' : 'J'; + } + +static bool DPEq(char c, SCORE *g_DP, SCORE *DPD_, + unsigned uPrefixCountA, unsigned uPrefixCountB) + { + if (0 == g_DP) + { + Log("***DPDIFF*** DP%c=NULL\n", c); + return true; + } + + SCORE *DPM_ = g_DP; + for (unsigned i = 0; i < uPrefixCountA; ++i) + for (unsigned j = 0; j < uPrefixCountB; ++j) + if (!LocalEq(DPM(i, j), DPD(i, j))) + { + Log("***DPDIFF*** DP%c(%d, %d) Simple = %.2g, Small = %.2g\n", + c, i, j, DPM(i, j), DPD(i, j)); + return false; + } + return true; + } + +static bool CompareTB(char **TB, char *TBM_, char *TBD_, char *TBE_, char *TBI_, char *TBJ_, + unsigned uPrefixCountA, unsigned uPrefixCountB) + { + if (!g_bKeepSimpleDP) + return true; + SCORE *DPM_ = g_DPM; + bool Eq = true; + for (unsigned i = 0; i < uPrefixCountA; ++i) + for (unsigned j = 0; j < uPrefixCountB; ++j) + { + char c1 = TBM(i, j); + char c2 = Get_M_Char(TB[i][j]); + if (c1 != '?' && c1 != c2 && DPM(i, j) > -100000) + { + Log("TBM(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2); + Eq = false; + goto D; + } + } + +D: + SCORE *DPD_ = g_DPD; + for (unsigned i = 0; i < uPrefixCountA; ++i) + for (unsigned j = 0; j < uPrefixCountB; ++j) + { + char c1 = TBD(i, j); + char c2 = Get_D_Char(TB[i][j]); + if (c1 != '?' && c1 != c2 && DPD(i, j) > -100000) + { + Log("TBD(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2); + Eq = false; + goto E; + } + } +E: + SCORE *DPE_ = g_DPE; + if (0 == TBE_) + goto I; + for (unsigned i = 0; i < uPrefixCountA; ++i) + for (unsigned j = 0; j < uPrefixCountB; ++j) + { + char c1 = TBE(i, j); + char c2 = Get_E_Char(TB[i][j]); + if (c1 != '?' && c1 != c2 && DPE(i, j) > -100000) + { + Log("TBE(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2); + Eq = false; + goto I; + } + } +I: + SCORE *DPI_ = g_DPI; + for (unsigned i = 0; i < uPrefixCountA; ++i) + for (unsigned j = 0; j < uPrefixCountB; ++j) + { + char c1 = TBI(i, j); + char c2 = Get_I_Char(TB[i][j]); + if (c1 != '?' && c1 != c2 && DPI(i, j) > -100000) + { + Log("TBI(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2); + Eq = false; + goto J; + } + } +J: + SCORE *DPJ_ = g_DPJ; + if (0 == DPJ_) + goto Done; + for (unsigned i = 0; i < uPrefixCountA; ++i) + for (unsigned j = 0; j < uPrefixCountB; ++j) + { + char c1 = TBJ(i, j); + char c2 = Get_J_Char(TB[i][j]); + if (c1 != '?' && c1 != c2 && DPJ(i, j) > -100000) + { + Log("TBJ(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2); + Eq = false; + goto Done; + } + } +Done: + if (Eq) + Log("TB success\n"); + return Eq; + } + +static const char *LocalScoreToStr(SCORE s) + { + static char str[16]; + if (s < -100000) + return " *"; + sprintf(str, "%6.1f", s); + return str; + } + +static void LogDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB, + unsigned uPrefixCountA, unsigned uPrefixCountB) + { + Log(" "); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + char c = ' '; + if (uPrefixLengthB > 0) + c = ConsensusChar(PB[uPrefixLengthB - 1]); + Log(" %4u:%c", uPrefixLengthB, c); + } + Log("\n"); + for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + char c = ' '; + if (uPrefixLengthA > 0) + c = ConsensusChar(PA[uPrefixLengthA - 1]); + Log("%4u:%c ", uPrefixLengthA, c); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB))); + Log("\n"); + } + } + +static void LogBitTB(char **TB, const ProfPos *PA, const ProfPos *PB, + unsigned uPrefixCountA, unsigned uPrefixCountB) + { + Log(" "); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + char c = ' '; + if (uPrefixLengthB > 0) + c = ConsensusChar(PB[uPrefixLengthB - 1]); + Log(" %4u:%c", uPrefixLengthB, c); + } + Log("\n"); + Log("Bit TBM:\n"); + for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + char c = ' '; + if (uPrefixLengthA > 0) + c = ConsensusChar(PA[uPrefixLengthA - 1]); + Log("%4u:%c ", uPrefixLengthA, c); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + char c = Get_M_Char(TB[uPrefixLengthA][uPrefixLengthB]); + Log(" %6c", c); + } + Log("\n"); + } + + Log("\n"); + Log("Bit TBD:\n"); + for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + char c = ' '; + if (uPrefixLengthA > 0) + c = ConsensusChar(PA[uPrefixLengthA - 1]); + Log("%4u:%c ", uPrefixLengthA, c); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + char c = Get_D_Char(TB[uPrefixLengthA][uPrefixLengthB]); + Log(" %6c", c); + } + Log("\n"); + } + + Log("\n"); + Log("Bit TBE:\n"); + for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + char c = ' '; + if (uPrefixLengthA > 0) + c = ConsensusChar(PA[uPrefixLengthA - 1]); + Log("%4u:%c ", uPrefixLengthA, c); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + char c = Get_E_Char(TB[uPrefixLengthA][uPrefixLengthB]); + Log(" %6c", c); + } + Log("\n"); + } + + Log("\n"); + Log("Bit TBI:\n"); + for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + char c = ' '; + if (uPrefixLengthA > 0) + c = ConsensusChar(PA[uPrefixLengthA - 1]); + Log("%4u:%c ", uPrefixLengthA, c); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + char c = Get_I_Char(TB[uPrefixLengthA][uPrefixLengthB]); + Log(" %6c", c); + } + Log("\n"); + } + + Log("\n"); + Log("Bit TBJ:\n"); + for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + char c = ' '; + if (uPrefixLengthA > 0) + c = ConsensusChar(PA[uPrefixLengthA - 1]); + Log("%4u:%c ", uPrefixLengthA, c); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + char c = Get_J_Char(TB[uPrefixLengthA][uPrefixLengthB]); + Log(" %6c", c); + } + Log("\n"); + } + } + +static void ListTB(char *TBM_, const ProfPos *PA, const ProfPos *PB, + unsigned uPrefixCountA, unsigned uPrefixCountB) + { + Log(" "); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + char c = ' '; + if (uPrefixLengthB > 0) + c = ConsensusChar(PB[uPrefixLengthB - 1]); + Log(" %4u:%c", uPrefixLengthB, c); + } + Log("\n"); + for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + char c = ' '; + if (uPrefixLengthA > 0) + c = ConsensusChar(PA[uPrefixLengthA - 1]); + Log("%4u:%c ", uPrefixLengthA, c); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + char c = TBM(uPrefixLengthA, uPrefixLengthB); + Log(" %6c", c); + } + Log("\n"); + } + } + +static const char *BitsToStr(char Bits) + { + static char Str[32]; + + sprintf(Str, "%cM %cD %cE %cI %cJ", + Get_M_Char(Bits), + Get_D_Char(Bits), + Get_E_Char(Bits), + Get_I_Char(Bits), + Get_J_Char(Bits)); + } +#endif // TRACE + +static inline void SetBitTBM(char **TB, unsigned i, unsigned j, char c) + { + char Bit; + switch (c) + { + case 'M': + Bit = BIT_MM; + break; + case 'D': + Bit = BIT_DM; + break; +#if DOUBLE_AFFINE + case 'E': + Bit = BIT_EM; + break; + case 'I': + Bit = BIT_IM; + break; + case 'J': + Bit = BIT_JM; + break; +#endif + default: + Quit("Huh?!"); + } + TB[i][j] &= ~BIT_xM; + TB[i][j] |= Bit; + } + +static inline void SetBitTBD(char **TB, unsigned i, unsigned j, char c) + { + char Bit; + switch (c) + { + case 'M': + Bit = BIT_MD; + break; + case 'D': + Bit = BIT_DD; + break; + default: + Quit("Huh?!"); + } + TB[i][j] &= ~BIT_xD; + TB[i][j] |= Bit; + } + +static inline void SetBitTBI(char **TB, unsigned i, unsigned j, char c) + { + char Bit; + switch (c) + { + case 'M': + Bit = BIT_MI; + break; + case 'I': + Bit = BIT_II; + break; + default: + Quit("Huh?!"); + } + TB[i][j] &= ~BIT_xI; + TB[i][j] |= Bit; + } + +#if DOUBLE_AFFINE +static inline void SetBitTBE(char **TB, unsigned i, unsigned j, char c) + { + char Bit; + switch (c) + { + case 'M': + Bit = BIT_ME; + break; + case 'E': + Bit = BIT_EE; + break; + default: + Quit("Huh?!"); + } + TB[i][j] &= ~BIT_xE; + TB[i][j] |= Bit; + } + +static inline void SetBitTBJ(char **TB, unsigned i, unsigned j, char c) + { + char Bit; + switch (c) + { + case 'M': + Bit = BIT_MJ; + break; + case 'J': + Bit = BIT_JJ; + break; + default: + Quit("Huh?!"); + } + TB[i][j] &= ~BIT_xJ; + TB[i][j] |= Bit; + } +#endif + +#if TRACE +#define LogMatrices() \ + { \ + Log("Bit DPM:\n"); \ + LogDP(DPM_, PA, PB, uPrefixCountA, uPrefixCountB); \ + Log("Bit DPD:\n"); \ + LogDP(DPD_, PA, PB, uPrefixCountA, uPrefixCountB); \ + Log("Bit DPE:\n"); \ + LogDP(DPE_, PA, PB, uPrefixCountA, uPrefixCountB); \ + Log("Bit DPI:\n"); \ + LogDP(DPI_, PA, PB, uPrefixCountA, uPrefixCountB); \ + Log("Bit DPJ:\n"); \ + LogDP(DPJ_, PA, PB, uPrefixCountA, uPrefixCountB); \ + Log("Bit TB:\n"); \ + LogBitTB(TB, PA, PB, uPrefixCountA, uPrefixCountB); \ + bool Same; \ + Same = DPEq('M', g_DPM, DPM_, uPrefixCountA, uPrefixCountB);\ + if (Same) \ + Log("DPM success\n"); \ + Same = DPEq('D', g_DPD, DPD_, uPrefixCountA, uPrefixCountB);\ + if (Same) \ + Log("DPD success\n"); \ + Same = DPEq('E', g_DPE, DPE_, uPrefixCountA, uPrefixCountB);\ + if (Same) \ + Log("DPE success\n"); \ + Same = DPEq('I', g_DPI, DPI_, uPrefixCountA, uPrefixCountB);\ + if (Same) \ + Log("DPI success\n"); \ + Same = DPEq('J', g_DPJ, DPJ_, uPrefixCountA, uPrefixCountB);\ + if (Same) \ + Log("DPJ success\n"); \ + CompareTB(TB, g_TBM, g_TBD, g_TBE, g_TBI, g_TBJ, uPrefixCountA, uPrefixCountB);\ + } +#else +#define LogMatrices() /* empty */ +#endif + +SCORE NWDASmall(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path) + { + assert(uLengthB > 0 && uLengthA > 0); + + ProfPos *pa0 = (ProfPos *) PA; + ProfPos *pb0 = (ProfPos *) PB; + ProfPos *paa = (ProfPos *) (PA + uLengthA - 1); + ProfPos *pbb = (ProfPos *) (PB + uLengthB - 1); + + pa0->m_scoreGapOpen *= -1; + pb0->m_scoreGapOpen *= -1; + + paa->m_scoreGapClose *= -1; + pbb->m_scoreGapClose *= -1; + + pa0->m_scoreGapOpen2 *= -1; + pb0->m_scoreGapOpen2 *= -1; + paa->m_scoreGapClose2 *= -1; + pbb->m_scoreGapClose2 *= -1; + + const unsigned uPrefixCountA = uLengthA + 1; + const unsigned uPrefixCountB = uLengthB + 1; + const SCORE e = g_scoreGapExtend; + + const SCORE e2 = g_scoreGapExtend2; + const SCORE min_e = MIN(g_scoreGapExtend, g_scoreGapExtend2); + + ALLOC_TRACE() + + SCORE *MCurr = new SCORE[uPrefixCountB]; + SCORE *MNext = new SCORE[uPrefixCountB]; + SCORE *MPrev = new SCORE[uPrefixCountB]; + SCORE *DRow = new SCORE[uPrefixCountB]; + SCORE *ERow = new SCORE[uPrefixCountB]; + + char **TB = new char *[uPrefixCountA]; + for (unsigned i = 0; i < uPrefixCountA; ++i) + { + TB[i] = new char [uPrefixCountB]; + memset(TB[i], 0, uPrefixCountB); + } + + SCORE Iij = MINUS_INFINITY; + SetDPI(0, 0, Iij); + + SCORE Jij = MINUS_INFINITY; + SetDPJ(0, 0, Jij); + + Iij = PB[0].m_scoreGapOpen; + SetDPI(0, 1, Iij); + + Jij = PB[0].m_scoreGapOpen2; + SetDPJ(0, 1, Jij); + + for (unsigned j = 2; j <= uLengthB; ++j) + { + Iij += e; + Jij += e2; + + SetDPI(0, j, Iij); + SetDPJ(0, j, Jij); + + SetTBI(0, j, 'I'); + SetTBJ(0, j, 'J'); + } + + for (unsigned j = 0; j <= uLengthB; ++j) + { + DRow[j] = MINUS_INFINITY; + ERow[j] = MINUS_INFINITY; + + SetDPD(0, j, DRow[j]); + SetDPE(0, j, ERow[j]); + + SetTBD(0, j, 'D'); + SetTBE(0, j, 'E'); + } + + MPrev[0] = 0; + SetDPM(0, 0, MPrev[0]); + for (unsigned j = 1; j <= uLengthB; ++j) + { + MPrev[j] = MINUS_INFINITY; + SetDPM(0, j, MPrev[j]); + } + + MCurr[0] = MINUS_INFINITY; + SetDPM(1, 0, MCurr[0]); + + MCurr[1] = ScoreProfPos2(PA[0], PB[0]); + SetDPM(1, 1, MCurr[1]); + SetBitTBM(TB, 1, 1, 'M'); + SetTBM(1, 1, 'M'); + + for (unsigned j = 2; j <= uLengthB; ++j) + { + SCORE M = ScoreProfPos2(PA[0], PB[j-1]) + PB[0].m_scoreGapOpen + + (j - 2)*e + PB[j-2].m_scoreGapClose; + SCORE M2 = ScoreProfPos2(PA[0], PB[j-1]) + PB[0].m_scoreGapOpen2 + + (j - 2)*e2 + PB[j-2].m_scoreGapClose2; + + if (M >= M2) + { + MCurr[j] = M; + SetBitTBM(TB, 1, j, 'I'); + SetTBM(1, j, 'I'); + } + else + { + MCurr[j] = M2; + SetBitTBM(TB, 1, j, 'J'); + SetTBM(1, j, 'J'); + } + SetDPM(1, j, MCurr[j]); + } + +// Main DP loop + for (unsigned i = 1; i < uLengthA; ++i) + { + Iij = MINUS_INFINITY; + Jij = MINUS_INFINITY; + SetDPI(i, 0, Iij); + SetDPJ(i, 0, Jij); + + DRow[0] = PA[0].m_scoreGapOpen + (i - 1)*e; + ERow[0] = PA[0].m_scoreGapOpen2 + (i - 1)*e2; + SetDPD(i, 0, DRow[0]); + SetDPE(i, 0, ERow[0]); + + MCurr[0] = MINUS_INFINITY; + if (i == 1) + { + MCurr[1] = ScoreProfPos2(PA[0], PB[0]); + SetBitTBM(TB, i, 1, 'M'); + SetTBM(i, 1, 'M'); + } + else + { + SCORE M = ScoreProfPos2(PA[i-1], PB[0]) + PA[0].m_scoreGapOpen + + (i - 2)*e + PA[i-2].m_scoreGapClose; + SCORE M2 = ScoreProfPos2(PA[i-1], PB[0]) + PA[0].m_scoreGapOpen2 + + (i - 2)*e2 + PA[i-2].m_scoreGapClose2; + if (M >= M2) + { + MCurr[1] = M; + SetBitTBM(TB, i, 1, 'D'); + SetTBM(i, 1, 'D'); + } + else + { + MCurr[1] = M2; + SetBitTBM(TB, i, 1, 'E'); + SetTBM(i, 1, 'E'); + } + } + SetDPM(i, 0, MCurr[0]); + SetDPM(i, 1, MCurr[1]); + + for (unsigned j = 1; j < uLengthB; ++j) + MNext[j+1] = ScoreProfPos2(PA[i], PB[j]); + + for (unsigned j = 1; j < uLengthB; ++j) + { + RECURSE_D(i, j) + RECURSE_E(i, j) + RECURSE_I(i, j) + RECURSE_J(i, j) + RECURSE_M(i, j) + } + // Special case for j=uLengthB + RECURSE_D_BTerm(i) + RECURSE_E_BTerm(i) + RECURSE_I_BTerm(i) + RECURSE_J_BTerm(i) + + // Prev := Curr, Curr := Next, Next := Prev + Rotate(MPrev, MCurr, MNext); + } + +// Special case for i=uLengthA + MCurr[0] = MINUS_INFINITY; + SCORE M = ScoreProfPos2(PA[uLengthA-1], PB[0]) + (uLengthA - 2)*e + + PA[0].m_scoreGapOpen + PA[uLengthA-2].m_scoreGapClose; + SCORE M2 = ScoreProfPos2(PA[uLengthA-1], PB[0]) + (uLengthA - 2)*e + + PA[0].m_scoreGapOpen + PA[uLengthA-2].m_scoreGapClose; + if (M >= M2) + { + MCurr[1] = M; + SetBitTBM(TB, uLengthA, 1, 'D'); + SetTBM(uLengthA, 1, 'D'); + } + else + { + MCurr[1] = M2; + SetBitTBM(TB, uLengthA, 1, 'E'); + SetTBM(uLengthA, 1, 'D'); + } + SetDPM(uLengthA, 0, MCurr[0]); + SetDPM(uLengthA, 1, MCurr[1]); + + DRow[0] = MINUS_INFINITY; + ERow[0] = MINUS_INFINITY; + + SetDPD(uLengthA, 0, DRow[0]); + SetDPE(uLengthA, 0, ERow[0]); + + for (unsigned j = 1; j <= uLengthB; ++j) + { + RECURSE_D_ATerm(j); + RECURSE_E_ATerm(j); + } + + Iij = MINUS_INFINITY; + Jij = MINUS_INFINITY; + + for (unsigned j = 1; j <= uLengthB; ++j) + { + RECURSE_I_ATerm(j) + RECURSE_J_ATerm(j) + } + + LogMatrices(); + + SCORE MAB = MCurr[uLengthB]; + SCORE DAB = DRow[uLengthB] + PA[uLengthA-1].m_scoreGapClose; + SCORE EAB = ERow[uLengthB] + PA[uLengthA-1].m_scoreGapClose2; + SCORE IAB = Iij + PB[uLengthB-1].m_scoreGapClose; + SCORE JAB = Jij + PB[uLengthB-1].m_scoreGapClose2; + + SCORE Score = MAB; + char cEdgeType = 'M'; + if (DAB > Score) + { + Score = DAB; + cEdgeType = 'D'; + } + if (EAB > Score) + { + Score = EAB; + cEdgeType = 'E'; + } + if (IAB > Score) + { + Score = IAB; + cEdgeType = 'I'; + } + if (JAB > Score) + { + Score = JAB; + cEdgeType = 'J'; + } + +#if TRACE + Log(" Small: MAB=%.4g DAB=%.4g EAB=%.4g IAB=%.4g JAB=%.4g best=%c\n", + MAB, DAB, EAB, IAB, JAB, cEdgeType); +#endif + + BitTraceBack(TB, uLengthA, uLengthB, cEdgeType, Path); + +#if DBEUG + Path.Validate(); +#endif + + delete[] MCurr; + delete[] MNext; + delete[] MPrev; + delete[] DRow; + delete[] ERow; + for (unsigned i = 0; i < uPrefixCountA; ++i) + delete[] TB[i]; + delete[] TB; + + return 0; + } +#endif // DOUBLE_AFFINE diff --git a/src/muscle/muscle3.8.31/src/nwrec.cpp b/src/muscle/muscle3.8.31/src/nwrec.cpp new file mode 100644 index 0000000..ff759cf --- /dev/null +++ b/src/muscle/muscle3.8.31/src/nwrec.cpp @@ -0,0 +1,137 @@ +/*** +Needleman-Wunch recursions + +Notation: i,j are prefix lengths so are in +ranges i = [0,|A|] and j = [0,|B|]. + +Profile positions are in ranges [0,|A|-1] +and [0,|B|-1] so prefix length i corresponds +to position (i-1) in the profile, and similarly +for j. + +Terminal gap scoring +-------------------- +Terminal gaps are scored as with open [close] +penalties only at the left [right] terminal, +as follows: + + 0 i + | | + A XXXXX... + B ---XX... + + i |A|-1 + | | + A ...XXXXX + B ...XX--- + +In these examples, open / close penalty at position +i is included, but close / open penalty at |A|-1 / +0 is not included. + +This is implemented by setting the open [close] +penalty to zero in the first [last] position of +each profile. + +Consider adding a column to a sub-alignment. After the +column is added, there are i letters from A and j letters +from B. + +The column starts a left-terminal gap if: + Delete with i=1, j=0 or + Insert with i=0, j=1. + +The column ends a left-terminal gap if: + Match following Delete with j=1, or + Match following Insert with i=1. + +The column starts a right-terminal gap if: + Delete following a Match and i=|A|, or + Insert following a Match and j=|B|. + +The column ends a right-terminal gap if: + Match with i=|A|, j=|B| following Delete or Insert. + +RECURSION RELATIONS +=================== + + i-1 + | +DD A ..X X + B ..- - + +MD A ..X X + B ..X - + +D(i,j) = max + D(i-1,j) + e + M(i-1,j) + goA(i-1) +Valid for: + i = [1,|A|-1] + j = [1,|B|] + +I(i,j) By symmetry with D(i,j). + + i-2 + | i-1 + | | +MM A ..X X + B ..X X + +DM A ..X X + B ..- X + +IM A ..- X + B ..X X + | | + | j-1 + j-2 + +M(i,j) = L(i-1,j-1) + max + M(i-1,j-1) + D(i-1,j-1) + gcA(i-2) + I(i-1,j-1) + gcB(j-2) +Valid for: + i = [2,|A|] + j = [2,|B|] + +Equivalently: + +M(i+1,j+1) = L(i,j) + max + M(i,j) + D(i,j) + gcA(i-1) + I(i,j) + gcB(j-1) + +Valid for: + i = [1,|A|-1] + j = [1,|B|-1] + +Boundary conditions +=================== + +A XXXX +B ---- + D(0,0) = -infinity + + D(i,0) = ie + i = [1,|A|] + + D(0,j) = -infinity + j = [0,|B|] + +I(0,0), I(0,j) and I(i,0) by symmetry with D. + + M(0,0) = 0 + M(i,0) = -infinity, i > 0 + M(0,j) = -infinity, j > 0 + +A X +B - + D(1,0) = e + D(1,j) = -infinity, j = [1,|B|] + (assuming no I-D allowed). + + D(0,1) = -infinity + D(1,1) = -infinity + D(i,1) = max. +***/ diff --git a/src/muscle/muscle3.8.31/src/nwsmall.cpp b/src/muscle/muscle3.8.31/src/nwsmall.cpp new file mode 100644 index 0000000..655882c --- /dev/null +++ b/src/muscle/muscle3.8.31/src/nwsmall.cpp @@ -0,0 +1,664 @@ +#include "muscle.h" +#include +#include "pwpath.h" +#include "profile.h" +#include + +// NW small memory + +#define TRACE 0 + +#if TRACE +extern bool g_bKeepSimpleDP; +extern SCORE *g_DPM; +extern SCORE *g_DPD; +extern SCORE *g_DPI; +extern char *g_TBM; +extern char *g_TBD; +extern char *g_TBI; +#endif + +#if TRACE +#define ALLOC_TRACE() \ + const SCORE UNINIT = MINUS_INFINITY; \ + const size_t LM = uPrefixCountA*uPrefixCountB; \ + \ + SCORE *DPM_ = new SCORE[LM]; \ + SCORE *DPD_ = new SCORE[LM]; \ + SCORE *DPI_ = new SCORE[LM]; \ + \ + char *TBM_ = new char[LM]; \ + char *TBD_ = new char[LM]; \ + char *TBI_ = new char[LM]; \ + \ + memset(TBM_, '?', LM); \ + memset(TBD_, '?', LM); \ + memset(TBI_, '?', LM); \ + \ + for (unsigned i = 0; i <= uLengthA; ++i) \ + for (unsigned j = 0; j <= uLengthB; ++j) \ + { \ + DPM(i, j) = UNINIT; \ + DPD(i, j) = UNINIT; \ + DPI(i, j) = UNINIT; \ + } +#else +#define ALLOC_TRACE() +#endif + +#if TRACE +#define SetDPM(i, j, x) DPM(i, j) = x +#define SetDPD(i, j, x) DPD(i, j) = x +#define SetDPI(i, j, x) DPI(i, j) = x +#define SetTBM(i, j, x) TBM(i, j) = x +#define SetTBD(i, j, x) TBD(i, j) = x +#define SetTBI(i, j, x) TBI(i, j) = x +#else +#define SetDPM(i, j, x) /* empty */ +#define SetDPD(i, j, x) /* empty */ +#define SetDPI(i, j, x) /* empty */ +#define SetTBM(i, j, x) /* empty */ +#define SetTBD(i, j, x) /* empty */ +#define SetTBI(i, j, x) /* empty */ +#endif + +#define RECURSE_D(i, j) \ + { \ + SCORE DD = DRow[j] + e; \ + SCORE MD = MPrev[j] + PA[i-1].m_scoreGapOpen;\ + if (DD > MD) \ + { \ + DRow[j] = DD; \ + SetTBD(i, j, 'D'); \ + } \ + else \ + { \ + DRow[j] = MD; \ + /* SetBitTBD(TB, i, j, 'M'); */ \ + TBRow[j] &= ~BIT_xD; \ + TBRow[j] |= BIT_MD; \ + SetTBD(i, j, 'M'); \ + } \ + SetDPD(i, j, DRow[j]); \ + } + +#define RECURSE_D_ATerm(j) RECURSE_D(uLengthA, j) +#define RECURSE_D_BTerm(j) RECURSE_D(i, uLengthB) + +#define RECURSE_I(i, j) \ + { \ + Iij += e; \ + SCORE MI = MCurr[j-1] + PB[j-1].m_scoreGapOpen;\ + if (MI >= Iij) \ + { \ + Iij = MI; \ + /* SetBitTBI(TB, i, j, 'M'); */ \ + TBRow[j] &= ~BIT_xI; \ + TBRow[j] |= BIT_MI; \ + SetTBI(i, j, 'M'); \ + } \ + else \ + SetTBI(i, j, 'I'); \ + SetDPI(i, j, Iij); \ + } + +#define RECURSE_I_ATerm(j) RECURSE_I(uLengthA, j) +#define RECURSE_I_BTerm(j) RECURSE_I(i, uLengthB) + +#define RECURSE_M(i, j) \ + { \ + SCORE DM = DRow[j] + PA[i-1].m_scoreGapClose; \ + SCORE IM = Iij + PB[j-1].m_scoreGapClose; \ + SCORE MM = MCurr[j]; \ + TB[i+1][j+1] &= ~BIT_xM; \ + if (MM >= DM && MM >= IM) \ + { \ + MNext[j+1] += MM; \ + SetDPM(i+1, j+1, MNext[j+1]); \ + SetTBM(i+1, j+1, 'M'); \ + /* SetBitTBM(TB, i+1, j+1, 'M'); */ \ + TB[i+1][j+1] |= BIT_MM; \ + } \ + else if (DM >= MM && DM >= IM) \ + { \ + MNext[j+1] += DM; \ + SetDPM(i+1, j+1, MNext[j+1]); \ + SetTBM(i+1, j+1, 'D'); \ + /* SetBitTBM(TB, i+1, j+1, 'D'); */ \ + TB[i+1][j+1] |= BIT_DM; \ + } \ + else \ + { \ + assert(IM >= MM && IM >= DM); \ + MNext[j+1] += IM; \ + SetDPM(i+1, j+1, MNext[j+1]); \ + SetTBM(i+1, j+1, 'I'); \ + /* SetBitTBM(TB, i+1, j+1, 'I'); */ \ + TB[i+1][j+1] |= BIT_IM; \ + } \ + } + +#if TRACE +static bool LocalEq(BASETYPE b1, BASETYPE b2) + { + if (b1 < -100000 && b2 < -100000) + return true; + double diff = fabs(b1 - b2); + if (diff < 0.0001) + return true; + double sum = fabs(b1) + fabs(b2); + return diff/sum < 0.005; + } + +static char Get_M_Char(char Bits) + { + switch (Bits & BIT_xM) + { + case BIT_MM: + return 'M'; + case BIT_DM: + return 'D'; + case BIT_IM: + return 'I'; + } + Quit("Huh?"); + return '?'; + } + +static char Get_D_Char(char Bits) + { + return (Bits & BIT_xD) ? 'M' : 'D'; + } + +static char Get_I_Char(char Bits) + { + return (Bits & BIT_xI) ? 'M' : 'I'; + } + +static bool DPEq(char c, SCORE *g_DP, SCORE *DPD_, + unsigned uPrefixCountA, unsigned uPrefixCountB) + { + SCORE *DPM_ = g_DP; + for (unsigned i = 0; i < uPrefixCountA; ++i) + for (unsigned j = 0; j < uPrefixCountB; ++j) + if (!LocalEq(DPM(i, j), DPD(i, j))) + { + Log("***DPDIFF*** DP%c(%d, %d) Simple = %.2g, Fast = %.2g\n", + c, i, j, DPM(i, j), DPD(i, j)); + return false; + } + return true; + } + +static bool CompareTB(char **TB, char *TBM_, char *TBD_, char *TBI_, + unsigned uPrefixCountA, unsigned uPrefixCountB) + { + SCORE *DPM_ = g_DPM; + bool Eq = true; + for (unsigned i = 0; i < uPrefixCountA; ++i) + for (unsigned j = 0; j < uPrefixCountB; ++j) + { + char c1 = TBM(i, j); + char c2 = Get_M_Char(TB[i][j]); + if (c1 != '?' && c1 != c2 && DPM(i, j) > -100000) + { + Log("TBM(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2); + Eq = false; + goto D; + } + } + +D: + SCORE *DPD_ = g_DPD; + for (unsigned i = 0; i < uPrefixCountA; ++i) + for (unsigned j = 0; j < uPrefixCountB; ++j) + { + char c1 = TBD(i, j); + char c2 = Get_D_Char(TB[i][j]); + if (c1 != '?' && c1 != c2 && DPD(i, j) > -100000) + { + Log("TBD(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2); + Eq = false; + goto I; + } + } +I: + SCORE *DPI_ = g_DPI; + for (unsigned i = 0; i < uPrefixCountA; ++i) + for (unsigned j = 0; j < uPrefixCountB; ++j) + { + char c1 = TBI(i, j); + char c2 = Get_I_Char(TB[i][j]); + if (c1 != '?' && c1 != c2 && DPI(i, j) > -100000) + { + Log("TBI(%d, %d) Simple = %c, NW = %c\n", i, j, c1, c2); + Eq = false; + goto Done; + } + } +Done: + if (Eq) + Log("TB success\n"); + return Eq; + } + +static const char *LocalScoreToStr(SCORE s) + { + static char str[16]; + if (s < -100000) + return " *"; + sprintf(str, "%6.1f", s); + return str; + } + +static void LogDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB, + unsigned uPrefixCountA, unsigned uPrefixCountB) + { + Log(" "); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + char c = ' '; + if (uPrefixLengthB > 0) + c = ConsensusChar(PB[uPrefixLengthB - 1]); + Log(" %4u:%c", uPrefixLengthB, c); + } + Log("\n"); + for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + char c = ' '; + if (uPrefixLengthA > 0) + c = ConsensusChar(PA[uPrefixLengthA - 1]); + Log("%4u:%c ", uPrefixLengthA, c); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB))); + Log("\n"); + } + } + +static void LogBitTB(char **TB, const ProfPos *PA, const ProfPos *PB, + unsigned uPrefixCountA, unsigned uPrefixCountB) + { + Log(" "); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + char c = ' '; + if (uPrefixLengthB > 0) + c = ConsensusChar(PB[uPrefixLengthB - 1]); + Log(" %4u:%c", uPrefixLengthB, c); + } + Log("\n"); + Log("Bit TBM:\n"); + for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + char c = ' '; + if (uPrefixLengthA > 0) + c = ConsensusChar(PA[uPrefixLengthA - 1]); + Log("%4u:%c ", uPrefixLengthA, c); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + char c = Get_M_Char(TB[uPrefixLengthA][uPrefixLengthB]); + Log(" %6c", c); + } + Log("\n"); + } + + Log("\n"); + Log("Bit TBD:\n"); + for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + char c = ' '; + if (uPrefixLengthA > 0) + c = ConsensusChar(PA[uPrefixLengthA - 1]); + Log("%4u:%c ", uPrefixLengthA, c); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + char c = Get_D_Char(TB[uPrefixLengthA][uPrefixLengthB]); + Log(" %6c", c); + } + Log("\n"); + } + + Log("\n"); + Log("Bit TBI:\n"); + for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + char c = ' '; + if (uPrefixLengthA > 0) + c = ConsensusChar(PA[uPrefixLengthA - 1]); + Log("%4u:%c ", uPrefixLengthA, c); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + char c = Get_I_Char(TB[uPrefixLengthA][uPrefixLengthB]); + Log(" %6c", c); + } + Log("\n"); + } + } + +static void ListTB(char *TBM_, const ProfPos *PA, const ProfPos *PB, + unsigned uPrefixCountA, unsigned uPrefixCountB) + { + Log(" "); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + char c = ' '; + if (uPrefixLengthB > 0) + c = ConsensusChar(PB[uPrefixLengthB - 1]); + Log(" %4u:%c", uPrefixLengthB, c); + } + Log("\n"); + for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + char c = ' '; + if (uPrefixLengthA > 0) + c = ConsensusChar(PA[uPrefixLengthA - 1]); + Log("%4u:%c ", uPrefixLengthA, c); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + char c = TBM(uPrefixLengthA, uPrefixLengthB); + Log(" %6c", c); + } + Log("\n"); + } + } + +static const char *BitsToStr(char Bits) + { + static char Str[9]; + + sprintf(Str, "%cM %cD %cI", + Get_M_Char(Bits), + Get_D_Char(Bits), + Get_I_Char(Bits)); + } +#endif // TRACE + +static inline void SetBitTBM(char **TB, unsigned i, unsigned j, char c) + { + char Bit; + switch (c) + { + case 'M': + Bit = BIT_MM; + break; + case 'D': + Bit = BIT_DM; + break; + case 'I': + Bit = BIT_IM; + break; + default: + Quit("Huh?!"); + } + TB[i][j] &= ~BIT_xM; + TB[i][j] |= Bit; + } + +static inline void SetBitTBD(char **TB, unsigned i, unsigned j, char c) + { + char Bit; + switch (c) + { + case 'M': + Bit = BIT_MD; + break; + case 'D': + Bit = BIT_DD; + break; + default: + Quit("Huh?!"); + } + TB[i][j] &= ~BIT_xD; + TB[i][j] |= Bit; + } + +static inline void SetBitTBI(char **TB, unsigned i, unsigned j, char c) + { + char Bit; + switch (c) + { + case 'M': + Bit = BIT_MI; + break; + case 'I': + Bit = BIT_II; + break; + default: + Quit("Huh?!"); + } + TB[i][j] &= ~BIT_xI; + TB[i][j] |= Bit; + } + +#if TRACE +#define LogMatrices() \ + { \ + Log("Bit DPM:\n"); \ + LogDP(DPM_, PA, PB, uPrefixCountA, uPrefixCountB); \ + Log("Bit DPD:\n"); \ + LogDP(DPD_, PA, PB, uPrefixCountA, uPrefixCountB); \ + Log("Bit DPI:\n"); \ + LogDP(DPI_, PA, PB, uPrefixCountA, uPrefixCountB); \ + Log("Bit TB:\n"); \ + LogBitTB(TB, PA, PB, uPrefixCountA, uPrefixCountB); \ + bool Same; \ + Same = DPEq('M', g_DPM, DPM_, uPrefixCountA, uPrefixCountB);\ + if (Same) \ + Log("DPM success\n"); \ + Same = DPEq('D', g_DPD, DPD_, uPrefixCountA, uPrefixCountB);\ + if (Same) \ + Log("DPD success\n"); \ + Same = DPEq('I', g_DPI, DPI_, uPrefixCountA, uPrefixCountB);\ + if (Same) \ + Log("DPI success\n"); \ + CompareTB(TB, g_TBM, g_TBD, g_TBI, uPrefixCountA, uPrefixCountB);\ + } +#else +#define LogMatrices() /* empty */ +#endif + +static unsigned uCachePrefixCountB; +static unsigned uCachePrefixCountA; +static SCORE *CacheMCurr; +static SCORE *CacheMNext; +static SCORE *CacheMPrev; +static SCORE *CacheDRow; +static char **CacheTB; + +static void AllocCache(unsigned uPrefixCountA, unsigned uPrefixCountB) + { + if (uPrefixCountA <= uCachePrefixCountA && uPrefixCountB <= uCachePrefixCountB) + return; + + delete[] CacheMCurr; + delete[] CacheMNext; + delete[] CacheMPrev; + delete[] CacheDRow; + for (unsigned i = 0; i < uCachePrefixCountA; ++i) + delete[] CacheTB[i]; + delete[] CacheTB; + + uCachePrefixCountA = uPrefixCountA + 1024; + uCachePrefixCountB = uPrefixCountB + 1024; + + CacheMCurr = new SCORE[uCachePrefixCountB]; + CacheMNext = new SCORE[uCachePrefixCountB]; + CacheMPrev = new SCORE[uCachePrefixCountB]; + CacheDRow = new SCORE[uCachePrefixCountB]; + + CacheTB = new char *[uCachePrefixCountA]; + for (unsigned i = 0; i < uCachePrefixCountA; ++i) + CacheTB[i] = new char [uCachePrefixCountB]; + } + +SCORE NWSmall(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path) + { + if (0 == uLengthB || 0 == uLengthA ) + Quit("Internal error, NWSmall: length=0"); + + SetTermGaps(PA, uLengthA); + SetTermGaps(PB, uLengthB); + + const unsigned uPrefixCountA = uLengthA + 1; + const unsigned uPrefixCountB = uLengthB + 1; + const SCORE e = g_scoreGapExtend; + + ALLOC_TRACE() + + AllocCache(uPrefixCountA, uPrefixCountB); + + SCORE *MCurr = CacheMCurr; + SCORE *MNext = CacheMNext; + SCORE *MPrev = CacheMPrev; + SCORE *DRow = CacheDRow; + + char **TB = CacheTB; + for (unsigned i = 0; i < uPrefixCountA; ++i) + memset(TB[i], 0, uPrefixCountB); + + SCORE Iij = MINUS_INFINITY; + SetDPI(0, 0, Iij); + + Iij = PB[0].m_scoreGapOpen; + SetDPI(0, 1, Iij); + + for (unsigned j = 2; j <= uLengthB; ++j) + { + Iij += e; + SetDPI(0, j, Iij); + SetTBI(0, j, 'I'); + } + + for (unsigned j = 0; j <= uLengthB; ++j) + { + DRow[j] = MINUS_INFINITY; + SetDPD(0, j, DRow[j]); + SetTBD(0, j, 'D'); + } + + MPrev[0] = 0; + SetDPM(0, 0, MPrev[0]); + for (unsigned j = 1; j <= uLengthB; ++j) + { + MPrev[j] = MINUS_INFINITY; + SetDPM(0, j, MPrev[j]); + } + + MCurr[0] = MINUS_INFINITY; + SetDPM(1, 0, MCurr[0]); + + MCurr[1] = ScoreProfPos2(PA[0], PB[0]); + SetDPM(1, 1, MCurr[1]); + SetBitTBM(TB, 1, 1, 'M'); + SetTBM(1, 1, 'M'); + + for (unsigned j = 2; j <= uLengthB; ++j) + { + MCurr[j] = ScoreProfPos2(PA[0], PB[j-1]) + PB[0].m_scoreGapOpen + + (j - 2)*e + PB[j-2].m_scoreGapClose; + SetDPM(1, j, MCurr[j]); + SetBitTBM(TB, 1, j, 'I'); + SetTBM(1, j, 'I'); + } + +// Main DP loop + for (unsigned i = 1; i < uLengthA; ++i) + { + char *TBRow = TB[i]; + + Iij = MINUS_INFINITY; + SetDPI(i, 0, Iij); + + DRow[0] = PA[0].m_scoreGapOpen + (i - 1)*e; + SetDPD(i, 0, DRow[0]); + + MCurr[0] = MINUS_INFINITY; + if (i == 1) + { + MCurr[1] = ScoreProfPos2(PA[0], PB[0]); + SetBitTBM(TB, i, 1, 'M'); + SetTBM(i, 1, 'M'); + } + else + { + MCurr[1] = ScoreProfPos2(PA[i-1], PB[0]) + PA[0].m_scoreGapOpen + + (i - 2)*e + PA[i-2].m_scoreGapClose; + SetBitTBM(TB, i, 1, 'D'); + SetTBM(i, 1, 'D'); + } + SetDPM(i, 0, MCurr[0]); + SetDPM(i, 1, MCurr[1]); + + for (unsigned j = 1; j < uLengthB; ++j) + MNext[j+1] = ScoreProfPos2(PA[i], PB[j]); + + for (unsigned j = 1; j < uLengthB; ++j) + { + RECURSE_D(i, j) + RECURSE_I(i, j) + RECURSE_M(i, j) + } + // Special case for j=uLengthB + RECURSE_D_BTerm(i) + RECURSE_I_BTerm(i) + + // Prev := Curr, Curr := Next, Next := Prev + Rotate(MPrev, MCurr, MNext); + } + +// Special case for i=uLengthA + char *TBRow = TB[uLengthA]; + MCurr[0] = MINUS_INFINITY; + if (uLengthA > 1) + MCurr[1] = ScoreProfPos2(PA[uLengthA-1], PB[0]) + (uLengthA - 2)*e + + PA[0].m_scoreGapOpen + PA[uLengthA-2].m_scoreGapClose; + else + MCurr[1] = ScoreProfPos2(PA[uLengthA-1], PB[0]) + PA[0].m_scoreGapOpen + + PA[0].m_scoreGapClose; + SetBitTBM(TB, uLengthA, 1, 'D'); + SetTBM(uLengthA, 1, 'D'); + SetDPM(uLengthA, 0, MCurr[0]); + SetDPM(uLengthA, 1, MCurr[1]); + + DRow[0] = MINUS_INFINITY; + SetDPD(uLengthA, 0, DRow[0]); + for (unsigned j = 1; j <= uLengthB; ++j) + RECURSE_D_ATerm(j); + + Iij = MINUS_INFINITY; + for (unsigned j = 1; j <= uLengthB; ++j) + RECURSE_I_ATerm(j) + + LogMatrices(); + + SCORE MAB = MCurr[uLengthB]; + SCORE DAB = DRow[uLengthB]; + SCORE IAB = Iij; + + SCORE Score = MAB; + char cEdgeType = 'M'; + if (DAB > Score) + { + Score = DAB; + cEdgeType = 'D'; + } + if (IAB > Score) + { + Score = IAB; + cEdgeType = 'I'; + } + +#if TRACE + Log(" Fast: MAB=%.4g DAB=%.4g IAB=%.4g best=%c\n", + MAB, DAB, IAB, cEdgeType); +#endif + + BitTraceBack(TB, uLengthA, uLengthB, cEdgeType, Path); + +#if DBEUG + Path.Validate(); +#endif + + return 0; + } diff --git a/src/muscle/muscle3.8.31/src/objscore.cpp b/src/muscle/muscle3.8.31/src/objscore.cpp new file mode 100644 index 0000000..6af91d9 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/objscore.cpp @@ -0,0 +1,113 @@ +#include "muscle.h" +#include "msa.h" +#include "objscore.h" +#include "profile.h" +#include "timing.h" + +#if TIMING +TICKS g_ticksObjScore = 0; +#endif + +SCORE ObjScore(const MSA &msa, const unsigned SeqIndexes1[], + unsigned uSeqCount1, const unsigned SeqIndexes2[], unsigned uSeqCount2) + { +#if TIMING + TICKS t1 = GetClockTicks(); +#endif + const unsigned uSeqCount = msa.GetSeqCount(); + + OBJSCORE OS = g_ObjScore; + if (g_ObjScore == OBJSCORE_SPM) + { + if (uSeqCount <= 100) + OS = OBJSCORE_XP; + else + OS = OBJSCORE_SPF; + } + + MSA msa1; + MSA msa2; + + switch (OS) + { + case OBJSCORE_DP: + case OBJSCORE_XP: + MSAFromSeqSubset(msa, SeqIndexes1, uSeqCount1, msa1); + MSAFromSeqSubset(msa, SeqIndexes2, uSeqCount2, msa2); + + SetMSAWeightsMuscle(msa1); + SetMSAWeightsMuscle(msa2); + break; + + case OBJSCORE_SP: + case OBJSCORE_SPF: + case OBJSCORE_PS: + // Yuck -- casting away const (design flaw) + SetMSAWeightsMuscle((MSA &) msa); + break; + } + + SCORE Score = 0; + switch (OS) + { + case OBJSCORE_SP: + Score = ObjScoreSP(msa); + break; + + case OBJSCORE_DP: + Score = ObjScoreDP(msa1, msa2); + break; + + case OBJSCORE_XP: + Score = ObjScoreXP(msa1, msa2); + break; + + case OBJSCORE_PS: + Score = ObjScorePS(msa); + break; + + case OBJSCORE_SPF: + Score = ObjScoreSPDimer(msa); + break; + + default: + Quit("Invalid g_ObjScore=%d", g_ObjScore); + } +#if TIMING + TICKS t2 = GetClockTicks(); + g_ticksObjScore += (t2 - t1); +#endif + return Score; + } + +SCORE ObjScoreIds(const MSA &msa, const unsigned Ids1[], + unsigned uCount1, const unsigned Ids2[], unsigned uCount2) + { +#if TIMING + TICKS t1 = GetClockTicks(); +#endif + unsigned *SeqIndexes1 = new unsigned[uCount1]; + unsigned *SeqIndexes2 = new unsigned[uCount2]; + + for (unsigned n = 0; n < uCount1; ++n) + SeqIndexes1[n] = msa.GetSeqIndex(Ids1[n]); + + for (unsigned n = 0; n < uCount2; ++n) + SeqIndexes2[n] = msa.GetSeqIndex(Ids2[n]); + +#if DOUBLE_AFFINE + extern SCORE ObjScoreDA(const MSA &msa, SCORE *ptrLetters, SCORE *ptrGaps); + SCORE Letters, Gaps; + SCORE dObjScore = ObjScoreDA(msa, &Letters, &Gaps); + + delete[] SeqIndexes1; + delete[] SeqIndexes2; +#else + SCORE dObjScore = ObjScore(msa, SeqIndexes1, uCount1, SeqIndexes2, uCount2); +#endif +#if TIMING + TICKS t2 = GetClockTicks(); + g_ticksObjScore += (t2 - t1); +#endif + return dObjScore; + } diff --git a/src/muscle/muscle3.8.31/src/objscore.h b/src/muscle/muscle3.8.31/src/objscore.h new file mode 100644 index 0000000..d769644 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/objscore.h @@ -0,0 +1,30 @@ +#ifndef ObjScore_h +#define ObjScore_h + +SCORE ScoreSeqPairGaps(const MSA &msa1, unsigned uSeqIndex1, + const MSA &msa2, unsigned uSeqIndex2); +SCORE ScoreSeqPairLetters(const MSA &msa1, unsigned uSeqIndex1, + const MSA &msa2, unsigned uSeqIndex2); +SCORE ScoreGaps(const MSA &msa, const unsigned Cols[], unsigned ColCount); + +SCORE ObjScore(const MSA &msa, const unsigned SeqIndexes1[], + unsigned uSeqCount1, const unsigned SeqIndexes2[], unsigned uSeqCount2); + +SCORE ObjScoreIds(const MSA &msa, const unsigned Ids1[], + unsigned uCount1, const unsigned Ids2[], unsigned uCount2); + +void GetLetterScores(const MSA &msa, SCORE LetterScores[]); + +SCORE ObjScoreDP(const MSA &msa1, const MSA &msa2, SCORE MatchScore[] = 0); +SCORE ObjScorePS(const MSA &msa, SCORE MatchScore[] = 0); +SCORE ObjScoreSP(const MSA &msa, SCORE MatchScore[] = 0); +SCORE ObjScoreXP(const MSA &msa, const MSA &msa2); +SCORE ObjScoreSPDimer(const MSA &msa); +SCORE ObjScoreDP_Profs(const ProfPos *PA, const ProfPos *PB, unsigned uColCount, + SCORE MatchScore[] = 0); + +SCORE DiffObjScore( + const MSA &msa1, const PWPath &Path1, const unsigned Edges1[], unsigned uEdgeCount1, + const MSA &msa2, const PWPath &Path2, const unsigned Edges2[], unsigned uEdgeCount2); + +#endif // ObjScore_h diff --git a/src/muscle/muscle3.8.31/src/objscore2.cpp b/src/muscle/muscle3.8.31/src/objscore2.cpp new file mode 100644 index 0000000..bac5b5a --- /dev/null +++ b/src/muscle/muscle3.8.31/src/objscore2.cpp @@ -0,0 +1,522 @@ +#include "muscle.h" +#include "msa.h" +#include "profile.h" +#include "objscore.h" + +#define TRACE 0 +#define TRACE_SEQPAIR 0 +#define TEST_SPFAST 0 + +extern SCOREMATRIX VTML_LA; +extern SCOREMATRIX PAM200; +extern SCOREMATRIX PAM200NoCenter; +extern SCOREMATRIX VTML_SP; +extern SCOREMATRIX VTML_SPNoCenter; +extern SCOREMATRIX NUC_SP; + +SCORE g_SPScoreLetters; +SCORE g_SPScoreGaps; + +static SCORE TermGapScore(bool Gap) + { + switch (g_TermGaps) + { + case TERMGAPS_Full: + return 0; + + case TERMGAPS_Half: + if (Gap) + return g_scoreGapOpen/2; + return 0; + + case TERMGAPS_Ext: + if (Gap) + return g_scoreGapExtend; + return 0; + } + Quit("TermGapScore?!"); + return 0; + } + +SCORE ScoreSeqPairLetters(const MSA &msa1, unsigned uSeqIndex1, + const MSA &msa2, unsigned uSeqIndex2) + { + const unsigned uColCount = msa1.GetColCount(); + const unsigned uColCount2 = msa2.GetColCount(); + if (uColCount != uColCount2) + Quit("ScoreSeqPairLetters, different lengths"); + +#if TRACE_SEQPAIR + { + Log("\n"); + Log("ScoreSeqPairLetters\n"); + MSA msaTmp; + msaTmp.SetSize(2, uColCount); + msaTmp.CopySeq(0, msa1, uSeqIndex1); + msaTmp.CopySeq(1, msa2, uSeqIndex2); + msaTmp.LogMe(); + } +#endif + + SCORE scoreLetters = 0; + SCORE scoreGaps = 0; + bool bGapping1 = false; + bool bGapping2 = false; + + unsigned uColStart = 0; + bool bLeftTermGap = false; + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + { + bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); + bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); + if (!bGap1 || !bGap2) + { + if (bGap1 || bGap2) + bLeftTermGap = true; + uColStart = uColIndex; + break; + } + } + + unsigned uColEnd = uColCount - 1; + bool bRightTermGap = false; + for (int iColIndex = (int) uColCount - 1; iColIndex >= 0; --iColIndex) + { + bool bGap1 = msa1.IsGap(uSeqIndex1, iColIndex); + bool bGap2 = msa2.IsGap(uSeqIndex2, iColIndex); + if (!bGap1 || !bGap2) + { + if (bGap1 || bGap2) + bRightTermGap = true; + uColEnd = (unsigned) iColIndex; + break; + } + } + +#if TRACE_SEQPAIR + Log("LeftTermGap=%d RightTermGap=%d\n", bLeftTermGap, bRightTermGap); +#endif + + for (unsigned uColIndex = uColStart; uColIndex <= uColEnd; ++uColIndex) + { + unsigned uLetter1 = msa1.GetLetterEx(uSeqIndex1, uColIndex); + if (uLetter1 >= g_AlphaSize) + continue; + unsigned uLetter2 = msa2.GetLetterEx(uSeqIndex2, uColIndex); + if (uLetter2 >= g_AlphaSize) + continue; + + SCORE scoreMatch = (*g_ptrScoreMatrix)[uLetter1][uLetter2]; + scoreLetters += scoreMatch; + } + return scoreLetters; + } + +SCORE ScoreSeqPairGaps(const MSA &msa1, unsigned uSeqIndex1, + const MSA &msa2, unsigned uSeqIndex2) + { + const unsigned uColCount = msa1.GetColCount(); + const unsigned uColCount2 = msa2.GetColCount(); + if (uColCount != uColCount2) + Quit("ScoreSeqPairGaps, different lengths"); + +#if TRACE_SEQPAIR + { + Log("\n"); + Log("ScoreSeqPairGaps\n"); + MSA msaTmp; + msaTmp.SetSize(2, uColCount); + msaTmp.CopySeq(0, msa1, uSeqIndex1); + msaTmp.CopySeq(1, msa2, uSeqIndex2); + msaTmp.LogMe(); + } +#endif + + SCORE scoreGaps = 0; + bool bGapping1 = false; + bool bGapping2 = false; + + unsigned uColStart = 0; + bool bLeftTermGap = false; + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + { + bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); + bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); + if (!bGap1 || !bGap2) + { + if (bGap1 || bGap2) + bLeftTermGap = true; + uColStart = uColIndex; + break; + } + } + + unsigned uColEnd = uColCount - 1; + bool bRightTermGap = false; + for (int iColIndex = (int) uColCount - 1; iColIndex >= 0; --iColIndex) + { + bool bGap1 = msa1.IsGap(uSeqIndex1, iColIndex); + bool bGap2 = msa2.IsGap(uSeqIndex2, iColIndex); + if (!bGap1 || !bGap2) + { + if (bGap1 || bGap2) + bRightTermGap = true; + uColEnd = (unsigned) iColIndex; + break; + } + } + +#if TRACE_SEQPAIR + Log("LeftTermGap=%d RightTermGap=%d\n", bLeftTermGap, bRightTermGap); +#endif + + for (unsigned uColIndex = uColStart; uColIndex <= uColEnd; ++uColIndex) + { + bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); + bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); + + if (bGap1 && bGap2) + continue; + + if (bGap1) + { + if (!bGapping1) + { +#if TRACE_SEQPAIR + Log("Gap open seq 1 col %d\n", uColIndex); +#endif + if (uColIndex == uColStart) + scoreGaps += TermGapScore(true); + else + scoreGaps += g_scoreGapOpen; + bGapping1 = true; + } + else + scoreGaps += g_scoreGapExtend; + continue; + } + + else if (bGap2) + { + if (!bGapping2) + { +#if TRACE_SEQPAIR + Log("Gap open seq 2 col %d\n", uColIndex); +#endif + if (uColIndex == uColStart) + scoreGaps += TermGapScore(true); + else + scoreGaps += g_scoreGapOpen; + bGapping2 = true; + } + else + scoreGaps += g_scoreGapExtend; + continue; + } + + bGapping1 = false; + bGapping2 = false; + } + + if (bGapping1 || bGapping2) + { + scoreGaps -= g_scoreGapOpen; + scoreGaps += TermGapScore(true); + } + return scoreGaps; + } + +// The usual sum-of-pairs objective score: sum the score +// of the alignment of each pair of sequences. +SCORE ObjScoreSP(const MSA &msa, SCORE MatchScore[]) + { +#if TRACE + Log("==================ObjScoreSP==============\n"); + Log("msa=\n"); + msa.LogMe(); +#endif + g_SPScoreLetters = 0; + g_SPScoreGaps = 0; + + if (0 != MatchScore) + { + const unsigned uColCount = msa.GetColCount(); + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + MatchScore[uColIndex] = 0; + } + + const unsigned uSeqCount = msa.GetSeqCount(); + SCORE scoreTotal = 0; + unsigned uPairCount = 0; +#if TRACE + Log("Seq1 Seq2 wt1 wt2 Letters Gaps Unwt.Score Wt.Score Total\n"); + Log("---- ---- ------ ------ ---------- ---------- ---------- ---------- ----------\n"); +#endif + for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) + { + const WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1); + for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; ++uSeqIndex2) + { + const WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2); + const WEIGHT w = w1*w2; + + SCORE scoreLetters = ScoreSeqPairLetters(msa, uSeqIndex1, msa, uSeqIndex2); + SCORE scoreGaps = ScoreSeqPairGaps(msa, uSeqIndex1, msa, uSeqIndex2); + SCORE scorePair = scoreLetters + scoreGaps; + ++uPairCount; + + scoreTotal += w*scorePair; + + g_SPScoreLetters += w*scoreLetters; + g_SPScoreGaps += w*scoreGaps; +#if TRACE + Log("%4d %4d %6.3f %6.3f %10.2f %10.2f %10.2f %10.2f %10.2f >%s >%s\n", + uSeqIndex1, + uSeqIndex2, + w1, + w2, + scoreLetters, + scoreGaps, + scorePair, + scorePair*w1*w2, + scoreTotal, + msa.GetSeqName(uSeqIndex1), + msa.GetSeqName(uSeqIndex2)); +#endif + } + } +#if TEST_SPFAST + { + SCORE f = ObjScoreSPFast(msa); + Log("Fast = %.6g\n", f); + Log("Brute = %.6g\n", scoreTotal); + if (BTEq(f, scoreTotal)) + Log("Agree\n"); + else + Log("** DISAGREE **\n"); + } +#endif +// return scoreTotal / uPairCount; + return scoreTotal; + } + +// Objective score defined as the dynamic programming score. +// Input is two alignments, which must be of the same length. +// Result is the same profile-profile score that is optimized +// by dynamic programming. +SCORE ObjScoreDP(const MSA &msa1, const MSA &msa2, SCORE MatchScore[]) + { + const unsigned uColCount = msa1.GetColCount(); + if (msa2.GetColCount() != uColCount) + Quit("ObjScoreDP, must be same length"); + + const unsigned uColCount1 = msa1.GetColCount(); + const unsigned uColCount2 = msa2.GetColCount(); + + const ProfPos *PA = ProfileFromMSA(msa1); + const ProfPos *PB = ProfileFromMSA(msa2); + + return ObjScoreDP_Profs(PA, PB, uColCount1, MatchScore); + } + +SCORE ObjScoreDP_Profs(const ProfPos *PA, const ProfPos *PB, unsigned uColCount, + SCORE MatchScore[]) + { +//#if TRACE +// Log("Profile 1:\n"); +// ListProfile(PA, uColCount, &msa1); +// +// Log("Profile 2:\n"); +// ListProfile(PB, uColCount, &msa2); +//#endif + + SCORE scoreTotal = 0; + + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + { + const ProfPos &PPA = PA[uColIndex]; + const ProfPos &PPB = PB[uColIndex]; + + SCORE scoreGap = 0; + SCORE scoreMatch = 0; + // If gapped column... + if (PPA.m_bAllGaps && PPB.m_bAllGaps) + scoreGap = 0; + else if (PPA.m_bAllGaps) + { + if (uColCount - 1 == uColIndex || !PA[uColIndex+1].m_bAllGaps) + scoreGap = PPB.m_scoreGapClose; + if (0 == uColIndex || !PA[uColIndex-1].m_bAllGaps) + scoreGap += PPB.m_scoreGapOpen; + //if (0 == scoreGap) + // scoreGap = PPB.m_scoreGapExtend; + } + else if (PPB.m_bAllGaps) + { + if (uColCount - 1 == uColIndex || !PB[uColIndex+1].m_bAllGaps) + scoreGap = PPA.m_scoreGapClose; + if (0 == uColIndex || !PB[uColIndex-1].m_bAllGaps) + scoreGap += PPA.m_scoreGapOpen; + //if (0 == scoreGap) + // scoreGap = PPA.m_scoreGapExtend; + } + else + scoreMatch = ScoreProfPos2(PPA, PPB); + + if (0 != MatchScore) + MatchScore[uColIndex] = scoreMatch; + + scoreTotal += scoreMatch + scoreGap; + + extern bool g_bTracePPScore; + extern MSA *g_ptrPPScoreMSA1; + extern MSA *g_ptrPPScoreMSA2; + if (g_bTracePPScore) + { + const MSA &msa1 = *g_ptrPPScoreMSA1; + const MSA &msa2 = *g_ptrPPScoreMSA2; + const unsigned uSeqCount1 = msa1.GetSeqCount(); + const unsigned uSeqCount2 = msa2.GetSeqCount(); + + for (unsigned n = 0; n < uSeqCount1; ++n) + Log("%c", msa1.GetChar(n, uColIndex)); + Log(" "); + for (unsigned n = 0; n < uSeqCount2; ++n) + Log("%c", msa2.GetChar(n, uColIndex)); + Log(" %10.3f", scoreMatch); + if (scoreGap != 0) + Log(" %10.3f", scoreGap); + Log("\n"); + } + } + + delete[] PA; + delete[] PB; + + return scoreTotal; + } + +// Objective score defined as the sum of profile-sequence +// scores for each sequence in the alignment. The profile +// is computed from the entire alignment, so this includes +// the score of each sequence against itself. This is to +// avoid recomputing the profile each time, so we reduce +// complexity but introduce a questionable approximation. +// The goal is to see if we can exploit the apparent +// improvement in performance of log-expectation score +// over the usual sum-of-pairs by optimizing this +// objective score in the iterative refinement stage. +SCORE ObjScorePS(const MSA &msa, SCORE MatchScore[]) + { + if (g_PPScore != PPSCORE_LE) + Quit("FastScoreMSA_LASimple: LA"); + + const unsigned uSeqCount = msa.GetSeqCount(); + const unsigned uColCount = msa.GetColCount(); + + const ProfPos *Prof = ProfileFromMSA(msa); + + if (0 != MatchScore) + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + MatchScore[uColIndex] = 0; + + SCORE scoreTotal = 0; + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + const WEIGHT weightSeq = msa.GetSeqWeight(uSeqIndex); + SCORE scoreSeq = 0; + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + { + const ProfPos &PP = Prof[uColIndex]; + if (msa.IsGap(uSeqIndex, uColIndex)) + { + bool bOpen = (0 == uColIndex || + !msa.IsGap(uSeqIndex, uColIndex - 1)); + bool bClose = (uColCount - 1 == uColIndex || + !msa.IsGap(uSeqIndex, uColIndex + 1)); + + if (bOpen) + scoreSeq += PP.m_scoreGapOpen; + if (bClose) + scoreSeq += PP.m_scoreGapClose; + //if (!bOpen && !bClose) + // scoreSeq += PP.m_scoreGapExtend; + } + else if (msa.IsWildcard(uSeqIndex, uColIndex)) + continue; + else + { + unsigned uLetter = msa.GetLetter(uSeqIndex, uColIndex); + const SCORE scoreMatch = PP.m_AAScores[uLetter]; + if (0 != MatchScore) + MatchScore[uColIndex] += weightSeq*scoreMatch; + scoreSeq += scoreMatch; + } + } + scoreTotal += weightSeq*scoreSeq; + } + + delete[] Prof; + return scoreTotal; + } + +// The XP score is the sum of the score of each pair of +// sequences between two profiles which are aligned to +// each other. Notice that for two given profiles aligned +// in different ways, the difference in XP score must be +// the same as the difference in SP score because the +// score of a pair of sequences in one profile doesn't +// depend on the alignment. +SCORE ObjScoreXP(const MSA &msa1, const MSA &msa2) + { + const unsigned uColCount1 = msa1.GetColCount(); + const unsigned uColCount2 = msa2.GetColCount(); + if (uColCount1 != uColCount2) + Quit("ObjScoreXP, alignment lengths differ %u %u", uColCount1, uColCount2); + + const unsigned uSeqCount1 = msa1.GetSeqCount(); + const unsigned uSeqCount2 = msa2.GetSeqCount(); + +#if TRACE + Log(" Score Weight Weight Total\n"); + Log("---------- ------ ------ ----------\n"); +#endif + + SCORE scoreTotal = 0; + unsigned uPairCount = 0; + for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount1; ++uSeqIndex1) + { + const WEIGHT w1 = msa1.GetSeqWeight(uSeqIndex1); + for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqCount2; ++uSeqIndex2) + { + const WEIGHT w2 = msa2.GetSeqWeight(uSeqIndex2); + const WEIGHT w = w1*w2; + SCORE scoreLetters = ScoreSeqPairLetters(msa1, uSeqIndex1, msa2, uSeqIndex2); + SCORE scoreGaps = ScoreSeqPairGaps(msa1, uSeqIndex1, msa2, uSeqIndex2); + SCORE scorePair = scoreLetters + scoreGaps; + scoreTotal += w1*w2*scorePair; + ++uPairCount; +#if TRACE + Log("%10.2f %6.3f %6.3f %10.2f >%s >%s\n", + scorePair, + w1, + w2, + scorePair*w1*w2, + msa1.GetSeqName(uSeqIndex1), + msa2.GetSeqName(uSeqIndex2)); +#endif + } + } + if (0 == uPairCount) + Quit("0 == uPairCount"); + +#if TRACE + Log("msa1=\n"); + msa1.LogMe(); + Log("msa2=\n"); + msa2.LogMe(); + Log("XP=%g\n", scoreTotal); +#endif +// return scoreTotal / uPairCount; + return scoreTotal; + } diff --git a/src/muscle/muscle3.8.31/src/objscoreda.cpp b/src/muscle/muscle3.8.31/src/objscoreda.cpp new file mode 100644 index 0000000..3f2978e --- /dev/null +++ b/src/muscle/muscle3.8.31/src/objscoreda.cpp @@ -0,0 +1,289 @@ +#include "muscle.h" +#include "msa.h" +#include "profile.h" +#include "objscore.h" + +#if DOUBLE_AFFINE + +#define TRACE 0 +#define TEST_SPFAST 0 + +static SCORE GapPenalty(unsigned uLength, bool Term, SCORE g, SCORE e) + { + //if (Term) + // { + // switch (g_TermGap) + // { + // case TERMGAP_Full: + // return g + (uLength - 1)*e; + + // case TERMGAP_Half: + // return g/2 + (uLength - 1)*e; + + // case TERMGAP_Ext: + // return uLength*e; + // } + // Quit("Bad termgap"); + // } + //else + // return g + (uLength - 1)*e; + //return MINUS_INFINITY; + return g + (uLength - 1)*e; + } + +static SCORE GapPenalty(unsigned uLength, bool Term) + { + SCORE s1 = GapPenalty(uLength, Term, g_scoreGapOpen, g_scoreGapExtend); +#if DOUBLE_AFFINE + SCORE s2 = GapPenalty(uLength, Term, g_scoreGapOpen2, g_scoreGapExtend2); + if (s1 > s2) + return s1; + return s2; +#else + return s1; +#endif + } + +static const MSA *g_ptrMSA1; +static const MSA *g_ptrMSA2; +static unsigned g_uSeqIndex1; +static unsigned g_uSeqIndex2; + +static void LogGap(unsigned uStart, unsigned uEnd, unsigned uGapLength, + bool bNTerm, bool bCTerm) + { + Log("%16.16s ", ""); + for (unsigned i = 0; i < uStart; ++i) + Log(" "); + unsigned uMyLength = 0; + for (unsigned i = uStart; i <= uEnd; ++i) + { + bool bGap1 = g_ptrMSA1->IsGap(g_uSeqIndex1, i); + bool bGap2 = g_ptrMSA2->IsGap(g_uSeqIndex2, i); + if (!bGap1 && !bGap2) + Quit("Error -- neither gapping"); + if (bGap1 && bGap2) + Log("."); + else + { + ++uMyLength; + Log("-"); + } + } + SCORE s = GapPenalty(uGapLength, bNTerm || bCTerm); + Log(" L=%d N%d C%d s=%.3g", uGapLength, bNTerm, bCTerm, s); + Log("\n"); + if (uMyLength != uGapLength) + Quit("Lengths differ"); + + } + +static SCORE ScoreSeqPair(const MSA &msa1, unsigned uSeqIndex1, + const MSA &msa2, unsigned uSeqIndex2, SCORE *ptrLetters, SCORE *ptrGaps) + { + g_ptrMSA1 = &msa1; + g_ptrMSA2 = &msa2; + g_uSeqIndex1 = uSeqIndex1; + g_uSeqIndex2 = uSeqIndex2; + + const unsigned uColCount = msa1.GetColCount(); + const unsigned uColCount2 = msa2.GetColCount(); + if (uColCount != uColCount2) + Quit("ScoreSeqPair, different lengths"); + +#if TRACE + Log("ScoreSeqPair\n"); + Log("%16.16s ", msa1.GetSeqName(uSeqIndex1)); + for (unsigned i = 0; i < uColCount; ++i) + Log("%c", msa1.GetChar(uSeqIndex1, i)); + Log("\n"); + Log("%16.16s ", msa2.GetSeqName(uSeqIndex2)); + for (unsigned i = 0; i < uColCount; ++i) + Log("%c", msa1.GetChar(uSeqIndex2, i)); + Log("\n"); +#endif + + SCORE scoreTotal = 0; + +// Substitution scores + unsigned uFirstLetter1 = uInsane; + unsigned uFirstLetter2 = uInsane; + unsigned uLastLetter1 = uInsane; + unsigned uLastLetter2 = uInsane; + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + { + bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); + bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); + bool bWildcard1 = msa1.IsWildcard(uSeqIndex1, uColIndex); + bool bWildcard2 = msa2.IsWildcard(uSeqIndex2, uColIndex); + + if (!bGap1) + { + if (uInsane == uFirstLetter1) + uFirstLetter1 = uColIndex; + uLastLetter1 = uColIndex; + } + if (!bGap2) + { + if (uInsane == uFirstLetter2) + uFirstLetter2 = uColIndex; + uLastLetter2 = uColIndex; + } + + if (bGap1 || bGap2 || bWildcard1 || bWildcard2) + continue; + + unsigned uLetter1 = msa1.GetLetter(uSeqIndex1, uColIndex); + unsigned uLetter2 = msa2.GetLetter(uSeqIndex2, uColIndex); + + SCORE scoreMatch = (*g_ptrScoreMatrix)[uLetter1][uLetter2]; + scoreTotal += scoreMatch; +#if TRACE + Log("%c <-> %c = %7.1f %10.1f\n", + msa1.GetChar(uSeqIndex1, uColIndex), + msa2.GetChar(uSeqIndex2, uColIndex), + scoreMatch, + scoreTotal); +#endif + } + + *ptrLetters = scoreTotal; + +// Gap penalties + unsigned uGapLength = uInsane; + unsigned uGapStartCol = uInsane; + bool bGapping1 = false; + bool bGapping2 = false; + + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + { + bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); + bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); + + if (bGap1 && bGap2) + continue; + + if (bGapping1) + { + if (bGap1) + ++uGapLength; + else + { + bGapping1 = false; + bool bNTerm = (uFirstLetter2 == uGapStartCol); + bool bCTerm = (uLastLetter2 + 1 == uColIndex); + SCORE scoreGap = GapPenalty(uGapLength, bNTerm || bCTerm); + scoreTotal += scoreGap; +#if TRACE + LogGap(uGapStartCol, uColIndex - 1, uGapLength, bNTerm, bCTerm); + Log("GAP %7.1f %10.1f\n", + scoreGap, + scoreTotal); +#endif + } + continue; + } + else + { + if (bGap1) + { + uGapStartCol = uColIndex; + bGapping1 = true; + uGapLength = 1; + continue; + } + } + + if (bGapping2) + { + if (bGap2) + ++uGapLength; + else + { + bGapping2 = false; + bool bNTerm = (uFirstLetter1 == uGapStartCol); + bool bCTerm = (uLastLetter1 + 1 == uColIndex); + SCORE scoreGap = GapPenalty(uGapLength, bNTerm || bCTerm); + scoreTotal += scoreGap; +#if TRACE + LogGap(uGapStartCol, uColIndex - 1, uGapLength, bNTerm, bCTerm); + Log("GAP %7.1f %10.1f\n", + scoreGap, + scoreTotal); +#endif + } + } + else + { + if (bGap2) + { + uGapStartCol = uColIndex; + bGapping2 = true; + uGapLength = 1; + } + } + } + + if (bGapping1 || bGapping2) + { + SCORE scoreGap = GapPenalty(uGapLength, true); + scoreTotal += scoreGap; +#if TRACE + LogGap(uGapStartCol, uColCount - 1, uGapLength, false, true); + Log("GAP %7.1f %10.1f\n", + scoreGap, + scoreTotal); +#endif + } + *ptrGaps = scoreTotal - *ptrLetters; + return scoreTotal; + } + +// The usual sum-of-pairs objective score: sum the score +// of the alignment of each pair of sequences. +SCORE ObjScoreDA(const MSA &msa, SCORE *ptrLetters, SCORE *ptrGaps) + { + const unsigned uSeqCount = msa.GetSeqCount(); + SCORE scoreTotal = 0; + unsigned uPairCount = 0; +#if TRACE + msa.LogMe(); + Log(" Score Weight Weight Total\n"); + Log("---------- ------ ------ ----------\n"); +#endif + SCORE TotalLetters = 0; + SCORE TotalGaps = 0; + for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) + { + const WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1); + for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; ++uSeqIndex2) + { + const WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2); + const WEIGHT w = w1*w2; + SCORE Letters; + SCORE Gaps; + SCORE scorePair = ScoreSeqPair(msa, uSeqIndex1, msa, uSeqIndex2, + &Letters, &Gaps); + scoreTotal += w1*w2*scorePair; + TotalLetters += w1*w2*Letters; + TotalGaps += w1*w2*Gaps; + ++uPairCount; +#if TRACE + Log("%10.2f %6.3f %6.3f %10.2f %d=%s %d=%s\n", + scorePair, + w1, + w2, + scorePair*w1*w2, + uSeqIndex1, + msa.GetSeqName(uSeqIndex1), + uSeqIndex2, + msa.GetSeqName(uSeqIndex2)); +#endif + } + } + *ptrLetters = TotalLetters; + *ptrGaps = TotalGaps; + return scoreTotal; + } + +#endif // DOUBLE_AFFINE diff --git a/src/muscle/muscle3.8.31/src/onexception.cpp b/src/muscle/muscle3.8.31/src/onexception.cpp new file mode 100644 index 0000000..f86b31f --- /dev/null +++ b/src/muscle/muscle3.8.31/src/onexception.cpp @@ -0,0 +1,15 @@ +#include "muscle.h" +#include + +static char szOnExceptionMessage[] = + { + "\nFatal error, exception caught.\n" + }; + +void OnException() + { + fprintf(stderr, "%s", szOnExceptionMessage); + Log("%s", szOnExceptionMessage); + Log("Finished %s\n", GetTimeAsStr()); + exit(EXIT_Except); + } diff --git a/src/muscle/muscle3.8.31/src/options.cpp b/src/muscle/muscle3.8.31/src/options.cpp new file mode 100644 index 0000000..902218d --- /dev/null +++ b/src/muscle/muscle3.8.31/src/options.cpp @@ -0,0 +1,241 @@ +#include "muscle.h" +#include + +struct VALUE_OPT + { + const char *m_pstrName; + const char *m_pstrValue; + }; + +struct FLAG_OPT + { + const char *m_pstrName; + bool m_bSet; + }; + +static VALUE_OPT ValueOpts[] = + { + "in", 0, + "in1", 0, + "in2", 0, + "out", 0, + "MaxIters", 0, + "MaxHours", 0, + "GapOpen", 0, + "GapOpen2", 0, + "GapExtend", 0, + "GapExtend2", 0, + "GapAmbig", 0, + "Center", 0, + "SmoothScoreCeil", 0, + "MinBestColScore", 0, + "MinSmoothScore", 0, + "ObjScore", 0, + "SmoothWindow", 0, + "RefineWindow", 0, + "FromWindow", 0, + "ToWindow", 0, + "SaveWindow", 0, + "WindowOffset", 0, + "FirstWindow", 0, + "AnchorSpacing", 0, + "Log", 0, + "LogA", 0, + "MaxTrees", 0, + "SUEFF", 0, + "Distance", 0, + "Distance1", 0, + "Distance2", 0, + "Weight", 0, + "Weight1", 0, + "Weight2", 0, + "Cluster", 0, + "Cluster1", 0, + "Cluster2", 0, + "Root1", 0, + "Root2", 0, + "Tree1", 0, + "Tree2", 0, + "UseTree", 0, + "UseTree_NoWarn", 0, + "DiagLength", 0, + "DiagMargin", 0, + "DiagBreak", 0, + "Hydro", 0, + "HydroFactor", 0, + "SPScore", 0, + "SeqType", 0, + "MaxMB", 0, + "ComputeWeights", 0, + "MaxSubFam", 0, + "ScoreFile", 0, + "TermGaps", 0, + "FASTAOut", 0, + "CLWOut", 0, + "CLWStrictOut", 0, + "HTMLOut", 0, + "MSFOut", 0, + "PHYIOut", 0, + "PHYSOut", 0, + "Matrix", 0, + "DistMx1", 0, + "DistMx2", 0, + "Weight", 0, + }; +static int ValueOptCount = sizeof(ValueOpts)/sizeof(ValueOpts[0]); + +static FLAG_OPT FlagOpts[] = + { + "LE", false, + "SP", false, + "SV", false, + "SPN", false, + "Core", false, + "NoCore", false, + "Diags1", false, + "Diags2", false, + "Diags", false, + "Quiet", false, + "MSF", false, + "Verbose", false, + "Anchors", false, + "NoAnchors", false, + "Refine", false, + "RefineW", false, + "SW", false, + "Profile", false, + "PPScore", false, + "ClusterOnly", false, + "Brenner", false, + "Dimer", false, + "clw", false, + "clwstrict", false, + "HTML", false, + "Version", false, + "Stable", false, + "Group", false, + "FASTA", false, + "ProfDB", false, + "PAS", false, + "PHYI", false, + "PHYS", false, + "TomHydro", false, + "MakeTree", false, + }; +static int FlagOptCount = sizeof(FlagOpts)/sizeof(FlagOpts[0]); + +static bool TestSetFlagOpt(const char *Arg) + { + for (int i = 0; i < FlagOptCount; ++i) + if (!stricmp(Arg, FlagOpts[i].m_pstrName)) + { + FlagOpts[i].m_bSet = true; + return true; + } + return false; + } + +static bool TestSetValueOpt(const char *Arg, const char *Value) + { + for (int i = 0; i < ValueOptCount; ++i) + if (!stricmp(Arg, ValueOpts[i].m_pstrName)) + { + if (0 == Value) + { + fprintf(stderr, "Option -%s must have value\n", Arg); + exit(EXIT_NotStarted); + } + ValueOpts[i].m_pstrValue = strsave(Value); + return true; + } + return false; + } + +bool FlagOpt(const char *Name) + { + for (int i = 0; i < FlagOptCount; ++i) + if (!stricmp(Name, FlagOpts[i].m_pstrName)) + return FlagOpts[i].m_bSet; + Quit("FlagOpt(%s) invalid", Name); + return false; + } + +const char *ValueOpt(const char *Name) + { + for (int i = 0; i < ValueOptCount; ++i) + if (!stricmp(Name, ValueOpts[i].m_pstrName)) + return ValueOpts[i].m_pstrValue; + Quit("ValueOpt(%s) invalid", Name); + return 0; + } + +void ProcessArgVect(int argc, char *argv[]) + { + for (int iArgIndex = 0; iArgIndex < argc; ) + { + const char *Arg = argv[iArgIndex]; + if (Arg[0] != '-') + { + fprintf(stderr, "Command-line option \"%s\" must start with '-'\n", Arg); + exit(EXIT_NotStarted); + } + const char *ArgName = Arg + 1; + if (TestSetFlagOpt(ArgName)) + { + ++iArgIndex; + continue; + } + + char *Value = 0; + if (iArgIndex < argc - 1) + Value = argv[iArgIndex+1]; + if (TestSetValueOpt(ArgName, Value)) + { + iArgIndex += 2; + continue; + } + fprintf(stderr, "Invalid command line option \"%s\"\n", ArgName); + Usage(); + exit(EXIT_NotStarted); + } + } + +void ProcessArgStr(const char *ArgStr) + { + const int MAX_ARGS = 64; + char *argv[MAX_ARGS]; + + if (0 == ArgStr) + return; + +// Modifiable copy + char *StrCopy = strsave(ArgStr); + + int argc = 0; + bool bInArg = false; + char *Str = StrCopy; + while (char c = *Str) + { + if (isspace(c)) + { + *Str = 0; + bInArg = false; + } + else if (!bInArg) + { + bInArg = true; + if (argc >= MAX_ARGS) + Quit("Too many args in MUSCLE_CMDLINE"); + argv[argc++] = Str; + } + Str++; + } + ProcessArgVect(argc, argv); + free(StrCopy); + } + +void ListFlagOpts() + { + for (int i = 0; i < FlagOptCount; ++i) + Log("%s %d\n", FlagOpts[i].m_pstrName, FlagOpts[i].m_bSet); + } diff --git a/src/muscle/muscle3.8.31/src/outweights.cpp b/src/muscle/muscle3.8.31/src/outweights.cpp new file mode 100644 index 0000000..58f0a20 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/outweights.cpp @@ -0,0 +1,17 @@ +#include "muscle.h" +#include "msa.h" + +void OutWeights(const char *FileName, const MSA &msa) + { + FILE *f = fopen(FileName, "w"); + if (0 == f) + Quit("Cannot open '%s'", FileName); + const unsigned uSeqCount = msa.GetSeqCount(); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + const char *Id = msa.GetSeqName(uSeqIndex); + const WEIGHT w = msa.GetSeqWeight(uSeqIndex); + fprintf(f, "%s\t%.3g\n", Id, w); + } + fclose(f); + } diff --git a/src/muscle/muscle3.8.31/src/pam200mafft.cpp b/src/muscle/muscle3.8.31/src/pam200mafft.cpp new file mode 100644 index 0000000..9ef24a4 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/pam200mafft.cpp @@ -0,0 +1,32 @@ +#include "muscle.h" + +// Adjusted PAM200 scoring matrix as used by default in MAFFT. +// Katoh, Misawa, Kuma and Miyata (2002), NAR 30(14), 3059-3066. + +static float PAM200[23][23] = + { +// A C D E F G H I K L M N P Q R S T V W Y B Z X + 408, 20, 54, 52, -182, 179, -68, 109, -35, -47, 39, 106, 206, -14, -12, 257, 293, 191, -306, -219, 0, 0, 0, // A + 20, 1190, -228, -295, 94, 6, 63, -131, -184, -176, -112, -29, -122, -195, 49, 185, 13, -49, 199, 333, 0, 0, 0, // C + 54, -228, 645, 516, -399, 168, 98, -225, 75, -341, -235, 352, -149, 142, -44, 65, 7, -147, -418, -128, 0, 0, 0, // D + 52, -295, 516, 630, -460, 145, 45, -225, 195, -307, -222, 186, -121, 299, 54, -10, -36, -130, -366, -285, 0, 0, 0, // E + -182, 94, -399, -460, 908, -387, 82, 100, -423, 340, 87, -216, -160, -274, -307, -31, -153, 51, 19, 604, 0, 0, 0, // F + 179, 6, 168, 145, -387, 682, -94, -196, -14, -304, -226, 99, -57, -48, 117, 175, 41, -73, -38, -329, 0, 0, 0, // G + -68, 63, 98, 45, 82, -94, 786, -185, 164, -72, -132, 258, 86, 388, 277, 55, -15, -197, -181, 488, 0, 0, 0, // H + 109, -131, -225, -225, 100, -196, -185, 574, -204, 308, 411, -94, -95, -202, -188, 1, 182, 489, -254, -133, 0, 0, 0, // I + -35, -184, 75, 195, -423, -14, 164, -204, 652, -229, -98, 206, -66, 335, 486, 22, 39, -207, -196, -244, 0, 0, 0, // K + -47, -176, -341, -307, 340, -304, -72, 308, -229, 611, 389, -203, 73, -66, -150, -49, -21, 259, -46, -9, 0, 0, 0, // L + 39, -112, -235, -222, 87, -226, -132, 411, -98, 389, 776, -111, -78, -104, -109, -29, 149, 351, -209, -162, 0, 0, 0, // M + 106, -29, 352, 186, -216, 99, 258, -94, 206, -203, -111, 536, -1, 108, 93, 260, 188, -98, -359, 12, 0, 0, 0, // N + 206, -122, -149, -121, -160, -57, 86, -95, -66, 73, -78, -1, 756, 142, 25, 241, 159, -55, -353, -206, 0, 0, 0, // P + -14, -195, 142, 299, -274, -48, 388, -202, 335, -66, -104, 108, 142, 655, 321, 7, -15, -175, -223, -53, 0, 0, 0, // Q + -12, 49, -44, 54, -307, 117, 277, -188, 486, -150, -109, 93, 25, 321, 626, 48, 16, -181, 124, -113, 0, 0, 0, // R + 257, 185, 65, -10, -31, 175, 55, 1, 22, -49, -29, 260, 241, 7, 48, 373, 279, 28, -193, -35, 0, 0, 0, // S + 293, 13, 7, -36, -153, 41, -15, 182, 39, -21, 149, 188, 159, -15, 16, 279, 442, 163, -323, -170, 0, 0, 0, // T + 191, -49, -147, -130, 51, -73, -197, 489, -207, 259, 351, -98, -55, -175, -181, 28, 163, 525, -225, -177, 0, 0, 0, // V + -306, 199, -418, -366, 19, -38, -181, -254, -196, -46, -209, -359, -353, -223, 124, -193, -323, -225, 1495, 83, 0, 0, 0, // W + -219, 333, -128, -285, 604, -329, 488, -133, -244, -9, -162, 12, -206, -53, -113, -35, -170, -177, 83, 999, 0, 0, 0, // Y + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Z + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // X + }; diff --git a/src/muscle/muscle3.8.31/src/params.cpp b/src/muscle/muscle3.8.31/src/params.cpp new file mode 100644 index 0000000..ec4e221 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/params.cpp @@ -0,0 +1,631 @@ +#include "muscle.h" +#include "objscore.h" +#include "profile.h" +#include "enumopts.h" + +const double DEFAULT_MAX_MB_FRACT = 0.8; + +SCORE g_scoreCenter = 0; +SCORE g_scoreGapExtend = 0; +SCORE g_scoreGapOpen2 = MINUS_INFINITY; +SCORE g_scoreGapExtend2 = MINUS_INFINITY; +SCORE g_scoreGapAmbig = 0; +SCORE g_scoreAmbigFactor = 0; + +extern SCOREMATRIX VTML_LA; +extern SCOREMATRIX PAM200; +extern SCOREMATRIX PAM200NoCenter; +extern SCOREMATRIX VTML_SP; +extern SCOREMATRIX VTML_SPNoCenter; +extern SCOREMATRIX NUC_SP; + +PTR_SCOREMATRIX g_ptrScoreMatrix; + +const char *g_pstrInFileName = "-"; +const char *g_pstrOutFileName = "-"; +const char *g_pstrFASTAOutFileName = 0; +const char *g_pstrMSFOutFileName = 0; +const char *g_pstrClwOutFileName = 0; +const char *g_pstrClwStrictOutFileName = 0; +const char *g_pstrHTMLOutFileName = 0; +const char *g_pstrPHYIOutFileName = 0; +const char *g_pstrPHYSOutFileName = 0; +const char *g_pstrDistMxFileName1 = 0; +const char *g_pstrDistMxFileName2 = 0; + +const char *g_pstrFileName1 = 0; +const char *g_pstrFileName2 = 0; + +const char *g_pstrSPFileName = 0; +const char *g_pstrMatrixFileName = 0; + +const char *g_pstrUseTreeFileName = 0; +bool g_bUseTreeNoWarn = false; + +const char *g_pstrComputeWeightsFileName; +const char *g_pstrScoreFileName; + +const char *g_pstrProf1FileName = 0; +const char *g_pstrProf2FileName = 0; + +unsigned g_uSmoothWindowLength = 7; +unsigned g_uAnchorSpacing = 32; +unsigned g_uMaxTreeRefineIters = 1; + +unsigned g_uRefineWindow = 200; +unsigned g_uWindowFrom = 0; +unsigned g_uWindowTo = 0; +unsigned g_uSaveWindow = uInsane; +unsigned g_uWindowOffset = 0; + +unsigned g_uMaxSubFamCount = 5; + +unsigned g_uHydrophobicRunLength = 5; +float g_dHydroFactor = (float) 1.2; + +unsigned g_uMinDiagLength = 24; // TODO alpha -- should depend on alphabet? +unsigned g_uMaxDiagBreak = 1; +unsigned g_uDiagMargin = 5; + +float g_dSUEFF = (float) 0.1; + +bool g_bPrecompiledCenter = true; +bool g_bNormalizeCounts = false; +bool g_bDiags1 = false; +bool g_bDiags2 = false; +bool g_bAnchors = true; +bool g_bQuiet = false; +bool g_bVerbose = false; +bool g_bRefine = false; +bool g_bRefineW = false; +bool g_bProfDB = false; +bool g_bLow = false; +bool g_bSW = false; +bool g_bClusterOnly = false; +bool g_bProfile = false; +bool g_bPPScore = false; +bool g_bBrenner = false; +bool g_bDimer = false; +bool g_bVersion = false; +bool g_bStable = false; +bool g_bFASTA = false; +bool g_bPAS = false; +bool g_bTomHydro = false; +bool g_bMakeTree = false; + +#if DEBUG +bool g_bCatchExceptions = false; +#else +bool g_bCatchExceptions = true; +#endif + +bool g_bMSF = false; +bool g_bAln = false; +bool g_bClwStrict = false; +bool g_bHTML = false; +bool g_bPHYI = false; +bool g_bPHYS = false; + +unsigned g_uMaxIters = 8; +unsigned long g_ulMaxSecs = 0; +unsigned g_uMaxMB = 500; + +PPSCORE g_PPScore = PPSCORE_LE; +OBJSCORE g_ObjScore = OBJSCORE_SPM; + +SEQWEIGHT g_SeqWeight1 = SEQWEIGHT_ClustalW; +SEQWEIGHT g_SeqWeight2 = SEQWEIGHT_ClustalW; + +DISTANCE g_Distance1 = DISTANCE_Kmer6_6; +DISTANCE g_Distance2 = DISTANCE_PctIdKimura; + +CLUSTER g_Cluster1 = CLUSTER_UPGMB; +CLUSTER g_Cluster2 = CLUSTER_UPGMB; + +ROOT g_Root1 = ROOT_Pseudo; +ROOT g_Root2 = ROOT_Pseudo; + +bool g_bDiags; + +SEQTYPE g_SeqType = SEQTYPE_Auto; + +TERMGAPS g_TermGaps = TERMGAPS_Half; + +//------------------------------------------------------ +// These parameters depending on the chosen prof-prof +// score (g_PPScore), initialized to "Undefined". +float g_dSmoothScoreCeil = fInsane; +float g_dMinBestColScore = fInsane; +float g_dMinSmoothScore = fInsane; +SCORE g_scoreGapOpen = fInsane; +//------------------------------------------------------ + +static unsigned atou(const char *s) + { + return (unsigned) atoi(s); + } + +const char *MaxSecsToStr() + { + if (0 == g_ulMaxSecs) + return "(No limit)"; + return SecsToStr(g_ulMaxSecs); + } + +void ListParams() + { + Log("\n"); + Log("%s\n", MUSCLE_LONG_VERSION); + Log("http://www.drive5.com/muscle\n"); + Log("\n"); + Log("Profile-profile score %s\n", PPSCOREToStr(g_PPScore)); + Log("Max iterations %u\n", g_uMaxIters); + Log("Max trees %u\n", g_uMaxTreeRefineIters); + Log("Max time %s\n", MaxSecsToStr()); + Log("Max MB %u\n", g_uMaxMB); + Log("Gap open %g\n", g_scoreGapOpen); + Log("Gap extend (dimer) %g\n", g_scoreGapExtend); + Log("Gap ambig factor %g\n", g_scoreAmbigFactor); + Log("Gap ambig penalty %g\n", g_scoreGapAmbig); + Log("Center (LE) %g\n", g_scoreCenter); + Log("Term gaps %s\n", TERMGAPSToStr(g_TermGaps)); + + Log("Smooth window length %u\n", g_uSmoothWindowLength); + Log("Refine window length %u\n", g_uRefineWindow); + Log("Min anchor spacing %u\n", g_uAnchorSpacing); + Log("Min diag length (lambda) %u\n", g_uMinDiagLength); + Log("Diag margin (mu) %u\n", g_uDiagMargin); + Log("Min diag break %u\n", g_uMaxDiagBreak); + Log("Hydrophobic window %u\n", g_uHydrophobicRunLength); + + Log("Hydrophobic gap factor %g\n", g_dHydroFactor); + Log("Smooth score ceiling %g\n", g_dSmoothScoreCeil); + Log("Min best col score %g\n", g_dMinBestColScore); + Log("Min anchor score %g\n", g_dMinSmoothScore); + Log("SUEFF %g\n", g_dSUEFF); + + Log("Brenner root MSA %s\n", BoolToStr(g_bBrenner)); + Log("Normalize counts %s\n", BoolToStr(g_bNormalizeCounts)); + Log("Diagonals (1) %s\n", BoolToStr(g_bDiags1)); + Log("Diagonals (2) %s\n", BoolToStr(g_bDiags2)); + Log("Anchors %s\n", BoolToStr(g_bAnchors)); + Log("MSF output format %s\n", BoolToStr(g_bMSF)); + Log("Phylip interleaved %s\n", BoolToStr(g_bPHYI)); + Log("Phylip sequential %s\n", BoolToStr(g_bPHYS)); + Log("ClustalW output format %s\n", BoolToStr(g_bAln)); + Log("Catch exceptions %s\n", BoolToStr(g_bCatchExceptions)); + Log("Quiet %s\n", BoolToStr(g_bQuiet)); + Log("Refine %s\n", BoolToStr(g_bRefine)); + Log("ProdfDB %s\n", BoolToStr(g_bProfDB)); + Log("Low complexity profiles %s\n", BoolToStr(g_bLow)); + + Log("Objective score %s\n", OBJSCOREToStr(g_ObjScore)); + + Log("Distance method (1) %s\n", DISTANCEToStr(g_Distance1)); + Log("Clustering method (1) %s\n", CLUSTERToStr(g_Cluster1)); + Log("Root method (1) %s\n", ROOTToStr(g_Root1)); + Log("Sequence weighting (1) %s\n", SEQWEIGHTToStr(g_SeqWeight1)); + + Log("Distance method (2) %s\n", DISTANCEToStr(g_Distance2)); + Log("Clustering method (2) %s\n", CLUSTERToStr(g_Cluster2)); + Log("Root method (2) %s\n", ROOTToStr(g_Root2)); + Log("Sequence weighting (2) %s\n", SEQWEIGHTToStr(g_SeqWeight2)); + + Log("\n"); + } + +static void SetDefaultsLE() + { + g_ptrScoreMatrix = &VTML_LA; + + //g_scoreGapOpen = (SCORE) -3.00; + //g_scoreCenter = (SCORE) -0.55; + g_scoreGapOpen = (SCORE) -2.9; + g_scoreCenter = (SCORE) -0.52; + + g_bNormalizeCounts = true; + + //g_dSmoothScoreCeil = 5.0; + //g_dMinBestColScore = 4.0; + //g_dMinSmoothScore = 2.0; + g_dSmoothScoreCeil = 3.0; + g_dMinBestColScore = 2.0; + g_dMinSmoothScore = 1.0; + + g_Distance1 = DISTANCE_Kmer6_6; + g_Distance2 = DISTANCE_PctIdKimura; + } + +static void SetDefaultsSP() + { + g_ptrScoreMatrix = &PAM200; + + g_scoreGapOpen = -1439; + g_scoreCenter = 0.0; // center pre-added into score mx + + g_bNormalizeCounts = false; + + g_dSmoothScoreCeil = 200.0; + g_dMinBestColScore = 300.0; + g_dMinSmoothScore = 125.0; + + g_Distance1 = DISTANCE_Kmer6_6; + g_Distance2 = DISTANCE_PctIdKimura; + } + +static void SetDefaultsSV() + { + g_ptrScoreMatrix = &VTML_SP; + + g_scoreGapOpen = -300; + g_scoreCenter = 0.0; // center pre-added into score mx + + g_bNormalizeCounts = false; + + g_dSmoothScoreCeil = 90.0; + g_dMinBestColScore = 130.0; + g_dMinSmoothScore = 40.0; + + g_Distance1 = DISTANCE_Kmer6_6; + g_Distance2 = DISTANCE_PctIdKimura; + } + +//static void SetDefaultsSPN() +// { +// g_ptrScoreMatrix = &NUC_SP; +// +// g_scoreGapOpen = -400; +// g_scoreCenter = 0.0; // center pre-added into score mx +// +// g_bNormalizeCounts = false; +// +// g_dSmoothScoreCeil = 999.0; // disable +// g_dMinBestColScore = 90; +// g_dMinSmoothScore = 90; +// +// g_Distance1 = DISTANCE_Kmer4_6; +// g_Distance2 = DISTANCE_PctIdKimura; +// } + +static void SetDefaultsSPN_DNA() + { + g_ptrScoreMatrix = &NUC_SP; + + g_scoreGapOpen = -400; + g_scoreCenter = 0.0; // center pre-added into score mx + g_scoreGapExtend = 0.0; + + g_bNormalizeCounts = false; + + g_dSmoothScoreCeil = 999.0; // disable + g_dMinBestColScore = 90; + g_dMinSmoothScore = 90; + + g_Distance1 = DISTANCE_Kmer4_6; + g_Distance2 = DISTANCE_PctIdKimura; + } + +static void SetDefaultsSPN_RNA() + { + g_ptrScoreMatrix = &NUC_SP; + + g_scoreGapOpen = -420; + g_scoreCenter = -300; // total center = NUC_EXTEND - 300 + g_scoreGapExtend = 0.0; + + g_bNormalizeCounts = false; + + g_dSmoothScoreCeil = 999.0; // disable + g_dMinBestColScore = 90; + g_dMinSmoothScore = 90; + + g_Distance1 = DISTANCE_Kmer4_6; + g_Distance2 = DISTANCE_PctIdKimura; + } + +static void FlagParam(const char *OptName, bool *ptrParam, bool bValueIfFlagSet) + { + bool bIsSet = FlagOpt(OptName); + if (bIsSet) + *ptrParam = bValueIfFlagSet; + } + +static void StrParam(const char *OptName, const char **ptrptrParam) + { + const char *opt = ValueOpt(OptName); + if (0 != opt) + *ptrptrParam = opt; + } + +static void FloatParam(const char *OptName, float *ptrParam) + { + const char *opt = ValueOpt(OptName); + if (0 != opt) + *ptrParam = (float) atof(opt); + } + +static void UintParam(const char *OptName, unsigned *ptrParam) + { + const char *opt = ValueOpt(OptName); + if (0 != opt) + *ptrParam = atou(opt); + } + +static void EnumParam(const char *OptName, EnumOpt *Opts, int *Param) + { + const char *Value = ValueOpt(OptName); + if (0 == Value) + return; + + for (;;) + { + if (0 == Opts->pstrOpt) + Quit("Invalid parameter -%s %s", OptName, Value); + if (0 == stricmp(Value, Opts->pstrOpt)) + { + *Param = Opts->iValue; + return; + } + ++Opts; + } + } + +static void SetPPDefaultParams() + { + switch (g_PPScore) + { + case PPSCORE_SP: + SetDefaultsSP(); + break; + + case PPSCORE_LE: + SetDefaultsLE(); + break; + + case PPSCORE_SV: + SetDefaultsSV(); + break; + + case PPSCORE_SPN: + switch (g_Alpha) + { + case ALPHA_DNA: + SetDefaultsSPN_DNA(); + break; + case ALPHA_RNA: + SetDefaultsSPN_RNA(); + break; + default: + Quit("Invalid alpha %d", g_Alpha); + } + break; + + default: + Quit("Invalid g_PPScore"); + } + } + +static void SetPPCommandLineParams() + { + FloatParam("GapOpen", &g_scoreGapOpen); + FloatParam("GapOpen2", &g_scoreGapOpen2); + FloatParam("GapExtend", &g_scoreGapExtend); + FloatParam("GapExtend2", &g_scoreGapExtend2); + FloatParam("GapAmbig", &g_scoreAmbigFactor); + FloatParam("Center", &g_scoreCenter); + FloatParam("SmoothScoreCeil", &g_dSmoothScoreCeil); + FloatParam("MinBestColScore", &g_dMinBestColScore); + FloatParam("MinSmoothScore", &g_dMinSmoothScore); + + EnumParam("Distance", DISTANCE_Opts, (int *) &g_Distance1); + EnumParam("Distance", DISTANCE_Opts, (int *) &g_Distance2); + + EnumParam("Distance1", DISTANCE_Opts, (int *) &g_Distance1); + EnumParam("Distance2", DISTANCE_Opts, (int *) &g_Distance2); + } + +void SetPPScore(bool bRespectFlagOpts) + { + if (bRespectFlagOpts) + { + if (FlagOpt("SP")) + g_PPScore = PPSCORE_SP; + else if (FlagOpt("LE")) + g_PPScore = PPSCORE_LE; + else if (FlagOpt("SV")) + g_PPScore = PPSCORE_SV; + else if (FlagOpt("SPN")) + g_PPScore = PPSCORE_SPN; + } + + switch (g_PPScore) + { + case PPSCORE_LE: + case PPSCORE_SP: + case PPSCORE_SV: + if (ALPHA_RNA == g_Alpha || ALPHA_DNA == g_Alpha) + g_PPScore = PPSCORE_SPN; + break; + case PPSCORE_SPN: + if (ALPHA_Amino == g_Alpha) + g_PPScore = PPSCORE_LE; + break; + } + + SetPPDefaultParams(); + SetPPCommandLineParams(); + + if (g_bVerbose) + ListParams(); + } + +void SetPPScore(PPSCORE p) + { + g_PPScore = p; + SetPPScore(true); + } + +static void SetMaxSecs() + { + float fMaxHours = 0.0; + FloatParam("MaxHours", &fMaxHours); + if (0.0 == fMaxHours) + return; + g_ulMaxSecs = (unsigned long) (fMaxHours*60*60); + } + +static bool CanDoLowComplexity() + { + if (g_SeqWeight1 != SEQWEIGHT_ClustalW) + return false; + if (1 == g_uMaxIters) + return true; + return g_SeqWeight2 == SEQWEIGHT_ClustalW; + } + +bool MissingCommand() + { + if (strcmp(g_pstrInFileName, "-")) + return false; + if (0 != g_pstrFileName1) + return false; + if (0 != g_pstrSPFileName) + return false; + return true; + } + +void SetParams() + { + SetMaxSecs(); + + StrParam("in", &g_pstrInFileName); + StrParam("out", &g_pstrOutFileName); + + StrParam("FASTAOut", &g_pstrFASTAOutFileName); + StrParam("ClwOut", &g_pstrClwOutFileName); + StrParam("ClwStrictOut", &g_pstrClwStrictOutFileName); + StrParam("HTMLOut", &g_pstrHTMLOutFileName); + StrParam("PHYIOut", &g_pstrPHYIOutFileName); + StrParam("PHYSOut", &g_pstrPHYSOutFileName); + StrParam("MSFOut", &g_pstrMSFOutFileName); + + StrParam("in1", &g_pstrFileName1); + StrParam("in2", &g_pstrFileName2); + + StrParam("Matrix", &g_pstrMatrixFileName); + StrParam("SPScore", &g_pstrSPFileName); + + StrParam("UseTree_NoWarn", &g_pstrUseTreeFileName); + if (0 != g_pstrUseTreeFileName) + g_bUseTreeNoWarn = true; + + StrParam("UseTree", &g_pstrUseTreeFileName); + StrParam("ComputeWeights", &g_pstrComputeWeightsFileName); + StrParam("ScoreFile", &g_pstrScoreFileName); + StrParam("DistMx1", &g_pstrDistMxFileName1); + StrParam("DistMx2", &g_pstrDistMxFileName2); + + FlagParam("Core", &g_bCatchExceptions, false); + FlagParam("NoCore", &g_bCatchExceptions, true); + + FlagParam("Diags1", &g_bDiags1, true); + FlagParam("Diags2", &g_bDiags2, true); + + bool Diags = false; + FlagParam("Diags", &Diags, true); + if (Diags) + { + g_bDiags1 = true; + g_bDiags2 = true; + } + + FlagParam("Anchors", &g_bAnchors, true); + FlagParam("NoAnchors", &g_bAnchors, false); + + FlagParam("Quiet", &g_bQuiet, true); + FlagParam("Verbose", &g_bVerbose, true); + FlagParam("Version", &g_bVersion, true); + FlagParam("Stable", &g_bStable, true); + FlagParam("Group", &g_bStable, false); + FlagParam("Refine", &g_bRefine, true); + FlagParam("RefineW", &g_bRefineW, true); + FlagParam("ProfDB", &g_bProfDB, true); + FlagParam("SW", &g_bSW, true); + FlagParam("ClusterOnly", &g_bClusterOnly, true); + FlagParam("Profile", &g_bProfile, true); + FlagParam("PPScore", &g_bPPScore, true); + FlagParam("Brenner", &g_bBrenner, true); + FlagParam("Dimer", &g_bDimer, true); + + FlagParam("MSF", &g_bMSF, true); + FlagParam("PHYI", &g_bPHYI, true); + FlagParam("PHYS", &g_bPHYS, true); + FlagParam("clw", &g_bAln, true); + FlagParam("HTML", &g_bHTML, true); + FlagParam("FASTA", &g_bFASTA, true); + FlagParam("PAS", &g_bPAS, true); + FlagParam("MakeTree", &g_bMakeTree, true); + + if (g_bStable) + Quit("-stable not supported in this version of muscle"); + + bool b = false; + FlagParam("clwstrict", &b, true); + if (b) + { + g_bAln = true; + g_bClwStrict = true; + } + + UintParam("MaxIters", &g_uMaxIters); + UintParam("MaxTrees", &g_uMaxTreeRefineIters); + UintParam("SmoothWindow", &g_uSmoothWindowLength); + UintParam("RefineWindow", &g_uRefineWindow); + UintParam("FromWindow", &g_uWindowFrom); + UintParam("ToWindow", &g_uWindowTo); + UintParam("SaveWindow", &g_uSaveWindow); + UintParam("WindowOffset", &g_uWindowOffset); + UintParam("AnchorSpacing", &g_uAnchorSpacing); + UintParam("DiagLength", &g_uMinDiagLength); + UintParam("DiagMargin", &g_uDiagMargin); + UintParam("DiagBreak", &g_uMaxDiagBreak); + UintParam("MaxSubFam", &g_uMaxSubFamCount); + + UintParam("Hydro", &g_uHydrophobicRunLength); + FlagParam("TomHydro", &g_bTomHydro, true); + if (g_bTomHydro) + g_uHydrophobicRunLength = 0; + + FloatParam("SUEFF", &g_dSUEFF); + FloatParam("HydroFactor", &g_dHydroFactor); + + EnumParam("ObjScore", OBJSCORE_Opts, (int *) &g_ObjScore); + EnumParam("TermGaps", TERMGAPS_Opts, (int *) &g_TermGaps); + + EnumParam("Weight", SEQWEIGHT_Opts, (int *) &g_SeqWeight1); + EnumParam("Weight", SEQWEIGHT_Opts, (int *) &g_SeqWeight2); + + EnumParam("Weight1", SEQWEIGHT_Opts, (int *) &g_SeqWeight1); + EnumParam("Weight2", SEQWEIGHT_Opts, (int *) &g_SeqWeight2); + + EnumParam("Cluster", CLUSTER_Opts, (int *) &g_Cluster1); + EnumParam("Cluster", CLUSTER_Opts, (int *) &g_Cluster2); + + EnumParam("Cluster1", CLUSTER_Opts, (int *) &g_Cluster1); + EnumParam("Cluster2", CLUSTER_Opts, (int *) &g_Cluster2); + + EnumParam("Root1", ROOT_Opts, (int *) &g_Root1); + EnumParam("Root2", ROOT_Opts, (int *) &g_Root2); + + EnumParam("SeqType", SEQTYPE_Opts, (int *) &g_SeqType); + + g_scoreGapAmbig = g_scoreGapOpen*g_scoreAmbigFactor; + g_bLow = CanDoLowComplexity(); + + if (g_bDimer) + g_bPrecompiledCenter = false; + + UintParam("MaxMB", &g_uMaxMB); + if (0 == ValueOpt("MaxMB")) + g_uMaxMB = (unsigned) (GetRAMSizeMB()*DEFAULT_MAX_MB_FRACT); + } diff --git a/src/muscle/muscle3.8.31/src/params.h b/src/muscle/muscle3.8.31/src/params.h new file mode 100644 index 0000000..aefea32 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/params.h @@ -0,0 +1,118 @@ +#ifndef params_h +#define params_h + +extern const char *g_pstrInFileName; +extern const char *g_pstrOutFileName; + +extern const char *g_pstrFASTAOutFileName; +extern const char *g_pstrMSFOutFileName; +extern const char *g_pstrClwOutFileName; +extern const char *g_pstrClwStrictOutFileName; +extern const char *g_pstrHTMLOutFileName; +extern const char *g_pstrPHYIOutFileName; +extern const char *g_pstrPHYSOutFileName; +extern const char *g_pstrDistMxFileName1; +extern const char *g_pstrDistMxFileName2; + +extern const char *g_pstrFileName1; +extern const char *g_pstrFileName2; + +extern const char *g_pstrSPFileName; +extern const char *g_pstrMatrixFileName; + +extern const char *g_pstrUseTreeFileName; +extern bool g_bUseTreeNoWarn; + +extern const char *g_pstrComputeWeightsFileName; +extern const char *g_pstrScoreFileName; + +extern SCORE g_scoreGapOpen; +extern SCORE g_scoreCenter; +extern SCORE g_scoreGapExtend; +extern SCORE g_scoreGapAmbig; + +#if DOUBLE_AFFINE +extern SCORE g_scoreGapOpen2; +extern SCORE g_scoreGapExtend2; +#endif + +extern unsigned g_uSmoothWindowLength; +extern unsigned g_uAnchorSpacing; +extern unsigned g_uMaxTreeRefineIters; + +extern unsigned g_uMinDiagLength; +extern unsigned g_uMaxDiagBreak; +extern unsigned g_uDiagMargin; + +extern unsigned g_uRefineWindow; +extern unsigned g_uWindowFrom; +extern unsigned g_uWindowTo; +extern unsigned g_uSaveWindow; +extern unsigned g_uWindowOffset; + +extern unsigned g_uMaxSubFamCount; + +extern unsigned g_uHydrophobicRunLength; +extern float g_dHydroFactor; + +extern float g_dSmoothScoreCeil; +extern float g_dMinBestColScore; +extern float g_dMinSmoothScore; +extern float g_dSUEFF; + +extern bool g_bPrecompiledCenter; +extern bool g_bNormalizeCounts; +extern bool g_bDiags1; +extern bool g_bDiags2; +extern bool g_bDiags; +extern bool g_bAnchors; +extern bool g_bCatchExceptions; + +extern bool g_bMSF; +extern bool g_bAln; +extern bool g_bClwStrict; +extern bool g_bHTML; +extern bool g_bPHYI; +extern bool g_bPHYS; + +extern bool g_bQuiet; +extern bool g_bVerbose; +extern bool g_bRefine; +extern bool g_bRefineW; +extern bool g_bRefineX; +extern bool g_bLow; +extern bool g_bSW; +extern bool g_bClusterOnly; +extern bool g_bProfile; +extern bool g_bProfDB; +extern bool g_bPPScore; +extern bool g_bBrenner; +extern bool g_bDimer; +extern bool g_bVersion; +extern bool g_bStable; +extern bool g_bFASTA; +extern bool g_bPAS; +extern bool g_bTomHydro; +extern bool g_bMakeTree; + +extern PPSCORE g_PPScore; +extern OBJSCORE g_ObjScore; + +extern DISTANCE g_Distance1; +extern CLUSTER g_Cluster1; +extern ROOT g_Root1; +extern SEQWEIGHT g_SeqWeight1; + +extern DISTANCE g_Distance2; +extern CLUSTER g_Cluster2; +extern ROOT g_Root2; +extern SEQWEIGHT g_SeqWeight2; + +extern unsigned g_uMaxIters; +extern unsigned long g_ulMaxSecs; +extern unsigned g_uMaxMB; + +extern SEQTYPE g_SeqType; +extern TERMGAPS g_TermGaps; + +#endif // params_h diff --git a/src/muscle/muscle3.8.31/src/phy.cpp b/src/muscle/muscle3.8.31/src/phy.cpp new file mode 100644 index 0000000..aaec915 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/phy.cpp @@ -0,0 +1,1130 @@ +#include "muscle.h" +#include "tree.h" +#include + +#define TRACE 0 + +/*** +Node has 0 to 3 neighbors: + 0 neighbors: singleton root + 1 neighbor: leaf, neighbor is parent + 2 neigbors: non-singleton root + 3 neighbors: internal node (other than root) + +Minimal rooted tree is single node. +Minimal unrooted tree is single edge. +Leaf node always has nulls in neighbors 2 and 3, neighbor 1 is parent. +When tree is rooted, neighbor 1=parent, 2=left, 3=right. +***/ + +void Tree::AssertAreNeighbors(unsigned uNodeIndex1, unsigned uNodeIndex2) const + { + if (uNodeIndex1 >= m_uNodeCount || uNodeIndex2 >= m_uNodeCount) + Quit("AssertAreNeighbors(%u,%u), are %u nodes", + uNodeIndex1, uNodeIndex2, m_uNodeCount); + + if (m_uNeighbor1[uNodeIndex1] != uNodeIndex2 && + m_uNeighbor2[uNodeIndex1] != uNodeIndex2 && + m_uNeighbor3[uNodeIndex1] != uNodeIndex2) + { + LogMe(); + Quit("AssertAreNeighbors(%u,%u) failed", uNodeIndex1, uNodeIndex2); + } + + if (m_uNeighbor1[uNodeIndex2] != uNodeIndex1 && + m_uNeighbor2[uNodeIndex2] != uNodeIndex1 && + m_uNeighbor3[uNodeIndex2] != uNodeIndex1) + { + LogMe(); + Quit("AssertAreNeighbors(%u,%u) failed", uNodeIndex1, uNodeIndex2); + } + + bool Has12 = HasEdgeLength(uNodeIndex1, uNodeIndex2); + bool Has21 = HasEdgeLength(uNodeIndex2, uNodeIndex1); + if (Has12 != Has21) + { + HasEdgeLength(uNodeIndex1, uNodeIndex2); + HasEdgeLength(uNodeIndex2, uNodeIndex1); + LogMe(); + Log("HasEdgeLength(%u, %u)=%c HasEdgeLength(%u, %u)=%c\n", + uNodeIndex1, + uNodeIndex2, + Has12 ? 'T' : 'F', + uNodeIndex2, + uNodeIndex1, + Has21 ? 'T' : 'F'); + + Quit("Tree::AssertAreNeighbors, HasEdgeLength not symmetric"); + } + + if (Has12) + { + double d12 = GetEdgeLength(uNodeIndex1, uNodeIndex2); + double d21 = GetEdgeLength(uNodeIndex2, uNodeIndex1); + if (d12 != d21) + { + LogMe(); + Quit("Tree::AssertAreNeighbors, Edge length disagrees %u-%u=%.3g, %u-%u=%.3g", + uNodeIndex1, uNodeIndex2, d12, + uNodeIndex2, uNodeIndex1, d21); + } + } + } + +void Tree::ValidateNode(unsigned uNodeIndex) const + { + if (uNodeIndex >= m_uNodeCount) + Quit("ValidateNode(%u), %u nodes", uNodeIndex, m_uNodeCount); + + const unsigned uNeighborCount = GetNeighborCount(uNodeIndex); + + if (2 == uNeighborCount) + { + if (!m_bRooted) + { + LogMe(); + Quit("Tree::ValidateNode: Node %u has two neighbors, tree is not rooted", + uNodeIndex); + } + if (uNodeIndex != m_uRootNodeIndex) + { + LogMe(); + Quit("Tree::ValidateNode: Node %u has two neighbors, but not root node=%u", + uNodeIndex, m_uRootNodeIndex); + } + } + + const unsigned n1 = m_uNeighbor1[uNodeIndex]; + const unsigned n2 = m_uNeighbor2[uNodeIndex]; + const unsigned n3 = m_uNeighbor3[uNodeIndex]; + + if (NULL_NEIGHBOR == n2 && NULL_NEIGHBOR != n3) + { + LogMe(); + Quit("Tree::ValidateNode, n2=null, n3!=null", uNodeIndex); + } + if (NULL_NEIGHBOR == n3 && NULL_NEIGHBOR != n2) + { + LogMe(); + Quit("Tree::ValidateNode, n3=null, n2!=null", uNodeIndex); + } + + if (n1 != NULL_NEIGHBOR) + AssertAreNeighbors(uNodeIndex, n1); + if (n2 != NULL_NEIGHBOR) + AssertAreNeighbors(uNodeIndex, n2); + if (n3 != NULL_NEIGHBOR) + AssertAreNeighbors(uNodeIndex, n3); + + if (n1 != NULL_NEIGHBOR && (n1 == n2 || n1 == n3)) + { + LogMe(); + Quit("Tree::ValidateNode, duplicate neighbors in node %u", uNodeIndex); + } + if (n2 != NULL_NEIGHBOR && (n2 == n1 || n2 == n3)) + { + LogMe(); + Quit("Tree::ValidateNode, duplicate neighbors in node %u", uNodeIndex); + } + if (n3 != NULL_NEIGHBOR && (n3 == n1 || n3 == n2)) + { + LogMe(); + Quit("Tree::ValidateNode, duplicate neighbors in node %u", uNodeIndex); + } + + if (IsRooted()) + { + if (NULL_NEIGHBOR == GetParent(uNodeIndex)) + { + if (uNodeIndex != m_uRootNodeIndex) + { + LogMe(); + Quit("Tree::ValiateNode(%u), no parent", uNodeIndex); + } + } + else if (GetLeft(GetParent(uNodeIndex)) != uNodeIndex && + GetRight(GetParent(uNodeIndex)) != uNodeIndex) + { + LogMe(); + Quit("Tree::ValidateNode(%u), parent / child mismatch", uNodeIndex); + } + } + } + +void Tree::Validate() const + { + for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) + ValidateNode(uNodeIndex); + } + +bool Tree::IsEdge(unsigned uNodeIndex1, unsigned uNodeIndex2) const + { + assert(uNodeIndex1 < m_uNodeCount && uNodeIndex2 < m_uNodeCount); + + return m_uNeighbor1[uNodeIndex1] == uNodeIndex2 || + m_uNeighbor2[uNodeIndex1] == uNodeIndex2 || + m_uNeighbor3[uNodeIndex1] == uNodeIndex2; + } + +double Tree::GetEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2) const + { + assert(uNodeIndex1 < m_uNodeCount && uNodeIndex2 < m_uNodeCount); + if (!HasEdgeLength(uNodeIndex1, uNodeIndex2)) + { + LogMe(); + Quit("Missing edge length in tree %u-%u", uNodeIndex1, uNodeIndex2); + } + + if (m_uNeighbor1[uNodeIndex1] == uNodeIndex2) + return m_dEdgeLength1[uNodeIndex1]; + else if (m_uNeighbor2[uNodeIndex1] == uNodeIndex2) + return m_dEdgeLength2[uNodeIndex1]; + assert(m_uNeighbor3[uNodeIndex1] == uNodeIndex2); + return m_dEdgeLength3[uNodeIndex1]; + } + +void Tree::ExpandCache() + { + const unsigned uNodeCount = 100; + unsigned uNewCacheCount = m_uCacheCount + uNodeCount; + unsigned *uNewNeighbor1 = new unsigned[uNewCacheCount]; + unsigned *uNewNeighbor2 = new unsigned[uNewCacheCount]; + unsigned *uNewNeighbor3 = new unsigned[uNewCacheCount]; + + unsigned *uNewIds = new unsigned[uNewCacheCount]; + memset(uNewIds, 0xff, uNewCacheCount*sizeof(unsigned)); + + double *dNewEdgeLength1 = new double[uNewCacheCount]; + double *dNewEdgeLength2 = new double[uNewCacheCount]; + double *dNewEdgeLength3 = new double[uNewCacheCount]; + double *dNewHeight = new double[uNewCacheCount]; + + bool *bNewHasEdgeLength1 = new bool[uNewCacheCount]; + bool *bNewHasEdgeLength2 = new bool[uNewCacheCount]; + bool *bNewHasEdgeLength3 = new bool[uNewCacheCount]; + bool *bNewHasHeight = new bool[uNewCacheCount]; + + char **ptrNewName = new char *[uNewCacheCount]; + memset(ptrNewName, 0, uNewCacheCount*sizeof(char *)); + + if (m_uCacheCount > 0) + { + const unsigned uUnsignedBytes = m_uCacheCount*sizeof(unsigned); + memcpy(uNewNeighbor1, m_uNeighbor1, uUnsignedBytes); + memcpy(uNewNeighbor2, m_uNeighbor2, uUnsignedBytes); + memcpy(uNewNeighbor3, m_uNeighbor3, uUnsignedBytes); + + memcpy(uNewIds, m_Ids, uUnsignedBytes); + + const unsigned uEdgeBytes = m_uCacheCount*sizeof(double); + memcpy(dNewEdgeLength1, m_dEdgeLength1, uEdgeBytes); + memcpy(dNewEdgeLength2, m_dEdgeLength2, uEdgeBytes); + memcpy(dNewEdgeLength3, m_dEdgeLength3, uEdgeBytes); + memcpy(dNewHeight, m_dHeight, uEdgeBytes); + + const unsigned uBoolBytes = m_uCacheCount*sizeof(bool); + memcpy(bNewHasEdgeLength1, m_bHasEdgeLength1, uBoolBytes); + memcpy(bNewHasEdgeLength2, m_bHasEdgeLength2, uBoolBytes); + memcpy(bNewHasEdgeLength3, m_bHasEdgeLength3, uBoolBytes); + memcpy(bNewHasHeight, m_bHasHeight, uBoolBytes); + + const unsigned uNameBytes = m_uCacheCount*sizeof(char *); + memcpy(ptrNewName, m_ptrName, uNameBytes); + + delete[] m_uNeighbor1; + delete[] m_uNeighbor2; + delete[] m_uNeighbor3; + + delete[] m_Ids; + + delete[] m_dEdgeLength1; + delete[] m_dEdgeLength2; + delete[] m_dEdgeLength3; + + delete[] m_bHasEdgeLength1; + delete[] m_bHasEdgeLength2; + delete[] m_bHasEdgeLength3; + delete[] m_bHasHeight; + + delete[] m_ptrName; + } + m_uCacheCount = uNewCacheCount; + m_uNeighbor1 = uNewNeighbor1; + m_uNeighbor2 = uNewNeighbor2; + m_uNeighbor3 = uNewNeighbor3; + m_Ids = uNewIds; + m_dEdgeLength1 = dNewEdgeLength1; + m_dEdgeLength2 = dNewEdgeLength2; + m_dEdgeLength3 = dNewEdgeLength3; + m_dHeight = dNewHeight; + m_bHasEdgeLength1 = bNewHasEdgeLength1; + m_bHasEdgeLength2 = bNewHasEdgeLength2; + m_bHasEdgeLength3 = bNewHasEdgeLength3; + m_bHasHeight = bNewHasHeight; + m_ptrName = ptrNewName; + } + +// Creates tree with single node, no edges. +// Root node always has index 0. +void Tree::CreateRooted() + { + Clear(); + ExpandCache(); + m_uNodeCount = 1; + + m_uNeighbor1[0] = NULL_NEIGHBOR; + m_uNeighbor2[0] = NULL_NEIGHBOR; + m_uNeighbor3[0] = NULL_NEIGHBOR; + + m_bHasEdgeLength1[0] = false; + m_bHasEdgeLength2[0] = false; + m_bHasEdgeLength3[0] = false; + m_bHasHeight[0] = false; + + m_uRootNodeIndex = 0; + m_bRooted = true; + +#if DEBUG + Validate(); +#endif + } + +// Creates unrooted tree with single edge. +// Nodes for that edge are always 0 and 1. +void Tree::CreateUnrooted(double dEdgeLength) + { + Clear(); + ExpandCache(); + + m_uNeighbor1[0] = 1; + m_uNeighbor2[0] = NULL_NEIGHBOR; + m_uNeighbor3[0] = NULL_NEIGHBOR; + + m_uNeighbor1[1] = 0; + m_uNeighbor2[1] = NULL_NEIGHBOR; + m_uNeighbor3[1] = NULL_NEIGHBOR; + + m_dEdgeLength1[0] = dEdgeLength; + m_dEdgeLength1[1] = dEdgeLength; + + m_bHasEdgeLength1[0] = true; + m_bHasEdgeLength1[1] = true; + + m_bRooted = false; + +#if DEBUG + Validate(); +#endif + } + +void Tree::SetLeafName(unsigned uNodeIndex, const char *ptrName) + { + assert(uNodeIndex < m_uNodeCount); + assert(IsLeaf(uNodeIndex)); + free(m_ptrName[uNodeIndex]); + m_ptrName[uNodeIndex] = strsave(ptrName); + } + +void Tree::SetLeafId(unsigned uNodeIndex, unsigned uId) + { + assert(uNodeIndex < m_uNodeCount); + assert(IsLeaf(uNodeIndex)); + m_Ids[uNodeIndex] = uId; + } + +const char *Tree::GetLeafName(unsigned uNodeIndex) const + { + assert(uNodeIndex < m_uNodeCount); + assert(IsLeaf(uNodeIndex)); + return m_ptrName[uNodeIndex]; + } + +unsigned Tree::GetLeafId(unsigned uNodeIndex) const + { + assert(uNodeIndex < m_uNodeCount); + assert(IsLeaf(uNodeIndex)); + return m_Ids[uNodeIndex]; + } + +// Append a new branch. +// This adds two new nodes and joins them to an existing leaf node. +// Return value is k, new nodes have indexes k and k+1 respectively. +unsigned Tree::AppendBranch(unsigned uExistingLeafIndex) + { + if (0 == m_uNodeCount) + Quit("Tree::AppendBranch: tree has not been created"); + +#if DEBUG + assert(uExistingLeafIndex < m_uNodeCount); + if (!IsLeaf(uExistingLeafIndex)) + { + LogMe(); + Quit("AppendBranch(%u): not leaf", uExistingLeafIndex); + } +#endif + + if (m_uNodeCount >= m_uCacheCount - 2) + ExpandCache(); + + const unsigned uNewLeaf1 = m_uNodeCount; + const unsigned uNewLeaf2 = m_uNodeCount + 1; + + m_uNodeCount += 2; + + assert(m_uNeighbor2[uExistingLeafIndex] == NULL_NEIGHBOR); + assert(m_uNeighbor3[uExistingLeafIndex] == NULL_NEIGHBOR); + + m_uNeighbor2[uExistingLeafIndex] = uNewLeaf1; + m_uNeighbor3[uExistingLeafIndex] = uNewLeaf2; + + m_uNeighbor1[uNewLeaf1] = uExistingLeafIndex; + m_uNeighbor1[uNewLeaf2] = uExistingLeafIndex; + + m_uNeighbor2[uNewLeaf1] = NULL_NEIGHBOR; + m_uNeighbor2[uNewLeaf2] = NULL_NEIGHBOR; + + m_uNeighbor3[uNewLeaf1] = NULL_NEIGHBOR; + m_uNeighbor3[uNewLeaf2] = NULL_NEIGHBOR; + + m_dEdgeLength2[uExistingLeafIndex] = 0; + m_dEdgeLength3[uExistingLeafIndex] = 0; + + m_dEdgeLength1[uNewLeaf1] = 0; + m_dEdgeLength2[uNewLeaf1] = 0; + m_dEdgeLength3[uNewLeaf1] = 0; + + m_dEdgeLength1[uNewLeaf2] = 0; + m_dEdgeLength2[uNewLeaf2] = 0; + m_dEdgeLength3[uNewLeaf2] = 0; + + m_bHasEdgeLength1[uNewLeaf1] = false; + m_bHasEdgeLength2[uNewLeaf1] = false; + m_bHasEdgeLength3[uNewLeaf1] = false; + + m_bHasEdgeLength1[uNewLeaf2] = false; + m_bHasEdgeLength2[uNewLeaf2] = false; + m_bHasEdgeLength3[uNewLeaf2] = false; + + m_bHasHeight[uNewLeaf1] = false; + m_bHasHeight[uNewLeaf2] = false; + + m_Ids[uNewLeaf1] = uInsane; + m_Ids[uNewLeaf2] = uInsane; + return uNewLeaf1; + } + +void Tree::LogMe() const + { + Log("Tree::LogMe %u nodes, ", m_uNodeCount); + + if (IsRooted()) + { + Log("rooted.\n"); + Log("\n"); + Log("Index Parnt LengthP Left LengthL Right LengthR Id Name\n"); + Log("----- ----- ------- ---- ------- ----- ------- ----- ----\n"); + } + else + { + Log("unrooted.\n"); + Log("\n"); + Log("Index Nbr_1 Length1 Nbr_2 Length2 Nbr_3 Length3 Id Name\n"); + Log("----- ----- ------- ----- ------- ----- ------- ----- ----\n"); + } + + for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) + { + Log("%5u ", uNodeIndex); + const unsigned n1 = m_uNeighbor1[uNodeIndex]; + const unsigned n2 = m_uNeighbor2[uNodeIndex]; + const unsigned n3 = m_uNeighbor3[uNodeIndex]; + if (NULL_NEIGHBOR != n1) + { + Log("%5u ", n1); + if (m_bHasEdgeLength1[uNodeIndex]) + Log("%7.4f ", m_dEdgeLength1[uNodeIndex]); + else + Log(" * "); + } + else + Log(" "); + + if (NULL_NEIGHBOR != n2) + { + Log("%5u ", n2); + if (m_bHasEdgeLength2[uNodeIndex]) + Log("%7.4f ", m_dEdgeLength2[uNodeIndex]); + else + Log(" * "); + } + else + Log(" "); + + if (NULL_NEIGHBOR != n3) + { + Log("%5u ", n3); + if (m_bHasEdgeLength3[uNodeIndex]) + Log("%7.4f ", m_dEdgeLength3[uNodeIndex]); + else + Log(" * "); + } + else + Log(" "); + + if (m_Ids != 0 && IsLeaf(uNodeIndex)) + { + unsigned uId = m_Ids[uNodeIndex]; + if (uId == uInsane) + Log(" *"); + else + Log("%5u", uId); + } + else + Log(" "); + + if (m_bRooted && uNodeIndex == m_uRootNodeIndex) + Log(" [ROOT] "); + const char *ptrName = m_ptrName[uNodeIndex]; + if (ptrName != 0) + Log(" %s", ptrName); + Log("\n"); + } + } + +void Tree::SetEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2, + double dLength) + { + assert(uNodeIndex1 < m_uNodeCount && uNodeIndex2 < m_uNodeCount); + assert(IsEdge(uNodeIndex1, uNodeIndex2)); + + if (m_uNeighbor1[uNodeIndex1] == uNodeIndex2) + { + m_dEdgeLength1[uNodeIndex1] = dLength; + m_bHasEdgeLength1[uNodeIndex1] = true; + } + else if (m_uNeighbor2[uNodeIndex1] == uNodeIndex2) + { + m_dEdgeLength2[uNodeIndex1] = dLength; + m_bHasEdgeLength2[uNodeIndex1] = true; + + } + else + { + assert(m_uNeighbor3[uNodeIndex1] == uNodeIndex2); + m_dEdgeLength3[uNodeIndex1] = dLength; + m_bHasEdgeLength3[uNodeIndex1] = true; + } + + if (m_uNeighbor1[uNodeIndex2] == uNodeIndex1) + { + m_dEdgeLength1[uNodeIndex2] = dLength; + m_bHasEdgeLength1[uNodeIndex2] = true; + } + else if (m_uNeighbor2[uNodeIndex2] == uNodeIndex1) + { + m_dEdgeLength2[uNodeIndex2] = dLength; + m_bHasEdgeLength2[uNodeIndex2] = true; + } + else + { + assert(m_uNeighbor3[uNodeIndex2] == uNodeIndex1); + m_dEdgeLength3[uNodeIndex2] = dLength; + m_bHasEdgeLength3[uNodeIndex2] = true; + } + } + +unsigned Tree::UnrootFromFile() + { +#if TRACE + Log("Before unroot:\n"); + LogMe(); +#endif + + if (!m_bRooted) + Quit("Tree::Unroot, not rooted"); + +// Convention: root node is always node zero + assert(IsRoot(0)); + assert(NULL_NEIGHBOR == m_uNeighbor1[0]); + + const unsigned uThirdNode = m_uNodeCount++; + + m_uNeighbor1[0] = uThirdNode; + m_uNeighbor1[uThirdNode] = 0; + + m_uNeighbor2[uThirdNode] = NULL_NEIGHBOR; + m_uNeighbor3[uThirdNode] = NULL_NEIGHBOR; + + m_dEdgeLength1[0] = 0; + m_dEdgeLength1[uThirdNode] = 0; + m_bHasEdgeLength1[uThirdNode] = true; + + m_bRooted = false; + +#if TRACE + Log("After unroot:\n"); + LogMe(); +#endif + + return uThirdNode; + } + +// In an unrooted tree, equivalent of GetLeft/Right is +// GetFirst/SecondNeighbor. +// uNeighborIndex must be a known neighbor of uNodeIndex. +// This is the way to find the other two neighbor nodes of +// an internal node. +// The labeling as "First" and "Second" neighbor is arbitrary. +// Calling these functions on a leaf returns NULL_NEIGHBOR, as +// for GetLeft/Right. +unsigned Tree::GetFirstNeighbor(unsigned uNodeIndex, unsigned uNeighborIndex) const + { + assert(uNodeIndex < m_uNodeCount); + assert(uNeighborIndex < m_uNodeCount); + assert(IsEdge(uNodeIndex, uNeighborIndex)); + + for (unsigned n = 0; n < 3; ++n) + { + unsigned uNeighbor = GetNeighbor(uNodeIndex, n); + if (NULL_NEIGHBOR != uNeighbor && uNeighborIndex != uNeighbor) + return uNeighbor; + } + return NULL_NEIGHBOR; + } + +unsigned Tree::GetSecondNeighbor(unsigned uNodeIndex, unsigned uNeighborIndex) const + { + assert(uNodeIndex < m_uNodeCount); + assert(uNeighborIndex < m_uNodeCount); + assert(IsEdge(uNodeIndex, uNeighborIndex)); + + bool bFoundOne = false; + for (unsigned n = 0; n < 3; ++n) + { + unsigned uNeighbor = GetNeighbor(uNodeIndex, n); + if (NULL_NEIGHBOR != uNeighbor && uNeighborIndex != uNeighbor) + { + if (bFoundOne) + return uNeighbor; + else + bFoundOne = true; + } + } + return NULL_NEIGHBOR; + } + +// Compute the number of leaves in the sub-tree defined by an edge +// in an unrooted tree. Conceptually, the tree is cut at this edge, +// and uNodeIndex2 considered the root of the sub-tree. +unsigned Tree::GetLeafCountUnrooted(unsigned uNodeIndex1, unsigned uNodeIndex2, + double *ptrdTotalDistance) const + { + assert(!IsRooted()); + + if (IsLeaf(uNodeIndex2)) + { + *ptrdTotalDistance = GetEdgeLength(uNodeIndex1, uNodeIndex2); + return 1; + } + +// Recurse down the rooted sub-tree defined by cutting the edge +// and considering uNodeIndex2 as the root. + const unsigned uLeft = GetFirstNeighbor(uNodeIndex2, uNodeIndex1); + const unsigned uRight = GetSecondNeighbor(uNodeIndex2, uNodeIndex1); + + double dLeftDistance; + double dRightDistance; + + const unsigned uLeftCount = GetLeafCountUnrooted(uNodeIndex2, uLeft, + &dLeftDistance); + const unsigned uRightCount = GetLeafCountUnrooted(uNodeIndex2, uRight, + &dRightDistance); + + *ptrdTotalDistance = dLeftDistance + dRightDistance; + return uLeftCount + uRightCount; + } + +void Tree::RootUnrootedTree(ROOT Method) + { + assert(!IsRooted()); +#if TRACE + Log("Tree::RootUnrootedTree, before:"); + LogMe(); +#endif + + unsigned uNode1; + unsigned uNode2; + double dLength1; + double dLength2; + FindRoot(*this, &uNode1, &uNode2, &dLength1, &dLength2, Method); + + if (m_uNodeCount == m_uCacheCount) + ExpandCache(); + m_uRootNodeIndex = m_uNodeCount++; + + double dEdgeLength = GetEdgeLength(uNode1, uNode2); + + m_uNeighbor1[m_uRootNodeIndex] = NULL_NEIGHBOR; + m_uNeighbor2[m_uRootNodeIndex] = uNode1; + m_uNeighbor3[m_uRootNodeIndex] = uNode2; + + if (m_uNeighbor1[uNode1] == uNode2) + m_uNeighbor1[uNode1] = m_uRootNodeIndex; + else if (m_uNeighbor2[uNode1] == uNode2) + m_uNeighbor2[uNode1] = m_uRootNodeIndex; + else + { + assert(m_uNeighbor3[uNode1] == uNode2); + m_uNeighbor3[uNode1] = m_uRootNodeIndex; + } + + if (m_uNeighbor1[uNode2] == uNode1) + m_uNeighbor1[uNode2] = m_uRootNodeIndex; + else if (m_uNeighbor2[uNode2] == uNode1) + m_uNeighbor2[uNode2] = m_uRootNodeIndex; + else + { + assert(m_uNeighbor3[uNode2] == uNode1); + m_uNeighbor3[uNode2] = m_uRootNodeIndex; + } + + OrientParent(uNode1, m_uRootNodeIndex); + OrientParent(uNode2, m_uRootNodeIndex); + + SetEdgeLength(m_uRootNodeIndex, uNode1, dLength1); + SetEdgeLength(m_uRootNodeIndex, uNode2, dLength2); + + m_bHasHeight[m_uRootNodeIndex] = false; + + m_ptrName[m_uRootNodeIndex] = 0; + + m_bRooted = true; + +#if TRACE + Log("\nPhy::RootUnrootedTree, after:"); + LogMe(); +#endif + + Validate(); + } + +bool Tree::HasEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2) const + { + assert(uNodeIndex1 < m_uNodeCount); + assert(uNodeIndex2 < m_uNodeCount); + assert(IsEdge(uNodeIndex1, uNodeIndex2)); + + if (m_uNeighbor1[uNodeIndex1] == uNodeIndex2) + return m_bHasEdgeLength1[uNodeIndex1]; + else if (m_uNeighbor2[uNodeIndex1] == uNodeIndex2) + return m_bHasEdgeLength2[uNodeIndex1]; + assert(m_uNeighbor3[uNodeIndex1] == uNodeIndex2); + return m_bHasEdgeLength3[uNodeIndex1]; + } + +void Tree::OrientParent(unsigned uNodeIndex, unsigned uParentNodeIndex) + { + if (NULL_NEIGHBOR == uNodeIndex) + return; + + if (m_uNeighbor1[uNodeIndex] == uParentNodeIndex) + ; + else if (m_uNeighbor2[uNodeIndex] == uParentNodeIndex) + { + double dEdgeLength2 = m_dEdgeLength2[uNodeIndex]; + m_uNeighbor2[uNodeIndex] = m_uNeighbor1[uNodeIndex]; + m_dEdgeLength2[uNodeIndex] = m_dEdgeLength1[uNodeIndex]; + m_uNeighbor1[uNodeIndex] = uParentNodeIndex; + m_dEdgeLength1[uNodeIndex] = dEdgeLength2; + } + else + { + assert(m_uNeighbor3[uNodeIndex] == uParentNodeIndex); + double dEdgeLength3 = m_dEdgeLength3[uNodeIndex]; + m_uNeighbor3[uNodeIndex] = m_uNeighbor1[uNodeIndex]; + m_dEdgeLength3[uNodeIndex] = m_dEdgeLength1[uNodeIndex]; + m_uNeighbor1[uNodeIndex] = uParentNodeIndex; + m_dEdgeLength1[uNodeIndex] = dEdgeLength3; + } + + OrientParent(m_uNeighbor2[uNodeIndex], uNodeIndex); + OrientParent(m_uNeighbor3[uNodeIndex], uNodeIndex); + } + +unsigned Tree::FirstDepthFirstNode() const + { + assert(IsRooted()); + +// Descend via left branches until we hit a leaf + unsigned uNodeIndex = m_uRootNodeIndex; + while (!IsLeaf(uNodeIndex)) + uNodeIndex = GetLeft(uNodeIndex); + return uNodeIndex; + } + +unsigned Tree::FirstDepthFirstNodeR() const + { + assert(IsRooted()); + +// Descend via left branches until we hit a leaf + unsigned uNodeIndex = m_uRootNodeIndex; + while (!IsLeaf(uNodeIndex)) + uNodeIndex = GetRight(uNodeIndex); + return uNodeIndex; + } + +unsigned Tree::NextDepthFirstNode(unsigned uNodeIndex) const + { +#if TRACE + Log("NextDepthFirstNode(%3u) ", uNodeIndex); +#endif + + assert(IsRooted()); + assert(uNodeIndex < m_uNodeCount); + + if (IsRoot(uNodeIndex)) + { +#if TRACE + Log(">> Node %u is root, end of traversal\n", uNodeIndex); +#endif + return NULL_NEIGHBOR; + } + + unsigned uParent = GetParent(uNodeIndex); + if (GetRight(uParent) == uNodeIndex) + { +#if TRACE + Log(">> Is right branch, return parent=%u\n", uParent); +#endif + return uParent; + } + + uNodeIndex = GetRight(uParent); +#if TRACE + Log(">> Descend left from right sibling=%u ... ", uNodeIndex); +#endif + while (!IsLeaf(uNodeIndex)) + uNodeIndex = GetLeft(uNodeIndex); + +#if TRACE + Log("bottom out at leaf=%u\n", uNodeIndex); +#endif + return uNodeIndex; + } + +unsigned Tree::NextDepthFirstNodeR(unsigned uNodeIndex) const + { +#if TRACE + Log("NextDepthFirstNode(%3u) ", uNodeIndex); +#endif + + assert(IsRooted()); + assert(uNodeIndex < m_uNodeCount); + + if (IsRoot(uNodeIndex)) + { +#if TRACE + Log(">> Node %u is root, end of traversal\n", uNodeIndex); +#endif + return NULL_NEIGHBOR; + } + + unsigned uParent = GetParent(uNodeIndex); + if (GetLeft(uParent) == uNodeIndex) + { +#if TRACE + Log(">> Is left branch, return parent=%u\n", uParent); +#endif + return uParent; + } + + uNodeIndex = GetLeft(uParent); +#if TRACE + Log(">> Descend right from left sibling=%u ... ", uNodeIndex); +#endif + while (!IsLeaf(uNodeIndex)) + uNodeIndex = GetRight(uNodeIndex); + +#if TRACE + Log("bottom out at leaf=%u\n", uNodeIndex); +#endif + return uNodeIndex; + } + +void Tree::UnrootByDeletingRoot() + { + assert(IsRooted()); + assert(m_uNodeCount >= 3); + + const unsigned uLeft = GetLeft(m_uRootNodeIndex); + const unsigned uRight = GetRight(m_uRootNodeIndex); + + m_uNeighbor1[uLeft] = uRight; + m_uNeighbor1[uRight] = uLeft; + + bool bHasEdgeLength = HasEdgeLength(m_uRootNodeIndex, uLeft) && + HasEdgeLength(m_uRootNodeIndex, uRight); + if (bHasEdgeLength) + { + double dEdgeLength = GetEdgeLength(m_uRootNodeIndex, uLeft) + + GetEdgeLength(m_uRootNodeIndex, uRight); + m_dEdgeLength1[uLeft] = dEdgeLength; + m_dEdgeLength1[uRight] = dEdgeLength; + } + +// Remove root node entry from arrays + const unsigned uMoveCount = m_uNodeCount - m_uRootNodeIndex; + const unsigned uUnsBytes = uMoveCount*sizeof(unsigned); + memmove(m_uNeighbor1 + m_uRootNodeIndex, m_uNeighbor1 + m_uRootNodeIndex + 1, + uUnsBytes); + memmove(m_uNeighbor2 + m_uRootNodeIndex, m_uNeighbor2 + m_uRootNodeIndex + 1, + uUnsBytes); + memmove(m_uNeighbor3 + m_uRootNodeIndex, m_uNeighbor3 + m_uRootNodeIndex + 1, + uUnsBytes); + + const unsigned uDoubleBytes = uMoveCount*sizeof(double); + memmove(m_dEdgeLength1 + m_uRootNodeIndex, m_dEdgeLength1 + m_uRootNodeIndex + 1, + uDoubleBytes); + memmove(m_dEdgeLength2 + m_uRootNodeIndex, m_dEdgeLength2 + m_uRootNodeIndex + 1, + uDoubleBytes); + memmove(m_dEdgeLength3 + m_uRootNodeIndex, m_dEdgeLength3 + m_uRootNodeIndex + 1, + uDoubleBytes); + + const unsigned uBoolBytes = uMoveCount*sizeof(bool); + memmove(m_bHasEdgeLength1 + m_uRootNodeIndex, m_bHasEdgeLength1 + m_uRootNodeIndex + 1, + uBoolBytes); + memmove(m_bHasEdgeLength2 + m_uRootNodeIndex, m_bHasEdgeLength2 + m_uRootNodeIndex + 1, + uBoolBytes); + memmove(m_bHasEdgeLength3 + m_uRootNodeIndex, m_bHasEdgeLength3 + m_uRootNodeIndex + 1, + uBoolBytes); + + const unsigned uPtrBytes = uMoveCount*sizeof(char *); + memmove(m_ptrName + m_uRootNodeIndex, m_ptrName + m_uRootNodeIndex + 1, uPtrBytes); + + --m_uNodeCount; + m_bRooted = false; + +// Fix up table entries + for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) + { +#define DEC(x) if (x != NULL_NEIGHBOR && x > m_uRootNodeIndex) --x; + DEC(m_uNeighbor1[uNodeIndex]) + DEC(m_uNeighbor2[uNodeIndex]) + DEC(m_uNeighbor3[uNodeIndex]) +#undef DEC + } + + Validate(); + } + +unsigned Tree::GetLeafParent(unsigned uNodeIndex) const + { + assert(IsLeaf(uNodeIndex)); + + if (IsRooted()) + return GetParent(uNodeIndex); + + if (m_uNeighbor1[uNodeIndex] != NULL_NEIGHBOR) + return m_uNeighbor1[uNodeIndex]; + if (m_uNeighbor2[uNodeIndex] != NULL_NEIGHBOR) + return m_uNeighbor2[uNodeIndex]; + return m_uNeighbor3[uNodeIndex]; + } + +// TODO: This is not efficient for large trees, should cache. +double Tree::GetNodeHeight(unsigned uNodeIndex) const + { + if (!IsRooted()) + Quit("Tree::GetNodeHeight: undefined unless rooted tree"); + + if (IsLeaf(uNodeIndex)) + return 0.0; + + if (m_bHasHeight[uNodeIndex]) + return m_dHeight[uNodeIndex]; + + const unsigned uLeft = GetLeft(uNodeIndex); + const unsigned uRight = GetRight(uNodeIndex); + double dLeftLength = GetEdgeLength(uNodeIndex, uLeft); + double dRightLength = GetEdgeLength(uNodeIndex, uRight); + + if (dLeftLength < 0) + dLeftLength = 0; + if (dRightLength < 0) + dRightLength = 0; + + const double dLeftHeight = dLeftLength + GetNodeHeight(uLeft); + const double dRightHeight = dRightLength + GetNodeHeight(uRight); + const double dHeight = (dLeftHeight + dRightHeight)/2; + m_bHasHeight[uNodeIndex] = true; + m_dHeight[uNodeIndex] = dHeight; + return dHeight; + } + +unsigned Tree::GetNeighborSubscript(unsigned uNodeIndex, unsigned uNeighborIndex) const + { + assert(uNodeIndex < m_uNodeCount); + assert(uNeighborIndex < m_uNodeCount); + if (uNeighborIndex == m_uNeighbor1[uNodeIndex]) + return 0; + if (uNeighborIndex == m_uNeighbor2[uNodeIndex]) + return 1; + if (uNeighborIndex == m_uNeighbor3[uNodeIndex]) + return 2; + return NULL_NEIGHBOR; + } + +unsigned Tree::GetNeighbor(unsigned uNodeIndex, unsigned uNeighborSubscript) const + { + switch (uNeighborSubscript) + { + case 0: + return m_uNeighbor1[uNodeIndex]; + case 1: + return m_uNeighbor2[uNodeIndex]; + case 2: + return m_uNeighbor3[uNodeIndex]; + } + Quit("Tree::GetNeighbor, sub=%u", uNeighborSubscript); + return NULL_NEIGHBOR; + } + +// TODO: check if this is a performance issue, could cache a lookup table +unsigned Tree::LeafIndexToNodeIndex(unsigned uLeafIndex) const + { + const unsigned uNodeCount = GetNodeCount(); + unsigned uLeafCount = 0; + for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) + { + if (IsLeaf(uNodeIndex)) + { + if (uLeafCount == uLeafIndex) + return uNodeIndex; + else + ++uLeafCount; + } + } + Quit("LeafIndexToNodeIndex: out of range"); + return 0; + } + +unsigned Tree::GetLeafNodeIndex(const char *ptrName) const + { + const unsigned uNodeCount = GetNodeCount(); + for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) + { + if (!IsLeaf(uNodeIndex)) + continue; + const char *ptrLeafName = GetLeafName(uNodeIndex); + if (0 == strcmp(ptrName, ptrLeafName)) + return uNodeIndex; + } + Quit("Tree::GetLeafNodeIndex, name not found"); + return 0; + } + +void Tree::Copy(const Tree &tree) + { + const unsigned uNodeCount = tree.GetNodeCount(); + InitCache(uNodeCount); + + m_uNodeCount = uNodeCount; + + const size_t UnsignedBytes = uNodeCount*sizeof(unsigned); + const size_t DoubleBytes = uNodeCount*sizeof(double); + const size_t BoolBytes = uNodeCount*sizeof(bool); + + memcpy(m_uNeighbor1, tree.m_uNeighbor1, UnsignedBytes); + memcpy(m_uNeighbor2, tree.m_uNeighbor2, UnsignedBytes); + memcpy(m_uNeighbor3, tree.m_uNeighbor3, UnsignedBytes); + + memcpy(m_Ids, tree.m_Ids, UnsignedBytes); + + memcpy(m_dEdgeLength1, tree.m_dEdgeLength1, DoubleBytes); + memcpy(m_dEdgeLength2, tree.m_dEdgeLength2, DoubleBytes); + memcpy(m_dEdgeLength3, tree.m_dEdgeLength3, DoubleBytes); + + memcpy(m_dHeight, tree.m_dHeight, DoubleBytes); + + memcpy(m_bHasEdgeLength1, tree.m_bHasEdgeLength1, BoolBytes); + memcpy(m_bHasEdgeLength2, tree.m_bHasEdgeLength2, BoolBytes); + memcpy(m_bHasEdgeLength3, tree.m_bHasEdgeLength3, BoolBytes); + + memcpy(m_bHasHeight, tree.m_bHasHeight, BoolBytes); + + m_uRootNodeIndex = tree.m_uRootNodeIndex; + m_bRooted = tree.m_bRooted; + + for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) + { + if (tree.IsLeaf(uNodeIndex)) + { + const char *ptrName = tree.GetLeafName(uNodeIndex); + m_ptrName[uNodeIndex] = strsave(ptrName); + } + else + m_ptrName[uNodeIndex] = 0; + } + +#if DEBUG + Validate(); +#endif + } + +// Create rooted tree from a vector description. +// Node indexes are 0..N-1 for leaves, N..2N-2 for +// internal nodes. +// Vector subscripts are i-N and have values for +// internal nodes only, but those values are node +// indexes 0..2N-2. So e.g. if N=6 and Left[2]=1, +// this means that the third internal node (node index 8) +// has the second leaf (node index 1) as its left child. +// uRoot gives the vector subscript of the root, so add N +// to get the node index. +void Tree::Create(unsigned uLeafCount, unsigned uRoot, const unsigned Left[], + const unsigned Right[], const float LeftLength[], const float RightLength[], + const unsigned LeafIds[], char **LeafNames) + { + Clear(); + + m_uNodeCount = 2*uLeafCount - 1; + InitCache(m_uNodeCount); + + for (unsigned uNodeIndex = 0; uNodeIndex < uLeafCount; ++uNodeIndex) + { + m_Ids[uNodeIndex] = LeafIds[uNodeIndex]; + m_ptrName[uNodeIndex] = strsave(LeafNames[uNodeIndex]); + } + + for (unsigned uNodeIndex = uLeafCount; uNodeIndex < m_uNodeCount; ++uNodeIndex) + { + unsigned v = uNodeIndex - uLeafCount; + unsigned uLeft = Left[v]; + unsigned uRight = Right[v]; + float fLeft = LeftLength[v]; + float fRight = RightLength[v]; + + m_uNeighbor2[uNodeIndex] = uLeft; + m_uNeighbor3[uNodeIndex] = uRight; + + m_bHasEdgeLength2[uNodeIndex] = true; + m_bHasEdgeLength3[uNodeIndex] = true; + + m_dEdgeLength2[uNodeIndex] = fLeft; + m_dEdgeLength3[uNodeIndex] = fRight; + + m_uNeighbor1[uLeft] = uNodeIndex; + m_uNeighbor1[uRight] = uNodeIndex; + + m_dEdgeLength1[uLeft] = fLeft; + m_dEdgeLength1[uRight] = fRight; + + m_bHasEdgeLength1[uLeft] = true; + m_bHasEdgeLength1[uRight] = true; + } + + m_bRooted = true; + m_uRootNodeIndex = uRoot + uLeafCount; + + Validate(); + } diff --git a/src/muscle/muscle3.8.31/src/phy2.cpp b/src/muscle/muscle3.8.31/src/phy2.cpp new file mode 100644 index 0000000..2abc5c7 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/phy2.cpp @@ -0,0 +1,282 @@ +#include "muscle.h" +#include "tree.h" + +#define TRACE 0 + +// Return false when done +bool PhyEnumEdges(const Tree &tree, PhyEnumEdgeState &ES) + { + unsigned uNode1 = uInsane; + + if (!ES.m_bInit) + { + if (tree.GetNodeCount() <= 1) + { + ES.m_uNodeIndex1 = NULL_NEIGHBOR; + ES.m_uNodeIndex2 = NULL_NEIGHBOR; + return false; + } + uNode1 = tree.FirstDepthFirstNode(); + ES.m_bInit = true; + } + else + { + uNode1 = tree.NextDepthFirstNode(ES.m_uNodeIndex1); + if (NULL_NEIGHBOR == uNode1) + return false; + if (tree.IsRooted() && tree.IsRoot(uNode1)) + { + uNode1 = tree.NextDepthFirstNode(uNode1); + if (NULL_NEIGHBOR == uNode1) + return false; + } + } + unsigned uNode2 = tree.GetParent(uNode1); + + ES.m_uNodeIndex1 = uNode1; + ES.m_uNodeIndex2 = uNode2; + return true; + } + +bool PhyEnumEdgesR(const Tree &tree, PhyEnumEdgeState &ES) + { + unsigned uNode1 = uInsane; + + if (!ES.m_bInit) + { + if (tree.GetNodeCount() <= 1) + { + ES.m_uNodeIndex1 = NULL_NEIGHBOR; + ES.m_uNodeIndex2 = NULL_NEIGHBOR; + return false; + } + uNode1 = tree.FirstDepthFirstNodeR(); + ES.m_bInit = true; + } + else + { + uNode1 = tree.NextDepthFirstNodeR(ES.m_uNodeIndex1); + if (NULL_NEIGHBOR == uNode1) + return false; + if (tree.IsRooted() && tree.IsRoot(uNode1)) + { + uNode1 = tree.NextDepthFirstNode(uNode1); + if (NULL_NEIGHBOR == uNode1) + return false; + } + } + unsigned uNode2 = tree.GetParent(uNode1); + + ES.m_uNodeIndex1 = uNode1; + ES.m_uNodeIndex2 = uNode2; + return true; + } + +static void GetLeavesSubtree(const Tree &tree, unsigned uNodeIndex1, + const unsigned uNodeIndex2, unsigned Leaves[], unsigned *ptruCount) + { + if (tree.IsLeaf(uNodeIndex1)) + { + Leaves[*ptruCount] = uNodeIndex1; + ++(*ptruCount); + return; + } + + const unsigned uLeft = tree.GetFirstNeighbor(uNodeIndex1, uNodeIndex2); + const unsigned uRight = tree.GetSecondNeighbor(uNodeIndex1, uNodeIndex2); + if (NULL_NEIGHBOR != uLeft) + GetLeavesSubtree(tree, uLeft, uNodeIndex1, Leaves, ptruCount); + if (NULL_NEIGHBOR != uRight) + GetLeavesSubtree(tree, uRight, uNodeIndex1, Leaves, ptruCount); + } + +static void PhyGetLeaves(const Tree &tree, unsigned uNodeIndex1, unsigned uNodeIndex2, + unsigned Leaves[], unsigned *ptruCount) + { + *ptruCount = 0; + GetLeavesSubtree(tree, uNodeIndex1, uNodeIndex2, Leaves, ptruCount); + } + +bool PhyEnumBiParts(const Tree &tree, PhyEnumEdgeState &ES, + unsigned Leaves1[], unsigned *ptruCount1, + unsigned Leaves2[], unsigned *ptruCount2) + { + bool bOk = PhyEnumEdges(tree, ES); + if (!bOk) + { + *ptruCount1 = 0; + *ptruCount2 = 0; + return false; + } + +// Special case: in a rooted tree, both edges from the root +// give the same bipartition, so skip one of them. + if (tree.IsRooted() && tree.IsRoot(ES.m_uNodeIndex2) + && tree.GetRight(ES.m_uNodeIndex2) == ES.m_uNodeIndex1) + { + bOk = PhyEnumEdges(tree, ES); + if (!bOk) + return false; + } + + PhyGetLeaves(tree, ES.m_uNodeIndex1, ES.m_uNodeIndex2, Leaves1, ptruCount1); + PhyGetLeaves(tree, ES.m_uNodeIndex2, ES.m_uNodeIndex1, Leaves2, ptruCount2); + + if (*ptruCount1 + *ptruCount2 != tree.GetLeafCount()) + Quit("PhyEnumBiParts %u + %u != %u", + *ptruCount1, *ptruCount2, tree.GetLeafCount()); +#if DEBUG + { + for (unsigned i = 0; i < *ptruCount1; ++i) + { + if (!tree.IsLeaf(Leaves1[i])) + Quit("PhyEnumByParts: not leaf"); + for (unsigned j = 0; j < *ptruCount2; ++j) + { + if (!tree.IsLeaf(Leaves2[j])) + Quit("PhyEnumByParts: not leaf"); + if (Leaves1[i] == Leaves2[j]) + Quit("PhyEnumByParts: dupe"); + } + } + } +#endif + + return true; + } + +#if 0 +void TestBiPart() + { + SetListFileName("c:\\tmp\\lobster.log", false); + Tree tree; + TextFile fileIn("c:\\tmp\\test.phy"); + tree.FromFile(fileIn); + tree.LogMe(); + + const unsigned uNodeCount = tree.GetNodeCount(); + unsigned *Leaves1 = new unsigned[uNodeCount]; + unsigned *Leaves2 = new unsigned[uNodeCount]; + + PhyEnumEdgeState ES; + bool bDone = false; + for (;;) + { + unsigned uCount1 = uInsane; + unsigned uCount2 = uInsane; + bool bOk = PhyEnumBiParts(tree, ES, Leaves1, &uCount1, Leaves2, &uCount2); + Log("PEBP=%d ES.Init=%d ES.ni1=%d ES.ni2=%d\n", + bOk, + ES.m_bInit, + ES.m_uNodeIndex1, + ES.m_uNodeIndex2); + if (!bOk) + break; + Log("\n"); + Log("Part1: "); + for (unsigned n = 0; n < uCount1; ++n) + Log(" %d(%s)", Leaves1[n], tree.GetLeafName(Leaves1[n])); + Log("\n"); + Log("Part2: "); + for (unsigned n = 0; n < uCount2; ++n) + Log(" %d(%s)", Leaves2[n], tree.GetLeafName(Leaves2[n])); + Log("\n"); + } + } +#endif + +static void GetLeavesSubtreeExcluding(const Tree &tree, unsigned uNodeIndex, + unsigned uExclude, unsigned Leaves[], unsigned *ptruCount) + { + if (uNodeIndex == uExclude) + return; + + if (tree.IsLeaf(uNodeIndex)) + { + Leaves[*ptruCount] = uNodeIndex; + ++(*ptruCount); + return; + } + + const unsigned uLeft = tree.GetLeft(uNodeIndex); + const unsigned uRight = tree.GetRight(uNodeIndex); + if (NULL_NEIGHBOR != uLeft) + GetLeavesSubtreeExcluding(tree, uLeft, uExclude, Leaves, ptruCount); + if (NULL_NEIGHBOR != uRight) + GetLeavesSubtreeExcluding(tree, uRight, uExclude, Leaves, ptruCount); + } + +void GetLeavesExcluding(const Tree &tree, unsigned uNodeIndex, + unsigned uExclude, unsigned Leaves[], unsigned *ptruCount) + { + *ptruCount = 0; + GetLeavesSubtreeExcluding(tree, uNodeIndex, uExclude, Leaves, ptruCount); + } + +void GetInternalNodesInHeightOrder(const Tree &tree, unsigned NodeIndexes[]) + { + const unsigned uNodeCount = tree.GetNodeCount(); + if (uNodeCount < 3) + Quit("GetInternalNodesInHeightOrder: %u nodes, none are internal", + uNodeCount); + const unsigned uInternalNodeCount = (uNodeCount - 1)/2; + double *Heights = new double[uInternalNodeCount]; + + unsigned uIndex = 0; + for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) + { + if (tree.IsLeaf(uNodeIndex)) + continue; + NodeIndexes[uIndex] = uNodeIndex; + Heights[uIndex] = tree.GetNodeHeight(uNodeIndex); + ++uIndex; + } + if (uIndex != uInternalNodeCount) + Quit("Internal error: GetInternalNodesInHeightOrder"); + +// Simple but slow bubble sort (probably don't care about speed here) + bool bDone = false; + while (!bDone) + { + bDone = true; + for (unsigned i = 0; i < uInternalNodeCount - 1; ++i) + { + if (Heights[i] > Heights[i+1]) + { + double dTmp = Heights[i]; + Heights[i] = Heights[i+1]; + Heights[i+1] = dTmp; + + unsigned uTmp = NodeIndexes[i]; + NodeIndexes[i] = NodeIndexes[i+1]; + NodeIndexes[i+1] = uTmp; + bDone = false; + } + } + } +#if TRACE + Log("Internal node index Height\n"); + Log("------------------- --------\n"); + // 1234567890123456789 123456789 + for (unsigned n = 0; n < uInternalNodeCount; ++n) + Log("%19u %9.3f\n", NodeIndexes[n], Heights[n]); +#endif + delete[] Heights; + } + +void ApplyMinEdgeLength(Tree &tree, double dMinEdgeLength) + { + const unsigned uNodeCount = tree.GetNodeCount(); + for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) + { + const unsigned uNeighborCount = tree.GetNeighborCount(uNodeIndex); + for (unsigned n = 0; n < uNeighborCount; ++n) + { + const unsigned uNeighborNodeIndex = tree.GetNeighbor(uNodeIndex, n); + if (!tree.HasEdgeLength(uNodeIndex, uNeighborNodeIndex)) + continue; + if (tree.GetEdgeLength(uNodeIndex, uNeighborNodeIndex) < dMinEdgeLength) + tree.SetEdgeLength(uNodeIndex, uNeighborNodeIndex, dMinEdgeLength); + } + } + } diff --git a/src/muscle/muscle3.8.31/src/phy3.cpp b/src/muscle/muscle3.8.31/src/phy3.cpp new file mode 100644 index 0000000..615372e --- /dev/null +++ b/src/muscle/muscle3.8.31/src/phy3.cpp @@ -0,0 +1,469 @@ +#include "muscle.h" +#include "tree.h" +#include "edgelist.h" + +#define TRACE 0 + +struct EdgeInfo + { + EdgeInfo() + { + m_bSet = false; + } +// Is data in this structure valid (i.e, has been set)? + bool m_bSet; + +// Node at start of this edge + unsigned m_uNode1; + +// Node at end of this edge + unsigned m_uNode2; + +// Maximum distance from Node2 to a leaf + double m_dMaxDistToLeaf; + +// Sum of distances from Node2 to all leaves under Node2 + double m_dTotalDistToLeaves; + +// Next node on path from Node2 to most distant leaf + unsigned m_uMaxStep; + +// Most distant leaf from Node2 (used for debugging only) + unsigned m_uMostDistantLeaf; + +// Number of leaves under Node2 + unsigned m_uLeafCount; + }; + +static void RootByMidLongestSpan(const Tree &tree, EdgeInfo **EIs, + unsigned *ptruNode1, unsigned *ptruNode2, + double *ptrdLength1, double *ptrdLength2); +static void RootByMinAvgLeafDist(const Tree &tree, EdgeInfo **EIs, + unsigned *ptruNode1, unsigned *ptruNode2, + double *ptrdLength1, double *ptrdLength2); + +static void ListEIs(EdgeInfo **EIs, unsigned uNodeCount) + { + Log("Node1 Node2 MaxDist TotDist MostDist LeafCount Step\n"); + Log("----- ----- ------- ------- -------- --------- ----\n"); + // 12345 12345 1234567 1234567 12345678 123456789 + + for (unsigned uNode = 0; uNode < uNodeCount; ++uNode) + for (unsigned uNeighbor = 0; uNeighbor < 3; ++uNeighbor) + { + const EdgeInfo &EI = EIs[uNode][uNeighbor]; + if (!EI.m_bSet) + continue; + Log("%5u %5u %7.3g %7.3g %8u %9u", + EI.m_uNode1, + EI.m_uNode2, + EI.m_dMaxDistToLeaf, + EI.m_dTotalDistToLeaves, + EI.m_uMostDistantLeaf, + EI.m_uLeafCount); + if (NULL_NEIGHBOR != EI.m_uMaxStep) + Log(" %4u", EI.m_uMaxStep); + Log("\n"); + } + } + +static void CalcInfo(const Tree &tree, unsigned uNode1, unsigned uNode2, EdgeInfo **EIs) + { + const unsigned uNeighborIndex = tree.GetNeighborSubscript(uNode1, uNode2); + EdgeInfo &EI = EIs[uNode1][uNeighborIndex]; + EI.m_uNode1 = uNode1; + EI.m_uNode2 = uNode2; + + if (tree.IsLeaf(uNode2)) + { + EI.m_dMaxDistToLeaf = 0; + EI.m_dTotalDistToLeaves = 0; + EI.m_uMaxStep = NULL_NEIGHBOR; + EI.m_uMostDistantLeaf = uNode2; + EI.m_uLeafCount = 1; + EI.m_bSet = true; + return; + } + + double dMaxDistToLeaf = -1e29; + double dTotalDistToLeaves = 0.0; + unsigned uLeafCount = 0; + unsigned uMostDistantLeaf = NULL_NEIGHBOR; + unsigned uMaxStep = NULL_NEIGHBOR; + + const unsigned uNeighborCount = tree.GetNeighborCount(uNode2); + for (unsigned uSub = 0; uSub < uNeighborCount; ++uSub) + { + const unsigned uNode3 = tree.GetNeighbor(uNode2, uSub); + if (uNode3 == uNode1) + continue; + const EdgeInfo &EINext = EIs[uNode2][uSub]; + if (!EINext.m_bSet) + Quit("CalcInfo: internal error, dist %u->%u not known", + uNode2, uNode3); + + + uLeafCount += EINext.m_uLeafCount; + + const double dEdgeLength = tree.GetEdgeLength(uNode2, uNode3); + const double dTotalDist = EINext.m_dTotalDistToLeaves + + EINext.m_uLeafCount*dEdgeLength; + dTotalDistToLeaves += dTotalDist; + + const double dDist = EINext.m_dMaxDistToLeaf + dEdgeLength; + if (dDist > dMaxDistToLeaf) + { + dMaxDistToLeaf = dDist; + uMostDistantLeaf = EINext.m_uMostDistantLeaf; + uMaxStep = uNode3; + } + } + if (NULL_NEIGHBOR == uMaxStep || NULL_NEIGHBOR == uMostDistantLeaf || + 0 == uLeafCount) + Quit("CalcInfo: internal error 2"); + + const double dThisDist = tree.GetEdgeLength(uNode1, uNode2); + EI.m_dMaxDistToLeaf = dMaxDistToLeaf; + EI.m_dTotalDistToLeaves = dTotalDistToLeaves; + EI.m_uMaxStep = uMaxStep; + EI.m_uMostDistantLeaf = uMostDistantLeaf; + EI.m_uLeafCount = uLeafCount; + EI.m_bSet = true; + } + +static bool Known(const Tree &tree, EdgeInfo **EIs, unsigned uNodeFrom, + unsigned uNodeTo) + { + const unsigned uSub = tree.GetNeighborSubscript(uNodeFrom, uNodeTo); + return EIs[uNodeFrom][uSub].m_bSet; + } + +static bool AllKnownOut(const Tree &tree, EdgeInfo **EIs, unsigned uNodeFrom, + unsigned uNodeTo) + { + const unsigned uNeighborCount = tree.GetNeighborCount(uNodeTo); + for (unsigned uSub = 0; uSub < uNeighborCount; ++uSub) + { + unsigned uNeighborIndex = tree.GetNeighbor(uNodeTo, uSub); + if (uNeighborIndex == uNodeFrom) + continue; + if (!EIs[uNodeTo][uSub].m_bSet) + return false; + } + return true; + } + +void FindRoot(const Tree &tree, unsigned *ptruNode1, unsigned *ptruNode2, + double *ptrdLength1, double *ptrdLength2, + ROOT RootMethod) + { +#if TRACE + tree.LogMe(); +#endif + if (tree.IsRooted()) + Quit("FindRoot: tree already rooted"); + + const unsigned uNodeCount = tree.GetNodeCount(); + const unsigned uLeafCount = tree.GetLeafCount(); + + if (uNodeCount < 2) + Quit("Root: don't support trees with < 2 edges"); + + EdgeInfo **EIs = new EdgeInfo *[uNodeCount]; + for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) + EIs[uNodeIndex] = new EdgeInfo[3]; + + EdgeList Edges; + for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) + if (tree.IsLeaf(uNodeIndex)) + { + unsigned uParent = tree.GetNeighbor1(uNodeIndex); + Edges.Add(uParent, uNodeIndex); + } + +#if TRACE + Log("Edges: "); + Edges.LogMe(); +#endif + +// Main loop: iterate until all distances known + double dAllMaxDist = -1e20; + unsigned uMaxFrom = NULL_NEIGHBOR; + unsigned uMaxTo = NULL_NEIGHBOR; + for (;;) + { + EdgeList NextEdges; + +#if TRACE + Log("\nTop of main loop\n"); + Log("Edges: "); + Edges.LogMe(); + Log("MDs:\n"); + ListEIs(EIs, uNodeCount); +#endif + + // For all edges + const unsigned uEdgeCount = Edges.GetCount(); + if (0 == uEdgeCount) + break; + for (unsigned n = 0; n < uEdgeCount; ++n) + { + unsigned uNodeFrom; + unsigned uNodeTo; + Edges.GetEdge(n, &uNodeFrom, &uNodeTo); + + CalcInfo(tree, uNodeFrom, uNodeTo, EIs); +#if TRACE + Log("Edge %u -> %u\n", uNodeFrom, uNodeTo); +#endif + const unsigned uNeighborCount = tree.GetNeighborCount(uNodeFrom); + for (unsigned i = 0; i < uNeighborCount; ++i) + { + const unsigned uNeighborIndex = tree.GetNeighbor(uNodeFrom, i); + if (!Known(tree, EIs, uNeighborIndex, uNodeFrom) && + AllKnownOut(tree, EIs, uNeighborIndex, uNodeFrom)) + NextEdges.Add(uNeighborIndex, uNodeFrom); + } + } + Edges.Copy(NextEdges); + } + +#if TRACE + ListEIs(EIs, uNodeCount); +#endif + + switch (RootMethod) + { + case ROOT_MidLongestSpan: + RootByMidLongestSpan(tree, EIs, ptruNode1, ptruNode2, + ptrdLength1, ptrdLength2); + break; + + case ROOT_MinAvgLeafDist: + RootByMinAvgLeafDist(tree, EIs, ptruNode1, ptruNode2, + ptrdLength1, ptrdLength2); + break; + + default: + Quit("Invalid RootMethod=%d", RootMethod); + } + + for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) + delete[] EIs[uNodeIndex]; + delete[] EIs; + } + +static void RootByMidLongestSpan(const Tree &tree, EdgeInfo **EIs, + unsigned *ptruNode1, unsigned *ptruNode2, + double *ptrdLength1, double *ptrdLength2) + { + const unsigned uNodeCount = tree.GetNodeCount(); + + unsigned uLeaf1 = NULL_NEIGHBOR; + unsigned uMostDistantLeaf = NULL_NEIGHBOR; + double dMaxDist = -VERY_LARGE_DOUBLE; + for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) + { + if (!tree.IsLeaf(uNodeIndex)) + continue; + + const unsigned uNode2 = tree.GetNeighbor1(uNodeIndex); + if (NULL_NEIGHBOR == uNode2) + Quit("RootByMidLongestSpan: internal error 0"); + const double dEdgeLength = tree.GetEdgeLength(uNodeIndex, uNode2); + const EdgeInfo &EI = EIs[uNodeIndex][0]; + if (!EI.m_bSet) + Quit("RootByMidLongestSpan: internal error 1"); + if (EI.m_uNode1 != uNodeIndex || EI.m_uNode2 != uNode2) + Quit("RootByMidLongestSpan: internal error 2"); + const double dSpanLength = dEdgeLength + EI.m_dMaxDistToLeaf; + if (dSpanLength > dMaxDist) + { + dMaxDist = dSpanLength; + uLeaf1 = uNodeIndex; + uMostDistantLeaf = EI.m_uMostDistantLeaf; + } + } + + if (NULL_NEIGHBOR == uLeaf1) + Quit("RootByMidLongestSpan: internal error 3"); + + const double dTreeHeight = dMaxDist/2.0; + unsigned uNode1 = uLeaf1; + unsigned uNode2 = tree.GetNeighbor1(uLeaf1); + double dAccumSpanLength = 0; + +#if TRACE + Log("RootByMidLongestSpan: span=%u", uLeaf1); +#endif + + for (;;) + { + const double dEdgeLength = tree.GetEdgeLength(uNode1, uNode2); +#if TRACE + Log("->%u(%g;%g)", uNode2, dEdgeLength, dAccumSpanLength); +#endif + if (dAccumSpanLength + dEdgeLength >= dTreeHeight) + { + *ptruNode1 = uNode1; + *ptruNode2 = uNode2; + *ptrdLength1 = dTreeHeight - dAccumSpanLength; + *ptrdLength2 = dEdgeLength - *ptrdLength1; +#if TRACE + { + const EdgeInfo &EI = EIs[uLeaf1][0]; + Log("...\n"); + Log("Midpoint: Leaf1=%u Leaf2=%u Node1=%u Node2=%u Length1=%g Length2=%g\n", + uLeaf1, EI.m_uMostDistantLeaf, *ptruNode1, *ptruNode2, *ptrdLength1, *ptrdLength2); + } +#endif + return; + } + + if (tree.IsLeaf(uNode2)) + Quit("RootByMidLongestSpan: internal error 4"); + + dAccumSpanLength += dEdgeLength; + const unsigned uSub = tree.GetNeighborSubscript(uNode1, uNode2); + const EdgeInfo &EI = EIs[uNode1][uSub]; + if (!EI.m_bSet) + Quit("RootByMidLongestSpan: internal error 5"); + + uNode1 = uNode2; + uNode2 = EI.m_uMaxStep; + } + } + +/*** +Root by balancing average distance to leaves. +The root is a point p such that the average +distance to leaves to the left of p is the +same as the to the right. + +This is the method used by CLUSTALW, which +was originally used in PROFILEWEIGHT: + + Thompson et al. (1994) CABIOS (10) 1, 19-29. +***/ + +static void RootByMinAvgLeafDist(const Tree &tree, EdgeInfo **EIs, + unsigned *ptruNode1, unsigned *ptruNode2, + double *ptrdLength1, double *ptrdLength2) + { + const unsigned uNodeCount = tree.GetNodeCount(); + const unsigned uLeafCount = tree.GetLeafCount(); + unsigned uNode1 = NULL_NEIGHBOR; + unsigned uNode2 = NULL_NEIGHBOR; + double dMinHeight = VERY_LARGE_DOUBLE; + double dBestLength1 = VERY_LARGE_DOUBLE; + double dBestLength2 = VERY_LARGE_DOUBLE; + + for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) + { + const unsigned uNeighborCount = tree.GetNeighborCount(uNodeIndex); + for (unsigned uSub = 0; uSub < uNeighborCount; ++uSub) + { + const unsigned uNeighborIndex = tree.GetNeighbor(uNodeIndex, uSub); + + // Avoid visiting same edge a second time in reversed order. + if (uNeighborIndex < uNodeIndex) + continue; + + const unsigned uSubRev = tree.GetNeighborSubscript(uNeighborIndex, uNodeIndex); + if (NULL_NEIGHBOR == uSubRev) + Quit("RootByMinAvgLeafDist, internal error 1"); + + // Get info for edges Node1->Node2 and Node2->Node1 (reversed) + const EdgeInfo &EI = EIs[uNodeIndex][uSub]; + const EdgeInfo &EIRev = EIs[uNeighborIndex][uSubRev]; + + if (EI.m_uNode1 != uNodeIndex || EI.m_uNode2 != uNeighborIndex || + EIRev.m_uNode1 != uNeighborIndex || EIRev.m_uNode2 != uNodeIndex) + Quit("RootByMinAvgLeafDist, internal error 2"); + if (!EI.m_bSet) + Quit("RootByMinAvgLeafDist, internal error 3"); + if (uLeafCount != EI.m_uLeafCount + EIRev.m_uLeafCount) + Quit("RootByMinAvgLeafDist, internal error 4"); + + const double dEdgeLength = tree.GetEdgeLength(uNodeIndex, uNeighborIndex); + if (dEdgeLength != tree.GetEdgeLength(uNeighborIndex, uNodeIndex)) + Quit("RootByMinAvgLeafDist, internal error 5"); + + // Consider point p on edge 12 in tree (1=Node, 2=Neighbor). + // + // ----- ---- + // | | + // 1----p--2 + // | | + // ----- ---- + // + // Define: + // ADLp = average distance to leaves to left of point p. + // ADRp = average distance to leaves to right of point p. + // L = edge length = distance 12 + // x = distance 1p + // So distance p2 = L - x. + // Average distance from p to leaves on left of p is: + // ADLp = ADL1 + x + // Average distance from p to leaves on right of p is: + // ADRp = ADR2 + (L - x) + // To be a root, we require these two distances to be equal, + // ADLp = ADRp + // ADL1 + x = ADR2 + (L - x) + // Solving for x, + // x = (ADR2 - ADL1 + L)/2 + // If 0 <= x <= L, we can place the root on edge 12. + + const double ADL1 = EI.m_dTotalDistToLeaves / EI.m_uLeafCount; + const double ADR2 = EIRev.m_dTotalDistToLeaves / EIRev.m_uLeafCount; + + const double x = (ADR2 - ADL1 + dEdgeLength)/2.0; + if (x >= 0 && x <= dEdgeLength) + { + const double dLength1 = x; + const double dLength2 = dEdgeLength - x; + const double dHeight1 = EI.m_dMaxDistToLeaf + dLength1; + const double dHeight2 = EIRev.m_dMaxDistToLeaf + dLength2; + const double dHeight = dHeight1 >= dHeight2 ? dHeight1 : dHeight2; +#if TRACE + Log("Candidate root Node1=%u Node2=%u Height=%g\n", + uNodeIndex, uNeighborIndex, dHeight); +#endif + if (dHeight < dMinHeight) + { + uNode1 = uNodeIndex; + uNode2 = uNeighborIndex; + dBestLength1 = dLength1; + dBestLength2 = dLength2; + dMinHeight = dHeight; + } + } + } + } + + if (NULL_NEIGHBOR == uNode1 || NULL_NEIGHBOR == uNode2) + Quit("RootByMinAvgLeafDist, internal error 6"); + +#if TRACE + Log("Best root Node1=%u Node2=%u Length1=%g Length2=%g Height=%g\n", + uNode1, uNode2, dBestLength1, dBestLength2, dMinHeight); +#endif + + *ptruNode1 = uNode1; + *ptruNode2 = uNode2; + *ptrdLength1 = dBestLength1; + *ptrdLength2 = dBestLength2; + } + +void FixRoot(Tree &tree, ROOT Method) + { + if (!tree.IsRooted()) + Quit("FixRoot: expecting rooted tree"); + + // Pseudo-root: keep root assigned by clustering + if (ROOT_Pseudo == Method) + return; + + tree.UnrootByDeletingRoot(); + tree.RootUnrootedTree(Method); + } diff --git a/src/muscle/muscle3.8.31/src/phy4.cpp b/src/muscle/muscle3.8.31/src/phy4.cpp new file mode 100644 index 0000000..88269fa --- /dev/null +++ b/src/muscle/muscle3.8.31/src/phy4.cpp @@ -0,0 +1,295 @@ +#include "muscle.h" +#include "tree.h" +#include + +#define TRACE 0 + +void ClusterByHeight(const Tree &tree, double dMaxHeight, unsigned Subtrees[], + unsigned *ptruSubtreeCount) + { + if (!tree.IsRooted()) + Quit("ClusterByHeight: requires rooted tree"); + +#if TRACE + Log("ClusterByHeight, max height=%g\n", dMaxHeight); +#endif + + unsigned uSubtreeCount = 0; + const unsigned uNodeCount = tree.GetNodeCount(); + for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) + { + if (tree.IsRoot(uNodeIndex)) + continue; + unsigned uParent = tree.GetParent(uNodeIndex); + double dHeight = tree.GetNodeHeight(uNodeIndex); + double dParentHeight = tree.GetNodeHeight(uParent); + +#if TRACE + Log("Node %3u Height %5.2f ParentHeight %5.2f\n", + uNodeIndex, dHeight, dParentHeight); +#endif + if (dParentHeight > dMaxHeight && dHeight <= dMaxHeight) + { + Subtrees[uSubtreeCount] = uNodeIndex; +#if TRACE + Log("Subtree[%u]=%u\n", uSubtreeCount, uNodeIndex); +#endif + ++uSubtreeCount; + } + } + *ptruSubtreeCount = uSubtreeCount; + } + +static void ClusterBySubfamCount_Iteration(const Tree &tree, unsigned Subfams[], + unsigned uCount) + { +// Find highest child node of current set of subfamilies. + double dHighestHeight = -1e20; + int iParentSubscript = -1; + + for (int n = 0; n < (int) uCount; ++n) + { + const unsigned uNodeIndex = Subfams[n]; + if (tree.IsLeaf(uNodeIndex)) + continue; + + const unsigned uLeft = tree.GetLeft(uNodeIndex); + const double dHeightLeft = tree.GetNodeHeight(uLeft); + if (dHeightLeft > dHighestHeight) + { + dHighestHeight = dHeightLeft; + iParentSubscript = n; + } + + const unsigned uRight = tree.GetRight(uNodeIndex); + const double dHeightRight = tree.GetNodeHeight(uRight); + if (dHeightRight > dHighestHeight) + { + dHighestHeight = dHeightRight; + iParentSubscript = n; + } + } + + if (-1 == iParentSubscript) + Quit("CBSFCIter: failed to find highest child"); + + const unsigned uNodeIndex = Subfams[iParentSubscript]; + const unsigned uLeft = tree.GetLeft(uNodeIndex); + const unsigned uRight = tree.GetRight(uNodeIndex); + +// Delete parent by replacing with left child + Subfams[iParentSubscript] = uLeft; + +// Append right child to list + Subfams[uCount] = uRight; + +#if TRACE + { + Log("Iter %3u:", uCount); + for (unsigned n = 0; n < uCount; ++n) + Log(" %u", Subfams[n]); + Log("\n"); + } +#endif + } + +// Divide a tree containing N leaves into k families by +// cutting the tree at a horizontal line at some height. +// Each internal node defines a height for the cut, +// considering all internal nodes enumerates all distinct +// cuts. Visit internal nodes in decreasing order of height. +// Visiting the node corresponds to moving the horizontal +// line down to cut the tree at the height of that node. +// We consider the cut to be "infinitestimally below" +// the node, so the effect is to remove the current node +// from the list of subfamilies and add its two children. +// We must visit a parent before its children (so care may +// be needed to handle zero edge lengths properly). +// We assume that N is small, and write dumb O(N^2) code. +// More efficient strategies are possible for large N +// by maintaining a list of nodes sorted by height. +void ClusterBySubfamCount(const Tree &tree, unsigned uSubfamCount, + unsigned Subfams[], unsigned *ptruSubfamCount) + { + const unsigned uNodeCount = tree.GetNodeCount(); + const unsigned uLeafCount = (uNodeCount + 1)/2; + +// Special case: empty tree + if (0 == uNodeCount) + { + *ptruSubfamCount = 0; + return; + } + +// Special case: more subfamilies than leaves + if (uSubfamCount >= uLeafCount) + { + for (unsigned n = 0; n < uLeafCount; ++n) + Subfams[n] = n; + *ptruSubfamCount = uLeafCount; + return; + } + +// Initialize list of subfamilies to be root + Subfams[0] = tree.GetRootNodeIndex(); + +// Iterate + for (unsigned i = 1; i < uSubfamCount; ++i) + ClusterBySubfamCount_Iteration(tree, Subfams, i); + + *ptruSubfamCount = uSubfamCount; + } + +static void GetLeavesRecurse(const Tree &tree, unsigned uNodeIndex, + unsigned Leaves[], unsigned &uLeafCount /* in-out */) + { + if (tree.IsLeaf(uNodeIndex)) + { + Leaves[uLeafCount] = uNodeIndex; + ++uLeafCount; + return; + } + + const unsigned uLeft = tree.GetLeft(uNodeIndex); + const unsigned uRight = tree.GetRight(uNodeIndex); + + GetLeavesRecurse(tree, uLeft, Leaves, uLeafCount); + GetLeavesRecurse(tree, uRight, Leaves, uLeafCount); + } + +void GetLeaves(const Tree &tree, unsigned uNodeIndex, unsigned Leaves[], + unsigned *ptruLeafCount) + { + unsigned uLeafCount = 0; + GetLeavesRecurse(tree, uNodeIndex, Leaves, uLeafCount); + *ptruLeafCount = uLeafCount; + } + +void Tree::PruneTree(const Tree &tree, unsigned Subfams[], + unsigned uSubfamCount) + { + if (!tree.IsRooted()) + Quit("Tree::PruneTree: requires rooted tree"); + + Clear(); + + m_uNodeCount = 2*uSubfamCount - 1; + InitCache(m_uNodeCount); + + const unsigned uUnprunedNodeCount = tree.GetNodeCount(); + + unsigned *uUnprunedToPrunedIndex = new unsigned[uUnprunedNodeCount]; + unsigned *uPrunedToUnprunedIndex = new unsigned[m_uNodeCount]; + + for (unsigned n = 0; n < uUnprunedNodeCount; ++n) + uUnprunedToPrunedIndex[n] = NULL_NEIGHBOR; + + for (unsigned n = 0; n < m_uNodeCount; ++n) + uPrunedToUnprunedIndex[n] = NULL_NEIGHBOR; + +// Create mapping between unpruned and pruned node indexes + unsigned uInternalNodeIndex = uSubfamCount; + for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex) + { + unsigned uUnprunedNodeIndex = Subfams[uSubfamIndex]; + uUnprunedToPrunedIndex[uUnprunedNodeIndex] = uSubfamIndex; + uPrunedToUnprunedIndex[uSubfamIndex] = uUnprunedNodeIndex; + for (;;) + { + uUnprunedNodeIndex = tree.GetParent(uUnprunedNodeIndex); + if (tree.IsRoot(uUnprunedNodeIndex)) + break; + + // Already visited this node? + if (NULL_NEIGHBOR != uUnprunedToPrunedIndex[uUnprunedNodeIndex]) + break; + + uUnprunedToPrunedIndex[uUnprunedNodeIndex] = uInternalNodeIndex; + uPrunedToUnprunedIndex[uInternalNodeIndex] = uUnprunedNodeIndex; + + ++uInternalNodeIndex; + } + } + + const unsigned uUnprunedRootIndex = tree.GetRootNodeIndex(); + uUnprunedToPrunedIndex[uUnprunedRootIndex] = uInternalNodeIndex; + uPrunedToUnprunedIndex[uInternalNodeIndex] = uUnprunedRootIndex; + +#if TRACE + { + Log("Pruned to unpruned:\n"); + for (unsigned i = 0; i < m_uNodeCount; ++i) + Log(" [%u]=%u", i, uPrunedToUnprunedIndex[i]); + Log("\n"); + Log("Unpruned to pruned:\n"); + for (unsigned i = 0; i < uUnprunedNodeCount; ++i) + { + unsigned n = uUnprunedToPrunedIndex[i]; + if (n != NULL_NEIGHBOR) + Log(" [%u]=%u", i, n); + } + Log("\n"); + } +#endif + + if (uInternalNodeIndex != m_uNodeCount - 1) + Quit("Tree::PruneTree, Internal error"); + +// Nodes 0, 1 ... are the leaves + for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex) + { + char szName[32]; + sprintf(szName, "Subfam_%u", uSubfamIndex + 1); + m_ptrName[uSubfamIndex] = strsave(szName); + } + + for (unsigned uPrunedNodeIndex = uSubfamCount; uPrunedNodeIndex < m_uNodeCount; + ++uPrunedNodeIndex) + { + unsigned uUnprunedNodeIndex = uPrunedToUnprunedIndex[uPrunedNodeIndex]; + + const unsigned uUnprunedLeft = tree.GetLeft(uUnprunedNodeIndex); + const unsigned uUnprunedRight = tree.GetRight(uUnprunedNodeIndex); + + const unsigned uPrunedLeft = uUnprunedToPrunedIndex[uUnprunedLeft]; + const unsigned uPrunedRight = uUnprunedToPrunedIndex[uUnprunedRight]; + + const double dLeftLength = + tree.GetEdgeLength(uUnprunedNodeIndex, uUnprunedLeft); + const double dRightLength = + tree.GetEdgeLength(uUnprunedNodeIndex, uUnprunedRight); + + m_uNeighbor2[uPrunedNodeIndex] = uPrunedLeft; + m_uNeighbor3[uPrunedNodeIndex] = uPrunedRight; + + m_dEdgeLength1[uPrunedLeft] = dLeftLength; + m_dEdgeLength1[uPrunedRight] = dRightLength; + + m_uNeighbor1[uPrunedLeft] = uPrunedNodeIndex; + m_uNeighbor1[uPrunedRight] = uPrunedNodeIndex; + + m_bHasEdgeLength1[uPrunedLeft] = true; + m_bHasEdgeLength1[uPrunedRight] = true; + + m_dEdgeLength2[uPrunedNodeIndex] = dLeftLength; + m_dEdgeLength3[uPrunedNodeIndex] = dRightLength; + + m_bHasEdgeLength2[uPrunedNodeIndex] = true; + m_bHasEdgeLength3[uPrunedNodeIndex] = true; + } + + m_uRootNodeIndex = uUnprunedToPrunedIndex[uUnprunedRootIndex]; + + m_bRooted = true; + + Validate(); + + delete[] uUnprunedToPrunedIndex; + } + +void LeafIndexesToIds(const Tree &tree, const unsigned Leaves[], unsigned uCount, + unsigned Ids[]) + { + for (unsigned n = 0; n < uCount; ++n) + Ids[n] = tree.GetLeafId(Leaves[n]); + } diff --git a/src/muscle/muscle3.8.31/src/phyfromclust.cpp b/src/muscle/muscle3.8.31/src/phyfromclust.cpp new file mode 100644 index 0000000..2f44fa7 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/phyfromclust.cpp @@ -0,0 +1,95 @@ +#include "muscle.h" +#include "tree.h" +#include "clust.h" + +void Tree::InitCache(unsigned uCacheCount) + { + m_uCacheCount = uCacheCount; + + m_uNeighbor1 = new unsigned[m_uCacheCount]; + m_uNeighbor2 = new unsigned[m_uCacheCount]; + m_uNeighbor3 = new unsigned[m_uCacheCount]; + + m_Ids = new unsigned[m_uCacheCount]; + + m_dEdgeLength1 = new double[m_uCacheCount]; + m_dEdgeLength2 = new double[m_uCacheCount]; + m_dEdgeLength3 = new double[m_uCacheCount]; + m_dHeight = new double[m_uCacheCount]; + + m_bHasEdgeLength1 = new bool[m_uCacheCount]; + m_bHasEdgeLength2 = new bool[m_uCacheCount]; + m_bHasEdgeLength3 = new bool[m_uCacheCount]; + m_bHasHeight = new bool[m_uCacheCount]; + + m_ptrName = new char *[m_uCacheCount]; + + for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) + { + m_uNeighbor1[uNodeIndex] = NULL_NEIGHBOR; + m_uNeighbor2[uNodeIndex] = NULL_NEIGHBOR; + m_uNeighbor3[uNodeIndex] = NULL_NEIGHBOR; + m_bHasEdgeLength1[uNodeIndex] = false; + m_bHasEdgeLength2[uNodeIndex] = false; + m_bHasEdgeLength3[uNodeIndex] = false; + m_bHasHeight[uNodeIndex] = false; + m_dEdgeLength1[uNodeIndex] = dInsane; + m_dEdgeLength2[uNodeIndex] = dInsane; + m_dEdgeLength3[uNodeIndex] = dInsane; + m_dHeight[uNodeIndex] = dInsane; + m_ptrName[uNodeIndex] = 0; + m_Ids[uNodeIndex] = uInsane; + } + } + +void Tree::FromClust(Clust &C) + { + Clear(); + + m_uNodeCount = C.GetNodeCount(); + InitCache(m_uNodeCount); + +// Cluster is always rooted. An unrooted cluster +// is represented by a pseudo-root, which we fix later. + m_bRooted = true; + const unsigned uRoot = C.GetRootNodeIndex(); + m_uRootNodeIndex = uRoot; + m_uNeighbor1[uRoot] = NULL_NEIGHBOR; + m_bHasEdgeLength1[uRoot] = false; + + for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) + { + if (C.IsLeaf(uNodeIndex)) + { + const char *ptrName = C.GetNodeName(uNodeIndex); + m_ptrName[uNodeIndex] = strsave(ptrName); + m_Ids[uNodeIndex] = C.GetNodeId(uNodeIndex); + continue; + } + + const unsigned uLeft = C.GetLeftIndex(uNodeIndex); + const unsigned uRight = C.GetRightIndex(uNodeIndex); + + const double dLeftLength = C.GetLength(uLeft); + const double dRightLength = C.GetLength(uRight); + + m_uNeighbor2[uNodeIndex] = uLeft; + m_uNeighbor3[uNodeIndex] = uRight; + + m_dEdgeLength1[uLeft] = dLeftLength; + m_dEdgeLength1[uRight] = dRightLength; + + m_uNeighbor1[uLeft] = uNodeIndex; + m_uNeighbor1[uRight] = uNodeIndex; + + m_bHasEdgeLength1[uLeft] = true; + m_bHasEdgeLength1[uRight] = true; + + m_dEdgeLength2[uNodeIndex] = dLeftLength; + m_dEdgeLength3[uNodeIndex] = dRightLength; + + m_bHasEdgeLength2[uNodeIndex] = true; + m_bHasEdgeLength3[uNodeIndex] = true; + } + Validate(); + } diff --git a/src/muscle/muscle3.8.31/src/phyfromfile.cpp b/src/muscle/muscle3.8.31/src/phyfromfile.cpp new file mode 100644 index 0000000..12f78b4 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/phyfromfile.cpp @@ -0,0 +1,272 @@ +#include "muscle.h" +#include "tree.h" +#include "textfile.h" + +#define TRACE 0 + +// Tokens in Newick files are: +// ( ) : , ; +// string +// 'string' +// "string" +// [ comment ] +// +// We can't safely distinguish between identifiers and floating point +// numbers at the lexical level (because identifiers may be numeric, +// or start with digits), so both edge lengths and identifiers are +// returned as strings. + +const char *Tree::NTTStr(NEWICK_TOKEN_TYPE NTT) const + { + switch (NTT) + { +#define c(x) case NTT_##x: return #x; + c(Unknown) + c(Lparen) + c(Rparen) + c(Colon) + c(Comma) + c(Semicolon) + c(String) + c(SingleQuotedString) + c(DoubleQuotedString) + c(Comment) +#undef c + } + return "??"; + } + +NEWICK_TOKEN_TYPE Tree::GetToken(TextFile &File, char szToken[], unsigned uBytes) const + { +// Skip leading white space + File.SkipWhite(); + + char c; + File.GetCharX(c); + +// In case a single-character token + szToken[0] = c; + szToken[1] = 0; + + unsigned uBytesCopied = 0; + NEWICK_TOKEN_TYPE TT; + switch (c) + { + case '(': + return NTT_Lparen; + + case ')': + return NTT_Rparen; + + case ':': + return NTT_Colon; + + case ';': + return NTT_Semicolon; + + case ',': + return NTT_Comma; + + case '\'': + TT = NTT_SingleQuotedString; + File.GetCharX(c); + break; + + case '"': + TT = NTT_DoubleQuotedString; + File.GetCharX(c); + break; + + case '[': + TT = NTT_Comment; + break; + + default: + TT = NTT_String; + break; + } + + for (;;) + { + if (TT != NTT_Comment) + { + if (uBytesCopied < uBytes - 2) + { + szToken[uBytesCopied++] = c; + szToken[uBytesCopied] = 0; + } + else + Quit("Tree::GetToken: input buffer too small, token so far='%s'", szToken); + } + bool bEof = File.GetChar(c); + if (bEof) + return TT; + + switch (TT) + { + case NTT_String: + if (0 != strchr("():;,", c)) + { + File.PushBack(c); + return NTT_String; + } + if (isspace(c)) + return NTT_String; + break; + + case NTT_SingleQuotedString: + if ('\'' == c) + return NTT_String; + break; + + case NTT_DoubleQuotedString: + if ('"' == c) + return NTT_String; + break; + + case NTT_Comment: + if (']' == c) + return GetToken(File, szToken, uBytes); + break; + + default: + Quit("Tree::GetToken, invalid TT=%u", TT); + } + } + } + +// NOTE: this hack must come after definition of Tree::GetToken. +#if TRACE +#define GetToken GetTokenVerbose +#endif + +void Tree::FromFile(TextFile &File) + { +// Assume rooted. +// If we discover that it is unrooted, will convert on the fly. + CreateRooted(); + + double dEdgeLength; + bool bEdgeLength = GetGroupFromFile(File, 0, &dEdgeLength); + +// Next token should be either ';' for rooted tree or ',' for unrooted. + char szToken[16]; + NEWICK_TOKEN_TYPE NTT = GetToken(File, szToken, sizeof(szToken)); + +// If rooted, all done. + if (NTT_Semicolon == NTT) + { + if (bEdgeLength) + Log(" *** Warning *** edge length on root group in Newick file %s\n", + File.GetFileName()); + Validate(); + return; + } + + if (NTT_Comma != NTT) + Quit("Tree::FromFile, expected ';' or ',', got '%s'", szToken); + + const unsigned uThirdNode = UnrootFromFile(); + bEdgeLength = GetGroupFromFile(File, uThirdNode, &dEdgeLength); + if (bEdgeLength) + SetEdgeLength(0, uThirdNode, dEdgeLength); + Validate(); + } + +// Return true if edge length for this group. +bool Tree::GetGroupFromFile(TextFile &File, unsigned uNodeIndex, + double *ptrdEdgeLength) + { + char szToken[1024]; + NEWICK_TOKEN_TYPE NTT = GetToken(File, szToken, sizeof(szToken)); + +// Group is either leaf name or (left, right). + if (NTT_String == NTT) + { + SetLeafName(uNodeIndex, szToken); +#if TRACE + Log("Group is leaf '%s'\n", szToken); +#endif + } + else if (NTT_Lparen == NTT) + { + const unsigned uLeft = AppendBranch(uNodeIndex); + const unsigned uRight = uLeft + 1; + + // Left sub-group... +#if TRACE + Log("Got '(', group is compound, expect left sub-group\n"); +#endif + double dEdgeLength; + bool bLeftLength = GetGroupFromFile(File, uLeft, &dEdgeLength); +#if TRACE + if (bLeftLength) + Log("Edge length for left sub-group: %.3g\n", dEdgeLength); + else + Log("No edge length for left sub-group\n"); +#endif + if (bLeftLength) + SetEdgeLength(uNodeIndex, uLeft, dEdgeLength); + + // ... then comma ... +#if TRACE + Log("Expect comma\n"); +#endif + NTT = GetToken(File, szToken, sizeof(szToken)); + if (NTT_Comma != NTT) + Quit("Tree::GetGroupFromFile, expected ',', got '%s'", szToken); + + // ...then right sub-group... +#if TRACE + Log("Expect right sub-group\n"); +#endif + bool bRightLength = GetGroupFromFile(File, uRight, &dEdgeLength); + if (bRightLength) + SetEdgeLength(uNodeIndex, uRight, dEdgeLength); + +#if TRACE + if (bRightLength) + Log("Edge length for right sub-group: %.3g\n", dEdgeLength); + else + Log("No edge length for right sub-group\n"); +#endif + + // ... then closing parenthesis. +#if TRACE + Log("Expect closing parenthesis (or comma if > 2-ary)\n"); +#endif + NTT = GetToken(File, szToken, sizeof(szToken)); + if (NTT_Rparen == NTT) + ; + else if (NTT_Comma == NTT) + { + File.PushBack(','); + return false; + } + else + Quit("Tree::GetGroupFromFile, expected ')' or ',', got '%s'", szToken); + } + else + Quit("Tree::GetGroupFromFile, expected '(' or leaf name, got '%s'", + szToken); + +// Group may optionally be followed by edge length. + bool bEof = File.SkipWhiteX(); + if (bEof) + return false; + char c; + File.GetCharX(c); +#if TRACE + Log("Character following group, could be colon, is '%c'\n", c); +#endif + if (':' == c) + { + NTT = GetToken(File, szToken, sizeof(szToken)); + if (NTT_String != NTT) + Quit("Tree::GetGroupFromFile, expected edge length, got '%s'", szToken); + *ptrdEdgeLength = atof(szToken); + return true; + } + File.PushBack(c); + return false; + } diff --git a/src/muscle/muscle3.8.31/src/physeq.cpp b/src/muscle/muscle3.8.31/src/physeq.cpp new file mode 100644 index 0000000..522f8b1 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/physeq.cpp @@ -0,0 +1,128 @@ +#include "muscle.h" +#include "msa.h" +#include "textfile.h" + +const int BLOCKSIZE = 60; + +static char FixChar(char c) + { + switch (c) + { + case '(': + case ')': + case '[': + case ']': + case ':': + case ';': + case ',': + return '_'; + } + if (!isprint(c)) + return '_'; + return c; + } + +static void FixName(char Name[]) + { + while (char c = *Name) + *Name++ = FixChar(c); + } + +void MSA::ToPhySequentialFile(TextFile &File) const + { + const unsigned SeqCount = GetSeqCount(); + const unsigned ColCount = GetColCount(); + + File.PutFormat("%d %d\n", SeqCount, ColCount); + + if (0 == ColCount) + return; + + for (unsigned Seq = 0; Seq < SeqCount; ++Seq) + { + char Name[11]; + const char *ptrName = GetSeqName(Seq); + size_t n = strlen(ptrName); + if (n > 10) + n = 10; + memcpy(Name, ptrName, n); + Name[n] = 0; + FixName(Name); + File.PutFormat("%-10.10s", Name); + + int BlockIndex = 0; + int Col = 0; + for (;;) + { + const unsigned MaxCols = (BlockIndex == 0) ? (BLOCKSIZE - 10) : BLOCKSIZE; + for (unsigned ColsThisBlock = 0; ColsThisBlock < MaxCols; ++ColsThisBlock) + { + if (Col == ColCount) + break; + if (ColsThisBlock%10 == 0 && (BlockIndex == 0 || ColsThisBlock > 0)) + File.PutChar(' '); + char c = GetChar(Seq, Col); + if (isalpha(c)) + c = toupper(c); + File.PutChar(c); + ++Col; + } + File.PutChar('\n'); + if (Col == ColCount) + break; + ++BlockIndex; + } + } + } + +void MSA::ToPhyInterleavedFile(TextFile &File) const + { + const unsigned SeqCount = GetSeqCount(); + const unsigned ColCount = GetColCount(); + + File.PutFormat("%d %d\n", SeqCount, ColCount); + + if (0 == ColCount) + return; + + int Col = 0; + for (;;) + { + const unsigned ColBlockStart = Col; + const unsigned MaxCols = (ColBlockStart == 0) ? (BLOCKSIZE - 10) : BLOCKSIZE; + + for (unsigned Seq = 0; Seq < SeqCount; ++Seq) + { + if (0 == ColBlockStart) + { + char Name[11]; + const char *ptrName = GetSeqName(Seq); + size_t n = strlen(ptrName); + if (n > 10) + n = 10; + memcpy(Name, ptrName, n); + Name[n] = 0; + FixName(Name); + File.PutFormat("%-10.10s", Name); + } + + Col = ColBlockStart; + for (unsigned ColsThisBlock = 0; ColsThisBlock < MaxCols; ++ColsThisBlock) + { + if (Col == ColCount) + break; + if (ColsThisBlock%10 == 0 && (0 == ColBlockStart || ColsThisBlock > 0)) + File.PutChar(' '); + char c = GetChar(Seq, Col); + if (isalpha(c)) + c = toupper(c); + File.PutChar(c); + ++Col; + } + File.PutChar('\n'); + } + if (Col == ColCount) + break; + File.PutChar('\n'); + } + } diff --git a/src/muscle/muscle3.8.31/src/phytofile.cpp b/src/muscle/muscle3.8.31/src/phytofile.cpp new file mode 100644 index 0000000..76fb368 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/phytofile.cpp @@ -0,0 +1,86 @@ +#include "muscle.h" +#include "tree.h" +#include "textfile.h" + +unsigned Tree::GetAnyNonLeafNode() const + { + for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) + if (!IsLeaf(uNodeIndex)) + return uNodeIndex; + return NULL_NEIGHBOR; + } + +void Tree::ToFile(TextFile &File) const + { + if (IsRooted()) + { + ToFileNodeRooted(File, m_uRootNodeIndex); + File.PutString(";\n"); + return; + } + +// Unrooted. + unsigned uNodeIndex = GetAnyNonLeafNode(); + + File.PutString("(\n"); + ToFileNodeUnrooted(File, m_uNeighbor1[uNodeIndex], uNodeIndex); + File.PutString(",\n"); + ToFileNodeUnrooted(File, m_uNeighbor2[uNodeIndex], uNodeIndex); + File.PutString(",\n"); + ToFileNodeUnrooted(File, m_uNeighbor3[uNodeIndex], uNodeIndex); + File.PutString(");\n"); + } + +void Tree::ToFileNodeUnrooted(TextFile &File, unsigned uNodeIndex, unsigned uParent) const + { + assert(!IsRooted()); + + bool bGroup = !IsLeaf(uNodeIndex); + if (bGroup) + File.PutString("(\n"); + + if (IsLeaf(uNodeIndex)) + File.PutString(GetName(uNodeIndex)); + else + { + ToFileNodeUnrooted(File, GetFirstNeighbor(uNodeIndex, uParent), uNodeIndex); + File.PutString(",\n"); + ToFileNodeUnrooted(File, GetSecondNeighbor(uNodeIndex, uParent), uNodeIndex); + } + + if (bGroup) + File.PutString(")"); + + if (HasEdgeLength(uNodeIndex, uParent)) + File.PutFormat(":%g", GetEdgeLength(uNodeIndex, uParent)); + File.PutString("\n"); + } + +void Tree::ToFileNodeRooted(TextFile &File, unsigned uNodeIndex) const + { + assert(IsRooted()); + + bool bGroup = !IsLeaf(uNodeIndex) || IsRoot(uNodeIndex); + if (bGroup) + File.PutString("(\n"); + + if (IsLeaf(uNodeIndex)) + File.PutString(GetName(uNodeIndex)); + else + { + ToFileNodeRooted(File, GetLeft(uNodeIndex)); + File.PutString(",\n"); + ToFileNodeRooted(File, GetRight(uNodeIndex)); + } + + if (bGroup) + File.PutString(")"); + + if (!IsRoot(uNodeIndex)) + { + unsigned uParent = GetParent(uNodeIndex); + if (HasEdgeLength(uNodeIndex, uParent)) + File.PutFormat(":%g", GetEdgeLength(uNodeIndex, uParent)); + } + File.PutString("\n"); + } diff --git a/src/muscle/muscle3.8.31/src/posgap.cpp b/src/muscle/muscle3.8.31/src/posgap.cpp new file mode 100644 index 0000000..863edc6 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/posgap.cpp @@ -0,0 +1,141 @@ +#include "muscle.h" + +//// Pascaralle and Argos gap factors +//// after Table 1 in Thompson et. al. ClustalW NAR paper. +//static double PAFFacs[20] = +// { +// 1.13, // A +// 1.13, // C +// 0.96, // D +// 1.31, // E +// 1.20, // F +// 0.61, // G +// 1.00, // H +// 1.32, // I +// 0.96, // K +// 1.21, // L +// 1.29, // M +// 0.62, // N +// 0.74, // P +// 1.07, // Q +// 0.72, // R +// 0.76, // S +// 0.89, // T +// 1.25, // V +// 1.00, // Y +// 1.23, // W +// }; +// +//// (Not used: does not appear to work well). +//SCORE PAFactor(const FCOUNT fcCounts[]) +// { +// if (ALPHA_Amino != g_Alpha) +// Quit("PAFFactor: requires amino acid sequence"); +// +// FCOUNT fLetterCount = 0; +// double dSum = 0; +// for (unsigned uLetter = 0; uLetter < 20; ++uLetter) +// { +// const FCOUNT fCount = fcCounts[uLetter]; +// dSum += fCount*PAFFacs[uLetter]; +// fLetterCount += fCount; +// } +// if (0 == fLetterCount) +// return 0.5; +// return (SCORE) (dSum/fLetterCount); +// } + +//static bool Hydrophilic[20] = +// { +// false, // A +// false, // C +// true, // D +// true, // E +// false, // F +// true, // G +// false, // H +// false, // I +// true, // K +// false, // L +// false, // M +// true, // N +// true, // P +// true, // Q +// true, // R +// true, // S +// false, // T +// false, // V +// false, // Y +// false, // W +// }; +// +//bool IsHydrophilic(const FCOUNT fcCounts[]) +// { +// if (ALPHA_Amino != g_Alpha) +// Quit("IsHydrophilic: requires amino acid sequence"); +// +// for (unsigned uLetter = 0; uLetter < 20; ++uLetter) +// if (fcCounts[uLetter] > 0 && !Hydrophilic[uLetter]) +// return false; +// return true; +// } +// +//bool IsHydrophilic(const unsigned uCounts[]) +// { +// if (ALPHA_Amino != g_Alpha) +// Quit("IsHydrophilic: requires amino acid sequence"); +// +// for (unsigned uLetter = 0; uLetter < 20; ++uLetter) +// if (uCounts[uLetter] > 0 && !Hydrophilic[uLetter]) +// return false; +// return true; +// } + +// LIVCATMFYWHK +// Venn Pascaralla B&T Me +// L y y y +// I y y y +// V y y y +// C y n +// A y y y +// T N n +// M y y y +// F y y y +// Y n n +// W y n +// H n n +// K n n +static bool Hydrophobic[20] = + { + true, // A + true, // C + false, // D + false, // E + true, // F + false, // G + true, // H + true, // I + false, // K + true, // L + true, // M + false, // N + false, // P + false, // Q + false, // R + false, // S + true, // T + true, // V + true, // Y + true, // W + }; + +bool IsHydrophobic(const FCOUNT fcCounts[]) + { + if (ALPHA_Amino != g_Alpha) + Quit("IsHydrophobic: requires amino acid sequence"); + + for (unsigned uLetter = 0; uLetter < 20; ++uLetter) + if (fcCounts[uLetter] > 0.0 && !Hydrophobic[uLetter]) + return false; + return true; + } diff --git a/src/muscle/muscle3.8.31/src/ppscore.cpp b/src/muscle/muscle3.8.31/src/ppscore.cpp new file mode 100644 index 0000000..813b8a2 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/ppscore.cpp @@ -0,0 +1,93 @@ +#include "muscle.h" +#include "textfile.h" +#include "msa.h" +#include "tree.h" +#include "profile.h" +#include "objscore.h" + +bool g_bTracePPScore = false; +MSA *g_ptrPPScoreMSA1 = 0; +MSA *g_ptrPPScoreMSA2 = 0; + +static ProfPos *ProfileFromMSALocal(MSA &msa, Tree &tree) + { + const unsigned uSeqCount = msa.GetSeqCount(); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + msa.SetSeqId(uSeqIndex, uSeqIndex); + + TreeFromMSA(msa, tree, g_Cluster2, g_Distance2, g_Root1); + SetMuscleTree(tree); + return ProfileFromMSA(msa); + } + +void PPScore() + { + if (0 == g_pstrFileName1 || 0 == g_pstrFileName2) + Quit("-ppscore needs -in1 and -in2"); + + SetSeqWeightMethod(g_SeqWeight1); + + TextFile file1(g_pstrFileName1); + TextFile file2(g_pstrFileName2); + + MSA msa1; + MSA msa2; + + msa1.FromFile(file1); + msa2.FromFile(file2); + + const unsigned uLength1 = msa1.GetColCount(); + const unsigned uLength2 = msa2.GetColCount(); + + if (uLength1 != uLength2) + Quit("Profiles must have the same length"); + + ALPHA Alpha = ALPHA_Undefined; + switch (g_SeqType) + { + case SEQTYPE_Auto: + Alpha = msa1.GuessAlpha(); + break; + + case SEQTYPE_Protein: + Alpha = ALPHA_Amino; + break; + + case SEQTYPE_DNA: + Alpha = ALPHA_DNA; + break; + + case SEQTYPE_RNA: + Alpha = ALPHA_RNA; + break; + + default: + Quit("Invalid SeqType"); + } + SetAlpha(Alpha); + + msa1.FixAlpha(); + msa2.FixAlpha(); + + if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha) + SetPPScore(PPSCORE_SPN); + + const unsigned uSeqCount1 = msa1.GetSeqCount(); + const unsigned uSeqCount2 = msa2.GetSeqCount(); + const unsigned uMaxSeqCount = (uSeqCount1 > uSeqCount2 ? uSeqCount1 : uSeqCount2); + MSA::SetIdCount(uMaxSeqCount); + + Tree tree1; + Tree tree2; + ProfPos *Prof1 = ProfileFromMSALocal(msa1, tree1); + ProfPos *Prof2 = ProfileFromMSALocal(msa2, tree2); + + g_bTracePPScore = true; + g_ptrPPScoreMSA1 = &msa1; + g_ptrPPScoreMSA2 = &msa2; + + SCORE Score = ObjScoreDP_Profs(Prof1, Prof2, uLength1); + + Log("Score=%.4g\n", Score); + printf("Score=%.4g\n", Score); + } diff --git a/src/muscle/muscle3.8.31/src/profdb.cpp b/src/muscle/muscle3.8.31/src/profdb.cpp new file mode 100644 index 0000000..f1b6595 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/profdb.cpp @@ -0,0 +1,54 @@ +#include "muscle.h" +#include "textfile.h" +#include "seqvect.h" +#include "distfunc.h" +#include "msa.h" +#include "tree.h" +#include "clust.h" +#include "profile.h" +#include "clustsetmsa.h" + +void ProfDB() + { + SetOutputFileName(g_pstrOutFileName); + SetInputFileName(g_pstrFileName2); + SetStartTime(); + + TextFile file1(g_pstrFileName1); + TextFile file2(g_pstrFileName2); + + SetMaxIters(g_uMaxIters); + SetSeqWeightMethod(g_SeqWeight1); + + TextFile fileIn(g_pstrFileName1); + MSA msa1; + msa1.FromFile(fileIn); + + const unsigned uSeqCount1 = msa1.GetSeqCount(); + if (0 == uSeqCount1) + Quit("No sequences in input alignment"); + + SeqVect v; + v.FromFASTAFile(file2); + const unsigned uSeqCount2 = v.Length(); + if (0 == uSeqCount2) + Quit("No sequences in input alignment"); + + MSA::SetIdCount(uSeqCount1 + uSeqCount2); + SetProgressDesc("Align sequence database to profile"); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount2; ++uSeqIndex) + { + Progress(uSeqIndex, uSeqCount2); + Seq &s = *(v[uSeqIndex]); + s.SetId(0); + MSA msaTmp; + msaTmp.FromSeq(s); + MSA msaOut; + ProfileProfile(msa1, msaTmp, msaOut); + msa1.Copy(msaOut); + } + ProgressStepsDone(); + + TextFile fileOut(g_pstrOutFileName, true); + msa1.ToFile(fileOut); + } diff --git a/src/muscle/muscle3.8.31/src/profile.cpp b/src/muscle/muscle3.8.31/src/profile.cpp new file mode 100644 index 0000000..f02b381 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/profile.cpp @@ -0,0 +1,147 @@ +#include "muscle.h" +#include "textfile.h" +#include "msa.h" +#include "tree.h" +#include "profile.h" +#include "objscore.h" + +bool TreeNeededForWeighting(SEQWEIGHT s) + { + switch (s) + { + case SEQWEIGHT_ClustalW: + case SEQWEIGHT_ThreeWay: + return true; + default: + return false; + } + } + +static ProfPos *ProfileFromMSALocal(MSA &msa, Tree &tree) + { + const unsigned uSeqCount = msa.GetSeqCount(); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + msa.SetSeqId(uSeqIndex, uSeqIndex); + + if (TreeNeededForWeighting(g_SeqWeight2)) + { + TreeFromMSA(msa, tree, g_Cluster2, g_Distance2, g_Root1); + SetMuscleTree(tree); + } + return ProfileFromMSA(msa); + } + +void ProfileProfile(MSA &msa1, MSA &msa2, MSA &msaOut) + { + //ALPHA Alpha = ALPHA_Undefined; + //switch (g_SeqType) + // { + //case SEQTYPE_Auto: + // Alpha = msa1.GuessAlpha(); + // break; + + //case SEQTYPE_Protein: + // Alpha = ALPHA_Amino; + // break; + + //case SEQTYPE_DNA: + // Alpha = ALPHA_DNA; + // break; + + //case SEQTYPE_RNA: + // Alpha = ALPHA_RNA; + // break; + + //default: + // Quit("Invalid SeqType"); + // } + //SetAlpha(Alpha); + + //msa1.FixAlpha(); + //msa2.FixAlpha(); + + unsigned uLength1; + unsigned uLength2; + + uLength1 = msa1.GetColCount(); + uLength2 = msa2.GetColCount(); + + Tree tree1; + Tree tree2; + ProfPos *Prof1 = ProfileFromMSALocal(msa1, tree1); + ProfPos *Prof2 = ProfileFromMSALocal(msa2, tree2); + + PWPath Path; + ProfPos *ProfOut; + unsigned uLengthOut; + Progress("Aligning profiles"); + AlignTwoProfs(Prof1, uLength1, 1.0, Prof2, uLength2, 1.0, Path, &ProfOut, &uLengthOut); + + Progress("Building output"); + AlignTwoMSAsGivenPath(Path, msa1, msa2, msaOut); + } + +// Do profile-profile alignment +void Profile() + { + if (0 == g_pstrFileName1 || 0 == g_pstrFileName2) + Quit("-profile needs -in1 and -in2"); + + SetSeqWeightMethod(g_SeqWeight1); + + TextFile file1(g_pstrFileName1); + TextFile file2(g_pstrFileName2); + + MSA msa1; + MSA msa2; + MSA msaOut; + + Progress("Reading %s", g_pstrFileName1); + msa1.FromFile(file1); + Progress("%u seqs %u cols", msa1.GetSeqCount(), msa1.GetColCount()); + + Progress("Reading %s", g_pstrFileName2); + msa2.FromFile(file2); + Progress("%u seqs %u cols", msa2.GetSeqCount(), msa2.GetColCount()); + + ALPHA Alpha = ALPHA_Undefined; + switch (g_SeqType) + { + case SEQTYPE_Auto: + Alpha = msa1.GuessAlpha(); + break; + + case SEQTYPE_Protein: + Alpha = ALPHA_Amino; + break; + + case SEQTYPE_DNA: + Alpha = ALPHA_DNA; + break; + + case SEQTYPE_RNA: + Alpha = ALPHA_RNA; + break; + + default: + Quit("Invalid seq type"); + } + SetAlpha(Alpha); + + msa1.FixAlpha(); + msa2.FixAlpha(); + + SetPPScore(); + if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha) + SetPPScore(PPSCORE_SPN); + + const unsigned uSeqCount1 = msa1.GetSeqCount(); + const unsigned uSeqCount2 = msa2.GetSeqCount(); + const unsigned uSumSeqCount = uSeqCount1 + uSeqCount2; + MSA::SetIdCount(uSumSeqCount); + + ProfileProfile(msa1, msa2, msaOut); + + Progress("Writing output"); + MuscleOutput(msaOut); + } diff --git a/src/muscle/muscle3.8.31/src/profile.h b/src/muscle/muscle3.8.31/src/profile.h new file mode 100644 index 0000000..e0b68bc --- /dev/null +++ b/src/muscle/muscle3.8.31/src/profile.h @@ -0,0 +1,126 @@ +#ifndef FastProf2_h +#define FastProf2_h + +#include "msa.h" +#include "pwpath.h" +#include // for log function + +class DiagList; +class WeightList; + +struct ProfPos + { + bool m_bAllGaps; + unsigned m_uSortOrder[21]; + FCOUNT m_fcCounts[20]; + FCOUNT m_LL; + FCOUNT m_LG; + FCOUNT m_GL; + FCOUNT m_GG; + SCORE m_AAScores[20]; + unsigned m_uResidueGroup; + FCOUNT m_fOcc; + FCOUNT m_fcStartOcc; + FCOUNT m_fcEndOcc; + SCORE m_scoreGapOpen; + SCORE m_scoreGapClose; +#if DOUBLE_AFFINE + SCORE m_scoreGapOpen2; + SCORE m_scoreGapClose2; +#endif +// SCORE m_scoreGapExtend; + }; + +struct ProgNode + { + ProgNode() + { + m_Prof = 0; + m_EstringL = 0; + m_EstringR = 0; + } + MSA m_MSA; + ProfPos *m_Prof; + PWPath m_Path; + short *m_EstringL; + short *m_EstringR; + unsigned m_uLength; + WEIGHT m_Weight; + }; + +extern unsigned ResidueGroup[]; +const unsigned RESIDUE_GROUP_MULTIPLE = (unsigned) ~0; + +extern PTR_SCOREMATRIX g_ptrScoreMatrix; + +ProfPos *ProfileFromMSA(const MSA &a); + +SCORE TraceBack(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, const SCORE *DPM_, const SCORE *DPD_, const SCORE *DPI_, + PWPath &Path); +SCORE GlobalAlign(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path); +void ProgressiveAlign(const SeqVect &v, const Tree &tree, MSA &a); +SCORE MSAPairSP(const MSA &msa1, const MSA &msa2); + +void AlignTwoMSAsGivenPath(const PWPath &Path, const MSA &msaA, const MSA &msaB, + MSA &msaCombined); + +void ListProfile(const ProfPos *Prof, unsigned uLength, const MSA *ptrMSA = 0); +SCORE ScoreProfPos2(const ProfPos &PPA, const ProfPos &PPB); +SCORE FastScorePath2(const ProfPos *PA, unsigned uLengthA, + const ProfPos *PB, unsigned uLengthB, const PWPath &Path); +bool IsHydrophilic(const FCOUNT fcCounts[]); +int PAM200_Letter(unsigned uLetter1, unsigned uLetter2); +SCORE AverageMatchScore(const PWPath &Path, unsigned uEdgeIndex, + unsigned uWindowLength); +void WindowSmooth(const SCORE Score[], unsigned uCount, unsigned uWindowLength, + SCORE SmoothScore[], double dCeil = 9e29); +SCORE FastScoreMSA_LA(const MSA &msa, SCORE MatchScore[] = 0); +SCORE FastScoreMSA_NS(const MSA &msa, SCORE MatchScore[] = 0); +SCORE FastScoreMSA_SP(const MSA &msa, SCORE MatchScore[] = 0); +bool RefineMSA(MSA &msa, const Tree &tree); +SCORE MSAQScore(const MSA &msa, SCORE MatchScore[] = 0); +bool RefineBiParts(MSA &msa, const Tree &tree, bool R); +void FindAnchorCols(const MSA &msa, unsigned AnchorCols[], + unsigned *ptruAnchorColCount); +double PctIdToHeight(double dPctId); +double PctIdToHeightKimura(double dPctId); +double PctIdToHeightMAFFT(double dPctId); +double PctIdToMAFFTDist(double dPctId); +bool RefineBlocks(MSA &msa, const Tree &tree); +bool RefineSubfams(MSA &msaIn, const Tree &tree, unsigned uIters); +void SetMuscleTree(const Tree &tree); +void CalcClustalWWeights(const Tree &tree, WEIGHT Weights[]); +void RealignDiffs(const MSA &msaIn, const Tree &Diffs, + const unsigned IdToDiffsTreeNodeIndex[], MSA &msaOut); +void RealignDiffsE(const MSA &msaIn, const SeqVect &v, + const Tree &NewTree, const Tree &OldTree, + const unsigned uNewNodeIndexToOldNodeIndex[], + MSA &msaOut, ProgNode *OldProgNodes); +void RefineTree(MSA &msa, Tree &tree); +void RefineTreeE(MSA &msa, const SeqVect &v, Tree &tree, ProgNode *ProgNodes); +bool IsHydrophobic(const FCOUNT fcCounts[]); +void Hydro(ProfPos *Prof, unsigned uLength); +void SetTermGaps(const ProfPos *Prof, unsigned uLength); + +// Macros to simulate 2D matrices +#define DPL(PLA, PLB) DPL_[(PLB)*uPrefixCountA + (PLA)] +#define DPM(PLA, PLB) DPM_[(PLB)*uPrefixCountA + (PLA)] +#define DPD(PLA, PLB) DPD_[(PLB)*uPrefixCountA + (PLA)] +#define DPE(PLA, PLB) DPE_[(PLB)*uPrefixCountA + (PLA)] +#define DPI(PLA, PLB) DPI_[(PLB)*uPrefixCountA + (PLA)] +#define DPJ(PLA, PLB) DPJ_[(PLB)*uPrefixCountA + (PLA)] +#define DPU(PLA, PLB) DPU_[(PLB)*uPrefixCountA + (PLA)] +#define TBM(PLA, PLB) TBM_[(PLB)*uPrefixCountA + (PLA)] +#define TBD(PLA, PLB) TBD_[(PLB)*uPrefixCountA + (PLA)] +#define TBE(PLA, PLB) TBE_[(PLB)*uPrefixCountA + (PLA)] +#define TBI(PLA, PLB) TBI_[(PLB)*uPrefixCountA + (PLA)] +#define TBJ(PLA, PLB) TBJ_[(PLB)*uPrefixCountA + (PLA)] + +SCORE ScoreProfPos2LA(const ProfPos &PPA, const ProfPos &PPB); +SCORE ScoreProfPos2NS(const ProfPos &PPA, const ProfPos &PPB); +SCORE ScoreProfPos2SP(const ProfPos &PPA, const ProfPos &PPB); +SCORE ScoreProfPos2SPN(const ProfPos &PPA, const ProfPos &PPB); + +#endif // FastProf_h diff --git a/src/muscle/muscle3.8.31/src/profilefrommsa.cpp b/src/muscle/muscle3.8.31/src/profilefrommsa.cpp new file mode 100644 index 0000000..1894f54 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/profilefrommsa.cpp @@ -0,0 +1,318 @@ +#include "muscle.h" +#include "msa.h" +#include "profile.h" + +#define TRACE 0 + +static void LogF(FCOUNT f) + { + if (f > -0.00001 && f < 0.00001) + Log(" "); + else + Log(" %5.3f", f); + } + +static const char *LocalScoreToStr(SCORE s) + { + static char str[16]; + if (s < -1e10 || s > 1e10) + return " *"; + sprintf(str, "%5.1f", s); + return str; + } + +#if DOUBLE_AFFINE +void ListProfile(const ProfPos *Prof, unsigned uLength, const MSA *ptrMSA) + { + Log(" Pos Occ LL LG GL GG Open Close Open2 Clos2\n"); + Log(" --- --- -- -- -- -- ---- ----- ----- -----\n"); + for (unsigned n = 0; n < uLength; ++n) + { + const ProfPos &PP = Prof[n]; + Log("%5u", n); + LogF(PP.m_fOcc); + LogF(PP.m_LL); + LogF(PP.m_LG); + LogF(PP.m_GL); + LogF(PP.m_GG); + Log(" %s", LocalScoreToStr(-PP.m_scoreGapOpen)); + Log(" %s", LocalScoreToStr(-PP.m_scoreGapClose)); + Log(" %s", LocalScoreToStr(-PP.m_scoreGapOpen2)); + Log(" %s", LocalScoreToStr(-PP.m_scoreGapClose2)); + if (0 != ptrMSA) + { + const unsigned uSeqCount = ptrMSA->GetSeqCount(); + Log(" "); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + Log("%c", ptrMSA->GetChar(uSeqIndex, n)); + } + Log("\n"); + } + + Log("\n"); + Log(" Pos G"); + for (unsigned n = 0; n < g_AlphaSize; ++n) + Log(" %c", LetterExToChar(n)); + Log("\n"); + Log(" --- -"); + for (unsigned n = 0; n < g_AlphaSize; ++n) + Log(" -----"); + Log("\n"); + + for (unsigned n = 0; n < uLength; ++n) + { + const ProfPos &PP = Prof[n]; + Log("%5u", n); + if (-1 == PP.m_uResidueGroup) + Log(" -", PP.m_uResidueGroup); + else + Log(" %d", PP.m_uResidueGroup); + + for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter) + { + FCOUNT f = PP.m_fcCounts[uLetter]; + if (f == 0.0) + Log(" "); + else + Log(" %5.3f", f); + } + if (0 != ptrMSA) + { + const unsigned uSeqCount = ptrMSA->GetSeqCount(); + Log(" "); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + Log("%c", ptrMSA->GetChar(uSeqIndex, n)); + } + Log("\n"); + } + } +#endif // DOUBLE_AFFINE + +#if SINGLE_AFFINE +void ListProfile(const ProfPos *Prof, unsigned uLength, const MSA *ptrMSA) + { + Log(" Pos Occ LL LG GL GG Open Close\n"); + Log(" --- --- -- -- -- -- ---- -----\n"); + for (unsigned n = 0; n < uLength; ++n) + { + const ProfPos &PP = Prof[n]; + Log("%5u", n); + LogF(PP.m_fOcc); + LogF(PP.m_LL); + LogF(PP.m_LG); + LogF(PP.m_GL); + LogF(PP.m_GG); + Log(" %5.1f", -PP.m_scoreGapOpen); + Log(" %5.1f", -PP.m_scoreGapClose); + if (0 != ptrMSA) + { + const unsigned uSeqCount = ptrMSA->GetSeqCount(); + Log(" "); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + Log("%c", ptrMSA->GetChar(uSeqIndex, n)); + } + Log("\n"); + } + + Log("\n"); + Log(" Pos G"); + for (unsigned n = 0; n < g_AlphaSize; ++n) + Log(" %c", LetterExToChar(n)); + Log("\n"); + Log(" --- -"); + for (unsigned n = 0; n < g_AlphaSize; ++n) + Log(" -----"); + Log("\n"); + + for (unsigned n = 0; n < uLength; ++n) + { + const ProfPos &PP = Prof[n]; + Log("%5u", n); + if (-1 == PP.m_uResidueGroup) + Log(" -", PP.m_uResidueGroup); + else + Log(" %d", PP.m_uResidueGroup); + + for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter) + { + FCOUNT f = PP.m_fcCounts[uLetter]; + if (f == 0.0) + Log(" "); + else + Log(" %5.3f", f); + } + if (0 != ptrMSA) + { + const unsigned uSeqCount = ptrMSA->GetSeqCount(); + Log(" "); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + Log("%c", ptrMSA->GetChar(uSeqIndex, n)); + } + Log("\n"); + } + } +#endif + +void SortCounts(const FCOUNT fcCounts[], unsigned SortOrder[]) + { + static unsigned InitialSortOrder[MAX_ALPHA] = + { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 + }; + memcpy(SortOrder, InitialSortOrder, g_AlphaSize*sizeof(unsigned)); + + bool bAny = true; + while (bAny) + { + bAny = false; + for (unsigned n = 0; n < g_AlphaSize - 1; ++n) + { + unsigned i1 = SortOrder[n]; + unsigned i2 = SortOrder[n+1]; + if (fcCounts[i1] < fcCounts[i2]) + { + SortOrder[n+1] = i1; + SortOrder[n] = i2; + bAny = true; + } + } + } + } + +static unsigned AminoGroupFromFCounts(const FCOUNT fcCounts[]) + { + bool bAny = false; + unsigned uConsensusResidueGroup = RESIDUE_GROUP_MULTIPLE; + for (unsigned uLetter = 0; uLetter < 20; ++uLetter) + { + if (0 == fcCounts[uLetter]) + continue; + const unsigned uResidueGroup = ResidueGroup[uLetter]; + if (bAny) + { + if (uResidueGroup != uConsensusResidueGroup) + return RESIDUE_GROUP_MULTIPLE; + } + else + { + bAny = true; + uConsensusResidueGroup = uResidueGroup; + } + } + return uConsensusResidueGroup; + } + +static unsigned NucleoGroupFromFCounts(const FCOUNT fcCounts[]) + { + bool bAny = false; + unsigned uConsensusResidueGroup = RESIDUE_GROUP_MULTIPLE; + for (unsigned uLetter = 0; uLetter < 4; ++uLetter) + { + if (0 == fcCounts[uLetter]) + continue; + const unsigned uResidueGroup = uLetter; + if (bAny) + { + if (uResidueGroup != uConsensusResidueGroup) + return RESIDUE_GROUP_MULTIPLE; + } + else + { + bAny = true; + uConsensusResidueGroup = uResidueGroup; + } + } + return uConsensusResidueGroup; + } + +unsigned ResidueGroupFromFCounts(const FCOUNT fcCounts[]) + { + switch (g_Alpha) + { + case ALPHA_Amino: + return AminoGroupFromFCounts(fcCounts); + + case ALPHA_DNA: + case ALPHA_RNA: + return NucleoGroupFromFCounts(fcCounts); + } + Quit("ResidueGroupFromFCounts: bad alpha"); + return 0; + } + +ProfPos *ProfileFromMSA(const MSA &a) + { + const unsigned uSeqCount = a.GetSeqCount(); + const unsigned uColCount = a.GetColCount(); + +// Yuck -- cast away const (inconsistent design here). + SetMSAWeightsMuscle((MSA &) a); + + ProfPos *Pos = new ProfPos[uColCount]; + + unsigned uHydrophobicRunLength = 0; + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + { + ProfPos &PP = Pos[uColIndex]; + + PP.m_bAllGaps = a.IsGapColumn(uColIndex); + + FCOUNT fcGapStart; + FCOUNT fcGapEnd; + FCOUNT fcGapExtend; + FCOUNT fOcc; + a.GetFractionalWeightedCounts(uColIndex, g_bNormalizeCounts, PP.m_fcCounts, + &fcGapStart, &fcGapEnd, &fcGapExtend, &fOcc, + &PP.m_LL, &PP.m_LG, &PP.m_GL, &PP.m_GG); + PP.m_fOcc = fOcc; + + SortCounts(PP.m_fcCounts, PP.m_uSortOrder); + + PP.m_uResidueGroup = ResidueGroupFromFCounts(PP.m_fcCounts); + + for (unsigned i = 0; i < g_AlphaSize; ++i) + { + SCORE scoreSum = 0; + for (unsigned j = 0; j < g_AlphaSize; ++j) + scoreSum += PP.m_fcCounts[j]*(*g_ptrScoreMatrix)[i][j]; + PP.m_AAScores[i] = scoreSum; + } + + SCORE sStartOcc = (SCORE) (1.0 - fcGapStart); + SCORE sEndOcc = (SCORE) (1.0 - fcGapEnd); + + PP.m_fcStartOcc = sStartOcc; + PP.m_fcEndOcc = sEndOcc; + + PP.m_scoreGapOpen = sStartOcc*g_scoreGapOpen/2; + PP.m_scoreGapClose = sEndOcc*g_scoreGapOpen/2; +#if DOUBLE_AFFINE + PP.m_scoreGapOpen2 = sStartOcc*g_scoreGapOpen2/2; + PP.m_scoreGapClose2 = sEndOcc*g_scoreGapOpen2/2; +#endif +// PP.m_scoreGapExtend = (SCORE) ((1.0 - fcGapExtend)*scoreGapExtend); + +#if PAF + if (ALHPA_Amino == g_Alpha && sStartOcc > 0.5) + { + extern SCORE PAFactor(const FCOUNT fcCounts[]); + SCORE paf = PAFactor(PP.m_fcCounts); + PP.m_scoreGapOpen *= paf; + PP.m_scoreGapClose *= paf; + } +#endif + } + +#if HYDRO + if (ALPHA_Amino == g_Alpha) + Hydro(Pos, uColCount); +#endif + +#if TRACE + { + Log("ProfileFromMSA\n"); + ListProfile(Pos, uColCount, &a); + } +#endif + return Pos; + } diff --git a/src/muscle/muscle3.8.31/src/progalign.cpp b/src/muscle/muscle3.8.31/src/progalign.cpp new file mode 100644 index 0000000..f42f598 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/progalign.cpp @@ -0,0 +1,206 @@ +#include "muscle.h" +#include "tree.h" +#include "seqvect.h" +#include "profile.h" +#include "msa.h" +#include "pwpath.h" +#include "distfunc.h" +#include "textfile.h" +#include "estring.h" + +#define TRACE 0 +#define VALIDATE 0 +#define TRACE_LENGTH_DELTA 0 + +static void LogLeafNames(const Tree &tree, unsigned uNodeIndex) + { + const unsigned uNodeCount = tree.GetNodeCount(); + unsigned *Leaves = new unsigned[uNodeCount]; + unsigned uLeafCount; + GetLeaves(tree, uNodeIndex, Leaves, &uLeafCount); + for (unsigned i = 0; i < uLeafCount; ++i) + { + if (i > 0) + Log(","); + Log("%s", tree.GetLeafName(Leaves[i])); + } + delete[] Leaves; + } + +ProgNode *ProgressiveAlignE(const SeqVect &v, const Tree &GuideTree, MSA &a) + { + assert(GuideTree.IsRooted()); + +#if TRACE + Log("GuideTree:\n"); + GuideTree.LogMe(); +#endif + + const unsigned uSeqCount = v.Length(); + const unsigned uNodeCount = 2*uSeqCount - 1; + const unsigned uIterCount = uSeqCount - 1; + + WEIGHT *Weights = new WEIGHT[uSeqCount]; + CalcClustalWWeights(GuideTree, Weights); + + ProgNode *ProgNodes = new ProgNode[uNodeCount]; + + unsigned uJoin = 0; + unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode(); + SetProgressDesc("Align node"); + do + { + if (GuideTree.IsLeaf(uTreeNodeIndex)) + { + if (uTreeNodeIndex >= uNodeCount) + Quit("TreeNodeIndex=%u NodeCount=%u\n", uTreeNodeIndex, uNodeCount); + ProgNode &Node = ProgNodes[uTreeNodeIndex]; + unsigned uId = GuideTree.GetLeafId(uTreeNodeIndex); + if (uId >= uSeqCount) + Quit("Seq index out of range"); + const Seq &s = *(v[uId]); + Node.m_MSA.FromSeq(s); + Node.m_MSA.SetSeqId(0, uId); + Node.m_uLength = Node.m_MSA.GetColCount(); + Node.m_Weight = Weights[uId]; + // TODO: Term gaps settable + Node.m_Prof = ProfileFromMSA(Node.m_MSA); + Node.m_EstringL = 0; + Node.m_EstringR = 0; +#if TRACE + Log("Leaf id=%u\n", uId); + Log("MSA=\n"); + Node.m_MSA.LogMe(); + Log("Profile (from MSA)=\n"); + ListProfile(Node.m_Prof, Node.m_uLength, &Node.m_MSA); +#endif + } + else + { + Progress(uJoin, uSeqCount - 1); + ++uJoin; + + const unsigned uMergeNodeIndex = uTreeNodeIndex; + ProgNode &Parent = ProgNodes[uMergeNodeIndex]; + + const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex); + const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex); + + if (g_bVerbose) + { + Log("Align: ("); + LogLeafNames(GuideTree, uLeft); + Log(") ("); + LogLeafNames(GuideTree, uRight); + Log(")\n"); + } + + ProgNode &Node1 = ProgNodes[uLeft]; + ProgNode &Node2 = ProgNodes[uRight]; + +#if TRACE + Log("AlignTwoMSAs:\n"); +#endif + AlignTwoProfs( + Node1.m_Prof, Node1.m_uLength, Node1.m_Weight, + Node2.m_Prof, Node2.m_uLength, Node2.m_Weight, + Parent.m_Path, + &Parent.m_Prof, &Parent.m_uLength); +#if TRACE_LENGTH_DELTA + { + unsigned L = Node1.m_uLength; + unsigned R = Node2.m_uLength; + unsigned P = Parent.m_Path.GetEdgeCount(); + unsigned Max = L > R ? L : R; + unsigned d = P - Max; + Log("LD%u;%u;%u;%u\n", L, R, P, d); + } +#endif + PathToEstrings(Parent.m_Path, &Parent.m_EstringL, &Parent.m_EstringR); + + Parent.m_Weight = Node1.m_Weight + Node2.m_Weight; + +#if VALIDATE + { +#if TRACE + Log("AlignTwoMSAs:\n"); +#endif + PWPath TmpPath; + AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, TmpPath); + ProfPos *P1 = ProfileFromMSA(Node1.m_MSA, true); + ProfPos *P2 = ProfileFromMSA(Node2.m_MSA, true); + unsigned uLength = Parent.m_MSA.GetColCount(); + ProfPos *TmpProf = ProfileFromMSA(Parent.m_MSA, true); + +#if TRACE + Log("Node1 MSA=\n"); + Node1.m_MSA.LogMe(); + + Log("Node1 prof=\n"); + ListProfile(Node1.m_Prof, Node1.m_MSA.GetColCount(), &Node1.m_MSA); + Log("Node1 prof (from MSA)=\n"); + ListProfile(P1, Node1.m_MSA.GetColCount(), &Node1.m_MSA); + + AssertProfsEq(Node1.m_Prof, Node1.m_uLength, P1, Node1.m_MSA.GetColCount()); + + Log("Node2 prof=\n"); + ListProfile(Node2.m_Prof, Node2.m_MSA.GetColCount(), &Node2.m_MSA); + + Log("Node2 MSA=\n"); + Node2.m_MSA.LogMe(); + + Log("Node2 prof (from MSA)=\n"); + ListProfile(P2, Node2.m_MSA.GetColCount(), &Node2.m_MSA); + + AssertProfsEq(Node2.m_Prof, Node2.m_uLength, P2, Node2.m_MSA.GetColCount()); + + TmpPath.AssertEqual(Parent.m_Path); + + Log("Parent MSA=\n"); + Parent.m_MSA.LogMe(); + + Log("Parent prof=\n"); + ListProfile(Parent.m_Prof, Parent.m_uLength, &Parent.m_MSA); + + Log("Parent prof (from MSA)=\n"); + ListProfile(TmpProf, Parent.m_MSA.GetColCount(), &Parent.m_MSA); + +#endif // TRACE + AssertProfsEq(Parent.m_Prof, Parent.m_uLength, + TmpProf, Parent.m_MSA.GetColCount()); + delete[] P1; + delete[] P2; + delete[] TmpProf; + } +#endif // VALIDATE + + Node1.m_MSA.Clear(); + Node2.m_MSA.Clear(); + + // Don't delete profiles, may need them for tree refinement. + //delete[] Node1.m_Prof; + //delete[] Node2.m_Prof; + //Node1.m_Prof = 0; + //Node2.m_Prof = 0; + } + uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex); + } + while (NULL_NEIGHBOR != uTreeNodeIndex); + ProgressStepsDone(); + + if (g_bBrenner) + MakeRootMSABrenner((SeqVect &) v, GuideTree, ProgNodes, a); + else + MakeRootMSA(v, GuideTree, ProgNodes, a); + +#if VALIDATE + { + unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex(); + const ProgNode &RootProgNode = ProgNodes[uRootNodeIndex]; + AssertMSAEq(a, RootProgNode.m_MSA); + } +#endif + + delete[] Weights; + return ProgNodes; + } diff --git a/src/muscle/muscle3.8.31/src/progress.cpp b/src/muscle/muscle3.8.31/src/progress.cpp new file mode 100644 index 0000000..a7fa01f --- /dev/null +++ b/src/muscle/muscle3.8.31/src/progress.cpp @@ -0,0 +1,172 @@ +#include "muscle.h" +#include +#include + +// Functions that provide visible feedback to the user +// that progress is being made. + +static unsigned g_uIter = 0; // Main MUSCLE iteration 1, 2.. +static unsigned g_uLocalMaxIters = 0; // Max iters +static FILE *g_fProgress = stderr; // Default to standard error +static char g_strFileName[32]; // File name +static time_t g_tLocalStart; // Start time +static char g_strDesc[32]; // Description +static bool g_bWipeDesc = false; +static int g_nPrevDescLength; +static unsigned g_uTotalSteps; + +const char *ElapsedTimeAsStr() + { + time_t Now = time(0); + unsigned long ElapsedSecs = (unsigned long) (Now - g_tLocalStart); + return SecsToStr(ElapsedSecs); + } + +const char *MemToStr(double MB) + { + if (MB < 0) + return ""; + + static char Str[16]; + static double MaxMB = 0; + static double RAMMB = 0; + + if (RAMMB == 0) + RAMMB = GetRAMSizeMB(); + + if (MB > MaxMB) + MaxMB = MB; + double Pct = (MaxMB*100.0)/RAMMB; + if (Pct > 100) + Pct = 100; + sprintf(Str, "%.0f MB(%.0f%%)", MaxMB, Pct); + return Str; + } + +void SetInputFileName(const char *pstrFileName) + { + NameFromPath(pstrFileName, g_strFileName, sizeof(g_strFileName)); + } + +void SetSeqStats(unsigned uSeqCount, unsigned uMaxL, unsigned uAvgL) + { + if (g_bQuiet) + return; + + fprintf(g_fProgress, "%s %u seqs, max length %u, avg length %u\n", + g_strFileName, uSeqCount, uMaxL, uAvgL); + if (g_bVerbose) + Log("%u seqs, max length %u, avg length %u\n", + uSeqCount, uMaxL, uAvgL); + } + +void SetStartTime() + { + time(&g_tLocalStart); + } + +unsigned long GetStartTime() + { + return (unsigned long) g_tLocalStart; + } + +void SetIter(unsigned uIter) + { + g_uIter = uIter; + } + +void IncIter() + { + ++g_uIter; + } + +void SetMaxIters(unsigned uMaxIters) + { + g_uLocalMaxIters = uMaxIters; + } + +void SetProgressDesc(const char szDesc[]) + { + strncpy(g_strDesc, szDesc, sizeof(g_strDesc)); + g_strDesc[sizeof(g_strDesc) - 1] = 0; + } + +static void Wipe(int n) + { + for (int i = 0; i < n; ++i) + fprintf(g_fProgress, " "); + } + +void Progress(const char *szFormat, ...) + { + CheckMaxTime(); + + if (g_bQuiet) + return; + + double MB = GetMemUseMB(); + + char szStr[4096]; + va_list ArgList; + va_start(ArgList, szFormat); + vsprintf(szStr, szFormat, ArgList); + + fprintf(g_fProgress, "%8.8s %12s %s", + ElapsedTimeAsStr(), + MemToStr(MB), + szStr); + + fprintf(g_fProgress, "\n"); + fflush(g_fProgress); + } + +void Progress(unsigned uStep, unsigned uTotalSteps) + { + CheckMaxTime(); + + if (g_bQuiet) + return; + + double dPct = ((uStep + 1)*100.0)/uTotalSteps; + double MB = GetMemUseMB(); + fprintf(g_fProgress, "%8.8s %12s Iter %3u %6.2f%% %s", + ElapsedTimeAsStr(), + MemToStr(MB), + g_uIter, + dPct, + g_strDesc); + + if (g_bWipeDesc) + { + int n = g_nPrevDescLength - (int) strlen(g_strDesc); + Wipe(n); + g_bWipeDesc = false; + } + + fprintf(g_fProgress, "\r"); + + g_uTotalSteps = uTotalSteps; + } + +void ProgressStepsDone() + { + CheckMaxTime(); + + if (g_bVerbose) + { + double MB = GetMemUseMB(); + Log("Elapsed time %8.8s Peak memory use %12s Iteration %3u %s\n", + ElapsedTimeAsStr(), + MemToStr(MB), + g_uIter, + g_strDesc); + } + + if (g_bQuiet) + return; + + Progress(g_uTotalSteps - 1, g_uTotalSteps); + fprintf(g_fProgress, "\n"); + g_bWipeDesc = true; + g_nPrevDescLength = (int) strlen(g_strDesc); + } diff --git a/src/muscle/muscle3.8.31/src/progressivealign.cpp b/src/muscle/muscle3.8.31/src/progressivealign.cpp new file mode 100644 index 0000000..c2feb3b --- /dev/null +++ b/src/muscle/muscle3.8.31/src/progressivealign.cpp @@ -0,0 +1,76 @@ +#include "muscle.h" +#include +#include "tree.h" +#include "seqvect.h" +#include "profile.h" +#include "msa.h" +#include "pwpath.h" +#include "distfunc.h" + +#define TRACE 0 + +void ProgressiveAlign(const SeqVect &v, const Tree &GuideTree, MSA &a) + { + assert(GuideTree.IsRooted()); + +#if TRACE + Log("GuideTree:\n"); + GuideTree.LogMe(); +#endif + + const unsigned uSeqCount = v.Length(); + const unsigned uNodeCount = 2*uSeqCount - 1; + + ProgNode *ProgNodes = new ProgNode[uNodeCount]; + + unsigned uJoin = 0; + unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode(); + SetProgressDesc("Align node"); + do + { + if (GuideTree.IsLeaf(uTreeNodeIndex)) + { + if (uTreeNodeIndex >= uNodeCount) + Quit("TreeNodeIndex=%u NodeCount=%u\n", uTreeNodeIndex, uNodeCount); + ProgNode &Node = ProgNodes[uTreeNodeIndex]; + unsigned uId = GuideTree.GetLeafId(uTreeNodeIndex); + if (uId >= uSeqCount) + Quit("Seq index out of range"); + const Seq &s = *(v[uId]); + Node.m_MSA.FromSeq(s); + Node.m_MSA.SetSeqId(0, uId); + Node.m_uLength = Node.m_MSA.GetColCount(); + } + else + { + Progress(uJoin, uSeqCount - 1); + ++uJoin; + + const unsigned uMergeNodeIndex = uTreeNodeIndex; + ProgNode &Parent = ProgNodes[uMergeNodeIndex]; + + const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex); + const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex); + + ProgNode &Node1 = ProgNodes[uLeft]; + ProgNode &Node2 = ProgNodes[uRight]; + + PWPath Path; + AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, Path); + Parent.m_uLength = Parent.m_MSA.GetColCount(); + + Node1.m_MSA.Clear(); + Node2.m_MSA.Clear(); + } + uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex); + } + while (NULL_NEIGHBOR != uTreeNodeIndex); + ProgressStepsDone(); + + unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex(); + const ProgNode &RootProgNode = ProgNodes[uRootNodeIndex]; + a.Copy(RootProgNode.m_MSA); + + delete[] ProgNodes; + ProgNodes = 0; + } diff --git a/src/muscle/muscle3.8.31/src/pwpath.cpp b/src/muscle/muscle3.8.31/src/pwpath.cpp new file mode 100644 index 0000000..0ab3449 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/pwpath.cpp @@ -0,0 +1,386 @@ +#include "muscle.h" +#include "pwpath.h" +#include "seq.h" +#include "textfile.h" +#include "msa.h" + +PWPath::PWPath() + { + m_uArraySize = 0; + m_uEdgeCount = 0; + m_Edges = 0; + } + +PWPath::~PWPath() + { + Clear(); + } + +void PWPath::Clear() + { + delete[] m_Edges; + m_Edges = 0; + m_uArraySize = 0; + m_uEdgeCount = 0; + } + +void PWPath::ExpandPath(unsigned uAdditionalEdgeCount) + { + PWEdge *OldPath = m_Edges; + unsigned uEdgeCount = m_uArraySize + uAdditionalEdgeCount; + + m_Edges = new PWEdge[uEdgeCount]; + m_uArraySize = uEdgeCount; + if (m_uEdgeCount > 0) + memcpy(m_Edges, OldPath, m_uEdgeCount*sizeof(PWEdge)); + delete[] OldPath; + } + +void PWPath::AppendEdge(const PWEdge &Edge) + { + if (0 == m_uArraySize || m_uEdgeCount + 1 == m_uArraySize) + ExpandPath(200); + + m_Edges[m_uEdgeCount] = Edge; + ++m_uEdgeCount; + } + +void PWPath::AppendEdge(char cType, unsigned uPrefixLengthA, unsigned uPrefixLengthB) + { + PWEdge e; + e.uPrefixLengthA = uPrefixLengthA; + e.uPrefixLengthB = uPrefixLengthB; + e.cType = cType; + AppendEdge(e); + } + +void PWPath::PrependEdge(const PWEdge &Edge) + { + if (0 == m_uArraySize || m_uEdgeCount + 1 == m_uArraySize) + ExpandPath(1000); + if (m_uEdgeCount > 0) + memmove(m_Edges + 1, m_Edges, sizeof(PWEdge)*m_uEdgeCount); + m_Edges[0] = Edge; + ++m_uEdgeCount; + } + +const PWEdge &PWPath::GetEdge(unsigned uEdgeIndex) const + { + assert(uEdgeIndex < m_uEdgeCount); + return m_Edges[uEdgeIndex]; + } + +void PWPath::Validate() const + { + const unsigned uEdgeCount = GetEdgeCount(); + if (0 == uEdgeCount) + return; + const PWEdge &FirstEdge = GetEdge(0); + const PWEdge &LastEdge = GetEdge(uEdgeCount - 1); + unsigned uStartA = FirstEdge.uPrefixLengthA; + unsigned uStartB = FirstEdge.uPrefixLengthB; + if (FirstEdge.cType != 'I') + --uStartA; + if (FirstEdge.cType != 'D') + --uStartB; + + unsigned uPrefixLengthA = FirstEdge.uPrefixLengthA; + unsigned uPrefixLengthB = FirstEdge.uPrefixLengthB; + for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex) + { + const PWEdge &Edge = GetEdge(uEdgeIndex); + switch (Edge.cType) + { + case 'M': + if (uPrefixLengthA + 1 != Edge.uPrefixLengthA) + Quit("PWPath::Validate MA %u", uPrefixLengthA); + if (uPrefixLengthB + 1 != Edge.uPrefixLengthB) + Quit("PWPath::Validate MB %u", uPrefixLengthB); + ++uPrefixLengthA; + ++uPrefixLengthB; + break; + case 'D': + if (uPrefixLengthA + 1 != Edge.uPrefixLengthA) + Quit("PWPath::Validate DA %u", uPrefixLengthA); + if (uPrefixLengthB != Edge.uPrefixLengthB) + Quit("PWPath::Validate DB %u", uPrefixLengthB); + ++uPrefixLengthA; + break; + case 'I': + if (uPrefixLengthA != Edge.uPrefixLengthA) + Quit("PWPath::Validate IA %u", uPrefixLengthA); + if (uPrefixLengthB + 1 != Edge.uPrefixLengthB) + Quit("PWPath::Validate IB %u", uPrefixLengthB); + ++uPrefixLengthB; + break; + } + } + } + +void PWPath::LogMe() const + { + for (unsigned uEdgeIndex = 0; uEdgeIndex < GetEdgeCount(); ++uEdgeIndex) + { + const PWEdge &Edge = GetEdge(uEdgeIndex); + if (uEdgeIndex > 0) + Log(" "); + Log("%c%d.%d", + Edge.cType, + Edge.uPrefixLengthA, + Edge.uPrefixLengthB); + if ((uEdgeIndex > 0 && uEdgeIndex%10 == 0) || + uEdgeIndex == GetEdgeCount() - 1) + Log("\n"); + } + } + +void PWPath::Copy(const PWPath &Path) + { + Clear(); + const unsigned uEdgeCount = Path.GetEdgeCount(); + for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) + { + const PWEdge &Edge = Path.GetEdge(uEdgeIndex); + AppendEdge(Edge); + } + } + +void PWPath::FromMSAPair(const MSA &msaA, const MSA &msaB) + { + const unsigned uColCount = msaA.GetColCount(); + if (uColCount != msaB.GetColCount()) + Quit("PWPath::FromMSAPair, lengths differ"); + + Clear(); + + unsigned uPrefixLengthA = 0; + unsigned uPrefixLengthB = 0; + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + { + bool bIsGapA = msaA.IsGapColumn(uColIndex); + bool bIsGapB = msaB.IsGapColumn(uColIndex); + + PWEdge Edge; + char cType; + if (!bIsGapA && !bIsGapB) + { + cType = 'M'; + ++uPrefixLengthA; + ++uPrefixLengthB; + } + else if (bIsGapA && !bIsGapB) + { + cType = 'I'; + ++uPrefixLengthB; + } + else if (!bIsGapA && bIsGapB) + { + cType = 'D'; + ++uPrefixLengthA; + } + else + { + assert(bIsGapB && bIsGapA); + continue; + } + + Edge.cType = cType; + Edge.uPrefixLengthA = uPrefixLengthA; + Edge.uPrefixLengthB = uPrefixLengthB; + AppendEdge(Edge); + } + } + +// Very similar to HMMPath::FromFile, should consolidate. +void PWPath::FromFile(TextFile &File) + { + Clear(); + char szToken[1024]; + File.GetTokenX(szToken, sizeof(szToken)); + if (0 != strcmp(szToken, "Path")) + Quit("Invalid path file (Path)"); + + File.GetTokenX(szToken, sizeof(szToken)); + if (0 != strcmp(szToken, "edges")) + Quit("Invalid path file (edges)"); + + File.GetTokenX(szToken, sizeof(szToken)); + if (!IsValidInteger(szToken)) + Quit("Invalid path file (edges value)"); + + const unsigned uEdgeCount = (unsigned) atoi(szToken); + unsigned uEdgeIndex = 0; + for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) + { + // index + File.GetTokenX(szToken, sizeof(szToken)); + if (!IsValidInteger(szToken)) + Quit("Invalid path file, invalid index '%s'", szToken); + unsigned n = (unsigned) atoi(szToken); + if (n != uEdgeIndex) + Quit("Invalid path file, expecting edge %u got %u", uEdgeIndex, n); + + // type + File.GetTokenX(szToken, sizeof(szToken)); + if (1 != strlen(szToken)) + Quit("Invalid path file, expecting state, got '%s'", szToken); + const char cType = szToken[0]; + if ('M' != cType && 'D' != cType && cType != 'I' && 'S' != cType) + Quit("Invalid path file, expecting state, got '%c'", cType); + + // prefix length A + File.GetTokenX(szToken, sizeof(szToken)); + if (!IsValidInteger(szToken)) + Quit("Invalid path file, bad prefix length A '%s'", szToken); + const unsigned uPrefixLengthA = (unsigned) atoi(szToken); + + // prefix length B + File.GetTokenX(szToken, sizeof(szToken)); + if (!IsValidInteger(szToken)) + Quit("Invalid path file, bad prefix length B '%s'", szToken); + const unsigned uPrefixLengthB = (unsigned) atoi(szToken); + + PWEdge Edge; + Edge.cType = cType; + Edge.uPrefixLengthA = uPrefixLengthA; + Edge.uPrefixLengthB = uPrefixLengthB; + AppendEdge(Edge); + } + File.GetTokenX(szToken, sizeof(szToken)); + if (0 != strcmp(szToken, "//")) + Quit("Invalid path file (//)"); + } + +void PWPath::ToFile(TextFile &File) const + { + const unsigned uEdgeCount = GetEdgeCount(); + + File.PutString("Path\n"); + File.PutFormat("edges %u\n", uEdgeCount); + for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) + { + const PWEdge &Edge = GetEdge(uEdgeIndex); + File.PutFormat("%u %c %u %u\n", + uEdgeIndex, + Edge.cType, + Edge.uPrefixLengthA, + Edge.uPrefixLengthB); + } + File.PutString("//\n"); + } + +void PWPath::AssertEqual(const PWPath &Path) const + { + const unsigned uEdgeCount = GetEdgeCount(); + if (uEdgeCount != Path.GetEdgeCount()) + { + Log("PWPath::AssertEqual, this=\n"); + LogMe(); + Log("\nOther path=\n"); + Path.LogMe(); + Log("\n"); + Quit("PWPath::AssertEqual, Edge count different %u %u\n", + uEdgeCount, Path.GetEdgeCount()); + } + + for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) + { + const PWEdge &e1 = GetEdge(uEdgeIndex); + const PWEdge &e2 = Path.GetEdge(uEdgeIndex); + if (e1.cType != e2.cType || e1.uPrefixLengthA != e2.uPrefixLengthA || + e1.uPrefixLengthB != e2.uPrefixLengthB) + { + Log("PWPath::AssertEqual, this=\n"); + LogMe(); + Log("\nOther path=\n"); + Path.LogMe(); + Log("\n"); + Log("This edge %c%u.%u, other edge %c%u.%u\n", + e1.cType, e1.uPrefixLengthA, e1.uPrefixLengthB, + e2.cType, e2.uPrefixLengthA, e2.uPrefixLengthB); + Quit("PWPath::AssertEqual, edge %u different\n", uEdgeIndex); + } + } + } + +bool PWPath::Equal(const PWPath &Path) const + { + const unsigned uEdgeCount = GetEdgeCount(); + if (uEdgeCount != Path.GetEdgeCount()) + return false; + + for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) + { + const PWEdge &e1 = GetEdge(uEdgeIndex); + const PWEdge &e2 = Path.GetEdge(uEdgeIndex); + if (e1.cType != e2.cType || e1.uPrefixLengthA != e2.uPrefixLengthA || + e1.uPrefixLengthB != e2.uPrefixLengthB) + return false; + } + return true; + } + +unsigned PWPath::GetMatchCount() const + { + unsigned uMatchCount = 0; + const unsigned uEdgeCount = GetEdgeCount(); + for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) + { + const PWEdge &e = GetEdge(uEdgeIndex); + if ('M' == e.cType) + ++uMatchCount; + } + return uMatchCount; + } + +unsigned PWPath::GetInsertCount() const + { + unsigned uInsertCount = 0; + const unsigned uEdgeCount = GetEdgeCount(); + for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) + { + const PWEdge &e = GetEdge(uEdgeIndex); + if ('I' == e.cType) + ++uInsertCount; + } + return uInsertCount; + } + +unsigned PWPath::GetDeleteCount() const + { + unsigned uDeleteCount = 0; + const unsigned uEdgeCount = GetEdgeCount(); + for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) + { + const PWEdge &e = GetEdge(uEdgeIndex); + if ('D' == e.cType) + ++uDeleteCount; + } + return uDeleteCount; + } + +void PWPath::FromStr(const char Str[]) + { + Clear(); + unsigned uPrefixLengthA = 0; + unsigned uPrefixLengthB = 0; + while (char c = *Str++) + { + switch (c) + { + case 'M': + ++uPrefixLengthA; + ++uPrefixLengthB; + break; + case 'D': + ++uPrefixLengthA; + break; + case 'I': + ++uPrefixLengthB; + break; + default: + Quit("PWPath::FromStr, invalid state %c", c); + } + AppendEdge(c, uPrefixLengthA, uPrefixLengthB); + } + } diff --git a/src/muscle/muscle3.8.31/src/pwpath.h b/src/muscle/muscle3.8.31/src/pwpath.h new file mode 100644 index 0000000..69628e3 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/pwpath.h @@ -0,0 +1,100 @@ +#ifndef PWPath_h +#define PWPath_h + +/*** +Each PWEdge in a PWPath specifies a column in a pair-wise (PW) alignment. +"Path" is by analogy with the path through an HMM. +Edge types are: + + 'M' LetterA + LetterB + 'D' LetterA + GapB + 'I' GapB + LetterA + +The mnemomic is Match, Delete, Insert (with respect to A). +Here is a global alignment of sequences A and B. + + A: AMQT-F + B: -M-TIF + +The path for this example is: + + Edge cType uPrefixLengthA uPrefixLengthB + 0 D 1 0 + 1 M 2 1 + 2 D 3 1 + 3 M 4 2 + 4 I 4 3 + 5 M 5 4 + +Given the starting positions in each alignment (e.g., column zero for +a global alignment), the prefix length fields are redundant; they are +included only for convenience and as a sanity check, we are not trying +to optimize for speed or space here. We use prefix lengths rather than +column indexes because of the problem of representing the special case +of a gap in the first position. +***/ + +class Seq; +class MSA; +class SatchmoParams; +class PW; +class TextFile; +class PWScore; + +class PWEdge + { +public: + char cType; + unsigned uPrefixLengthA; + unsigned uPrefixLengthB; + + bool Equal(const PWEdge &e) const + { + return uPrefixLengthA == e.uPrefixLengthA && + uPrefixLengthB == e.uPrefixLengthB && + cType == e.cType; + } + }; + +class PWPath + { +// Disable compiler defaults +private: + PWPath &operator=(const PWPath &rhs); + PWPath(const PWPath &rhs); + +public: + PWPath(); + virtual ~PWPath(); + +public: + void Clear(); + void FromStr(const char Str[]); + void Copy(const PWPath &Path); + void AppendEdge(const PWEdge &Edge); + void AppendEdge(char cType, unsigned uPrefixLengthA, unsigned uPrefixLengthB); + void PrependEdge(const PWEdge &Edge); + unsigned GetEdgeCount() const { return m_uEdgeCount; } + const PWEdge &GetEdge(unsigned uEdgeIndex) const; + void Validate(const PWScore &PWS) const; + void Validate() const; + void LogMe() const; + void FromFile(TextFile &File); + void ToFile(TextFile &File) const; + void FromMSAPair(const MSA &msaA, const MSA &msaB); + void AssertEqual(const PWPath &Path) const; + bool Equal(const PWPath &Path) const; + unsigned GetMatchCount() const; + unsigned GetDeleteCount() const; + unsigned GetInsertCount() const; + +private: + void ExpandPath(unsigned uAdditionalEdgeCount); + +private: + unsigned m_uEdgeCount; + unsigned m_uArraySize; + PWEdge *m_Edges; + }; + +#endif // PWPath_h diff --git a/src/muscle/muscle3.8.31/src/readmx.cpp b/src/muscle/muscle3.8.31/src/readmx.cpp new file mode 100644 index 0000000..faf7085 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/readmx.cpp @@ -0,0 +1,158 @@ +#include "muscle.h" +#include "textfile.h" + +#define TRACE 0 + +const int MAX_LINE = 4096; +const int MAX_HEADINGS = 32; +static char Heading[MAX_HEADINGS]; +static unsigned HeadingCount = 0; +static float Mx[32][32]; + +static void LogMx() + { + Log("Matrix\n"); + Log(" "); + for (int i = 0; i < 20; ++i) + Log(" %c", LetterToChar(i)); + Log("\n"); + + for (int i = 0; i < 20; ++i) + { + Log("%c ", LetterToChar(i)); + for (int j = 0; j < 20; ++j) + Log("%5.1f", Mx[i][j]); + Log("\n"); + } + Log("\n"); + } + +static unsigned MxCharToLetter(char c) + { + for (unsigned Letter = 0; Letter < HeadingCount; ++Letter) + if (Heading[Letter] == c) + return Letter; + Quit("Letter '%c' has no heading", c); + return 0; + } + +PTR_SCOREMATRIX ReadMx(TextFile &File) + { +// Find column headers + char Line[MAX_LINE]; + for (;;) + { + bool EndOfFile = File.GetLine(Line, sizeof(Line)); + if (EndOfFile) + Quit("Premature EOF in matrix file"); + + if (Line[0] == '#') + continue; + else if (Line[0] == ' ') + break; + else + Quit("Invalid line in matrix file: '%s'", Line); + } + +// Read column headers + HeadingCount = 0; + for (char *p = Line; *p; ++p) + { + char c = *p; + if (!isspace(c)) + Heading[HeadingCount++] = c; + } + + if (HeadingCount > 0 && Heading[HeadingCount-1] == '*') + --HeadingCount; + + if (HeadingCount < 20) + Quit("Error in matrix file: < 20 headers, line='%s'", Line); + +#if TRACE + { + Log("ReadMx\n"); + Log("%d headings: ", HeadingCount); + for (unsigned i = 0; i < HeadingCount; ++i) + Log("%c", Heading[i]); + Log("\n"); + } +#endif + +// Zero out matrix + for (int i = 0; i < MAX_ALPHA; ++i) + for (int j = 0; j < MAX_ALPHA; ++j) + Mx[i][j] = 0.0; + +// Read data lines + for (unsigned RowIndex = 0; RowIndex < HeadingCount; ++RowIndex) + { + bool EndOfFile = File.GetTrimLine(Line, sizeof(Line)); + if (EndOfFile) + Quit("Premature EOF in matrix file"); +#if TRACE + Log("Line=%s\n", Line); +#endif + if (Line[0] == '#') + continue; + + char c = Line[0]; +#if TRACE + Log("Row char=%c\n", c); +#endif + if (!IsResidueChar(c)) + continue; + unsigned RowLetter = CharToLetter(c); + if (RowLetter >= 20) + continue; +#if TRACE + Log("Row letter = %u\n", RowLetter); +#endif + + char *p = Line + 1; + char *maxp = p + strlen(Line); + for (unsigned Col = 0; Col < HeadingCount - 1; ++Col) + { + if (p >= maxp) + Quit("Too few fields in line of matrix file: '%s'", Line); + while (isspace(*p)) + ++p; + char *Value = p; + while (!isspace(*p)) + ++p; + float v = (float) atof(Value); + char HeaderChar = Heading[Col]; + if (IsResidueChar(HeaderChar)) + { + unsigned ColLetter = CharToLetter(HeaderChar); + if (ColLetter >= 20) + continue; + Mx[RowLetter][ColLetter] = v; + } + p += 1; + } + } + +// Sanity check for symmetry + for (int i = 0; i < 20; ++i) + for (int j = 0; j < i; ++j) + { + if (Mx[i][j] != Mx[j][i]) + { + Warning("Matrix is not symmetrical, %c->%c=%g, %c->%c=%g", + CharToLetter(i), + CharToLetter(j), + Mx[i][j], + CharToLetter(j), + CharToLetter(i), + Mx[j][i]); + goto ExitLoop; + } + } +ExitLoop:; + + if (g_bVerbose) + LogMx(); + + return &Mx; + } diff --git a/src/muscle/muscle3.8.31/src/realigndiffs.cpp b/src/muscle/muscle3.8.31/src/realigndiffs.cpp new file mode 100644 index 0000000..d0bac98 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/realigndiffs.cpp @@ -0,0 +1,115 @@ +#include "muscle.h" +#include "msa.h" +#include "tree.h" +#include "profile.h" +#include "pwpath.h" + +#define TRACE 0 + +// Progressive alignment according to a diffs tree. + +static void MakeNode(const MSA &msaIn, const Tree &Diffs, unsigned uDiffsNodeIndex, + const unsigned IdToDiffsTreeNodeIndex[], ProgNode &Node) + { + const unsigned uSeqCount = msaIn.GetSeqCount(); + + unsigned *Ids = new unsigned[uSeqCount]; + + unsigned uSeqsInDiffCount = 0; + for (unsigned uId = 0; uId < uSeqCount; ++uId) + { + if (IdToDiffsTreeNodeIndex[uId] == uDiffsNodeIndex) + { + Ids[uSeqsInDiffCount] = uId; + ++uSeqsInDiffCount; + } + } + if (0 == uSeqsInDiffCount) + Quit("MakeNode: no seqs in diff"); + + MSASubsetByIds(msaIn, Ids, uSeqsInDiffCount, Node.m_MSA); + +#if DEBUG + ValidateMuscleIds(Node.m_MSA); +#endif + + DeleteGappedCols(Node.m_MSA); + delete[] Ids; + } + +void RealignDiffs(const MSA &msaIn, const Tree &Diffs, + const unsigned IdToDiffsTreeNodeIndex[], MSA &msaOut) + { + assert(Diffs.IsRooted()); + +#if TRACE + Log("RealignDiffs\n"); + Log("Diff tree:\n"); + Diffs.LogMe(); +#endif + + const unsigned uNodeCount = Diffs.GetNodeCount(); + if (uNodeCount%2 == 0) + Quit("RealignDiffs: Expected odd number of nodes"); + + const unsigned uMergeCount = (uNodeCount - 1)/2; + + ProgNode *ProgNodes = new ProgNode[uNodeCount]; + + unsigned uJoin = 0; + SetProgressDesc("Refine tree"); + for (unsigned uDiffsNodeIndex = Diffs.FirstDepthFirstNode(); + NULL_NEIGHBOR != uDiffsNodeIndex; + uDiffsNodeIndex = Diffs.NextDepthFirstNode(uDiffsNodeIndex)) + { + if (Diffs.IsLeaf(uDiffsNodeIndex)) + { + assert(uDiffsNodeIndex < uNodeCount); + if (uDiffsNodeIndex >= uNodeCount) + Quit("TreeNodeIndex=%u NodeCount=%u\n", uDiffsNodeIndex, uNodeCount); + + ProgNode &Node = ProgNodes[uDiffsNodeIndex]; + MakeNode(msaIn, Diffs, uDiffsNodeIndex, IdToDiffsTreeNodeIndex, Node); + + Node.m_uLength = Node.m_MSA.GetColCount(); + } + else + { + Progress(uJoin, uMergeCount); + ++uJoin; + const unsigned uMergeNodeIndex = uDiffsNodeIndex; + ProgNode &Parent = ProgNodes[uMergeNodeIndex]; + + const unsigned uLeft = Diffs.GetLeft(uDiffsNodeIndex); + const unsigned uRight = Diffs.GetRight(uDiffsNodeIndex); + + ProgNode &Node1 = ProgNodes[uLeft]; + ProgNode &Node2 = ProgNodes[uRight]; + + PWPath Path; + AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, Path); + +#if TRACE + { + Log("Combined:\n"); + Parent.m_MSA.LogMe(); + } +#endif + + Node1.m_MSA.Clear(); + Node2.m_MSA.Clear(); + } + } + ProgressStepsDone(); + + unsigned uRootNodeIndex = Diffs.GetRootNodeIndex(); + const ProgNode &RootProgNode = ProgNodes[uRootNodeIndex]; + msaOut.Copy(RootProgNode.m_MSA); + +#if DEBUG + AssertMSAEqIgnoreCaseAndGaps(msaIn, msaOut); +#endif + + delete[] ProgNodes; + ProgNodes = 0; + } diff --git a/src/muscle/muscle3.8.31/src/realigndiffse.cpp b/src/muscle/muscle3.8.31/src/realigndiffse.cpp new file mode 100644 index 0000000..559c333 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/realigndiffse.cpp @@ -0,0 +1,142 @@ +#include "muscle.h" +#include "msa.h" +#include "tree.h" +#include "profile.h" +#include "pwpath.h" +#include "seqvect.h" +#include "estring.h" + +#define TRACE 0 + +void DeleteProgNode(ProgNode &Node) + { + delete[] Node.m_Prof; + delete[] Node.m_EstringL; + delete[] Node.m_EstringR; + + Node.m_Prof = 0; + Node.m_EstringL = 0; + Node.m_EstringR = 0; + } + +static void MakeNode(ProgNode &OldNode, ProgNode &NewNode, bool bSwapLR) + { + if (bSwapLR) + { + NewNode.m_EstringL = OldNode.m_EstringR; + NewNode.m_EstringR = OldNode.m_EstringL; + } + else + { + NewNode.m_EstringL = OldNode.m_EstringL; + NewNode.m_EstringR = OldNode.m_EstringR; + } + NewNode.m_Prof = OldNode.m_Prof; + NewNode.m_uLength = OldNode.m_uLength; + NewNode.m_Weight = OldNode.m_Weight; + + OldNode.m_Prof = 0; + OldNode.m_EstringL = 0; + OldNode.m_EstringR = 0; + } + +void RealignDiffsE(const MSA &msaIn, const SeqVect &v, + const Tree &NewTree, const Tree &OldTree, + const unsigned uNewNodeIndexToOldNodeIndex[], + MSA &msaOut, ProgNode *OldProgNodes) + { + assert(OldProgNodes != 0); + + const unsigned uNodeCount = NewTree.GetNodeCount(); + if (uNodeCount%2 == 0) + Quit("RealignDiffs: Expected odd number of nodes"); + + const unsigned uMergeCount = (uNodeCount - 1)/2; + ProgNode *NewProgNodes = new ProgNode[uNodeCount]; + + for (unsigned uNewNodeIndex = 0; uNewNodeIndex < uNodeCount; ++uNewNodeIndex) + { + if (NODE_CHANGED == uNewNodeIndexToOldNodeIndex[uNewNodeIndex]) + continue; + + unsigned uOldNodeIndex = uNewNodeIndexToOldNodeIndex[uNewNodeIndex]; + assert(uNewNodeIndex < uNodeCount); + assert(uOldNodeIndex < uNodeCount); + + ProgNode &NewNode = NewProgNodes[uNewNodeIndex]; + ProgNode &OldNode = OldProgNodes[uOldNodeIndex]; + bool bSwapLR = false; + if (!NewTree.IsLeaf(uNewNodeIndex)) + { + unsigned uNewLeft = NewTree.GetLeft(uNewNodeIndex); + unsigned uNewRight = NewTree.GetRight(uNewNodeIndex); + unsigned uOld = uNewNodeIndexToOldNodeIndex[uNewNodeIndex]; + unsigned uOldLeft = OldTree.GetLeft(uOld); + unsigned uOldRight = OldTree.GetRight(uOld); + assert(uOldLeft < uNodeCount && uOldRight < uNodeCount); + if (uOldLeft != uNewNodeIndexToOldNodeIndex[uNewLeft]) + { + assert(uOldLeft == uNewNodeIndexToOldNodeIndex[uNewRight]); + bSwapLR = true; + } + } + MakeNode(OldNode, NewNode, bSwapLR); +#if TRACE + Log("MakeNode old=%u new=%u swap=%d length=%u weight=%.3g\n", + uOldNodeIndex, uNewNodeIndex, bSwapLR, NewNode.m_uLength, NewNode.m_Weight); +#endif + } + + unsigned uJoin = 0; + SetProgressDesc("Refine tree"); + for (unsigned uNewNodeIndex = NewTree.FirstDepthFirstNode(); + NULL_NEIGHBOR != uNewNodeIndex; + uNewNodeIndex = NewTree.NextDepthFirstNode(uNewNodeIndex)) + { + if (NODE_CHANGED != uNewNodeIndexToOldNodeIndex[uNewNodeIndex]) + continue; + + Progress(uJoin, uMergeCount - 1); + ++uJoin; + + const unsigned uMergeNodeIndex = uNewNodeIndex; + ProgNode &Parent = NewProgNodes[uMergeNodeIndex]; + + const unsigned uLeft = NewTree.GetLeft(uNewNodeIndex); + const unsigned uRight = NewTree.GetRight(uNewNodeIndex); + + ProgNode &Node1 = NewProgNodes[uLeft]; + ProgNode &Node2 = NewProgNodes[uRight]; + + AlignTwoProfs( + Node1.m_Prof, Node1.m_uLength, Node1.m_Weight, + Node2.m_Prof, Node2.m_uLength, Node2.m_Weight, + Parent.m_Path, + &Parent.m_Prof, &Parent.m_uLength); + PathToEstrings(Parent.m_Path, &Parent.m_EstringL, &Parent.m_EstringR); + + Parent.m_Weight = Node1.m_Weight + Node2.m_Weight; + + delete[] Node1.m_Prof; + delete[] Node2.m_Prof; + + Node1.m_Prof = 0; + Node2.m_Prof = 0; + } + + ProgressStepsDone(); + + if (g_bBrenner) + MakeRootMSABrenner((SeqVect &) v, NewTree, NewProgNodes, msaOut); + else + MakeRootMSA(v, NewTree, NewProgNodes, msaOut); + +#if DEBUG + AssertMSAEqIgnoreCaseAndGaps(msaIn, msaOut); +#endif + + for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) + DeleteProgNode(NewProgNodes[uNodeIndex]); + + delete[] NewProgNodes; + } diff --git a/src/muscle/muscle3.8.31/src/redblack.cpp b/src/muscle/muscle3.8.31/src/redblack.cpp new file mode 100644 index 0000000..7194653 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/redblack.cpp @@ -0,0 +1,471 @@ +#include "muscle.h" +#include "clust.h" + +void Clust::InsertMetric(unsigned uIndex1, unsigned uIndex2, float dMetric) + { + RBInsert(uIndex1, uIndex2, dMetric); + } + +void Clust::DeleteMetric(unsigned uIndex) + { + for (unsigned uNodeIndex = GetFirstCluster(); uNodeIndex != uInsane; + uNodeIndex = GetNextCluster(uNodeIndex)) + { + if (uIndex == uNodeIndex) + continue; + DeleteMetric(uIndex, uNodeIndex); + } + } + +void Clust::InitMetric(unsigned uMaxNodeIndex) + { + m_uRBNodeCount = m_uTriangularMatrixSize; + m_RBParent = new unsigned[m_uRBNodeCount]; + m_RBLeft = new unsigned[m_uRBNodeCount]; + m_RBRight = new unsigned[m_uRBNodeCount]; + m_RBi = new ushort[m_uRBNodeCount]; + m_RBj = new ushort[m_uRBNodeCount]; + m_RBMetric = new float[m_uRBNodeCount]; + m_RBColor = new bool[m_uRBNodeCount]; + m_RBRoot = RB_NIL; + +#if DEBUG + { +// Initialize fields to invalid values so we have a chance +// catch attempts to use them if they're not properly set. + unsigned InvalidNode = m_uRBNodeCount + 1; + for (unsigned Node = 0; Node < m_uRBNodeCount; ++Node) + { + m_RBParent[Node] = InvalidNode; + m_RBLeft[Node] = InvalidNode; + m_RBRight[Node] = InvalidNode; + m_RBi[Node] = InvalidNode; + m_RBj[Node] = InvalidNode; + } + } +#endif + } + +void Clust::ListMetric() const + { + Log("Red-black tree root=%u\n", m_RBRoot); + Log("\n"); + Log(" Node Parent Left Right Color i j Metric\n"); + Log("----- ------ ----- ----- ----- ----- ----- ------\n"); + + if (RB_NIL == m_RBRoot) + return; + + unsigned Count = 0; + unsigned Start = RBMin(m_RBRoot); + for (unsigned Node = Start; RB_NIL != Node; Node = RBNext(Node)) + { + Log("%5u", Node); + + if (RB_NIL != m_RBParent[Node]) + Log(" %6u", m_RBParent[Node]); + else + Log(" "); + + if (RB_NIL != m_RBLeft[Node]) + Log(" %5u", m_RBLeft[Node]); + else + Log(" "); + + if (RB_NIL != m_RBRight[Node]) + Log(" %5u", m_RBRight[Node]); + else + Log(" "); + + Log(" %s %5u %5u %g\n", + m_RBColor[Node] ? " Red" : "Black", + m_RBi[Node], + m_RBj[Node], + m_RBMetric[Node]); + + if (++Count > m_uRBNodeCount) + { + Log(" ** LOOP ** \n"); + break; + } + } + } + +// If there is a left subtree, predecessor is the +// largest key found under the left branch. Otherwise, +// is first node in path to root that is a right child. +unsigned Clust::RBPrev(unsigned Node) const + { + assert(Node < m_uRBNodeCount); + + unsigned Left = m_RBLeft[Node]; + if (RB_NIL != Left) + return RBMax(Left); + + for (;;) + { + unsigned Parent = m_RBParent[Node]; + if (RB_NIL == Parent) + return RB_NIL; + if (m_RBRight[Parent] == Node) + return Parent; + Node = Parent; + } + } + +// If there is a right subtree, sucessor is the +// smallest key found under the right branch. Otherwise, +// is first node in path to root that is a left child. +unsigned Clust::RBNext(unsigned Node) const + { + if (Node >= m_uRBNodeCount) + Quit("RBNext(%u)", Node); + assert(Node < m_uRBNodeCount); + + unsigned Right = m_RBRight[Node]; + if (RB_NIL != Right) + return RBMin(Right); + + for (;;) + { + unsigned Parent = m_RBParent[Node]; + if (RB_NIL == Parent) + return RB_NIL; + if (m_RBLeft[Parent] == Node) + return Parent; + Node = Parent; + } + } + +// Minimum is in leftmost leaf +unsigned Clust::RBMin(unsigned RBNode) const + { + assert(RB_NIL != RBNode); + for (;;) + { + unsigned Left = m_RBLeft[RBNode]; + if (RB_NIL == Left) + return RBNode; + RBNode = Left; + } + } + +// Maximum is in rightmost leaf +unsigned Clust::RBMax(unsigned RBNode) const + { + assert(RB_NIL != RBNode); + for (;;) + { + unsigned Right = m_RBRight[RBNode]; + if (RB_NIL == Right) + return RBNode; + RBNode = Right; + } + } + +void Clust::DeleteMetric(unsigned uIndex1, unsigned uIndex2) + { + unsigned RBNode = (unsigned) VectorIndex(uIndex1, uIndex2); + RBDelete(RBNode); + } + +void Clust::RBDelete(unsigned Node) + { +#if DEBUG + ValidateRB(); + //Log("@@ Before RBDelete(%u)\n", Node); + //ListMetric(); +#endif + + unsigned Left = m_RBLeft[Node]; + unsigned Right = m_RBRight[Node]; + unsigned Parent = m_RBParent[Node]; + +// If one or two nil children, splice out this node. + if (RB_NIL == Left || RB_NIL == Right) + { +// Log("@@ One child\n"); + // Child is non-NIL child, or NIL if none. + unsigned Child = (Left != RB_NIL ? Left : Right); + + // Special case if root + if (RB_NIL == Parent) + { + assert(Node == m_RBRoot); + m_RBRoot = Child; + if (RB_NIL != Child) + m_RBParent[Child] = RB_NIL; + return; + } + + // Typical case. + // Update parent->child link + if (m_RBLeft[Parent] == Node) + m_RBLeft[Parent] = Child; + else + { + assert(m_RBRight[Parent] == Node); + m_RBRight[Parent] = Child; + } + + // Update child->parent link + if (RB_NIL != Child) + m_RBParent[Child] = Parent; + +#if DEBUG + //Log("@@ After RBDelete(%u)\n", Node); + //ListMetric(); + ValidateRB(); +#endif + return; + } + + //Log("@@ RBDelete(%u) Tricky case\n", Node); + //ListMetric(); + +// Trickier case, node has two children. + assert(Left != RB_NIL && Right != RB_NIL); + +// We're going to splice out successor node from its +// current position and insert it in place of node +// to be deleted. + +// Successor cannot be nil because there is a right child. + unsigned Next = RBNext(Node); + assert(Next != RB_NIL); + +// The successor of a node with two children is +// guaranteed to have no more than one child. + unsigned NextLeft = m_RBLeft[Next]; + unsigned NextRight = m_RBRight[Next]; + assert(RB_NIL == NextLeft || RB_NIL == NextRight); + +// Successor of node with two children cannot be the root. + unsigned NextParent = m_RBParent[Next]; + assert(RB_NIL != NextParent); + +// Ugly special case if successor is right child + if (Next == Right) + { +#if DEBUG + //Log("@@ Before RBDelete(%u) (tricky next==right)\n", Node); + //ListMetric(); +#endif + m_RBParent[Next] = Parent; + + if (RB_NIL == Parent) + { + m_RBRoot = Next; + m_RBParent[Next] = RB_NIL; + } + else + { + if (m_RBLeft[Parent] == Node) + m_RBLeft[Parent] = Next; + else + { + assert(m_RBRight[Parent] == Node); + m_RBRight[Parent] = Next; + } + } + + m_RBLeft[Next] = Left; + + if (RB_NIL != Left) + m_RBParent[Left] = Next; + +#if DEBUG + //Log("@@ After RBDelete(%u) (tricky next==right)\n", Node); + //ListMetric(); + ValidateRB(); +#endif + return; + } + +// Set NextChild either to the one child of successor, or nil. + unsigned NextChild = (NextLeft != RB_NIL ? NextLeft : NextRight); + +// Splice successor from its current position + if (m_RBLeft[NextParent] == Next) + m_RBLeft[NextParent] = NextChild; + else + { + assert(m_RBRight[NextParent] == Next); + m_RBRight[NextParent] = NextChild; + } + + if (RB_NIL != NextChild) + m_RBParent[NextChild] = NextParent; + +// Insert successor into position currently held by node +// to be deleted. + if (RB_NIL == Parent) + { + m_RBRoot = Next; + m_RBParent[Next] = RB_NIL; + } + else + { + if (m_RBLeft[Parent] == Node) + m_RBLeft[Parent] = Next; + else + { + assert(m_RBRight[Parent] == Node); + m_RBRight[Parent] = Next; + } + } + + m_RBLeft[Next] = Left; + m_RBRight[Next] = Right; + m_RBParent[Next] = Parent; + + m_RBParent[Left] = Next; + m_RBParent[Right] = Next; + +#if DEBUG + //Log("@@ After RBDelete(%u)\n", Node); + //ListMetric(); + ValidateRB(); +#endif + } + +unsigned Clust::RBInsert(unsigned i, unsigned j, float fMetric) + { +#if DEBUG + ValidateRB(); +#endif + + unsigned NewNode = VectorIndex(i, j); + m_RBMetric[NewNode] = fMetric; + m_RBi[NewNode] = i; + m_RBj[NewNode] = j; + +// New node is always inserted as a leaf. +// Proof that this is possible is found in algorithm +// textbooks (I forget the argument). + m_RBLeft[NewNode] = RB_NIL; + m_RBRight[NewNode] = RB_NIL; + + unsigned NewParent = RB_NIL; + unsigned Node = m_RBRoot; + + unsigned uCount = 0; + while (RB_NIL != Node) + { + NewParent = Node; + if (fMetric < m_RBMetric[Node]) + Node = m_RBLeft[Node]; + else + Node = m_RBRight[Node]; + ++uCount; + if (uCount > m_uRBNodeCount) + Quit("Infinite loop in RBInsert"); + } + + m_RBParent[NewNode] = NewParent; + if (RB_NIL == NewParent) + m_RBRoot = NewNode; + else + { + if (fMetric < m_RBMetric[NewParent]) + m_RBLeft[NewParent] = NewNode; + else + m_RBRight[NewParent] = NewNode; + } + +#if DEBUG + { + unsigned Next = RBNext(NewNode); + if (Next != RB_NIL) + assert(NewNode == RBPrev(Next)); + unsigned Prev = RBPrev(NewNode); + if (Prev != RB_NIL) + assert(NewNode == RBNext(Prev)); + ValidateRB(); + } +#endif + return NewNode; + } + +void Clust::ValidateRBNode(unsigned Node, const char szMsg[]) const + { + if (RB_NIL == Node) + return; + + unsigned Parent = m_RBParent[Node]; + unsigned Left = m_RBLeft[Node]; + unsigned Right = m_RBRight[Node]; + + unsigned Next = RBNext(Node); + unsigned Prev = RBPrev(Node); + + if (RB_NIL != Next && RBPrev(Next) != Node) + { + ListMetric(); + Quit("ValidateRB(%s) Node=%u Next=%u Prev(Next)=%u", + szMsg, Node, Next, RBPrev(Next)); + } + + if (RB_NIL != Prev && RBNext(Prev) != Node) + { + ListMetric(); + Quit("ValidateRB(%s) Node=%u Prev=%u Next(Prev)=%u", + szMsg, Node, Prev, RBNext(Prev)); + } + + if (RB_NIL != Parent) + { + if (m_RBLeft[Parent] != Node && m_RBRight[Parent] != Node) + { + ListMetric(); + Quit("ValidateRB(%s): Parent %u not linked to child %u\n", + szMsg, Parent, Node); + } + } + + if (RB_NIL != Left) + { + if (m_RBParent[Left] != Node) + { + ListMetric(); + Quit("ValidateRB(%s): Left child %u not linked to parent %u\n", + szMsg, Left, Node); + } + } + + if (RB_NIL != Right) + { + if (m_RBParent[Right] != Node) + { + ListMetric(); + Quit("ValidateRB(%s): Right child %u not linked to parent %u\n", + szMsg, Right, Node); + } + } + + ValidateRBNode(Left, szMsg); + ValidateRBNode(Right, szMsg); + } + +void Clust::ValidateRB(const char szMsg[]) const + { + if (RB_NIL == m_RBRoot) + return; + + ValidateRBNode(m_RBRoot, szMsg); + + unsigned Node = RBMin(m_RBRoot); + for (;;) + { + unsigned Next = RBNext(Node); + if (RB_NIL == Next) + break; + if (m_RBMetric[Node] > m_RBMetric[Next]) + { + ListMetric(); + Quit("ValidateRBNode(%s): metric out of order %u=%g %u=%g", + szMsg, Node, m_RBMetric[Node], Next, m_RBMetric[Next]); + } + Node = Next; + } + } diff --git a/src/muscle/muscle3.8.31/src/refine.cpp b/src/muscle/muscle3.8.31/src/refine.cpp new file mode 100644 index 0000000..94f0046 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/refine.cpp @@ -0,0 +1,80 @@ +#include "muscle.h" +#include "textfile.h" +#include "seqvect.h" +#include "distfunc.h" +#include "msa.h" +#include "tree.h" +#include "clust.h" +#include "profile.h" +#include "clustsetmsa.h" + +void Refine() + { + SetOutputFileName(g_pstrOutFileName); + SetInputFileName(g_pstrInFileName); + SetStartTime(); + + SetMaxIters(g_uMaxIters); + SetSeqWeightMethod(g_SeqWeight1); + + TextFile fileIn(g_pstrInFileName); + MSA msa; + msa.FromFile(fileIn); + + const unsigned uSeqCount = msa.GetSeqCount(); + if (0 == uSeqCount) + Quit("No sequences in input file"); + + ALPHA Alpha = ALPHA_Undefined; + switch (g_SeqType) + { + case SEQTYPE_Auto: + Alpha = msa.GuessAlpha(); + break; + + case SEQTYPE_Protein: + Alpha = ALPHA_Amino; + break; + + case SEQTYPE_DNA: + Alpha = ALPHA_DNA; + break; + + case SEQTYPE_RNA: + Alpha = ALPHA_RNA; + break; + + default: + Quit("Invalid SeqType"); + } + SetAlpha(Alpha); + msa.FixAlpha(); + + SetPPScore(); + if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha) + SetPPScore(PPSCORE_SPN); + + MSA::SetIdCount(uSeqCount); + +// Initialize sequence ids. +// From this point on, ids must somehow propogate from here. + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + msa.SetSeqId(uSeqIndex, uSeqIndex); + SetMuscleInputMSA(msa); + + Tree GuideTree; + TreeFromMSA(msa, GuideTree, g_Cluster2, g_Distance2, g_Root2); + SetMuscleTree(GuideTree); + + if (g_bAnchors) + RefineVert(msa, GuideTree, g_uMaxIters); + else + RefineHoriz(msa, GuideTree, g_uMaxIters, false, false); + + ValidateMuscleIds(msa); + ValidateMuscleIds(GuideTree); + +// TextFile fileOut(g_pstrOutFileName, true); +// msa.ToFile(fileOut); + MuscleOutput(msa); + } diff --git a/src/muscle/muscle3.8.31/src/refinehoriz.cpp b/src/muscle/muscle3.8.31/src/refinehoriz.cpp new file mode 100644 index 0000000..cbb04fb --- /dev/null +++ b/src/muscle/muscle3.8.31/src/refinehoriz.cpp @@ -0,0 +1,288 @@ +#include "muscle.h" +#include "tree.h" +#include "msa.h" +#include "pwpath.h" +#include "profile.h" +#include "scorehistory.h" +#include "objscore.h" + +unsigned g_uRefineHeightSubtree; +unsigned g_uRefineHeightSubtreeTotal; + +#define TRACE 0 +#define DIFFOBJSCORE 0 + +static bool TryRealign(MSA &msaIn, const Tree &tree, const unsigned Leaves1[], + unsigned uCount1, const unsigned Leaves2[], unsigned uCount2, + SCORE *ptrscoreBefore, SCORE *ptrscoreAfter, + bool bLockLeft, bool bLockRight) + { +#if TRACE + Log("TryRealign, msaIn=\n"); + msaIn.LogMe(); +#endif + + const unsigned uSeqCount = msaIn.GetSeqCount(); + + unsigned *Ids1 = new unsigned[uSeqCount]; + unsigned *Ids2 = new unsigned[uSeqCount]; + + LeafIndexesToIds(tree, Leaves1, uCount1, Ids1); + LeafIndexesToIds(tree, Leaves2, uCount2, Ids2); + + MSA msa1; + MSA msa2; + + MSASubsetByIds(msaIn, Ids1, uCount1, msa1); + MSASubsetByIds(msaIn, Ids2, uCount2, msa2); + +#if DEBUG + ValidateMuscleIds(msa1); + ValidateMuscleIds(msa2); +#endif + +// Computing the objective score may be expensive for +// large numbers of sequences. As a speed optimization, +// we check whether the alignment changes. If it does +// not change, there is no need to compute the objective +// score. We test for the alignment changing by comparing +// the Viterbi paths before and after re-aligning. + PWPath pathBefore; + pathBefore.FromMSAPair(msa1, msa2); + + DeleteGappedCols(msa1); + DeleteGappedCols(msa2); + + if (0 == msa1.GetColCount() || 0 == msa2.GetColCount()) + return false; + + MSA msaRealigned; + PWPath pathAfter; + + AlignTwoMSAs(msa1, msa2, msaRealigned, pathAfter, bLockLeft, bLockRight); + + bool bAnyChanges = !pathAfter.Equal(pathBefore); + unsigned uDiffCount1; + unsigned uDiffCount2; + static unsigned Edges1[10000]; + static unsigned Edges2[10000]; + DiffPaths(pathBefore, pathAfter, Edges1, &uDiffCount1, Edges2, &uDiffCount2); + +#if TRACE + Log("TryRealign, msa1=\n"); + msa1.LogMe(); + Log("\nmsa2=\n"); + msa2.LogMe(); + Log("\nRealigned (changes %s)=\n", bAnyChanges ? "TRUE" : "FALSE"); + msaRealigned.LogMe(); +#endif + + if (!bAnyChanges) + { + *ptrscoreBefore = 0; + *ptrscoreAfter = 0; + return false; + } + + SetMSAWeightsMuscle(msaIn); + SetMSAWeightsMuscle(msaRealigned); + +#if DIFFOBJSCORE + const SCORE scoreDiff = DiffObjScore(msaIn, pathBefore, Edges1, uDiffCount1, + msaRealigned, pathAfter, Edges2, uDiffCount2); + bool bAccept = (scoreDiff > 0); + *ptrscoreBefore = 0; + *ptrscoreAfter = scoreDiff; + //const SCORE scoreBefore = ObjScoreIds(msaIn, Ids1, uCount1, Ids2, uCount2); + //const SCORE scoreAfter = ObjScoreIds(msaRealigned, Ids1, uCount1, Ids2, uCount2); + //Log("Diff = %.3g %.3g\n", scoreDiff, scoreAfter - scoreBefore); +#else + const SCORE scoreBefore = ObjScoreIds(msaIn, Ids1, uCount1, Ids2, uCount2); + const SCORE scoreAfter = ObjScoreIds(msaRealigned, Ids1, uCount1, Ids2, uCount2); + + bool bAccept = (scoreAfter > scoreBefore); + +#if TRACE + Log("Score %g -> %g Accept %s\n", scoreBefore, scoreAfter, bAccept ? "TRUE" : "FALSE"); +#endif + + *ptrscoreBefore = scoreBefore; + *ptrscoreAfter = scoreAfter; +#endif + + if (bAccept) + msaIn.Copy(msaRealigned); + delete[] Ids1; + delete[] Ids2; + return bAccept; + } + +static void RefineHeightParts(MSA &msaIn, const Tree &tree, + const unsigned InternalNodeIndexes[], bool bReversed, bool bRight, + unsigned uIter, + ScoreHistory &History, + bool *ptrbAnyChanges, bool *ptrbOscillating, bool bLockLeft, bool bLockRight) + { + *ptrbOscillating = false; + + const unsigned uSeqCount = msaIn.GetSeqCount(); + const unsigned uInternalNodeCount = uSeqCount - 1; + + unsigned *Leaves1 = new unsigned[uSeqCount]; + unsigned *Leaves2 = new unsigned[uSeqCount]; + + const unsigned uRootNodeIndex = tree.GetRootNodeIndex(); + bool bAnyAccepted = false; + for (unsigned i = 0; i < uInternalNodeCount; ++i) + { + const unsigned uInternalNodeIndex = InternalNodeIndexes[i]; + unsigned uNeighborNodeIndex; + if (tree.IsRoot(uInternalNodeIndex) && !bRight) + continue; + else if (bRight) + uNeighborNodeIndex = tree.GetRight(uInternalNodeIndex); + else + uNeighborNodeIndex = tree.GetLeft(uInternalNodeIndex); + + g_uTreeSplitNode1 = uInternalNodeIndex; + g_uTreeSplitNode2 = uNeighborNodeIndex; + + unsigned uCount1; + unsigned uCount2; + + GetLeaves(tree, uNeighborNodeIndex, Leaves1, &uCount1); + GetLeavesExcluding(tree, uRootNodeIndex, uNeighborNodeIndex, + Leaves2, &uCount2); + +#if TRACE + Log("\nRefineHeightParts node %u\n", uInternalNodeIndex); + Log("Group1="); + for (unsigned n = 0; n < uCount1; ++n) + Log(" %u(%s)", Leaves1[n], tree.GetName(Leaves1[n])); + Log("\n"); + Log("Group2="); + for (unsigned n = 0; n < uCount2; ++n) + Log(" %u(%s)", Leaves2[n], tree.GetName(Leaves2[n])); + Log("\n"); +#endif + + SCORE scoreBefore; + SCORE scoreAfter; + bool bAccepted = TryRealign(msaIn, tree, Leaves1, uCount1, Leaves2, uCount2, + &scoreBefore, &scoreAfter, bLockLeft, bLockRight); + SetCurrentAlignment(msaIn); + + ++g_uRefineHeightSubtree; + Progress(g_uRefineHeightSubtree, g_uRefineHeightSubtreeTotal); + +#if TRACE + if (uIter > 0) + Log("Before %g %g\n", scoreBefore, + History.GetScore(uIter - 1, uInternalNodeIndex, bReversed, bRight)); +#endif + SCORE scoreMax = scoreAfter > scoreBefore? scoreAfter : scoreBefore; + bool bRepeated = History.SetScore(uIter, uInternalNodeIndex, bRight, scoreMax); + if (bRepeated) + { + *ptrbOscillating = true; + break; + } + + if (bAccepted) + bAnyAccepted = true; + } + + delete[] Leaves1; + delete[] Leaves2; + + *ptrbAnyChanges = bAnyAccepted; + } + +// Return true if any changes made +bool RefineHoriz(MSA &msaIn, const Tree &tree, unsigned uIters, bool bLockLeft, + bool bLockRight) + { +#if TRACE + tree.LogMe(); +#endif + + if (!tree.IsRooted()) + Quit("RefineHeight: requires rooted tree"); + + const unsigned uSeqCount = msaIn.GetSeqCount(); + if (uSeqCount < 3) + return false; + + const unsigned uInternalNodeCount = uSeqCount - 1; + unsigned *InternalNodeIndexes = new unsigned[uInternalNodeCount]; + unsigned *InternalNodeIndexesR = new unsigned[uInternalNodeCount]; + + GetInternalNodesInHeightOrder(tree, InternalNodeIndexes); + + ScoreHistory History(uIters, 2*uSeqCount - 1); + + bool bAnyChangesAnyIter = false; + for (unsigned n = 0; n < uInternalNodeCount; ++n) + InternalNodeIndexesR[uInternalNodeCount - 1 - n] = InternalNodeIndexes[n]; + + for (unsigned uIter = 0; uIter < uIters; ++uIter) + { + bool bAnyChangesThisIter = false; + IncIter(); + SetProgressDesc("Refine biparts"); + g_uRefineHeightSubtree = 0; + g_uRefineHeightSubtreeTotal = uInternalNodeCount*2 - 1; + + bool bReverse = (uIter%2 != 0); + unsigned *Internals; + if (bReverse) + Internals = InternalNodeIndexesR; + else + Internals = InternalNodeIndexes; + + bool bOscillating; + for (unsigned i = 0; i < 2; ++i) + { + bool bAnyChanges = false; + bool bRight; + switch (i) + { + case 0: + bRight = true; + break; + case 1: + bRight = false; + break; + default: + Quit("RefineHeight default case"); + } + RefineHeightParts(msaIn, tree, Internals, bReverse, bRight, + uIter, + History, + &bAnyChanges, &bOscillating, bLockLeft, bLockRight); + if (bOscillating) + { + ProgressStepsDone(); + goto Osc; + } + if (bAnyChanges) + { + bAnyChangesThisIter = true; + bAnyChangesAnyIter = true; + } + } + + ProgressStepsDone(); + if (bOscillating) + break; + + if (!bAnyChangesThisIter) + break; + } + +Osc: + delete[] InternalNodeIndexes; + delete[] InternalNodeIndexesR; + + return bAnyChangesAnyIter; + } diff --git a/src/muscle/muscle3.8.31/src/refinesubfams.cpp b/src/muscle/muscle3.8.31/src/refinesubfams.cpp new file mode 100644 index 0000000..600e394 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/refinesubfams.cpp @@ -0,0 +1,212 @@ +#include "muscle.h" +#include "msa.h" +#include "tree.h" +#include "clust.h" +#include "profile.h" +#include "pwpath.h" + +#define TRACE 0 + +static void ProgressiveAlignSubfams(const Tree &tree, const unsigned Subfams[], + unsigned uSubfamCount, const MSA SubfamMSAs[], MSA &msa); + +// Identify subfamilies in a tree. +// Returns array of internal node indexes, one for each subfamily. +// First try is to select groups by height (which should approximate +// minimum percent identity), if this gives too many subfamilies then +// we cut at a point that gives the maximum allowed number of subfams. +static void GetSubfams(const Tree &tree, double dMaxHeight, + unsigned uMaxSubfamCount, unsigned **ptrptrSubfams, unsigned *ptruSubfamCount) + { + const unsigned uNodeCount = tree.GetNodeCount(); + + unsigned *Subfams = new unsigned[uNodeCount]; + + unsigned uSubfamCount; + ClusterByHeight(tree, dMaxHeight, Subfams, &uSubfamCount); + + if (uSubfamCount > uMaxSubfamCount) + ClusterBySubfamCount(tree, uMaxSubfamCount, Subfams, &uSubfamCount); + + *ptrptrSubfams = Subfams; + *ptruSubfamCount = uSubfamCount; + } + +static void LogSubfams(const Tree &tree, const unsigned Subfams[], + unsigned uSubfamCount) + { + const unsigned uNodeCount = tree.GetNodeCount(); + Log("%u subfamilies found\n", uSubfamCount); + Log("Subfam Sequence\n"); + Log("------ --------\n"); + unsigned *Leaves = new unsigned[uNodeCount]; + for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex) + { + unsigned uSubfamNodeIndex = Subfams[uSubfamIndex]; + unsigned uLeafCount; + GetLeaves(tree, uSubfamNodeIndex, Leaves, &uLeafCount); + for (unsigned uLeafIndex = 0; uLeafIndex < uLeafCount; ++uLeafIndex) + Log("%6u %s\n", uSubfamIndex + 1, tree.GetLeafName(Leaves[uLeafIndex])); + Log("\n"); + } + delete[] Leaves; + } + +bool RefineSubfams(MSA &msa, const Tree &tree, unsigned uIters) + { + const unsigned uSeqCount = msa.GetSeqCount(); + if (uSeqCount < 3) + return false; + + const double dMaxHeight = 0.6; + const unsigned uMaxSubfamCount = 16; + const unsigned uNodeCount = tree.GetNodeCount(); + + unsigned *Subfams; + unsigned uSubfamCount; + GetSubfams(tree, dMaxHeight, uMaxSubfamCount, &Subfams, &uSubfamCount); + assert(uSubfamCount <= uSeqCount); + + if (g_bVerbose) + LogSubfams(tree, Subfams, uSubfamCount); + + MSA *SubfamMSAs = new MSA[uSubfamCount]; + unsigned *Leaves = new unsigned[uSeqCount]; + unsigned *Ids = new unsigned[uSeqCount]; + + bool bAnyChanges = false; + for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex) + { + unsigned uSubfam = Subfams[uSubfamIndex]; + unsigned uLeafCount; + GetLeaves(tree, uSubfam, Leaves, &uLeafCount); + assert(uLeafCount <= uSeqCount); + + LeafIndexesToIds(tree, Leaves, uLeafCount, Ids); + + MSA &msaSubfam = SubfamMSAs[uSubfamIndex]; + MSASubsetByIds(msa, Ids, uLeafCount, msaSubfam); + DeleteGappedCols(msaSubfam); + +#if TRACE + Log("Subfam %u MSA=\n", uSubfamIndex); + msaSubfam.LogMe(); +#endif + + if (msaSubfam.GetSeqCount() <= 2) + continue; + + // TODO ///////////////////////////////////////// + // Try using existing tree, may actually hurt to + // re-estimate, may also be a waste of CPU & mem. + ///////////////////////////////////////////////// + Tree SubfamTree; + TreeFromMSA(msaSubfam, SubfamTree, g_Cluster2, g_Distance2, g_Root2); + + bool bAnyChangesThisSubfam; + if (g_bAnchors) + bAnyChangesThisSubfam = RefineVert(msaSubfam, SubfamTree, uIters); + else + bAnyChangesThisSubfam = RefineHoriz(msaSubfam, SubfamTree, uIters, false, false); +#if TRACE + Log("Subfam %u Changed %d\n", uSubfamIndex, bAnyChangesThisSubfam); +#endif + if (bAnyChangesThisSubfam) + bAnyChanges = true; + } + + if (bAnyChanges) + ProgressiveAlignSubfams(tree, Subfams, uSubfamCount, SubfamMSAs, msa); + + delete[] Leaves; + delete[] Subfams; + delete[] SubfamMSAs; + + return bAnyChanges; + } + +static void ProgressiveAlignSubfams(const Tree &tree, const unsigned Subfams[], + unsigned uSubfamCount, const MSA SubfamMSAs[], MSA &msa) + { + const unsigned uNodeCount = tree.GetNodeCount(); + + bool *Ready = new bool[uNodeCount]; + MSA **MSAs = new MSA *[uNodeCount]; + for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) + { + Ready[uNodeIndex] = false; + MSAs[uNodeIndex] = 0; + } + + for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex) + { + unsigned uNodeIndex = Subfams[uSubfamIndex]; + Ready[uNodeIndex] = true; + MSA *ptrMSA = new MSA; + // TODO: Wasteful copy, needs re-design + ptrMSA->Copy(SubfamMSAs[uSubfamIndex]); + MSAs[uNodeIndex] = ptrMSA; + } + + for (unsigned uNodeIndex = tree.FirstDepthFirstNode(); + NULL_NEIGHBOR != uNodeIndex; + uNodeIndex = tree.NextDepthFirstNode(uNodeIndex)) + { + if (tree.IsLeaf(uNodeIndex)) + continue; + + unsigned uRight = tree.GetRight(uNodeIndex); + unsigned uLeft = tree.GetLeft(uNodeIndex); + if (!Ready[uRight] || !Ready[uLeft]) + continue; + + MSA *ptrLeft = MSAs[uLeft]; + MSA *ptrRight = MSAs[uRight]; + assert(ptrLeft != 0 && ptrRight != 0); + + MSA *ptrParent = new MSA; + + PWPath Path; + AlignTwoMSAs(*ptrLeft, *ptrRight, *ptrParent, Path); + + MSAs[uNodeIndex] = ptrParent; + Ready[uNodeIndex] = true; + Ready[uLeft] = false; + Ready[uRight] = false; + + delete MSAs[uLeft]; + delete MSAs[uRight]; + MSAs[uLeft] = 0; + MSAs[uRight] = 0; + } + +#if DEBUG + { + unsigned uReadyCount = 0; + for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) + { + if (Ready[uNodeIndex]) + { + assert(tree.IsRoot(uNodeIndex)); + ++uReadyCount; + assert(0 != MSAs[uNodeIndex]); + } + else + assert(0 == MSAs[uNodeIndex]); + } + assert(1 == uReadyCount); + } +#endif + + const unsigned uRoot = tree.GetRootNodeIndex(); + MSA *ptrRootAlignment = MSAs[uRoot]; + + msa.Copy(*ptrRootAlignment); + + delete ptrRootAlignment; + +#if TRACE + Log("After refine subfamilies, root alignment=\n"); + msa.LogMe(); +#endif + } diff --git a/src/muscle/muscle3.8.31/src/refinetree.cpp b/src/muscle/muscle3.8.31/src/refinetree.cpp new file mode 100644 index 0000000..313bbed --- /dev/null +++ b/src/muscle/muscle3.8.31/src/refinetree.cpp @@ -0,0 +1,59 @@ +#include "muscle.h" +#include "msa.h" +#include "tree.h" +#include "profile.h" +#include + +void RefineTree(MSA &msa, Tree &tree) + { + const unsigned uSeqCount = msa.GetSeqCount(); + if (tree.GetLeafCount() != uSeqCount) + Quit("Refine tree, tree has different number of nodes"); + + if (uSeqCount < 3) + return; + +#if DEBUG + ValidateMuscleIds(msa); + ValidateMuscleIds(tree); +#endif + + unsigned *IdToDiffsLeafNodeIndex = new unsigned[uSeqCount]; + unsigned uDiffsCount = uSeqCount; + Tree Tree2; + for (unsigned uIter = 0; uIter < g_uMaxTreeRefineIters; ++uIter) + { + TreeFromMSA(msa, Tree2, g_Cluster2, g_Distance2, g_Root2, g_pstrDistMxFileName2); + +#if DEBUG + ValidateMuscleIds(Tree2); +#endif + + Tree Diffs; + DiffTrees(Tree2, tree, Diffs, IdToDiffsLeafNodeIndex); + + tree.Copy(Tree2); + + const unsigned uNewDiffsNodeCount = Diffs.GetNodeCount(); + const unsigned uNewDiffsCount = (uNewDiffsNodeCount - 1)/2; + + if (0 == uNewDiffsCount || uNewDiffsCount >= uDiffsCount) + { + ProgressStepsDone(); + break; + } + uDiffsCount = uNewDiffsCount; + + MSA msa2; + RealignDiffs(msa, Diffs, IdToDiffsLeafNodeIndex, msa2); + +#if DEBUG + ValidateMuscleIds(msa2); +#endif + + msa.Copy(msa2); + SetCurrentAlignment(msa); + } + + delete[] IdToDiffsLeafNodeIndex; + } diff --git a/src/muscle/muscle3.8.31/src/refinetreee.cpp b/src/muscle/muscle3.8.31/src/refinetreee.cpp new file mode 100644 index 0000000..6d8afc6 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/refinetreee.cpp @@ -0,0 +1,51 @@ +#include "muscle.h" +#include "msa.h" +#include "tree.h" +#include "profile.h" +#include + +#define TRACE 0 + +void RefineTreeE(MSA &msa, const SeqVect &v, Tree &tree, ProgNode *ProgNodes) + { + const unsigned uSeqCount = msa.GetSeqCount(); + if (tree.GetLeafCount() != uSeqCount) + Quit("Refine tree, tree has different number of nodes"); + + if (uSeqCount < 3) + return; + +#if DEBUG + ValidateMuscleIds(msa); + ValidateMuscleIds(tree); +#endif + + const unsigned uNodeCount = tree.GetNodeCount(); + unsigned *uNewNodeIndexToOldNodeIndex= new unsigned[uNodeCount]; + + Tree Tree2; + TreeFromMSA(msa, Tree2, g_Cluster2, g_Distance2, g_Root2, g_pstrDistMxFileName2); + +#if DEBUG + ValidateMuscleIds(Tree2); +#endif + + DiffTreesE(Tree2, tree, uNewNodeIndexToOldNodeIndex); + + unsigned uRoot = Tree2.GetRootNodeIndex(); + if (NODE_CHANGED == uNewNodeIndexToOldNodeIndex[uRoot]) + { + MSA msa2; + RealignDiffsE(msa, v, Tree2, tree, uNewNodeIndexToOldNodeIndex, msa2, ProgNodes); + tree.Copy(Tree2); + msa.Copy(msa2); +#if DEBUG + ValidateMuscleIds(msa2); +#endif + } + + delete[] uNewNodeIndexToOldNodeIndex; + + SetCurrentAlignment(msa); + ProgressStepsDone(); + } diff --git a/src/muscle/muscle3.8.31/src/refinevert.cpp b/src/muscle/muscle3.8.31/src/refinevert.cpp new file mode 100644 index 0000000..0655651 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/refinevert.cpp @@ -0,0 +1,159 @@ +#include "muscle.h" +#include "profile.h" +#include "msa.h" +#include "pwpath.h" +#include "seqvect.h" +#include "clust.h" +#include "tree.h" + +#define TRACE 0 + +struct Range + { + unsigned m_uBestColLeft; + unsigned m_uBestColRight; + }; + +static void ListVertSavings(unsigned uColCount, unsigned uAnchorColCount, + const Range *Ranges, unsigned uRangeCount) + { + if (!g_bVerbose || !g_bAnchors) + return; + double dTotalArea = uColCount*uColCount; + double dArea = 0.0; + for (unsigned i = 0; i < uRangeCount; ++i) + { + unsigned uLength = Ranges[i].m_uBestColRight - Ranges[i].m_uBestColLeft; + dArea += uLength*uLength; + } + double dPct = (dTotalArea - dArea)*100.0/dTotalArea; + Log("Anchor columns found %u\n", uAnchorColCount); + Log("DP area saved by anchors %-4.1f%%\n", dPct); + } + +static void ColsToRanges(const unsigned BestCols[], unsigned uBestColCount, + unsigned uColCount, Range Ranges[]) + { +// N best columns produces N+1 vertical blocks. + const unsigned uRangeCount = uBestColCount + 1; + for (unsigned uIndex = 0; uIndex < uRangeCount ; ++uIndex) + { + unsigned uBestColLeft = 0; + if (uIndex > 0) + uBestColLeft = BestCols[uIndex-1]; + + unsigned uBestColRight = uColCount; + if (uIndex < uBestColCount) + uBestColRight = BestCols[uIndex]; + + Ranges[uIndex].m_uBestColLeft = uBestColLeft; + Ranges[uIndex].m_uBestColRight = uBestColRight; + } + } + +// Return true if any changes made +bool RefineVert(MSA &msaIn, const Tree &tree, unsigned uIters) + { + bool bAnyChanges = false; + + const unsigned uColCountIn = msaIn.GetColCount(); + const unsigned uSeqCountIn = msaIn.GetSeqCount(); + + if (uColCountIn < 3 || uSeqCountIn < 3) + return false; + + unsigned *AnchorCols = new unsigned[uColCountIn]; + unsigned uAnchorColCount; + SetMSAWeightsMuscle(msaIn); + FindAnchorCols(msaIn, AnchorCols, &uAnchorColCount); + + const unsigned uRangeCount = uAnchorColCount + 1; + Range *Ranges = new Range[uRangeCount]; + +#if TRACE + Log("%u ranges\n", uRangeCount); +#endif + + ColsToRanges(AnchorCols, uAnchorColCount, uColCountIn, Ranges); + ListVertSavings(uColCountIn, uAnchorColCount, Ranges, uRangeCount); + +#if TRACE + { + Log("Anchor cols: "); + for (unsigned i = 0; i < uAnchorColCount; ++i) + Log(" %u", AnchorCols[i]); + Log("\n"); + + Log("Ranges:\n"); + for (unsigned i = 0; i < uRangeCount; ++i) + Log("%4u - %4u\n", Ranges[i].m_uBestColLeft, Ranges[i].m_uBestColRight); + } +#endif + + delete[] AnchorCols; + + MSA msaOut; + msaOut.SetSize(uSeqCountIn, 0); + + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCountIn; ++uSeqIndex) + { + const char *ptrName = msaIn.GetSeqName(uSeqIndex); + unsigned uId = msaIn.GetSeqId(uSeqIndex); + msaOut.SetSeqName(uSeqIndex, ptrName); + msaOut.SetSeqId(uSeqIndex, uId); + } + + for (unsigned uRangeIndex = 0; uRangeIndex < uRangeCount; ++uRangeIndex) + { + MSA msaRange; + + const Range &r = Ranges[uRangeIndex]; + + const unsigned uFromColIndex = r.m_uBestColLeft; + const unsigned uRangeColCount = r.m_uBestColRight - uFromColIndex; + + if (0 == uRangeColCount) + continue; + else if (1 == uRangeColCount) + { + MSAFromColRange(msaIn, uFromColIndex, 1, msaRange); + MSAAppend(msaOut, msaRange); + continue; + } + MSAFromColRange(msaIn, uFromColIndex, uRangeColCount, msaRange); + +#if TRACE + Log("\n-------------\n"); + Log("Range %u - %u count=%u\n", r.m_uBestColLeft, r.m_uBestColRight, uRangeColCount); + Log("Before:\n"); + msaRange.LogMe(); +#endif + + bool bLockLeft = (0 != uRangeIndex); + bool bLockRight = (uRangeCount - 1 != uRangeIndex); + bool bAnyChangesThisBlock = RefineHoriz(msaRange, tree, uIters, bLockLeft, bLockRight); + bAnyChanges = (bAnyChanges || bAnyChangesThisBlock); + +#if TRACE + Log("After:\n"); + msaRange.LogMe(); +#endif + + MSAAppend(msaOut, msaRange); + +#if TRACE + Log("msaOut after Cat:\n"); + msaOut.LogMe(); +#endif + } + +#if DEBUG +// Sanity check + AssertMSAEqIgnoreCaseAndGaps(msaIn, msaOut); +#endif + + delete[] Ranges; + if (bAnyChanges) + msaIn.Copy(msaOut); + return bAnyChanges; + } diff --git a/src/muscle/muscle3.8.31/src/refinew.cpp b/src/muscle/muscle3.8.31/src/refinew.cpp new file mode 100644 index 0000000..671e591 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/refinew.cpp @@ -0,0 +1,227 @@ +#include "muscle.h" +#include "msa.h" +#include "seqvect.h" +#include "textfile.h" + +#define MEMDEBUG 0 + +#if MEMDEBUG +#include +#endif + +void MUSCLE(SeqVect &v, MSA &msaOut); + +// Append msa2 at the end of msa1 +void AppendMSA(MSA &msa1, const MSA &msa2) + { + const unsigned uSeqCount = msa1.GetSeqCount(); + + const unsigned uColCount1 = msa1.GetColCount(); + const unsigned uColCount2 = msa2.GetColCount(); + + const unsigned uColCountCat = uColCount1 + uColCount2; + + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + unsigned uId = msa1.GetSeqId(uSeqIndex); + unsigned uSeqIndex2; + bool bFound = msa2.GetSeqIndex(uId, &uSeqIndex2); + if (bFound) + { + for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) + { + const char c = msa2.GetChar(uSeqIndex2, uColIndex); + msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, c); + } + } + else + { + for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) + msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, '-'); + } + } + } + +static void SeqFromMSACols(const MSA &msa, unsigned uSeqIndex, unsigned uColFrom, + unsigned uColTo, Seq &s) + { + s.Clear(); + s.SetName(msa.GetSeqName(uSeqIndex)); + s.SetId(msa.GetSeqId(uSeqIndex)); + for (unsigned uColIndex = uColFrom; uColIndex <= uColTo; ++uColIndex) + { + char c = msa.GetChar(uSeqIndex, uColIndex); + if (!IsGapChar(c)) + s.AppendChar(c); + } + } + +static void SeqVectFromMSACols(const MSA &msa, unsigned uColFrom, unsigned uColTo, + SeqVect &v) + { + v.Clear(); + const unsigned uSeqCount = msa.GetSeqCount(); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + Seq s; + SeqFromMSACols(msa, uSeqIndex, uColFrom, uColTo, s); + v.AppendSeq(s); + } + } + +void RefineW(const MSA &msaIn, MSA &msaOut) + { + const unsigned uSeqCount = msaIn.GetSeqCount(); + const unsigned uColCount = msaIn.GetColCount(); + +// Reserve same nr seqs, 20% more cols + const unsigned uReserveColCount = (uColCount*120)/100; + msaOut.SetSize(uSeqCount, uReserveColCount); + + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + msaOut.SetSeqName(uSeqIndex, msaIn.GetSeqName(uSeqIndex)); + msaOut.SetSeqId(uSeqIndex, msaIn.GetSeqId(uSeqIndex)); + } + + const unsigned uWindowCount = (uColCount + g_uRefineWindow - 1)/g_uRefineWindow; + if (0 == g_uWindowTo) + g_uWindowTo = uWindowCount - 1; + +#if MEMDEBUG + _CrtSetBreakAlloc(1560); +#endif + + if (g_uWindowOffset > 0) + { + MSA msaTmp; + MSAFromColRange(msaIn, 0, g_uWindowOffset, msaOut); + } + + fprintf(stderr, "\n"); + for (unsigned uWindowIndex = g_uWindowFrom; uWindowIndex <= g_uWindowTo; ++uWindowIndex) + { + fprintf(stderr, "Window %d of %d \r", uWindowIndex, uWindowCount); + const unsigned uColFrom = g_uWindowOffset + uWindowIndex*g_uRefineWindow; + unsigned uColTo = uColFrom + g_uRefineWindow - 1; + if (uColTo >= uColCount) + uColTo = uColCount - 1; + assert(uColTo >= uColFrom); + + SeqVect v; + SeqVectFromMSACols(msaIn, uColFrom, uColTo, v); + +#if MEMDEBUG + _CrtMemState s1; + _CrtMemCheckpoint(&s1); +#endif + + MSA msaTmp; + MUSCLE(v, msaTmp); + AppendMSA(msaOut, msaTmp); + if (uWindowIndex == g_uSaveWindow) + { + MSA msaInTmp; + unsigned uOutCols = msaOut.GetColCount(); + unsigned un = uColTo - uColFrom + 1; + MSAFromColRange(msaIn, uColFrom, un, msaInTmp); + + char fn[256]; + sprintf(fn, "win%d_inaln.tmp", uWindowIndex); + TextFile fIn(fn, true); + msaInTmp.ToFile(fIn); + + sprintf(fn, "win%d_inseqs.tmp", uWindowIndex); + TextFile fv(fn, true); + v.ToFile(fv); + + sprintf(fn, "win%d_outaln.tmp", uWindowIndex); + TextFile fOut(fn, true); + msaTmp.ToFile(fOut); + } + +#if MEMDEBUG + void FreeDPMemSPN(); + FreeDPMemSPN(); + + _CrtMemState s2; + _CrtMemCheckpoint(&s2); + + _CrtMemState s; + _CrtMemDifference(&s, &s1, &s2); + + _CrtMemDumpStatistics(&s); + _CrtMemDumpAllObjectsSince(&s1); + exit(1); +#endif +//#if DEBUG +// AssertMSAEqIgnoreCaseAndGaps(msaInTmp, msaTmp); +//#endif + } + fprintf(stderr, "\n"); + +// AssertMSAEqIgnoreCaseAndGaps(msaIn, msaOut);//@@uncomment! + } + +void DoRefineW() + { + SetOutputFileName(g_pstrOutFileName); + SetInputFileName(g_pstrInFileName); + SetStartTime(); + + SetMaxIters(g_uMaxIters); + SetSeqWeightMethod(g_SeqWeight1); + + TextFile fileIn(g_pstrInFileName); + MSA msa; + msa.FromFile(fileIn); + + const unsigned uSeqCount = msa.GetSeqCount(); + if (0 == uSeqCount) + Quit("No sequences in input file"); + + MSA::SetIdCount(uSeqCount); + +// Initialize sequence ids. +// From this point on, ids must somehow propogate from here. + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + msa.SetSeqId(uSeqIndex, uSeqIndex); + SetMuscleInputMSA(msa); + + ALPHA Alpha = ALPHA_Undefined; + switch (g_SeqType) + { + case SEQTYPE_Auto: + Alpha = msa.GuessAlpha(); + break; + + case SEQTYPE_Protein: + Alpha = ALPHA_Amino; + break; + + case SEQTYPE_DNA: + Alpha = ALPHA_DNA; + break; + + case SEQTYPE_RNA: + Alpha = ALPHA_RNA; + break; + + default: + Quit("Invalid SeqType"); + } + SetAlpha(Alpha); + msa.FixAlpha(); + + if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha) + SetPPScore(PPSCORE_SPN); + + MSA msaOut; + RefineW(msa, msaOut); + +// ValidateMuscleIds(msa); + +// TextFile fileOut(g_pstrOutFileName, true); +// msaOut.ToFile(fileOut); + MuscleOutput(msaOut); + } diff --git a/src/muscle/muscle3.8.31/src/releases.txt b/src/muscle/muscle3.8.31/src/releases.txt new file mode 100644 index 0000000..cd6e5cd --- /dev/null +++ b/src/muscle/muscle3.8.31/src/releases.txt @@ -0,0 +1,13 @@ +ver=2.01 rev=1 +ver=2.10 rev=3 +ver=3.00 rev=5 +ver=3.20 rev=7 +ver=3.30 rev=9 +ver=3.41 rev=11 +ver=3.40 rev=12 +ver=3.51 rev=14 +ver=3.52 rev=16 +ver=3.50 rev=17 +ver=3.60 rev=19 +ver=3.70 rev=21 +ver=3.80 rev=22 diff --git a/src/muscle/muscle3.8.31/src/savebest.cpp b/src/muscle/muscle3.8.31/src/savebest.cpp new file mode 100644 index 0000000..24757f2 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/savebest.cpp @@ -0,0 +1,66 @@ +#include "muscle.h" +#include "msa.h" +#include "textfile.h" +#include + +MSA *ptrBestMSA; +static const char *pstrOutputFileName; + +void SetOutputFileName(const char *out) + { + pstrOutputFileName = out; + } + +void SetCurrentAlignment(MSA &msa) + { + ptrBestMSA = &msa; + } + +void SaveCurrentAlignment() + { + static bool bCalled = false; + if (bCalled) + { + fprintf(stderr, + "\nRecursive call to SaveCurrentAlignment, giving up attempt to save.\n"); + exit(EXIT_FatalError); + } + + if (0 == ptrBestMSA) + { + fprintf(stderr, "\nAlignment not completed, cannot save.\n"); + Log("Alignment not completed, cannot save.\n"); + exit(EXIT_FatalError); + } + + if (0 == pstrOutputFileName) + { + fprintf(stderr, "\nOutput file name not specified, cannot save.\n"); + exit(EXIT_FatalError); + } + + fprintf(stderr, "\nSaving current alignment ...\n"); + + TextFile fileOut(pstrOutputFileName, true); + ptrBestMSA->ToFASTAFile(fileOut); + + fprintf(stderr, "Current alignment saved to \"%s\".\n", pstrOutputFileName); + Log("Current alignment saved to \"%s\".\n", pstrOutputFileName); + } + +void CheckMaxTime() + { + if (0 == g_ulMaxSecs) + return; + + time_t Now = time(0); + time_t ElapsedSecs = Now - GetStartTime(); + if (ElapsedSecs <= (time_t) g_ulMaxSecs) + return; + + Log("Max time %s exceeded, elapsed seconds = %ul\n", + MaxSecsToStr(), ElapsedSecs); + + SaveCurrentAlignment(); + exit(EXIT_Success); + } diff --git a/src/muscle/muscle3.8.31/src/scoredist.cpp b/src/muscle/muscle3.8.31/src/scoredist.cpp new file mode 100644 index 0000000..f065a22 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/scoredist.cpp @@ -0,0 +1,128 @@ +#include +#include +#include "muscle.h" +#include "msa.h" +#include "distfunc.h" +#include "msa.h" +#include "seqvect.h" +#include "pwpath.h" + +// ScoreDist +// E. Sonnhammer & V. Hollich, Scoredist: A simple and robust protein sequence +// distance estimator, BMC Bioinformatics 2005, 6:108. + +extern int BLOSUM62[20][20]; +extern double BLOSUM62_Expected; + +static const double Dayhoff_CalibrationFactor = 1.3370; +static const double JTT_CalibrationFactor = 1.2873; +static const double MV_CalibrationFactor = 1.1775; +static const double LARGE_D = 3.0; + +static double CalibrationFactor = JTT_CalibrationFactor; + + +// Similarity score +static double Sigma(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2, + unsigned *ptrLength) + { + unsigned Length = 0; + double Score = 0; + const unsigned ColCount = msa.GetColCount(); + for (unsigned ColIndex = 0; ColIndex < ColCount; ++ColIndex) + { + unsigned Letter1 = msa.GetLetterEx(SeqIndex1, ColIndex); + unsigned Letter2 = msa.GetLetterEx(SeqIndex2, ColIndex); + if (Letter1 >= 20 || Letter2 >= 20) + continue; + ++Length; + Score += BLOSUM62[Letter1][Letter2]; + } + + *ptrLength = Length; + return Score; + } + +// Normalized score +static double Sigma_N(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2) + { + unsigned Length = UINT_MAX; + double Score = Sigma(msa, SeqIndex1, SeqIndex2, &Length); + double RandomScore = Length*BLOSUM62_Expected; + return Score - RandomScore; + } + +// Upper limit +static double Sigma_U(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2, + unsigned *ptrLength) + { + double Score11 = Sigma(msa, SeqIndex1, SeqIndex1, ptrLength); + double Score22 = Sigma(msa, SeqIndex2, SeqIndex2, ptrLength); + return (Score11 + Score22)/2; + } + +// Normalized upper limit +static double Sigma_UN(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2) + { + unsigned Length = UINT_MAX; + double Score = Sigma_U(msa, SeqIndex1, SeqIndex2, &Length); + double RandomScore = Length*BLOSUM62_Expected; + return Score - RandomScore; + } + +double GetScoreDist(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2) + { + if (g_Alpha != ALPHA_Amino) + Quit("Scoredist is only for amino acid sequences"); + + double s_N = Sigma_N(msa, SeqIndex1, SeqIndex2); + double s_UN = Sigma_UN(msa, SeqIndex1, SeqIndex2); + double d = 0.0; + if (s_UN != 0) + { + double Ratio = s_N/s_UN; + if (Ratio < 0.001) + d = LARGE_D; + else + d = -log(Ratio); + } + return d*CalibrationFactor; + } + +void DistPWScoreDist(const SeqVect &v, DistFunc &DF) + { + SEQWEIGHT SeqWeightSave = GetSeqWeightMethod(); + SetSeqWeightMethod(SEQWEIGHT_Henikoff); + + const unsigned uSeqCount = v.Length(); + DF.SetCount(uSeqCount); + + const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2; + unsigned uCount = 0; + SetProgressDesc("PW ScoreDist"); + for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) + { + const Seq &s1 = v.GetSeq(uSeqIndex1); + MSA msa1; + msa1.FromSeq(s1); + for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2) + { + if (0 == uCount%20) + Progress(uCount, uPairCount); + ++uCount; + const Seq &s2 = v.GetSeq(uSeqIndex2); + MSA msa2; + msa2.FromSeq(s2); + + PWPath Path; + MSA msaOut; + AlignTwoMSAs(msa1, msa2, msaOut, Path, false, false); + + float d = (float) GetScoreDist(msaOut, 0, 1); + DF.SetDist(uSeqIndex1, uSeqIndex2, d); + } + } + ProgressStepsDone(); + + SetSeqWeightMethod(SeqWeightSave); + } diff --git a/src/muscle/muscle3.8.31/src/scoregaps.cpp b/src/muscle/muscle3.8.31/src/scoregaps.cpp new file mode 100644 index 0000000..1c31a8b --- /dev/null +++ b/src/muscle/muscle3.8.31/src/scoregaps.cpp @@ -0,0 +1,201 @@ +#include "muscle.h" +#include "msa.h" +#include "objscore.h" + +#define TRACE 0 + +struct GAPINFO + { + GAPINFO *Next; + unsigned Start; + unsigned End; + }; + +static GAPINFO **g_Gaps; +static GAPINFO *g_FreeList; +static unsigned g_MaxSeqCount; +static unsigned g_MaxColCount; +static unsigned g_ColCount; +static bool *g_ColDiff; + +static GAPINFO *NewGapInfo() + { + if (0 == g_FreeList) + { + const int NEWCOUNT = 256; + GAPINFO *NewList = new GAPINFO[NEWCOUNT]; + g_FreeList = &NewList[0]; + for (int i = 0; i < NEWCOUNT-1; ++i) + NewList[i].Next = &NewList[i+1]; + NewList[NEWCOUNT-1].Next = 0; + } + GAPINFO *GI = g_FreeList; + g_FreeList = g_FreeList->Next; + return GI; + } + +static void FreeGapInfo(GAPINFO *GI) + { + GI->Next = g_FreeList; + g_FreeList = GI; + } + +// TODO: This could be much faster, no need to look +// at all columns. +static void FindIntersectingGaps(const MSA &msa, unsigned SeqIndex) + { + const unsigned ColCount = msa.GetColCount(); + bool InGap = false; + bool Intersects = false; + unsigned Start = uInsane; + for (unsigned Col = 0; Col <= ColCount; ++Col) + { + bool Gap = ((Col != ColCount) && msa.IsGap(SeqIndex, Col)); + if (Gap) + { + if (!InGap) + { + InGap = true; + Start = Col; + } + if (g_ColDiff[Col]) + Intersects = true; + } + else if (InGap) + { + InGap = false; + if (Intersects) + { + GAPINFO *GI = NewGapInfo(); + GI->Start = Start; + GI->End = Col - 1; + GI->Next = g_Gaps[SeqIndex]; + g_Gaps[SeqIndex] = GI; + } + Intersects = false; + } + } + } + +static SCORE Penalty(unsigned Length, bool Term) + { + if (0 == Length) + return 0; + SCORE s1 = g_scoreGapOpen + g_scoreGapExtend*(Length - 1); +#if DOUBLE_AFFINE + SCORE s2 = g_scoreGapOpen2 + g_scoreGapExtend2*(Length - 1); + if (s1 > s2) + return s1; + return s2; +#else + return s1; +#endif + } + +//static SCORE ScorePair(unsigned Seq1, unsigned Seq2) +// { +//#if TRACE +// { +// Log("ScorePair(%d,%d)\n", Seq1, Seq2); +// Log("Gaps seq 1: "); +// for (GAPINFO *GI = g_Gaps[Seq1]; GI; GI = GI->Next) +// Log(" %d-%d", GI->Start, GI->End); +// Log("\n"); +// Log("Gaps seq 2: "); +// for (GAPINFO *GI = g_Gaps[Seq2]; GI; GI = GI->Next) +// Log(" %d-%d", GI->Start, GI->End); +// Log("\n"); +// } +//#endif +// return 0; +// } + +SCORE ScoreGaps(const MSA &msa, const unsigned DiffCols[], unsigned DiffColCount) + { +#if TRACE + { + Log("ScoreGaps\n"); + Log("DiffCols "); + for (unsigned i = 0; i < DiffColCount; ++i) + Log(" %u", DiffCols[i]); + Log("\n"); + Log("msa=\n"); + msa.LogMe(); + Log("\n"); + } +#endif + const unsigned SeqCount = msa.GetSeqCount(); + const unsigned ColCount = msa.GetColCount(); + g_ColCount = ColCount; + + if (SeqCount > g_MaxSeqCount) + { + delete[] g_Gaps; + g_MaxSeqCount = SeqCount + 256; + g_Gaps = new GAPINFO *[g_MaxSeqCount]; + } + memset(g_Gaps, 0, SeqCount*sizeof(GAPINFO *)); + + if (ColCount > g_MaxColCount) + { + delete[] g_ColDiff; + g_MaxColCount = ColCount + 256; + g_ColDiff = new bool[g_MaxColCount]; + } + + memset(g_ColDiff, 0, g_ColCount*sizeof(bool)); + for (unsigned i = 0; i < DiffColCount; ++i) + { + unsigned Col = DiffCols[i]; + assert(Col < ColCount); + g_ColDiff[Col] = true; + } + + for (unsigned SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) + FindIntersectingGaps(msa, SeqIndex); + +#if TRACE + { + Log("\n"); + Log("Intersecting gaps:\n"); + Log(" "); + for (unsigned Col = 0; Col < ColCount; ++Col) + Log("%c", g_ColDiff[Col] ? '*' : ' '); + Log("\n"); + Log(" "); + for (unsigned Col = 0; Col < ColCount; ++Col) + Log("%d", Col%10); + Log("\n"); + for (unsigned Seq = 0; Seq < SeqCount; ++Seq) + { + Log("%3d: ", Seq); + for (unsigned Col = 0; Col < ColCount; ++Col) + Log("%c", msa.GetChar(Seq, Col)); + Log(" :: "); + for (GAPINFO *GI = g_Gaps[Seq]; GI; GI = GI->Next) + Log(" (%d,%d)", GI->Start, GI->End); + Log(" >%s\n", msa.GetSeqName(Seq)); + } + Log("\n"); + } +#endif + + SCORE Score = 0; + for (unsigned Seq1 = 0; Seq1 < SeqCount; ++Seq1) + { + const WEIGHT w1 = msa.GetSeqWeight(Seq1); + for (unsigned Seq2 = Seq1 + 1; Seq2 < SeqCount; ++Seq2) + { + const WEIGHT w2 = msa.GetSeqWeight(Seq2); +// const SCORE Pair = ScorePair(Seq1, Seq2); + const SCORE Pair = ScoreSeqPairGaps(msa, Seq1, msa, Seq2); + Score += w1*w2*Pair; +#if TRACE + Log("Seq1=%u Seq2=%u ScorePair=%.4g w1=%.4g w2=%.4g Sum=%.4g\n", + Seq1, Seq2, Pair, w1, w2, Score); +#endif + } + } + + return Score; + } diff --git a/src/muscle/muscle3.8.31/src/scorehistory.cpp b/src/muscle/muscle3.8.31/src/scorehistory.cpp new file mode 100644 index 0000000..7345a45 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/scorehistory.cpp @@ -0,0 +1,101 @@ +#include "muscle.h" +#include "scorehistory.h" +#include + +#define TRACE 0 + +ScoreHistory::ScoreHistory(unsigned uIters, unsigned uNodeCount) + { + m_uNodeCount = uNodeCount; + m_uIters = uIters; + + m_Score = new SCORE *[uIters]; + m_bScoreSet = new bool *[uIters]; + for (unsigned n = 0; n < uIters; ++n) + { + m_Score[n] = new SCORE[uNodeCount*2]; + m_bScoreSet[n] = new bool[uNodeCount*2]; + memset(m_bScoreSet[n], 0, uNodeCount*2*sizeof(bool)); + } + } + +ScoreHistory::~ScoreHistory() + { + for (unsigned n = 0; n < m_uIters; ++n) + { + delete[] m_Score[n]; + delete[] m_bScoreSet[n]; + } + delete[] m_Score; + delete[] m_bScoreSet; + } + +bool ScoreHistory::SetScore(unsigned uIter, unsigned uNodeIndex, bool bRight, SCORE Score) + { +#if TRACE + Log("ScoreHistory::SetScore(Iter=%u Node=%u Right=%d Score=%g)\n", + uIter, uNodeIndex, bRight, Score); +#endif + if (uIter >= m_uIters) + Quit("ScoreHistory::SetScore-1"); + if (uNodeIndex >= m_uNodeCount) + Quit("ScoreHistory::SetScore-2"); + + const unsigned uIndex = uNodeIndex*2 + bRight; + for (unsigned n = 1; n < uIter; ++n) + { + const unsigned uPrevIter = n - 1; + if (!m_bScoreSet[uPrevIter][uIndex]) + { + LogMe(); + Quit("ScoreHistory::SetScore-3"); + } + if (m_Score[uPrevIter][uIndex] == Score) + { + ProgressStepsDone(); +#if TRACE + Log("Oscillating\n"); +#endif + return true; + } + } + m_Score[uIter][uIndex] = Score; + m_bScoreSet[uIter][uIndex] = true; + return false; + } + +void ScoreHistory::LogMe() const + { + Log("ScoreHistory\n"); + Log("Iter Node Right Score\n"); + Log("---- ---- ----- ---------\n"); + for (unsigned uIter = 0; uIter < m_uIters; ++uIter) + { + bool bAnySet = false; + for (unsigned n = 0; n < m_uNodeCount*2; ++n) + if (m_bScoreSet[uIter][n]) + { + bAnySet = true; + break; + } + if (!bAnySet) + return; + for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex) + { + const unsigned uBase = 2*uNodeIndex; + if (m_bScoreSet[uIter][uBase]) + Log("%4u %4u F %9.3f\n", uIter, uNodeIndex, m_Score[uIter][uBase]); + if (m_bScoreSet[uIter][uBase+1]) + Log("%4u %4u T %9.3f\n", uIter, uNodeIndex, m_Score[uIter][uBase+1]); + } + } + } + +SCORE ScoreHistory::GetScore(unsigned uIter, unsigned uNodeIndex, + bool bReverse, bool bRight) const + { + const unsigned uIndex = uNodeIndex*2 + bRight; + if (!m_bScoreSet[uIter][uIndex]) + Quit("ScoreHistory::GetScore"); + return m_Score[uIter][uIndex]; + } diff --git a/src/muscle/muscle3.8.31/src/scorehistory.h b/src/muscle/muscle3.8.31/src/scorehistory.h new file mode 100644 index 0000000..d375ff6 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/scorehistory.h @@ -0,0 +1,21 @@ +#ifndef ScoreHistory_h +#define ScoreHistory_h + +class ScoreHistory + { +public: + ScoreHistory(unsigned uIters, unsigned uInternalNodeCount); + ~ScoreHistory(); + bool SetScore(unsigned uIter, unsigned uInternalNodeIndex, bool bRight, SCORE Score); + void LogMe() const; + SCORE GetScore(unsigned uIter, unsigned uInternalNodeIndex, bool bReversed, + bool bRight) const; + +private: + SCORE **m_Score; + bool **m_bScoreSet; + unsigned m_uIters; + unsigned m_uNodeCount; + }; + +#endif // ScoreHistory_h diff --git a/src/muscle/muscle3.8.31/src/scoremx.cpp b/src/muscle/muscle3.8.31/src/scoremx.cpp new file mode 100644 index 0000000..ca36c4d --- /dev/null +++ b/src/muscle/muscle3.8.31/src/scoremx.cpp @@ -0,0 +1,45 @@ +#include "muscle.h" +#include "profile.h" + +extern SCOREMATRIX VTML_LA; +extern SCOREMATRIX PAM200; +extern SCOREMATRIX PAM200NoCenter; +extern SCOREMATRIX VTML_SP; +extern SCOREMATRIX VTML_SPNoCenter; +extern SCOREMATRIX NUC_SP; + +PTR_SCOREMATRIX g_ptrScoreMatrix; + +void SetScoreMatrix() + { + switch (g_PPScore) + { + case PPSCORE_LE: + g_ptrScoreMatrix = &VTML_LA; + break; + + case PPSCORE_SP: + if (g_bPrecompiledCenter) + g_ptrScoreMatrix = &PAM200; + else + g_ptrScoreMatrix = &PAM200NoCenter; + break; + + case PPSCORE_SV: + if (g_bPrecompiledCenter) + g_ptrScoreMatrix = &VTML_SP; + else + g_ptrScoreMatrix = &VTML_SPNoCenter; + break; + + case PPSCORE_SPN: + if (g_bPrecompiledCenter) + g_ptrScoreMatrix = &NUC_SP; + else + Quit("SPN requires precompiled center"); + break; + + default: + Quit("Invalid g_PPScore"); + } + } diff --git a/src/muscle/muscle3.8.31/src/scorepp.cpp b/src/muscle/muscle3.8.31/src/scorepp.cpp new file mode 100644 index 0000000..541e7ff --- /dev/null +++ b/src/muscle/muscle3.8.31/src/scorepp.cpp @@ -0,0 +1,104 @@ +#include "muscle.h" +#include "profile.h" + +char ConsensusChar(const ProfPos &PP) + { + unsigned uMostCommonLetter = 0; + FCOUNT fcMostCommon = PP.m_fcCounts[0]; + bool bMoreThanOneLetter = false; + bool bAnyLetter = false; + for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter) + { + const FCOUNT fc = PP.m_fcCounts[uLetter]; + if (fc > 0) + { + if (bAnyLetter) + bMoreThanOneLetter = true; + bAnyLetter = true; + } + if (fc > fcMostCommon) + { + uMostCommonLetter = uLetter; + fcMostCommon = fc; + } + } + if (!bAnyLetter) + return '-'; + char c = LetterToChar(uMostCommonLetter); + if (bMoreThanOneLetter) + return UnalignChar(c); + return c; + } + +SCORE ScoreProfPos2LA(const ProfPos &PPA, const ProfPos &PPB) + { + SCORE Score = 0; + for (unsigned n = 0; n < 20; ++n) + { + const unsigned uLetter = PPA.m_uSortOrder[n]; + const FCOUNT fcLetter = PPA.m_fcCounts[uLetter]; + if (0 == fcLetter) + break; + Score += fcLetter*PPB.m_AAScores[uLetter]; + } + if (0 == Score) + return -2.5; + SCORE logScore = logf(Score); + return (SCORE) ((logScore - g_scoreCenter)*(PPA.m_fOcc * PPB.m_fOcc)); + } + +SCORE ScoreProfPos2NS(const ProfPos &PPA, const ProfPos &PPB) + { + SCORE Score = 0; + for (unsigned n = 0; n < 20; ++n) + { + const unsigned uLetter = PPA.m_uSortOrder[n]; + const FCOUNT fcLetter = PPA.m_fcCounts[uLetter]; + if (0 == fcLetter) + break; + Score += fcLetter*PPB.m_AAScores[uLetter]; + } + return Score - g_scoreCenter; + } + +SCORE ScoreProfPos2SP(const ProfPos &PPA, const ProfPos &PPB) + { + SCORE Score = 0; + for (unsigned n = 0; n < 20; ++n) + { + const unsigned uLetter = PPA.m_uSortOrder[n]; + const FCOUNT fcLetter = PPA.m_fcCounts[uLetter]; + if (0 == fcLetter) + break; + Score += fcLetter*PPB.m_AAScores[uLetter]; + } + return Score - g_scoreCenter; + } + +SCORE ScoreProfPos2SPN(const ProfPos &PPA, const ProfPos &PPB) + { + SCORE Score = 0; + for (unsigned n = 0; n < 4; ++n) + { + const unsigned uLetter = PPA.m_uSortOrder[n]; + const FCOUNT fcLetter = PPA.m_fcCounts[uLetter]; + if (0 == fcLetter) + break; + Score += fcLetter*PPB.m_AAScores[uLetter]; + } + return Score - g_scoreCenter; + } + +SCORE ScoreProfPos2(const ProfPos &PPA, const ProfPos &PPB) + { + if (PPSCORE_SP == g_PPScore) + return ScoreProfPos2NS(PPA, PPB); + else if (PPSCORE_LE == g_PPScore) + return ScoreProfPos2LA(PPA, PPB); + else if (PPSCORE_SV == g_PPScore) + return ScoreProfPos2SP(PPA, PPB); + else if (PPSCORE_SPN == g_PPScore) + return ScoreProfPos2SPN(PPA, PPB); + Quit("Invalid g_PPScore"); + return 0; + } diff --git a/src/muscle/muscle3.8.31/src/seq.cpp b/src/muscle/muscle3.8.31/src/seq.cpp new file mode 100644 index 0000000..e682758 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/seq.cpp @@ -0,0 +1,342 @@ +#include "muscle.h" +#include "seq.h" +#include "textfile.h" +#include "msa.h" +//#include + +const size_t MAX_FASTA_LINE = 16000; + +void Seq::SetName(const char *ptrName) + { + delete[] m_ptrName; + size_t n = strlen(ptrName) + 1; + m_ptrName = new char[n]; + strcpy(m_ptrName, ptrName); + } + +void Seq::ToFASTAFile(TextFile &File) const + { + File.PutFormat(">%s\n", m_ptrName); + unsigned uColCount = Length(); + for (unsigned n = 0; n < uColCount; ++n) + { + if (n > 0 && n%60 == 0) + File.PutString("\n"); + File.PutChar(at(n)); + } + File.PutString("\n"); + } + +// Return true on end-of-file +bool Seq::FromFASTAFile(TextFile &File) + { + Clear(); + + char szLine[MAX_FASTA_LINE]; + bool bEof = File.GetLine(szLine, sizeof(szLine)); + if (bEof) + return true; + if ('>' != szLine[0]) + Quit("Expecting '>' in FASTA file %s line %u", + File.GetFileName(), File.GetLineNr()); + + size_t n = strlen(szLine); + if (1 == n) + Quit("Missing annotation following '>' in FASTA file %s line %u", + File.GetFileName(), File.GetLineNr()); + + m_ptrName = new char[n]; + strcpy(m_ptrName, szLine + 1); + + TEXTFILEPOS Pos = File.GetPos(); + for (;;) + { + bEof = File.GetLine(szLine, sizeof(szLine)); + if (bEof) + { + if (0 == size()) + { + Quit("Empty sequence in FASTA file %s line %u", + File.GetFileName(), File.GetLineNr()); + return true; + } + return false; + } + if ('>' == szLine[0]) + { + if (0 == size()) + Quit("Empty sequence in FASTA file %s line %u", + File.GetFileName(), File.GetLineNr()); + // Rewind to beginning of this line, it's the start of the + // next sequence. + File.SetPos(Pos); + return false; + } + const char *ptrChar = szLine; + while (char c = *ptrChar++) + { + if (isspace(c)) + continue; + if (IsGapChar(c)) + continue; + if (!IsResidueChar(c)) + { + if (isprint(c)) + { + char w = GetWildcardChar(); + Warning("Invalid residue '%c' in FASTA file %s line %d, replaced by '%c'", + c, File.GetFileName(), File.GetLineNr(), w); + c = w; + } + else + Quit("Invalid byte hex %02x in FASTA file %s line %d", + (unsigned char) c, File.GetFileName(), File.GetLineNr()); + } + c = toupper(c); + push_back(c); + } + Pos = File.GetPos(); + } + } + +void Seq::ExtractUngapped(MSA &msa) const + { + msa.Clear(); + unsigned uColCount = Length(); + msa.SetSize(1, 1); + unsigned uUngappedPos = 0; + for (unsigned n = 0; n < uColCount; ++n) + { + char c = at(n); + if (!IsGapChar(c)) + msa.SetChar(0, uUngappedPos++, c); + } + msa.SetSeqName(0, m_ptrName); + } + +void Seq::Copy(const Seq &rhs) + { + clear(); + const unsigned uLength = rhs.Length(); + for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex) + push_back(rhs.at(uColIndex)); + const char *ptrName = rhs.GetName(); + size_t n = strlen(ptrName) + 1; + m_ptrName = new char[n]; + strcpy(m_ptrName, ptrName); + SetId(rhs.GetId()); + } + +void Seq::CopyReversed(const Seq &rhs) + { + clear(); + const unsigned uLength = rhs.Length(); + const unsigned uBase = rhs.Length() - 1; + for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex) + push_back(rhs.at(uBase - uColIndex)); + const char *ptrName = rhs.GetName(); + size_t n = strlen(ptrName) + 1; + m_ptrName = new char[n]; + strcpy(m_ptrName, ptrName); + } + +void Seq::StripGaps() + { + for (CharVect::iterator p = begin(); p != end(); ) + { + char c = *p; + if (IsGapChar(c)) + erase(p); + else + ++p; + } + } + +void Seq::StripGapsAndWhitespace() + { + for (CharVect::iterator p = begin(); p != end(); ) + { + char c = *p; + if (isspace(c) || IsGapChar(c)) + erase(p); + else + ++p; + } + } + +void Seq::ToUpper() + { + for (CharVect::iterator p = begin(); p != end(); ++p) + { + char c = *p; + if (islower(c)) + *p = toupper(c); + } + } + +unsigned Seq::GetLetter(unsigned uIndex) const + { + assert(uIndex < Length()); + char c = operator[](uIndex); + return CharToLetter(c); + } + +bool Seq::EqIgnoreCase(const Seq &s) const + { + const unsigned n = Length(); + if (n != s.Length()) + return false; + for (unsigned i = 0; i < n; ++i) + { + const char c1 = at(i); + const char c2 = s.at(i); + if (IsGapChar(c1)) + { + if (!IsGapChar(c2)) + return false; + } + else + { + if (toupper(c1) != toupper(c2)) + return false; + } + } + return true; + } + +bool Seq::Eq(const Seq &s) const + { + const unsigned n = Length(); + if (n != s.Length()) + return false; + for (unsigned i = 0; i < n; ++i) + { + const char c1 = at(i); + const char c2 = s.at(i); + if (c1 != c2) + return false; + } + return true; + } + +bool Seq::EqIgnoreCaseAndGaps(const Seq &s) const + { + const unsigned uThisLength = Length(); + const unsigned uOtherLength = s.Length(); + + unsigned uThisPos = 0; + unsigned uOtherPos = 0; + + int cThis; + int cOther; + for (;;) + { + if (uThisPos == uThisLength && uOtherPos == uOtherLength) + break; + + // Set cThis to next non-gap character in this string + // or -1 if end-of-string. + for (;;) + { + if (uThisPos == uThisLength) + { + cThis = -1; + break; + } + else + { + cThis = at(uThisPos); + ++uThisPos; + if (!IsGapChar(cThis)) + { + cThis = toupper(cThis); + break; + } + } + } + + // Set cOther to next non-gap character in s + // or -1 if end-of-string. + for (;;) + { + if (uOtherPos == uOtherLength) + { + cOther = -1; + break; + } + else + { + cOther = s.at(uOtherPos); + ++uOtherPos; + if (!IsGapChar(cOther)) + { + cOther = toupper(cOther); + break; + } + } + } + + // Compare characters are corresponding ungapped position + if (cThis != cOther) + return false; + } + return true; + } + +unsigned Seq::GetUngappedLength() const + { + unsigned uUngappedLength = 0; + for (CharVect::const_iterator p = begin(); p != end(); ++p) + { + char c = *p; + if (!IsGapChar(c)) + ++uUngappedLength; + } + return uUngappedLength; + } + +void Seq::LogMe() const + { + Log(">%s\n", m_ptrName); + const unsigned n = Length(); + for (unsigned i = 0; i < n; ++i) + Log("%c", at(i)); + Log("\n"); + } + +void Seq::FromString(const char *pstrSeq, const char *pstrName) + { + clear(); + const unsigned uLength = (unsigned) strlen(pstrSeq); + for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex) + push_back(pstrSeq[uColIndex]); + size_t n = strlen(pstrName) + 1; + m_ptrName = new char[n]; + strcpy(m_ptrName, pstrName); + } + +bool Seq::HasGap() const + { + for (CharVect::const_iterator p = begin(); p != end(); ++p) + { + char c = *p; + if (IsGapChar(c)) + return true; + } + return false; + } + +void Seq::FixAlpha() + { + for (CharVect::iterator p = begin(); p != end(); ++p) + { + char c = *p; + if (!IsResidueChar(c)) + { + char w = GetWildcardChar(); + // Warning("Invalid residue '%c', replaced by '%c'", c, w); + InvalidLetterWarning(c, w); + *p = w; + } + } + } diff --git a/src/muscle/muscle3.8.31/src/seq.h b/src/muscle/muscle3.8.31/src/seq.h new file mode 100644 index 0000000..2e8fb7d --- /dev/null +++ b/src/muscle/muscle3.8.31/src/seq.h @@ -0,0 +1,85 @@ +#ifndef Seq_h +#define Seq_h + +#include + +class TextFile; +class MSA; + +typedef std::vector CharVect; + +class Seq : public CharVect + { +public: + Seq() + { + m_ptrName = 0; + // Start with moderate size to avoid + // thrashing the heap. + reserve(200); + } + virtual ~Seq() + { + delete[] m_ptrName; + } + +private: +// Not implemented; prevent use of copy c'tor and assignment. + Seq(const Seq &); + Seq &operator=(const Seq &); + +public: + void Clear() + { + clear(); + delete[] m_ptrName; + m_ptrName = 0; + m_uId = uInsane; + } + const char *GetName() const + { + return m_ptrName; + } + unsigned GetId() const + { + if (uInsane == m_uId) + Quit("Seq::GetId, id not set"); + return m_uId; + } + void SetId(unsigned uId) { m_uId = uId; } + + bool FromFASTAFile(TextFile &File); + void ToFASTAFile(TextFile &File) const; + void ExtractUngapped(MSA &msa) const; + + void FromString(const char *pstrSeq, const char *pstrName); + void Copy(const Seq &rhs); + void CopyReversed(const Seq &rhs); + void StripGaps(); + void StripGapsAndWhitespace(); + void ToUpper(); + void SetName(const char *ptrName); + unsigned GetLetter(unsigned uIndex) const; + unsigned Length() const { return (unsigned) size(); } + bool Eq(const Seq &s) const; + bool EqIgnoreCase(const Seq &s) const; + bool EqIgnoreCaseAndGaps(const Seq &s) const; + bool HasGap() const; + unsigned GetUngappedLength() const; + void LogMe() const; + char GetChar(unsigned uIndex) const { return operator[](uIndex); } + void SetChar(unsigned uIndex, char c) { operator[](uIndex) = c; } + void AppendChar(char c) { push_back(c); } + void FixAlpha(); + +#ifndef _WIN32 + reference at(size_type i) { return operator[](i); } + const_reference at(size_type i) const { return operator[](i); } +#endif + +private: + char *m_ptrName; + unsigned m_uId; + }; + +#endif // Seq.h diff --git a/src/muscle/muscle3.8.31/src/seqvect.cpp b/src/muscle/muscle3.8.31/src/seqvect.cpp new file mode 100644 index 0000000..52b46fc --- /dev/null +++ b/src/muscle/muscle3.8.31/src/seqvect.cpp @@ -0,0 +1,290 @@ +#include "muscle.h" +#include "seqvect.h" +#include "textfile.h" +#include "msa.h" + +const size_t MAX_FASTA_LINE = 16000; + +SeqVect::~SeqVect() + { + Clear(); + } + +void SeqVect::Clear() + { + for (size_t n = 0; n < size(); ++n) + delete (*this)[n]; + } + +void SeqVect::ToFASTAFile(TextFile &File) const + { + unsigned uSeqCount = Length(); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + Seq *ptrSeq = at(uSeqIndex); + ptrSeq->ToFASTAFile(File); + } + } + +void SeqVect::FromFASTAFile(TextFile &File) + { + Clear(); + + FILE *f = File.GetStdioFile(); + for (;;) + { + char *Label; + unsigned uLength; + char *SeqData = GetFastaSeq(f, &uLength, &Label); + if (0 == SeqData) + return; + Seq *ptrSeq = new Seq; + + for (unsigned i = 0; i < uLength; ++i) + { + char c = SeqData[i]; + ptrSeq->push_back(c); + } + + ptrSeq->SetName(Label); + push_back(ptrSeq); + + delete[] SeqData; + delete[] Label; + } + } + +void SeqVect::PadToMSA(MSA &msa) + { + unsigned uSeqCount = Length(); + if (0 == uSeqCount) + { + msa.Clear(); + return; + } + + unsigned uLongestSeqLength = 0; + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + Seq *ptrSeq = at(uSeqIndex); + unsigned uColCount = ptrSeq->Length(); + if (uColCount > uLongestSeqLength) + uLongestSeqLength = uColCount; + } + msa.SetSize(uSeqCount, uLongestSeqLength); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + Seq *ptrSeq = at(uSeqIndex); + msa.SetSeqName(uSeqIndex, ptrSeq->GetName()); + unsigned uColCount = ptrSeq->Length(); + unsigned uColIndex; + for (uColIndex = 0; uColIndex < uColCount; ++uColIndex) + { + char c = ptrSeq->at(uColIndex); + msa.SetChar(uSeqIndex, uColIndex, c); + } + while (uColIndex < uLongestSeqLength) + msa.SetChar(uSeqIndex, uColIndex++, '.'); + } + } + +void SeqVect::Copy(const SeqVect &rhs) + { + clear(); + unsigned uSeqCount = rhs.Length(); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + Seq *ptrSeq = rhs.at(uSeqIndex); + Seq *ptrSeqCopy = new Seq; + ptrSeqCopy->Copy(*ptrSeq); + push_back(ptrSeqCopy); + } + } + +void SeqVect::StripGaps() + { + unsigned uSeqCount = Length(); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + Seq *ptrSeq = at(uSeqIndex); + ptrSeq->StripGaps(); + } + } + +void SeqVect::StripGapsAndWhitespace() + { + unsigned uSeqCount = Length(); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + Seq *ptrSeq = at(uSeqIndex); + ptrSeq->StripGapsAndWhitespace(); + } + } + +void SeqVect::ToUpper() + { + unsigned uSeqCount = Length(); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + Seq *ptrSeq = at(uSeqIndex); + ptrSeq->ToUpper(); + } + } + +bool SeqVect::FindName(const char *ptrName, unsigned *ptruIndex) const + { + unsigned uSeqCount = Length(); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + const Seq *ptrSeq = at(uSeqIndex); + if (0 == stricmp(ptrSeq->GetName(), ptrName)) + { + *ptruIndex = uSeqIndex; + return true; + } + } + return false; + } + +void SeqVect::AppendSeq(const Seq &s) + { + Seq *ptrSeqCopy = new Seq; + ptrSeqCopy->Copy(s); + push_back(ptrSeqCopy); + } + +void SeqVect::LogMe() const + { + unsigned uSeqCount = Length(); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + const Seq *ptrSeq = at(uSeqIndex); + ptrSeq->LogMe(); + } + } + +const char *SeqVect::GetSeqName(unsigned uSeqIndex) const + { + assert(uSeqIndex < size()); + const Seq *ptrSeq = at(uSeqIndex); + return ptrSeq->GetName(); + } + +unsigned SeqVect::GetSeqId(unsigned uSeqIndex) const + { + assert(uSeqIndex < size()); + const Seq *ptrSeq = at(uSeqIndex); + return ptrSeq->GetId(); + } + +unsigned SeqVect::GetSeqIdFromName(const char *Name) const + { + const unsigned uSeqCount = GetSeqCount(); + for (unsigned i = 0; i < uSeqCount; ++i) + { + if (!strcmp(Name, GetSeqName(i))) + return GetSeqId(i); + } + Quit("SeqVect::GetSeqIdFromName(%s): not found", Name); + return 0; + } + +Seq &SeqVect::GetSeqById(unsigned uId) + { + const unsigned uSeqCount = GetSeqCount(); + for (unsigned i = 0; i < uSeqCount; ++i) + { + if (GetSeqId(i) == uId) + return GetSeq(i); + } + Quit("SeqVect::GetSeqIdByUd(%d): not found", uId); + return (Seq &) *((Seq *) 0); + } + +unsigned SeqVect::GetSeqLength(unsigned uSeqIndex) const + { + assert(uSeqIndex < size()); + const Seq *ptrSeq = at(uSeqIndex); + return ptrSeq->Length(); + } + +Seq &SeqVect::GetSeq(unsigned uSeqIndex) + { + assert(uSeqIndex < size()); + return *at(uSeqIndex); + } + +const Seq &SeqVect::GetSeq(unsigned uSeqIndex) const + { + assert(uSeqIndex < size()); + return *at(uSeqIndex); + } + +void SeqVect::SetSeqId(unsigned uSeqIndex, unsigned uId) + { + assert(uSeqIndex < size()); + Seq *ptrSeq = at(uSeqIndex); + return ptrSeq->SetId(uId); + } + +ALPHA SeqVect::GuessAlpha() const + { +// If at least MIN_NUCLEO_PCT of the first CHAR_COUNT non-gap +// letters belong to the nucleotide alphabet, guess nucleo. +// Otherwise amino. + const unsigned CHAR_COUNT = 100; + const unsigned MIN_NUCLEO_PCT = 95; + + const unsigned uSeqCount = GetSeqCount(); + if (0 == uSeqCount) + return ALPHA_Amino; + + unsigned uSeqIndex = 0; + unsigned uPos = 0; + unsigned uSeqLength = GetSeqLength(0); + unsigned uDNACount = 0; + unsigned uRNACount = 0; + unsigned uTotal = 0; + const Seq *ptrSeq = &GetSeq(0); + for (;;) + { + while (uPos >= uSeqLength) + { + ++uSeqIndex; + if (uSeqIndex >= uSeqCount) + break; + ptrSeq = &GetSeq(uSeqIndex); + uSeqLength = ptrSeq->Length(); + uPos = 0; + } + if (uSeqIndex >= uSeqCount) + break; + char c = ptrSeq->at(uPos++); + if (IsGapChar(c)) + continue; + if (IsDNA(c)) + ++uDNACount; + if (IsRNA(c)) + ++uRNACount; + ++uTotal; + if (uTotal >= CHAR_COUNT) + break; + } + if (uTotal != 0 && ((uDNACount*100)/uTotal) >= MIN_NUCLEO_PCT) + return ALPHA_DNA; + if (uTotal != 0 && ((uRNACount*100)/uTotal) >= MIN_NUCLEO_PCT) + return ALPHA_RNA; + return ALPHA_Amino; + } + +void SeqVect::FixAlpha() + { + ClearInvalidLetterWarning(); + unsigned uSeqCount = Length(); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + Seq *ptrSeq = at(uSeqIndex); + ptrSeq->FixAlpha(); + } + ReportInvalidLetters(); + } diff --git a/src/muscle/muscle3.8.31/src/seqvect.h b/src/muscle/muscle3.8.31/src/seqvect.h new file mode 100644 index 0000000..fb20993 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/seqvect.h @@ -0,0 +1,63 @@ +#ifndef SeqVect_h +#define SeqVect_h + +#include +#include "seq.h" + +typedef std::vector SeqVectBase; + +class SeqVect : public SeqVectBase + { +public: + SeqVect() {} + virtual ~SeqVect(); + +private: +// Not implemented; prevent use of copy c'tor and assignment. + SeqVect(const SeqVect &); + SeqVect &operator=(const SeqVect &); + +public: + void FromFile(TextFile &File) + { + FromFASTAFile(File); + } + + void FromFASTAFile(TextFile &File); + void ToFASTAFile(TextFile &File) const; + + void ToFile(TextFile &File) const + { + ToFASTAFile(File); + } + + void PadToMSA(MSA &msa); + void Copy(const SeqVect &rhs); + void StripGaps(); + void StripGapsAndWhitespace(); + void ToUpper(); + void Clear(); + unsigned Length() const { return (unsigned) size(); } + unsigned GetSeqCount() const { return (unsigned) size(); } + void AppendSeq(const Seq &s); + bool FindName(const char *ptrName, unsigned *ptruIndex) const; + void LogMe() const; + const char *GetSeqName(unsigned uSeqIndex) const; + unsigned GetSeqId(unsigned uSeqIndex) const; + unsigned GetSeqIdFromName(const char *Name) const; + unsigned GetSeqLength(unsigned uSeqIndex) const; + void SetSeqId(unsigned uSeqIndex, unsigned uId); + Seq &GetSeq(unsigned uIndex); + Seq &GetSeqById(unsigned uId); + const Seq &GetSeq(unsigned uIndex) const; + + ALPHA GuessAlpha() const; + void FixAlpha(); + +#ifndef _WIN32 + reference at(size_type i) { return operator[](i); } + const_reference at(size_type i) const { return operator[](i); } +#endif + }; + +#endif // SeqVect_h diff --git a/src/muscle/muscle3.8.31/src/setblosumweights.cpp b/src/muscle/muscle3.8.31/src/setblosumweights.cpp new file mode 100644 index 0000000..404f3c2 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/setblosumweights.cpp @@ -0,0 +1,131 @@ +/*** +Code for implementing HMMer's "BLOSUM weighting" algorithm. + +The algorithm was deduced by reverse-engineering the HMMer code. + +The HMMer documentation refers to BLOSUM weighting as "Henikoff +simple filter weighting" + +The name BLOSUM implied to me that HMMer would be using a +substitution probability matrix to compute distances, but this +turned out not to be the case. + +It is notable, not to say puzzling, that the HMMer BLOSUM weighting +algorithm is guaranteed to produce an integral NIC (number-of-indepdent- +counts, also known as effective sequence count). Presumably Eddy must +have known this, though he doesn't comment on it and he computes & stores +the value in a float. + +Here's the algorithm: + +Distances between two sequences are based on the average of a simple +binary equal (one) / not equal (zero) at each position. The only thing +that has anything to do with BLOSUM in this calculation is an obscure +(to me) constant value of 0.62. The sequences are clustered using this +distance. If the pairwise identity (fraction of identical positions) +is less than 0.62, they get assigned to disjoint clusters, the final +number of disjoint clusters is the NIC. This makes some intuitive sense: +I would interpret this by saying that if a set of sequences are close +enough they count as one sequence. The weight for each sequence within a +disjoint cluster is then determined to be 1 / (clustersize), from which it +follows that the sum of all weights is equal to the number of disjoint +clusters and is thus guaranteed to be an integer value. It is exactly this +sum that HMMer uses for the NIC, by default. + +The individual BLOSUM sequence weights are not used for anything else in +HMMer, unless you specify that BLOSUM weighting should override the default +GSC weighting. GSC weighting uses a different clustering algorithm to +determine relative weights. The BLOSUM NIC is then distributed over the +GSC tree according to those relative weights. +***/ + +#include "muscle.h" +#include "msa.h" +#include "cluster.h" +#include "distfunc.h" + +// Set weights of all sequences in the subtree under given node. +void MSA::SetBLOSUMSubtreeWeight(const ClusterNode *ptrNode, double dWeight) const + { + if (0 == ptrNode) + return; + + const ClusterNode *ptrRight = ptrNode->GetRight(); + const ClusterNode *ptrLeft = ptrNode->GetLeft(); + +// If leaf, set weight + if (0 == ptrRight && 0 == ptrLeft) + { + unsigned uIndex = ptrNode->GetIndex(); + WEIGHT w = DoubleToWeight(dWeight); + m_Weights[uIndex] = w; + return; + } + +// Otherwise, recursively set subtrees + SetBLOSUMSubtreeWeight(ptrLeft, dWeight); + SetBLOSUMSubtreeWeight(ptrRight, dWeight); + } + +// Traverse a subtree looking for clusters where all +// the leaves are sufficiently similar that they +// should be weighted as a group, i.e. given a weight +// of 1/N where N is the cluster size. The idea is +// to avoid sample bias where we have closely related +// sequences in the input alignment. +// The weight at a node is the distance between +// the two closest sequences in the left and right +// subtrees under that node. "Sufficiently similar" +// is defined as being where that minimum distance +// is less than the dMinDist threshhold. I don't know +// why the clustering is done using a minimum rather +// than a maximum or average, either of which would +// seem more natural to me. +// Return value is number of groups under this node. +// A "group" is the cluster found under a node with a +// weight less than the minimum. +unsigned MSA::SetBLOSUMNodeWeight(const ClusterNode *ptrNode, double dMinDist) const + { + if (0 == ptrNode) + return 0; + + if (ptrNode->GetWeight() < dMinDist) + { + unsigned uClusterSize = ptrNode->GetClusterSize(); + assert(uClusterSize > 0); + double dWeight = 1.0 / uClusterSize; + SetBLOSUMSubtreeWeight(ptrNode, dWeight); + return 1; + } + + const ClusterNode *ptrLeft = ptrNode->GetLeft(); + const ClusterNode *ptrRight = ptrNode->GetRight(); + + unsigned uLeftGroupCount = SetBLOSUMNodeWeight(ptrLeft, dMinDist); + unsigned uRightGroupCount = SetBLOSUMNodeWeight(ptrRight, dMinDist); + + return uLeftGroupCount + uRightGroupCount; + } + +// Return value is the group count, i.e. the effective number +// of distinctly different sequences. +unsigned MSA::CalcBLOSUMWeights(ClusterTree &BlosumCluster) const + { +// Build distance matrix + DistFunc DF; + unsigned uSeqCount = GetSeqCount(); + DF.SetCount(uSeqCount); + for (unsigned i = 0; i < uSeqCount; ++i) + for (unsigned j = i+1; j < uSeqCount; ++j) + { + double dDist = GetPctIdentityPair(i, j); + assert(dDist >= 0.0 && dDist <= 1.0); + DF.SetDist(i, j, (float) (1.0 - dDist)); + } + +// Cluster based on the distance function + BlosumCluster.Create(DF); + +// Return value is HMMer's "effective sequence count". + return SetBLOSUMNodeWeight(BlosumCluster.GetRoot(), 1.0 - BLOSUM_DIST); + } diff --git a/src/muscle/muscle3.8.31/src/setgscweights.cpp b/src/muscle/muscle3.8.31/src/setgscweights.cpp new file mode 100644 index 0000000..7e7e787 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/setgscweights.cpp @@ -0,0 +1,195 @@ +/*** +Gerstein/Sonnhammer/Chothia ad hoc sequence weighting. +The algorithm was deduced by reverse-engineering the +HMMer code. + +I used an alternative representation that I prefer over +HMMer's. The HMMer code is full of tree manipulations +that do something to the left child and then the equivalent +thing to the right child. It was clear that there must be +a re-formulation that does everything once for each node, +which would reduce the number of operations expressed +in the code by a factor of two. This gives a more elegant +and less error-prone way to code it. + +These notes explain the correspondence between my design +and Eddy's. + +HMMer stores a data structure phylo_s for each non-leaf +node in the cluster tree. This structure contains the +following fields: + + diff Weight of the node + lblen Left branch length + rblen Right branch length + +The lblen and rblen branch lengths are calculated as: + + this.lblen = this.diff - left.diff + this.rblen = this.diff - right.diff + +My code stores one ClusterNode data structure per node +in the cluster tree, including leaves. I store only the +weight. I can recover the HMMer branch length fields +in a trivial O(1) calculation as follows: + + lblen = Node.GetWeight() - Node.GetLeft()->GetWeight() + rblen = Node.GetWeight() - Node.GetRight()->GetWeight() + +For the GSC weights calculation, HMMer constructs the +following vectors, which have entries for all nodes, +including leaves: + + lwt Left weight + rwt Right weight + +The "left weight" is calculated as the sum of the weights in +all the nodes reachable through the left branch, including +the node itself. (This is not immediately obvious from the +code, which does the calculation using branch lengths rather +than weights, but this is an equivalent, and to my mind clearer, +statement of what they are). Similarly, the "right weight" is +the sum of all weights reachable via the right branch. I define +the "cluster weight" to be the summed weight of all nodes in the +subtree under the node, including the node itself. I provide +a function Node.GetClusterWeight() which calculates the cluster +weight using a O(ln N) recursion through the tree. The lwt and +rwt values can be recovered as follows: + + lwt = Node.GetLeft()->GetClusterWeight() + + Node.GetWeight() + + lwt = Node.GetLeft()->GetClusterWeight() + + Node.GetWeight() + +HMMer calculates a further vector fwt as follows. + + this.fwt = parent.fwt * parent.lwt / (parent.lwt + parent.rwt) + +This applies to nodes reached via a left branch, for nodes reached +via a right branch: + + this.fwt = parent.fwt * parent.rwt / (parent.lwt + parent.rwt) + +The values of fwt at the leaf nodes are the final GSC weights. +We derive the various terms using our equivalents. + + parent.lwt = Parent.GetLeft()->GetClusterWeight() + + Parent.GetWeight() + + parent.rwt = Parent.GetRight()->GetClusterWeight() + + Parent.GetWeight() + + parent.lwt + parent.rwt = + { Parent.GetLeft()->GetClusterWeight() + + Parent.GetRight()->GetClusterWeight() + + Parent.GetWeight() } + + Parent.GetWeight() + +We recognize the term {...} as the cluster weight of the +parent, so + + parent.lwt + parent.rwt + = Parent.GetClusterWeight() + + Parent.GetWeight() + +As you would expect, repeating this exercise for parent.rwt gives +exactly the same expression. + +The GSC weights (fwt) are stored in the Weight2 field of the cluster +tree, the Weight field stores the original (BLOSUM) weights used +as input to this algorithm. +***/ + +#include "muscle.h" +#include "msa.h" +#include "cluster.h" +#include "distfunc.h" + +// Set weights of all sequences in the subtree under given node. +void MSA::SetSubtreeWeight2(const ClusterNode *ptrNode) const + { + if (0 == ptrNode) + return; + + const ClusterNode *ptrRight = ptrNode->GetRight(); + const ClusterNode *ptrLeft = ptrNode->GetLeft(); + +// If leaf, set weight + if (0 == ptrRight && 0 == ptrLeft) + { + unsigned uIndex = ptrNode->GetIndex(); + double dWeight = ptrNode->GetWeight2(); + WEIGHT w = DoubleToWeight(dWeight); + m_Weights[uIndex] = w; + return; + } + +// Otherwise, recursively set subtrees + SetSubtreeWeight2(ptrLeft); + SetSubtreeWeight2(ptrRight); + } + +void MSA::SetSubtreeGSCWeight(ClusterNode *ptrNode) const + { + if (0 == ptrNode) + return; + + ClusterNode *ptrParent = ptrNode->GetParent(); + double dParentWeight2 = ptrParent->GetWeight2(); + double dParentClusterWeight = ptrParent->GetClusterWeight(); + if (0.0 == dParentClusterWeight) + { + double dThisClusterSize = ptrNode->GetClusterSize(); + double dParentClusterSize = ptrParent->GetClusterSize(); + double dWeight2 = + dParentWeight2*dThisClusterSize/dParentClusterSize; + ptrNode->SetWeight2(dWeight2); + } + else + { + // Could cache cluster weights for better performance. + // We calculate cluster weight of each node twice, so this + // would give x2 improvement. + // As weighting is not very expensive, we don't care. + double dThisClusterWeight = ptrNode->GetClusterWeight(); + double dParentWeight = ptrParent->GetWeight(); + + double dNum = dThisClusterWeight + dParentWeight; + double dDenom = dParentClusterWeight + dParentWeight; + double dWeight2 = dParentWeight2*(dNum/dDenom); + + ptrNode->SetWeight2(dWeight2); + } + + SetSubtreeGSCWeight(ptrNode->GetLeft()); + SetSubtreeGSCWeight(ptrNode->GetRight()); + } + +void MSA::SetGSCWeights() const + { + ClusterTree CT; + CalcBLOSUMWeights(CT); + +// Calculate weights and store in tree. + ClusterNode *ptrRoot = CT.GetRoot(); + ptrRoot->SetWeight2(1.0); + SetSubtreeGSCWeight(ptrRoot->GetLeft()); + SetSubtreeGSCWeight(ptrRoot->GetRight()); + +// Copy weights from tree to MSA. + SetSubtreeWeight2(ptrRoot); + } + +void MSA::ListWeights() const + { + const unsigned uSeqCount = GetSeqCount(); + Log("Weights:\n"); + WEIGHT wTotal = 0; + for (unsigned n = 0; n < uSeqCount; ++n) + { + wTotal += GetSeqWeight(n); + Log("%6.3f %s\n", GetSeqWeight(n), GetSeqName(n)); + } + Log("Total weights = %6.3f, should be 1.0\n", wTotal); + } diff --git a/src/muscle/muscle3.8.31/src/setnewhandler.cpp b/src/muscle/muscle3.8.31/src/setnewhandler.cpp new file mode 100644 index 0000000..73fd68c --- /dev/null +++ b/src/muscle/muscle3.8.31/src/setnewhandler.cpp @@ -0,0 +1,26 @@ +#include "muscle.h" +#include +#include + +const int ONE_MB = 1024*1024; +const size_t RESERVE_BYTES = 8*ONE_MB; +static void *EmergencyReserve = 0; + +void OnOutOfMemory() + { + free(EmergencyReserve); + fprintf(stderr, "\n*** OUT OF MEMORY ***\n"); + fprintf(stderr, "Memory allocated so far %g MB\n", GetMemUseMB()); + extern MSA *ptrBestMSA; + if (ptrBestMSA == 0) + fprintf(stderr, "No alignment generated\n"); + else + SaveCurrentAlignment(); + exit(EXIT_FatalError); + } + +void SetNewHandler() + { + EmergencyReserve = malloc(RESERVE_BYTES); + std::set_new_handler(OnOutOfMemory); + } diff --git a/src/muscle/muscle3.8.31/src/spfast.cpp b/src/muscle/muscle3.8.31/src/spfast.cpp new file mode 100644 index 0000000..51e0dba --- /dev/null +++ b/src/muscle/muscle3.8.31/src/spfast.cpp @@ -0,0 +1,269 @@ +#include "muscle.h" +#include "profile.h" + +#define TRACE 0 + +enum + { + LL = 0, + LG = 1, + GL = 2, + GG = 3, + }; + +static const char *GapTypeToStr(int GapType) + { + switch (GapType) + { + case LL: return "LL"; + case LG: return "LG"; + case GL: return "GL"; + case GG: return "GG"; + } + Quit("Invalid gap type"); + return "?"; + } + +static SCORE GapScoreMatrix[4][4]; + +static void InitGapScoreMatrix() + { + const SCORE t = (SCORE) 0.2; + + GapScoreMatrix[LL][LL] = 0; + GapScoreMatrix[LL][LG] = g_scoreGapOpen; + GapScoreMatrix[LL][GL] = 0; + GapScoreMatrix[LL][GG] = 0; + + GapScoreMatrix[LG][LL] = g_scoreGapOpen; + GapScoreMatrix[LG][LG] = 0; + GapScoreMatrix[LG][GL] = g_scoreGapOpen; + GapScoreMatrix[LG][GG] = t*g_scoreGapOpen; // approximation! + + GapScoreMatrix[GL][LL] = 0; + GapScoreMatrix[GL][LG] = g_scoreGapOpen; + GapScoreMatrix[GL][GL] = 0; + GapScoreMatrix[GL][GG] = 0; + + GapScoreMatrix[GG][LL] = 0; + GapScoreMatrix[GG][LG] = t*g_scoreGapOpen; // approximation! + GapScoreMatrix[GG][GL] = 0; + GapScoreMatrix[GG][GG] = 0; + + for (int i = 0; i < 4; ++i) + for (int j = 0; j < i; ++j) + if (GapScoreMatrix[i][j] != GapScoreMatrix[j][i]) + Quit("GapScoreMatrix not symmetrical"); + } + +static SCORE SPColBrute(const MSA &msa, unsigned uColIndex) + { + SCORE Sum = 0; + const unsigned uSeqCount = msa.GetSeqCount(); + for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) + { + const WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1); + unsigned uLetter1 = msa.GetLetterEx(uSeqIndex1, uColIndex); + if (uLetter1 >= 20) + continue; + for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2) + { + const WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2); + unsigned uLetter2 = msa.GetLetterEx(uSeqIndex2, uColIndex); + if (uLetter2 >= 20) + continue; + SCORE t = w1*w2*(*g_ptrScoreMatrix)[uLetter1][uLetter2]; +#if TRACE + Log("Check %c %c w1=%.3g w2=%.3g Mx=%.3g t=%.3g\n", + LetterToCharAmino(uLetter1), + LetterToCharAmino(uLetter2), + w1, + w2, + (*g_ptrScoreMatrix)[uLetter1][uLetter2], + t); +#endif + Sum += t; + } + } + return Sum; + } + +static SCORE SPGapFreqs(const FCOUNT Freqs[]) + { +#if TRACE + Log("Freqs="); + for (unsigned i = 0; i < 4; ++i) + if (Freqs[i] != 0) + Log(" %s=%.3g", GapTypeToStr(i), Freqs[i]); + Log("\n"); +#endif + + SCORE TotalOffDiag = 0; + SCORE TotalDiag = 0; + for (unsigned i = 0; i < 4; ++i) + { + const FCOUNT fi = Freqs[i]; + if (0 == fi) + continue; + const float *Row = GapScoreMatrix[i]; + SCORE diagt = fi*fi*Row[i]; + TotalDiag += diagt; +#if TRACE + Log("SPFGaps %s %s + Mx=%.3g TotalDiag += %.3g\n", + GapTypeToStr(i), + GapTypeToStr(i), + Row[i], + diagt); +#endif + SCORE Sum = 0; + for (unsigned j = 0; j < i; ++j) + { + SCORE t = Freqs[j]*Row[j]; +#if TRACE + if (Freqs[j] != 0) + Log("SPFGaps %s %s + Mx=%.3g Sum += %.3g\n", + GapTypeToStr(i), + GapTypeToStr(j), + Row[j], + fi*t); +#endif + Sum += t; + } + TotalOffDiag += fi*Sum; + } +#if TRACE + Log("SPFGap TotalOffDiag=%.3g + TotalDiag=%.3g = %.3g\n", + TotalOffDiag, TotalDiag, TotalOffDiag + TotalDiag); +#endif + return TotalOffDiag*2 + TotalDiag; + } + +static SCORE SPFreqs(const FCOUNT Freqs[]) + { +#if TRACE + Log("Freqs="); + for (unsigned i = 0; i < 20; ++i) + if (Freqs[i] != 0) + Log(" %c=%.3g", LetterToCharAmino(i), Freqs[i]); + Log("\n"); +#endif + + SCORE TotalOffDiag = 0; + SCORE TotalDiag = 0; + for (unsigned i = 0; i < 20; ++i) + { + const FCOUNT fi = Freqs[i]; + if (0 == fi) + continue; + const float *Row = (*g_ptrScoreMatrix)[i]; + SCORE diagt = fi*fi*Row[i]; + TotalDiag += diagt; +#if TRACE + Log("SPF %c %c + Mx=%.3g TotalDiag += %.3g\n", + LetterToCharAmino(i), + LetterToCharAmino(i), + Row[i], + diagt); +#endif + SCORE Sum = 0; + for (unsigned j = 0; j < i; ++j) + { + SCORE t = Freqs[j]*Row[j]; +#if TRACE + if (Freqs[j] != 0) + Log("SPF %c %c + Mx=%.3g Sum += %.3g\n", + LetterToCharAmino(i), + LetterToCharAmino(j), + Row[j], + fi*t); +#endif + Sum += t; + } + TotalOffDiag += fi*Sum; + } +#if TRACE + Log("SPF TotalOffDiag=%.3g + TotalDiag=%.3g = %.3g\n", + TotalOffDiag, TotalDiag, TotalOffDiag + TotalDiag); +#endif + return TotalOffDiag*2 + TotalDiag; + } + +static SCORE ObjScoreSPCol(const MSA &msa, unsigned uColIndex) + { + FCOUNT Freqs[20]; + FCOUNT GapFreqs[4]; + + memset(Freqs, 0, sizeof(Freqs)); + memset(GapFreqs, 0, sizeof(GapFreqs)); + + const unsigned uSeqCount = msa.GetSeqCount(); +#if TRACE + Log("Weights="); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + Log(" %u=%.3g", uSeqIndex, msa.GetSeqWeight(uSeqIndex)); + Log("\n"); +#endif + SCORE SelfOverCount = 0; + SCORE GapSelfOverCount = 0; + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + WEIGHT w = msa.GetSeqWeight(uSeqIndex); + + bool bGapThisCol = msa.IsGap(uSeqIndex, uColIndex); + bool bGapPrevCol = (uColIndex == 0 ? false : msa.IsGap(uSeqIndex, uColIndex - 1)); + int GapType = bGapThisCol + 2*bGapPrevCol; + assert(GapType >= 0 && GapType < 4); + GapFreqs[GapType] += w; + SCORE gapt = w*w*GapScoreMatrix[GapType][GapType]; + GapSelfOverCount += gapt; + + if (bGapThisCol) + continue; + unsigned uLetter = msa.GetLetterEx(uSeqIndex, uColIndex); + if (uLetter >= 20) + continue; + Freqs[uLetter] += w; + SCORE t = w*w*(*g_ptrScoreMatrix)[uLetter][uLetter]; +#if TRACE + Log("FastCol compute freqs & SelfOverCount %c w=%.3g M=%.3g SelfOverCount += %.3g\n", + LetterToCharAmino(uLetter), w, (*g_ptrScoreMatrix)[uLetter][uLetter], t); +#endif + SelfOverCount += t; + } + SCORE SPF = SPFreqs(Freqs); + SCORE Col = SPF - SelfOverCount; + + SCORE SPFGaps = SPGapFreqs(GapFreqs); + SCORE ColGaps = SPFGaps - GapSelfOverCount; +#if TRACE + Log("SPF=%.3g - SelfOverCount=%.3g = %.3g\n", SPF, SelfOverCount, Col); + Log("SPFGaps=%.3g - GapsSelfOverCount=%.3g = %.3g\n", SPFGaps, GapSelfOverCount, ColGaps); +#endif + return Col + ColGaps; + } + +SCORE ObjScoreSPDimer(const MSA &msa) + { + static bool bGapScoreMatrixInit = false; + if (!bGapScoreMatrixInit) + InitGapScoreMatrix(); + + SCORE Total = 0; + const unsigned uSeqCount = msa.GetSeqCount(); + const unsigned uColCount = msa.GetColCount(); + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + { + SCORE Col = ObjScoreSPCol(msa, uColIndex); +#if TRACE + { + SCORE ColCheck = SPColBrute(msa, uColIndex); + Log("FastCol=%.3g CheckCol=%.3g\n", Col, ColCheck); + } +#endif + Total += Col; + } +#if TRACE + Log("Total/2 = %.3g (final result from fast)\n", Total/2); +#endif + return Total/2; + } diff --git a/src/muscle/muscle3.8.31/src/sptest.cpp b/src/muscle/muscle3.8.31/src/sptest.cpp new file mode 100644 index 0000000..ba06ecf --- /dev/null +++ b/src/muscle/muscle3.8.31/src/sptest.cpp @@ -0,0 +1,176 @@ +#include "muscle.h" +#include "objscore.h" +#include "msa.h" +#include "textfile.h" +#include "pwpath.h" + +const unsigned INDELS = 1; + +static void GetPos(const char Str[], unsigned L, int *pi1, int *pi2) + { + int i1; + for (;;) + { + i1 = rand()%(L-2) + 1; + if (Str[i1] == 'M') + break; + } + int i2; + for (;;) + { + i2 = rand()%(L-2) + 1; + if (i1 != i2 && Str[i2] == 'M') + break; + } + *pi1 = i1; + *pi2 = i2; + } + +static void MakePath(unsigned uSeqLength, unsigned uIndelCount, char Str[]) + { + unsigned uPathLength = uSeqLength + uIndelCount; + for (unsigned i = 0; i < uPathLength; ++i) + Str[i] = 'M'; + + for (unsigned i = 0; i < uIndelCount; ++i) + { + int i1, i2; + GetPos(Str, uPathLength, &i1, &i2); + Str[i1] = 'D'; + Str[i2] = 'I'; + } + + Str[uPathLength] = 0; + Log("MakePath=%s\n", Str); + } + +void SPTest() + { + SetPPScore(PPSCORE_SV); + + SetListFileName("c:\\tmp\\muscle.log", false); + + TextFile file1("c:\\tmp\\msa1.afa"); + TextFile file2("c:\\tmp\\msa2.afa"); + + MSA msa1; + MSA msa2; + + msa1.FromFile(file1); + msa2.FromFile(file2); + + Log("msa1=\n"); + msa1.LogMe(); + Log("msa2=\n"); + msa2.LogMe(); + + const unsigned uColCount = msa1.GetColCount(); + if (msa2.GetColCount() != uColCount) + Quit("Different lengths"); + + const unsigned uSeqCount1 = msa1.GetSeqCount(); + const unsigned uSeqCount2 = msa2.GetSeqCount(); + const unsigned uSeqCount = uSeqCount1 + uSeqCount2; + + MSA::SetIdCount(uSeqCount); + + for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount1; ++uSeqIndex1) + { + msa1.SetSeqWeight(uSeqIndex1, 1.0); + msa1.SetSeqId(uSeqIndex1, uSeqIndex1); + } + + for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqCount2; ++uSeqIndex2) + { + msa2.SetSeqWeight(uSeqIndex2, 1.0); + msa2.SetSeqId(uSeqIndex2, uSeqCount1 + uSeqIndex2); + } + + MSA alnA; + MSA alnB; + + char strPathA[1024]; + char strPathB[1024]; + MakePath(uColCount, INDELS, strPathA); + MakePath(uColCount, INDELS, strPathB); + + PWPath PathA; + PWPath PathB; + PathA.FromStr(strPathA); + PathB.FromStr(strPathB); + + Log("PathA=\n"); + PathA.LogMe(); + Log("PathB=\n"); + PathB.LogMe(); + + AlignTwoMSAsGivenPath(PathA, msa1, msa2, alnA); + AlignTwoMSAsGivenPath(PathB, msa1, msa2, alnB); + + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + alnA.SetSeqWeight(uSeqIndex, 1.0); + alnB.SetSeqWeight(uSeqIndex, 1.0); + } + + unsigned Seqs1[1024]; + unsigned Seqs2[1024]; + + for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount1; ++uSeqIndex1) + Seqs1[uSeqIndex1] = uSeqIndex1; + + for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqCount2; ++uSeqIndex2) + Seqs2[uSeqIndex2] = uSeqCount1 + uSeqIndex2; + + MSA msaA1; + MSA msaA2; + MSA msaB1; + MSA msaB2; + MSAFromSeqSubset(alnA, Seqs1, uSeqCount1, msaA1); + MSAFromSeqSubset(alnB, Seqs1, uSeqCount1, msaB1); + MSAFromSeqSubset(alnA, Seqs2, uSeqCount2, msaA2); + MSAFromSeqSubset(alnB, Seqs2, uSeqCount2, msaB2); + + for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount1; ++uSeqIndex1) + { + msaA1.SetSeqWeight(uSeqIndex1, 1.0); + msaB1.SetSeqWeight(uSeqIndex1, 1.0); + } + + for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqCount2; ++uSeqIndex2) + { + msaA2.SetSeqWeight(uSeqIndex2, 1.0); + msaB2.SetSeqWeight(uSeqIndex2, 1.0); + } + + Log("msaA1=\n"); + msaA1.LogMe(); + + Log("msaB1=\n"); + msaB1.LogMe(); + + Log("msaA2=\n"); + msaA2.LogMe(); + + Log("msaB2=\n"); + msaB2.LogMe(); + + Log("alnA=\n"); + alnA.LogMe(); + + Log("AlnB=\n"); + alnB.LogMe(); + + Log("\nSPA\n---\n"); + SCORE SPA = ObjScoreSP(alnA); + Log("\nSPB\n---\n"); + SCORE SPB = ObjScoreSP(alnB); + + Log("\nXPA\n---\n"); + SCORE XPA = ObjScoreXP(msaA1, msaA2); + Log("\nXPB\n---\n"); + SCORE XPB = ObjScoreXP(msaB1, msaB2); + + Log("SPA=%.4g SPB=%.4g Diff=%.4g\n", SPA, SPB, SPA - SPB); + Log("XPA=%.4g XPB=%.4g Diff=%.4g\n", XPA, XPB, XPA - XPB); + } diff --git a/src/muscle/muscle3.8.31/src/stabilize.cpp b/src/muscle/muscle3.8.31/src/stabilize.cpp new file mode 100644 index 0000000..b6b2569 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/stabilize.cpp @@ -0,0 +1,21 @@ +#include "muscle.h" +#include "msa.h" + +void Stabilize(const MSA &msa, MSA &msaStable) + { + const unsigned uSeqCount = msa.GetSeqCount(); + const unsigned uColCount = msa.GetColCount(); + + msaStable.SetSize(uSeqCount, uColCount); + for (unsigned uId = 0; uId < uSeqCount; ++uId) + { + const unsigned uSeqIndex = msa.GetSeqIndex(uId); + msaStable.SetSeqName(uId, msa.GetSeqName(uSeqIndex)); + msaStable.SetSeqId(uSeqIndex, uId); + for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) + { + const char c = msa.GetChar(uSeqIndex, uColIndex); + msaStable.SetChar(uId, uColIndex, c); + } + } + } diff --git a/src/muscle/muscle3.8.31/src/subfam.cpp b/src/muscle/muscle3.8.31/src/subfam.cpp new file mode 100644 index 0000000..e720cfa --- /dev/null +++ b/src/muscle/muscle3.8.31/src/subfam.cpp @@ -0,0 +1,409 @@ +#include "muscle.h" +#include "tree.h" +#include "textfile.h" // for test only +#include "msa.h" +#include "seqvect.h" +#include "profile.h" +#ifndef _MSC_VER +#include // for unlink +#endif + +#define TRACE 0 + +/*** +Find subfamilies from tree by following criteria: + +(a) number of leaves <= max, +(b) is monophyletic, i.e. most recent common ancestor is parent +of no more than one subfamily. +***/ + +static unsigned SubFamRecurse(const Tree &tree, unsigned uNodeIndex, unsigned uMaxLeafCount, + unsigned SubFams[], unsigned &uSubFamCount) + { + if (tree.IsLeaf(uNodeIndex)) + return 1; + + unsigned uLeft = tree.GetLeft(uNodeIndex); + unsigned uRight = tree.GetRight(uNodeIndex); + unsigned uLeftCount = SubFamRecurse(tree, uLeft, uMaxLeafCount, SubFams, uSubFamCount); + unsigned uRightCount = SubFamRecurse(tree, uRight, uMaxLeafCount, SubFams, uSubFamCount); + + unsigned uLeafCount = uLeftCount + uRightCount; + if (uLeftCount + uRightCount > uMaxLeafCount) + { + if (uLeftCount <= uMaxLeafCount) + SubFams[uSubFamCount++] = uLeft; + if (uRightCount <= uMaxLeafCount) + SubFams[uSubFamCount++] = uRight; + } + else if (tree.IsRoot(uNodeIndex)) + { + if (uSubFamCount != 0) + Quit("Error in SubFamRecurse"); + SubFams[uSubFamCount++] = uNodeIndex; + } + + return uLeafCount; + } + +void SubFam(const Tree &tree, unsigned uMaxLeafCount, unsigned SubFams[], unsigned *ptruSubFamCount) + { + *ptruSubFamCount = 0; + SubFamRecurse(tree, tree.GetRootNodeIndex(), uMaxLeafCount, SubFams, *ptruSubFamCount); + +#if TRACE + { + Log("\n"); + Log("Tree:\n"); + tree.LogMe(); + //void DrawTree(const Tree &tree); + //DrawTree(tree); + Log("\n"); + Log("%d subfams:\n", *ptruSubFamCount); + for (unsigned i = 0; i < *ptruSubFamCount; ++i) + Log(" %d=%d", i, SubFams[i]); + Log("\n"); + } +#endif + } + +//unsigned SubFams[9999]; +//unsigned uSubFamCount; +// +//static unsigned DistFromRoot(const Tree &tree, unsigned uNodeIndex) +// { +// const unsigned uRoot = tree.GetRootNodeIndex(); +// unsigned uDist = 0; +// while (uNodeIndex != uRoot) +// { +// ++uDist; +// uNodeIndex = tree.GetParent(uNodeIndex); +// } +// return uDist; +// } +// +//static void DrawNode(const Tree &tree, unsigned uNodeIndex) +// { +// if (!tree.IsLeaf(uNodeIndex)) +// DrawNode(tree, tree.GetLeft(uNodeIndex)); +// +// unsigned uDist = DistFromRoot(tree, uNodeIndex); +// for (unsigned i = 0; i < 5*uDist; ++i) +// Log(" "); +// Log("%d", uNodeIndex); +// for (unsigned i = 0; i < uSubFamCount; ++i) +// if (uNodeIndex == SubFams[i]) +// { +// Log("*"); +// break; +// } +// Log("\n"); +// +// if (!tree.IsLeaf(uNodeIndex)) +// DrawNode(tree, tree.GetRight(uNodeIndex)); +// } +// +//static void DrawTree(const Tree &tree) +// { +// unsigned uRoot = tree.GetRootNodeIndex(); +// DrawNode(tree, uRoot); +// } +// +//void TestSubFams(const char *FileName) +// { +// Tree tree; +// TextFile f(FileName); +// tree.FromFile(f); +// SubFam(tree, 5, SubFams, &uSubFamCount); +// DrawTree(tree); +// } + +static void SetInFam(const Tree &tree, unsigned uNodeIndex, bool NodeInSubFam[]) + { + if (tree.IsLeaf(uNodeIndex)) + return; + unsigned uLeft = tree.GetLeft(uNodeIndex); + unsigned uRight = tree.GetRight(uNodeIndex); + NodeInSubFam[uLeft] = true; + NodeInSubFam[uRight] = true; + + SetInFam(tree, uLeft, NodeInSubFam); + SetInFam(tree, uRight, NodeInSubFam); + } + +void AlignSubFam(SeqVect &vAll, const Tree &GuideTree, unsigned uNodeIndex, + MSA &msaOut) + { + const unsigned uSeqCount = vAll.GetSeqCount(); + + const char *InTmp = "asf_in.tmp"; + const char *OutTmp = "asf_out.tmp"; + + unsigned *Leaves = new unsigned[uSeqCount]; + unsigned uLeafCount; + GetLeaves(GuideTree, uNodeIndex, Leaves, &uLeafCount); + + SeqVect v; + for (unsigned i = 0; i < uLeafCount; ++i) + { + unsigned uLeafNodeIndex = Leaves[i]; + unsigned uId = GuideTree.GetLeafId(uLeafNodeIndex); + Seq &s = vAll.GetSeqById(uId); + v.AppendSeq(s); + } + +#if TRACE + { + Log("Align subfam[node=%d, size=%d] ", uNodeIndex, uLeafCount); + for (unsigned i = 0; i < uLeafCount; ++i) + Log(" %s", v.GetSeqName(i)); + Log("\n"); + } +#endif + + TextFile fIn(InTmp, true); + + v.ToFASTAFile(fIn); + fIn.Close(); + + char CmdLine[4096]; + sprintf(CmdLine, "probcons %s > %s 2> /dev/null", InTmp, OutTmp); +// sprintf(CmdLine, "muscle -in %s -out %s -maxiters 1", InTmp, OutTmp); + int NotUsed = system(CmdLine); + + TextFile fOut(OutTmp); + msaOut.FromFile(fOut); + + for (unsigned uSeqIndex = 0; uSeqIndex < uLeafCount; ++uSeqIndex) + { + const char *Name = msaOut.GetSeqName(uSeqIndex); + unsigned uId = vAll.GetSeqIdFromName(Name); + msaOut.SetSeqId(uSeqIndex, uId); + } + + unlink(InTmp); + unlink(OutTmp); + + delete[] Leaves; + } + +void ProgAlignSubFams() + { + MSA msaOut; + + SetOutputFileName(g_pstrOutFileName); + SetInputFileName(g_pstrInFileName); + + SetMaxIters(g_uMaxIters); + SetSeqWeightMethod(g_SeqWeight1); + + TextFile fileIn(g_pstrInFileName); + SeqVect v; + v.FromFASTAFile(fileIn); + const unsigned uSeqCount = v.Length(); + + if (0 == uSeqCount) + Quit("No sequences in input file"); + + ALPHA Alpha = ALPHA_Undefined; + switch (g_SeqType) + { + case SEQTYPE_Auto: + Alpha = v.GuessAlpha(); + break; + + case SEQTYPE_Protein: + Alpha = ALPHA_Amino; + break; + + case SEQTYPE_DNA: + Alpha = ALPHA_DNA; + break; + + case SEQTYPE_RNA: + Alpha = ALPHA_RNA; + break; + + default: + Quit("Invalid seq type"); + } + SetAlpha(Alpha); + v.FixAlpha(); + + PTR_SCOREMATRIX UserMatrix = 0; + if (0 != g_pstrMatrixFileName) + { + const char *FileName = g_pstrMatrixFileName; + const char *Path = getenv("MUSCLE_MXPATH"); + if (Path != 0) + { + size_t n = strlen(Path) + 1 + strlen(FileName) + 1; + char *NewFileName = new char[n]; + sprintf(NewFileName, "%s/%s", Path, FileName); + FileName = NewFileName; + } + TextFile File(FileName); + UserMatrix = ReadMx(File); + g_Alpha = ALPHA_Amino; + g_PPScore = PPSCORE_SP; + } + + SetPPScore(); + + if (0 != UserMatrix) + g_ptrScoreMatrix = UserMatrix; + + if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha) + { + SetPPScore(PPSCORE_SPN); + g_Distance1 = DISTANCE_Kmer4_6; + } + + unsigned uMaxL = 0; + unsigned uTotL = 0; + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + unsigned L = v.GetSeq(uSeqIndex).Length(); + uTotL += L; + if (L > uMaxL) + uMaxL = L; + } + + SetIter(1); + g_bDiags = g_bDiags1; + SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount); + + SetMuscleSeqVect(v); + + MSA::SetIdCount(uSeqCount); + +// Initialize sequence ids. +// From this point on, ids must somehow propogate from here. + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + v.SetSeqId(uSeqIndex, uSeqIndex); + + if (uSeqCount > 1) + MHackStart(v); + + if (0 == uSeqCount) + { + msaOut.Clear(); + return; + } + + if (1 == uSeqCount && ALPHA_Amino == Alpha) + { + const Seq &s = v.GetSeq(0); + msaOut.FromSeq(s); + return; + } + + Tree GuideTree; + TreeFromSeqVect(v, GuideTree, g_Cluster1, g_Distance1, g_Root1); + SetMuscleTree(GuideTree); + + MSA msa; + if (g_bLow) + { + ProgNode *ProgNodes = 0; + ProgNodes = ProgressiveAlignE(v, GuideTree, msa); + delete[] ProgNodes; + } + else + ProgressiveAlign(v, GuideTree, msa); + SetCurrentAlignment(msa); + TreeFromMSA(msa, GuideTree, g_Cluster2, g_Distance2, g_Root2); + SetMuscleTree(GuideTree); + + unsigned *SubFams = new unsigned[uSeqCount]; + unsigned uSubFamCount; + SubFam(GuideTree, g_uMaxSubFamCount, SubFams, &uSubFamCount); + + SetProgressDesc("Align node"); + const unsigned uNodeCount = 2*uSeqCount - 1; + + ProgNode *ProgNodes = new ProgNode[uNodeCount]; + bool *NodeIsSubFam = new bool[uNodeCount]; + bool *NodeInSubFam = new bool[uNodeCount]; + + for (unsigned i = 0; i < uNodeCount; ++i) + { + NodeIsSubFam[i] = false; + NodeInSubFam[i] = false; + } + + for (unsigned i = 0; i < uSubFamCount; ++i) + { + unsigned uNodeIndex = SubFams[i]; + assert(uNodeIndex < uNodeCount); + NodeIsSubFam[uNodeIndex] = true; + SetInFam(GuideTree, uNodeIndex, NodeInSubFam); + } + + unsigned uJoin = 0; + unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode(); + do + { + if (NodeIsSubFam[uTreeNodeIndex]) + { +#if TRACE + Log("Node %d: align subfam\n", uTreeNodeIndex); +#endif + ProgNode &Node = ProgNodes[uTreeNodeIndex]; + AlignSubFam(v, GuideTree, uTreeNodeIndex, Node.m_MSA); + Node.m_uLength = Node.m_MSA.GetColCount(); + } + else if (!NodeInSubFam[uTreeNodeIndex]) + { +#if TRACE + Log("Node %d: align two subfams\n", uTreeNodeIndex); +#endif + Progress(uJoin, uSubFamCount - 1); + ++uJoin; + + const unsigned uMergeNodeIndex = uTreeNodeIndex; + ProgNode &Parent = ProgNodes[uMergeNodeIndex]; + + const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex); + const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex); + + ProgNode &Node1 = ProgNodes[uLeft]; + ProgNode &Node2 = ProgNodes[uRight]; + + PWPath Path; + AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, Path); + Parent.m_uLength = Parent.m_MSA.GetColCount(); + + Node1.m_MSA.Clear(); + Node2.m_MSA.Clear(); + } + else + { +#if TRACE + Log("Node %d: in subfam\n", uTreeNodeIndex); +#endif + ; + } + uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex); + } + while (NULL_NEIGHBOR != uTreeNodeIndex); + ProgressStepsDone(); + + unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex(); + ProgNode &RootProgNode = ProgNodes[uRootNodeIndex]; + + TextFile fOut(g_pstrOutFileName, true); + MHackEnd(RootProgNode.m_MSA); + RootProgNode.m_MSA.ToFile(fOut); + + delete[] NodeInSubFam; + delete[] NodeIsSubFam; + delete[] ProgNodes; + delete[] SubFams; + + ProgNodes = 0; + NodeInSubFam = 0; + NodeIsSubFam = 0; + SubFams = 0; + } diff --git a/src/muscle/muscle3.8.31/src/subfams.cpp b/src/muscle/muscle3.8.31/src/subfams.cpp new file mode 100644 index 0000000..62cad34 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/subfams.cpp @@ -0,0 +1,65 @@ +#include "muscle.h" +#include "distfunc.h" + +const float INFINITY = float(1e29); +const unsigned NILL = uInsane; + +static float *ShortestPathEstimate; +static unsigned *Predecessor; + +static void GetMostDistantPair(DistFunc &DF, unsigned *ptrIndex1, unsigned *ptrIndex2) + { + const unsigned uNodeCount = DF.GetCount(); + if (uNodeCount < 2) + Quit("GetMostDistantPair: < 2 seqs"); + + float MaxDist = -1; + unsigned Index1 = uInsane; + unsigned Index2 = uInsane; + for (unsigned i = 0; i < uNodeCount; ++i) + { + for (unsigned j = i + 1; j < uNodeCount; ++j) + { + float d = DF.GetDist(i, j); + if (d > MaxDist) + { + MaxDist = d; + Index1 = i; + Index2 = j; + } + } + } + + assert(Index1 != uInsane); + assert(Index2 != uInsane); + + *ptrIndex1 = Index1; + *ptrIndex2 = Index2; + } + +static void InitializeSingleSource(DistFunc &DF, unsigned uIndex) + { + const unsigned uNodeCount = 0; + + for (unsigned i = 0; i < uNodeCount; ++i) + { + ShortestPathEstimate[i] = INFINITY; + Predecessor[i] = NILL; + } + ShortestPathEstimate[uIndex] = 0; + } + +static void Relax(DistFunc &DF, unsigned u, unsigned v) + { + float w = DF.GetDist(u, v); + float d = ShortestPathEstimate[u] + w; + if (ShortestPathEstimate[v] > d) + { + ShortestPathEstimate[v] = d; + Predecessor[v] = u; + } + } + +void ShortestPath(DistFunc &DF, unsigned uIndex) + { + } diff --git a/src/muscle/muscle3.8.31/src/svnmods.h b/src/muscle/muscle3.8.31/src/svnmods.h new file mode 100644 index 0000000..548d4e1 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/svnmods.h @@ -0,0 +1 @@ +"export" diff --git a/src/muscle/muscle3.8.31/src/svnversion.h b/src/muscle/muscle3.8.31/src/svnversion.h new file mode 100644 index 0000000..0caeafe --- /dev/null +++ b/src/muscle/muscle3.8.31/src/svnversion.h @@ -0,0 +1 @@ +"31" diff --git a/src/muscle/muscle3.8.31/src/sw.cpp b/src/muscle/muscle3.8.31/src/sw.cpp new file mode 100644 index 0000000..81ef445 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/sw.cpp @@ -0,0 +1,206 @@ +#include "muscle.h" +#include +#include "pwpath.h" +#include "profile.h" +#include + +// Textbook Smith-Waterman affine gap implementation. + +#define TRACE 0 + +static const char *LocalScoreToStr(SCORE s) + { + static char str[16]; + if (MINUS_INFINITY == s) + return " *"; + sprintf(str, "%6.2f", s); + return str; + } + +static void ListDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB, + unsigned uPrefixCountA, unsigned uPrefixCountB) + { + Log(" "); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + char c = ' '; + if (uPrefixLengthB > 0) + c = ConsensusChar(PB[uPrefixLengthB - 1]); + Log(" %4u:%c", uPrefixLengthB, c); + } + Log("\n"); + for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + char c = ' '; + if (uPrefixLengthA > 0) + c = ConsensusChar(PA[uPrefixLengthA - 1]); + Log("%4u:%c ", uPrefixLengthA, c); + for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB))); + Log("\n"); + } + } + +SCORE SW(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path) + { + assert(uLengthB > 0 && uLengthA > 0); + + const unsigned uPrefixCountA = uLengthA + 1; + const unsigned uPrefixCountB = uLengthB + 1; + +// Allocate DP matrices + const size_t LM = uPrefixCountA*uPrefixCountB; + SCORE *DPM_ = new SCORE[LM]; + SCORE *DPD_ = new SCORE[LM]; + SCORE *DPI_ = new SCORE[LM]; + + DPM(0, 0) = 0; + DPD(0, 0) = MINUS_INFINITY; + DPI(0, 0) = MINUS_INFINITY; + + DPM(1, 0) = MINUS_INFINITY; + DPD(1, 0) = MINUS_INFINITY; + DPI(1, 0) = MINUS_INFINITY; + + DPM(0, 1) = MINUS_INFINITY; + DPD(0, 1) = MINUS_INFINITY; + DPI(0, 1) = MINUS_INFINITY; + +// Empty prefix of B is special case + for (unsigned uPrefixLengthA = 2; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + // M=LetterA+LetterB, impossible with empty prefix + DPM(uPrefixLengthA, 0) = MINUS_INFINITY; + + // D=LetterA+GapB, never optimal in local alignment with gap penalties + DPD(uPrefixLengthA, 0) = MINUS_INFINITY; + + // I=GapA+LetterB, impossible with empty prefix + DPI(uPrefixLengthA, 0) = MINUS_INFINITY; + } + +// Empty prefix of A is special case + for (unsigned uPrefixLengthB = 2; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + // M=LetterA+LetterB, impossible with empty prefix + DPM(0, uPrefixLengthB) = MINUS_INFINITY; + + // D=LetterA+GapB, impossible with empty prefix + DPD(0, uPrefixLengthB) = MINUS_INFINITY; + + // I=GapA+LetterB, never optimal in local alignment with gap penalties + DPI(0, uPrefixLengthB) = MINUS_INFINITY; + } + + SCORE scoreMax = MINUS_INFINITY; + unsigned uPrefixLengthAMax = uInsane; + unsigned uPrefixLengthBMax = uInsane; + +// ============ +// Main DP loop +// ============ + SCORE scoreGapCloseB = MINUS_INFINITY; + for (unsigned uPrefixLengthB = 1; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) + { + const ProfPos &PPB = PB[uPrefixLengthB - 1]; + + SCORE scoreGapCloseA = MINUS_INFINITY; + for (unsigned uPrefixLengthA = 1; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) + { + const ProfPos &PPA = PA[uPrefixLengthA - 1]; + + { + // Match M=LetterA+LetterB + SCORE scoreLL = ScoreProfPos2(PPA, PPB); + + SCORE scoreMM = DPM(uPrefixLengthA-1, uPrefixLengthB-1); + SCORE scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseA; + SCORE scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + scoreGapCloseB; + + SCORE scoreBest; + if (scoreMM >= scoreDM && scoreMM >= scoreIM) + scoreBest = scoreMM; + else if (scoreDM >= scoreMM && scoreDM >= scoreIM) + scoreBest = scoreDM; + else + { + assert(scoreIM >= scoreMM && scoreIM >= scoreDM); + scoreBest = scoreIM; + } + if (scoreBest < 0) + scoreBest = 0; + scoreBest += scoreLL; + if (scoreBest > scoreMax) + { + scoreMax = scoreBest; + uPrefixLengthAMax = uPrefixLengthA; + uPrefixLengthBMax = uPrefixLengthB; + } + DPM(uPrefixLengthA, uPrefixLengthB) = scoreBest; + } + + { + // Delete D=LetterA+GapB + SCORE scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) + + PA[uPrefixLengthA-1].m_scoreGapOpen; + SCORE scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB); + + SCORE scoreBest; + if (scoreMD >= scoreDD) + scoreBest = scoreMD; + else + { + assert(scoreDD >= scoreMD); + scoreBest = scoreDD; + } + DPD(uPrefixLengthA, uPrefixLengthB) = scoreBest; + } + + // Insert I=GapA+LetterB + { + SCORE scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) + + PB[uPrefixLengthB - 1].m_scoreGapOpen; + SCORE scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1); + + SCORE scoreBest; + if (scoreMI >= scoreII) + scoreBest = scoreMI; + else + { + assert(scoreII > scoreMI); + scoreBest = scoreII; + } + DPI(uPrefixLengthA, uPrefixLengthB) = scoreBest; + } + + scoreGapCloseA = PPA.m_scoreGapClose; + } + scoreGapCloseB = PPB.m_scoreGapClose; + } + +#if TRACE + Log("DPM:\n"); + ListDP(DPM_, PA, PB, uPrefixLengthA, uPrefixLengthB); + Log("DPD:\n"); + ListDP(DPD_, PA, PB, uPrefixLengthA, uPrefixLengthB); + Log("DPI:\n"); + ListDP(DPI_, PA, PB, uPrefixLengthA, uPrefixLengthB); +#endif + + assert(scoreMax == DPM(uPrefixLengthAMax, uPrefixLengthBMax)); + TraceBackSW(PA, uLengthA, PB, uLengthB, DPM_, DPD_, DPI_, + uPrefixLengthAMax, uPrefixLengthBMax, Path); + +#if TRACE + SCORE scorePath = FastScorePath2(PA, uLengthA, PB, uLengthB, Path); + Path.LogMe(); + Log("Score = %s Path = %s\n", LocalScoreToStr(scoreMax), LocalScoreToStr(scorePath)); +#endif + + delete[] DPM_; + delete[] DPD_; + delete[] DPI_; + + return scoreMax; + } diff --git a/src/muscle/muscle3.8.31/src/termgaps.cpp b/src/muscle/muscle3.8.31/src/termgaps.cpp new file mode 100644 index 0000000..f65f231 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/termgaps.cpp @@ -0,0 +1,36 @@ +#include "muscle.h" +#include "profile.h" + +void SetTermGaps(const ProfPos *Prof, unsigned uLength) + { + if (0 == uLength) + return; + + ProfPos *First = (ProfPos *) Prof; + ProfPos *Last = (ProfPos *) (Prof + uLength - 1); + + switch (g_TermGaps) + { + case TERMGAPS_Full: + break; + + case TERMGAPS_Half: + // -infinity check for lock left/right + if (First->m_scoreGapOpen != MINUS_INFINITY) + First->m_scoreGapOpen = 0; + + if (uLength > 1 && Last->m_scoreGapClose != MINUS_INFINITY) + Last->m_scoreGapClose = 0; + + case TERMGAPS_Ext: + if (First->m_scoreGapOpen != MINUS_INFINITY) + First->m_scoreGapOpen *= -1; + + if (uLength > 1 && Last->m_scoreGapClose != MINUS_INFINITY) + Last->m_scoreGapClose *= -1; + break; + + default: + Quit("Invalid g_TermGaps"); + } + } diff --git a/src/muscle/muscle3.8.31/src/textfile.cpp b/src/muscle/muscle3.8.31/src/textfile.cpp new file mode 100644 index 0000000..dbe68e2 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/textfile.cpp @@ -0,0 +1,359 @@ +#include "muscle.h" +#include "textfile.h" +#include + +TextFile::TextFile(const char szFileName[], bool bWrite) + { + FILE *ptrFile = 0; + if (bWrite) + { + if (0 == strcmp(szFileName, "-")) + ptrFile = stdout; + else + ptrFile = fopen(szFileName, "wb"); + } + else + { + if (0 == strcmp(szFileName, "-")) + ptrFile = stdin; + else + ptrFile = fopen(szFileName, "rb"); + } + if (0 == ptrFile) + Quit("Cannot open '%s' errno=%d\n", szFileName, errno); + Init(ptrFile, szFileName); + } + +void TextFile::Init(FILE *ptrFile, const char *ptrFileName) + { + m_ptrFile = ptrFile; + m_ptrName = strdup(ptrFileName); + m_uLineNr = 1; + m_uColNr = 0; + m_bLastCharWasEOL = true; + m_cPushedBack = -1; +#if DEBUG + setbuf(m_ptrFile, 0); +#endif + } + +TextFile::TextFile(FILE *ptrFile, const char *ptrFileName) + { + Init(ptrFile, "-"); + } + +TextFile::~TextFile() + { + if (m_ptrFile && + m_ptrFile != stdin && m_ptrFile != stdout && m_ptrFile != stderr) + fclose(m_ptrFile); + free(m_ptrName); + } + +// Get line from file. +// Return true if end-of-file, quit if line too long. +bool TextFile::GetLine(char szLine[], unsigned uBytes) + { + if (0 == uBytes) + Quit("TextFile::GetLine, buffer zero size"); + + + int FillVal = 0; // suppress warning from gcc that I don't understand + memset(szLine, FillVal, (size_t) uBytes); + + unsigned uBytesCopied = 0; + +// Loop until end of line or end of file. + for (;;) + { + char c; + bool bEof = GetChar(c); + if (bEof) + return true; + if ('\r' == c) + continue; + if ('\n' == c) + return false; + if (uBytesCopied < uBytes - 1) + szLine[uBytesCopied++] = (char) c; + else + Quit("TextFile::GetLine: input buffer too small, line %u", + m_uLineNr); + } + } + +// As GetLine, but trim leading and trailing blanks; skip empty lines +bool TextFile::GetTrimLine(char szLine[], unsigned uBytes) + { + if (uBytes == 0) + Quit("GetTrimLine"); + for (;;) + { + bool bEOF = GetLine(szLine, uBytes); + if (bEOF) + return true; + TrimBlanks(szLine); + if (0 != szLine[0]) + break; + } + return false; + } + +void TextFile::Rewind() + { + fseek(m_ptrFile, 0, SEEK_SET); + m_uLineNr = 1; + m_bLastCharWasEOL = true; + } + +void TextFile::PutChar(char c) + { + int i = fputc(c, m_ptrFile); + assert(i == c); + if ('\n' == c) + { + ++m_uLineNr; + m_uColNr = 1; + } + else + ++m_uColNr; + } + +void TextFile::PutString(const char szLine[]) + { + int iError = fputs(szLine, m_ptrFile); + assert(iError >= 0); + } + +void TextFile::PutFormat(const char szFormat[], ...) + { + char szStr[4096]; + va_list ArgList; + va_start(ArgList, szFormat); + vsprintf(szStr, szFormat, ArgList); + PutString(szStr); + } + +void TextFile::GetLineX(char szLine[], unsigned uBytes) + { + if (uBytes == 0) + Quit("GetLineX"); + bool bEof = GetLine(szLine, uBytes); + if (bEof) + Quit("end-of-file in GetLineX"); + } + +bool TextFile::GetToken(char szToken[], unsigned uBytes, const char szCharTokens[]) + { +// Skip leading white space + char c; + for (;;) + { + bool bEof = GetChar(c); + if (bEof) + return true; + if (!isspace(c)) + break; + } + +// Check for special case single-character tokens + if (0 != strchr(szCharTokens, c)) + { + assert(uBytes >= 2); + szToken[0] = c; + szToken[1] = 0; + return false; + } + +// Loop until token terminated by white space, EOF or special + unsigned uBytesCopied = 0; + for (;;) + { + if (uBytesCopied < uBytes - 1) + szToken[uBytesCopied++] = c; + else + Quit("TextFile::GetToken: input buffer too small, line %u", + m_uLineNr); + bool bEof = GetChar(c); + if (bEof) + { + szToken[uBytesCopied] = 0; + return true; + } + // Check for special case single-character tokens + if (0 != strchr(szCharTokens, c)) + { + PushBack(c); + assert(uBytesCopied > 0 && uBytesCopied < uBytes); + szToken[uBytesCopied] = 0; + return false; + } + if (isspace(c)) + { + assert(uBytesCopied > 0 && uBytesCopied < uBytes); + szToken[uBytesCopied] = 0; + return false; + } + } + } + +void TextFile::GetTokenX(char szToken[], unsigned uBytes, const char szCharTokens[]) + { + bool bEof = GetToken(szToken, uBytes, szCharTokens); + if (bEof) + Quit("End-of-file in GetTokenX"); + } + +void TextFile::Skip() + { + for (;;) + { + char c; + bool bEof = GetChar(c); + if (bEof || '\n' == c) + return; + assert(isspace(c)); + } + } + +#ifdef _WIN32 + +TEXTFILEPOS TextFile::GetPos() + { + fpos_t p; + int i = fgetpos(m_ptrFile, &p); + assert(0 == i); + assert(p >= 0); + TEXTFILEPOS Pos; + Pos.uOffset = (unsigned) p; + Pos.uLineNr = m_uLineNr; + Pos.uColNr = m_uColNr; + return Pos; + } + +void TextFile::SetPos(TEXTFILEPOS Pos) + { + fpos_t p = (fpos_t) Pos.uOffset; + int i = fsetpos(m_ptrFile, &p); + assert(0 == i); + m_uLineNr = Pos.uLineNr; + m_uColNr = Pos.uColNr; + } + +#else + +TEXTFILEPOS TextFile::GetPos() + { + TEXTFILEPOS Pos; + Pos.uOffset = ftell(m_ptrFile); + Pos.uLineNr = m_uLineNr; + Pos.uColNr = m_uColNr; + return Pos; + } + +void TextFile::SetPos(TEXTFILEPOS Pos) + { + fseek(m_ptrFile, Pos.uOffset, SEEK_SET); + m_uLineNr = Pos.uLineNr; + m_uColNr = Pos.uColNr; + } + +#endif + +bool TextFile::GetChar(char &c) + { + if (-1 != m_cPushedBack) + { + c = (char) m_cPushedBack; + m_cPushedBack = -1; + return false; + } + + int ic = fgetc(m_ptrFile); + if (ic < 0) + { + if (feof(m_ptrFile)) + { + // Hack to fix up a non-empty text file that is missing + // and end-of-line character in the last line. + if (!m_bLastCharWasEOL && m_uLineNr > 0) + { + c = '\n'; + m_bLastCharWasEOL = true; + return false; + } + return true; + } + Quit("TextFile::GetChar, error %s", strerror(errno)); + } + c = (char) ic; + if ('\n' == c) + { + m_bLastCharWasEOL = true; + ++m_uLineNr; + m_uColNr = 1; + } + else + { + m_bLastCharWasEOL = false; + ++m_uColNr; + } + return false; + } + +void TextFile::GetCharX(char &c) + { + bool bEof = GetChar(c); + if (bEof) + Quit("End-of-file in GetCharX"); + } + +void TextFile::GetNonblankChar(char &c) + { + do + { + bool bEof = GetChar(c); + if (bEof) + Quit("End-of-file in GetCharX"); + } + while (isspace(c)); + } + +void TextFile::SkipLine() + { + if (m_bLastCharWasEOL) + return; + for (;;) + { + char c; + bool bEof = GetChar(c); + if (bEof) + Quit("End-of-file in SkipLine"); + if ('\n' == c) + break; + } + } + +void TextFile::SkipWhite() + { + bool bEof = SkipWhiteX(); + if (bEof) + Quit("End-of-file skipping white space"); + } + +bool TextFile::SkipWhiteX() + { + for (;;) + { + char c; + bool bEof = GetChar(c); + if (bEof) + return true; + if (!isspace(c)) + { + PushBack(c); + break; + } + } + return false; + } diff --git a/src/muscle/muscle3.8.31/src/textfile.h b/src/muscle/muscle3.8.31/src/textfile.h new file mode 100644 index 0000000..9d5c508 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/textfile.h @@ -0,0 +1,70 @@ +#ifndef TextFile_h +#define TextFile_h + +#include + +struct TEXTFILEPOS + { + unsigned uOffset; + unsigned uLineNr; + unsigned uColNr; + }; + +const unsigned TextFileBufferSize = 256; + +class TextFile + { +private: +// no default c'tor, not implemented + TextFile(); + +public: + virtual ~TextFile(); + + TextFile(const char szFileName[], bool bWrite = false); + TextFile(FILE *ptrFile, const char *ptrFileName = "-"); + void Close() { fclose(m_ptrFile); m_ptrFile = 0; } + + bool GetLine(char szLine[], unsigned uBytes); + bool GetTrimLine(char szLine[], unsigned uBytes); + void GetLineX(char szLine[], unsigned uBytes); + + bool GetToken(char szToken[], unsigned uBytes, const char szCharTokens[] = "{}"); + void GetTokenX(char szToken[], unsigned uBytes, const char szCharTokens[] = "{}"); + + void Skip(); + void SkipLine(); + void SkipWhite(); + bool SkipWhiteX(); + void Rewind(); + TEXTFILEPOS GetPos(); + void SetPos(TEXTFILEPOS Pos); + bool GetChar(char &c); + void GetCharX(char &c); + void GetNonblankChar(char &c); + + unsigned GetLineNr() { return m_uLineNr; } + + void PutString(const char szLine[]); + void PutFormat(const char szFormat[], ...); + void PutChar(char c); + + const char *GetFileName() { return m_ptrName; } + + void PushBack(int c) { m_cPushedBack = c; } + + FILE *GetStdioFile() const { return m_ptrFile; } + +private: + void Init(FILE *ptrFile, const char *ptrFileName); + +private: + FILE *m_ptrFile; + unsigned m_uLineNr; + unsigned m_uColNr; + char *m_ptrName; + bool m_bLastCharWasEOL; + int m_cPushedBack; + }; + +#endif // TextFile_h diff --git a/src/muscle/muscle3.8.31/src/threewaywt.cpp b/src/muscle/muscle3.8.31/src/threewaywt.cpp new file mode 100644 index 0000000..cf4c0ce --- /dev/null +++ b/src/muscle/muscle3.8.31/src/threewaywt.cpp @@ -0,0 +1,342 @@ +#include "muscle.h" +#include "tree.h" +#include + +#define TRACE 0 + +/*** +Sequence weights derived from a tree using Gotoh's +three-way method. + + Gotoh (1995) CABIOS 11(5), 543-51. + +Each edge e is assigned a weight w(e). + +Consider first a tree with three leaves A,B and C +having branch lengths a, b and c, as follows. + + B + | + b + | + A---a---R---c---C + +The internal node is denoted by R. + +Define: + + S = (ab + ca + ab) + x = bc(a + b)(a + c) + y = a(b + c)FS + +Here F is a tunable normalization factor which is +approximately 1.0. Then the edge weight for AR +is computed as: + + w(AR) = sqrt(x/y) + +Similar expressions for the other edges follow by +symmetry. + +For a tree with more than three edges, the weight +of an edge that ends in a leaf is computed from +the three-way tree that includes the edge and +its two neighbors. The weight of an internal edge +is computed as the product of the weights for that +edge derived from the two three-way subtrees that +include that edge. + +For example, consider the following tree. + + B + | + A--R--V--C + | + D + +Here, w(RV) is computed as the product of the +two values for w(RV) derived from the three-way +trees with leaves ABV and RCD respectively. + +The calculation is done using "Gotoh lengths", +not the real edge lengths. + +The Gotoh length G of a directed edge is calculated +recursively as: + + G = d + LR/(L + R) + +where d is the length of the edge, and L and R are +the Gotoh lengths of the left and right edges adjoining +the terminal end of the edge. If the edge terminates on +a leaf, then G=d. + +Pairwise sequence weights are computed as the +product of edge weights on the path that connects +their leaves. + +If the tree is split into two subtrees by deleting +a given edge e, then the pairwise weights factorize. +For operations on profiles formed from the two +subtrees, it is possible to assign a weight to a +sequence as the product of edge weights on a path +from e to its leaf. +***/ + +// The xxxUnrooted functions present a rooted tree as +// if it had been unrooted by deleting the root node. +static unsigned GetFirstNeighborUnrooted(const Tree &tree, unsigned uNode1, + unsigned uNode2) + { + if (tree.IsRoot(uNode1) || tree.IsRoot(uNode2)) + Quit("GetFirstNeighborUnrooted, should never be called with root"); + if (!tree.IsEdge(uNode1, uNode2)) + { + if (!tree.IsRoot(tree.GetParent(uNode1)) || + !tree.IsRoot(tree.GetParent(uNode2))) + Quit("GetFirstNeighborUnrooted, not edge"); + const unsigned uRoot = tree.GetRootNodeIndex(); + return tree.GetFirstNeighbor(uNode1, uRoot); + } + + unsigned uNeighbor = tree.GetFirstNeighbor(uNode1, uNode2); + if (tree.IsRoot(uNeighbor)) + return tree.GetFirstNeighbor(uNeighbor, uNode1); + return uNeighbor; + } + +static unsigned GetSecondNeighborUnrooted(const Tree &tree, unsigned uNode1, + unsigned uNode2) + { + if (tree.IsRoot(uNode1) || tree.IsRoot(uNode2)) + Quit("GetFirstNeighborUnrooted, should never be called with root"); + if (!tree.IsEdge(uNode1, uNode2)) + { + if (!tree.IsRoot(tree.GetParent(uNode1)) || + !tree.IsRoot(tree.GetParent(uNode2))) + Quit("GetFirstNeighborUnrooted, not edge"); + const unsigned uRoot = tree.GetRootNodeIndex(); + return tree.GetSecondNeighbor(uNode1, uRoot); + } + + unsigned uNeighbor = tree.GetSecondNeighbor(uNode1, uNode2); + if (tree.IsRoot(uNeighbor)) + return tree.GetFirstNeighbor(uNeighbor, uNode1); + return uNeighbor; + } + +static unsigned GetNeighborUnrooted(const Tree &tree, unsigned uNode1, + unsigned uSub) + { + unsigned uNeighbor = tree.GetNeighbor(uNode1, uSub); + if (tree.IsRoot(uNeighbor)) + return tree.GetFirstNeighbor(uNeighbor, uNode1); + return uNeighbor; + } + +static unsigned GetNeighborSubscriptUnrooted(const Tree &tree, unsigned uNode1, + unsigned uNode2) + { + if (tree.IsEdge(uNode1, uNode2)) + return tree.GetNeighborSubscript(uNode1, uNode2); + if (!tree.IsRoot(tree.GetParent(uNode1)) || + !tree.IsRoot(tree.GetParent(uNode2))) + Quit("GetNeighborSubscriptUnrooted, not edge"); + for (unsigned uSub = 0; uSub < 3; ++uSub) + if (GetNeighborUnrooted(tree, uNode1, uSub) == uNode2) + return uSub; + Quit("GetNeighborSubscriptUnrooted, not a neighbor"); + return NULL_NEIGHBOR; + } + +static double GetEdgeLengthUnrooted(const Tree &tree, unsigned uNode1, + unsigned uNode2) + { + if (tree.IsRoot(uNode1) || tree.IsRoot(uNode2)) + Quit("GetEdgeLengthUnrooted, should never be called with root"); + if (!tree.IsEdge(uNode1, uNode2)) + { + if (!tree.IsRoot(tree.GetParent(uNode1)) || + !tree.IsRoot(tree.GetParent(uNode2))) + Quit("GetEdgeLengthUnrooted, not edge"); + + const unsigned uRoot = tree.GetRootNodeIndex(); + return tree.GetEdgeLength(uNode1, uRoot) + + tree.GetEdgeLength(uNode2, uRoot); + } + return tree.GetEdgeLength(uNode1, uNode2); + } + +double GetGotohLength(const Tree &tree, unsigned R, unsigned A) + { + double dThis = GetEdgeLengthUnrooted(tree, R, A); + +// Enforce non-negative edge lengths + if (dThis < 0) + dThis = 0; + + if (tree.IsLeaf(A)) + return dThis; + + const unsigned uFirst = GetFirstNeighborUnrooted(tree, A, R); + const unsigned uSecond = GetSecondNeighborUnrooted(tree, A, R); + const double dFirst = GetGotohLength(tree, A, uFirst); + const double dSecond = GetGotohLength(tree, A, uSecond); + const double dSum = dFirst + dSecond; + const double dThird = dSum == 0 ? 0 : (dFirst*dSecond)/dSum; + return dThis + dThird; + } + +// Return weight of edge A-R in three-way subtree that has +// leaves A,B,C and internal node R. +static double GotohWeightThreeWay(const Tree &tree, unsigned A, + unsigned B, unsigned C, unsigned R) + { + const double F = 1.0; + + if (tree.IsLeaf(R)) + Quit("GotohThreeWay: R must be internal node"); + + double a = GetGotohLength(tree, R, A); + double b = GetGotohLength(tree, R, B); + double c = GetGotohLength(tree, R, C); + + double S = b*c + c*a + a*b; + double x = b*c*(a + b)*(a + c); + double y = a*(b + c)*F*S; + +// y is zero iff all three branch lengths are zero. + if (y < 0.001) + return 1.0; + return sqrt(x/y); + } + +static double GotohWeightEdge(const Tree &tree, unsigned uNodeIndex1, + unsigned uNodeIndex2) + { + double w1 = 1.0; + double w2 = 1.0; + if (!tree.IsLeaf(uNodeIndex1)) + { + unsigned R = uNodeIndex1; + unsigned A = uNodeIndex2; + unsigned B = GetFirstNeighborUnrooted(tree, R, A); + unsigned C = GetSecondNeighborUnrooted(tree, R, A); + w1 = GotohWeightThreeWay(tree, A, B, C, R); + } + if (!tree.IsLeaf(uNodeIndex2)) + { + unsigned R = uNodeIndex2; + unsigned A = uNodeIndex1; + unsigned B = GetFirstNeighborUnrooted(tree, R, A); + unsigned C = GetSecondNeighborUnrooted(tree, R, A); + w2 = GotohWeightThreeWay(tree, A, B, C, R); + } + return w1*w2; + } + +void CalcThreeWayEdgeWeights(const Tree &tree, WEIGHT **EdgeWeights) + { + const unsigned uNodeCount = tree.GetNodeCount(); + for (unsigned uNodeIndex1 = 0; uNodeIndex1 < uNodeCount; ++uNodeIndex1) + { + if (tree.IsRoot(uNodeIndex1)) + continue; + for (unsigned uSub1 = 0; uSub1 < 3; ++uSub1) + { + const unsigned uNodeIndex2 = GetNeighborUnrooted(tree, uNodeIndex1, uSub1); + if (NULL_NEIGHBOR == uNodeIndex2) + continue; + + // Avoid computing same edge twice in reversed order + if (uNodeIndex2 < uNodeIndex1) + continue; + + const WEIGHT w = (WEIGHT) GotohWeightEdge(tree, uNodeIndex1, uNodeIndex2); + const unsigned uSub2 = GetNeighborSubscriptUnrooted(tree, uNodeIndex2, uNodeIndex1); +#if DEBUG + { + assert(uNodeIndex2 == GetNeighborUnrooted(tree, uNodeIndex1, uSub1)); + assert(uNodeIndex1 == GetNeighborUnrooted(tree, uNodeIndex2, uSub2)); + const WEIGHT wRev = (WEIGHT) GotohWeightEdge(tree, uNodeIndex2, uNodeIndex1); + if (!BTEq(w, wRev)) + Quit("CalcThreeWayWeights: rev check failed %g %g", + w, wRev); + } +#endif + EdgeWeights[uNodeIndex1][uSub1] = w; + EdgeWeights[uNodeIndex2][uSub2] = w; + } + } + } + +static void SetSeqWeights(const Tree &tree, unsigned uNode1, unsigned uNode2, + double dPathWeight, WEIGHT *Weights) + { + if (tree.IsRoot(uNode1) || tree.IsRoot(uNode2)) + Quit("SetSeqWeights, should never be called with root"); + + const double dThisLength = GetEdgeLengthUnrooted(tree, uNode1, uNode2); + if (tree.IsLeaf(uNode2)) + { + const unsigned Id = tree.GetLeafId(uNode2); + Weights[Id] = (WEIGHT) (dPathWeight + dThisLength); + return; + } + const unsigned uFirst = GetFirstNeighborUnrooted(tree, uNode2, uNode1); + const unsigned uSecond = GetSecondNeighborUnrooted(tree, uNode2, uNode1); + dPathWeight *= dThisLength; + SetSeqWeights(tree, uNode2, uFirst, dPathWeight, Weights); + SetSeqWeights(tree, uNode2, uSecond, dPathWeight, Weights); + } + +void CalcThreeWayWeights(const Tree &tree, unsigned uNode1, unsigned uNode2, + WEIGHT *Weights) + { +#if TRACE + Log("CalcThreeWayEdgeWeights\n"); + tree.LogMe(); +#endif + + if (tree.IsRoot(uNode1)) + uNode1 = tree.GetFirstNeighbor(uNode1, uNode2); + else if (tree.IsRoot(uNode2)) + uNode2 = tree.GetFirstNeighbor(uNode2, uNode1); + const unsigned uNodeCount = tree.GetNodeCount(); + WEIGHT **EdgeWeights = new WEIGHT *[uNodeCount]; + for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) + EdgeWeights[uNodeIndex] = new WEIGHT[3]; + + CalcThreeWayEdgeWeights(tree, EdgeWeights); + +#if TRACE + { + Log("Node1 Node2 Length Gotoh EdgeWt\n"); + Log("----- ----- ------ ------ ------\n"); + for (unsigned uNodeIndex1 = 0; uNodeIndex1 < uNodeCount; ++uNodeIndex1) + { + if (tree.IsRoot(uNodeIndex1)) + continue; + for (unsigned uSub1 = 0; uSub1 < 3; ++uSub1) + { + const unsigned uNodeIndex2 = GetNeighborUnrooted(tree, uNodeIndex1, uSub1); + if (NULL_NEIGHBOR == uNodeIndex2) + continue; + if (uNodeIndex2 < uNodeIndex1) + continue; + const WEIGHT ew = EdgeWeights[uNodeIndex1][uSub1]; + const double d = GetEdgeLengthUnrooted(tree, uNodeIndex1, uNodeIndex2); + const double g = GetGotohLength(tree, uNodeIndex1, uNodeIndex2); + Log("%5u %5u %6.3f %6.3f %6.3f\n", uNodeIndex1, uNodeIndex2, d, g, ew); + } + } + } +#endif + + SetSeqWeights(tree, uNode1, uNode2, 0.0, Weights); + SetSeqWeights(tree, uNode2, uNode1, 0.0, Weights); + + for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) + delete[] EdgeWeights[uNodeIndex]; + delete[] EdgeWeights; + } diff --git a/src/muscle/muscle3.8.31/src/timing.h b/src/muscle/muscle3.8.31/src/timing.h new file mode 100644 index 0000000..d747bd7 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/timing.h @@ -0,0 +1,24 @@ +#if WIN32 + +typedef unsigned __int64 TICKS; + +#pragma warning(disable:4035) +inline TICKS GetClockTicks() + { + _asm + { + _emit 0x0f + _emit 0x31 + } + } + +#define StartTimer() __int64 t1__ = GetClockTicks() + +#define GetElapsedTicks() (GetClockTicks() - t1__) + +static double TicksToSecs(TICKS t) + { + return (__int64) t/2.5e9; + } + +#endif // WIN32 diff --git a/src/muscle/muscle3.8.31/src/tomhydro.cpp b/src/muscle/muscle3.8.31/src/tomhydro.cpp new file mode 100644 index 0000000..88b664b --- /dev/null +++ b/src/muscle/muscle3.8.31/src/tomhydro.cpp @@ -0,0 +1,109 @@ +#include "muscle.h" +#include "profile.h" + +// Original: +//HYDROPHILIC_CONTEXT 0 6 -0.3969495574 +//HYDROPHILIC_CONTEXT 1 6 -0.9407126603 +//HYDROPHILIC_CONTEXT 2 6 -0.4968150972 +//HYDROPHILIC_CONTEXT 3 6 -0.271646023 +//HYDROPHILIC_CONTEXT 4 6 0.006990406416 +//HYDROPHILIC_CONTEXT 5 6 0.1381111256 +//HYDROPHILIC_CONTEXT 6 6 0.2541439872 + +// Blosum62: +//HYDROPHILIC_CONTEXT 0 6 -0.2448419585 +//HYDROPHILIC_CONTEXT 1 6 -0.8734889946 +//HYDROPHILIC_CONTEXT 2 6 -0.5724336598 +//HYDROPHILIC_CONTEXT 3 6 -0.2670439975 +//HYDROPHILIC_CONTEXT 4 6 0.004844647323 +//HYDROPHILIC_CONTEXT 5 6 0.1812057148 +//HYDROPHILIC_CONTEXT 6 6 0.1036540864 + +static SCORE Factors[7] = + { + (SCORE) -0.2448419585, + (SCORE) -0.8734889946, + (SCORE) -0.5724336598, + (SCORE) -0.2670439975, + (SCORE) 0.004844647323, + (SCORE) 0.1812057148, + (SCORE) 0.1036540864 + }; + +static bool Hydrophilic[20] = + { + false, // A + false, // C + true, // D + true, // E + false, // F + true, // G + false, // H + false, // I + true, // K + false, // L + false, // M + true, // N + true, // P + true, // Q + true, // R + true, // S + false, // T + false, // V + false, // Y + false, // W + }; + +bool IsHydrophilic(const FCOUNT fcCounts[]) + { + for (unsigned uLetter = 0; uLetter < 20; ++uLetter) + if (fcCounts[uLetter] > 0.0 && Hydrophilic[uLetter]) + return false; + return true; + } + +static double HydrophilicFraction(const FCOUNT fcCounts[]) + { + double TotalAll = 0.0; + double TotalHydrophilic = 0.0; + for (unsigned uLetter = 0; uLetter < 20; ++uLetter) + { + FCOUNT Freq = fcCounts[uLetter]; + TotalAll += Freq; + if (Hydrophilic[uLetter]) + TotalHydrophilic += Freq; + } + return TotalHydrophilic / TotalAll; + } + +void TomHydro(ProfPos *Prof, unsigned uLength) + { + if (ALPHA_Amino != g_Alpha) + return; + if (uLength < 6) + return; + + for (unsigned uColIndex = 3; uColIndex < uLength - 2; ++uColIndex) + { + // 6-residue window: + // xxxxxx + // AARNCARNGTAGCATNAC + // AARN----------TNAC + + double dCount = 0.0; + for (unsigned uColIndexW = uColIndex - 3; uColIndexW < uColIndex + 3; + ++uColIndexW) + { + const ProfPos &PP = Prof[uColIndexW]; + dCount += HydrophilicFraction(PP.m_fcCounts); + } + // Round to nearest integer + unsigned uCount = (unsigned) (dCount + 0.5); + if (uCount > 6) + uCount = 6; + SCORE dFactor = Factors[uCount]; + ProfPos &PP = Prof[uColIndex]; + PP.m_scoreGapOpen += dFactor; + PP.m_scoreGapClose += dFactor; + } + } diff --git a/src/muscle/muscle3.8.31/src/traceback.cpp b/src/muscle/muscle3.8.31/src/traceback.cpp new file mode 100644 index 0000000..4d8533b --- /dev/null +++ b/src/muscle/muscle3.8.31/src/traceback.cpp @@ -0,0 +1,208 @@ +#include "muscle.h" +#include "profile.h" +#include "pwpath.h" +#include + +#define TRACE 0 + +#define EQ(a, b) (fabs(a-b) < 0.1) + +SCORE TraceBack(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, const SCORE *DPM_, const SCORE *DPD_, const SCORE *DPI_, + PWPath &Path) + { +#if TRACE + Log("\n"); + Log("TraceBack LengthA=%u LengthB=%u\n", uLengthA, uLengthB); +#endif + assert(uLengthB > 0 && uLengthA > 0); + + const unsigned uPrefixCountA = uLengthA + 1; + const unsigned uPrefixCountB = uLengthB + 1; + + Path.Clear(); + + unsigned uPrefixLengthA = uLengthA; + unsigned uPrefixLengthB = uLengthB; + + const SCORE scoreM = DPM(uPrefixLengthA, uPrefixLengthB); + SCORE scoreD = DPD(uPrefixLengthA, uPrefixLengthB); + SCORE scoreI = DPI(uPrefixLengthA, uPrefixLengthB); + + const ProfPos &LastPPA = PA[uLengthA - 1]; + const ProfPos &LastPPB = PB[uLengthB - 1]; + + scoreD += LastPPA.m_scoreGapClose; + scoreI += LastPPB.m_scoreGapClose; + + char cEdgeType = cInsane; + SCORE scoreMax; + if (scoreM >= scoreD && scoreM >= scoreI) + { + scoreMax = scoreM; + cEdgeType = 'M'; + } + else if (scoreD >= scoreM && scoreD >= scoreI) + { + scoreMax = scoreD; + cEdgeType = 'D'; + } + else + { + assert(scoreI >= scoreM && scoreI >= scoreD); + scoreMax = scoreI; + cEdgeType = 'I'; + } + + for (;;) + { + if ('S' == cEdgeType) + break; + + PWEdge Edge; + Edge.cType = cEdgeType; + Edge.uPrefixLengthA = uPrefixLengthA; + Edge.uPrefixLengthB = uPrefixLengthB; + Path.PrependEdge(Edge); + + char cPrevEdgeType; + unsigned uPrevPrefixLengthA = uPrefixLengthA; + unsigned uPrevPrefixLengthB = uPrefixLengthB; + + switch (cEdgeType) + { + case 'M': + { + assert(uPrefixLengthA > 0); + assert(uPrefixLengthB > 0); + const ProfPos &PPA = PA[uPrefixLengthA - 1]; + const ProfPos &PPB = PB[uPrefixLengthB - 1]; + + const SCORE Score = DPM(uPrefixLengthA, uPrefixLengthB); + const SCORE scoreMatch = ScoreProfPos2(PPA, PPB); + + SCORE scoreSM; + if (1 == uPrefixLengthA && 1 == uPrefixLengthB) + scoreSM = scoreMatch; + else + scoreSM = MINUS_INFINITY; + + SCORE scoreMM = MINUS_INFINITY; + SCORE scoreDM = MINUS_INFINITY; + SCORE scoreIM = MINUS_INFINITY; + if (uPrefixLengthA > 1 && uPrefixLengthB > 1) + scoreMM = DPM(uPrefixLengthA-1, uPrefixLengthB-1) + scoreMatch; + if (uPrefixLengthA > 1) + { + SCORE scoreTransDM = PA[uPrefixLengthA-2].m_scoreGapClose; + scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + scoreTransDM + scoreMatch; + } + if (uPrefixLengthB > 1) + { + SCORE scoreTransIM = PB[uPrefixLengthB-2].m_scoreGapClose; + scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + scoreTransIM + scoreMatch; + } + + if (EQ(scoreMM, Score)) + cPrevEdgeType = 'M'; + else if (EQ(scoreDM, Score)) + cPrevEdgeType = 'D'; + else if (EQ(scoreIM, Score)) + cPrevEdgeType = 'I'; + else if (EQ(scoreSM, Score)) + cPrevEdgeType = 'S'; + else + Quit("TraceBack: failed to match M score=%g M=%g D=%g I=%g S=%g", + Score, scoreMM, scoreDM, scoreIM, scoreSM); + + --uPrevPrefixLengthA; + --uPrevPrefixLengthB; + break; + } + + case 'D': + { + assert(uPrefixLengthA > 0); + const SCORE Score = DPD(uPrefixLengthA, uPrefixLengthB); + + SCORE scoreMD = MINUS_INFINITY; + SCORE scoreDD = MINUS_INFINITY; + SCORE scoreSD = MINUS_INFINITY; + if (uPrefixLengthB == 0) + { + if (uPrefixLengthA == 1) + scoreSD = PA[0].m_scoreGapOpen; + else + scoreSD = DPD(uPrefixLengthA - 1, 0); + } + if (uPrefixLengthA > 1) + { + const ProfPos &PPA = PA[uPrefixLengthA - 1]; + SCORE scoreTransMD = PPA.m_scoreGapOpen; + scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) + scoreTransMD; + scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB); + } + + if (EQ(Score, scoreMD)) + cPrevEdgeType = 'M'; + else if (EQ(Score, scoreDD)) + cPrevEdgeType = 'D'; + else if (EQ(Score, scoreSD)) + cPrevEdgeType = 'S'; + else + Quit("TraceBack: failed to match D"); + + --uPrevPrefixLengthA; + break; + } + + case 'I': + { + assert(uPrefixLengthB > 0); + const SCORE Score = DPI(uPrefixLengthA, uPrefixLengthB); + + SCORE scoreMI = MINUS_INFINITY; + SCORE scoreII = MINUS_INFINITY; + SCORE scoreSI = MINUS_INFINITY; + if (uPrefixLengthA == 0) + { + if (uPrefixLengthB == 1) + scoreSI = PB[0].m_scoreGapOpen; + else + scoreSI = DPI(0, uPrefixLengthB - 1); + } + if (uPrefixLengthB > 1) + { + const ProfPos &PPB = PB[uPrefixLengthB - 1]; + SCORE scoreTransMI = PPB.m_scoreGapOpen; + scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) + scoreTransMI; + scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1); + } + + if (EQ(Score, scoreMI)) + cPrevEdgeType = 'M'; + else if (EQ(Score, scoreII)) + cPrevEdgeType = 'I'; + else if (EQ(Score, scoreSI)) + cPrevEdgeType = 'S'; + else + Quit("TraceBack: failed to match I"); + + --uPrevPrefixLengthB; + break; + } + + default: + assert(false); + } +#if TRACE + Log("Edge %c%c%u.%u", cPrevEdgeType, cEdgeType, uPrefixLengthA, uPrefixLengthB); + Log("\n"); +#endif + cEdgeType = cPrevEdgeType; + uPrefixLengthA = uPrevPrefixLengthA; + uPrefixLengthB = uPrevPrefixLengthB; + } + + return scoreMax; + } diff --git a/src/muscle/muscle3.8.31/src/tracebackopt.cpp b/src/muscle/muscle3.8.31/src/tracebackopt.cpp new file mode 100644 index 0000000..9df1132 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/tracebackopt.cpp @@ -0,0 +1,73 @@ +#include "muscle.h" +#include "pwpath.h" + +void TraceBackToPath(int **TraceBack, unsigned uLengthA, + unsigned uLengthB, PWPath &Path) + { + Path.Clear(); + + PWEdge Edge; + Edge.uPrefixLengthA = uLengthA; + Edge.uPrefixLengthB = uLengthB; + + for (;;) + { + if (0 == Edge.uPrefixLengthA && 0 == Edge.uPrefixLengthB) + break; + + int iDelta = TraceBack[Edge.uPrefixLengthA][Edge.uPrefixLengthB]; +#if TRACE + Log("TraceBack[%u][%u] = %d\n", + Edge.uPrefixLengthA, Edge.uPrefixLengthB, iDelta); +#endif + if (0 == iDelta) + { + assert(Edge.uPrefixLengthA > 0); + assert(Edge.uPrefixLengthB > 0); + + Edge.cType = 'M'; + Path.PrependEdge(Edge); + --(Edge.uPrefixLengthA); + --(Edge.uPrefixLengthB); + continue; + } + else if (iDelta > 0) + { + Edge.cType = 'D'; + while (iDelta-- > 0) + { + assert(Edge.uPrefixLengthA > 0); + + Path.PrependEdge(Edge); + --(Edge.uPrefixLengthA); + } + } + else if (iDelta < 0) + { + Edge.cType = 'I'; + while (iDelta++ < 0) + { + assert(Edge.uPrefixLengthB > 0); + + Path.PrependEdge(Edge); + --(Edge.uPrefixLengthB); + } + } + + if (0 == Edge.uPrefixLengthA && 0 == Edge.uPrefixLengthB) + break; + + assert(Edge.uPrefixLengthA > 0); + assert(Edge.uPrefixLengthB > 0); + + Edge.cType = 'M'; + Path.PrependEdge(Edge); + --(Edge.uPrefixLengthA); + --(Edge.uPrefixLengthB); + } + +#if TRACE + Log("TraceBackToPath "); + Path.LogMe(); +#endif + } diff --git a/src/muscle/muscle3.8.31/src/tracebacksw.cpp b/src/muscle/muscle3.8.31/src/tracebacksw.cpp new file mode 100644 index 0000000..301ad65 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/tracebacksw.cpp @@ -0,0 +1,186 @@ +#include "muscle.h" +#include "profile.h" +#include "pwpath.h" +#include + +#define TRACE 0 + +#define EQ(a, b) (fabs(a-b) < 0.1) + +void TraceBackSW(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, const SCORE *DPM_, const SCORE *DPD_, const SCORE *DPI_, + unsigned uPrefixLengthAMax, unsigned uPrefixLengthBMax, PWPath &Path) + { +#if TRACE + Log("\n"); + Log("TraceBackSW LengthA=%u LengthB=%u PLAMax=%u PLBMax=%u\n", + uLengthA, uLengthB, uPrefixLengthAMax, uPrefixLengthBMax); +#endif + assert(uLengthB > 0 && uLengthA > 0); + + const unsigned uPrefixCountA = uLengthA + 1; + const unsigned uPrefixCountB = uLengthB + 1; + + Path.Clear(); + + unsigned uPrefixLengthA = uPrefixLengthAMax; + unsigned uPrefixLengthB = uPrefixLengthBMax; + + SCORE scoreMax = DPM(uPrefixLengthA, uPrefixLengthB); + char cEdgeType = 'M'; + + for (;;) + { + if ('S' == cEdgeType) + break; + + PWEdge Edge; + Edge.cType = cEdgeType; + Edge.uPrefixLengthA = uPrefixLengthA; + Edge.uPrefixLengthB = uPrefixLengthB; + Path.PrependEdge(Edge); + + char cPrevEdgeType; + unsigned uPrevPrefixLengthA = uPrefixLengthA; + unsigned uPrevPrefixLengthB = uPrefixLengthB; + + switch (cEdgeType) + { + case 'M': + { + assert(uPrefixLengthA > 0); + assert(uPrefixLengthB > 0); + const ProfPos &PPA = PA[uPrefixLengthA - 1]; + const ProfPos &PPB = PB[uPrefixLengthB - 1]; + + const SCORE Score = DPM(uPrefixLengthA, uPrefixLengthB); + const SCORE scoreMatch = ScoreProfPos2(PPA, PPB); + + SCORE scoreSM; + if (1 == uPrefixLengthA && 1 == uPrefixLengthB) + scoreSM = scoreMatch; + else + scoreSM = MINUS_INFINITY; + + SCORE scoreMM = MINUS_INFINITY; + SCORE scoreDM = MINUS_INFINITY; + SCORE scoreIM = MINUS_INFINITY; + if (uPrefixLengthA > 1 && uPrefixLengthB > 1) + { + SCORE scoreTrans = DPM(uPrefixLengthA-1, uPrefixLengthB-1); + scoreMM = scoreTrans + scoreMatch; + } + if (uPrefixLengthA > 1) + { + SCORE scoreTransDM = PA[uPrefixLengthA-2].m_scoreGapClose; + scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + scoreTransDM + scoreMatch; + } + if (uPrefixLengthB > 1) + { + SCORE scoreTransIM = PB[uPrefixLengthB-2].m_scoreGapClose; + scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + scoreTransIM + scoreMatch; + } + + if (EQ(scoreMM, Score)) + cPrevEdgeType = 'M'; + else if (EQ(scoreDM, Score)) + cPrevEdgeType = 'D'; + else if (EQ(scoreIM, Score)) + cPrevEdgeType = 'I'; + else if (EQ(scoreSM, Score)) + cPrevEdgeType = 'S'; + else if (EQ(scoreMatch, Score)) + cPrevEdgeType = 'S'; + else + Quit("TraceBack2: failed to match M score=%g M=%g D=%g I=%g S=%g", + Score, scoreMM, scoreDM, scoreIM, scoreSM); + + --uPrevPrefixLengthA; + --uPrevPrefixLengthB; + break; + } + + case 'D': + { + assert(uPrefixLengthA > 0); + const SCORE Score = DPD(uPrefixLengthA, uPrefixLengthB); + + SCORE scoreMD = MINUS_INFINITY; + SCORE scoreDD = MINUS_INFINITY; + SCORE scoreSD = MINUS_INFINITY; + if (uPrefixLengthB == 0) + { + if (uPrefixLengthA == 1) + scoreSD = PA[0].m_scoreGapOpen; + else + scoreSD = DPD(uPrefixLengthA - 1, 0); + } + if (uPrefixLengthA > 1) + { + const ProfPos &PPA = PA[uPrefixLengthA - 1]; + SCORE scoreTransMD = PPA.m_scoreGapOpen; + scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) + scoreTransMD; + scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB); + } + + if (EQ(Score, scoreMD)) + cPrevEdgeType = 'M'; + else if (EQ(Score, scoreDD)) + cPrevEdgeType = 'D'; + else if (EQ(Score, scoreSD)) + cPrevEdgeType = 'S'; + else + Quit("TraceBack2: failed to match D"); + + --uPrevPrefixLengthA; + break; + } + + case 'I': + { + assert(uPrefixLengthB > 0); + const SCORE Score = DPI(uPrefixLengthA, uPrefixLengthB); + + SCORE scoreMI = MINUS_INFINITY; + SCORE scoreII = MINUS_INFINITY; + SCORE scoreSI = MINUS_INFINITY; + if (uPrefixLengthA == 0) + { + if (uPrefixLengthB == 1) + scoreSI = PB[0].m_scoreGapOpen; + else + scoreSI = DPI(0, uPrefixLengthB - 1); + } + if (uPrefixLengthB > 1) + { + const ProfPos &PPB = PB[uPrefixLengthB - 1]; + SCORE scoreTransMI = PPB.m_scoreGapOpen; + scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) + scoreTransMI; + scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1); + } + + if (EQ(Score, scoreMI)) + cPrevEdgeType = 'M'; + else if (EQ(Score, scoreII)) + cPrevEdgeType = 'I'; + else if (EQ(Score, scoreSI)) + cPrevEdgeType = 'S'; + else + Quit("TraceBack2: failed to match I"); + + --uPrevPrefixLengthB; + break; + } + + default: + assert(false); + } +#if TRACE + Log("Edge %c%c%u.%u", cPrevEdgeType, cEdgeType, uPrefixLengthA, uPrefixLengthB); + Log("\n"); +#endif + cEdgeType = cPrevEdgeType; + uPrefixLengthA = uPrevPrefixLengthA; + uPrefixLengthB = uPrevPrefixLengthB; + } + } diff --git a/src/muscle/muscle3.8.31/src/tree.h b/src/muscle/muscle3.8.31/src/tree.h new file mode 100644 index 0000000..0f5e29e --- /dev/null +++ b/src/muscle/muscle3.8.31/src/tree.h @@ -0,0 +1,339 @@ +#ifndef tree_h +#define tree_h + +#include + +class Clust; + +const unsigned NULL_NEIGHBOR = UINT_MAX; + +enum NEWICK_TOKEN_TYPE + { + NTT_Unknown, + +// Returned from Tree::GetToken: + NTT_Lparen, + NTT_Rparen, + NTT_Colon, + NTT_Comma, + NTT_Semicolon, + NTT_String, + +// Following are never returned from Tree::GetToken: + NTT_SingleQuotedString, + NTT_DoubleQuotedString, + NTT_Comment + }; + +class Tree + { +public: + Tree() + { + m_uNodeCount = 0; + m_uCacheCount = 0; + m_uNeighbor1 = 0; + m_uNeighbor2 = 0; + m_uNeighbor3 = 0; + m_dEdgeLength1 = 0; + m_dEdgeLength2 = 0; + m_dEdgeLength3 = 0; + m_dHeight = 0; + m_bHasEdgeLength1 = 0; + m_bHasEdgeLength2 = 0; + m_bHasEdgeLength3 = 0; + m_bHasHeight = 0; + m_ptrName = 0; + m_Ids = 0; + } + virtual ~Tree() + { + Clear(); + } + + void Clear() + { + for (unsigned n = 0; n < m_uNodeCount; ++n) + free(m_ptrName[n]); + + m_uNodeCount = 0; + m_uCacheCount = 0; + + delete[] m_uNeighbor1; + delete[] m_uNeighbor2; + delete[] m_uNeighbor3; + delete[] m_dEdgeLength1; + delete[] m_dEdgeLength2; + delete[] m_dEdgeLength3; + delete[] m_bHasEdgeLength1; + delete[] m_bHasEdgeLength2; + delete[] m_bHasEdgeLength3; + delete[] m_ptrName; + delete[] m_Ids; + delete[] m_bHasHeight; + delete[] m_dHeight; + + m_uNeighbor1 = 0; + m_uNeighbor2 = 0; + m_uNeighbor3 = 0; + m_dEdgeLength1 = 0; + m_dEdgeLength2 = 0; + m_dEdgeLength3 = 0; + m_ptrName = 0; + m_Ids = 0; + m_uRootNodeIndex = 0; + m_bHasHeight = 0; + m_dHeight = 0; + + m_bRooted = false; + } + +// Creation and manipulation + void CreateRooted(); + void CreateUnrooted(double dEdgeLength); + + void FromFile(TextFile &File); + void FromClust(Clust &C); + + void Copy(const Tree &tree); + + void Create(unsigned uLeafCount, unsigned uRoot, const unsigned Left[], + const unsigned Right[], const float LeftLength[], const float RightLength[], + const unsigned LeafIds[], char *LeafNames[]); + unsigned AppendBranch(unsigned uExistingNodeIndex); + void SetLeafName(unsigned uNodeIndex, const char *ptrName); + void SetLeafId(unsigned uNodeIndex, unsigned uId); + void SetEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2, + double dLength); + + void RootUnrootedTree(unsigned uNodeIndex1, unsigned uNodeIndex2); + void RootUnrootedTree(ROOT Method); + void UnrootByDeletingRoot(); + +// Saving to file + void ToFile(TextFile &File) const; + +// Accessor functions + unsigned GetNodeCount() const + { + return m_uNodeCount; + } + + unsigned GetLeafCount() const + { + if (m_bRooted) + { + assert(m_uNodeCount%2 == 1); + return (m_uNodeCount + 1)/2; + } + else + { + assert(m_uNodeCount%2 == 0); + return (m_uNodeCount + 2)/2; + } + } + + unsigned GetNeighbor(unsigned uNodeIndex, unsigned uNeighborSubscript) const; + + unsigned GetNeighbor1(unsigned uNodeIndex) const + { + assert(uNodeIndex < m_uNodeCount); + return m_uNeighbor1[uNodeIndex]; + } + + unsigned GetNeighbor2(unsigned uNodeIndex) const + { + assert(uNodeIndex < m_uNodeCount); + return m_uNeighbor2[uNodeIndex]; + } + + unsigned GetNeighbor3(unsigned uNodeIndex) const + { + assert(uNodeIndex < m_uNodeCount); + return m_uNeighbor3[uNodeIndex]; + } + + unsigned GetParent(unsigned uNodeIndex) const + { + assert(m_bRooted && uNodeIndex < m_uNodeCount); + return m_uNeighbor1[uNodeIndex]; + } + + bool IsRooted() const + { + return m_bRooted; + } + + unsigned GetLeft(unsigned uNodeIndex) const + { + assert(m_bRooted && uNodeIndex < m_uNodeCount); + return m_uNeighbor2[uNodeIndex]; + } + + unsigned GetRight(unsigned uNodeIndex) const + { + assert(m_bRooted && uNodeIndex < m_uNodeCount); + return m_uNeighbor3[uNodeIndex]; + } + + const char *GetName(unsigned uNodeIndex) const + { + assert(uNodeIndex < m_uNodeCount); + return m_ptrName[uNodeIndex]; + } + + unsigned GetRootNodeIndex() const + { + assert(m_bRooted); + return m_uRootNodeIndex; + } + + unsigned GetNeighborCount(unsigned uNodeIndex) const + { + const unsigned n1 = m_uNeighbor1[uNodeIndex]; + const unsigned n2 = m_uNeighbor2[uNodeIndex]; + const unsigned n3 = m_uNeighbor3[uNodeIndex]; + return (NULL_NEIGHBOR != n1) + (NULL_NEIGHBOR != n2) + (NULL_NEIGHBOR != n3); + } + + bool IsLeaf(unsigned uNodeIndex) const + { + assert(uNodeIndex < m_uNodeCount); + if (1 == m_uNodeCount) + return true; + return 1 == GetNeighborCount(uNodeIndex); + } + + bool IsRoot(unsigned uNodeIndex) const + { + return IsRooted() && m_uRootNodeIndex == uNodeIndex; + } + + unsigned GetLeafId(unsigned uNodeIndex) const; + unsigned GetLeafNodeIndex(const char *ptrName) const; + bool IsEdge(unsigned uNodeIndex1, unsigned uNodeIndex2) const; + bool HasEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2) const; + double GetEdgeLength(unsigned uNodeIndex1, unsigned uNodeIndex2) const; + const char *GetLeafName(unsigned uNodeIndex) const; + unsigned GetNeighborSubscript(unsigned uNodeIndex, unsigned uNeighborIndex) const; + double GetNodeHeight(unsigned uNodeIndex) const; + +// Depth-first traversal + unsigned FirstDepthFirstNode() const; + unsigned NextDepthFirstNode(unsigned uNodeIndex) const; + + unsigned FirstDepthFirstNodeR() const; + unsigned NextDepthFirstNodeR(unsigned uNodeIndex) const; + +// Equivalent of GetLeft/Right in unrooted tree, works in rooted tree too. + unsigned GetFirstNeighbor(unsigned uNodeIndex, unsigned uNeighborIndex) const; + unsigned GetSecondNeighbor(unsigned uNodeIndex, unsigned uNeighborIndex) const; + +// Getting parent node in unrooted tree defined iff leaf + unsigned GetLeafParent(unsigned uNodeIndex) const; + +// Misc + const char *NTTStr(NEWICK_TOKEN_TYPE NTT) const; + void FindCenterByLongestSpan(unsigned *ptrNodeIndex1, + unsigned *ptrNodeIndex2) const; + void PruneTree(const Tree &tree, unsigned Subfams[], + unsigned uSubfamCount); + unsigned LeafIndexToNodeIndex(unsigned uLeafIndex) const; + +// Debugging & trouble-shooting support + void Validate() const; + void ValidateNode(unsigned uNodeIndex) const; + void AssertAreNeighbors(unsigned uNodeIndex1, unsigned uNodeIndex2) const; + void LogMe() const; + +private: + unsigned UnrootFromFile(); + NEWICK_TOKEN_TYPE GetTokenVerbose(TextFile &File, char szToken[], + unsigned uBytes) const + { + NEWICK_TOKEN_TYPE NTT = GetToken(File, szToken, uBytes); + Log("GetToken %10.10s %s\n", NTTStr(NTT), szToken); + return NTT; + } + + void InitCache(unsigned uCacheCount); + void ExpandCache(); + NEWICK_TOKEN_TYPE GetToken(TextFile &File, char szToken[], unsigned uBytes) const; + bool GetGroupFromFile(TextFile &File, unsigned uNodeIndex, double *ptrdEdgeLength); + unsigned GetLeafCountUnrooted(unsigned uNodeIndex1, unsigned uNodeIndex2, + double *ptrdTotalDistance) const; + void ToFileNodeRooted(TextFile &File, unsigned uNodeIndex) const; + void ToFileNodeUnrooted(TextFile &File, unsigned uNodeIndex, unsigned uParent) const; + void OrientParent(unsigned uNodeIndex, unsigned uParentNodeIndex); + double FromClustNode(const Clust &C, unsigned uClustNodeIndex, unsigned uPhyNodeIndex); + unsigned GetAnyNonLeafNode() const; + +// Yuck. Data is made public for the convenience of Tree::Copy. +// There has to be a better way. +public: + unsigned m_uNodeCount; + unsigned m_uCacheCount; + unsigned *m_uNeighbor1; + unsigned *m_uNeighbor2; + unsigned *m_uNeighbor3; + double *m_dEdgeLength1; + double *m_dEdgeLength2; + double *m_dEdgeLength3; + double *m_dHeight; + bool *m_bHasEdgeLength1; + bool *m_bHasEdgeLength2; + bool *m_bHasEdgeLength3; + bool *m_bHasHeight; + unsigned *m_Ids; + char **m_ptrName; + bool m_bRooted; + unsigned m_uRootNodeIndex; + }; + +struct PhyEnumEdgeState + { + PhyEnumEdgeState() + { + m_bInit = false; + m_uNodeIndex1 = NULL_NEIGHBOR; + m_uNodeIndex2 = NULL_NEIGHBOR; + } + bool m_bInit; + unsigned m_uNodeIndex1; + unsigned m_uNodeIndex2; + }; + +const unsigned NODE_CHANGED = (unsigned) (~0); + +extern bool PhyEnumBiParts(const Tree &tree, PhyEnumEdgeState &ES, + unsigned Leaves1[], unsigned *ptruCount1, + unsigned Leaves2[], unsigned *ptruCount2); +extern bool PhyEnumBiPartsR(const Tree &tree, PhyEnumEdgeState &ES, + unsigned Leaves1[], unsigned *ptruCount1, + unsigned Leaves2[], unsigned *ptruCount2); +extern void ClusterByHeight(const Tree &tree, double dMaxHeight, unsigned Subtrees[], + unsigned *ptruSubtreeCount); +void ClusterBySubfamCount(const Tree &tree, unsigned uSubfamCount, + unsigned Subfams[], unsigned *ptruSubfamCount); +void GetLeaves(const Tree &tree, unsigned uNodeIndex, unsigned Leaves[], + unsigned *ptruLeafCount); +void GetLeavesExcluding(const Tree &tree, unsigned uNodeIndex, + unsigned uExclude, unsigned Leaves[], unsigned *ptruCount); +void GetInternalNodesInHeightOrder(const Tree &tree, unsigned NodeIndexes[]); +void ApplyMinEdgeLength(Tree &tree, double dMinEdgeLength); +void LeafIndexesToLeafNames(const Tree &tree, const unsigned Leaves[], unsigned uCount, + char *Names[]); +void LeafIndexesToIds(const Tree &tree, const unsigned Leaves[], unsigned uCount, + unsigned Ids[]); +void MSASeqSubset(const MSA &msaIn, char *Names[], unsigned uSeqCount, + MSA &msaOut); +void DiffTrees(const Tree &Tree1, const Tree &Tree2, Tree &Diffs, + unsigned IdToDiffsLeafNodeIndex[]); +void DiffTreesE(const Tree &NewTree, const Tree &OldTree, + unsigned NewNodeIndexToOldNodeIndex[]); +void FindRoot(const Tree &tree, unsigned *ptruNode1, unsigned *ptruNode2, + double *ptrdLength1, double *ptrdLength2, + ROOT RootMethod); +void FixRoot(Tree &tree, ROOT RootMethod); + +#endif // tree_h diff --git a/src/muscle/muscle3.8.31/src/treefrommsa.cpp b/src/muscle/muscle3.8.31/src/treefrommsa.cpp new file mode 100644 index 0000000..36d3166 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/treefrommsa.cpp @@ -0,0 +1,97 @@ +#include "muscle.h" +#include "msa.h" +#include "tree.h" +#include "clust.h" +#include "clustsetmsa.h" +#include "distcalc.h" + +static void SaveMSADist(const MSA &msa, MSADist &d, const char *FileName) + { + FILE *f = fopen(FileName, "w"); + if (f == 0) + Quit("Cannot create %s", FileName); + + unsigned n = msa.GetSeqCount(); + for (unsigned i = 0; i < n; ++i) + { + fprintf(f, "%10.10s ", msa.GetSeqName(i)); + for (unsigned j = 0; j < n; ++j) + fprintf(f, " %9g", d.ComputeDist(msa, i, j)); + fprintf(f, "\n"); + } + fclose(f); + } + +static void TreeFromMSA_NJ(const MSA &msa, Tree &tree, CLUSTER Cluster, + DISTANCE Distance, const char *SaveFileName) + { + MSADist MD(Distance); + ClustSetMSA Set(msa, MD); + + if (SaveFileName != 0) + SaveMSADist(msa, MD, SaveFileName); + + Clust C; + C.Create(Set, Cluster); + + tree.FromClust(C); + } + +static void SaveDC(const DistCalcMSA &DC, const char *FileName) + { + FILE *f = fopen(FileName, "w"); + if (f == 0) + Quit("Cannot create %s", FileName); + + unsigned n = DC.GetCount(); + fprintf(f, "%u\n", n); + float *Dist = new float[n]; + for (unsigned i = 0; i < n; ++i) + { + fprintf(f, "%10.10s ", DC.GetName(i)); + DC.CalcDistRange(i, Dist); + for (unsigned j = 0; j < i; ++j) + fprintf(f, " %9g", Dist[j]); + fprintf(f, "\n"); + } + fclose(f); + } + +static void TreeFromMSA_UPGMA(const MSA &msa, Tree &tree, CLUSTER Cluster, + DISTANCE Distance, const char *SaveFileName) + { + LINKAGE Linkage = LINKAGE_Undefined; + switch (Cluster) + { + case CLUSTER_UPGMA: + Linkage = LINKAGE_Avg; + break; + case CLUSTER_UPGMAMin: + Linkage = LINKAGE_Min; + break; + case CLUSTER_UPGMAMax: + Linkage = LINKAGE_Max; + break; + case CLUSTER_UPGMB: + Linkage = LINKAGE_Biased; + break; + default: + Quit("TreeFromMSA_UPGMA, CLUSTER_%u not supported", Cluster); + } + + DistCalcMSA DC; + DC.Init(msa, Distance); + if (SaveFileName != 0) + SaveDC(DC, SaveFileName); + UPGMA2(DC, tree, Linkage); + } + +void TreeFromMSA(const MSA &msa, Tree &tree, CLUSTER Cluster, + DISTANCE Distance, ROOT Root, const char *SaveFileName) + { + if (CLUSTER_NeighborJoining == Cluster) + TreeFromMSA_NJ(msa, tree, Cluster, Distance, SaveFileName); + else + TreeFromMSA_UPGMA(msa, tree, Cluster, Distance, SaveFileName); + FixRoot(tree, Root); + } diff --git a/src/muscle/muscle3.8.31/src/types.h b/src/muscle/muscle3.8.31/src/types.h new file mode 100644 index 0000000..b93eb84 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/types.h @@ -0,0 +1,117 @@ +#ifndef types_h +#define types_h + +typedef unsigned char byte; +typedef unsigned short ushort; + +typedef float SCOREMATRIX[32][32]; +typedef SCOREMATRIX *PTR_SCOREMATRIX; + +class MSA; +class Seq; +class ClusterTree; +class DistFunc; +class TextFile; +class PWPath; +class Tree; +class SeqVect; +class DistCalc; + +struct ProgNode; +struct ProfPos; + +#if SINGLE_AFFINE +// Compress M, D and I trace-back matrices into 4 bits +enum + { + BIT_MM = 0x00, + BIT_DM = 0x01, + BIT_IM = 0x02, + BIT_xM = 0x03, + + BIT_DD = 0x00, + BIT_MD = 0x04, + // ID not allowed + BIT_xD = 0x04, + + BIT_II = 0x00, + BIT_MI = 0x08, + // DI not allowed + BIT_xI = 0x08, + }; + +#endif + +#if DOUBLE_AFFINE +// Compress M, D, E, I and J trace-back matrices into 7 bits +enum + { + BIT_MM = 0x00, + BIT_DM = 0x01, + BIT_EM = 0x02, + BIT_IM = 0x03, + BIT_JM = 0x04, + BIT_xM = 0x07, + + BIT_DD = 0x00, + BIT_MD = 0x08, + // [EIJ]D not sallowed + BIT_xD = 0x08, + + BIT_EE = 0x00, + BIT_ME = 0x10, + // [DDJ]E not allowed + BIT_xE = 0x10, + + BIT_II = 0x00, + BIT_MI = 0x20, + // [EDJ]I not allowed + BIT_xI = 0x20, + + BIT_JJ = 0x00, + BIT_MJ = 0x40, + // [EDI]J not allowed + BIT_xJ = 0x40, + }; +#endif + +enum EXIT + { + EXIT_Success = 0, + EXIT_NotStarted = 1, + EXIT_FatalError = 2, + EXIT_Except = 3, + }; + +enum NODECMP + { + NODECMP_Undefined = 0, + NODECMP_Same = 0, // equivalent to node in old tree + NODECMP_Diff = 1, // equivalent & parent is changed + NODECMP_Changed = 2 // no equivalent node in old tree + }; + +// Declare enums using macro hacks (see enums.h). +#define s(t) enum t { t##_Undefined = 0, +#define c(t, x) t##_##x, +#define e(t) }; +#include "enums.h" + +// Declare conversion function XXXToStr(XXX x) +// for each enum type XXX. +#define s(t) const char *t##ToStr(t x); +#define c(t, x) /* empty */ +#define e(t) /* empty */ +#include "enums.h" + +// Declare conversion function StrToXXX(const char *Str) +// for each enum type XXX. +#define s(t) t StrTo##t(const char *Str); +#define c(t, x) /* empty */ +#define e(t) /* empty */ +#include "enums.h" + +const char *BoolToStr(bool b); +const char *SecsToStr(unsigned long Secs); + +#endif // types_h diff --git a/src/muscle/muscle3.8.31/src/typetostr.cpp b/src/muscle/muscle3.8.31/src/typetostr.cpp new file mode 100644 index 0000000..2c0afdd --- /dev/null +++ b/src/muscle/muscle3.8.31/src/typetostr.cpp @@ -0,0 +1,58 @@ +#include "muscle.h" +#include + +const char *SecsToStr(unsigned long Secs) + { + static char Str[16]; + long hh, mm, ss; + + hh = Secs/(60*60); + mm = (Secs/60)%60; + ss = Secs%60; + + sprintf(Str, "%02ld:%02ld:%02ld", hh, mm, ss); + return Str; + } + +const char *BoolToStr(bool b) + { + return b ? "True" : "False"; + } + +const char *ScoreToStr(SCORE Score) + { + if (MINUS_INFINITY >= Score) + return " *"; +// Hack to use "circular" buffer so when called multiple +// times in a printf-like argument list it works OK. + const int iBufferCount = 16; + const int iBufferLength = 16; + static char szStr[iBufferCount*iBufferLength]; + static int iBufferIndex = 0; + iBufferIndex = (iBufferIndex + 1)%iBufferCount; + char *pStr = szStr + iBufferIndex*iBufferLength; + sprintf(pStr, "%8g", Score); + return pStr; + } + +// Left-justified version of ScoreToStr +const char *ScoreToStrL(SCORE Score) + { + if (MINUS_INFINITY >= Score) + return "*"; +// Hack to use "circular" buffer so when called multiple +// times in a printf-like argument list it works OK. + const int iBufferCount = 16; + const int iBufferLength = 16; + static char szStr[iBufferCount*iBufferLength]; + static int iBufferIndex = 0; + iBufferIndex = (iBufferIndex + 1)%iBufferCount; + char *pStr = szStr + iBufferIndex*iBufferLength; + sprintf(pStr, "%.3g", Score); + return pStr; + } + +const char *WeightToStr(WEIGHT w) + { + return ScoreToStr(w); + } diff --git a/src/muscle/muscle3.8.31/src/unixio.h b/src/muscle/muscle3.8.31/src/unixio.h new file mode 100644 index 0000000..c2e243f --- /dev/null +++ b/src/muscle/muscle3.8.31/src/unixio.h @@ -0,0 +1,11 @@ +#ifdef WIN32 +#include +#include +#else +#include +#include +#endif + +#if !defined(WIN32) && !defined(O_BINARY) +#define O_BINARY 0 +#endif diff --git a/src/muscle/muscle3.8.31/src/upgma2.cpp b/src/muscle/muscle3.8.31/src/upgma2.cpp new file mode 100644 index 0000000..941b9d2 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/upgma2.cpp @@ -0,0 +1,395 @@ +#include "muscle.h" +#include "tree.h" +#include "distcalc.h" + +// UPGMA clustering in O(N^2) time and space. + +#define TRACE 0 + +#define MIN(x, y) ((x) < (y) ? (x) : (y)) +#define MAX(x, y) ((x) > (y) ? (x) : (y)) +#define AVG(x, y) (((x) + (y))/2) + +static unsigned g_uLeafCount; +static unsigned g_uTriangleSize; +static unsigned g_uInternalNodeCount; +static unsigned g_uInternalNodeIndex; + +// Triangular distance matrix is g_Dist, which is allocated +// as a one-dimensional vector of length g_uTriangleSize. +// TriangleSubscript(i,j) maps row,column=i,j to the subscript +// into this vector. +// Row / column coordinates are a bit messy. +// Initially they are leaf indexes 0..N-1. +// But each time we create a new node (=new cluster, new subtree), +// we re-use one of the two rows that become available (the children +// of the new node). This saves memory. +// We keep track of this through the g_uNodeIndex vector. +static dist_t *g_Dist; + +// Distance to nearest neighbor in row i of distance matrix. +// Subscript is distance matrix row. +static dist_t *g_MinDist; + +// Nearest neighbor to row i of distance matrix. +// Subscript is distance matrix row. +static unsigned *g_uNearestNeighbor; + +// Node index of row i in distance matrix. +// Node indexes are 0..N-1 for leaves, N..2N-2 for internal nodes. +// Subscript is distance matrix row. +static unsigned *g_uNodeIndex; + +// The following vectors are defined on internal nodes, +// subscripts are internal node index 0..N-2. +// For g_uLeft/Right, value is the node index 0 .. 2N-2 +// because a child can be internal or leaf. +static unsigned *g_uLeft; +static unsigned *g_uRight; +static dist_t *g_Height; +static dist_t *g_LeftLength; +static dist_t *g_RightLength; + +static inline unsigned TriangleSubscript(unsigned uIndex1, unsigned uIndex2) + { +#if DEBUG + if (uIndex1 >= g_uLeafCount || uIndex2 >= g_uLeafCount) + Quit("TriangleSubscript(%u,%u) %u", uIndex1, uIndex2, g_uLeafCount); +#endif + unsigned v; + if (uIndex1 >= uIndex2) + v = uIndex2 + (uIndex1*(uIndex1 - 1))/2; + else + v = uIndex1 + (uIndex2*(uIndex2 - 1))/2; + assert(v < (g_uLeafCount*(g_uLeafCount - 1))/2); + return v; + } + +static void ListState() + { + Log("Dist matrix\n"); + Log(" "); + for (unsigned i = 0; i < g_uLeafCount; ++i) + { + if (uInsane == g_uNodeIndex[i]) + continue; + Log(" %5u", g_uNodeIndex[i]); + } + Log("\n"); + + for (unsigned i = 0; i < g_uLeafCount; ++i) + { + if (uInsane == g_uNodeIndex[i]) + continue; + Log("%5u ", g_uNodeIndex[i]); + for (unsigned j = 0; j < g_uLeafCount; ++j) + { + if (uInsane == g_uNodeIndex[j]) + continue; + if (i == j) + Log(" "); + else + { + unsigned v = TriangleSubscript(i, j); + Log("%5.2g ", g_Dist[v]); + } + } + Log("\n"); + } + + Log("\n"); + Log(" i Node NrNb Dist\n"); + Log("----- ----- ----- --------\n"); + for (unsigned i = 0; i < g_uLeafCount; ++i) + { + if (uInsane == g_uNodeIndex[i]) + continue; + Log("%5u %5u %5u %8.3f\n", + i, + g_uNodeIndex[i], + g_uNearestNeighbor[i], + g_MinDist[i]); + } + + Log("\n"); + Log(" Node L R Height LLength RLength\n"); + Log("----- ----- ----- ------ ------- -------\n"); + for (unsigned i = 0; i <= g_uInternalNodeIndex; ++i) + Log("%5u %5u %5u %6.2g %6.2g %6.2g\n", + i, + g_uLeft[i], + g_uRight[i], + g_Height[i], + g_LeftLength[i], + g_RightLength[i]); + } + +void UPGMA2(const DistCalc &DC, Tree &tree, LINKAGE Linkage) + { + g_uLeafCount = DC.GetCount(); + + g_uTriangleSize = (g_uLeafCount*(g_uLeafCount - 1))/2; + g_uInternalNodeCount = g_uLeafCount - 1; + + g_Dist = new dist_t[g_uTriangleSize]; + + g_uNodeIndex = new unsigned[g_uLeafCount]; + g_uNearestNeighbor = new unsigned[g_uLeafCount]; + g_MinDist = new dist_t[g_uLeafCount]; + unsigned *Ids = new unsigned [g_uLeafCount]; + char **Names = new char *[g_uLeafCount]; + + g_uLeft = new unsigned[g_uInternalNodeCount]; + g_uRight = new unsigned[g_uInternalNodeCount]; + g_Height = new dist_t[g_uInternalNodeCount]; + g_LeftLength = new dist_t[g_uInternalNodeCount]; + g_RightLength = new dist_t[g_uInternalNodeCount]; + + for (unsigned i = 0; i < g_uLeafCount; ++i) + { + g_MinDist[i] = BIG_DIST; + g_uNodeIndex[i] = i; + g_uNearestNeighbor[i] = uInsane; + Ids[i] = DC.GetId(i); + Names[i] = strsave(DC.GetName(i)); + } + + for (unsigned i = 0; i < g_uInternalNodeCount; ++i) + { + g_uLeft[i] = uInsane; + g_uRight[i] = uInsane; + g_LeftLength[i] = BIG_DIST; + g_RightLength[i] = BIG_DIST; + g_Height[i] = BIG_DIST; + } + +// Compute initial NxN triangular distance matrix. +// Store minimum distance for each full (not triangular) row. +// Loop from 1, not 0, because "row" is 0, 1 ... i-1, +// so nothing to do when i=0. + for (unsigned i = 1; i < g_uLeafCount; ++i) + { + dist_t *Row = g_Dist + TriangleSubscript(i, 0); + DC.CalcDistRange(i, Row); + for (unsigned j = 0; j < i; ++j) + { + const dist_t d = Row[j]; + if (d < g_MinDist[i]) + { + g_MinDist[i] = d; + g_uNearestNeighbor[i] = j; + } + if (d < g_MinDist[j]) + { + g_MinDist[j] = d; + g_uNearestNeighbor[j] = i; + } + } + } + +#if TRACE + Log("Initial state:\n"); + ListState(); +#endif + + for (g_uInternalNodeIndex = 0; g_uInternalNodeIndex < g_uLeafCount - 1; + ++g_uInternalNodeIndex) + { +#if TRACE + Log("\n"); + Log("Internal node index %5u\n", g_uInternalNodeIndex); + Log("-------------------------\n"); +#endif + + // Find nearest neighbors + unsigned Lmin = uInsane; + unsigned Rmin = uInsane; + dist_t dtMinDist = BIG_DIST; + for (unsigned j = 0; j < g_uLeafCount; ++j) + { + if (uInsane == g_uNodeIndex[j]) + continue; + + dist_t d = g_MinDist[j]; + if (d < dtMinDist) + { + dtMinDist = d; + Lmin = j; + Rmin = g_uNearestNeighbor[j]; + assert(uInsane != Rmin); + assert(uInsane != g_uNodeIndex[Rmin]); + } + } + + assert(Lmin != uInsane); + assert(Rmin != uInsane); + assert(dtMinDist != BIG_DIST); + +#if TRACE + Log("Nearest neighbors Lmin %u[=%u] Rmin %u[=%u] dist %.3g\n", + Lmin, + g_uNodeIndex[Lmin], + Rmin, + g_uNodeIndex[Rmin], + dtMinDist); +#endif + + // Compute distances to new node + // New node overwrites row currently assigned to Lmin + dist_t dtNewMinDist = BIG_DIST; + unsigned uNewNearestNeighbor = uInsane; + for (unsigned j = 0; j < g_uLeafCount; ++j) + { + if (j == Lmin || j == Rmin) + continue; + if (uInsane == g_uNodeIndex[j]) + continue; + + const unsigned vL = TriangleSubscript(Lmin, j); + const unsigned vR = TriangleSubscript(Rmin, j); + const dist_t dL = g_Dist[vL]; + const dist_t dR = g_Dist[vR]; + dist_t dtNewDist; + + switch (Linkage) + { + case LINKAGE_Avg: + dtNewDist = AVG(dL, dR); + break; + + case LINKAGE_Min: + dtNewDist = MIN(dL, dR); + break; + + case LINKAGE_Max: + dtNewDist = MAX(dL, dR); + break; + + case LINKAGE_Biased: + dtNewDist = g_dSUEFF*AVG(dL, dR) + (1 - g_dSUEFF)*MIN(dL, dR); + break; + + default: + Quit("UPGMA2: Invalid LINKAGE_%u", Linkage); + } + + // Nasty special case. + // If nearest neighbor of j is Lmin or Rmin, then make the new + // node (which overwrites the row currently occupied by Lmin) + // the nearest neighbor. This situation can occur when there are + // equal distances in the matrix. If we don't make this fix, + // the nearest neighbor pointer for j would become invalid. + // (We don't need to test for == Lmin, because in that case + // the net change needed is zero due to the change in row + // numbering). + if (g_uNearestNeighbor[j] == Rmin) + g_uNearestNeighbor[j] = Lmin; + +#if TRACE + Log("New dist to %u = (%u/%.3g + %u/%.3g)/2 = %.3g\n", + j, Lmin, dL, Rmin, dR, dtNewDist); +#endif + g_Dist[vL] = dtNewDist; + if (dtNewDist < dtNewMinDist) + { + dtNewMinDist = dtNewDist; + uNewNearestNeighbor = j; + } + } + + assert(g_uInternalNodeIndex < g_uLeafCount - 1 || BIG_DIST != dtNewMinDist); + assert(g_uInternalNodeIndex < g_uLeafCount - 1 || uInsane != uNewNearestNeighbor); + + const unsigned v = TriangleSubscript(Lmin, Rmin); + const dist_t dLR = g_Dist[v]; + const dist_t dHeightNew = dLR/2; + const unsigned uLeft = g_uNodeIndex[Lmin]; + const unsigned uRight = g_uNodeIndex[Rmin]; + const dist_t HeightLeft = + uLeft < g_uLeafCount ? 0 : g_Height[uLeft - g_uLeafCount]; + const dist_t HeightRight = + uRight < g_uLeafCount ? 0 : g_Height[uRight - g_uLeafCount]; + + g_uLeft[g_uInternalNodeIndex] = uLeft; + g_uRight[g_uInternalNodeIndex] = uRight; + g_LeftLength[g_uInternalNodeIndex] = dHeightNew - HeightLeft; + g_RightLength[g_uInternalNodeIndex] = dHeightNew - HeightRight; + g_Height[g_uInternalNodeIndex] = dHeightNew; + + // Row for left child overwritten by row for new node + g_uNodeIndex[Lmin] = g_uLeafCount + g_uInternalNodeIndex; + g_uNearestNeighbor[Lmin] = uNewNearestNeighbor; + g_MinDist[Lmin] = dtNewMinDist; + + // Delete row for right child + g_uNodeIndex[Rmin] = uInsane; + +#if TRACE + Log("\nInternalNodeIndex=%u Lmin=%u Rmin=%u\n", + g_uInternalNodeIndex, Lmin, Rmin); + ListState(); +#endif + } + + unsigned uRoot = g_uLeafCount - 2; + tree.Create(g_uLeafCount, uRoot, g_uLeft, g_uRight, g_LeftLength, g_RightLength, + Ids, Names); + +#if TRACE + tree.LogMe(); +#endif + + delete[] g_Dist; + + delete[] g_uNodeIndex; + delete[] g_uNearestNeighbor; + delete[] g_MinDist; + delete[] g_Height; + + delete[] g_uLeft; + delete[] g_uRight; + delete[] g_LeftLength; + delete[] g_RightLength; + + for (unsigned i = 0; i < g_uLeafCount; ++i) + free(Names[i]); + delete[] Names; + delete[] Ids; + } + +class DistCalcTest : public DistCalc + { + virtual void CalcDistRange(unsigned i, dist_t Dist[]) const + { + static dist_t TestDist[5][5] = + { + 0, 2, 14, 14, 20, + 2, 0, 14, 14, 20, + 14, 14, 0, 4, 20, + 14, 14, 4, 0, 20, + 20, 20, 20, 20, 0, + }; + for (unsigned j = 0; j < i; ++j) + Dist[j] = TestDist[i][j]; + } + virtual unsigned GetCount() const + { + return 5; + } + virtual unsigned GetId(unsigned i) const + { + return i; + } + virtual const char *GetName(unsigned i) const + { + return "name"; + } + }; + +void Test() + { + SetListFileName("c:\\tmp\\lobster.log", false); + DistCalcTest DC; + Tree tree; + UPGMA2(DC, tree, LINKAGE_Avg); + } diff --git a/src/muscle/muscle3.8.31/src/usage.cpp b/src/muscle/muscle3.8.31/src/usage.cpp new file mode 100644 index 0000000..39a49f6 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/usage.cpp @@ -0,0 +1,44 @@ +#include "muscle.h" +#include + +void Credits() + { + static bool Displayed = false; + if (Displayed) + return; + + fprintf(stderr, "\n%s\n\n", MUSCLE_LONG_VERSION); + fprintf(stderr, "http://www.drive5.com/muscle\n"); + fprintf(stderr, "This software is donated to the public domain.\n"); + fprintf(stderr, "Please cite: Edgar, R.C. Nucleic Acids Res 32(5), 1792-97.\n\n"); + Displayed = true; + } + +void Usage() + { + Credits(); + fprintf(stderr, +"\n" +"Basic usage\n" +"\n" +" muscle -in -out \n" +"\n" +"Common options (for a complete list please see the User Guide):\n" +"\n" +" -in Input file in FASTA format (default stdin)\n" +" -out Output alignment in FASTA format (default stdout)\n" +" -diags Find diagonals (faster for similar sequences)\n" +" -maxiters Maximum number of iterations (integer, default 16)\n" +" -maxhours Maximum time to iterate in hours (default no limit)\n" +" -html Write output in HTML format (default FASTA)\n" +" -msf Write output in GCG MSF format (default FASTA)\n" +" -clw Write output in CLUSTALW format (default FASTA)\n" +" -clwstrict As -clw, with 'CLUSTAL W (1.81)' header\n" +" -log[a] Log to file (append if -loga, overwrite if -log)\n" +" -quiet Do not write progress messages to stderr\n" +" -version Display version information and exit\n" +"\n" +"Without refinement (very fast, avg accuracy similar to T-Coffee): -maxiters 2\n" +"Fastest possible (amino acids): -maxiters 1 -diags -sv -distance1 kbit20_3\n" +"Fastest possible (nucleotides): -maxiters 1 -diags\n"); + } diff --git a/src/muscle/muscle3.8.31/src/validateids.cpp b/src/muscle/muscle3.8.31/src/validateids.cpp new file mode 100644 index 0000000..b1d9e3d --- /dev/null +++ b/src/muscle/muscle3.8.31/src/validateids.cpp @@ -0,0 +1,112 @@ +#include "muscle.h" +#include "msa.h" +#include "tree.h" +#include "seqvect.h" + +#if DEBUG +static SeqVect *g_ptrMuscleSeqVect = 0; +static MSA MuscleInputMSA; + +void SetMuscleInputMSA(MSA &msa) + { + MuscleInputMSA.Copy(msa); + } + +void SetMuscleSeqVect(SeqVect &v) + { + g_ptrMuscleSeqVect = &v; + } + +void ValidateMuscleIdsSeqVect(const MSA &msa) + { + const unsigned uSeqCount = msa.GetSeqCount(); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + const unsigned uId = msa.GetSeqId(uSeqIndex); + const char *ptrNameMSA = msa.GetSeqName(uSeqIndex); + const char *ptrName = g_ptrMuscleSeqVect->GetSeqName(uId); + if (0 != strcmp(ptrNameMSA, ptrName)) + Quit("ValidateMuscleIdsSeqVect, names don't match"); + } + } + +void ValidateMuscleIdsMSA(const MSA &msa) + { + const unsigned uSeqCount = msa.GetSeqCount(); + for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) + { + const unsigned uId = msa.GetSeqId(uSeqIndex); + const char *ptrNameMSA = msa.GetSeqName(uSeqIndex); + const char *ptrName = MuscleInputMSA.GetSeqName(uId); + if (0 != strcmp(ptrNameMSA, ptrName)) + { + Log("Input MSA:\n"); + MuscleInputMSA.LogMe(); + Log("MSA being tested:\n"); + msa.LogMe(); + Log("Id=%u\n", uId); + Log("Input name=%s\n", ptrName); + Log("Test name=%s\n", ptrNameMSA); + Quit("ValidateMuscleIdsMSA, names don't match"); + } + } + } + +void ValidateMuscleIds(const MSA &msa) + { + if (0 != g_ptrMuscleSeqVect) + ValidateMuscleIdsSeqVect(msa); + else if (0 != MuscleInputMSA.GetSeqCount()) + ValidateMuscleIdsMSA(msa); + else + Quit("ValidateMuscleIds, ptrMuscleSeqVect=0 && 0 == MuscleInputMSA.SeqCount()"); + + } + +void ValidateMuscleIdsSeqVect(const Tree &tree) + { + const unsigned uSeqCount = g_ptrMuscleSeqVect->GetSeqCount(); + const unsigned uNodeCount = tree.GetNodeCount(); + for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) + { + if (!tree.IsLeaf(uNodeIndex)) + continue; + const unsigned uId = tree.GetLeafId(uNodeIndex); + if (uId >= uSeqCount) + { + tree.LogMe(); + Quit("Leaf with node index %u has id=%u, there are %u seqs", + uNodeIndex, uId, uSeqCount); + } + const char *ptrNameTree = tree.GetLeafName(uNodeIndex); + const char *ptrName = g_ptrMuscleSeqVect->GetSeqName(uId); + if (0 != strcmp(ptrNameTree, ptrName)) + Quit("ValidateMuscleIds: names don't match"); + } + } + +void ValidateMuscleIdsMSA(const Tree &tree) + { + const unsigned uNodeCount = tree.GetNodeCount(); + for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) + { + if (!tree.IsLeaf(uNodeIndex)) + continue; + const unsigned uId = tree.GetLeafId(uNodeIndex); + const char *ptrNameTree = tree.GetLeafName(uNodeIndex); + const char *ptrName = MuscleInputMSA.GetSeqName(uId); + if (0 != strcmp(ptrNameTree, ptrName)) + Quit("ValidateMuscleIds: names don't match"); + } + } + +void ValidateMuscleIds(const Tree &tree) + { + if (0 != g_ptrMuscleSeqVect) + ValidateMuscleIdsSeqVect(tree); + else if (0 != MuscleInputMSA.GetSeqCount()) + ValidateMuscleIdsMSA(tree); + else + Quit("ValidateMuscleIds, ptrMuscleSeqVect=0 && 0 == MuscleInputMSA.SeqCount"); + } +#endif diff --git a/src/muscle/muscle3.8.31/src/version.txt b/src/muscle/muscle3.8.31/src/version.txt new file mode 100644 index 0000000..bd28b9c --- /dev/null +++ b/src/muscle/muscle3.8.31/src/version.txt @@ -0,0 +1 @@ +3.9 diff --git a/src/muscle/muscle3.8.31/src/vtml2.cpp b/src/muscle/muscle3.8.31/src/vtml2.cpp new file mode 100644 index 0000000..ba8dc67 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/vtml2.cpp @@ -0,0 +1,145 @@ +#include "muscle.h" + +// Note: We use 32x32 arrays rather than 20x20 as this may give the compiler +// optimizer an opportunity to make subscript arithmetic more efficient +// (multiplying by 32 is same as shifting left by 5 bits). + +#define v(x) ((float) x) +#define ROW(A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y) \ + { v(A), v(C), v(D), v(E), v(F), v(G), v(H), v(I), v(K), v(L), v(M), v(N), v(P), v(Q), \ + v(R), v(S), v(T), v(V), v(W), v(Y) }, + + +// A C D E F G H I K L +// M N P Q R S T V W Y +// VTML200 +float VTML_LA[32][32] = + { +ROW( 2.25080, 1.31180, 0.82704, 0.88740, 0.55520, 1.09860, 0.71673, 0.80805, 0.81213, 0.68712, + 0.79105, 0.86777, 0.99328, 0.86644, 0.72821, 1.33924, 1.20373, 1.05956, 0.38107, 0.54373) // A + +ROW( 1.31180,15.79469, 0.39862, 0.42329, 0.49882, 0.65541, 0.67100, 0.97185, 0.46414, 0.55673, + 0.90230, 0.63236, 0.54479, 0.47895, 0.56465, 1.18490, 0.99069, 1.21604, 0.28988, 0.91338) // C + +ROW( 0.82704, 0.39862, 4.18833, 2.06850, 0.25194, 0.90937, 1.01617, 0.32860, 1.03391, 0.31300, + 0.42498, 1.80888, 0.81307, 1.20043, 0.63712, 1.03001, 0.88191, 0.43557, 0.26313, 0.37947) // D + +ROW( 0.88740, 0.42329, 2.06850, 3.08354, 0.33456, 0.77183, 0.94536, 0.43151, 1.35989, 0.45579, + 0.53423, 1.15745, 0.82832, 1.66752, 0.84500, 0.98693, 0.88132, 0.54047, 0.24519, 0.52025) // E + +ROW( 0.55520, 0.49882, 0.25194, 0.33456, 6.08351, 0.30140, 1.02191, 1.10969, 0.37069, 1.50587, + 1.41207, 0.42850, 0.41706, 0.48113, 0.41970, 0.56867, 0.57172, 0.91256, 2.02494, 3.44675) // F + +ROW( 1.09860, 0.65541, 0.90937, 0.77183, 0.30140, 5.62829, 0.64191, 0.28432, 0.67874, 0.30549, + 0.37739, 1.01012, 0.60851, 0.65996, 0.63660, 1.03448, 0.68435, 0.40728, 0.36034, 0.35679) // G + +ROW( 0.71673, 0.67100, 1.01617, 0.94536, 1.02191, 0.64191, 6.05494, 0.50783, 1.03822, 0.60887, + 0.55685, 1.28619, 0.72275, 1.41503, 1.24635, 0.93344, 0.83543, 0.54817, 0.81780, 1.81552) // H + +ROW( 0.80805, 0.97185, 0.32860, 0.43151, 1.10969, 0.28432, 0.50783, 3.03766, 0.49310, 1.88886, + 1.75039, 0.44246, 0.44431, 0.53213, 0.48153, 0.55603, 0.88168, 2.37367, 0.68494, 0.70035) // I + +ROW( 0.81213, 0.46414, 1.03391, 1.35989, 0.37069, 0.67874, 1.03822, 0.49310, 2.72883, 0.52739, + 0.68244, 1.15671, 0.82911, 1.51333, 2.33521, 0.93858, 0.92730, 0.55467, 0.39944, 0.52549) // K + +ROW( 0.68712, 0.55673, 0.31300, 0.45579, 1.50587, 0.30549, 0.60887, 1.88886, 0.52739, 3.08540, + 2.14480, 0.43539, 0.53630, 0.62771, 0.53025, 0.53468, 0.69924, 1.50372, 0.82822, 0.89854) // L + +ROW( 0.79105, 0.90230, 0.42498, 0.53423, 1.41207, 0.37739, 0.55685, 1.75039, 0.68244, 2.14480, + 4.04057, 0.55603, 0.48415, 0.76770, 0.66775, 0.62409, 0.87759, 1.42742, 0.52278, 0.72067) // M + +ROW( 0.86777, 0.63236, 1.80888, 1.15745, 0.42850, 1.01012, 1.28619, 0.44246, 1.15671, 0.43539, + 0.55603, 3.36000, 0.69602, 1.13490, 0.98603, 1.31366, 1.11252, 0.50603, 0.35810, 0.68349) // N + +ROW( 0.99328, 0.54479, 0.81307, 0.82832, 0.41706, 0.60851, 0.72275, 0.44431, 0.82911, 0.53630, + 0.48415, 0.69602, 7.24709, 0.90276, 0.74827, 1.03719, 0.83014, 0.56795, 0.37867, 0.33127) // P + +ROW( 0.86644, 0.47895, 1.20043, 1.66752, 0.48113, 0.65996, 1.41503, 0.53213, 1.51333, 0.62771, + 0.76770, 1.13490, 0.90276, 2.86937, 1.50116, 0.99561, 0.93103, 0.61085, 0.29926, 0.51971) // Q + +ROW( 0.72821, 0.56465, 0.63712, 0.84500, 0.41970, 0.63660, 1.24635, 0.48153, 2.33521, 0.53025, + 0.66775, 0.98603, 0.74827, 1.50116, 4.28698, 0.84662, 0.80673, 0.51422, 0.47569, 0.59592) // R + +ROW( 1.33924, 1.18490, 1.03001, 0.98693, 0.56867, 1.03448, 0.93344, 0.55603, 0.93858, 0.53468, + 0.62409, 1.31366, 1.03719, 0.99561, 0.84662, 2.13816, 1.52911, 0.67767, 0.45129, 0.66767) // S + +ROW( 1.20373, 0.99069, 0.88191, 0.88132, 0.57172, 0.68435, 0.83543, 0.88168, 0.92730, 0.69924, + 0.87759, 1.11252, 0.83014, 0.93103, 0.80673, 1.52911, 2.58221, 0.98702, 0.31541, 0.57954) // T + +ROW( 1.05956, 1.21604, 0.43557, 0.54047, 0.91256, 0.40728, 0.54817, 2.37367, 0.55467, 1.50372, + 1.42742, 0.50603, 0.56795, 0.61085, 0.51422, 0.67767, 0.98702, 2.65580, 0.43419, 0.63805) // V + +ROW( 0.38107, 0.28988, 0.26313, 0.24519, 2.02494, 0.36034, 0.81780, 0.68494, 0.39944, 0.82822, + 0.52278, 0.35810, 0.37867, 0.29926, 0.47569, 0.45129, 0.31541, 0.43419,31.39564, 2.51433) // W + +ROW( 0.54373, 0.91338, 0.37947, 0.52025, 3.44675, 0.35679, 1.81552, 0.70035, 0.52549, 0.89854, + 0.72067, 0.68349, 0.33127, 0.51971, 0.59592, 0.66767, 0.57954, 0.63805, 2.51433, 7.50693) // Y + }; + +const float VTML_SP_CENTER = (float) 22.0; + +#undef ROW +#undef v +#define v(x) ((float) (x + VTML_SP_CENTER)) +#define ROW(A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y, X) \ + { v(A), v(C), v(D), v(E), v(F), v(G), v(H), v(I), v(K), v(L), v(M), v(N), v(P), v(Q), \ + v(R), v(S), v(T), v(V), v(W), v(Y), v(X) }, + +// VTML 240 +float VTML_SP[32][32] = + { +// A C D E F G H I K L M N P Q R S T V W Y X +ROW( 58, 23, -12, -7, -44, 10, -23, -14, -14, -27, -17, -8, 1, -9, -22, 23, 15, 5, -74, -45, 0) // A +ROW( 23, 224, -67, -63, -50, -30, -29, 1, -56, -41, -6, -33, -44, -53, -43, 15, 2, 18, -93, -6, 0) // C +ROW( -12, -67, 111, 59,-104, -4, 4, -84, 6, -88, -65, 48, -13, 18, -29, 5, -7, -63,-105, -73, 0) // D +ROW( -7, -63, 59, 85, -83, -17, -1, -63, 25, -60, -47, 15, -12, 40, -8, 1, -7, -47,-108, -51, 0) // E +ROW( -44, -50,-104, -83, 144, -93, 4, 12, -74, 36, 30, -64, -67, -56, -65, -43, -41, -3, 63, 104, 0) // F +ROW( 10, -30, -4, -17, -93, 140, -32, -95, -27, -91, -75, 4, -36, -29, -32, 5, -26, -68, -80, -79, 0) // G +ROW( -23, -29, 4, -1, 4, -32, 137, -50, 6, -37, -42, 21, -23, 27, 19, -4, -12, -44, -13, 48, 0) // H +ROW( -14, 1, -84, -63, 12, -95, -50, 86, -53, 53, 47, -62, -60, -47, -55, -43, -8, 69, -27, -24, 0) // I +ROW( -14, -56, 6, 25, -74, -27, 6, -53, 75, -48, -30, 13, -12, 34, 68, -3, -4, -44, -71, -49, 0) // K +ROW( -27, -41, -88, -60, 36, -91, -37, 53, -48, 88, 62, -63, -48, -36, -48, -47, -25, 36, -11, -4, 0) // L +ROW( -17, -6, -65, -47, 30, -75, -42, 47, -30, 62, 103, -45, -54, -21, -31, -35, -9, 31, -46, -20, 0) // M +ROW( -8, -33, 48, 15, -64, 4, 21, -62, 13, -63, -45, 89, -25, 12, 2, 22, 10, -51, -79, -29, 0) // N +ROW( 1, -44, -13, -12, -67, -36, -23, -60, -12, -48, -54, -25, 160, -6, -20, 5, -12, -42, -76, -83, 0) // P +ROW( -9, -53, 18, 40, -56, -29, 27, -47, 34, -36, -21, 12, -6, 75, 34, 1, -4, -37, -92, -48, 0) // Q +ROW( -22, -43, -29, -8, -65, -32, 19, -55, 68, -48, -31, 2, -20, 34, 113, -10, -14, -49, -58, -39, 0) // R +ROW( 23, 15, 5, 1, -43, 5, -4, -43, -3, -47, -35, 22, 5, 1, -10, 53, 32, -28, -62, -31, 0) // S +ROW( 15, 2, -7, -7, -41, -26, -12, -8, -4, -25, -9, 10, -12, -4, -14, 32, 68, 0, -87, -40, 0) // T +ROW( 5, 18, -63, -47, -3, -68, -44, 69, -44, 36, 31, -51, -42, -37, -49, -28, 0, 74, -61, -32, 0) // V +ROW( -74, -93,-105,-108, 63, -80, -13, -27, -71, -11, -46, -79, -76, -92, -58, -62, -87, -61, 289, 81, 0) // W +ROW( -45, -6, -73, -51, 104, -79, 48, -24, -49, -4, -20, -29, -83, -48, -39, -31, -40, -32, 81, 162, 0) // Y +ROW( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) // X + }; + +#undef v +#define v(x) ((float) (x)) +#define RNC(A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y, X) \ + { v(A), v(C), v(D), v(E), v(F), v(G), v(H), v(I), v(K), v(L), v(M), v(N), v(P), v(Q), \ + v(R), v(S), v(T), v(V), v(W), v(Y), v(X) }, + +float VTML_SPNoCenter[32][32] = + { +// A C D E F G H I K L M N P Q R S T V W Y X +RNC( 58, 23, -12, -7, -44, 10, -23, -14, -14, -27, -17, -8, 1, -9, -22, 23, 15, 5, -74, -45, 0) // A +RNC( 23, 224, -67, -63, -50, -30, -29, 1, -56, -41, -6, -33, -44, -53, -43, 15, 2, 18, -93, -6, 0) // C +RNC( -12, -67, 111, 59,-104, -4, 4, -84, 6, -88, -65, 48, -13, 18, -29, 5, -7, -63,-105, -73, 0) // D +RNC( -7, -63, 59, 85, -83, -17, -1, -63, 25, -60, -47, 15, -12, 40, -8, 1, -7, -47,-108, -51, 0) // E +RNC( -44, -50,-104, -83, 144, -93, 4, 12, -74, 36, 30, -64, -67, -56, -65, -43, -41, -3, 63, 104, 0) // F +RNC( 10, -30, -4, -17, -93, 140, -32, -95, -27, -91, -75, 4, -36, -29, -32, 5, -26, -68, -80, -79, 0) // G +RNC( -23, -29, 4, -1, 4, -32, 137, -50, 6, -37, -42, 21, -23, 27, 19, -4, -12, -44, -13, 48, 0) // H +RNC( -14, 1, -84, -63, 12, -95, -50, 86, -53, 53, 47, -62, -60, -47, -55, -43, -8, 69, -27, -24, 0) // I +RNC( -14, -56, 6, 25, -74, -27, 6, -53, 75, -48, -30, 13, -12, 34, 68, -3, -4, -44, -71, -49, 0) // K +RNC( -27, -41, -88, -60, 36, -91, -37, 53, -48, 88, 62, -63, -48, -36, -48, -47, -25, 36, -11, -4, 0) // L +RNC( -17, -6, -65, -47, 30, -75, -42, 47, -30, 62, 103, -45, -54, -21, -31, -35, -9, 31, -46, -20, 0) // M +RNC( -8, -33, 48, 15, -64, 4, 21, -62, 13, -63, -45, 89, -25, 12, 2, 22, 10, -51, -79, -29, 0) // N +RNC( 1, -44, -13, -12, -67, -36, -23, -60, -12, -48, -54, -25, 160, -6, -20, 5, -12, -42, -76, -83, 0) // P +RNC( -9, -53, 18, 40, -56, -29, 27, -47, 34, -36, -21, 12, -6, 75, 34, 1, -4, -37, -92, -48, 0) // Q +RNC( -22, -43, -29, -8, -65, -32, 19, -55, 68, -48, -31, 2, -20, 34, 113, -10, -14, -49, -58, -39, 0) // R +RNC( 23, 15, 5, 1, -43, 5, -4, -43, -3, -47, -35, 22, 5, 1, -10, 53, 32, -28, -62, -31, 0) // S +RNC( 15, 2, -7, -7, -41, -26, -12, -8, -4, -25, -9, 10, -12, -4, -14, 32, 68, 0, -87, -40, 0) // T +RNC( 5, 18, -63, -47, -3, -68, -44, 69, -44, 36, 31, -51, -42, -37, -49, -28, 0, 74, -61, -32, 0) // V +RNC( -74, -93,-105,-108, 63, -80, -13, -27, -71, -11, -46, -79, -76, -92, -58, -62, -87, -61, 289, 81, 0) // W +RNC( -45, -6, -73, -51, 104, -79, 48, -24, -49, -4, -20, -29, -83, -48, -39, -31, -40, -32, 81, 162, 0) // Y +RNC( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) // X + }; diff --git a/src/muscle/muscle3.8.31/src/writescorefile.cpp b/src/muscle/muscle3.8.31/src/writescorefile.cpp new file mode 100644 index 0000000..17f7a81 --- /dev/null +++ b/src/muscle/muscle3.8.31/src/writescorefile.cpp @@ -0,0 +1,69 @@ +#include "muscle.h" +#include "msa.h" +#include + +extern float VTML_SP[32][32]; +extern float NUC_SP[32][32]; + +static double GetColScore(const MSA &msa, unsigned uCol) + { + const unsigned uSeqCount = msa.GetSeqCount(); + unsigned uPairCount = 0; + double dSum = 0.0; + for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) + { + if (msa.IsGap(uSeq1, uCol)) + continue; + unsigned uLetter1 = msa.GetLetterEx(uSeq1, uCol); + if (uLetter1 >= g_AlphaSize) + continue; + for (unsigned uSeq2 = uSeq1 + 1; uSeq2 < uSeqCount; ++uSeq2) + { + if (msa.IsGap(uSeq2, uCol)) + continue; + unsigned uLetter2 = msa.GetLetterEx(uSeq2, uCol); + if (uLetter2 >= g_AlphaSize) + continue; + double Score; + switch (g_Alpha) + { + case ALPHA_Amino: + Score = VTML_SP[uLetter1][uLetter2]; + break; + case ALPHA_DNA: + case ALPHA_RNA: + Score = NUC_SP[uLetter1][uLetter2]; + break; + default: + Quit("GetColScore: invalid alpha=%d", g_Alpha); + } + dSum += Score; + ++uPairCount; + } + } + if (0 == uPairCount) + return 0; + return dSum / uPairCount; + } + +void WriteScoreFile(const MSA &msa) + { + FILE *f = fopen(g_pstrScoreFileName, "w"); + if (0 == f) + Quit("Cannot open score file '%s' errno=%d", g_pstrScoreFileName, errno); + + const unsigned uColCount = msa.GetColCount(); + const unsigned uSeqCount = msa.GetSeqCount(); + for (unsigned uCol = 0; uCol < uColCount; ++uCol) + { + double Score = GetColScore(msa, uCol); + fprintf(f, "%10.3f ", Score); + for (unsigned uSeq = 0; uSeq < uSeqCount; ++uSeq) + { + char c = msa.GetChar(uSeq, uCol); + fprintf(f, "%c", c); + } + fprintf(f, "\n"); + } + fclose(f); + } diff --git a/src/prokov/Makefile b/src/prokov/Makefile index 730be01..2669b78 100755 --- a/src/prokov/Makefile +++ b/src/prokov/Makefile @@ -25,9 +25,6 @@ include ../../config/targets/help.targ all:: $(MAKE) ACTION=$@ _action - test -d $(PRTDIR) || mkdir $(PRTDIR) - test -d $(BINDIR) || mkdir $(BINDIR) - \cp -f lxpack/ports/$(PORTNAME)/bin/* $(BINDIR) clean:: $(MAKE) -C lxpack portclean diff --git a/src/prokov/lxpack/Makefile b/src/prokov/lxpack/Makefile index 11584cf..baab2d7 100755 --- a/src/prokov/lxpack/Makefile +++ b/src/prokov/lxpack/Makefile @@ -15,14 +15,14 @@ # @end: # --------------------------------------------------------------- # -include ./config/auto.conf +include ../../../config/auto.conf DIRS = src \ tests -include ./config/targets/propagate.targ +include ../../../config/targets/propagate.targ -include ./config/targets/help.targ +include ../../../config/targets/help.targ portclean:: $(MAKE) ACTION=$@ _action diff --git a/src/prokov/lxpack/config/README.txt b/src/prokov/lxpack/config/README.txt deleted file mode 100755 index dbcb92a..0000000 --- a/src/prokov/lxpack/config/README.txt +++ /dev/null @@ -1,51 +0,0 @@ - -$Id: README.txt 1825 2013-02-26 09:39:47Z viari $ - -This directory contains Makefile machine specific configuration files -(and default targets to help you writing Makefile's) - -These headers should be used with GNU make or compatible - -# -# portname -# - -To check your port, issue : - - ./guess_port - - if output is 'unknown ::' then you should : - - add a port entry in guess_port for :: - - create a ports/.conf configuration file - (the best is to start from another port file, - choose whatever looks closest) - -# -# configuration flags -# - -auto.conf : the main configuration file : - - determine the machine port thru 'guess_port' shell - - include 'default.conf' file - - include the machine specific 'ports/.conf' file - -default.conf : default configuration (included by 'auto.conf') - -ports/.conf : machine specific configuration (included by 'auto.conf') - -# -# utility targets -# - -targets/help.targ : target for standard help - -targets/propagate.targ : target for propagating targets to subdirectories - -targets/package.targ : default targets for standard package with 'configure' - -targets/empty.targ : default empty targets (defined as double colon rules) - -targets/lxbin.targ : default make targets for standard lx binary (without libraries) - -targets/debug.targ : target to print debug information (for dev.) - diff --git a/src/prokov/lxpack/config/auto.conf b/src/prokov/lxpack/config/auto.conf deleted file mode 100644 index 265ad82..0000000 --- a/src/prokov/lxpack/config/auto.conf +++ /dev/null @@ -1,54 +0,0 @@ -# -# $Id: auto.conf 1825 2013-02-26 09:39:47Z viari $ -# -# auto.conf -# auto configuration file using guess_port -# -# this file is included in Makefile -# - -# -# default shell for gnu-make -# - -SHELL = /bin/sh - -# -# CFGDIR : location of config files = this file directory location -# -# CFGPRT : port name (as returned by guess_port) -# - -# because builtin 'lastword' is missing in gnu-make 3.80 - -lastword = $(word $(words $1), $1) - -CFGDIR := $(dir $(call lastword, $(MAKEFILE_LIST))) - -CFGPRT := $(shell $(CFGDIR)guess_port) - -# check if port is correctly defined - -ifneq (1, $(words $(CFGPRT))) - entry := $(call lastword, $(CFGPRT)) - $(error port is undefined - add entry for "$(entry)" in configuration file -) -endif - -# -# PORTNAME : port name to use : default is CFGPRT but may be futher modified -# by machine specific configuration - -PORTNAME = $(CFGPRT) - -# -# default configuration -# may be overriden by machine dependant definitions below -# - -include $(CFGDIR)default.conf - -# -# machine dependant definitions -# - -include $(CFGDIR)ports/$(CFGPRT).conf diff --git a/src/prokov/lxpack/config/default.conf b/src/prokov/lxpack/config/default.conf deleted file mode 100644 index 672d1b1..0000000 --- a/src/prokov/lxpack/config/default.conf +++ /dev/null @@ -1,124 +0,0 @@ -# -# $Id: default.conf 2007 2013-12-03 14:21:39Z viari $ -# -# default.conf -# default configuration flags -# maybe further redefined by machine specific configuration -# -# this file is included by auto.conf -# - -# ------------------------------------ -# General compilation flags -# ------------------------------------ - -# -# MACHDEF : define machine and OS specific flags -# - -MACHDEF = - -# -# CC : (ansi C) compiler command to use -# you may add some machine specific flags (like -arch ...) -# in the .conf configuration file -# - -CC = gcc - -# -# default compiler optimizer flag -# - -OPTIM = -O - -# -# CC_LIBS : additionnal machine specific $(CC) libraries -# like '-lC' on some machines -# - -CC_LIBS = - -# -# MALLOC_LIBS : machine specific malloc librairies -# like '-lmalloc' on SGI -# - -MALLOC_LIBS = - -# -# MATH_LIBS : machine specific math librairies -# like '-lm' on Solaris -# - -MATH_LIBS = - -# -# LINT : looks like LINT command does not exist anymore -# here is a rough replacement -# - -LINT = gcc -S -Wall -Wno-format-y2k -W -Wstrict-prototypes \ - -Wmissing-prototypes -Wpointer-arith -Wreturn-type \ - -Wcast-qual -Wwrite-strings -Wswitch -Wshadow \ - -Wcast-align -Wbad-function-cast -Wchar-subscripts \ - -Winline -Wnested-externs -Wredundant-decls - -# ------------------------------------ -# General system commands -# ------------------------------------ - -# -# AR : AR archive command -# ARFLAGS : $(AR) archiving flags -# ARXFLAGS : $(AR) extraction flags -# - -AR = ar -ARFLAGS = rcv -ARXFLAGS = xv - -# -# RANLIB : ranlib command -# - -RANLIB = ranlib - -# -# DIFF : diff command -# - -DIFF = diff - -# -# TAR : tar command -# - -TAR = tar - -# ------------------------------------ -# Default locations -# ------------------------------------ -# -# PRTDIR : port dependent files location (libraries and binaries) -# BINDIR : port binaries -# LIBDIR : port libraries -# - -PRTDIR = $(CFGDIR)../ports/$(PORTNAME) - -BINDIR = $(PRTDIR)/bin - -LIBDIR = $(PRTDIR)/lib - -# ------------------------------------ -# default gmake variable in implicit rules -# ------------------------------------ - -CFLAGS = $(OPTIM) $(MACHDEF) -I$(INCDIR) - -LDFLAGS = -L$(LIBDIR) -L. - -LDLIBS = $(LIBS) $(MALLOC_LIBS) $(MATH_LIBS) $(CC_LIBS) - -LINTFLAGS = $(MACHDEF) -I$(INCDIR) diff --git a/src/prokov/lxpack/config/guess_port b/src/prokov/lxpack/config/guess_port deleted file mode 100755 index 56e0ae3..0000000 --- a/src/prokov/lxpack/config/guess_port +++ /dev/null @@ -1,33 +0,0 @@ -#! /bin/sh -# -# $Id: guess_port 1825 2013-02-26 09:39:47Z viari $ -# -# @file: guess_port -# @desc: attempt to guess the portname -# @usage: guess_port -# -# @history: -# @+ Nov. 2000 first draft adapted from GNU config.guess -# @+ Feb. 2010 moved to sh -# - -mach=`uname -m` -syst=`uname -s` -rels=`uname -r` - -case ${mach}:${syst}:${rels} in - - alpha:OSF1:* ) echo alpha-osf1;; - sun4*:SunOS:5.* ) echo sparc-solaris;; - i86pc:SunOS:5.* ) echo i386-solaris;; - sun4*:SunOS:* ) echo sparc-sunos;; - Power*:Darwin:* ) echo ppc-darwin;; - i*86:Linux:* ) echo i386-linux;; - x*86*:Linux:* ) echo i386-linux;; - i*86:Darwin:* ) echo i386-darwin;; - IP*:IRIX*:* ) echo mips-irix;; - i*86:MINGW32*:* ) echo x86-mingw32;; - - *) echo unknown ${mach}:${syst}:${rels}; exit 1;; -esac -exit 0 diff --git a/src/prokov/lxpack/config/ports/i386-darwin.conf b/src/prokov/lxpack/config/ports/i386-darwin.conf deleted file mode 100644 index b8a7999..0000000 --- a/src/prokov/lxpack/config/ports/i386-darwin.conf +++ /dev/null @@ -1,26 +0,0 @@ -# -# $Id: i386-darwin.conf 1825 2013-02-26 09:39:47Z viari $ -# -# i386-darwin.conf -# configuration file for MacOS-X/Intel-Based/Darwin 1.2 with gcc compiler -# this file is included in Makefile -# -# system (uname -srp) : Darwin 8.7.1 i386 -# compiler (cc --version) : i686-apple-darwin8-gcc-4.0.1 -# -# check tags -# @uname:uname -srp:Darwin 8.7.1 i386 -# @cc:cc --version:i686-apple-darwin8-gcc-4.0.1 -# -# - -# ------------------------------------ -# General compilation flags -# ------------------------------------ - -# -# MACHDEF : define machine and OS specific flags -# - -MACHDEF = -DLX_TARGET_MACINTEL -DLITTLE_ENDIAN -DMACOSX - diff --git a/src/prokov/lxpack/config/ports/i386-linux.conf b/src/prokov/lxpack/config/ports/i386-linux.conf deleted file mode 100755 index d90af22..0000000 --- a/src/prokov/lxpack/config/ports/i386-linux.conf +++ /dev/null @@ -1,32 +0,0 @@ -# -# $Id: i386-linux.conf 1825 2013-02-26 09:39:47Z viari $ -# -# i386-linux.conf -# configuration file for linux ix86 with GNU gcc compiler -# this file is included in Makefile -# -# system (uname -srp) : Linux 2.2.14-5.0 unknown -# compiler (gcc --version) : egcs-2.91.66 -# -# check tags -# @uname:uname -srp:Linux 2.2.14-5.0 unknown -# @cc:cc --version:egcs-2.91.66 -# -# - -# ------------------------------------ -# General compilation flags -# ------------------------------------ - -# -# MACHDEF : define machine and OS specific flags -# - -MACHDEF = -DLX_TARGET_LINUX -DLITTLE_ENDIAN - -# -# MATH_LIBS : machine specific math librairies -# - -MATH_LIBS = -lm - diff --git a/src/prokov/lxpack/config/ports/ppc-darwin.conf b/src/prokov/lxpack/config/ports/ppc-darwin.conf deleted file mode 100755 index 553345b..0000000 --- a/src/prokov/lxpack/config/ports/ppc-darwin.conf +++ /dev/null @@ -1,32 +0,0 @@ -# -# $Id: ppc-darwin.conf 1825 2013-02-26 09:39:47Z viari $ -# -# ppc-darwin.conf -# configuration file for MacOS-X/Darwin 1.2 with native cc compiler -# this file is included in Makefile -# -# system (uname -srp) : Darwin 1.2 powerpc -# compiler (cc --version) : 2.7.2.1 -# -# check tags -# @uname:uname -srp:Darwin 1.2 powerpc -# @cc:cc --version:2.7.2.1 -# -# - -# ------------------------------------ -# General compilation flags -# ------------------------------------ - -# -# MACHDEF : define machine and OS specific flags -# - -MACHDEF = -DLX_TARGET_MACPPC -DBIG_ENDIAN - -# -# CC : name of (ansi C) compiler to use -# - -CC = cc -arch ppc - diff --git a/src/prokov/lxpack/config/ports/sparc-solaris.conf b/src/prokov/lxpack/config/ports/sparc-solaris.conf deleted file mode 100755 index 46ce21a..0000000 --- a/src/prokov/lxpack/config/ports/sparc-solaris.conf +++ /dev/null @@ -1,31 +0,0 @@ -# -# $Id: sparc-solaris.conf 1825 2013-02-26 09:39:47Z viari $ -# -# sparc-solaris.conf -# configuration file for sparc solaris with GNU gcc compiler -# this file is included in Makefile -# -# system (uname -srp) : SunOS 5.8 sparc -# compiler (gcc --version) : 2.95.2 -# -# check tags -# @uname:uname -srp:SunOS 5.8 sparc -# @cc:cc --version:2.95.2 -# -# - -# ------------------------------------ -# General compilation flags -# ------------------------------------ - -# -# MACHDEF : define machine and OS specific flags -# - -MACHDEF = -DLX_TARGET_SOLARIS -DBIG_ENDIAN - -# -# MATH_LIBS : machine specific math librairies -# - -MATH_LIBS = -lm diff --git a/src/prokov/lxpack/config/ports/x86-mingw32.conf b/src/prokov/lxpack/config/ports/x86-mingw32.conf deleted file mode 100644 index 0dfea5d..0000000 --- a/src/prokov/lxpack/config/ports/x86-mingw32.conf +++ /dev/null @@ -1,54 +0,0 @@ -# -# $Id: x86-mingw32.conf 1825 2013-02-26 09:39:47Z viari $ -# -# x86-mingw32 -# configuration file for MinGW with GNU gcc compiler. -# -# this file is included in Makefile -# -# - -# -# rename PORTNAME safely since MinGW produce pure win32 executables -# without dll's -# - -PORTNAME = x86-win32 - -# ------------------------------------ -# General compilation flags -# ------------------------------------ - -# -# CC_LIBS : additionnal machine specific $(CC) libraries -# -# libiberty is needed for some system extensions (like mkstemps) -# - -CC_LIBS = -liberty - -# -# MACHDEF : define machine and OS specific flags -# -# -DDLMALLOC : use dlmalloc instead of malloc (which does not have mallinfo) -# -posix is a new replacement for several MinGW32 flags, including: -# -D__USE_MINGW_ANSI_STDIO : mingw gcc flag to recognize the C99 "%zu" format -# - -MACHDEF = -posix -DLX_TARGET_WIN32 -DWIN_MINGW -DDLMALLOC -DLITTLE_ENDIAN - -# -# MATH_LIBS : machine specific math librairies -# - -MATH_LIBS = -lm - -# ------------------------------------ -# General system commands -# ------------------------------------ - -# -# DIFF : diff command / should ignore cr on windows -# - -DIFF = diff --strip-trailing-cr diff --git a/src/prokov/lxpack/config/targets/debug.targ b/src/prokov/lxpack/config/targets/debug.targ deleted file mode 100644 index c25e3ca..0000000 --- a/src/prokov/lxpack/config/targets/debug.targ +++ /dev/null @@ -1,25 +0,0 @@ -# -# $Id: help.targ 1825 2013-02-26 09:39:47Z viari $ -# -# debug.targ -# -# target to print debug information (dev. only) -# -# it defines the following targets: -# -# debug : -# print debug -# -# it requires auto.conf -# - -.PHONY: debug - -debug:: - @echo "+ PORTNAME: $(PORTNAME)" - @echo "+ CFGPRT: $(CFGPRT)" - @echo "+ CFGDIR: $(CFGDIR)" - @echo "+ PRTDIR: $(PRTDIR)" - @echo "+ MACHDEF: $(MACHDEF)" - - diff --git a/src/prokov/lxpack/config/targets/empty.targ b/src/prokov/lxpack/config/targets/empty.targ deleted file mode 100644 index 9642422..0000000 --- a/src/prokov/lxpack/config/targets/empty.targ +++ /dev/null @@ -1,24 +0,0 @@ -# -# $Id: $ -# -# epty.targ -# -# default empty targets (defined as double colon rules) -# -# - -# -# Rules -# - -.PHONY: all test clean portclean help - -all:: - -test:: - -clean:: - -portclean:: clean - -test:: diff --git a/src/prokov/lxpack/config/targets/help.targ b/src/prokov/lxpack/config/targets/help.targ deleted file mode 100644 index f0128a5..0000000 --- a/src/prokov/lxpack/config/targets/help.targ +++ /dev/null @@ -1,23 +0,0 @@ -# -# $Id: help.targ 1825 2013-02-26 09:39:47Z viari $ -# -# help.targ -# -# default target to print help -# -# it defines the following targets: -# -# help : -# print help -# - -.PHONY: help - -help:: - @ echo "basic usage: make [+]" - @ echo "valid :" - @ echo " all : compile everything for current port [default target]" - @ echo " clean : local cleanup" - @ echo " portclean : cleanup distribution for current port" - @ echo " test : run tests on current port" - @ echo " help : print this help" diff --git a/src/prokov/lxpack/config/targets/lxbin.targ b/src/prokov/lxpack/config/targets/lxbin.targ deleted file mode 100644 index 4e6dbe9..0000000 --- a/src/prokov/lxpack/config/targets/lxbin.targ +++ /dev/null @@ -1,51 +0,0 @@ -# -# $Id: $ -# -# lxbin.targ -# -# default make targets for standard lx binary -# -# you should define the 'PROGS' and 'OSRC' variables -# and optionnaly 'LIBS' if binaries have to be linked with libraries -# -# note: if main source code for binary PROG is PROG.c, there is nothing to do, -# else (e.g. if it involves several sources files) you should also add local -# file dependencies. e.g under the form: -# -# mymain: $(OBJ) mymain_base.c mymain_help.c -# $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS) $(LDLIBS) -# -# -# 'auto.conf' should have been included -# - -OBJ = $(OSRC:.c=.o) - -INCDIR = ../include - -# -# Rules -# - -.PHONY: all prelib install test clean portclean - -all:: prelib $(PROGS) install - @echo "+++++++++++ binaries $(PROGS) done" - -prelib:: - test -d $(PRTDIR) || mkdir $(PRTDIR) # because some linker may complain - test -d $(LIBDIR) || mkdir $(LIBDIR) # if -L$(LIBDIR) does not exist - -install:: - test -d $(PRTDIR) || mkdir $(PRTDIR) - test -d $(BINDIR) || mkdir $(BINDIR) - -for f in $(PROGS) ; do \cp -f $$f $(BINDIR) ; done - -test:: - -clean:: - -\rm -f *.o cvstatic* *% *.bak so_loc* - -\rm -f $(PROGS) - -portclean:: clean - -(! test -d $(BINDIR)) || (cd $(BINDIR) && \rm -f $(PROGS)) diff --git a/src/prokov/lxpack/config/targets/lxlib.targ b/src/prokov/lxpack/config/targets/lxlib.targ deleted file mode 100644 index 1be65c3..0000000 --- a/src/prokov/lxpack/config/targets/lxlib.targ +++ /dev/null @@ -1,43 +0,0 @@ -# -# $Id: $ -# -# lxlib.targ -# -# default make targets for standard lx library -# -# you should define the 'LOCLIB' and 'OSRC' variables -# -# 'auto.conf' should have been included -# - -OBJ = $(OSRC:.c=.o) - -INCDIR = ../include - -# -# Rules -# - -.PHONY: all lib install test clean portclean - -all:: lib install - @echo "+++++++++++ library $(LOCLIB) done" - -lib:: $(OBJ) - $(AR) $(ARFLAGS) $(LOCLIB) $(OBJ) - $(RANLIB) $(LOCLIB) - -install:: - test -d $(PRTDIR) || mkdir $(PRTDIR) - test -d $(LIBDIR) || mkdir $(LIBDIR) - \cp -f $(LOCLIB) $(LIBDIR) - $(RANLIB) $(LIBDIR)/$(LOCLIB) - -test:: - -clean:: - -\rm -f *.o cvstatic* *% *.bak so_loc* - -\rm -f $(LOCLIB) - -portclean:: clean - -(! test -d $(LIBDIR)) || (cd $(LIBDIR) && \rm -f $(LOCLIB)) diff --git a/src/prokov/lxpack/config/targets/package.targ b/src/prokov/lxpack/config/targets/package.targ deleted file mode 100644 index f5918b8..0000000 --- a/src/prokov/lxpack/config/targets/package.targ +++ /dev/null @@ -1,48 +0,0 @@ -# -# $Id: package.targ 1825 2013-02-26 09:39:47Z viari $ -# -# package.targ -# -# default make targets for standard package with configure -# -# you should define the 'PKG' variable -# (and optionaly 'PKGTAR', 'PKGDIR') -# - -PKGTAR ?= $(PKG).tgz - -PKGDIR ?= build.$(PORTNAME) - -PRTPATH = $(abspath $(PRTDIR)) - -# -# Rules -# - -.PHONY: all clean test portclean pkg pkg.expand pkg.make pkg.install - -all:: pkg - -pkg.expand:: - test -d $(PKGDIR) || mkdir $(PKGDIR) - test -f $(PKGDIR)/configure || $(TAR) zxf $(PKGTAR) -C $(PKGDIR) --strip-components 1 - -pkg.make:: pkg.expand - test -f $(PKGDIR)/Makefile || (cd $(PKGDIR) && ./configure --prefix=$(PRTPATH)) - $(MAKE) -C $(PKGDIR) - -pkg.install:: pkg.make - $(MAKE) -C $(PKGDIR) install - -pkg:: pkg.install - @echo "+++++++++++ package $(PKG) done" - -test:: - (! test -d $(PKGDIR)) || $(MAKE) -C $(PKGDIR) test - -clean:: - (! test -d $(PKGDIR)) || $(MAKE) -C $(PKGDIR) clean - -portclean:: - (! test -d $(PKGDIR)) || $(MAKE) -C $(PKGDIR) distclean - (! test -d $(PKGDIR)) || \rm -r $(PKGDIR) diff --git a/src/prokov/lxpack/config/targets/propagate.targ b/src/prokov/lxpack/config/targets/propagate.targ deleted file mode 100644 index 2e9df18..0000000 --- a/src/prokov/lxpack/config/targets/propagate.targ +++ /dev/null @@ -1,30 +0,0 @@ -# -# $Id: propagate.targ 1825 2013-02-26 09:39:47Z viari $ -# -# propagate.targ -# -# default make targets for library containers -# -# you should define the 'DIRS' variable -# -# It will propagate 'MAKE ' to all -# directories listed in DIRS -# - -# -# Rules -# - -.PHONY: all _action $(DIRS) - -.DEFAULT: - $(MAKE) ACTION=$@ _action - -all:: - $(MAKE) ACTION=all _action - -_action: $(DIRS) - @echo "$(ACTION) done" - -$(DIRS): - $(MAKE) -C $@ $(ACTION) diff --git a/src/prokov/lxpack/src/Makefile b/src/prokov/lxpack/src/Makefile index f6a1696..0059643 100644 --- a/src/prokov/lxpack/src/Makefile +++ b/src/prokov/lxpack/src/Makefile @@ -14,7 +14,7 @@ # --------------------------------------------------------------- # -include ../config/auto.conf +include ../../../../config/auto.conf PROGS = prokov_learn prokov_curve \ prokov_score prokov_cds \ diff --git a/src/prokov/lxpack/tests/Makefile b/src/prokov/lxpack/tests/Makefile index c6f7224..f8fc16b 100755 --- a/src/prokov/lxpack/tests/Makefile +++ b/src/prokov/lxpack/tests/Makefile @@ -16,7 +16,7 @@ # --------------------------------------------------------------- # -include ../config/targets/empty.targ +include ../../../../config/targets/empty.targ clean:: -\rm -f *.bak diff --git a/src/repseek/Makefile b/src/repseek/Makefile new file mode 100755 index 0000000..6e6269e --- /dev/null +++ b/src/repseek/Makefile @@ -0,0 +1,30 @@ +# --------------------------------------------------------------- +# $Id: $ +# --------------------------------------------------------------- +# @file: Makefile +# @desc: makefile for lxpack +# +# @history: +# @history: +# @+ : Apr 97 : Created +# @+ : Mar 02 : Updated for LXxware +# +# @note: should be processed with gnu compatible make +# @note: helixware_compatible +# +# @end: +# --------------------------------------------------------------- +# +include ../../config/auto.conf + +DIRS = repseek-2014.09 + +include ../../config/targets/propagate.targ + +include ../../config/targets/help.targ + +all:: + $(MAKE) ACTION=$@ _action + +clean:: + $(MAKE) -C lxpack portclean diff --git a/src/repseek/repseek-2014.09/Makefile b/src/repseek/repseek-2014.09/Makefile index dc4c9a8..626d26d 100644 --- a/src/repseek/repseek-2014.09/Makefile +++ b/src/repseek/repseek-2014.09/Makefile @@ -10,44 +10,18 @@ # MACHINE = LINUX # MACHINE = OSF -MACHINE = MACOSX +include ../../../config/auto.conf -# ???: MALLOC=-lmalloc -# others: MALLOC = -# macosx debug: MALLOC = -lMallocDebug -MALLOC= - # SGI: PROTO= PROTO=1 # others: PROTO= PROTO=0 PROTO= PROTO=0 -# SGI: RANLIB= touch -# others: RANLIB= ranlib - -RANLIB= ranlib - -# Users can choose: - -#CC= cc -CC= gcc - - -CFLAGS= -O4 -Wall -#CFLAGS= -O2 -#CFLAGS= -pg -#CFLAGS= -g - -INSTALLDIR = $$HOME/bin - ##### defined SHELL = bash -LDFLAGS = -lm $(MALLOC) -#LDFLAGS = -g -lm $(MALLOC) - SRC = sort.c\ help.c\ output.c\ @@ -77,24 +51,18 @@ OBJ = $(SRC:.c=.o) ## Rules -default: - @echo "++ Repeats Search Engines ++" - @echo "edit Makefile and set MACHINE, RANLIB, PROTO, CC, CFLAGS and eventually MALLOC" - @echo " " - @echo "To compile: make repseek" - @echo "To clean: make clean" - +all: repseek install %.o: %.c - $(CC) $(CFLAGS) -D$(MACHINE) -c -o $@ $<; + $(CC) $(CFLAGS) -c -o $@ $<; repseek: $(OBJ) main_repseek.c - $(CC) $(CFLAGS) -D$(MACHINE) -o $@ $(OBJ) main_repseek.c $(LDFLAGS); + $(CC) $(CFLAGS) -o $@ $(OBJ) main_repseek.c $(LDFLAGS); install: repseek - cp repseek $(INSTALLDIR) + cp repseek $(BINDIR) clean: diff --git a/src/sequtils/Makefile b/src/sequtils/Makefile index 730be01..2669b78 100755 --- a/src/sequtils/Makefile +++ b/src/sequtils/Makefile @@ -25,9 +25,6 @@ include ../../config/targets/help.targ all:: $(MAKE) ACTION=$@ _action - test -d $(PRTDIR) || mkdir $(PRTDIR) - test -d $(BINDIR) || mkdir $(BINDIR) - \cp -f lxpack/ports/$(PORTNAME)/bin/* $(BINDIR) clean:: $(MAKE) -C lxpack portclean diff --git a/src/sequtils/lxpack/.DS_Store b/src/sequtils/lxpack/.DS_Store index ed42efc..5008ddf 100644 Binary files a/src/sequtils/lxpack/.DS_Store and b/src/sequtils/lxpack/.DS_Store differ diff --git a/src/sequtils/lxpack/Makefile b/src/sequtils/lxpack/Makefile index c87bac8..6bfb540 100755 --- a/src/sequtils/lxpack/Makefile +++ b/src/sequtils/lxpack/Makefile @@ -15,13 +15,13 @@ # @end: # --------------------------------------------------------------- # -include ./config/auto.conf +include ../../../config/auto.conf DIRS = src -include ./config/targets/propagate.targ +include ../../../config/targets/propagate.targ -include ./config/targets/help.targ +include ../../../config/targets/help.targ portclean:: $(MAKE) ACTION=$@ _action diff --git a/src/sequtils/lxpack/config/README.txt b/src/sequtils/lxpack/config/README.txt deleted file mode 100755 index dbcb92a..0000000 --- a/src/sequtils/lxpack/config/README.txt +++ /dev/null @@ -1,51 +0,0 @@ - -$Id: README.txt 1825 2013-02-26 09:39:47Z viari $ - -This directory contains Makefile machine specific configuration files -(and default targets to help you writing Makefile's) - -These headers should be used with GNU make or compatible - -# -# portname -# - -To check your port, issue : - - ./guess_port - - if output is 'unknown ::' then you should : - - add a port entry in guess_port for :: - - create a ports/.conf configuration file - (the best is to start from another port file, - choose whatever looks closest) - -# -# configuration flags -# - -auto.conf : the main configuration file : - - determine the machine port thru 'guess_port' shell - - include 'default.conf' file - - include the machine specific 'ports/.conf' file - -default.conf : default configuration (included by 'auto.conf') - -ports/.conf : machine specific configuration (included by 'auto.conf') - -# -# utility targets -# - -targets/help.targ : target for standard help - -targets/propagate.targ : target for propagating targets to subdirectories - -targets/package.targ : default targets for standard package with 'configure' - -targets/empty.targ : default empty targets (defined as double colon rules) - -targets/lxbin.targ : default make targets for standard lx binary (without libraries) - -targets/debug.targ : target to print debug information (for dev.) - diff --git a/src/sequtils/lxpack/config/auto.conf b/src/sequtils/lxpack/config/auto.conf deleted file mode 100644 index 265ad82..0000000 --- a/src/sequtils/lxpack/config/auto.conf +++ /dev/null @@ -1,54 +0,0 @@ -# -# $Id: auto.conf 1825 2013-02-26 09:39:47Z viari $ -# -# auto.conf -# auto configuration file using guess_port -# -# this file is included in Makefile -# - -# -# default shell for gnu-make -# - -SHELL = /bin/sh - -# -# CFGDIR : location of config files = this file directory location -# -# CFGPRT : port name (as returned by guess_port) -# - -# because builtin 'lastword' is missing in gnu-make 3.80 - -lastword = $(word $(words $1), $1) - -CFGDIR := $(dir $(call lastword, $(MAKEFILE_LIST))) - -CFGPRT := $(shell $(CFGDIR)guess_port) - -# check if port is correctly defined - -ifneq (1, $(words $(CFGPRT))) - entry := $(call lastword, $(CFGPRT)) - $(error port is undefined - add entry for "$(entry)" in configuration file -) -endif - -# -# PORTNAME : port name to use : default is CFGPRT but may be futher modified -# by machine specific configuration - -PORTNAME = $(CFGPRT) - -# -# default configuration -# may be overriden by machine dependant definitions below -# - -include $(CFGDIR)default.conf - -# -# machine dependant definitions -# - -include $(CFGDIR)ports/$(CFGPRT).conf diff --git a/src/sequtils/lxpack/config/default.conf b/src/sequtils/lxpack/config/default.conf deleted file mode 100644 index 672d1b1..0000000 --- a/src/sequtils/lxpack/config/default.conf +++ /dev/null @@ -1,124 +0,0 @@ -# -# $Id: default.conf 2007 2013-12-03 14:21:39Z viari $ -# -# default.conf -# default configuration flags -# maybe further redefined by machine specific configuration -# -# this file is included by auto.conf -# - -# ------------------------------------ -# General compilation flags -# ------------------------------------ - -# -# MACHDEF : define machine and OS specific flags -# - -MACHDEF = - -# -# CC : (ansi C) compiler command to use -# you may add some machine specific flags (like -arch ...) -# in the .conf configuration file -# - -CC = gcc - -# -# default compiler optimizer flag -# - -OPTIM = -O - -# -# CC_LIBS : additionnal machine specific $(CC) libraries -# like '-lC' on some machines -# - -CC_LIBS = - -# -# MALLOC_LIBS : machine specific malloc librairies -# like '-lmalloc' on SGI -# - -MALLOC_LIBS = - -# -# MATH_LIBS : machine specific math librairies -# like '-lm' on Solaris -# - -MATH_LIBS = - -# -# LINT : looks like LINT command does not exist anymore -# here is a rough replacement -# - -LINT = gcc -S -Wall -Wno-format-y2k -W -Wstrict-prototypes \ - -Wmissing-prototypes -Wpointer-arith -Wreturn-type \ - -Wcast-qual -Wwrite-strings -Wswitch -Wshadow \ - -Wcast-align -Wbad-function-cast -Wchar-subscripts \ - -Winline -Wnested-externs -Wredundant-decls - -# ------------------------------------ -# General system commands -# ------------------------------------ - -# -# AR : AR archive command -# ARFLAGS : $(AR) archiving flags -# ARXFLAGS : $(AR) extraction flags -# - -AR = ar -ARFLAGS = rcv -ARXFLAGS = xv - -# -# RANLIB : ranlib command -# - -RANLIB = ranlib - -# -# DIFF : diff command -# - -DIFF = diff - -# -# TAR : tar command -# - -TAR = tar - -# ------------------------------------ -# Default locations -# ------------------------------------ -# -# PRTDIR : port dependent files location (libraries and binaries) -# BINDIR : port binaries -# LIBDIR : port libraries -# - -PRTDIR = $(CFGDIR)../ports/$(PORTNAME) - -BINDIR = $(PRTDIR)/bin - -LIBDIR = $(PRTDIR)/lib - -# ------------------------------------ -# default gmake variable in implicit rules -# ------------------------------------ - -CFLAGS = $(OPTIM) $(MACHDEF) -I$(INCDIR) - -LDFLAGS = -L$(LIBDIR) -L. - -LDLIBS = $(LIBS) $(MALLOC_LIBS) $(MATH_LIBS) $(CC_LIBS) - -LINTFLAGS = $(MACHDEF) -I$(INCDIR) diff --git a/src/sequtils/lxpack/config/guess_port b/src/sequtils/lxpack/config/guess_port deleted file mode 100755 index 56e0ae3..0000000 --- a/src/sequtils/lxpack/config/guess_port +++ /dev/null @@ -1,33 +0,0 @@ -#! /bin/sh -# -# $Id: guess_port 1825 2013-02-26 09:39:47Z viari $ -# -# @file: guess_port -# @desc: attempt to guess the portname -# @usage: guess_port -# -# @history: -# @+ Nov. 2000 first draft adapted from GNU config.guess -# @+ Feb. 2010 moved to sh -# - -mach=`uname -m` -syst=`uname -s` -rels=`uname -r` - -case ${mach}:${syst}:${rels} in - - alpha:OSF1:* ) echo alpha-osf1;; - sun4*:SunOS:5.* ) echo sparc-solaris;; - i86pc:SunOS:5.* ) echo i386-solaris;; - sun4*:SunOS:* ) echo sparc-sunos;; - Power*:Darwin:* ) echo ppc-darwin;; - i*86:Linux:* ) echo i386-linux;; - x*86*:Linux:* ) echo i386-linux;; - i*86:Darwin:* ) echo i386-darwin;; - IP*:IRIX*:* ) echo mips-irix;; - i*86:MINGW32*:* ) echo x86-mingw32;; - - *) echo unknown ${mach}:${syst}:${rels}; exit 1;; -esac -exit 0 diff --git a/src/sequtils/lxpack/config/ports/i386-darwin.conf b/src/sequtils/lxpack/config/ports/i386-darwin.conf deleted file mode 100644 index b8a7999..0000000 --- a/src/sequtils/lxpack/config/ports/i386-darwin.conf +++ /dev/null @@ -1,26 +0,0 @@ -# -# $Id: i386-darwin.conf 1825 2013-02-26 09:39:47Z viari $ -# -# i386-darwin.conf -# configuration file for MacOS-X/Intel-Based/Darwin 1.2 with gcc compiler -# this file is included in Makefile -# -# system (uname -srp) : Darwin 8.7.1 i386 -# compiler (cc --version) : i686-apple-darwin8-gcc-4.0.1 -# -# check tags -# @uname:uname -srp:Darwin 8.7.1 i386 -# @cc:cc --version:i686-apple-darwin8-gcc-4.0.1 -# -# - -# ------------------------------------ -# General compilation flags -# ------------------------------------ - -# -# MACHDEF : define machine and OS specific flags -# - -MACHDEF = -DLX_TARGET_MACINTEL -DLITTLE_ENDIAN -DMACOSX - diff --git a/src/sequtils/lxpack/config/ports/i386-linux.conf b/src/sequtils/lxpack/config/ports/i386-linux.conf deleted file mode 100755 index d90af22..0000000 --- a/src/sequtils/lxpack/config/ports/i386-linux.conf +++ /dev/null @@ -1,32 +0,0 @@ -# -# $Id: i386-linux.conf 1825 2013-02-26 09:39:47Z viari $ -# -# i386-linux.conf -# configuration file for linux ix86 with GNU gcc compiler -# this file is included in Makefile -# -# system (uname -srp) : Linux 2.2.14-5.0 unknown -# compiler (gcc --version) : egcs-2.91.66 -# -# check tags -# @uname:uname -srp:Linux 2.2.14-5.0 unknown -# @cc:cc --version:egcs-2.91.66 -# -# - -# ------------------------------------ -# General compilation flags -# ------------------------------------ - -# -# MACHDEF : define machine and OS specific flags -# - -MACHDEF = -DLX_TARGET_LINUX -DLITTLE_ENDIAN - -# -# MATH_LIBS : machine specific math librairies -# - -MATH_LIBS = -lm - diff --git a/src/sequtils/lxpack/config/ports/ppc-darwin.conf b/src/sequtils/lxpack/config/ports/ppc-darwin.conf deleted file mode 100755 index 553345b..0000000 --- a/src/sequtils/lxpack/config/ports/ppc-darwin.conf +++ /dev/null @@ -1,32 +0,0 @@ -# -# $Id: ppc-darwin.conf 1825 2013-02-26 09:39:47Z viari $ -# -# ppc-darwin.conf -# configuration file for MacOS-X/Darwin 1.2 with native cc compiler -# this file is included in Makefile -# -# system (uname -srp) : Darwin 1.2 powerpc -# compiler (cc --version) : 2.7.2.1 -# -# check tags -# @uname:uname -srp:Darwin 1.2 powerpc -# @cc:cc --version:2.7.2.1 -# -# - -# ------------------------------------ -# General compilation flags -# ------------------------------------ - -# -# MACHDEF : define machine and OS specific flags -# - -MACHDEF = -DLX_TARGET_MACPPC -DBIG_ENDIAN - -# -# CC : name of (ansi C) compiler to use -# - -CC = cc -arch ppc - diff --git a/src/sequtils/lxpack/config/ports/sparc-solaris.conf b/src/sequtils/lxpack/config/ports/sparc-solaris.conf deleted file mode 100755 index 46ce21a..0000000 --- a/src/sequtils/lxpack/config/ports/sparc-solaris.conf +++ /dev/null @@ -1,31 +0,0 @@ -# -# $Id: sparc-solaris.conf 1825 2013-02-26 09:39:47Z viari $ -# -# sparc-solaris.conf -# configuration file for sparc solaris with GNU gcc compiler -# this file is included in Makefile -# -# system (uname -srp) : SunOS 5.8 sparc -# compiler (gcc --version) : 2.95.2 -# -# check tags -# @uname:uname -srp:SunOS 5.8 sparc -# @cc:cc --version:2.95.2 -# -# - -# ------------------------------------ -# General compilation flags -# ------------------------------------ - -# -# MACHDEF : define machine and OS specific flags -# - -MACHDEF = -DLX_TARGET_SOLARIS -DBIG_ENDIAN - -# -# MATH_LIBS : machine specific math librairies -# - -MATH_LIBS = -lm diff --git a/src/sequtils/lxpack/config/ports/x86-mingw32.conf b/src/sequtils/lxpack/config/ports/x86-mingw32.conf deleted file mode 100644 index 0dfea5d..0000000 --- a/src/sequtils/lxpack/config/ports/x86-mingw32.conf +++ /dev/null @@ -1,54 +0,0 @@ -# -# $Id: x86-mingw32.conf 1825 2013-02-26 09:39:47Z viari $ -# -# x86-mingw32 -# configuration file for MinGW with GNU gcc compiler. -# -# this file is included in Makefile -# -# - -# -# rename PORTNAME safely since MinGW produce pure win32 executables -# without dll's -# - -PORTNAME = x86-win32 - -# ------------------------------------ -# General compilation flags -# ------------------------------------ - -# -# CC_LIBS : additionnal machine specific $(CC) libraries -# -# libiberty is needed for some system extensions (like mkstemps) -# - -CC_LIBS = -liberty - -# -# MACHDEF : define machine and OS specific flags -# -# -DDLMALLOC : use dlmalloc instead of malloc (which does not have mallinfo) -# -posix is a new replacement for several MinGW32 flags, including: -# -D__USE_MINGW_ANSI_STDIO : mingw gcc flag to recognize the C99 "%zu" format -# - -MACHDEF = -posix -DLX_TARGET_WIN32 -DWIN_MINGW -DDLMALLOC -DLITTLE_ENDIAN - -# -# MATH_LIBS : machine specific math librairies -# - -MATH_LIBS = -lm - -# ------------------------------------ -# General system commands -# ------------------------------------ - -# -# DIFF : diff command / should ignore cr on windows -# - -DIFF = diff --strip-trailing-cr diff --git a/src/sequtils/lxpack/config/targets/debug.targ b/src/sequtils/lxpack/config/targets/debug.targ deleted file mode 100644 index c25e3ca..0000000 --- a/src/sequtils/lxpack/config/targets/debug.targ +++ /dev/null @@ -1,25 +0,0 @@ -# -# $Id: help.targ 1825 2013-02-26 09:39:47Z viari $ -# -# debug.targ -# -# target to print debug information (dev. only) -# -# it defines the following targets: -# -# debug : -# print debug -# -# it requires auto.conf -# - -.PHONY: debug - -debug:: - @echo "+ PORTNAME: $(PORTNAME)" - @echo "+ CFGPRT: $(CFGPRT)" - @echo "+ CFGDIR: $(CFGDIR)" - @echo "+ PRTDIR: $(PRTDIR)" - @echo "+ MACHDEF: $(MACHDEF)" - - diff --git a/src/sequtils/lxpack/config/targets/empty.targ b/src/sequtils/lxpack/config/targets/empty.targ deleted file mode 100644 index 9642422..0000000 --- a/src/sequtils/lxpack/config/targets/empty.targ +++ /dev/null @@ -1,24 +0,0 @@ -# -# $Id: $ -# -# epty.targ -# -# default empty targets (defined as double colon rules) -# -# - -# -# Rules -# - -.PHONY: all test clean portclean help - -all:: - -test:: - -clean:: - -portclean:: clean - -test:: diff --git a/src/sequtils/lxpack/config/targets/help.targ b/src/sequtils/lxpack/config/targets/help.targ deleted file mode 100644 index f0128a5..0000000 --- a/src/sequtils/lxpack/config/targets/help.targ +++ /dev/null @@ -1,23 +0,0 @@ -# -# $Id: help.targ 1825 2013-02-26 09:39:47Z viari $ -# -# help.targ -# -# default target to print help -# -# it defines the following targets: -# -# help : -# print help -# - -.PHONY: help - -help:: - @ echo "basic usage: make [+]" - @ echo "valid :" - @ echo " all : compile everything for current port [default target]" - @ echo " clean : local cleanup" - @ echo " portclean : cleanup distribution for current port" - @ echo " test : run tests on current port" - @ echo " help : print this help" diff --git a/src/sequtils/lxpack/config/targets/lxbin.targ b/src/sequtils/lxpack/config/targets/lxbin.targ deleted file mode 100644 index 4e6dbe9..0000000 --- a/src/sequtils/lxpack/config/targets/lxbin.targ +++ /dev/null @@ -1,51 +0,0 @@ -# -# $Id: $ -# -# lxbin.targ -# -# default make targets for standard lx binary -# -# you should define the 'PROGS' and 'OSRC' variables -# and optionnaly 'LIBS' if binaries have to be linked with libraries -# -# note: if main source code for binary PROG is PROG.c, there is nothing to do, -# else (e.g. if it involves several sources files) you should also add local -# file dependencies. e.g under the form: -# -# mymain: $(OBJ) mymain_base.c mymain_help.c -# $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS) $(LDLIBS) -# -# -# 'auto.conf' should have been included -# - -OBJ = $(OSRC:.c=.o) - -INCDIR = ../include - -# -# Rules -# - -.PHONY: all prelib install test clean portclean - -all:: prelib $(PROGS) install - @echo "+++++++++++ binaries $(PROGS) done" - -prelib:: - test -d $(PRTDIR) || mkdir $(PRTDIR) # because some linker may complain - test -d $(LIBDIR) || mkdir $(LIBDIR) # if -L$(LIBDIR) does not exist - -install:: - test -d $(PRTDIR) || mkdir $(PRTDIR) - test -d $(BINDIR) || mkdir $(BINDIR) - -for f in $(PROGS) ; do \cp -f $$f $(BINDIR) ; done - -test:: - -clean:: - -\rm -f *.o cvstatic* *% *.bak so_loc* - -\rm -f $(PROGS) - -portclean:: clean - -(! test -d $(BINDIR)) || (cd $(BINDIR) && \rm -f $(PROGS)) diff --git a/src/sequtils/lxpack/config/targets/lxlib.targ b/src/sequtils/lxpack/config/targets/lxlib.targ deleted file mode 100644 index 1be65c3..0000000 --- a/src/sequtils/lxpack/config/targets/lxlib.targ +++ /dev/null @@ -1,43 +0,0 @@ -# -# $Id: $ -# -# lxlib.targ -# -# default make targets for standard lx library -# -# you should define the 'LOCLIB' and 'OSRC' variables -# -# 'auto.conf' should have been included -# - -OBJ = $(OSRC:.c=.o) - -INCDIR = ../include - -# -# Rules -# - -.PHONY: all lib install test clean portclean - -all:: lib install - @echo "+++++++++++ library $(LOCLIB) done" - -lib:: $(OBJ) - $(AR) $(ARFLAGS) $(LOCLIB) $(OBJ) - $(RANLIB) $(LOCLIB) - -install:: - test -d $(PRTDIR) || mkdir $(PRTDIR) - test -d $(LIBDIR) || mkdir $(LIBDIR) - \cp -f $(LOCLIB) $(LIBDIR) - $(RANLIB) $(LIBDIR)/$(LOCLIB) - -test:: - -clean:: - -\rm -f *.o cvstatic* *% *.bak so_loc* - -\rm -f $(LOCLIB) - -portclean:: clean - -(! test -d $(LIBDIR)) || (cd $(LIBDIR) && \rm -f $(LOCLIB)) diff --git a/src/sequtils/lxpack/config/targets/package.targ b/src/sequtils/lxpack/config/targets/package.targ deleted file mode 100644 index f5918b8..0000000 --- a/src/sequtils/lxpack/config/targets/package.targ +++ /dev/null @@ -1,48 +0,0 @@ -# -# $Id: package.targ 1825 2013-02-26 09:39:47Z viari $ -# -# package.targ -# -# default make targets for standard package with configure -# -# you should define the 'PKG' variable -# (and optionaly 'PKGTAR', 'PKGDIR') -# - -PKGTAR ?= $(PKG).tgz - -PKGDIR ?= build.$(PORTNAME) - -PRTPATH = $(abspath $(PRTDIR)) - -# -# Rules -# - -.PHONY: all clean test portclean pkg pkg.expand pkg.make pkg.install - -all:: pkg - -pkg.expand:: - test -d $(PKGDIR) || mkdir $(PKGDIR) - test -f $(PKGDIR)/configure || $(TAR) zxf $(PKGTAR) -C $(PKGDIR) --strip-components 1 - -pkg.make:: pkg.expand - test -f $(PKGDIR)/Makefile || (cd $(PKGDIR) && ./configure --prefix=$(PRTPATH)) - $(MAKE) -C $(PKGDIR) - -pkg.install:: pkg.make - $(MAKE) -C $(PKGDIR) install - -pkg:: pkg.install - @echo "+++++++++++ package $(PKG) done" - -test:: - (! test -d $(PKGDIR)) || $(MAKE) -C $(PKGDIR) test - -clean:: - (! test -d $(PKGDIR)) || $(MAKE) -C $(PKGDIR) clean - -portclean:: - (! test -d $(PKGDIR)) || $(MAKE) -C $(PKGDIR) distclean - (! test -d $(PKGDIR)) || \rm -r $(PKGDIR) diff --git a/src/sequtils/lxpack/config/targets/propagate.targ b/src/sequtils/lxpack/config/targets/propagate.targ deleted file mode 100644 index 2e9df18..0000000 --- a/src/sequtils/lxpack/config/targets/propagate.targ +++ /dev/null @@ -1,30 +0,0 @@ -# -# $Id: propagate.targ 1825 2013-02-26 09:39:47Z viari $ -# -# propagate.targ -# -# default make targets for library containers -# -# you should define the 'DIRS' variable -# -# It will propagate 'MAKE ' to all -# directories listed in DIRS -# - -# -# Rules -# - -.PHONY: all _action $(DIRS) - -.DEFAULT: - $(MAKE) ACTION=$@ _action - -all:: - $(MAKE) ACTION=all _action - -_action: $(DIRS) - @echo "$(ACTION) done" - -$(DIRS): - $(MAKE) -C $@ $(ACTION) diff --git a/src/sequtils/lxpack/src/Abisrc/Makefile b/src/sequtils/lxpack/src/Abisrc/Makefile index f665eda..4aa8e23 100644 --- a/src/sequtils/lxpack/src/Abisrc/Makefile +++ b/src/sequtils/lxpack/src/Abisrc/Makefile @@ -17,15 +17,15 @@ # --------------------------------------------------------------- # -include ../../config/auto.conf +include ../../../../../config/auto.conf OSRC = libaabi.c LOCLIB = libaabi.a -include ../../config/targets/lxlib.targ +include ../../../../../config/targets/lxlib.targ -include ../../config/targets/help.targ +include ../../../../../config/targets/help.targ INCDIR = ../../include diff --git a/src/sequtils/lxpack/src/Biosrc/Makefile b/src/sequtils/lxpack/src/Biosrc/Makefile index 0a51135..31ff9be 100644 --- a/src/sequtils/lxpack/src/Biosrc/Makefile +++ b/src/sequtils/lxpack/src/Biosrc/Makefile @@ -17,15 +17,15 @@ # --------------------------------------------------------------- # -include ../../config/auto.conf +include ../../../../../config/auto.conf OSRC = string.c \ bioseq.c LOCLIB = libbio.a -include ../../config/targets/lxlib.targ +include ../../../../../config/targets/lxlib.targ -include ../../config/targets/help.targ +include ../../../../../config/targets/help.targ INCDIR = ../../include diff --git a/src/sequtils/lxpack/src/Fastasrc/Makefile b/src/sequtils/lxpack/src/Fastasrc/Makefile index 15fd8b2..8b87b22 100644 --- a/src/sequtils/lxpack/src/Fastasrc/Makefile +++ b/src/sequtils/lxpack/src/Fastasrc/Makefile @@ -17,15 +17,15 @@ # --------------------------------------------------------------- # -include ../../config/auto.conf +include ../../../../../config/auto.conf OSRC = libfasta.c LOCLIB = libfasta.a -include ../../config/targets/lxlib.targ +include ../../../../../config/targets/lxlib.targ -include ../../config/targets/help.targ +include ../../../../../config/targets/help.targ INCDIR = ../../include diff --git a/src/sequtils/lxpack/src/Makefile b/src/sequtils/lxpack/src/Makefile index a1e969c..7931d61 100644 --- a/src/sequtils/lxpack/src/Makefile +++ b/src/sequtils/lxpack/src/Makefile @@ -21,7 +21,7 @@ DIRS = Abisrc \ Fastasrc \ Utilsrc -include ../config/targets/propagate.targ +include ../../../../config/targets/propagate.targ -include ../config/targets/help.targ +include ../../../../config/targets/help.targ diff --git a/src/sequtils/lxpack/src/Utilsrc/Makefile b/src/sequtils/lxpack/src/Utilsrc/Makefile index 676a680..2167c47 100644 --- a/src/sequtils/lxpack/src/Utilsrc/Makefile +++ b/src/sequtils/lxpack/src/Utilsrc/Makefile @@ -16,7 +16,7 @@ # --------------------------------------------------------------- # -include ../../config/auto.conf +include ../../../../../config/auto.conf # # Machine independant flags @@ -38,8 +38,8 @@ PROGS = util_complinv \ OSRC = $(PROGS:=.c) -include ../../config/targets/lxbin.targ -include ../../config/targets/help.targ +include ../../../../../config/targets/lxbin.targ +include ../../../../../config/targets/help.targ INCDIR = ../../include diff --git a/src/sumatra-1.0.10/Licence_CeCILL_V2-en.txt b/src/sumaclust/sumaclust_v1.0.10/Licence_CeCILL_V2-en.txt similarity index 100% rename from src/sumatra-1.0.10/Licence_CeCILL_V2-en.txt rename to src/sumaclust/sumaclust_v1.0.10/Licence_CeCILL_V2-en.txt diff --git a/src/sumatra-1.0.10/Licence_CeCILL_V2-fr.txt b/src/sumaclust/sumaclust_v1.0.10/Licence_CeCILL_V2-fr.txt similarity index 100% rename from src/sumatra-1.0.10/Licence_CeCILL_V2-fr.txt rename to src/sumaclust/sumaclust_v1.0.10/Licence_CeCILL_V2-fr.txt diff --git a/src/sumaclust/sumaclust_v1.0.10/Makefile b/src/sumaclust/sumaclust_v1.0.10/Makefile new file mode 100644 index 0000000..74dce9f --- /dev/null +++ b/src/sumaclust/sumaclust_v1.0.10/Makefile @@ -0,0 +1,55 @@ +EXEC=sumaclust + +SUMACLUST_SRC= sumaclust.c \ + mtcompare_sumaclust.c + + +SUMACLUST_OBJ= $(patsubst %.c,%.o,$(SUMACLUST_SRC)) + + +SRCS= $(SUMACLUST_SRC) + +LIB= -lfasta -llcs -lfile -lutils -lm #-ll + + +include ./global.mk + +all: $(EXEC) install + + +######## +# +# sumaclust compilation +# +######## + +# executable compilation and link + +#ifeq ($(CC),gcc) +# LFLAGS = -fopenmp +#else +# LFLAGS = +#endif + +sumaclust: $(SUMACLUST_OBJ) $(LIBFASTA) $(LIBLCS) $(LIBFILE) $(LIBUTILS) + $(CC) $(LIBFASTAPATH) $(LIBLCSPATH) $(LIBFILEPATH) $(LIBUTILSPATH) $(LDFLAGS) -o $@ $(LFLAGS) $(SUMACLUST_OBJ) $(LIB) + +######## +# +# project management +# +######## + +clean: + rm -f *.o + rm -f *.P + rm -f $(EXEC) + $(MAKE) -C ./sumalibs/libfasta clean + $(MAKE) -C ./sumalibs/liblcs clean + $(MAKE) -C ./sumalibs/libfile clean + $(MAKE) -C ./sumalibs/libutils clean + +install: + cp $(EXEC) $(BINDIR) + + diff --git a/src/sumatra-1.0.10/global.mk b/src/sumaclust/sumaclust_v1.0.10/global.mk similarity index 74% rename from src/sumatra-1.0.10/global.mk rename to src/sumaclust/sumaclust_v1.0.10/global.mk index 9b7651f..303ca46 100644 --- a/src/sumatra-1.0.10/global.mk +++ b/src/sumaclust/sumaclust_v1.0.10/global.mk @@ -1,3 +1,4 @@ +include ../../../config/auto.conf LIBFASTAPATH = -L./sumalibs/libfasta LIBLCSPATH = -L./sumalibs/liblcs @@ -9,21 +10,19 @@ LIBLCS = ./sumalibs/liblcs/liblcs.a LIBFILE = ./sumalibs/libfile/libfile.a LIBUTILS = ./sumalibs/libutils/libutils.a -CC=gcc -LDFLAGS= -ifeq ($(CC),gcc) - CFLAGS = -O3 -s -DOMP_SUPPORT -fopenmp -w -else - CFLAGS = -O3 -w -endif +#ifeq ($(CC),gcc) +# CFLAGS = -O3 -s -DOMP_SUPPORT -fopenmp -w +#else +# CFLAGS = -O3 -w +#endif default: all %.o: %.c - $(CC) $(CFLAGS) -c -o $@ $< $(LIB) + $(CC) $(CFLAGS) -c -o $@ $< ######## @@ -42,4 +41,5 @@ default: all $(MAKE) -C ./sumalibs/libfile ./sumalibs/libutils/libutils.a: - $(MAKE) -C ./sumalibs/libutils \ No newline at end of file + $(MAKE) -C ./sumalibs/libutils + diff --git a/src/sumaclust/sumaclust_v1.0.10/mtcompare_sumaclust.c b/src/sumaclust/sumaclust_v1.0.10/mtcompare_sumaclust.c new file mode 100644 index 0000000..da9e5ab --- /dev/null +++ b/src/sumaclust/sumaclust_v1.0.10/mtcompare_sumaclust.c @@ -0,0 +1,334 @@ +/* + * mtcompare_cumaclust.c + * + * Author: Celine Mercier + * + */ + + +#ifdef OMP_SUPPORT +#include +#endif + +#include +#include +#include +#include + +#include "./sumalibs/libfasta/sequence.h" +#include "./sumalibs/libutils/utilities.h" +#include "./sumalibs/liblcs/upperband.h" +#include "./sumalibs/liblcs/sse_banded_LCS_alignment.h" + +#include "sumaclust.h" + + + +static double computeScore(void* c, int32_t seed_number, int32_t i, int thread_number,int32_t maxcount) +{ + thread_control_t *control=(thread_control_t*)c; + fastaSeqPtr* db = control->db; + fastaSeqPtr db_i = db[i]; + fastaSeqPtr db_seed_number = db[seed_number]; + int LCSmin; + double score; + + score = control->worstscore; + + if (db_i->count <= maxcount) + filters(db_i, db_seed_number, + control->threshold, + control->normalize, + control->reference, + control->lcsmode, + &score, + &LCSmin); + + if (score == -1.0) + score = alignForSumathings(db_seed_number->sequence, control->iseqs1[thread_number], + db_i->sequence, control->iseqs2[thread_number], + db_seed_number->length, db_i->length, + control->normalize, control->reference, + control->lcsmode, control->addresses[thread_number], + control->sizeForSeqs, LCSmin); + + return score; +} + + +inline void putSeqInClusterMT(void *c, int32_t center_idx, int32_t seq, double score) +{ + // saves a sequence as belonging to a cluster and its score with the seed + + thread_control_t *control=(thread_control_t*)c; + fastaSeqPtr* db = control->db; + fastaSeqPtr pseq = db[seq]; + + + pseq->center = db+center_idx; + pseq->center_index = center_idx; // saves cluster + pseq->score = score; // saves score with the seed + pseq->cluster_center = FALSE; +} + + +int64_t timevaldiff(struct timeval *starttime, struct timeval *finishtime) +{ + int64_t msec; + msec=(finishtime->tv_sec-starttime->tv_sec)*1000000; + msec+=(finishtime->tv_usec-starttime->tv_usec)/1000000; + return msec; +} + +void computeOneSeed(void* c) +{ + thread_control_t *control=(thread_control_t*)c; + BOOL found; + int32_t seed_number; + int32_t nextone=control->n; + int64_t elapsedtime; + struct timeval current; + struct timeval start; + + seed_number = control->next; + found = FALSE; + + //printf("\n seed = %d, n = %d", seed_number, control->n); + + #ifdef OMP_SUPPORT + omp_set_num_threads(control->threads_number); + #endif + + gettimeofday(&start,NULL); + + #ifdef OMP_SUPPORT + #pragma omp parallel default(none) \ + firstprivate(found) \ + firstprivate(seed_number) \ + firstprivate(control) \ + shared(nextone) + #endif + + { + int32_t i; + double score; + int32_t current_seed; + #ifdef OMP_SUPPORT + int thread_id=omp_get_thread_num(); + #else + int thread_id=0; + #endif + int nseq = control->n; + BOOL fast = control->fast; + BOOL lcsmode = control->lcsmode; + int normalize = control->normalize; + double threshold = control->threshold; + BOOL first = TRUE; + BOOL not_already_in_a_cluster; + BOOL threshold_bad; + int32_t priv_nextone=control->n; + int32_t maxcount = (double)(control->db[seed_number]->count) * control->max_ratio; + + fastaSeqPtr* db = control->db; + + #ifdef OMP_SUPPORT + #pragma omp for schedule(dynamic,10) + #endif + + for (i=seed_number+1; \ + i < nseq; \ + i++) + { + + current_seed = db[i]->center_index; + not_already_in_a_cluster = current_seed == i; // At the beginning all the sequences are their own center + + if ((! fast) || not_already_in_a_cluster) + { + score = computeScore((void*)control, seed_number, i, thread_id,maxcount); // computes LCS score or 0 if k-mer filter not passed + + if (lcsmode || normalize) + threshold_bad = (score < threshold); + else + threshold_bad = (score > threshold); + + if (threshold_bad) // similarity under threshold + { + if (!found && not_already_in_a_cluster && (i < priv_nextone)) + { + priv_nextone=i; // saves potential next seed +// *potential_nexts_list = i; + + found = TRUE; // saves the fact that a next seed + // has been found for this thread + } + } + else if (not_already_in_a_cluster || \ + ((! fast) && \ + (db[i]->score < score))) + { // if seq matching with current seed : + // clustering with seed if seq doesn't belong to any cluster yet + // OR in exact mode and the score is better with this seed + if (! lcsmode && normalize) + score = 1.0 - score; + putSeqInClusterMT((void*)control, seed_number, i, score); // saves new seed for this seq + } + } // if ((! fast) || on_current_seed) + + } // for (i=seed_number+1;... + + #ifdef OMP_SUPPORT + #pragma omp flush(nextone) + #endif + if (priv_nextone < nextone) + #ifdef OMP_SUPPORT + #pragma omp critical + #endif + if (priv_nextone < nextone) + nextone=priv_nextone; + + } + + gettimeofday(¤t,NULL); + elapsedtime = timevaldiff(&start,¤t); + control->elapsedtime+=elapsedtime; + + control->next=nextone; + + if (control->next < (control->n)-1) + (control->seeds_counter)++; + else if (control->next == (control->n)-1) + { + control->stop = TRUE; + (control->seeds_counter)++; + } + else if (control->next == control->n) + control->stop = TRUE; +} + + +void initializeCentersAndScores(void *c) +{ + // Initializing the scores table for each seq : + + thread_control_t *control = (thread_control_t*) c; + int32_t i; + fastaSeqPtr *db_i; + int scoremax; + + if (control->normalize && control->lcsmode) + scoremax = 1.0; + else if (!control->lcsmode) + scoremax = 0.0; + else + scoremax = (*(control->db))->length; + + for (i=0, db_i = control->db; + i <= control->n-1; + i++,db_i++) + { + (*db_i)->center = (control->db)+i; + (*db_i)->center_index = i; + (*db_i)->score = scoremax; + (*db_i)->cluster_center = TRUE; + } +} + + +void freeEverything(void *c) +{ + thread_control_t *control=(thread_control_t*)c; + int i; + + // free(control->potential_nexts_list); + if ((control->reference == ALILEN) && (control->normalize || !control->lcsmode)) + { + for (i=0; i < control->threads_number; i++) + free(control->addresses[i]); + free(control->addresses); + } + free(control->iseqs1); + free(control->iseqs2); +} + + +int mt_compare_sumaclust(fastaSeqPtr* db, int n, BOOL fast, double threshold, BOOL normalize, + int reference, BOOL lcsmode, int threads_number, double max_ratio) +{ + thread_control_t control; + int32_t i; + int lmax, lmin; + + if (lcsmode || normalize) + fprintf(stderr,"Clustering sequences when similarity >= %lf\n", threshold); + else + fprintf(stderr,"Clustering sequences when distance <= %lf\n", threshold); + + fprintf(stderr,"Aligning and clustering... \n"); + + #ifdef OMP_SUPPORT + control.threads_number = omp_get_max_threads(); + #else + control.threads_number = 1; + #endif + if (threads_number < control.threads_number) + control.threads_number = threads_number; + + calculateMaxAndMinLen(db, n, &lmax, &lmin); + + control.addresses = (int16_t**) malloc(control.threads_number*sizeof(int16_t*)); + control.iseqs1 = (int16_t**) malloc(control.threads_number*sizeof(int16_t*)); + control.iseqs2 = (int16_t**) malloc(control.threads_number*sizeof(int16_t*)); + + for (i=0; i < control.threads_number; i++) + control.sizeForSeqs = prepareTablesForSumathings(lmax, lmin, threshold, normalize, reference, lcsmode, (control.addresses)+i, (control.iseqs1)+i, (control.iseqs2)+i); + + control.db = db; + control.next = 0; + control.normalize = normalize; + control.reference = reference; + control.threshold = threshold; + control.max_ratio = max_ratio; + control.lcsmode = lcsmode; + control.stop = FALSE; + control.fast = fast; + control.seeds_counter = 1; +// control.potential_nexts_list = (int*) calloc(control.threads_number, sizeof(int)); + control.n = n; + + if (lcsmode || normalize) + control.worstscore = 0.0; + else + control.worstscore = lmax; + + control.elapsedtime=0; + + fprintf(stderr, "%d threads running\n", control.threads_number); + + // initialize scores table : + initializeCentersAndScores(&control); + + while (control.stop == FALSE) + { + if ((control.next)%100 == 0) + { + float p = ((float)(control.next)/(float)n)*100; + fprintf(stderr,"\rDone : %f %% ",p); + } + computeOneSeed(&control); + + } + + for (i=0; i < control.threads_number; i++) + { + free((*((control.iseqs1)+i))-(control.sizeForSeqs)+lmax); + free((*((control.iseqs2)+i))-(control.sizeForSeqs)+lmax); + } + + freeEverything(&control); + fprintf(stderr,"\rDone : 100 %% %d clusters created. \n\n", control.seeds_counter); + fprintf(stderr,"Pure computation time %f \n\n", (double)control.elapsedtime/1000000.); + + + return(control.seeds_counter); +} diff --git a/src/sumaclust/sumaclust_v1.0.10/mtcompare_sumaclust.h b/src/sumaclust/sumaclust_v1.0.10/mtcompare_sumaclust.h new file mode 100644 index 0000000..b5476c3 --- /dev/null +++ b/src/sumaclust/sumaclust_v1.0.10/mtcompare_sumaclust.h @@ -0,0 +1,15 @@ +/* + * mtcompare.h + * + * Created on: 12 mars 2013 + * Author: celinemercier + */ + +#ifndef MTCOMPARE_H_ +#define MTCOMPARE_H_ + +int mt_compare_sumaclust(fastaSeqPtr* db, int n, BOOL fast, double threshold, BOOL normalize, + int reference, BOOL lcsmode, int threads_number, double max_ratio); + + +#endif /* MTCOMPARE_H_ */ diff --git a/src/sumaclust/sumaclust_v1.0.10/sumaclust.c b/src/sumaclust/sumaclust_v1.0.10/sumaclust.c new file mode 100644 index 0000000..a86a72f --- /dev/null +++ b/src/sumaclust/sumaclust_v1.0.10/sumaclust.c @@ -0,0 +1,1086 @@ +/** + * FileName: sumaclust.c + * Author: Celine Mercier + * Description: star clustering of DNA sequences + * **/ + + +#include +#include +#include +#include +#include +#include + +#include "./sumalibs/libutils/utilities.h" +#include "./sumalibs/libfasta/sequence.h" +#include "./sumalibs/libfasta/fasta_header_parser.h" +#include "./sumalibs/libfasta/fasta_header_handler.h" +#include "./sumalibs/libfasta/fasta_seq_writer.h" +#include "./sumalibs/liblcs/upperband.h" +#include "./sumalibs/liblcs/sse_banded_LCS_alignment.h" + +#include "mtcompare_sumaclust.h" +#include "sumaclust.h" + +#define VERSION "1.0.10" + + +/* ----------------------------------------------- */ +/* printout help */ +/* ----------------------------------------------- */ + +#define PP fprintf(stdout, + + +static void PrintHelp() +{ + PP "------------------------------------------------------------\n"); + PP " SUMACLUST Version %s\n", VERSION); + PP "------------------------------------------------------------\n"); + PP " Synopsis : star clustering of sequences.\n"); + PP " Usage: sumaclust [options] \n"); + PP "------------------------------------------------------------\n"); + PP " Options:\n"); + PP " -h : [H]elp - print help\n\n"); + PP " -l : Reference sequence length is the shortest. \n\n"); + PP " -L : Reference sequence length is the largest. \n\n"); + PP " -a : Reference sequence length is the alignment length (default). \n\n"); + PP " -n : Score is normalized by reference sequence length (default).\n\n"); + PP " -r : Raw score, not normalized. \n\n"); + PP " -d : Score is expressed in distance (default : score is expressed in similarity). \n\n"); + PP " -t ##.## : Score threshold for clustering. If the score is normalized and expressed in similarity (default),\n"); + PP " it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized\n"); + PP " and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%.\n"); + PP " If the score is not normalized and expressed in similarity, it is the length of the\n"); + PP " Longest Common Subsequence. If the score is not normalized and expressed in distance,\n"); + PP " it is (reference length - LCS length).\n"); + PP " Only sequences with a similarity above ##.## with the center sequence of a cluster\n"); + PP " are assigned to that cluster. Default: 0.97.\n\n"); + PP " -e : Exact option : A sequence is assigned to the cluster with the center sequence presenting the\n"); + PP " highest similarity score > threshold, as opposed to the default 'fast' option where a sequence is\n"); + PP " assigned to the first cluster found with a center sequence presenting a score > threshold.\n\n"); + PP " -R ## : Maximum ratio between the counts of two sequences so that the less abundant one can be considered\n"); + PP " as a variant of the more abundant one. Default: 1.0.\n\n"); + PP " -p ## : Multithreading with ## threads using openMP.\n\n"); + PP " -s #### : Sorting by ####. Must be 'None' for no sorting, or a key in the fasta header of each sequence,\n"); + PP " except for the count that can be computed (default : sorting by count).\n\n"); + PP " -o : Sorting is in ascending order (default : descending).\n\n"); + PP " -g : n's are replaced with a's (default: sequences with n's are discarded).\n\n"); + PP " -B ### : Output of the OTU table in BIOM format is activated, and written to file ###.\n\n"); + PP " -O ### : Output of the OTU map (observation map) is activated, and written to file ###.\n\n"); + PP " -F ### : Output in FASTA format is written to file ### instead of standard output.\n\n"); + PP " -f : Output in FASTA format is deactivated.\n"); + PP "\n"); + PP "------------------------------------------------------------\n"); + PP " Argument : the nucleotide dataset to cluster\n"); + PP "------------------------------------------------------------\n"); + PP " http://metabarcoding.org/sumatra\n"); + PP "------------------------------------------------------------\n\n"); +} + +#undef PP + +/* ----------------------------------------------- */ +/* printout usage and exit */ +/* ----------------------------------------------- */ + +#define PP fprintf(stderr, + + +static void ExitUsage(stat) + int stat; +{ + PP "usage: sumaclust [-l|L|a|n|r|d|e|o|g|f] [-t threshold_value] [-s sorting_key] [-R maximum_ratio] [-p number_of_threads]\n"); + PP "[-B file_name_for_BIOM-formatted_output] [-O file_name_for_OTU_table-formatted_output] [-F file_name_for_FASTA-formatted_output] dataset\n"); + PP "type \"sumaclust -h\" for help\n"); + + if (stat) + exit(stat); +} + +#undef PP + + +static char* sortingKey="count"; + +static int sortSeqsP(const void **s1, const void **s2) +{ + int res; + double r1; + double r2; + + r1 = atof(getItemFromHeader(sortingKey, ((fastaSeqPtr) *s2)->header)); + r2 = atof(getItemFromHeader(sortingKey, ((fastaSeqPtr) *s2)->header)); + if (r2 > r1) + res = 1; + else if (r2 < r1) + res = -1; + else + res = 0; + + return(res); +} + + +static int reverseSortSeqsP(const void **s1, const void **s2) +{ + int res; + double r1; + double r2; + + r1 = atof(getItemFromHeader(sortingKey, ((fastaSeqPtr) *s2)->header)); + r2 = atof(getItemFromHeader(sortingKey, ((fastaSeqPtr) *s2)->header)); + + if (r1 > r2) + res = 1; + else if (r1 < r2) + res = -1; + else + res = 0; + + return(res); +} + + +int uniqSeqsDoubleSortFunction(const void *s1, const void *s2) +{ + int c; + char* str_r1; + double r1; + double r2; + + c = strcmp(((fastaSeqPtr) s1)->sequence, ((fastaSeqPtr) s2)->sequence); + if (c == 0) + { + str_r1 = getItemFromHeader(sortingKey, ((fastaSeqPtr) s1)->header); + if (str_r1 == NULL) + { + fprintf(stderr, "\nERROR: '%s' not in sequence header(s).\n\n", sortingKey); + exit(1); + } + r1 = atof(str_r1); + r2 = atof(getItemFromHeader(sortingKey, ((fastaSeqPtr) s2)->header)); + + if (r2 > r1) + c = 1; + else if (r2 < r1) + c = -1; + else + c = 0; + } + return(c); +} + + +int uniqSeqsDoubleReverseSortFunction(const void *s1, const void *s2) +{ + int c; + char* str_r1; + double r1; + double r2; + + c = strcmp(((fastaSeqPtr) s1)->sequence, ((fastaSeqPtr) s2)->sequence); + if (c == 0) + { + str_r1 = getItemFromHeader(sortingKey, ((fastaSeqPtr) s1)->header); + if (str_r1 == NULL) + { + fprintf(stderr, "\nERROR: '%s' not in sequence header(s).\n\n", sortingKey); + exit(1); + } + r1 = atof(str_r1); + r2 = atof(getItemFromHeader(sortingKey, ((fastaSeqPtr) s2)->header)); + + if (r1 > r2) + c = 1; + else if (r1 < r2) + c = -1; + else + c = 0; + } + return(c); +} + + +void printInBIOMformat(fastaSeqPtr* uniqSeqs, int count, int numberOfCenters, char* biomFile_name) +{ + int i, j, n; + FILE* biomFile; + struct tm* tm_info; + time_t timer; + char buffer_date[20]; + fastaSeqPtr* c; + fastaSeqPtr* seq; + int id_len; + int row_number; + BOOL first_center = TRUE; + + int buffer_col_rows; + int buffer_col_rows_1; + int buffer_col_rows_2; + + buffer_col_rows = 29; + buffer_col_rows_1 = 9; + buffer_col_rows_2 = 20; + + n = 0; + + biomFile = fopen(biomFile_name, "w"); + if (biomFile == NULL) + fprintf(stderr, "\nCan't open BIOM output file.\n"); //, %s outputFilename); + + for (i=0; iaccession_id); + j=0; + + if ((*seq)->cluster_center) // center sequence + { + n++; + (*seq)->cluster_weight_unique_ids = 1; + + if (first_center) + { + (*seq)->columns_BIOM_size = id_len + buffer_col_rows; + (*seq)->columns_BIOM = (char*) malloc(((*seq)->columns_BIOM_size)*sizeof(char)); + strcpy((*seq)->columns_BIOM, "{\"id\": \""); + first_center = FALSE; + } + else + { + (*seq)->columns_BIOM_size = id_len + buffer_col_rows + 1; + (*seq)->columns_BIOM = (char*) malloc(((*seq)->columns_BIOM_size)*sizeof(char)); + strcpy((*seq)->columns_BIOM, ",{\"id\": \""); + } + + memcpy((*seq)->columns_BIOM + (*seq)->columns_BIOM_size - id_len - buffer_col_rows_2 - 1, (*seq)->accession_id, id_len); + memcpy((*seq)->columns_BIOM + (*seq)->columns_BIOM_size - buffer_col_rows_2 - 1, "\", \"metadata\": null}", buffer_col_rows_2+1); + + if ((*seq)->next != NULL) // not last sequence + { + for (j=1; ((((*seq)+j)->next != NULL) && (((*seq)+j)->uniqHead == FALSE)); j++) // identical sequences + { + id_len = strlen((*(seq)+j)->accession_id); + n++; + + (*seq)->cluster_weight_unique_ids++; + (*seq)->columns_BIOM_size = (*seq)->columns_BIOM_size + id_len + buffer_col_rows; + (*seq)->columns_BIOM = realloc((*seq)->columns_BIOM, ((*seq)->columns_BIOM_size) * sizeof(char)); + memcpy((*seq)->columns_BIOM + (*seq)->columns_BIOM_size - buffer_col_rows - id_len - 1, ",{\"id\": \"", buffer_col_rows_1); + memcpy((*seq)->columns_BIOM + (*seq)->columns_BIOM_size - id_len - buffer_col_rows_2 - 1, (*(seq)+j)->accession_id, id_len); + memcpy((*seq)->columns_BIOM + (*seq)->columns_BIOM_size - buffer_col_rows_2 - 1, "\", \"metadata\": null}", buffer_col_rows_2+1); + } + if ((((*seq)+j)->next == NULL) && (((*seq)+j)->uniqHead == FALSE)) // last sequence + { + id_len = strlen((*(seq)+j)->accession_id); + n++; + + (*seq)->cluster_weight_unique_ids++; + (*seq)->columns_BIOM_size = (*seq)->columns_BIOM_size + id_len + buffer_col_rows; + (*seq)->columns_BIOM = realloc((*seq)->columns_BIOM, ((*seq)->columns_BIOM_size) * sizeof(char)); + memcpy((*seq)->columns_BIOM + (*seq)->columns_BIOM_size - buffer_col_rows - id_len - 1, ",{\"id\": \"", buffer_col_rows_1); + memcpy((*seq)->columns_BIOM + (*seq)->columns_BIOM_size - id_len - buffer_col_rows_2 - 1, (*(seq)+j)->accession_id, id_len); + memcpy((*seq)->columns_BIOM + (*seq)->columns_BIOM_size - buffer_col_rows_2 - 1, "\", \"metadata\": null}", buffer_col_rows_2+1); + } + } + } + else // not a center sequence + { + n++; + + c = (*seq)->center; + + id_len = strlen((*seq)->accession_id); + n++; + + (*c)->cluster_weight_unique_ids++; + (*c)->columns_BIOM_size = (*c)->columns_BIOM_size + id_len + buffer_col_rows; + (*c)->columns_BIOM = realloc((*c)->columns_BIOM, ((*c)->columns_BIOM_size) * sizeof(char)); + memcpy((*c)->columns_BIOM + (*c)->columns_BIOM_size - buffer_col_rows - id_len - 1, ",{\"id\": \"", buffer_col_rows_1); + memcpy((*c)->columns_BIOM + (*c)->columns_BIOM_size - id_len - buffer_col_rows_2 - 1, (*seq)->accession_id, id_len); + memcpy((*c)->columns_BIOM + (*c)->columns_BIOM_size - buffer_col_rows_2 - 1, "\", \"metadata\": null}", buffer_col_rows_2+1); + + if ((*seq)->next != NULL) // not last sequence + { + for (j=1; ((((*seq)+j)->next != NULL) && (((*seq)+j)->uniqHead == FALSE)); j++) // identical sequences + { + id_len = strlen((*(seq)+j)->accession_id); + n++; + + (*c)->cluster_weight_unique_ids++; + (*c)->columns_BIOM_size = (*c)->columns_BIOM_size + id_len + buffer_col_rows; + (*c)->columns_BIOM = realloc((*c)->columns_BIOM, ((*c)->columns_BIOM_size) * sizeof(char)); + memcpy((*c)->columns_BIOM + (*c)->columns_BIOM_size - buffer_col_rows - id_len - 1, ",{\"id\": \"", buffer_col_rows_1); + memcpy((*c)->columns_BIOM + (*c)->columns_BIOM_size - id_len - buffer_col_rows_2 - 1, (*(seq)+j)->accession_id, id_len); + memcpy((*c)->columns_BIOM + (*c)->columns_BIOM_size - buffer_col_rows_2 - 1, "\", \"metadata\": null}", buffer_col_rows_2+1); + } + + if ((((*seq)+j)->next == NULL) && (((*seq)+j)->uniqHead == FALSE)) // last sequence + { + id_len = strlen((*(seq)+j)->accession_id); + n++; + + (*c)->cluster_weight_unique_ids++; + (*c)->columns_BIOM_size = (*c)->columns_BIOM_size + id_len + buffer_col_rows; + (*c)->columns_BIOM = realloc((*c)->columns_BIOM, ((*c)->columns_BIOM_size) * sizeof(char)); + memcpy((*c)->columns_BIOM + (*c)->columns_BIOM_size - buffer_col_rows - id_len - 1, ",{\"id\": \"", buffer_col_rows_1); + memcpy((*c)->columns_BIOM + (*c)->columns_BIOM_size - id_len - buffer_col_rows_2 - 1, (*(seq)+j)->accession_id, id_len); + memcpy((*c)->columns_BIOM + (*c)->columns_BIOM_size - buffer_col_rows_2 - 1, "\", \"metadata\": null}", buffer_col_rows_2+1); + } + } + } + } + + time(&timer); + tm_info = localtime(&timer); + strftime(buffer_date, 20, "%Y-%m-%dT%H:%M:%S", tm_info); + + fprintf(biomFile, "{\"id\": \"None\",\"format\": \"Biological Observation Matrix 1.0.0\"," + "\"format_url\": \"http://biom-format.org\",\"type\": \"OTU table\"," + "\"generated_by\": \"SUMACLUST %s\",\"date\": \"%s\",\"matrix_type\": \"sparse\"," + "\"matrix_element_type\": \"int\",\"shape\": [%d, %d],", + VERSION, buffer_date, numberOfCenters, n); + + // print data + + row_number = 0; + n = 0; + + fprintf(biomFile, "\"data\": ["); + + for (i=0; icluster_center) // center sequence + { + for (j=0; j<(*seq)->cluster_weight_unique_ids; j++) + { + if ((row_number == (numberOfCenters - 1)) && (j == ((*seq)->cluster_weight_unique_ids - 1))) // last seq to print + fprintf(biomFile, "[%d,%d,1]],", row_number, n); + else + fprintf(biomFile, "[%d,%d,1],", row_number, n); + n++; + } + row_number++; + } + } + // end data + + // Print rows + + first_center = TRUE; + + for (i=0; icluster_center) // center sequence + { + if (first_center) + { + fprintf(biomFile, "\"rows\": [{\"id\": \"%s\", \"metadata\": null}", (*seq)->accession_id); + first_center = FALSE; + } + else + fprintf(biomFile, ",{\"id\": \"%s\", \"metadata\": null}", (*seq)->accession_id); + } + } + + // Print columns + + fprintf(biomFile, "],\"columns\": ["); + for (i=0; icluster_center) // center sequence + fprintf(biomFile, (*seq)->columns_BIOM); + } + fprintf(biomFile, "]}"); + + fclose(biomFile); +} + + +void printInOTUtableFormat(fastaSeqPtr* uniqSeqs, int count, char* OTUtableFile_name) +{ + int i, j; + FILE* OTUtableFile; + fastaSeqPtr* c; + fastaSeqPtr* seq; + int id_len; + + OTUtableFile = fopen(OTUtableFile_name, "w"); + if (OTUtableFile == NULL) + fprintf(stderr, "\nCan't open OTU table output file.\n"); //, %s outputFilename); + + for (i=0; iaccession_id); + j=0; + + + if ((*seq)->cluster_center) // center sequence + { + (*seq)->line_OTU_table_size = id_len*2 + 2; + (*seq)->line_OTU_table = (char*) malloc(((*seq)->line_OTU_table_size)*sizeof(char)); + strcpy((*seq)->line_OTU_table, (*seq)->accession_id); + memcpy((*seq)->line_OTU_table + (*seq)->line_OTU_table_size - id_len - 2, "\t", 1); + memcpy((*seq)->line_OTU_table + (*seq)->line_OTU_table_size - id_len - 1, (*seq)->accession_id, id_len); + memcpy((*seq)->line_OTU_table + (*seq)->line_OTU_table_size - 1, "\0", 1); + + if ((*seq)->next != NULL) // not last sequence + { + for (j=1; ((((*seq)+j)->next != NULL) && (((*seq)+j)->uniqHead == FALSE)); j++) // identical sequences + { + id_len = strlen((*(seq)+j)->accession_id); + + (*seq)->line_OTU_table_size = (*seq)->line_OTU_table_size + id_len + 1; + (*seq)->line_OTU_table = realloc((*seq)->line_OTU_table, ((*seq)->line_OTU_table_size) * sizeof(char)); + memcpy((*seq)->line_OTU_table + (*seq)->line_OTU_table_size - id_len - 2, "\t", 1); + memcpy((*seq)->line_OTU_table + (*seq)->line_OTU_table_size - id_len - 1, (*(seq)+j)->accession_id, id_len); + memcpy((*seq)->line_OTU_table + (*seq)->line_OTU_table_size - 1, "\0", 1); + } + + if ((((*seq)+j)->next == NULL) && (((*seq)+j)->uniqHead == FALSE)) // last sequence + { + id_len = strlen((*(seq)+j)->accession_id); + + (*seq)->line_OTU_table_size = (*seq)->line_OTU_table_size + id_len + 1; + (*seq)->line_OTU_table = realloc((*seq)->line_OTU_table, ((*seq)->line_OTU_table_size) * sizeof(char)); + memcpy((*seq)->line_OTU_table + (*seq)->line_OTU_table_size - id_len - 2, "\t", 1); + memcpy((*seq)->line_OTU_table + (*seq)->line_OTU_table_size - id_len - 1, (*(seq)+j)->accession_id, id_len); + memcpy((*seq)->line_OTU_table + (*seq)->line_OTU_table_size - 1, "\0", 1); + } + } + } + else // not a center sequence + { + c = (*seq)->center; + + (*c)->line_OTU_table_size = (*c)->line_OTU_table_size + id_len + 1; + (*c)->line_OTU_table = realloc((*c)->line_OTU_table, ((*c)->line_OTU_table_size) * sizeof(char)); + memcpy((*c)->line_OTU_table + (*c)->line_OTU_table_size - id_len - 2, "\t", 1); + memcpy((*c)->line_OTU_table + (*c)->line_OTU_table_size - id_len - 1, (*seq)->accession_id, id_len); + memcpy((*c)->line_OTU_table + (*c)->line_OTU_table_size - 1, "\0", 1); + + if ((*seq)->next != NULL) // not last sequence + { + for (j=1; ((((*seq)+j)->next != NULL) && (((*seq)+j)->uniqHead == FALSE)); j++) // identical sequences + { + id_len = strlen((*(seq)+j)->accession_id); + + (*c)->line_OTU_table_size = (*c)->line_OTU_table_size + id_len + 1; + (*c)->line_OTU_table = realloc((*c)->line_OTU_table, ((*c)->line_OTU_table_size) * sizeof(char)); + memcpy((*c)->line_OTU_table + (*c)->line_OTU_table_size - id_len - 2, "\t", 1); + memcpy((*c)->line_OTU_table + (*c)->line_OTU_table_size - id_len - 1, (*(seq)+j)->accession_id, id_len); + memcpy((*c)->line_OTU_table + (*c)->line_OTU_table_size - 1, "\0", 1); + } + + if ((((*seq)+j)->next == NULL) && (((*seq)+j)->uniqHead == FALSE)) // last sequence + { + id_len = strlen((*(seq)+j)->accession_id); + + (*c)->line_OTU_table_size = (*c)->line_OTU_table_size + id_len + 1; + (*c)->line_OTU_table = realloc((*c)->line_OTU_table, ((*c)->line_OTU_table_size) * sizeof(char)); + memcpy((*c)->line_OTU_table + (*c)->line_OTU_table_size - id_len - 2, "\t", 1); + memcpy((*c)->line_OTU_table + (*c)->line_OTU_table_size - id_len - 1, (*(seq)+j)->accession_id, id_len); + memcpy((*c)->line_OTU_table + (*c)->line_OTU_table_size - 1, "\0", 1); + } + } + } + } + + // Print rows + + for (i=0; icluster_center) // center sequence + { + fprintf(OTUtableFile, (*seq)->line_OTU_table); + fprintf(OTUtableFile, "\n"); + } + } + + fclose(OTUtableFile); +} + + +void printSeq(fastaSeqPtr* seq, fastaSeqPtr* center, double score, FILE* output) +{ + int i; + + char* score_n; + char* score_v; + char* cluster_n; + char* cluster_v; + char* center_n; + char* center_true; + char* center_false; + int id_size; + + score_n = (char*) malloc(14*sizeof(char)); + score_v = (char*) malloc(20*sizeof(char)); + + strcpy(score_n, "cluster_score"); + sprintf(score_v,"%f", score); + + id_size = strlen((*center)->accession_id); + + cluster_n = (char*) malloc(8*sizeof(char)); + cluster_v = (char*) malloc((id_size+1)*sizeof(char)); + + strcpy(cluster_n, "cluster"); + strcpy(cluster_v, (*center)->accession_id); + + center_n = (char*) malloc(15*sizeof(char)); + strcpy(center_n, "cluster_center"); + center_true = (char*) malloc(5*sizeof(char)); + strcpy(center_true, "True"); + center_false = (char*) malloc(6*sizeof(char)); + strcpy(center_false, "False"); + + (*seq)->header = table_header_add_field((*seq)->header, cluster_n, cluster_v); + (*seq)->header = table_header_add_field((*seq)->header, score_n, score_v); + if ((*seq)->cluster_center) + (*seq)->header = table_header_add_field((*seq)->header, center_n, center_true); + else + (*seq)->header = table_header_add_field((*seq)->header, center_n, center_false); + + printOnlyHeaderFromTable((*seq)->header, output); + printOnlySeqFromFastaSeqPtr((*seq), output); + + if ((*seq)->next != NULL) + { + for (i=1; ((((*seq)+i)->next != NULL) && (((*seq)+i)->uniqHead == FALSE)); i++) + { + ((*seq)+i)->header = table_header_add_field(((*seq)+i)->header, cluster_n, cluster_v); + ((*seq)+i)->header = table_header_add_field(((*seq)+i)->header, score_n, score_v); + ((*seq)+i)->header = table_header_add_field(((*seq)+i)->header, center_n, center_false); + + printOnlyHeaderFromTable(((*seq)+i)->header, output); + printOnlySeqFromFastaSeqPtr(((*seq)+i), output); + } + + if ((((*seq)+i)->next == NULL) && (((*seq)+i)->uniqHead == FALSE)) // last sequence + { + ((*seq)+i)->header = table_header_add_field(((*seq)+i)->header, cluster_n, cluster_v); + ((*seq)+i)->header = table_header_add_field(((*seq)+i)->header, score_n, score_v); + ((*seq)+i)->header = table_header_add_field(((*seq)+i)->header, center_n, center_false); + + printOnlyHeaderFromTable(((*seq)+i)->header, output); + printOnlySeqFromFastaSeqPtr(((*seq)+i), output); + } + } +} + + +void putSeqInCluster(fastaSeqPtr* seq, fastaSeqPtr* center, double score) +{ + (*seq)->center = center; + (*seq)->score = score; +} + + +int compare(fastaSeqPtr* db, int n, BOOL fastOption, double threshold, BOOL normalize, int reference, BOOL lcsmode, + double max_ratio) +{ + double score; + double scoremax; + double worstscore; + BOOL toCluster; + static BOOL first=TRUE; + int32_t i,j,k; + int center; + float p; + BOOL found; + int lmax, lmin; + int16_t* address; + int16_t* iseq1; + int16_t* iseq2; + int l1; + int l2; + char* s1; + char* s2; + int sizeForSeqs; + int LCSmin; + + if (lcsmode || normalize) + fprintf(stderr,"Clustering sequences when similarity >= %lf\n", threshold); + else + fprintf(stderr,"Clustering sequences when distance <= %lf\n", threshold); + + fprintf(stderr,"Aligning and clustering... \n"); + + int* centers = (int*) malloc(n * sizeof(int)); + + for (i=0; i < n; i++) + centers[i] = -1; + + k=0; + found = FALSE; + + calculateMaxAndMinLen(db, n, &lmax, &lmin); + + sizeForSeqs = prepareTablesForSumathings(lmax, lmin, threshold, normalize, reference, lcsmode, &address, &iseq1, &iseq2); + + if (lcsmode || normalize) + worstscore = 0.0; + else + worstscore = lmax; + + for (i=0; i < n; i++) + { + if (i%100 == 0) + { + p = (i/(float)n)*100; + fprintf(stderr,"\rDone : %f %% %d clusters created",p,k); + } + + if (first) + { + first = FALSE; + if (normalize && lcsmode) + score = 1.0; + else if (!lcsmode) + score = 0.0; + else + score = (*(db+i))->length; + (*(db+i))->cluster_center = TRUE; + putSeqInCluster(db+i, db+i, score); + centers[k] = i; + k++; + } + + else + { + scoremax = worstscore; + center = 0; + found = FALSE; + toCluster = FALSE; + j=0; + + s1 = (*(db+i))->sequence; + l1 = (*(db+i))->length; + + while (((found == FALSE) && (centers[j] != -1) && (fastOption == TRUE)) || ((fastOption == FALSE) && (centers[j] != -1))) + { + score = worstscore; + + if ((double) ((*(db+i))->count) / (double) ((*(db+centers[j]))->count) <= max_ratio) + { + filters((*(db+i)), (*(db+centers[j])), threshold, normalize, reference, lcsmode, &score, &LCSmin); + } + + if (score == -1.0) + { + s2 = (*(db+centers[j]))->sequence; + l2 = (*(db+centers[j]))->length; + + score = alignForSumathings(s1, iseq1, s2, iseq2, l1, l2, normalize, reference, lcsmode, address, sizeForSeqs, LCSmin); + } + + if (((score >= threshold) && (lcsmode || normalize) && (score > scoremax)) || ((!lcsmode && !normalize) && (score <= threshold) && (score < scoremax))) + { + toCluster = TRUE; + scoremax = score; + center = centers[j]; + if (fastOption == TRUE) + found = TRUE; + } + j++; + } + + if (toCluster) + { + if (!lcsmode && normalize) + scoremax = 1.0 - scoremax; + (*(db+i))->cluster_center = FALSE; + putSeqInCluster(db+i, db+center, scoremax); + } + else + { + if (normalize && lcsmode) + score = 1.0; + else if (!lcsmode) + score = 0.0; + else + score = (*(db+i))->length; + (*(db+i))->cluster_center = TRUE; + putSeqInCluster(db+i, db+i, score); + centers[k] = i; + k++; + } + } + } + fprintf(stderr,"\rDone : 100 %% %d clusters created. \n",k); + + free(centers); + + free(iseq1-sizeForSeqs+lmax); + free(iseq2-sizeForSeqs+lmax); + + if (normalize && reference == ALILEN) + free(address); + + return(k); +} + + +void computeClusterWeights(fastaSeqPtr* uniqSeqs, int n) +{ + int i,j; + fastaSeqPtr* seq; + fastaSeqPtr* center; + char* cluster_weight_n; + char* cluster_weight_v; + int cluster_weight; + + for (i=0; icluster_center) + (*seq)->cluster_weight = (*seq)->count; + else + { + center = (*seq)->center; + ((*center)->cluster_weight)+=(*seq)->count; + } + } + + for (i=0; icluster_center) + cluster_weight = (*seq)->cluster_weight; + else + { + center = (*seq)->center; + cluster_weight = (*center)->cluster_weight; + } + cluster_weight_n = (char*) malloc(15*sizeof(char)); + cluster_weight_v = (char*) malloc(20*sizeof(char)); + strcpy(cluster_weight_n, "cluster_weight"); + sprintf(cluster_weight_v,"%d", cluster_weight); + (*seq)->header = table_header_add_field((*seq)->header, cluster_weight_n, cluster_weight_v); + + if ((*seq)->next != NULL) // not the last sequence + { + for (j=1; ((((*seq)+j)->next != NULL) && (((*seq)+j)->uniqHead == FALSE)); j++) + (*(seq)+j)->header = table_header_add_field((*(seq)+j)->header, cluster_weight_n, cluster_weight_v); + + if ((((*seq)+j)->next == NULL) && (((*seq)+j)->uniqHead == FALSE)) // last sequence + (*(seq)+j)->header = table_header_add_field((*(seq)+j)->header, cluster_weight_n, cluster_weight_v); + } + } +} + + +int main(int argc, char** argv) +{ + + int32_t carg = 0; + int32_t errflag = 0; + char* sort; + double threshold = 0.97; + double max_ratio = 1.0; + BOOL lcsmode = TRUE; + BOOL fastOption = TRUE; + BOOL normalize = TRUE; + BOOL reverse = FALSE; + BOOL onlyATGC = TRUE; + int reference = ALILEN; + int ndb = 0; + int nproc = 1; + BOOL printBIOM = FALSE; + BOOL printOTUtable = FALSE; + BOOL printFASTA = TRUE; + BOOL printFASTAtofile = FALSE; + FILE* FASTA_output = stdout; + fastaSeqCount db; + int i,n; + fastaSeqPtr* uniqSeqs; + char* biomFile_name; + char* OTUtableFile_name; + char* FASTA_file_name; + int numberOfCenters; + + + sort = malloc(1024*sizeof(char)); + strcpy(sort, "count"); + + biomFile_name = malloc(1024*sizeof(char)); + OTUtableFile_name = malloc(1024*sizeof(char)); + FASTA_file_name = malloc(1024*sizeof(char)); + + + while ((carg = getopt(argc, argv, "hlLanrdet:p:s:ogB:O:R:fF:")) != -1) { + switch (carg) { + /* -------------------- */ + case 'h': /* help */ + /* -------------------- */ + PrintHelp(); + exit(0); + break; + + /* -------------------------------------------------- */ + case 'l': /* Normalize LCS/Error by the shortest sequence length*/ + /* -------------------------------------------------- */ + reference=MINLEN; + break; + + /* -------------------------------------------------- */ + case 'L': /* Normalize LCS/Error by the largest sequence length */ + /* -------------------------------------------------- */ + reference=MAXLEN; + break; + + /* -------------------------------------------------- */ + case 'a': /* Normalize LCS/Error by the alignment length */ + /* -------------------------------------------------- */ + reference=ALILEN; + break; + + /* -------------------------------------------------- */ + case 'n': /* Normalize LCS by the reference length */ + /* -------------------------------------------------- */ + normalize=TRUE; + break; + + /* -------------------------------------------------- */ + case 'r': /* No normalization */ + /* -------------------------------------------------- */ + normalize=FALSE; + break; + + /* -------------------------------------------------- */ + case 'd': /* Score is expressed in distance */ + /* -------------------------------------------------- */ + lcsmode=FALSE; + break; + + /* ---------------------------------------------------------------------------------------------------------- */ + case 'e': /* center with the best score > threshold is chosen, otherwise first center with a score > threshold */ + /* ---------------------------------------------------------------------------------------------------------- */ + fastOption=FALSE; + break; + + /* ------------------------------------------------------------------- */ + case 't': /* Clusters only pairs with similarity higher than (threshold) */ + /* ------------------------------------------------------------------- */ + sscanf(optarg,"%lf",&threshold); + break; + + + /* ------------------------------------------------------------------- */ + case 'R': /* maximum ratio between counts of two sequences connected by an edge */ + /* ------------------------------------------------------------------- */ + sscanf(optarg,"%lf",&max_ratio); + break; + + /* -------------------------------------------------- */ + case 'p': /* number of processors to use */ + /* -------------------------------------------------- */ + sscanf(optarg,"%d",&nproc); + break; + + /* -------------------------------------------------- */ + case 's': /* Sorting option */ + /* -------------------------------------------------- */ + sscanf(optarg, "%s", sort); + sortingKey = sort; + break; + + /* -------------------------------------------------- */ + case 'o': /* reverse sorting */ + /* -------------------------------------------------- */ + reverse=TRUE; + break; + + /* -------------------------------------------------- */ + case 'g': /* replace n's with a's in sequences */ + /* -------------------------------------------------- */ + onlyATGC=FALSE; + break; + + /* -------------------------------------------------- */ + case 'B': /* file name to print results in BIOM format */ + /* -------------------------------------------------- */ + sscanf(optarg, "%s", biomFile_name); + printBIOM=TRUE; + break; + + /* -------------------------------------------------- */ + case 'O': /* file name to print results in OTU table format */ + /* -------------------------------------------------- */ + sscanf(optarg, "%s", OTUtableFile_name); + printOTUtable=TRUE; + break; + + /* -------------------------------------------------- */ + case 'f': /* don't print results in FASTA format */ + /* -------------------------------------------------- */ + printFASTA=FALSE; + break; + + /* ---------------------------------------------- */ + case 'F': /* file name to print results in FASTA format */ + /* ---------------------------------------------- */ + sscanf(optarg, "%s", FASTA_file_name); + printFASTAtofile=TRUE; + break; + + + case '?': /* invalid option */ + errflag++; + break; + } + } + + ndb = argc - optind; + if (ndb != 1) + errflag++; + + if (errflag) + ExitUsage(errflag); + + fprintf(stderr,"===========================================================\n"); + fprintf(stderr," SUMACLUST version %s\n",VERSION); +#ifdef __SSE2__ + fprintf(stderr," Alignment using SSE2 instructions.\n"); +#else + fprintf(stderr," Alignment using standard code, SSE2 unavailable.\n"); +#endif + fprintf(stderr,"===========================================================\n"); + + if ((threshold == 0.0) || (normalize && (threshold > 1.0))) + { + fprintf(stderr, "\nERROR: Please specify a threshold > 0, and < 1 when scores are normalized.\n\n"); + exit(1); + } + + fprintf(stderr,"Reading dataset..."); + db = seq_readAllSeq2(argv[optind], TRUE, onlyATGC); + fprintf(stderr,"\n%d sequences\n",db.count); + + if (db.count == 0) + { + fprintf(stderr, "\nNo valid sequences. Exiting program.\n\n"); + exit(1); + } + + if (!onlyATGC) + (void)cleanDB(db); + + if (!lcsmode && normalize) + threshold = 1.0 - threshold; + + if (threshold > 0) + (void)hashDB(db); + + addCounts(&db); + + // first sorting of sequences to have good unique heads + + if ((strcmp(sortingKey, "None") != 0) && (strcmp(sortingKey, "none") != 0)) + { + if (reverse == FALSE) + qsort((void*) db.fastaSeqs, db.count, sizeof(fastaSeq), uniqSeqsDoubleSortFunction); + else + qsort((void*) db.fastaSeqs, db.count, sizeof(fastaSeq), uniqSeqsDoubleReverseSortFunction); + } + + // getting the vector of unique seqs + uniqSeqs = (fastaSeqPtr*) malloc((db.count)*sizeof(fastaSeqPtr)); + n = uniqSeqsVector(&db, &uniqSeqs); + uniqSeqs = realloc(uniqSeqs, n*sizeof(fastaSeqPtr)); + + // putting a flag on the last sequence + for (i=0; i<(db.count-1); i++) + ((db.fastaSeqs)+i)->next = (db.fastaSeqs)+i-1; + ((db.fastaSeqs)+(db.count)-1)->next = NULL; + + // sorting unique sequences + if (strcmp(sortingKey, "count") == 0) + { + fprintf(stderr,"Sorting sequences by count...\n", n); + if (reverse == FALSE) + qsort((void*) uniqSeqs, n, sizeof(fastaSeqPtr), sortSeqsWithCounts); + else + qsort((void*) uniqSeqs, n, sizeof(fastaSeqPtr), reverseSortSeqsWithCounts); + } + else if ((strcmp(sortingKey, "None") != 0) && (strcmp(sortingKey, "none") != 0)) + { + fprintf(stderr,"Sorting sequences by %s...\n", sortingKey); + if (reverse == FALSE) + qsort((void*) uniqSeqs, n, sizeof(fastaSeqPtr), sortSeqsP); + else + qsort((void*) uniqSeqs, n, sizeof(fastaSeqPtr), reverseSortSeqsP); + } + + if (max_ratio > 0) + fprintf(stderr,"Maximum ratio between the counts of two sequences to connect them: %lf\n", max_ratio); + + // Computing + if (nproc==1) + numberOfCenters = compare(uniqSeqs, n, fastOption, threshold, normalize, reference, lcsmode, max_ratio); + + else + numberOfCenters = mt_compare_sumaclust(uniqSeqs, n, fastOption, threshold, normalize, reference, lcsmode, nproc, max_ratio); + + // Computing cluster weights + computeClusterWeights(uniqSeqs, n); + + // Printing results + + // FASTA file + if (printFASTA) + { + + if (printFASTAtofile) + { + FASTA_output = fopen(FASTA_file_name, "w"); + if (FASTA_output == NULL) + fprintf(stderr, "\nCan't open FASTA output file.\n"); //, %s outputFilename); + } + + for (i=0; icenter, (*(uniqSeqs+i))->score, FASTA_output); + } + fprintf(stderr,"Done.\n"); + } + + // BIOM file + if (printBIOM) + { + fprintf(stderr,"Printing results in BIOM format...\n"); + printInBIOMformat(uniqSeqs, n, numberOfCenters, biomFile_name); + fprintf(stderr,"Done.\n"); + } + + // OTU table file + if (printOTUtable) + { + fprintf(stderr,"Printing results in OTU table format...\n"); + printInOTUtableFormat(uniqSeqs, n, OTUtableFile_name); + fprintf(stderr,"Done.\n"); + } + + // Freeing + for (i=0; i < db.count; i++) + { + free(((db.fastaSeqs)[i]).table); + free_header_table(((db.fastaSeqs)[i]).header); + } + free(db.fastaSeqs); + free(sort); + free(uniqSeqs); + + return(0); + +} diff --git a/src/sumaclust/sumaclust_v1.0.10/sumaclust.h b/src/sumaclust/sumaclust_v1.0.10/sumaclust.h new file mode 100644 index 0000000..873b641 --- /dev/null +++ b/src/sumaclust/sumaclust_v1.0.10/sumaclust.h @@ -0,0 +1,34 @@ +/* + * sumaclust.h + * + * Created on: april 2, 2012 + * Author: mercier + */ + + +#ifndef SUMACLUST_H_ +#define SUMACLUST_H_ + +typedef struct { + int32_t next; + int32_t threads_number; + int* potential_nexts_list; + fastaSeqPtr* db; + int n; + int normalize; + int reference; + BOOL lcsmode; + BOOL fast; + double threshold; + BOOL stop; + int sizeForSeqs; + int16_t** addresses; + int16_t** iseqs1; + int16_t** iseqs2; + int seeds_counter; + double worstscore; + double max_ratio; + int64_t elapsedtime; +} thread_control_t; + +#endif /* SUMACLUST_H_ */ diff --git a/src/sumaclust/sumaclust_v1.0.10/sumaclust_user_manual.md b/src/sumaclust/sumaclust_v1.0.10/sumaclust_user_manual.md new file mode 100644 index 0000000..e46072a --- /dev/null +++ b/src/sumaclust/sumaclust_v1.0.10/sumaclust_user_manual.md @@ -0,0 +1,143 @@ +# Sumaclust: fast and exact clustering of sequences + +[metabarcoding.org/sumaclust](metabarcoding.org/sumaclust) + + +## Introduction + +With the development of next-generation sequencing, efficient tools are needed to handle millions of sequences in reasonable amounts of time. +Sumaclust is a program developed by the [LECA](http://www-leca.ujf-grenoble.fr/?lang=en). +Sumaclust aims to cluster sequences in a way that is fast and exact at the same time. This tool has been developed to be adapted to the type of data generated by DNA metabarcoding, i.e. entirely sequenced, short markers. +Sumaclust clusters sequences using the same clustering algorithm as UCLUST and CD-HIT. This algorithm is mainly useful to detect the 'erroneous' sequences created during amplification and sequencing protocols, deriving from 'true' sequences. +Currently, Sumaclust is available as a program that you can download and install on Unix-like machines. + +## Download and installation of Sumaclust + +### Download + +Sumaclust can be downloaded from the metabarcoding.org GitLab. +The archive of the latest tagged version can be downloaded on the GitLab wiki page: + +[https://git.metabarcoding.org/obitools/sumaclust/wikis/home](https://git.metabarcoding.org/obitools/sumaclust/wikis/home) + +The versions downloaded this way are for Unix-like systems compatible with SIMD SSE2 instructions and POSIX threads. Pre-compiled versions of GCC for OS X can be found [here](http://hpc.sourceforge.net/), that might be helpful if you encounter problems compiling the programs. Send an email at for other versions, or if you have any inquiries. + +### Installation + +Untar the archive, go into the newly created directory and compile: + +``` +tar –zxvf sumaclust_v[x.x.xx].tar.gz +cd sumaclust_v[x.x.xx] +make +``` + +You can compile Sumaclust with `clang`, which deactivates `OpenMP`, with: + +``` +make CC=clang +``` + +## Documentation + +Sumaclust clusters sequences using the same clustering algorithm as UCLUST and CD-HIT. This algorithm is mainly useful to detect the "erroneous" sequences created during amplification and sequencing protocols, deriving from "true" sequences. + +### Using Sumaclust + +#### Input + +Input file must be in FASTA format. + +#### Usage + +``` +sumaclust [-l|L|a|n|r|d|e|o|g|f] [-t threshold_value] [-s sorting_key] [-R maximum_ratio] [-p number_of_threads] [-B file_name_for_BIOM-formatted_output] [-O file_name_for_OTU_table-formatted_output] [-F file_name_for_FASTA-formatted_output] dataset +``` + +Argument: the sequence dataset to cluster. + +For help : + +``` +sumaclust -h +``` + +#### Examples + +``` +sumaclust -t 0.97 my_dataset.fasta > clusters_of_seqs_with_similarity_>_97%.fasta +``` + +``` +sumaclust -d -r -t 2 my_dataset.fasta > clusters_of_seqs_with_distance_<=_2_differences.fasta +``` + +#### Options + +``` +-h : [H]elp - print the help +-l : Reference sequence length is the shortest. +-L : Reference sequence length is the largest. +-a : Reference sequence length is the alignment length (default). +-n : Score is normalized by reference sequence length (default). +-r : Raw score, not normalized. +-d : Score is expressed in distance (default : score is expressed in similarity). +-t ##.## : Score threshold for clustering. If the score is normalized and expressed in similarity (default), it is an identity, e.g. 0.95 for an identity of 95%. If the score is normalized and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%. If the score is not normalized and expressed in similarity, it is the length of the Longest Common Subsequence. If the score is not normalized and expressed in distance, it is (reference length - LCS length). Only sequences with a similarity above ##.## with the representative sequence of a cluster are assigned to that cluster. Default: 0.97. +-e : Exact option : A sequence is assigned to the cluster with the representative sequence presenting the highest similarity score > threshold, as opposed to the default 'fast' option where a sequence is assigned to the first cluster found with a representative sequence presenting a score > threshold. +-R ## : Maximum ratio between the counts of two sequences so that the less abundant one can be considered as a variant of the more abundant one. Default: 1.0. +-p ## : Multithreading with ## threads using openMP. +-s #### : Sorting by ####. Must be 'None' for no sorting, or a key in the fasta header of each sequence, except for the count that can be computed (default : sorting by count). +-o : Sorting is in ascending order (default: descending). +-g : n's are replaced with a's (default: sequences with n's are discarded). +-B ### : Output of the OTU table in BIOM format is activated, and written to file ###. +-O ### : Output of the OTU map (observation map) is activated, and written to file ###. +-F ### : Output in FASTA format is written to file ### instead of standard output. +-f : Output in FASTA format is deactivated. +``` + +#### Output + +Sumaclust's default output is in fasta format. There are four fields added in the headers of all sequences. Those fields are of the form [key=value;]. The four keys are `cluster`, `cluster_score`, `cluster_center` and `cluster_weight` and their values correspond respectively to the identifier of the center of the sequence's cluster, the similarity score of the sequence with this center, a boolean indicating whether the sequence is the center of its cluster, and the total number of sequences in the cluster to which the sequence belongs. + +Example where `seq_1` is a cluster center and `seq_2` is clustered with `seq_1`: + +``` +>seq_1 species=Heracleum maximum; count=3; cluster=seq_1; cluster_score=1.0; cluster_center=True; cluster_weight=5; atcctattttccaaaaacaaacaaaggcccagaaggtgaaaaaag +>seq_2 species=Cnidium cnidiifolium; count=2; cluster=seq_1; cluster_score=0.955556; cluster_center=False; cluster_weight=5; atcctattttccaaaaacaacaaaggcccataaggtgaaaaaag +``` + +There is a possibility to print the clusters in BIOM format with the `–B` option, and/or in OTU map (observation map) format with the `–O` option. The FASTA output can then be deactivated with the `–f` option. The FASTA output is written to the standard output by default, but can be written to a file using the `–F` option. +In the following examples, the first one prints results in FASTA and BIOM formats, and the second one prints results in BIOM and OTU map formats: + +``` +sumaclust -B clusters_of_seqs_with_similarity_>_97%.biom my_dataset.fasta > clusters_of_seqs_with_similarity_>_97%.fasta +``` +``` +sumaclust -F -B clusters_of_seqs_with_similarity_>_97%.biom -O clusters_of_seqs_with_similarity_>_97%.txt my_dataset.fasta +``` + +### How SUMACLUST works + +#### Clustering algorithm + +Sumaclust clusters sequences using the same clustering algorithm as UCLUST and CD-HIT. The problem is defined as follows: + +Sumaclust browses through the dataset, in the order in which the sequences have been sorted with the -s option. By default, sequences are sorted by decreasing abundance, because this enables to identify 'true' and 'erroneous' sequences the best, as 'true' sequences tend to end up as cluster centers. The first sequence of the ordered list is considered the center of the first cluster. Each sequence, following the ordered list, is compared with the centers of the existing clusters, respecting the initial list's order. If the similarity of the query sequence with a center is above a chosen threshold, and their abundance ratio is below the maximum ratio chosen, the sequence is grouped in the cluster of this center. Otherwise, a new cluster is created with the query sequence as the center. + +#### About the abundance ratio + +An edge is created between a query sequence and a center sequence only if their abundance ratio, i.e. the query sequence’s count divided by the center sequence’s count, is below the maximum ratio chosen with the `–R` option. This can prevent sequences that are very abundant, and therefore likely true sequences, to be considered a variant of another true sequence that is only a little more abundant and very close to them. + +#### Similarity computation + +##### Similarity indice + +A good way to evaluate the similarities between full-length sequences is to use indices based on the length of the Longest Common Subsequence (LCS), and in particular, a good similarity indice is the length of the LCS divided by the length of the shortest alignment representing this LCS, giving an identity percentage. This is the similarity indice used by Sumatra by default. Other similarity indices are available through the options. + +##### Fast computation of the similarity + +*Lossless k-mer filter.* Since we are usually interested in higly similar sequences, Sumatra uses similarity thresholds under which similarities are not reported. A lossless filtering step enables to only align couples of sequences that potentially have an identity greater than the chosen threshold. This filter is based on the number of overlapping k-mers that the sequences must share in order to have an identity at least equal to the threshold. With typical DNA metabarcoding datasets (a few millions sequences of 50-300 bp and threshold around 90-95% id), we empirically determined that the most efficient filtering was achieved with 4-mers and 5-mers. + +*Alignment within a diagonal band.* Alignments are computed using a Needleman-Wunsch algorithm. In the scoring system used, matches are rewarded by one point, and mismatches and insertions/deletions are not penalised. The computation of the length of the LCS and the length of the alignment by the NWS algorithm has a quadratic complexity in time. It is responsible for most of the computation time. At high identity thresholds, the alignment computation can be done only in a diagonal band of the alignment matrix, gaining a considerable amount of time depending on the threshold. + +*Parallelization.* There are two levels of parallelization implemented in Sumatra. Both the filtering and the alignments steps are optimized with the use of Simple Instruction Multiple Data instructions (SIMD). Since 4-mers enable to work easily with SIMD instructions, we implemented a 4-mer filter. Moreover, the program can be run on multiple threads. diff --git a/src/sumaclust/sumaclust_v1.0.10/sumaclust_user_manual.pdf b/src/sumaclust/sumaclust_v1.0.10/sumaclust_user_manual.pdf new file mode 100644 index 0000000..de53660 Binary files /dev/null and b/src/sumaclust/sumaclust_v1.0.10/sumaclust_user_manual.pdf differ diff --git a/src/sumatra-1.0.10/sumalibs/Licence_CeCILL_V2-en.txt b/src/sumaclust/sumaclust_v1.0.10/sumalibs/Licence_CeCILL_V2-en.txt similarity index 100% rename from src/sumatra-1.0.10/sumalibs/Licence_CeCILL_V2-en.txt rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/Licence_CeCILL_V2-en.txt diff --git a/src/sumatra-1.0.10/sumalibs/Licence_CeCILL_V2-fr.txt b/src/sumaclust/sumaclust_v1.0.10/sumalibs/Licence_CeCILL_V2-fr.txt similarity index 100% rename from src/sumatra-1.0.10/sumalibs/Licence_CeCILL_V2-fr.txt rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/Licence_CeCILL_V2-fr.txt diff --git a/src/sumaclust/sumaclust_v1.0.10/sumalibs/global.mk b/src/sumaclust/sumaclust_v1.0.10/sumalibs/global.mk new file mode 100644 index 0000000..e5a0dfb --- /dev/null +++ b/src/sumaclust/sumaclust_v1.0.10/sumalibs/global.mk @@ -0,0 +1,7 @@ +include ../../../../../config/auto.conf + + +default: all + +%.o: %.c + $(CC) $(CFLAGS) -c -o $@ $< diff --git a/src/sumatra-1.0.10/sumalibs/libfasta/Makefile b/src/sumaclust/sumaclust_v1.0.10/sumalibs/libfasta/Makefile similarity index 100% rename from src/sumatra-1.0.10/sumalibs/libfasta/Makefile rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/libfasta/Makefile diff --git a/src/sumatra-1.0.10/sumalibs/libfasta/fasta_header_handler.c b/src/sumaclust/sumaclust_v1.0.10/sumalibs/libfasta/fasta_header_handler.c similarity index 100% rename from src/sumatra-1.0.10/sumalibs/libfasta/fasta_header_handler.c rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/libfasta/fasta_header_handler.c diff --git a/src/sumatra-1.0.10/sumalibs/libfasta/fasta_header_handler.h b/src/sumaclust/sumaclust_v1.0.10/sumalibs/libfasta/fasta_header_handler.h similarity index 100% rename from src/sumatra-1.0.10/sumalibs/libfasta/fasta_header_handler.h rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/libfasta/fasta_header_handler.h diff --git a/src/sumatra-1.0.10/sumalibs/libfasta/fasta_header_parser.c b/src/sumaclust/sumaclust_v1.0.10/sumalibs/libfasta/fasta_header_parser.c similarity index 100% rename from src/sumatra-1.0.10/sumalibs/libfasta/fasta_header_parser.c rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/libfasta/fasta_header_parser.c diff --git a/src/sumatra-1.0.10/sumalibs/libfasta/fasta_header_parser.h b/src/sumaclust/sumaclust_v1.0.10/sumalibs/libfasta/fasta_header_parser.h similarity index 100% rename from src/sumatra-1.0.10/sumalibs/libfasta/fasta_header_parser.h rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/libfasta/fasta_header_parser.h diff --git a/src/sumatra-1.0.10/sumalibs/libfasta/fasta_header_parser.l b/src/sumaclust/sumaclust_v1.0.10/sumalibs/libfasta/fasta_header_parser.l similarity index 100% rename from src/sumatra-1.0.10/sumalibs/libfasta/fasta_header_parser.l rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/libfasta/fasta_header_parser.l diff --git a/src/sumatra-1.0.10/sumalibs/libfasta/fasta_seq_writer.c b/src/sumaclust/sumaclust_v1.0.10/sumalibs/libfasta/fasta_seq_writer.c similarity index 100% rename from src/sumatra-1.0.10/sumalibs/libfasta/fasta_seq_writer.c rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/libfasta/fasta_seq_writer.c diff --git a/src/sumatra-1.0.10/sumalibs/libfasta/fasta_seq_writer.h b/src/sumaclust/sumaclust_v1.0.10/sumalibs/libfasta/fasta_seq_writer.h similarity index 100% rename from src/sumatra-1.0.10/sumalibs/libfasta/fasta_seq_writer.h rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/libfasta/fasta_seq_writer.h diff --git a/src/sumatra-1.0.10/sumalibs/libfasta/header_mem_handler.c b/src/sumaclust/sumaclust_v1.0.10/sumalibs/libfasta/header_mem_handler.c similarity index 100% rename from src/sumatra-1.0.10/sumalibs/libfasta/header_mem_handler.c rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/libfasta/header_mem_handler.c diff --git a/src/sumatra-1.0.10/sumalibs/libfasta/header_mem_handler.h b/src/sumaclust/sumaclust_v1.0.10/sumalibs/libfasta/header_mem_handler.h similarity index 100% rename from src/sumatra-1.0.10/sumalibs/libfasta/header_mem_handler.h rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/libfasta/header_mem_handler.h diff --git a/src/sumatra-1.0.10/sumalibs/libfasta/sequence.c b/src/sumaclust/sumaclust_v1.0.10/sumalibs/libfasta/sequence.c similarity index 100% rename from src/sumatra-1.0.10/sumalibs/libfasta/sequence.c rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/libfasta/sequence.c diff --git a/src/sumatra-1.0.10/sumalibs/libfasta/sequence.h b/src/sumaclust/sumaclust_v1.0.10/sumalibs/libfasta/sequence.h similarity index 100% rename from src/sumatra-1.0.10/sumalibs/libfasta/sequence.h rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/libfasta/sequence.h diff --git a/src/sumatra-1.0.10/sumalibs/libfile/Makefile b/src/sumaclust/sumaclust_v1.0.10/sumalibs/libfile/Makefile similarity index 100% rename from src/sumatra-1.0.10/sumalibs/libfile/Makefile rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/libfile/Makefile diff --git a/src/sumatra-1.0.10/sumalibs/libfile/fileHandling.c b/src/sumaclust/sumaclust_v1.0.10/sumalibs/libfile/fileHandling.c similarity index 100% rename from src/sumatra-1.0.10/sumalibs/libfile/fileHandling.c rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/libfile/fileHandling.c diff --git a/src/sumatra-1.0.10/sumalibs/libfile/fileHandling.h b/src/sumaclust/sumaclust_v1.0.10/sumalibs/libfile/fileHandling.h similarity index 100% rename from src/sumatra-1.0.10/sumalibs/libfile/fileHandling.h rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/libfile/fileHandling.h diff --git a/src/sumatra-1.0.10/sumalibs/liblcs/Makefile b/src/sumaclust/sumaclust_v1.0.10/sumalibs/liblcs/Makefile similarity index 100% rename from src/sumatra-1.0.10/sumalibs/liblcs/Makefile rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/liblcs/Makefile diff --git a/src/sumatra-1.0.10/sumalibs/liblcs/_lcs.ext.1.c b/src/sumaclust/sumaclust_v1.0.10/sumalibs/liblcs/_lcs.ext.1.c similarity index 100% rename from src/sumatra-1.0.10/sumalibs/liblcs/_lcs.ext.1.c rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/liblcs/_lcs.ext.1.c diff --git a/src/sumatra-1.0.10/sumalibs/liblcs/_lcs.ext.2.c b/src/sumaclust/sumaclust_v1.0.10/sumalibs/liblcs/_lcs.ext.2.c similarity index 100% rename from src/sumatra-1.0.10/sumalibs/liblcs/_lcs.ext.2.c rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/liblcs/_lcs.ext.2.c diff --git a/src/sumatra-1.0.10/sumalibs/liblcs/_lcs.ext.3.c b/src/sumaclust/sumaclust_v1.0.10/sumalibs/liblcs/_lcs.ext.3.c similarity index 100% rename from src/sumatra-1.0.10/sumalibs/liblcs/_lcs.ext.3.c rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/liblcs/_lcs.ext.3.c diff --git a/src/sumatra-1.0.10/sumalibs/liblcs/_lcs.h b/src/sumaclust/sumaclust_v1.0.10/sumalibs/liblcs/_lcs.h similarity index 100% rename from src/sumatra-1.0.10/sumalibs/liblcs/_lcs.h rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/liblcs/_lcs.h diff --git a/src/sumatra-1.0.10/sumalibs/liblcs/_lcs_fast.h b/src/sumaclust/sumaclust_v1.0.10/sumalibs/liblcs/_lcs_fast.h similarity index 100% rename from src/sumatra-1.0.10/sumalibs/liblcs/_lcs_fast.h rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/liblcs/_lcs_fast.h diff --git a/src/sumatra-1.0.10/sumalibs/liblcs/banded_LCS_alignment.c b/src/sumaclust/sumaclust_v1.0.10/sumalibs/liblcs/banded_LCS_alignment.c similarity index 100% rename from src/sumatra-1.0.10/sumalibs/liblcs/banded_LCS_alignment.c rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/liblcs/banded_LCS_alignment.c diff --git a/src/sumatra-1.0.10/sumalibs/liblcs/banded_LCS_alignment.h b/src/sumaclust/sumaclust_v1.0.10/sumalibs/liblcs/banded_LCS_alignment.h similarity index 100% rename from src/sumatra-1.0.10/sumalibs/liblcs/banded_LCS_alignment.h rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/liblcs/banded_LCS_alignment.h diff --git a/src/sumatra-1.0.10/sumalibs/liblcs/sse_banded_LCS_alignment.c b/src/sumaclust/sumaclust_v1.0.10/sumalibs/liblcs/sse_banded_LCS_alignment.c similarity index 100% rename from src/sumatra-1.0.10/sumalibs/liblcs/sse_banded_LCS_alignment.c rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/liblcs/sse_banded_LCS_alignment.c diff --git a/src/sumatra-1.0.10/sumalibs/liblcs/sse_banded_LCS_alignment.h b/src/sumaclust/sumaclust_v1.0.10/sumalibs/liblcs/sse_banded_LCS_alignment.h similarity index 100% rename from src/sumatra-1.0.10/sumalibs/liblcs/sse_banded_LCS_alignment.h rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/liblcs/sse_banded_LCS_alignment.h diff --git a/src/sumatra-1.0.10/sumalibs/liblcs/upperband.c b/src/sumaclust/sumaclust_v1.0.10/sumalibs/liblcs/upperband.c similarity index 100% rename from src/sumatra-1.0.10/sumalibs/liblcs/upperband.c rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/liblcs/upperband.c diff --git a/src/sumatra-1.0.10/sumalibs/liblcs/upperband.h b/src/sumaclust/sumaclust_v1.0.10/sumalibs/liblcs/upperband.h similarity index 100% rename from src/sumatra-1.0.10/sumalibs/liblcs/upperband.h rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/liblcs/upperband.h diff --git a/src/sumatra-1.0.10/sumalibs/libsse/_sse.h b/src/sumaclust/sumaclust_v1.0.10/sumalibs/libsse/_sse.h similarity index 100% rename from src/sumatra-1.0.10/sumalibs/libsse/_sse.h rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/libsse/_sse.h diff --git a/src/sumatra-1.0.10/sumalibs/libutils/Makefile b/src/sumaclust/sumaclust_v1.0.10/sumalibs/libutils/Makefile similarity index 100% rename from src/sumatra-1.0.10/sumalibs/libutils/Makefile rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/libutils/Makefile diff --git a/src/sumatra-1.0.10/sumalibs/libutils/debug.c b/src/sumaclust/sumaclust_v1.0.10/sumalibs/libutils/debug.c similarity index 100% rename from src/sumatra-1.0.10/sumalibs/libutils/debug.c rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/libutils/debug.c diff --git a/src/sumatra-1.0.10/sumalibs/libutils/debug.h b/src/sumaclust/sumaclust_v1.0.10/sumalibs/libutils/debug.h similarity index 100% rename from src/sumatra-1.0.10/sumalibs/libutils/debug.h rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/libutils/debug.h diff --git a/src/sumatra-1.0.10/sumalibs/libutils/utilities.c b/src/sumaclust/sumaclust_v1.0.10/sumalibs/libutils/utilities.c similarity index 100% rename from src/sumatra-1.0.10/sumalibs/libutils/utilities.c rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/libutils/utilities.c diff --git a/src/sumatra-1.0.10/sumalibs/libutils/utilities.h b/src/sumaclust/sumaclust_v1.0.10/sumalibs/libutils/utilities.h similarity index 100% rename from src/sumatra-1.0.10/sumalibs/libutils/utilities.h rename to src/sumaclust/sumaclust_v1.0.10/sumalibs/libutils/utilities.h diff --git a/src/sumatra-1.0.10/sumalibs/global.mk b/src/sumatra-1.0.10/sumalibs/global.mk deleted file mode 100644 index c58123e..0000000 --- a/src/sumatra-1.0.10/sumalibs/global.mk +++ /dev/null @@ -1,10 +0,0 @@ - -CC=gcc -LDFLAGS= - -CFLAGS = -O3 -w - -default: all - -%.o: %.c - $(CC) $(CFLAGS) -c -o $@ $< $(LIB) diff --git a/src/sumatra/Makefile b/src/sumatra/Makefile new file mode 100755 index 0000000..a052de3 --- /dev/null +++ b/src/sumatra/Makefile @@ -0,0 +1,30 @@ +# --------------------------------------------------------------- +# $Id: $ +# --------------------------------------------------------------- +# @file: Makefile +# @desc: makefile for lxpack +# +# @history: +# @history: +# @+ : Apr 97 : Created +# @+ : Mar 02 : Updated for LXxware +# +# @note: should be processed with gnu compatible make +# @note: helixware_compatible +# +# @end: +# --------------------------------------------------------------- +# +include ../../config/auto.conf + +DIRS = sumatra-1.0.10 + +include ../../config/targets/propagate.targ + +include ../../config/targets/help.targ + +all:: + $(MAKE) ACTION=$@ _action + +clean:: + $(MAKE) -C lxpack portclean diff --git a/src/sumatra/sumatra-1.0.10/Licence_CeCILL_V2-en.txt b/src/sumatra/sumatra-1.0.10/Licence_CeCILL_V2-en.txt new file mode 100644 index 0000000..fcc8df2 --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/Licence_CeCILL_V2-en.txt @@ -0,0 +1,506 @@ + +CeCILL FREE SOFTWARE LICENSE AGREEMENT + + + Notice + +This Agreement is a Free Software license agreement that is the result +of discussions between its authors in order to ensure compliance with +the two main principles guiding its drafting: + + * firstly, compliance with the principles governing the distribution + of Free Software: access to source code, broad rights granted to + users, + * secondly, the election of a governing law, French law, with which + it is conformant, both as regards the law of torts and + intellectual property law, and the protection that it offers to + both authors and holders of the economic rights over software. + +The authors of the CeCILL (for Ce[a] C[nrs] I[nria] L[ogiciel] L[ibre]) +license are: + +Commissariat ŕ l'Energie Atomique - CEA, a public scientific, technical +and industrial research establishment, having its principal place of +business at 25 rue Leblanc, immeuble Le Ponant D, 75015 Paris, France. + +Centre National de la Recherche Scientifique - CNRS, a public scientific +and technological establishment, having its principal place of business +at 3 rue Michel-Ange, 75794 Paris cedex 16, France. + +Institut National de Recherche en Informatique et en Automatique - +INRIA, a public scientific and technological establishment, having its +principal place of business at Domaine de Voluceau, Rocquencourt, BP +105, 78153 Le Chesnay cedex, France. + + + Preamble + +The purpose of this Free Software license agreement is to grant users +the right to modify and redistribute the software governed by this +license within the framework of an open source distribution model. + +The exercising of these rights is conditional upon certain obligations +for users so as to preserve this status for all subsequent redistributions. + +In consideration of access to the source code and the rights to copy, +modify and redistribute granted by the license, users are provided only +with a limited warranty and the software's author, the holder of the +economic rights, and the successive licensors only have limited liability. + +In this respect, the risks associated with loading, using, modifying +and/or developing or reproducing the software by the user are brought to +the user's attention, given its Free Software status, which may make it +complicated to use, with the result that its use is reserved for +developers and experienced professionals having in-depth computer +knowledge. Users are therefore encouraged to load and test the +suitability of the software as regards their requirements in conditions +enabling the security of their systems and/or data to be ensured and, +more generally, to use and operate it in the same conditions of +security. This Agreement may be freely reproduced and published, +provided it is not altered, and that no provisions are either added or +removed herefrom. + +This Agreement may apply to any or all software for which the holder of +the economic rights decides to submit the use thereof to its provisions. + + + Article 1 - DEFINITIONS + +For the purpose of this Agreement, when the following expressions +commence with a capital letter, they shall have the following meaning: + +Agreement: means this license agreement, and its possible subsequent +versions and annexes. + +Software: means the software in its Object Code and/or Source Code form +and, where applicable, its documentation, "as is" when the Licensee +accepts the Agreement. + +Initial Software: means the Software in its Source Code and possibly its +Object Code form and, where applicable, its documentation, "as is" when +it is first distributed under the terms and conditions of the Agreement. + +Modified Software: means the Software modified by at least one +Contribution. + +Source Code: means all the Software's instructions and program lines to +which access is required so as to modify the Software. + +Object Code: means the binary files originating from the compilation of +the Source Code. + +Holder: means the holder(s) of the economic rights over the Initial +Software. + +Licensee: means the Software user(s) having accepted the Agreement. + +Contributor: means a Licensee having made at least one Contribution. + +Licensor: means the Holder, or any other individual or legal entity, who +distributes the Software under the Agreement. + +Contribution: means any or all modifications, corrections, translations, +adaptations and/or new functions integrated into the Software by any or +all Contributors, as well as any or all Internal Modules. + +Module: means a set of sources files including their documentation that +enables supplementary functions or services in addition to those offered +by the Software. + +External Module: means any or all Modules, not derived from the +Software, so that this Module and the Software run in separate address +spaces, with one calling the other when they are run. + +Internal Module: means any or all Module, connected to the Software so +that they both execute in the same address space. + +GNU GPL: means the GNU General Public License version 2 or any +subsequent version, as published by the Free Software Foundation Inc. + +Parties: mean both the Licensee and the Licensor. + +These expressions may be used both in singular and plural form. + + + Article 2 - PURPOSE + +The purpose of the Agreement is the grant by the Licensor to the +Licensee of a non-exclusive, transferable and worldwide license for the +Software as set forth in Article 5 hereinafter for the whole term of the +protection granted by the rights over said Software. + + + Article 3 - ACCEPTANCE + +3.1 The Licensee shall be deemed as having accepted the terms and +conditions of this Agreement upon the occurrence of the first of the +following events: + + * (i) loading the Software by any or all means, notably, by + downloading from a remote server, or by loading from a physical + medium; + * (ii) the first time the Licensee exercises any of the rights + granted hereunder. + +3.2 One copy of the Agreement, containing a notice relating to the +characteristics of the Software, to the limited warranty, and to the +fact that its use is restricted to experienced users has been provided +to the Licensee prior to its acceptance as set forth in Article 3.1 +hereinabove, and the Licensee hereby acknowledges that it has read and +understood it. + + + Article 4 - EFFECTIVE DATE AND TERM + + + 4.1 EFFECTIVE DATE + +The Agreement shall become effective on the date when it is accepted by +the Licensee as set forth in Article 3.1. + + + 4.2 TERM + +The Agreement shall remain in force for the entire legal term of +protection of the economic rights over the Software. + + + Article 5 - SCOPE OF RIGHTS GRANTED + +The Licensor hereby grants to the Licensee, who accepts, the following +rights over the Software for any or all use, and for the term of the +Agreement, on the basis of the terms and conditions set forth hereinafter. + +Besides, if the Licensor owns or comes to own one or more patents +protecting all or part of the functions of the Software or of its +components, the Licensor undertakes not to enforce the rights granted by +these patents against successive Licensees using, exploiting or +modifying the Software. If these patents are transferred, the Licensor +undertakes to have the transferees subscribe to the obligations set +forth in this paragraph. + + + 5.1 RIGHT OF USE + +The Licensee is authorized to use the Software, without any limitation +as to its fields of application, with it being hereinafter specified +that this comprises: + + 1. permanent or temporary reproduction of all or part of the Software + by any or all means and in any or all form. + + 2. loading, displaying, running, or storing the Software on any or + all medium. + + 3. entitlement to observe, study or test its operation so as to + determine the ideas and principles behind any or all constituent + elements of said Software. This shall apply when the Licensee + carries out any or all loading, displaying, running, transmission + or storage operation as regards the Software, that it is entitled + to carry out hereunder. + + + 5.2 ENTITLEMENT TO MAKE CONTRIBUTIONS + +The right to make Contributions includes the right to translate, adapt, +arrange, or make any or all modifications to the Software, and the right +to reproduce the resulting software. + +The Licensee is authorized to make any or all Contributions to the +Software provided that it includes an explicit notice that it is the +author of said Contribution and indicates the date of the creation thereof. + + + 5.3 RIGHT OF DISTRIBUTION + +In particular, the right of distribution includes the right to publish, +transmit and communicate the Software to the general public on any or +all medium, and by any or all means, and the right to market, either in +consideration of a fee, or free of charge, one or more copies of the +Software by any means. + +The Licensee is further authorized to distribute copies of the modified +or unmodified Software to third parties according to the terms and +conditions set forth hereinafter. + + + 5.3.1 DISTRIBUTION OF SOFTWARE WITHOUT MODIFICATION + +The Licensee is authorized to distribute true copies of the Software in +Source Code or Object Code form, provided that said distribution +complies with all the provisions of the Agreement and is accompanied by: + + 1. a copy of the Agreement, + + 2. a notice relating to the limitation of both the Licensor's + warranty and liability as set forth in Articles 8 and 9, + +and that, in the event that only the Object Code of the Software is +redistributed, the Licensee allows future Licensees unhindered access to +the full Source Code of the Software by indicating how to access it, it +being understood that the additional cost of acquiring the Source Code +shall not exceed the cost of transferring the data. + + + 5.3.2 DISTRIBUTION OF MODIFIED SOFTWARE + +When the Licensee makes a Contribution to the Software, the terms and +conditions for the distribution of the resulting Modified Software +become subject to all the provisions of this Agreement. + +The Licensee is authorized to distribute the Modified Software, in +source code or object code form, provided that said distribution +complies with all the provisions of the Agreement and is accompanied by: + + 1. a copy of the Agreement, + + 2. a notice relating to the limitation of both the Licensor's + warranty and liability as set forth in Articles 8 and 9, + +and that, in the event that only the object code of the Modified +Software is redistributed, the Licensee allows future Licensees +unhindered access to the full source code of the Modified Software by +indicating how to access it, it being understood that the additional +cost of acquiring the source code shall not exceed the cost of +transferring the data. + + + 5.3.3 DISTRIBUTION OF EXTERNAL MODULES + +When the Licensee has developed an External Module, the terms and +conditions of this Agreement do not apply to said External Module, that +may be distributed under a separate license agreement. + + + 5.3.4 COMPATIBILITY WITH THE GNU GPL + +The Licensee can include a code that is subject to the provisions of one +of the versions of the GNU GPL in the Modified or unmodified Software, +and distribute that entire code under the terms of the same version of +the GNU GPL. + +The Licensee can include the Modified or unmodified Software in a code +that is subject to the provisions of one of the versions of the GNU GPL, +and distribute that entire code under the terms of the same version of +the GNU GPL. + + + Article 6 - INTELLECTUAL PROPERTY + + + 6.1 OVER THE INITIAL SOFTWARE + +The Holder owns the economic rights over the Initial Software. Any or +all use of the Initial Software is subject to compliance with the terms +and conditions under which the Holder has elected to distribute its work +and no one shall be entitled to modify the terms and conditions for the +distribution of said Initial Software. + +The Holder undertakes that the Initial Software will remain ruled at +least by this Agreement, for the duration set forth in Article 4.2. + + + 6.2 OVER THE CONTRIBUTIONS + +The Licensee who develops a Contribution is the owner of the +intellectual property rights over this Contribution as defined by +applicable law. + + + 6.3 OVER THE EXTERNAL MODULES + +The Licensee who develops an External Module is the owner of the +intellectual property rights over this External Module as defined by +applicable law and is free to choose the type of agreement that shall +govern its distribution. + + + 6.4 JOINT PROVISIONS + +The Licensee expressly undertakes: + + 1. not to remove, or modify, in any manner, the intellectual property + notices attached to the Software; + + 2. to reproduce said notices, in an identical manner, in the copies + of the Software modified or not. + +The Licensee undertakes not to directly or indirectly infringe the +intellectual property rights of the Holder and/or Contributors on the +Software and to take, where applicable, vis-ŕ-vis its staff, any and all +measures required to ensure respect of said intellectual property rights +of the Holder and/or Contributors. + + + Article 7 - RELATED SERVICES + +7.1 Under no circumstances shall the Agreement oblige the Licensor to +provide technical assistance or maintenance services for the Software. + +However, the Licensor is entitled to offer this type of services. The +terms and conditions of such technical assistance, and/or such +maintenance, shall be set forth in a separate instrument. Only the +Licensor offering said maintenance and/or technical assistance services +shall incur liability therefor. + +7.2 Similarly, any Licensor is entitled to offer to its licensees, under +its sole responsibility, a warranty, that shall only be binding upon +itself, for the redistribution of the Software and/or the Modified +Software, under terms and conditions that it is free to decide. Said +warranty, and the financial terms and conditions of its application, +shall be subject of a separate instrument executed between the Licensor +and the Licensee. + + + Article 8 - LIABILITY + +8.1 Subject to the provisions of Article 8.2, the Licensee shall be +entitled to claim compensation for any direct loss it may have suffered +from the Software as a result of a fault on the part of the relevant +Licensor, subject to providing evidence thereof. + +8.2 The Licensor's liability is limited to the commitments made under +this Agreement and shall not be incurred as a result of in particular: +(i) loss due the Licensee's total or partial failure to fulfill its +obligations, (ii) direct or consequential loss that is suffered by the +Licensee due to the use or performance of the Software, and (iii) more +generally, any consequential loss. In particular the Parties expressly +agree that any or all pecuniary or business loss (i.e. loss of data, +loss of profits, operating loss, loss of customers or orders, +opportunity cost, any disturbance to business activities) or any or all +legal proceedings instituted against the Licensee by a third party, +shall constitute consequential loss and shall not provide entitlement to +any or all compensation from the Licensor. + + + Article 9 - WARRANTY + +9.1 The Licensee acknowledges that the scientific and technical +state-of-the-art when the Software was distributed did not enable all +possible uses to be tested and verified, nor for the presence of +possible defects to be detected. In this respect, the Licensee's +attention has been drawn to the risks associated with loading, using, +modifying and/or developing and reproducing the Software which are +reserved for experienced users. + +The Licensee shall be responsible for verifying, by any or all means, +the suitability of the product for its requirements, its good working +order, and for ensuring that it shall not cause damage to either persons +or properties. + +9.2 The Licensor hereby represents, in good faith, that it is entitled +to grant all the rights over the Software (including in particular the +rights set forth in Article 5). + +9.3 The Licensee acknowledges that the Software is supplied "as is" by +the Licensor without any other express or tacit warranty, other than +that provided for in Article 9.2 and, in particular, without any warranty +as to its commercial value, its secured, safe, innovative or relevant +nature. + +Specifically, the Licensor does not warrant that the Software is free +from any error, that it will operate without interruption, that it will +be compatible with the Licensee's own equipment and software +configuration, nor that it will meet the Licensee's requirements. + +9.4 The Licensor does not either expressly or tacitly warrant that the +Software does not infringe any third party intellectual property right +relating to a patent, software or any other property right. Therefore, +the Licensor disclaims any and all liability towards the Licensee +arising out of any or all proceedings for infringement that may be +instituted in respect of the use, modification and redistribution of the +Software. Nevertheless, should such proceedings be instituted against +the Licensee, the Licensor shall provide it with technical and legal +assistance for its defense. Such technical and legal assistance shall be +decided on a case-by-case basis between the relevant Licensor and the +Licensee pursuant to a memorandum of understanding. The Licensor +disclaims any and all liability as regards the Licensee's use of the +name of the Software. No warranty is given as regards the existence of +prior rights over the name of the Software or as regards the existence +of a trademark. + + + Article 10 - TERMINATION + +10.1 In the event of a breach by the Licensee of its obligations +hereunder, the Licensor may automatically terminate this Agreement +thirty (30) days after notice has been sent to the Licensee and has +remained ineffective. + +10.2 A Licensee whose Agreement is terminated shall no longer be +authorized to use, modify or distribute the Software. However, any +licenses that it may have granted prior to termination of the Agreement +shall remain valid subject to their having been granted in compliance +with the terms and conditions hereof. + + + Article 11 - MISCELLANEOUS + + + 11.1 EXCUSABLE EVENTS + +Neither Party shall be liable for any or all delay, or failure to +perform the Agreement, that may be attributable to an event of force +majeure, an act of God or an outside cause, such as defective +functioning or interruptions of the electricity or telecommunications +networks, network paralysis following a virus attack, intervention by +government authorities, natural disasters, water damage, earthquakes, +fire, explosions, strikes and labor unrest, war, etc. + +11.2 Any failure by either Party, on one or more occasions, to invoke +one or more of the provisions hereof, shall under no circumstances be +interpreted as being a waiver by the interested Party of its right to +invoke said provision(s) subsequently. + +11.3 The Agreement cancels and replaces any or all previous agreements, +whether written or oral, between the Parties and having the same +purpose, and constitutes the entirety of the agreement between said +Parties concerning said purpose. No supplement or modification to the +terms and conditions hereof shall be effective as between the Parties +unless it is made in writing and signed by their duly authorized +representatives. + +11.4 In the event that one or more of the provisions hereof were to +conflict with a current or future applicable act or legislative text, +said act or legislative text shall prevail, and the Parties shall make +the necessary amendments so as to comply with said act or legislative +text. All other provisions shall remain effective. Similarly, invalidity +of a provision of the Agreement, for any reason whatsoever, shall not +cause the Agreement as a whole to be invalid. + + + 11.5 LANGUAGE + +The Agreement is drafted in both French and English and both versions +are deemed authentic. + + + Article 12 - NEW VERSIONS OF THE AGREEMENT + +12.1 Any person is authorized to duplicate and distribute copies of this +Agreement. + +12.2 So as to ensure coherence, the wording of this Agreement is +protected and may only be modified by the authors of the License, who +reserve the right to periodically publish updates or new versions of the +Agreement, each with a separate number. These subsequent versions may +address new issues encountered by Free Software. + +12.3 Any Software distributed under a given version of the Agreement may +only be subsequently distributed under the same version of the Agreement +or a subsequent version, subject to the provisions of Article 5.3.4. + + + Article 13 - GOVERNING LAW AND JURISDICTION + +13.1 The Agreement is governed by French law. The Parties agree to +endeavor to seek an amicable solution to any disagreements or disputes +that may arise during the performance of the Agreement. + +13.2 Failing an amicable solution within two (2) months as from their +occurrence, and unless emergency proceedings are necessary, the +disagreements or disputes shall be referred to the Paris Courts having +jurisdiction, by the more diligent Party. + + +Version 2.0 dated 2006-09-05. diff --git a/src/sumatra/sumatra-1.0.10/Licence_CeCILL_V2-fr.txt b/src/sumatra/sumatra-1.0.10/Licence_CeCILL_V2-fr.txt new file mode 100644 index 0000000..1613fca --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/Licence_CeCILL_V2-fr.txt @@ -0,0 +1,512 @@ + +CONTRAT DE LICENCE DE LOGICIEL LIBRE CeCILL + + + Avertissement + +Ce contrat est une licence de logiciel libre issue d'une concertation +entre ses auteurs afin que le respect de deux grands principes préside ŕ +sa rédaction: + + * d'une part, le respect des principes de diffusion des logiciels + libres: accčs au code source, droits étendus conférés aux + utilisateurs, + * d'autre part, la désignation d'un droit applicable, le droit + français, auquel elle est conforme, tant au regard du droit de la + responsabilité civile que du droit de la propriété intellectuelle + et de la protection qu'il offre aux auteurs et titulaires des + droits patrimoniaux sur un logiciel. + +Les auteurs de la licence CeCILL (pour Ce[a] C[nrs] I[nria] L[ogiciel] +L[ibre]) sont: + +Commissariat ŕ l'Energie Atomique - CEA, établissement public de +recherche ŕ caractčre scientifique, technique et industriel, dont le +sičge est situé 25 rue Leblanc, immeuble Le Ponant D, 75015 Paris. + +Centre National de la Recherche Scientifique - CNRS, établissement +public ŕ caractčre scientifique et technologique, dont le sičge est +situé 3 rue Michel-Ange, 75794 Paris cedex 16. + +Institut National de Recherche en Informatique et en Automatique - +INRIA, établissement public ŕ caractčre scientifique et technologique, +dont le sičge est situé Domaine de Voluceau, Rocquencourt, BP 105, 78153 +Le Chesnay cedex. + + + Préambule + +Ce contrat est une licence de logiciel libre dont l'objectif est de +conférer aux utilisateurs la liberté de modification et de +redistribution du logiciel régi par cette licence dans le cadre d'un +modčle de diffusion en logiciel libre. + +L'exercice de ces libertés est assorti de certains devoirs ŕ la charge +des utilisateurs afin de préserver ce statut au cours des +redistributions ultérieures. + +L'accessibilité au code source et les droits de copie, de modification +et de redistribution qui en découlent ont pour contrepartie de n'offrir +aux utilisateurs qu'une garantie limitée et de ne faire peser sur +l'auteur du logiciel, le titulaire des droits patrimoniaux et les +concédants successifs qu'une responsabilité restreinte. + +A cet égard l'attention de l'utilisateur est attirée sur les risques +associés au chargement, ŕ l'utilisation, ŕ la modification et/ou au +développement et ŕ la reproduction du logiciel par l'utilisateur étant +donné sa spécificité de logiciel libre, qui peut le rendre complexe ŕ +manipuler et qui le réserve donc ŕ des développeurs ou des +professionnels avertis possédant des connaissances informatiques +approfondies. Les utilisateurs sont donc invités ŕ charger et tester +l'adéquation du logiciel ŕ leurs besoins dans des conditions permettant +d'assurer la sécurité de leurs systčmes et/ou de leurs données et, plus +généralement, ŕ l'utiliser et l'exploiter dans les męmes conditions de +sécurité. Ce contrat peut ętre reproduit et diffusé librement, sous +réserve de le conserver en l'état, sans ajout ni suppression de clauses. + +Ce contrat est susceptible de s'appliquer ŕ tout logiciel dont le +titulaire des droits patrimoniaux décide de soumettre l'exploitation aux +dispositions qu'il contient. + + + Article 1 - DEFINITIONS + +Dans ce contrat, les termes suivants, lorsqu'ils seront écrits avec une +lettre capitale, auront la signification suivante: + +Contrat: désigne le présent contrat de licence, ses éventuelles versions +postérieures et annexes. + +Logiciel: désigne le logiciel sous sa forme de Code Objet et/ou de Code +Source et le cas échéant sa documentation, dans leur état au moment de +l'acceptation du Contrat par le Licencié. + +Logiciel Initial: désigne le Logiciel sous sa forme de Code Source et +éventuellement de Code Objet et le cas échéant sa documentation, dans +leur état au moment de leur premičre diffusion sous les termes du Contrat. + +Logiciel Modifié: désigne le Logiciel modifié par au moins une +Contribution. + +Code Source: désigne l'ensemble des instructions et des lignes de +programme du Logiciel et auquel l'accčs est nécessaire en vue de +modifier le Logiciel. + +Code Objet: désigne les fichiers binaires issus de la compilation du +Code Source. + +Titulaire: désigne le ou les détenteurs des droits patrimoniaux d'auteur +sur le Logiciel Initial. + +Licencié: désigne le ou les utilisateurs du Logiciel ayant accepté le +Contrat. + +Contributeur: désigne le Licencié auteur d'au moins une Contribution. + +Concédant: désigne le Titulaire ou toute personne physique ou morale +distribuant le Logiciel sous le Contrat. + +Contribution: désigne l'ensemble des modifications, corrections, +traductions, adaptations et/ou nouvelles fonctionnalités intégrées dans +le Logiciel par tout Contributeur, ainsi que tout Module Interne. + +Module: désigne un ensemble de fichiers sources y compris leur +documentation qui permet de réaliser des fonctionnalités ou services +supplémentaires ŕ ceux fournis par le Logiciel. + +Module Externe: désigne tout Module, non dérivé du Logiciel, tel que ce +Module et le Logiciel s'exécutent dans des espaces d'adressage +différents, l'un appelant l'autre au moment de leur exécution. + +Module Interne: désigne tout Module lié au Logiciel de telle sorte +qu'ils s'exécutent dans le męme espace d'adressage. + +GNU GPL: désigne la GNU General Public License dans sa version 2 ou +toute version ultérieure, telle que publiée par Free Software Foundation +Inc. + +Parties: désigne collectivement le Licencié et le Concédant. + +Ces termes s'entendent au singulier comme au pluriel. + + + Article 2 - OBJET + +Le Contrat a pour objet la concession par le Concédant au Licencié d'une +licence non exclusive, cessible et mondiale du Logiciel telle que +définie ci-aprčs ŕ l'article 5 pour toute la durée de protection des droits +portant sur ce Logiciel. + + + Article 3 - ACCEPTATION + +3.1 L'acceptation par le Licencié des termes du Contrat est réputée +acquise du fait du premier des faits suivants: + + * (i) le chargement du Logiciel par tout moyen notamment par + téléchargement ŕ partir d'un serveur distant ou par chargement ŕ + partir d'un support physique; + * (ii) le premier exercice par le Licencié de l'un quelconque des + droits concédés par le Contrat. + +3.2 Un exemplaire du Contrat, contenant notamment un avertissement +relatif aux spécificités du Logiciel, ŕ la restriction de garantie et ŕ +la limitation ŕ un usage par des utilisateurs expérimentés a été mis ŕ +disposition du Licencié préalablement ŕ son acceptation telle que +définie ŕ l'article 3.1 ci dessus et le Licencié reconnaît en avoir pris +connaissance. + + + Article 4 - ENTREE EN VIGUEUR ET DUREE + + + 4.1 ENTREE EN VIGUEUR + +Le Contrat entre en vigueur ŕ la date de son acceptation par le Licencié +telle que définie en 3.1. + + + 4.2 DUREE + +Le Contrat produira ses effets pendant toute la durée légale de +protection des droits patrimoniaux portant sur le Logiciel. + + + Article 5 - ETENDUE DES DROITS CONCEDES + +Le Concédant concčde au Licencié, qui accepte, les droits suivants sur +le Logiciel pour toutes destinations et pour la durée du Contrat dans +les conditions ci-aprčs détaillées. + +Par ailleurs, si le Concédant détient ou venait ŕ détenir un ou +plusieurs brevets d'invention protégeant tout ou partie des +fonctionnalités du Logiciel ou de ses composants, il s'engage ŕ ne pas +opposer les éventuels droits conférés par ces brevets aux Licenciés +successifs qui utiliseraient, exploiteraient ou modifieraient le +Logiciel. En cas de cession de ces brevets, le Concédant s'engage ŕ +faire reprendre les obligations du présent alinéa aux cessionnaires. + + + 5.1 DROIT D'UTILISATION + +Le Licencié est autorisé ŕ utiliser le Logiciel, sans restriction quant +aux domaines d'application, étant ci-aprčs précisé que cela comporte: + + 1. la reproduction permanente ou provisoire du Logiciel en tout ou + partie par tout moyen et sous toute forme. + + 2. le chargement, l'affichage, l'exécution, ou le stockage du + Logiciel sur tout support. + + 3. la possibilité d'en observer, d'en étudier, ou d'en tester le + fonctionnement afin de déterminer les idées et principes qui sont + ŕ la base de n'importe quel élément de ce Logiciel; et ceci, + lorsque le Licencié effectue toute opération de chargement, + d'affichage, d'exécution, de transmission ou de stockage du + Logiciel qu'il est en droit d'effectuer en vertu du Contrat. + + + 5.2 DROIT D'APPORTER DES CONTRIBUTIONS + +Le droit d'apporter des Contributions comporte le droit de traduire, +d'adapter, d'arranger ou d'apporter toute autre modification au Logiciel +et le droit de reproduire le logiciel en résultant. + +Le Licencié est autorisé ŕ apporter toute Contribution au Logiciel sous +réserve de mentionner, de façon explicite, son nom en tant qu'auteur de +cette Contribution et la date de création de celle-ci. + + + 5.3 DROIT DE DISTRIBUTION + +Le droit de distribution comporte notamment le droit de diffuser, de +transmettre et de communiquer le Logiciel au public sur tout support et +par tout moyen ainsi que le droit de mettre sur le marché ŕ titre +onéreux ou gratuit, un ou des exemplaires du Logiciel par tout procédé. + +Le Licencié est autorisé ŕ distribuer des copies du Logiciel, modifié ou +non, ŕ des tiers dans les conditions ci-aprčs détaillées. + + + 5.3.1 DISTRIBUTION DU LOGICIEL SANS MODIFICATION + +Le Licencié est autorisé ŕ distribuer des copies conformes du Logiciel, +sous forme de Code Source ou de Code Objet, ŕ condition que cette +distribution respecte les dispositions du Contrat dans leur totalité et +soit accompagnée: + + 1. d'un exemplaire du Contrat, + + 2. d'un avertissement relatif ŕ la restriction de garantie et de + responsabilité du Concédant telle que prévue aux articles 8 + et 9, + +et que, dans le cas oů seul le Code Objet du Logiciel est redistribué, +le Licencié permette aux futurs Licenciés d'accéder facilement au Code +Source complet du Logiciel en indiquant les modalités d'accčs, étant +entendu que le coűt additionnel d'acquisition du Code Source ne devra +pas excéder le simple coűt de transfert des données. + + + 5.3.2 DISTRIBUTION DU LOGICIEL MODIFIE + +Lorsque le Licencié apporte une Contribution au Logiciel, les conditions +de distribution du Logiciel Modifié en résultant sont alors soumises ŕ +l'intégralité des dispositions du Contrat. + +Le Licencié est autorisé ŕ distribuer le Logiciel Modifié, sous forme de +code source ou de code objet, ŕ condition que cette distribution +respecte les dispositions du Contrat dans leur totalité et soit +accompagnée: + + 1. d'un exemplaire du Contrat, + + 2. d'un avertissement relatif ŕ la restriction de garantie et de + responsabilité du Concédant telle que prévue aux articles 8 + et 9, + +et que, dans le cas oů seul le code objet du Logiciel Modifié est +redistribué, le Licencié permette aux futurs Licenciés d'accéder +facilement au code source complet du Logiciel Modifié en indiquant les +modalités d'accčs, étant entendu que le coűt additionnel d'acquisition +du code source ne devra pas excéder le simple coűt de transfert des données. + + + 5.3.3 DISTRIBUTION DES MODULES EXTERNES + +Lorsque le Licencié a développé un Module Externe les conditions du +Contrat ne s'appliquent pas ŕ ce Module Externe, qui peut ętre distribué +sous un contrat de licence différent. + + + 5.3.4 COMPATIBILITE AVEC LA LICENCE GNU GPL + +Le Licencié peut inclure un code soumis aux dispositions d'une des +versions de la licence GNU GPL dans le Logiciel modifié ou non et +distribuer l'ensemble sous les conditions de la męme version de la +licence GNU GPL. + +Le Licencié peut inclure le Logiciel modifié ou non dans un code soumis +aux dispositions d'une des versions de la licence GNU GPL et distribuer +l'ensemble sous les conditions de la męme version de la licence GNU GPL. + + + Article 6 - PROPRIETE INTELLECTUELLE + + + 6.1 SUR LE LOGICIEL INITIAL + +Le Titulaire est détenteur des droits patrimoniaux sur le Logiciel +Initial. Toute utilisation du Logiciel Initial est soumise au respect +des conditions dans lesquelles le Titulaire a choisi de diffuser son +oeuvre et nul autre n'a la faculté de modifier les conditions de +diffusion de ce Logiciel Initial. + +Le Titulaire s'engage ŕ ce que le Logiciel Initial reste au moins régi +par le Contrat et ce, pour la durée visée ŕ l'article 4.2. + + + 6.2 SUR LES CONTRIBUTIONS + +Le Licencié qui a développé une Contribution est titulaire sur celle-ci +des droits de propriété intellectuelle dans les conditions définies par +la législation applicable. + + + 6.3 SUR LES MODULES EXTERNES + +Le Licencié qui a développé un Module Externe est titulaire sur celui-ci +des droits de propriété intellectuelle dans les conditions définies par +la législation applicable et reste libre du choix du contrat régissant +sa diffusion. + + + 6.4 DISPOSITIONS COMMUNES + +Le Licencié s'engage expressément: + + 1. ŕ ne pas supprimer ou modifier de quelque maničre que ce soit les + mentions de propriété intellectuelle apposées sur le Logiciel; + + 2. ŕ reproduire ŕ l'identique lesdites mentions de propriété + intellectuelle sur les copies du Logiciel modifié ou non. + +Le Licencié s'engage ŕ ne pas porter atteinte, directement ou +indirectement, aux droits de propriété intellectuelle du Titulaire et/ou +des Contributeurs sur le Logiciel et ŕ prendre, le cas échéant, ŕ +l'égard de son personnel toutes les mesures nécessaires pour assurer le +respect des dits droits de propriété intellectuelle du Titulaire et/ou +des Contributeurs. + + + Article 7 - SERVICES ASSOCIES + +7.1 Le Contrat n'oblige en aucun cas le Concédant ŕ la réalisation de +prestations d'assistance technique ou de maintenance du Logiciel. + +Cependant le Concédant reste libre de proposer ce type de services. Les +termes et conditions d'une telle assistance technique et/ou d'une telle +maintenance seront alors déterminés dans un acte séparé. Ces actes de +maintenance et/ou assistance technique n'engageront que la seule +responsabilité du Concédant qui les propose. + +7.2 De męme, tout Concédant est libre de proposer, sous sa seule +responsabilité, ŕ ses licenciés une garantie, qui n'engagera que lui, +lors de la redistribution du Logiciel et/ou du Logiciel Modifié et ce, +dans les conditions qu'il souhaite. Cette garantie et les modalités +financičres de son application feront l'objet d'un acte séparé entre le +Concédant et le Licencié. + + + Article 8 - RESPONSABILITE + +8.1 Sous réserve des dispositions de l'article 8.2, le Licencié a la +faculté, sous réserve de prouver la faute du Concédant concerné, de +solliciter la réparation du préjudice direct qu'il subirait du fait du +Logiciel et dont il apportera la preuve. + +8.2 La responsabilité du Concédant est limitée aux engagements pris en +application du Contrat et ne saurait ętre engagée en raison notamment: +(i) des dommages dus ŕ l'inexécution, totale ou partielle, de ses +obligations par le Licencié, (ii) des dommages directs ou indirects +découlant de l'utilisation ou des performances du Logiciel subis par le +Licencié et (iii) plus généralement d'un quelconque dommage indirect. En +particulier, les Parties conviennent expressément que tout préjudice +financier ou commercial (par exemple perte de données, perte de +bénéfices, perte d'exploitation, perte de clientčle ou de commandes, +manque ŕ gagner, trouble commercial quelconque) ou toute action dirigée +contre le Licencié par un tiers, constitue un dommage indirect et +n'ouvre pas droit ŕ réparation par le Concédant. + + + Article 9 - GARANTIE + +9.1 Le Licencié reconnaît que l'état actuel des connaissances +scientifiques et techniques au moment de la mise en circulation du +Logiciel ne permet pas d'en tester et d'en vérifier toutes les +utilisations ni de détecter l'existence d'éventuels défauts. L'attention +du Licencié a été attirée sur ce point sur les risques associés au +chargement, ŕ l'utilisation, la modification et/ou au développement et ŕ +la reproduction du Logiciel qui sont réservés ŕ des utilisateurs avertis. + +Il relčve de la responsabilité du Licencié de contrôler, par tous +moyens, l'adéquation du produit ŕ ses besoins, son bon fonctionnement et +de s'assurer qu'il ne causera pas de dommages aux personnes et aux biens. + +9.2 Le Concédant déclare de bonne foi ętre en droit de concéder +l'ensemble des droits attachés au Logiciel (comprenant notamment les +droits visés ŕ l'article 5). + +9.3 Le Licencié reconnaît que le Logiciel est fourni "en l'état" par le +Concédant sans autre garantie, expresse ou tacite, que celle prévue ŕ +l'article 9.2 et notamment sans aucune garantie sur sa valeur commerciale, +son caractčre sécurisé, innovant ou pertinent. + +En particulier, le Concédant ne garantit pas que le Logiciel est exempt +d'erreur, qu'il fonctionnera sans interruption, qu'il sera compatible +avec l'équipement du Licencié et sa configuration logicielle ni qu'il +remplira les besoins du Licencié. + +9.4 Le Concédant ne garantit pas, de maničre expresse ou tacite, que le +Logiciel ne porte pas atteinte ŕ un quelconque droit de propriété +intellectuelle d'un tiers portant sur un brevet, un logiciel ou sur tout +autre droit de propriété. Ainsi, le Concédant exclut toute garantie au +profit du Licencié contre les actions en contrefaçon qui pourraient ętre +diligentées au titre de l'utilisation, de la modification, et de la +redistribution du Logiciel. Néanmoins, si de telles actions sont +exercées contre le Licencié, le Concédant lui apportera son aide +technique et juridique pour sa défense. Cette aide technique et +juridique est déterminée au cas par cas entre le Concédant concerné et +le Licencié dans le cadre d'un protocole d'accord. Le Concédant dégage +toute responsabilité quant ŕ l'utilisation de la dénomination du +Logiciel par le Licencié. Aucune garantie n'est apportée quant ŕ +l'existence de droits antérieurs sur le nom du Logiciel et sur +l'existence d'une marque. + + + Article 10 - RESILIATION + +10.1 En cas de manquement par le Licencié aux obligations mises ŕ sa +charge par le Contrat, le Concédant pourra résilier de plein droit le +Contrat trente (30) jours aprčs notification adressée au Licencié et +restée sans effet. + +10.2 Le Licencié dont le Contrat est résilié n'est plus autorisé ŕ +utiliser, modifier ou distribuer le Logiciel. Cependant, toutes les +licences qu'il aura concédées antérieurement ŕ la résiliation du Contrat +resteront valides sous réserve qu'elles aient été effectuées en +conformité avec le Contrat. + + + Article 11 - DISPOSITIONS DIVERSES + + + 11.1 CAUSE EXTERIEURE + +Aucune des Parties ne sera responsable d'un retard ou d'une défaillance +d'exécution du Contrat qui serait dű ŕ un cas de force majeure, un cas +fortuit ou une cause extérieure, telle que, notamment, le mauvais +fonctionnement ou les interruptions du réseau électrique ou de +télécommunication, la paralysie du réseau liée ŕ une attaque +informatique, l'intervention des autorités gouvernementales, les +catastrophes naturelles, les dégâts des eaux, les tremblements de terre, +le feu, les explosions, les grčves et les conflits sociaux, l'état de +guerre... + +11.2 Le fait, par l'une ou l'autre des Parties, d'omettre en une ou +plusieurs occasions de se prévaloir d'une ou plusieurs dispositions du +Contrat, ne pourra en aucun cas impliquer renonciation par la Partie +intéressée ŕ s'en prévaloir ultérieurement. + +11.3 Le Contrat annule et remplace toute convention antérieure, écrite +ou orale, entre les Parties sur le męme objet et constitue l'accord +entier entre les Parties sur cet objet. Aucune addition ou modification +aux termes du Contrat n'aura d'effet ŕ l'égard des Parties ŕ moins +d'ętre faite par écrit et signée par leurs représentants dűment habilités. + +11.4 Dans l'hypothčse oů une ou plusieurs des dispositions du Contrat +s'avčrerait contraire ŕ une loi ou ŕ un texte applicable, existants ou +futurs, cette loi ou ce texte prévaudrait, et les Parties feraient les +amendements nécessaires pour se conformer ŕ cette loi ou ŕ ce texte. +Toutes les autres dispositions resteront en vigueur. De męme, la +nullité, pour quelque raison que ce soit, d'une des dispositions du +Contrat ne saurait entraîner la nullité de l'ensemble du Contrat. + + + 11.5 LANGUE + +Le Contrat est rédigé en langue française et en langue anglaise, ces +deux versions faisant également foi. + + + Article 12 - NOUVELLES VERSIONS DU CONTRAT + +12.1 Toute personne est autorisée ŕ copier et distribuer des copies de +ce Contrat. + +12.2 Afin d'en préserver la cohérence, le texte du Contrat est protégé +et ne peut ętre modifié que par les auteurs de la licence, lesquels se +réservent le droit de publier périodiquement des mises ŕ jour ou de +nouvelles versions du Contrat, qui posséderont chacune un numéro +distinct. Ces versions ultérieures seront susceptibles de prendre en +compte de nouvelles problématiques rencontrées par les logiciels libres. + +12.3 Tout Logiciel diffusé sous une version donnée du Contrat ne pourra +faire l'objet d'une diffusion ultérieure que sous la męme version du +Contrat ou une version postérieure, sous réserve des dispositions de +l'article 5.3.4. + + + Article 13 - LOI APPLICABLE ET COMPETENCE TERRITORIALE + +13.1 Le Contrat est régi par la loi française. Les Parties conviennent +de tenter de régler ŕ l'amiable les différends ou litiges qui +viendraient ŕ se produire par suite ou ŕ l'occasion du Contrat. + +13.2 A défaut d'accord amiable dans un délai de deux (2) mois ŕ compter +de leur survenance et sauf situation relevant d'une procédure d'urgence, +les différends ou litiges seront portés par la Partie la plus diligente +devant les Tribunaux compétents de Paris. + + +Version 2.0 du 2006-09-05. diff --git a/src/sumatra-1.0.10/Makefile b/src/sumatra/sumatra-1.0.10/Makefile similarity index 72% rename from src/sumatra-1.0.10/Makefile rename to src/sumatra/sumatra-1.0.10/Makefile index 8736691..9f43190 100644 --- a/src/sumatra-1.0.10/Makefile +++ b/src/sumatra/sumatra-1.0.10/Makefile @@ -1,3 +1,5 @@ + + EXEC=sumatra SUMATRA_SRC= sumatra.c \ @@ -8,11 +10,11 @@ SUMATRA_OBJ= $(patsubst %.c,%.o,$(SUMATRA_SRC)) SRCS= $(SUMATRA_SRC) -LIB= -lfasta -llcs -lfile -lutils -lz -lm +LIB= -lfasta -llcs -lfile -lutils -lz -lm -lpthread include ./global.mk -all: $(EXEC) +all: $(EXEC) install ######## @@ -24,7 +26,7 @@ all: $(EXEC) # executable compilation and link sumatra: $(SUMATRA_OBJ) $(LIBFASTA) $(LIBLCS) $(LIBFILE) $(LIBUTILS) - $(CC) $(LDFLAGS) -o $@ -pthread $(SUMATRA_OBJ) $(LIBFASTAPATH) $(LIBLCSPATH) $(LIBFILEPATH) $(LIBUTILSPATH) $(LIB) + $(CC) $(LIBFASTAPATH) $(LIBLCSPATH) $(LIBFILEPATH) $(LIBUTILSPATH) $(LDFLAGS) -o $@ $(SUMATRA_OBJ) $(LIB) ######## # @@ -41,4 +43,5 @@ clean: $(MAKE) -C ./sumalibs/libfile clean $(MAKE) -C ./sumalibs/libutils clean - +install: + cp $(EXEC) $(BINDIR) diff --git a/src/sumatra/sumatra-1.0.10/global.mk b/src/sumatra/sumatra-1.0.10/global.mk new file mode 100644 index 0000000..303ca46 --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/global.mk @@ -0,0 +1,45 @@ +include ../../../config/auto.conf + +LIBFASTAPATH = -L./sumalibs/libfasta +LIBLCSPATH = -L./sumalibs/liblcs +LIBFILEPATH = -L./sumalibs/libfile +LIBUTILSPATH = -L./sumalibs/libutils + +LIBFASTA = ./sumalibs/libfasta/libfasta.a +LIBLCS = ./sumalibs/liblcs/liblcs.a +LIBFILE = ./sumalibs/libfile/libfile.a +LIBUTILS = ./sumalibs/libutils/libutils.a + + + +#ifeq ($(CC),gcc) +# CFLAGS = -O3 -s -DOMP_SUPPORT -fopenmp -w +#else +# CFLAGS = -O3 -w +#endif + + +default: all + +%.o: %.c + $(CC) $(CFLAGS) -c -o $@ $< + + +######## +# +# libraries compilation +# +######## + +./sumalibs/libfasta/libfasta.a: + $(MAKE) -C ./sumalibs/libfasta + +./sumalibs/liblcs/liblcs.a: + $(MAKE) -C ./sumalibs/liblcs + +./sumalibs/libfile/libfile.a: + $(MAKE) -C ./sumalibs/libfile + +./sumalibs/libutils/libutils.a: + $(MAKE) -C ./sumalibs/libutils + diff --git a/src/sumatra-1.0.10/mtcompare_sumatra.c b/src/sumatra/sumatra-1.0.10/mtcompare_sumatra.c similarity index 100% rename from src/sumatra-1.0.10/mtcompare_sumatra.c rename to src/sumatra/sumatra-1.0.10/mtcompare_sumatra.c diff --git a/src/sumatra-1.0.10/mtcompare_sumatra.h b/src/sumatra/sumatra-1.0.10/mtcompare_sumatra.h similarity index 100% rename from src/sumatra-1.0.10/mtcompare_sumatra.h rename to src/sumatra/sumatra-1.0.10/mtcompare_sumatra.h diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/Licence_CeCILL_V2-en.txt b/src/sumatra/sumatra-1.0.10/sumalibs/Licence_CeCILL_V2-en.txt new file mode 100644 index 0000000..fcc8df2 --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/Licence_CeCILL_V2-en.txt @@ -0,0 +1,506 @@ + +CeCILL FREE SOFTWARE LICENSE AGREEMENT + + + Notice + +This Agreement is a Free Software license agreement that is the result +of discussions between its authors in order to ensure compliance with +the two main principles guiding its drafting: + + * firstly, compliance with the principles governing the distribution + of Free Software: access to source code, broad rights granted to + users, + * secondly, the election of a governing law, French law, with which + it is conformant, both as regards the law of torts and + intellectual property law, and the protection that it offers to + both authors and holders of the economic rights over software. + +The authors of the CeCILL (for Ce[a] C[nrs] I[nria] L[ogiciel] L[ibre]) +license are: + +Commissariat ŕ l'Energie Atomique - CEA, a public scientific, technical +and industrial research establishment, having its principal place of +business at 25 rue Leblanc, immeuble Le Ponant D, 75015 Paris, France. + +Centre National de la Recherche Scientifique - CNRS, a public scientific +and technological establishment, having its principal place of business +at 3 rue Michel-Ange, 75794 Paris cedex 16, France. + +Institut National de Recherche en Informatique et en Automatique - +INRIA, a public scientific and technological establishment, having its +principal place of business at Domaine de Voluceau, Rocquencourt, BP +105, 78153 Le Chesnay cedex, France. + + + Preamble + +The purpose of this Free Software license agreement is to grant users +the right to modify and redistribute the software governed by this +license within the framework of an open source distribution model. + +The exercising of these rights is conditional upon certain obligations +for users so as to preserve this status for all subsequent redistributions. + +In consideration of access to the source code and the rights to copy, +modify and redistribute granted by the license, users are provided only +with a limited warranty and the software's author, the holder of the +economic rights, and the successive licensors only have limited liability. + +In this respect, the risks associated with loading, using, modifying +and/or developing or reproducing the software by the user are brought to +the user's attention, given its Free Software status, which may make it +complicated to use, with the result that its use is reserved for +developers and experienced professionals having in-depth computer +knowledge. Users are therefore encouraged to load and test the +suitability of the software as regards their requirements in conditions +enabling the security of their systems and/or data to be ensured and, +more generally, to use and operate it in the same conditions of +security. This Agreement may be freely reproduced and published, +provided it is not altered, and that no provisions are either added or +removed herefrom. + +This Agreement may apply to any or all software for which the holder of +the economic rights decides to submit the use thereof to its provisions. + + + Article 1 - DEFINITIONS + +For the purpose of this Agreement, when the following expressions +commence with a capital letter, they shall have the following meaning: + +Agreement: means this license agreement, and its possible subsequent +versions and annexes. + +Software: means the software in its Object Code and/or Source Code form +and, where applicable, its documentation, "as is" when the Licensee +accepts the Agreement. + +Initial Software: means the Software in its Source Code and possibly its +Object Code form and, where applicable, its documentation, "as is" when +it is first distributed under the terms and conditions of the Agreement. + +Modified Software: means the Software modified by at least one +Contribution. + +Source Code: means all the Software's instructions and program lines to +which access is required so as to modify the Software. + +Object Code: means the binary files originating from the compilation of +the Source Code. + +Holder: means the holder(s) of the economic rights over the Initial +Software. + +Licensee: means the Software user(s) having accepted the Agreement. + +Contributor: means a Licensee having made at least one Contribution. + +Licensor: means the Holder, or any other individual or legal entity, who +distributes the Software under the Agreement. + +Contribution: means any or all modifications, corrections, translations, +adaptations and/or new functions integrated into the Software by any or +all Contributors, as well as any or all Internal Modules. + +Module: means a set of sources files including their documentation that +enables supplementary functions or services in addition to those offered +by the Software. + +External Module: means any or all Modules, not derived from the +Software, so that this Module and the Software run in separate address +spaces, with one calling the other when they are run. + +Internal Module: means any or all Module, connected to the Software so +that they both execute in the same address space. + +GNU GPL: means the GNU General Public License version 2 or any +subsequent version, as published by the Free Software Foundation Inc. + +Parties: mean both the Licensee and the Licensor. + +These expressions may be used both in singular and plural form. + + + Article 2 - PURPOSE + +The purpose of the Agreement is the grant by the Licensor to the +Licensee of a non-exclusive, transferable and worldwide license for the +Software as set forth in Article 5 hereinafter for the whole term of the +protection granted by the rights over said Software. + + + Article 3 - ACCEPTANCE + +3.1 The Licensee shall be deemed as having accepted the terms and +conditions of this Agreement upon the occurrence of the first of the +following events: + + * (i) loading the Software by any or all means, notably, by + downloading from a remote server, or by loading from a physical + medium; + * (ii) the first time the Licensee exercises any of the rights + granted hereunder. + +3.2 One copy of the Agreement, containing a notice relating to the +characteristics of the Software, to the limited warranty, and to the +fact that its use is restricted to experienced users has been provided +to the Licensee prior to its acceptance as set forth in Article 3.1 +hereinabove, and the Licensee hereby acknowledges that it has read and +understood it. + + + Article 4 - EFFECTIVE DATE AND TERM + + + 4.1 EFFECTIVE DATE + +The Agreement shall become effective on the date when it is accepted by +the Licensee as set forth in Article 3.1. + + + 4.2 TERM + +The Agreement shall remain in force for the entire legal term of +protection of the economic rights over the Software. + + + Article 5 - SCOPE OF RIGHTS GRANTED + +The Licensor hereby grants to the Licensee, who accepts, the following +rights over the Software for any or all use, and for the term of the +Agreement, on the basis of the terms and conditions set forth hereinafter. + +Besides, if the Licensor owns or comes to own one or more patents +protecting all or part of the functions of the Software or of its +components, the Licensor undertakes not to enforce the rights granted by +these patents against successive Licensees using, exploiting or +modifying the Software. If these patents are transferred, the Licensor +undertakes to have the transferees subscribe to the obligations set +forth in this paragraph. + + + 5.1 RIGHT OF USE + +The Licensee is authorized to use the Software, without any limitation +as to its fields of application, with it being hereinafter specified +that this comprises: + + 1. permanent or temporary reproduction of all or part of the Software + by any or all means and in any or all form. + + 2. loading, displaying, running, or storing the Software on any or + all medium. + + 3. entitlement to observe, study or test its operation so as to + determine the ideas and principles behind any or all constituent + elements of said Software. This shall apply when the Licensee + carries out any or all loading, displaying, running, transmission + or storage operation as regards the Software, that it is entitled + to carry out hereunder. + + + 5.2 ENTITLEMENT TO MAKE CONTRIBUTIONS + +The right to make Contributions includes the right to translate, adapt, +arrange, or make any or all modifications to the Software, and the right +to reproduce the resulting software. + +The Licensee is authorized to make any or all Contributions to the +Software provided that it includes an explicit notice that it is the +author of said Contribution and indicates the date of the creation thereof. + + + 5.3 RIGHT OF DISTRIBUTION + +In particular, the right of distribution includes the right to publish, +transmit and communicate the Software to the general public on any or +all medium, and by any or all means, and the right to market, either in +consideration of a fee, or free of charge, one or more copies of the +Software by any means. + +The Licensee is further authorized to distribute copies of the modified +or unmodified Software to third parties according to the terms and +conditions set forth hereinafter. + + + 5.3.1 DISTRIBUTION OF SOFTWARE WITHOUT MODIFICATION + +The Licensee is authorized to distribute true copies of the Software in +Source Code or Object Code form, provided that said distribution +complies with all the provisions of the Agreement and is accompanied by: + + 1. a copy of the Agreement, + + 2. a notice relating to the limitation of both the Licensor's + warranty and liability as set forth in Articles 8 and 9, + +and that, in the event that only the Object Code of the Software is +redistributed, the Licensee allows future Licensees unhindered access to +the full Source Code of the Software by indicating how to access it, it +being understood that the additional cost of acquiring the Source Code +shall not exceed the cost of transferring the data. + + + 5.3.2 DISTRIBUTION OF MODIFIED SOFTWARE + +When the Licensee makes a Contribution to the Software, the terms and +conditions for the distribution of the resulting Modified Software +become subject to all the provisions of this Agreement. + +The Licensee is authorized to distribute the Modified Software, in +source code or object code form, provided that said distribution +complies with all the provisions of the Agreement and is accompanied by: + + 1. a copy of the Agreement, + + 2. a notice relating to the limitation of both the Licensor's + warranty and liability as set forth in Articles 8 and 9, + +and that, in the event that only the object code of the Modified +Software is redistributed, the Licensee allows future Licensees +unhindered access to the full source code of the Modified Software by +indicating how to access it, it being understood that the additional +cost of acquiring the source code shall not exceed the cost of +transferring the data. + + + 5.3.3 DISTRIBUTION OF EXTERNAL MODULES + +When the Licensee has developed an External Module, the terms and +conditions of this Agreement do not apply to said External Module, that +may be distributed under a separate license agreement. + + + 5.3.4 COMPATIBILITY WITH THE GNU GPL + +The Licensee can include a code that is subject to the provisions of one +of the versions of the GNU GPL in the Modified or unmodified Software, +and distribute that entire code under the terms of the same version of +the GNU GPL. + +The Licensee can include the Modified or unmodified Software in a code +that is subject to the provisions of one of the versions of the GNU GPL, +and distribute that entire code under the terms of the same version of +the GNU GPL. + + + Article 6 - INTELLECTUAL PROPERTY + + + 6.1 OVER THE INITIAL SOFTWARE + +The Holder owns the economic rights over the Initial Software. Any or +all use of the Initial Software is subject to compliance with the terms +and conditions under which the Holder has elected to distribute its work +and no one shall be entitled to modify the terms and conditions for the +distribution of said Initial Software. + +The Holder undertakes that the Initial Software will remain ruled at +least by this Agreement, for the duration set forth in Article 4.2. + + + 6.2 OVER THE CONTRIBUTIONS + +The Licensee who develops a Contribution is the owner of the +intellectual property rights over this Contribution as defined by +applicable law. + + + 6.3 OVER THE EXTERNAL MODULES + +The Licensee who develops an External Module is the owner of the +intellectual property rights over this External Module as defined by +applicable law and is free to choose the type of agreement that shall +govern its distribution. + + + 6.4 JOINT PROVISIONS + +The Licensee expressly undertakes: + + 1. not to remove, or modify, in any manner, the intellectual property + notices attached to the Software; + + 2. to reproduce said notices, in an identical manner, in the copies + of the Software modified or not. + +The Licensee undertakes not to directly or indirectly infringe the +intellectual property rights of the Holder and/or Contributors on the +Software and to take, where applicable, vis-ŕ-vis its staff, any and all +measures required to ensure respect of said intellectual property rights +of the Holder and/or Contributors. + + + Article 7 - RELATED SERVICES + +7.1 Under no circumstances shall the Agreement oblige the Licensor to +provide technical assistance or maintenance services for the Software. + +However, the Licensor is entitled to offer this type of services. The +terms and conditions of such technical assistance, and/or such +maintenance, shall be set forth in a separate instrument. Only the +Licensor offering said maintenance and/or technical assistance services +shall incur liability therefor. + +7.2 Similarly, any Licensor is entitled to offer to its licensees, under +its sole responsibility, a warranty, that shall only be binding upon +itself, for the redistribution of the Software and/or the Modified +Software, under terms and conditions that it is free to decide. Said +warranty, and the financial terms and conditions of its application, +shall be subject of a separate instrument executed between the Licensor +and the Licensee. + + + Article 8 - LIABILITY + +8.1 Subject to the provisions of Article 8.2, the Licensee shall be +entitled to claim compensation for any direct loss it may have suffered +from the Software as a result of a fault on the part of the relevant +Licensor, subject to providing evidence thereof. + +8.2 The Licensor's liability is limited to the commitments made under +this Agreement and shall not be incurred as a result of in particular: +(i) loss due the Licensee's total or partial failure to fulfill its +obligations, (ii) direct or consequential loss that is suffered by the +Licensee due to the use or performance of the Software, and (iii) more +generally, any consequential loss. In particular the Parties expressly +agree that any or all pecuniary or business loss (i.e. loss of data, +loss of profits, operating loss, loss of customers or orders, +opportunity cost, any disturbance to business activities) or any or all +legal proceedings instituted against the Licensee by a third party, +shall constitute consequential loss and shall not provide entitlement to +any or all compensation from the Licensor. + + + Article 9 - WARRANTY + +9.1 The Licensee acknowledges that the scientific and technical +state-of-the-art when the Software was distributed did not enable all +possible uses to be tested and verified, nor for the presence of +possible defects to be detected. In this respect, the Licensee's +attention has been drawn to the risks associated with loading, using, +modifying and/or developing and reproducing the Software which are +reserved for experienced users. + +The Licensee shall be responsible for verifying, by any or all means, +the suitability of the product for its requirements, its good working +order, and for ensuring that it shall not cause damage to either persons +or properties. + +9.2 The Licensor hereby represents, in good faith, that it is entitled +to grant all the rights over the Software (including in particular the +rights set forth in Article 5). + +9.3 The Licensee acknowledges that the Software is supplied "as is" by +the Licensor without any other express or tacit warranty, other than +that provided for in Article 9.2 and, in particular, without any warranty +as to its commercial value, its secured, safe, innovative or relevant +nature. + +Specifically, the Licensor does not warrant that the Software is free +from any error, that it will operate without interruption, that it will +be compatible with the Licensee's own equipment and software +configuration, nor that it will meet the Licensee's requirements. + +9.4 The Licensor does not either expressly or tacitly warrant that the +Software does not infringe any third party intellectual property right +relating to a patent, software or any other property right. Therefore, +the Licensor disclaims any and all liability towards the Licensee +arising out of any or all proceedings for infringement that may be +instituted in respect of the use, modification and redistribution of the +Software. Nevertheless, should such proceedings be instituted against +the Licensee, the Licensor shall provide it with technical and legal +assistance for its defense. Such technical and legal assistance shall be +decided on a case-by-case basis between the relevant Licensor and the +Licensee pursuant to a memorandum of understanding. The Licensor +disclaims any and all liability as regards the Licensee's use of the +name of the Software. No warranty is given as regards the existence of +prior rights over the name of the Software or as regards the existence +of a trademark. + + + Article 10 - TERMINATION + +10.1 In the event of a breach by the Licensee of its obligations +hereunder, the Licensor may automatically terminate this Agreement +thirty (30) days after notice has been sent to the Licensee and has +remained ineffective. + +10.2 A Licensee whose Agreement is terminated shall no longer be +authorized to use, modify or distribute the Software. However, any +licenses that it may have granted prior to termination of the Agreement +shall remain valid subject to their having been granted in compliance +with the terms and conditions hereof. + + + Article 11 - MISCELLANEOUS + + + 11.1 EXCUSABLE EVENTS + +Neither Party shall be liable for any or all delay, or failure to +perform the Agreement, that may be attributable to an event of force +majeure, an act of God or an outside cause, such as defective +functioning or interruptions of the electricity or telecommunications +networks, network paralysis following a virus attack, intervention by +government authorities, natural disasters, water damage, earthquakes, +fire, explosions, strikes and labor unrest, war, etc. + +11.2 Any failure by either Party, on one or more occasions, to invoke +one or more of the provisions hereof, shall under no circumstances be +interpreted as being a waiver by the interested Party of its right to +invoke said provision(s) subsequently. + +11.3 The Agreement cancels and replaces any or all previous agreements, +whether written or oral, between the Parties and having the same +purpose, and constitutes the entirety of the agreement between said +Parties concerning said purpose. No supplement or modification to the +terms and conditions hereof shall be effective as between the Parties +unless it is made in writing and signed by their duly authorized +representatives. + +11.4 In the event that one or more of the provisions hereof were to +conflict with a current or future applicable act or legislative text, +said act or legislative text shall prevail, and the Parties shall make +the necessary amendments so as to comply with said act or legislative +text. All other provisions shall remain effective. Similarly, invalidity +of a provision of the Agreement, for any reason whatsoever, shall not +cause the Agreement as a whole to be invalid. + + + 11.5 LANGUAGE + +The Agreement is drafted in both French and English and both versions +are deemed authentic. + + + Article 12 - NEW VERSIONS OF THE AGREEMENT + +12.1 Any person is authorized to duplicate and distribute copies of this +Agreement. + +12.2 So as to ensure coherence, the wording of this Agreement is +protected and may only be modified by the authors of the License, who +reserve the right to periodically publish updates or new versions of the +Agreement, each with a separate number. These subsequent versions may +address new issues encountered by Free Software. + +12.3 Any Software distributed under a given version of the Agreement may +only be subsequently distributed under the same version of the Agreement +or a subsequent version, subject to the provisions of Article 5.3.4. + + + Article 13 - GOVERNING LAW AND JURISDICTION + +13.1 The Agreement is governed by French law. The Parties agree to +endeavor to seek an amicable solution to any disagreements or disputes +that may arise during the performance of the Agreement. + +13.2 Failing an amicable solution within two (2) months as from their +occurrence, and unless emergency proceedings are necessary, the +disagreements or disputes shall be referred to the Paris Courts having +jurisdiction, by the more diligent Party. + + +Version 2.0 dated 2006-09-05. diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/Licence_CeCILL_V2-fr.txt b/src/sumatra/sumatra-1.0.10/sumalibs/Licence_CeCILL_V2-fr.txt new file mode 100644 index 0000000..1613fca --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/Licence_CeCILL_V2-fr.txt @@ -0,0 +1,512 @@ + +CONTRAT DE LICENCE DE LOGICIEL LIBRE CeCILL + + + Avertissement + +Ce contrat est une licence de logiciel libre issue d'une concertation +entre ses auteurs afin que le respect de deux grands principes préside ŕ +sa rédaction: + + * d'une part, le respect des principes de diffusion des logiciels + libres: accčs au code source, droits étendus conférés aux + utilisateurs, + * d'autre part, la désignation d'un droit applicable, le droit + français, auquel elle est conforme, tant au regard du droit de la + responsabilité civile que du droit de la propriété intellectuelle + et de la protection qu'il offre aux auteurs et titulaires des + droits patrimoniaux sur un logiciel. + +Les auteurs de la licence CeCILL (pour Ce[a] C[nrs] I[nria] L[ogiciel] +L[ibre]) sont: + +Commissariat ŕ l'Energie Atomique - CEA, établissement public de +recherche ŕ caractčre scientifique, technique et industriel, dont le +sičge est situé 25 rue Leblanc, immeuble Le Ponant D, 75015 Paris. + +Centre National de la Recherche Scientifique - CNRS, établissement +public ŕ caractčre scientifique et technologique, dont le sičge est +situé 3 rue Michel-Ange, 75794 Paris cedex 16. + +Institut National de Recherche en Informatique et en Automatique - +INRIA, établissement public ŕ caractčre scientifique et technologique, +dont le sičge est situé Domaine de Voluceau, Rocquencourt, BP 105, 78153 +Le Chesnay cedex. + + + Préambule + +Ce contrat est une licence de logiciel libre dont l'objectif est de +conférer aux utilisateurs la liberté de modification et de +redistribution du logiciel régi par cette licence dans le cadre d'un +modčle de diffusion en logiciel libre. + +L'exercice de ces libertés est assorti de certains devoirs ŕ la charge +des utilisateurs afin de préserver ce statut au cours des +redistributions ultérieures. + +L'accessibilité au code source et les droits de copie, de modification +et de redistribution qui en découlent ont pour contrepartie de n'offrir +aux utilisateurs qu'une garantie limitée et de ne faire peser sur +l'auteur du logiciel, le titulaire des droits patrimoniaux et les +concédants successifs qu'une responsabilité restreinte. + +A cet égard l'attention de l'utilisateur est attirée sur les risques +associés au chargement, ŕ l'utilisation, ŕ la modification et/ou au +développement et ŕ la reproduction du logiciel par l'utilisateur étant +donné sa spécificité de logiciel libre, qui peut le rendre complexe ŕ +manipuler et qui le réserve donc ŕ des développeurs ou des +professionnels avertis possédant des connaissances informatiques +approfondies. Les utilisateurs sont donc invités ŕ charger et tester +l'adéquation du logiciel ŕ leurs besoins dans des conditions permettant +d'assurer la sécurité de leurs systčmes et/ou de leurs données et, plus +généralement, ŕ l'utiliser et l'exploiter dans les męmes conditions de +sécurité. Ce contrat peut ętre reproduit et diffusé librement, sous +réserve de le conserver en l'état, sans ajout ni suppression de clauses. + +Ce contrat est susceptible de s'appliquer ŕ tout logiciel dont le +titulaire des droits patrimoniaux décide de soumettre l'exploitation aux +dispositions qu'il contient. + + + Article 1 - DEFINITIONS + +Dans ce contrat, les termes suivants, lorsqu'ils seront écrits avec une +lettre capitale, auront la signification suivante: + +Contrat: désigne le présent contrat de licence, ses éventuelles versions +postérieures et annexes. + +Logiciel: désigne le logiciel sous sa forme de Code Objet et/ou de Code +Source et le cas échéant sa documentation, dans leur état au moment de +l'acceptation du Contrat par le Licencié. + +Logiciel Initial: désigne le Logiciel sous sa forme de Code Source et +éventuellement de Code Objet et le cas échéant sa documentation, dans +leur état au moment de leur premičre diffusion sous les termes du Contrat. + +Logiciel Modifié: désigne le Logiciel modifié par au moins une +Contribution. + +Code Source: désigne l'ensemble des instructions et des lignes de +programme du Logiciel et auquel l'accčs est nécessaire en vue de +modifier le Logiciel. + +Code Objet: désigne les fichiers binaires issus de la compilation du +Code Source. + +Titulaire: désigne le ou les détenteurs des droits patrimoniaux d'auteur +sur le Logiciel Initial. + +Licencié: désigne le ou les utilisateurs du Logiciel ayant accepté le +Contrat. + +Contributeur: désigne le Licencié auteur d'au moins une Contribution. + +Concédant: désigne le Titulaire ou toute personne physique ou morale +distribuant le Logiciel sous le Contrat. + +Contribution: désigne l'ensemble des modifications, corrections, +traductions, adaptations et/ou nouvelles fonctionnalités intégrées dans +le Logiciel par tout Contributeur, ainsi que tout Module Interne. + +Module: désigne un ensemble de fichiers sources y compris leur +documentation qui permet de réaliser des fonctionnalités ou services +supplémentaires ŕ ceux fournis par le Logiciel. + +Module Externe: désigne tout Module, non dérivé du Logiciel, tel que ce +Module et le Logiciel s'exécutent dans des espaces d'adressage +différents, l'un appelant l'autre au moment de leur exécution. + +Module Interne: désigne tout Module lié au Logiciel de telle sorte +qu'ils s'exécutent dans le męme espace d'adressage. + +GNU GPL: désigne la GNU General Public License dans sa version 2 ou +toute version ultérieure, telle que publiée par Free Software Foundation +Inc. + +Parties: désigne collectivement le Licencié et le Concédant. + +Ces termes s'entendent au singulier comme au pluriel. + + + Article 2 - OBJET + +Le Contrat a pour objet la concession par le Concédant au Licencié d'une +licence non exclusive, cessible et mondiale du Logiciel telle que +définie ci-aprčs ŕ l'article 5 pour toute la durée de protection des droits +portant sur ce Logiciel. + + + Article 3 - ACCEPTATION + +3.1 L'acceptation par le Licencié des termes du Contrat est réputée +acquise du fait du premier des faits suivants: + + * (i) le chargement du Logiciel par tout moyen notamment par + téléchargement ŕ partir d'un serveur distant ou par chargement ŕ + partir d'un support physique; + * (ii) le premier exercice par le Licencié de l'un quelconque des + droits concédés par le Contrat. + +3.2 Un exemplaire du Contrat, contenant notamment un avertissement +relatif aux spécificités du Logiciel, ŕ la restriction de garantie et ŕ +la limitation ŕ un usage par des utilisateurs expérimentés a été mis ŕ +disposition du Licencié préalablement ŕ son acceptation telle que +définie ŕ l'article 3.1 ci dessus et le Licencié reconnaît en avoir pris +connaissance. + + + Article 4 - ENTREE EN VIGUEUR ET DUREE + + + 4.1 ENTREE EN VIGUEUR + +Le Contrat entre en vigueur ŕ la date de son acceptation par le Licencié +telle que définie en 3.1. + + + 4.2 DUREE + +Le Contrat produira ses effets pendant toute la durée légale de +protection des droits patrimoniaux portant sur le Logiciel. + + + Article 5 - ETENDUE DES DROITS CONCEDES + +Le Concédant concčde au Licencié, qui accepte, les droits suivants sur +le Logiciel pour toutes destinations et pour la durée du Contrat dans +les conditions ci-aprčs détaillées. + +Par ailleurs, si le Concédant détient ou venait ŕ détenir un ou +plusieurs brevets d'invention protégeant tout ou partie des +fonctionnalités du Logiciel ou de ses composants, il s'engage ŕ ne pas +opposer les éventuels droits conférés par ces brevets aux Licenciés +successifs qui utiliseraient, exploiteraient ou modifieraient le +Logiciel. En cas de cession de ces brevets, le Concédant s'engage ŕ +faire reprendre les obligations du présent alinéa aux cessionnaires. + + + 5.1 DROIT D'UTILISATION + +Le Licencié est autorisé ŕ utiliser le Logiciel, sans restriction quant +aux domaines d'application, étant ci-aprčs précisé que cela comporte: + + 1. la reproduction permanente ou provisoire du Logiciel en tout ou + partie par tout moyen et sous toute forme. + + 2. le chargement, l'affichage, l'exécution, ou le stockage du + Logiciel sur tout support. + + 3. la possibilité d'en observer, d'en étudier, ou d'en tester le + fonctionnement afin de déterminer les idées et principes qui sont + ŕ la base de n'importe quel élément de ce Logiciel; et ceci, + lorsque le Licencié effectue toute opération de chargement, + d'affichage, d'exécution, de transmission ou de stockage du + Logiciel qu'il est en droit d'effectuer en vertu du Contrat. + + + 5.2 DROIT D'APPORTER DES CONTRIBUTIONS + +Le droit d'apporter des Contributions comporte le droit de traduire, +d'adapter, d'arranger ou d'apporter toute autre modification au Logiciel +et le droit de reproduire le logiciel en résultant. + +Le Licencié est autorisé ŕ apporter toute Contribution au Logiciel sous +réserve de mentionner, de façon explicite, son nom en tant qu'auteur de +cette Contribution et la date de création de celle-ci. + + + 5.3 DROIT DE DISTRIBUTION + +Le droit de distribution comporte notamment le droit de diffuser, de +transmettre et de communiquer le Logiciel au public sur tout support et +par tout moyen ainsi que le droit de mettre sur le marché ŕ titre +onéreux ou gratuit, un ou des exemplaires du Logiciel par tout procédé. + +Le Licencié est autorisé ŕ distribuer des copies du Logiciel, modifié ou +non, ŕ des tiers dans les conditions ci-aprčs détaillées. + + + 5.3.1 DISTRIBUTION DU LOGICIEL SANS MODIFICATION + +Le Licencié est autorisé ŕ distribuer des copies conformes du Logiciel, +sous forme de Code Source ou de Code Objet, ŕ condition que cette +distribution respecte les dispositions du Contrat dans leur totalité et +soit accompagnée: + + 1. d'un exemplaire du Contrat, + + 2. d'un avertissement relatif ŕ la restriction de garantie et de + responsabilité du Concédant telle que prévue aux articles 8 + et 9, + +et que, dans le cas oů seul le Code Objet du Logiciel est redistribué, +le Licencié permette aux futurs Licenciés d'accéder facilement au Code +Source complet du Logiciel en indiquant les modalités d'accčs, étant +entendu que le coűt additionnel d'acquisition du Code Source ne devra +pas excéder le simple coűt de transfert des données. + + + 5.3.2 DISTRIBUTION DU LOGICIEL MODIFIE + +Lorsque le Licencié apporte une Contribution au Logiciel, les conditions +de distribution du Logiciel Modifié en résultant sont alors soumises ŕ +l'intégralité des dispositions du Contrat. + +Le Licencié est autorisé ŕ distribuer le Logiciel Modifié, sous forme de +code source ou de code objet, ŕ condition que cette distribution +respecte les dispositions du Contrat dans leur totalité et soit +accompagnée: + + 1. d'un exemplaire du Contrat, + + 2. d'un avertissement relatif ŕ la restriction de garantie et de + responsabilité du Concédant telle que prévue aux articles 8 + et 9, + +et que, dans le cas oů seul le code objet du Logiciel Modifié est +redistribué, le Licencié permette aux futurs Licenciés d'accéder +facilement au code source complet du Logiciel Modifié en indiquant les +modalités d'accčs, étant entendu que le coűt additionnel d'acquisition +du code source ne devra pas excéder le simple coűt de transfert des données. + + + 5.3.3 DISTRIBUTION DES MODULES EXTERNES + +Lorsque le Licencié a développé un Module Externe les conditions du +Contrat ne s'appliquent pas ŕ ce Module Externe, qui peut ętre distribué +sous un contrat de licence différent. + + + 5.3.4 COMPATIBILITE AVEC LA LICENCE GNU GPL + +Le Licencié peut inclure un code soumis aux dispositions d'une des +versions de la licence GNU GPL dans le Logiciel modifié ou non et +distribuer l'ensemble sous les conditions de la męme version de la +licence GNU GPL. + +Le Licencié peut inclure le Logiciel modifié ou non dans un code soumis +aux dispositions d'une des versions de la licence GNU GPL et distribuer +l'ensemble sous les conditions de la męme version de la licence GNU GPL. + + + Article 6 - PROPRIETE INTELLECTUELLE + + + 6.1 SUR LE LOGICIEL INITIAL + +Le Titulaire est détenteur des droits patrimoniaux sur le Logiciel +Initial. Toute utilisation du Logiciel Initial est soumise au respect +des conditions dans lesquelles le Titulaire a choisi de diffuser son +oeuvre et nul autre n'a la faculté de modifier les conditions de +diffusion de ce Logiciel Initial. + +Le Titulaire s'engage ŕ ce que le Logiciel Initial reste au moins régi +par le Contrat et ce, pour la durée visée ŕ l'article 4.2. + + + 6.2 SUR LES CONTRIBUTIONS + +Le Licencié qui a développé une Contribution est titulaire sur celle-ci +des droits de propriété intellectuelle dans les conditions définies par +la législation applicable. + + + 6.3 SUR LES MODULES EXTERNES + +Le Licencié qui a développé un Module Externe est titulaire sur celui-ci +des droits de propriété intellectuelle dans les conditions définies par +la législation applicable et reste libre du choix du contrat régissant +sa diffusion. + + + 6.4 DISPOSITIONS COMMUNES + +Le Licencié s'engage expressément: + + 1. ŕ ne pas supprimer ou modifier de quelque maničre que ce soit les + mentions de propriété intellectuelle apposées sur le Logiciel; + + 2. ŕ reproduire ŕ l'identique lesdites mentions de propriété + intellectuelle sur les copies du Logiciel modifié ou non. + +Le Licencié s'engage ŕ ne pas porter atteinte, directement ou +indirectement, aux droits de propriété intellectuelle du Titulaire et/ou +des Contributeurs sur le Logiciel et ŕ prendre, le cas échéant, ŕ +l'égard de son personnel toutes les mesures nécessaires pour assurer le +respect des dits droits de propriété intellectuelle du Titulaire et/ou +des Contributeurs. + + + Article 7 - SERVICES ASSOCIES + +7.1 Le Contrat n'oblige en aucun cas le Concédant ŕ la réalisation de +prestations d'assistance technique ou de maintenance du Logiciel. + +Cependant le Concédant reste libre de proposer ce type de services. Les +termes et conditions d'une telle assistance technique et/ou d'une telle +maintenance seront alors déterminés dans un acte séparé. Ces actes de +maintenance et/ou assistance technique n'engageront que la seule +responsabilité du Concédant qui les propose. + +7.2 De męme, tout Concédant est libre de proposer, sous sa seule +responsabilité, ŕ ses licenciés une garantie, qui n'engagera que lui, +lors de la redistribution du Logiciel et/ou du Logiciel Modifié et ce, +dans les conditions qu'il souhaite. Cette garantie et les modalités +financičres de son application feront l'objet d'un acte séparé entre le +Concédant et le Licencié. + + + Article 8 - RESPONSABILITE + +8.1 Sous réserve des dispositions de l'article 8.2, le Licencié a la +faculté, sous réserve de prouver la faute du Concédant concerné, de +solliciter la réparation du préjudice direct qu'il subirait du fait du +Logiciel et dont il apportera la preuve. + +8.2 La responsabilité du Concédant est limitée aux engagements pris en +application du Contrat et ne saurait ętre engagée en raison notamment: +(i) des dommages dus ŕ l'inexécution, totale ou partielle, de ses +obligations par le Licencié, (ii) des dommages directs ou indirects +découlant de l'utilisation ou des performances du Logiciel subis par le +Licencié et (iii) plus généralement d'un quelconque dommage indirect. En +particulier, les Parties conviennent expressément que tout préjudice +financier ou commercial (par exemple perte de données, perte de +bénéfices, perte d'exploitation, perte de clientčle ou de commandes, +manque ŕ gagner, trouble commercial quelconque) ou toute action dirigée +contre le Licencié par un tiers, constitue un dommage indirect et +n'ouvre pas droit ŕ réparation par le Concédant. + + + Article 9 - GARANTIE + +9.1 Le Licencié reconnaît que l'état actuel des connaissances +scientifiques et techniques au moment de la mise en circulation du +Logiciel ne permet pas d'en tester et d'en vérifier toutes les +utilisations ni de détecter l'existence d'éventuels défauts. L'attention +du Licencié a été attirée sur ce point sur les risques associés au +chargement, ŕ l'utilisation, la modification et/ou au développement et ŕ +la reproduction du Logiciel qui sont réservés ŕ des utilisateurs avertis. + +Il relčve de la responsabilité du Licencié de contrôler, par tous +moyens, l'adéquation du produit ŕ ses besoins, son bon fonctionnement et +de s'assurer qu'il ne causera pas de dommages aux personnes et aux biens. + +9.2 Le Concédant déclare de bonne foi ętre en droit de concéder +l'ensemble des droits attachés au Logiciel (comprenant notamment les +droits visés ŕ l'article 5). + +9.3 Le Licencié reconnaît que le Logiciel est fourni "en l'état" par le +Concédant sans autre garantie, expresse ou tacite, que celle prévue ŕ +l'article 9.2 et notamment sans aucune garantie sur sa valeur commerciale, +son caractčre sécurisé, innovant ou pertinent. + +En particulier, le Concédant ne garantit pas que le Logiciel est exempt +d'erreur, qu'il fonctionnera sans interruption, qu'il sera compatible +avec l'équipement du Licencié et sa configuration logicielle ni qu'il +remplira les besoins du Licencié. + +9.4 Le Concédant ne garantit pas, de maničre expresse ou tacite, que le +Logiciel ne porte pas atteinte ŕ un quelconque droit de propriété +intellectuelle d'un tiers portant sur un brevet, un logiciel ou sur tout +autre droit de propriété. Ainsi, le Concédant exclut toute garantie au +profit du Licencié contre les actions en contrefaçon qui pourraient ętre +diligentées au titre de l'utilisation, de la modification, et de la +redistribution du Logiciel. Néanmoins, si de telles actions sont +exercées contre le Licencié, le Concédant lui apportera son aide +technique et juridique pour sa défense. Cette aide technique et +juridique est déterminée au cas par cas entre le Concédant concerné et +le Licencié dans le cadre d'un protocole d'accord. Le Concédant dégage +toute responsabilité quant ŕ l'utilisation de la dénomination du +Logiciel par le Licencié. Aucune garantie n'est apportée quant ŕ +l'existence de droits antérieurs sur le nom du Logiciel et sur +l'existence d'une marque. + + + Article 10 - RESILIATION + +10.1 En cas de manquement par le Licencié aux obligations mises ŕ sa +charge par le Contrat, le Concédant pourra résilier de plein droit le +Contrat trente (30) jours aprčs notification adressée au Licencié et +restée sans effet. + +10.2 Le Licencié dont le Contrat est résilié n'est plus autorisé ŕ +utiliser, modifier ou distribuer le Logiciel. Cependant, toutes les +licences qu'il aura concédées antérieurement ŕ la résiliation du Contrat +resteront valides sous réserve qu'elles aient été effectuées en +conformité avec le Contrat. + + + Article 11 - DISPOSITIONS DIVERSES + + + 11.1 CAUSE EXTERIEURE + +Aucune des Parties ne sera responsable d'un retard ou d'une défaillance +d'exécution du Contrat qui serait dű ŕ un cas de force majeure, un cas +fortuit ou une cause extérieure, telle que, notamment, le mauvais +fonctionnement ou les interruptions du réseau électrique ou de +télécommunication, la paralysie du réseau liée ŕ une attaque +informatique, l'intervention des autorités gouvernementales, les +catastrophes naturelles, les dégâts des eaux, les tremblements de terre, +le feu, les explosions, les grčves et les conflits sociaux, l'état de +guerre... + +11.2 Le fait, par l'une ou l'autre des Parties, d'omettre en une ou +plusieurs occasions de se prévaloir d'une ou plusieurs dispositions du +Contrat, ne pourra en aucun cas impliquer renonciation par la Partie +intéressée ŕ s'en prévaloir ultérieurement. + +11.3 Le Contrat annule et remplace toute convention antérieure, écrite +ou orale, entre les Parties sur le męme objet et constitue l'accord +entier entre les Parties sur cet objet. Aucune addition ou modification +aux termes du Contrat n'aura d'effet ŕ l'égard des Parties ŕ moins +d'ętre faite par écrit et signée par leurs représentants dűment habilités. + +11.4 Dans l'hypothčse oů une ou plusieurs des dispositions du Contrat +s'avčrerait contraire ŕ une loi ou ŕ un texte applicable, existants ou +futurs, cette loi ou ce texte prévaudrait, et les Parties feraient les +amendements nécessaires pour se conformer ŕ cette loi ou ŕ ce texte. +Toutes les autres dispositions resteront en vigueur. De męme, la +nullité, pour quelque raison que ce soit, d'une des dispositions du +Contrat ne saurait entraîner la nullité de l'ensemble du Contrat. + + + 11.5 LANGUE + +Le Contrat est rédigé en langue française et en langue anglaise, ces +deux versions faisant également foi. + + + Article 12 - NOUVELLES VERSIONS DU CONTRAT + +12.1 Toute personne est autorisée ŕ copier et distribuer des copies de +ce Contrat. + +12.2 Afin d'en préserver la cohérence, le texte du Contrat est protégé +et ne peut ętre modifié que par les auteurs de la licence, lesquels se +réservent le droit de publier périodiquement des mises ŕ jour ou de +nouvelles versions du Contrat, qui posséderont chacune un numéro +distinct. Ces versions ultérieures seront susceptibles de prendre en +compte de nouvelles problématiques rencontrées par les logiciels libres. + +12.3 Tout Logiciel diffusé sous une version donnée du Contrat ne pourra +faire l'objet d'une diffusion ultérieure que sous la męme version du +Contrat ou une version postérieure, sous réserve des dispositions de +l'article 5.3.4. + + + Article 13 - LOI APPLICABLE ET COMPETENCE TERRITORIALE + +13.1 Le Contrat est régi par la loi française. Les Parties conviennent +de tenter de régler ŕ l'amiable les différends ou litiges qui +viendraient ŕ se produire par suite ou ŕ l'occasion du Contrat. + +13.2 A défaut d'accord amiable dans un délai de deux (2) mois ŕ compter +de leur survenance et sauf situation relevant d'une procédure d'urgence, +les différends ou litiges seront portés par la Partie la plus diligente +devant les Tribunaux compétents de Paris. + + +Version 2.0 du 2006-09-05. diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/global.mk b/src/sumatra/sumatra-1.0.10/sumalibs/global.mk new file mode 100644 index 0000000..e5a0dfb --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/global.mk @@ -0,0 +1,7 @@ +include ../../../../../config/auto.conf + + +default: all + +%.o: %.c + $(CC) $(CFLAGS) -c -o $@ $< diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/Makefile b/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/Makefile new file mode 100644 index 0000000..97cf5ec --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/Makefile @@ -0,0 +1,33 @@ + +SOURCES = fasta_header_parser.c \ + fasta_seq_writer.c \ + fasta_header_handler.c \ + header_mem_handler.c \ + sequence.c + +SRCS=$(SOURCES) + + +OBJECTS= $(patsubst %.c,%.o,$(SOURCES)) + +LIBFILE = libfasta.a +RANLIB = ranlib + + +include ../global.mk + +all: $(LIBFILE) + +fasta_header_parser.c: fasta_header_parser.l + flex -Pheader_yy -t $< > $@ + +dic_parser.c: dic_parser.l + lex -Phashtable_yy -t $< > $@ + +clean: + rm -rf $(OBJECTS) $(LIBFILE) + rm -f *.a + +$(LIBFILE): $(OBJECTS) + ar -cr $@ $? + $(RANLIB) $@ diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/fasta_header_handler.c b/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/fasta_header_handler.c new file mode 100644 index 0000000..f57d8c7 --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/fasta_header_handler.c @@ -0,0 +1,126 @@ +#include +#include +#include +#include "sequence.h" +#include "fasta_header_parser.h" +#include "fasta_header_handler.h" + + +char* char_header_add_field(char* header, char* name, char* value) +{ + int lheader = strlen(header); + header = (char*) realloc(header, (lheader+strlen(name)+strlen(value)+4)*sizeof(char)); + if (header[lheader-1] == '.') + { + strcpy(header+lheader-1,";"); + strcpy(header+lheader," "); + strcpy(header+lheader+1,name); + strcpy(header+lheader+1+strlen(name),"="); + strcpy(header+lheader+1+strlen(name)+1,value); + } + else + { + strcpy(header+lheader,";"); + strcpy(header+lheader+1," "); + strcpy(header+lheader+2,name); + strcpy(header+lheader+2+strlen(name),"="); + strcpy(header+lheader+2+strlen(name)+1,value); + } + return header; +} + + +char* fastaSeqPtr_header_add_field(fastaSeqPtr seq, char* name, char* value) +{ + int lheader = strlen(seq->rawheader); + int i; + char* buffer; + char* rawheader; + + rawheader = (char*) malloc((lheader+strlen(name)+strlen(value)+5)*sizeof(char)); + strcpy(rawheader, seq->rawheader); + + buffer = calloc(lheader, sizeof(char)); + + i=0; + + while ((rawheader[i] != ' ') && (rawheader[i] != 0)) + i++; + + if (rawheader[i] == ' ') + strcpy(buffer, rawheader+i); + else + strcpy(rawheader+i, " "); + + i++; + + strcpy(rawheader+i,name); + strcpy(rawheader+i+strlen(name),"="); + strcpy(rawheader+i+strlen(name)+1,value); + strcpy(rawheader+i+strlen(name)+1+strlen(value),";"); + strcpy(rawheader+i+strlen(name)+1+strlen(value)+1, buffer); + + free(buffer); + + return(rawheader); +} + + +element_from_header* table_header_add_field(element_from_header* header, char* name, char* value) +{ + int nbf; + nbf = atoi(header[0].value); + nbf++; + header = (element_from_header*) realloc(header, (nbf+1)*sizeof(element_from_header)); + header[nbf].name = (char*) malloc((1+strlen(name))*sizeof(char)); + strcpy(header[nbf].name, name); + header[nbf].value = (char*) malloc((1+strlen(value))*sizeof(char)); + strcpy(header[nbf].value, value); + sprintf(header[0].value, "%d", nbf); + return(header); +} + + +void free_header_table(element_from_header* header) +{ + int i; + int nbf = atoi(header[0].value); + + for (i = 0; i <= nbf; i++) + { + free((header[i]).name); + free((header[i]).value); + } + free(header); +} + + +char* getItemFromHeader(char* name, element_from_header* header) +{ + char* value = 0; + int nbf; + int i; + nbf = atoi(header[0].value); + for (i = 1; i <= nbf; i++) + { + if (strcmp(header[i].name,name)==0) + value = header[i].value; + } + return value; +} + + +void changeValue(element_from_header* header, char* name, char* newValue) +{ + int i; + int nbf = atoi(header[0].value); + + for (i = 1; i <= nbf; i++) + { + if (strcmp(header[i].name, name)==0) + { + header[i].value = realloc(header[i].value, (1+strlen(newValue))*sizeof(char)); + strcpy(header[i].value, newValue); + } + } +} diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/fasta_header_handler.h b/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/fasta_header_handler.h new file mode 100644 index 0000000..e68b81a --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/fasta_header_handler.h @@ -0,0 +1,23 @@ + +#ifndef FASTA_HEADER_HANDLER_H_ +#define FASTA_HEADER_HANDLER_H_ + + +#include "sequence.h" + + +char* char_header_add_field(char*,char*,char*); + +char* fastaSeqPtr_header_add_field(fastaSeqPtr seq, char* name, char* value); + +element_from_header* table_header_add_dic(element_from_header* header, char* name, struct hashtable *hashtab); + +element_from_header* table_header_add_field(element_from_header* header, char* name, char* value); + +void free_header_table(element_from_header*); + +char* getItemFromHeader(char*, element_from_header*); + +void changeValue(element_from_header* header, char* name, char* newValue); + +#endif diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/fasta_header_parser.c b/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/fasta_header_parser.c new file mode 100644 index 0000000..ed53557 --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/fasta_header_parser.c @@ -0,0 +1,1954 @@ + +#line 3 "" + +#define YY_INT_ALIGNED short int + +/* A lexical scanner generated by flex */ + +#define yy_create_buffer header_yy_create_buffer +#define yy_delete_buffer header_yy_delete_buffer +#define yy_flex_debug header_yy_flex_debug +#define yy_init_buffer header_yy_init_buffer +#define yy_flush_buffer header_yy_flush_buffer +#define yy_load_buffer_state header_yy_load_buffer_state +#define yy_switch_to_buffer header_yy_switch_to_buffer +#define yyin header_yyin +#define yyleng header_yyleng +#define yylex header_yylex +#define yylineno header_yylineno +#define yyout header_yyout +#define yyrestart header_yyrestart +#define yytext header_yytext +#define yywrap header_yywrap +#define yyalloc header_yyalloc +#define yyrealloc header_yyrealloc +#define yyfree header_yyfree + +#define FLEX_SCANNER +#define YY_FLEX_MAJOR_VERSION 2 +#define YY_FLEX_MINOR_VERSION 5 +#define YY_FLEX_SUBMINOR_VERSION 37 +#if YY_FLEX_SUBMINOR_VERSION > 0 +#define FLEX_BETA +#endif + +/* First, we deal with platform-specific or compiler-specific issues. */ + +/* begin standard C headers. */ +#include +#include +#include +#include + +/* end standard C headers. */ + +/* flex integer type definitions */ + +#ifndef FLEXINT_H +#define FLEXINT_H + +/* C99 systems have . Non-C99 systems may or may not. */ + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + +/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, + * if you want the limit (max/min) macros for int types. + */ +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS 1 +#endif + +#include +typedef int8_t flex_int8_t; +typedef uint8_t flex_uint8_t; +typedef int16_t flex_int16_t; +typedef uint16_t flex_uint16_t; +typedef int32_t flex_int32_t; +typedef uint32_t flex_uint32_t; +#else +typedef signed char flex_int8_t; +typedef short int flex_int16_t; +typedef int flex_int32_t; +typedef unsigned char flex_uint8_t; +typedef unsigned short int flex_uint16_t; +typedef unsigned int flex_uint32_t; + +/* Limits of integral types. */ +#ifndef INT8_MIN +#define INT8_MIN (-128) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-32767-1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-2147483647-1) +#endif +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255U) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535U) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +#endif /* ! C99 */ + +#endif /* ! FLEXINT_H */ + +#ifdef __cplusplus + +/* The "const" storage-class-modifier is valid. */ +#define YY_USE_CONST + +#else /* ! __cplusplus */ + +/* C99 requires __STDC__ to be defined as 1. */ +#if defined (__STDC__) + +#define YY_USE_CONST + +#endif /* defined (__STDC__) */ +#endif /* ! __cplusplus */ + +#ifdef YY_USE_CONST +#define yyconst const +#else +#define yyconst +#endif + +/* Returned upon end-of-file. */ +#define YY_NULL 0 + +/* Promotes a possibly negative, possibly signed char to an unsigned + * integer for use as an array index. If the signed char is negative, + * we want to instead treat it as an 8-bit unsigned char, hence the + * double cast. + */ +#define YY_SC_TO_UI(c) ((unsigned int) (unsigned char) c) + +/* Enter a start condition. This macro really ought to take a parameter, + * but we do it the disgusting crufty way forced on us by the ()-less + * definition of BEGIN. + */ +#define BEGIN (yy_start) = 1 + 2 * + +/* Translate the current start state into a value that can be later handed + * to BEGIN to return to the state. The YYSTATE alias is for lex + * compatibility. + */ +#define YY_START (((yy_start) - 1) / 2) +#define YYSTATE YY_START + +/* Action number for EOF rule of a given start state. */ +#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1) + +/* Special action meaning "start processing a new file". */ +#define YY_NEW_FILE header_yyrestart(header_yyin ) + +#define YY_END_OF_BUFFER_CHAR 0 + +/* Size of default input buffer. */ +#ifndef YY_BUF_SIZE +#define YY_BUF_SIZE 16384 +#endif + +/* The state buf must be large enough to hold one state per character in the main buffer. + */ +#define YY_STATE_BUF_SIZE ((YY_BUF_SIZE + 2) * sizeof(yy_state_type)) + +#ifndef YY_TYPEDEF_YY_BUFFER_STATE +#define YY_TYPEDEF_YY_BUFFER_STATE +typedef struct yy_buffer_state *YY_BUFFER_STATE; +#endif + +#ifndef YY_TYPEDEF_YY_SIZE_T +#define YY_TYPEDEF_YY_SIZE_T +typedef size_t yy_size_t; +#endif + +extern yy_size_t header_yyleng; + +extern FILE *header_yyin, *header_yyout; + +#define EOB_ACT_CONTINUE_SCAN 0 +#define EOB_ACT_END_OF_FILE 1 +#define EOB_ACT_LAST_MATCH 2 + + #define YY_LESS_LINENO(n) + +/* Return all but the first "n" matched characters back to the input stream. */ +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up header_yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + *yy_cp = (yy_hold_char); \ + YY_RESTORE_YY_MORE_OFFSET \ + (yy_c_buf_p) = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \ + YY_DO_BEFORE_ACTION; /* set up header_yytext again */ \ + } \ + while ( 0 ) + +#define unput(c) yyunput( c, (yytext_ptr) ) + +#ifndef YY_STRUCT_YY_BUFFER_STATE +#define YY_STRUCT_YY_BUFFER_STATE +struct yy_buffer_state + { + FILE *yy_input_file; + + char *yy_ch_buf; /* input buffer */ + char *yy_buf_pos; /* current position in input buffer */ + + /* Size of input buffer in bytes, not including room for EOB + * characters. + */ + yy_size_t yy_buf_size; + + /* Number of characters read into yy_ch_buf, not including EOB + * characters. + */ + yy_size_t yy_n_chars; + + /* Whether we "own" the buffer - i.e., we know we created it, + * and can realloc() it to grow it, and should free() it to + * delete it. + */ + int yy_is_our_buffer; + + /* Whether this is an "interactive" input source; if so, and + * if we're using stdio for input, then we want to use getc() + * instead of fread(), to make sure we stop fetching input after + * each newline. + */ + int yy_is_interactive; + + /* Whether we're considered to be at the beginning of a line. + * If so, '^' rules will be active on the next match, otherwise + * not. + */ + int yy_at_bol; + + int yy_bs_lineno; /**< The line count. */ + int yy_bs_column; /**< The column count. */ + + /* Whether to try to fill the input buffer when we reach the + * end of it. + */ + int yy_fill_buffer; + + int yy_buffer_status; + +#define YY_BUFFER_NEW 0 +#define YY_BUFFER_NORMAL 1 + /* When an EOF's been seen but there's still some text to process + * then we mark the buffer as YY_EOF_PENDING, to indicate that we + * shouldn't try reading from the input source any more. We might + * still have a bunch of tokens to match, though, because of + * possible backing-up. + * + * When we actually see the EOF, we change the status to "new" + * (via header_yyrestart()), so that the user can continue scanning by + * just pointing header_yyin at a new input file. + */ +#define YY_BUFFER_EOF_PENDING 2 + + }; +#endif /* !YY_STRUCT_YY_BUFFER_STATE */ + +/* Stack of input buffers. */ +static size_t yy_buffer_stack_top = 0; /**< index of top of stack. */ +static size_t yy_buffer_stack_max = 0; /**< capacity of stack. */ +static YY_BUFFER_STATE * yy_buffer_stack = 0; /**< Stack as an array. */ + +/* We provide macros for accessing buffer states in case in the + * future we want to put the buffer states in a more general + * "scanner state". + * + * Returns the top of the stack, or NULL. + */ +#define YY_CURRENT_BUFFER ( (yy_buffer_stack) \ + ? (yy_buffer_stack)[(yy_buffer_stack_top)] \ + : NULL) + +/* Same as previous macro, but useful when we know that the buffer stack is not + * NULL or when we need an lvalue. For internal use only. + */ +#define YY_CURRENT_BUFFER_LVALUE (yy_buffer_stack)[(yy_buffer_stack_top)] + +/* yy_hold_char holds the character lost when header_yytext is formed. */ +static char yy_hold_char; +static yy_size_t yy_n_chars; /* number of characters read into yy_ch_buf */ +yy_size_t header_yyleng; + +/* Points to current character in buffer. */ +static char *yy_c_buf_p = (char *) 0; +static int yy_init = 0; /* whether we need to initialize */ +static int yy_start = 0; /* start state number */ + +/* Flag which is used to allow header_yywrap()'s to do buffer switches + * instead of setting up a fresh header_yyin. A bit of a hack ... + */ +static int yy_did_buffer_switch_on_eof; + +void header_yyrestart (FILE *input_file ); +void header_yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ); +YY_BUFFER_STATE header_yy_create_buffer (FILE *file,int size ); +void header_yy_delete_buffer (YY_BUFFER_STATE b ); +void header_yy_flush_buffer (YY_BUFFER_STATE b ); +void header_yypush_buffer_state (YY_BUFFER_STATE new_buffer ); +void header_yypop_buffer_state (void ); + +static void header_yyensure_buffer_stack (void ); +static void header_yy_load_buffer_state (void ); +static void header_yy_init_buffer (YY_BUFFER_STATE b,FILE *file ); + +#define YY_FLUSH_BUFFER header_yy_flush_buffer(YY_CURRENT_BUFFER ) + +YY_BUFFER_STATE header_yy_scan_buffer (char *base,yy_size_t size ); +YY_BUFFER_STATE header_yy_scan_string (yyconst char *yy_str ); +YY_BUFFER_STATE header_yy_scan_bytes (yyconst char *bytes,yy_size_t len ); + +void *header_yyalloc (yy_size_t ); +void *header_yyrealloc (void *,yy_size_t ); +void header_yyfree (void * ); + +#define yy_new_buffer header_yy_create_buffer + +#define yy_set_interactive(is_interactive) \ + { \ + if ( ! YY_CURRENT_BUFFER ){ \ + header_yyensure_buffer_stack (); \ + YY_CURRENT_BUFFER_LVALUE = \ + header_yy_create_buffer(header_yyin,YY_BUF_SIZE ); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \ + } + +#define yy_set_bol(at_bol) \ + { \ + if ( ! YY_CURRENT_BUFFER ){\ + header_yyensure_buffer_stack (); \ + YY_CURRENT_BUFFER_LVALUE = \ + header_yy_create_buffer(header_yyin,YY_BUF_SIZE ); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \ + } + +#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol) + +/* Begin user sect3 */ + +typedef unsigned char YY_CHAR; + +FILE *header_yyin = (FILE *) 0, *header_yyout = (FILE *) 0; + +typedef int yy_state_type; + +extern int header_yylineno; + +int header_yylineno = 1; + +extern char *header_yytext; +#define yytext_ptr header_yytext + +static yy_state_type yy_get_previous_state (void ); +static yy_state_type yy_try_NUL_trans (yy_state_type current_state ); +static int yy_get_next_buffer (void ); +static void yy_fatal_error (yyconst char msg[] ); + +/* Done after the current pattern has been matched and before the + * corresponding action - sets up header_yytext. + */ +#define YY_DO_BEFORE_ACTION \ + (yytext_ptr) = yy_bp; \ + header_yyleng = (size_t) (yy_cp - yy_bp); \ + (yy_hold_char) = *yy_cp; \ + *yy_cp = '\0'; \ + (yy_c_buf_p) = yy_cp; + +#define YY_NUM_RULES 12 +#define YY_END_OF_BUFFER 13 +/* This struct is not used in this scanner, + but its presence is necessary. */ +struct yy_trans_info + { + flex_int32_t yy_verify; + flex_int32_t yy_nxt; + }; +static yyconst flex_int16_t yy_accept[29] = + { 0, + 0, 0, 0, 0, 0, 0, 0, 0, 13, 12, + 3, 2, 1, 5, 4, 7, 6, 9, 8, 10, + 11, 3, 2, 5, 4, 9, 8, 0 + } ; + +static yyconst flex_int32_t yy_ec[256] = + { 0, + 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 1, 3, 3, 1, 3, 3, 3, 3, + 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 4, 1, + 5, 6, 1, 1, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 1, 3, 1, 3, 1, 3, 3, 3, 3, + + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1 + } ; + +static yyconst flex_int32_t yy_meta[7] = + { 0, + 1, 2, 3, 4, 4, 1 + } ; + +static yyconst flex_int16_t yy_base[35] = + { 0, + 0, 0, 22, 21, 6, 0, 12, 0, 26, 29, + 0, 0, 29, 0, 0, 29, 29, 0, 0, 29, + 29, 0, 0, 0, 0, 0, 0, 29, 23, 16, + 22, 20, 20, 18 + } ; + +static yyconst flex_int16_t yy_def[35] = + { 0, + 28, 1, 1, 1, 28, 5, 28, 7, 28, 28, + 29, 30, 28, 31, 32, 28, 28, 33, 34, 28, + 28, 29, 30, 31, 32, 33, 34, 0, 28, 28, + 28, 28, 28, 28 + } ; + +static yyconst flex_int16_t yy_nxt[36] = + { 0, + 10, 11, 12, 12, 12, 13, 10, 14, 15, 16, + 17, 10, 10, 18, 19, 20, 21, 10, 23, 23, + 27, 26, 25, 24, 22, 28, 10, 10, 9, 28, + 28, 28, 28, 28, 28 + } ; + +static yyconst flex_int16_t yy_chk[36] = + { 0, + 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, + 5, 5, 7, 7, 7, 7, 7, 7, 30, 30, + 34, 33, 32, 31, 29, 9, 4, 3, 28, 28, + 28, 28, 28, 28, 28 + } ; + +static yy_state_type yy_last_accepting_state; +static char *yy_last_accepting_cpos; + +extern int header_yy_flex_debug; +int header_yy_flex_debug = 0; + +/* The intent behind this definition is that it'll catch + * any uses of REJECT which flex missed. + */ +#define REJECT reject_used_but_not_detected +#define yymore() yymore_used_but_not_detected +#define YY_MORE_ADJ 0 +#define YY_RESTORE_YY_MORE_OFFSET +char *header_yytext; +#line 1 "fasta_header_parser.l" +/* + * Add -ll in Makefile if you modify this file to convert to .c + */ + + + +#line 10 "fasta_header_parser.l" + +#include +#include +#include "header_mem_handler.h" +#include "fasta_header_handler.h" + +#define MEMALLOCATED 10 +#define BUFFER 5 + +#define YY_DECL int header_parser(int *nbf, int *memory_allocated, element_from_header **p_header) + + +#line 502 "" + +#define INITIAL 0 +#define REGID 1 +#define REGNAME 2 +#define REGVAL 3 + +#ifndef YY_NO_UNISTD_H +/* Special case for "unistd.h", since it is non-ANSI. We include it way + * down here because we want the user's section 1 to have been scanned first. + * The user has a chance to override it with an option. + */ +#include +#endif + +#ifndef YY_EXTRA_TYPE +#define YY_EXTRA_TYPE void * +#endif + +static int yy_init_globals (void ); + +/* Accessor methods to globals. + These are made visible to non-reentrant scanners for convenience. */ + +int header_yylex_destroy (void ); + +int header_yyget_debug (void ); + +void header_yyset_debug (int debug_flag ); + +YY_EXTRA_TYPE header_yyget_extra (void ); + +void header_yyset_extra (YY_EXTRA_TYPE user_defined ); + +FILE *header_yyget_in (void ); + +void header_yyset_in (FILE * in_str ); + +FILE *header_yyget_out (void ); + +void header_yyset_out (FILE * out_str ); + +yy_size_t header_yyget_leng (void ); + +char *header_yyget_text (void ); + +int header_yyget_lineno (void ); + +void header_yyset_lineno (int line_number ); + +/* Macros after this point can all be overridden by user definitions in + * section 1. + */ + +#ifndef YY_SKIP_YYWRAP +#ifdef __cplusplus +extern "C" int header_yywrap (void ); +#else +extern int header_yywrap (void ); +#endif +#endif + + static void yyunput (int c,char *buf_ptr ); + +#ifndef yytext_ptr +static void yy_flex_strncpy (char *,yyconst char *,int ); +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * ); +#endif + +#ifndef YY_NO_INPUT + +#ifdef __cplusplus +static int yyinput (void ); +#else +static int input (void ); +#endif + +#endif + +/* Amount of stuff to slurp up with each read. */ +#ifndef YY_READ_BUF_SIZE +#define YY_READ_BUF_SIZE 8192 +#endif + +/* Copy whatever the last rule matched to the standard output. */ +#ifndef ECHO +/* This used to be an fputs(), but since the string might contain NUL's, + * we now use fwrite(). + */ +#define ECHO do { if (fwrite( header_yytext, header_yyleng, 1, header_yyout )) {} } while (0) +#endif + +/* Gets input and stuffs it into "buf". number of characters read, or YY_NULL, + * is returned in "result". + */ +#ifndef YY_INPUT +#define YY_INPUT(buf,result,max_size) \ + if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \ + { \ + int c = '*'; \ + size_t n; \ + for ( n = 0; n < max_size && \ + (c = getc( header_yyin )) != EOF && c != '\n'; ++n ) \ + buf[n] = (char) c; \ + if ( c == '\n' ) \ + buf[n++] = (char) c; \ + if ( c == EOF && ferror( header_yyin ) ) \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + result = n; \ + } \ + else \ + { \ + errno=0; \ + while ( (result = fread(buf, 1, max_size, header_yyin))==0 && ferror(header_yyin)) \ + { \ + if( errno != EINTR) \ + { \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + break; \ + } \ + errno=0; \ + clearerr(header_yyin); \ + } \ + }\ +\ + +#endif + +/* No semi-colon after return; correct usage is to write "yyterminate();" - + * we don't want an extra ';' after the "return" because that will cause + * some compilers to complain about unreachable statements. + */ +#ifndef yyterminate +#define yyterminate() return YY_NULL +#endif + +/* Number of entries by which start-condition stack grows. */ +#ifndef YY_START_STACK_INCR +#define YY_START_STACK_INCR 25 +#endif + +/* Report a fatal error. */ +#ifndef YY_FATAL_ERROR +#define YY_FATAL_ERROR(msg) yy_fatal_error( msg ) +#endif + +/* end tables serialization structures and prototypes */ + +/* Default declaration of generated scanner - a define so the user can + * easily add parameters. + */ +#ifndef YY_DECL +#define YY_DECL_IS_OURS 1 + +extern int header_yylex (void); + +#define YY_DECL int header_yylex (void) +#endif /* !YY_DECL */ + +/* Code executed at the beginning of each rule, after header_yytext and header_yyleng + * have been set up. + */ +#ifndef YY_USER_ACTION +#define YY_USER_ACTION +#endif + +/* Code executed at the end of each rule. */ +#ifndef YY_BREAK +#define YY_BREAK break; +#endif + +#define YY_RULE_SETUP \ + YY_USER_ACTION + +/** The main scanner function which does all the work. + */ +YY_DECL +{ + register yy_state_type yy_current_state; + register char *yy_cp, *yy_bp; + register int yy_act; + +#line 32 "fasta_header_parser.l" + + + int i; + int size_needed; + int free_size; + char* field; + + +#line 696 "" + + if ( !(yy_init) ) + { + (yy_init) = 1; + +#ifdef YY_USER_INIT + YY_USER_INIT; +#endif + + if ( ! (yy_start) ) + (yy_start) = 1; /* first start state */ + + if ( ! header_yyin ) + header_yyin = stdin; + + if ( ! header_yyout ) + header_yyout = stdout; + + if ( ! YY_CURRENT_BUFFER ) { + header_yyensure_buffer_stack (); + YY_CURRENT_BUFFER_LVALUE = + header_yy_create_buffer(header_yyin,YY_BUF_SIZE ); + } + + header_yy_load_buffer_state( ); + } + + while ( 1 ) /* loops until end-of-file is reached */ + { + yy_cp = (yy_c_buf_p); + + /* Support of header_yytext. */ + *yy_cp = (yy_hold_char); + + /* yy_bp points to the position in yy_ch_buf of the start of + * the current run. + */ + yy_bp = yy_cp; + + yy_current_state = (yy_start); +yy_match: + do + { + register YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)]; + if ( yy_accept[yy_current_state] ) + { + (yy_last_accepting_state) = yy_current_state; + (yy_last_accepting_cpos) = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 29 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + ++yy_cp; + } + while ( yy_base[yy_current_state] != 29 ); + +yy_find_action: + yy_act = yy_accept[yy_current_state]; + if ( yy_act == 0 ) + { /* have to back up */ + yy_cp = (yy_last_accepting_cpos); + yy_current_state = (yy_last_accepting_state); + yy_act = yy_accept[yy_current_state]; + } + + YY_DO_BEFORE_ACTION; + +do_action: /* This label is used only to access EOF actions. */ + + switch ( yy_act ) + { /* beginning of action switch */ + case 0: /* must back up */ + /* undo the effects of YY_DO_BEFORE_ACTION */ + *yy_cp = (yy_hold_char); + yy_cp = (yy_last_accepting_cpos); + yy_current_state = (yy_last_accepting_state); + goto yy_find_action; + +case 1: +YY_RULE_SETUP +#line 40 "fasta_header_parser.l" +{ + /*printf("\n{SUP},%s",header_yytext);*/ + BEGIN(REGID); + } + YY_BREAK +case 2: +YY_RULE_SETUP +#line 45 "fasta_header_parser.l" +{ + i=0; + + field = malloc_field(&free_size); + (*p_header)[*nbf].name = (char*) malloc(3*sizeof(char)); + strcpy(((*p_header)[*nbf]).name,"id"); + + size_needed = strlen(header_yytext)+1; + (*p_header)[*nbf].value = (char*) malloc(sizeof(char)*size_needed); + strcpy(((*p_header)[*nbf]).value,header_yytext); + + (*nbf)++; + } + YY_BREAK +case 3: +YY_RULE_SETUP +#line 60 "fasta_header_parser.l" +{ + BEGIN(REGNAME); + } + YY_BREAK +case 4: +YY_RULE_SETUP +#line 64 "fasta_header_parser.l" +{ + /*fprintf(stderr,"\n{WORD} **%s**",header_yytext);*/ + field = store_in_field(field,header_yytext,&free_size,&i); + } + YY_BREAK +case 5: +YY_RULE_SETUP +#line 69 "fasta_header_parser.l" +{ + /*fprintf(stderr,"\n{SPACE} **%s**",header_yytext);*/ + if (i != 0) + field = store_in_field(field,header_yytext,&free_size,&i); + } + YY_BREAK +case 6: +YY_RULE_SETUP +#line 75 "fasta_header_parser.l" +{ + /*fprintf(stderr,"\n{EQUAL},%s",header_yytext);*/ + field = store_in_header_table(field, &((*p_header)[*nbf].name), &free_size, &i); + BEGIN(REGVAL); + } + YY_BREAK +case 7: +YY_RULE_SETUP +#line 81 "fasta_header_parser.l" +{ + /*fprintf(stderr,"\n{SEP},%s",header_yytext);*/ + (*p_header)[*nbf].name = (char*) malloc(19*sizeof(char)); + strcpy((*p_header)[*nbf].name,"definition"); + field = store_in_header_table(field, &((*p_header)[*nbf].value), &free_size, &i); + p_header = check_and_realloc_mem_in_header_table(p_header, nbf, memory_allocated); + BEGIN(REGNAME); + } + YY_BREAK +case 8: +YY_RULE_SETUP +#line 90 "fasta_header_parser.l" +{ + /*fprintf(stderr,"\n{WORD} **%s**\n",header_yytext);*/ + field = store_in_field(field,header_yytext,&free_size,&i); + } + YY_BREAK +case 9: +YY_RULE_SETUP +#line 95 "fasta_header_parser.l" +{ + /*fprintf(stderr,"\n{SPACE} **%s**\n",header_yytext);*/ + field = store_in_field(field,header_yytext,&free_size,&i); + } + YY_BREAK +case 10: +YY_RULE_SETUP +#line 100 "fasta_header_parser.l" +{ + /*fprintf(stderr,"\n{SEP},%s\n",header_yytext);*/ + + field = store_in_header_table(field, &((*p_header)[*nbf].value), &free_size, &i); + p_header = check_and_realloc_mem_in_header_table(p_header, nbf, memory_allocated); + BEGIN(REGNAME); + } + YY_BREAK +case 11: +YY_RULE_SETUP +#line 109 "fasta_header_parser.l" +{ + /*fprintf(stderr, "\nWarning : separator ';' probably missing in header after %s",(*p_header)[*nbf].name);*/ + } + YY_BREAK +case YY_STATE_EOF(REGVAL): +#line 113 "fasta_header_parser.l" +{ + field = store_in_header_table(field, &((*p_header)[*nbf].value), &free_size, &i); + p_header = check_and_realloc_mem_in_header_table(p_header, nbf, memory_allocated); + end_header_table(p_header, *nbf); + + free(field); + BEGIN(INITIAL); + return 0; + } + YY_BREAK +case YY_STATE_EOF(REGNAME): +#line 123 "fasta_header_parser.l" +{ + /*(*p_header)[*nbf].name = (char*) malloc(sizeof(char)*19); + strcpy((*p_header)[*nbf].name,"other_informations"); + field = store_in_header_table(field, &((*p_header)[*nbf].value), &free_size, &i); + p_header = check_and_realloc_mem_in_header_table(p_header, nbf, memory_allocated); + */ + end_header_table(p_header, *nbf); + + free(field); + BEGIN(INITIAL); + return 0; + } + YY_BREAK +case 12: +YY_RULE_SETUP +#line 136 "fasta_header_parser.l" +ECHO; + YY_BREAK +#line 915 "" +case YY_STATE_EOF(INITIAL): +case YY_STATE_EOF(REGID): + yyterminate(); + + case YY_END_OF_BUFFER: + { + /* Amount of text matched not including the EOB char. */ + int yy_amount_of_matched_text = (int) (yy_cp - (yytext_ptr)) - 1; + + /* Undo the effects of YY_DO_BEFORE_ACTION. */ + *yy_cp = (yy_hold_char); + YY_RESTORE_YY_MORE_OFFSET + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW ) + { + /* We're scanning a new file or input source. It's + * possible that this happened because the user + * just pointed header_yyin at a new source and called + * header_yylex(). If so, then we have to assure + * consistency between YY_CURRENT_BUFFER and our + * globals. Here is the right place to do so, because + * this is the first action (other than possibly a + * back-up) that will match for the new input source. + */ + (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + YY_CURRENT_BUFFER_LVALUE->yy_input_file = header_yyin; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL; + } + + /* Note that here we test for yy_c_buf_p "<=" to the position + * of the first EOB in the buffer, since yy_c_buf_p will + * already have been incremented past the NUL character + * (since all states make transitions on EOB to the + * end-of-buffer state). Contrast this with the test + * in input(). + */ + if ( (yy_c_buf_p) <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] ) + { /* This was really a NUL. */ + yy_state_type yy_next_state; + + (yy_c_buf_p) = (yytext_ptr) + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( ); + + /* Okay, we're now positioned to make the NUL + * transition. We couldn't have + * yy_get_previous_state() go ahead and do it + * for us because it doesn't know how to deal + * with the possibility of jamming (and we don't + * want to build jamming into it because then it + * will run more slowly). + */ + + yy_next_state = yy_try_NUL_trans( yy_current_state ); + + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + + if ( yy_next_state ) + { + /* Consume the NUL. */ + yy_cp = ++(yy_c_buf_p); + yy_current_state = yy_next_state; + goto yy_match; + } + + else + { + yy_cp = (yy_c_buf_p); + goto yy_find_action; + } + } + + else switch ( yy_get_next_buffer( ) ) + { + case EOB_ACT_END_OF_FILE: + { + (yy_did_buffer_switch_on_eof) = 0; + + if ( header_yywrap( ) ) + { + /* Note: because we've taken care in + * yy_get_next_buffer() to have set up + * header_yytext, we can now set up + * yy_c_buf_p so that if some total + * hoser (like flex itself) wants to + * call the scanner after we return the + * YY_NULL, it'll still work - another + * YY_NULL will get returned. + */ + (yy_c_buf_p) = (yytext_ptr) + YY_MORE_ADJ; + + yy_act = YY_STATE_EOF(YY_START); + goto do_action; + } + + else + { + if ( ! (yy_did_buffer_switch_on_eof) ) + YY_NEW_FILE; + } + break; + } + + case EOB_ACT_CONTINUE_SCAN: + (yy_c_buf_p) = + (yytext_ptr) + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( ); + + yy_cp = (yy_c_buf_p); + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + goto yy_match; + + case EOB_ACT_LAST_MATCH: + (yy_c_buf_p) = + &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)]; + + yy_current_state = yy_get_previous_state( ); + + yy_cp = (yy_c_buf_p); + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + goto yy_find_action; + } + break; + } + + default: + YY_FATAL_ERROR( + "fatal flex scanner internal error--no action found" ); + } /* end of action switch */ + } /* end of scanning one token */ +} /* end of header_yylex */ + +/* yy_get_next_buffer - try to read in a new buffer + * + * Returns a code representing an action: + * EOB_ACT_LAST_MATCH - + * EOB_ACT_CONTINUE_SCAN - continue scanning from current position + * EOB_ACT_END_OF_FILE - end of file + */ +static int yy_get_next_buffer (void) +{ + register char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf; + register char *source = (yytext_ptr); + register int number_to_move, i; + int ret_val; + + if ( (yy_c_buf_p) > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] ) + YY_FATAL_ERROR( + "fatal flex scanner internal error--end of buffer missed" ); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 ) + { /* Don't try to fill the buffer, so this is an EOF. */ + if ( (yy_c_buf_p) - (yytext_ptr) - YY_MORE_ADJ == 1 ) + { + /* We matched a single character, the EOB, so + * treat this as a final EOF. + */ + return EOB_ACT_END_OF_FILE; + } + + else + { + /* We matched some text prior to the EOB, first + * process it. + */ + return EOB_ACT_LAST_MATCH; + } + } + + /* Try to read more data. */ + + /* First move last chars to start of buffer. */ + number_to_move = (int) ((yy_c_buf_p) - (yytext_ptr)) - 1; + + for ( i = 0; i < number_to_move; ++i ) + *(dest++) = *(source++); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING ) + /* don't do the read, it's not guaranteed to return an EOF, + * just force an EOF + */ + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars) = 0; + + else + { + yy_size_t num_to_read = + YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1; + + while ( num_to_read <= 0 ) + { /* Not enough room in the buffer - grow it. */ + + /* just a shorter name for the current buffer */ + YY_BUFFER_STATE b = YY_CURRENT_BUFFER_LVALUE; + + int yy_c_buf_p_offset = + (int) ((yy_c_buf_p) - b->yy_ch_buf); + + if ( b->yy_is_our_buffer ) + { + yy_size_t new_size = b->yy_buf_size * 2; + + if ( new_size <= 0 ) + b->yy_buf_size += b->yy_buf_size / 8; + else + b->yy_buf_size *= 2; + + b->yy_ch_buf = (char *) + /* Include room in for 2 EOB chars. */ + header_yyrealloc((void *) b->yy_ch_buf,b->yy_buf_size + 2 ); + } + else + /* Can't grow it, we don't own it. */ + b->yy_ch_buf = 0; + + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( + "fatal error - scanner input buffer overflow" ); + + (yy_c_buf_p) = &b->yy_ch_buf[yy_c_buf_p_offset]; + + num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size - + number_to_move - 1; + + } + + if ( num_to_read > YY_READ_BUF_SIZE ) + num_to_read = YY_READ_BUF_SIZE; + + /* Read in more data. */ + YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]), + (yy_n_chars), num_to_read ); + + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + if ( (yy_n_chars) == 0 ) + { + if ( number_to_move == YY_MORE_ADJ ) + { + ret_val = EOB_ACT_END_OF_FILE; + header_yyrestart(header_yyin ); + } + + else + { + ret_val = EOB_ACT_LAST_MATCH; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = + YY_BUFFER_EOF_PENDING; + } + } + + else + ret_val = EOB_ACT_CONTINUE_SCAN; + + if ((yy_size_t) ((yy_n_chars) + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) { + /* Extend the array by 50%, plus the number we really need. */ + yy_size_t new_size = (yy_n_chars) + number_to_move + ((yy_n_chars) >> 1); + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) header_yyrealloc((void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf,new_size ); + if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" ); + } + + (yy_n_chars) += number_to_move; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] = YY_END_OF_BUFFER_CHAR; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] = YY_END_OF_BUFFER_CHAR; + + (yytext_ptr) = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0]; + + return ret_val; +} + +/* yy_get_previous_state - get the state just before the EOB char was reached */ + + static yy_state_type yy_get_previous_state (void) +{ + register yy_state_type yy_current_state; + register char *yy_cp; + + yy_current_state = (yy_start); + + for ( yy_cp = (yytext_ptr) + YY_MORE_ADJ; yy_cp < (yy_c_buf_p); ++yy_cp ) + { + register YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1); + if ( yy_accept[yy_current_state] ) + { + (yy_last_accepting_state) = yy_current_state; + (yy_last_accepting_cpos) = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 29 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + } + + return yy_current_state; +} + +/* yy_try_NUL_trans - try to make a transition on the NUL character + * + * synopsis + * next_state = yy_try_NUL_trans( current_state ); + */ + static yy_state_type yy_try_NUL_trans (yy_state_type yy_current_state ) +{ + register int yy_is_jam; + register char *yy_cp = (yy_c_buf_p); + + register YY_CHAR yy_c = 1; + if ( yy_accept[yy_current_state] ) + { + (yy_last_accepting_state) = yy_current_state; + (yy_last_accepting_cpos) = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 29 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + yy_is_jam = (yy_current_state == 28); + + return yy_is_jam ? 0 : yy_current_state; +} + + static void yyunput (int c, register char * yy_bp ) +{ + register char *yy_cp; + + yy_cp = (yy_c_buf_p); + + /* undo effects of setting up header_yytext */ + *yy_cp = (yy_hold_char); + + if ( yy_cp < YY_CURRENT_BUFFER_LVALUE->yy_ch_buf + 2 ) + { /* need to shift things up to make room */ + /* +2 for EOB chars. */ + register yy_size_t number_to_move = (yy_n_chars) + 2; + register char *dest = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[ + YY_CURRENT_BUFFER_LVALUE->yy_buf_size + 2]; + register char *source = + &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]; + + while ( source > YY_CURRENT_BUFFER_LVALUE->yy_ch_buf ) + *--dest = *--source; + + yy_cp += (int) (dest - source); + yy_bp += (int) (dest - source); + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = + (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_buf_size; + + if ( yy_cp < YY_CURRENT_BUFFER_LVALUE->yy_ch_buf + 2 ) + YY_FATAL_ERROR( "flex scanner push-back overflow" ); + } + + *--yy_cp = (char) c; + + (yytext_ptr) = yy_bp; + (yy_hold_char) = *yy_cp; + (yy_c_buf_p) = yy_cp; +} + +#ifndef YY_NO_INPUT +#ifdef __cplusplus + static int yyinput (void) +#else + static int input (void) +#endif + +{ + int c; + + *(yy_c_buf_p) = (yy_hold_char); + + if ( *(yy_c_buf_p) == YY_END_OF_BUFFER_CHAR ) + { + /* yy_c_buf_p now points to the character we want to return. + * If this occurs *before* the EOB characters, then it's a + * valid NUL; if not, then we've hit the end of the buffer. + */ + if ( (yy_c_buf_p) < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] ) + /* This was really a NUL. */ + *(yy_c_buf_p) = '\0'; + + else + { /* need more input */ + yy_size_t offset = (yy_c_buf_p) - (yytext_ptr); + ++(yy_c_buf_p); + + switch ( yy_get_next_buffer( ) ) + { + case EOB_ACT_LAST_MATCH: + /* This happens because yy_g_n_b() + * sees that we've accumulated a + * token and flags that we need to + * try matching the token before + * proceeding. But for input(), + * there's no matching to consider. + * So convert the EOB_ACT_LAST_MATCH + * to EOB_ACT_END_OF_FILE. + */ + + /* Reset buffer status. */ + header_yyrestart(header_yyin ); + + /*FALLTHROUGH*/ + + case EOB_ACT_END_OF_FILE: + { + if ( header_yywrap( ) ) + return EOF; + + if ( ! (yy_did_buffer_switch_on_eof) ) + YY_NEW_FILE; +#ifdef __cplusplus + return yyinput(); +#else + return input(); +#endif + } + + case EOB_ACT_CONTINUE_SCAN: + (yy_c_buf_p) = (yytext_ptr) + offset; + break; + } + } + } + + c = *(unsigned char *) (yy_c_buf_p); /* cast for 8-bit char's */ + *(yy_c_buf_p) = '\0'; /* preserve header_yytext */ + (yy_hold_char) = *++(yy_c_buf_p); + + return c; +} +#endif /* ifndef YY_NO_INPUT */ + +/** Immediately switch to a different input stream. + * @param input_file A readable stream. + * + * @note This function does not reset the start condition to @c INITIAL . + */ + void header_yyrestart (FILE * input_file ) +{ + + if ( ! YY_CURRENT_BUFFER ){ + header_yyensure_buffer_stack (); + YY_CURRENT_BUFFER_LVALUE = + header_yy_create_buffer(header_yyin,YY_BUF_SIZE ); + } + + header_yy_init_buffer(YY_CURRENT_BUFFER,input_file ); + header_yy_load_buffer_state( ); +} + +/** Switch to a different input buffer. + * @param new_buffer The new input buffer. + * + */ + void header_yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ) +{ + + /* TODO. We should be able to replace this entire function body + * with + * header_yypop_buffer_state(); + * header_yypush_buffer_state(new_buffer); + */ + header_yyensure_buffer_stack (); + if ( YY_CURRENT_BUFFER == new_buffer ) + return; + + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *(yy_c_buf_p) = (yy_hold_char); + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p); + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + YY_CURRENT_BUFFER_LVALUE = new_buffer; + header_yy_load_buffer_state( ); + + /* We don't actually know whether we did this switch during + * EOF (header_yywrap()) processing, but the only time this flag + * is looked at is after header_yywrap() is called, so it's safe + * to go ahead and always set it. + */ + (yy_did_buffer_switch_on_eof) = 1; +} + +static void header_yy_load_buffer_state (void) +{ + (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + (yytext_ptr) = (yy_c_buf_p) = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos; + header_yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file; + (yy_hold_char) = *(yy_c_buf_p); +} + +/** Allocate and initialize an input buffer state. + * @param file A readable stream. + * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE. + * + * @return the allocated buffer state. + */ + YY_BUFFER_STATE header_yy_create_buffer (FILE * file, int size ) +{ + YY_BUFFER_STATE b; + + b = (YY_BUFFER_STATE) header_yyalloc(sizeof( struct yy_buffer_state ) ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in header_yy_create_buffer()" ); + + b->yy_buf_size = size; + + /* yy_ch_buf has to be 2 characters longer than the size given because + * we need to put in 2 end-of-buffer characters. + */ + b->yy_ch_buf = (char *) header_yyalloc(b->yy_buf_size + 2 ); + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in header_yy_create_buffer()" ); + + b->yy_is_our_buffer = 1; + + header_yy_init_buffer(b,file ); + + return b; +} + +/** Destroy the buffer. + * @param b a buffer created with header_yy_create_buffer() + * + */ + void header_yy_delete_buffer (YY_BUFFER_STATE b ) +{ + + if ( ! b ) + return; + + if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */ + YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0; + + if ( b->yy_is_our_buffer ) + header_yyfree((void *) b->yy_ch_buf ); + + header_yyfree((void *) b ); +} + +/* Initializes or reinitializes a buffer. + * This function is sometimes called more than once on the same buffer, + * such as during a header_yyrestart() or at EOF. + */ + static void header_yy_init_buffer (YY_BUFFER_STATE b, FILE * file ) + +{ + int oerrno = errno; + + header_yy_flush_buffer(b ); + + b->yy_input_file = file; + b->yy_fill_buffer = 1; + + /* If b is the current buffer, then header_yy_init_buffer was _probably_ + * called from header_yyrestart() or through yy_get_next_buffer. + * In that case, we don't want to reset the lineno or column. + */ + if (b != YY_CURRENT_BUFFER){ + b->yy_bs_lineno = 1; + b->yy_bs_column = 0; + } + + b->yy_is_interactive = file ? (isatty( fileno(file) ) > 0) : 0; + + errno = oerrno; +} + +/** Discard all buffered characters. On the next scan, YY_INPUT will be called. + * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER. + * + */ + void header_yy_flush_buffer (YY_BUFFER_STATE b ) +{ + if ( ! b ) + return; + + b->yy_n_chars = 0; + + /* We always need two end-of-buffer characters. The first causes + * a transition to the end-of-buffer state. The second causes + * a jam in that state. + */ + b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR; + b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR; + + b->yy_buf_pos = &b->yy_ch_buf[0]; + + b->yy_at_bol = 1; + b->yy_buffer_status = YY_BUFFER_NEW; + + if ( b == YY_CURRENT_BUFFER ) + header_yy_load_buffer_state( ); +} + +/** Pushes the new state onto the stack. The new state becomes + * the current state. This function will allocate the stack + * if necessary. + * @param new_buffer The new state. + * + */ +void header_yypush_buffer_state (YY_BUFFER_STATE new_buffer ) +{ + if (new_buffer == NULL) + return; + + header_yyensure_buffer_stack(); + + /* This block is copied from header_yy_switch_to_buffer. */ + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *(yy_c_buf_p) = (yy_hold_char); + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p); + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + /* Only push if top exists. Otherwise, replace top. */ + if (YY_CURRENT_BUFFER) + (yy_buffer_stack_top)++; + YY_CURRENT_BUFFER_LVALUE = new_buffer; + + /* copied from header_yy_switch_to_buffer. */ + header_yy_load_buffer_state( ); + (yy_did_buffer_switch_on_eof) = 1; +} + +/** Removes and deletes the top of the stack, if present. + * The next element becomes the new top. + * + */ +void header_yypop_buffer_state (void) +{ + if (!YY_CURRENT_BUFFER) + return; + + header_yy_delete_buffer(YY_CURRENT_BUFFER ); + YY_CURRENT_BUFFER_LVALUE = NULL; + if ((yy_buffer_stack_top) > 0) + --(yy_buffer_stack_top); + + if (YY_CURRENT_BUFFER) { + header_yy_load_buffer_state( ); + (yy_did_buffer_switch_on_eof) = 1; + } +} + +/* Allocates the stack if it does not exist. + * Guarantees space for at least one push. + */ +static void header_yyensure_buffer_stack (void) +{ + yy_size_t num_to_alloc; + + if (!(yy_buffer_stack)) { + + /* First allocation is just for 2 elements, since we don't know if this + * scanner will even need a stack. We use 2 instead of 1 to avoid an + * immediate realloc on the next call. + */ + num_to_alloc = 1; + (yy_buffer_stack) = (struct yy_buffer_state**)header_yyalloc + (num_to_alloc * sizeof(struct yy_buffer_state*) + ); + if ( ! (yy_buffer_stack) ) + YY_FATAL_ERROR( "out of dynamic memory in header_yyensure_buffer_stack()" ); + + memset((yy_buffer_stack), 0, num_to_alloc * sizeof(struct yy_buffer_state*)); + + (yy_buffer_stack_max) = num_to_alloc; + (yy_buffer_stack_top) = 0; + return; + } + + if ((yy_buffer_stack_top) >= ((yy_buffer_stack_max)) - 1){ + + /* Increase the buffer to prepare for a possible push. */ + int grow_size = 8 /* arbitrary grow size */; + + num_to_alloc = (yy_buffer_stack_max) + grow_size; + (yy_buffer_stack) = (struct yy_buffer_state**)header_yyrealloc + ((yy_buffer_stack), + num_to_alloc * sizeof(struct yy_buffer_state*) + ); + if ( ! (yy_buffer_stack) ) + YY_FATAL_ERROR( "out of dynamic memory in header_yyensure_buffer_stack()" ); + + /* zero only the new slots.*/ + memset((yy_buffer_stack) + (yy_buffer_stack_max), 0, grow_size * sizeof(struct yy_buffer_state*)); + (yy_buffer_stack_max) = num_to_alloc; + } +} + +/** Setup the input buffer state to scan directly from a user-specified character buffer. + * @param base the character buffer + * @param size the size in bytes of the character buffer + * + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE header_yy_scan_buffer (char * base, yy_size_t size ) +{ + YY_BUFFER_STATE b; + + if ( size < 2 || + base[size-2] != YY_END_OF_BUFFER_CHAR || + base[size-1] != YY_END_OF_BUFFER_CHAR ) + /* They forgot to leave room for the EOB's. */ + return 0; + + b = (YY_BUFFER_STATE) header_yyalloc(sizeof( struct yy_buffer_state ) ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in header_yy_scan_buffer()" ); + + b->yy_buf_size = size - 2; /* "- 2" to take care of EOB's */ + b->yy_buf_pos = b->yy_ch_buf = base; + b->yy_is_our_buffer = 0; + b->yy_input_file = 0; + b->yy_n_chars = b->yy_buf_size; + b->yy_is_interactive = 0; + b->yy_at_bol = 1; + b->yy_fill_buffer = 0; + b->yy_buffer_status = YY_BUFFER_NEW; + + header_yy_switch_to_buffer(b ); + + return b; +} + +/** Setup the input buffer state to scan a string. The next call to header_yylex() will + * scan from a @e copy of @a str. + * @param yystr a NUL-terminated string to scan + * + * @return the newly allocated buffer state object. + * @note If you want to scan bytes that may contain NUL values, then use + * header_yy_scan_bytes() instead. + */ +YY_BUFFER_STATE header_yy_scan_string (yyconst char * yystr ) +{ + + return header_yy_scan_bytes(yystr,strlen(yystr) ); +} + +/** Setup the input buffer state to scan the given bytes. The next call to header_yylex() will + * scan from a @e copy of @a bytes. + * @param yybytes the byte buffer to scan + * @param _yybytes_len the number of bytes in the buffer pointed to by @a bytes. + * + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE header_yy_scan_bytes (yyconst char * yybytes, yy_size_t _yybytes_len ) +{ + YY_BUFFER_STATE b; + char *buf; + yy_size_t n; + int i; + + /* Get memory for full buffer, including space for trailing EOB's. */ + n = _yybytes_len + 2; + buf = (char *) header_yyalloc(n ); + if ( ! buf ) + YY_FATAL_ERROR( "out of dynamic memory in header_yy_scan_bytes()" ); + + for ( i = 0; i < _yybytes_len; ++i ) + buf[i] = yybytes[i]; + + buf[_yybytes_len] = buf[_yybytes_len+1] = YY_END_OF_BUFFER_CHAR; + + b = header_yy_scan_buffer(buf,n ); + if ( ! b ) + YY_FATAL_ERROR( "bad buffer in header_yy_scan_bytes()" ); + + /* It's okay to grow etc. this buffer, and we should throw it + * away when we're done. + */ + b->yy_is_our_buffer = 1; + + return b; +} + +#ifndef YY_EXIT_FAILURE +#define YY_EXIT_FAILURE 2 +#endif + +static void yy_fatal_error (yyconst char* msg ) +{ + (void) fprintf( stderr, "%s\n", msg ); + exit( YY_EXIT_FAILURE ); +} + +/* Redefine yyless() so it works in section 3 code. */ + +#undef yyless +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up header_yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + header_yytext[header_yyleng] = (yy_hold_char); \ + (yy_c_buf_p) = header_yytext + yyless_macro_arg; \ + (yy_hold_char) = *(yy_c_buf_p); \ + *(yy_c_buf_p) = '\0'; \ + header_yyleng = yyless_macro_arg; \ + } \ + while ( 0 ) + +/* Accessor methods (get/set functions) to struct members. */ + +/** Get the current line number. + * + */ +int header_yyget_lineno (void) +{ + + return header_yylineno; +} + +/** Get the input stream. + * + */ +FILE *header_yyget_in (void) +{ + return header_yyin; +} + +/** Get the output stream. + * + */ +FILE *header_yyget_out (void) +{ + return header_yyout; +} + +/** Get the length of the current token. + * + */ +yy_size_t header_yyget_leng (void) +{ + return header_yyleng; +} + +/** Get the current token. + * + */ + +char *header_yyget_text (void) +{ + return header_yytext; +} + +/** Set the current line number. + * @param line_number + * + */ +void header_yyset_lineno (int line_number ) +{ + + header_yylineno = line_number; +} + +/** Set the input stream. This does not discard the current + * input buffer. + * @param in_str A readable stream. + * + * @see header_yy_switch_to_buffer + */ +void header_yyset_in (FILE * in_str ) +{ + header_yyin = in_str ; +} + +void header_yyset_out (FILE * out_str ) +{ + header_yyout = out_str ; +} + +int header_yyget_debug (void) +{ + return header_yy_flex_debug; +} + +void header_yyset_debug (int bdebug ) +{ + header_yy_flex_debug = bdebug ; +} + +static int yy_init_globals (void) +{ + /* Initialization is the same as for the non-reentrant scanner. + * This function is called from header_yylex_destroy(), so don't allocate here. + */ + + (yy_buffer_stack) = 0; + (yy_buffer_stack_top) = 0; + (yy_buffer_stack_max) = 0; + (yy_c_buf_p) = (char *) 0; + (yy_init) = 0; + (yy_start) = 0; + +/* Defined in main.c */ +#ifdef YY_STDINIT + header_yyin = stdin; + header_yyout = stdout; +#else + header_yyin = (FILE *) 0; + header_yyout = (FILE *) 0; +#endif + + /* For future reference: Set errno on error, since we are called by + * header_yylex_init() + */ + return 0; +} + +/* header_yylex_destroy is for both reentrant and non-reentrant scanners. */ +int header_yylex_destroy (void) +{ + + /* Pop the buffer stack, destroying each element. */ + while(YY_CURRENT_BUFFER){ + header_yy_delete_buffer(YY_CURRENT_BUFFER ); + YY_CURRENT_BUFFER_LVALUE = NULL; + header_yypop_buffer_state(); + } + + /* Destroy the stack itself. */ + header_yyfree((yy_buffer_stack) ); + (yy_buffer_stack) = NULL; + + /* Reset the globals. This is important in a non-reentrant scanner so the next time + * header_yylex() is called, initialization will occur. */ + yy_init_globals( ); + + return 0; +} + +/* + * Internal utility routines. + */ + +#ifndef yytext_ptr +static void yy_flex_strncpy (char* s1, yyconst char * s2, int n ) +{ + register int i; + for ( i = 0; i < n; ++i ) + s1[i] = s2[i]; +} +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * s ) +{ + register int n; + for ( n = 0; s[n]; ++n ) + ; + + return n; +} +#endif + +void *header_yyalloc (yy_size_t size ) +{ + return (void *) malloc( size ); +} + +void *header_yyrealloc (void * ptr, yy_size_t size ) +{ + /* The cast to (char *) in the following accommodates both + * implementations that use char* generic pointers, and those + * that use void* generic pointers. It works with the latter + * because both ANSI C and C++ allow castless assignment from + * any pointer type to void*, and deal with argument conversions + * as though doing an assignment. + */ + return (void *) realloc( (char *) ptr, size ); +} + +void header_yyfree (void * ptr ) +{ + free( (char *) ptr ); /* see header_yyrealloc() for (char *) cast */ +} + +#define YYTABLES_NAME "yytables" + +#line 136 "fasta_header_parser.l" + + + +int header_yywrap() +{ + return 1; +} + +element_from_header* header_parser_main(char *h) +{ + int nbfields,memory_allocated; + element_from_header* header; + char* nbfields_n; + char* nbfields_v; + + nbfields_n = (char*) malloc(9*sizeof(char)); + nbfields_v = (char*) malloc(5*sizeof(char)); + + memory_allocated=MEMALLOCATED; + + nbfields=1; + + strcpy(nbfields_n, "nbfields"); + strcpy(nbfields_v, "1"); + + header = (element_from_header*) malloc(memory_allocated * sizeof(element_from_header)); + + header[0].name = nbfields_n; + header[0].value = nbfields_v; + + YY_BUFFER_STATE state; + + state=header_yy_scan_string(h); + + header_parser(&nbfields, &memory_allocated, &header); + + header_yy_delete_buffer(state); + + return header; +} + + + + + diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/fasta_header_parser.h b/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/fasta_header_parser.h new file mode 100644 index 0000000..985b460 --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/fasta_header_parser.h @@ -0,0 +1,13 @@ + +#ifndef FASTA_HEADER_PARSER_H_ +#define FASTA_HEADER_PARSER_H_ + +typedef struct { + char *name; + void *value; +}element_from_header; + +element_from_header* header_parser_main(char*); + + +#endif diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/fasta_header_parser.l b/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/fasta_header_parser.l new file mode 100644 index 0000000..e379d4e --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/fasta_header_parser.l @@ -0,0 +1,178 @@ +/* + * Add -ll in Makefile if you modify this file to convert to .c + */ + +%x REGID +%x REGNAME +%x REGVAL + +%{ + +#include +#include +#include "header_mem_handler.h" +#include "fasta_header_handler.h" + +#define MEMALLOCATED 10 +#define BUFFER 5 + +#define YY_DECL int header_parser(int *nbf, int *memory_allocated, element_from_header **p_header) + + +%} + +WORD [[:alnum:]:\-.{},'_()\#\[\]\|\&\"\'\/\%\+]+ +WORDID [[:alnum:]:\-.{},'_()\#\[\]\|\&\"\'\/\%\+=;]+ +SUP > +EOL \n +SEP ; +SPACE [[:blank:]]+ +EQUAL = + +%% + + int i; + int size_needed; + int free_size; + char* field; + + +{SUP} { + /*printf("\n{SUP},%s",yytext);*/ + BEGIN(REGID); + } + +{WORDID} { + i=0; + + field = malloc_field(&free_size); + (*p_header)[*nbf].name = (char*) malloc(3*sizeof(char)); + strcpy(((*p_header)[*nbf]).name,"id"); + + size_needed = strlen(yytext)+1; + (*p_header)[*nbf].value = (char*) malloc(sizeof(char)*size_needed); + strcpy(((*p_header)[*nbf]).value,yytext); + + (*nbf)++; + } + + +{SPACE} { + BEGIN(REGNAME); + } + +{WORD} { + /*fprintf(stderr,"\n{WORD} **%s**",yytext);*/ + field = store_in_field(field,yytext,&free_size,&i); + } + +{SPACE} { + /*fprintf(stderr,"\n{SPACE} **%s**",yytext);*/ + if (i != 0) + field = store_in_field(field,yytext,&free_size,&i); + } + +{EQUAL} { + /*fprintf(stderr,"\n{EQUAL},%s",yytext);*/ + field = store_in_header_table(field, &((*p_header)[*nbf].name), &free_size, &i); + BEGIN(REGVAL); + } + +{SEP} { + /*fprintf(stderr,"\n{SEP},%s",yytext);*/ + (*p_header)[*nbf].name = (char*) malloc(19*sizeof(char)); + strcpy((*p_header)[*nbf].name,"definition"); + field = store_in_header_table(field, &((*p_header)[*nbf].value), &free_size, &i); + p_header = check_and_realloc_mem_in_header_table(p_header, nbf, memory_allocated); + BEGIN(REGNAME); + } + +{WORD} { + /*fprintf(stderr,"\n{WORD} **%s**\n",yytext);*/ + field = store_in_field(field,yytext,&free_size,&i); + } + +{SPACE} { + /*fprintf(stderr,"\n{SPACE} **%s**\n",yytext);*/ + field = store_in_field(field,yytext,&free_size,&i); + } + +{SEP} { + /*fprintf(stderr,"\n{SEP},%s\n",yytext);*/ + + field = store_in_header_table(field, &((*p_header)[*nbf].value), &free_size, &i); + p_header = check_and_realloc_mem_in_header_table(p_header, nbf, memory_allocated); + BEGIN(REGNAME); + } + + +{EQUAL} { + /*fprintf(stderr, "\nWarning : separator ';' probably missing in header after %s",(*p_header)[*nbf].name);*/ + } + +<> { + field = store_in_header_table(field, &((*p_header)[*nbf].value), &free_size, &i); + p_header = check_and_realloc_mem_in_header_table(p_header, nbf, memory_allocated); + end_header_table(p_header, *nbf); + + free(field); + BEGIN(INITIAL); + return 0; + } + +<> { + /*(*p_header)[*nbf].name = (char*) malloc(sizeof(char)*19); + strcpy((*p_header)[*nbf].name,"other_informations"); + field = store_in_header_table(field, &((*p_header)[*nbf].value), &free_size, &i); + p_header = check_and_realloc_mem_in_header_table(p_header, nbf, memory_allocated); + */ + end_header_table(p_header, *nbf); + + free(field); + BEGIN(INITIAL); + return 0; + } + +%% + +int header_yywrap() +{ + return 1; +} + +element_from_header* header_parser_main(char *h) +{ + int nbfields,memory_allocated; + element_from_header* header; + char* nbfields_n; + char* nbfields_v; + + nbfields_n = (char*) malloc(9*sizeof(char)); + nbfields_v = (char*) malloc(5*sizeof(char)); + + memory_allocated=MEMALLOCATED; + + nbfields=1; + + strcpy(nbfields_n, "nbfields"); + strcpy(nbfields_v, "1"); + + header = (element_from_header*) malloc(memory_allocated * sizeof(element_from_header)); + + header[0].name = nbfields_n; + header[0].value = nbfields_v; + + YY_BUFFER_STATE state; + + state=yy_scan_string(h); + + header_parser(&nbfields, &memory_allocated, &header); + + yy_delete_buffer(state); + + return header; +} + + + + diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/fasta_seq_writer.c b/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/fasta_seq_writer.c new file mode 100644 index 0000000..ccee773 --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/fasta_seq_writer.c @@ -0,0 +1,76 @@ +#include +#include +#include +#include "sequence.h" +#include "fasta_header_parser.h" + + +void printOnlySeqFromFastaSeqPtr(fastaSeqPtr seq, FILE* output) +{ + char nuc; + int n=60; + int l = strlen(seq->sequence); + for (n=60; nsequence[n]; + seq->sequence[n]=0; + fprintf(output,"%s\n",seq->sequence+n-60); + seq->sequence[n]=nuc; + } + fprintf(output,"%s\n",seq->sequence+n-60); +} + + +void printOnlySeqFromChar(char* seq, FILE* output) +{ + char nuc; + int n=60; + int l = strlen(seq); + for (n=60; n%s\n",seq->rawheader); +} + + +void printOnlyHeaderFromTable(element_from_header* header, FILE* output) +{ + int i; + int nbf; + + nbf = atoi(header[0].value); + + fprintf(output,">%s ",header[1].value); + + for (i = 2; i <= nbf; i++) + { + if (strcmp(header[i].name, "definition") != 0) + { + fprintf(output,"%s",header[i].name); + fprintf(output,"="); + fprintf(output,"%s; ",header[i].value); + } + } + + if (strcmp(header[nbf].name, "definition") == 0) + fprintf(output,"%s; ",header[nbf].value); + + fprintf(output,"\n"); +} + + +void printHeaderAndSeqFromFastaSeqPtr(fastaSeqPtr seq, FILE* output) +{ + printOnlyHeaderFromFastaSeqPtr(seq, output); + printOnlySeqFromFastaSeqPtr(seq, output); +} diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/fasta_seq_writer.h b/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/fasta_seq_writer.h new file mode 100644 index 0000000..39f8212 --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/fasta_seq_writer.h @@ -0,0 +1,19 @@ + +#ifndef FASTA_SEQ_WRITER_H_ +#define FASTA_SEQ_WRITER_H_ + +#include "sequence.h" + + +void printOnlySeqFromFastaSeqPtr(fastaSeqPtr, FILE*); + +void printOnlySeqFromChar(char*, FILE*); + +void printOnlyHeaderFromFastaSeqPtr(fastaSeqPtr, FILE*); + +void printOnlyHeaderFromTable(element_from_header*, FILE*); + +void printHeaderAndSeqFromFastaSeqPtr(fastaSeqPtr, FILE*); + + +#endif diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/header_mem_handler.c b/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/header_mem_handler.c new file mode 100644 index 0000000..a0b8e7c --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/header_mem_handler.c @@ -0,0 +1,93 @@ +#include +#include +#include "header_mem_handler.h" +#include + +#define FIELD_BUFFER 1024 + + +char* malloc_field(int *free_size) +{ + char* field = (char*) malloc(sizeof(char) * FIELD_BUFFER); + field[0] = 0; + (*free_size) = FIELD_BUFFER; + return field; +} + +int check_mem_field(int size_needed) +{ + int number_of_chunks_to_alloc; + number_of_chunks_to_alloc = size_needed / FIELD_BUFFER + 1; + return number_of_chunks_to_alloc; +} + +char* realloc_field(int number_of_chunks_to_alloc, char* field) +{ + int size_needed; + size_needed = number_of_chunks_to_alloc * FIELD_BUFFER; + field = realloc(field, (size_needed)*sizeof(char)); + return field; +} + +char* check_and_realloc_field(char* field, int size_needed, int* free_size) +{ + size_needed = size_needed + strlen(field); + int number_of_chunks_to_alloc = check_mem_field(size_needed); + if (strlen(field)>0) + field = realloc_field(number_of_chunks_to_alloc, field); + else + { + free(field); + field = malloc(number_of_chunks_to_alloc * FIELD_BUFFER); + } + (*free_size) = number_of_chunks_to_alloc*FIELD_BUFFER - size_needed + 1; + return field; +} + + +char* store_in_field(char* field, char* yytext, int* free_size, int* i) +{ + int size_needed; + size_needed = strlen(yytext)+1; + if (size_needed > (*free_size)) + field = check_and_realloc_field(field, size_needed, free_size); + else + (*free_size) = (*free_size) - size_needed + 1; + strcpy(&(field[(*i)]),yytext); + (*i) = (*i)+size_needed-1; + return field; +} + + +char* store_in_header_table(char* field, char** storing_place, int* free_size, int* i) +{ + int size_needed; + size_needed = strlen(field)+1; + *storing_place = (char*) malloc(size_needed*sizeof(char)); + strcpy(*storing_place,field); + (*i)=0; + free(field); + field = malloc_field(free_size); + return field; +} + + +element_from_header** check_and_realloc_mem_in_header_table(element_from_header** p_header, int* nbf, int* memory_allocated) +{ + (*nbf)++; + + if (*nbf == *memory_allocated) + { + (*memory_allocated)++; + *p_header = (element_from_header*) realloc(*p_header, (*memory_allocated) * sizeof(element_from_header)); + } + + return p_header; +} + +void end_header_table(element_from_header** p_header, int nbf) +{ + nbf = nbf - 1; + //fprintf(stderr, "nbf = %d", nbf); + sprintf((*p_header)->value, "%d", nbf); +} diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/header_mem_handler.h b/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/header_mem_handler.h new file mode 100644 index 0000000..bfb591b --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/header_mem_handler.h @@ -0,0 +1,22 @@ +#ifndef HEADER_MEM_HANDLER_H_ +#define HEADER_MEM_HANDLER_H_ + +#include "fasta_header_parser.h" + +char* malloc_field(int*); + +int check_mem_field(int); + +char* realloc_field(int, char*); + +char* check_and_realloc_field(char*, int, int*); + +char* store_in_field(char*, char*, int*, int*); + +char* store_in_header_table(char*, char**, int*, int*); + +element_from_header** check_and_realloc_mem_in_header_table(element_from_header**, int*, int*); + +void end_header_table(element_from_header** p_header, int nbf); + +#endif diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/sequence.c b/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/sequence.c new file mode 100644 index 0000000..4f903d9 --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/sequence.c @@ -0,0 +1,450 @@ +/** + * FileName: sequence.c + * Authors: Tiayyba Riaz, Celine Mercier + * Description: C file for sequence reading and parsing + * **/ + +#include +#include +#include +#include + +#include "../libutils/utilities.h" +#include "sequence.h" +#include "../libfile/fileHandling.h" +#include "fasta_header_handler.h" +#include "fasta_header_parser.h" + + +/* + * Function Name: seq_getNext(FILE *fp, char *fieldDelim) + * Description: Gets the next sequence from file by calling another function, passes the sequence + * to other function to get the header elements and nucleotide suquence into a strcuture of + * type fastaSeq and returns a pointer to this newly populated structure. + */ + +fastaSeqPtr seq_getNext(FILE *fp, char *fieldDelim, BOOL isStandardSeq, BOOL onlyATGC) +{ + char *seq; + char *header; + char *strTemp; + fastaSeqPtr seqElem; + int seqLen; + + seq = seq_readNextFromFilebyLine(fp); + if (seq == NULL) return NULL; + + /* Find header separator \n, if not found return NULL */ + strTemp = strchr(seq, '\n'); + if(strTemp == NULL) + return NULL; + + seqLen = strlen(strTemp); + header = (char*) util_malloc(1+(strlen(seq) - seqLen)*sizeof(char), __FILE__, __LINE__); + + /* Separate header in header variable */ + strncpy(header, seq, strTemp - seq); + header[strTemp - seq] = '\0'; + /* Get memory for new sequence structure element */ + seqElem = (fastaSeqPtr) util_malloc(sizeof(fastaSeq), __FILE__, __LINE__); + /* Parse header and assign values to structure fields */ + seq_fillHeader(header, fieldDelim, seqElem); + /* Get clean sequence and assign to structure field */ + if (isStandardSeq) + if (onlyATGC) + seq_fillSeqOnlyATGC(strTemp, seqElem, seqLen); + else + seq_fillSeq(strTemp, seqElem, seqLen); + else + seq_fillDigitSeq(strTemp, seqElem, seqLen); + /* Type cast the char * seq to void pointer and deallocate the memory pointed by this */ + util_free((void *)seq); + /* Return new sequence structure element */ + return seqElem; +} + + +char *seq_readNextFromFilebyLine(FILE* fp) +{ + char newc = '\0'; + BOOL seqCompleted = FALSE; + int length = 500; + int32_t len; + char tempstr[length]; + char* buffer; + + if (feof(fp)) return NULL; + newc = file_nextChar(fp); + if (newc != '>') ungetc(newc, fp); + + buffer = util_malloc(1*sizeof(char), __FILE__, __LINE__); + buffer[0] = '\0'; + + while(!seqCompleted) + { + newc = file_nextChar(fp); + if(newc == '>' || newc == '\0') + { + seqCompleted = TRUE; + if (newc == '>') + ungetc(newc, fp); // Make sure next time we start from sequence delimiter > + } + else + { + ungetc(newc, fp); + if(file_nextLine( fp, tempstr, length) != NULL) + { + len = strlen(tempstr) + strlen(buffer) + 1; + buffer = util_realloc(buffer, len, __FILE__, __LINE__); + strcat(buffer, tempstr); + } + else + { + seqCompleted = TRUE; + } + } + } + return buffer; +} + + +/* + * Function Name: seq_fillHeader(char* header, char *fieldDelim, fastaSeqPtr seqElem) + */ +void seq_fillHeader(char* header, char *fieldDelim, fastaSeqPtr seqElem) +{ + char* IdEnd; + int IdSize; + + seqElem->rawheader = strdup(header); + + IdEnd = strchr(header, ' '); + if (IdEnd == NULL) + IdSize = strlen(header); + else + IdSize = strlen(header) - strlen(IdEnd); + + seqElem->accession_id = (char*) util_malloc(1+IdSize*sizeof(char), __FILE__, __LINE__); + + strncpy(seqElem->accession_id, header, IdSize); + + (seqElem->accession_id)[IdSize] = '\0'; +} + + +/* + * Function Name: seq_fillSeq(char *seq, fastaSeqPtr seqElem) + * Description: Parses the whole sequences for actual nucleotide sequences and stores that + * sequence in the field of structure 'seqElem' . + */ +void seq_fillSeq(char *seq, fastaSeqPtr seqElem, int seqLen) +{ + char* seqTemp; + char c; + int32_t index = 0, seqIndex = 0, len = strlen(seq); + char* seqAlphabets = "acgtACGT-nN"; + + seqTemp = (char*) util_malloc(seqLen*sizeof(char), __FILE__, __LINE__); + + while (index < len) + { + c = seq[index++]; + if (strchr(seqAlphabets, c) != NULL) + seqTemp[seqIndex++] = tolower(c); + } + seqTemp[seqIndex] = '\0'; + seqElem->length=seqIndex; + seqElem->sequence = strdup(seqTemp); +} + + +void seq_fillSeqOnlyATGC(char *seq, fastaSeqPtr seqElem, int seqLen) +{ + char* seqTemp; + char c; + int32_t index = 0, seqIndex = 0, len = strlen(seq); + char* seqAlphabets = "acgtACGT"; + int notAllATGC = 0; + + seqTemp = (char*) util_malloc(seqLen*sizeof(char), __FILE__, __LINE__); + + while (index < len) + { + c = seq[index++]; + if (strchr(seqAlphabets, c) != NULL) + seqTemp[seqIndex++] = tolower(c); + else if (c != '\n') + notAllATGC = 1; + } + + if (notAllATGC) + seqTemp[0] = '\0'; + else + { + seqTemp[seqIndex] = '\0'; + seqElem->length=seqIndex; + } + seqElem->sequence = strdup(seqTemp); +} + + +void seq_fillDigitSeq(char *seq, fastaSeqPtr seqElem, int seqLen) +{ + char* seqTemp; + char c; + int32_t index = 0, seqIndex = 0, len = strlen(seq); + + seqTemp = (char*) util_malloc(seqLen*sizeof(char), __FILE__, __LINE__); + + while (index < len) + { + c = seq[index++]; + if ((c >= '0' && c <= '9') || c == ' ') + seqTemp[seqIndex++] = c; + /*else + { + printf("Error in input file"); + exit(0); + }*/ + } + seqTemp[seqIndex] = '\0'; + seqElem->sequence = strdup(seqTemp); +} + + +fastaSeqCount seq_readAllSeq2(char *fileName, BOOL isStandardSeq, BOOL onlyATGC) +{ + FILE* fp; + fastaSeqPtr seqPtr; + fastaSeqPtr seqPtrAr; + + int32_t counter = 0; + int32_t slots = 1000; + fastaSeqCount allseqs; + int32_t discarded=0; + + fp = file_open(fileName, TRUE); + + if (fp == NULL) + { + fprintf(stderr, "\nCould not open file.\n"); + exit(1); + } + + exitIfEmptyFile(fp); + + seqPtrAr = (fastaSeqPtr) util_malloc(slots*sizeof(fastaSeq), __FILE__, __LINE__); + + seqPtr = seq_getNext(fp, " ", isStandardSeq, onlyATGC); + + while (seqPtr != NULL) + { + if (counter == slots) + { + slots += 1000; + seqPtrAr = (fastaSeqPtr)util_realloc(seqPtrAr, slots*sizeof(fastaSeq), __FILE__, __LINE__); + } + + if ((seqPtr->sequence)[0] != '\0') + seqPtrAr[counter++] = *seqPtr; + else + discarded++; + + util_free((void *)seqPtr); + seqPtr = seq_getNext(fp, " ", isStandardSeq, onlyATGC); + } + fclose(fp); + + if (counter != slots) + seqPtrAr = (fastaSeqPtr)util_realloc(seqPtrAr, counter*sizeof(fastaSeq), __FILE__, __LINE__); + + allseqs.count = counter; + allseqs.fastaSeqs = seqPtrAr; + + if (discarded) + fprintf(stderr, "\nDiscarded %d sequences that did not contain only 'AaTtGgCc' characters.", discarded); + + return allseqs; +} + + +int32_t seq_findSeqByAccId (char *accid, fastaSeqCountPtr allseqs) +{ + int32_t i; + + for (i = 0; i < allseqs->count; i++) + { + if (strcmp (accid, allseqs->fastaSeqs[i].accession_id) == 0) + return i; + } + return -1; +} + + +void seq_printSeqs (fastaSeqCountPtr allseq) +{ + int32_t i; + + for (i = 0; i < allseq->count; i++) + //for (i = 0; i < 4; i++) + { + if (allseq->fastaSeqs[i].sequence == NULL) continue; + if (allseq->fastaSeqs[i].rawheader) + printf (">%s\n", allseq->fastaSeqs[i].rawheader); + else + printf (">%s\n", allseq->fastaSeqs[i].accession_id); + printf ("%s\n", allseq->fastaSeqs[i].sequence); + } +} + + +int cleanDB(fastaSeqCount db) // replace not a/t/g/c with a's +{ + int32_t i; + char *seq; + BOOL changed; + int32_t seqchanged=0; + int32_t nucchanged=0; + + fprintf(stderr,"Cleaning dataset..."); + + for (i=0; i < db.count;i++) + { + + changed=FALSE; + for (seq = db.fastaSeqs[i].sequence; *seq!=0; seq++) + { + if (*seq!='a' && *seq!='c' && *seq!='g' && *seq!='t') + { + changed=TRUE; + nucchanged++; + *seq='a'; + } + } + if (changed) + seqchanged++; + } + + if (seqchanged) + fprintf(stderr," : %d nucleotides substituted in %d sequences\n",nucchanged,seqchanged); + else + fprintf(stderr," : Done\n"); + + return(db.count); +} + + +void addCounts(fastaSeqCount* db) +{ + int s; + char* count; + element_from_header* header; + char* count_n; + char* count_v; + + count_n = (char*) malloc(6*sizeof(char)); + count_v = (char*) malloc(2*sizeof(char)); + + strcpy(count_n, "count"); + strcpy(count_v, "1"); + + for (s=0; s < db->count; s++) + { + header = header_parser_main(db->fastaSeqs[s].rawheader); + count = getItemFromHeader("count", header); + if (count == 0) // no count field + { + header = table_header_add_field(header, count_n, count_v); + db->fastaSeqs[s].count = 1; + } + else + db->fastaSeqs[s].count = atoi(count); + db->fastaSeqs[s].header = header; + } +} + + +int uniqSeqsVector(fastaSeqCount* db, fastaSeqPtr** uniqSeqs) +{ + int i, j, k; + *(*(uniqSeqs)) = db->fastaSeqs; + db->fastaSeqs[0].uniqHead = TRUE; + + i = 0; + k = 1; + + for (j=1; j < db->count; j++) + { + if (strcmp(db->fastaSeqs[i].sequence, db->fastaSeqs[j].sequence) == 0) + { + db->fastaSeqs[i].count += db->fastaSeqs[j].count; + db->fastaSeqs[j].uniqHead = FALSE; + } + else + { + db->fastaSeqs[j].uniqHead = TRUE; + *(*(uniqSeqs)+k) = (db->fastaSeqs)+j; + k++; + i = j; + } + } + return(k); +} + + +void calculateMaxAndMinLen(fastaSeqPtr* db, int n, int* lmax, int* lmin) +{ + int i; + int l; + + *lmax = 0; + for (i=0; i < n; i++) + { + l = (*(db+i))->length; + if (l > *lmax) + *lmax = l; + } + + *lmin = *lmax; + for (i=0; i < n; i++) + { + l = (*(db+i))->length; + if (l < *lmin) + *lmin = l; + } +} + + +void calculateMaxAndMinLenDB(fastaSeqCount db, int* lmax, int* lmin) +{ + int i; + int l; + + *lmax = 0; + for (i=0; i < db.count; i++) + { + l = ((db.fastaSeqs)+i)->length; + if (l > *lmax) + *lmax = l; + } + + *lmin = *lmax; + for (i=0; i < db.count; i++) + { + l = ((db.fastaSeqs)+i)->length;; + if (l < *lmin) + *lmin = l; + } +} + + +int sortSeqsWithCounts(const void **s1, const void **s2) +{ + return(((fastaSeqPtr) *s2)->count - ((fastaSeqPtr) *s1)->count); +} + + +int reverseSortSeqsWithCounts(const void **s1, const void **s2) +{ + return(((fastaSeqPtr) *s1)->count - ((fastaSeqPtr) *s2)->count); +} diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/sequence.h b/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/sequence.h new file mode 100644 index 0000000..fa2d782 --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/libfasta/sequence.h @@ -0,0 +1,64 @@ +/** + * FileName: sequence.h + * Authors: Tiayyba Riaz, Celine Mercier + * Description: Prototypes and other declarations for sequences + * **/ +#ifndef SEQUENCE_H_ +#define SEQUENCE_H_ + +#include +#include +#include "../libutils/utilities.h" +#include "fasta_header_parser.h" + + +typedef struct { + char* accession_id; // identifier + char *rawheader; // not parsed header + element_from_header* header; // parsed header + char *sequence; // DNA sequence itself + int32_t length; // DNA sequence's length + int32_t count; // abundance of the sequence + unsigned char *table; // 4mer occurrence table build using function buildTable + int32_t over; // count of 4mer with occurrences greater than 255 (overflow) + struct fastaSeqPtr* next; // next unique sequence for example + BOOL cluster_center; // whether the sequence is a cluster center or not + int32_t cluster_weight; // cluster weight when sequence is cluster center + int32_t cluster_weight_unique_ids; // cluster weight when sequence is cluster center, counting the number sequence records + double score; // score with cluster center for example + struct fastaSeqPtr* center; // pointer to the sequence's cluster center + int32_t center_index; // index of the sequence's cluster center + BOOL uniqHead; // whether the sequence is a unique head or not + char* columns_BIOM; // to print in BIOM format + int columns_BIOM_size; // size allocated for columns_BIOM + char* line_OTU_table; // to print in OTU table format + int line_OTU_table_size; // size allocated for line_OTU_table + struct hashtable *sample_counts; // sample counts for sumaclean +}fastaSeq,*fastaSeqPtr; + + +typedef struct { + int32_t count; + fastaSeqPtr fastaSeqs; +}fastaSeqCount, *fastaSeqCountPtr; + + +fastaSeqPtr seq_getNext(FILE *fp, char *fieldDelim, BOOL isStandardSeq, BOOL onlyATGC); +char *seq_readNextFromFilebyLine(FILE* fp); +void seq_fillSeq(char *seq, fastaSeqPtr seqElem, int seqLen); +void seq_fillSeqOnlyATGC(char *seq, fastaSeqPtr seqElem, int seqLen); +void seq_fillDigitSeq(char *seq, fastaSeqPtr seqElem, int seqLen); +void seq_fillHeader(char* header, char *fieldDelim, fastaSeqPtr seqElem); +fastaSeqCount seq_readAllSeq2(char *fileName, BOOL isStandardSeq, BOOL onlyATGC); +int32_t seq_findSeqByAccId (char *accid, fastaSeqCountPtr allseqs); +void seq_printSeqs (fastaSeqCountPtr allseq); +int cleanDB(fastaSeqCount); +void addCounts(fastaSeqCount* db); +int uniqSeqsVector(fastaSeqCount* db, fastaSeqPtr** uniqSeqs); +void calculateMaxAndMinLen(fastaSeqPtr* db, int n, int* lmax, int* lmin); +void calculateMaxAndMinLenDB(fastaSeqCount db, int* lmax, int* lmin); +int sortSeqsWithCounts(const void **s1, const void **s2); +int reverseSortSeqsWithCounts(const void **s1, const void **s2); +void readSampleCounts(fastaSeqCount* db, char* key_name); + +#endif /*SEQUENCE_H_*/ diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/libfile/Makefile b/src/sumatra/sumatra-1.0.10/sumalibs/libfile/Makefile new file mode 100644 index 0000000..fc12708 --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/libfile/Makefile @@ -0,0 +1,25 @@ + +SOURCES = fileHandling.c + + +SRCS=$(SOURCES) + + +OBJECTS= $(patsubst %.c,%.o,$(SOURCES)) + +LIBFILE= libfile.a +RANLIB=ranlib + + +include ../global.mk + +all: $(LIBFILE) + +clean: + rm -rf $(OBJECTS) $(LIBFILE) + rm -f *.P + rm -f *.a + +$(LIBFILE): $(OBJECTS) + ar -cr $@ $? + $(RANLIB) $@ diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/libfile/fileHandling.c b/src/sumatra/sumatra-1.0.10/sumalibs/libfile/fileHandling.c new file mode 100644 index 0000000..af05b9a --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/libfile/fileHandling.c @@ -0,0 +1,88 @@ +/** + * FileName: fileHandling.c + * Authors: Tiayyba Riaz, Celine Mercier + * Description: C file for file handling functions + * **/ + +#include +#include +#include + +#include "../libutils/utilities.h" + +/* + * Function Name: fileOpen(char* fileName, BOOL abortOnError) + * Description: Opens the file and returns the pointer to file object + */ +FILE *file_open(char* fileName, BOOL abortOnError) +{ + FILE* fp; + + if (fileName == NULL && abortOnError) + ERRORABORT(FILE_OPENING_ERROR, "File name not given."); + + if (fileName == NULL) + return NULL; + + fp = fopen(fileName, "r"); + return fp; +} + +FILE *file_openrw(char* fileName, BOOL abortOnError) +{ + FILE* fp; + + if (fileName == NULL && abortOnError) + ERRORABORT(FILE_OPENING_ERROR, "File name not given."); + + if (fileName == NULL) + return NULL; + + fp = fopen(fileName, "w+"); + return fp; +} + +/* + * Function Name: fileNextChar(FILE* fp) + * Description: Reads the file and returns next character, if file is null or its end of file, returns \Ż. + */ +char file_nextChar(FILE* fp) +{ + if (fp == NULL) + return '\0'; + + if(feof(fp)) + return '\0'; + + return (char) fgetc(fp); +} + +/* + * Function Name: *fileNextLine(FILE *fp, char *buffer, int32_t bufferSize) + * Description: Reads the file and returns next line, if file is null or its end of file, returns \Ż. + */ +char *file_nextLine(FILE *fp, char *buffer, int32_t bufferSize) +{ + if(fp == NULL) + return NULL; + + if(feof(fp)) + return NULL; + + return fgets(buffer, bufferSize, fp); +} + + +void exitIfEmptyFile(FILE *file) +{ + long savedOffset = ftell(file); + fseek(file, 0, SEEK_END); + + if (ftell(file) == 0) + { + fprintf(stderr, "\nInput file is empty.\n"); + exit(1); + } + fseek(file, savedOffset, SEEK_SET); +} + diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/libfile/fileHandling.h b/src/sumatra/sumatra-1.0.10/sumalibs/libfile/fileHandling.h new file mode 100644 index 0000000..334d8c3 --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/libfile/fileHandling.h @@ -0,0 +1,20 @@ +/** + * FileName: fileHandling.h + * Authors: Tiayyba Riaz, Celine Mercier + * Description: Header file for file handling functions + * **/ + + +#ifndef FILEHANDLING_H_ +#define FILEHANDLING_H_ + +#include "../libutils/utilities.h" +/* Prototypes */ + +FILE *file_open(char* fileName, BOOL abortOnError); +char file_nextChar(FILE* fp); +char *file_nextLine(FILE *fp, char *buffer, int32_t bufferSize); +FILE *file_openrw(char* fileName, BOOL abortOnError); +void exitIfEmptyFile(FILE *file); + +#endif /*FILEHANDLING_H_*/ diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/Makefile b/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/Makefile new file mode 100644 index 0000000..43a787c --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/Makefile @@ -0,0 +1,25 @@ + +SOURCES = sse_banded_LCS_alignment.c \ + upperband.c + +SRCS=$(SOURCES) + + +OBJECTS= $(patsubst %.c,%.o,$(SOURCES)) + +LIBFILE= liblcs.a +RANLIB=ranlib + + +include ../global.mk + +all: $(LIBFILE) + +clean: + rm -rf $(OBJECTS) $(LIBFILE) + rm -f *.P + rm -f *.a + +$(LIBFILE): $(OBJECTS) + ar -cr $@ $? + $(RANLIB) $@ diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/_lcs.ext.1.c b/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/_lcs.ext.1.c new file mode 100644 index 0000000..eeb1a21 --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/_lcs.ext.1.c @@ -0,0 +1,168 @@ +#include "_lcs.h" + +#include +#include +#include + +#include + + + +// Allocate a band allowing to align sequences of length : 'length' + +column_t* allocateColumn(int length,column_t *column, bool mode8bits) +{ + int size; + bool newc = false; + + // The band length should be equal to the length + // of the sequence + 7 for taking into account its + // shape + + size = (length+1) * ((mode8bits) ? sizeof(int8_t):sizeof(int16_t)); + + + // If the pointer to the old column is NULL we allocate + // a new column + + if (column==NULL) + { + + column = malloc(sizeof(column_t)); + if (!column) + return NULL; + + column->size = 0; + column->data.shrt=NULL; + column->score.shrt=NULL; + newc = true; + } + + // Otherwise we check if its size is sufficient + // or if it should be extended + + if (size > column->size) + { + int16_t *old = column->data.shrt; + int16_t *olds= column->score.shrt; + + column->data.shrt = malloc(size); + column->score.shrt= malloc(size); + + if (column->data.shrt==NULL || column->score.shrt==NULL) + { + fprintf(stderr,"Allocation Error on column for a size of %d\n" , size); + column->data.shrt = old; + column->score.shrt= olds; + + if (newc) + { + free(column); + column=NULL; + return NULL; + } + return NULL; + } + else + column->size = size; + } + + return column; +} + +void freeColumn(column_p column) +{ + if (column) + { + if (column->data.shrt) + free(column->data.shrt); + + if (column->score.shrt) + free(column->score.shrt); + + free(column); + } +} + +int fastLCSScore(const char* seq1, const char* seq2,column_pp column,int32_t* lpath) +{ + return fastLCSScore16(seq1,seq2,column,lpath); +} + +int simpleLCS(const char* seq1, const char* seq2,column_pp ppcolumn,int32_t* lpath) +{ + int lseq1,lseq2; // length of the both sequences + int lcs; + int itmp; // tmp variables for swap + const char* stmp; // + int32_t *score; + int32_t *path; + column_t *column; + int32_t i,j; + int32_t sl,su,sd; + int32_t pl,pu,pd; + + // Made seq1 the longest sequences + lseq1=strlen(seq1); + lseq2=strlen(seq2); + + if (lseq1 < lseq2) + { + itmp=lseq1; + lseq1=lseq2; + lseq2=itmp; + + stmp=seq1; + seq1=seq2; + seq2=stmp; + } + + lseq1++; + lseq2++; + + // a band sized to the smallest sequence is allocated + + if (ppcolumn) + column = *ppcolumn; + else + column=NULL; + + column = allocateColumn(lseq1*2,column,0); + score = (int32_t*) column->score.shrt; + path = (int32_t*) column->data.shrt; + + memset(score,0,lseq1 * sizeof(int32_t)); + + for (j=0; j < lseq1; j++) + path[j]=j; + + for (i=1; i< lseq2; i++) + { + sl=0; + pl=i; + for (j=1; j < lseq1; j++) + { + sd=score[j-1] + (seq2[i-1]==seq1[j-1] ? 1:0); + pd=path[j-1] + 1; + + su=score[j]; + pu=path[j] + 1; + + score[j-1]=sl; + + if (su > sl) sl=su, pl=pu; + if (sd > sl) sl=sd, pl=pd; + } + } + + lcs = sl; + if(lpath) *lpath=pl; + + if (ppcolumn) + *ppcolumn=column; + else + freeColumn(column); + + return lcs; +} + diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/_lcs.ext.2.c b/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/_lcs.ext.2.c new file mode 100644 index 0000000..381dc6a --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/_lcs.ext.2.c @@ -0,0 +1,34 @@ +#include "_lcs.h" + +#include +#include +#include + +#include + + + + +#define VSIZE (8) +#define VTYPE vInt16 +#define STYPE int16_t +#define CMENB shrt +#define VMODE false +#define FASTLCSSCORE fastLCSScore16 +#define INSERT_REG _MM_INSERT_EPI16 +#define EXTRACT_REG _MM_EXTRACT_EPI16 +#define EQUAL_REG _MM_CMPEQ_EPI16 +#define GREATER_REG _MM_CMPGT_EPI16 +#define SMALLER_REG _MM_CMPLT_EPI16 +#define ADD_REG _MM_ADD_EPI16 +#define SUB_REG _MM_SUB_EPI16 +#define AND_REG _MM_AND_SI128 +#define ANDNOT_REG _MM_ANDNOT_SI128 +#define OR_REG _MM_OR_SI128 +#define SET_CONST _MM_SET1_EPI16 +#define GET_MAX _MM_MAX_EPI16 +#define GET_MIN _MM_MIN_EPI16 +#define MIN_SCORE INT16_MIN +#define MAX_SCORE 32000 + +#include "_lcs_fast.h" diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/_lcs.ext.3.c b/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/_lcs.ext.3.c new file mode 100644 index 0000000..5c3a150 --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/_lcs.ext.3.c @@ -0,0 +1,34 @@ +#include "_lcs.h" + +#include +#include +#include + +#include + + + + +#define VSIZE (16) +#define VTYPE vInt8 +#define STYPE int8_t +#define CMENB byte +#define VMODE true +#define FASTLCSSCORE fastLCSScore8 +#define INSERT_REG _MM_INSERT_EPI8 +#define EXTRACT_REG _MM_EXTRACT_EPI8 +#define EQUAL_REG _MM_CMPEQ_EPI8 +#define GREATER_REG _MM_CMPGT_EPI8 +#define SMALLER_REG _MM_CMPLT_EPI8 +#define ADD_REG _MM_ADD_EPI8 +#define SUB_REG _MM_SUB_EPI8 +#define AND_REG _MM_AND_SI128 +#define ANDNOT_REG _MM_ANDNOT_SI128 +#define OR_REG _MM_OR_SI128 +#define SET_CONST _MM_SET1_EPI8 +#define GET_MAX _MM_MAX_EPI8 +#define GET_MIN _MM_MIN_EPI8 +#define MIN_SCORE INT8_MIN +#define MAX_SCORE 127 + +#include "_lcs_fast.h" diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/_lcs.h b/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/_lcs.h new file mode 100644 index 0000000..cfc032f --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/_lcs.h @@ -0,0 +1,29 @@ +#include "../libsse/_sse.h" + +#define bool char +#define false (1==0) +#define true (1==1) + +typedef struct { + int16_t size; + + union { int16_t *shrt; + int8_t *byte; + } data; + + union { int16_t *shrt; + int8_t *byte; + } score; + + +} column_t, **column_pp, *column_p; + +column_p allocateColumn(int length,column_t *column, bool mode8bits); + +void freeColumn(column_p column); + +int fastLCSScore16(const char* seq1, const char* seq2,column_pp ppcolumn,int32_t* lpath); +int fastLCSScore8(const char* seq1, const char* seq2,column_pp ppcolumn,int32_t* lpath); +int simpleLCS(const char* seq1, const char* seq2,column_pp ppcolumn,int32_t* lpath); + +int fastLCSScore(const char* seq1, const char* seq2,column_pp column,int32_t* lpath); diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/_lcs_fast.h b/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/_lcs_fast.h new file mode 100644 index 0000000..3d0ac00 --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/_lcs_fast.h @@ -0,0 +1,597 @@ + +/* + * Print a SSE register for debug purpose + */ + +#ifdef __SSE2__ + +static void printreg(VTYPE r) +{ + STYPE a0,a1,a2,a3,a4,a5,a6,a7; +#if VMODE + STYPE a8,a9,a10,a11,a12,a13,a14,a15; +#endif + + a0= EXTRACT_REG(r,0); + a1= EXTRACT_REG(r,1); + a2= EXTRACT_REG(r,2); + a3= EXTRACT_REG(r,3); + a4= EXTRACT_REG(r,4); + a5= EXTRACT_REG(r,5); + a6= EXTRACT_REG(r,6); + a7= EXTRACT_REG(r,7); +#if VMODE + a8= EXTRACT_REG(r,8); + a9= EXTRACT_REG(r,9); + a10= EXTRACT_REG(r,10); + a11= EXTRACT_REG(r,11); + a12= EXTRACT_REG(r,12); + a13= EXTRACT_REG(r,13); + a14= EXTRACT_REG(r,14); + a15= EXTRACT_REG(r,15); +#endif + +printf( "a00 :-> %7d %7d %7d %7d " + " %7d %7d %7d %7d " +#if VMODE + "%7d %7d %7d %7d " + " %7d %7d %7d %7d " +#endif + "\n" + , a0,a1,a2,a3,a4,a5,a6,a7 +#if VMODE + , a8,a9,a10,a11,a12,a13,a14,a15 +#endif +); +} + +/* + * set position p of a SSE register with the value v + */ + +static inline VTYPE insert_reg(VTYPE r, STYPE v, int p) +{ + switch (p) { + case 0: return INSERT_REG(r,v,0); + case 1: return INSERT_REG(r,v,1); + case 2: return INSERT_REG(r,v,2); + case 3: return INSERT_REG(r,v,3); + case 4: return INSERT_REG(r,v,4); + case 5: return INSERT_REG(r,v,5); + case 6: return INSERT_REG(r,v,6); + case 7: return INSERT_REG(r,v,7); +#if VMODE + case 8: return INSERT_REG(r,v,8); + case 9: return INSERT_REG(r,v,9); + case 10: return INSERT_REG(r,v,10); + case 11: return INSERT_REG(r,v,11); + case 12: return INSERT_REG(r,v,12); + case 13: return INSERT_REG(r,v,13); + case 14: return INSERT_REG(r,v,14); + case 15: return INSERT_REG(r,v,15); +#endif + } + return _MM_SETZERO_SI128(); +} + +static inline STYPE extract_reg(VTYPE r, int p) +{ + switch (p) { + case 0: return EXTRACT_REG(r,0); + case 1: return EXTRACT_REG(r,1); + case 2: return EXTRACT_REG(r,2); + case 3: return EXTRACT_REG(r,3); + case 4: return EXTRACT_REG(r,4); + case 5: return EXTRACT_REG(r,5); + case 6: return EXTRACT_REG(r,6); + case 7: return EXTRACT_REG(r,7); +#if VMODE + case 8: return EXTRACT_REG(r,8); + case 9: return EXTRACT_REG(r,9); + case 10: return EXTRACT_REG(r,10); + case 11: return EXTRACT_REG(r,11); + case 12: return EXTRACT_REG(r,12); + case 13: return EXTRACT_REG(r,13); + case 14: return EXTRACT_REG(r,14); + case 15: return EXTRACT_REG(r,15); +#endif + } + return 0; +} + +#define GET_H_SYMBOLE(s,p) ((p && p < lseq1) ? (s)[(p)-1]:255) +#define GET_V_SYMBOLE(s,p) ((p && p < lseq2) ? (s)[(p)-1]:0) + +#define LSHIFT_SCORE(r) { r = _MM_SLLI_SI128((r),sizeof(STYPE)); } +#define SET_H_SYMBOLE(r,p,s) { r = insert_reg((r),(STYPE)GET_H_SYMBOLE(seq1,(s)),(p)); } +#define PUSH_V_SYMBOLE(r,s) { r = insert_reg(_MM_SLLI_SI128((r),sizeof(STYPE)),(STYPE)GET_V_SYMBOLE(seq2,(s)),0); } +#define EQUAL(f1,f2) _MM_AND_SI128(EQUAL_REG((f1),(f2)),SET_CONST(1)) + +int FASTLCSSCORE(const char* seq1, const char* seq2,column_pp ppcolumn,int32_t* lpath) +{ + int lseq1,lseq2; // length of the both sequences + + int itmp; // tmp variables for swap + const char* stmp; // + + int nbands; // Number of bands of width eight in the score matrix + int lastband; // width of the last band + + // Register for scanning the score matrix + VTYPE minus1; + VTYPE minus2; + VTYPE current; + + VTYPE left; + VTYPE top; + VTYPE diag; + + + VTYPE sminus1; + VTYPE sminus2; + VTYPE scurrent; + + VTYPE sleft; + VTYPE stop; + VTYPE sdiag; + + VTYPE way; + VTYPE onevect; + VTYPE maxvect; + + VTYPE fhseq; // The fragment of the horizontal sequence + // to consider for aligment + VTYPE fvseq; // The fragment of the horizontal sequence + // to consider for aligment + VTYPE match; + + int band; + int line; + int limit; + + int lcs; + + int h; + int i; + + column_t *column; + + + // Made seq1 the longest sequences + lseq1=strlen(seq1); + lseq2=strlen(seq2); + + if (lseq1 < 10 || lseq2 < 10) + return simpleLCS(seq1,seq2,ppcolumn,lpath); + + if (lseq1 < lseq2) + { + itmp=lseq1; + lseq1=lseq2; + lseq2=itmp; + + stmp=seq1; + seq1=seq2; + seq2=stmp; + } + + // we add one to both lengths for taking into + // account the extra line and column in the score + // matrix + + lseq1++; + lseq2++; + + // a band sized to the smallest sequence is allocated + + if (ppcolumn) + column = *ppcolumn; + else + column=NULL; + + column = allocateColumn(lseq2,column,VMODE); + + // Check memory allocation + if (column == NULL) + return -1; + + for (i=0; idata.CMENB[i]=MIN_SCORE; + column->score.CMENB[i]=-1; + } + + nbands = lseq1 / VSIZE; // You have VSIZE element in one SSE register + // Alignment will be realized in nbands + + lastband = lseq1 - (nbands * VSIZE); // plus one of width lastband except if + // lastband==0 + + if (lastband) nbands++; + else lastband=VSIZE; + + lastband--; + +// printf("seq1 : %s seq2 : %s\n",seq1,seq2); + + + minus2 = SET_CONST(MIN_SCORE); + minus1 = _MM_SETZERO_SI128(); + + sminus1= _MM_SETZERO_SI128(); + sminus2= _MM_SETZERO_SI128(); + onevect= SET_CONST(1); + maxvect= SET_CONST(MAX_SCORE); + + h=0; + + fhseq = _MM_SETZERO_SI128(); + fvseq = _MM_SETZERO_SI128(); + + // + // Beginning of the first band + // + + for (line = 0; line < VSIZE; line++,h++) // avant VSIZE - 1 + { +// printf("line= %4d h= %4d\n",line,h); + SET_H_SYMBOLE(fhseq,line,h) + PUSH_V_SYMBOLE(fvseq,line) + minus2 = insert_reg(minus2,0,h); + minus1 = insert_reg(minus1,MIN_SCORE,line); // 0 avant + match = EQUAL(fhseq,fvseq); + + if (lpath) + { + sminus2 = insert_reg(sminus2,line-1,line); // Je ne suis pas certain de l'initialisation + sminus1 = insert_reg(sminus1,0,line); + } + +// printreg(fvseq); +// printreg(fhseq); +// printreg(match); +// printf("================================\n"); + + current = minus1; // The best score is the upper one + // It cannot be the best as set to MIN_SCORE + + left = minus1; + +// printf("Vert = "); printreg(current); + + + LSHIFT_SCORE(minus1) // I shift minus1 so now I'll compare with the left position + minus1=insert_reg(minus1,(column)->data.CMENB[line],0); + + top=minus1; + + if (lpath) + { + sleft=sminus1; // I store the path length corresponding to the upper path + LSHIFT_SCORE(sminus1) // I shift to prepare the score coming from the left side + sminus1=insert_reg(sminus1,(column)->score.CMENB[line],0); + stop=sminus1; + sdiag=sminus2; + + } + +// printf("Horz = "); printreg(minus1); + + current = GET_MAX(current,minus1); // Look for the best between upper and left + +// printf("BstHV= "); printreg(current); +// +// printf("Diag = "); printreg(ADD_REG(minus2,match)); + + diag=minus2; + + // minus2 = ; // Minus2 contains the diagonal score, so I add the match reward + // Diag score are setup to 0 so this one will win on the first iteration + current = GET_MAX(current,ADD_REG(minus2,match)); + + if (lpath) + { +// printf("\n"); +// printf("current: "); +// printreg(current); +// printf("current: "); +// printreg(SUB_REG(current,match)); +// printf("diag : "); +// printreg(diag); +// printf("left : "); +// printreg(left); +// printf("top : "); +// printreg(top); + + + way = EQUAL_REG(SUB_REG(current,match),diag); + scurrent= OR_REG(AND_REG(way,sdiag), + ANDNOT_REG(way,maxvect)); +// printf("sdiag : "); +// printreg(scurrent); + way = EQUAL_REG(current,left); + scurrent= GET_MIN(scurrent,OR_REG(AND_REG(way,sleft), + ANDNOT_REG(way,maxvect))); + +// printf("sleft : "); +// printreg(scurrent); + way = EQUAL_REG(current,top); + scurrent= GET_MIN(scurrent,OR_REG(AND_REG(way,stop), + ANDNOT_REG(way,maxvect))); +// printf("stop : "); +// printreg(scurrent); + + scurrent= ADD_REG(scurrent,onevect); + + sminus2=sminus1; + sminus1=scurrent; + } +// printf("line %d :Best = ",line); printreg(current); +// +// printf("================================\n"); + + minus2=minus1; + minus1=current; + +// printf("min2 = "); printreg(minus2); +// printf("min1 = "); printreg(minus1); +// printf("================================\n"); + +// printf("\n"); +// printf("sdiag : "); +// printreg(sminus2); +// printf("scur : "); +// printreg(scurrent); +// printf("current: "); +// printreg(current); +// printf("%8s\n",seq1); +// printf("%8s\n",seq2); +// printf("================================\n"); + + + } ///// <<<<<<<<------- Fin du debut de la premiere bande + + +// printf("================================\n"); + + (column)->data.CMENB[lseq2-VSIZE+line]=EXTRACT_REG(current,VSIZE-1); + + + if (lpath) + (column)->score.CMENB[lseq2-VSIZE+line]=EXTRACT_REG(scurrent,VSIZE-1); + + + + for (band=0; band < nbands; band++) + { +// SET_H_SYMBOLE(fhseq,line,h) +// minus2 = insert_reg(minus2,0,line); +// minus1 = insert_reg(minus1,MIN_SCORE,line); // 0 avant +// h++; + + for (; line < lseq2; line++) + { +// printf("Je tourne avec line= %d \n",line); + PUSH_V_SYMBOLE(fvseq,line) + + match = EQUAL(fhseq,fvseq); + +// printreg(fvseq); +// printreg(fhseq); +// printreg(match); +// printf("================================\n"); + + current = minus1; + + left = minus1; + + // Store the last current score in extra column + (column)->data.CMENB[line-VSIZE]=EXTRACT_REG(current,VSIZE-1); + LSHIFT_SCORE(minus1) + minus1=insert_reg(minus1,(column)->data.CMENB[line],0); + + top = minus1; + +// printf("Vert = "); printreg(current); + + if (lpath) + { + sleft= sminus1; + (column)->score.CMENB[line-VSIZE]=EXTRACT_REG(scurrent,VSIZE-1); + LSHIFT_SCORE(sminus1) + sminus1=insert_reg(sminus1,(column)->score.CMENB[line],0); + stop=sminus1; + sdiag=sminus2; + } + +// printf("line = %d --> get = %d\n",line,(column)->data.CMENB[line]); + +// printf("Horz = "); printreg(minus1); + + current = GET_MAX(current,minus1); + + diag=minus2; + + current = GET_MAX(current,ADD_REG(minus2,match)); + + if (lpath) + { +// printf("\n"); +// printf("current: "); +// printreg(current); +// printf("current: "); +// printreg(SUB_REG(current,match)); +// printf("diag : "); +// printreg(diag); +// printf("left : "); +// printreg(left); +// printf("top : "); +// printreg(top); + + way = EQUAL_REG(SUB_REG(current,match),diag); + scurrent= OR_REG(AND_REG(way,sdiag), + ANDNOT_REG(way,maxvect)); + +// printf("sdiag : "); +// printreg(scurrent); + + way = EQUAL_REG(current,left); + scurrent= GET_MIN(scurrent,OR_REG(AND_REG(way,sleft), + ANDNOT_REG(way,maxvect))); + +// printf("sleft : "); +// printreg(scurrent); + + way = EQUAL_REG(current,top); + scurrent= GET_MIN(scurrent,OR_REG(AND_REG(way,stop), + ANDNOT_REG(way,maxvect))); + +// printf("stop : "); +// printreg(scurrent); + + scurrent= ADD_REG(scurrent,onevect); + + sminus2=sminus1; + sminus1=scurrent; + } + + minus2=minus1; + minus1=current; + +// printf("\n"); +// printf("sdiag : "); +// printreg(sminus2); +// printf("scur : "); +// printreg(scurrent); +// printf("current: "); +// printreg(current); +// printf("%8s\n",seq1); +// printf("%8s\n",seq2); + } +// printf("================================\n"); + + // end of the band and beginnig of the next one + + limit=(band==(nbands-1)) ? lastband:VSIZE; + + for (line = 0; line < limit; line++,h++) + { +// printf("Je fini avec line= %d \n",line); + + SET_H_SYMBOLE(fhseq,line,h) + PUSH_V_SYMBOLE(fvseq,line) + + + minus2 = insert_reg(minus2,MIN_SCORE,line); + minus1 = insert_reg(minus1,MIN_SCORE,line); + current = minus1; + left=minus1; + + match = EQUAL(fhseq,fvseq); + + if (lpath) + { + sminus2 = insert_reg(sminus2,lseq2-VSIZE+line,line); + sminus1 = insert_reg(sminus1,h,line); + sleft= sminus1; + } + + +// printf("\n"); +// printf("fhseq = "); printreg(fhseq); +// printf("fvseq = "); printreg(fvseq); +// printf("----------------------------------------------------------------\n"); +// printf("match = "); printreg(match); + + + (column)->data.CMENB[lseq2-VSIZE+line]=EXTRACT_REG(current,VSIZE-1); + LSHIFT_SCORE(minus1) + minus1=insert_reg(minus1,(column)->data.CMENB[line],0); + top=minus1; + + current = GET_MAX(current,minus1); + + if (lpath) + { + (column)->score.CMENB[lseq2-VSIZE+line]=EXTRACT_REG(scurrent,VSIZE-1); + LSHIFT_SCORE(sminus1) + sminus1=insert_reg(sminus1,(column)->score.CMENB[line],0); + stop=sminus1; + sdiag=sminus2; + + way = EQUAL_REG(current,minus1); + + scurrent= OR_REG(AND_REG(way,sminus1), + ANDNOT_REG(way,scurrent)); + } + + + diag=minus2; + + current = GET_MAX(current,ADD_REG(minus2,match)); + + if (lpath) + { + way = EQUAL_REG(SUB_REG(current,match),diag); + scurrent= OR_REG(AND_REG(way,sdiag), + ANDNOT_REG(way,maxvect)); + + way = EQUAL_REG(current,left); + scurrent= GET_MIN(scurrent,OR_REG(AND_REG(way,sleft), + ANDNOT_REG(way,maxvect))); + + way = EQUAL_REG(current,top); + scurrent= GET_MIN(scurrent,OR_REG(AND_REG(way,stop), + ANDNOT_REG(way,maxvect))); + + scurrent= ADD_REG(scurrent,onevect); + + sminus2=sminus1; + sminus1=scurrent; + } + +// printf("currt = "); printreg(current); + + minus2=minus1; + minus1=current; + +// printf("\n"); +// printf("sdiag : "); +// printreg(sminus2); +// printf("scur : "); +// printreg(scurrent); +// printf("current: "); +// printreg(current); +// printf("%8s\n",seq1); +// printf("%8s\n",seq2); + +// printf("Je stocke line= %d la valeur %d\n",lseq2-VSIZE+line,(column)->data.CMENB[lseq2-VSIZE+line]); + } + + } + +// printf("\n"); +// printf("line = %d, h= %d, lastband = %d\n",line,h,lastband); +// printf("currt = "); printreg(current); + lcs = extract_reg(current,lastband); + + if(lpath) + *lpath= extract_reg(scurrent,lastband); +// printf("lastband = %d (%d) lcs = %d\n",lastband,lseq2,lcs); + + if (ppcolumn) + *ppcolumn=column; + else + freeColumn(column); + + return lcs; +} + +#else +int FASTLCSSCORE(const char* seq1, const char* seq2,column_pp ppcolumn,int32_t* lpath) +{ + return simpleLCS(seq1,seq2,ppcolumn,lpath); +} + +#endif /* __SSE2__ */ + diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/banded_LCS_alignment.c b/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/banded_LCS_alignment.c new file mode 100644 index 0000000..0fae829 --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/banded_LCS_alignment.c @@ -0,0 +1,211 @@ +/* + * banded_LCS_alignment.c + * + * Created on: 7 nov. 2012 + * Author: merciece + */ + +#include +#include +#include +#include "../libutils/utilities.h" + + +typedef struct { + int score; + int l_path; +}infos; + + +int calculateScore(char nuc1, char nuc2) +{ + return(nuc1 == nuc2); +} + +infos** banded_align(char *seq1, char *seq2, int l1, int l2, int bandLengthRight, int bandLengthLeft) +{ + int i, j; + //int c; + //double id; + int start, end; + int diag_score, delete, insert, mismatch; + int l_path, l_path_i, l_path_d; + int bestScore; + int mismatch_margin; + int stop; + int diag_index; + infos **matrix; + + l1++; + l2++; + mismatch_margin = bandLengthLeft; // the biggest one + diag_index = l1-l2; // diagonal index + stop=0; + + //fprintf(stderr,"\nseq1 = %s, seq2=%s, bandLengthR = %d, bandLengthL = %d", seq1, seq2, bandLengthRight, bandLengthLeft); + + // Matrix initialization~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + matrix = (infos**) malloc(l1 * sizeof(infos*)); + for (i = 0; i < l1; i++) + matrix[i] = (infos*) malloc(l2 * sizeof(infos)); + + for (i = 0; i < l1; i++) + for (j = 0; j < l2; j++) + { + matrix[i][j].score = 0; + matrix[i][j].l_path = 0; + } + + for (i = 0; i < l1; i++) + matrix[i][0].l_path = i; + + for (j = 0; j < l2; j++) + matrix[0][j].l_path = j; + + // Matrix initialized~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + for (i = 1; i < l1; i++) + { + start = i - bandLengthLeft; + if (start < 1) + start = 1; + end = i+bandLengthRight+1; + if (end > l2) + end = l2; + + for (j = start; j < end; j++) + { + delete = matrix[i-1][j].score; + l_path_d = matrix[i-1][j].l_path + 1; + insert = matrix[i][j-1].score; + l_path_i = matrix[i][j-1].l_path + 1; + mismatch = 0; + + diag_score = calculateScore(seq1[i-1], seq2[j-1]); + bestScore = matrix[i-1][j-1].score + diag_score; + l_path = matrix[i-1][j-1].l_path + 1; + if (diag_score == 0) // mismatch + mismatch = 1; + + if ((insert > bestScore) || ((insert == bestScore) && (l_path_i < l_path))) + { + bestScore = matrix[i][j-1].score; + l_path = l_path_i; + mismatch = 0; + } + + if ((delete > bestScore) || ((delete == bestScore) && (l_path_d < l_path))) + { + bestScore = delete; + l_path = l_path_d; + mismatch = 0; + } + + /*if (((i-j) - diag_index == 0) && (mismatch == 1)) + { + //fprintf(stderr, "\nR = %d, L = %d\n", bandLengthRight, bandLengthLeft); + if (bandLengthRight+bandLengthLeft == 0) + { + stop = 1; + //fprintf(stderr, "\nBREAKING LOOPS\n"); + break; + } + if (bandLengthRight != 0) + bandLengthRight = bandLengthRight - 1; + if (bandLengthLeft != 0) + bandLengthLeft = bandLengthLeft - 1; + }*/ + + (matrix[i][j]).score = bestScore; + (matrix[i][j]).l_path = l_path; + } + + //if ((bandLengthRight + bandLengthLeft == 0) && ((matrix[i][j].l_path - matrix[i][j].score) > mismatch_margin)) + if (stop==1) + break; + } + return(matrix); +} + + +void calculateBandLength(int l1, int l2, double threshold, int* bandLengthRight, int* bandLengthLeft) +{ + (*bandLengthLeft) = round(-l1 * threshold + l1); + (*bandLengthRight) = round(-l1 * threshold + l2); + +// fprintf(stderr,"\nR=%d, L=%d", (*bandLengthRight), (*bandLengthLeft)); +} + + +double calculateId(infos** matrix, int len1, int len2) +{ + double id; + int l_ali; + int l_lcs; + + l_lcs = matrix[len1][len2].score; + l_ali = matrix[len1][len2].l_path; + + if (l_lcs == 0) + id = 0.0; + else + id = (double) l_lcs / (double) l_ali; + + //fprintf(stderr, "\n%d, %d\n", l_lcs, l_ali); + return(id); +} + + +double banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, double threshold, BOOL n, int ref, BOOL lcsmode, int16_t* address) +{ + double id; + int bandLengthRight, bandLengthLeft; + int i,j; + + char* s1; + char* s2; + + s1 = (char*) malloc(l1*sizeof(char)+1); + s2 = (char*) malloc(l2*sizeof(char)+1); + + for (i=l1-1, j=0; i>=0, j +#include +#include +#include +#include "../libutils/utilities.h" +#include "../libsse/_sse.h" + + + +/*static void printreg(__m128i r) +{ + int16_t a0,a1,a2,a3,a4,a5,a6,a7; + + a0= _MM_EXTRACT_EPI16(r,0); + a1= _MM_EXTRACT_EPI16(r,1); + a2= _MM_EXTRACT_EPI16(r,2); + a3= _MM_EXTRACT_EPI16(r,3); + a4= _MM_EXTRACT_EPI16(r,4); + a5= _MM_EXTRACT_EPI16(r,5); + a6= _MM_EXTRACT_EPI16(r,6); + a7= _MM_EXTRACT_EPI16(r,7); + +fprintf(stderr, "a00 :-> %7d %7d %7d %7d " + " %7d %7d %7d %7d " + "\n" + , a0,a1,a2,a3,a4,a5,a6,a7 + ); +} +*/ + +static inline int extract_reg(__m128i r, int p) +{ + switch (p) { + case 0: return(_MM_EXTRACT_EPI16(r,0)); + case 1: return(_MM_EXTRACT_EPI16(r,1)); + case 2: return(_MM_EXTRACT_EPI16(r,2)); + case 3: return(_MM_EXTRACT_EPI16(r,3)); + case 4: return(_MM_EXTRACT_EPI16(r,4)); + case 5: return(_MM_EXTRACT_EPI16(r,5)); + case 6: return(_MM_EXTRACT_EPI16(r,6)); + case 7: return(_MM_EXTRACT_EPI16(r,7)); + } + return(0); +} + + +void sse_banded_align_lcs_and_ali_len(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal, int16_t* address, double* lcs_length, int* ali_length) +{ + register int j; + int k1, k2; + int max, diff; + int l_reg, l_loc; + int line; + int numberOfRegistersPerLine; + int numberOfRegistersFor3Lines; + + BOOL even_line; + BOOL odd_line; + BOOL even_BLL; + BOOL odd_BLL; + + um128* SSEregisters; + um128* p_diag; + um128* p_gap1; + um128* p_gap2; + um128* p_diag_j; + um128* p_gap1_j; + um128* p_gap2_j; + um128 current; + + um128* l_ali_SSEregisters; + um128* p_l_ali_diag; + um128* p_l_ali_gap1; + um128* p_l_ali_gap2; + um128* p_l_ali_diag_j; + um128* p_l_ali_gap1_j; + um128* p_l_ali_gap2_j; + um128 l_ali_current; + + um128 nucs1; + um128 nucs2; + um128 scores; + + um128 boolean_reg; + + // Initialisations + + odd_BLL = bandLengthLeft & 1; + even_BLL = !odd_BLL; + + max = INT16_MAX - l1; + + numberOfRegistersPerLine = bandLengthTotal / 8; + numberOfRegistersFor3Lines = 3 * numberOfRegistersPerLine; + + SSEregisters = (um128*) calloc(numberOfRegistersFor3Lines * 2, sizeof(um128)); + l_ali_SSEregisters = SSEregisters + numberOfRegistersFor3Lines; + + // preparer registres SSE + + for (j=0; ji, scores.i); + + // Computing alignment length + + l_ali_current.i = p_l_ali_diag_j->i; + boolean_reg.i = _MM_CMPGT_EPI16(p_gap1_j->i, current.i); + l_ali_current.i = _MM_OR_SI128( + _MM_AND_SI128(p_l_ali_gap1_j->i, boolean_reg.i), + _MM_ANDNOT_SI128(boolean_reg.i, l_ali_current.i)); + current.i = _MM_OR_SI128( + _MM_AND_SI128(p_gap1_j->i, boolean_reg.i), + _MM_ANDNOT_SI128(boolean_reg.i, current.i)); + boolean_reg.i = _MM_AND_SI128( + _MM_CMPEQ_EPI16(p_gap1_j->i, current.i), + _MM_CMPLT_EPI16(p_l_ali_gap1_j->i, l_ali_current.i)); + l_ali_current.i = _MM_OR_SI128( + _MM_AND_SI128(p_l_ali_gap1_j->i, boolean_reg.i), + _MM_ANDNOT_SI128(boolean_reg.i, l_ali_current.i)); + current.i = _MM_OR_SI128( + _MM_AND_SI128(p_gap1_j->i, boolean_reg.i), + _MM_ANDNOT_SI128(boolean_reg.i, current.i)); + boolean_reg.i = _MM_CMPGT_EPI16(p_gap2_j->i, current.i); + l_ali_current.i = _MM_OR_SI128( + _MM_AND_SI128(p_l_ali_gap2_j->i, boolean_reg.i), + _MM_ANDNOT_SI128(boolean_reg.i, l_ali_current.i)); + current.i = _MM_OR_SI128( + _MM_AND_SI128(p_gap2_j->i, boolean_reg.i), + _MM_ANDNOT_SI128(boolean_reg.i, current.i)); + boolean_reg.i = _MM_AND_SI128( + _MM_CMPEQ_EPI16(p_gap2_j->i, current.i), + _MM_CMPLT_EPI16(p_l_ali_gap2_j->i, l_ali_current.i)); + l_ali_current.i = _MM_OR_SI128( + _MM_AND_SI128(p_l_ali_gap2_j->i, boolean_reg.i), + _MM_ANDNOT_SI128(boolean_reg.i, l_ali_current.i)); + current.i = _MM_OR_SI128( + _MM_AND_SI128(p_gap2_j->i, boolean_reg.i), + _MM_ANDNOT_SI128(boolean_reg.i, current.i)); + + +/* fprintf(stderr, "\nline = %d", line); + fprintf(stderr, "\nDiag, r %d : ", j); + printreg((*(p_diag_j)).i); + fprintf(stderr, "Gap1 : "); + printreg((*(p_gap1_j)).i); + fprintf(stderr, "Gap2 : "); + printreg((*(p_gap2_j)).i); + fprintf(stderr, "current : "); + printreg(current.i); + fprintf(stderr, "L ALI\nDiag r %d : ", j); + printreg((*(p_l_ali_diag_j)).i); + fprintf(stderr, "Gap1 : "); + printreg((*(p_l_ali_gap1_j)).i); + fprintf(stderr, "Gap2 : "); + printreg((*(p_l_ali_gap2_j)).i); + fprintf(stderr, "current : "); + printreg(l_ali_current.i); +*/ + + // diag = gap1 and gap1 = current + p_diag_j->i = p_gap1_j->i; + p_gap1_j->i = current.i; + + // l_ali_diag = l_ali_gap1 and l_ali_gap1 = l_ali_current+1 + p_l_ali_diag_j->i = p_l_ali_gap1_j->i; + p_l_ali_gap1_j->i = _MM_ADD_EPI16(l_ali_current.i, _MM_SET1_EPI16(1)); + } + + // shifts for gap2, to do only once all the registers of a line have been computed Copier gap2 puis le charger depuis la copie? + + for (j=0; j < numberOfRegistersPerLine; j++) + { + if ((odd_line && even_BLL) || (even_line && odd_BLL)) + { + p_gap2[j].i = _MM_LOADU_SI128((p_gap1[j].s16)-1); + p_l_ali_gap2[j].i = _MM_LOADU_SI128((p_l_ali_gap1[j].s16)-1); + if (j == 0) + { + p_gap2[j].i = _MM_INSERT_EPI16(p_gap2[j].i, 0, 0); + p_l_ali_gap2[j].i = _MM_INSERT_EPI16(p_l_ali_gap2[j].i, max, 0); + } + } + else + { + p_gap2[j].i = _MM_LOADU_SI128(p_gap1[j].s16+1); + p_l_ali_gap2[j].i = _MM_LOADU_SI128(p_l_ali_gap1[j].s16+1); + if (j == numberOfRegistersPerLine - 1) + { + p_gap2[j].i = _MM_INSERT_EPI16(p_gap2[j].i, 0, 7); + p_l_ali_gap2[j].i = _MM_INSERT_EPI16(p_l_ali_gap2[j].i, max, 7); + } + } + } + // end shifts for gap2 + + } + +/* /// Recovering LCS and alignment lengths \\\ */ + + // finding the location of the results in the registers : + diff = l1-l2; + if ((diff & 1) && odd_BLL) + l_loc = (int) floor((double)(bandLengthLeft) / (double)2) - floor((double)(diff) / (double)2); + else + l_loc = (int) floor((double)(bandLengthLeft) / (double)2) - ceil((double)(diff) / (double)2); + + l_reg = (int)floor((double)l_loc/(double)8.0); + //fprintf(stderr, "\nl_reg = %d, l_loc = %d\n", l_reg, l_loc); + l_loc = l_loc - l_reg*8; + + // extracting the results from the registers : + *lcs_length = extract_reg(p_gap1[l_reg].i, l_loc); + *ali_length = extract_reg(p_l_ali_gap1[l_reg].i, l_loc) - 1; + + // freeing the registers + free(SSEregisters); +} + + +double sse_banded_align_just_lcs(int16_t* seq1, int16_t* seq2, int l1, int l2, int bandLengthLeft, int bandLengthTotal) +{ + register int j; + int k1, k2; + int diff; + int l_reg, l_loc; + int16_t l_lcs; + int line; + int numberOfRegistersPerLine; + int numberOfRegistersFor3Lines; + + BOOL even_line; + BOOL odd_line; + BOOL even_BLL; + BOOL odd_BLL; + + um128* SSEregisters; + um128* p_diag; + um128* p_gap1; + um128* p_gap2; + um128* p_diag_j; + um128* p_gap1_j; + um128* p_gap2_j; + um128 current; + + um128 nucs1; + um128 nucs2; + um128 scores; + + // Initialisations + + odd_BLL = bandLengthLeft & 1; + even_BLL = !odd_BLL; + + numberOfRegistersPerLine = bandLengthTotal / 8; + numberOfRegistersFor3Lines = 3 * numberOfRegistersPerLine; + + SSEregisters = malloc(numberOfRegistersFor3Lines * sizeof(um128)); + + // preparer registres SSE + + for (j=0; j 0) + { + if (normalize) + { + if (reference == MINLEN) + LCSmin = threshold*l2; + else // ref = maxlen or alilen + LCSmin = threshold*l1; + } + else if (lcsmode) + LCSmin = threshold; + else if ((reference == MINLEN)) // not lcsmode + LCSmin = l2 - threshold; + else // not lcsmode and ref = maxlen or alilen + LCSmin = l1 - threshold; + } + else + LCSmin = 0; + + return(LCSmin); +} + + +int calculateSSEBandLength(int bandLengthRight, int bandLengthLeft) +{ +// *bandLengthTotal= (double) floor(bandLengthRight + bandLengthLeft) / 2.0 + 1; + int bandLengthTotal= (double)(bandLengthRight + bandLengthLeft) / 2.0 + 1.0; + + return (bandLengthTotal & (~ (int)7)) + (( bandLengthTotal & (int)7) ? 8:0); // Calcule le multiple de 8 superieur +} + + +int calculateSizeToAllocate(int maxLen, int minLen, int LCSmin) +{ + int size; + int notUsed; + + calculateBandLengths(maxLen, minLen, ¬Used, &size, LCSmin); // max size = max left band length * 2 + + //fprintf(stderr, "\nsize for address before %8 = %d", size); + + size*= 2; + size = (size & (~ (int)7)) + (( size & (int)7) ? 8:0); // Calcule le multiple de 8 superieur + size*= 3; + size+= 16; + + //fprintf(stderr, "\nsize for address = %d", size); + + return(size*sizeof(int16_t)); +} + + +void iniSeq(int16_t* seq, int size, int16_t iniValue) +{ + int16_t *target=seq; + int16_t *end = target + (size_t)size; + + for (; target < end; target++) + *target = iniValue; +} + + +void putSeqInSeq(int16_t* seq, char* s, int l, BOOL reverse) +{ + int16_t *target=seq; + int16_t *end = target + (size_t)l; + char *source=s; + + if (reverse) + for (source=s + (size_t)l-1; target < end; target++, source--) + *target=*source; + else + for (; target < end; source++,target++) + *target=*source; +} + + +void initializeAddressWithGaps(int16_t* address, int bandLengthTotal, int bandLengthLeft, int l1) +{ + int i; + int address_00, x_address_10, address_01, address_01_shifted; + int numberOfRegistersPerLine; + int bm; + int value=INT16_MAX-l1; + + numberOfRegistersPerLine = bandLengthTotal / 8; + bm = bandLengthLeft%2; + + for (i=0; i < (3*numberOfRegistersPerLine*8); i++) + address[i] = value; + + // 0,0 set to 1 and 0,1 and 1,0 set to 2 + + address_00 = bandLengthLeft / 2; + + x_address_10 = address_00 + bm - 1; + address_01 = numberOfRegistersPerLine*8 + x_address_10; + + address_01_shifted = numberOfRegistersPerLine*16 + address_00 - bm; + + // fill address_00, address_01,+1, address_01_shifted,+1 + + address[address_00] = 1; + address[address_01] = 2; + address[address_01+1] = 2; + address[address_01_shifted] = 2; + address[address_01_shifted+1] = 2; +} + + +double sse_banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, BOOL normalize, int reference, BOOL lcsmode, int16_t* address, int LCSmin) +{ + double id; + int bandLengthRight, bandLengthLeft, bandLengthTotal; + int ali_length; + + //fprintf(stderr, "\nl1 = %d, l2 = %d\n", l1, l2); + + calculateBandLengths(l1, l2, &bandLengthRight, &bandLengthLeft, LCSmin); + + //fprintf(stderr, "\nBLL = %d, BLR = %d, LCSmin = %d\n", bandLengthLeft, bandLengthRight, LCSmin); + + bandLengthTotal = calculateSSEBandLength(bandLengthRight, bandLengthLeft); + + //fprintf(stderr, "\nBLT = %d\n", bandLengthTotal); + + if ((reference == ALILEN) && (normalize || !lcsmode)) + { + initializeAddressWithGaps(address, bandLengthTotal, bandLengthLeft, l1); + sse_banded_align_lcs_and_ali_len(seq1, seq2, l1, l2, bandLengthLeft, bandLengthTotal, address, &id, &ali_length); + } + else + id = sse_banded_align_just_lcs(seq1, seq2, l1, l2, bandLengthLeft, bandLengthTotal); + + //fprintf(stderr, "\nid before normalizations = %f", id); + + //fprintf(stderr, "\nlcs = %f, ali = %d\n", id, ali_length); + + if (!lcsmode && !normalize) + switch(reference) { + case ALILEN: id = ali_length - id; + break; + case MAXLEN: id = l1 - id; + break; + case MINLEN: id = l2 - id; + } + + //fprintf(stderr, "\n2>>> %f, %d\n", id, ali_length); + if (normalize) + switch(reference) { + case ALILEN: id = id / (double) ali_length; + break; + case MAXLEN: id = id / (double) l1; + break; + case MINLEN: id = id / (double) l2; + } + + //fprintf(stderr, "\nid = %f\n", id); + return(id); +} + + +double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, BOOL normalize, int reference, BOOL lcsmode, int16_t** address, int* buffer_size, int16_t** iseq1, + int16_t** iseq2, int* buffer_sizeS) +{ + double id; + int l1; + int l2; + int lmax, lmin; + int sizeToAllocateForBand; + int maxBLL, notUsed; + int sizeToAllocateForSeqs; + int LCSmin; + + l1 = strlen(seq1); + l2 = strlen(seq2); + + if (l2 > l1) + { + lmax = l1; + lmin = l2; + } + else + { + lmax = l2; + lmin = l1; + } + + if (!lcsmode && (normalize==TRUE)) + { + threshold = 1.0 - threshold; + } + + LCSmin = calculateLCSmin(lmax, lmin, threshold, normalize, reference, lcsmode); + +// Allocating space for matrix band if the alignment must be computed + + if ((reference == ALILEN) && ((lcsmode && normalize) || (!lcsmode))) // checking if alignment must be computed + { + sizeToAllocateForBand = calculateSizeToAllocate(lmax, lmin, LCSmin); + + if (sizeToAllocateForBand > (*buffer_size)) + { + // reallocating if needed + address = reallocA16Address(*address, sizeToAllocateForBand); + } + } + +// Allocating space for the int16_t arrays representing the sequences + + calculateBandLengths(lmax, lmin, ¬Used, &maxBLL, LCSmin); + + sizeToAllocateForSeqs = 2*maxBLL+lmax; + + if (sizeToAllocateForSeqs > *buffer_sizeS) + { + (*(iseq1)) = realloc((*(iseq1)), sizeToAllocateForSeqs*sizeof(int16_t)); + (*(iseq2)) = realloc((*(iseq2)), sizeToAllocateForSeqs*sizeof(int16_t)); + } + + iniSeq(*(iseq1), maxBLL, 0); + iniSeq(*(iseq2), maxBLL, 255); + *(iseq1) = *(iseq1)+maxBLL; + *(iseq2) = *(iseq2)+maxBLL; + + // longest seq must be first argument of sse_align function + if (l2 > l1) + { + putSeqInSeq((*(iseq1)), seq2, l2, TRUE); + putSeqInSeq((*(iseq2)), seq1, l1, FALSE); + id = sse_banded_lcs_align(*(iseq1), *(iseq2), l2, l1, normalize, reference, lcsmode, *address, LCSmin); + } + else + { + putSeqInSeq((*(iseq1)), seq1, l1, TRUE); + putSeqInSeq((*(iseq2)), seq2, l2, FALSE); + id = sse_banded_lcs_align(*(iseq1), *(iseq2), l1, l2, normalize, reference, lcsmode, *address, LCSmin); + } + + return(id); +} + + +int prepareTablesForSumathings(int lmax, int lmin, double threshold, BOOL normalize, int reference, BOOL lcsmode, + int16_t** address, int16_t** iseq1, int16_t** iseq2) +{ + int sizeToAllocateForBand; + int maxBLL; + int notUsed; + int sizeToAllocateForSeqs; + int LCSmin; + + LCSmin = calculateLCSmin(lmax, lmin, threshold, normalize, reference, lcsmode); + + // Allocating space for matrix band if the alignment must be computed + + if ((reference == ALILEN) && (normalize || !lcsmode)) // checking if alignment must be computed + { + sizeToAllocateForBand = calculateSizeToAllocate(lmax, lmin, LCSmin); + (*(address)) = getA16Address(sizeToAllocateForBand); + } + + // Allocating space for the int16_t arrays representing the sequences + + calculateBandLengths(lmax, lmin, ¬Used, &maxBLL, LCSmin); + + sizeToAllocateForSeqs = 2*maxBLL+lmax; + (*(iseq1)) = malloc(sizeToAllocateForSeqs*sizeof(int16_t)); + (*(iseq2)) = malloc(sizeToAllocateForSeqs*sizeof(int16_t)); + + iniSeq(*(iseq1), maxBLL, 0); + iniSeq(*(iseq2), maxBLL, 255); + *(iseq1) = *(iseq1)+maxBLL; + *(iseq2) = *(iseq2)+maxBLL; + + return(maxBLL+lmax); +} + + +double alignForSumathings(char* seq1, int16_t* iseq1, char* seq2, int16_t* iseq2, int l1, int l2, + BOOL normalize, int reference, BOOL lcsmode, int16_t* address, int sizeForSeqs, int LCSmin) +{ + double id; + + iniSeq(iseq1, sizeForSeqs, 0); + iniSeq(iseq2, sizeForSeqs, 255); + + if (l2 > l1) + { + putSeqInSeq(iseq1, seq2, l2, TRUE); + putSeqInSeq(iseq2, seq1, l1, FALSE); + id = sse_banded_lcs_align(iseq1, iseq2, l2, l1, normalize, reference, lcsmode, address, LCSmin); + } + else + { + putSeqInSeq(iseq1, seq1, l1, TRUE); + putSeqInSeq(iseq2, seq2, l2, FALSE); + id = sse_banded_lcs_align(iseq1, iseq2, l1, l2, normalize, reference, lcsmode, address, LCSmin); + } + + return(id); +} + diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/sse_banded_LCS_alignment.h b/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/sse_banded_LCS_alignment.h new file mode 100644 index 0000000..95f50b0 --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/sse_banded_LCS_alignment.h @@ -0,0 +1,24 @@ +/* + * sse_banded_LCS_alignment.h + * + * Created on: november 29, 2012 + * Author: mercier + */ + +#ifndef SSE_BANDED_LCS_ALIGNMENT_H_ +#define SSE_BANDED_LCS_ALIGNMENT_H_ +#include + +double sse_banded_lcs_align(int16_t* seq1, int16_t* seq2, int l1, int l2, BOOL normalize, int reference, BOOL lcsmode, int16_t* address, int LCSmin); +int calculateSizeToAllocate(int maxLen, int minLen, int LCSmin); +void calculateThresholdFromErrorNumber(int error, int length, double* threshold); +void iniSeq(int16_t* seq, int size, int16_t iniValue); +void putSeqInSeq(int16_t* seq, char* s, int l, BOOL reverse); +double generic_sse_banded_lcs_align(char* seq1, char* seq2, double threshold, BOOL normalize, int reference, BOOL lcsmode, int16_t** address, int* buffer_size, int16_t** iseq1, + int16_t** iseq2, int* buffer_sizeS); +int prepareTablesForSumathings(int lmax, int lmin, double threshold, BOOL normalize, int reference, BOOL lcsmode, + int16_t** address, int16_t** iseq1, int16_t** iseq2); +double alignForSumathings(char* seq1, int16_t* iseq1, char* seq2, int16_t* iseq2, int l1, int l2, BOOL normalize, + int reference, BOOL lcsmode, int16_t* address, int sizeForSeqs, int LCSmin); +int calculateLCSmin(int l1, int l2, double threshold, BOOL normalize, int reference, BOOL lcsmode); +#endif diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/upperband.c b/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/upperband.c new file mode 100644 index 0000000..4948bd0 --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/upperband.c @@ -0,0 +1,382 @@ +#include "../libsse/_sse.h" +#include +#include +#include "../libutils/utilities.h" +#include "../libfasta/sequence.h" +#include "sse_banded_LCS_alignment.h" + + +inline static uchar_v hash4m128(uchar_v frag) +{ + uchar_v words; + + vUInt8 mask_03= _MM_SET1_EPI8(0x03); // charge le registre avec 16x le meme octet + vUInt8 mask_FC= _MM_SET1_EPI8(0xFC); + + frag.m = _MM_SRLI_EPI64(frag.m,1); // shift logic a droite sur 2 x 64 bits + frag.m = _MM_AND_SI128(frag.m,mask_03); // and sur les 128 bits + + + words.m= _MM_SLLI_EPI64(frag.m,2); + words.m= _MM_AND_SI128(words.m,mask_FC); + frag.m = _MM_SRLI_SI128(frag.m,1); + words.m= _MM_OR_SI128(words.m,frag.m); + + words.m= _MM_SLLI_EPI64(words.m,2); + words.m= _MM_AND_SI128(words.m,mask_FC); + frag.m = _MM_SRLI_SI128(frag.m,1); + words.m= _MM_OR_SI128(words.m,frag.m); + + words.m= _MM_SLLI_EPI64(words.m,2); + words.m= _MM_AND_SI128(words.m,mask_FC); + frag.m = _MM_SRLI_SI128(frag.m,1); + words.m= _MM_OR_SI128(words.m,frag.m); + + return words; +} + +#ifdef __SSE2__ + +inline static int anyzerom128(vUInt8 data) +{ + vUInt8 mask_00= _MM_SETZERO_SI128(); + uint64_v tmp; + tmp.m = _MM_CMPEQ_EPI8(data,mask_00); + return (int)(tmp.c[0]!=0 || tmp.c[1]!=0); +} + +#else + +inline static int anyzerom128(vUInt8 data) +{ + int i; + um128 tmp; + tmp.i = data; + for (i=0;i<8;i++) + if (tmp.s8[i]==0) + return 1; + return 0; +} + +#endif + +inline static void dumpm128(unsigned short *table,vUInt8 data) +{ + memcpy(table,&data,16); +} + +/** + * Compute 4mer occurrence table from a DNA sequence + * + * sequence : a pointer to the null terminated nuc sequence + * table : a pointer to a 256 cells unisgned char table for + * storing the occurrence table + * count : pointer to an int value used as a return value + * containing the global word counted + * + * returns the number of words observed in the sequence with a + * count greater than 255. + */ + +int buildTable(const char* sequence, unsigned char *table, int *count) +{ + int overflow = 0; + int wc=0; + int i; + vUInt8 mask_00= _MM_SETZERO_SI128(); + + uchar_v frag; + uchar_v words; + uchar_v zero; + + char* s; + + s=(char*)sequence; + + memset(table,0,256*sizeof(unsigned char)); + + // encode ascii sequence with A : 00 C : 01 T: 10 G : 11 + + for(frag.m=_MM_LOADU_SI128((vUInt8*)s); + ! anyzerom128(frag.m); + s+=12,frag.m=_MM_LOADU_SI128((vUInt8*)s)) + { + words= hash4m128(frag); + + // printf("%d %d %d %d\n",words.c[0],words.c[1],words.c[2],words.c[3]); + + if (table[words.c[0]]<255) table[words.c[0]]++; else overflow++; + if (table[words.c[1]]<255) table[words.c[1]]++; else overflow++; + if (table[words.c[2]]<255) table[words.c[2]]++; else overflow++; + if (table[words.c[3]]<255) table[words.c[3]]++; else overflow++; + if (table[words.c[4]]<255) table[words.c[4]]++; else overflow++; + if (table[words.c[5]]<255) table[words.c[5]]++; else overflow++; + if (table[words.c[6]]<255) table[words.c[6]]++; else overflow++; + if (table[words.c[7]]<255) table[words.c[7]]++; else overflow++; + if (table[words.c[8]]<255) table[words.c[8]]++; else overflow++; + if (table[words.c[9]]<255) table[words.c[9]]++; else overflow++; + if (table[words.c[10]]<255) table[words.c[10]]++; else overflow++; + if (table[words.c[11]]<255) table[words.c[11]]++; else overflow++; + + wc+=12; + } + + zero.m=_MM_CMPEQ_EPI8(frag.m,mask_00); + //printf("frag=%d %d %d %d\n",frag.c[0],frag.c[1],frag.c[2],frag.c[3]); + //printf("zero=%d %d %d %d\n",zero.c[0],zero.c[1],zero.c[2],zero.c[3]); + words = hash4m128(frag); + + if (zero.c[0]+zero.c[1]+zero.c[2]+zero.c[3]==0) + for(i=0;zero.c[i+3]==0;i++,wc++) + if (table[words.c[i]]<255) table[words.c[i]]++; else overflow++; + + if (count) *count=wc; + return overflow; +} + +static inline vUInt16 partialminsum(vUInt8 ft1,vUInt8 ft2) +{ + vUInt8 mini; + vUInt16 minilo; + vUInt16 minihi; + vUInt8 mask_00= _MM_SETZERO_SI128(); + + mini = _MM_MIN_EPU8(ft1,ft2); + minilo = _MM_UNPACKLO_EPI8(mini,mask_00); + minihi = _MM_UNPACKHI_EPI8(mini,mask_00); + + return _MM_ADDS_EPU16(minilo,minihi); +} + +int compareTable(unsigned char *t1, int over1, unsigned char* t2, int over2) +{ + vUInt8 ft1; + vUInt8 ft2; + vUInt8 *table1=(vUInt8*)t1; + vUInt8 *table2=(vUInt8*)t2; + ushort_v summini; + int i; + int total; + + ft1 = _MM_LOADU_SI128(table1); + ft2 = _MM_LOADU_SI128(table2); + summini.m = partialminsum(ft1,ft2); + table1++; + table2++; + + + for (i=1;i<16;i++,table1++,table2++) + { + ft1 = _MM_LOADU_SI128(table1); + ft2 = _MM_LOADU_SI128(table2); + summini.m = _MM_ADDS_EPU16(summini.m,partialminsum(ft1,ft2)); + + } + + // Finishing the sum process + + summini.m = _MM_ADDS_EPU16(summini.m,_MM_SRLI_SI128(summini.m,8)); // sum the 4 firsts with the 4 lasts + summini.m = _MM_ADDS_EPU16(summini.m,_MM_SRLI_SI128(summini.m,4)); + + total = summini.c[0]+summini.c[1]; + total+= (over1 < over2) ? over1:over2; + + return total; +} + +int threshold4(int wordcount,double identity) +{ + int error; + int lmax; + + wordcount+=3; + error = (int)floor((double)wordcount * ((double)1.0-identity)); + lmax = (wordcount - error) / (error + 1); + if (lmax < 4) + return 0; + return (lmax - 3) \ + * (error + 1) \ + + ((wordcount - error) % (error + 1)); +} + + +int thresholdLCS4(int32_t reflen,int32_t lcs) +{ + int nbfrag; + int smin; + int R; + int common; + + nbfrag = (reflen - lcs)*2 + 1; + smin = lcs/nbfrag; + R = lcs - smin * nbfrag; + common = MAX(smin - 2,0) * R + MAX(smin - 3,0) * (nbfrag - R); + return common; +} + + +int hashDB(fastaSeqCount db) +{ + int32_t i; + int32_t count; + + fprintf(stderr,"Indexing dataset..."); + + for (i=0; i < db.count;i++) + { + db.fastaSeqs[i].table = util_malloc((256)*sizeof(unsigned char), __FILE__, __LINE__); + db.fastaSeqs[i].over = buildTable((const char*)(db.fastaSeqs[i].sequence), + db.fastaSeqs[i].table, + &count); + } + + fprintf(stderr," : Done\n"); + + return db.count; +} + + +BOOL isPossible(fastaSeqPtr seq1, fastaSeqPtr seq2, double threshold, BOOL normalize, int reference, BOOL lcsmode) +{ + int32_t reflen; + int32_t maxlen; + int32_t lcs; + int32_t mincount; + + if (seq1->length < 12 || seq2->length < 12) + return TRUE; + + maxlen = MAX(seq1->length,seq2->length); + + if (reference==ALILEN || reference==MAXLEN) + reflen = maxlen; + else + reflen = MIN(seq1->length,seq2->length); + + if (normalize) + { + if (! lcsmode) + threshold = 1. - threshold; + + lcs = (int32_t)ceil((double)reflen * threshold); + } + else + { + if (! lcsmode) + threshold = reflen - threshold; + lcs = (int32_t) threshold; + } + + if (lcs > MIN(seq1->length,seq2->length)) + return FALSE; + + mincount = thresholdLCS4(maxlen,lcs); + + return compareTable(seq1->table,seq1->over,seq2->table,seq2->over) >=mincount; +} + + +BOOL isPossibleSumathings(fastaSeqPtr seq1, fastaSeqPtr seq2, int l1, int l2, double threshold, BOOL normalize, int reference, BOOL lcsmode) +{ // optimized version of the filter for sumaclust and sumatra + + int32_t reflen; + int32_t lcs; + int32_t mincount; + + if (l1 < 12 || l2 < 12) + return TRUE; + + if (reference==ALILEN || reference==MAXLEN) + reflen = l1; + else + reflen = l2; + + if (normalize) + lcs = (int32_t)ceil((double)reflen * threshold); + else + { + if (! lcsmode) + threshold = reflen - threshold; + lcs = (int32_t) threshold; + } + + mincount = thresholdLCS4(l1,lcs); + + return compareTable(seq1->table,seq1->over,seq2->table,seq2->over) >=mincount; +} + + +void filters(fastaSeqPtr seq1, fastaSeqPtr seq2, double threshold, BOOL normalize, int reference, BOOL lcsmode, double* score, int* LCSmin) +{ // score takes value -1 if filters are passed. score must be initialized in calling function. + int l1; + int l2; + + l1 = seq1->length; + l2 = seq2->length; + + if (l1 >= l2) + { + *LCSmin = calculateLCSmin(l1, l2, threshold, normalize, reference, lcsmode); + if (l2 >= *LCSmin) + { + if (isPossibleSumathings(seq1, seq2, l1, l2, threshold, normalize, reference, lcsmode)) // 4-mers filter + *score = -1.0; + } + } + else + { + *LCSmin = calculateLCSmin(l2, l1, threshold, normalize, reference, lcsmode); + if (l1 >= *LCSmin) + { + if (isPossibleSumathings(seq2, seq1, l2, l1, threshold, normalize, reference, lcsmode)) // 4-mers filter + *score = -1.0; + } + } +} + + +void filtersSumatra(fastaSeqPtr seq1, fastaSeqPtr seq2, double threshold, BOOL normalize, int reference, BOOL lcsmode, double* score, int* LCSmin) +{ // score takes value -2 if filters are not passed, -1 if filters are passed and >= 0 with max score if the 2 sequences are identical. + + int l1; + int l2; + l1 = seq1->length; + + *score = -2.0; + + if (strcmp(seq1->sequence, seq2->sequence) == 0) // the 2 sequences are identical + { + if (lcsmode && normalize) + *score = 1.0; + else if (!lcsmode) + *score = 0.0; + else + *score = l1; + } + + else if (threshold != 0) + { + l2 = seq2->length; + + if (l1 >= l2) + { + *LCSmin = calculateLCSmin(l1, l2, threshold, normalize, reference, lcsmode); + if (l2 >= *LCSmin) + { + if (isPossibleSumathings(seq1, seq2, l1, l2, threshold, normalize, reference, lcsmode)) // 4-mers filter + *score = -1.0; + } + } + else + { + *LCSmin = calculateLCSmin(l2, l1, threshold, normalize, reference, lcsmode); + if (l1 >= *LCSmin) + { + if (isPossibleSumathings(seq2, seq1, l2, l1, threshold, normalize, reference, lcsmode)) // 4-mers filter + *score = -1.0; + } + } + } + else + *LCSmin = 0; +} diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/upperband.h b/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/upperband.h new file mode 100644 index 0000000..cded693 --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/liblcs/upperband.h @@ -0,0 +1,18 @@ + +#ifndef UPPERBAND_H_ +#define UPPERBAND_H_ + + +int buildTable(const char *sequence, unsigned char *table, int *count); +int compareTable(unsigned char *t1, int over1, unsigned char* t2, int over2); +int threshold4(int wordcount,double identity); +int thresholdLCS4(int32_t reflen,int32_t lcs); + + +int hashDB(fastaSeqCount); +BOOL isPossible(fastaSeqPtr, fastaSeqPtr, BOOL, int, double, BOOL); +BOOL isPossibleSumathings(fastaSeqPtr seq1, fastaSeqPtr seq2, int l1, int l2, double threshold, BOOL normalize, int reference, BOOL lcsmode); +void filters(fastaSeqPtr seq1, fastaSeqPtr seq2, double threshold, BOOL normalize, int reference, BOOL lcsmode, double* score, int* LCSmin); +void filtersSumatra(fastaSeqPtr seq1, fastaSeqPtr seq2, double threshold, BOOL normalize, int reference, BOOL lcsmode, double* score, int* LCSmin); +#endif + diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/libsse/_sse.h b/src/sumatra/sumatra-1.0.10/sumalibs/libsse/_sse.h new file mode 100644 index 0000000..8754721 --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/libsse/_sse.h @@ -0,0 +1,961 @@ +#ifndef _SSE_H_ +#define _SSE_H_ + +#include + +#include +#ifdef __SSE2__ +#include +#else +typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); +#endif /* __SSE2__ */ + +#ifndef MAX +#define MAX(x,y) (((x)>(y)) ? (x):(y)) +#define MIN(x,y) (((x)<(y)) ? (x):(y)) +#endif + +#define ALIGN __attribute__((aligned(16))) +typedef __m128i vUInt8; +typedef __m128i vInt8; + +typedef __m128i vUInt16; +typedef __m128i vInt16; + +typedef __m128i vUInt64; + +typedef union +{ + __m128i i; + int64_t s64[ 2]; + int16_t s16[ 8]; + int8_t s8 [16]; + uint8_t u8 [16]; + uint16_t u16[8 ]; + uint32_t u32[4 ]; + uint64_t u64[2 ]; +} um128; + +typedef union + { + vUInt8 m; + uint8_t c[16]; + } uchar_v; + +typedef union + { + vUInt16 m; + uint16_t c[8]; + } ushort_v; + +typedef union + { + vUInt64 m; + uint64_t c[2]; + } uint64_v; + + +#ifdef __SSE2__ + +static inline int8_t _s2_extract_epi8(__m128i r, const int p) +{ +#define ACTIONP(r,x) return _mm_extract_epi16(r,x) & 0xFF +#define ACTIONI(r,x) return _mm_extract_epi16(r,x) >> 8 + switch (p) { + case 0: ACTIONP(r,0); + case 1: ACTIONI(r,0); + case 2: ACTIONP(r,1); + case 3: ACTIONI(r,1); + case 4: ACTIONP(r,2); + case 5: ACTIONI(r,2); + case 6: ACTIONP(r,3); + case 7: ACTIONI(r,3); + case 8: ACTIONP(r,4); + case 9: ACTIONI(r,4); + case 10: ACTIONP(r,5); + case 11: ACTIONI(r,5); + case 12: ACTIONP(r,6); + case 13: ACTIONI(r,6); + case 14: ACTIONP(r,7); + case 15: ACTIONI(r,7); + } +#undef ACTIONP +#undef ACTIONI + + return 0; +} + +static inline __m128i _s2_max_epi8(__m128i a, __m128i b) +{ + __m128i mask = _mm_cmpgt_epi8( a, b ); + a = _mm_and_si128 (a,mask ); + b = _mm_andnot_si128(mask,b); + return _mm_or_si128(a,b); +} + +static inline __m128i _s2_min_epi8(__m128i a, __m128i b) +{ + __m128i mask = _mm_cmplt_epi8( a, b ); + a = _mm_and_si128 (a,mask ); + b = _mm_andnot_si128(mask,b); + return _mm_or_si128(a,b); +} + +static inline __m128i _s2_insert_epi8(__m128i r, int b, const int p) +{ +#define ACTIONP(r,x) return _mm_insert_epi16(r,(_mm_extract_epi16(r,x) & 0xFF00) | (b & 0x00FF),x) +#define ACTIONI(r,x) return _mm_insert_epi16(r,(_mm_extract_epi16(r,x) & 0x00FF) | ((b << 8)& 0xFF00),x) + switch (p) { + case 0: ACTIONP(r,0); + case 1: ACTIONI(r,0); + case 2: ACTIONP(r,1); + case 3: ACTIONI(r,1); + case 4: ACTIONP(r,2); + case 5: ACTIONI(r,2); + case 6: ACTIONP(r,3); + case 7: ACTIONI(r,3); + case 8: ACTIONP(r,4); + case 9: ACTIONI(r,4); + case 10: ACTIONP(r,5); + case 11: ACTIONI(r,5); + case 12: ACTIONP(r,6); + case 13: ACTIONI(r,6); + case 14: ACTIONP(r,7); + case 15: ACTIONI(r,7); + } +#undef ACTIONP +#undef ACTIONI + + return _mm_setzero_si128(); +} + +// Fill a SSE Register with 16 time the same 8bits integer value +#define _MM_SET1_EPI8(x) _mm_set1_epi8(x) +#define _MM_INSERT_EPI8(r,x,i) _s2_insert_epi8((r),(x),(i)) +#define _MM_CMPEQ_EPI8(x,y) _mm_cmpeq_epi8((x),(y)) +#define _MM_CMPGT_EPI8(x,y) _mm_cmpgt_epi8((x),(y)) +#define _MM_CMPLT_EPI8(x,y) _mm_cmplt_epi8((x),(y)) +#define _MM_MAX_EPI8(x,y) _s2_max_epi8((x),(y)) +#define _MM_MIN_EPI8(x,y) _s2_min_epi8((x),(y)) +#define _MM_ADD_EPI8(x,y) _mm_add_epi8((x),(y)) +#define _MM_SUB_EPI8(x,y) _mm_sub_epi8((x),(y)) +#define _MM_EXTRACT_EPI8(r,p) _s2_extract_epi8((r),(p)) + +#define _MM_MIN_EPU8(x,y) _mm_min_epu8((x),(y)) + +// Fill a SSE Register with 8 time the same 16bits integer value +#define _MM_SET1_EPI16(x) _mm_set1_epi16(x) + +#define _MM_INSERT_EPI16(r,x,i) _mm_insert_epi16((r),(x),(i)) +#define _MM_CMPEQ_EPI16(x,y) _mm_cmpeq_epi16((x),(y)) +#define _MM_CMPGT_EPI16(x,y) _mm_cmpgt_epi16((x),(y)) +#define _MM_CMPGT_EPU16(x,y) _mm_cmpgt_epu16((x),(y)) // n'existe pas ?? +#define _MM_CMPLT_EPI16(x,y) _mm_cmplt_epi16((x),(y)) +#define _MM_MAX_EPI16(x,y) _mm_max_epi16((x),(y)) +#define _MM_MIN_EPI16(x,y) _mm_min_epi16((x),(y)) +#define _MM_ADD_EPI16(x,y) _mm_add_epi16((x),(y)) +#define _MM_SUB_EPI16(x,y) _mm_sub_epi16((x),(y)) +#define _MM_EXTRACT_EPI16(r,p) _mm_extract_epi16((r),(p)) +#define _MM_UNPACKLO_EPI8(a,b) _mm_unpacklo_epi8((a),(b)) +#define _MM_UNPACKHI_EPI8(a,b) _mm_unpackhi_epi8((a),(b)) +#define _MM_ADDS_EPU16(x,y) _mm_adds_epu16((x),(y)) + +// Multiplication +#define _MM_MULLO_EPI16(x,y) _mm_mullo_epi16((x), (y)) + +#define _MM_SRLI_EPI64(r,x) _mm_srli_epi64((r),(x)) +#define _MM_SLLI_EPI64(r,x) _mm_slli_epi64((r),(x)) + +// Set a SSE Register to 0 +#define _MM_SETZERO_SI128() _mm_setzero_si128() + +#define _MM_AND_SI128(x,y) _mm_and_si128((x),(y)) +#define _MM_ANDNOT_SI128(x,y) _mm_andnot_si128((x),(y)) +#define _MM_OR_SI128(x,y) _mm_or_si128((x),(y)) +#define _MM_XOR_SI128(x,y) _mm_xor_si128((x),(y)) +#define _MM_SLLI_SI128(r,s) _mm_slli_si128((r),(s)) +#define _MM_SRLI_SI128(r,s) _mm_srli_si128((r),(s)) + +// Load a SSE register from an unaligned address +#define _MM_LOADU_SI128(x) _mm_loadu_si128(x) + +// Load a SSE register from an aligned address (/!\ not defined when SSE not available) +#define _MM_LOAD_SI128(x) _mm_load_si128(x) + +// #define _MM_UNPACKLO_EPI8(x,y) _mm_unpacklo_epi8((x),(y)) + +#else /* __SSE2__ Not defined */ + +static inline __m128i _em_set1_epi8(int x) +{ + um128 a; + + x&=0xFF; + a.s8[0]=x; + a.s8[1]=x; + a.u16[1]=a.u16[0]; + a.u32[1]=a.u32[0]; + a.u64[1]=a.u64[0]; + + return a.i; +} + +static inline __m128i _em_insert_epi8(__m128i r, int x, const int i) +{ + um128 a; + a.i=r; + a.s8[i]=x & 0xFF; + return a.i; +} + +static inline __m128i _em_cmpeq_epi8(__m128i a, __m128i b) +{ + um128 x; + um128 y; + um128 r; + + x.i=a; + y.i=b; + +#define R(z) r.s8[z]=(x.s8[z]==y.s8[z]) ? 0xFF:0 + R(0); + R(1); + R(2); + R(3); + R(4); + R(5); + R(6); + R(7); + R(8); + R(9); + R(10); + R(11); + R(12); + R(13); + R(14); + R(15); +#undef R + + return r.i; +} + +static inline __m128i _em_cmpgt_epi8(__m128i a, __m128i b) +{ + um128 x; + um128 y; + um128 r; + + x.i=a; + y.i=b; + +#define R(z) r.s8[z]=(x.s8[z]>y.s8[z]) ? 0xFF:0 + R(0); + R(1); + R(2); + R(3); + R(4); + R(5); + R(6); + R(7); + R(8); + R(9); + R(10); + R(11); + R(12); + R(13); + R(14); + R(15); +#undef R + + return r.i; +} + +static inline __m128i _em_cmplt_epi8(__m128i a, __m128i b) +{ + um128 x; + um128 y; + um128 r; + + x.i=a; + y.i=b; + +#define R(z) r.s8[z]=(x.s8[z]y.s16[z]) ? 0xFFFF:0 + R(0); + R(1); + R(2); + R(3); + R(4); + R(5); + R(6); + R(7); +#undef R + + return r.i; +} + +static inline __m128i _em_cmplt_epi16(__m128i a, __m128i b) +{ + um128 x; + um128 y; + um128 r; + + x.i=a; + y.i=b; + +#define R(z) r.s16[z]=(x.s16[z]>=b; + x.s64[1]>>=b; + + return x.i; +} + +static inline __m128i _em_slli_epi64(__m128i a, int b) +{ + um128 x; + + x.i=a; + + x.s64[0]<<=b; + x.s64[1]<<=b; + + return x.i; +} + +static inline __m128i _em_setzero_si128() +{ + um128 x; + + x.s64[0]=x.s64[1]=0; + + return x.i; +} + +static inline __m128i _em_and_si128(__m128i a, __m128i b) +{ + um128 x; + um128 y; + um128 r; + + x.i=a; + y.i=b; + + +#define R(z) r.u64[z]=x.u64[z] & y.u64[z] + R(0); + R(1); +#undef R + + return r.i; +} + +static inline __m128i _em_andnot_si128(__m128i a, __m128i b) +{ + um128 x; + um128 y; + um128 r; + + x.i=a; + y.i=b; + + +#define R(z) r.u64[z]=(~x.u64[z]) & y.u64[z] + R(0); + R(1); +#undef R + + return r.i; +} + +static inline __m128i _em_or_si128(__m128i a, __m128i b) +{ + um128 x; + um128 y; + um128 r; + + x.i=a; + y.i=b; + +#define R(z) r.u64[z]=x.u64[z] | y.u64[z] + R(0); + R(1); +#undef R + + return r.i; +} + +static inline __m128i _em_xor_si128(__m128i a, __m128i b) +{ + um128 x; + um128 y; + um128 r; + + x.i=a; + y.i=b; + +#define R(z) r.u64[z]=x.u64[z] ^ y.u64[z] + R(0); + R(1); +#undef R + + return r.i; +} + +static inline __m128i _em_slli_si128(__m128i a, int b) +{ + um128 x; + + x.i=a; + +#define R(z) x.u8[z]=(z>=b) ? x.u8[z-b]:0 + R(15); + R(14); + R(13); + R(12); + R(11); + R(10); + R(9); + R(8); + R(7); + R(6); + R(5); + R(4); + R(3); + R(2); + R(1); + R(0); +#undef R + + return x.i; +} + +static inline __m128i _em_srli_si128(__m128i a, int b) +{ + um128 x; + + x.i=a; + +#define R(z) x.u8[z]=((b+z) > 15) ? 0:x.u8[z+b] + R(0); + R(1); + R(2); + R(3); + R(4); + R(5); + R(6); + R(7); + R(8); + R(9); + R(10); + R(11); + R(12); + R(13); + R(14); + R(15); +#undef R + + return x.i; +} + +inline static __m128i _em_loadu_si128(__m128i const *P) +{ + um128 tmp; + um128 *pp=(um128*)P; + + tmp.u8[0]=(*pp).u8[0]; + tmp.u8[1]=(*pp).u8[1]; + tmp.u8[2]=(*pp).u8[2]; + tmp.u8[3]=(*pp).u8[3]; + tmp.u8[4]=(*pp).u8[4]; + tmp.u8[5]=(*pp).u8[5]; + tmp.u8[6]=(*pp).u8[6]; + tmp.u8[7]=(*pp).u8[7]; + tmp.u8[8]=(*pp).u8[8]; + tmp.u8[9]=(*pp).u8[9]; + tmp.u8[10]=(*pp).u8[10]; + tmp.u8[11]=(*pp).u8[11]; + tmp.u8[12]=(*pp).u8[12]; + tmp.u8[13]=(*pp).u8[13]; + tmp.u8[14]=(*pp).u8[14]; + tmp.u8[15]=(*pp).u8[15]; + return tmp.i; +} + + +#define _MM_SET1_EPI8(x) _em_set1_epi8(x) +#define _MM_INSERT_EPI8(r,x,i) _em_insert_epi8((r),(x),(i)) +#define _MM_CMPEQ_EPI8(x,y) _em_cmpeq_epi8((x),(y)) +#define _MM_CMPGT_EPI8(x,y) _em_cmpgt_epi8((x),(y)) +#define _MM_CMPLT_EPI8(x,y) _em_cmplt_epi8((x),(y)) +#define _MM_MAX_EPI8(x,y) _em_max_epi8((x),(y)) +#define _MM_MIN_EPI8(x,y) _em_min_epi8((x),(y)) +#define _MM_ADD_EPI8(x,y) _em_add_epi8((x),(y)) +#define _MM_SUB_EPI8(x,y) _em_sub_epi8((x),(y)) +#define _MM_EXTRACT_EPI8(r,p) _em_extract_epi8((r),(p)) + +#define _MM_MIN_EPU8(x,y) _em_min_epu8((x),(y)) + +#define _MM_SET1_EPI16(x) _em_set1_epi16(x) +#define _MM_INSERT_EPI16(r,x,i) _em_insert_epi16((r),(x),(i)) +#define _MM_CMPEQ_EPI16(x,y) _em_cmpeq_epi16((x),(y)) +#define _MM_CMPGT_EPI16(x,y) _em_cmpgt_epi16((x),(y)) +#define _MM_CMPLT_EPI16(x,y) _em_cmplt_epi16((x),(y)) +#define _MM_MAX_EPI16(x,y) _em_max_epi16((x),(y)) +#define _MM_MIN_EPI16(x,y) _em_min_epi16((x),(y)) +#define _MM_ADD_EPI16(x,y) _em_add_epi16((x),(y)) +#define _MM_SUB_EPI16(x,y) _em_sub_epi16((x),(y)) +#define _MM_EXTRACT_EPI16(r,p) _em_extract_epi16((r),(p)) +#define _MM_UNPACKLO_EPI8(a,b) _em_unpacklo_epi8((a),(b)) +#define _MM_UNPACKHI_EPI8(a,b) _em_unpackhi_epi8((a),(b)) +#define _MM_ADDS_EPU16(x,y) _em_adds_epu16((x),(y)) + +#define _MM_SRLI_EPI64(r,x) _em_srli_epi64((r),(x)) +#define _MM_SLLI_EPI64(r,x) _em_slli_epi64((r),(x)) + +#define _MM_SETZERO_SI128() _em_setzero_si128() + +#define _MM_AND_SI128(x,y) _em_and_si128((x),(y)) +#define _MM_ANDNOT_SI128(x,y) _em_andnot_si128((x),(y)) +#define _MM_OR_SI128(x,y) _em_or_si128((x),(y)) +#define _MM_XOR_SI128(x,y) _em_xor_si128((x),(y)) +#define _MM_SLLI_SI128(r,s) _em_slli_si128((r),(s)) +#define _MM_SRLI_SI128(r,s) _em_srli_si128((r),(s)) + +#define _MM_LOADU_SI128(x) _em_loadu_si128(x) +#define _MM_LOAD_SI128(x) _em_loadu_si128(x) + + +#endif /* __SSE2__ */ + +#define _MM_NOT_SI128(x) _MM_XOR_SI128((x),(_MM_SET1_EPI8(0xFFFF))) + +#endif diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/libutils/Makefile b/src/sumatra/sumatra-1.0.10/sumalibs/libutils/Makefile new file mode 100644 index 0000000..8428d77 --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/libutils/Makefile @@ -0,0 +1,25 @@ + +SOURCES = utilities.c \ + debug.c + +SRCS=$(SOURCES) + + +OBJECTS= $(patsubst %.c,%.o,$(SOURCES)) + +LIBFILE= libutils.a +RANLIB=ranlib + + +include ../global.mk + +all: $(LIBFILE) + +clean: + rm -rf $(OBJECTS) $(LIBFILE) + rm -f *.P + rm -f *.a + +$(LIBFILE): $(OBJECTS) + ar -cr $@ $? + $(RANLIB) $@ diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/libutils/debug.c b/src/sumatra/sumatra-1.0.10/sumalibs/libutils/debug.c new file mode 100644 index 0000000..b0131d5 --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/libutils/debug.c @@ -0,0 +1,32 @@ +/* + * debug.c + * + * Created on: 4 sept. 2012 + * Author: coissac + */ + +#include +#include +#include "debug.h" + +char* int2bin(int64_t i,size_t bits) +{ + static char str[65]; + uint64_t u; + + if (bits > 64) + return NULL; + + str[bits] = 0; + + // type punning because signed shift is implementation-defined + u = *(unsigned *)&i; + + for(; bits--; u >>= 1) + str[bits] = u & 1 ? '1' : '0'; + + return str; +} + + + diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/libutils/debug.h b/src/sumatra/sumatra-1.0.10/sumalibs/libutils/debug.h new file mode 100644 index 0000000..6462cb5 --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/libutils/debug.h @@ -0,0 +1,25 @@ +/* + * debug.h + * + * Created on: 4 sept. 2012 + * Author: coissac + */ + +#ifndef DEBUG_H_ +#define DEBUG_H_ + +#ifdef DEBUG +#undef DEBUG +#endif + +#ifdef DEBUG_ON +#define DEBUG(format,...) fprintf(stderr,"[%s:%d] : "format"\n",__FILE__,__LINE__,__VA_ARGS__) +#else +#define DEBUG(format,...) +#endif +#include + +char * int2bin(int64_t i,size_t bits); + + +#endif /* DEBUG_H_ */ diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/libutils/utilities.c b/src/sumatra/sumatra-1.0.10/sumalibs/libutils/utilities.c new file mode 100644 index 0000000..71c3ce6 --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/libutils/utilities.c @@ -0,0 +1,230 @@ +/** + * FileName: utilities.c + * Author: Tiayyba Riaz + * Description: C file for miscellenious functions and macros + * **/ + +#include "utilities.h" +#include +#include +#include + +/* + * Function Name: errorAbort(int errorCode, char* errorMsg, char* fileName, int lineNumber) + * Description: Reports an error on standard error and aborts + */ +void errorAbort(int32_t errorCode, char* errorMsg, char* fileName, int32_t lineNumber) +{ + fprintf(stderr,"Error %d in file %s line %d : %s\n", + errorCode, + fileName, + lineNumber, + errorMsg); + + abort(); +} + +void *util_malloc(size_t chunksize, const char *filename, int32_t line) +{ + void * chunk; + + chunk = calloc(1,chunksize); + + if (!chunk) + errorAbort(MEM_ALLOC_ERROR,"Could not allocate memory.",filename,line); + + return chunk; +} + +/* + * Function Name: util_realloc(void *chunk, int32_t newsize, const char *filename, int32_t line) + * Description: Overloading realloc funstion, changes the size of the memory object pointed to by chunk + * to the size specified by newsize. If memory cannot be allocated, gives the error on stderr and aborts. + */ +void *util_realloc(void *chunk, size_t newsize, const char *filename, int32_t line) +{ + void *newchunk; + + newchunk = realloc(chunk,newsize); + + if (!newchunk) + { + errorAbort(MEM_ALLOC_ERROR,"Could not allocate memory.",filename,line); + } + + return newchunk; +} + +/* + * Function Name: util_free(void *chunk) + * Description: Returns the memory specified by chunk back to operating syste. + */ +void util_free(void *chunk) +{ + free(chunk); +} + +BOOL util_findInArr(int32_t tempArr[], int seqNo, int32_t noOfSeqs) +{ + int index; + + for(index = 0; index < noOfSeqs; index++) + { + if(tempArr[index] == seqNo) return TRUE; + } + + return FALSE; +} + + +/** + * + * String handling utilities + * + **/ + +/* + * Function Name: str_chopAtDelim(char *dest, char *src, char *delim, BOOL includeDelim) + * Description: chops the string startig from source to the delimeter specified. + */ +char *str_chopAtDelim(char *dest, char *src, char *delim, BOOL includeDelim) +{ + char *temp; + int32_t len; + + /* returns a pointer to the first occurance of delim in src*/ + temp = strstr(src, delim); + + if (temp == NULL) + return NULL; + + if (includeDelim) + { + /* temp - src + strlen(delim) -> a string between src and delimeter including delimeter*/ + len = temp - src + strlen(delim); + strncpy(dest, src, len); + } + else + { + len = temp - src; + strncpy(dest, src, temp - src); + } + dest[len] = '\0'; + + return dest; +} + +/* + * Function Name: str_sepNameValue(char *name, char *value, char *src, char *delim) + * Description: . + */ +void str_sepNameValue(char *name, char *value, char *src, char *delim) +{ + char *temp; + + temp = strstr(src, delim); + + if(temp != NULL) + { + strncpy(name, src, temp - src); + strcpy(value, temp + strlen(delim)); + } + else + { + strcpy(name, src); + strcpy(value, ""); + } +} + +/* + * Function Name: str_removeSpaces(char *src) + * Description: Removes the spaces from the start and end of the string. + */ +int str_isSpace (char ch) +{ + switch (ch) + { + case ' ': + case '\t': + case '\n': + return 1; + } + return 0; +} + +void str_removeSpaces(char *src) +{ + int32_t start = 0, end = strlen(src) - 1; + int32_t index = 0; + + if (src == NULL || end < 0) return; + + while(str_isSpace(src[start]) && start < end) start++; + while(str_isSpace(src[end]) && end > start) end--; + + if ( start == end && src[start] == ' ') + { + src[0] = '\0'; + return; + } + if (start > 0) + { + while(start <= end) + { + src[index] = src[start]; + index++; + start++; + } + src[index] = '\0'; + return; + } + src[end+1] = '\0'; +} + +/* + * Function Name: str_strrstr(char *src, char *delim) + * Description: Searches the position of last occurence of string delim in src. + */ +char *str_strrstr(char *src, char *delim) +{ + char *last, *next; + next = strstr(src, delim); + last = next; + while(next != NULL) + { + last = next; + next = strstr(last + 1, delim); + } + return last; +} + + +void* getA16Address(int size) +{ + void* address; + address = (void*) malloc(size); + while ((((long long unsigned int) (address))%16) != 0) + address++; + return(address); +} + + +void** reallocA16Address(void** address, int size) +{ + if (*(address) == NULL) + *(address) = malloc(size); + *(address) = realloc(address, size); + while ((((long long unsigned int) (*(address)))%16) != 0) + (*(address))++; + return(address); +} + + + + + + + + + + diff --git a/src/sumatra/sumatra-1.0.10/sumalibs/libutils/utilities.h b/src/sumatra/sumatra-1.0.10/sumalibs/libutils/utilities.h new file mode 100644 index 0000000..36138f9 --- /dev/null +++ b/src/sumatra/sumatra-1.0.10/sumalibs/libutils/utilities.h @@ -0,0 +1,56 @@ +/** + * FileName: utilities.h + * Author: Tiayyba Riaz + * Description: Header file for miscellenious functions and macros + * **/ + +#ifndef UTILITIES_H_ +#define UTILITIES_H_ + +#include +#include +#include + + + +//static char *basecodes = "00100020000000000003000000"; + +//#define BASEIDX(ch) basecodes[ch - 'a'] - 48 + +#ifndef MAX +#define MAX(x,y) (((x)>(y)) ? (x):(y)) +#define MIN(x,y) (((x)<(y)) ? (x):(y)) +#endif + +typedef char BOOL; +#define TRUE (3==3) +#define FALSE (!TRUE) +#define ALILEN (0) +#define MAXLEN (1) +#define MINLEN (2) + + +/* Error Codes */ +#define FILE_OPENING_ERROR (1) +#define MEM_ALLOC_ERROR (2) + +/* Prototypes */ +void errorAbort(int32_t code, char* errorMsg, char* fileName, int32_t lineNumber); +char *str_strrstr(char *src, char *delim); +void str_removeSpaces(char *src); +void str_sepNameValue(char *name, char *value, char *src, char *delim); +char *str_chopAtDelim(char *dest, char *src, char *delim, BOOL includeDelim); +void util_free(void *chunk); +void *util_realloc(void *chunk, size_t newsize, const char *filename, int32_t line); +void *util_malloc(size_t chunksize, const char *filename, int32_t line); +BOOL util_findInArr(int32_t tempArr[], int seqNo, int32_t noOfSeqs); +void* getA16Address(int size); +void** reallocA16Address(void** address, int size); + +/* Macros */ +#define ERRORABORT(code, msg) errorAbort((code), (msg), __FILE__, __LINE__) + +#endif /*UTILITIES_H_*/ + + + diff --git a/src/sumatra-1.0.10/sumatra.c b/src/sumatra/sumatra-1.0.10/sumatra.c similarity index 100% rename from src/sumatra-1.0.10/sumatra.c rename to src/sumatra/sumatra-1.0.10/sumatra.c diff --git a/src/sumatra-1.0.10/sumatra.h b/src/sumatra/sumatra-1.0.10/sumatra.h similarity index 100% rename from src/sumatra-1.0.10/sumatra.h rename to src/sumatra/sumatra-1.0.10/sumatra.h diff --git a/src/sumatra-1.0.10/sumatra_user_manual.md b/src/sumatra/sumatra-1.0.10/sumatra_user_manual.md similarity index 100% rename from src/sumatra-1.0.10/sumatra_user_manual.md rename to src/sumatra/sumatra-1.0.10/sumatra_user_manual.md diff --git a/src/sumatra-1.0.10/sumatra_user_manual.pdf b/src/sumatra/sumatra-1.0.10/sumatra_user_manual.pdf similarity index 100% rename from src/sumatra-1.0.10/sumatra_user_manual.pdf rename to src/sumatra/sumatra-1.0.10/sumatra_user_manual.pdf