Patch compilation of binaries

Former-commit-id: 688670c339643a282bdeabafafff3b451be83cb6
Former-commit-id: 60d3e42d2af73515fea50d9c97dc4eacda9c8abb
This commit is contained in:
2015-10-02 23:20:33 +02:00
parent c89d3cc24f
commit 6f0ba35953
391 changed files with 47675 additions and 2155 deletions

View File

@ -103,13 +103,16 @@ TAR = tar
# PRTDIR : port dependent files location (libraries and binaries)
# BINDIR : port binaries
# LIBDIR : port libraries
# INCDIR : port includes
#
PRTDIR = $(CFGDIR)../ports/$(PORTNAME)
BINDIR = $(PRTDIR)/bin
BINDIR = $(abspath $(PRTDIR))/bin
LIBDIR = $(PRTDIR)/lib
LIBDIR = $(abspath $(PRTDIR))/lib
INCDIR = $(abspath $(PRTDIR))/include
# ------------------------------------
# default gmake variable in implicit rules

View File

@ -18,9 +18,15 @@
# General compilation flags
# ------------------------------------
CC = /usr/bin/gcc
CXX = /usr/bin/g++
CXXPP = /usr/bin/cpp
CPP = /usr/bin/cpp
#
# MACHDEF : define machine and OS specific flags
#
MACHINE = MACOSX
MACHDEF = -DLX_TARGET_MACINTEL -DLITTLE_ENDIAN -DMACOSX

View File

@ -22,7 +22,7 @@
# MACHDEF : define machine and OS specific flags
#
MACHDEF = -DLX_TARGET_LINUX -DLITTLE_ENDIAN
MACHDEF = -DLX_TARGET_LINUX -DLITTLE_ENDIAN -DLINUX
#
# MATH_LIBS : machine specific math librairies

View File

@ -15,6 +15,9 @@ PKGDIR ?= build.$(PORTNAME)
PRTPATH = $(abspath $(PRTDIR))
PRTPATH_BIN = $(PRTPATH)/bin
PKG_CONFIG = $(PRTPATH)/bin/pkg-config
#
# Rules
#
@ -28,7 +31,17 @@ pkg.expand::
test -f $(PKGDIR)/configure || $(TAR) zxf $(PKGTAR) -C $(PKGDIR) --strip-components 1
pkg.make:: pkg.expand
test -f $(PKGDIR)/Makefile || (cd $(PKGDIR) && ./configure --prefix=$(PRTPATH))
echo $(PKG_CONFIG)
test -f $(PKGDIR)/Makefile || (export PATH="$(PRTPATH_BIN):$$PATH" && \
export PKG_CONFIG=$(PKG_CONFIG) && \
export CC="$(CC)" && \
export CXX="$(CXX)" && \
export CPP="$(CPP)" && \
export CXXPP="$(CXXPP)" && \
export CFLAGS="$(CFLAGS)" && \
export LDFLAGS="$(LDFLAGS)" && \
cd $(PKGDIR) && \
./configure --prefix=$(PRTPATH) $(CONFIGURE_OPTIONS))
$(MAKE) -C $(PKGDIR)
pkg.install:: pkg.make

View File

@ -17,12 +17,18 @@
#
include ../config/auto.conf
DIRS = exonerate \
DIRS = aragorn \
clustalo \
exonerate \
hmmer3 \
kimono \
muscle \
ncbiblast \
prokov \
repseek \
sequtils \
aragorn \
ncbiblast
sumaclust \
sumatra
include ../config/targets/propagate.targ

29
src/clustalo/Makefile Executable file
View File

@ -0,0 +1,29 @@
# ---------------------------------------------------------------
# $Id: $
# ---------------------------------------------------------------
# @file: Makefile
# @desc: makefile for lxpack
#
# @history:
# @history:
# @+ <Gloup> : Apr 97 : Created
# @+ <Gloup> : Mar 02 : Updated for LXxware
#
# @note: should be processed with gnu compatible make
# @note: helixware_compatible
#
# @end:
# ---------------------------------------------------------------
#
include ../../config/auto.conf
DIRS = argtable \
clustalo
include ../../config/targets/propagate.targ
include ../../config/targets/help.targ
all::
$(MAKE) ACTION=$@ _action

View File

@ -0,0 +1,24 @@
# ---------------------------------------------------------------
# $Id: $
# ---------------------------------------------------------------
# @file: Makefile
# @desc: makefile for package exonerate
#
# @history:
# @+ <Gloup> : Sept 15 : Adapted to ORG.Annot
#
# @note: should be processed with gnu compatible make
# @note: helixware_compatible
#
# @end:
# ---------------------------------------------------------------
#
include ../../../config/auto.conf
PKG = argtable2-13
include $(CFGDIR)targets/package.targ
include $(CFGDIR)targets/help.targ

Binary file not shown.

View File

@ -0,0 +1,24 @@
# ---------------------------------------------------------------
# $Id: $
# ---------------------------------------------------------------
# @file: Makefile
# @desc: makefile for package exonerate
#
# @history:
# @+ <Gloup> : Sept 15 : Adapted to ORG.Annot
#
# @note: should be processed with gnu compatible make
# @note: helixware_compatible
#
# @end:
# ---------------------------------------------------------------
#
include ../../../config/auto.conf
PKG = clustal-omega-1.2.1
include $(CFGDIR)targets/package.targ
include $(CFGDIR)targets/help.targ

Binary file not shown.

Binary file not shown.

20
src/exonerate/Makefile Normal file → Executable file
View File

@ -2,10 +2,12 @@
# $Id: $
# ---------------------------------------------------------------
# @file: Makefile
# @desc: makefile for package exonerate
# @desc: makefile for lxpack
#
# @history:
# @+ <Gloup> : Sept 15 : Adapted to ORG.Annot
# @history:
# @+ <Gloup> : Apr 97 : Created
# @+ <Gloup> : Mar 02 : Updated for LXxware
#
# @note: should be processed with gnu compatible make
# @note: helixware_compatible
@ -13,12 +15,18 @@
# @end:
# ---------------------------------------------------------------
#
include ../../config/auto.conf
PKG = exonerate-2.2.0
DIRS = pkg-config \
libffi \
gettext \
glib2 \
exonerate
include $(CFGDIR)targets/package.targ
include ../../config/targets/propagate.targ
include $(CFGDIR)targets/help.targ
include ../../config/targets/help.targ
all::
$(MAKE) ACTION=$@ _action

Binary file not shown.

BIN
src/exonerate/exonerate/.DS_Store vendored Normal file

Binary file not shown.

View File

@ -0,0 +1,24 @@
# ---------------------------------------------------------------
# $Id: $
# ---------------------------------------------------------------
# @file: Makefile
# @desc: makefile for package exonerate
#
# @history:
# @+ <Gloup> : Sept 15 : Adapted to ORG.Annot
#
# @note: should be processed with gnu compatible make
# @note: helixware_compatible
#
# @end:
# ---------------------------------------------------------------
#
include ../../../config/auto.conf
PKG = exonerate-2.2.0_EC
include $(CFGDIR)targets/package.targ
include $(CFGDIR)targets/help.targ

Binary file not shown.

View File

@ -0,0 +1,24 @@
# ---------------------------------------------------------------
# $Id: $
# ---------------------------------------------------------------
# @file: Makefile
# @desc: makefile for package exonerate
#
# @history:
# @+ <Gloup> : Sept 15 : Adapted to ORG.Annot
#
# @note: should be processed with gnu compatible make
# @note: helixware_compatible
#
# @end:
# ---------------------------------------------------------------
#
include ../../../config/auto.conf
PKG = gettext-0.19
include $(CFGDIR)targets/package.targ
include $(CFGDIR)targets/help.targ

Binary file not shown.

View File

@ -0,0 +1,25 @@
# ---------------------------------------------------------------
# $Id: $
# ---------------------------------------------------------------
# @file: Makefile
# @desc: makefile for package exonerate
#
# @history:
# @+ <Gloup> : Sept 15 : Adapted to ORG.Annot
#
# @note: should be processed with gnu compatible make
# @note: helixware_compatible
#
# @end:
# ---------------------------------------------------------------
#
include ../../../config/auto.conf
PKG = glib-2.44.1
CONFIGURE_OPTIONS = --disable-dtrace
include $(CFGDIR)targets/package.targ
include $(CFGDIR)targets/help.targ

Binary file not shown.

BIN
src/exonerate/libffi/.DS_Store vendored Normal file

Binary file not shown.

View File

@ -0,0 +1,24 @@
# ---------------------------------------------------------------
# $Id: $
# ---------------------------------------------------------------
# @file: Makefile
# @desc: makefile for package exonerate
#
# @history:
# @+ <Gloup> : Sept 15 : Adapted to ORG.Annot
#
# @note: should be processed with gnu compatible make
# @note: helixware_compatible
#
# @end:
# ---------------------------------------------------------------
#
include ../../../config/auto.conf
PKG = libffi-3.2.1
include $(CFGDIR)targets/package.targ
include $(CFGDIR)targets/help.targ

Binary file not shown.

View File

@ -0,0 +1,25 @@
# ---------------------------------------------------------------
# $Id: $
# ---------------------------------------------------------------
# @file: Makefile
# @desc: makefile for package exonerate
#
# @history:
# @+ <Gloup> : Sept 15 : Adapted to ORG.Annot
#
# @note: should be processed with gnu compatible make
# @note: helixware_compatible
#
# @end:
# ---------------------------------------------------------------
#
include ../../../config/auto.conf
PKG = pkg-config-0.29
CONFIGURE_OPTIONS= --with-internal-glib
include $(CFGDIR)targets/package.targ
include $(CFGDIR)targets/help.targ

Binary file not shown.

28
src/hmmer3/Makefile Executable file
View File

@ -0,0 +1,28 @@
# ---------------------------------------------------------------
# $Id: $
# ---------------------------------------------------------------
# @file: Makefile
# @desc: makefile for lxpack
#
# @history:
# @history:
# @+ <Gloup> : Apr 97 : Created
# @+ <Gloup> : Mar 02 : Updated for LXxware
#
# @note: should be processed with gnu compatible make
# @note: helixware_compatible
#
# @end:
# ---------------------------------------------------------------
#
include ../../config/auto.conf
DIRS = hmmer3
include ../../config/targets/propagate.targ
include ../../config/targets/help.targ
all::
$(MAKE) ACTION=$@ _action

Binary file not shown.

View File

@ -0,0 +1,24 @@
# ---------------------------------------------------------------
# $Id: $
# ---------------------------------------------------------------
# @file: Makefile
# @desc: makefile for package exonerate
#
# @history:
# @+ <Gloup> : Sept 15 : Adapted to ORG.Annot
#
# @note: should be processed with gnu compatible make
# @note: helixware_compatible
#
# @end:
# ---------------------------------------------------------------
#
include ../../../config/auto.conf
PKG = hmmer-3.1b1
include $(CFGDIR)targets/package.targ
include $(CFGDIR)targets/help.targ

Binary file not shown.

View File

@ -1,4 +1,4 @@
# ---------------------------------------------------------------
#---------------------------------------------------------------
# $Id: $
# ---------------------------------------------------------------
# @file: Makefile
@ -25,9 +25,6 @@ include ../../config/targets/help.targ
all::
$(MAKE) ACTION=$@ _action
test -d $(PRTDIR) || mkdir $(PRTDIR)
test -d $(BINDIR) || mkdir $(BINDIR)
\cp -f lxpack/ports/$(PORTNAME)/bin/* $(BINDIR)
clean::
$(MAKE) -C lxpack portclean

View File

@ -15,14 +15,14 @@
# @end:
# ---------------------------------------------------------------
#
include ./config/auto.conf
include ../../../config/auto.conf
DIRS = src \
tests
include ./config/targets/propagate.targ
include ../../../config/targets/propagate.targ
include ./config/targets/help.targ
include ../../../config/targets/help.targ
portclean::
$(MAKE) ACTION=$@ _action

View File

@ -1,51 +0,0 @@
$Id: README.txt 1825 2013-02-26 09:39:47Z viari $
This directory contains Makefile machine specific configuration files
(and default targets to help you writing Makefile's)
These headers should be used with GNU make or compatible
#
# portname
#
To check your port, issue :
./guess_port
if output is 'unknown <mach>:<sys>:<rel>' then you should :
- add a port entry in guess_port for <mach>:<sys>:<rel>
- create a ports/<port>.conf configuration file
(the best is to start from another port file,
choose whatever looks closest)
#
# configuration flags
#
auto.conf : the main configuration file :
- determine the machine port thru 'guess_port' shell
- include 'default.conf' file
- include the machine specific 'ports/<port>.conf' file
default.conf : default configuration (included by 'auto.conf')
ports/<port>.conf : machine specific configuration (included by 'auto.conf')
#
# utility targets
#
targets/help.targ : target for standard help
targets/propagate.targ : target for propagating targets to subdirectories
targets/package.targ : default targets for standard package with 'configure'
targets/empty.targ : default empty targets (defined as double colon rules)
targets/lxbin.targ : default make targets for standard lx binary (without libraries)
targets/debug.targ : target to print debug information (for dev.)

View File

@ -1,54 +0,0 @@
#
# $Id: auto.conf 1825 2013-02-26 09:39:47Z viari $
#
# auto.conf
# auto configuration file using guess_port
#
# this file is included in Makefile
#
#
# default shell for gnu-make
#
SHELL = /bin/sh
#
# CFGDIR : location of config files = this file directory location
#
# CFGPRT : port name (as returned by guess_port)
#
# because builtin 'lastword' is missing in gnu-make 3.80
lastword = $(word $(words $1), $1)
CFGDIR := $(dir $(call lastword, $(MAKEFILE_LIST)))
CFGPRT := $(shell $(CFGDIR)guess_port)
# check if port is correctly defined
ifneq (1, $(words $(CFGPRT)))
entry := $(call lastword, $(CFGPRT))
$(error port is undefined - add entry for "$(entry)" in configuration file -)
endif
#
# PORTNAME : port name to use : default is CFGPRT but may be futher modified
# by machine specific configuration
PORTNAME = $(CFGPRT)
#
# default configuration
# may be overriden by machine dependant definitions below
#
include $(CFGDIR)default.conf
#
# machine dependant definitions
#
include $(CFGDIR)ports/$(CFGPRT).conf

View File

@ -1,124 +0,0 @@
#
# $Id: default.conf 2007 2013-12-03 14:21:39Z viari $
#
# default.conf
# default configuration flags
# maybe further redefined by machine specific configuration
#
# this file is included by auto.conf
#
# ------------------------------------
# General compilation flags
# ------------------------------------
#
# MACHDEF : define machine and OS specific flags
#
MACHDEF =
#
# CC : (ansi C) compiler command to use
# you may add some machine specific flags (like -arch ...)
# in the <machine>.conf configuration file
#
CC = gcc
#
# default compiler optimizer flag
#
OPTIM = -O
#
# CC_LIBS : additionnal machine specific $(CC) libraries
# like '-lC' on some machines
#
CC_LIBS =
#
# MALLOC_LIBS : machine specific malloc librairies
# like '-lmalloc' on SGI
#
MALLOC_LIBS =
#
# MATH_LIBS : machine specific math librairies
# like '-lm' on Solaris
#
MATH_LIBS =
#
# LINT : looks like LINT command does not exist anymore
# here is a rough replacement
#
LINT = gcc -S -Wall -Wno-format-y2k -W -Wstrict-prototypes \
-Wmissing-prototypes -Wpointer-arith -Wreturn-type \
-Wcast-qual -Wwrite-strings -Wswitch -Wshadow \
-Wcast-align -Wbad-function-cast -Wchar-subscripts \
-Winline -Wnested-externs -Wredundant-decls
# ------------------------------------
# General system commands
# ------------------------------------
#
# AR : AR archive command
# ARFLAGS : $(AR) archiving flags
# ARXFLAGS : $(AR) extraction flags
#
AR = ar
ARFLAGS = rcv
ARXFLAGS = xv
#
# RANLIB : ranlib command
#
RANLIB = ranlib
#
# DIFF : diff command
#
DIFF = diff
#
# TAR : tar command
#
TAR = tar
# ------------------------------------
# Default locations
# ------------------------------------
#
# PRTDIR : port dependent files location (libraries and binaries)
# BINDIR : port binaries
# LIBDIR : port libraries
#
PRTDIR = $(CFGDIR)../ports/$(PORTNAME)
BINDIR = $(PRTDIR)/bin
LIBDIR = $(PRTDIR)/lib
# ------------------------------------
# default gmake variable in implicit rules
# ------------------------------------
CFLAGS = $(OPTIM) $(MACHDEF) -I$(INCDIR)
LDFLAGS = -L$(LIBDIR) -L.
LDLIBS = $(LIBS) $(MALLOC_LIBS) $(MATH_LIBS) $(CC_LIBS)
LINTFLAGS = $(MACHDEF) -I$(INCDIR)

View File

@ -1,33 +0,0 @@
#! /bin/sh
#
# $Id: guess_port 1825 2013-02-26 09:39:47Z viari $
#
# @file: guess_port
# @desc: attempt to guess the portname
# @usage: guess_port
#
# @history:
# @+ <Gloup> Nov. 2000 first draft adapted from GNU config.guess
# @+ <Gloup> Feb. 2010 moved to sh
#
mach=`uname -m`
syst=`uname -s`
rels=`uname -r`
case ${mach}:${syst}:${rels} in
alpha:OSF1:* ) echo alpha-osf1;;
sun4*:SunOS:5.* ) echo sparc-solaris;;
i86pc:SunOS:5.* ) echo i386-solaris;;
sun4*:SunOS:* ) echo sparc-sunos;;
Power*:Darwin:* ) echo ppc-darwin;;
i*86:Linux:* ) echo i386-linux;;
x*86*:Linux:* ) echo i386-linux;;
i*86:Darwin:* ) echo i386-darwin;;
IP*:IRIX*:* ) echo mips-irix;;
i*86:MINGW32*:* ) echo x86-mingw32;;
*) echo unknown ${mach}:${syst}:${rels}; exit 1;;
esac
exit 0

View File

@ -1,26 +0,0 @@
#
# $Id: i386-darwin.conf 1825 2013-02-26 09:39:47Z viari $
#
# i386-darwin.conf
# configuration file for MacOS-X/Intel-Based/Darwin 1.2 with gcc compiler
# this file is included in Makefile
#
# system (uname -srp) : Darwin 8.7.1 i386
# compiler (cc --version) : i686-apple-darwin8-gcc-4.0.1
#
# check tags
# @uname:uname -srp:Darwin 8.7.1 i386
# @cc:cc --version:i686-apple-darwin8-gcc-4.0.1
#
#
# ------------------------------------
# General compilation flags
# ------------------------------------
#
# MACHDEF : define machine and OS specific flags
#
MACHDEF = -DLX_TARGET_MACINTEL -DLITTLE_ENDIAN -DMACOSX

View File

@ -1,32 +0,0 @@
#
# $Id: i386-linux.conf 1825 2013-02-26 09:39:47Z viari $
#
# i386-linux.conf
# configuration file for linux ix86 with GNU gcc compiler
# this file is included in Makefile
#
# system (uname -srp) : Linux 2.2.14-5.0 unknown
# compiler (gcc --version) : egcs-2.91.66
#
# check tags
# @uname:uname -srp:Linux 2.2.14-5.0 unknown
# @cc:cc --version:egcs-2.91.66
#
#
# ------------------------------------
# General compilation flags
# ------------------------------------
#
# MACHDEF : define machine and OS specific flags
#
MACHDEF = -DLX_TARGET_LINUX -DLITTLE_ENDIAN
#
# MATH_LIBS : machine specific math librairies
#
MATH_LIBS = -lm

View File

@ -1,32 +0,0 @@
#
# $Id: ppc-darwin.conf 1825 2013-02-26 09:39:47Z viari $
#
# ppc-darwin.conf
# configuration file for MacOS-X/Darwin 1.2 with native cc compiler
# this file is included in Makefile
#
# system (uname -srp) : Darwin 1.2 powerpc
# compiler (cc --version) : 2.7.2.1
#
# check tags
# @uname:uname -srp:Darwin 1.2 powerpc
# @cc:cc --version:2.7.2.1
#
#
# ------------------------------------
# General compilation flags
# ------------------------------------
#
# MACHDEF : define machine and OS specific flags
#
MACHDEF = -DLX_TARGET_MACPPC -DBIG_ENDIAN
#
# CC : name of (ansi C) compiler to use
#
CC = cc -arch ppc

View File

@ -1,31 +0,0 @@
#
# $Id: sparc-solaris.conf 1825 2013-02-26 09:39:47Z viari $
#
# sparc-solaris.conf
# configuration file for sparc solaris with GNU gcc compiler
# this file is included in Makefile
#
# system (uname -srp) : SunOS 5.8 sparc
# compiler (gcc --version) : 2.95.2
#
# check tags
# @uname:uname -srp:SunOS 5.8 sparc
# @cc:cc --version:2.95.2
#
#
# ------------------------------------
# General compilation flags
# ------------------------------------
#
# MACHDEF : define machine and OS specific flags
#
MACHDEF = -DLX_TARGET_SOLARIS -DBIG_ENDIAN
#
# MATH_LIBS : machine specific math librairies
#
MATH_LIBS = -lm

View File

@ -1,54 +0,0 @@
#
# $Id: x86-mingw32.conf 1825 2013-02-26 09:39:47Z viari $
#
# x86-mingw32
# configuration file for MinGW with GNU gcc compiler.
#
# this file is included in Makefile
#
#
#
# rename PORTNAME safely since MinGW produce pure win32 executables
# without dll's
#
PORTNAME = x86-win32
# ------------------------------------
# General compilation flags
# ------------------------------------
#
# CC_LIBS : additionnal machine specific $(CC) libraries
#
# libiberty is needed for some system extensions (like mkstemps)
#
CC_LIBS = -liberty
#
# MACHDEF : define machine and OS specific flags
#
# -DDLMALLOC : use dlmalloc instead of malloc (which does not have mallinfo)
# -posix is a new replacement for several MinGW32 flags, including:
# -D__USE_MINGW_ANSI_STDIO : mingw gcc flag to recognize the C99 "%zu" format
#
MACHDEF = -posix -DLX_TARGET_WIN32 -DWIN_MINGW -DDLMALLOC -DLITTLE_ENDIAN
#
# MATH_LIBS : machine specific math librairies
#
MATH_LIBS = -lm
# ------------------------------------
# General system commands
# ------------------------------------
#
# DIFF : diff command / should ignore cr on windows
#
DIFF = diff --strip-trailing-cr

View File

@ -1,25 +0,0 @@
#
# $Id: help.targ 1825 2013-02-26 09:39:47Z viari $
#
# debug.targ
#
# target to print debug information (dev. only)
#
# it defines the following targets:
#
# debug :
# print debug
#
# it requires auto.conf
#
.PHONY: debug
debug::
@echo "+ PORTNAME: $(PORTNAME)"
@echo "+ CFGPRT: $(CFGPRT)"
@echo "+ CFGDIR: $(CFGDIR)"
@echo "+ PRTDIR: $(PRTDIR)"
@echo "+ MACHDEF: $(MACHDEF)"

View File

@ -1,24 +0,0 @@
#
# $Id: $
#
# epty.targ
#
# default empty targets (defined as double colon rules)
#
#
#
# Rules
#
.PHONY: all test clean portclean help
all::
test::
clean::
portclean:: clean
test::

View File

@ -1,23 +0,0 @@
#
# $Id: help.targ 1825 2013-02-26 09:39:47Z viari $
#
# help.targ
#
# default target to print help
#
# it defines the following targets:
#
# help :
# print help
#
.PHONY: help
help::
@ echo "basic usage: make [<action>+]"
@ echo "valid <action> :"
@ echo " all : compile everything for current port [default target]"
@ echo " clean : local cleanup"
@ echo " portclean : cleanup distribution for current port"
@ echo " test : run tests on current port"
@ echo " help : print this help"

View File

@ -1,51 +0,0 @@
#
# $Id: $
#
# lxbin.targ
#
# default make targets for standard lx binary
#
# you should define the 'PROGS' and 'OSRC' variables
# and optionnaly 'LIBS' if binaries have to be linked with libraries
#
# note: if main source code for binary PROG is PROG.c, there is nothing to do,
# else (e.g. if it involves several sources files) you should also add local
# file dependencies. e.g under the form:
#
# mymain: $(OBJ) mymain_base.c mymain_help.c
# $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS) $(LDLIBS)
#
#
# 'auto.conf' should have been included
#
OBJ = $(OSRC:.c=.o)
INCDIR = ../include
#
# Rules
#
.PHONY: all prelib install test clean portclean
all:: prelib $(PROGS) install
@echo "+++++++++++ binaries $(PROGS) done"
prelib::
test -d $(PRTDIR) || mkdir $(PRTDIR) # because some linker may complain
test -d $(LIBDIR) || mkdir $(LIBDIR) # if -L$(LIBDIR) does not exist
install::
test -d $(PRTDIR) || mkdir $(PRTDIR)
test -d $(BINDIR) || mkdir $(BINDIR)
-for f in $(PROGS) ; do \cp -f $$f $(BINDIR) ; done
test::
clean::
-\rm -f *.o cvstatic* *% *.bak so_loc*
-\rm -f $(PROGS)
portclean:: clean
-(! test -d $(BINDIR)) || (cd $(BINDIR) && \rm -f $(PROGS))

View File

@ -1,43 +0,0 @@
#
# $Id: $
#
# lxlib.targ
#
# default make targets for standard lx library
#
# you should define the 'LOCLIB' and 'OSRC' variables
#
# 'auto.conf' should have been included
#
OBJ = $(OSRC:.c=.o)
INCDIR = ../include
#
# Rules
#
.PHONY: all lib install test clean portclean
all:: lib install
@echo "+++++++++++ library $(LOCLIB) done"
lib:: $(OBJ)
$(AR) $(ARFLAGS) $(LOCLIB) $(OBJ)
$(RANLIB) $(LOCLIB)
install::
test -d $(PRTDIR) || mkdir $(PRTDIR)
test -d $(LIBDIR) || mkdir $(LIBDIR)
\cp -f $(LOCLIB) $(LIBDIR)
$(RANLIB) $(LIBDIR)/$(LOCLIB)
test::
clean::
-\rm -f *.o cvstatic* *% *.bak so_loc*
-\rm -f $(LOCLIB)
portclean:: clean
-(! test -d $(LIBDIR)) || (cd $(LIBDIR) && \rm -f $(LOCLIB))

View File

@ -1,48 +0,0 @@
#
# $Id: package.targ 1825 2013-02-26 09:39:47Z viari $
#
# package.targ
#
# default make targets for standard package with configure
#
# you should define the 'PKG' variable
# (and optionaly 'PKGTAR', 'PKGDIR')
#
PKGTAR ?= $(PKG).tgz
PKGDIR ?= build.$(PORTNAME)
PRTPATH = $(abspath $(PRTDIR))
#
# Rules
#
.PHONY: all clean test portclean pkg pkg.expand pkg.make pkg.install
all:: pkg
pkg.expand::
test -d $(PKGDIR) || mkdir $(PKGDIR)
test -f $(PKGDIR)/configure || $(TAR) zxf $(PKGTAR) -C $(PKGDIR) --strip-components 1
pkg.make:: pkg.expand
test -f $(PKGDIR)/Makefile || (cd $(PKGDIR) && ./configure --prefix=$(PRTPATH))
$(MAKE) -C $(PKGDIR)
pkg.install:: pkg.make
$(MAKE) -C $(PKGDIR) install
pkg:: pkg.install
@echo "+++++++++++ package $(PKG) done"
test::
(! test -d $(PKGDIR)) || $(MAKE) -C $(PKGDIR) test
clean::
(! test -d $(PKGDIR)) || $(MAKE) -C $(PKGDIR) clean
portclean::
(! test -d $(PKGDIR)) || $(MAKE) -C $(PKGDIR) distclean
(! test -d $(PKGDIR)) || \rm -r $(PKGDIR)

View File

@ -1,30 +0,0 @@
#
# $Id: propagate.targ 1825 2013-02-26 09:39:47Z viari $
#
# propagate.targ
#
# default make targets for library containers
#
# you should define the 'DIRS' variable
#
# It will propagate 'MAKE <target>' to all
# directories listed in DIRS
#
#
# Rules
#
.PHONY: all _action $(DIRS)
.DEFAULT:
$(MAKE) ACTION=$@ _action
all::
$(MAKE) ACTION=all _action
_action: $(DIRS)
@echo "$(ACTION) done"
$(DIRS):
$(MAKE) -C $@ $(ACTION)

View File

@ -13,7 +13,7 @@
# ---------------------------------------------------------------
#
include ../config/auto.conf
include ../../../../config/auto.conf
PROGS = kimono kimfit
@ -29,8 +29,8 @@ OSRC = fasta_io.c \
kim_genetic.c \
kim_codonskew.c
include ../config/targets/lxbin.targ
include ../config/targets/help.targ
include ../../../../config/targets/lxbin.targ
include ../../../../config/targets/help.targ
#
# file dependencies

View File

@ -16,7 +16,7 @@
# ---------------------------------------------------------------
#
include ../config/targets/empty.targ
include ../../../../config/targets/empty.targ
clean::
-\rm -f *.tst

30
src/muscle/Makefile Executable file
View File

@ -0,0 +1,30 @@
#---------------------------------------------------------------
# $Id: $
# ---------------------------------------------------------------
# @file: Makefile
# @desc: makefile for lxpack
#
# @history:
# @history:
# @+ <Gloup> : Apr 97 : Created
# @+ <Gloup> : Mar 02 : Updated for LXxware
#
# @note: should be processed with gnu compatible make
# @note: helixware_compatible
#
# @end:
# ---------------------------------------------------------------
#
include ../../config/auto.conf
DIRS = muscle3.8.31
include ../../config/targets/propagate.targ
include ../../config/targets/help.targ
all::
$(MAKE) ACTION=$@ _action
clean::
$(MAKE) -C lxpack portclean

View File

@ -0,0 +1,30 @@
#---------------------------------------------------------------
# $Id: $
# ---------------------------------------------------------------
# @file: Makefile
# @desc: makefile for lxpack
#
# @history:
# @history:
# @+ <Gloup> : Apr 97 : Created
# @+ <Gloup> : Mar 02 : Updated for LXxware
#
# @note: should be processed with gnu compatible make
# @note: helixware_compatible
#
# @end:
# ---------------------------------------------------------------
#
include ../../../config/auto.conf
DIRS = src
include ../../../config/targets/propagate.targ
include ../../../config/targets/help.targ
all::
$(MAKE) ACTION=$@ _action
clean::
$(MAKE) -C lxpack portclean

View File

@ -0,0 +1,11 @@
include ../../../../config/auto.conf
all: muscle install
muscle:
chmod +x ./mk
(export CXX=$(CXX) && ./mk)
install:
cp muscle $(BINDIR)

View File

@ -0,0 +1,27 @@
MUSCLE v3.0 source code README
------------------------------
http://www.drive5.com/muscle
This version of MUSCLE was built and tested on two platforms:
Windows XP and Red Hat Linux 8.0.
On Windows, I used Microsoft Visual C++ .Net, which I find
to be the best C++ compile / edit / test environment I've
tried on any platform. The Microsoft project file is
muscle.vcproj.
The Linux make file is Makefile. This is a very simple-minded
make file (because I am a Linux development novice), so should
be easy to understand. By default, it uses shared libraries,
but I found this to give problems when copying between
different Linux versions. The fix was to use the linker
flag -lm static (commented out), which gives a much bigger
but more portable binary. The posted binary was linked with
static libraries.
The source code was not written to be maintained by anyone
but me, so the usual apologies and caveats apply.
Bob Edgar,
January 2004

View File

@ -0,0 +1,802 @@
#include "muscle.h"
#include "msa.h"
#include "pwpath.h"
#include "profile.h"
#define TRACE 0
static void LogPP(const ProfPos &PP)
{
Log("ResidueGroup %u\n", PP.m_uResidueGroup);
Log("AllGaps %d\n", PP.m_bAllGaps);
Log("Occ %.3g\n", PP.m_fOcc);
Log("LL=%.3g LG=%.3g GL=%.3g GG=%.3g\n", PP.m_LL, PP.m_LG, PP.m_GL, PP.m_GG);
Log("Freqs ");
for (unsigned i = 0; i < 20; ++i)
if (PP.m_fcCounts[i] > 0)
Log("%c=%.3g ", LetterToChar(i), PP.m_fcCounts[i]);
Log("\n");
}
static void AssertProfPosEq(const ProfPos *PA, const ProfPos *PB, unsigned i)
{
const ProfPos &PPA = PA[i];
const ProfPos &PPB = PB[i];
#define eq(x) if (PPA.m_##x != PPB.m_##x) { LogPP(PPA); LogPP(PPB); Quit("AssertProfPosEq." #x); }
#define be(x) if (!BTEq(PPA.m_##x, PPB.m_##x)) { LogPP(PPA); LogPP(PPB); Quit("AssertProfPosEq." #x); }
eq(bAllGaps)
eq(uResidueGroup)
be(LL)
be(LG)
be(GL)
be(GG)
be(fOcc)
be(scoreGapOpen)
be(scoreGapClose)
for (unsigned j = 0; j < 20; ++j)
{
#define eqj(x) if (PPA.m_##x != PPB.m_##x) Quit("AssertProfPosEq j=%u " #x, j);
#define bej(x) if (!BTEq(PPA.m_##x, PPB.m_##x)) Quit("AssertProfPosEq j=%u " #x, j);
bej(fcCounts[j]);
// eqj(uSortOrder[j]) // may differ due to ties, don't check?
bej(AAScores[j])
#undef eqj
#undef bej
}
#undef eq
#undef be
}
void AssertProfsEq(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
unsigned uLengthB)
{
if (uLengthA != uLengthB)
Quit("AssertProfsEq: lengths differ %u %u", uLengthA, uLengthB);
for (unsigned i = 0; i < uLengthB; ++i)
AssertProfPosEq(PA, PB, i);
}
#if DEBUG
static void ValidateProf(const ProfPos *Prof, unsigned uLength)
{
for (unsigned i = 0; i < uLength; ++i)
{
const ProfPos &PP = Prof[i];
FCOUNT s1 = PP.m_LL + PP.m_LG + PP.m_GL + PP.m_GG;
assert(BTEq(s1, 1.0));
if (i > 0)
{
const ProfPos &PPPrev = Prof[i-1];
FCOUNT s2 = PPPrev.m_LL + PPPrev.m_GL;
FCOUNT s3 = PP.m_LL + PP.m_LG;
assert(BTEq(s2, s3));
}
if (i < uLength - 1)
{
const ProfPos &PPNext = Prof[i+1];
FCOUNT s4 = PP.m_LL + PP.m_GL;
FCOUNT s5 = PPNext.m_LL + PPNext.m_LG;
assert(BTEq(s4, s5));
}
}
}
#else
#define ValidateProf(Prof, Length) /* empty */
#endif
static void ScoresFromFreqsPos(ProfPos *Prof, unsigned uLength, unsigned uPos)
{
ProfPos &PP = Prof[uPos];
SortCounts(PP.m_fcCounts, PP.m_uSortOrder);
PP.m_uResidueGroup = ResidueGroupFromFCounts(PP.m_fcCounts);
// "Occupancy"
PP.m_fOcc = PP.m_LL + PP.m_GL;
// Frequency of gap-opens in this position (i)
// Gap open = letter in i-1 and gap in i
// = iff LG in i
FCOUNT fcOpen = PP.m_LG;
// Frequency of gap-closes in this position
// Gap close = gap in i and letter in i+1
// = iff GL in i+1
FCOUNT fcClose;
if (uPos + 1 < uLength)
fcClose = Prof[uPos + 1].m_GL;
else
fcClose = PP.m_GG + PP.m_LG;
PP.m_scoreGapOpen = (SCORE) ((1.0 - fcOpen)*g_scoreGapOpen/2.0);
PP.m_scoreGapClose = (SCORE) ((1.0 - fcClose)*g_scoreGapOpen/2.0);
#if DOUBLE_AFFINE
PP.m_scoreGapOpen2 = (SCORE) ((1.0 - fcOpen)*g_scoreGapOpen2/2.0);
PP.m_scoreGapClose2 = (SCORE) ((1.0 - fcClose)*g_scoreGapOpen2/2.0);
#endif
for (unsigned i = 0; i < g_AlphaSize; ++i)
{
SCORE scoreSum = 0;
for (unsigned j = 0; j < g_AlphaSize; ++j)
scoreSum += PP.m_fcCounts[j]*(*g_ptrScoreMatrix)[i][j];
PP.m_AAScores[i] = scoreSum;
}
}
void ProfScoresFromFreqs(ProfPos *Prof, unsigned uLength)
{
for (unsigned i = 0; i < uLength; ++i)
ScoresFromFreqsPos(Prof, uLength, i);
}
static void AppendDelete(const MSA &msaA, unsigned &uColIndexA,
unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined,
unsigned &uColIndexCombined)
{
#if TRACE
Log("AppendDelete ColIxA=%u ColIxCmb=%u\n",
uColIndexA, uColIndexCombined);
#endif
for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
{
char c = msaA.GetChar(uSeqIndexA, uColIndexA);
msaCombined.SetChar(uSeqIndexA, uColIndexCombined, c);
}
for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, '-');
++uColIndexCombined;
++uColIndexA;
}
static void AppendInsert(const MSA &msaB, unsigned &uColIndexB,
unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined,
unsigned &uColIndexCombined)
{
#if TRACE
Log("AppendInsert ColIxB=%u ColIxCmb=%u\n",
uColIndexB, uColIndexCombined);
#endif
for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
msaCombined.SetChar(uSeqIndexA, uColIndexCombined, '-');
for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
{
char c = msaB.GetChar(uSeqIndexB, uColIndexB);
msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, c);
}
++uColIndexCombined;
++uColIndexB;
}
static void AppendTplInserts(const MSA &msaA, unsigned &uColIndexA, unsigned uColCountA,
const MSA &msaB, unsigned &uColIndexB, unsigned uColCountB, unsigned uSeqCountA,
unsigned uSeqCountB, MSA &msaCombined, unsigned &uColIndexCombined)
{
#if TRACE
Log("AppendTplInserts ColIxA=%u ColIxB=%u ColIxCmb=%u\n",
uColIndexA, uColIndexB, uColIndexCombined);
#endif
const unsigned uLengthA = msaA.GetColCount();
const unsigned uLengthB = msaB.GetColCount();
unsigned uNewColCount = uColCountA;
if (uColCountB > uNewColCount)
uNewColCount = uColCountB;
for (unsigned n = 0; n < uColCountA; ++n)
{
for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
{
char c = msaA.GetChar(uSeqIndexA, uColIndexA + n);
c = UnalignChar(c);
msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, c);
}
}
for (unsigned n = uColCountA; n < uNewColCount; ++n)
{
for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, '.');
}
for (unsigned n = 0; n < uColCountB; ++n)
{
for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
{
char c = msaB.GetChar(uSeqIndexB, uColIndexB + n);
c = UnalignChar(c);
msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, c);
}
}
for (unsigned n = uColCountB; n < uNewColCount; ++n)
{
for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, '.');
}
uColIndexCombined += uNewColCount;
uColIndexA += uColCountA;
uColIndexB += uColCountB;
}
static void AppendMatch(const MSA &msaA, unsigned &uColIndexA, const MSA &msaB,
unsigned &uColIndexB, unsigned uSeqCountA, unsigned uSeqCountB,
MSA &msaCombined, unsigned &uColIndexCombined)
{
#if TRACE
Log("AppendMatch ColIxA=%u ColIxB=%u ColIxCmb=%u\n",
uColIndexA, uColIndexB, uColIndexCombined);
#endif
for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
{
char c = msaA.GetChar(uSeqIndexA, uColIndexA);
msaCombined.SetChar(uSeqIndexA, uColIndexCombined, c);
}
for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
{
char c = msaB.GetChar(uSeqIndexB, uColIndexB);
msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, c);
}
++uColIndexA;
++uColIndexB;
++uColIndexCombined;
}
void AlignTwoMSAsGivenPath(const PWPath &Path, const MSA &msaA, const MSA &msaB,
MSA &msaCombined)
{
msaCombined.Clear();
#if TRACE
Log("FastAlignProfiles\n");
Log("Template A:\n");
msaA.LogMe();
Log("Template B:\n");
msaB.LogMe();
#endif
const unsigned uColCountA = msaA.GetColCount();
const unsigned uColCountB = msaB.GetColCount();
const unsigned uSeqCountA = msaA.GetSeqCount();
const unsigned uSeqCountB = msaB.GetSeqCount();
msaCombined.SetSeqCount(uSeqCountA + uSeqCountB);
// Copy sequence names into combined MSA
for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
{
msaCombined.SetSeqName(uSeqIndexA, msaA.GetSeqName(uSeqIndexA));
msaCombined.SetSeqId(uSeqIndexA, msaA.GetSeqId(uSeqIndexA));
}
for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
{
msaCombined.SetSeqName(uSeqCountA + uSeqIndexB, msaB.GetSeqName(uSeqIndexB));
msaCombined.SetSeqId(uSeqCountA + uSeqIndexB, msaB.GetSeqId(uSeqIndexB));
}
unsigned uColIndexA = 0;
unsigned uColIndexB = 0;
unsigned uColIndexCombined = 0;
const unsigned uEdgeCount = Path.GetEdgeCount();
for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
{
const PWEdge &Edge = Path.GetEdge(uEdgeIndex);
#if TRACE
Log("\nEdge %u %c%u.%u\n",
uEdgeIndex,
Edge.cType,
Edge.uPrefixLengthA,
Edge.uPrefixLengthB);
#endif
const char cType = Edge.cType;
const unsigned uPrefixLengthA = Edge.uPrefixLengthA;
unsigned uColCountA = 0;
if (uPrefixLengthA > 0)
{
const unsigned uNodeIndexA = uPrefixLengthA - 1;
const unsigned uTplColIndexA = uNodeIndexA;
if (uTplColIndexA > uColIndexA)
uColCountA = uTplColIndexA - uColIndexA;
}
const unsigned uPrefixLengthB = Edge.uPrefixLengthB;
unsigned uColCountB = 0;
if (uPrefixLengthB > 0)
{
const unsigned uNodeIndexB = uPrefixLengthB - 1;
const unsigned uTplColIndexB = uNodeIndexB;
if (uTplColIndexB > uColIndexB)
uColCountB = uTplColIndexB - uColIndexB;
}
// TODO: This code looks like a hangover from HMM estimation -- can we delete it?
assert(uColCountA == 0);
assert(uColCountB == 0);
AppendTplInserts(msaA, uColIndexA, uColCountA, msaB, uColIndexB, uColCountB,
uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined);
switch (cType)
{
case 'M':
{
assert(uPrefixLengthA > 0);
assert(uPrefixLengthB > 0);
const unsigned uColA = uPrefixLengthA - 1;
const unsigned uColB = uPrefixLengthB - 1;
assert(uColIndexA == uColA);
assert(uColIndexB == uColB);
AppendMatch(msaA, uColIndexA, msaB, uColIndexB, uSeqCountA, uSeqCountB,
msaCombined, uColIndexCombined);
break;
}
case 'D':
{
assert(uPrefixLengthA > 0);
const unsigned uColA = uPrefixLengthA - 1;
assert(uColIndexA == uColA);
AppendDelete(msaA, uColIndexA, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined);
break;
}
case 'I':
{
assert(uPrefixLengthB > 0);
const unsigned uColB = uPrefixLengthB - 1;
assert(uColIndexB == uColB);
AppendInsert(msaB, uColIndexB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined);
break;
}
default:
assert(false);
}
}
unsigned uInsertColCountA = uColCountA - uColIndexA;
unsigned uInsertColCountB = uColCountB - uColIndexB;
// TODO: This code looks like a hangover from HMM estimation -- can we delete it?
assert(uInsertColCountA == 0);
assert(uInsertColCountB == 0);
AppendTplInserts(msaA, uColIndexA, uInsertColCountA, msaB, uColIndexB,
uInsertColCountB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined);
assert(msaCombined.GetColCount() == uEdgeCount);
}
static const ProfPos PPStart =
{
false, //m_bAllGaps;
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // m_uSortOrder[21];
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // m_fcCounts[20];
1.0, // m_LL;
0.0, // m_LG;
0.0, // m_GL;
0.0, // m_GG;
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // m_ALScores
0, // m_uResidueGroup;
1.0, // m_fOcc;
0.0, // m_fcStartOcc;
0.0, // m_fcEndOcc;
0.0, // m_scoreGapOpen;
0.0, // m_scoreGapClose;
};
// MM
// Ai<41>1 Ai Out
// X X LL LL
// X - LG LG
// - X GL GL
// - - GG GG
//
// Bj<42>1 Bj
// X X LL LL
// X - LG LG
// - X GL GL
// - - GG GG
static void SetGapsMM(
const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA,
const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB,
ProfPos *POut, unsigned uColIndexOut)
{
const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart;
const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart;
ProfPos &PPO = POut[uColIndexOut];
PPO.m_LL = wA*PPA.m_LL + wB*PPB.m_LL;
PPO.m_LG = wA*PPA.m_LG + wB*PPB.m_LG;
PPO.m_GL = wA*PPA.m_GL + wB*PPB.m_GL;
PPO.m_GG = wA*PPA.m_GG + wB*PPB.m_GG;
}
// MD
// Ai<41>1 Ai Out
// X X LL LL
// X - LG LG
// - X GL GL
// - - GG GG
//
// Bj (-)
// X - ?L LG
// - - ?G GG
static void SetGapsMD(
const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA,
const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB,
ProfPos *POut, unsigned uColIndexOut)
{
const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart;
const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart;
ProfPos &PPO = POut[uColIndexOut];
PPO.m_LL = wA*PPA.m_LL;
PPO.m_LG = wA*PPA.m_LG + wB*(PPB.m_LL + PPB.m_GL);
PPO.m_GL = wA*PPA.m_GL;
PPO.m_GG = wA*PPA.m_GG + wB*(PPB.m_LG + PPB.m_GG);
}
// DD
// Ai<41>1 Ai Out
// X X LL LL
// X - LG LG
// - X GL GL
// - - GG GG
//
// (-) (-)
// - - ?? GG
static void SetGapsDD(
const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA,
const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB,
ProfPos *POut, unsigned uColIndexOut)
{
const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart;
ProfPos &PPO = POut[uColIndexOut];
PPO.m_LL = wA*PPA.m_LL;
PPO.m_LG = wA*PPA.m_LG;
PPO.m_GL = wA*PPA.m_GL;
PPO.m_GG = wA*PPA.m_GG + wB;
}
// MI
// Ai (-) Out
// X - ?L LG
// - - ?G GG
// Bj<42>1 Bj
// X X LL LL
// X - LG LG
// - X GL GL
// - - GG GG
static void SetGapsMI(
const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA,
const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB,
ProfPos *POut, unsigned uColIndexOut)
{
const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart;
const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart;
ProfPos &PPO = POut[uColIndexOut];
PPO.m_LL = wB*PPB.m_LL;
PPO.m_LG = wB*PPB.m_LG + wA*(PPA.m_LL + PPA.m_GL);
PPO.m_GL = wB*PPB.m_GL;
PPO.m_GG = wB*PPB.m_GG + wA*(PPA.m_LG + PPA.m_GG);
}
// DM
// Ai<41>1 Ai Out
// X X LL LL
// X - LG LG
// - X GL GL
// - - GG GG
//
// (-) Bj
// - X ?L GL
// - - ?G GG
static void SetGapsDM(
const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA,
const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB,
ProfPos *POut, unsigned uColIndexOut)
{
const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart;
const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart;
ProfPos &PPO = POut[uColIndexOut];
PPO.m_LL = wA*PPA.m_LL;
PPO.m_LG = wA*PPA.m_LG;
PPO.m_GL = wA*PPA.m_GL + wB*(PPB.m_LL + PPB.m_GL);
PPO.m_GG = wA*PPA.m_GG + wB*(PPB.m_LG + PPB.m_GG);
}
// IM
// (-) Ai Out
// - X ?L GL
// - - ?G GG
// Bj<42>1 Bj
// X X LL LL
// X - LG LG
// - X GL GL
// - - GG GG
static void SetGapsIM(
const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA,
const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB,
ProfPos *POut, unsigned uColIndexOut)
{
const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart;
const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart;
ProfPos &PPO = POut[uColIndexOut];
PPO.m_LL = wB*PPB.m_LL;
PPO.m_LG = wB*PPB.m_LG;
PPO.m_GL = wB*PPB.m_GL + wA*(PPA.m_LL + PPA.m_GL);
PPO.m_GG = wB*PPB.m_GG + wA*(PPA.m_LG + PPA.m_GG);
}
// ID
// (-) Ai Out
// - X ?L GL
// - - ?G GG
// Bj (-)
// X - ?L LG
// - - ?G GG
static void SetGapsID(
const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA,
const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB,
ProfPos *POut, unsigned uColIndexOut)
{
const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart;
const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart;
ProfPos &PPO = POut[uColIndexOut];
PPO.m_LL = 0;
PPO.m_LG = wB*PPB.m_GL + wB*PPB.m_LL;
PPO.m_GL = wA*PPA.m_GL + wA*PPA.m_LL;
PPO.m_GG = wA*(PPA.m_LG + PPA.m_GG) + wB*(PPB.m_LG + PPB.m_GG);
}
// DI
// Ai (-) Out
// X - ?L LG
// - - ?G GG
// (-) Bj
// - X ?L GL
// - - ?G GG
static void SetGapsDI(
const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA,
const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB,
ProfPos *POut, unsigned uColIndexOut)
{
const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart;
const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart;
ProfPos &PPO = POut[uColIndexOut];
PPO.m_LL = 0;
PPO.m_LG = wA*PPA.m_GL + wA*PPA.m_LL;
PPO.m_GL = wB*PPB.m_GL + wB*PPB.m_LL;
PPO.m_GG = wA*(PPA.m_LG + PPA.m_GG) + wB*(PPB.m_LG + PPB.m_GG);
}
// II
// (-) (-) Out
// - - ?? GG
// Bj<42>1 Bj
// X X LL LL
// X - LG LG
// - X GL GL
// - - GG GG
static void SetGapsII(
const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA,
const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB,
ProfPos *POut, unsigned uColIndexOut)
{
const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart;
ProfPos &PPO = POut[uColIndexOut];
PPO.m_LL = wB*PPB.m_LL;
PPO.m_LG = wB*PPB.m_LG;
PPO.m_GL = wB*PPB.m_GL;
PPO.m_GG = wB*PPB.m_GG + wA;
}
static void SetFreqs(
const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA,
const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB,
ProfPos *POut, unsigned uColIndexOut)
{
const ProfPos &PPA = uPrefixLengthA > 0 ? PA[uPrefixLengthA-1] : PPStart;
const ProfPos &PPB = uPrefixLengthB > 0 ? PB[uPrefixLengthB-1] : PPStart;
ProfPos &PPO = POut[uColIndexOut];
if (g_bNormalizeCounts)
{
const FCOUNT fA = PPA.m_fOcc*wA/(wA + wB);
const FCOUNT fB = PPB.m_fOcc*wB/(wA + wB);
FCOUNT fTotal = 0;
for (unsigned i = 0; i < 20; ++i)
{
const FCOUNT f = fA*PPA.m_fcCounts[i] + fB*PPB.m_fcCounts[i];
PPO.m_fcCounts[i] = f;
fTotal += f;
}
if (fTotal > 0)
for (unsigned i = 0; i < 20; ++i)
PPO.m_fcCounts[i] /= fTotal;
}
else
{
for (unsigned i = 0; i < 20; ++i)
PPO.m_fcCounts[i] = wA*PPA.m_fcCounts[i] + wB*PPB.m_fcCounts[i];
}
}
void AlignTwoProfsGivenPath(const PWPath &Path,
const ProfPos *PA, unsigned uPrefixLengthA, WEIGHT wA,
const ProfPos *PB, unsigned uPrefixLengthB, WEIGHT wB,
ProfPos **ptrPOut, unsigned *ptruLengthOut)
{
#if TRACE
Log("AlignTwoProfsGivenPath wA=%.3g wB=%.3g Path=\n", wA, wB);
Path.LogMe();
#endif
assert(BTEq(wA + wB, 1.0));
unsigned uColIndexA = 0;
unsigned uColIndexB = 0;
unsigned uColIndexOut = 0;
const unsigned uEdgeCount = Path.GetEdgeCount();
ProfPos *POut = new ProfPos[uEdgeCount];
char cPrevType = 'M';
for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
{
const PWEdge &Edge = Path.GetEdge(uEdgeIndex);
const char cType = Edge.cType;
const unsigned uPrefixLengthA = Edge.uPrefixLengthA;
const unsigned uPrefixLengthB = Edge.uPrefixLengthB;
#if TRACE
Log("\nEdge %u %c%u.%u ColA=%u ColB=%u\n",
uEdgeIndex,
Edge.cType,
Edge.uPrefixLengthA,
Edge.uPrefixLengthB,
uColIndexA,
uColIndexB);
#endif
POut[uColIndexOut].m_bAllGaps = false;
switch (cType)
{
case 'M':
{
assert(uPrefixLengthA > 0);
assert(uPrefixLengthB > 0);
SetFreqs(
PA, uPrefixLengthA, wA,
PB, uPrefixLengthB, wB,
POut, uColIndexOut);
switch (cPrevType)
{
case 'M':
SetGapsMM(
PA, uPrefixLengthA, wA,
PB, uPrefixLengthB, wB,
POut, uColIndexOut);
break;
case 'D':
SetGapsDM(
PA, uPrefixLengthA, wA,
PB, uPrefixLengthB, wB,
POut, uColIndexOut);
break;
case 'I':
SetGapsIM(
PA, uPrefixLengthA, wA,
PB, uPrefixLengthB, wB,
POut, uColIndexOut);
break;
default:
Quit("Bad cPrevType");
}
++uColIndexA;
++uColIndexB;
++uColIndexOut;
break;
}
case 'D':
{
assert(uPrefixLengthA > 0);
SetFreqs(
PA, uPrefixLengthA, wA,
PB, uPrefixLengthB, 0,
POut, uColIndexOut);
switch (cPrevType)
{
case 'M':
SetGapsMD(
PA, uPrefixLengthA, wA,
PB, uPrefixLengthB, wB,
POut, uColIndexOut);
break;
case 'D':
SetGapsDD(
PA, uPrefixLengthA, wA,
PB, uPrefixLengthB, wB,
POut, uColIndexOut);
break;
case 'I':
SetGapsID(
PA, uPrefixLengthA, wA,
PB, uPrefixLengthB, wB,
POut, uColIndexOut);
break;
default:
Quit("Bad cPrevType");
}
++uColIndexA;
++uColIndexOut;
break;
}
case 'I':
{
assert(uPrefixLengthB > 0);
SetFreqs(
PA, uPrefixLengthA, 0,
PB, uPrefixLengthB, wB,
POut, uColIndexOut);
switch (cPrevType)
{
case 'M':
SetGapsMI(
PA, uPrefixLengthA, wA,
PB, uPrefixLengthB, wB,
POut, uColIndexOut);
break;
case 'D':
SetGapsDI(
PA, uPrefixLengthA, wA,
PB, uPrefixLengthB, wB,
POut, uColIndexOut);
break;
case 'I':
SetGapsII(
PA, uPrefixLengthA, wA,
PB, uPrefixLengthB, wB,
POut, uColIndexOut);
break;
default:
Quit("Bad cPrevType");
}
++uColIndexB;
++uColIndexOut;
break;
}
default:
assert(false);
}
cPrevType = cType;
}
assert(uColIndexOut == uEdgeCount);
ProfScoresFromFreqs(POut, uEdgeCount);
ValidateProf(POut, uEdgeCount);
*ptrPOut = POut;
*ptruLengthOut = uEdgeCount;
#if TRACE
Log("AlignTwoProfsGivenPath:\n");
ListProfile(POut, uEdgeCount, 0);
#endif
}

View File

@ -0,0 +1,237 @@
#include "muscle.h"
#include "msa.h"
#include "pwpath.h"
#include "profile.h"
#define TRACE 0
static void AppendDelete(const MSA &msaA, unsigned &uColIndexA,
unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined,
unsigned &uColIndexCombined)
{
#if TRACE
Log("AppendDelete ColIxA=%u ColIxCmb=%u\n",
uColIndexA, uColIndexCombined);
#endif
for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
{
char c = msaA.GetChar(uSeqIndexA, uColIndexA);
msaCombined.SetChar(uSeqIndexA, uColIndexCombined, c);
}
for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, '-');
++uColIndexCombined;
++uColIndexA;
}
static void AppendInsert(const MSA &msaB, unsigned &uColIndexB,
unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined,
unsigned &uColIndexCombined)
{
#if TRACE
Log("AppendInsert ColIxB=%u ColIxCmb=%u\n",
uColIndexB, uColIndexCombined);
#endif
for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
msaCombined.SetChar(uSeqIndexA, uColIndexCombined, '-');
for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
{
char c = msaB.GetChar(uSeqIndexB, uColIndexB);
msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, c);
}
++uColIndexCombined;
++uColIndexB;
}
static void AppendUnalignedTerminals(const MSA &msaA, unsigned &uColIndexA, unsigned uColCountA,
const MSA &msaB, unsigned &uColIndexB, unsigned uColCountB, unsigned uSeqCountA,
unsigned uSeqCountB, MSA &msaCombined, unsigned &uColIndexCombined)
{
#if TRACE
Log("AppendUnalignedTerminals ColIxA=%u ColIxB=%u ColIxCmb=%u\n",
uColIndexA, uColIndexB, uColIndexCombined);
#endif
const unsigned uLengthA = msaA.GetColCount();
const unsigned uLengthB = msaB.GetColCount();
unsigned uNewColCount = uColCountA;
if (uColCountB > uNewColCount)
uNewColCount = uColCountB;
for (unsigned n = 0; n < uColCountA; ++n)
{
for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
{
char c = msaA.GetChar(uSeqIndexA, uColIndexA + n);
c = UnalignChar(c);
msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, c);
}
}
for (unsigned n = uColCountA; n < uNewColCount; ++n)
{
for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, '.');
}
for (unsigned n = 0; n < uColCountB; ++n)
{
for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
{
char c = msaB.GetChar(uSeqIndexB, uColIndexB + n);
c = UnalignChar(c);
msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, c);
}
}
for (unsigned n = uColCountB; n < uNewColCount; ++n)
{
for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, '.');
}
uColIndexCombined += uNewColCount;
uColIndexA += uColCountA;
uColIndexB += uColCountB;
}
static void AppendMatch(const MSA &msaA, unsigned &uColIndexA, const MSA &msaB,
unsigned &uColIndexB, unsigned uSeqCountA, unsigned uSeqCountB,
MSA &msaCombined, unsigned &uColIndexCombined)
{
#if TRACE
Log("AppendMatch ColIxA=%u ColIxB=%u ColIxCmb=%u\n",
uColIndexA, uColIndexB, uColIndexCombined);
#endif
for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
{
char c = msaA.GetChar(uSeqIndexA, uColIndexA);
msaCombined.SetChar(uSeqIndexA, uColIndexCombined, c);
}
for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
{
char c = msaB.GetChar(uSeqIndexB, uColIndexB);
msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined, c);
}
++uColIndexA;
++uColIndexB;
++uColIndexCombined;
}
void AlignTwoMSAsGivenPathSW(const PWPath &Path, const MSA &msaA, const MSA &msaB,
MSA &msaCombined)
{
msaCombined.Clear();
#if TRACE
Log("AlignTwoMSAsGivenPathSW\n");
Log("Template A:\n");
msaA.LogMe();
Log("Template B:\n");
msaB.LogMe();
#endif
const unsigned uColCountA = msaA.GetColCount();
const unsigned uColCountB = msaB.GetColCount();
const unsigned uSeqCountA = msaA.GetSeqCount();
const unsigned uSeqCountB = msaB.GetSeqCount();
msaCombined.SetSeqCount(uSeqCountA + uSeqCountB);
// Copy sequence names into combined MSA
for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA)
{
msaCombined.SetSeqName(uSeqIndexA, msaA.GetSeqName(uSeqIndexA));
msaCombined.SetSeqId(uSeqIndexA, msaA.GetSeqId(uSeqIndexA));
}
for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB)
{
msaCombined.SetSeqName(uSeqCountA + uSeqIndexB, msaB.GetSeqName(uSeqIndexB));
msaCombined.SetSeqId(uSeqCountA + uSeqIndexB, msaB.GetSeqId(uSeqIndexB));
}
unsigned uColIndexA = 0;
unsigned uColIndexB = 0;
unsigned uColIndexCombined = 0;
const unsigned uEdgeCount = Path.GetEdgeCount();
for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
{
const PWEdge &Edge = Path.GetEdge(uEdgeIndex);
#if TRACE
Log("\nEdge %u %c%u.%u\n",
uEdgeIndex,
Edge.cType,
Edge.uPrefixLengthA,
Edge.uPrefixLengthB);
#endif
const char cType = Edge.cType;
const unsigned uPrefixLengthA = Edge.uPrefixLengthA;
unsigned uColCountA = 0;
if (uPrefixLengthA > 0)
{
const unsigned uNodeIndexA = uPrefixLengthA - 1;
const unsigned uTplColIndexA = uNodeIndexA;
if (uTplColIndexA > uColIndexA)
uColCountA = uTplColIndexA - uColIndexA;
}
const unsigned uPrefixLengthB = Edge.uPrefixLengthB;
unsigned uColCountB = 0;
if (uPrefixLengthB > 0)
{
const unsigned uNodeIndexB = uPrefixLengthB - 1;
const unsigned uTplColIndexB = uNodeIndexB;
if (uTplColIndexB > uColIndexB)
uColCountB = uTplColIndexB - uColIndexB;
}
AppendUnalignedTerminals(msaA, uColIndexA, uColCountA, msaB, uColIndexB, uColCountB,
uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined);
switch (cType)
{
case 'M':
{
assert(uPrefixLengthA > 0);
assert(uPrefixLengthB > 0);
const unsigned uColA = uPrefixLengthA - 1;
const unsigned uColB = uPrefixLengthB - 1;
assert(uColIndexA == uColA);
assert(uColIndexB == uColB);
AppendMatch(msaA, uColIndexA, msaB, uColIndexB, uSeqCountA, uSeqCountB,
msaCombined, uColIndexCombined);
break;
}
case 'D':
{
assert(uPrefixLengthA > 0);
const unsigned uColA = uPrefixLengthA - 1;
assert(uColIndexA == uColA);
AppendDelete(msaA, uColIndexA, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined);
break;
}
case 'I':
{
assert(uPrefixLengthB > 0);
const unsigned uColB = uPrefixLengthB - 1;
assert(uColIndexB == uColB);
AppendInsert(msaB, uColIndexB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined);
break;
}
default:
assert(false);
}
}
unsigned uInsertColCountA = uColCountA - uColIndexA;
unsigned uInsertColCountB = uColCountB - uColIndexB;
AppendUnalignedTerminals(msaA, uColIndexA, uInsertColCountA, msaB, uColIndexB,
uInsertColCountB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined);
}

View File

@ -0,0 +1,41 @@
#include "muscle.h"
#include "msa.h"
#include "profile.h"
#include "pwpath.h"
#include "textfile.h"
#include "timing.h"
SCORE AlignTwoMSAs(const MSA &msa1, const MSA &msa2, MSA &msaOut, PWPath &Path,
bool bLockLeft, bool bLockRight)
{
const unsigned uLengthA = msa1.GetColCount();
const unsigned uLengthB = msa2.GetColCount();
ProfPos *PA = ProfileFromMSA(msa1);
ProfPos *PB = ProfileFromMSA(msa2);
if (bLockLeft)
{
PA[0].m_scoreGapOpen = MINUS_INFINITY;
PB[0].m_scoreGapOpen = MINUS_INFINITY;
}
if (bLockRight)
{
PA[uLengthA-1].m_scoreGapClose = MINUS_INFINITY;
PB[uLengthB-1].m_scoreGapClose = MINUS_INFINITY;
}
float r = (float) uLengthA/ (float) (uLengthB + 1); // +1 to prevent div 0
if (r < 1)
r = 1/r;
SCORE Score = GlobalAlign(PA, uLengthA, PB, uLengthB, Path);
AlignTwoMSAsGivenPath(Path, msa1, msa2, msaOut);
delete[] PA;
delete[] PB;
return Score;
}

View File

@ -0,0 +1,31 @@
#include "muscle.h"
#include "msa.h"
#include "profile.h"
#include "pwpath.h"
SCORE GlobalAlign4(ProfPos *PA, unsigned uLengthA, ProfPos *PB,
unsigned uLengthB, PWPath &Path);
SCORE AlignTwoProfs(
const ProfPos *PA, unsigned uLengthA, WEIGHT wA,
const ProfPos *PB, unsigned uLengthB, WEIGHT wB,
PWPath &Path, ProfPos **ptrPout, unsigned *ptruLengthOut)
{
assert(uLengthA < 100000);
assert(uLengthB < 100000);
float r = (float) uLengthA/ (float) (uLengthB + 1); // +1 to prevent div 0
if (r < 1)
r = 1/r;
SCORE Score = GlobalAlign(PA, uLengthA, PB, uLengthB, Path);
AlignTwoProfsGivenPath(Path, PA, uLengthB, wA/(wA + wB), PB, uLengthB, wB/(wA + wB),
ptrPout, ptruLengthOut);
#if HYDRO
if (ALPHA_Amino == g_Alpha)
Hydro(*ptrPout, *ptruLengthOut);
#endif
return Score;
}

View File

@ -0,0 +1,170 @@
#include "muscle.h"
#include <stdio.h>
#include <ctype.h>
#include "msa.h"
#include "textfile.h"
const unsigned uCharsPerLine = 60;
const int MIN_NAME = 10;
const int MAX_NAME = 32;
static char GetAlnConsensusChar(const MSA &a, unsigned uColIndex);
void MSA::ToAlnFile(TextFile &File) const
{
if (g_bClwStrict)
File.PutString("CLUSTAL W (1.81) multiple sequence alignment\n");
else
{
File.PutString("MUSCLE ("
SHORT_VERSION ")"
" multiple sequence alignment\n");
File.PutString("\n");
}
int iLongestNameLength = 0;
for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
{
const char *ptrName = GetSeqName(uSeqIndex);
const char *ptrBlank = strchr(ptrName, ' ');
int iLength;
if (0 != ptrBlank)
iLength = (int) (ptrBlank - ptrName);
else
iLength = (int) strlen(ptrName);
if (iLength > iLongestNameLength)
iLongestNameLength = iLength;
}
if (iLongestNameLength > MAX_NAME)
iLongestNameLength = MAX_NAME;
if (iLongestNameLength < MIN_NAME)
iLongestNameLength = MIN_NAME;
unsigned uLineCount = (GetColCount() - 1)/uCharsPerLine + 1;
for (unsigned uLineIndex = 0; uLineIndex < uLineCount; ++uLineIndex)
{
File.PutString("\n");
unsigned uStartColIndex = uLineIndex*uCharsPerLine;
unsigned uEndColIndex = uStartColIndex + uCharsPerLine - 1;
if (uEndColIndex >= GetColCount())
uEndColIndex = GetColCount() - 1;
char Name[MAX_NAME+1];
for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
{
const char *ptrName = GetSeqName(uSeqIndex);
const char *ptrBlank = strchr(ptrName, ' ');
int iLength;
if (0 != ptrBlank)
iLength = (int) (ptrBlank - ptrName);
else
iLength = (int) strlen(ptrName);
if (iLength > MAX_NAME)
iLength = MAX_NAME;
memset(Name, ' ', MAX_NAME);
memcpy(Name, ptrName, iLength);
Name[iLongestNameLength] = 0;
File.PutFormat("%s ", Name);
for (unsigned uColIndex = uStartColIndex; uColIndex <= uEndColIndex;
++uColIndex)
{
const char c = GetChar(uSeqIndex, uColIndex);
File.PutFormat("%c", toupper(c));
}
File.PutString("\n");
}
memset(Name, ' ', MAX_NAME);
Name[iLongestNameLength] = 0;
File.PutFormat("%s ", Name);
for (unsigned uColIndex = uStartColIndex; uColIndex <= uEndColIndex;
++uColIndex)
{
const char c = GetAlnConsensusChar(*this, uColIndex);
File.PutChar(c);
}
File.PutString("\n");
}
}
static char GetAlnConsensusChar(const MSA &a, unsigned uColIndex)
{
const unsigned uSeqCount = a.GetSeqCount();
unsigned BitMap = 0;
unsigned Count = 0;
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
{
unsigned uLetter = a.GetLetterEx(uSeqIndex, uColIndex);
assert(uLetter < 32);
unsigned Bit = (1 << uLetter);
if (!(BitMap & Bit))
++Count;
BitMap |= Bit;
}
// '*' indicates positions which have a single, fully conserved residue
if (1 == Count)
return '*';
if (ALPHA_Amino != g_Alpha)
return ' ';
#define B(a) (1 << AX_##a)
#define S2(a, b) S(B(a) | B(b))
#define S3(a, b, c) S(B(a) | B(b) | B(c))
#define S4(a, b, c, d) S(B(a) | B(b) | B(c) | B(d))
#define S(w) if (0 == (BitMap & ~(w)) && (BitMap & (w)) != 0) return ':';
#define W3(a, b, c) W(B(a) | B(b) | B(c))
#define W4(a, b, c, d) W(B(a) | B(b) | B(c) | B(d))
#define W5(a, b, c, d, e) W(B(a) | B(b) | B(c) | B(d) | B(e))
#define W6(a, b, c, d, e, f) W(B(a) | B(b) | B(c) | B(d) | B(e) | B(f))
#define W(w) if (0 == (BitMap & ~(w)) && (BitMap & (w)) != 0) return '.';
// ':' indicates that one of the following 'strong'
// groups is fully conserved
// STA
// NEQK
// NHQK
// NDEQ
// QHRK
// MILV
// MILF
// HY
// FYW
//
S3(S, T, A)
S4(N, E, Q, K)
S4(N, H, Q, K)
S4(N, D, E, Q)
S4(M, I, L, V)
S4(M, I, L, F)
S2(H, Y)
S3(F, Y, W)
// '.' indicates that one of the following 'weaker'
// groups is fully conserved
// CSA
// ATV
// SAG
// STNK
// STPA
// SGND
// SNDEQK
// NDEQHK
// NEQHRK
// FVLIM
// HFY
W3(C, S, A)
W3(A, T, V)
W3(S, A, G)
W4(S, T, N, K)
W4(S, T, P, A)
W4(S, G, N, D)
W6(S, N, D, E, Q, K)
W6(N, W, Q, H, R, K)
W5(F, V, L, I, M)
W3(H, F, Y)
return ' ';
}

View File

@ -0,0 +1,283 @@
#include "muscle.h"
#include <ctype.h>
/***
From Bioperl docs:
Extended DNA / RNA alphabet
------------------------------------------
Symbol Meaning Nucleic Acid
------------------------------------------
A A Adenine
C C Cytosine
G G Guanine
T T Thymine
U U Uracil
M A or C
R A or G
W A or T
S C or G
Y C or T
K G or T
V A or C or G
H A or C or T
D A or G or T
B C or G or T
X G or A or T or C
N G or A or T or C
IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE:
Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030.
***/
unsigned g_CharToLetter[MAX_CHAR];
unsigned g_CharToLetterEx[MAX_CHAR];
char g_LetterToChar[MAX_ALPHA];
char g_LetterExToChar[MAX_ALPHA_EX];
char g_UnalignChar[MAX_CHAR];
char g_AlignChar[MAX_CHAR];
bool g_IsWildcardChar[MAX_CHAR];
bool g_IsResidueChar[MAX_CHAR];
ALPHA g_Alpha = ALPHA_Undefined;
unsigned g_AlphaSize = 0;
#define Res(c, Letter) \
{ \
const unsigned char Upper = (unsigned char) toupper(c); \
const unsigned char Lower = (unsigned char) tolower(c); \
g_CharToLetter[Upper] = Letter; \
g_CharToLetter[Lower] = Letter; \
g_CharToLetterEx[Upper] = Letter; \
g_CharToLetterEx[Lower] = Letter; \
g_LetterToChar[Letter] = Upper; \
g_LetterExToChar[Letter] = Upper; \
g_IsResidueChar[Upper] = true; \
g_IsResidueChar[Lower] = true; \
g_AlignChar[Upper] = Upper; \
g_AlignChar[Lower] = Upper; \
g_UnalignChar[Upper] = Lower; \
g_UnalignChar[Lower] = Lower; \
}
#define Wild(c, Letter) \
{ \
const unsigned char Upper = (unsigned char) toupper(c); \
const unsigned char Lower = (unsigned char) tolower(c); \
g_CharToLetterEx[Upper] = Letter; \
g_CharToLetterEx[Lower] = Letter; \
g_LetterExToChar[Letter] = Upper; \
g_IsResidueChar[Upper] = true; \
g_IsResidueChar[Lower] = true; \
g_AlignChar[Upper] = Upper; \
g_AlignChar[Lower] = Upper; \
g_UnalignChar[Upper] = Lower; \
g_UnalignChar[Lower] = Lower; \
g_IsWildcardChar[Lower] = true; \
g_IsWildcardChar[Upper] = true; \
}
static unsigned GetAlphaSize(ALPHA Alpha)
{
switch (Alpha)
{
case ALPHA_Amino:
return 20;
case ALPHA_RNA:
case ALPHA_DNA:
return 4;
}
Quit("Invalid Alpha=%d", Alpha);
return 0;
}
static void InitArrays()
{
memset(g_CharToLetter, 0xff, sizeof(g_CharToLetter));
memset(g_CharToLetterEx, 0xff, sizeof(g_CharToLetterEx));
memset(g_LetterToChar, '?', sizeof(g_LetterToChar));
memset(g_LetterExToChar, '?', sizeof(g_LetterExToChar));
memset(g_AlignChar, '?', sizeof(g_UnalignChar));
memset(g_UnalignChar, '?', sizeof(g_UnalignChar));
memset(g_IsWildcardChar, 0, sizeof(g_IsWildcardChar));
}
static void SetGapChar(char c)
{
unsigned char u = (unsigned char) c;
g_CharToLetterEx[u] = AX_GAP;
g_LetterExToChar[AX_GAP] = u;
g_AlignChar[u] = u;
g_UnalignChar[u] = u;
}
static void SetAlphaDNA()
{
Res('A', NX_A)
Res('C', NX_C)
Res('G', NX_G)
Res('T', NX_T)
Wild('M', NX_M)
Wild('R', NX_R)
Wild('W', NX_W)
Wild('S', NX_S)
Wild('Y', NX_Y)
Wild('K', NX_K)
Wild('V', NX_V)
Wild('H', NX_H)
Wild('D', NX_D)
Wild('B', NX_B)
Wild('X', NX_X)
Wild('N', NX_N)
}
static void SetAlphaRNA()
{
Res('A', NX_A)
Res('C', NX_C)
Res('G', NX_G)
Res('U', NX_U)
Res('T', NX_T)
Wild('M', NX_M)
Wild('R', NX_R)
Wild('W', NX_W)
Wild('S', NX_S)
Wild('Y', NX_Y)
Wild('K', NX_K)
Wild('V', NX_V)
Wild('H', NX_H)
Wild('D', NX_D)
Wild('B', NX_B)
Wild('X', NX_X)
Wild('N', NX_N)
}
static void SetAlphaAmino()
{
Res('A', AX_A)
Res('C', AX_C)
Res('D', AX_D)
Res('E', AX_E)
Res('F', AX_F)
Res('G', AX_G)
Res('H', AX_H)
Res('I', AX_I)
Res('K', AX_K)
Res('L', AX_L)
Res('M', AX_M)
Res('N', AX_N)
Res('P', AX_P)
Res('Q', AX_Q)
Res('R', AX_R)
Res('S', AX_S)
Res('T', AX_T)
Res('V', AX_V)
Res('W', AX_W)
Res('Y', AX_Y)
Wild('B', AX_B)
Wild('X', AX_X)
Wild('Z', AX_Z)
}
void SetAlpha(ALPHA Alpha)
{
InitArrays();
SetGapChar('.');
SetGapChar('-');
switch (Alpha)
{
case ALPHA_Amino:
SetAlphaAmino();
break;
case ALPHA_DNA:
SetAlphaDNA();
case ALPHA_RNA:
SetAlphaRNA();
break;
default:
Quit("Invalid Alpha=%d", Alpha);
}
g_AlphaSize = GetAlphaSize(Alpha);
g_Alpha = Alpha;
if (g_bVerbose)
Log("Alphabet %s\n", ALPHAToStr(g_Alpha));
}
char GetWildcardChar()
{
switch (g_Alpha)
{
case ALPHA_Amino:
return 'X';
case ALPHA_DNA:
case ALPHA_RNA:
return 'N';
default:
Quit("Invalid Alpha=%d", g_Alpha);
}
return '?';
}
bool IsNucleo(char c)
{
return strchr("ACGTURYNacgturyn", c) != 0;
}
bool IsDNA(char c)
{
return strchr("AGCTNagctn", c) != 0;
}
bool IsRNA(char c)
{
return strchr("AGCUNagcun", c) != 0;
}
static char InvalidLetters[256];
static int InvalidLetterCount = 0;
void ClearInvalidLetterWarning()
{
memset(InvalidLetters, 0, 256);
}
void InvalidLetterWarning(char c, char w)
{
InvalidLetters[(unsigned char) c] = 1;
++InvalidLetterCount;
}
void ReportInvalidLetters()
{
if (0 == InvalidLetterCount)
return;
char Str[257];
memset(Str, 0, 257);
int n = 0;
for (int i = 0; i < 256; ++i)
{
if (InvalidLetters[i])
Str[n++] = (char) i;
}
Warning("Assuming %s (see -seqtype option), invalid letters found: %s",
ALPHAToStr(g_Alpha), Str);
}

View File

@ -0,0 +1,106 @@
#ifndef alpha_h
#define alpha_h
bool StrHasAmino(const char *Str);
bool StrHasGap(const char *Str);
void ClearInvalidLetterWarning();
void InvalidLetterWarning(char c, char w);
void ReportInvalidLetters();
extern unsigned g_CharToLetter[];
extern unsigned g_CharToLetterEx[];
extern char g_LetterToChar[];
extern char g_LetterExToChar[];
extern char g_UnalignChar[];
extern char g_AlignChar[];
extern bool g_IsWildcardChar[];
extern bool g_IsResidueChar[];
#define CharToLetter(c) (g_CharToLetter[(unsigned char) (c)])
#define CharToLetterEx(c) (g_CharToLetterEx[(unsigned char) (c)])
#define LetterToChar(u) (g_LetterToChar[u])
#define LetterExToChar(u) (g_LetterExToChar[u])
#define IsResidueChar(c) (g_IsResidueChar[(unsigned char) (c)])
#define IsGapChar(c) ('-' == (c) || '.' == (c))
#define IsWildcardChar(c) (g_IsWildcardChar[(unsigned char) (c)])
#define AlignChar(c) (g_AlignChar[(unsigned char) (c)])
#define UnalignChar(c) (g_UnalignChar[(unsigned char) (c)])
// AX=Amino alphabet with eXtensions (B, Z and X)
enum AX
{
AX_A,
AX_C,
AX_D,
AX_E,
AX_F,
AX_G,
AX_H,
AX_I,
AX_K,
AX_L,
AX_M,
AX_N,
AX_P,
AX_Q,
AX_R,
AX_S,
AX_T,
AX_V,
AX_W,
AX_Y,
AX_X, // Any
AX_B, // D or N
AX_Z, // E or Q
AX_GAP,
};
const unsigned AX_COUNT = AX_GAP + 1;
// NX=Nucleotide alphabet with extensions
enum NX
{
NX_A,
NX_C,
NX_G,
NX_T,
NX_U = NX_T,
NX_M, // AC
NX_R, // AG
NX_W, // AT
NX_S, // CG
NX_Y, // CT
NX_K, // GT
NX_V, // ACG
NX_H, // ACT
NX_D, // AGT
NX_B, // CGT
NX_X, // GATC
NX_N, // GATC
NX_GAP
};
const unsigned NX_COUNT = NX_GAP + 1;
const unsigned MAX_ALPHA = 20;
const unsigned MAX_ALPHA_EX = AX_COUNT;
const unsigned MAX_CHAR = 256;
extern ALPHA g_Alpha;
extern unsigned g_AlphaSize;
void SetAlpha(ALPHA Alpha);
char GetWildcardChar();
bool IsNucleo(char c);
bool IsDNA(char c);
bool IsRNA(char c);
#endif // alpha_h

View File

@ -0,0 +1,218 @@
#include "muscle.h"
#include "msa.h"
#include "objscore.h"
#define TRACE 0
static void WindowSmooth(const SCORE Score[], unsigned uCount, unsigned uWindowLength,
SCORE SmoothScore[], double dCeil)
{
#define Ceil(x) ((SCORE) ((x) > dCeil ? dCeil : (x)))
if (1 != uWindowLength%2)
Quit("WindowSmooth=%u must be odd", uWindowLength);
if (uCount <= uWindowLength)
{
for (unsigned i = 0; i < uCount; ++i)
SmoothScore[i] = 0;
return;
}
const unsigned w2 = uWindowLength/2;
for (unsigned i = 0; i < w2; ++i)
{
SmoothScore[i] = 0;
SmoothScore[uCount - i - 1] = 0;
}
SCORE scoreWindowTotal = 0;
for (unsigned i = 0; i < uWindowLength; ++i)
{
scoreWindowTotal += Ceil(Score[i]);
}
for (unsigned i = w2; ; ++i)
{
SmoothScore[i] = scoreWindowTotal/uWindowLength;
if (i == uCount - w2 - 1)
break;
scoreWindowTotal -= Ceil(Score[i - w2]);
scoreWindowTotal += Ceil(Score[i + w2 + 1]);
}
#undef Ceil
}
// Find columns that score above the given threshold.
// A range of scores is defined between the average
// and the maximum. The threshold is a fraction 0.0 .. 1.0
// within that range, where 0.0 is the average score
// and 1.0 is the maximum score.
// "Grade" is by analogy with grading on a curve.
static void FindBestColsGrade(const SCORE Score[], unsigned uCount,
double dThreshold, unsigned BestCols[], unsigned *ptruBestColCount)
{
SCORE scoreTotal = 0;
for (unsigned uIndex = 0; uIndex < uCount; ++uIndex)
scoreTotal += Score[uIndex];
const SCORE scoreAvg = scoreTotal / uCount;
SCORE scoreMax = MINUS_INFINITY;
for (unsigned uIndex = 0; uIndex < uCount; ++uIndex)
if (Score[uIndex] > scoreMax)
scoreMax = Score[uIndex];
unsigned uBestColCount = 0;
for (unsigned uIndex = 0; uIndex < uCount; ++uIndex)
{
const SCORE s = Score[uIndex];
const double dHeight = (s - scoreAvg)/(scoreMax - scoreAvg);
if (dHeight >= dThreshold)
{
BestCols[uBestColCount] = uIndex;
++uBestColCount;
}
}
*ptruBestColCount = uBestColCount;
}
// Best col only if all following criteria satisfied:
// (1) Score >= min
// (2) Smoothed score >= min
// (3) No gaps.
static void FindBestColsCombo(const MSA &msa, const SCORE Score[],
const SCORE SmoothScore[], double dMinScore, double dMinSmoothScore,
unsigned BestCols[], unsigned *ptruBestColCount)
{
const unsigned uColCount = msa.GetColCount();
unsigned uBestColCount = 0;
for (unsigned uIndex = 0; uIndex < uColCount; ++uIndex)
{
if (Score[uIndex] < dMinScore)
continue;
if (SmoothScore[uIndex] < dMinSmoothScore)
continue;
if (msa.ColumnHasGap(uIndex))
continue;
BestCols[uBestColCount] = uIndex;
++uBestColCount;
}
*ptruBestColCount = uBestColCount;
}
static void ListBestCols(const MSA &msa, const SCORE Score[], const SCORE SmoothScore[],
unsigned BestCols[], unsigned uBestColCount)
{
const unsigned uColCount = msa.GetColCount();
const unsigned uSeqCount = msa.GetSeqCount();
Log("Col ");
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
Log("%u", uSeqIndex%10);
Log(" ");
for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
{
Log("%3u ", uColIndex);
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
Log("%c", msa.GetChar(uSeqIndex, uColIndex));
Log(" %10.3f", Score[uColIndex]);
Log(" %10.3f", SmoothScore[uColIndex]);
for (unsigned i = 0; i < uBestColCount; ++i)
if (BestCols[i] == uColIndex)
Log(" <-- Best");
Log("\n");
}
}
// If two best columns are found within a window, choose
// the highest-scoring. If more than two, choose the one
// closest to the center of the window.
static void MergeBestCols(const SCORE Scores[], const unsigned BestCols[],
unsigned uBestColCount, unsigned uWindowLength, unsigned AnchorCols[],
unsigned *ptruAnchorColCount)
{
unsigned uAnchorColCount = 0;
for (unsigned n = 0; n < uBestColCount; /* update inside loop */)
{
unsigned uBestColIndex = BestCols[n];
unsigned uCountWithinWindow = 0;
for (unsigned i = n + 1; i < uBestColCount; ++i)
{
unsigned uBestColIndex2 = BestCols[i];
if (uBestColIndex2 - uBestColIndex >= uWindowLength)
break;
++uCountWithinWindow;
}
unsigned uAnchorCol = uBestColIndex;
if (1 == uCountWithinWindow)
{
unsigned uBestColIndex2 = BestCols[n+1];
if (Scores[uBestColIndex] > Scores[uBestColIndex2])
uAnchorCol = uBestColIndex;
else
uAnchorCol = uBestColIndex2;
}
else if (uCountWithinWindow > 1)
{
unsigned uWindowCenter = uBestColIndex + uWindowLength/2;
int iClosestDist = uWindowLength;
unsigned uClosestCol = uBestColIndex;
for (unsigned i = n + 1; i < n + uCountWithinWindow; ++i)
{
unsigned uColIndex = BestCols[i];
int iDist = uColIndex - uBestColIndex;
if (iDist < 0)
iDist = -iDist;
if (iDist < iClosestDist)
{
uClosestCol = uColIndex;
iClosestDist = iDist;
}
}
uAnchorCol = uClosestCol;
}
AnchorCols[uAnchorColCount] = uAnchorCol;
++uAnchorColCount;
n += uCountWithinWindow + 1;
}
*ptruAnchorColCount = uAnchorColCount;
}
void FindAnchorCols(const MSA &msa, unsigned AnchorCols[],
unsigned *ptruAnchorColCount)
{
const unsigned uColCount = msa.GetColCount();
if (uColCount < 16)
{
*ptruAnchorColCount = 0;
return;
}
SCORE *MatchScore = new SCORE[uColCount];
SCORE *SmoothScore = new SCORE[uColCount];
unsigned *BestCols = new unsigned[uColCount];
GetLetterScores(msa, MatchScore);
WindowSmooth(MatchScore, uColCount, g_uSmoothWindowLength, SmoothScore,
g_dSmoothScoreCeil);
unsigned uBestColCount;
FindBestColsCombo(msa, MatchScore, SmoothScore, g_dMinBestColScore, g_dMinSmoothScore,
BestCols, &uBestColCount);
#if TRACE
ListBestCols(msa, MatchScore, SmoothScore, BestCols, uBestColCount);
#endif
MergeBestCols(MatchScore, BestCols, uBestColCount, g_uAnchorSpacing, AnchorCols,
ptruAnchorColCount);
delete[] MatchScore;
delete[] SmoothScore;
delete[] BestCols;
}

View File

@ -0,0 +1,206 @@
#include "muscle.h"
#include "pwpath.h"
#define TRACE 0
static char XlatEdgeType(char c)
{
if ('E' == c)
return 'D';
if ('J' == c)
return 'I';
return c;
}
static const char *BitsToStr(char Bits)
{
static char Str[] = "xM xD xI";
switch (Bits & BIT_xM)
{
case BIT_MM:
Str[0] = 'M';
break;
case BIT_DM:
Str[0] = 'D';
break;
case BIT_IM:
Str[0] = 'I';
break;
}
switch (Bits & BIT_xD)
{
case BIT_MD:
Str[3] = 'M';
break;
case BIT_DD:
Str[3] = 'D';
break;
}
switch (Bits & BIT_xI)
{
case BIT_MI:
Str[6] = 'M';
break;
case BIT_II:
Str[6] = 'I';
break;
}
return Str;
}
static inline char XChar(char Bits, char cType)
{
switch (cType)
{
case 'M':
{
switch (Bits & BIT_xM)
{
case BIT_MM:
return 'M';
case BIT_DM:
return 'D';
case BIT_IM:
return 'I';
#if DOUBLE_AFFINE
case BIT_EM:
return 'E';
case BIT_JM:
return 'J';
#endif
}
Quit("Huh!?");
return '?';
}
case 'D':
{
switch (Bits & BIT_xD)
{
case BIT_MD:
return 'M';
case BIT_DD:
return 'D';
}
Quit("Huh!?");
return '?';
}
case 'I':
{
switch (Bits & BIT_xI)
{
case BIT_MI:
return 'M';
case BIT_II:
return 'I';
}
Quit("Huh!?");
return '?';
}
#if DOUBLE_AFFINE
case 'E':
{
switch (Bits & BIT_xE)
{
case BIT_ME:
return 'M';
case BIT_EE:
return 'E';
}
Quit("Huh!?");
return '?';
}
case 'J':
{
switch (Bits & BIT_xJ)
{
case BIT_MJ:
return 'M';
case BIT_JJ:
return 'J';
}
Quit("Huh!?");
return '?';
}
#endif
default:
Quit("Huh?");
return '?';
}
}
void BitTraceBack(char **TraceBack, unsigned uLengthA, unsigned uLengthB,
char LastEdge, PWPath &Path)
{
#if TRACE
Log("BitTraceBack\n");
#endif
Path.Clear();
PWEdge Edge;
Edge.uPrefixLengthA = uLengthA;
Edge.uPrefixLengthB = uLengthB;
char Bits = TraceBack[uLengthA][uLengthB];
Edge.cType = LastEdge;
for (;;)
{
#if TRACE
Log("Prepend %c%d.%d\n", Edge.cType, Edge.uPrefixLengthA, Edge.uPrefixLengthB);
#endif
char cSave = Edge.cType;
Edge.cType = XlatEdgeType(cSave);
Path.PrependEdge(Edge);
Edge.cType = cSave;
unsigned PLA = Edge.uPrefixLengthA;
unsigned PLB = Edge.uPrefixLengthB;
char Bits = TraceBack[PLA][PLB];
char NextEdgeType = XChar(Bits, Edge.cType);
#if TRACE
Log("XChar(%s, %c) = %c\n", BitsToStr(Bits), Edge.cType, NextEdgeType);
#endif
switch (Edge.cType)
{
case 'M':
{
if (Edge.uPrefixLengthA == 0)
Quit("BitTraceBack MA=0");
if (Edge.uPrefixLengthB == 0)
Quit("BitTraceBack MA=0");
--(Edge.uPrefixLengthA);
--(Edge.uPrefixLengthB);
break;
}
case 'D':
case 'E':
{
if (Edge.uPrefixLengthA == 0)
Quit("BitTraceBack DA=0");
--(Edge.uPrefixLengthA);
break;
}
case 'I':
case 'J':
{
if (Edge.uPrefixLengthB == 0)
Quit("BitTraceBack IB=0");
--(Edge.uPrefixLengthB);
break;
}
default:
Quit("BitTraceBack: Invalid edge %c", Edge);
}
if (0 == Edge.uPrefixLengthA && 0 == Edge.uPrefixLengthB)
break;
Edge.cType = NextEdgeType;
}
#if TRACE
Path.LogMe();
#endif
}

View File

@ -0,0 +1,28 @@
#include "muscle.h"
int BLOSUM62[20][20] =
{
// A C D E F G H I K L M N P Q R S T V W Y
{ 4, 0, -2, -1, -2, 0, -2, -1, -1, -1, -1, -2, -1, -1, -1, 1, 0, 0, -3, -2}, // A
{ 0, 9, -3, -4, -2, -3, -3, -1, -3, -1, -1, -3, -3, -3, -3, -1, -1, -1, -2, -2}, // C
{-2, -3, 6, 2, -3, -1, -1, -3, -1, -4, -3, 1, -1, 0, -2, 0, -1, -3, -4, -3}, // D
{-1, -4, 2, 5, -3, -2, 0, -3, 1, -3, -2, 0, -1, 2, 0, 0, -1, -2, -3, -2}, // E
{-2, -2, -3, -3, 6, -3, -1, 0, -3, 0, 0, -3, -4, -3, -3, -2, -2, -1, 1, 3}, // F
{ 0, -3, -1, -2, -3, 6, -2, -4, -2, -4, -3, 0, -2, -2, -2, 0, -2, -3, -2, -3}, // G
{-2, -3, -1, 0, -1, -2, 8, -3, -1, -3, -2, 1, -2, 0, 0, -1, -2, -3, -2, 2}, // H
{-1, -1, -3, -3, 0, -4, -3, 4, -3, 2, 1, -3, -3, -3, -3, -2, -1, 3, -3, -1}, // I
{-1, -3, -1, 1, -3, -2, -1, -3, 5, -2, -1, 0, -1, 1, 2, 0, -1, -2, -3, -2}, // K
{-1, -1, -4, -3, 0, -4, -3, 2, -2, 4, 2, -3, -3, -2, -2, -2, -1, 1, -2, -1}, // L
{-1, -1, -3, -2, 0, -3, -2, 1, -1, 2, 5, -2, -2, 0, -1, -1, -1, 1, -1, -1}, // M
{-2, -3, 1, 0, -3, 0, 1, -3, 0, -3, -2, 6, -2, 0, 0, 1, 0, -3, -4, -2}, // N
{-1, -3, -1, -1, -4, -2, -2, -3, -1, -3, -2, -2, 7, -1, -2, -1, -1, -2, -4, -3}, // P
{-1, -3, 0, 2, -3, -2, 0, -3, 1, -2, 0, 0, -1, 5, 1, 0, -1, -2, -2, -1}, // Q
{-1, -3, -2, 0, -3, -2, 0, -3, 2, -2, -1, 0, -2, 1, 5, -1, -1, -3, -3, -2}, // R
{ 1, -1, 0, 0, -2, 0, -1, -2, 0, -2, -1, 1, -1, 0, -1, 4, 1, -2, -3, -2}, // S
{ 0, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, 0, -1, -1, -1, 1, 5, 0, -2, -2}, // T
{ 0, -1, -3, -2, -1, -3, -3, 3, -2, 1, 1, -3, -2, -2, -3, -2, 0, 4, -3, -1}, // V
{-3, -2, -4, -3, 1, -2, -2, -3, -3, -2, -1, -4, -4, -2, -3, -3, -2, -3, 11, 2}, // W
{-2, -2, -3, -2, 3, -3, 2, -1, -2, -1, -1, -2, -3, -1, -2, -2, -2, -1, 2, 7}, // Y
};
double BLOSUM62_Expected = -0.5209;

View File

@ -0,0 +1,118 @@
#include "muscle.h"
#define GAPVAL 0.3
#define GAPGAPVAL 5.0
// Blosum62 log-average factor matrix
static float Blosum62LA[20][20] =
{
#define v(x) ((float) x)
#define S_ROW(n, c, A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y) \
{ v(A), v(C), v(D), v(E), v(F), v(G), v(H), v(I), v(K), v(L), v(M), v(N), v(P), v(Q), \
v(R), v(S), v(T), v(V), v(W), v(Y) },
// Blosum62 log average matrix
// A C D E F
// G H I K L
// M N P Q R
// S T V W Y
S_ROW( 0, 'A', 3.9029401, 0.8679881, 0.5446049, 0.7412640, 0.4648942,
1.0568696, 0.5693654, 0.6324813, 0.7753898, 0.6019460,
0.7231498, 0.5883077, 0.7541214, 0.7568035, 0.6126988,
1.4721037, 0.9844022, 0.9364584, 0.4165484, 0.5426125)
S_ROW( 1, 'C', 0.8679881, 19.5765802, 0.3014542, 0.2859347, 0.4389910,
0.4203886, 0.3550472, 0.6534589, 0.3491296, 0.6422760,
0.6113537, 0.3978026, 0.3795628, 0.3657796, 0.3089379,
0.7384148, 0.7405530, 0.7558448, 0.4499807, 0.4342013)
S_ROW( 2, 'D', 0.5446049, 0.3014542, 7.3979253, 1.6878109, 0.2989696,
0.6343015, 0.6785593, 0.3390155, 0.7840905, 0.2866128,
0.3464547, 1.5538520, 0.5987177, 0.8970811, 0.5732000,
0.9135051, 0.6947898, 0.3365004, 0.2321050, 0.3456829)
S_ROW( 3, 'E', 0.7412640, 0.2859347, 1.6878109, 5.4695276, 0.3307441,
0.4812675, 0.9600400, 0.3305223, 1.3082782, 0.3728734,
0.5003421, 0.9112983, 0.6792027, 1.9017376, 0.9607983,
0.9503570, 0.7414260, 0.4289431, 0.3743021, 0.4964664)
S_ROW( 4, 'F', 0.4648942, 0.4389910, 0.2989696, 0.3307441, 8.1287983,
0.3406407, 0.6519893, 0.9457698, 0.3440433, 1.1545978,
1.0043715, 0.3542882, 0.2874440, 0.3339729, 0.3807263,
0.4399736, 0.4816930, 0.7450894, 1.3743775, 2.7693817)
S_ROW( 5, 'G', 1.0568696, 0.4203886, 0.6343015, 0.4812675, 0.3406407,
6.8763075, 0.4929663, 0.2750096, 0.5888716, 0.2845039,
0.3954865, 0.8637114, 0.4773858, 0.5386498, 0.4499840,
0.9035965, 0.5792712, 0.3369551, 0.4216898, 0.3487141)
S_ROW( 6, 'H', 0.5693654, 0.3550472, 0.6785593, 0.9600400, 0.6519893,
0.4929663, 13.5060070, 0.3262878, 0.7788884, 0.3806759,
0.5841316, 1.2220028, 0.4728797, 1.1679835, 0.9170473,
0.7367319, 0.5575021, 0.3394474, 0.4440859, 1.7979036)
S_ROW( 7, 'I', 0.6324813, 0.6534589, 0.3390155, 0.3305223, 0.9457698,
0.2750096, 0.3262878, 3.9979299, 0.3963730, 1.6944349,
1.4777449, 0.3279345, 0.3846629, 0.3829375, 0.3547509,
0.4431634, 0.7798163, 2.4175121, 0.4088732, 0.6303898)
S_ROW( 8, 'K', 0.7753898, 0.3491296, 0.7840905, 1.3082782, 0.3440433,
0.5888716, 0.7788884, 0.3963730, 4.7643359, 0.4282702,
0.6253033, 0.9398419, 0.7037741, 1.5543233, 2.0768092,
0.9319192, 0.7929060, 0.4565429, 0.3589319, 0.5321784)
S_ROW( 9, 'L', 0.6019460, 0.6422760, 0.2866128, 0.3728734, 1.1545978,
0.2845039, 0.3806759, 1.6944349, 0.4282702, 3.7966214,
1.9942957, 0.3100430, 0.3711219, 0.4773261, 0.4739194,
0.4288939, 0.6603292, 1.3142355, 0.5680359, 0.6920589)
S_ROW(10, 'M', 0.7231498, 0.6113537, 0.3464547, 0.5003421, 1.0043715,
0.3954865, 0.5841316, 1.4777449, 0.6253033, 1.9942957,
6.4814549, 0.4745299, 0.4238960, 0.8642486, 0.6226249,
0.5985578, 0.7938018, 1.2689365, 0.6103022, 0.7083636)
S_ROW(11, 'N', 0.5883077, 0.3978026, 1.5538520, 0.9112983, 0.3542882,
0.8637114, 1.2220028, 0.3279345, 0.9398419, 0.3100430,
0.4745299, 7.0940964, 0.4999337, 1.0005835, 0.8586298,
1.2315289, 0.9841525, 0.3690340, 0.2777841, 0.4860309)
S_ROW(12, 'P', 0.7541214, 0.3795628, 0.5987177, 0.6792027, 0.2874440,
0.4773858, 0.4728797, 0.3846629, 0.7037741, 0.3711219,
0.4238960, 0.4999337, 12.8375452, 0.6412803, 0.4815348,
0.7555033, 0.6888962, 0.4430825, 0.2818321, 0.3635216)
S_ROW(13, 'Q', 0.7568035, 0.3657796, 0.8970811, 1.9017376, 0.3339729,
0.5386498, 1.1679835, 0.3829375, 1.5543233, 0.4773261,
0.8642486, 1.0005835, 0.6412803, 6.2444210, 1.4057958,
0.9655559, 0.7913219, 0.4667781, 0.5093584, 0.6110951)
S_ROW(14, 'R', 0.6126988, 0.3089379, 0.5732000, 0.9607983, 0.3807263,
0.4499840, 0.9170473, 0.3547509, 2.0768092, 0.4739194,
0.6226249, 0.8586298, 0.4815348, 1.4057958, 6.6655769,
0.7671661, 0.6777544, 0.4200721, 0.3951049, 0.5559652)
S_ROW(15, 'S', 1.4721037, 0.7384148, 0.9135051, 0.9503570, 0.4399736,
0.9035965, 0.7367319, 0.4431634, 0.9319192, 0.4288939,
0.5985578, 1.2315289, 0.7555033, 0.9655559, 0.7671661,
3.8428476, 1.6139205, 0.5652240, 0.3853031, 0.5575206)
S_ROW(16, 'T', 0.9844022, 0.7405530, 0.6947898, 0.7414260, 0.4816930,
0.5792712, 0.5575021, 0.7798163, 0.7929060, 0.6603292,
0.7938018, 0.9841525, 0.6888962, 0.7913219, 0.6777544,
1.6139205, 4.8321048, 0.9809432, 0.4309317, 0.5731577)
S_ROW(17, 'V', 0.9364584, 0.7558448, 0.3365004, 0.4289431, 0.7450894,
0.3369551, 0.3394474, 2.4175121, 0.4565429, 1.3142355,
1.2689365, 0.3690340, 0.4430825, 0.4667781, 0.4200721,
0.5652240, 0.9809432, 3.6921553, 0.3744576, 0.6580390)
S_ROW(18, 'W', 0.4165484, 0.4499807, 0.2321050, 0.3743021, 1.3743775,
0.4216898, 0.4440859, 0.4088732, 0.3589319, 0.5680359,
0.6103022, 0.2777841, 0.2818321, 0.5093584, 0.3951049,
0.3853031, 0.4309317, 0.3744576, 38.1077830, 2.1098056)
S_ROW(19, 'Y', 0.5426125, 0.4342013, 0.3456829, 0.4964664, 2.7693817,
0.3487141, 1.7979036, 0.6303898, 0.5321784, 0.6920589,
0.7083636, 0.4860309, 0.3635216, 0.6110951, 0.5559652,
0.5575206, 0.5731577, 0.6580390, 2.1098056, 9.8322054)
};

View File

@ -0,0 +1,666 @@
#include "muscle.h"
#include "clust.h"
#include "clustset.h"
#include <stdio.h>
#define TRACE 0
Clust::Clust()
{
m_Nodes = 0;
m_uNodeCount = 0;
m_uLeafCount = 0;
m_uClusterCount = 0;
m_JoinStyle = JOIN_Undefined;
m_dDist = 0;
m_uLeafCount = 0;
m_ptrSet = 0;
}
Clust::~Clust()
{
delete[] m_Nodes;
delete[] m_dDist;
delete[] m_ClusterIndexToNodeIndex;
}
void Clust::Create(ClustSet &Set, CLUSTER Method)
{
m_ptrSet = &Set;
SetLeafCount(Set.GetLeafCount());
switch (Method)
{
case CLUSTER_UPGMA:
m_JoinStyle = JOIN_NearestNeighbor;
m_CentroidStyle = LINKAGE_Avg;
break;
case CLUSTER_UPGMAMax:
m_JoinStyle = JOIN_NearestNeighbor;
m_CentroidStyle = LINKAGE_Max;
break;
case CLUSTER_UPGMAMin:
m_JoinStyle = JOIN_NearestNeighbor;
m_CentroidStyle = LINKAGE_Min;
break;
case CLUSTER_UPGMB:
m_JoinStyle = JOIN_NearestNeighbor;
m_CentroidStyle = LINKAGE_Biased;
break;
case CLUSTER_NeighborJoining:
m_JoinStyle = JOIN_NeighborJoining;
m_CentroidStyle = LINKAGE_NeighborJoining;
break;
default:
Quit("Clust::Create, invalid method %d", Method);
}
if (m_uLeafCount <= 1)
Quit("Clust::Create: no leaves");
m_uNodeCount = 2*m_uLeafCount - 1;
m_Nodes = new ClustNode[m_uNodeCount];
m_ClusterIndexToNodeIndex = new unsigned[m_uLeafCount];
m_ptrClusterList = 0;
for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex)
{
ClustNode &Node = m_Nodes[uNodeIndex];
Node.m_uIndex = uNodeIndex;
if (uNodeIndex < m_uLeafCount)
{
Node.m_uSize = 1;
Node.m_uLeafIndexes = new unsigned[1];
Node.m_uLeafIndexes[0] = uNodeIndex;
AddToClusterList(uNodeIndex);
}
else
Node.m_uSize = 0;
}
// Compute initial distance matrix between leaves
SetProgressDesc("Build dist matrix");
unsigned uPairIndex = 0;
const unsigned uPairCount = (m_uLeafCount*(m_uLeafCount - 1))/2;
for (unsigned i = 0; i < m_uLeafCount; ++i)
for (unsigned j = 0; j < i; ++j)
{
const float dDist = (float) m_ptrSet->ComputeDist(*this, i, j);
SetDist(i, j, dDist);
if (0 == uPairIndex%10000)
Progress(uPairIndex, uPairCount);
++uPairIndex;
}
ProgressStepsDone();
// Call CreateCluster once for each internal node in the tree
SetProgressDesc("Build guide tree");
m_uClusterCount = m_uLeafCount;
const unsigned uInternalNodeCount = m_uNodeCount - m_uLeafCount;
for (unsigned uNodeIndex = m_uLeafCount; uNodeIndex < m_uNodeCount; ++uNodeIndex)
{
unsigned i = uNodeIndex + 1 - m_uLeafCount;
Progress(i, uInternalNodeCount);
CreateCluster();
}
ProgressStepsDone();
}
void Clust::CreateCluster()
{
unsigned uLeftNodeIndex;
unsigned uRightNodeIndex;
float dLeftLength;
float dRightLength;
ChooseJoin(&uLeftNodeIndex, &uRightNodeIndex, &dLeftLength, &dRightLength);
const unsigned uNewNodeIndex = m_uNodeCount - m_uClusterCount + 1;
JoinNodes(uLeftNodeIndex, uRightNodeIndex, dLeftLength, dRightLength,
uNewNodeIndex);
#if TRACE
Log("Merge New=%u L=%u R=%u Ld=%7.2g Rd=%7.2g\n",
uNewNodeIndex, uLeftNodeIndex, uRightNodeIndex, dLeftLength, dRightLength);
#endif
// Compute distances to other clusters
--m_uClusterCount;
for (unsigned uNodeIndex = GetFirstCluster(); uNodeIndex != uInsane;
uNodeIndex = GetNextCluster(uNodeIndex))
{
if (uNodeIndex == uLeftNodeIndex || uNodeIndex == uRightNodeIndex)
continue;
if (uNewNodeIndex == uNodeIndex)
continue;
const float dDist = ComputeDist(uNewNodeIndex, uNodeIndex);
SetDist(uNewNodeIndex, uNodeIndex, dDist);
}
for (unsigned uNodeIndex = GetFirstCluster(); uNodeIndex != uInsane;
uNodeIndex = GetNextCluster(uNodeIndex))
{
if (uNodeIndex == uLeftNodeIndex || uNodeIndex == uRightNodeIndex)
continue;
if (uNewNodeIndex == uNodeIndex)
continue;
#if REDLACK
const float dMetric = ComputeMetric(uNewNodeIndex, uNodeIndex);
InsertMetric(uNewNodeIndex, uNodeIndex, dMetric);
#endif
}
}
void Clust::ChooseJoin(unsigned *ptruLeftIndex, unsigned *ptruRightIndex,
float *ptrdLeftLength, float *ptrdRightLength)
{
switch (m_JoinStyle)
{
case JOIN_NearestNeighbor:
ChooseJoinNearestNeighbor(ptruLeftIndex, ptruRightIndex, ptrdLeftLength,
ptrdRightLength);
return;
case JOIN_NeighborJoining:
ChooseJoinNeighborJoining(ptruLeftIndex, ptruRightIndex, ptrdLeftLength,
ptrdRightLength);
return;
}
Quit("Clust::ChooseJoin, Invalid join style %u", m_JoinStyle);
}
void Clust::ChooseJoinNearestNeighbor(unsigned *ptruLeftIndex,
unsigned *ptruRightIndex, float *ptrdLeftLength, float *ptrdRightLength)
{
const unsigned uClusterCount = GetClusterCount();
unsigned uMinLeftNodeIndex;
unsigned uMinRightNodeIndex;
GetMinMetric(&uMinLeftNodeIndex, &uMinRightNodeIndex);
float dMinDist = GetDist(uMinLeftNodeIndex, uMinRightNodeIndex);
const float dLeftHeight = GetHeight(uMinLeftNodeIndex);
const float dRightHeight = GetHeight(uMinRightNodeIndex);
*ptruLeftIndex = uMinLeftNodeIndex;
*ptruRightIndex = uMinRightNodeIndex;
*ptrdLeftLength = dMinDist/2 - dLeftHeight;
*ptrdRightLength = dMinDist/2 - dRightHeight;
}
void Clust::ChooseJoinNeighborJoining(unsigned *ptruLeftIndex,
unsigned *ptruRightIndex, float *ptrdLeftLength, float *ptrdRightLength)
{
const unsigned uClusterCount = GetClusterCount();
//unsigned uMinLeftNodeIndex = uInsane;
//unsigned uMinRightNodeIndex = uInsane;
//float dMinD = PLUS_INFINITY;
//for (unsigned i = GetFirstCluster(); i != uInsane; i = GetNextCluster(i))
// {
// const float ri = Calc_r(i);
// for (unsigned j = GetNextCluster(i); j != uInsane; j = GetNextCluster(j))
// {
// const float rj = Calc_r(j);
// const float dij = GetDist(i, j);
// const float Dij = dij - (ri + rj);
// if (Dij < dMinD)
// {
// dMinD = Dij;
// uMinLeftNodeIndex = i;
// uMinRightNodeIndex = j;
// }
// }
// }
unsigned uMinLeftNodeIndex;
unsigned uMinRightNodeIndex;
GetMinMetric(&uMinLeftNodeIndex, &uMinRightNodeIndex);
const float dDistLR = GetDist(uMinLeftNodeIndex, uMinRightNodeIndex);
const float rL = Calc_r(uMinLeftNodeIndex);
const float rR = Calc_r(uMinRightNodeIndex);
const float dLeftLength = (dDistLR + rL - rR)/2;
const float dRightLength = (dDistLR - rL + rR)/2;
*ptruLeftIndex = uMinLeftNodeIndex;
*ptruRightIndex = uMinRightNodeIndex;
*ptrdLeftLength = dLeftLength;
*ptrdRightLength = dRightLength;
}
void Clust::JoinNodes(unsigned uLeftIndex, unsigned uRightIndex, float dLeftLength,
float dRightLength, unsigned uNodeIndex)
{
ClustNode &Parent = m_Nodes[uNodeIndex];
ClustNode &Left = m_Nodes[uLeftIndex];
ClustNode &Right = m_Nodes[uRightIndex];
Left.m_dLength = dLeftLength;
Right.m_dLength = dRightLength;
Parent.m_ptrLeft = &Left;
Parent.m_ptrRight = &Right;
Left.m_ptrParent = &Parent;
Right.m_ptrParent = &Parent;
const unsigned uLeftSize = Left.m_uSize;
const unsigned uRightSize = Right.m_uSize;
const unsigned uParentSize = uLeftSize + uRightSize;
Parent.m_uSize = uParentSize;
assert(0 == Parent.m_uLeafIndexes);
Parent.m_uLeafIndexes = new unsigned[uParentSize];
const unsigned uLeftBytes = uLeftSize*sizeof(unsigned);
const unsigned uRightBytes = uRightSize*sizeof(unsigned);
memcpy(Parent.m_uLeafIndexes, Left.m_uLeafIndexes, uLeftBytes);
memcpy(Parent.m_uLeafIndexes + uLeftSize, Right.m_uLeafIndexes, uRightBytes);
DeleteFromClusterList(uLeftIndex);
DeleteFromClusterList(uRightIndex);
AddToClusterList(uNodeIndex);
}
float Clust::Calc_r(unsigned uNodeIndex) const
{
const unsigned uClusterCount = GetClusterCount();
if (2 == uClusterCount)
return 0;
float dSum = 0;
for (unsigned i = GetFirstCluster(); i != uInsane; i = GetNextCluster(i))
{
if (i == uNodeIndex)
continue;
dSum += GetDist(uNodeIndex, i);
}
return dSum/(uClusterCount - 2);
}
float Clust::ComputeDist(unsigned uNewNodeIndex, unsigned uNodeIndex)
{
switch (m_CentroidStyle)
{
case LINKAGE_Avg:
return ComputeDistAverageLinkage(uNewNodeIndex, uNodeIndex);
case LINKAGE_Min:
return ComputeDistMinLinkage(uNewNodeIndex, uNodeIndex);
case LINKAGE_Max:
return ComputeDistMaxLinkage(uNewNodeIndex, uNodeIndex);
case LINKAGE_Biased:
return ComputeDistMAFFT(uNewNodeIndex, uNodeIndex);
case LINKAGE_NeighborJoining:
return ComputeDistNeighborJoining(uNewNodeIndex, uNodeIndex);
}
Quit("Clust::ComputeDist, invalid centroid style %u", m_CentroidStyle);
return (float) g_dNAN;
}
float Clust::ComputeDistMinLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex)
{
const unsigned uLeftNodeIndex = GetLeftIndex(uNewNodeIndex);
const unsigned uRightNodeIndex = GetRightIndex(uNewNodeIndex);
const float dDistL = GetDist(uLeftNodeIndex, uNodeIndex);
const float dDistR = GetDist(uRightNodeIndex, uNodeIndex);
return (dDistL < dDistR ? dDistL : dDistR);
}
float Clust::ComputeDistMaxLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex)
{
const unsigned uLeftNodeIndex = GetLeftIndex(uNewNodeIndex);
const unsigned uRightNodeIndex = GetRightIndex(uNewNodeIndex);
const float dDistL = GetDist(uLeftNodeIndex, uNodeIndex);
const float dDistR = GetDist(uRightNodeIndex, uNodeIndex);
return (dDistL > dDistR ? dDistL : dDistR);
}
float Clust::ComputeDistAverageLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex)
{
const unsigned uLeftNodeIndex = GetLeftIndex(uNewNodeIndex);
const unsigned uRightNodeIndex = GetRightIndex(uNewNodeIndex);
const float dDistL = GetDist(uLeftNodeIndex, uNodeIndex);
const float dDistR = GetDist(uRightNodeIndex, uNodeIndex);
return (dDistL + dDistR)/2;
}
float Clust::ComputeDistNeighborJoining(unsigned uNewNodeIndex, unsigned uNodeIndex)
{
const unsigned uLeftNodeIndex = GetLeftIndex(uNewNodeIndex);
const unsigned uRightNodeIndex = GetRightIndex(uNewNodeIndex);
const float dDistLR = GetDist(uLeftNodeIndex, uRightNodeIndex);
const float dDistL = GetDist(uLeftNodeIndex, uNodeIndex);
const float dDistR = GetDist(uRightNodeIndex, uNodeIndex);
const float dDist = (dDistL + dDistR - dDistLR)/2;
return dDist;
}
// This is a mysterious variant of UPGMA reverse-engineered from MAFFT source.
float Clust::ComputeDistMAFFT(unsigned uNewNodeIndex, unsigned uNodeIndex)
{
const unsigned uLeftNodeIndex = GetLeftIndex(uNewNodeIndex);
const unsigned uRightNodeIndex = GetRightIndex(uNewNodeIndex);
const float dDistLR = GetDist(uLeftNodeIndex, uRightNodeIndex);
const float dDistL = GetDist(uLeftNodeIndex, uNodeIndex);
const float dDistR = GetDist(uRightNodeIndex, uNodeIndex);
const float dMinDistLR = (dDistL < dDistR ? dDistL : dDistR);
const float dSumDistLR = dDistL + dDistR;
const float dDist = dMinDistLR*(1 - g_dSUEFF) + dSumDistLR*g_dSUEFF/2;
return dDist;
}
unsigned Clust::GetClusterCount() const
{
return m_uClusterCount;
}
void Clust::LogMe() const
{
Log("Clust %u leaves, %u nodes, %u clusters.\n",
m_uLeafCount, m_uNodeCount, m_uClusterCount);
Log("Distance matrix\n");
const unsigned uNodeCount = GetNodeCount();
Log(" ");
for (unsigned i = 0; i < uNodeCount - 1; ++i)
Log(" %7u", i);
Log("\n");
Log(" ");
for (unsigned i = 0; i < uNodeCount - 1; ++i)
Log(" ------");
Log("\n");
for (unsigned i = 0; i < uNodeCount - 1; ++i)
{
Log("%4u: ", i);
for (unsigned j = 0; j < i; ++j)
Log(" %7.2g", GetDist(i, j));
Log("\n");
}
Log("\n");
Log("Node Size Prnt Left Rght Length Name\n");
Log("---- ---- ---- ---- ---- ------ ----\n");
for (unsigned uNodeIndex = 0; uNodeIndex < m_uNodeCount; ++uNodeIndex)
{
const ClustNode &Node = m_Nodes[uNodeIndex];
Log("%4u %4u", uNodeIndex, Node.m_uSize);
if (0 != Node.m_ptrParent)
Log(" %4u", Node.m_ptrParent->m_uIndex);
else
Log(" ");
if (0 != Node.m_ptrLeft)
Log(" %4u", Node.m_ptrLeft->m_uIndex);
else
Log(" ");
if (0 != Node.m_ptrRight)
Log(" %4u", Node.m_ptrRight->m_uIndex);
else
Log(" ");
if (uNodeIndex != m_uNodeCount - 1)
Log(" %7.3g", Node.m_dLength);
if (IsLeaf(uNodeIndex))
{
const char *ptrName = GetNodeName(uNodeIndex);
if (0 != ptrName)
Log(" %s", ptrName);
}
if (GetRootNodeIndex() == uNodeIndex)
Log(" [ROOT]");
Log("\n");
}
}
const ClustNode &Clust::GetNode(unsigned uNodeIndex) const
{
if (uNodeIndex >= m_uNodeCount)
Quit("ClustNode::GetNode(%u) %u", uNodeIndex, m_uNodeCount);
return m_Nodes[uNodeIndex];
}
bool Clust::IsLeaf(unsigned uNodeIndex) const
{
return uNodeIndex < m_uLeafCount;
}
unsigned Clust::GetClusterSize(unsigned uNodeIndex) const
{
const ClustNode &Node = GetNode(uNodeIndex);
return Node.m_uSize;
}
unsigned Clust::GetLeftIndex(unsigned uNodeIndex) const
{
const ClustNode &Node = GetNode(uNodeIndex);
if (0 == Node.m_ptrLeft)
Quit("Clust::GetLeftIndex: leaf");
return Node.m_ptrLeft->m_uIndex;
}
unsigned Clust::GetRightIndex(unsigned uNodeIndex) const
{
const ClustNode &Node = GetNode(uNodeIndex);
if (0 == Node.m_ptrRight)
Quit("Clust::GetRightIndex: leaf");
return Node.m_ptrRight->m_uIndex;
}
float Clust::GetLength(unsigned uNodeIndex) const
{
const ClustNode &Node = GetNode(uNodeIndex);
return Node.m_dLength;
}
void Clust::SetLeafCount(unsigned uLeafCount)
{
if (uLeafCount <= 1)
Quit("Clust::SetLeafCount(%u)", uLeafCount);
m_uLeafCount = uLeafCount;
const unsigned uNodeCount = GetNodeCount();
// Triangular matrix size excluding diagonal (all zeros in our case).
m_uTriangularMatrixSize = (uNodeCount*(uNodeCount - 1))/2;
m_dDist = new float[m_uTriangularMatrixSize];
}
unsigned Clust::GetLeafCount() const
{
return m_uLeafCount;
}
unsigned Clust::VectorIndex(unsigned uIndex1, unsigned uIndex2) const
{
const unsigned uNodeCount = GetNodeCount();
if (uIndex1 >= uNodeCount || uIndex2 >= uNodeCount)
Quit("DistVectorIndex(%u,%u) %u", uIndex1, uIndex2, uNodeCount);
unsigned v;
if (uIndex1 >= uIndex2)
v = uIndex2 + (uIndex1*(uIndex1 - 1))/2;
else
v = uIndex1 + (uIndex2*(uIndex2 - 1))/2;
assert(v < m_uTriangularMatrixSize);
return v;
}
float Clust::GetDist(unsigned uIndex1, unsigned uIndex2) const
{
unsigned v = VectorIndex(uIndex1, uIndex2);
return m_dDist[v];
}
void Clust::SetDist(unsigned uIndex1, unsigned uIndex2, float dDist)
{
unsigned v = VectorIndex(uIndex1, uIndex2);
m_dDist[v] = dDist;
}
float Clust::GetHeight(unsigned uNodeIndex) const
{
if (IsLeaf(uNodeIndex))
return 0;
const unsigned uLeftIndex = GetLeftIndex(uNodeIndex);
const unsigned uRightIndex = GetRightIndex(uNodeIndex);
const float dLeftLength = GetLength(uLeftIndex);
const float dRightLength = GetLength(uRightIndex);
const float dLeftHeight = dLeftLength + GetHeight(uLeftIndex);
const float dRightHeight = dRightLength + GetHeight(uRightIndex);
return (dLeftHeight + dRightHeight)/2;
}
const char *Clust::GetNodeName(unsigned uNodeIndex) const
{
if (!IsLeaf(uNodeIndex))
Quit("Clust::GetNodeName, is not leaf");
return m_ptrSet->GetLeafName(uNodeIndex);
}
unsigned Clust::GetNodeId(unsigned uNodeIndex) const
{
if (uNodeIndex >= GetLeafCount())
return 0;
return m_ptrSet->GetLeafId(uNodeIndex);
}
unsigned Clust::GetLeaf(unsigned uNodeIndex, unsigned uLeafIndex) const
{
const ClustNode &Node = GetNode(uNodeIndex);
const unsigned uLeafCount = Node.m_uSize;
if (uLeafIndex >= uLeafCount)
Quit("Clust::GetLeaf, invalid index");
const unsigned uIndex = Node.m_uLeafIndexes[uLeafIndex];
if (uIndex >= m_uNodeCount)
Quit("Clust::GetLeaf, index out of range");
return uIndex;
}
unsigned Clust::GetFirstCluster() const
{
if (0 == m_ptrClusterList)
return uInsane;
return m_ptrClusterList->m_uIndex;
}
unsigned Clust::GetNextCluster(unsigned uIndex) const
{
ClustNode *ptrNode = &m_Nodes[uIndex];
if (0 == ptrNode->m_ptrNextCluster)
return uInsane;
return ptrNode->m_ptrNextCluster->m_uIndex;
}
void Clust::DeleteFromClusterList(unsigned uNodeIndex)
{
assert(uNodeIndex < m_uNodeCount);
ClustNode *ptrNode = &m_Nodes[uNodeIndex];
ClustNode *ptrPrev = ptrNode->m_ptrPrevCluster;
ClustNode *ptrNext = ptrNode->m_ptrNextCluster;
if (0 != ptrNext)
ptrNext->m_ptrPrevCluster = ptrPrev;
if (0 == ptrPrev)
{
assert(m_ptrClusterList == ptrNode);
m_ptrClusterList = ptrNext;
}
else
ptrPrev->m_ptrNextCluster = ptrNext;
ptrNode->m_ptrNextCluster = 0;
ptrNode->m_ptrPrevCluster = 0;
}
void Clust::AddToClusterList(unsigned uNodeIndex)
{
assert(uNodeIndex < m_uNodeCount);
ClustNode *ptrNode = &m_Nodes[uNodeIndex];
if (0 != m_ptrClusterList)
m_ptrClusterList->m_ptrPrevCluster = ptrNode;
ptrNode->m_ptrNextCluster = m_ptrClusterList;
ptrNode->m_ptrPrevCluster = 0;
m_ptrClusterList = ptrNode;
}
float Clust::ComputeMetric(unsigned uIndex1, unsigned uIndex2) const
{
switch (m_JoinStyle)
{
case JOIN_NearestNeighbor:
return ComputeMetricNearestNeighbor(uIndex1, uIndex2);
case JOIN_NeighborJoining:
return ComputeMetricNeighborJoining(uIndex1, uIndex2);
}
Quit("Clust::ComputeMetric");
return 0;
}
float Clust::ComputeMetricNeighborJoining(unsigned i, unsigned j) const
{
float ri = Calc_r(i);
float rj = Calc_r(j);
float dij = GetDist(i, j);
float dMetric = dij - (ri + rj);
return (float) dMetric;
}
float Clust::ComputeMetricNearestNeighbor(unsigned i, unsigned j) const
{
return (float) GetDist(i, j);
}
float Clust::GetMinMetricBruteForce(unsigned *ptruIndex1, unsigned *ptruIndex2) const
{
unsigned uMinLeftNodeIndex = uInsane;
unsigned uMinRightNodeIndex = uInsane;
float dMinMetric = PLUS_INFINITY;
for (unsigned uLeftNodeIndex = GetFirstCluster(); uLeftNodeIndex != uInsane;
uLeftNodeIndex = GetNextCluster(uLeftNodeIndex))
{
for (unsigned uRightNodeIndex = GetNextCluster(uLeftNodeIndex);
uRightNodeIndex != uInsane;
uRightNodeIndex = GetNextCluster(uRightNodeIndex))
{
float dMetric = ComputeMetric(uLeftNodeIndex, uRightNodeIndex);
if (dMetric < dMinMetric)
{
dMinMetric = dMetric;
uMinLeftNodeIndex = uLeftNodeIndex;
uMinRightNodeIndex = uRightNodeIndex;
}
}
}
*ptruIndex1 = uMinLeftNodeIndex;
*ptruIndex2 = uMinRightNodeIndex;
return dMinMetric;
}
float Clust::GetMinMetric(unsigned *ptruIndex1, unsigned *ptruIndex2) const
{
return GetMinMetricBruteForce(ptruIndex1, ptruIndex2);
}

View File

@ -0,0 +1,148 @@
#ifndef Clust_h
#define Clust_h
class Clust;
class ClustNode;
class ClustSet;
class Phylip;
class SortedNode;
const unsigned RB_NIL = ((unsigned) 0xfff0);
class ClustNode
{
public:
ClustNode()
{
m_uIndex = uInsane;
m_uSize = uInsane;
m_dLength = (float) dInsane;
m_ptrLeft = 0;
m_ptrRight = 0;
m_ptrParent = 0;
m_ptrNextCluster = 0;
m_ptrPrevCluster = 0;
m_uLeafIndexes = 0;
}
~ClustNode()
{
delete[] m_uLeafIndexes;
}
unsigned m_uIndex;
unsigned m_uSize;
float m_dLength;
ClustNode *m_ptrLeft;
ClustNode *m_ptrRight;
ClustNode *m_ptrParent;
ClustNode *m_ptrNextCluster;
ClustNode *m_ptrPrevCluster;
unsigned *m_uLeafIndexes;
};
class Clust
{
public:
Clust();
virtual ~Clust();
void Create(ClustSet &Set, CLUSTER Method);
unsigned GetLeafCount() const;
unsigned GetClusterCount() const;
unsigned GetClusterSize(unsigned uNodeIndex) const;
unsigned GetLeaf(unsigned uClusterIndex, unsigned uLeafIndex) const;
unsigned GetNodeCount() const { return 2*m_uLeafCount - 1; }
const ClustNode &GetRoot() const { return m_Nodes[GetRootNodeIndex()]; }
unsigned GetRootNodeIndex() const { return m_uNodeCount - 1; }
const ClustNode &GetNode(unsigned uNodeIndex) const;
bool IsLeaf(unsigned uNodeIndex) const;
unsigned GetLeftIndex(unsigned uNodeIndex) const;
unsigned GetRightIndex(unsigned uNodeIndex) const;
float GetLength(unsigned uNodeIndex) const;
float GetHeight(unsigned uNodeIndex) const;
const char *GetNodeName(unsigned uNodeIndex) const;
unsigned GetNodeId(unsigned uNodeIndex) const;
JOIN GetJoinStyle() const { return m_JoinStyle; }
LINKAGE GetCentroidStyle() const { return m_CentroidStyle; }
void SetDist(unsigned uIndex1, unsigned uIndex2, float dDist);
float GetDist(unsigned uIndex1, unsigned uIndex2) const;
void ToPhylip(Phylip &tree);
void LogMe() const;
//private:
void SetLeafCount(unsigned uLeafCount);
void CreateCluster();
void JoinNodes(unsigned uLeftNodeIndex, unsigned uRightNodeIndex,
float dLeftLength, float dRightLength, unsigned uNewNodeIndex);
void ChooseJoin(unsigned *ptruLeftIndex, unsigned *ptruRightIndex,
float *ptrdLeftLength, float *ptrdRightLength);
void ChooseJoinNeighborJoining(unsigned *ptruLeftIndex, unsigned *ptruRightIndex,
float *ptrdLeftLength, float *ptrdRightLength);
void ChooseJoinNearestNeighbor(unsigned *ptruLeftIndex, unsigned *ptruRightIndex,
float *ptrdLeftLength, float *ptrdRightLength);
float ComputeDist(unsigned uNewNodeIndex, unsigned uNodeIndex);
float ComputeDistAverageLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex);
float ComputeDistMinLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex);
float ComputeDistMaxLinkage(unsigned uNewNodeIndex, unsigned uNodeIndex);
float ComputeDistNeighborJoining(unsigned uNewNewIndex, unsigned uNodeIndex);
float ComputeDistMAFFT(unsigned uNewNewIndex, unsigned uNodeIndex);
float Calc_r(unsigned uNodeIndex) const;
unsigned VectorIndex(unsigned uIndex1, unsigned uIndex2) const;
unsigned GetFirstCluster() const;
unsigned GetNextCluster(unsigned uNodeIndex) const;
float ComputeMetric(unsigned uIndex1, unsigned uIndex2) const;
float ComputeMetricNearestNeighbor(unsigned i, unsigned j) const;
float ComputeMetricNeighborJoining(unsigned i, unsigned j) const;
void InitMetric(unsigned uMaxNodeIndex);
void InsertMetric(unsigned uIndex1, unsigned uIndex2, float dMetric);
float GetMinMetric(unsigned *ptruIndex1, unsigned *ptruIndex2) const;
float GetMinMetricBruteForce(unsigned *ptruIndex1, unsigned *ptruIndex2) const;
void DeleteMetric(unsigned uIndex);
void DeleteMetric(unsigned uIndex1, unsigned uIndex2);
void ListMetric() const;
void DeleteFromClusterList(unsigned uNodeIndex);
void AddToClusterList(unsigned uNodeIndex);
void RBDelete(unsigned RBNode);
unsigned RBInsert(unsigned i, unsigned j, float fMetric);
unsigned RBNext(unsigned RBNode) const;
unsigned RBPrev(unsigned RBNode) const;
unsigned RBMin(unsigned RBNode) const;
unsigned RBMax(unsigned RBNode) const;
void ValidateRB(const char szMsg[] = 0) const;
void ValidateRBNode(unsigned Node, const char szMsg[]) const;
//private:
JOIN m_JoinStyle;
LINKAGE m_CentroidStyle;
ClustNode *m_Nodes;
unsigned *m_ClusterIndexToNodeIndex;
unsigned *m_NodeIndexToClusterIndex;
unsigned m_uLeafCount;
unsigned m_uNodeCount;
unsigned m_uClusterCount;
unsigned m_uTriangularMatrixSize;
float *m_dDist;
ClustSet *m_ptrSet;
ClustNode *m_ptrClusterList;
};
#endif // Clust_h

View File

@ -0,0 +1,339 @@
#include "muscle.h"
#include "cluster.h"
#include "distfunc.h"
static inline float Min(float d1, float d2)
{
return d1 < d2 ? d1 : d2;
}
static inline float Max(float d1, float d2)
{
return d1 > d2 ? d1 : d2;
}
static inline float Mean(float d1, float d2)
{
return (float) ((d1 + d2)/2.0);
}
#if _DEBUG
void ClusterTree::Validate(unsigned uNodeCount)
{
unsigned n;
ClusterNode *pNode;
unsigned uDisjointListCount = 0;
for (pNode = m_ptrDisjoints; pNode; pNode = pNode->GetNextDisjoint())
{
ClusterNode *pPrev = pNode->GetPrevDisjoint();
ClusterNode *pNext = pNode->GetNextDisjoint();
if (0 != pPrev)
{
if (pPrev->GetNextDisjoint() != pNode)
{
Log("Prev->This mismatch, prev=\n");
pPrev->LogMe();
Log("This=\n");
pNode->LogMe();
Quit("ClusterTree::Validate()");
}
}
else
{
if (pNode != m_ptrDisjoints)
{
Log("[%u]->prev = 0 but != m_ptrDisjoints=%d\n",
pNode->GetIndex(),
m_ptrDisjoints ? m_ptrDisjoints->GetIndex() : 0xffffffff);
pNode->LogMe();
Quit("ClusterTree::Validate()");
}
}
if (0 != pNext)
{
if (pNext->GetPrevDisjoint() != pNode)
{
Log("Next->This mismatch, next=\n");
pNext->LogMe();
Log("This=\n");
pNode->LogMe();
Quit("ClusterTree::Validate()");
}
}
++uDisjointListCount;
if (uDisjointListCount > m_uNodeCount)
Quit("Loop in disjoint list");
}
unsigned uParentlessNodeCount = 0;
for (n = 0; n < uNodeCount; ++n)
if (0 == m_Nodes[n].GetParent())
++uParentlessNodeCount;
if (uDisjointListCount != uParentlessNodeCount)
Quit("Disjoints = %u Parentless = %u\n", uDisjointListCount,
uParentlessNodeCount);
}
#else // !_DEBUG
#define Validate(uNodeCount) // empty
#endif
void ClusterNode::LogMe() const
{
unsigned uClusterSize = GetClusterSize();
Log("[%02u] w=%5.3f CW=%5.3f LBW=%5.3f RBW=%5.3f LWT=%5.3f RWT=%5.3f L=%02d R=%02d P=%02d NxDj=%02d PvDj=%02d Sz=%02d {",
m_uIndex,
m_dWeight,
GetClusterWeight(),
GetLeftBranchWeight(),
GetRightBranchWeight(),
GetLeftWeight(),
GetRightWeight(),
m_ptrLeft ? m_ptrLeft->GetIndex() : 0xffffffff,
m_ptrRight ? m_ptrRight->GetIndex() : 0xffffffff,
m_ptrParent ? m_ptrParent->GetIndex() : 0xffffffff,
m_ptrNextDisjoint ? m_ptrNextDisjoint->GetIndex() : 0xffffffff,
m_ptrPrevDisjoint ? m_ptrPrevDisjoint->GetIndex() : 0xffffffff,
uClusterSize);
for (unsigned i = 0; i < uClusterSize; ++i)
Log(" %u", GetClusterLeaf(i)->GetIndex());
Log(" }\n");
}
// How many leaves in the sub-tree under this node?
unsigned ClusterNode::GetClusterSize() const
{
unsigned uLeafCount = 0;
if (0 == m_ptrLeft && 0 == m_ptrRight)
return 1;
if (0 != m_ptrLeft)
uLeafCount += m_ptrLeft->GetClusterSize();
if (0 != m_ptrRight)
uLeafCount += m_ptrRight->GetClusterSize();
assert(uLeafCount > 0);
return uLeafCount;
}
double ClusterNode::GetClusterWeight() const
{
double dWeight = 0.0;
if (0 != m_ptrLeft)
dWeight += m_ptrLeft->GetClusterWeight();
if (0 != m_ptrRight)
dWeight += m_ptrRight->GetClusterWeight();
return dWeight + GetWeight();
}
double ClusterNode::GetLeftBranchWeight() const
{
const ClusterNode *ptrLeft = GetLeft();
if (0 == ptrLeft)
return 0.0;
return GetWeight() - ptrLeft->GetWeight();
}
double ClusterNode::GetRightBranchWeight() const
{
const ClusterNode *ptrRight = GetRight();
if (0 == ptrRight)
return 0.0;
return GetWeight() - ptrRight->GetWeight();
}
double ClusterNode::GetRightWeight() const
{
const ClusterNode *ptrRight = GetRight();
if (0 == ptrRight)
return 0.0;
return ptrRight->GetClusterWeight() + GetWeight();
}
double ClusterNode::GetLeftWeight() const
{
const ClusterNode *ptrLeft = GetLeft();
if (0 == ptrLeft)
return 0.0;
return ptrLeft->GetClusterWeight() + GetWeight();
}
// Return n'th leaf in the sub-tree under this node.
const ClusterNode *ClusterNode::GetClusterLeaf(unsigned uLeafIndex) const
{
if (0 != m_ptrLeft)
{
if (0 == m_ptrRight)
return this;
unsigned uLeftLeafCount = m_ptrLeft->GetClusterSize();
if (uLeafIndex < uLeftLeafCount)
return m_ptrLeft->GetClusterLeaf(uLeafIndex);
assert(uLeafIndex >= uLeftLeafCount);
return m_ptrRight->GetClusterLeaf(uLeafIndex - uLeftLeafCount);
}
if (0 == m_ptrRight)
return this;
return m_ptrRight->GetClusterLeaf(uLeafIndex);
}
void ClusterTree::DeleteFromDisjoints(ClusterNode *ptrNode)
{
ClusterNode *ptrPrev = ptrNode->GetPrevDisjoint();
ClusterNode *ptrNext = ptrNode->GetNextDisjoint();
if (0 != ptrPrev)
ptrPrev->SetNextDisjoint(ptrNext);
else
m_ptrDisjoints = ptrNext;
if (0 != ptrNext)
ptrNext->SetPrevDisjoint(ptrPrev);
#if _DEBUG
// not algorithmically necessary, but improves clarity
// and supports Validate().
ptrNode->SetPrevDisjoint(0);
ptrNode->SetNextDisjoint(0);
#endif
}
void ClusterTree::AddToDisjoints(ClusterNode *ptrNode)
{
ptrNode->SetNextDisjoint(m_ptrDisjoints);
ptrNode->SetPrevDisjoint(0);
if (0 != m_ptrDisjoints)
m_ptrDisjoints->SetPrevDisjoint(ptrNode);
m_ptrDisjoints = ptrNode;
}
ClusterTree::ClusterTree()
{
m_ptrDisjoints = 0;
m_Nodes = 0;
m_uNodeCount = 0;
}
ClusterTree::~ClusterTree()
{
delete[] m_Nodes;
}
void ClusterTree::LogMe() const
{
Log("Disjoints=%d\n", m_ptrDisjoints ? m_ptrDisjoints->GetIndex() : 0xffffffff);
for (unsigned i = 0; i < m_uNodeCount; ++i)
{
m_Nodes[i].LogMe();
}
}
ClusterNode *ClusterTree::GetRoot() const
{
return &m_Nodes[m_uNodeCount - 1];
}
// This is the UPGMA algorithm as described in Durbin et al. p166.
void ClusterTree::Create(const DistFunc &Dist)
{
unsigned i;
m_uLeafCount = Dist.GetCount();
m_uNodeCount = 2*m_uLeafCount - 1;
delete[] m_Nodes;
m_Nodes = new ClusterNode[m_uNodeCount];
for (i = 0; i < m_uNodeCount; ++i)
m_Nodes[i].SetIndex(i);
for (i = 0; i < m_uLeafCount - 1; ++i)
m_Nodes[i].SetNextDisjoint(&m_Nodes[i+1]);
for (i = 1; i < m_uLeafCount; ++i)
m_Nodes[i].SetPrevDisjoint(&m_Nodes[i-1]);
m_ptrDisjoints = &m_Nodes[0];
// Log("Initial state\n");
// LogMe();
// Log("\n");
DistFunc ClusterDist;
ClusterDist.SetCount(m_uNodeCount);
double dMaxDist = 0.0;
for (i = 0; i < m_uLeafCount; ++i)
for (unsigned j = 0; j < m_uLeafCount; ++j)
{
float dDist = Dist.GetDist(i, j);
ClusterDist.SetDist(i, j, dDist);
}
Validate(m_uLeafCount);
// Iteration. N-1 joins needed to create a binary tree from N leaves.
for (unsigned uJoinIndex = m_uLeafCount; uJoinIndex < m_uNodeCount;
++uJoinIndex)
{
// Find closest pair of clusters
unsigned uIndexClosest1;
unsigned uIndexClosest2;
bool bFound = false;
double dDistClosest = 9e99;
for (ClusterNode *ptrNode1 = m_ptrDisjoints; ptrNode1;
ptrNode1 = ptrNode1->GetNextDisjoint())
{
for (ClusterNode *ptrNode2 = ptrNode1->GetNextDisjoint(); ptrNode2;
ptrNode2 = ptrNode2->GetNextDisjoint())
{
unsigned i1 = ptrNode1->GetIndex();
unsigned i2 = ptrNode2->GetIndex();
double dDist = ClusterDist.GetDist(i1, i2);
if (dDist < dDistClosest)
{
bFound = true;
dDistClosest = dDist;
uIndexClosest1 = i1;
uIndexClosest2 = i2;
}
}
}
assert(bFound);
ClusterNode &Join = m_Nodes[uJoinIndex];
ClusterNode &Child1 = m_Nodes[uIndexClosest1];
ClusterNode &Child2 = m_Nodes[uIndexClosest2];
Join.SetLeft(&Child1);
Join.SetRight(&Child2);
Join.SetWeight(dDistClosest);
Child1.SetParent(&Join);
Child2.SetParent(&Join);
DeleteFromDisjoints(&Child1);
DeleteFromDisjoints(&Child2);
AddToDisjoints(&Join);
// Log("After join %d %d\n", uIndexClosest1, uIndexClosest2);
// LogMe();
// Calculate distance of every remaining disjoint cluster to the
// new cluster created by the join
for (ClusterNode *ptrNode = m_ptrDisjoints; ptrNode;
ptrNode = ptrNode->GetNextDisjoint())
{
unsigned uNodeIndex = ptrNode->GetIndex();
float dDist1 = ClusterDist.GetDist(uNodeIndex, uIndexClosest1);
float dDist2 = ClusterDist.GetDist(uNodeIndex, uIndexClosest2);
float dDist = Min(dDist1, dDist2);
ClusterDist.SetDist(uJoinIndex, uNodeIndex, dDist);
}
Validate(uJoinIndex+1);
}
GetRoot()->GetClusterWeight();
// LogMe();
}

View File

@ -0,0 +1,86 @@
class DistFunc;
class ClusterNode
{
friend class ClusterTree;
public:
ClusterNode()
{
m_dWeight = 0.0;
m_dWeight2 = 0.0;
m_ptrLeft = 0;
m_ptrRight = 0;
m_ptrParent = 0;
m_uIndex = 0;
m_ptrPrevDisjoint = 0;
m_ptrNextDisjoint = 0;
}
~ClusterNode() {}
public:
unsigned GetIndex() const { return m_uIndex; }
ClusterNode *GetLeft() const { return m_ptrLeft; }
ClusterNode *GetRight() const { return m_ptrRight; }
ClusterNode *GetParent() const { return m_ptrParent; }
double GetWeight() const { return m_dWeight; }
const ClusterNode *GetClusterLeaf(unsigned uLeafIndex) const;
unsigned GetClusterSize() const;
double GetClusterWeight() const;
double GetLeftBranchWeight() const;
double GetRightBranchWeight() const;
double GetLeftWeight() const;
double GetRightWeight() const;
void LogMe() const;
double GetWeight2() const { return m_dWeight2; }
void SetWeight2(double dWeight2) { m_dWeight2 = dWeight2; }
protected:
void SetIndex(unsigned uIndex) { m_uIndex = uIndex; }
void SetWeight(double dWeight) { m_dWeight = dWeight; }
void SetLeft(ClusterNode *ptrLeft) { m_ptrLeft = ptrLeft; }
void SetRight(ClusterNode *ptrRight) { m_ptrRight = ptrRight; }
void SetParent(ClusterNode *ptrParent) { m_ptrParent = ptrParent; }
void SetNextDisjoint(ClusterNode *ptrNode) { m_ptrNextDisjoint = ptrNode; }
void SetPrevDisjoint(ClusterNode *ptrNode) { m_ptrPrevDisjoint = ptrNode; }
ClusterNode *GetNextDisjoint() { return m_ptrNextDisjoint; }
ClusterNode *GetPrevDisjoint() { return m_ptrPrevDisjoint; }
private:
double m_dWeight;
double m_dWeight2;
unsigned m_uIndex;
ClusterNode *m_ptrLeft;
ClusterNode *m_ptrRight;
ClusterNode *m_ptrParent;
ClusterNode *m_ptrNextDisjoint;
ClusterNode *m_ptrPrevDisjoint;
};
class ClusterTree
{
public:
ClusterTree();
virtual ~ClusterTree();
void Create(const DistFunc &DF);
ClusterNode *GetRoot() const;
void LogMe() const;
protected:
void Join(ClusterNode *ptrNode1, ClusterNode *ptrNode2,
ClusterNode *ptrJoin);
void AddToDisjoints(ClusterNode *ptrNode);
void DeleteFromDisjoints(ClusterNode *ptrNode);
void Validate(unsigned uNodeCount);
private:
ClusterNode *m_ptrDisjoints;
ClusterNode *m_Nodes;
unsigned m_uNodeCount;
unsigned m_uLeafCount;
};

View File

@ -0,0 +1,21 @@
#ifndef ClustSet_h
#define ClustSet_h
enum JOIN;
enum LINKAGE;
class Clust;
class ClustSet
{
public:
virtual unsigned GetLeafCount() = 0;
virtual double ComputeDist(const Clust &C, unsigned uNodeIndex1,
unsigned uNodeIndex2) = 0;
virtual void JoinNodes(const Clust &C, unsigned uLeftNodeIndex,
unsigned uRightNodeIndex, unsigned uJoinedNodeIndex,
double *ptrdLeftLength, double *ptrdRightLength) = 0;
virtual const char *GetLeafName(unsigned uNodeIndex) = 0;
virtual unsigned GetLeafId(unsigned uNodeIndex) = 0;
};
#endif // ClustSet_h

View File

@ -0,0 +1,48 @@
#ifndef ClustSetDF_h
#define ClustSetDF_h
class MSA;
class Clust;
#include "clustset.h"
#include "distfunc.h"
#include "msa.h"
class ClustSetDF : public ClustSet
{
public:
ClustSetDF(const DistFunc &DF) :
m_ptrDF(&DF)
{
}
public:
virtual unsigned GetLeafCount()
{
return m_ptrDF->GetCount();
}
virtual const char *GetLeafName(unsigned uNodeIndex)
{
return m_ptrDF->GetName(uNodeIndex);
}
virtual unsigned GetLeafId(unsigned uNodeIndex)
{
return m_ptrDF->GetId(uNodeIndex);
}
virtual void JoinNodes(const Clust &C, unsigned uLeftNodeIndex,
unsigned uRightNodeIndex, unsigned uJoinedNodeIndex,
double *ptrdLeftLength, double *ptrdRightLength)
{
Quit("ClustSetDF::JoinNodes, should never be called");
}
virtual double ComputeDist(const Clust &C, unsigned uNodeIndex1,
unsigned uNodeIndex2)
{
return m_ptrDF->GetDist(uNodeIndex1, uNodeIndex2);
}
private:
const DistFunc *m_ptrDF;
};
#endif // ClustSetDF_h

View File

@ -0,0 +1,55 @@
#ifndef ClustSetMSA_h
#define ClustSetMSA_h
class MSA;
class Clust;
#include "clustset.h"
#include "msadist.h"
// Distance matrix based set.
// Computes distances between leaves, never between
// joined clusters (leaves this to distance matrix method).
class ClustSetMSA : public ClustSet
{
public:
ClustSetMSA(const MSA &msa, MSADist &MD) :
m_ptrMSA(&msa),
m_ptrMSADist(&MD)
{
}
public:
virtual unsigned GetLeafCount()
{
return m_ptrMSA->GetSeqCount();
}
virtual const char *GetLeafName(unsigned uNodeIndex)
{
return m_ptrMSA->GetSeqName(uNodeIndex);
}
virtual unsigned GetLeafId(unsigned uNodeIndex)
{
return m_ptrMSA->GetSeqId(uNodeIndex);
}
virtual void JoinNodes(const Clust &C, unsigned uLeftNodeIndex,
unsigned uRightNodeIndex, unsigned uJoinedNodeIndex,
double *ptrdLeftLength, double *ptrdRightLength)
{
Quit("ClustSetMSA::JoinNodes, should never be called");
}
virtual double ComputeDist(const Clust &C, unsigned uNodeIndex1,
unsigned uNodeIndex2)
{
return m_ptrMSADist->ComputeDist(*m_ptrMSA, uNodeIndex1, uNodeIndex2);
}
public:
const MSA &GetMSA();
private:
const MSA *m_ptrMSA;
MSADist *m_ptrMSADist;
};
#endif // ClustSetMSA_h

View File

@ -0,0 +1,190 @@
#include "muscle.h"
#include "tree.h"
#include "msa.h"
/***
Compute weights by the CLUSTALW method.
Thompson, Higgins and Gibson (1994), CABIOS (10) 19-29;
see also CLUSTALW paper.
Weights are computed from the edge lengths of a rooted tree.
Define the strength of an edge to be its length divided by the number
of leaves under that edge. The weight of a sequence is then the sum
of edge strengths on the path from the root to the leaf.
Example.
0.2
-----A 0.1
-x ------- B 0.7
--------y ----------- C
0.3 ----------z
0.4 -------------- D
0.8
Edge Length Leaves Strength
---- ----- ------ --------
xy 0.3 3 0.1
xA 0.2 1 0.2
yz 0.4 2 0.2
yB 0.1 1 0.1
zC 0.7 1 0.7
zD 0.8 1 0.8
Leaf Path Strengths Weight
---- ---- --------- ------
A xA 0.2 0.2
B xy-yB 0.1 + 0.1 0.2
C xy-yz-zC 0.1 + 0.2 + 0.7 1.0
D xy-yz-zD 0.1 + 0.2 + 0.8 1.1
***/
#define TRACE 0
static unsigned CountLeaves(const Tree &tree, unsigned uNodeIndex,
unsigned LeavesUnderNode[])
{
if (tree.IsLeaf(uNodeIndex))
{
LeavesUnderNode[uNodeIndex] = 1;
return 1;
}
const unsigned uLeft = tree.GetLeft(uNodeIndex);
const unsigned uRight = tree.GetRight(uNodeIndex);
const unsigned uRightCount = CountLeaves(tree, uRight, LeavesUnderNode);
const unsigned uLeftCount = CountLeaves(tree, uLeft, LeavesUnderNode);
const unsigned uCount = uRightCount + uLeftCount;
LeavesUnderNode[uNodeIndex] = uCount;
return uCount;
}
void CalcClustalWWeights(const Tree &tree, WEIGHT Weights[])
{
#if TRACE
Log("CalcClustalWWeights\n");
tree.LogMe();
#endif
const unsigned uLeafCount = tree.GetLeafCount();
if (0 == uLeafCount)
return;
else if (1 == uLeafCount)
{
Weights[0] = (WEIGHT) 1.0;
return;
}
else if (2 == uLeafCount)
{
Weights[0] = (WEIGHT) 0.5;
Weights[1] = (WEIGHT) 0.5;
return;
}
if (!tree.IsRooted())
Quit("CalcClustalWWeights requires rooted tree");
const unsigned uNodeCount = tree.GetNodeCount();
unsigned *LeavesUnderNode = new unsigned[uNodeCount];
memset(LeavesUnderNode, 0, uNodeCount*sizeof(unsigned));
const unsigned uRootNodeIndex = tree.GetRootNodeIndex();
unsigned uLeavesUnderRoot = CountLeaves(tree, uRootNodeIndex, LeavesUnderNode);
if (uLeavesUnderRoot != uLeafCount)
Quit("WeightsFromTreee: Internal error, root count %u %u",
uLeavesUnderRoot, uLeafCount);
#if TRACE
Log("Node Leaves Length Strength\n");
Log("---- ------ -------- --------\n");
// 1234 123456 12345678 12345678
#endif
double *Strengths = new double[uNodeCount];
for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
{
if (tree.IsRoot(uNodeIndex))
{
Strengths[uNodeIndex] = 0.0;
continue;
}
const unsigned uParent = tree.GetParent(uNodeIndex);
const double dLength = tree.GetEdgeLength(uNodeIndex, uParent);
const unsigned uLeaves = LeavesUnderNode[uNodeIndex];
const double dStrength = dLength / (double) uLeaves;
Strengths[uNodeIndex] = dStrength;
#if TRACE
Log("%4u %6u %8g %8g\n", uNodeIndex, uLeaves, dLength, dStrength);
#endif
}
#if TRACE
Log("\n");
Log(" Seq Path..Weight\n");
Log("-------------------- ------------\n");
#endif
for (unsigned n = 0; n < uLeafCount; ++n)
{
const unsigned uLeafNodeIndex = tree.LeafIndexToNodeIndex(n);
#if TRACE
Log("%20.20s %4u ", tree.GetLeafName(uLeafNodeIndex), uLeafNodeIndex);
#endif
if (!tree.IsLeaf(uLeafNodeIndex))
Quit("CalcClustalWWeights: leaf");
double dWeight = 0;
unsigned uNode = uLeafNodeIndex;
while (!tree.IsRoot(uNode))
{
dWeight += Strengths[uNode];
uNode = tree.GetParent(uNode);
#if TRACE
Log("->%u(%g)", uNode, Strengths[uNode]);
#endif
}
if (dWeight < 0.0001)
{
#if TRACE
Log("zero->one");
#endif
dWeight = 1.0;
}
Weights[n] = (WEIGHT) dWeight;
#if TRACE
Log(" = %g\n", dWeight);
#endif
}
delete[] Strengths;
delete[] LeavesUnderNode;
Normalize(Weights, uLeafCount);
}
void MSA::SetClustalWWeights(const Tree &tree)
{
const unsigned uSeqCount = GetSeqCount();
const unsigned uLeafCount = tree.GetLeafCount();
WEIGHT *Weights = new WEIGHT[uSeqCount];
CalcClustalWWeights(tree, Weights);
for (unsigned n = 0; n < uLeafCount; ++n)
{
const WEIGHT w = Weights[n];
const unsigned uLeafNodeIndex = tree.LeafIndexToNodeIndex(n);
const unsigned uId = tree.GetLeafId(uLeafNodeIndex);
const unsigned uSeqIndex = GetSeqIndex(uId);
#if DEBUG
if (GetSeqName(uSeqIndex) != tree.GetLeafName(uLeafNodeIndex))
Quit("MSA::SetClustalWWeights: names don't match");
#endif
SetSeqWeight(uSeqIndex, w);
}
NormalizeWeights((WEIGHT) 1.0);
delete[] Weights;
}

View File

@ -0,0 +1,189 @@
#include "muscle.h"
#include "msa.h"
static int Blosum62[23][23] =
{
// A B C D E F G H I K L M N P Q R S T V W X Y Z
+4, -2, +0, -2, -1, -2, +0, -2, -1, -1, -1, -1, -2, -1, -1, -1, +1, +0, +0, -3, -1, -2, -1, // A
-2, +6, -3, +6, +2, -3, -1, -1, -3, -1, -4, -3, +1, -1, +0, -2, +0, -1, -3, -4, -1, -3, +2, // B
+0, -3, +9, -3, -4, -2, -3, -3, -1, -3, -1, -1, -3, -3, -3, -3, -1, -1, -1, -2, -1, -2, -4, // C
-2, +6, -3, +6, +2, -3, -1, -1, -3, -1, -4, -3, +1, -1, +0, -2, +0, -1, -3, -4, -1, -3, +2, // D
-1, +2, -4, +2, +5, -3, -2, +0, -3, +1, -3, -2, +0, -1, +2, +0, +0, -1, -2, -3, -1, -2, +5, // E
-2, -3, -2, -3, -3, +6, -3, -1, +0, -3, +0, +0, -3, -4, -3, -3, -2, -2, -1, +1, -1, +3, -3, // F
+0, -1, -3, -1, -2, -3, +6, -2, -4, -2, -4, -3, +0, -2, -2, -2, +0, -2, -3, -2, -1, -3, -2, // G
-2, -1, -3, -1, +0, -1, -2, +8, -3, -1, -3, -2, +1, -2, +0, +0, -1, -2, -3, -2, -1, +2, +0, // H
-1, -3, -1, -3, -3, +0, -4, -3, +4, -3, +2, +1, -3, -3, -3, -3, -2, -1, +3, -3, -1, -1, -3, // I
-1, -1, -3, -1, +1, -3, -2, -1, -3, +5, -2, -1, +0, -1, +1, +2, +0, -1, -2, -3, -1, -2, +1, // K
-1, -4, -1, -4, -3, +0, -4, -3, +2, -2, +4, +2, -3, -3, -2, -2, -2, -1, +1, -2, -1, -1, -3, // L
-1, -3, -1, -3, -2, +0, -3, -2, +1, -1, +2, +5, -2, -2, +0, -1, -1, -1, +1, -1, -1, -1, -2, // M
-2, +1, -3, +1, +0, -3, +0, +1, -3, +0, -3, -2, +6, -2, +0, +0, +1, +0, -3, -4, -1, -2, +0, // N
-1, -1, -3, -1, -1, -4, -2, -2, -3, -1, -3, -2, -2, +7, -1, -2, -1, -1, -2, -4, -1, -3, -1, // P
-1, +0, -3, +0, +2, -3, -2, +0, -3, +1, -2, +0, +0, -1, +5, +1, +0, -1, -2, -2, -1, -1, +2, // Q
-1, -2, -3, -2, +0, -3, -2, +0, -3, +2, -2, -1, +0, -2, +1, +5, -1, -1, -3, -3, -1, -2, +0, // R
+1, +0, -1, +0, +0, -2, +0, -1, -2, +0, -2, -1, +1, -1, +0, -1, +4, +1, -2, -3, -1, -2, +0, // S
+0, -1, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, +0, -1, -1, -1, +1, +5, +0, -2, -1, -2, -1, // T
+0, -3, -1, -3, -2, -1, -3, -3, +3, -2, +1, +1, -3, -2, -2, -3, -2, +0, +4, -3, -1, -1, -2, // V
-3, -4, -2, -4, -3, +1, -2, -2, -3, -3, -2, -1, -4, -4, -2, -3, -3, -2, -3,+11, -1, +2, -3, // W
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // X
-2, -3, -2, -3, -2, +3, -3, +2, -1, -2, -1, -1, -2, -3, -1, -2, -2, -2, -1, +2, -1, +7, -2, // Y
-1, +2, -4, +2, +5, -3, -2, +0, -3, +1, -3, -2, +0, -1, +2, +0, +0, -1, -2, -3, -1, -2, +5, // Z
};
static int toi_tab[26] =
{
0, // A
1, // B
2, // C
3, // D
4, // E
5, // F
6, // G
7, // H
8, // I
-1, // J
9, // K
10, // L
11, // M
12, // N
-1, // O
13, // P
14, // Q
15, // R
16, // S
17, // T
17, // U
18, // V
19, // W
20, // X
21, // Y
22, // Z
};
static int toi(char c)
{
c = toupper(c);
return toi_tab[c - 'A'];
}
static int BlosumScore(char c1, char c2)
{
int i1 = toi(c1);
int i2 = toi(c2);
return Blosum62[i1][i2];
}
/***
Consider a column with 5 As and 3 Bs.
There are:
5x4 pairs of As.
3x2 pairs of Bs.
5x3x2 AB pairs
8x7 = 5x4 + 3x2 + 5x3x2 pairs of letters
***/
static double BlosumScoreCol(const MSA &a, unsigned uColIndex)
{
int iCounts[23];
memset(iCounts, 0, sizeof(iCounts));
const unsigned uSeqCount = a.GetSeqCount();
unsigned uCharCount = 0;
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
{
char c = a.GetChar(uSeqIndex, uColIndex);
if (IsGapChar(c))
continue;
int iChar = toi(c);
++iCounts[iChar];
++uCharCount;
}
if (uCharCount < 2)
return -9;
int iTotalScore = 0;
for (int i1 = 0; i1 < 23; ++i1)
{
int iCounts1 = iCounts[i1];
iTotalScore += iCounts1*(iCounts1 - 1)*Blosum62[i1][i1];
for (int i2 = i1 + 1; i2 < 23; ++i2)
iTotalScore += iCounts[i2]*iCounts1*2*Blosum62[i1][i2];
}
int iPairCount = uCharCount*(uCharCount - 1);
return (double) iTotalScore / (double) iPairCount;
}
/***
Consider a column with 5 As and 3 Bs.
A residue of type Q scores:
5xAQ + 3xBQ
***/
static void AssignColorsCol(const MSA &a, unsigned uColIndex, int **Colors)
{
int iCounts[23];
memset(iCounts, 0, sizeof(iCounts));
const unsigned uSeqCount = a.GetSeqCount();
unsigned uCharCount = 0;
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
{
char c = a.GetChar(uSeqIndex, uColIndex);
if (IsGapChar(c))
continue;
int iChar = toi(c);
++iCounts[iChar];
++uCharCount;
}
int iMostConservedType = -1;
int iMostConservedCount = -1;
for (unsigned i = 0; i < 23; ++i)
{
if (iCounts[i] > iMostConservedCount)
{
iMostConservedType = i;
iMostConservedCount = iCounts[i];
}
}
double dColScore = BlosumScoreCol(a, uColIndex);
int c;
if (dColScore >= 3.0)
c = 3;
//else if (dColScore >= 1.0)
// c = 2;
else if (dColScore >= 0.2)
c = 1;
else
c = 0;
int Color[23];
for (unsigned uLetter = 0; uLetter < 23; ++uLetter)
{
double dScore = Blosum62[uLetter][iMostConservedType];
if (dScore >= dColScore)
Color[uLetter] = c;
else
Color[uLetter] = 0;
}
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
{
char c = a.GetChar(uSeqIndex, uColIndex);
if (IsGapChar(c))
{
Colors[uSeqIndex][uColIndex] = 0;
continue;
}
int iLetter = toi(c);
if (iLetter >= 0 && iLetter < 23)
Colors[uSeqIndex][uColIndex] = Color[iLetter];
else
Colors[uSeqIndex][uColIndex] = 0;
}
}
void AssignColors(const MSA &a, int **Colors)
{
const unsigned uColCount = a.GetColCount();
for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
AssignColorsCol(a, uColIndex, Colors);
}

View File

@ -0,0 +1,118 @@
/***
Conservation value for a column in an MSA is defined as the number
of times the most common letter appears divided by the number of
sequences.
***/
#include "muscle.h"
#include "msa.h"
#include <math.h>
double MSA::GetAvgCons() const
{
assert(GetSeqCount() > 0);
double dSum = 0;
unsigned uNonGapColCount = 0;
for (unsigned uColIndex = 0; uColIndex < GetColCount(); ++uColIndex)
{
if (!IsGapColumn(uColIndex))
{
dSum += GetCons(uColIndex);
++uNonGapColCount;
}
}
assert(uNonGapColCount > 0);
double dAvg = dSum / uNonGapColCount;
assert(dAvg > 0 && dAvg <= 1);
return dAvg;
}
double MSA::GetCons(unsigned uColIndex) const
{
unsigned Counts[MAX_ALPHA];
for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter)
Counts[uLetter] = 0;
unsigned uMaxCount = 0;
for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
{
if (IsGap(uSeqIndex, uColIndex))
continue;
char c = GetChar(uSeqIndex, uColIndex);
c = toupper(c);
if ('X' == c || 'B' == c || 'Z' == c)
continue;
unsigned uLetter = GetLetter(uSeqIndex, uColIndex);
unsigned uCount = Counts[uLetter] + 1;
if (uCount > uMaxCount)
uMaxCount = uCount;
Counts[uLetter] = uCount;
}
// Cons is undefined for all-gap column
if (0 == uMaxCount)
{
// assert(false);
return 1;
}
double dCons = (double) uMaxCount / (double) GetSeqCount();
assert(dCons > 0 && dCons <= 1);
return dCons;
}
// Perecent identity of a pair of sequences.
// Positions with one or both gapped are ignored.
double MSA::GetPctIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const
{
const unsigned uColCount = GetColCount();
unsigned uPosCount = 0;
unsigned uSameCount = 0;
for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
{
const char c1 = GetChar(uSeqIndex1, uColIndex);
const char c2 = GetChar(uSeqIndex2, uColIndex);
if (IsGapChar(c1) || IsGapChar(c2))
continue;
if (c1 == c2)
++uSameCount;
++uPosCount;
}
if (0 == uPosCount)
return 0;
return (double) uSameCount / (double) uPosCount;
}
// Perecent group identity of a pair of sequences.
// Positions with one or both gapped are ignored.
double MSA::GetPctGroupIdentityPair(unsigned uSeqIndex1,
unsigned uSeqIndex2) const
{
extern unsigned ResidueGroup[];
const unsigned uColCount = GetColCount();
unsigned uPosCount = 0;
unsigned uSameCount = 0;
for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
{
if (IsGap(uSeqIndex1, uColIndex))
continue;
if (IsGap(uSeqIndex2, uColIndex))
continue;
if (IsWildcard(uSeqIndex1, uColIndex))
continue;
if (IsWildcard(uSeqIndex2, uColIndex))
continue;
const unsigned uLetter1 = GetLetter(uSeqIndex1, uColIndex);
const unsigned uLetter2 = GetLetter(uSeqIndex2, uColIndex);
const unsigned uGroup1 = ResidueGroup[uLetter1];
const unsigned uGroup2 = ResidueGroup[uLetter2];
if (uGroup1 == uGroup2)
++uSameCount;
++uPosCount;
}
if (0 == uPosCount)
return 0;
return (double) uSameCount / (double) uPosCount;
}

View File

@ -0,0 +1,378 @@
#include "muscle.h"
#include "diaglist.h"
#include "pwpath.h"
#define MAX(x, y) ((x) > (y) ? (x) : (y))
#define MIN(x, y) ((x) < (y) ? (x) : (y))
void DiagList::Add(const Diag &d)
{
if (m_uCount == MAX_DIAGS)
Quit("DiagList::Add, overflow %u", m_uCount);
m_Diags[m_uCount] = d;
++m_uCount;
}
void DiagList::Add(unsigned uStartPosA, unsigned uStartPosB, unsigned uLength)
{
Diag d;
d.m_uStartPosA = uStartPosA;
d.m_uStartPosB = uStartPosB;
d.m_uLength = uLength;
Add(d);
}
const Diag &DiagList::Get(unsigned uIndex) const
{
if (uIndex >= m_uCount)
Quit("DiagList::Get(%u), count=%u", uIndex, m_uCount);
return m_Diags[uIndex];
}
void DiagList::LogMe() const
{
Log("DiagList::LogMe, count=%u\n", m_uCount);
Log(" n StartA StartB Length\n");
Log("--- ------ ------ ------\n");
for (unsigned n = 0; n < m_uCount; ++n)
{
const Diag &d = m_Diags[n];
Log("%3u %6u %6u %6u\n",
n, d.m_uStartPosA, d.m_uStartPosB, d.m_uLength);
}
}
void DiagList::FromPath(const PWPath &Path)
{
Clear();
const unsigned uEdgeCount = Path.GetEdgeCount();
unsigned uLength = 0;
unsigned uStartPosA;
unsigned uStartPosB;
for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
{
const PWEdge &Edge = Path.GetEdge(uEdgeIndex);
// Typical cases
if (Edge.cType == 'M')
{
if (0 == uLength)
{
uStartPosA = Edge.uPrefixLengthA - 1;
uStartPosB = Edge.uPrefixLengthB - 1;
}
++uLength;
}
else
{
if (uLength >= g_uMinDiagLength)
Add(uStartPosA, uStartPosB, uLength);
uLength = 0;
}
}
// Special case for last edge
if (uLength >= g_uMinDiagLength)
Add(uStartPosA, uStartPosB, uLength);
}
bool DiagList::NonZeroIntersection(const Diag &d) const
{
for (unsigned n = 0; n < m_uCount; ++n)
{
const Diag &d2 = m_Diags[n];
if (DiagOverlap(d, d2) > 0)
return true;
}
return false;
}
// DialogOverlap returns the length of the overlapping
// section of the two diagonals along the diagonals
// themselves; in other words, the length of
// the intersection of the two sets of cells in
// the matrix.
unsigned DiagOverlap(const Diag &d1, const Diag &d2)
{
// Determine where the diagonals intersect the A
// axis (extending them if required). If they
// intersect at different points, they do not
// overlap. Coordinates on a diagonal are
// given by B = A + c where c is the value of
// A at the intersection with the A axis.
// Hence, c = B - A for any point on the diagonal.
int c1 = (int) d1.m_uStartPosB - (int) d1.m_uStartPosA;
int c2 = (int) d2.m_uStartPosB - (int) d2.m_uStartPosA;
if (c1 != c2)
return 0;
assert(DiagOverlapA(d1, d2) == DiagOverlapB(d1, d2));
return DiagOverlapA(d1, d2);
}
// DialogOverlapA returns the length of the overlapping
// section of the projection of the two diagonals onto
// the A axis.
unsigned DiagOverlapA(const Diag &d1, const Diag &d2)
{
unsigned uMaxStart = MAX(d1.m_uStartPosA, d2.m_uStartPosA);
unsigned uMinEnd = MIN(d1.m_uStartPosA + d1.m_uLength - 1,
d2.m_uStartPosA + d2.m_uLength - 1);
int iLength = (int) uMinEnd - (int) uMaxStart + 1;
if (iLength < 0)
return 0;
return (unsigned) iLength;
}
// DialogOverlapB returns the length of the overlapping
// section of the projection of the two diagonals onto
// the B axis.
unsigned DiagOverlapB(const Diag &d1, const Diag &d2)
{
unsigned uMaxStart = MAX(d1.m_uStartPosB, d2.m_uStartPosB);
unsigned uMinEnd = MIN(d1.m_uStartPosB + d1.m_uLength - 1,
d2.m_uStartPosB + d2.m_uLength - 1);
int iLength = (int) uMinEnd - (int) uMaxStart + 1;
if (iLength < 0)
return 0;
return (unsigned) iLength;
}
// Returns true if the two diagonals can be on the
// same path through the DP matrix. If DiagCompatible
// returns false, they cannot be in the same path
// and hence "contradict" each other.
bool DiagCompatible(const Diag &d1, const Diag &d2)
{
if (DiagOverlap(d1, d2) > 0)
return true;
return 0 == DiagOverlapA(d1, d2) && 0 == DiagOverlapB(d1, d2);
}
// Returns the length of the "break" between two diagonals.
unsigned DiagBreak(const Diag &d1, const Diag &d2)
{
int c1 = (int) d1.m_uStartPosB - (int) d1.m_uStartPosA;
int c2 = (int) d2.m_uStartPosB - (int) d2.m_uStartPosA;
if (c1 != c2)
return 0;
int iMaxStart = MAX(d1.m_uStartPosA, d2.m_uStartPosA);
int iMinEnd = MIN(d1.m_uStartPosA + d1.m_uLength - 1,
d2.m_uStartPosA + d1.m_uLength - 1);
int iBreak = iMaxStart - iMinEnd - 1;
if (iBreak < 0)
return 0;
return (unsigned) iBreak;
}
// Merge diagonals that are continuations of each other with
// short breaks of up to length g_uMaxDiagBreak.
// In a sorted list of diagonals, we only have to check
// consecutive entries.
void MergeDiags(DiagList &DL)
{
return;
#if DEBUG
if (!DL.IsSorted())
Quit("MergeDiags: !IsSorted");
#endif
// TODO: Fix this!
// Breaks must be with no offset (no gaps)
const unsigned uCount = DL.GetCount();
if (uCount <= 1)
return;
DiagList NewList;
Diag MergedDiag;
const Diag *ptrPrev = &DL.Get(0);
for (unsigned i = 1; i < uCount; ++i)
{
const Diag *ptrDiag = &DL.Get(i);
unsigned uBreakLength = DiagBreak(*ptrPrev, *ptrDiag);
if (uBreakLength <= g_uMaxDiagBreak)
{
MergedDiag.m_uStartPosA = ptrPrev->m_uStartPosA;
MergedDiag.m_uStartPosB = ptrPrev->m_uStartPosB;
MergedDiag.m_uLength = ptrPrev->m_uLength + ptrDiag->m_uLength
+ uBreakLength;
ptrPrev = &MergedDiag;
}
else
{
NewList.Add(*ptrPrev);
ptrPrev = ptrDiag;
}
}
NewList.Add(*ptrPrev);
DL.Copy(NewList);
}
void DiagList::DeleteIncompatible()
{
assert(IsSorted());
if (m_uCount < 2)
return;
bool *bFlagForDeletion = new bool[m_uCount];
for (unsigned i = 0; i < m_uCount; ++i)
bFlagForDeletion[i] = false;
for (unsigned i = 0; i < m_uCount; ++i)
{
const Diag &di = m_Diags[i];
for (unsigned j = i + 1; j < m_uCount; ++j)
{
const Diag &dj = m_Diags[j];
// Verify sorted correctly
assert(di.m_uStartPosA <= dj.m_uStartPosA);
// If two diagonals are incompatible and
// one is is much longer than the other,
// keep the longer one.
if (!DiagCompatible(di, dj))
{
if (di.m_uLength > dj.m_uLength*4)
bFlagForDeletion[j] = true;
else if (dj.m_uLength > di.m_uLength*4)
bFlagForDeletion[i] = true;
else
{
bFlagForDeletion[i] = true;
bFlagForDeletion[j] = true;
}
}
}
}
for (unsigned i = 0; i < m_uCount; ++i)
{
const Diag &di = m_Diags[i];
if (bFlagForDeletion[i])
continue;
for (unsigned j = i + 1; j < m_uCount; ++j)
{
const Diag &dj = m_Diags[j];
if (bFlagForDeletion[j])
continue;
// Verify sorted correctly
assert(di.m_uStartPosA <= dj.m_uStartPosA);
// If sort order in B different from sorted order in A,
// either diags are incompatible or we detected a repeat
// or permutation.
if (di.m_uStartPosB >= dj.m_uStartPosB || !DiagCompatible(di, dj))
{
bFlagForDeletion[i] = true;
bFlagForDeletion[j] = true;
}
}
}
unsigned uNewCount = 0;
Diag *NewDiags = new Diag[m_uCount];
for (unsigned i = 0; i < m_uCount; ++i)
{
if (bFlagForDeletion[i])
continue;
const Diag &d = m_Diags[i];
NewDiags[uNewCount] = d;
++uNewCount;
}
memcpy(m_Diags, NewDiags, uNewCount*sizeof(Diag));
m_uCount = uNewCount;
delete[] NewDiags;
}
void DiagList::Copy(const DiagList &DL)
{
Clear();
unsigned uCount = DL.GetCount();
for (unsigned i = 0; i < uCount; ++i)
Add(DL.Get(i));
}
// Check if sorted in increasing order of m_uStartPosA
bool DiagList::IsSorted() const
{
return true;
unsigned uCount = GetCount();
for (unsigned i = 1; i < uCount; ++i)
if (m_Diags[i-1].m_uStartPosA > m_Diags[i].m_uStartPosA)
return false;
return true;
}
// Sort in increasing order of m_uStartPosA
// Dumb bubble sort, but don't care about speed
// because don't get long lists.
void DiagList::Sort()
{
if (m_uCount < 2)
return;
bool bContinue = true;
while (bContinue)
{
bContinue = false;
for (unsigned i = 0; i < m_uCount - 1; ++i)
{
if (m_Diags[i].m_uStartPosA > m_Diags[i+1].m_uStartPosA)
{
Diag Tmp = m_Diags[i];
m_Diags[i] = m_Diags[i+1];
m_Diags[i+1] = Tmp;
bContinue = true;
}
}
}
}
//void TestDiag()
// {
// Diag d1;
// Diag d2;
// Diag d3;
//
// d1.m_uStartPosA = 0;
// d1.m_uStartPosB = 1;
// d1.m_uLength = 32;
//
// d2.m_uStartPosA = 55;
// d2.m_uStartPosB = 70;
// d2.m_uLength = 36;
//
// d3.m_uStartPosA = 102;
// d3.m_uStartPosB = 122;
// d3.m_uLength = 50;
//
// DiagList DL;
// DL.Add(d1);
// DL.Add(d2);
// DL.Add(d3);
//
// Log("Before DeleteIncompatible:\n");
// DL.LogMe();
// DL.DeleteIncompatible();
//
// Log("After DeleteIncompatible:\n");
// DL.LogMe();
//
// MergeDiags(DL);
// Log("After Merge:\n");
// DL.LogMe();
//
// DPRegionList RL;
// DiagListToDPRegionList(DL, RL, 200, 200);
// RL.LogMe();
// }

View File

@ -0,0 +1,89 @@
#ifndef diaglist_h
#define diaglist_h
const unsigned EMPTY = (unsigned) ~0;
const unsigned MAX_DIAGS = 1024;
struct Diag
{
unsigned m_uStartPosA;
unsigned m_uStartPosB;
unsigned m_uLength;
};
struct Rect
{
unsigned m_uStartPosA;
unsigned m_uStartPosB;
unsigned m_uLengthA;
unsigned m_uLengthB;
};
class DiagList
{
public:
DiagList()
{
m_uCount = 0;
}
~DiagList()
{
Free();
}
public:
// Creation
void Clear()
{
Free();
}
void FromPath(const PWPath &Path);
void Add(const Diag &d);
void Add(unsigned uStartPosA, unsigned uStartPosB, unsigned uLength);
void DeleteIncompatible();
// Accessors
unsigned GetCount() const
{
return m_uCount;
}
const Diag &Get(unsigned uIndex) const;
// Operations
void Sort();
void Copy(const DiagList &DL);
// Query
// returns true iff given diagonal is included in the list
// in whole or in part.
bool NonZeroIntersection(const Diag &d) const;
bool IsSorted() const;
// Diagnostics
void LogMe() const;
private:
void Free()
{
m_uCount = 0;
}
private:
unsigned m_uCount;
Diag m_Diags[MAX_DIAGS];
};
unsigned DiagOverlap(const Diag &d1, const Diag &d2);
unsigned DiagOverlapA(const Diag &d1, const Diag &d2);
unsigned DiagOverlapB(const Diag &d1, const Diag &d2);
unsigned DiagBreak(const Diag &d1, const Diag &d2);
bool DiagCompatible(const Diag &d1, const Diag &d2);
void CheckDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
unsigned uLengthB, const MSA &msaA, const MSA &msaB, const PWPath &Path);
void FindDiags(const ProfPos *PX, unsigned uLengthX, const ProfPos *PY,
unsigned uLengthY, DiagList &DL);
void FindDiagsNuc(const ProfPos *PX, unsigned uLengthX, const ProfPos *PY,
unsigned uLengthY, DiagList &DL);
void MergeDiags(DiagList &DL);
#endif // diaglist_h

View File

@ -0,0 +1,162 @@
#include "muscle.h"
#include "msa.h"
#include "objscore.h"
#include "profile.h"
#define TRACE 0
#define COMPARE_3_52 0
#define BRUTE_LETTERS 0
static SCORE ScoreColLetters(const MSA &msa, unsigned uColIndex)
{
SCOREMATRIX &Mx = *g_ptrScoreMatrix;
const unsigned uSeqCount = msa.GetSeqCount();
#if BRUTE_LETTERS
SCORE BruteScore = 0;
for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1)
{
unsigned uLetter1 = msa.GetLetterEx(uSeqIndex1, uColIndex);
if (uLetter1 >= g_AlphaSize)
continue;
WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1);
for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; ++uSeqIndex2)
{
unsigned uLetter2 = msa.GetLetterEx(uSeqIndex2, uColIndex);
if (uLetter2 >= g_AlphaSize)
continue;
WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2);
BruteScore += w1*w2*Mx[uLetter1][uLetter2];
}
}
#endif
double N = 0;
for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1)
{
WEIGHT w = msa.GetSeqWeight(uSeqIndex1);
N += w;
}
if (N <= 0)
return 0;
FCOUNT Freqs[20];
memset(Freqs, 0, sizeof(Freqs));
SCORE Score = 0;
for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1)
{
unsigned uLetter = msa.GetLetterEx(uSeqIndex1, uColIndex);
if (uLetter >= g_AlphaSize)
continue;
WEIGHT w = msa.GetSeqWeight(uSeqIndex1);
Freqs[uLetter] += w;
Score -= w*w*Mx[uLetter][uLetter];
}
for (unsigned uLetter1 = 0; uLetter1 < g_AlphaSize; ++uLetter1)
{
const FCOUNT f1 = Freqs[uLetter1];
Score += f1*f1*Mx[uLetter1][uLetter1];
for (unsigned uLetter2 = uLetter1 + 1; uLetter2 < g_AlphaSize; ++uLetter2)
{
const FCOUNT f2 = Freqs[uLetter2];
Score += 2*f1*f2*Mx[uLetter1][uLetter2];
}
}
Score /= 2;
#if BRUTE_LETTERS
assert(BTEq(BruteScore, Score));
#endif
return Score;
}
static SCORE ScoreLetters(const MSA &msa, const unsigned Edges[],
unsigned uEdgeCount)
{
const unsigned uSeqCount = msa.GetSeqCount();
const unsigned uColCount = msa.GetColCount();
// Letters
SCORE Score = 0;
for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
{
const unsigned uColIndex = Edges[uEdgeIndex];
assert(uColIndex < uColCount);
Score += ScoreColLetters(msa, uColIndex);
}
return Score;
}
void GetLetterScores(const MSA &msa, SCORE Scores[])
{
const unsigned uColCount = msa.GetColCount();
for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
Scores[uColIndex] = ScoreColLetters(msa, uColIndex);
}
SCORE DiffObjScore(
const MSA &msa1, const PWPath &Path1, const unsigned Edges1[], unsigned uEdgeCount1,
const MSA &msa2, const PWPath &Path2, const unsigned Edges2[], unsigned uEdgeCount2)
{
#if TRACE
{
Log("============DiffObjScore===========\n");
Log("msa1:\n");
msa1.LogMe();
Log("\n");
Log("Cols1: ");
for (unsigned i = 0; i < uEdgeCount1; ++i)
Log(" %u", Edges1[i]);
Log("\n\n");
Log("msa2:\n");
msa2.LogMe();
Log("Cols2: ");
for (unsigned i = 0; i < uEdgeCount2; ++i)
Log(" %u", Edges2[i]);
Log("\n\n");
}
#endif
#if COMPARE_3_52
extern SCORE g_SPScoreLetters;
extern SCORE g_SPScoreGaps;
SCORE SP1 = ObjScoreSP(msa1);
SCORE SPLetters1 = g_SPScoreLetters;
SCORE SPGaps1 = g_SPScoreGaps;
SCORE SP2 = ObjScoreSP(msa2);
SCORE SPLetters2 = g_SPScoreLetters;
SCORE SPGaps2 = g_SPScoreGaps;
SCORE SPDiffLetters = SPLetters2 - SPLetters1;
SCORE SPDiffGaps = SPGaps2 - SPGaps1;
SCORE SPDiff = SPDiffLetters + SPDiffGaps;
#endif
SCORE Letters1 = ScoreLetters(msa1, Edges1, uEdgeCount1);
SCORE Letters2 = ScoreLetters(msa2, Edges2, uEdgeCount2);
SCORE Gaps1 = ScoreGaps(msa1, Edges1, uEdgeCount1);
SCORE Gaps2 = ScoreGaps(msa2, Edges2, uEdgeCount2);
SCORE DiffLetters = Letters2 - Letters1;
SCORE DiffGaps = Gaps2 - Gaps1;
SCORE Diff = DiffLetters + DiffGaps;
#if COMPARE_3_52
Log("ObjScoreSP Letters1=%.4g Letters2=%.4g DiffLetters=%.4g\n",
SPLetters1, SPLetters2, SPDiffLetters);
Log("DiffObjScore Letters1=%.4g Letters2=%.4g DiffLetters=%.4g\n",
Letters1, Letters2, DiffLetters);
Log("ObjScoreSP Gaps1=%.4g Gaps2=%.4g DiffGaps=%.4g\n",
SPGaps1, SPGaps2, SPDiffGaps);
Log("DiffObjScore Gaps1=%.4g Gaps2=%.4g DiffGaps=%.4g\n",
Gaps1, Gaps2, DiffGaps);
Log("SP diff=%.4g DiffObjScore Diff=%.4g\n", SPDiff, Diff);
#endif
return Diff;
}

View File

@ -0,0 +1,114 @@
#include "muscle.h"
#include "pwpath.h"
#define TRACE 0
void DiffPaths(const PWPath &p1, const PWPath &p2, unsigned Edges1[],
unsigned *ptruDiffCount1, unsigned Edges2[], unsigned *ptruDiffCount2)
{
#if TRACE
Log("DiffPaths\n");
Log("p1=");
p1.LogMe();
Log("p2=");
p2.LogMe();
#endif
const unsigned uEdgeCount1 = p1.GetEdgeCount();
const unsigned uEdgeCount2 = p2.GetEdgeCount();
unsigned uDiffCount1 = 0;
unsigned uDiffCount2 = 0;
unsigned uEdgeIndex1 = 0;
unsigned uEdgeIndex2 = 0;
const PWEdge *Edge1 = &p1.GetEdge(uEdgeIndex1);
const PWEdge *Edge2 = &p2.GetEdge(uEdgeIndex2);
for (;;)
{
unsigned uEdgeIndexTop1 = uEdgeIndex1;
unsigned uEdgeIndexTop2 = uEdgeIndex2;
Edge1 = &p1.GetEdge(uEdgeIndex1);
Edge2 = &p2.GetEdge(uEdgeIndex2);
#if TRACE
Log("e1[%u] PLA%u PLB%u %c, e2[%u] PLA%u PLB %u %c DC1=%u DC2=%u\n",
uEdgeIndex1, Edge1->uPrefixLengthA, Edge1->uPrefixLengthB, Edge1->cType,
uEdgeIndex2, Edge2->uPrefixLengthA, Edge2->uPrefixLengthB, Edge2->cType,
uDiffCount1, uDiffCount2);
#endif
if (Edge1->uPrefixLengthA == Edge2->uPrefixLengthA &&
Edge1->uPrefixLengthB == Edge2->uPrefixLengthB)
{
if (!Edge1->Equal(*Edge2))
{
Edges1[uDiffCount1++] = uEdgeIndex1;
Edges2[uDiffCount2++] = uEdgeIndex2;
}
++uEdgeIndex1;
++uEdgeIndex2;
}
else if (Edge2->uPrefixLengthA < Edge1->uPrefixLengthA ||
Edge2->uPrefixLengthB < Edge1->uPrefixLengthB)
Edges2[uDiffCount2++] = uEdgeIndex2++;
else if (Edge1->uPrefixLengthA < Edge2->uPrefixLengthA ||
Edge1->uPrefixLengthB < Edge2->uPrefixLengthB)
Edges1[uDiffCount1++] = uEdgeIndex1++;
if (uEdgeCount1 == uEdgeIndex1)
{
while (uEdgeIndex2 < uEdgeCount2)
Edges2[uDiffCount2++] = uEdgeIndex2++;
goto Done;
}
if (uEdgeCount2 == uEdgeIndex2)
{
while (uEdgeIndex1 < uEdgeCount1)
Edges1[uDiffCount1++] = uEdgeIndex1++;
goto Done;
}
if (uEdgeIndex1 == uEdgeIndexTop1 && uEdgeIndex2 == uEdgeIndexTop2)
Quit("DiffPaths stuck");
}
Done:;
#if TRACE
Log("DiffCount1=%u (%u %u)\n", uDiffCount1, uEdgeCount1, uEdgeCount2);
Log("Diffs1=");
for (unsigned i = 0; i < uDiffCount1; ++i)
{
const PWEdge e = p1.GetEdge(Edges1[i]);
Log(" %u=%c%u.%u", Edges1[i], e.cType, e.uPrefixLengthA, e.uPrefixLengthB);
}
Log("\n");
Log("DiffCount2=%u\n", uDiffCount2);
Log("Diffs2=");
for (unsigned i = 0; i < uDiffCount2; ++i)
{
const PWEdge e = p2.GetEdge(Edges2[i]);
Log(" %u=%c%u.%u", Edges2[i], e.cType, e.uPrefixLengthA, e.uPrefixLengthB);
}
Log("\n");
#endif
*ptruDiffCount1 = uDiffCount1;
*ptruDiffCount2 = uDiffCount2;
}
void TestDiffPaths()
{
PWPath p1;
PWPath p2;
p1.AppendEdge('M', 1, 1);
p1.AppendEdge('M', 2, 2);
p1.AppendEdge('M', 3, 3);
p2.AppendEdge('M', 1, 1);
p2.AppendEdge('D', 2, 1);
p2.AppendEdge('I', 2, 2);
p2.AppendEdge('M', 3, 3);
unsigned Edges1[64];
unsigned Edges2[64];
unsigned uDiffCount1;
unsigned uDiffCount2;
DiffPaths(p1, p2, Edges1, &uDiffCount1, Edges2, &uDiffCount2);
}

View File

@ -0,0 +1,381 @@
#include "muscle.h"
#include "tree.h"
#define TRACE 0
/***
Algorithm to compare two trees, X and Y.
A node x in X and node y in Y are defined to be
similar iff the set of leaves in the subtree under
x is identical to the set of leaves under y.
A node is defined to be dissimilar iff it is not
similar to any node in the other tree.
Nodes x and y are defined to be married iff every
node in the subtree under x is similar to a node
in the subtree under y. Married nodes are considered
to be equal. The subtrees under two married nodes can
at most differ by exchanges of left and right branches,
which we do not consider to be significant here.
A node is defined to be a bachelor iff it is not
married. If a node is a bachelor, then it has a
dissimilar node in its subtree, and it follows
immediately from the definition of marriage that its
parent is also a bachelor. Hence all nodes on the path
from a bachelor node to the root are bachelors.
We assume the trees have the same set of leaves, so
every leaf is trivially both similar and married to
the same leaf in the opposite tree. Bachelor nodes
are therefore always internal (i.e., non-leaf) nodes.
A node is defined to be a diff iff (a) it is married
and (b) its parent is a bachelor. The subtree under
a diff is maximally similar to the other tree. (In
other words, you cannot extend the subtree without
adding a bachelor).
The set of diffs is the subset of the two trees that
we consider to be identical.
Example:
-----A
-----k
----j -----B
--i -----C
------D
-----A
-----p
----n -----B
--m -----D
------C
The following pairs of internal nodes are similar.
Nodes Set of leaves
----- -------------
k,p A,B
i,m A,B,C,D
Bachelors in the first tree are i and j, bachelors
in the second tree are m and n.
Node k and p are married, but i and m are not (because j
and n are bachelors). The diffs are C, D and k.
The set of bachelor nodes can be viewed as the internal
nodes of a tree, the leaves of which are diffs. (To see
that there can't be disjoint subtrees, note that the path
from a diff to a root is all bachelor nodes, so there is
always a path between two diffs that goes through the root).
We call this tree the "diffs tree".
There is a simple O(N) algorithm to build the diffs tree.
To achieve O(N) we avoid traversing a given subtree multiple
times and also avoid comparing lists of leaves.
We visit nodes in depth-first order (i.e., a node is visited
before its parent).
If either child of a node is a bachelor, we flag it as
a bachelor.
If both children of the node we are visiting are married,
we check whether the spouses of those children have the
same parent in the other tree. If the parents are different,
the current node is a bachelor. If they have the same parent,
then the node we are visiting is the spouse of that parent.
We assign this newly identified married couple a unique integer
id. The id of a node is in one-to-one correspondence with the
set of leaves in its subtree. Two nodes have the same set of
leaves iff they have the same id. Bachelor nodes do not get
an id.
***/
static void BuildDiffs(const Tree &tree, unsigned uTreeNodeIndex,
const bool bIsDiff[], Tree &Diffs, unsigned uDiffsNodeIndex,
unsigned IdToDiffsLeafNodeIndex[])
{
#if TRACE
Log("BuildDiffs(TreeNode=%u IsDiff=%d IsLeaf=%d)\n",
uTreeNodeIndex, bIsDiff[uTreeNodeIndex], tree.IsLeaf(uTreeNodeIndex));
#endif
if (bIsDiff[uTreeNodeIndex])
{
unsigned uLeafCount = tree.GetLeafCount();
unsigned *Leaves = new unsigned[uLeafCount];
GetLeaves(tree, uTreeNodeIndex, Leaves, &uLeafCount);
for (unsigned n = 0; n < uLeafCount; ++n)
{
const unsigned uLeafNodeIndex = Leaves[n];
const unsigned uId = tree.GetLeafId(uLeafNodeIndex);
if (uId >= tree.GetLeafCount())
Quit("BuildDiffs, id out of range");
IdToDiffsLeafNodeIndex[uId] = uDiffsNodeIndex;
#if TRACE
Log(" Leaf id=%u DiffsNode=%u\n", uId, uDiffsNodeIndex);
#endif
}
delete[] Leaves;
return;
}
if (tree.IsLeaf(uTreeNodeIndex))
Quit("BuildDiffs: should never reach leaf");
const unsigned uTreeLeft = tree.GetLeft(uTreeNodeIndex);
const unsigned uTreeRight = tree.GetRight(uTreeNodeIndex);
const unsigned uDiffsLeft = Diffs.AppendBranch(uDiffsNodeIndex);
const unsigned uDiffsRight = uDiffsLeft + 1;
BuildDiffs(tree, uTreeLeft, bIsDiff, Diffs, uDiffsLeft, IdToDiffsLeafNodeIndex);
BuildDiffs(tree, uTreeRight, bIsDiff, Diffs, uDiffsRight, IdToDiffsLeafNodeIndex);
}
void DiffTrees(const Tree &Tree1, const Tree &Tree2, Tree &Diffs,
unsigned IdToDiffsLeafNodeIndex[])
{
#if TRACE
Log("Tree1:\n");
Tree1.LogMe();
Log("\n");
Log("Tree2:\n");
Tree2.LogMe();
#endif
if (!Tree1.IsRooted() || !Tree2.IsRooted())
Quit("DiffTrees: requires rooted trees");
const unsigned uNodeCount = Tree1.GetNodeCount();
const unsigned uNodeCount2 = Tree2.GetNodeCount();
const unsigned uLeafCount = Tree1.GetLeafCount();
const unsigned uLeafCount2 = Tree2.GetLeafCount();
assert(uLeafCount == uLeafCount2);
if (uNodeCount != uNodeCount2)
Quit("DiffTrees: different node counts");
// Allocate tables so we can convert tree node index to
// and from the unique id with a O(1) lookup.
unsigned *NodeIndexToId1 = new unsigned[uNodeCount];
unsigned *IdToNodeIndex2 = new unsigned[uNodeCount];
bool *bIsBachelor1 = new bool[uNodeCount];
bool *bIsDiff1 = new bool[uNodeCount];
for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
{
NodeIndexToId1[uNodeIndex] = uNodeCount;
bIsBachelor1[uNodeIndex] = false;
bIsDiff1[uNodeIndex] = false;
// Use uNodeCount as value meaning "not set".
IdToNodeIndex2[uNodeIndex] = uNodeCount;
}
// Initialize node index <-> id lookup tables
for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
{
if (Tree1.IsLeaf(uNodeIndex))
{
const unsigned uId = Tree1.GetLeafId(uNodeIndex);
if (uId >= uNodeCount)
Quit("Diff trees requires existing leaf ids in range 0 .. (N-1)");
NodeIndexToId1[uNodeIndex] = uId;
}
if (Tree2.IsLeaf(uNodeIndex))
{
const unsigned uId = Tree2.GetLeafId(uNodeIndex);
if (uId >= uNodeCount)
Quit("Diff trees requires existing leaf ids in range 0 .. (N-1)");
IdToNodeIndex2[uId] = uNodeIndex;
}
}
// Validity check. This verifies that the ids
// pre-assigned to the leaves in Tree1 are unique
// (note that the id<N check above does not rule
// out two leaves having duplicate ids).
for (unsigned uId = 0; uId < uLeafCount; ++uId)
{
unsigned uNodeIndex2 = IdToNodeIndex2[uId];
if (uNodeCount == uNodeIndex2)
Quit("DiffTrees, check 2");
}
// Ids assigned to internal nodes are N, N+1 ...
// An internal node id uniquely identifies a set
// of two or more leaves.
unsigned uInternalNodeId = uLeafCount;
// Depth-first traversal of tree.
// The order guarantees that a node is visited before
// its parent is visited.
for (unsigned uNodeIndex1 = Tree1.FirstDepthFirstNode();
NULL_NEIGHBOR != uNodeIndex1;
uNodeIndex1 = Tree1.NextDepthFirstNode(uNodeIndex1))
{
#if TRACE
Log("Main loop: Node1=%u IsLeaf=%d IsBachelor=%d\n",
uNodeIndex1,
Tree1.IsLeaf(uNodeIndex1),
bIsBachelor1[uNodeIndex1]);
#endif
// Leaves are trivial; nothing to do.
if (Tree1.IsLeaf(uNodeIndex1) || bIsBachelor1[uNodeIndex1])
continue;
// If either child is a bachelor, flag
// this node as a bachelor and continue.
unsigned uLeft1 = Tree1.GetLeft(uNodeIndex1);
if (bIsBachelor1[uLeft1])
{
bIsBachelor1[uNodeIndex1] = true;
continue;
}
unsigned uRight1 = Tree1.GetRight(uNodeIndex1);
if (bIsBachelor1[uRight1])
{
bIsBachelor1[uNodeIndex1] = true;
continue;
}
// Both children are married.
// Married nodes are guaranteed to have an id.
unsigned uIdLeft = NodeIndexToId1[uLeft1];
unsigned uIdRight = NodeIndexToId1[uRight1];
if (uIdLeft == uNodeCount || uIdRight == uNodeCount)
Quit("DiffTrees, check 5");
// uLeft2 is the spouse of uLeft1, and similarly for uRight2.
unsigned uLeft2 = IdToNodeIndex2[uIdLeft];
unsigned uRight2 = IdToNodeIndex2[uIdRight];
if (uLeft2 == uNodeCount || uRight2 == uNodeCount)
Quit("DiffTrees, check 6");
// If the spouses of uLeft1 and uRight1 have the same
// parent, then this parent is the spouse of uNodeIndex1.
// Otherwise, uNodeIndex1 is a diff.
unsigned uParentLeft2 = Tree2.GetParent(uLeft2);
unsigned uParentRight2 = Tree2.GetParent(uRight2);
#if TRACE
Log("L1=%u R1=%u L2=%u R2=%u PL2=%u PR2=%u\n",
uLeft1,
uRight1,
uLeft2,
uRight2,
uParentLeft2,
uParentRight2);
#endif
if (uParentLeft2 == uParentRight2)
{
NodeIndexToId1[uNodeIndex1] = uInternalNodeId;
IdToNodeIndex2[uInternalNodeId] = uParentLeft2;
++uInternalNodeId;
}
else
bIsBachelor1[uNodeIndex1] = true;
}
unsigned uDiffCount = 0;
for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
{
if (bIsBachelor1[uNodeIndex])
continue;
if (Tree1.IsRoot(uNodeIndex))
{
// Special case: if no bachelors, consider the
// root a diff.
if (!bIsBachelor1[uNodeIndex])
bIsDiff1[uNodeIndex] = true;
continue;
}
const unsigned uParent = Tree1.GetParent(uNodeIndex);
if (bIsBachelor1[uParent])
{
bIsDiff1[uNodeIndex] = true;
++uDiffCount;
}
}
#if TRACE
Log("Tree1:\n");
Log("Node Id Bach Diff Name\n");
Log("---- ---- ---- ---- ----\n");
for (unsigned n = 0; n < uNodeCount; ++n)
{
Log("%4u %4u %d %d",
n,
NodeIndexToId1[n],
bIsBachelor1[n],
bIsDiff1[n]);
if (Tree1.IsLeaf(n))
Log(" %s", Tree1.GetLeafName(n));
Log("\n");
}
Log("\n");
Log("Tree2:\n");
Log("Node Id Name\n");
Log("---- ---- ----\n");
for (unsigned n = 0; n < uNodeCount; ++n)
{
Log("%4u ", n);
if (Tree2.IsLeaf(n))
Log(" %s", Tree2.GetLeafName(n));
Log("\n");
}
#endif
Diffs.CreateRooted();
const unsigned uDiffsRootIndex = Diffs.GetRootNodeIndex();
const unsigned uRootIndex1 = Tree1.GetRootNodeIndex();
for (unsigned n = 0; n < uLeafCount; ++n)
IdToDiffsLeafNodeIndex[n] = uNodeCount;
BuildDiffs(Tree1, uRootIndex1, bIsDiff1, Diffs, uDiffsRootIndex,
IdToDiffsLeafNodeIndex);
#if TRACE
Log("\n");
Log("Diffs:\n");
Diffs.LogMe();
Log("\n");
Log("IdToDiffsLeafNodeIndex:");
for (unsigned n = 0; n < uLeafCount; ++n)
{
if (n%16 == 0)
Log("\n");
else
Log(" ");
Log("%u=%u", n, IdToDiffsLeafNodeIndex[n]);
}
Log("\n");
#endif
for (unsigned n = 0; n < uLeafCount; ++n)
if (IdToDiffsLeafNodeIndex[n] == uNodeCount)
Quit("TreeDiffs check 7");
delete[] NodeIndexToId1;
delete[] IdToNodeIndex2;
delete[] bIsBachelor1;
delete[] bIsDiff1;
}

View File

@ -0,0 +1,235 @@
#include "muscle.h"
#include "tree.h"
#define TRACE 0
/***
Algorithm to compare two trees, X and Y.
A node x in X and node y in Y are defined to be
similar iff the set of leaves in the subtree under
x is identical to the set of leaves under y.
A node is defined to be changed iff it is not
similar to any node in the other tree.
Nodes x and y are defined to be married iff every
node in the subtree under x is similar to a node
in the subtree under y. Married nodes are considered
to be equal. The subtrees under two married nodes can
at most differ by exchanges of left and right branches,
which we do not consider to be significant here.
A node is changed iff it is not married. If a node is
changed, then it has a dissimilar node in its subtree,
and it follows immediately from the definition of marriage
that its parent is also a bachelor. Hence all nodes on the
path from a changed node to the root are changed.
We assume the trees have the same set of leaves, so
every leaf is trivially both similar and married to
the same leaf in the opposite tree. Changed nodes
are therefore always internal (i.e., non-leaf) nodes.
Example:
-----A
-----k
----j -----B
--i -----C
------D
-----A
-----p
----n -----B
--m -----D
------C
The following pairs of internal nodes are similar.
Nodes Set of leaves
----- -------------
k,p A,B
i,m A,B,C,D
Changed nodes in the first tree are i and j, changed nodes
in the second tree are m and n.
Node k and p are married, but i and m are not (because j
and n are changed). The diffs are C, D and k.
To achieve O(N) we avoid traversing a given subtree multiple
times and also avoid comparing lists of leaves.
We visit nodes in depth-first order (i.e., a node is visited
before its parent).
If either child of a node is changed, we flag it as changed.
If both children of the node we are visiting are married,
we check whether the spouses of those children have the
same parent in the other tree. If the parents are different,
the current node is a bachelor. If they have the same parent,
then the node we are visiting is the spouse of that parent.
We assign this newly identified married couple a unique integer
id. The id of a node is in one-to-one correspondence with the
set of leaves in its subtree. Two nodes have the same set of
leaves iff they have the same id. Changed nodes do not get
an id.
***/
void DiffTreesE(const Tree &NewTree, const Tree &OldTree,
unsigned NewNodeIndexToOldNodeIndex[])
{
#if TRACE
Log("DiffTreesE NewTree:\n");
NewTree.LogMe();
Log("\n");
Log("OldTree:\n");
OldTree.LogMe();
#endif
if (!NewTree.IsRooted() || !OldTree.IsRooted())
Quit("DiffTrees: requires rooted trees");
const unsigned uNodeCount = NewTree.GetNodeCount();
const unsigned uOldNodeCount = OldTree.GetNodeCount();
const unsigned uLeafCount = NewTree.GetLeafCount();
const unsigned uOldLeafCount = OldTree.GetLeafCount();
if (uNodeCount != uOldNodeCount || uLeafCount != uOldLeafCount)
Quit("DiffTreesE: different node counts");
{
unsigned *IdToOldNodeIndex = new unsigned[uNodeCount];
for (unsigned uOldNodeIndex = 0; uOldNodeIndex < uNodeCount; ++uOldNodeIndex)
{
if (OldTree.IsLeaf(uOldNodeIndex))
{
unsigned Id = OldTree.GetLeafId(uOldNodeIndex);
IdToOldNodeIndex[Id] = uOldNodeIndex;
}
}
// Initialize NewNodeIndexToOldNodeIndex[]
// All internal nodes are marked as changed, but may be updated later.
for (unsigned uNewNodeIndex = 0; uNewNodeIndex < uNodeCount; ++uNewNodeIndex)
{
if (NewTree.IsLeaf(uNewNodeIndex))
{
unsigned uId = NewTree.GetLeafId(uNewNodeIndex);
assert(uId < uLeafCount);
unsigned uOldNodeIndex = IdToOldNodeIndex[uId];
assert(uOldNodeIndex < uNodeCount);
NewNodeIndexToOldNodeIndex[uNewNodeIndex] = uOldNodeIndex;
}
else
NewNodeIndexToOldNodeIndex[uNewNodeIndex] = NODE_CHANGED;
}
delete[] IdToOldNodeIndex;
}
// Depth-first traversal of tree.
// The order guarantees that a node is visited before
// its parent is visited.
for (unsigned uNewNodeIndex = NewTree.FirstDepthFirstNode();
NULL_NEIGHBOR != uNewNodeIndex;
uNewNodeIndex = NewTree.NextDepthFirstNode(uNewNodeIndex))
{
if (NewTree.IsLeaf(uNewNodeIndex))
continue;
// If either child is changed, flag this node as changed and continue.
unsigned uNewLeft = NewTree.GetLeft(uNewNodeIndex);
unsigned uOldLeft = NewNodeIndexToOldNodeIndex[uNewLeft];
if (NODE_CHANGED == uOldLeft)
{
NewNodeIndexToOldNodeIndex[uNewLeft] = NODE_CHANGED;
continue;
}
unsigned uNewRight = NewTree.GetRight(uNewNodeIndex);
unsigned uOldRight = NewNodeIndexToOldNodeIndex[uNewRight];
if (NODE_CHANGED == NewNodeIndexToOldNodeIndex[uNewRight])
{
NewNodeIndexToOldNodeIndex[uNewRight] = NODE_CHANGED;
continue;
}
unsigned uOldParentLeft = OldTree.GetParent(uOldLeft);
unsigned uOldParentRight = OldTree.GetParent(uOldRight);
if (uOldParentLeft == uOldParentRight)
NewNodeIndexToOldNodeIndex[uNewNodeIndex] = uOldParentLeft;
else
NewNodeIndexToOldNodeIndex[uNewNodeIndex] = NODE_CHANGED;
}
#if TRACE
{
Log("NewToOld ");
for (unsigned uNewNodeIndex = 0; uNewNodeIndex < uNodeCount; ++uNewNodeIndex)
{
Log(" [%3u]=", uNewNodeIndex);
if (NODE_CHANGED == NewNodeIndexToOldNodeIndex[uNewNodeIndex])
Log(" X");
else
Log("%3u", NewNodeIndexToOldNodeIndex[uNewNodeIndex]);
if ((uNewNodeIndex+1)%8 == 0)
Log("\n ");
}
Log("\n");
}
#endif
#if DEBUG
{
for (unsigned uNewNodeIndex = 0; uNewNodeIndex < uNodeCount; ++uNewNodeIndex)
{
unsigned uOld = NewNodeIndexToOldNodeIndex[uNewNodeIndex];
if (NewTree.IsLeaf(uNewNodeIndex))
{
if (uOld >= uNodeCount)
{
Log("NewNode=%u uOld=%u > uNodeCount=%u\n",
uNewNodeIndex, uOld, uNodeCount);
Quit("Diff check failed");
}
unsigned uIdNew = NewTree.GetLeafId(uNewNodeIndex);
unsigned uIdOld = OldTree.GetLeafId(uOld);
if (uIdNew != uIdOld)
{
Log("NewNode=%u uOld=%u IdNew=%u IdOld=%u\n",
uNewNodeIndex, uOld, uIdNew, uIdOld);
Quit("Diff check failed");
}
continue;
}
if (NODE_CHANGED == uOld)
continue;
unsigned uNewLeft = NewTree.GetLeft(uNewNodeIndex);
unsigned uNewRight = NewTree.GetRight(uNewNodeIndex);
unsigned uOldLeft = OldTree.GetLeft(uOld);
unsigned uOldRight = OldTree.GetRight(uOld);
unsigned uNewLeftPartner = NewNodeIndexToOldNodeIndex[uNewLeft];
unsigned uNewRightPartner = NewNodeIndexToOldNodeIndex[uNewRight];
bool bSameNotRotated = (uNewLeftPartner == uOldLeft && uNewRightPartner == uOldRight);
bool bSameRotated = (uNewLeftPartner == uOldRight && uNewRightPartner == uOldLeft);
if (!bSameNotRotated && !bSameRotated)
{
Log("NewNode=%u NewL=%u NewR=%u\n", uNewNodeIndex, uNewLeft, uNewRight);
Log("OldNode=%u OldL=%u OldR=%u\n", uOld, uOldLeft, uOldRight);
Log("NewLPartner=%u NewRPartner=%u\n", uNewLeftPartner, uNewRightPartner);
Quit("Diff check failed");
}
}
}
#endif
}

View File

@ -0,0 +1,89 @@
#include "muscle.h"
#include "distfunc.h"
#include "distcalc.h"
#include "msa.h"
void DistCalcDF::Init(const DistFunc &DF)
{
m_ptrDF = &DF;
}
void DistCalcDF::CalcDistRange(unsigned i, dist_t Dist[]) const
{
for (unsigned j = 0; j < i; ++j)
Dist[j] = m_ptrDF->GetDist(i, j);
}
unsigned DistCalcDF::GetCount() const
{
return m_ptrDF->GetCount();
}
unsigned DistCalcDF::GetId(unsigned i) const
{
return m_ptrDF->GetId(i);
}
const char *DistCalcDF::GetName(unsigned i) const
{
return m_ptrDF->GetName(i);
}
void DistCalcMSA::Init(const MSA &msa, DISTANCE Distance)
{
m_ptrMSA = &msa;
m_Distance = Distance;
}
void DistCalcMSA::CalcDistRange(unsigned i, dist_t Dist[]) const
{
for (unsigned j = 0; j < i; ++j)
{
switch (m_Distance)
{
case DISTANCE_PctIdKimura:
{
const float PctId = (float) m_ptrMSA->GetPctIdentityPair(i, j);
Dist[j] = (float) KimuraDist(PctId);
break;
}
case DISTANCE_PctIdLog:
{
const float PctId = (float) m_ptrMSA->GetPctIdentityPair(i, j);
Dist[j] = (float) PctIdToMAFFTDist(PctId);
break;
}
case DISTANCE_ScoreDist:
{
double GetScoreDist(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2);
Dist[j] = (float) GetScoreDist(*m_ptrMSA, i, j);
continue;
}
case DISTANCE_Edit:
{
const float PctId = (float) m_ptrMSA->GetPctIdentityPair(i, j);
if (PctId > 1.0)
Quit("Internal error, DISTANCE_Edit, pct id=%.3g", PctId);
Dist[j] = (float) 1.0 - PctId;
break;
}
default:
Quit("DistCalcMSA: Invalid DISTANCE_%u", m_Distance);
}
}
}
unsigned DistCalcMSA::GetCount() const
{
return m_ptrMSA->GetSeqCount();
}
unsigned DistCalcMSA::GetId(unsigned i) const
{
return m_ptrMSA->GetSeqId(i);
}
const char *DistCalcMSA::GetName(unsigned i) const
{
return m_ptrMSA->GetSeqName(i);
}

View File

@ -0,0 +1,45 @@
#ifndef DistCalc_h
#define DistCalc_h
typedef float dist_t;
const dist_t BIG_DIST = (dist_t) 1e29;
class DistFunc;
class DistCalc
{
public:
virtual void CalcDistRange(unsigned i, dist_t Dist[]) const = 0;
virtual unsigned GetCount() const = 0;
virtual unsigned GetId(unsigned i) const = 0;
virtual const char *GetName(unsigned i) const = 0;
};
class DistCalcDF : public DistCalc
{
public:
void Init(const DistFunc &DF);
virtual void CalcDistRange(unsigned i, dist_t Dist[]) const;
virtual unsigned GetCount() const;
virtual unsigned GetId(unsigned i) const;
virtual const char *GetName(unsigned i) const;
private:
const DistFunc *m_ptrDF;
};
class DistCalcMSA : public DistCalc
{
public:
void Init(const MSA &msa, DISTANCE Distance);
virtual void CalcDistRange(unsigned i, dist_t Dist[]) const;
virtual unsigned GetCount() const;
virtual unsigned GetId(unsigned i) const;
virtual const char *GetName(unsigned i) const;
private:
const MSA *m_ptrMSA;
DISTANCE m_Distance;
};
#endif // DistCalc_h

View File

@ -0,0 +1,113 @@
#include "muscle.h"
#include "distfunc.h"
#include <assert.h>
DistFunc::DistFunc()
{
m_Dists = 0;
m_uCount = 0;
m_uCacheCount = 0;
m_Names = 0;
m_Ids = 0;
}
DistFunc::~DistFunc()
{
if (0 != m_Names)
{
for (unsigned i = 0; i < m_uCount; ++i)
free(m_Names[i]);
}
delete[] m_Dists;
delete[] m_Names;
delete[] m_Ids;
}
float DistFunc::GetDist(unsigned uIndex1, unsigned uIndex2) const
{
return m_Dists[VectorIndex(uIndex1, uIndex2)];
}
unsigned DistFunc::GetCount() const
{
return m_uCount;
}
void DistFunc::SetCount(unsigned uCount)
{
m_uCount = uCount;
if (uCount <= m_uCacheCount)
return;
delete[] m_Dists;
m_Dists = new float[VectorLength()];
m_Names = new char *[m_uCount];
m_Ids = new unsigned[m_uCount];
m_uCacheCount = uCount;
memset(m_Names, 0, m_uCount*sizeof(char *));
memset(m_Ids, 0xff, m_uCount*sizeof(unsigned));
memset(m_Dists, 0, VectorLength()*sizeof(float));
}
void DistFunc::SetDist(unsigned uIndex1, unsigned uIndex2, float dDist)
{
m_Dists[VectorIndex(uIndex1, uIndex2)] = dDist;
m_Dists[VectorIndex(uIndex2, uIndex1)] = dDist;
}
unsigned DistFunc::VectorIndex(unsigned uIndex1, unsigned uIndex2) const
{
assert(uIndex1 < m_uCount && uIndex2 < m_uCount);
return uIndex1*m_uCount + uIndex2;
}
unsigned DistFunc::VectorLength() const
{
return m_uCount*m_uCount;
}
void DistFunc::SetName(unsigned uIndex, const char szName[])
{
assert(uIndex < m_uCount);
m_Names[uIndex] = strsave(szName);
}
void DistFunc::SetId(unsigned uIndex, unsigned uId)
{
assert(uIndex < m_uCount);
m_Ids[uIndex] = uId;
}
const char *DistFunc::GetName(unsigned uIndex) const
{
assert(uIndex < m_uCount);
return m_Names[uIndex];
}
unsigned DistFunc::GetId(unsigned uIndex) const
{
assert(uIndex < m_uCount);
return m_Ids[uIndex];
}
void DistFunc::LogMe() const
{
Log("DistFunc::LogMe count=%u\n", m_uCount);
Log(" ");
for (unsigned i = 0; i < m_uCount; ++i)
Log(" %7u", i);
Log("\n");
Log(" ");
for (unsigned i = 0; i < m_uCount; ++i)
Log(" %7.7s", m_Names[i] ? m_Names[i] : "");
Log("\n");
for (unsigned i = 0; i < m_uCount; ++i)
{
Log("%4u %10.10s : ", i, m_Names[i] ? m_Names[i] : "");
for (unsigned j = 0; j <= i; ++j)
Log(" %7.4g", GetDist(i, j));
Log("\n");
}
}

View File

@ -0,0 +1,36 @@
#ifndef DistFunc_h
#define DistFunc_h
class DistFunc
{
public:
DistFunc();
virtual ~DistFunc();
public:
virtual void SetCount(unsigned uCount);
virtual void SetDist(unsigned uIndex1, unsigned uIndex2, float dDist);
void SetName(unsigned uIndex, const char szName[]);
void SetId(unsigned uIndex, unsigned uId);
const char *GetName(unsigned uIndex) const;
unsigned GetId(unsigned uIndex) const;
virtual float GetDist(unsigned uIndex1, unsigned uIndex2) const;
virtual unsigned GetCount() const;
void LogMe() const;
protected:
unsigned VectorIndex(unsigned uIndex, unsigned uIndex2) const;
unsigned VectorLength() const;
private:
unsigned m_uCount;
unsigned m_uCacheCount;
float *m_Dists;
char **m_Names;
unsigned *m_Ids;
};
#endif // DistFunc_h

View File

@ -0,0 +1,45 @@
#include "muscle.h"
#include "distfunc.h"
#include "msa.h"
#include "seqvect.h"
#include "pwpath.h"
void DistPWKimura(const SeqVect &v, DistFunc &DF)
{
SEQWEIGHT SeqWeightSave = GetSeqWeightMethod();
SetSeqWeightMethod(SEQWEIGHT_Henikoff);
const unsigned uSeqCount = v.Length();
DF.SetCount(uSeqCount);
const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2;
unsigned uCount = 0;
SetProgressDesc("PWKimura distance");
for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1)
{
const Seq &s1 = v.GetSeq(uSeqIndex1);
MSA msa1;
msa1.FromSeq(s1);
for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2)
{
if (0 == uCount%20)
Progress(uCount, uPairCount);
++uCount;
const Seq &s2 = v.GetSeq(uSeqIndex2);
MSA msa2;
msa2.FromSeq(s2);
PWPath Path;
MSA msaOut;
AlignTwoMSAs(msa1, msa2, msaOut, Path, false, false);
double dPctId = msaOut.GetPctIdentityPair(0, 1);
float f = (float) KimuraDist(dPctId);
DF.SetDist(uSeqIndex1, uSeqIndex2, f);
}
}
ProgressStepsDone();
SetSeqWeightMethod(SeqWeightSave);
}

View File

@ -0,0 +1,299 @@
#include "muscle.h"
#include "textfile.h"
#include "seqvect.h"
#include "distfunc.h"
#include "msa.h"
#include "tree.h"
#include "profile.h"
#include "timing.h"
static char g_strUseTreeWarning[] =
"\n******** WARNING ****************\n"
"\nYou specified the -usetree option.\n"
"Note that a good evolutionary tree may NOT be a good\n"
"guide tree for multiple alignment. For more details,\n"
"please refer to the user guide. To disable this\n"
"warning, use -usetree_nowarn <treefilename>.\n\n";
void DoMuscle()
{
SetOutputFileName(g_pstrOutFileName);
SetInputFileName(g_pstrInFileName);
SetMaxIters(g_uMaxIters);
SetSeqWeightMethod(g_SeqWeight1);
TextFile fileIn(g_pstrInFileName);
SeqVect v;
v.FromFASTAFile(fileIn);
const unsigned uSeqCount = v.Length();
if (0 == uSeqCount)
Quit("No sequences in input file");
ALPHA Alpha = ALPHA_Undefined;
switch (g_SeqType)
{
case SEQTYPE_Auto:
Alpha = v.GuessAlpha();
break;
case SEQTYPE_Protein:
Alpha = ALPHA_Amino;
break;
case SEQTYPE_DNA:
Alpha = ALPHA_DNA;
break;
case SEQTYPE_RNA:
Alpha = ALPHA_RNA;
break;
default:
Quit("Invalid seq type");
}
SetAlpha(Alpha);
v.FixAlpha();
PTR_SCOREMATRIX UserMatrix = 0;
if (0 != g_pstrMatrixFileName)
{
const char *FileName = g_pstrMatrixFileName;
const char *Path = getenv("MUSCLE_MXPATH");
if (Path != 0)
{
size_t n = strlen(Path) + 1 + strlen(FileName) + 1;
char *NewFileName = new char[n];
sprintf(NewFileName, "%s/%s", Path, FileName);
FileName = NewFileName;
}
TextFile File(FileName);
UserMatrix = ReadMx(File);
g_Alpha = ALPHA_Amino;
g_PPScore = PPSCORE_SP;
}
SetPPScore();
if (0 != UserMatrix)
g_ptrScoreMatrix = UserMatrix;
unsigned uMaxL = 0;
unsigned uTotL = 0;
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
{
unsigned L = v.GetSeq(uSeqIndex).Length();
uTotL += L;
if (L > uMaxL)
uMaxL = L;
}
SetIter(1);
g_bDiags = g_bDiags1;
SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount);
SetMuscleSeqVect(v);
MSA::SetIdCount(uSeqCount);
// Initialize sequence ids.
// From this point on, ids must somehow propogate from here.
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
v.SetSeqId(uSeqIndex, uSeqIndex);
if (0 == uSeqCount)
Quit("Input file '%s' has no sequences", g_pstrInFileName);
if (1 == uSeqCount)
{
TextFile fileOut(g_pstrOutFileName, true);
v.ToFile(fileOut);
return;
}
if (uSeqCount > 1)
MHackStart(v);
// First iteration
Tree GuideTree;
if (0 != g_pstrUseTreeFileName)
{
// Discourage users...
if (!g_bUseTreeNoWarn)
fprintf(stderr, "%s", g_strUseTreeWarning);
// Read tree from file
TextFile TreeFile(g_pstrUseTreeFileName);
GuideTree.FromFile(TreeFile);
// Make sure tree is rooted
if (!GuideTree.IsRooted())
Quit("User tree must be rooted");
if (GuideTree.GetLeafCount() != uSeqCount)
Quit("User tree does not match input sequences");
const unsigned uNodeCount = GuideTree.GetNodeCount();
for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
{
if (!GuideTree.IsLeaf(uNodeIndex))
continue;
const char *LeafName = GuideTree.GetLeafName(uNodeIndex);
unsigned uSeqIndex;
bool SeqFound = v.FindName(LeafName, &uSeqIndex);
if (!SeqFound)
Quit("Label %s in tree does not match sequences", LeafName);
unsigned uId = v.GetSeqIdFromName(LeafName);
GuideTree.SetLeafId(uNodeIndex, uId);
}
}
else
TreeFromSeqVect(v, GuideTree, g_Cluster1, g_Distance1, g_Root1,
g_pstrDistMxFileName1);
const char *Tree1 = ValueOpt("Tree1");
if (0 != Tree1)
{
TextFile f(Tree1, true);
GuideTree.ToFile(f);
if (g_bClusterOnly)
return;
}
SetMuscleTree(GuideTree);
ValidateMuscleIds(GuideTree);
MSA msa;
ProgNode *ProgNodes = 0;
if (g_bLow)
ProgNodes = ProgressiveAlignE(v, GuideTree, msa);
else
ProgressiveAlign(v, GuideTree, msa);
SetCurrentAlignment(msa);
if (0 != g_pstrComputeWeightsFileName)
{
extern void OutWeights(const char *FileName, const MSA &msa);
SetMSAWeightsMuscle(msa);
OutWeights(g_pstrComputeWeightsFileName, msa);
return;
}
ValidateMuscleIds(msa);
if (1 == g_uMaxIters || 2 == uSeqCount)
{
//TextFile fileOut(g_pstrOutFileName, true);
//MHackEnd(msa);
//msa.ToFile(fileOut);
MuscleOutput(msa);
return;
}
if (0 == g_pstrUseTreeFileName)
{
g_bDiags = g_bDiags2;
SetIter(2);
if (g_bLow)
{
if (0 != g_uMaxTreeRefineIters)
RefineTreeE(msa, v, GuideTree, ProgNodes);
}
else
RefineTree(msa, GuideTree);
const char *Tree2 = ValueOpt("Tree2");
if (0 != Tree2)
{
TextFile f(Tree2, true);
GuideTree.ToFile(f);
}
}
SetSeqWeightMethod(g_SeqWeight2);
SetMuscleTree(GuideTree);
if (g_bAnchors)
RefineVert(msa, GuideTree, g_uMaxIters - 2);
else
RefineHoriz(msa, GuideTree, g_uMaxIters - 2, false, false);
#if 0
// Refining by subfamilies is disabled as it didn't give better
// results. I tried doing this before and after RefineHoriz.
// Should get back to this as it seems like this should work.
RefineSubfams(msa, GuideTree, g_uMaxIters - 2);
#endif
ValidateMuscleIds(msa);
ValidateMuscleIds(GuideTree);
//TextFile fileOut(g_pstrOutFileName, true);
//MHackEnd(msa);
//msa.ToFile(fileOut);
MuscleOutput(msa);
}
void Run()
{
SetStartTime();
Log("Started %s\n", GetTimeAsStr());
for (int i = 0; i < g_argc; ++i)
Log("%s ", g_argv[i]);
Log("\n");
#if TIMING
TICKS t1 = GetClockTicks();
#endif
if (g_bRefine)
Refine();
else if (g_bRefineW)
{
extern void DoRefineW();
DoRefineW();
}
else if (g_bProfDB)
ProfDB();
else if (g_bSW)
Local();
else if (0 != g_pstrSPFileName)
DoSP();
else if (g_bProfile)
Profile();
else if (g_bPPScore)
PPScore();
else if (g_bPAS)
ProgAlignSubFams();
else if (g_bMakeTree)
{
extern void DoMakeTree();
DoMakeTree();
}
else
DoMuscle();
#if TIMING
extern TICKS g_ticksDP;
extern TICKS g_ticksObjScore;
TICKS t2 = GetClockTicks();
TICKS TotalTicks = t2 - t1;
TICKS ticksOther = TotalTicks - g_ticksDP - g_ticksObjScore;
double dSecs = TicksToSecs(TotalTicks);
double PctDP = (double) g_ticksDP*100.0/(double) TotalTicks;
double PctOS = (double) g_ticksObjScore*100.0/(double) TotalTicks;
double PctOther = (double) ticksOther*100.0/(double) TotalTicks;
Log(" Ticks Secs Pct\n");
Log(" ============ ======= =====\n");
Log("DP %12ld %7.2f %5.1f%%\n",
(long) g_ticksDP, TicksToSecs(g_ticksDP), PctDP);
Log("OS %12ld %7.2f %5.1f%%\n",
(long) g_ticksObjScore, TicksToSecs(g_ticksObjScore), PctOS);
Log("Other %12ld %7.2f %5.1f%%\n",
(long) ticksOther, TicksToSecs(ticksOther), PctOther);
Log("Total %12ld %7.2f 100.0%%\n", (long) TotalTicks, dSecs);
#endif
ListDiagSavings();
Log("Finished %s\n", GetTimeAsStr());
}

View File

@ -0,0 +1,60 @@
#include "muscle.h"
#include "textfile.h"
#include "msa.h"
#include "objscore.h"
#include "tree.h"
#include "profile.h"
void DoSP()
{
TextFile f(g_pstrSPFileName);
MSA a;
a.FromFile(f);
ALPHA Alpha = ALPHA_Undefined;
switch (g_SeqType)
{
case SEQTYPE_Auto:
Alpha = a.GuessAlpha();
break;
case SEQTYPE_Protein:
Alpha = ALPHA_Amino;
break;
case SEQTYPE_DNA:
Alpha = ALPHA_DNA;
break;
case SEQTYPE_RNA:
Alpha = ALPHA_RNA;
break;
default:
Quit("Invalid SeqType");
}
SetAlpha(Alpha);
a.FixAlpha();
SetPPScore();
const unsigned uSeqCount = a.GetSeqCount();
if (0 == uSeqCount)
Quit("No sequences in input file %s", g_pstrSPFileName);
MSA::SetIdCount(uSeqCount);
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
a.SetSeqId(uSeqIndex, uSeqIndex);
SetSeqWeightMethod(g_SeqWeight1);
Tree tree;
TreeFromMSA(a, tree, g_Cluster2, g_Distance2, g_Root2);
SetMuscleTree(tree);
SetMSAWeightsMuscle((MSA &) a);
SCORE SP = ObjScoreSP(a);
Log("File=%s;SP=%.4g\n", g_pstrSPFileName, SP);
fprintf(stderr, "File=%s;SP=%.4g\n", g_pstrSPFileName, SP);
}

View File

@ -0,0 +1,73 @@
#ifndef DPRegionList_h
#define DPRegionList_h
#include "diaglist.h"
enum DPREGIONTYPE
{
DPREGIONTYPE_Unknown,
DPREGIONTYPE_Diag,
DPREGIONTYPE_Rect
};
struct DPRegion
{
DPREGIONTYPE m_Type;
union
{
Diag m_Diag;
Rect m_Rect;
};
};
const unsigned MAX_DPREGIONS = 1024;
class DPRegionList
{
public:
DPRegionList()
{
m_uCount = 0;
}
~DPRegionList()
{
Free();
}
public:
// Creation
void Clear()
{
Free();
}
void Add(const DPRegion &r);
// Accessors
unsigned GetCount() const
{
return m_uCount;
}
const DPRegion &Get(unsigned uIndex) const
{
assert(uIndex < m_uCount);
return m_DPRegions[uIndex];
}
// Diagnostics
void LogMe() const;
private:
void Free()
{
m_uCount = 0;
}
private:
unsigned m_uCount;
DPRegion m_DPRegions[MAX_DPREGIONS];
};
void DiagListToDPRegionList(const DiagList &DL, DPRegionList &RL,
unsigned uLengthA, unsigned uLengthB);
#endif // DPRegionList_h

View File

@ -0,0 +1,108 @@
#include "muscle.h"
#include "dpreglist.h"
unsigned DPRegionList::GetDPArea() const
{
unsigned uArea = 0;
for (unsigned i = 0; i < m_uCount; ++i)
{
const DPRegion &r = m_DPRegions[i];
if (DPREGIONTYPE_Rect == r.m_Type)
uArea += r.m_Rect.m_uLengthA*r.m_Rect.m_uLengthB;
}
return uArea;
}
void DPRegionList::Add(const DPRegion &r)
{
if (m_uCount == MAX_DPREGIONS)
Quit("DPRegionList::Add, overflow %d", m_uCount);
m_DPRegions[m_uCount] = r;
++m_uCount;
}
void DPRegionList::LogMe() const
{
Log("DPRegionList::LogMe, count=%u\n", m_uCount);
Log("Region Type StartA StartB EndA EndB\n");
Log("------ ---- ------ ------ ---- ----\n");
for (unsigned i = 0; i < m_uCount; ++i)
{
const DPRegion &r = m_DPRegions[i];
Log("%6u ", i);
if (DPREGIONTYPE_Diag == r.m_Type)
Log("Diag %6u %6u %6u %6u\n",
r.m_Diag.m_uStartPosA,
r.m_Diag.m_uStartPosB,
r.m_Diag.m_uStartPosA + r.m_Diag.m_uLength - 1,
r.m_Diag.m_uStartPosB + r.m_Diag.m_uLength - 1);
else if (DPREGIONTYPE_Rect == r.m_Type)
Log("Rect %6u %6u %6u %6u\n",
r.m_Rect.m_uStartPosA,
r.m_Rect.m_uStartPosB,
r.m_Rect.m_uStartPosA + r.m_Rect.m_uLengthA - 1,
r.m_Rect.m_uStartPosB + r.m_Rect.m_uLengthB - 1);
else
Log(" *** ERROR *** Type=%u\n", r.m_Type);
}
}
void DiagListToDPRegionList(const DiagList &DL, DPRegionList &RL,
unsigned uLengthA, unsigned uLengthB)
{
if (g_uDiagMargin > g_uMinDiagLength/2)
Quit("Invalid parameters, diagmargin=%d must be <= 2*diaglength=%d",
g_uDiagMargin, g_uMinDiagLength);
unsigned uStartPosA = 0;
unsigned uStartPosB = 0;
const unsigned uDiagCount = DL.GetCount();
DPRegion r;
for (unsigned uDiagIndex = 0; uDiagIndex < uDiagCount; ++uDiagIndex)
{
const Diag &d = DL.Get(uDiagIndex);
assert(d.m_uLength >= g_uMinDiagLength);
const unsigned uStartVertexA = d.m_uStartPosA + g_uDiagMargin - 1;
const unsigned uStartVertexB = d.m_uStartPosB + g_uDiagMargin - 1;
const unsigned uEndVertexA = d.m_uStartPosA + d.m_uLength - g_uDiagMargin;
const unsigned uEndVertexB = d.m_uStartPosB + d.m_uLength - g_uDiagMargin;
r.m_Type = DPREGIONTYPE_Rect;
r.m_Rect.m_uStartPosA = uStartPosA;
r.m_Rect.m_uStartPosB = uStartPosB;
assert(uStartVertexA + 1 >= uStartPosA);
assert(uStartVertexB + 1 >= uStartPosB);
r.m_Rect.m_uLengthA = uStartVertexA + 1 - uStartPosA;
r.m_Rect.m_uLengthB = uStartVertexB + 1 - uStartPosB;
RL.Add(r);
if (uEndVertexA > uStartVertexA + 1)
{
const unsigned uDiagLengthMinusCaps = uEndVertexA - uStartVertexA - 1;
r.m_Type = DPREGIONTYPE_Diag;
r.m_Diag.m_uStartPosA = uStartVertexA + 1;
r.m_Diag.m_uStartPosB = uStartVertexB + 1;
assert(uEndVertexA - uStartVertexA == uEndVertexB - uStartVertexB);
r.m_Diag.m_uLength = uEndVertexA - uStartVertexA - 1;
RL.Add(r);
}
uStartPosA = uEndVertexA;
uStartPosB = uEndVertexB;
}
assert((int) uLengthA - (int) uStartPosA >= (int) g_uDiagMargin);
assert((int) uLengthB - (int) uStartPosB >= (int) g_uDiagMargin);
r.m_Type = DPREGIONTYPE_Rect;
r.m_Rect.m_uStartPosA = uStartPosA;
r.m_Rect.m_uStartPosB = uStartPosB;
assert(uLengthA >= uStartPosA);
assert(uLengthB >= uStartPosB);
r.m_Rect.m_uLengthA = uLengthA - uStartPosA;
r.m_Rect.m_uLengthB = uLengthB - uStartPosB;
RL.Add(r);
}

View File

@ -0,0 +1,76 @@
#ifndef dpreglist_h
#define dpreglist_h
#include "diaglist.h"
enum DPREGIONTYPE
{
DPREGIONTYPE_Unknown,
DPREGIONTYPE_Diag,
DPREGIONTYPE_Rect
};
struct DPRegion
{
DPREGIONTYPE m_Type;
union
{
Diag m_Diag;
Rect m_Rect;
};
};
const unsigned MAX_DPREGIONS = 1024;
class DPRegionList
{
public:
DPRegionList()
{
m_uCount = 0;
}
~DPRegionList()
{
Free();
}
public:
// Creation
void Clear()
{
Free();
}
void Add(const DPRegion &r);
// Accessors
unsigned GetCount() const
{
return m_uCount;
}
const DPRegion &Get(unsigned uIndex) const
{
assert(uIndex < m_uCount);
return m_DPRegions[uIndex];
}
unsigned GetDPArea() const;
// Diagnostics
void LogMe() const;
private:
void Free()
{
m_uCount = 0;
}
private:
unsigned m_uCount;
DPRegion m_DPRegions[MAX_DPREGIONS];
};
void DiagListToDPRegionList(const DiagList &DL, DPRegionList &RL,
unsigned uLengthA, unsigned uLengthB);
#endif // dpreglist_h

View File

@ -0,0 +1,41 @@
#include "muscle.h"
#include "tree.h"
/***
Simple tree drawing algorithm.
y coordinate of node is index in depth-first traversal.
x coordinate is distance from root.
***/
static unsigned DistFromRoot(const Tree &tree, unsigned uNodeIndex)
{
const unsigned uRoot = tree.GetRootNodeIndex();
unsigned uDist = 0;
while (uNodeIndex != uRoot)
{
++uDist;
uNodeIndex = tree.GetParent(uNodeIndex);
}
return uDist;
}
static void DrawNode(const Tree &tree, unsigned uNodeIndex)
{
if (!tree.IsLeaf(uNodeIndex))
DrawNode(tree, tree.GetLeft(uNodeIndex));
unsigned uDist = DistFromRoot(tree, uNodeIndex);
for (unsigned i = 0; i < 5*uDist; ++i)
Log(" ");
Log("%d\n", uNodeIndex);
if (!tree.IsLeaf(uNodeIndex))
DrawNode(tree, tree.GetRight(uNodeIndex));
}
void DrawTree(const Tree &tree)
{
unsigned uRoot = tree.GetRootNodeIndex();
DrawNode(tree, uRoot);
}

View File

@ -0,0 +1,88 @@
#include "muscle.h"
#include "edgelist.h"
EdgeList::EdgeList()
{
m_uNode1 = 0;
m_uNode2 = 0;
m_uCount = 0;
m_uCacheSize = 0;
}
EdgeList::~EdgeList()
{
Clear();
}
void EdgeList::Clear()
{
delete[] m_uNode1;
delete[] m_uNode2;
m_uNode1 = 0;
m_uNode2 = 0;
m_uCount = 0;
m_uCacheSize = 0;
}
void EdgeList::Add(unsigned uNode1, unsigned uNode2)
{
if (m_uCount <= m_uCacheSize)
Expand();
m_uNode1[m_uCount] = uNode1;
m_uNode2[m_uCount] = uNode2;
++m_uCount;
}
unsigned EdgeList::GetCount() const
{
return m_uCount;
}
void EdgeList::GetEdge(unsigned uIndex, unsigned *ptruNode1, unsigned *ptruNode2) const
{
if (uIndex > m_uCount)
Quit("EdgeList::GetEdge(%u) count=%u", uIndex, m_uCount);
*ptruNode1 = m_uNode1[uIndex];
*ptruNode2 = m_uNode2[uIndex];
}
void EdgeList::Copy(const EdgeList &rhs)
{
Clear();
const unsigned uCount = rhs.GetCount();
for (unsigned n = 0; n < uCount; ++n)
{
unsigned uNode1;
unsigned uNode2;
rhs.GetEdge(n, &uNode1, &uNode2);
Add(uNode1, uNode2);
}
}
void EdgeList::Expand()
{
unsigned uNewCacheSize = m_uCacheSize + 512;
unsigned *NewNode1 = new unsigned[uNewCacheSize];
unsigned *NewNode2 = new unsigned[uNewCacheSize];
if (m_uCount > 0)
{
memcpy(NewNode1, m_uNode1, m_uCount*sizeof(unsigned));
memcpy(NewNode2, m_uNode2, m_uCount*sizeof(unsigned));
}
delete[] m_uNode1;
delete[] m_uNode2;
m_uNode1 = NewNode1;
m_uNode2 = NewNode2;
m_uCacheSize = uNewCacheSize;
}
void EdgeList::LogMe() const
{
for (unsigned n = 0; n < m_uCount; ++n)
{
if (n > 0)
Log(" ");
Log("%u->%u", m_uNode1[n], m_uNode2[n]);
}
Log("\n");
}

View File

@ -0,0 +1,28 @@
#ifndef EdgeList_h
#define EdgeList_h
class EdgeList
{
public:
EdgeList();
virtual ~EdgeList();
public:
void Clear();
void Add(unsigned uNode1, unsigned uNode2);
unsigned GetCount() const;
void GetEdge(unsigned uIndex, unsigned *ptruNode1, unsigned *ptruNode2) const;
void Copy(const EdgeList &rhs);
void LogMe() const;
private:
void Expand();
private:
unsigned m_uCount;
unsigned m_uCacheSize;
unsigned *m_uNode1;
unsigned *m_uNode2;
};
#endif // EdgeList_h

View File

@ -0,0 +1,8 @@
#include "muscle.h"
#include "enumopts.h"
#define s(t) EnumOpt t##_Opts[] = {
#define c(t, x) #x, t##_##x,
#define e(t) 0, 0 };
#include "enums.h"

View File

@ -0,0 +1,16 @@
#ifndef enumopts_h
#define enumopts_h
struct EnumOpt
{
const char *pstrOpt;
int iValue;
};
#define s(t) extern EnumOpt t##_Opts[];
#define c(t, x) /* empty */
#define e(t) /* empty */
#include "enums.h"
#endif // enumopts_h

View File

@ -0,0 +1,98 @@
// enums.h
// Define enum types.
// Exploit macro hacks to avoid lots of repetetive typing.
// Generally I am opposed to macro hacks because of the
// highly obscure code that results, but in this case it
// makes maintenance much easier and less error-prone.
// The idea is that this file can be included in different
// places with different definitions of s (Start), c (Case)
// and e (End). See types.h.
s(ALPHA)
c(ALPHA, Amino)
c(ALPHA, DNA)
c(ALPHA, RNA)
e(ALPHA)
s(SEQTYPE)
c(SEQTYPE, Protein)
c(SEQTYPE, DNA)
c(SEQTYPE, RNA)
c(SEQTYPE, Auto)
e(SEQTYPE)
s(ROOT)
c(ROOT, Pseudo)
c(ROOT, MidLongestSpan)
c(ROOT, MinAvgLeafDist)
e(ROOT)
s(CLUSTER)
c(CLUSTER, UPGMA)
c(CLUSTER, UPGMAMax)
c(CLUSTER, UPGMAMin)
c(CLUSTER, UPGMB)
c(CLUSTER, NeighborJoining)
e(CLUSTER)
s(JOIN)
c(JOIN, NearestNeighbor)
c(JOIN, NeighborJoining)
e(JOIN)
s(LINKAGE)
c(LINKAGE, Min)
c(LINKAGE, Avg)
c(LINKAGE, Max)
c(LINKAGE, NeighborJoining)
c(LINKAGE, Biased)
e(LINKAGE)
s(DISTANCE)
c(DISTANCE, Kmer6_6)
c(DISTANCE, Kmer20_3)
c(DISTANCE, Kmer20_4)
c(DISTANCE, Kbit20_3)
c(DISTANCE, Kmer4_6)
c(DISTANCE, PctIdKimura)
c(DISTANCE, PctIdLog)
c(DISTANCE, PWKimura)
c(DISTANCE, PWScoreDist)
c(DISTANCE, ScoreDist)
c(DISTANCE, Edit)
e(DISTANCE)
s(PPSCORE)
c(PPSCORE, LE)
c(PPSCORE, SP)
c(PPSCORE, SV)
c(PPSCORE, SPN)
e(PPSCORE)
s(SEQWEIGHT)
c(SEQWEIGHT, None)
c(SEQWEIGHT, Henikoff)
c(SEQWEIGHT, HenikoffPB)
c(SEQWEIGHT, GSC)
c(SEQWEIGHT, ClustalW)
c(SEQWEIGHT, ThreeWay)
e(SEQWEIGHT)
s(OBJSCORE)
c(OBJSCORE, SP) // Sum of Pairs of sequences
c(OBJSCORE, DP) // Dynamic Programming score
c(OBJSCORE, XP) // Cross Pairs = sum of pairs between two MSAs
c(OBJSCORE, PS) // sum of Prof-Seq score for all seqs in MSA
c(OBJSCORE, SPF) // sum of pairs, fast approximation
c(OBJSCORE, SPM) // sp if <= 100 seqs, spf otherwise
e(OBJSCORE)
s(TERMGAPS)
c(TERMGAPS, Full)
c(TERMGAPS, Half)
c(TERMGAPS, Ext)
e(TERMGAPS)
#undef s
#undef c
#undef e

View File

@ -0,0 +1,16 @@
#include "muscle.h"
#include <stdio.h>
static char szMsg[64];
// Define XXXToStr(XXX x) functions for each enum type XXX.
#define s(t) const char *t##ToStr(t x) { switch (x) { case t##_Undefined: return "Undefined";
#define c(t, x) case t##_##x: return #x;
#define e(t) } sprintf(szMsg, #t "_%d", x); return szMsg; }
#include "enums.h"
// Define StrToXXX(const char *Str) functions for each enum type XXX.
#define s(t) t StrTo##t(const char *Str) { if (0) ;
#define c(t, x) else if (0 == stricmp(#x, Str)) return t##_##x;
#define e(t) Quit("Invalid value %s for type %s", Str, #t); return t##_Undefined; }
#include "enums.h"

View File

@ -0,0 +1,689 @@
#include "muscle.h"
#include "pwpath.h"
#include "estring.h"
#include "seq.h"
#include "msa.h"
/***
An "estring" is an edit string that operates on a sequence.
An estring is represented as a vector of integers.
It is interpreted in order of increasing suffix.
A positive value n means copy n letters.
A negative value -n means insert n indels.
Zero marks the end of the vector.
Consecutive entries must have opposite sign, i.e. the
shortest possible representation must be used.
A "tpair" is a traceback path for a pairwise alignment
represented as two estrings, one for each sequence.
***/
#define c2(c,d) (((unsigned char) c) << 8 | (unsigned char) d)
unsigned LengthEstring(const short es[])
{
unsigned i = 0;
while (*es++ != 0)
++i;
return i;
}
short *EstringNewCopy(const short es[])
{
unsigned n = LengthEstring(es) + 1;
short *esNew = new short[n];
memcpy(esNew, es, n*sizeof(short));
return esNew;
}
void LogEstring(const short es[])
{
Log("<");
for (unsigned i = 0; es[i] != 0; ++i)
{
if (i > 0)
Log(" ");
Log("%d", es[i]);
}
Log(">");
}
static bool EstringsEq(const short es1[], const short es2[])
{
for (;;)
{
if (*es1 != *es2)
return false;
if (0 == *es1)
break;
++es1;
++es2;
}
return true;
}
static void EstringCounts(const short es[], unsigned *ptruSymbols,
unsigned *ptruIndels)
{
unsigned uSymbols = 0;
unsigned uIndels = 0;
for (unsigned i = 0; es[i] != 0; ++i)
{
short n = es[i];
if (n > 0)
uSymbols += n;
else if (n < 0)
uIndels += -n;
}
*ptruSymbols = uSymbols;
*ptruIndels = uIndels;
}
static char *EstringOp(const short es[], const char s[])
{
unsigned uSymbols;
unsigned uIndels;
EstringCounts(es, &uSymbols, &uIndels);
assert((unsigned) strlen(s) == uSymbols);
char *sout = new char[uSymbols + uIndels + 1];
char *psout = sout;
for (;;)
{
int n = *es++;
if (0 == n)
break;
if (n > 0)
for (int i = 0; i < n; ++i)
*psout++ = *s++;
else
for (int i = 0; i < -n; ++i)
*psout++ = '-';
}
assert(0 == *s);
*psout = 0;
return sout;
}
void EstringOp(const short es[], const Seq &sIn, Seq &sOut)
{
#if DEBUG
unsigned uSymbols;
unsigned uIndels;
EstringCounts(es, &uSymbols, &uIndels);
assert(sIn.Length() == uSymbols);
#endif
sOut.Clear();
sOut.SetName(sIn.GetName());
int p = 0;
for (;;)
{
int n = *es++;
if (0 == n)
break;
if (n > 0)
for (int i = 0; i < n; ++i)
{
char c = sIn[p++];
sOut.push_back(c);
}
else
for (int i = 0; i < -n; ++i)
sOut.push_back('-');
}
}
unsigned EstringOp(const short es[], const Seq &sIn, MSA &a)
{
unsigned uSymbols;
unsigned uIndels;
EstringCounts(es, &uSymbols, &uIndels);
assert(sIn.Length() == uSymbols);
unsigned uColCount = uSymbols + uIndels;
a.Clear();
a.SetSize(1, uColCount);
a.SetSeqName(0, sIn.GetName());
a.SetSeqId(0, sIn.GetId());
unsigned p = 0;
unsigned uColIndex = 0;
for (;;)
{
int n = *es++;
if (0 == n)
break;
if (n > 0)
for (int i = 0; i < n; ++i)
{
char c = sIn[p++];
a.SetChar(0, uColIndex++, c);
}
else
for (int i = 0; i < -n; ++i)
a.SetChar(0, uColIndex++, '-');
}
assert(uColIndex == uColCount);
return uColCount;
}
void PathToEstrings(const PWPath &Path, short **ptresA, short **ptresB)
{
// First pass to determine size of estrings esA and esB
const unsigned uEdgeCount = Path.GetEdgeCount();
if (0 == uEdgeCount)
{
short *esA = new short[1];
short *esB = new short[1];
esA[0] = 0;
esB[0] = 0;
*ptresA = esA;
*ptresB = esB;
return;
}
unsigned iLengthA = 1;
unsigned iLengthB = 1;
const char cFirstEdgeType = Path.GetEdge(0).cType;
char cPrevEdgeType = cFirstEdgeType;
for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
{
const PWEdge &Edge = Path.GetEdge(uEdgeIndex);
char cEdgeType = Edge.cType;
switch (c2(cPrevEdgeType, cEdgeType))
{
case c2('M', 'M'):
case c2('D', 'D'):
case c2('I', 'I'):
break;
case c2('D', 'M'):
case c2('M', 'D'):
++iLengthB;
break;
case c2('I', 'M'):
case c2('M', 'I'):
++iLengthA;
break;
case c2('I', 'D'):
case c2('D', 'I'):
++iLengthB;
++iLengthA;
break;
default:
assert(false);
}
cPrevEdgeType = cEdgeType;
}
// Pass2 for seq A
{
short *esA = new short[iLengthA+1];
unsigned iA = 0;
switch (Path.GetEdge(0).cType)
{
case 'M':
case 'D':
esA[0] = 1;
break;
case 'I':
esA[0] = -1;
break;
default:
assert(false);
}
char cPrevEdgeType = cFirstEdgeType;
for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
{
const PWEdge &Edge = Path.GetEdge(uEdgeIndex);
char cEdgeType = Edge.cType;
switch (c2(cPrevEdgeType, cEdgeType))
{
case c2('M', 'M'):
case c2('D', 'D'):
case c2('D', 'M'):
case c2('M', 'D'):
++(esA[iA]);
break;
case c2('I', 'D'):
case c2('I', 'M'):
++iA;
esA[iA] = 1;
break;
case c2('M', 'I'):
case c2('D', 'I'):
++iA;
esA[iA] = -1;
break;
case c2('I', 'I'):
--(esA[iA]);
break;
default:
assert(false);
}
cPrevEdgeType = cEdgeType;
}
assert(iA == iLengthA - 1);
esA[iLengthA] = 0;
*ptresA = esA;
}
{
// Pass2 for seq B
short *esB = new short[iLengthB+1];
unsigned iB = 0;
switch (Path.GetEdge(0).cType)
{
case 'M':
case 'I':
esB[0] = 1;
break;
case 'D':
esB[0] = -1;
break;
default:
assert(false);
}
char cPrevEdgeType = cFirstEdgeType;
for (unsigned uEdgeIndex = 1; uEdgeIndex < uEdgeCount; ++uEdgeIndex)
{
const PWEdge &Edge = Path.GetEdge(uEdgeIndex);
char cEdgeType = Edge.cType;
switch (c2(cPrevEdgeType, cEdgeType))
{
case c2('M', 'M'):
case c2('I', 'I'):
case c2('I', 'M'):
case c2('M', 'I'):
++(esB[iB]);
break;
case c2('D', 'I'):
case c2('D', 'M'):
++iB;
esB[iB] = 1;
break;
case c2('M', 'D'):
case c2('I', 'D'):
++iB;
esB[iB] = -1;
break;
case c2('D', 'D'):
--(esB[iB]);
break;
default:
assert(false);
}
cPrevEdgeType = cEdgeType;
}
assert(iB == iLengthB - 1);
esB[iLengthB] = 0;
*ptresB = esB;
}
#if DEBUG
{
const PWEdge &LastEdge = Path.GetEdge(uEdgeCount - 1);
unsigned uSymbols;
unsigned uIndels;
EstringCounts(*ptresA, &uSymbols, &uIndels);
assert(uSymbols == LastEdge.uPrefixLengthA);
assert(uSymbols + uIndels == uEdgeCount);
EstringCounts(*ptresB, &uSymbols, &uIndels);
assert(uSymbols == LastEdge.uPrefixLengthB);
assert(uSymbols + uIndels == uEdgeCount);
PWPath TmpPath;
EstringsToPath(*ptresA, *ptresB, TmpPath);
TmpPath.AssertEqual(Path);
}
#endif
}
void EstringsToPath(const short esA[], const short esB[], PWPath &Path)
{
Path.Clear();
unsigned iA = 0;
unsigned iB = 0;
int nA = esA[iA++];
int nB = esB[iB++];
unsigned uPrefixLengthA = 0;
unsigned uPrefixLengthB = 0;
for (;;)
{
char cType;
if (nA > 0)
{
if (nB > 0)
{
cType = 'M';
--nA;
--nB;
}
else if (nB < 0)
{
cType = 'D';
--nA;
++nB;
}
else
assert(false);
}
else if (nA < 0)
{
if (nB > 0)
{
cType = 'I';
++nA;
--nB;
}
else
assert(false);
}
else
assert(false);
switch (cType)
{
case 'M':
++uPrefixLengthA;
++uPrefixLengthB;
break;
case 'D':
++uPrefixLengthA;
break;
case 'I':
++uPrefixLengthB;
break;
}
PWEdge Edge;
Edge.cType = cType;
Edge.uPrefixLengthA = uPrefixLengthA;
Edge.uPrefixLengthB = uPrefixLengthB;
Path.AppendEdge(Edge);
if (nA == 0)
{
if (0 == esA[iA])
{
assert(0 == esB[iB]);
break;
}
nA = esA[iA++];
}
if (nB == 0)
nB = esB[iB++];
}
}
/***
Multiply two estrings to make a third estring.
The product of two estrings e1*e2 is defined to be
the estring that produces the same result as applying
e1 then e2. Multiplication is not commutative. In fact,
the reversed order is undefined unless both estrings
consist of a single, identical, positive entry.
A primary motivation for using estrings is that
multiplication is very fast, reducing the time
needed to construct the root alignment.
Example
<-1,3>(XXX) = -XXX
<2,-1,2>(-XXX) = -X-XX
Therefore,
<-1,3>*<2,-1,2> = <-1,1,-1,2>
***/
static bool CanMultiplyEstrings(const short es1[], const short es2[])
{
unsigned uSymbols1;
unsigned uSymbols2;
unsigned uIndels1;
unsigned uIndels2;
EstringCounts(es1, &uSymbols1, &uIndels1);
EstringCounts(es2, &uSymbols2, &uIndels2);
return uSymbols1 + uIndels1 == uSymbols2;
}
static inline void AppendGaps(short esp[], int &ip, int n)
{
if (-1 == ip)
esp[++ip] = n;
else if (esp[ip] < 0)
esp[ip] += n;
else
esp[++ip] = n;
}
static inline void AppendSymbols(short esp[], int &ip, int n)
{
if (-1 == ip)
esp[++ip] = n;
else if (esp[ip] > 0)
esp[ip] += n;
else
esp[++ip] = n;
}
void MulEstrings(const short es1[], const short es2[], short esp[])
{
assert(CanMultiplyEstrings(es1, es2));
unsigned i1 = 0;
int ip = -1;
int n1 = es1[i1++];
for (unsigned i2 = 0; ; ++i2)
{
int n2 = es2[i2];
if (0 == n2)
break;
if (n2 > 0)
{
for (;;)
{
if (n1 < 0)
{
if (n2 > -n1)
{
AppendGaps(esp, ip, n1);
n2 += n1;
n1 = es1[i1++];
}
else if (n2 == -n1)
{
AppendGaps(esp, ip, n1);
n1 = es1[i1++];
break;
}
else
{
assert(n2 < -n1);
AppendGaps(esp, ip, -n2);
n1 += n2;
break;
}
}
else
{
assert(n1 > 0);
if (n2 > n1)
{
AppendSymbols(esp, ip, n1);
n2 -= n1;
n1 = es1[i1++];
}
else if (n2 == n1)
{
AppendSymbols(esp, ip, n1);
n1 = es1[i1++];
break;
}
else
{
assert(n2 < n1);
AppendSymbols(esp, ip, n2);
n1 -= n2;
break;
}
}
}
}
else
{
assert(n2 < 0);
AppendGaps(esp, ip, n2);
}
}
esp[++ip] = 0;
#if DEBUG
{
int MaxLen = (int) (LengthEstring(es1) + LengthEstring(es2) + 1);
assert(ip < MaxLen);
if (ip >= 2)
for (int i = 0; i < ip - 2; ++i)
{
if (!(esp[i] > 0 && esp[i+1] < 0 || esp[i] < 0 && esp[i+1] > 0))
{
Log("Bad result of MulEstring: ");
LogEstring(esp);
Quit("Assert failed (alternating signs)");
}
}
unsigned uSymbols1;
unsigned uSymbols2;
unsigned uSymbolsp;
unsigned uIndels1;
unsigned uIndels2;
unsigned uIndelsp;
EstringCounts(es1, &uSymbols1, &uIndels1);
EstringCounts(es2, &uSymbols2, &uIndels2);
EstringCounts(esp, &uSymbolsp, &uIndelsp);
if (uSymbols1 + uIndels1 != uSymbols2)
{
Log("Bad result of MulEstring: ");
LogEstring(esp);
Quit("Assert failed (counts1 %u %u %u)",
uSymbols1, uIndels1, uSymbols2);
}
}
#endif
}
static void test(const short es1[], const short es2[], const short esa[])
{
unsigned uSymbols1;
unsigned uSymbols2;
unsigned uIndels1;
unsigned uIndels2;
EstringCounts(es1, &uSymbols1, &uIndels1);
EstringCounts(es2, &uSymbols2, &uIndels2);
char s[4096];
memset(s, 'X', sizeof(s));
s[uSymbols1] = 0;
char *s1 = EstringOp(es1, s);
char *s12 = EstringOp(es2, s1);
memset(s, 'X', sizeof(s));
s[uSymbols2] = 0;
char *s2 = EstringOp(es2, s);
Log("%s * %s = %s\n", s1, s2, s12);
LogEstring(es1);
Log(" * ");
LogEstring(es2);
Log(" = ");
LogEstring(esa);
Log("\n");
short esp[4096];
MulEstrings(es1, es2, esp);
LogEstring(esp);
if (!EstringsEq(esp, esa))
Log(" *ERROR* ");
Log("\n");
memset(s, 'X', sizeof(s));
s[uSymbols1] = 0;
char *sp = EstringOp(esp, s);
Log("%s\n", sp);
Log("\n==========\n\n");
}
void TestEstrings()
{
SetListFileName("c:\\tmp\\muscle.log", false);
//{
//short es1[] = { -1, 1, -1, 0 };
//short es2[] = { 1, -1, 2, 0 };
//short esa[] = { -2, 1, -1, 0 };
//test(es1, es2, esa);
//}
//{
//short es1[] = { 2, -1, 2, 0 };
//short es2[] = { 1, -1, 3, -1, 1, 0 };
//short esa[] = { 1, -1, 1, -1, 1, -1, 1, 0 };
//test(es1, es2, esa);
//}
//{
//short es1[] = { -1, 3, 0 };
//short es2[] = { 2, -1, 2, 0 };
//short esa[] = { -1, 1, -1, 2, 0 };
//test(es1, es2, esa);
//}
//{
//short es1[] = { -1, 1, -1, 1, 0};
//short es2[] = { 4, 0 };
//short esa[] = { -1, 1, -1, 1, 0};
//test(es1, es2, esa);
//}
//{
//short es1[] = { 1, -1, 1, -1, 0};
//short es2[] = { 4, 0 };
//short esa[] = { 1, -1, 1, -1, 0};
//test(es1, es2, esa);
//}
//{
//short es1[] = { 1, -1, 1, -1, 0};
//short es2[] = { -1, 4, -1, 0 };
//short esa[] = { -1, 1, -1, 1, -2, 0};
//test(es1, es2, esa);
//}
{
short es1[] = { 106, -77, 56, -2, 155, -3, 123, -2, 0};
short es2[] = { 50, -36, 34, -3, 12, -6, 1, -6, 18, -17, 60, -5, 349, -56, 0 };
short esa[] = { 0 };
test(es1, es2, esa);
}
exit(0);
}

View File

@ -0,0 +1,13 @@
#ifndef pathsum_h
#define pathsum_h
void PathToEstrings(const PWPath &Path, short **ptresA, short **ptresB);
void EstringsToPath(const short esA[], const short esB[], PWPath &Path);
void MulEstrings(const short es1[], const short es2[], short esp[]);
void EstringOp(const short es[], const Seq &sIn, Seq &sOut);
unsigned EstringOp(const short es[], const Seq &sIn, MSA &a);
void LogEstring(const short es[]);
unsigned LengthEstring(const short es[]);
short *EstringNewCopy(const short es[]);
#endif // pathsum_h

View File

@ -0,0 +1,56 @@
#include "muscle.h"
#include <stdio.h>
#include <ctype.h>
#include "msa.h"
#include "textfile.h"
const unsigned FASTA_BLOCK = 60;
void MSA::FromFASTAFile(TextFile &File)
{
Clear();
FILE *f = File.GetStdioFile();
unsigned uSeqCount = 0;
unsigned uColCount = uInsane;
for (;;)
{
char *Label;
unsigned uSeqLength;
char *SeqData = GetFastaSeq(f, &uSeqLength, &Label, false);
if (0 == SeqData)
break;
AppendSeq(SeqData, uSeqLength, Label);
}
}
void MSA::ToFASTAFile(TextFile &File) const
{
const unsigned uColCount = GetColCount();
assert(uColCount > 0);
const unsigned uLinesPerSeq = (GetColCount() - 1)/FASTA_BLOCK + 1;
const unsigned uSeqCount = GetSeqCount();
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
{
File.PutString(">");
File.PutString(GetSeqName(uSeqIndex));
File.PutString("\n");
unsigned n = 0;
for (unsigned uLine = 0; uLine < uLinesPerSeq; ++uLine)
{
unsigned uLetters = uColCount - uLine*FASTA_BLOCK;
if (uLetters > FASTA_BLOCK)
uLetters = FASTA_BLOCK;
for (unsigned i = 0; i < uLetters; ++i)
{
char c = GetChar(uSeqIndex, n);
File.PutChar(c);
++n;
}
File.PutChar('\n');
}
}
}

Some files were not shown because too many files have changed in this diff Show More