14 Commits

10 changed files with 64 additions and 61 deletions

2
.gitmodules vendored
View File

@ -1,3 +1,3 @@
[submodule "sumalibs"] [submodule "sumalibs"]
path = sumalibs path = sumalibs
url = https://git.metabarcoding.org/obitools/sumalibs.git url = https://forge.metabarcoding.org/obitools/sumalibs.git

View File

@ -1,14 +1,17 @@
EXEC=sumatra PREFIX=/usr/local
SUMATRA_SRC= sumatra.c \ CFLAGS=-I$(PREFIX)/include
mtcompare_sumatra.c
SUMATRA_OBJ= $(patsubst %.c,%.o,$(SUMATRA_SRC))
EXEC = sumatra
SRCS= $(SUMATRA_SRC) SUMATRA_SRC = sumatra.c \
mtcompare_sumatra.c
LIB= -lfasta -llcs -lfile -lutils -lz -lm SUMATRA_OBJ = $(patsubst %.c,%.o,$(SUMATRA_SRC))
SRCS = $(SUMATRA_SRC)
LIB = -lsuma -lz -lm
include ./global.mk include ./global.mk
@ -23,9 +26,9 @@ all: $(EXEC)
# executable compilation and link # executable compilation and link
sumatra: $(SUMATRA_OBJ) $(LIBFASTA) $(LIBLCS) $(LIBFILE) $(LIBUTILS) sumatra: $(SUMATRA_OBJ) ./sumalibs/libsuma.a
$(CC) $(LDFLAGS) -o $@ -pthread $(SUMATRA_OBJ) $(LIBFASTAPATH) $(LIBLCSPATH) $(LIBFILEPATH) $(LIBUTILSPATH) $(LIB) $(CC) $(LDFLAGS) -o $@ -pthread $(SUMATRA_OBJ) $(LIBSUMAPATH) $(LIB)
######## ########
# #
# project management # project management
@ -33,12 +36,10 @@ sumatra: $(SUMATRA_OBJ) $(LIBFASTA) $(LIBLCS) $(LIBFILE) $(LIBUTILS)
######## ########
clean: clean:
rm -f *.o rm -f $(SUMATRA_OBJ)
rm -f *.P
rm -f $(EXEC) rm -f $(EXEC)
$(MAKE) -C ./sumalibs/libfasta clean $(MAKE) -C ./sumalibs clean
$(MAKE) -C ./sumalibs/liblcs clean
$(MAKE) -C ./sumalibs/libfile clean
$(MAKE) -C ./sumalibs/libutils clean
install: all
install -d $(DESTDIR)$(PREFIX)/bin/
install -m 755 $(EXEC) $(DESTDIR)$(PREFIX)/bin/

1
README.md Normal file
View File

@ -0,0 +1 @@
[See the wiki](https://git.metabarcoding.org/obitools/sumatra/wikis/home)

View File

@ -1,22 +1,15 @@
LIBSUMAPATH = -L./sumalibs
LIBFASTAPATH = -L./sumalibs/libfasta LIBSUMA = ./sumalibs/libsuma.a
LIBLCSPATH = -L./sumalibs/liblcs
LIBFILEPATH = -L./sumalibs/libfile
LIBUTILSPATH = -L./sumalibs/libutils
LIBFASTA = ./sumalibs/libfasta/libfasta.a
LIBLCS = ./sumalibs/liblcs/liblcs.a
LIBFILE = ./sumalibs/libfile/libfile.a
LIBUTILS = ./sumalibs/libutils/libutils.a
CC=gcc CC=gcc
LDFLAGS= LDFLAGS=
ifeq ($(CC),gcc) ifeq ($(CC),gcc)
CFLAGS = -O3 -s -DOMP_SUPPORT -fopenmp -w CFLAGS = -I sumalibs -O3 -s -DOMP_SUPPORT -w
else else
CFLAGS = -O3 -w CFLAGS = -I sumalibs -O3 -w
endif endif
@ -32,14 +25,5 @@ default: all
# #
######## ########
./sumalibs/libfasta/libfasta.a: ./sumalibs/libsuma.a:
$(MAKE) -C ./sumalibs/libfasta $(MAKE) -C ./sumalibs
./sumalibs/liblcs/liblcs.a:
$(MAKE) -C ./sumalibs/liblcs
./sumalibs/libfile/libfile.a:
$(MAKE) -C ./sumalibs/libfile
./sumalibs/libutils/libutils.a:
$(MAKE) -C ./sumalibs/libutils

View File

@ -10,10 +10,10 @@
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include "sumatra.h" #include "sumatra.h"
#include "./sumalibs/libfasta/sequence.h" #include "libfasta/sequence.h"
#include "./sumalibs/libutils/utilities.h" #include "libutils/utilities.h"
#include "./sumalibs/liblcs/upperband.h" #include "liblcs/upperband.h"
#include "./sumalibs/liblcs/sse_banded_LCS_alignment.h" #include "liblcs/sse_banded_LCS_alignment.h"
typedef struct { typedef struct {

View File

@ -14,13 +14,13 @@
#include <sys/time.h> #include <sys/time.h>
#include "./sumalibs/libfasta/sequence.h" #include "libfasta/sequence.h"
#include "./sumalibs/liblcs/upperband.h" #include "liblcs/upperband.h"
#include "./sumalibs/liblcs/sse_banded_LCS_alignment.h" #include "liblcs/sse_banded_LCS_alignment.h"
#include "./sumalibs/libutils/utilities.h" #include "libutils/utilities.h"
#include "mtcompare_sumatra.h" #include "mtcompare_sumatra.h"
#define VERSION "1.0.10" #define VERSION "1.0.36"
/* ----------------------------------------------- */ /* ----------------------------------------------- */
@ -57,7 +57,9 @@ static void PrintHelp()
PP " -g : n's are replaced with a's (default: sequences with n's are discarded).\n"); PP " -g : n's are replaced with a's (default: sequences with n's are discarded).\n");
PP " -x : Adds four extra columns with the count and length of both sequences.\n"); PP " -x : Adds four extra columns with the count and length of both sequences.\n");
PP "-----------------------------------------------------------------------------------------------------------------------------\n"); PP "-----------------------------------------------------------------------------------------------------------------------------\n");
PP " First argument : the nucleotide dataset to analyze\n\n"); PP " First argument : the nucleotide dataset to analyze (or nothing \n");
PP " if there is only one dataset and the standard \n");
PP " input should be used). \n\n");
PP " Second argument : optionally the second nucleotide dataset\n"); PP " Second argument : optionally the second nucleotide dataset\n");
PP "-----------------------------------------------------------------------------------------------------------------------------\n"); PP "-----------------------------------------------------------------------------------------------------------------------------\n");
PP " Results table description : \n"); PP " Results table description : \n");
@ -84,7 +86,7 @@ static void PrintHelp()
static void ExitUsage(stat) static void ExitUsage(stat)
int stat; int stat;
{ {
PP "usage: sumatra [-l|L|a|n|r|d|g|x] [-t threshold_value] [-p number of threads] dataset1 [dataset2]\n"); PP "usage: sumatra [-l|L|a|n|r|d|g|x] [-t threshold_value] [-p number of threads] [dataset1] [dataset2]\n");
PP "type \"sumatra -h\" for help\n"); PP "type \"sumatra -h\" for help\n");
if (stat) if (stat)
@ -212,7 +214,7 @@ int compare1(fastaSeqCount db1, double threshold, BOOL normalize, int reference,
BOOL always = TRUE; BOOL always = TRUE;
int64_t pairs = (int64_t)(db1.count - 1) * (int64_t)db1.count /2; int64_t pairs = (int64_t)(db1.count - 1) * (int64_t)db1.count /2;
BOOL print; BOOL print;
double score; double score, scoreG;
int32_t i,j; int32_t i,j;
char* s1; char* s1;
char* s2; char* s2;
@ -237,7 +239,7 @@ int compare1(fastaSeqCount db1, double threshold, BOOL normalize, int reference,
calculateMaxAndMinLenDB(db1, &lmax, &lmin); calculateMaxAndMinLenDB(db1, &lmax, &lmin);
sizeForSeqs = prepareTablesForSumathings(lmax, lmin, threshold, normalize, reference, lcsmode, &address, &iseq1, &iseq2); sizeForSeqs = prepareTablesForSumathings(lmax, lmin, threshold, normalize, reference, lcsmode, &address, &iseq1, &iseq2);
for (i=0; i < db1.count; i++) // ...?? for (i=0; i < db1.count; i++) // ...?? db1.count - 1 probably
for (j=i+1; j < db1.count; j++) for (j=i+1; j < db1.count; j++)
{ {
print = FALSE; print = FALSE;
@ -250,7 +252,23 @@ int compare1(fastaSeqCount db1, double threshold, BOOL normalize, int reference,
l1 = (db1.fastaSeqs+i)->length; l1 = (db1.fastaSeqs+i)->length;
s2 = (db1.fastaSeqs+j)->sequence; s2 = (db1.fastaSeqs+j)->sequence;
l2 = (db1.fastaSeqs+j)->length; l2 = (db1.fastaSeqs+j)->length;
/* fprintf(stderr, "\n%s", s1);
fprintf(stderr, "\n%s", s2);
fprintf(stderr, "\n%f", threshold);
fprintf(stderr, "\n%d", normalize);
fprintf(stderr, "\n%d", reference);
fprintf(stderr, "\n%d\n", lcsmode);
*/
// score = generic_sse_banded_lcs_align(s1, s2, threshold, normalize, reference, lcsmode);
// fprintf(stderr, "\nscore generic = %f", scoreG);
score = alignForSumathings(s1, iseq1, s2, iseq2, l1, l2, normalize, reference, lcsmode, address, sizeForSeqs, LCSmin); score = alignForSumathings(s1, iseq1, s2, iseq2, l1, l2, normalize, reference, lcsmode, address, sizeForSeqs, LCSmin);
// fprintf(stderr, "\nscore = %f\n", score);
// if (scoreG != score)
// {
// fprintf(stderr, "\nscores differents\n");
// exit(1);
// }
print = always || (((normalize || lcsmode) && (score >= threshold)) || ((!lcsmode && !normalize) && (score <= threshold))); print = always || (((normalize || lcsmode) && (score >= threshold)) || ((!lcsmode && !normalize) && (score <= threshold)));
if (print && !lcsmode && normalize) if (print && !lcsmode && normalize)
score = 1.0 - score; score = 1.0 - score;
@ -320,7 +338,7 @@ int compare2(fastaSeqCount db1, fastaSeqCount db2, double threshold, BOOL normal
score = alignForSumathings(s1, iseq1, s2, iseq2, l1, l2, normalize, reference, lcsmode, address, sizeForSeqs, LCSmin); score = alignForSumathings(s1, iseq1, s2, iseq2, l1, l2, normalize, reference, lcsmode, address, sizeForSeqs, LCSmin);
print = always || (((normalize || lcsmode) && (score >= threshold)) || ((!lcsmode && !normalize) && (score <= threshold))); print = always || (((normalize || lcsmode) && (score >= threshold)) || ((!lcsmode && !normalize) && (score <= threshold)));
if (print && !lcsmode && normalize) if (print && !lcsmode && normalize)
score = 1.0 - score; score = 1.0 - score; // TODO isn't that already done?
} }
printResults(db1.fastaSeqs+i, db2.fastaSeqs+j, score, extradata, pairs, print); printResults(db1.fastaSeqs+i, db2.fastaSeqs+j, score, extradata, pairs, print);
} }
@ -425,8 +443,6 @@ int main(int argc, char **argv)
} }
ndb = argc - optind; ndb = argc - optind;
if (ndb < 1)
errflag++;
if (errflag) if (errflag)
ExitUsage(errflag); ExitUsage(errflag);

View File

@ -8,7 +8,7 @@
#ifndef SUMATRA_H_ #ifndef SUMATRA_H_
#define SUMATRA_H_ #define SUMATRA_H_
#include "./sumalibs/libfasta/sequence.h" #include "libfasta/sequence.h"
void printResults(fastaSeqPtr seq1, fastaSeqPtr seq2, double score, BOOL extradata, int64_t pairs, BOOL print); void printResults(fastaSeqPtr seq1, fastaSeqPtr seq2, double score, BOOL extradata, int64_t pairs, BOOL print);

View File

@ -29,7 +29,8 @@ Untar the archive, go into the newly created directory and compile:
``` ```
tar zxvf sumatra_v[x.x.xx].tar.gz tar zxvf sumatra_v[x.x.xx].tar.gz
cd sumatra_v[x.x.xx] cd sumatra_v[x.x.xx]
make make -C sumalibs install
make install
``` ```
## Documentation ## Documentation
@ -40,12 +41,12 @@ Sumatra computes the pairwise alignment scores from one dataset or between two d
#### Input #### Input
Files must be in FASTA format. If there is one dataset, the input can be either the standard input (stdin), or a file in FASTA format. If there are two datasets to compare, the input must be two files in FASTA format.
#### Usage #### Usage
``` ```
sumatra [-l|L|a|n|r|d|g|x] [-t threshold_value] [-p number of threads] dataset1 [dataset2] sumatra [-l|L|a|n|r|d|g|x] [-t threshold_value] [-p number of threads] [dataset1] [dataset2]
``` ```
First argument: the sequence dataset in fasta format to analyse. First argument: the sequence dataset in fasta format to analyse.

Binary file not shown.