501 lines
16 KiB
C
501 lines
16 KiB
C
/**
|
|
* FileName: sumatra.c
|
|
* Authors: Eric Coissac, Celine Mercier
|
|
* Description: computation of pairwise similarities of DNA sequences
|
|
* **/
|
|
|
|
#include "sumatra.h"
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <unistd.h>
|
|
#include <math.h>
|
|
#include <string.h>
|
|
|
|
#include <sys/time.h>
|
|
|
|
#include "./sumalibs/libfasta/sequence.h"
|
|
#include "./sumalibs/liblcs/upperband.h"
|
|
#include "./sumalibs/liblcs/sse_banded_LCS_alignment.h"
|
|
#include "./sumalibs/libutils/utilities.h"
|
|
#include "mtcompare_sumatra.h"
|
|
|
|
#define VERSION "1.0.10"
|
|
|
|
|
|
/* ----------------------------------------------- */
|
|
/* printout help */
|
|
/* ----------------------------------------------- */
|
|
#define PP fprintf(stdout,
|
|
|
|
static void PrintHelp()
|
|
{
|
|
PP "-----------------------------------------------------------------------------------------------------------------------------\n");
|
|
PP " SUMATRA Version %s\n", VERSION);
|
|
PP "-----------------------------------------------------------------------------------------------------------------------------\n");
|
|
PP " Synopsis : sumatra computes all the pairwise LCS (Longest Common Subsequence) scores\n");
|
|
PP " of one nucleotide dataset or between two nucleotide datasets.\n");
|
|
PP " Usage: sumatra [options] <dataset1> [dataset2]\n");
|
|
PP "-----------------------------------------------------------------------------------------------------------------------------\n");
|
|
PP " Options:\n\n");
|
|
PP " -h : [H]elp - print <this> help\n\n");
|
|
PP " -l : Reference sequence length is the shortest. \n\n");
|
|
PP " -L : Reference sequence length is the largest. \n\n");
|
|
PP " -a : Reference sequence length is the alignment length (default). \n\n");
|
|
PP " -n : Score is normalized by reference sequence length (default).\n\n");
|
|
PP " -r : Raw score, not normalized. \n\n");
|
|
PP " -d : Score is expressed in distance (default: score is expressed in similarity). \n\n");
|
|
PP " -t ##.## : Score threshold. If the score is normalized and expressed in similarity (default),\n");
|
|
PP " it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized\n");
|
|
PP " and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%.\n");
|
|
PP " If the score is not normalized and expressed in similarity, it is the length of the\n");
|
|
PP " Longest Common Subsequence. If the score is not normalized and expressed in distance,\n");
|
|
PP " it is (reference length - LCS length).\n");
|
|
PP " Only sequence pairs with a similarity above ##.## are printed. Default: 0.00 \n");
|
|
PP " (no threshold).\n\n");
|
|
PP " -p ## : Number of threads used for computation (default=1).\n\n");
|
|
PP " -g : n's are replaced with a's (default: sequences with n's are discarded).\n");
|
|
PP " -x : Adds four extra columns with the count and length of both sequences.\n");
|
|
PP "-----------------------------------------------------------------------------------------------------------------------------\n");
|
|
PP " First argument : the nucleotide dataset to analyze\n\n");
|
|
PP " Second argument : optionally the second nucleotide dataset\n");
|
|
PP "-----------------------------------------------------------------------------------------------------------------------------\n");
|
|
PP " Results table description : \n");
|
|
PP " column 1 : Identifier sequence 1\n");
|
|
PP " column 2 : Identifier sequence 2\n");
|
|
PP " column 3 : Score\n");
|
|
PP " column 4 : Count of sequence 1 (only with option -x)\n");
|
|
PP " column 5 : Count of sequence 2 (only with option -x)\n");
|
|
PP " column 6 : Length of sequence 1 (only with option -x)\n");
|
|
PP " column 7 : Length of sequence 2 (only with option -x)\n");
|
|
PP "-----------------------------------------------------------------------------------------------------------------------------\n");
|
|
PP " http://metabarcoding.org/sumatra\n");
|
|
PP "-----------------------------------------------------------------------------------------------------------------------------\n\n");
|
|
}
|
|
|
|
#undef PP
|
|
|
|
/* ----------------------------------------------- */
|
|
/* printout usage and exit */
|
|
/* ----------------------------------------------- */
|
|
|
|
#define PP fprintf(stderr,
|
|
|
|
static void ExitUsage(stat)
|
|
int stat;
|
|
{
|
|
PP "usage: sumatra [-l|L|a|n|r|d|g|x] [-t threshold_value] [-p number of threads] dataset1 [dataset2]\n");
|
|
PP "type \"sumatra -h\" for help\n");
|
|
|
|
if (stat)
|
|
exit(stat);
|
|
}
|
|
|
|
#undef PP
|
|
|
|
|
|
void printResults(fastaSeqPtr seq1,fastaSeqPtr seq2,
|
|
double score,
|
|
BOOL extradata,
|
|
int64_t pairs,
|
|
BOOL print)
|
|
{
|
|
static struct timeval start;
|
|
static struct timeval lastprint;
|
|
static BOOL first=TRUE;
|
|
static uint64_t aligned=0;
|
|
|
|
struct timeval current;
|
|
double fraction;
|
|
time_t fulltime;
|
|
time_t remaintime;
|
|
double elapsedtime;
|
|
int32_t day;
|
|
int32_t hour;
|
|
int32_t minute;
|
|
int32_t seconde;
|
|
|
|
|
|
aligned++;
|
|
|
|
if (first)
|
|
{
|
|
first=FALSE;
|
|
gettimeofday(&start,NULL);
|
|
lastprint=start;
|
|
}
|
|
|
|
gettimeofday(¤t,NULL);
|
|
|
|
if (current.tv_sec!=lastprint.tv_sec)
|
|
{
|
|
lastprint=current;
|
|
fraction = (double)aligned/(double)pairs;
|
|
elapsedtime = difftime(current.tv_sec,start.tv_sec);
|
|
fulltime = elapsedtime / fraction;
|
|
remaintime = (time_t)difftime(fulltime,(time_t)elapsedtime);
|
|
|
|
|
|
fprintf(stderr,
|
|
"Computed %lld / %lld -> %5.2lf%%",
|
|
aligned, pairs, fraction*100.
|
|
);
|
|
seconde = fulltime % 60;
|
|
minute = fulltime / 60;
|
|
hour = minute / 60;
|
|
minute = minute % 60;
|
|
day = hour / 24;
|
|
hour = hour % 24;
|
|
if (day)
|
|
fprintf(stderr,
|
|
", estimated computation time = %3d days %02d:%02d:%02d",
|
|
day,
|
|
hour,
|
|
minute,
|
|
seconde
|
|
);
|
|
else
|
|
fprintf(stderr,
|
|
", estimated computation time = %02d:%02d:%02d",
|
|
hour,
|
|
minute,
|
|
seconde
|
|
);
|
|
|
|
seconde = remaintime % 60;
|
|
minute = remaintime / 60;
|
|
hour = minute / 60;
|
|
minute = minute % 60;
|
|
day = hour / 24;
|
|
hour = hour % 24;
|
|
if (day)
|
|
fprintf(stderr,
|
|
", about %3d days %02d:%02d:%02d remaining \r",
|
|
day,
|
|
hour,
|
|
minute,
|
|
seconde
|
|
);
|
|
else
|
|
fprintf(stderr,
|
|
", about %02d:%02d:%02d remaining \r",
|
|
|
|
hour,
|
|
minute,
|
|
seconde
|
|
);
|
|
|
|
}
|
|
|
|
if (print)
|
|
{
|
|
|
|
if (extradata)
|
|
printf("%s\t%s\t%lf\t%d\t%d\t%d\t%d\n", seq1->accession_id,
|
|
seq2->accession_id,
|
|
score,
|
|
seq1->count,
|
|
seq2->count,
|
|
seq1->length,
|
|
seq2->length
|
|
);
|
|
else
|
|
printf("%s\t%s\t%lf\n", seq1->accession_id,
|
|
seq2->accession_id,
|
|
score);
|
|
}
|
|
}
|
|
|
|
|
|
int compare1(fastaSeqCount db1, double threshold, BOOL normalize, int reference, BOOL lcsmode, BOOL extradata)
|
|
{
|
|
BOOL always = TRUE;
|
|
int64_t pairs = (int64_t)(db1.count - 1) * (int64_t)db1.count /2;
|
|
BOOL print;
|
|
double score;
|
|
int32_t i,j;
|
|
char* s1;
|
|
char* s2;
|
|
int l1;
|
|
int l2;
|
|
int16_t* iseq1;
|
|
int16_t* iseq2;
|
|
int16_t* address;
|
|
int sizeForSeqs;
|
|
int lmax, lmin;
|
|
int LCSmin;
|
|
|
|
fprintf(stderr,"Pairwise alignments of one dataset against itself\n");
|
|
fprintf(stderr,"Count of alignments to do : %lld\n",pairs);
|
|
|
|
if (threshold > 0)
|
|
{
|
|
fprintf(stderr,"Computing for scores > %lf\n",threshold);
|
|
always = FALSE;
|
|
}
|
|
|
|
calculateMaxAndMinLenDB(db1, &lmax, &lmin);
|
|
sizeForSeqs = prepareTablesForSumathings(lmax, lmin, threshold, normalize, reference, lcsmode, &address, &iseq1, &iseq2);
|
|
|
|
for (i=0; i < db1.count; i++) // ...??
|
|
for (j=i+1; j < db1.count; j++)
|
|
{
|
|
print = FALSE;
|
|
filtersSumatra(db1.fastaSeqs+i, db1.fastaSeqs+j, threshold, normalize, reference, lcsmode, &score, &LCSmin);
|
|
if (score >= 0) // identical sequences
|
|
print = TRUE;
|
|
else if (always || (score == -1.0)) // filter passed or no threshold
|
|
{
|
|
s1 = (db1.fastaSeqs+i)->sequence;
|
|
l1 = (db1.fastaSeqs+i)->length;
|
|
s2 = (db1.fastaSeqs+j)->sequence;
|
|
l2 = (db1.fastaSeqs+j)->length;
|
|
score = alignForSumathings(s1, iseq1, s2, iseq2, l1, l2, normalize, reference, lcsmode, address, sizeForSeqs, LCSmin);
|
|
print = always || (((normalize || lcsmode) && (score >= threshold)) || ((!lcsmode && !normalize) && (score <= threshold)));
|
|
if (print && !lcsmode && normalize)
|
|
score = 1.0 - score;
|
|
}
|
|
printResults(db1.fastaSeqs+i, db1.fastaSeqs+j, score, extradata, pairs, print);
|
|
}
|
|
|
|
free(iseq1-sizeForSeqs+lmax);
|
|
free(iseq2-sizeForSeqs+lmax);
|
|
return 0;
|
|
}
|
|
|
|
|
|
int compare2(fastaSeqCount db1, fastaSeqCount db2, double threshold, BOOL normalize, int reference, BOOL lcsmode, BOOL extradata)
|
|
{
|
|
BOOL always = TRUE;
|
|
int64_t pairs = (int64_t)(db1.count) * (int64_t)(db2.count);
|
|
BOOL print;
|
|
double score;
|
|
int32_t i,j;
|
|
char* s1;
|
|
char* s2;
|
|
int l1;
|
|
int l2;
|
|
int16_t* iseq1;
|
|
int16_t* iseq2;
|
|
int16_t* address;
|
|
int sizeForSeqs;
|
|
int lmax;
|
|
int lmax1;
|
|
int lmin;
|
|
int lmin1;
|
|
int LCSmin;
|
|
|
|
fprintf(stderr,"Pairwise alignments of two datasets\n");
|
|
fprintf(stderr,"Count of alignments to do : %lld\n",pairs);
|
|
|
|
if (threshold > 0)
|
|
{
|
|
fprintf(stderr,"Computing for scores > %lf\n",threshold);
|
|
always = FALSE;
|
|
}
|
|
|
|
calculateMaxAndMinLenDB(db1, &lmax, &lmin);
|
|
calculateMaxAndMinLenDB(db2, &lmax1, &lmin1);
|
|
|
|
if (lmax1 > lmax)
|
|
lmax = lmax1;
|
|
if (lmin1 < lmin)
|
|
lmin = lmin1;
|
|
|
|
sizeForSeqs = prepareTablesForSumathings(lmax, lmin, threshold, normalize, reference, lcsmode, &address, &iseq1, &iseq2);
|
|
|
|
for (i=0; i < db1.count; i++)
|
|
for (j=0; j < db2.count; j++)
|
|
{
|
|
print = FALSE;
|
|
filtersSumatra(db1.fastaSeqs+i, db2.fastaSeqs+j, threshold, normalize, reference, lcsmode, &score, &LCSmin);
|
|
if (score >= 0) // identical sequences
|
|
print = TRUE;
|
|
else if (always || (score == -1.0)) // filter passed or no threshold
|
|
{
|
|
s1 = (db1.fastaSeqs+i)->sequence;
|
|
l1 = (db1.fastaSeqs+i)->length;
|
|
s2 = (db2.fastaSeqs+j)->sequence;
|
|
l2 = (db2.fastaSeqs+j)->length;
|
|
score = alignForSumathings(s1, iseq1, s2, iseq2, l1, l2, normalize, reference, lcsmode, address, sizeForSeqs, LCSmin);
|
|
print = always || (((normalize || lcsmode) && (score >= threshold)) || ((!lcsmode && !normalize) && (score <= threshold)));
|
|
if (print && !lcsmode && normalize)
|
|
score = 1.0 - score;
|
|
}
|
|
printResults(db1.fastaSeqs+i, db2.fastaSeqs+j, score, extradata, pairs, print);
|
|
}
|
|
|
|
free(iseq1-sizeForSeqs+lmax);
|
|
free(iseq2-sizeForSeqs+lmax);
|
|
return 0;
|
|
}
|
|
|
|
|
|
int main(int argc, char **argv)
|
|
{
|
|
|
|
int32_t carg = 0;
|
|
int32_t errflag = 0;
|
|
BOOL normalize = TRUE;
|
|
BOOL lcsmode = TRUE;
|
|
int reference = ALILEN;
|
|
BOOL extradata = FALSE;
|
|
BOOL onlyATGC = TRUE;
|
|
double threshold = 0.0;
|
|
int ndb = 0;
|
|
int nproc = 1;
|
|
fastaSeqCount db1;
|
|
fastaSeqCount db2;
|
|
|
|
while ((carg = getopt(argc, argv, "hlLanrdt:p:gx")) != -1) {
|
|
switch (carg) {
|
|
/* -------------------- */
|
|
case 'h': /* help */
|
|
/* -------------------- */
|
|
PrintHelp();
|
|
exit(0);
|
|
break;
|
|
|
|
/* -------------------------------------------------- */
|
|
case 'l': /* Normalize LCS/Error by the shortest sequence length*/
|
|
/* -------------------------------------------------- */
|
|
reference=MINLEN;
|
|
break;
|
|
|
|
/* -------------------------------------------------- */
|
|
case 'L': /* Normalize LCS/Error by the largest sequence length */
|
|
/* -------------------------------------------------- */
|
|
reference=MAXLEN;
|
|
break;
|
|
|
|
/* -------------------------------------------------- */
|
|
case 'a': /* Normalize LCS/Error by the alignment length */
|
|
/* -------------------------------------------------- */
|
|
reference=ALILEN;
|
|
break;
|
|
|
|
/* -------------------------------------------------- */
|
|
case 'n': /* Normalize LCS by the reference length */
|
|
/* -------------------------------------------------- */
|
|
normalize=TRUE;
|
|
break;
|
|
|
|
/* -------------------------------------------------- */
|
|
case 'r': /* No normalization */
|
|
/* -------------------------------------------------- */
|
|
normalize=FALSE;
|
|
break;
|
|
|
|
/* -------------------------------------------------- */
|
|
case 'd': /* Score is expressed in distance */
|
|
/* -------------------------------------------------- */
|
|
lcsmode=FALSE;
|
|
break;
|
|
|
|
/* ------------------------------------------------------------------- */
|
|
case 't': /* Prints only pairs with similarity higher than (threshold) */
|
|
/* ------------------------------------------------------------------- */
|
|
sscanf(optarg,"%lf",&threshold);
|
|
break;
|
|
|
|
/* -------------------------------------------------- */
|
|
case 'p': /* number of processors to use */
|
|
/* -------------------------------------------------- */
|
|
sscanf(optarg,"%d",&nproc);
|
|
break;
|
|
|
|
/* -------------------------------------------------- */
|
|
case 'x': /* Print extra data (node weight, lseq1, lseq2) */
|
|
/* -------------------------------------------------- */
|
|
extradata=TRUE;
|
|
break;
|
|
|
|
|
|
/* -------------------------------------------------- */
|
|
case 'g': /* replace n's with a's in sequences */
|
|
/* -------------------------------------------------- */
|
|
onlyATGC=FALSE;
|
|
break;
|
|
|
|
|
|
case '?': /* bad option */
|
|
errflag++;
|
|
break;
|
|
}
|
|
}
|
|
|
|
ndb = argc - optind;
|
|
if (ndb < 1)
|
|
errflag++;
|
|
|
|
if (errflag)
|
|
ExitUsage(errflag);
|
|
|
|
fprintf(stderr,"===============================================================\n");
|
|
fprintf(stderr," SUMATRA version %s\n",VERSION);
|
|
#ifdef __SSE2__
|
|
fprintf(stderr,"Alignment using SSE2 code\n");
|
|
#else
|
|
fprintf(stderr,"Alignment using standard code, SSE2 unavailable\n");
|
|
#endif
|
|
fprintf(stderr,"===============================================================\n");
|
|
|
|
|
|
if (normalize && (threshold > 1.0))
|
|
{
|
|
fprintf(stderr, "\nERROR : Please specify a threshold included between 0.0 and 1.0 when normalizing scores.\n\n");
|
|
exit(1);
|
|
}
|
|
|
|
fprintf(stderr,"Reading dataset 1...");
|
|
db1 = seq_readAllSeq2(argv[optind], TRUE, onlyATGC);
|
|
fprintf(stderr,"\n%d sequences\n",db1.count);
|
|
|
|
if (!onlyATGC)
|
|
(void)cleanDB(db1);
|
|
|
|
addCounts(&db1);
|
|
|
|
if (threshold>0)
|
|
(void)hashDB(db1);
|
|
|
|
optind++;
|
|
|
|
if (ndb == 2)
|
|
{
|
|
fprintf(stderr,"Reading dataset 2...");
|
|
db2 = seq_readAllSeq2(argv[optind], TRUE, onlyATGC);
|
|
fprintf(stderr,"\n%d sequences\n",db2.count);
|
|
|
|
if (!onlyATGC)
|
|
(void)cleanDB(db2);
|
|
|
|
addCounts(&db2);
|
|
|
|
if (threshold>0)
|
|
(void)hashDB(db2);
|
|
}
|
|
|
|
if (!lcsmode && normalize && (threshold > 0))
|
|
threshold = 1.0 - threshold;
|
|
|
|
if (ndb==1)
|
|
{
|
|
if (nproc==1)
|
|
compare1(db1, threshold, normalize, reference, lcsmode, extradata);
|
|
else
|
|
mt_compare_sumatra(&db1, NULL, threshold, normalize, reference, lcsmode, extradata, nproc);
|
|
}
|
|
else
|
|
{
|
|
if (nproc==1)
|
|
compare2(db1, db2, threshold, normalize, reference, lcsmode, extradata);
|
|
else
|
|
mt_compare_sumatra(&db1, &db2, threshold, normalize, reference, lcsmode, extradata, nproc);
|
|
}
|
|
|
|
fprintf(stderr,"\nDone.\n\n");
|
|
return 0;
|
|
|
|
}
|