sumatra version 1.0.02
This commit is contained in:
500
sumatra.c
Normal file
500
sumatra.c
Normal file
@ -0,0 +1,500 @@
|
||||
/**
|
||||
* FileName: sumatra.c
|
||||
* Authors: Eric Coissac, Celine Mercier
|
||||
* Description: computation of pairwise similarities of DNA sequences
|
||||
* **/
|
||||
|
||||
#include "sumatra.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <math.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
#include "../sumalibs/libfasta/sequence.h"
|
||||
#include "../sumalibs/liblcs/upperband.h"
|
||||
#include "../sumalibs/liblcs/sse_banded_LCS_alignment.h"
|
||||
#include "../sumalibs/libutils/utilities.h"
|
||||
#include "mtcompare_sumatra.h"
|
||||
|
||||
#define VERSION "1.0.02"
|
||||
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
/* printout help */
|
||||
/* ----------------------------------------------- */
|
||||
#define PP fprintf(stdout,
|
||||
|
||||
static void PrintHelp()
|
||||
{
|
||||
PP "-----------------------------------------------------------------------------------------------------------------------------\n");
|
||||
PP " SUMATRA Version %s\n", VERSION);
|
||||
PP "-----------------------------------------------------------------------------------------------------------------------------\n");
|
||||
PP " Synopsis : sumatra computes all the pairwise LCS (Longest Common Subsequence) scores\n");
|
||||
PP " of one nucleotide dataset or between two nucleotide datasets.\n");
|
||||
PP " Usage: sumatra [options] <dataset1> [dataset2]\n");
|
||||
PP "-----------------------------------------------------------------------------------------------------------------------------\n");
|
||||
PP " Options:\n\n");
|
||||
PP " -h : [H]elp - print <this> help\n\n");
|
||||
PP " -l : Reference sequence length is the shortest. \n\n");
|
||||
PP " -L : Reference sequence length is the largest. \n\n");
|
||||
PP " -a : Reference sequence length is the alignment length (default). \n\n");
|
||||
PP " -n : Score is normalized by reference sequence length (default).\n\n");
|
||||
PP " -r : Raw score, not normalized. \n\n");
|
||||
PP " -d : Score is expressed in distance (default: score is expressed in similarity). \n\n");
|
||||
PP " -t ##.## : Score threshold. If the score is normalized and expressed in similarity (default),\n");
|
||||
PP " it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized\n");
|
||||
PP " and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%.\n");
|
||||
PP " If the score is not normalized and expressed in similarity, it is the length of the\n");
|
||||
PP " Longest Common Subsequence. If the score is not normalized and expressed in distance,\n");
|
||||
PP " it is (reference length - LCS length).\n");
|
||||
PP " Only sequence pairs with a similarity above ##.## are printed. Default: 0.00 \n");
|
||||
PP " (no threshold).\n\n");
|
||||
PP " -p ## : Number of threads used for computation (default=1).\n\n");
|
||||
PP " -g : n's are replaced with a's (default: sequences with n's are discarded).\n");
|
||||
PP " -x : Adds four extra columns with the count and length of both sequences.\n");
|
||||
PP "-----------------------------------------------------------------------------------------------------------------------------\n");
|
||||
PP " First argument : the nucleotide dataset to analyze\n\n");
|
||||
PP " Second argument : optionally the second nucleotide dataset\n");
|
||||
PP "-----------------------------------------------------------------------------------------------------------------------------\n");
|
||||
PP " Results table description : \n");
|
||||
PP " column 1 : Identifier sequence 1\n");
|
||||
PP " column 2 : Identifier sequence 2\n");
|
||||
PP " column 3 : Score\n");
|
||||
PP " column 4 : Count of sequence 1 (only with option -x)\n");
|
||||
PP " column 5 : Count of sequence 2 (only with option -x)\n");
|
||||
PP " column 6 : Length of sequence 1 (only with option -x)\n");
|
||||
PP " column 7 : Length of sequence 2 (only with option -x)\n");
|
||||
PP "-----------------------------------------------------------------------------------------------------------------------------\n");
|
||||
PP " http://metabarcoding.org/sumatra\n");
|
||||
PP "-----------------------------------------------------------------------------------------------------------------------------\n\n");
|
||||
}
|
||||
|
||||
#undef PP
|
||||
|
||||
/* ----------------------------------------------- */
|
||||
/* printout usage and exit */
|
||||
/* ----------------------------------------------- */
|
||||
|
||||
#define PP fprintf(stderr,
|
||||
|
||||
static void ExitUsage(stat)
|
||||
int stat;
|
||||
{
|
||||
PP "usage: sumatra [-l|L|a|n|r|d|g|x] [-t threshold_value] [-p number of threads] dataset1 [dataset2]\n");
|
||||
PP "type \"sumatra -h\" for help\n");
|
||||
|
||||
if (stat)
|
||||
exit(stat);
|
||||
}
|
||||
|
||||
#undef PP
|
||||
|
||||
|
||||
void printResults(fastaSeqPtr seq1,fastaSeqPtr seq2,
|
||||
double score,
|
||||
BOOL extradata,
|
||||
int64_t pairs,
|
||||
BOOL print)
|
||||
{
|
||||
static struct timeval start;
|
||||
static struct timeval lastprint;
|
||||
static BOOL first=TRUE;
|
||||
static uint64_t aligned=0;
|
||||
|
||||
struct timeval current;
|
||||
double fraction;
|
||||
time_t fulltime;
|
||||
time_t remaintime;
|
||||
double elapsedtime;
|
||||
int32_t day;
|
||||
int32_t hour;
|
||||
int32_t minute;
|
||||
int32_t seconde;
|
||||
|
||||
|
||||
aligned++;
|
||||
|
||||
if (first)
|
||||
{
|
||||
first=FALSE;
|
||||
gettimeofday(&start,NULL);
|
||||
lastprint=start;
|
||||
}
|
||||
|
||||
gettimeofday(¤t,NULL);
|
||||
|
||||
if (current.tv_sec!=lastprint.tv_sec)
|
||||
{
|
||||
lastprint=current;
|
||||
fraction = (double)aligned/(double)pairs;
|
||||
elapsedtime = difftime(current.tv_sec,start.tv_sec);
|
||||
fulltime = elapsedtime / fraction;
|
||||
remaintime = (time_t)difftime(fulltime,(time_t)elapsedtime);
|
||||
|
||||
|
||||
fprintf(stderr,
|
||||
"Computed %lld / %lld -> %5.2lf%%",
|
||||
aligned, pairs, fraction*100.
|
||||
);
|
||||
seconde = fulltime % 60;
|
||||
minute = fulltime / 60;
|
||||
hour = minute / 60;
|
||||
minute = minute % 60;
|
||||
day = hour / 24;
|
||||
hour = hour % 24;
|
||||
if (day)
|
||||
fprintf(stderr,
|
||||
", estimated computation time = %3d days %02d:%02d:%02d",
|
||||
day,
|
||||
hour,
|
||||
minute,
|
||||
seconde
|
||||
);
|
||||
else
|
||||
fprintf(stderr,
|
||||
", estimated computation time = %02d:%02d:%02d",
|
||||
hour,
|
||||
minute,
|
||||
seconde
|
||||
);
|
||||
|
||||
seconde = remaintime % 60;
|
||||
minute = remaintime / 60;
|
||||
hour = minute / 60;
|
||||
minute = minute % 60;
|
||||
day = hour / 24;
|
||||
hour = hour % 24;
|
||||
if (day)
|
||||
fprintf(stderr,
|
||||
", about %3d days %02d:%02d:%02d remaining \r",
|
||||
day,
|
||||
hour,
|
||||
minute,
|
||||
seconde
|
||||
);
|
||||
else
|
||||
fprintf(stderr,
|
||||
", about %02d:%02d:%02d remaining \r",
|
||||
|
||||
hour,
|
||||
minute,
|
||||
seconde
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
if (print)
|
||||
{
|
||||
|
||||
if (extradata)
|
||||
printf("%s\t%s\t%lf\t%d\t%d\t%d\t%d\n", seq1->accession_id,
|
||||
seq2->accession_id,
|
||||
score,
|
||||
seq1->count,
|
||||
seq2->count,
|
||||
seq1->length,
|
||||
seq2->length
|
||||
);
|
||||
else
|
||||
printf("%s\t%s\t%lf\n", seq1->accession_id,
|
||||
seq2->accession_id,
|
||||
score);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int compare1(fastaSeqCount db1, double threshold, BOOL normalize, int reference, BOOL lcsmode, BOOL extradata)
|
||||
{
|
||||
BOOL always = TRUE;
|
||||
int64_t pairs = (int64_t)(db1.count - 1) * (int64_t)db1.count /2;
|
||||
BOOL print;
|
||||
double score;
|
||||
int32_t i,j;
|
||||
char* s1;
|
||||
char* s2;
|
||||
int l1;
|
||||
int l2;
|
||||
int16_t* iseq1;
|
||||
int16_t* iseq2;
|
||||
int16_t* address;
|
||||
int sizeForSeqs;
|
||||
int lmax, lmin;
|
||||
int LCSmin;
|
||||
|
||||
fprintf(stderr,"Pairwise alignments of one dataset against itself\n");
|
||||
fprintf(stderr,"Count of alignments to do : %lld\n",pairs);
|
||||
|
||||
if (threshold > 0)
|
||||
{
|
||||
fprintf(stderr,"Computing for scores > %lf\n",threshold);
|
||||
always = FALSE;
|
||||
}
|
||||
|
||||
calculateMaxAndMinLenDB(db1, &lmax, &lmin);
|
||||
sizeForSeqs = prepareTablesForSumathings(lmax, lmin, threshold, normalize, reference, lcsmode, &address, &iseq1, &iseq2);
|
||||
|
||||
for (i=0; i < db1.count; i++) // ...??
|
||||
for (j=i+1; j < db1.count; j++)
|
||||
{
|
||||
print = FALSE;
|
||||
filtersSumatra(db1.fastaSeqs+i, db1.fastaSeqs+j, threshold, normalize, reference, lcsmode, &score, &LCSmin);
|
||||
if (score >= 0) // identical sequences
|
||||
print = TRUE;
|
||||
else if (always || (score == -1.0)) // filter passed or no threshold
|
||||
{
|
||||
s1 = (db1.fastaSeqs+i)->sequence;
|
||||
l1 = (db1.fastaSeqs+i)->length;
|
||||
s2 = (db1.fastaSeqs+j)->sequence;
|
||||
l2 = (db1.fastaSeqs+j)->length;
|
||||
score = alignForSumathings(s1, iseq1, s2, iseq2, l1, l2, normalize, reference, lcsmode, address, sizeForSeqs, LCSmin);
|
||||
print = always || (((normalize || lcsmode) && (score >= threshold)) || ((!lcsmode && !normalize) && (score <= threshold)));
|
||||
if (print && !lcsmode && normalize)
|
||||
score = 1.0 - score;
|
||||
}
|
||||
printResults(db1.fastaSeqs+i, db1.fastaSeqs+j, score, extradata, pairs, print);
|
||||
}
|
||||
|
||||
free(iseq1-sizeForSeqs+lmax);
|
||||
free(iseq2-sizeForSeqs+lmax);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int compare2(fastaSeqCount db1, fastaSeqCount db2, double threshold, BOOL normalize, int reference, BOOL lcsmode, BOOL extradata)
|
||||
{
|
||||
BOOL always = TRUE;
|
||||
int64_t pairs = (int64_t)(db1.count) * (int64_t)(db2.count);
|
||||
BOOL print;
|
||||
double score;
|
||||
int32_t i,j;
|
||||
char* s1;
|
||||
char* s2;
|
||||
int l1;
|
||||
int l2;
|
||||
int16_t* iseq1;
|
||||
int16_t* iseq2;
|
||||
int16_t* address;
|
||||
int sizeForSeqs;
|
||||
int lmax;
|
||||
int lmax1;
|
||||
int lmin;
|
||||
int lmin1;
|
||||
int LCSmin;
|
||||
|
||||
fprintf(stderr,"Pairwise alignments of two datasets\n");
|
||||
fprintf(stderr,"Count of alignments to do : %lld\n",pairs);
|
||||
|
||||
if (threshold > 0)
|
||||
{
|
||||
fprintf(stderr,"Computing for scores > %lf\n",threshold);
|
||||
always = FALSE;
|
||||
}
|
||||
|
||||
calculateMaxAndMinLenDB(db1, &lmax, &lmin);
|
||||
calculateMaxAndMinLenDB(db2, &lmax1, &lmin1);
|
||||
|
||||
if (lmax1 > lmax)
|
||||
lmax = lmax1;
|
||||
if (lmin1 < lmin)
|
||||
lmin = lmin1;
|
||||
|
||||
sizeForSeqs = prepareTablesForSumathings(lmax, lmin, threshold, normalize, reference, lcsmode, &address, &iseq1, &iseq2);
|
||||
|
||||
for (i=0; i < db1.count; i++)
|
||||
for (j=0; j < db2.count; j++)
|
||||
{
|
||||
print = FALSE;
|
||||
filtersSumatra(db1.fastaSeqs+i, db2.fastaSeqs+j, threshold, normalize, reference, lcsmode, &score, &LCSmin);
|
||||
if (score >= 0) // identical sequences
|
||||
print = TRUE;
|
||||
else if (always || (score == -1.0)) // filter passed or no threshold
|
||||
{
|
||||
s1 = (db1.fastaSeqs+i)->sequence;
|
||||
l1 = (db1.fastaSeqs+i)->length;
|
||||
s2 = (db2.fastaSeqs+j)->sequence;
|
||||
l2 = (db2.fastaSeqs+j)->length;
|
||||
score = alignForSumathings(s1, iseq1, s2, iseq2, l1, l2, normalize, reference, lcsmode, address, sizeForSeqs, LCSmin);
|
||||
print = always || (((normalize || lcsmode) && (score >= threshold)) || ((!lcsmode && !normalize) && (score <= threshold)));
|
||||
if (print && !lcsmode && normalize)
|
||||
score = 1.0 - score;
|
||||
}
|
||||
printResults(db1.fastaSeqs+i, db2.fastaSeqs+j, score, extradata, pairs, print);
|
||||
}
|
||||
|
||||
free(iseq1-sizeForSeqs+lmax);
|
||||
free(iseq2-sizeForSeqs+lmax);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
|
||||
int32_t carg = 0;
|
||||
int32_t errflag = 0;
|
||||
BOOL normalize = TRUE;
|
||||
BOOL lcsmode = TRUE;
|
||||
int reference = ALILEN;
|
||||
BOOL extradata = FALSE;
|
||||
BOOL onlyATGC = TRUE;
|
||||
double threshold = 0.0;
|
||||
int ndb = 0;
|
||||
int nproc = 1;
|
||||
fastaSeqCount db1;
|
||||
fastaSeqCount db2;
|
||||
|
||||
while ((carg = getopt(argc, argv, "hlLanrdt:p:gx")) != -1) {
|
||||
switch (carg) {
|
||||
/* -------------------- */
|
||||
case 'h': /* help */
|
||||
/* -------------------- */
|
||||
PrintHelp();
|
||||
exit(0);
|
||||
break;
|
||||
|
||||
/* -------------------------------------------------- */
|
||||
case 'l': /* Normalize LCS/Error by the shortest sequence length*/
|
||||
/* -------------------------------------------------- */
|
||||
reference=MINLEN;
|
||||
break;
|
||||
|
||||
/* -------------------------------------------------- */
|
||||
case 'L': /* Normalize LCS/Error by the largest sequence length */
|
||||
/* -------------------------------------------------- */
|
||||
reference=MAXLEN;
|
||||
break;
|
||||
|
||||
/* -------------------------------------------------- */
|
||||
case 'a': /* Normalize LCS/Error by the alignment length */
|
||||
/* -------------------------------------------------- */
|
||||
reference=ALILEN;
|
||||
break;
|
||||
|
||||
/* -------------------------------------------------- */
|
||||
case 'n': /* Normalize LCS by the reference length */
|
||||
/* -------------------------------------------------- */
|
||||
normalize=TRUE;
|
||||
break;
|
||||
|
||||
/* -------------------------------------------------- */
|
||||
case 'r': /* No normalization */
|
||||
/* -------------------------------------------------- */
|
||||
normalize=FALSE;
|
||||
break;
|
||||
|
||||
/* -------------------------------------------------- */
|
||||
case 'd': /* Score is expressed in distance */
|
||||
/* -------------------------------------------------- */
|
||||
lcsmode=FALSE;
|
||||
break;
|
||||
|
||||
/* ------------------------------------------------------------------- */
|
||||
case 't': /* Prints only pairs with similarity higher than (threshold) */
|
||||
/* ------------------------------------------------------------------- */
|
||||
sscanf(optarg,"%lf",&threshold);
|
||||
break;
|
||||
|
||||
/* -------------------------------------------------- */
|
||||
case 'p': /* number of processors to use */
|
||||
/* -------------------------------------------------- */
|
||||
sscanf(optarg,"%d",&nproc);
|
||||
break;
|
||||
|
||||
/* -------------------------------------------------- */
|
||||
case 'x': /* Print extra data (node weight, lseq1, lseq2) */
|
||||
/* -------------------------------------------------- */
|
||||
extradata=TRUE;
|
||||
break;
|
||||
|
||||
|
||||
/* -------------------------------------------------- */
|
||||
case 'g': /* replace n's with a's in sequences */
|
||||
/* -------------------------------------------------- */
|
||||
onlyATGC=FALSE;
|
||||
break;
|
||||
|
||||
|
||||
case '?': /* bad option */
|
||||
errflag++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
ndb = argc - optind;
|
||||
if (ndb < 1)
|
||||
errflag++;
|
||||
|
||||
if (errflag)
|
||||
ExitUsage(errflag);
|
||||
|
||||
fprintf(stderr,"===============================================================\n");
|
||||
fprintf(stderr," SUMATRA version %s\n",VERSION);
|
||||
#ifdef __SSE2__
|
||||
fprintf(stderr,"Alignment using SSE2 code\n");
|
||||
#else
|
||||
fprintf(stderr,"Alignment using standard code, SSE2 unavailable\n");
|
||||
#endif
|
||||
fprintf(stderr,"===============================================================\n");
|
||||
|
||||
|
||||
if (normalize && (threshold > 1.0))
|
||||
{
|
||||
fprintf(stderr, "\nERROR : Please specify a threshold included between 0.0 and 1.0 when normalizing scores.\n\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
fprintf(stderr,"Reading dataset 1...");
|
||||
db1 = seq_readAllSeq2(argv[optind], TRUE, onlyATGC);
|
||||
fprintf(stderr,"\n%d sequences\n",db1.count);
|
||||
|
||||
if (!onlyATGC)
|
||||
(void)cleanDB(db1);
|
||||
|
||||
addCounts(&db1);
|
||||
|
||||
if (threshold>0)
|
||||
(void)hashDB(db1);
|
||||
|
||||
optind++;
|
||||
|
||||
if (ndb == 2)
|
||||
{
|
||||
fprintf(stderr,"Reading dataset 2...");
|
||||
db2 = seq_readAllSeq2(argv[optind], TRUE, onlyATGC);
|
||||
fprintf(stderr,"\n%d sequences\n",db2.count);
|
||||
|
||||
if (!onlyATGC)
|
||||
(void)cleanDB(db2);
|
||||
|
||||
addCounts(&db2);
|
||||
|
||||
if (threshold>0)
|
||||
(void)hashDB(db2);
|
||||
}
|
||||
|
||||
if (!lcsmode && normalize && (threshold > 0))
|
||||
threshold = 1.0 - threshold;
|
||||
|
||||
if (ndb==1)
|
||||
{
|
||||
if (nproc==1)
|
||||
compare1(db1, threshold, normalize, reference, lcsmode, extradata);
|
||||
else
|
||||
mt_compare_sumatra(&db1, NULL, threshold, normalize, reference, lcsmode, extradata, nproc);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (nproc==1)
|
||||
compare2(db1, db2, threshold, normalize, reference, lcsmode, extradata);
|
||||
else
|
||||
mt_compare_sumatra(&db1, &db2, threshold, normalize, reference, lcsmode, extradata, nproc);
|
||||
}
|
||||
|
||||
fprintf(stderr,"\nDone.\n\n");
|
||||
return 0;
|
||||
|
||||
}
|
Reference in New Issue
Block a user