Files
sumatra/sumatra.c
2015-07-16 13:59:44 +02:00

501 lines
16 KiB
C

/**
* FileName: sumatra.c
* Authors: Eric Coissac, Celine Mercier
* Description: computation of pairwise similarities of DNA sequences
* **/
#include "sumatra.h"
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include <string.h>
#include <sys/time.h>
#include "../sumalibs/libfasta/sequence.h"
#include "../sumalibs/liblcs/upperband.h"
#include "../sumalibs/liblcs/sse_banded_LCS_alignment.h"
#include "../sumalibs/libutils/utilities.h"
#include "mtcompare_sumatra.h"
#define VERSION "1.0.02"
/* ----------------------------------------------- */
/* printout help */
/* ----------------------------------------------- */
#define PP fprintf(stdout,
static void PrintHelp()
{
PP "-----------------------------------------------------------------------------------------------------------------------------\n");
PP " SUMATRA Version %s\n", VERSION);
PP "-----------------------------------------------------------------------------------------------------------------------------\n");
PP " Synopsis : sumatra computes all the pairwise LCS (Longest Common Subsequence) scores\n");
PP " of one nucleotide dataset or between two nucleotide datasets.\n");
PP " Usage: sumatra [options] <dataset1> [dataset2]\n");
PP "-----------------------------------------------------------------------------------------------------------------------------\n");
PP " Options:\n\n");
PP " -h : [H]elp - print <this> help\n\n");
PP " -l : Reference sequence length is the shortest. \n\n");
PP " -L : Reference sequence length is the largest. \n\n");
PP " -a : Reference sequence length is the alignment length (default). \n\n");
PP " -n : Score is normalized by reference sequence length (default).\n\n");
PP " -r : Raw score, not normalized. \n\n");
PP " -d : Score is expressed in distance (default: score is expressed in similarity). \n\n");
PP " -t ##.## : Score threshold. If the score is normalized and expressed in similarity (default),\n");
PP " it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized\n");
PP " and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%.\n");
PP " If the score is not normalized and expressed in similarity, it is the length of the\n");
PP " Longest Common Subsequence. If the score is not normalized and expressed in distance,\n");
PP " it is (reference length - LCS length).\n");
PP " Only sequence pairs with a similarity above ##.## are printed. Default: 0.00 \n");
PP " (no threshold).\n\n");
PP " -p ## : Number of threads used for computation (default=1).\n\n");
PP " -g : n's are replaced with a's (default: sequences with n's are discarded).\n");
PP " -x : Adds four extra columns with the count and length of both sequences.\n");
PP "-----------------------------------------------------------------------------------------------------------------------------\n");
PP " First argument : the nucleotide dataset to analyze\n\n");
PP " Second argument : optionally the second nucleotide dataset\n");
PP "-----------------------------------------------------------------------------------------------------------------------------\n");
PP " Results table description : \n");
PP " column 1 : Identifier sequence 1\n");
PP " column 2 : Identifier sequence 2\n");
PP " column 3 : Score\n");
PP " column 4 : Count of sequence 1 (only with option -x)\n");
PP " column 5 : Count of sequence 2 (only with option -x)\n");
PP " column 6 : Length of sequence 1 (only with option -x)\n");
PP " column 7 : Length of sequence 2 (only with option -x)\n");
PP "-----------------------------------------------------------------------------------------------------------------------------\n");
PP " http://metabarcoding.org/sumatra\n");
PP "-----------------------------------------------------------------------------------------------------------------------------\n\n");
}
#undef PP
/* ----------------------------------------------- */
/* printout usage and exit */
/* ----------------------------------------------- */
#define PP fprintf(stderr,
static void ExitUsage(stat)
int stat;
{
PP "usage: sumatra [-l|L|a|n|r|d|g|x] [-t threshold_value] [-p number of threads] dataset1 [dataset2]\n");
PP "type \"sumatra -h\" for help\n");
if (stat)
exit(stat);
}
#undef PP
void printResults(fastaSeqPtr seq1,fastaSeqPtr seq2,
double score,
BOOL extradata,
int64_t pairs,
BOOL print)
{
static struct timeval start;
static struct timeval lastprint;
static BOOL first=TRUE;
static uint64_t aligned=0;
struct timeval current;
double fraction;
time_t fulltime;
time_t remaintime;
double elapsedtime;
int32_t day;
int32_t hour;
int32_t minute;
int32_t seconde;
aligned++;
if (first)
{
first=FALSE;
gettimeofday(&start,NULL);
lastprint=start;
}
gettimeofday(&current,NULL);
if (current.tv_sec!=lastprint.tv_sec)
{
lastprint=current;
fraction = (double)aligned/(double)pairs;
elapsedtime = difftime(current.tv_sec,start.tv_sec);
fulltime = elapsedtime / fraction;
remaintime = (time_t)difftime(fulltime,(time_t)elapsedtime);
fprintf(stderr,
"Computed %lld / %lld -> %5.2lf%%",
aligned, pairs, fraction*100.
);
seconde = fulltime % 60;
minute = fulltime / 60;
hour = minute / 60;
minute = minute % 60;
day = hour / 24;
hour = hour % 24;
if (day)
fprintf(stderr,
", estimated computation time = %3d days %02d:%02d:%02d",
day,
hour,
minute,
seconde
);
else
fprintf(stderr,
", estimated computation time = %02d:%02d:%02d",
hour,
minute,
seconde
);
seconde = remaintime % 60;
minute = remaintime / 60;
hour = minute / 60;
minute = minute % 60;
day = hour / 24;
hour = hour % 24;
if (day)
fprintf(stderr,
", about %3d days %02d:%02d:%02d remaining \r",
day,
hour,
minute,
seconde
);
else
fprintf(stderr,
", about %02d:%02d:%02d remaining \r",
hour,
minute,
seconde
);
}
if (print)
{
if (extradata)
printf("%s\t%s\t%lf\t%d\t%d\t%d\t%d\n", seq1->accession_id,
seq2->accession_id,
score,
seq1->count,
seq2->count,
seq1->length,
seq2->length
);
else
printf("%s\t%s\t%lf\n", seq1->accession_id,
seq2->accession_id,
score);
}
}
int compare1(fastaSeqCount db1, double threshold, BOOL normalize, int reference, BOOL lcsmode, BOOL extradata)
{
BOOL always = TRUE;
int64_t pairs = (int64_t)(db1.count - 1) * (int64_t)db1.count /2;
BOOL print;
double score;
int32_t i,j;
char* s1;
char* s2;
int l1;
int l2;
int16_t* iseq1;
int16_t* iseq2;
int16_t* address;
int sizeForSeqs;
int lmax, lmin;
int LCSmin;
fprintf(stderr,"Pairwise alignments of one dataset against itself\n");
fprintf(stderr,"Count of alignments to do : %lld\n",pairs);
if (threshold > 0)
{
fprintf(stderr,"Computing for scores > %lf\n",threshold);
always = FALSE;
}
calculateMaxAndMinLenDB(db1, &lmax, &lmin);
sizeForSeqs = prepareTablesForSumathings(lmax, lmin, threshold, normalize, reference, lcsmode, &address, &iseq1, &iseq2);
for (i=0; i < db1.count; i++) // ...??
for (j=i+1; j < db1.count; j++)
{
print = FALSE;
filtersSumatra(db1.fastaSeqs+i, db1.fastaSeqs+j, threshold, normalize, reference, lcsmode, &score, &LCSmin);
if (score >= 0) // identical sequences
print = TRUE;
else if (always || (score == -1.0)) // filter passed or no threshold
{
s1 = (db1.fastaSeqs+i)->sequence;
l1 = (db1.fastaSeqs+i)->length;
s2 = (db1.fastaSeqs+j)->sequence;
l2 = (db1.fastaSeqs+j)->length;
score = alignForSumathings(s1, iseq1, s2, iseq2, l1, l2, normalize, reference, lcsmode, address, sizeForSeqs, LCSmin);
print = always || (((normalize || lcsmode) && (score >= threshold)) || ((!lcsmode && !normalize) && (score <= threshold)));
if (print && !lcsmode && normalize)
score = 1.0 - score;
}
printResults(db1.fastaSeqs+i, db1.fastaSeqs+j, score, extradata, pairs, print);
}
free(iseq1-sizeForSeqs+lmax);
free(iseq2-sizeForSeqs+lmax);
return 0;
}
int compare2(fastaSeqCount db1, fastaSeqCount db2, double threshold, BOOL normalize, int reference, BOOL lcsmode, BOOL extradata)
{
BOOL always = TRUE;
int64_t pairs = (int64_t)(db1.count) * (int64_t)(db2.count);
BOOL print;
double score;
int32_t i,j;
char* s1;
char* s2;
int l1;
int l2;
int16_t* iseq1;
int16_t* iseq2;
int16_t* address;
int sizeForSeqs;
int lmax;
int lmax1;
int lmin;
int lmin1;
int LCSmin;
fprintf(stderr,"Pairwise alignments of two datasets\n");
fprintf(stderr,"Count of alignments to do : %lld\n",pairs);
if (threshold > 0)
{
fprintf(stderr,"Computing for scores > %lf\n",threshold);
always = FALSE;
}
calculateMaxAndMinLenDB(db1, &lmax, &lmin);
calculateMaxAndMinLenDB(db2, &lmax1, &lmin1);
if (lmax1 > lmax)
lmax = lmax1;
if (lmin1 < lmin)
lmin = lmin1;
sizeForSeqs = prepareTablesForSumathings(lmax, lmin, threshold, normalize, reference, lcsmode, &address, &iseq1, &iseq2);
for (i=0; i < db1.count; i++)
for (j=0; j < db2.count; j++)
{
print = FALSE;
filtersSumatra(db1.fastaSeqs+i, db2.fastaSeqs+j, threshold, normalize, reference, lcsmode, &score, &LCSmin);
if (score >= 0) // identical sequences
print = TRUE;
else if (always || (score == -1.0)) // filter passed or no threshold
{
s1 = (db1.fastaSeqs+i)->sequence;
l1 = (db1.fastaSeqs+i)->length;
s2 = (db2.fastaSeqs+j)->sequence;
l2 = (db2.fastaSeqs+j)->length;
score = alignForSumathings(s1, iseq1, s2, iseq2, l1, l2, normalize, reference, lcsmode, address, sizeForSeqs, LCSmin);
print = always || (((normalize || lcsmode) && (score >= threshold)) || ((!lcsmode && !normalize) && (score <= threshold)));
if (print && !lcsmode && normalize)
score = 1.0 - score;
}
printResults(db1.fastaSeqs+i, db2.fastaSeqs+j, score, extradata, pairs, print);
}
free(iseq1-sizeForSeqs+lmax);
free(iseq2-sizeForSeqs+lmax);
return 0;
}
int main(int argc, char **argv)
{
int32_t carg = 0;
int32_t errflag = 0;
BOOL normalize = TRUE;
BOOL lcsmode = TRUE;
int reference = ALILEN;
BOOL extradata = FALSE;
BOOL onlyATGC = TRUE;
double threshold = 0.0;
int ndb = 0;
int nproc = 1;
fastaSeqCount db1;
fastaSeqCount db2;
while ((carg = getopt(argc, argv, "hlLanrdt:p:gx")) != -1) {
switch (carg) {
/* -------------------- */
case 'h': /* help */
/* -------------------- */
PrintHelp();
exit(0);
break;
/* -------------------------------------------------- */
case 'l': /* Normalize LCS/Error by the shortest sequence length*/
/* -------------------------------------------------- */
reference=MINLEN;
break;
/* -------------------------------------------------- */
case 'L': /* Normalize LCS/Error by the largest sequence length */
/* -------------------------------------------------- */
reference=MAXLEN;
break;
/* -------------------------------------------------- */
case 'a': /* Normalize LCS/Error by the alignment length */
/* -------------------------------------------------- */
reference=ALILEN;
break;
/* -------------------------------------------------- */
case 'n': /* Normalize LCS by the reference length */
/* -------------------------------------------------- */
normalize=TRUE;
break;
/* -------------------------------------------------- */
case 'r': /* No normalization */
/* -------------------------------------------------- */
normalize=FALSE;
break;
/* -------------------------------------------------- */
case 'd': /* Score is expressed in distance */
/* -------------------------------------------------- */
lcsmode=FALSE;
break;
/* ------------------------------------------------------------------- */
case 't': /* Prints only pairs with similarity higher than (threshold) */
/* ------------------------------------------------------------------- */
sscanf(optarg,"%lf",&threshold);
break;
/* -------------------------------------------------- */
case 'p': /* number of processors to use */
/* -------------------------------------------------- */
sscanf(optarg,"%d",&nproc);
break;
/* -------------------------------------------------- */
case 'x': /* Print extra data (node weight, lseq1, lseq2) */
/* -------------------------------------------------- */
extradata=TRUE;
break;
/* -------------------------------------------------- */
case 'g': /* replace n's with a's in sequences */
/* -------------------------------------------------- */
onlyATGC=FALSE;
break;
case '?': /* bad option */
errflag++;
break;
}
}
ndb = argc - optind;
if (ndb < 1)
errflag++;
if (errflag)
ExitUsage(errflag);
fprintf(stderr,"===============================================================\n");
fprintf(stderr," SUMATRA version %s\n",VERSION);
#ifdef __SSE2__
fprintf(stderr,"Alignment using SSE2 code\n");
#else
fprintf(stderr,"Alignment using standard code, SSE2 unavailable\n");
#endif
fprintf(stderr,"===============================================================\n");
if (normalize && (threshold > 1.0))
{
fprintf(stderr, "\nERROR : Please specify a threshold included between 0.0 and 1.0 when normalizing scores.\n\n");
exit(1);
}
fprintf(stderr,"Reading dataset 1...");
db1 = seq_readAllSeq2(argv[optind], TRUE, onlyATGC);
fprintf(stderr,"\n%d sequences\n",db1.count);
if (!onlyATGC)
(void)cleanDB(db1);
addCounts(&db1);
if (threshold>0)
(void)hashDB(db1);
optind++;
if (ndb == 2)
{
fprintf(stderr,"Reading dataset 2...");
db2 = seq_readAllSeq2(argv[optind], TRUE, onlyATGC);
fprintf(stderr,"\n%d sequences\n",db2.count);
if (!onlyATGC)
(void)cleanDB(db2);
addCounts(&db2);
if (threshold>0)
(void)hashDB(db2);
}
if (!lcsmode && normalize && (threshold > 0))
threshold = 1.0 - threshold;
if (ndb==1)
{
if (nproc==1)
compare1(db1, threshold, normalize, reference, lcsmode, extradata);
else
mt_compare_sumatra(&db1, NULL, threshold, normalize, reference, lcsmode, extradata, nproc);
}
else
{
if (nproc==1)
compare2(db1, db2, threshold, normalize, reference, lcsmode, extradata);
else
mt_compare_sumatra(&db1, &db2, threshold, normalize, reference, lcsmode, extradata, nproc);
}
fprintf(stderr,"\nDone.\n\n");
return 0;
}