sumatra version 1.0.02

2015-07-16 13:59:44 +02:00
commit 627c4e77ba
9 changed files with 1952 additions and 0 deletions
--- a/sumatra.c
+++ b/sumatra.c
@ -0,0 +1,500 @@
+/**
+ * FileName:    sumatra.c
+ * Authors:      Eric Coissac, Celine Mercier
+ * Description: computation of pairwise similarities of DNA sequences
+ * **/
+
+#include "sumatra.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <math.h>
+#include <string.h>
+
+#include <sys/time.h>
+
+#include "../sumalibs/libfasta/sequence.h"
+#include "../sumalibs/liblcs/upperband.h"
+#include "../sumalibs/liblcs/sse_banded_LCS_alignment.h"
+#include "../sumalibs/libutils/utilities.h"
+#include "mtcompare_sumatra.h"
+
+#define VERSION "1.0.02"
+
+
+/* ----------------------------------------------- */
+/* printout help                                   */
+/* ----------------------------------------------- */
+#define PP fprintf(stdout,
+
+static void PrintHelp()
+{
+        PP      "-----------------------------------------------------------------------------------------------------------------------------\n");
+        PP      " SUMATRA Version %s\n", VERSION);
+        PP      "-----------------------------------------------------------------------------------------------------------------------------\n");
+        PP      " Synopsis : sumatra computes all the pairwise LCS (Longest Common Subsequence) scores\n");
+        PP		" of one nucleotide dataset or between two nucleotide datasets.\n");
+        PP      " Usage: sumatra [options] <dataset1> [dataset2]\n");
+        PP      "-----------------------------------------------------------------------------------------------------------------------------\n");
+        PP      " Options:\n\n");
+        PP      " -h       : [H]elp - print <this> help\n\n");
+        PP      " -l       : Reference sequence length is the shortest. \n\n");
+        PP      " -L       : Reference sequence length is the largest. \n\n");
+        PP      " -a       : Reference sequence length is the alignment length (default). \n\n");
+        PP      " -n       : Score is normalized by reference sequence length (default).\n\n");
+        PP      " -r       : Raw score, not normalized. \n\n");
+        PP      " -d       : Score is expressed in distance (default: score is expressed in similarity). \n\n");
+        PP      " -t ##.## : Score threshold. If the score is normalized and expressed in similarity (default),\n");
+        PP		"            it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized\n");
+        PP		"            and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%.\n");
+        PP		"            If the score is not normalized and expressed in similarity, it is the length of the\n");
+        PP		"            Longest Common Subsequence. If the score is not normalized and expressed in distance,\n");
+        PP		"            it is (reference length - LCS length).\n");
+        PP		"            Only sequence pairs with a similarity above ##.## are printed. Default: 0.00 \n");
+        PP		"            (no threshold).\n\n");
+        PP      " -p ##    : Number of threads used for computation (default=1).\n\n");
+        PP      " -g       : n's are replaced with a's (default: sequences with n's are discarded).\n");
+        PP      " -x       : Adds four extra columns with the count and length of both sequences.\n");
+        PP      "-----------------------------------------------------------------------------------------------------------------------------\n");
+        PP      " First argument  : the nucleotide dataset to analyze\n\n");
+        PP      " Second argument : optionally the second nucleotide dataset\n");
+        PP      "-----------------------------------------------------------------------------------------------------------------------------\n");
+        PP      " Results table description : \n");
+        PP      " column 1 : Identifier sequence 1\n");
+        PP      " column 2 : Identifier sequence 2\n");
+        PP      " column 3 : Score\n");
+        PP      " column 4 : Count of sequence 1  (only with option -x)\n");
+        PP      " column 5 : Count of sequence 2  (only with option -x)\n");
+        PP      " column 6 : Length of sequence 1 (only with option -x)\n");
+        PP      " column 7 : Length of sequence 2 (only with option -x)\n");
+        PP      "-----------------------------------------------------------------------------------------------------------------------------\n");
+        PP		" http://metabarcoding.org/sumatra\n");
+        PP      "-----------------------------------------------------------------------------------------------------------------------------\n\n");
+}
+
+#undef PP
+
+/* ----------------------------------------------- */
+/* printout usage and exit                         */
+/* ----------------------------------------------- */
+
+#define PP fprintf(stderr,
+
+static void ExitUsage(stat)
+        int stat;
+{
+        PP      "usage: sumatra [-l|L|a|n|r|d|g|x] [-t threshold_value] [-p number of threads] dataset1 [dataset2]\n");
+        PP      "type \"sumatra -h\" for help\n");
+
+        if (stat)
+            exit(stat);
+}
+
+#undef  PP
+
+
+void printResults(fastaSeqPtr seq1,fastaSeqPtr seq2,
+			      double score,
+			      BOOL extradata,
+			      int64_t pairs,
+			      BOOL print)
+{
+	static struct timeval start;
+	static struct timeval lastprint;
+	static BOOL first=TRUE;
+	static uint64_t aligned=0;
+
+	struct timeval current;
+	double  fraction;
+	time_t fulltime;
+	time_t remaintime;
+	double elapsedtime;
+	int32_t day;
+	int32_t hour;
+	int32_t minute;
+	int32_t seconde;
+
+
+	aligned++;
+
+	if (first)
+	{
+		first=FALSE;
+		gettimeofday(&start,NULL);
+		lastprint=start;
+	}
+
+	gettimeofday(&current,NULL);
+
+	if (current.tv_sec!=lastprint.tv_sec)
+	{
+		lastprint=current;
+		fraction = (double)aligned/(double)pairs;
+		elapsedtime = difftime(current.tv_sec,start.tv_sec);
+		fulltime = elapsedtime / fraction;
+		remaintime = (time_t)difftime(fulltime,(time_t)elapsedtime);
+
+
+		fprintf(stderr,
+				"Computed %lld / %lld -> %5.2lf%%",
+				aligned, pairs, fraction*100.
+				);
+		seconde = fulltime % 60;
+		minute  = fulltime / 60;
+		hour    = minute / 60;
+		minute  = minute % 60;
+		day     = hour / 24;
+		hour    = hour % 24;
+		if (day)
+		fprintf(stderr,
+				", estimated computation time = %3d days %02d:%02d:%02d",
+				day,
+				hour,
+				minute,
+				seconde
+				);
+		else
+			fprintf(stderr,
+					", estimated computation time = %02d:%02d:%02d",
+					hour,
+					minute,
+					seconde
+					);
+
+		seconde = remaintime % 60;
+		minute  = remaintime / 60;
+		hour    = minute / 60;
+		minute  = minute % 60;
+		day     = hour / 24;
+		hour    = hour % 24;
+		if (day)
+		fprintf(stderr,
+				", about %3d days %02d:%02d:%02d remaining                  \r",
+				day,
+				hour,
+				minute,
+				seconde
+				);
+		else
+			fprintf(stderr,
+					", about %02d:%02d:%02d remaining                  \r",
+
+					hour,
+					minute,
+					seconde
+					);
+
+	}
+
+	if (print)
+	{
+
+	if (extradata)
+		printf("%s\t%s\t%lf\t%d\t%d\t%d\t%d\n", seq1->accession_id,
+										seq2->accession_id,
+										score,
+										seq1->count,
+										seq2->count,
+										seq1->length,
+										seq2->length
+		      );
+	else
+		printf("%s\t%s\t%lf\n", seq1->accession_id,
+				seq2->accession_id,
+				score);
+	}
+}
+
+
+int compare1(fastaSeqCount db1, double threshold, BOOL normalize, int reference, BOOL lcsmode, BOOL extradata)
+{
+	BOOL     always = TRUE;
+	int64_t  pairs = (int64_t)(db1.count - 1) * (int64_t)db1.count /2;
+	BOOL     print;
+	double   score;
+	int32_t  i,j;
+	char* s1;
+	char* s2;
+	int l1;
+	int l2;
+	int16_t* iseq1;
+	int16_t* iseq2;
+	int16_t* address;
+	int sizeForSeqs;
+	int lmax, lmin;
+	int LCSmin;
+
+	fprintf(stderr,"Pairwise alignments of one dataset against itself\n");
+	fprintf(stderr,"Count of alignments to do : %lld\n",pairs);
+
+	if (threshold > 0)
+	{
+		fprintf(stderr,"Computing for scores > %lf\n",threshold);
+		always = FALSE;
+	}
+
+	calculateMaxAndMinLenDB(db1, &lmax, &lmin);
+	sizeForSeqs = prepareTablesForSumathings(lmax, lmin, threshold, normalize, reference, lcsmode, &address, &iseq1, &iseq2);
+
+	for (i=0; i < db1.count; i++) // ...??
+		for (j=i+1; j < db1.count; j++)
+		{
+			print = FALSE;
+			filtersSumatra(db1.fastaSeqs+i, db1.fastaSeqs+j, threshold, normalize, reference, lcsmode, &score, &LCSmin);
+			if (score >= 0) // identical sequences
+				print = TRUE;
+			else if (always || (score == -1.0))	// filter passed or no threshold
+			{
+				s1 = (db1.fastaSeqs+i)->sequence;
+				l1 = (db1.fastaSeqs+i)->length;
+				s2 = (db1.fastaSeqs+j)->sequence;
+				l2 = (db1.fastaSeqs+j)->length;
+				score = alignForSumathings(s1, iseq1, s2, iseq2, l1, l2, normalize, reference, lcsmode, address, sizeForSeqs, LCSmin);
+				print = always || (((normalize || lcsmode) && (score >= threshold)) || ((!lcsmode && !normalize) && (score <= threshold)));
+				if (print && !lcsmode && normalize)
+					score = 1.0 - score;
+			}
+			printResults(db1.fastaSeqs+i, db1.fastaSeqs+j, score, extradata, pairs, print);
+		}
+
+	free(iseq1-sizeForSeqs+lmax);
+	free(iseq2-sizeForSeqs+lmax);
+	return 0;
+}
+
+
+int compare2(fastaSeqCount db1, fastaSeqCount db2, double threshold, BOOL normalize, int reference, BOOL lcsmode, BOOL extradata)
+{
+	BOOL     always = TRUE;
+	int64_t  pairs = (int64_t)(db1.count) * (int64_t)(db2.count);
+	BOOL     print;
+	double   score;
+	int32_t  i,j;
+	char* s1;
+	char* s2;
+	int l1;
+	int l2;
+	int16_t* iseq1;
+	int16_t* iseq2;
+	int16_t* address;
+	int sizeForSeqs;
+	int lmax;
+	int lmax1;
+	int lmin;
+	int lmin1;
+	int LCSmin;
+
+	fprintf(stderr,"Pairwise alignments of two datasets\n");
+	fprintf(stderr,"Count of alignments to do : %lld\n",pairs);
+
+	if (threshold > 0)
+	{
+		fprintf(stderr,"Computing for scores > %lf\n",threshold);
+		always = FALSE;
+	}
+
+	calculateMaxAndMinLenDB(db1, &lmax, &lmin);
+	calculateMaxAndMinLenDB(db2, &lmax1, &lmin1);
+
+	if (lmax1 > lmax)
+		lmax = lmax1;
+	if (lmin1 < lmin)
+		lmin = lmin1;
+
+	sizeForSeqs = prepareTablesForSumathings(lmax, lmin, threshold, normalize, reference, lcsmode, &address, &iseq1, &iseq2);
+
+	for (i=0; i < db1.count; i++)
+		for (j=0; j < db2.count; j++)
+		{
+			print = FALSE;
+			filtersSumatra(db1.fastaSeqs+i, db2.fastaSeqs+j, threshold, normalize, reference, lcsmode, &score, &LCSmin);
+			if (score >= 0) // identical sequences
+				print = TRUE;
+			else if (always || (score == -1.0))	// filter passed or no threshold
+			{
+				s1 = (db1.fastaSeqs+i)->sequence;
+				l1 = (db1.fastaSeqs+i)->length;
+				s2 = (db2.fastaSeqs+j)->sequence;
+				l2 = (db2.fastaSeqs+j)->length;
+				score = alignForSumathings(s1, iseq1, s2, iseq2, l1, l2, normalize, reference, lcsmode, address, sizeForSeqs, LCSmin);
+				print = always || (((normalize || lcsmode) && (score >= threshold)) || ((!lcsmode && !normalize) && (score <= threshold)));
+				if (print && !lcsmode && normalize)
+					score = 1.0 - score;
+			}
+			printResults(db1.fastaSeqs+i, db2.fastaSeqs+j, score, extradata, pairs, print);
+		}
+
+	free(iseq1-sizeForSeqs+lmax);
+	free(iseq2-sizeForSeqs+lmax);
+	return 0;
+}
+
+
+int main(int argc, char **argv)
+{
+
+	int32_t     	carg		= 0;
+	int32_t         errflag     = 0;
+	BOOL            normalize   = TRUE;
+	BOOL            lcsmode     = TRUE;
+	int             reference   = ALILEN;
+	BOOL            extradata   = FALSE;
+	BOOL			onlyATGC	   = TRUE;
+	double          threshold   = 0.0;
+	int             ndb         = 0;
+	int				nproc       = 1;
+	fastaSeqCount   db1;
+	fastaSeqCount   db2;
+
+	while ((carg = getopt(argc, argv, "hlLanrdt:p:gx")) != -1) {
+	    switch (carg) {
+								   /* -------------------- */
+	    case 'h':                  /* help                 */
+								   /* -------------------- */
+	    	PrintHelp();
+	    	exit(0);
+	    	break;
+
+									  /* -------------------------------------------------- */
+	    case 'l':               	  /* Normalize LCS/Error by the shortest sequence length*/
+									  /* -------------------------------------------------- */
+	    	reference=MINLEN;
+	        break;
+
+									  /* -------------------------------------------------- */
+	    case 'L':               	  /* Normalize LCS/Error by the largest sequence length */
+									  /* -------------------------------------------------- */
+	    	reference=MAXLEN;
+	        break;
+
+	          	  	  	  	  	  	  /* -------------------------------------------------- */
+	    case 'a':               	  /* Normalize LCS/Error by the alignment length        */
+	    	   	   	   	   	   	   	  /* -------------------------------------------------- */
+	    	reference=ALILEN;
+	    	break;
+
+									  /* -------------------------------------------------- */
+	    case 'n':               	  /* Normalize LCS by the reference length              */
+									  /* -------------------------------------------------- */
+	    	normalize=TRUE;
+	    	break;
+
+	    								/* -------------------------------------------------- */
+		case 'r':   					/* No normalization     				              */
+					  	  	  	  	  	/* -------------------------------------------------- */
+			normalize=FALSE;
+			break;
+
+									  /* -------------------------------------------------- */
+	    case 'd':               	  /* Score is expressed in distance                  */
+								      /* -------------------------------------------------- */
+	    	lcsmode=FALSE;
+	    	break;
+
+						/* ------------------------------------------------------------------- */
+		case 't':   	/* Prints only pairs with similarity higher than (threshold)         */
+						/* ------------------------------------------------------------------- */
+	    	sscanf(optarg,"%lf",&threshold);
+	    	break;
+
+							 /* -------------------------------------------------- */
+	    case 'p':            /* number of processors to use                        */
+							 /* -------------------------------------------------- */
+	    	sscanf(optarg,"%d",&nproc);
+			break;
+
+								  /* -------------------------------------------------- */
+	    case 'x':              	  /* Print extra data (node weight, lseq1, lseq2)       */
+								  /* -------------------------------------------------- */
+	    	extradata=TRUE;
+			break;
+
+
+									/* -------------------------------------------------- */
+		case 'g':   				/* replace n's with a's in sequences                  */
+									/* -------------------------------------------------- */
+			onlyATGC=FALSE;
+			break;
+
+
+	    case '?':               	/* bad option   	        */
+	    	errflag++;
+	    	break;
+	     }
+	}
+
+	ndb = argc - optind;
+	if (ndb < 1)
+        errflag++;
+
+	if (errflag)
+		ExitUsage(errflag);
+
+    fprintf(stderr,"===============================================================\n");
+	fprintf(stderr," SUMATRA version %s\n",VERSION);
+#ifdef __SSE2__
+	fprintf(stderr,"Alignment using SSE2 code\n");
+#else
+	fprintf(stderr,"Alignment using standard code, SSE2 unavailable\n");
+#endif
+	fprintf(stderr,"===============================================================\n");
+
+
+	if (normalize && (threshold > 1.0))
+	{
+		fprintf(stderr, "\nERROR : Please specify a threshold included between 0.0 and 1.0 when normalizing scores.\n\n");
+		exit(1);
+	}
+
+	fprintf(stderr,"Reading dataset 1...");
+	db1 = seq_readAllSeq2(argv[optind], TRUE, onlyATGC);
+	fprintf(stderr,"\n%d sequences\n",db1.count);
+
+	if (!onlyATGC)
+		(void)cleanDB(db1);
+
+	addCounts(&db1);
+
+	if (threshold>0)
+		(void)hashDB(db1);
+
+	optind++;
+
+	if (ndb == 2)
+	{
+	    fprintf(stderr,"Reading dataset 2...");
+		db2 = seq_readAllSeq2(argv[optind], TRUE, onlyATGC);
+		fprintf(stderr,"\n%d sequences\n",db2.count);
+
+		if (!onlyATGC)
+			(void)cleanDB(db2);
+
+		addCounts(&db2);
+
+		if (threshold>0)
+			(void)hashDB(db2);
+	}
+
+	if (!lcsmode && normalize && (threshold > 0))
+		threshold = 1.0 - threshold;
+
+	if (ndb==1)
+	{
+		if (nproc==1)
+			compare1(db1, threshold, normalize, reference, lcsmode, extradata);
+		else
+		    mt_compare_sumatra(&db1, NULL, threshold, normalize, reference, lcsmode, extradata, nproc);
+	}
+	else
+	{
+		if (nproc==1)
+			compare2(db1, db2, threshold, normalize, reference, lcsmode, extradata);
+		else
+		    mt_compare_sumatra(&db1, &db2, threshold, normalize, reference, lcsmode, extradata, nproc);
+	}
+
+	fprintf(stderr,"\nDone.\n\n");
+	return 0;
+
+}