2015-07-16 14:41:51 +02:00
|
|
|
/**
|
|
|
|
* FileName: sumaclust.c
|
|
|
|
* Author: Celine Mercier
|
|
|
|
* Description: star clustering of DNA sequences
|
|
|
|
* **/
|
|
|
|
|
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include <unistd.h>
|
|
|
|
#include <math.h>
|
|
|
|
#include <sys/time.h>
|
|
|
|
|
2020-04-01 15:15:15 +02:00
|
|
|
#include "sumalibs/libutils/utilities.h"
|
|
|
|
#include "sumalibs/libfasta/sequence.h"
|
|
|
|
#include "sumalibs/libfasta/fasta_header_parser.h"
|
|
|
|
#include "sumalibs/libfasta/fasta_header_handler.h"
|
|
|
|
#include "sumalibs/libfasta/fasta_seq_writer.h"
|
|
|
|
#include "sumalibs/liblcs/upperband.h"
|
|
|
|
#include "sumalibs/liblcs/sse_banded_LCS_alignment.h"
|
2015-07-16 14:41:51 +02:00
|
|
|
|
|
|
|
#include "mtcompare_sumaclust.h"
|
|
|
|
#include "sumaclust.h"
|
|
|
|
|
2020-04-01 15:15:15 +02:00
|
|
|
#define VERSION "1.0.35"
|
2015-07-16 14:41:51 +02:00
|
|
|
|
|
|
|
|
|
|
|
/* ----------------------------------------------- */
|
|
|
|
/* printout help */
|
|
|
|
/* ----------------------------------------------- */
|
|
|
|
|
|
|
|
#define PP fprintf(stdout,
|
|
|
|
|
|
|
|
|
|
|
|
static void PrintHelp()
|
|
|
|
{
|
|
|
|
PP "------------------------------------------------------------\n");
|
|
|
|
PP " SUMACLUST Version %s\n", VERSION);
|
|
|
|
PP "------------------------------------------------------------\n");
|
|
|
|
PP " Synopsis : star clustering of sequences.\n");
|
|
|
|
PP " Usage: sumaclust [options] <dataset>\n");
|
|
|
|
PP "------------------------------------------------------------\n");
|
|
|
|
PP " Options:\n");
|
|
|
|
PP " -h : [H]elp - print <this> help\n\n");
|
|
|
|
PP " -l : Reference sequence length is the shortest. \n\n");
|
|
|
|
PP " -L : Reference sequence length is the largest. \n\n");
|
|
|
|
PP " -a : Reference sequence length is the alignment length (default). \n\n");
|
|
|
|
PP " -n : Score is normalized by reference sequence length (default).\n\n");
|
|
|
|
PP " -r : Raw score, not normalized. \n\n");
|
|
|
|
PP " -d : Score is expressed in distance (default : score is expressed in similarity). \n\n");
|
|
|
|
PP " -t ##.## : Score threshold for clustering. If the score is normalized and expressed in similarity (default),\n");
|
|
|
|
PP " it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized\n");
|
|
|
|
PP " and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%.\n");
|
|
|
|
PP " If the score is not normalized and expressed in similarity, it is the length of the\n");
|
|
|
|
PP " Longest Common Subsequence. If the score is not normalized and expressed in distance,\n");
|
|
|
|
PP " it is (reference length - LCS length).\n");
|
|
|
|
PP " Only sequences with a similarity above ##.## with the center sequence of a cluster\n");
|
|
|
|
PP " are assigned to that cluster. Default: 0.97.\n\n");
|
|
|
|
PP " -e : Exact option : A sequence is assigned to the cluster with the center sequence presenting the\n");
|
|
|
|
PP " highest similarity score > threshold, as opposed to the default 'fast' option where a sequence is\n");
|
|
|
|
PP " assigned to the first cluster found with a center sequence presenting a score > threshold.\n\n");
|
|
|
|
PP " -R ## : Maximum ratio between the counts of two sequences so that the less abundant one can be considered\n");
|
|
|
|
PP " as a variant of the more abundant one. Default: 1.0.\n\n");
|
|
|
|
PP " -p ## : Multithreading with ## threads using openMP.\n\n");
|
|
|
|
PP " -s #### : Sorting by ####. Must be 'None' for no sorting, or a key in the fasta header of each sequence,\n");
|
|
|
|
PP " except for the count that can be computed (default : sorting by count).\n\n");
|
|
|
|
PP " -o : Sorting is in ascending order (default : descending).\n\n");
|
|
|
|
PP " -g : n's are replaced with a's (default: sequences with n's are discarded).\n\n");
|
|
|
|
PP " -B ### : Output of the OTU table in BIOM format is activated, and written to file ###.\n\n");
|
|
|
|
PP " -O ### : Output of the OTU map (observation map) is activated, and written to file ###.\n\n");
|
|
|
|
PP " -F ### : Output in FASTA format is written to file ### instead of standard output.\n\n");
|
|
|
|
PP " -f : Output in FASTA format is deactivated.\n");
|
|
|
|
PP "\n");
|
|
|
|
PP "------------------------------------------------------------\n");
|
2016-02-22 15:56:46 +01:00
|
|
|
PP " Argument : the nucleotide dataset to cluster (or nothing \n");
|
|
|
|
PP " if the standard input should be used). \n");
|
2015-07-16 14:41:51 +02:00
|
|
|
PP "------------------------------------------------------------\n");
|
2016-02-22 15:56:46 +01:00
|
|
|
PP " http://metabarcoding.org/sumaclust\n");
|
2015-07-16 14:41:51 +02:00
|
|
|
PP "------------------------------------------------------------\n\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
#undef PP
|
|
|
|
|
|
|
|
/* ----------------------------------------------- */
|
|
|
|
/* printout usage and exit */
|
|
|
|
/* ----------------------------------------------- */
|
|
|
|
|
|
|
|
#define PP fprintf(stderr,
|
|
|
|
|
|
|
|
|
|
|
|
static void ExitUsage(stat)
|
|
|
|
int stat;
|
|
|
|
{
|
|
|
|
PP "usage: sumaclust [-l|L|a|n|r|d|e|o|g|f] [-t threshold_value] [-s sorting_key] [-R maximum_ratio] [-p number_of_threads]\n");
|
|
|
|
PP "[-B file_name_for_BIOM-formatted_output] [-O file_name_for_OTU_table-formatted_output] [-F file_name_for_FASTA-formatted_output] dataset\n");
|
|
|
|
PP "type \"sumaclust -h\" for help\n");
|
|
|
|
|
|
|
|
if (stat)
|
|
|
|
exit(stat);
|
|
|
|
}
|
|
|
|
|
|
|
|
#undef PP
|
|
|
|
|
|
|
|
|
|
|
|
static char* sortingKey="count";
|
|
|
|
|
|
|
|
static int sortSeqsP(const void **s1, const void **s2)
|
|
|
|
{
|
|
|
|
int res;
|
|
|
|
double r1;
|
|
|
|
double r2;
|
|
|
|
|
|
|
|
r1 = atof(getItemFromHeader(sortingKey, ((fastaSeqPtr) *s2)->header));
|
|
|
|
r2 = atof(getItemFromHeader(sortingKey, ((fastaSeqPtr) *s2)->header));
|
|
|
|
if (r2 > r1)
|
|
|
|
res = 1;
|
|
|
|
else if (r2 < r1)
|
|
|
|
res = -1;
|
|
|
|
else
|
|
|
|
res = 0;
|
|
|
|
|
|
|
|
return(res);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static int reverseSortSeqsP(const void **s1, const void **s2)
|
|
|
|
{
|
|
|
|
int res;
|
|
|
|
double r1;
|
|
|
|
double r2;
|
|
|
|
|
|
|
|
r1 = atof(getItemFromHeader(sortingKey, ((fastaSeqPtr) *s2)->header));
|
|
|
|
r2 = atof(getItemFromHeader(sortingKey, ((fastaSeqPtr) *s2)->header));
|
|
|
|
|
|
|
|
if (r1 > r2)
|
|
|
|
res = 1;
|
|
|
|
else if (r1 < r2)
|
|
|
|
res = -1;
|
|
|
|
else
|
|
|
|
res = 0;
|
|
|
|
|
|
|
|
return(res);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int uniqSeqsDoubleSortFunction(const void *s1, const void *s2)
|
|
|
|
{
|
|
|
|
int c;
|
|
|
|
char* str_r1;
|
|
|
|
double r1;
|
|
|
|
double r2;
|
|
|
|
|
|
|
|
c = strcmp(((fastaSeqPtr) s1)->sequence, ((fastaSeqPtr) s2)->sequence);
|
|
|
|
if (c == 0)
|
|
|
|
{
|
|
|
|
str_r1 = getItemFromHeader(sortingKey, ((fastaSeqPtr) s1)->header);
|
|
|
|
if (str_r1 == NULL)
|
|
|
|
{
|
|
|
|
fprintf(stderr, "\nERROR: '%s' not in sequence header(s).\n\n", sortingKey);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
r1 = atof(str_r1);
|
|
|
|
r2 = atof(getItemFromHeader(sortingKey, ((fastaSeqPtr) s2)->header));
|
|
|
|
|
|
|
|
if (r2 > r1)
|
|
|
|
c = 1;
|
|
|
|
else if (r2 < r1)
|
|
|
|
c = -1;
|
|
|
|
else
|
|
|
|
c = 0;
|
|
|
|
}
|
|
|
|
return(c);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int uniqSeqsDoubleReverseSortFunction(const void *s1, const void *s2)
|
|
|
|
{
|
|
|
|
int c;
|
|
|
|
char* str_r1;
|
|
|
|
double r1;
|
|
|
|
double r2;
|
|
|
|
|
|
|
|
c = strcmp(((fastaSeqPtr) s1)->sequence, ((fastaSeqPtr) s2)->sequence);
|
|
|
|
if (c == 0)
|
|
|
|
{
|
|
|
|
str_r1 = getItemFromHeader(sortingKey, ((fastaSeqPtr) s1)->header);
|
|
|
|
if (str_r1 == NULL)
|
|
|
|
{
|
|
|
|
fprintf(stderr, "\nERROR: '%s' not in sequence header(s).\n\n", sortingKey);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
r1 = atof(str_r1);
|
|
|
|
r2 = atof(getItemFromHeader(sortingKey, ((fastaSeqPtr) s2)->header));
|
|
|
|
|
|
|
|
if (r1 > r2)
|
|
|
|
c = 1;
|
|
|
|
else if (r1 < r2)
|
|
|
|
c = -1;
|
|
|
|
else
|
|
|
|
c = 0;
|
|
|
|
}
|
|
|
|
return(c);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void printInBIOMformat(fastaSeqPtr* uniqSeqs, int count, int numberOfCenters, char* biomFile_name)
|
|
|
|
{
|
|
|
|
int i, j, n;
|
|
|
|
FILE* biomFile;
|
|
|
|
struct tm* tm_info;
|
|
|
|
time_t timer;
|
|
|
|
char buffer_date[20];
|
|
|
|
fastaSeqPtr* c;
|
|
|
|
fastaSeqPtr* seq;
|
|
|
|
int id_len;
|
|
|
|
int row_number;
|
|
|
|
BOOL first_center = TRUE;
|
|
|
|
|
|
|
|
int buffer_col_rows;
|
|
|
|
int buffer_col_rows_1;
|
|
|
|
int buffer_col_rows_2;
|
|
|
|
|
|
|
|
buffer_col_rows = 29;
|
|
|
|
buffer_col_rows_1 = 9;
|
|
|
|
buffer_col_rows_2 = 20;
|
|
|
|
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
biomFile = fopen(biomFile_name, "w");
|
|
|
|
if (biomFile == NULL)
|
|
|
|
fprintf(stderr, "\nCan't open BIOM output file.\n"); //, %s outputFilename);
|
|
|
|
|
|
|
|
for (i=0; i<count; i++) // Loop to store columns
|
|
|
|
{
|
|
|
|
seq = uniqSeqs+i;
|
|
|
|
id_len = strlen((*seq)->accession_id);
|
|
|
|
j=0;
|
|
|
|
|
|
|
|
if ((*seq)->cluster_center) // center sequence
|
|
|
|
{
|
|
|
|
n++;
|
|
|
|
(*seq)->cluster_weight_unique_ids = 1;
|
|
|
|
|
|
|
|
if (first_center)
|
|
|
|
{
|
|
|
|
(*seq)->columns_BIOM_size = id_len + buffer_col_rows;
|
|
|
|
(*seq)->columns_BIOM = (char*) malloc(((*seq)->columns_BIOM_size)*sizeof(char));
|
|
|
|
strcpy((*seq)->columns_BIOM, "{\"id\": \"");
|
|
|
|
first_center = FALSE;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
(*seq)->columns_BIOM_size = id_len + buffer_col_rows + 1;
|
|
|
|
(*seq)->columns_BIOM = (char*) malloc(((*seq)->columns_BIOM_size)*sizeof(char));
|
|
|
|
strcpy((*seq)->columns_BIOM, ",{\"id\": \"");
|
|
|
|
}
|
|
|
|
|
|
|
|
memcpy((*seq)->columns_BIOM + (*seq)->columns_BIOM_size - id_len - buffer_col_rows_2 - 1, (*seq)->accession_id, id_len);
|
|
|
|
memcpy((*seq)->columns_BIOM + (*seq)->columns_BIOM_size - buffer_col_rows_2 - 1, "\", \"metadata\": null}", buffer_col_rows_2+1);
|
|
|
|
|
|
|
|
if ((*seq)->next != NULL) // not last sequence
|
|
|
|
{
|
|
|
|
for (j=1; ((((*seq)+j)->next != NULL) && (((*seq)+j)->uniqHead == FALSE)); j++) // identical sequences
|
|
|
|
{
|
|
|
|
id_len = strlen((*(seq)+j)->accession_id);
|
|
|
|
n++;
|
|
|
|
|
|
|
|
(*seq)->cluster_weight_unique_ids++;
|
|
|
|
(*seq)->columns_BIOM_size = (*seq)->columns_BIOM_size + id_len + buffer_col_rows;
|
|
|
|
(*seq)->columns_BIOM = realloc((*seq)->columns_BIOM, ((*seq)->columns_BIOM_size) * sizeof(char));
|
|
|
|
memcpy((*seq)->columns_BIOM + (*seq)->columns_BIOM_size - buffer_col_rows - id_len - 1, ",{\"id\": \"", buffer_col_rows_1);
|
|
|
|
memcpy((*seq)->columns_BIOM + (*seq)->columns_BIOM_size - id_len - buffer_col_rows_2 - 1, (*(seq)+j)->accession_id, id_len);
|
|
|
|
memcpy((*seq)->columns_BIOM + (*seq)->columns_BIOM_size - buffer_col_rows_2 - 1, "\", \"metadata\": null}", buffer_col_rows_2+1);
|
|
|
|
}
|
|
|
|
if ((((*seq)+j)->next == NULL) && (((*seq)+j)->uniqHead == FALSE)) // last sequence
|
|
|
|
{
|
|
|
|
id_len = strlen((*(seq)+j)->accession_id);
|
|
|
|
n++;
|
|
|
|
|
|
|
|
(*seq)->cluster_weight_unique_ids++;
|
|
|
|
(*seq)->columns_BIOM_size = (*seq)->columns_BIOM_size + id_len + buffer_col_rows;
|
|
|
|
(*seq)->columns_BIOM = realloc((*seq)->columns_BIOM, ((*seq)->columns_BIOM_size) * sizeof(char));
|
|
|
|
memcpy((*seq)->columns_BIOM + (*seq)->columns_BIOM_size - buffer_col_rows - id_len - 1, ",{\"id\": \"", buffer_col_rows_1);
|
|
|
|
memcpy((*seq)->columns_BIOM + (*seq)->columns_BIOM_size - id_len - buffer_col_rows_2 - 1, (*(seq)+j)->accession_id, id_len);
|
|
|
|
memcpy((*seq)->columns_BIOM + (*seq)->columns_BIOM_size - buffer_col_rows_2 - 1, "\", \"metadata\": null}", buffer_col_rows_2+1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else // not a center sequence
|
|
|
|
{
|
|
|
|
n++;
|
|
|
|
|
|
|
|
c = (*seq)->center;
|
|
|
|
|
|
|
|
id_len = strlen((*seq)->accession_id);
|
|
|
|
n++;
|
|
|
|
|
|
|
|
(*c)->cluster_weight_unique_ids++;
|
|
|
|
(*c)->columns_BIOM_size = (*c)->columns_BIOM_size + id_len + buffer_col_rows;
|
|
|
|
(*c)->columns_BIOM = realloc((*c)->columns_BIOM, ((*c)->columns_BIOM_size) * sizeof(char));
|
|
|
|
memcpy((*c)->columns_BIOM + (*c)->columns_BIOM_size - buffer_col_rows - id_len - 1, ",{\"id\": \"", buffer_col_rows_1);
|
|
|
|
memcpy((*c)->columns_BIOM + (*c)->columns_BIOM_size - id_len - buffer_col_rows_2 - 1, (*seq)->accession_id, id_len);
|
|
|
|
memcpy((*c)->columns_BIOM + (*c)->columns_BIOM_size - buffer_col_rows_2 - 1, "\", \"metadata\": null}", buffer_col_rows_2+1);
|
|
|
|
|
|
|
|
if ((*seq)->next != NULL) // not last sequence
|
|
|
|
{
|
|
|
|
for (j=1; ((((*seq)+j)->next != NULL) && (((*seq)+j)->uniqHead == FALSE)); j++) // identical sequences
|
|
|
|
{
|
|
|
|
id_len = strlen((*(seq)+j)->accession_id);
|
|
|
|
n++;
|
|
|
|
|
|
|
|
(*c)->cluster_weight_unique_ids++;
|
|
|
|
(*c)->columns_BIOM_size = (*c)->columns_BIOM_size + id_len + buffer_col_rows;
|
|
|
|
(*c)->columns_BIOM = realloc((*c)->columns_BIOM, ((*c)->columns_BIOM_size) * sizeof(char));
|
|
|
|
memcpy((*c)->columns_BIOM + (*c)->columns_BIOM_size - buffer_col_rows - id_len - 1, ",{\"id\": \"", buffer_col_rows_1);
|
|
|
|
memcpy((*c)->columns_BIOM + (*c)->columns_BIOM_size - id_len - buffer_col_rows_2 - 1, (*(seq)+j)->accession_id, id_len);
|
|
|
|
memcpy((*c)->columns_BIOM + (*c)->columns_BIOM_size - buffer_col_rows_2 - 1, "\", \"metadata\": null}", buffer_col_rows_2+1);
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((((*seq)+j)->next == NULL) && (((*seq)+j)->uniqHead == FALSE)) // last sequence
|
|
|
|
{
|
|
|
|
id_len = strlen((*(seq)+j)->accession_id);
|
|
|
|
n++;
|
|
|
|
|
|
|
|
(*c)->cluster_weight_unique_ids++;
|
|
|
|
(*c)->columns_BIOM_size = (*c)->columns_BIOM_size + id_len + buffer_col_rows;
|
|
|
|
(*c)->columns_BIOM = realloc((*c)->columns_BIOM, ((*c)->columns_BIOM_size) * sizeof(char));
|
|
|
|
memcpy((*c)->columns_BIOM + (*c)->columns_BIOM_size - buffer_col_rows - id_len - 1, ",{\"id\": \"", buffer_col_rows_1);
|
|
|
|
memcpy((*c)->columns_BIOM + (*c)->columns_BIOM_size - id_len - buffer_col_rows_2 - 1, (*(seq)+j)->accession_id, id_len);
|
|
|
|
memcpy((*c)->columns_BIOM + (*c)->columns_BIOM_size - buffer_col_rows_2 - 1, "\", \"metadata\": null}", buffer_col_rows_2+1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
time(&timer);
|
|
|
|
tm_info = localtime(&timer);
|
|
|
|
strftime(buffer_date, 20, "%Y-%m-%dT%H:%M:%S", tm_info);
|
|
|
|
|
|
|
|
fprintf(biomFile, "{\"id\": \"None\",\"format\": \"Biological Observation Matrix 1.0.0\","
|
|
|
|
"\"format_url\": \"http://biom-format.org\",\"type\": \"OTU table\","
|
|
|
|
"\"generated_by\": \"SUMACLUST %s\",\"date\": \"%s\",\"matrix_type\": \"sparse\","
|
|
|
|
"\"matrix_element_type\": \"int\",\"shape\": [%d, %d],",
|
|
|
|
VERSION, buffer_date, numberOfCenters, n);
|
|
|
|
|
|
|
|
// print data
|
|
|
|
|
|
|
|
row_number = 0;
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
fprintf(biomFile, "\"data\": [");
|
|
|
|
|
|
|
|
for (i=0; i<count; i++)
|
|
|
|
{
|
|
|
|
seq = uniqSeqs+i;
|
|
|
|
if ((*seq)->cluster_center) // center sequence
|
|
|
|
{
|
|
|
|
for (j=0; j<(*seq)->cluster_weight_unique_ids; j++)
|
|
|
|
{
|
|
|
|
if ((row_number == (numberOfCenters - 1)) && (j == ((*seq)->cluster_weight_unique_ids - 1))) // last seq to print
|
|
|
|
fprintf(biomFile, "[%d,%d,1]],", row_number, n);
|
|
|
|
else
|
|
|
|
fprintf(biomFile, "[%d,%d,1],", row_number, n);
|
|
|
|
n++;
|
|
|
|
}
|
|
|
|
row_number++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// end data
|
|
|
|
|
|
|
|
// Print rows
|
|
|
|
|
|
|
|
first_center = TRUE;
|
|
|
|
|
|
|
|
for (i=0; i<count; i++)
|
|
|
|
{
|
|
|
|
seq = uniqSeqs+i;
|
|
|
|
if ((*seq)->cluster_center) // center sequence
|
|
|
|
{
|
|
|
|
if (first_center)
|
|
|
|
{
|
|
|
|
fprintf(biomFile, "\"rows\": [{\"id\": \"%s\", \"metadata\": null}", (*seq)->accession_id);
|
|
|
|
first_center = FALSE;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
fprintf(biomFile, ",{\"id\": \"%s\", \"metadata\": null}", (*seq)->accession_id);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Print columns
|
|
|
|
|
|
|
|
fprintf(biomFile, "],\"columns\": [");
|
|
|
|
for (i=0; i<count; i++)
|
|
|
|
{
|
|
|
|
seq = uniqSeqs+i;
|
|
|
|
if ((*seq)->cluster_center) // center sequence
|
|
|
|
fprintf(biomFile, (*seq)->columns_BIOM);
|
|
|
|
}
|
|
|
|
fprintf(biomFile, "]}");
|
|
|
|
|
|
|
|
fclose(biomFile);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void printInOTUtableFormat(fastaSeqPtr* uniqSeqs, int count, char* OTUtableFile_name)
|
|
|
|
{
|
|
|
|
int i, j;
|
|
|
|
FILE* OTUtableFile;
|
|
|
|
fastaSeqPtr* c;
|
|
|
|
fastaSeqPtr* seq;
|
|
|
|
int id_len;
|
|
|
|
|
|
|
|
OTUtableFile = fopen(OTUtableFile_name, "w");
|
|
|
|
if (OTUtableFile == NULL)
|
|
|
|
fprintf(stderr, "\nCan't open OTU table output file.\n"); //, %s outputFilename);
|
|
|
|
|
|
|
|
for (i=0; i<count; i++)
|
|
|
|
{
|
|
|
|
seq = uniqSeqs+i;
|
|
|
|
id_len = strlen((*seq)->accession_id);
|
|
|
|
j=0;
|
|
|
|
|
|
|
|
|
|
|
|
if ((*seq)->cluster_center) // center sequence
|
|
|
|
{
|
|
|
|
(*seq)->line_OTU_table_size = id_len*2 + 2;
|
|
|
|
(*seq)->line_OTU_table = (char*) malloc(((*seq)->line_OTU_table_size)*sizeof(char));
|
|
|
|
strcpy((*seq)->line_OTU_table, (*seq)->accession_id);
|
|
|
|
memcpy((*seq)->line_OTU_table + (*seq)->line_OTU_table_size - id_len - 2, "\t", 1);
|
|
|
|
memcpy((*seq)->line_OTU_table + (*seq)->line_OTU_table_size - id_len - 1, (*seq)->accession_id, id_len);
|
|
|
|
memcpy((*seq)->line_OTU_table + (*seq)->line_OTU_table_size - 1, "\0", 1);
|
|
|
|
|
|
|
|
if ((*seq)->next != NULL) // not last sequence
|
|
|
|
{
|
|
|
|
for (j=1; ((((*seq)+j)->next != NULL) && (((*seq)+j)->uniqHead == FALSE)); j++) // identical sequences
|
|
|
|
{
|
|
|
|
id_len = strlen((*(seq)+j)->accession_id);
|
|
|
|
|
|
|
|
(*seq)->line_OTU_table_size = (*seq)->line_OTU_table_size + id_len + 1;
|
|
|
|
(*seq)->line_OTU_table = realloc((*seq)->line_OTU_table, ((*seq)->line_OTU_table_size) * sizeof(char));
|
|
|
|
memcpy((*seq)->line_OTU_table + (*seq)->line_OTU_table_size - id_len - 2, "\t", 1);
|
|
|
|
memcpy((*seq)->line_OTU_table + (*seq)->line_OTU_table_size - id_len - 1, (*(seq)+j)->accession_id, id_len);
|
|
|
|
memcpy((*seq)->line_OTU_table + (*seq)->line_OTU_table_size - 1, "\0", 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((((*seq)+j)->next == NULL) && (((*seq)+j)->uniqHead == FALSE)) // last sequence
|
|
|
|
{
|
|
|
|
id_len = strlen((*(seq)+j)->accession_id);
|
|
|
|
|
|
|
|
(*seq)->line_OTU_table_size = (*seq)->line_OTU_table_size + id_len + 1;
|
|
|
|
(*seq)->line_OTU_table = realloc((*seq)->line_OTU_table, ((*seq)->line_OTU_table_size) * sizeof(char));
|
|
|
|
memcpy((*seq)->line_OTU_table + (*seq)->line_OTU_table_size - id_len - 2, "\t", 1);
|
|
|
|
memcpy((*seq)->line_OTU_table + (*seq)->line_OTU_table_size - id_len - 1, (*(seq)+j)->accession_id, id_len);
|
|
|
|
memcpy((*seq)->line_OTU_table + (*seq)->line_OTU_table_size - 1, "\0", 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else // not a center sequence
|
|
|
|
{
|
|
|
|
c = (*seq)->center;
|
|
|
|
|
|
|
|
(*c)->line_OTU_table_size = (*c)->line_OTU_table_size + id_len + 1;
|
|
|
|
(*c)->line_OTU_table = realloc((*c)->line_OTU_table, ((*c)->line_OTU_table_size) * sizeof(char));
|
|
|
|
memcpy((*c)->line_OTU_table + (*c)->line_OTU_table_size - id_len - 2, "\t", 1);
|
|
|
|
memcpy((*c)->line_OTU_table + (*c)->line_OTU_table_size - id_len - 1, (*seq)->accession_id, id_len);
|
|
|
|
memcpy((*c)->line_OTU_table + (*c)->line_OTU_table_size - 1, "\0", 1);
|
|
|
|
|
|
|
|
if ((*seq)->next != NULL) // not last sequence
|
|
|
|
{
|
|
|
|
for (j=1; ((((*seq)+j)->next != NULL) && (((*seq)+j)->uniqHead == FALSE)); j++) // identical sequences
|
|
|
|
{
|
|
|
|
id_len = strlen((*(seq)+j)->accession_id);
|
|
|
|
|
|
|
|
(*c)->line_OTU_table_size = (*c)->line_OTU_table_size + id_len + 1;
|
|
|
|
(*c)->line_OTU_table = realloc((*c)->line_OTU_table, ((*c)->line_OTU_table_size) * sizeof(char));
|
|
|
|
memcpy((*c)->line_OTU_table + (*c)->line_OTU_table_size - id_len - 2, "\t", 1);
|
|
|
|
memcpy((*c)->line_OTU_table + (*c)->line_OTU_table_size - id_len - 1, (*(seq)+j)->accession_id, id_len);
|
|
|
|
memcpy((*c)->line_OTU_table + (*c)->line_OTU_table_size - 1, "\0", 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((((*seq)+j)->next == NULL) && (((*seq)+j)->uniqHead == FALSE)) // last sequence
|
|
|
|
{
|
|
|
|
id_len = strlen((*(seq)+j)->accession_id);
|
|
|
|
|
|
|
|
(*c)->line_OTU_table_size = (*c)->line_OTU_table_size + id_len + 1;
|
|
|
|
(*c)->line_OTU_table = realloc((*c)->line_OTU_table, ((*c)->line_OTU_table_size) * sizeof(char));
|
|
|
|
memcpy((*c)->line_OTU_table + (*c)->line_OTU_table_size - id_len - 2, "\t", 1);
|
|
|
|
memcpy((*c)->line_OTU_table + (*c)->line_OTU_table_size - id_len - 1, (*(seq)+j)->accession_id, id_len);
|
|
|
|
memcpy((*c)->line_OTU_table + (*c)->line_OTU_table_size - 1, "\0", 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Print rows
|
|
|
|
|
|
|
|
for (i=0; i<count; i++)
|
|
|
|
{
|
|
|
|
seq = uniqSeqs+i;
|
|
|
|
if ((*seq)->cluster_center) // center sequence
|
|
|
|
{
|
|
|
|
fprintf(OTUtableFile, (*seq)->line_OTU_table);
|
|
|
|
fprintf(OTUtableFile, "\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fclose(OTUtableFile);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void printSeq(fastaSeqPtr* seq, fastaSeqPtr* center, double score, FILE* output)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
char* score_n;
|
|
|
|
char* score_v;
|
|
|
|
char* cluster_n;
|
|
|
|
char* cluster_v;
|
|
|
|
char* center_n;
|
|
|
|
char* center_true;
|
|
|
|
char* center_false;
|
|
|
|
int id_size;
|
|
|
|
|
|
|
|
score_n = (char*) malloc(14*sizeof(char));
|
|
|
|
score_v = (char*) malloc(20*sizeof(char));
|
|
|
|
|
|
|
|
strcpy(score_n, "cluster_score");
|
|
|
|
sprintf(score_v,"%f", score);
|
|
|
|
|
|
|
|
id_size = strlen((*center)->accession_id);
|
|
|
|
|
|
|
|
cluster_n = (char*) malloc(8*sizeof(char));
|
|
|
|
cluster_v = (char*) malloc((id_size+1)*sizeof(char));
|
|
|
|
|
|
|
|
strcpy(cluster_n, "cluster");
|
|
|
|
strcpy(cluster_v, (*center)->accession_id);
|
|
|
|
|
|
|
|
center_n = (char*) malloc(15*sizeof(char));
|
|
|
|
strcpy(center_n, "cluster_center");
|
|
|
|
center_true = (char*) malloc(5*sizeof(char));
|
|
|
|
strcpy(center_true, "True");
|
|
|
|
center_false = (char*) malloc(6*sizeof(char));
|
|
|
|
strcpy(center_false, "False");
|
|
|
|
|
|
|
|
(*seq)->header = table_header_add_field((*seq)->header, cluster_n, cluster_v);
|
|
|
|
(*seq)->header = table_header_add_field((*seq)->header, score_n, score_v);
|
|
|
|
if ((*seq)->cluster_center)
|
|
|
|
(*seq)->header = table_header_add_field((*seq)->header, center_n, center_true);
|
|
|
|
else
|
|
|
|
(*seq)->header = table_header_add_field((*seq)->header, center_n, center_false);
|
|
|
|
|
|
|
|
printOnlyHeaderFromTable((*seq)->header, output);
|
|
|
|
printOnlySeqFromFastaSeqPtr((*seq), output);
|
|
|
|
|
|
|
|
if ((*seq)->next != NULL)
|
|
|
|
{
|
|
|
|
for (i=1; ((((*seq)+i)->next != NULL) && (((*seq)+i)->uniqHead == FALSE)); i++)
|
|
|
|
{
|
|
|
|
((*seq)+i)->header = table_header_add_field(((*seq)+i)->header, cluster_n, cluster_v);
|
|
|
|
((*seq)+i)->header = table_header_add_field(((*seq)+i)->header, score_n, score_v);
|
|
|
|
((*seq)+i)->header = table_header_add_field(((*seq)+i)->header, center_n, center_false);
|
|
|
|
|
|
|
|
printOnlyHeaderFromTable(((*seq)+i)->header, output);
|
|
|
|
printOnlySeqFromFastaSeqPtr(((*seq)+i), output);
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((((*seq)+i)->next == NULL) && (((*seq)+i)->uniqHead == FALSE)) // last sequence
|
|
|
|
{
|
|
|
|
((*seq)+i)->header = table_header_add_field(((*seq)+i)->header, cluster_n, cluster_v);
|
|
|
|
((*seq)+i)->header = table_header_add_field(((*seq)+i)->header, score_n, score_v);
|
|
|
|
((*seq)+i)->header = table_header_add_field(((*seq)+i)->header, center_n, center_false);
|
|
|
|
|
|
|
|
printOnlyHeaderFromTable(((*seq)+i)->header, output);
|
|
|
|
printOnlySeqFromFastaSeqPtr(((*seq)+i), output);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void putSeqInCluster(fastaSeqPtr* seq, fastaSeqPtr* center, double score)
|
|
|
|
{
|
|
|
|
(*seq)->center = center;
|
|
|
|
(*seq)->score = score;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-10-13 19:47:20 +02:00
|
|
|
int compare(fastaSeqPtr* db, int n, BOOL fastOption, double threshold, BOOL normalize, int reference, BOOL lcsmode,
|
2015-07-16 14:41:51 +02:00
|
|
|
double max_ratio)
|
|
|
|
{
|
|
|
|
double score;
|
|
|
|
double scoremax;
|
|
|
|
double worstscore;
|
|
|
|
BOOL toCluster;
|
|
|
|
static BOOL first=TRUE;
|
|
|
|
int32_t i,j,k;
|
|
|
|
int center;
|
|
|
|
float p;
|
|
|
|
BOOL found;
|
|
|
|
int lmax, lmin;
|
|
|
|
int16_t* address;
|
|
|
|
int16_t* iseq1;
|
|
|
|
int16_t* iseq2;
|
|
|
|
int l1;
|
|
|
|
int l2;
|
|
|
|
char* s1;
|
|
|
|
char* s2;
|
|
|
|
int sizeForSeqs;
|
|
|
|
int LCSmin;
|
|
|
|
|
|
|
|
if (lcsmode || normalize)
|
|
|
|
fprintf(stderr,"Clustering sequences when similarity >= %lf\n", threshold);
|
|
|
|
else
|
|
|
|
fprintf(stderr,"Clustering sequences when distance <= %lf\n", threshold);
|
|
|
|
|
|
|
|
fprintf(stderr,"Aligning and clustering... \n");
|
|
|
|
|
|
|
|
int* centers = (int*) malloc(n * sizeof(int));
|
|
|
|
|
|
|
|
for (i=0; i < n; i++)
|
|
|
|
centers[i] = -1;
|
|
|
|
|
|
|
|
k=0;
|
|
|
|
found = FALSE;
|
|
|
|
|
|
|
|
calculateMaxAndMinLen(db, n, &lmax, &lmin);
|
|
|
|
|
|
|
|
sizeForSeqs = prepareTablesForSumathings(lmax, lmin, threshold, normalize, reference, lcsmode, &address, &iseq1, &iseq2);
|
|
|
|
|
|
|
|
if (lcsmode || normalize)
|
|
|
|
worstscore = 0.0;
|
|
|
|
else
|
|
|
|
worstscore = lmax;
|
|
|
|
|
|
|
|
for (i=0; i < n; i++)
|
|
|
|
{
|
|
|
|
if (i%100 == 0)
|
|
|
|
{
|
|
|
|
p = (i/(float)n)*100;
|
|
|
|
fprintf(stderr,"\rDone : %f %% %d clusters created",p,k);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (first)
|
|
|
|
{
|
|
|
|
first = FALSE;
|
|
|
|
if (normalize && lcsmode)
|
|
|
|
score = 1.0;
|
|
|
|
else if (!lcsmode)
|
|
|
|
score = 0.0;
|
|
|
|
else
|
|
|
|
score = (*(db+i))->length;
|
|
|
|
(*(db+i))->cluster_center = TRUE;
|
|
|
|
putSeqInCluster(db+i, db+i, score);
|
|
|
|
centers[k] = i;
|
|
|
|
k++;
|
|
|
|
}
|
|
|
|
|
|
|
|
else
|
|
|
|
{
|
|
|
|
scoremax = worstscore;
|
|
|
|
center = 0;
|
|
|
|
found = FALSE;
|
|
|
|
toCluster = FALSE;
|
|
|
|
j=0;
|
|
|
|
|
|
|
|
s1 = (*(db+i))->sequence;
|
|
|
|
l1 = (*(db+i))->length;
|
|
|
|
|
|
|
|
while (((found == FALSE) && (centers[j] != -1) && (fastOption == TRUE)) || ((fastOption == FALSE) && (centers[j] != -1)))
|
|
|
|
{
|
|
|
|
score = worstscore;
|
|
|
|
|
|
|
|
if ((double) ((*(db+i))->count) / (double) ((*(db+centers[j]))->count) <= max_ratio)
|
|
|
|
{
|
|
|
|
filters((*(db+i)), (*(db+centers[j])), threshold, normalize, reference, lcsmode, &score, &LCSmin);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (score == -1.0)
|
|
|
|
{
|
|
|
|
s2 = (*(db+centers[j]))->sequence;
|
|
|
|
l2 = (*(db+centers[j]))->length;
|
|
|
|
|
|
|
|
score = alignForSumathings(s1, iseq1, s2, iseq2, l1, l2, normalize, reference, lcsmode, address, sizeForSeqs, LCSmin);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (((score >= threshold) && (lcsmode || normalize) && (score > scoremax)) || ((!lcsmode && !normalize) && (score <= threshold) && (score < scoremax)))
|
|
|
|
{
|
|
|
|
toCluster = TRUE;
|
|
|
|
scoremax = score;
|
|
|
|
center = centers[j];
|
|
|
|
if (fastOption == TRUE)
|
|
|
|
found = TRUE;
|
|
|
|
}
|
|
|
|
j++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (toCluster)
|
|
|
|
{
|
|
|
|
if (!lcsmode && normalize)
|
|
|
|
scoremax = 1.0 - scoremax;
|
|
|
|
(*(db+i))->cluster_center = FALSE;
|
|
|
|
putSeqInCluster(db+i, db+center, scoremax);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (normalize && lcsmode)
|
|
|
|
score = 1.0;
|
|
|
|
else if (!lcsmode)
|
|
|
|
score = 0.0;
|
|
|
|
else
|
|
|
|
score = (*(db+i))->length;
|
|
|
|
(*(db+i))->cluster_center = TRUE;
|
|
|
|
putSeqInCluster(db+i, db+i, score);
|
|
|
|
centers[k] = i;
|
|
|
|
k++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
fprintf(stderr,"\rDone : 100 %% %d clusters created. \n",k);
|
|
|
|
|
|
|
|
free(centers);
|
|
|
|
|
|
|
|
free(iseq1-sizeForSeqs+lmax);
|
|
|
|
free(iseq2-sizeForSeqs+lmax);
|
|
|
|
|
2017-10-13 19:47:20 +02:00
|
|
|
if (normalize && (reference == ALILEN))
|
2015-07-16 14:41:51 +02:00
|
|
|
free(address);
|
|
|
|
|
|
|
|
return(k);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void computeClusterWeights(fastaSeqPtr* uniqSeqs, int n)
|
|
|
|
{
|
|
|
|
int i,j;
|
|
|
|
fastaSeqPtr* seq;
|
|
|
|
fastaSeqPtr* center;
|
|
|
|
char* cluster_weight_n;
|
|
|
|
char* cluster_weight_v;
|
|
|
|
int cluster_weight;
|
|
|
|
|
|
|
|
for (i=0; i<n; i++)
|
|
|
|
{
|
|
|
|
seq = uniqSeqs+i;
|
|
|
|
|
|
|
|
if ((*seq)->cluster_center)
|
|
|
|
(*seq)->cluster_weight = (*seq)->count;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
center = (*seq)->center;
|
|
|
|
((*center)->cluster_weight)+=(*seq)->count;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i=0; i<n; i++)
|
|
|
|
{
|
|
|
|
seq = uniqSeqs+i;
|
|
|
|
|
|
|
|
if ((*seq)->cluster_center)
|
|
|
|
cluster_weight = (*seq)->cluster_weight;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
center = (*seq)->center;
|
|
|
|
cluster_weight = (*center)->cluster_weight;
|
|
|
|
}
|
|
|
|
cluster_weight_n = (char*) malloc(15*sizeof(char));
|
|
|
|
cluster_weight_v = (char*) malloc(20*sizeof(char));
|
|
|
|
strcpy(cluster_weight_n, "cluster_weight");
|
|
|
|
sprintf(cluster_weight_v,"%d", cluster_weight);
|
|
|
|
(*seq)->header = table_header_add_field((*seq)->header, cluster_weight_n, cluster_weight_v);
|
|
|
|
|
|
|
|
if ((*seq)->next != NULL) // not the last sequence
|
|
|
|
{
|
|
|
|
for (j=1; ((((*seq)+j)->next != NULL) && (((*seq)+j)->uniqHead == FALSE)); j++)
|
|
|
|
(*(seq)+j)->header = table_header_add_field((*(seq)+j)->header, cluster_weight_n, cluster_weight_v);
|
|
|
|
|
|
|
|
if ((((*seq)+j)->next == NULL) && (((*seq)+j)->uniqHead == FALSE)) // last sequence
|
|
|
|
(*(seq)+j)->header = table_header_add_field((*(seq)+j)->header, cluster_weight_n, cluster_weight_v);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int main(int argc, char** argv)
|
|
|
|
{
|
|
|
|
|
|
|
|
int32_t carg = 0;
|
|
|
|
int32_t errflag = 0;
|
|
|
|
char* sort;
|
|
|
|
double threshold = 0.97;
|
|
|
|
double max_ratio = 1.0;
|
|
|
|
BOOL lcsmode = TRUE;
|
|
|
|
BOOL fastOption = TRUE;
|
|
|
|
BOOL normalize = TRUE;
|
|
|
|
BOOL reverse = FALSE;
|
|
|
|
BOOL onlyATGC = TRUE;
|
|
|
|
int reference = ALILEN;
|
|
|
|
int nproc = 1;
|
|
|
|
BOOL printBIOM = FALSE;
|
|
|
|
BOOL printOTUtable = FALSE;
|
|
|
|
BOOL printFASTA = TRUE;
|
|
|
|
BOOL printFASTAtofile = FALSE;
|
|
|
|
FILE* FASTA_output = stdout;
|
|
|
|
fastaSeqCount db;
|
|
|
|
int i,n;
|
|
|
|
fastaSeqPtr* uniqSeqs;
|
|
|
|
char* biomFile_name;
|
|
|
|
char* OTUtableFile_name;
|
|
|
|
char* FASTA_file_name;
|
|
|
|
int numberOfCenters;
|
|
|
|
|
|
|
|
|
|
|
|
sort = malloc(1024*sizeof(char));
|
|
|
|
strcpy(sort, "count");
|
|
|
|
|
|
|
|
biomFile_name = malloc(1024*sizeof(char));
|
|
|
|
OTUtableFile_name = malloc(1024*sizeof(char));
|
|
|
|
FASTA_file_name = malloc(1024*sizeof(char));
|
|
|
|
|
|
|
|
|
|
|
|
while ((carg = getopt(argc, argv, "hlLanrdet:p:s:ogB:O:R:fF:")) != -1) {
|
|
|
|
switch (carg) {
|
|
|
|
/* -------------------- */
|
|
|
|
case 'h': /* help */
|
|
|
|
/* -------------------- */
|
|
|
|
PrintHelp();
|
|
|
|
exit(0);
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* -------------------------------------------------- */
|
|
|
|
case 'l': /* Normalize LCS/Error by the shortest sequence length*/
|
|
|
|
/* -------------------------------------------------- */
|
|
|
|
reference=MINLEN;
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* -------------------------------------------------- */
|
|
|
|
case 'L': /* Normalize LCS/Error by the largest sequence length */
|
|
|
|
/* -------------------------------------------------- */
|
|
|
|
reference=MAXLEN;
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* -------------------------------------------------- */
|
|
|
|
case 'a': /* Normalize LCS/Error by the alignment length */
|
|
|
|
/* -------------------------------------------------- */
|
|
|
|
reference=ALILEN;
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* -------------------------------------------------- */
|
|
|
|
case 'n': /* Normalize LCS by the reference length */
|
|
|
|
/* -------------------------------------------------- */
|
|
|
|
normalize=TRUE;
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* -------------------------------------------------- */
|
|
|
|
case 'r': /* No normalization */
|
|
|
|
/* -------------------------------------------------- */
|
|
|
|
normalize=FALSE;
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* -------------------------------------------------- */
|
|
|
|
case 'd': /* Score is expressed in distance */
|
|
|
|
/* -------------------------------------------------- */
|
|
|
|
lcsmode=FALSE;
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* ---------------------------------------------------------------------------------------------------------- */
|
|
|
|
case 'e': /* center with the best score > threshold is chosen, otherwise first center with a score > threshold */
|
|
|
|
/* ---------------------------------------------------------------------------------------------------------- */
|
|
|
|
fastOption=FALSE;
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* ------------------------------------------------------------------- */
|
|
|
|
case 't': /* Clusters only pairs with similarity higher than (threshold) */
|
|
|
|
/* ------------------------------------------------------------------- */
|
|
|
|
sscanf(optarg,"%lf",&threshold);
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
/* ------------------------------------------------------------------- */
|
|
|
|
case 'R': /* maximum ratio between counts of two sequences connected by an edge */
|
|
|
|
/* ------------------------------------------------------------------- */
|
|
|
|
sscanf(optarg,"%lf",&max_ratio);
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* -------------------------------------------------- */
|
|
|
|
case 'p': /* number of processors to use */
|
|
|
|
/* -------------------------------------------------- */
|
|
|
|
sscanf(optarg,"%d",&nproc);
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* -------------------------------------------------- */
|
|
|
|
case 's': /* Sorting option */
|
|
|
|
/* -------------------------------------------------- */
|
|
|
|
sscanf(optarg, "%s", sort);
|
|
|
|
sortingKey = sort;
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* -------------------------------------------------- */
|
|
|
|
case 'o': /* reverse sorting */
|
|
|
|
/* -------------------------------------------------- */
|
|
|
|
reverse=TRUE;
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* -------------------------------------------------- */
|
|
|
|
case 'g': /* replace n's with a's in sequences */
|
|
|
|
/* -------------------------------------------------- */
|
|
|
|
onlyATGC=FALSE;
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* -------------------------------------------------- */
|
|
|
|
case 'B': /* file name to print results in BIOM format */
|
|
|
|
/* -------------------------------------------------- */
|
|
|
|
sscanf(optarg, "%s", biomFile_name);
|
|
|
|
printBIOM=TRUE;
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* -------------------------------------------------- */
|
|
|
|
case 'O': /* file name to print results in OTU table format */
|
|
|
|
/* -------------------------------------------------- */
|
|
|
|
sscanf(optarg, "%s", OTUtableFile_name);
|
|
|
|
printOTUtable=TRUE;
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* -------------------------------------------------- */
|
|
|
|
case 'f': /* don't print results in FASTA format */
|
|
|
|
/* -------------------------------------------------- */
|
|
|
|
printFASTA=FALSE;
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* ---------------------------------------------- */
|
|
|
|
case 'F': /* file name to print results in FASTA format */
|
|
|
|
/* ---------------------------------------------- */
|
|
|
|
sscanf(optarg, "%s", FASTA_file_name);
|
|
|
|
printFASTAtofile=TRUE;
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case '?': /* invalid option */
|
|
|
|
errflag++;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (errflag)
|
|
|
|
ExitUsage(errflag);
|
|
|
|
|
|
|
|
fprintf(stderr,"===========================================================\n");
|
|
|
|
fprintf(stderr," SUMACLUST version %s\n",VERSION);
|
|
|
|
#ifdef __SSE2__
|
|
|
|
fprintf(stderr," Alignment using SSE2 instructions.\n");
|
|
|
|
#else
|
|
|
|
fprintf(stderr," Alignment using standard code, SSE2 unavailable.\n");
|
|
|
|
#endif
|
|
|
|
fprintf(stderr,"===========================================================\n");
|
|
|
|
|
|
|
|
if ((threshold == 0.0) || (normalize && (threshold > 1.0)))
|
|
|
|
{
|
|
|
|
fprintf(stderr, "\nERROR: Please specify a threshold > 0, and < 1 when scores are normalized.\n\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
fprintf(stderr,"Reading dataset...");
|
|
|
|
db = seq_readAllSeq2(argv[optind], TRUE, onlyATGC);
|
2016-02-22 15:56:46 +01:00
|
|
|
|
2015-07-16 14:41:51 +02:00
|
|
|
fprintf(stderr,"\n%d sequences\n",db.count);
|
|
|
|
|
|
|
|
if (db.count == 0)
|
|
|
|
{
|
|
|
|
fprintf(stderr, "\nNo valid sequences. Exiting program.\n\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!onlyATGC)
|
|
|
|
(void)cleanDB(db);
|
|
|
|
|
|
|
|
if (!lcsmode && normalize)
|
|
|
|
threshold = 1.0 - threshold;
|
|
|
|
|
|
|
|
if (threshold > 0)
|
|
|
|
(void)hashDB(db);
|
|
|
|
|
|
|
|
addCounts(&db);
|
|
|
|
|
|
|
|
// first sorting of sequences to have good unique heads
|
|
|
|
|
|
|
|
if ((strcmp(sortingKey, "None") != 0) && (strcmp(sortingKey, "none") != 0))
|
|
|
|
{
|
|
|
|
if (reverse == FALSE)
|
|
|
|
qsort((void*) db.fastaSeqs, db.count, sizeof(fastaSeq), uniqSeqsDoubleSortFunction);
|
|
|
|
else
|
|
|
|
qsort((void*) db.fastaSeqs, db.count, sizeof(fastaSeq), uniqSeqsDoubleReverseSortFunction);
|
|
|
|
}
|
|
|
|
|
|
|
|
// getting the vector of unique seqs
|
|
|
|
uniqSeqs = (fastaSeqPtr*) malloc((db.count)*sizeof(fastaSeqPtr));
|
|
|
|
n = uniqSeqsVector(&db, &uniqSeqs);
|
|
|
|
uniqSeqs = realloc(uniqSeqs, n*sizeof(fastaSeqPtr));
|
|
|
|
|
|
|
|
// putting a flag on the last sequence
|
|
|
|
for (i=0; i<(db.count-1); i++)
|
|
|
|
((db.fastaSeqs)+i)->next = (db.fastaSeqs)+i-1;
|
|
|
|
((db.fastaSeqs)+(db.count)-1)->next = NULL;
|
|
|
|
|
|
|
|
// sorting unique sequences
|
|
|
|
if (strcmp(sortingKey, "count") == 0)
|
|
|
|
{
|
|
|
|
fprintf(stderr,"Sorting sequences by count...\n", n);
|
|
|
|
if (reverse == FALSE)
|
|
|
|
qsort((void*) uniqSeqs, n, sizeof(fastaSeqPtr), sortSeqsWithCounts);
|
|
|
|
else
|
|
|
|
qsort((void*) uniqSeqs, n, sizeof(fastaSeqPtr), reverseSortSeqsWithCounts);
|
|
|
|
}
|
|
|
|
else if ((strcmp(sortingKey, "None") != 0) && (strcmp(sortingKey, "none") != 0))
|
|
|
|
{
|
|
|
|
fprintf(stderr,"Sorting sequences by %s...\n", sortingKey);
|
|
|
|
if (reverse == FALSE)
|
|
|
|
qsort((void*) uniqSeqs, n, sizeof(fastaSeqPtr), sortSeqsP);
|
|
|
|
else
|
|
|
|
qsort((void*) uniqSeqs, n, sizeof(fastaSeqPtr), reverseSortSeqsP);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (max_ratio > 0)
|
|
|
|
fprintf(stderr,"Maximum ratio between the counts of two sequences to connect them: %lf\n", max_ratio);
|
|
|
|
|
|
|
|
// Computing
|
|
|
|
if (nproc==1)
|
|
|
|
numberOfCenters = compare(uniqSeqs, n, fastOption, threshold, normalize, reference, lcsmode, max_ratio);
|
|
|
|
|
|
|
|
else
|
|
|
|
numberOfCenters = mt_compare_sumaclust(uniqSeqs, n, fastOption, threshold, normalize, reference, lcsmode, nproc, max_ratio);
|
|
|
|
|
|
|
|
// Computing cluster weights
|
|
|
|
computeClusterWeights(uniqSeqs, n);
|
|
|
|
|
|
|
|
// Printing results
|
|
|
|
|
|
|
|
// FASTA file
|
|
|
|
if (printFASTA)
|
|
|
|
{
|
|
|
|
if (printFASTAtofile)
|
|
|
|
{
|
|
|
|
FASTA_output = fopen(FASTA_file_name, "w");
|
|
|
|
if (FASTA_output == NULL)
|
|
|
|
fprintf(stderr, "\nCan't open FASTA output file.\n"); //, %s outputFilename);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i=0; i<n; i++)
|
|
|
|
{
|
|
|
|
printSeq(uniqSeqs+i, (*(uniqSeqs+i))->center, (*(uniqSeqs+i))->score, FASTA_output);
|
|
|
|
}
|
|
|
|
fprintf(stderr,"Done.\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
// BIOM file
|
|
|
|
if (printBIOM)
|
|
|
|
{
|
|
|
|
fprintf(stderr,"Printing results in BIOM format...\n");
|
|
|
|
printInBIOMformat(uniqSeqs, n, numberOfCenters, biomFile_name);
|
|
|
|
fprintf(stderr,"Done.\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
// OTU table file
|
|
|
|
if (printOTUtable)
|
|
|
|
{
|
|
|
|
fprintf(stderr,"Printing results in OTU table format...\n");
|
|
|
|
printInOTUtableFormat(uniqSeqs, n, OTUtableFile_name);
|
|
|
|
fprintf(stderr,"Done.\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Freeing
|
|
|
|
for (i=0; i < db.count; i++)
|
|
|
|
{
|
|
|
|
free(((db.fastaSeqs)[i]).table);
|
|
|
|
free_header_table(((db.fastaSeqs)[i]).header);
|
|
|
|
}
|
|
|
|
free(db.fastaSeqs);
|
|
|
|
free(sort);
|
|
|
|
free(uniqSeqs);
|
|
|
|
|
|
|
|
return(0);
|
|
|
|
|
|
|
|
}
|