/** * FileName: sumaclust.c * Author: Celine Mercier * Description: star clustering of DNA sequences * **/ #include #include #include #include #include #include #include "sumalibs/libutils/utilities.h" #include "sumalibs/libfasta/sequence.h" #include "sumalibs/libfasta/fasta_header_parser.h" #include "sumalibs/libfasta/fasta_header_handler.h" #include "sumalibs/libfasta/fasta_seq_writer.h" #include "sumalibs/liblcs/upperband.h" #include "sumalibs/liblcs/sse_banded_LCS_alignment.h" #include "mtcompare_sumaclust.h" #include "sumaclust.h" #define VERSION "1.0.36" /* ----------------------------------------------- */ /* printout help */ /* ----------------------------------------------- */ #define PP fprintf(stdout, static void PrintHelp() { PP "------------------------------------------------------------\n"); PP " SUMACLUST Version %s\n", VERSION); PP "------------------------------------------------------------\n"); PP " Synopsis : star clustering of sequences.\n"); PP " Usage: sumaclust [options] \n"); PP "------------------------------------------------------------\n"); PP " Options:\n"); PP " -h : [H]elp - print help\n\n"); PP " -l : Reference sequence length is the shortest. \n\n"); PP " -L : Reference sequence length is the largest. \n\n"); PP " -a : Reference sequence length is the alignment length (default). \n\n"); PP " -n : Score is normalized by reference sequence length (default).\n\n"); PP " -r : Raw score, not normalized. \n\n"); PP " -d : Score is expressed in distance (default : score is expressed in similarity). \n\n"); PP " -t ##.## : Score threshold for clustering. If the score is normalized and expressed in similarity (default),\n"); PP " it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized\n"); PP " and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%.\n"); PP " If the score is not normalized and expressed in similarity, it is the length of the\n"); PP " Longest Common Subsequence. If the score is not normalized and expressed in distance,\n"); PP " it is (reference length - LCS length).\n"); PP " Only sequences with a similarity above ##.## with the center sequence of a cluster\n"); PP " are assigned to that cluster. Default: 0.97.\n\n"); PP " -e : Exact option : A sequence is assigned to the cluster with the center sequence presenting the\n"); PP " highest similarity score > threshold, as opposed to the default 'fast' option where a sequence is\n"); PP " assigned to the first cluster found with a center sequence presenting a score > threshold.\n\n"); PP " -R ## : Maximum ratio between the counts of two sequences so that the less abundant one can be considered\n"); PP " as a variant of the more abundant one. Default: 1.0.\n\n"); PP " -p ## : Multithreading with ## threads using openMP.\n\n"); PP " -s #### : Sorting by ####. Must be 'None' for no sorting, or a key in the fasta header of each sequence,\n"); PP " except for the count that can be computed (default : sorting by count).\n\n"); PP " -o : Sorting is in ascending order (default : descending).\n\n"); PP " -g : n's are replaced with a's (default: sequences with n's are discarded).\n\n"); PP " -B ### : Output of the OTU table in BIOM format is activated, and written to file ###.\n\n"); PP " -O ### : Output of the OTU map (observation map) is activated, and written to file ###.\n\n"); PP " -F ### : Output in FASTA format is written to file ### instead of standard output.\n\n"); PP " -f : Output in FASTA format is deactivated.\n"); PP "\n"); PP "------------------------------------------------------------\n"); PP " Argument : the nucleotide dataset to cluster (or nothing \n"); PP " if the standard input should be used). \n"); PP "------------------------------------------------------------\n"); PP " http://metabarcoding.org/sumaclust\n"); PP "------------------------------------------------------------\n\n"); } #undef PP /* ----------------------------------------------- */ /* printout usage and exit */ /* ----------------------------------------------- */ #define PP fprintf(stderr, static void ExitUsage(stat) int stat; { PP "usage: sumaclust [-l|L|a|n|r|d|e|o|g|f] [-t threshold_value] [-s sorting_key] [-R maximum_ratio] [-p number_of_threads]\n"); PP "[-B file_name_for_BIOM-formatted_output] [-O file_name_for_OTU_table-formatted_output] [-F file_name_for_FASTA-formatted_output] dataset\n"); PP "type \"sumaclust -h\" for help\n"); if (stat) exit(stat); } #undef PP static char* sortingKey="count"; static int sortSeqsP(const void **s1, const void **s2) { int res; double r1; double r2; r1 = atof(getItemFromHeader(sortingKey, ((fastaSeqPtr) *s2)->header)); r2 = atof(getItemFromHeader(sortingKey, ((fastaSeqPtr) *s2)->header)); if (r2 > r1) res = 1; else if (r2 < r1) res = -1; else res = 0; return(res); } static int reverseSortSeqsP(const void **s1, const void **s2) { int res; double r1; double r2; r1 = atof(getItemFromHeader(sortingKey, ((fastaSeqPtr) *s2)->header)); r2 = atof(getItemFromHeader(sortingKey, ((fastaSeqPtr) *s2)->header)); if (r1 > r2) res = 1; else if (r1 < r2) res = -1; else res = 0; return(res); } int uniqSeqsDoubleSortFunction(const void *s1, const void *s2) { int c; char* str_r1; double r1; double r2; c = strcmp(((fastaSeqPtr) s1)->sequence, ((fastaSeqPtr) s2)->sequence); if (c == 0) { str_r1 = getItemFromHeader(sortingKey, ((fastaSeqPtr) s1)->header); if (str_r1 == NULL) { fprintf(stderr, "\nERROR: '%s' not in sequence header(s).\n\n", sortingKey); exit(1); } r1 = atof(str_r1); r2 = atof(getItemFromHeader(sortingKey, ((fastaSeqPtr) s2)->header)); if (r2 > r1) c = 1; else if (r2 < r1) c = -1; else c = 0; } return(c); } int uniqSeqsDoubleReverseSortFunction(const void *s1, const void *s2) { int c; char* str_r1; double r1; double r2; c = strcmp(((fastaSeqPtr) s1)->sequence, ((fastaSeqPtr) s2)->sequence); if (c == 0) { str_r1 = getItemFromHeader(sortingKey, ((fastaSeqPtr) s1)->header); if (str_r1 == NULL) { fprintf(stderr, "\nERROR: '%s' not in sequence header(s).\n\n", sortingKey); exit(1); } r1 = atof(str_r1); r2 = atof(getItemFromHeader(sortingKey, ((fastaSeqPtr) s2)->header)); if (r1 > r2) c = 1; else if (r1 < r2) c = -1; else c = 0; } return(c); } void printInBIOMformat(fastaSeqPtr* uniqSeqs, int count, int numberOfCenters, char* biomFile_name) { int i, j, n; FILE* biomFile; struct tm* tm_info; time_t timer; char buffer_date[20]; fastaSeqPtr* c; fastaSeqPtr* seq; int id_len; int row_number; BOOL first_center = TRUE; int buffer_col_rows; int buffer_col_rows_1; int buffer_col_rows_2; buffer_col_rows = 29; buffer_col_rows_1 = 9; buffer_col_rows_2 = 20; n = 0; biomFile = fopen(biomFile_name, "w"); if (biomFile == NULL) fprintf(stderr, "\nCan't open BIOM output file.\n"); //, %s outputFilename); for (i=0; iaccession_id); j=0; if ((*seq)->cluster_center) // center sequence { n++; (*seq)->cluster_weight_unique_ids = 1; if (first_center) { (*seq)->columns_BIOM_size = id_len + buffer_col_rows; (*seq)->columns_BIOM = (char*) malloc(((*seq)->columns_BIOM_size)*sizeof(char)); strcpy((*seq)->columns_BIOM, "{\"id\": \""); first_center = FALSE; } else { (*seq)->columns_BIOM_size = id_len + buffer_col_rows + 1; (*seq)->columns_BIOM = (char*) malloc(((*seq)->columns_BIOM_size)*sizeof(char)); strcpy((*seq)->columns_BIOM, ",{\"id\": \""); } memcpy((*seq)->columns_BIOM + (*seq)->columns_BIOM_size - id_len - buffer_col_rows_2 - 1, (*seq)->accession_id, id_len); memcpy((*seq)->columns_BIOM + (*seq)->columns_BIOM_size - buffer_col_rows_2 - 1, "\", \"metadata\": null}", buffer_col_rows_2+1); if ((*seq)->next != NULL) // not last sequence { for (j=1; ((((*seq)+j)->next != NULL) && (((*seq)+j)->uniqHead == FALSE)); j++) // identical sequences { id_len = strlen((*(seq)+j)->accession_id); n++; (*seq)->cluster_weight_unique_ids++; (*seq)->columns_BIOM_size = (*seq)->columns_BIOM_size + id_len + buffer_col_rows; (*seq)->columns_BIOM = realloc((*seq)->columns_BIOM, ((*seq)->columns_BIOM_size) * sizeof(char)); memcpy((*seq)->columns_BIOM + (*seq)->columns_BIOM_size - buffer_col_rows - id_len - 1, ",{\"id\": \"", buffer_col_rows_1); memcpy((*seq)->columns_BIOM + (*seq)->columns_BIOM_size - id_len - buffer_col_rows_2 - 1, (*(seq)+j)->accession_id, id_len); memcpy((*seq)->columns_BIOM + (*seq)->columns_BIOM_size - buffer_col_rows_2 - 1, "\", \"metadata\": null}", buffer_col_rows_2+1); } if ((((*seq)+j)->next == NULL) && (((*seq)+j)->uniqHead == FALSE)) // last sequence { id_len = strlen((*(seq)+j)->accession_id); n++; (*seq)->cluster_weight_unique_ids++; (*seq)->columns_BIOM_size = (*seq)->columns_BIOM_size + id_len + buffer_col_rows; (*seq)->columns_BIOM = realloc((*seq)->columns_BIOM, ((*seq)->columns_BIOM_size) * sizeof(char)); memcpy((*seq)->columns_BIOM + (*seq)->columns_BIOM_size - buffer_col_rows - id_len - 1, ",{\"id\": \"", buffer_col_rows_1); memcpy((*seq)->columns_BIOM + (*seq)->columns_BIOM_size - id_len - buffer_col_rows_2 - 1, (*(seq)+j)->accession_id, id_len); memcpy((*seq)->columns_BIOM + (*seq)->columns_BIOM_size - buffer_col_rows_2 - 1, "\", \"metadata\": null}", buffer_col_rows_2+1); } } } else // not a center sequence { n++; c = (*seq)->center; id_len = strlen((*seq)->accession_id); n++; (*c)->cluster_weight_unique_ids++; (*c)->columns_BIOM_size = (*c)->columns_BIOM_size + id_len + buffer_col_rows; (*c)->columns_BIOM = realloc((*c)->columns_BIOM, ((*c)->columns_BIOM_size) * sizeof(char)); memcpy((*c)->columns_BIOM + (*c)->columns_BIOM_size - buffer_col_rows - id_len - 1, ",{\"id\": \"", buffer_col_rows_1); memcpy((*c)->columns_BIOM + (*c)->columns_BIOM_size - id_len - buffer_col_rows_2 - 1, (*seq)->accession_id, id_len); memcpy((*c)->columns_BIOM + (*c)->columns_BIOM_size - buffer_col_rows_2 - 1, "\", \"metadata\": null}", buffer_col_rows_2+1); if ((*seq)->next != NULL) // not last sequence { for (j=1; ((((*seq)+j)->next != NULL) && (((*seq)+j)->uniqHead == FALSE)); j++) // identical sequences { id_len = strlen((*(seq)+j)->accession_id); n++; (*c)->cluster_weight_unique_ids++; (*c)->columns_BIOM_size = (*c)->columns_BIOM_size + id_len + buffer_col_rows; (*c)->columns_BIOM = realloc((*c)->columns_BIOM, ((*c)->columns_BIOM_size) * sizeof(char)); memcpy((*c)->columns_BIOM + (*c)->columns_BIOM_size - buffer_col_rows - id_len - 1, ",{\"id\": \"", buffer_col_rows_1); memcpy((*c)->columns_BIOM + (*c)->columns_BIOM_size - id_len - buffer_col_rows_2 - 1, (*(seq)+j)->accession_id, id_len); memcpy((*c)->columns_BIOM + (*c)->columns_BIOM_size - buffer_col_rows_2 - 1, "\", \"metadata\": null}", buffer_col_rows_2+1); } if ((((*seq)+j)->next == NULL) && (((*seq)+j)->uniqHead == FALSE)) // last sequence { id_len = strlen((*(seq)+j)->accession_id); n++; (*c)->cluster_weight_unique_ids++; (*c)->columns_BIOM_size = (*c)->columns_BIOM_size + id_len + buffer_col_rows; (*c)->columns_BIOM = realloc((*c)->columns_BIOM, ((*c)->columns_BIOM_size) * sizeof(char)); memcpy((*c)->columns_BIOM + (*c)->columns_BIOM_size - buffer_col_rows - id_len - 1, ",{\"id\": \"", buffer_col_rows_1); memcpy((*c)->columns_BIOM + (*c)->columns_BIOM_size - id_len - buffer_col_rows_2 - 1, (*(seq)+j)->accession_id, id_len); memcpy((*c)->columns_BIOM + (*c)->columns_BIOM_size - buffer_col_rows_2 - 1, "\", \"metadata\": null}", buffer_col_rows_2+1); } } } } time(&timer); tm_info = localtime(&timer); strftime(buffer_date, 20, "%Y-%m-%dT%H:%M:%S", tm_info); fprintf(biomFile, "{\"id\": \"None\",\"format\": \"Biological Observation Matrix 1.0.0\"," "\"format_url\": \"http://biom-format.org\",\"type\": \"OTU table\"," "\"generated_by\": \"SUMACLUST %s\",\"date\": \"%s\",\"matrix_type\": \"sparse\"," "\"matrix_element_type\": \"int\",\"shape\": [%d, %d],", VERSION, buffer_date, numberOfCenters, n); // print data row_number = 0; n = 0; fprintf(biomFile, "\"data\": ["); for (i=0; icluster_center) // center sequence { for (j=0; j<(*seq)->cluster_weight_unique_ids; j++) { if ((row_number == (numberOfCenters - 1)) && (j == ((*seq)->cluster_weight_unique_ids - 1))) // last seq to print fprintf(biomFile, "[%d,%d,1]],", row_number, n); else fprintf(biomFile, "[%d,%d,1],", row_number, n); n++; } row_number++; } } // end data // Print rows first_center = TRUE; for (i=0; icluster_center) // center sequence { if (first_center) { fprintf(biomFile, "\"rows\": [{\"id\": \"%s\", \"metadata\": null}", (*seq)->accession_id); first_center = FALSE; } else fprintf(biomFile, ",{\"id\": \"%s\", \"metadata\": null}", (*seq)->accession_id); } } // Print columns fprintf(biomFile, "],\"columns\": ["); for (i=0; icluster_center) // center sequence fprintf(biomFile, (*seq)->columns_BIOM); } fprintf(biomFile, "]}"); fclose(biomFile); } void printInOTUtableFormat(fastaSeqPtr* uniqSeqs, int count, char* OTUtableFile_name) { int i, j; FILE* OTUtableFile; fastaSeqPtr* c; fastaSeqPtr* seq; int id_len; OTUtableFile = fopen(OTUtableFile_name, "w"); if (OTUtableFile == NULL) fprintf(stderr, "\nCan't open OTU table output file.\n"); //, %s outputFilename); for (i=0; iaccession_id); j=0; if ((*seq)->cluster_center) // center sequence { (*seq)->line_OTU_table_size = id_len*2 + 2; (*seq)->line_OTU_table = (char*) malloc(((*seq)->line_OTU_table_size)*sizeof(char)); strcpy((*seq)->line_OTU_table, (*seq)->accession_id); memcpy((*seq)->line_OTU_table + (*seq)->line_OTU_table_size - id_len - 2, "\t", 1); memcpy((*seq)->line_OTU_table + (*seq)->line_OTU_table_size - id_len - 1, (*seq)->accession_id, id_len); memcpy((*seq)->line_OTU_table + (*seq)->line_OTU_table_size - 1, "\0", 1); if ((*seq)->next != NULL) // not last sequence { for (j=1; ((((*seq)+j)->next != NULL) && (((*seq)+j)->uniqHead == FALSE)); j++) // identical sequences { id_len = strlen((*(seq)+j)->accession_id); (*seq)->line_OTU_table_size = (*seq)->line_OTU_table_size + id_len + 1; (*seq)->line_OTU_table = realloc((*seq)->line_OTU_table, ((*seq)->line_OTU_table_size) * sizeof(char)); memcpy((*seq)->line_OTU_table + (*seq)->line_OTU_table_size - id_len - 2, "\t", 1); memcpy((*seq)->line_OTU_table + (*seq)->line_OTU_table_size - id_len - 1, (*(seq)+j)->accession_id, id_len); memcpy((*seq)->line_OTU_table + (*seq)->line_OTU_table_size - 1, "\0", 1); } if ((((*seq)+j)->next == NULL) && (((*seq)+j)->uniqHead == FALSE)) // last sequence { id_len = strlen((*(seq)+j)->accession_id); (*seq)->line_OTU_table_size = (*seq)->line_OTU_table_size + id_len + 1; (*seq)->line_OTU_table = realloc((*seq)->line_OTU_table, ((*seq)->line_OTU_table_size) * sizeof(char)); memcpy((*seq)->line_OTU_table + (*seq)->line_OTU_table_size - id_len - 2, "\t", 1); memcpy((*seq)->line_OTU_table + (*seq)->line_OTU_table_size - id_len - 1, (*(seq)+j)->accession_id, id_len); memcpy((*seq)->line_OTU_table + (*seq)->line_OTU_table_size - 1, "\0", 1); } } } else // not a center sequence { c = (*seq)->center; (*c)->line_OTU_table_size = (*c)->line_OTU_table_size + id_len + 1; (*c)->line_OTU_table = realloc((*c)->line_OTU_table, ((*c)->line_OTU_table_size) * sizeof(char)); memcpy((*c)->line_OTU_table + (*c)->line_OTU_table_size - id_len - 2, "\t", 1); memcpy((*c)->line_OTU_table + (*c)->line_OTU_table_size - id_len - 1, (*seq)->accession_id, id_len); memcpy((*c)->line_OTU_table + (*c)->line_OTU_table_size - 1, "\0", 1); if ((*seq)->next != NULL) // not last sequence { for (j=1; ((((*seq)+j)->next != NULL) && (((*seq)+j)->uniqHead == FALSE)); j++) // identical sequences { id_len = strlen((*(seq)+j)->accession_id); (*c)->line_OTU_table_size = (*c)->line_OTU_table_size + id_len + 1; (*c)->line_OTU_table = realloc((*c)->line_OTU_table, ((*c)->line_OTU_table_size) * sizeof(char)); memcpy((*c)->line_OTU_table + (*c)->line_OTU_table_size - id_len - 2, "\t", 1); memcpy((*c)->line_OTU_table + (*c)->line_OTU_table_size - id_len - 1, (*(seq)+j)->accession_id, id_len); memcpy((*c)->line_OTU_table + (*c)->line_OTU_table_size - 1, "\0", 1); } if ((((*seq)+j)->next == NULL) && (((*seq)+j)->uniqHead == FALSE)) // last sequence { id_len = strlen((*(seq)+j)->accession_id); (*c)->line_OTU_table_size = (*c)->line_OTU_table_size + id_len + 1; (*c)->line_OTU_table = realloc((*c)->line_OTU_table, ((*c)->line_OTU_table_size) * sizeof(char)); memcpy((*c)->line_OTU_table + (*c)->line_OTU_table_size - id_len - 2, "\t", 1); memcpy((*c)->line_OTU_table + (*c)->line_OTU_table_size - id_len - 1, (*(seq)+j)->accession_id, id_len); memcpy((*c)->line_OTU_table + (*c)->line_OTU_table_size - 1, "\0", 1); } } } } // Print rows for (i=0; icluster_center) // center sequence { fprintf(OTUtableFile, (*seq)->line_OTU_table); fprintf(OTUtableFile, "\n"); } } fclose(OTUtableFile); } void printSeq(fastaSeqPtr* seq, fastaSeqPtr* center, double score, FILE* output) { int i; char* score_n; char* score_v; char* cluster_n; char* cluster_v; char* center_n; char* center_true; char* center_false; int id_size; score_n = (char*) malloc(14*sizeof(char)); score_v = (char*) malloc(20*sizeof(char)); strcpy(score_n, "cluster_score"); sprintf(score_v,"%f", score); id_size = strlen((*center)->accession_id); cluster_n = (char*) malloc(8*sizeof(char)); cluster_v = (char*) malloc((id_size+1)*sizeof(char)); strcpy(cluster_n, "cluster"); strcpy(cluster_v, (*center)->accession_id); center_n = (char*) malloc(15*sizeof(char)); strcpy(center_n, "cluster_center"); center_true = (char*) malloc(5*sizeof(char)); strcpy(center_true, "True"); center_false = (char*) malloc(6*sizeof(char)); strcpy(center_false, "False"); (*seq)->header = table_header_add_field((*seq)->header, cluster_n, cluster_v); (*seq)->header = table_header_add_field((*seq)->header, score_n, score_v); if ((*seq)->cluster_center) (*seq)->header = table_header_add_field((*seq)->header, center_n, center_true); else (*seq)->header = table_header_add_field((*seq)->header, center_n, center_false); printOnlyHeaderFromTable((*seq)->header, output); printOnlySeqFromFastaSeqPtr((*seq), output); if ((*seq)->next != NULL) { for (i=1; ((((*seq)+i)->next != NULL) && (((*seq)+i)->uniqHead == FALSE)); i++) { ((*seq)+i)->header = table_header_add_field(((*seq)+i)->header, cluster_n, cluster_v); ((*seq)+i)->header = table_header_add_field(((*seq)+i)->header, score_n, score_v); ((*seq)+i)->header = table_header_add_field(((*seq)+i)->header, center_n, center_false); printOnlyHeaderFromTable(((*seq)+i)->header, output); printOnlySeqFromFastaSeqPtr(((*seq)+i), output); } if ((((*seq)+i)->next == NULL) && (((*seq)+i)->uniqHead == FALSE)) // last sequence { ((*seq)+i)->header = table_header_add_field(((*seq)+i)->header, cluster_n, cluster_v); ((*seq)+i)->header = table_header_add_field(((*seq)+i)->header, score_n, score_v); ((*seq)+i)->header = table_header_add_field(((*seq)+i)->header, center_n, center_false); printOnlyHeaderFromTable(((*seq)+i)->header, output); printOnlySeqFromFastaSeqPtr(((*seq)+i), output); } } } void putSeqInCluster(fastaSeqPtr* seq, fastaSeqPtr* center, double score) { (*seq)->center = center; (*seq)->score = score; } int compare(fastaSeqPtr* db, int n, BOOL fastOption, double threshold, BOOL normalize, int reference, BOOL lcsmode, double max_ratio) { double score; double scoremax; double worstscore; BOOL toCluster; static BOOL first=TRUE; int32_t i,j,k; int center; float p; BOOL found; int lmax, lmin; int16_t* address; int16_t* iseq1; int16_t* iseq2; int l1; int l2; char* s1; char* s2; int sizeForSeqs; int LCSmin; if (lcsmode || normalize) fprintf(stderr,"Clustering sequences when similarity >= %lf\n", threshold); else fprintf(stderr,"Clustering sequences when distance <= %lf\n", threshold); fprintf(stderr,"Aligning and clustering... \n"); int* centers = (int*) malloc(n * sizeof(int)); for (i=0; i < n; i++) centers[i] = -1; k=0; found = FALSE; calculateMaxAndMinLen(db, n, &lmax, &lmin); sizeForSeqs = prepareTablesForSumathings(lmax, lmin, threshold, normalize, reference, lcsmode, &address, &iseq1, &iseq2); if (lcsmode || normalize) worstscore = 0.0; else worstscore = lmax; for (i=0; i < n; i++) { if (i%100 == 0) { p = (i/(float)n)*100; fprintf(stderr,"\rDone : %f %% %d clusters created",p,k); } if (first) { first = FALSE; if (normalize && lcsmode) score = 1.0; else if (!lcsmode) score = 0.0; else score = (*(db+i))->length; (*(db+i))->cluster_center = TRUE; putSeqInCluster(db+i, db+i, score); centers[k] = i; k++; } else { scoremax = worstscore; center = 0; found = FALSE; toCluster = FALSE; j=0; s1 = (*(db+i))->sequence; l1 = (*(db+i))->length; while (((found == FALSE) && (centers[j] != -1) && (fastOption == TRUE)) || ((fastOption == FALSE) && (centers[j] != -1))) { score = worstscore; if ((double) ((*(db+i))->count) / (double) ((*(db+centers[j]))->count) <= max_ratio) { filters((*(db+i)), (*(db+centers[j])), threshold, normalize, reference, lcsmode, &score, &LCSmin); } if (score == -1.0) { s2 = (*(db+centers[j]))->sequence; l2 = (*(db+centers[j]))->length; score = alignForSumathings(s1, iseq1, s2, iseq2, l1, l2, normalize, reference, lcsmode, address, sizeForSeqs, LCSmin); } if (((score >= threshold) && (lcsmode || normalize) && (score > scoremax)) || ((!lcsmode && !normalize) && (score <= threshold) && (score < scoremax))) { toCluster = TRUE; scoremax = score; center = centers[j]; if (fastOption == TRUE) found = TRUE; } j++; } if (toCluster) { if (!lcsmode && normalize) scoremax = 1.0 - scoremax; (*(db+i))->cluster_center = FALSE; putSeqInCluster(db+i, db+center, scoremax); } else { if (normalize && lcsmode) score = 1.0; else if (!lcsmode) score = 0.0; else score = (*(db+i))->length; (*(db+i))->cluster_center = TRUE; putSeqInCluster(db+i, db+i, score); centers[k] = i; k++; } } } fprintf(stderr,"\rDone : 100 %% %d clusters created. \n",k); free(centers); free(iseq1-sizeForSeqs+lmax); free(iseq2-sizeForSeqs+lmax); if (normalize && (reference == ALILEN)) free(address); return(k); } void computeClusterWeights(fastaSeqPtr* uniqSeqs, int n) { int i,j; fastaSeqPtr* seq; fastaSeqPtr* center; char* cluster_weight_n; char* cluster_weight_v; int cluster_weight; for (i=0; icluster_center) (*seq)->cluster_weight = (*seq)->count; else { center = (*seq)->center; ((*center)->cluster_weight)+=(*seq)->count; } } for (i=0; icluster_center) cluster_weight = (*seq)->cluster_weight; else { center = (*seq)->center; cluster_weight = (*center)->cluster_weight; } cluster_weight_n = (char*) malloc(15*sizeof(char)); cluster_weight_v = (char*) malloc(20*sizeof(char)); strcpy(cluster_weight_n, "cluster_weight"); sprintf(cluster_weight_v,"%d", cluster_weight); (*seq)->header = table_header_add_field((*seq)->header, cluster_weight_n, cluster_weight_v); if ((*seq)->next != NULL) // not the last sequence { for (j=1; ((((*seq)+j)->next != NULL) && (((*seq)+j)->uniqHead == FALSE)); j++) (*(seq)+j)->header = table_header_add_field((*(seq)+j)->header, cluster_weight_n, cluster_weight_v); if ((((*seq)+j)->next == NULL) && (((*seq)+j)->uniqHead == FALSE)) // last sequence (*(seq)+j)->header = table_header_add_field((*(seq)+j)->header, cluster_weight_n, cluster_weight_v); } } } int main(int argc, char** argv) { int32_t carg = 0; int32_t errflag = 0; char* sort; double threshold = 0.97; double max_ratio = 1.0; BOOL lcsmode = TRUE; BOOL fastOption = TRUE; BOOL normalize = TRUE; BOOL reverse = FALSE; BOOL onlyATGC = TRUE; int reference = ALILEN; int nproc = 1; BOOL printBIOM = FALSE; BOOL printOTUtable = FALSE; BOOL printFASTA = TRUE; BOOL printFASTAtofile = FALSE; FILE* FASTA_output = stdout; fastaSeqCount db; int i,n; fastaSeqPtr* uniqSeqs; char* biomFile_name; char* OTUtableFile_name; char* FASTA_file_name; int numberOfCenters; sort = malloc(1024*sizeof(char)); strcpy(sort, "count"); biomFile_name = malloc(1024*sizeof(char)); OTUtableFile_name = malloc(1024*sizeof(char)); FASTA_file_name = malloc(1024*sizeof(char)); while ((carg = getopt(argc, argv, "hlLanrdet:p:s:ogB:O:R:fF:")) != -1) { switch (carg) { /* -------------------- */ case 'h': /* help */ /* -------------------- */ PrintHelp(); exit(0); break; /* -------------------------------------------------- */ case 'l': /* Normalize LCS/Error by the shortest sequence length*/ /* -------------------------------------------------- */ reference=MINLEN; break; /* -------------------------------------------------- */ case 'L': /* Normalize LCS/Error by the largest sequence length */ /* -------------------------------------------------- */ reference=MAXLEN; break; /* -------------------------------------------------- */ case 'a': /* Normalize LCS/Error by the alignment length */ /* -------------------------------------------------- */ reference=ALILEN; break; /* -------------------------------------------------- */ case 'n': /* Normalize LCS by the reference length */ /* -------------------------------------------------- */ normalize=TRUE; break; /* -------------------------------------------------- */ case 'r': /* No normalization */ /* -------------------------------------------------- */ normalize=FALSE; break; /* -------------------------------------------------- */ case 'd': /* Score is expressed in distance */ /* -------------------------------------------------- */ lcsmode=FALSE; break; /* ---------------------------------------------------------------------------------------------------------- */ case 'e': /* center with the best score > threshold is chosen, otherwise first center with a score > threshold */ /* ---------------------------------------------------------------------------------------------------------- */ fastOption=FALSE; break; /* ------------------------------------------------------------------- */ case 't': /* Clusters only pairs with similarity higher than (threshold) */ /* ------------------------------------------------------------------- */ sscanf(optarg,"%lf",&threshold); break; /* ------------------------------------------------------------------- */ case 'R': /* maximum ratio between counts of two sequences connected by an edge */ /* ------------------------------------------------------------------- */ sscanf(optarg,"%lf",&max_ratio); break; /* -------------------------------------------------- */ case 'p': /* number of processors to use */ /* -------------------------------------------------- */ sscanf(optarg,"%d",&nproc); break; /* -------------------------------------------------- */ case 's': /* Sorting option */ /* -------------------------------------------------- */ sscanf(optarg, "%s", sort); sortingKey = sort; break; /* -------------------------------------------------- */ case 'o': /* reverse sorting */ /* -------------------------------------------------- */ reverse=TRUE; break; /* -------------------------------------------------- */ case 'g': /* replace n's with a's in sequences */ /* -------------------------------------------------- */ onlyATGC=FALSE; break; /* -------------------------------------------------- */ case 'B': /* file name to print results in BIOM format */ /* -------------------------------------------------- */ sscanf(optarg, "%s", biomFile_name); printBIOM=TRUE; break; /* -------------------------------------------------- */ case 'O': /* file name to print results in OTU table format */ /* -------------------------------------------------- */ sscanf(optarg, "%s", OTUtableFile_name); printOTUtable=TRUE; break; /* -------------------------------------------------- */ case 'f': /* don't print results in FASTA format */ /* -------------------------------------------------- */ printFASTA=FALSE; break; /* ---------------------------------------------- */ case 'F': /* file name to print results in FASTA format */ /* ---------------------------------------------- */ sscanf(optarg, "%s", FASTA_file_name); printFASTAtofile=TRUE; break; case '?': /* invalid option */ errflag++; break; } } if (errflag) ExitUsage(errflag); fprintf(stderr,"===========================================================\n"); fprintf(stderr," SUMACLUST version %s\n",VERSION); #ifdef __SSE2__ fprintf(stderr," Alignment using SSE2 instructions.\n"); #else fprintf(stderr," Alignment using standard code, SSE2 unavailable.\n"); #endif fprintf(stderr,"===========================================================\n"); if ((threshold == 0.0) || (normalize && (threshold > 1.0))) { fprintf(stderr, "\nERROR: Please specify a threshold > 0, and < 1 when scores are normalized.\n\n"); exit(1); } fprintf(stderr,"Reading dataset..."); db = seq_readAllSeq2(argv[optind], TRUE, onlyATGC); fprintf(stderr,"\n%d sequences\n",db.count); if (db.count == 0) { fprintf(stderr, "\nNo valid sequences. Exiting program.\n\n"); exit(1); } if (!onlyATGC) (void)cleanDB(db); if (!lcsmode && normalize) threshold = 1.0 - threshold; if (threshold > 0) (void)hashDB(db); addCounts(&db); // first sorting of sequences to have good unique heads if ((strcmp(sortingKey, "None") != 0) && (strcmp(sortingKey, "none") != 0)) { if (reverse == FALSE) qsort((void*) db.fastaSeqs, db.count, sizeof(fastaSeq), uniqSeqsDoubleSortFunction); else qsort((void*) db.fastaSeqs, db.count, sizeof(fastaSeq), uniqSeqsDoubleReverseSortFunction); } // getting the vector of unique seqs uniqSeqs = (fastaSeqPtr*) malloc((db.count)*sizeof(fastaSeqPtr)); n = uniqSeqsVector(&db, &uniqSeqs); uniqSeqs = realloc(uniqSeqs, n*sizeof(fastaSeqPtr)); // putting a flag on the last sequence for (i=0; i<(db.count-1); i++) ((db.fastaSeqs)+i)->next = (db.fastaSeqs)+i-1; ((db.fastaSeqs)+(db.count)-1)->next = NULL; // sorting unique sequences if (strcmp(sortingKey, "count") == 0) { fprintf(stderr,"Sorting sequences by count...\n", n); if (reverse == FALSE) qsort((void*) uniqSeqs, n, sizeof(fastaSeqPtr), sortSeqsWithCounts); else qsort((void*) uniqSeqs, n, sizeof(fastaSeqPtr), reverseSortSeqsWithCounts); } else if ((strcmp(sortingKey, "None") != 0) && (strcmp(sortingKey, "none") != 0)) { fprintf(stderr,"Sorting sequences by %s...\n", sortingKey); if (reverse == FALSE) qsort((void*) uniqSeqs, n, sizeof(fastaSeqPtr), sortSeqsP); else qsort((void*) uniqSeqs, n, sizeof(fastaSeqPtr), reverseSortSeqsP); } if (max_ratio > 0) fprintf(stderr,"Maximum ratio between the counts of two sequences to connect them: %lf\n", max_ratio); // Computing if (nproc==1) numberOfCenters = compare(uniqSeqs, n, fastOption, threshold, normalize, reference, lcsmode, max_ratio); else numberOfCenters = mt_compare_sumaclust(uniqSeqs, n, fastOption, threshold, normalize, reference, lcsmode, nproc, max_ratio); // Computing cluster weights computeClusterWeights(uniqSeqs, n); // Printing results // FASTA file if (printFASTA) { if (printFASTAtofile) { FASTA_output = fopen(FASTA_file_name, "w"); if (FASTA_output == NULL) fprintf(stderr, "\nCan't open FASTA output file.\n"); //, %s outputFilename); } for (i=0; icenter, (*(uniqSeqs+i))->score, FASTA_output); } fprintf(stderr,"Done.\n"); } // BIOM file if (printBIOM) { fprintf(stderr,"Printing results in BIOM format...\n"); printInBIOMformat(uniqSeqs, n, numberOfCenters, biomFile_name); fprintf(stderr,"Done.\n"); } // OTU table file if (printOTUtable) { fprintf(stderr,"Printing results in OTU table format...\n"); printInOTUtableFormat(uniqSeqs, n, OTUtableFile_name); fprintf(stderr,"Done.\n"); } // Freeing for (i=0; i < db.count; i++) { free(((db.fastaSeqs)[i]).table); free_header_table(((db.fastaSeqs)[i]).header); } free(db.fastaSeqs); free(sort); free(uniqSeqs); return(0); }