/**************************************************************************** * Tags a set of sequences for PCR/sequencing errors identification * ****************************************************************************/ /** * @file obi_clean.c * @author Celine Mercier (celine.mercier@metabarcoding.org) * @date April 9th 2018 * @brief Functions tagging a set of sequences for PCR/sequencing errors identification. */ //#define OMP_SUPPORT // TODO #ifdef OMP_SUPPORT #include #endif #include #include #include #include #include #include "obi_clean.h" #include "obidms.h" #include "obidebug.h" #include "obierrno.h" #include "obitypes.h" #include "obiview.h" #include "sse_banded_LCS_alignment.h" #include "upperband.h" #include "obiblob.h" #define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) /************************************************************************** * * D E C L A R A T I O N O F T H E P R I V A T E F U N C T I O N S * **************************************************************************/ // TODO static int create_output_columns(Obiview_p o_view, Obiview_p i_view, OBIDMS_column_p sample_column, int sample_count); static inline int idxcmp(const void* idx1, const void* idx2); /************************************************************************ * * D E F I N I T I O N O F T H E P R I V A T E F U N C T I O N S * ************************************************************************/ static int create_output_columns(Obiview_p o_view, Obiview_p i_view, OBIDMS_column_p sample_column, int sample_count) { // Status column if (obi_view_add_column(o_view, CLEAN_STATUS_COLUMN_NAME, -1, NULL, OBI_CHAR, 0, sample_count, (sample_column->header)->elements_names, true, false, false, NULL, NULL, -1, CLEAN_STATUS_COLUMN_COMMENTS, true) < 0) { obidebug(1, "\nError creating the %s column", CLEAN_STATUS_COLUMN_NAME); return -1; } // Head column if (obi_view_add_column(o_view, CLEAN_HEAD_COLUMN_NAME, -1, NULL, OBI_BOOL, 0, 1, NULL, false, false, false, NULL, NULL, -1, CLEAN_HEAD_COLUMN_COMMENTS, true) < 0) { obidebug(1, "\nError creating the %s column", CLEAN_HEAD_COLUMN_NAME); return -1; } // Sample count column if (obi_view_add_column(o_view, CLEAN_SAMPLECOUNT_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, CLEAN_SAMPLECOUNT_COLUMN_COMMENTS, true) < 0) { obidebug(1, "\nError creating the %s column", CLEAN_SAMPLECOUNT_COLUMN_NAME); return -1; } // Head count column if (obi_view_add_column(o_view, CLEAN_HEADCOUNT_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, CLEAN_HEADCOUNT_COLUMN_COMMENTS, true) < 0) { obidebug(1, "\nError creating the %s column", CLEAN_HEADCOUNT_COLUMN_NAME); return -1; } // Internal count column if (obi_view_add_column(o_view, CLEAN_INTERNALCOUNT_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, CLEAN_INTERNALCOUNT_COLUMN_COMMENTS, true) < 0) { obidebug(1, "\nError creating the %s column", CLEAN_INTERNALCOUNT_COLUMN_NAME); return -1; } // Singleton count column if (obi_view_add_column(o_view, CLEAN_SINGLETONCOUNT_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, CLEAN_SINGLETONCOUNT_COLUMN_COMMENTS, true) < 0) { obidebug(1, "\nError creating the %s column", CLEAN_SINGLETONCOUNT_COLUMN_NAME); return -1; } return 0; } static inline int idxcmp(const void* idx1, const void* idx2) { if (*(index_t*) idx1 < *(index_t*) idx2) return -1; if (*(index_t*) idx1 > *(index_t*) idx2) return 1; return 0; } /********************************************************************** * * D E F I N I T I O N O F T H E P U B L I C F U N C T I O N S * **********************************************************************/ // TODO comment int obi_clean(const char* dms_name, const char* i_view_name, const char* sample_column_name, const char* o_view_name, const char* o_view_comments, double threshold, double max_ratio, bool heads_only, int thread_count) { float p; index_t i, j; index_t seq_count; index_t* index_array; double score; bool above_threshold; int lcs_length; int ali_length; Kmer_table_p ktable; Obi_blob_p blob1; Obi_blob_p blob2; int lcs_min; int sample_count; int sample; int s1_count; int s2_count; bool head; int head_count; int internal_count; int singleton_count; int ind_sample_count; char status; void** yes_trees; void** no_trees; OBIDMS_p dms = NULL; Obiview_p i_view = NULL; Obiview_p o_view = NULL; OBIDMS_column_p iseq_column = NULL; OBIDMS_column_p sample_column = NULL; OBIDMS_column_p status_column = NULL; OBIDMS_column_p head_column = NULL; OBIDMS_column_p internalcount_column = NULL; OBIDMS_column_p headcount_column = NULL; OBIDMS_column_p singletoncount_column = NULL; OBIDMS_column_p samplecount_column = NULL; // TODO other columns void* no; void* yes; void* key_p; bool normalize = false; int reference = 0; bool similarity_mode = false; char* seq1; char* seq2; // Open DMS dms = obi_dms(dms_name); // Open input view i_view = obi_open_view(dms, i_view_name); if (i_view == NULL) { obidebug(1, "\nError opening the input view to clean"); return -1; } // Open the sequence column if (strcmp((i_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0) iseq_column = obi_view_get_column(i_view, NUC_SEQUENCE_COLUMN); else { obi_set_errno(OBI_CLEAN_ERROR); obidebug(1, "\nError: no sequence column"); return -1; } if (iseq_column == NULL) { obidebug(1, "\nError getting the sequence column"); return -1; } // Open the sample column if there is one if ((strcmp(sample_column_name, "") == 0) || (sample_column_name == NULL)) sample_column = NULL; else { sample_column = obi_view_get_column(i_view, sample_column_name); if (sample_column == NULL) { obidebug(1, "\nError getting the sample column"); return -1; } } // Create the output view o_view = obi_clone_view(dms, i_view, o_view_name, NULL, o_view_comments); if (o_view == NULL) { obidebug(1, "\nError creating the output view"); return -1; } sample_count = (sample_column->header)->nb_elements_per_line; // Create the output columns if (create_output_columns(o_view, i_view, sample_column, sample_count) < 0) // TODO return -1; // Get the created output columns status_column = obi_view_get_column(o_view, CLEAN_STATUS_COLUMN_NAME); head_column = obi_view_get_column(o_view, CLEAN_HEAD_COLUMN_NAME); internalcount_column = obi_view_get_column(o_view, CLEAN_INTERNALCOUNT_COLUMN_NAME); headcount_column = obi_view_get_column(o_view, CLEAN_HEADCOUNT_COLUMN_NAME); singletoncount_column = obi_view_get_column(o_view, CLEAN_SINGLETONCOUNT_COLUMN_NAME); samplecount_column = obi_view_get_column(o_view, CLEAN_SAMPLECOUNT_COLUMN_NAME); // Build kmer tables ktable = hash_seq_column(i_view, iseq_column, 0); if (ktable == NULL) { obi_set_errno(OBI_CLEAN_ERROR); obidebug(1, "\nError building kmer tables before aligning"); return -1; } seq_count = (i_view->infos)->line_count; // Allocate arrays of pointers to binary trees yes_trees = (void**) calloc(seq_count, sizeof(void*)); no_trees = (void**) calloc(seq_count, sizeof(void*)); // Allocate and fill index array for the binary trees to reference (they don't copy the data) index_array = (index_t*) malloc(seq_count * sizeof(index_t)); for (i=0; i < seq_count; i++) index_array[i]=i; // Initialize all sequences to singletons or NA if no sequences in that sample for (i=0; i < (seq_count - 1); i++) { for (sample=0; sample < sample_count; sample++) { if (obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, i, sample) != OBIInt_NA) { if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample, 's') < 0) // TODO return -1; } } } for (sample=0; sample < sample_count; sample++) { for (i=0; i < (seq_count - 1); i++) { if (i%1000 == 0) { p = ((i+(sample*(seq_count)))/(float)(seq_count*sample_count))*100; fprintf(stderr,"\rDone : %f %% ",p); } // Get first sequence blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0); if (blob1 == NULL) { obidebug(1, "\nError retrieving sequences to align"); return -1; } seq1 = obi_get_seq_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0); // Get count for this sample s1_count = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, i, sample); for (j=i+1; j < seq_count; j++) // TODO parallelize this loop? { // Get second sequence blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, j, 0); if (blob2 == NULL) { obidebug(1, "\nError retrieving sequences to align"); return -1; } seq2 = obi_get_seq_with_elt_idx_and_col_p_in_view(i_view, iseq_column, j, 0); // Get count for this sample s2_count = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, j, sample); // Checking ratio if (((s1_count!=OBIInt_NA && s2_count!=OBIInt_NA) && (s1_count>0 && s2_count>0)) && ((((s1_count >= s2_count) && (((double) s2_count / (double) s1_count) <= max_ratio))) || (((s2_count >= s1_count) && (((double) s1_count / (double) s2_count) <= max_ratio))))) { yes=NULL; no=NULL; no = tfind(index_array+j, &(no_trees[i]), idxcmp); if (no == NULL) yes = tfind(index_array+j, &(yes_trees[i]), idxcmp); above_threshold = false; if ((no == NULL) && (yes == NULL)) //never compared { // Check if the sequences are identical in a quick way (same index in the same indexer) if (obi_get_index_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0) == obi_get_index_with_elt_idx_and_col_p_in_view(i_view, iseq_column, j, 0)) above_threshold = true; else // the sequences aren't identical { // kmer filter align_filters(ktable, blob1, blob2, i, j, threshold, normalize, reference, similarity_mode, &score, &lcs_min, false); // Compute alignment score if filter passed if (score == -1.0) score = obiblob_sse_banded_lcs_align(blob1, blob2, threshold, normalize, reference, similarity_mode, &lcs_length, &ali_length); above_threshold = ((score >= 0) && (score <= threshold)); } } if (yes || above_threshold) { if (yes == NULL) // put seqs in their 'yes' trees key_p = tsearch(index_array+j, &(yes_trees[i]), idxcmp); // TODO error checking // label as head or internal if (s1_count >= s2_count) { if (obi_get_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample) == 's') // seq can become head ONLY if it's a singleton obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample, 'h'); // Otherwise it's an internal obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample, 'i'); } else { if (obi_get_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample) == 's') // seq can become head ONLY if it's a singleton obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample, 'h'); obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample, 'i'); } } else if (no == NULL) // put in 'no' tree of both sequences // TODO why just one then??? key_p = tsearch(index_array+j, &(no_trees[i]), idxcmp); // TODO error checking } free(seq2); } free(seq1); } } free_kmer_tables(ktable, seq_count); fprintf(stderr, "\n"); for (i=0; i < (seq_count - 1); i++) { if (i%1000 == 0) { p = (i/(float)(seq_count))*100; fprintf(stderr, "\rAnnotating : %f %% ",p); } head = false; head_count = 0; internal_count = 0; singleton_count = 0; ind_sample_count = 0; for (sample=0; sample < sample_count; sample++) { // Check if head or singleton in at least one sample status = obi_get_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample); if ((!head) && ((status == 'h') || (status == 's'))) head = true; if (status == 's') { singleton_count++; ind_sample_count++; } else if (status == 'i') { internal_count++; ind_sample_count++; } else if (status == 'h') { head_count++; ind_sample_count++; } } if (obi_set_bool_with_elt_idx_and_col_p_in_view(o_view, head_column, i, 0, head) < 0) // TODO return -1; if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, singletoncount_column, i, 0, singleton_count) < 0) // TODO return -1; if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, internalcount_column, i, 0, internal_count) < 0) // TODO return -1; if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, headcount_column, i, 0, head_count) < 0) // TODO return -1; if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, samplecount_column, i, 0, ind_sample_count) < 0) // TODO return -1; } // Close views if (obi_save_and_close_view(i_view) < 0) { obidebug(1, "\nError closing the input view after aligning"); return -1; } if (obi_save_and_close_view(o_view) < 0) { obidebug(1, "\nError closing the output view after aligning"); return -1; } fprintf(stderr,"\rDone : 100 %% "); return 0; }