diff --git a/python/obitools3/commands/clean.pyx b/python/obitools3/commands/clean.pyx new file mode 100644 index 0000000..1cd6195 --- /dev/null +++ b/python/obitools3/commands/clean.pyx @@ -0,0 +1,94 @@ +#cython: language_level=3 + +from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport +from obitools3.dms.dms cimport DMS +from obitools3.dms.capi.obidms cimport OBIDMS_p +from obitools3.dms.view import RollbackException +from obitools3.dms.capi.obiclean cimport obi_clean +from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption +from obitools3.uri.decode import open_uri +from obitools3.apps.config import logger +from obitools3.utils cimport tobytes + +from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS + + +__title__="Tag a set of sequences for PCR and sequencing errors identification" + + +def addOptions(parser): + + addSequenceInputOption(parser) + addMinimalOutputOption(parser) + + group = parser.add_argument_group('obi clean specific options') + + group.add_argument('--distance', '-d', + action="store", dest="clean:distance", + metavar='', + default=1.0, + type=float, + help="Maximum numbers of errors between two variant sequences. Default: 1.") + + group.add_argument('--sample-tag', '-s', + action="store", + dest="clean:sample-tag-name", + metavar="", + type=str, + default="merged_sample", + help="Name of the tag where sample counts are kept.") + + group.add_argument('--ratio', '-r', + action="store", dest="clean:ratio", + metavar='', + default=0.5, + type=float, + help="Maximum ratio between the counts of two sequences so that the less abundant one can be considered" + " a variant of the more abundant one. Default: 0.5.") + + group.add_argument('--heads-only', '-H', + action="store_true", + dest="clean:heads-only", + default=False, + help="Only sequences labeled as heads are kept in the output. Default: False") + + group.add_argument('--cluster-tags', '-C', + action="store_true", + dest="clean:cluster-tags", + default=False, + help="Adds tags for each sequence giving its cluster's head and weight for each sample.") + + +def run(config): + + DMS.obi_atexit() + + logger("info", "obi clean") + + # Open DMS + dms_name = config['obi']['inputURI'].split('/')[0] + dms = open_uri(dms_name)[0] + + # Read the name of the input view + uri_i = config['obi']['inputURI'].split('/') + i_view_name = uri_i[1] + + # Read the name of the output view + uri_o = config['obi']['outputURI'].split('/') + if len(uri_o)==2: + # Check that input and output DMS are the same (predicate, to discuss) + if dms_name != uri_o[0]: + raise Exception("Input and output DMS must be the same") + o_view_name = uri_o[1] + else: + o_view_name = uri_o[0] + + if obi_clean(tobytes(dms_name), tobytes(i_view_name), tobytes(config['clean']['sample-tag-name']), tobytes(o_view_name), b"obiclean", \ + config['clean']['distance'], config['clean']['ratio'], config['clean']['heads-only'], 1) < 0: + raise Exception("Error running obiclean") + + print("\n") + print(repr(dms[o_view_name])) + + dms.close() + diff --git a/python/obitools3/dms/capi/obiclean.pxd b/python/obitools3/dms/capi/obiclean.pxd new file mode 100644 index 0000000..5f80861 --- /dev/null +++ b/python/obitools3/dms/capi/obiclean.pxd @@ -0,0 +1,17 @@ +#cython: language_level=3 + +from obitools3.dms.capi.obidms cimport OBIDMS_p + + +cdef extern from "obi_clean.h" nogil: + + int obi_clean(const char* dms_name, + const char* i_view_name, + const char* sample_column_name, + const char* o_view_name, + const char* o_view_comments, + double threshold, + double max_ratio, + bint heads_only, + int thread_count) + diff --git a/python/obitools3/dms/dms.cfiles b/python/obitools3/dms/dms.cfiles index 43de89d..2d0b4d8 100644 --- a/python/obitools3/dms/dms.cfiles +++ b/python/obitools3/dms/dms.cfiles @@ -8,6 +8,7 @@ ../../../src/linked_list.c ../../../src/murmurhash2.c ../../../src/obi_align.c +../../../src/obi_clean.c ../../../src/obiavl.c ../../../src/obiblob_indexer.c ../../../src/obiblob.c diff --git a/python/obitools3/utils.cfiles b/python/obitools3/utils.cfiles index a775d3c..2a5e3b9 100644 --- a/python/obitools3/utils.cfiles +++ b/python/obitools3/utils.cfiles @@ -8,6 +8,7 @@ ../../src/linked_list.c ../../src/murmurhash2.c ../../src/obi_align.c +../../src/obi_clean.c ../../src/obiavl.c ../../src/obiblob_indexer.c ../../src/obiblob.c diff --git a/src/obi_clean.c b/src/obi_clean.c new file mode 100644 index 0000000..c9ac057 --- /dev/null +++ b/src/obi_clean.c @@ -0,0 +1,463 @@ +/**************************************************************************** + * Tags a set of sequences for PCR/sequencing errors identification * + ****************************************************************************/ + +/** + * @file obi_clean.c + * @author Celine Mercier (celine.mercier@metabarcoding.org) + * @date April 9th 2018 + * @brief Functions tagging a set of sequences for PCR/sequencing errors identification. + */ + +//#define OMP_SUPPORT // TODO +#ifdef OMP_SUPPORT +#include +#endif + +#include +#include +#include +#include +#include + +#include "obi_clean.h" +#include "obidms.h" +#include "obidebug.h" +#include "obierrno.h" +#include "obitypes.h" +#include "obiview.h" +#include "sse_banded_LCS_alignment.h" +#include "upperband.h" +#include "obiblob.h" + + + +#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) + + +/************************************************************************** + * + * D E C L A R A T I O N O F T H E P R I V A T E F U N C T I O N S + * + **************************************************************************/ + +// TODO +static int create_output_columns(Obiview_p o_view, + Obiview_p i_view, + OBIDMS_column_p sample_column, + int sample_count); + + +static inline int idxcmp(const void* idx1, const void* idx2); + + +/************************************************************************ + * + * D E F I N I T I O N O F T H E P R I V A T E F U N C T I O N S + * + ************************************************************************/ + +static int create_output_columns(Obiview_p o_view, + Obiview_p i_view, + OBIDMS_column_p sample_column, + int sample_count) +{ + // Status column + if (obi_view_add_column(o_view, CLEAN_STATUS_COLUMN_NAME, -1, NULL, OBI_CHAR, 0, sample_count, (sample_column->header)->elements_names, true, false, false, NULL, NULL, -1, CLEAN_STATUS_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", CLEAN_STATUS_COLUMN_NAME); + return -1; + } + + // Head column + if (obi_view_add_column(o_view, CLEAN_HEAD_COLUMN_NAME, -1, NULL, OBI_BOOL, 0, 1, NULL, false, false, false, NULL, NULL, -1, CLEAN_HEAD_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", CLEAN_HEAD_COLUMN_NAME); + return -1; + } + + // Sample count column + if (obi_view_add_column(o_view, CLEAN_SAMPLECOUNT_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, CLEAN_SAMPLECOUNT_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", CLEAN_SAMPLECOUNT_COLUMN_NAME); + return -1; + } + + // Head count column + if (obi_view_add_column(o_view, CLEAN_HEADCOUNT_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, CLEAN_HEADCOUNT_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", CLEAN_HEADCOUNT_COLUMN_NAME); + return -1; + } + + // Internal count column + if (obi_view_add_column(o_view, CLEAN_INTERNALCOUNT_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, CLEAN_INTERNALCOUNT_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", CLEAN_INTERNALCOUNT_COLUMN_NAME); + return -1; + } + + // Singleton count column + if (obi_view_add_column(o_view, CLEAN_SINGLETONCOUNT_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, CLEAN_SINGLETONCOUNT_COLUMN_COMMENTS, true) < 0) + { + obidebug(1, "\nError creating the %s column", CLEAN_SINGLETONCOUNT_COLUMN_NAME); + return -1; + } + + return 0; +} + + +static inline int idxcmp(const void* idx1, const void* idx2) +{ + if (*(index_t*) idx1 < *(index_t*) idx2) + return -1; + if (*(index_t*) idx1 > *(index_t*) idx2) + return 1; + return 0; +} + + +/********************************************************************** + * + * D E F I N I T I O N O F T H E P U B L I C F U N C T I O N S + * + **********************************************************************/ + + +// TODO comment +int obi_clean(const char* dms_name, + const char* i_view_name, + const char* sample_column_name, + const char* o_view_name, + const char* o_view_comments, + double threshold, + double max_ratio, + bool heads_only, + int thread_count) +{ + float p; + index_t i, j; + index_t seq_count; + index_t* index_array; + double score; + bool above_threshold; + int lcs_length; + int ali_length; + Kmer_table_p ktable; + Obi_blob_p blob1; + Obi_blob_p blob2; + int lcs_min; + int sample_count; + int sample; + int s1_count; + int s2_count; + bool head; + int head_count; + int internal_count; + int singleton_count; + int ind_sample_count; + char status; + + void** yes_trees; + void** no_trees; + + OBIDMS_p dms = NULL; + Obiview_p i_view = NULL; + Obiview_p o_view = NULL; + OBIDMS_column_p iseq_column = NULL; + OBIDMS_column_p sample_column = NULL; + OBIDMS_column_p status_column = NULL; + OBIDMS_column_p head_column = NULL; + OBIDMS_column_p internalcount_column = NULL; + OBIDMS_column_p headcount_column = NULL; + OBIDMS_column_p singletoncount_column = NULL; + OBIDMS_column_p samplecount_column = NULL; + + // TODO other columns + + void* no; + void* yes; + void* key_p; + + bool normalize = false; + int reference = 0; + bool similarity_mode = false; + + char* seq1; + char* seq2; + + // Open DMS + dms = obi_dms(dms_name); + + // Open input view + i_view = obi_open_view(dms, i_view_name); + if (i_view == NULL) + { + obidebug(1, "\nError opening the input view to clean"); + return -1; + } + + // Open the sequence column + if (strcmp((i_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0) + iseq_column = obi_view_get_column(i_view, NUC_SEQUENCE_COLUMN); + else + { + obi_set_errno(OBI_CLEAN_ERROR); + obidebug(1, "\nError: no sequence column"); + return -1; + } + if (iseq_column == NULL) + { + obidebug(1, "\nError getting the sequence column"); + return -1; + } + + // Open the sample column if there is one + if ((strcmp(sample_column_name, "") == 0) || (sample_column_name == NULL)) + sample_column = NULL; + else + { + sample_column = obi_view_get_column(i_view, sample_column_name); + if (sample_column == NULL) + { + obidebug(1, "\nError getting the sample column"); + return -1; + } + } + + // Create the output view + o_view = obi_clone_view(dms, i_view, o_view_name, NULL, o_view_comments); + if (o_view == NULL) + { + obidebug(1, "\nError creating the output view"); + return -1; + } + + sample_count = (sample_column->header)->nb_elements_per_line; + + // Create the output columns + if (create_output_columns(o_view, i_view, sample_column, sample_count) < 0) + // TODO + return -1; + + // Get the created output columns + status_column = obi_view_get_column(o_view, CLEAN_STATUS_COLUMN_NAME); + head_column = obi_view_get_column(o_view, CLEAN_HEAD_COLUMN_NAME); + internalcount_column = obi_view_get_column(o_view, CLEAN_INTERNALCOUNT_COLUMN_NAME); + headcount_column = obi_view_get_column(o_view, CLEAN_HEADCOUNT_COLUMN_NAME); + singletoncount_column = obi_view_get_column(o_view, CLEAN_SINGLETONCOUNT_COLUMN_NAME); + samplecount_column = obi_view_get_column(o_view, CLEAN_SAMPLECOUNT_COLUMN_NAME); + + // Build kmer tables + ktable = hash_seq_column(i_view, iseq_column, 0); + if (ktable == NULL) + { + obi_set_errno(OBI_CLEAN_ERROR); + obidebug(1, "\nError building kmer tables before aligning"); + return -1; + } + + seq_count = (i_view->infos)->line_count; + + // Allocate arrays of pointers to binary trees + yes_trees = (void**) calloc(seq_count, sizeof(void*)); + no_trees = (void**) calloc(seq_count, sizeof(void*)); + + // Allocate and fill index array for the binary trees to reference (they don't copy the data) + index_array = (index_t*) malloc(seq_count * sizeof(index_t)); + for (i=0; i < seq_count; i++) + index_array[i]=i; + + // Initialize all sequences to singletons or NA if no sequences in that sample + for (i=0; i < (seq_count - 1); i++) + { + for (sample=0; sample < sample_count; sample++) + { + if (obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, i, sample) != OBIInt_NA) + { + if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample, 's') < 0) + // TODO + return -1; + } + } + } + + for (sample=0; sample < sample_count; sample++) + { + for (i=0; i < (seq_count - 1); i++) + { + if (i%1000 == 0) + { + p = ((i+(sample*(seq_count)))/(float)(seq_count*sample_count))*100; + fprintf(stderr,"\rDone : %f %% ",p); + } + + // Get first sequence + blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0); + if (blob1 == NULL) + { + obidebug(1, "\nError retrieving sequences to align"); + return -1; + } + + seq1 = obi_get_seq_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0); + + // Get count for this sample + s1_count = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, i, sample); + + for (j=i+1; j < seq_count; j++) // TODO parallelize this loop? + { + // Get second sequence + blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, j, 0); + if (blob2 == NULL) + { + obidebug(1, "\nError retrieving sequences to align"); + return -1; + } + + seq2 = obi_get_seq_with_elt_idx_and_col_p_in_view(i_view, iseq_column, j, 0); + + // Get count for this sample + s2_count = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, j, sample); + + // Checking ratio + if (((s1_count!=OBIInt_NA && s2_count!=OBIInt_NA) && (s1_count>0 && s2_count>0)) && + ((((s1_count >= s2_count) && (((double) s2_count / (double) s1_count) <= max_ratio))) || + (((s2_count >= s1_count) && (((double) s1_count / (double) s2_count) <= max_ratio))))) + { + yes=NULL; + no=NULL; + no = tfind(index_array+j, &(no_trees[i]), idxcmp); + if (no == NULL) + yes = tfind(index_array+j, &(yes_trees[i]), idxcmp); + + above_threshold = false; + if ((no == NULL) && (yes == NULL)) //never compared + { + // Check if the sequences are identical in a quick way (same index in the same indexer) + if (obi_get_index_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0) == obi_get_index_with_elt_idx_and_col_p_in_view(i_view, iseq_column, j, 0)) + above_threshold = true; + else // the sequences aren't identical + { + // kmer filter + align_filters(ktable, blob1, blob2, i, j, threshold, normalize, reference, similarity_mode, &score, &lcs_min, false); + + // Compute alignment score if filter passed + if (score == -1.0) + score = obiblob_sse_banded_lcs_align(blob1, blob2, threshold, normalize, reference, similarity_mode, &lcs_length, &ali_length); + + above_threshold = ((score >= 0) && (score <= threshold)); + } + } + + if (yes || above_threshold) + { + if (yes == NULL) + // put seqs in their 'yes' trees + key_p = tsearch(index_array+j, &(yes_trees[i]), idxcmp); // TODO error checking + // label as head or internal + if (s1_count >= s2_count) + { + if (obi_get_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample) == 's') // seq can become head ONLY if it's a singleton + obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample, 'h'); + // Otherwise it's an internal + obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample, 'i'); + } + else + { + if (obi_get_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample) == 's') // seq can become head ONLY if it's a singleton + obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample, 'h'); + obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample, 'i'); + } + } + else if (no == NULL) + // put in 'no' tree of both sequences // TODO why just one then??? + key_p = tsearch(index_array+j, &(no_trees[i]), idxcmp); // TODO error checking + } + free(seq2); + } + free(seq1); + } + } + + free_kmer_tables(ktable, seq_count); + + fprintf(stderr, "\n"); + + for (i=0; i < (seq_count - 1); i++) + { + if (i%1000 == 0) + { + p = (i/(float)(seq_count))*100; + fprintf(stderr, "\rAnnotating : %f %% ",p); + } + + head = false; + head_count = 0; + internal_count = 0; + singleton_count = 0; + ind_sample_count = 0; + + for (sample=0; sample < sample_count; sample++) + { + // Check if head or singleton in at least one sample + status = obi_get_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample); + if ((!head) && ((status == 'h') || (status == 's'))) + head = true; + + if (status == 's') + { + singleton_count++; + ind_sample_count++; + } + else if (status == 'i') + { + internal_count++; + ind_sample_count++; + } + else if (status == 'h') + { + head_count++; + ind_sample_count++; + } + + } + + if (obi_set_bool_with_elt_idx_and_col_p_in_view(o_view, head_column, i, 0, head) < 0) + // TODO + return -1; + + if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, singletoncount_column, i, 0, singleton_count) < 0) + // TODO + return -1; + if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, internalcount_column, i, 0, internal_count) < 0) + // TODO + return -1; + if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, headcount_column, i, 0, head_count) < 0) + // TODO + return -1; + if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, samplecount_column, i, 0, ind_sample_count) < 0) + // TODO + return -1; + + } + + // Close views + if (obi_save_and_close_view(i_view) < 0) + { + obidebug(1, "\nError closing the input view after aligning"); + return -1; + } + if (obi_save_and_close_view(o_view) < 0) + { + obidebug(1, "\nError closing the output view after aligning"); + return -1; + } + + fprintf(stderr,"\rDone : 100 %% "); + return 0; +} + + + diff --git a/src/obi_clean.h b/src/obi_clean.h new file mode 100644 index 0000000..029ef0c --- /dev/null +++ b/src/obi_clean.h @@ -0,0 +1,85 @@ +/************************************************************************************************* + * Header file for functions tagging a set of sequences for PCR/sequencing errors identification * + *************************************************************************************************/ + +/** + * @file obi_clean.h + * @author Celine Mercier (celine.mercier@metabarcoding.org) + * @date April 9th 2018 + * @brief Header file for the functions tagging a set of sequences for PCR/sequencing errors identification. + */ + + +#ifndef OBI_CLEAN_H_ +#define OBI_CLEAN_H_ + + +#include +#include +#include + +#include "obidms.h" +#include "obiview.h" +#include "obidmscolumn.h" +#include "obitypes.h" + + +/** + * @brief Names and comments of columns automatically created in the output view when aligning. + * + * @since April 2018 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +#define CLEAN_STATUS_COLUMN_NAME "obiclean_status" +#define CLEAN_HEAD_COLUMN_NAME "obiclean_head" +#define CLEAN_SAMPLECOUNT_COLUMN_NAME "obiclean_samplecount" +#define CLEAN_HEADCOUNT_COLUMN_NAME "obiclean_headcount" +#define CLEAN_INTERNALCOUNT_COLUMN_NAME "obiclean_internalcount" +#define CLEAN_SINGLETONCOUNT_COLUMN_NAME "obiclean_singletoncount" + +#define CLEAN_STATUS_COLUMN_COMMENTS "" +#define CLEAN_HEAD_COLUMN_COMMENTS "" +#define CLEAN_SAMPLECOUNT_COLUMN_COMMENTS "" +#define CLEAN_HEADCOUNT_COLUMN_COMMENTS "" +#define CLEAN_INTERNALCOUNT_COLUMN_COMMENTS "" +#define CLEAN_SINGLETONCOUNT_COLUMN_COMMENTS "" + + +/** + * @brief Tags a set of sequences for PCR/sequencing errors identification + * + * Note: The columns where the results are written are automatically named and created. + * + * @param dms A pointer on an OBIDMS. + * @param i_view_name The name of the input view. + * @param sample_column_name The name of the OBI_STR column in the input view where the sample information is kept. + * NULL or "" (empty string) if there is no sample information. + * @param o_view_name The name of the output view where the results should be written (should not already exist). + * @param o_view_comments The comments that should be associated with the output view. + * @param threshold Similarity threshold expressed as a number of differences. + * Only sequence pairs with a similarity above the threshold are clustered. + * @param max_ratio Maximum ratio between the counts of two sequences so that the less abundant one can be considered + * as a variant of the more abundant one. + * @param heads_only If true, only cluster heads are printed to the output view. + * @param thread_count Number of threads to use (Not available yet) TODO + * + * @returns A value indicating the success of the operation. + * @retval 0 if the operation was successfully completed. + * @retval -1 if an error occurred. + * + * @since April 2017 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +int obi_clean(const char* dms_name, + const char* i_view_name, + const char* sample_column_name, + const char* o_view_name, + const char* o_view_comments, + double threshold, + double max_ratio, + bool heads_only, + int thread_count); + + +#endif /* OBI_CLEAN_H_ */ + diff --git a/src/obierrno.h b/src/obierrno.h index faa7b08..61fe10b 100644 --- a/src/obierrno.h +++ b/src/obierrno.h @@ -122,6 +122,8 @@ extern int obi_errno; */ #define OBI_ELT_IDX_ERROR (31) /** Error setting or getting a value at a non-existent element index or with a non-existent element name */ +#define OBI_CLEAN_ERROR (32) /** Error while cleaning sequences + */ /**@}*/