obi clean: first version

2018-05-18 14:26:54 +02:00
parent f0c147c252
commit 6911bf4d70
7 changed files with 663 additions and 0 deletions
--- a/python/obitools3/commands/clean.pyx
+++ b/python/obitools3/commands/clean.pyx
@ -0,0 +1,94 @@
+#cython: language_level=3
+
+from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
+from obitools3.dms.dms cimport DMS
+from obitools3.dms.capi.obidms cimport OBIDMS_p
+from obitools3.dms.view import RollbackException
+from obitools3.dms.capi.obiclean cimport obi_clean
+from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
+from obitools3.uri.decode import open_uri
+from obitools3.apps.config import logger
+from obitools3.utils cimport tobytes
+
+from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
+
+
+__title__="Tag a set of sequences for PCR and sequencing errors identification"
+
+
+def addOptions(parser):
+
+    addSequenceInputOption(parser)
+    addMinimalOutputOption(parser)
+ 
+    group = parser.add_argument_group('obi clean specific options')
+
+    group.add_argument('--distance', '-d',
+                       action="store", dest="clean:distance",
+                       metavar='<DISTANCE>',
+                       default=1.0,
+                       type=float,
+                       help="Maximum numbers of errors between two variant sequences. Default: 1.")
+
+    group.add_argument('--sample-tag', '-s',
+                       action="store", 
+                       dest="clean:sample-tag-name",
+                       metavar="<SAMPLE TAG NAME>",
+                       type=str,
+                       default="merged_sample",
+                       help="Name of the tag where sample counts are kept.")
+    
+    group.add_argument('--ratio', '-r',
+                       action="store", dest="clean:ratio",
+                       metavar='<RATIO>',
+                       default=0.5,
+                       type=float,
+                       help="Maximum ratio between the counts of two sequences so that the less abundant one can be considered"
+                            " a variant of the more abundant one. Default: 0.5.")
+ 
+    group.add_argument('--heads-only', '-H',
+                       action="store_true", 
+                       dest="clean:heads-only",
+                       default=False,
+                       help="Only sequences labeled as heads are kept in the output. Default: False")
+
+    group.add_argument('--cluster-tags', '-C',
+                       action="store_true", 
+                       dest="clean:cluster-tags",
+                       default=False,
+                       help="Adds tags for each sequence giving its cluster's head and weight for each sample.")
+
+
+def run(config):
+        
+    DMS.obi_atexit()
+    
+    logger("info", "obi clean")
+    
+    # Open DMS
+    dms_name = config['obi']['inputURI'].split('/')[0]
+    dms = open_uri(dms_name)[0]
+    
+    # Read the name of the input view
+    uri_i = config['obi']['inputURI'].split('/')
+    i_view_name = uri_i[1]
+
+    # Read the name of the output view
+    uri_o = config['obi']['outputURI'].split('/')
+    if len(uri_o)==2:
+        # Check that input and output DMS are the same (predicate, to discuss)
+        if dms_name != uri_o[0]:
+            raise Exception("Input and output DMS must be the same")
+        o_view_name = uri_o[1]
+    else:
+        o_view_name = uri_o[0]
+    
+    if obi_clean(tobytes(dms_name), tobytes(i_view_name), tobytes(config['clean']['sample-tag-name']), tobytes(o_view_name), b"obiclean", \
+              config['clean']['distance'], config['clean']['ratio'], config['clean']['heads-only'], 1) < 0:
+        raise Exception("Error running obiclean")
+
+    print("\n")
+    print(repr(dms[o_view_name]))
+
+    dms.close()
+
--- a/python/obitools3/dms/capi/obiclean.pxd
+++ b/python/obitools3/dms/capi/obiclean.pxd
@ -0,0 +1,17 @@
+#cython: language_level=3
+
+from obitools3.dms.capi.obidms       cimport OBIDMS_p
+
+
+cdef extern from "obi_clean.h" nogil:
+
+    int obi_clean(const char* dms_name,
+                  const char* i_view_name,
+                  const char* sample_column_name,
+                  const char* o_view_name,
+                  const char* o_view_comments,
+                  double threshold,
+                  double max_ratio,
+                  bint heads_only,
+                  int thread_count)
+    
--- a/python/obitools3/dms/dms.cfiles
+++ b/python/obitools3/dms/dms.cfiles
@ -8,6 +8,7 @@
 ../../../src/linked_list.c
 ../../../src/murmurhash2.c
 ../../../src/obi_align.c
+../../../src/obi_clean.c
 ../../../src/obiavl.c
 ../../../src/obiblob_indexer.c
 ../../../src/obiblob.c
--- a/python/obitools3/utils.cfiles
+++ b/python/obitools3/utils.cfiles
@ -8,6 +8,7 @@
 ../../src/linked_list.c
 ../../src/murmurhash2.c
 ../../src/obi_align.c
+../../src/obi_clean.c
 ../../src/obiavl.c
 ../../src/obiblob_indexer.c
 ../../src/obiblob.c
--- a/src/obi_clean.c
+++ b/src/obi_clean.c
@ -0,0 +1,463 @@
+/****************************************************************************
+ * Tags a set of sequences for PCR/sequencing errors identification		    *
+ ****************************************************************************/
+
+/**
+ * @file obi_clean.c
+ * @author Celine Mercier (celine.mercier@metabarcoding.org)
+ * @date April 9th 2018
+ * @brief Functions tagging a set of sequences for PCR/sequencing errors identification.
+ */
+
+//#define OMP_SUPPORT // TODO
+#ifdef OMP_SUPPORT
+#include <omp.h>
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <search.h>
+#include <sys/time.h>
+
+#include "obi_clean.h"
+#include "obidms.h"
+#include "obidebug.h"
+#include "obierrno.h"
+#include "obitypes.h"
+#include "obiview.h"
+#include "sse_banded_LCS_alignment.h"
+#include "upperband.h"
+#include "obiblob.h"
+
+
+
+#define DEBUG_LEVEL 0	// TODO has to be defined somewhere else (cython compil flag?)
+
+
+/**************************************************************************
+ *
+ * D E C L A R A T I O N   O F   T H E   P R I V A T E   F U N C T I O N S
+ *
+ **************************************************************************/
+
+// TODO
+static int create_output_columns(Obiview_p o_view,
+								 Obiview_p i_view,
+								 OBIDMS_column_p sample_column,
+								 int sample_count);
+
+
+static inline int idxcmp(const void* idx1, const void* idx2);
+
+
+/************************************************************************
+ *
+ * D E F I N I T I O N   O F   T H E   P R I V A T E   F U N C T I O N S
+ *
+ ************************************************************************/
+
+static int create_output_columns(Obiview_p o_view,
+								 Obiview_p i_view,
+								 OBIDMS_column_p sample_column,
+								 int sample_count)
+{
+	// Status column
+	if (obi_view_add_column(o_view, CLEAN_STATUS_COLUMN_NAME, -1, NULL, OBI_CHAR, 0, sample_count, (sample_column->header)->elements_names, true, false, false, NULL, NULL, -1, CLEAN_STATUS_COLUMN_COMMENTS, true) < 0)
+	{
+		obidebug(1, "\nError creating the %s column", CLEAN_STATUS_COLUMN_NAME);
+		return -1;
+	}
+
+	// Head column
+	if (obi_view_add_column(o_view, CLEAN_HEAD_COLUMN_NAME, -1, NULL, OBI_BOOL, 0, 1, NULL, false, false, false, NULL, NULL, -1, CLEAN_HEAD_COLUMN_COMMENTS, true) < 0)
+	{
+		obidebug(1, "\nError creating the %s column", CLEAN_HEAD_COLUMN_NAME);
+		return -1;
+	}
+
+	// Sample count column
+	if (obi_view_add_column(o_view, CLEAN_SAMPLECOUNT_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, CLEAN_SAMPLECOUNT_COLUMN_COMMENTS, true) < 0)
+	{
+		obidebug(1, "\nError creating the %s column", CLEAN_SAMPLECOUNT_COLUMN_NAME);
+		return -1;
+	}
+
+	// Head count column
+	if (obi_view_add_column(o_view, CLEAN_HEADCOUNT_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, CLEAN_HEADCOUNT_COLUMN_COMMENTS, true) < 0)
+	{
+		obidebug(1, "\nError creating the %s column", CLEAN_HEADCOUNT_COLUMN_NAME);
+		return -1;
+	}
+
+	// Internal count column
+	if (obi_view_add_column(o_view, CLEAN_INTERNALCOUNT_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, CLEAN_INTERNALCOUNT_COLUMN_COMMENTS, true) < 0)
+	{
+		obidebug(1, "\nError creating the %s column", CLEAN_INTERNALCOUNT_COLUMN_NAME);
+		return -1;
+	}
+
+	// Singleton count column
+	if (obi_view_add_column(o_view, CLEAN_SINGLETONCOUNT_COLUMN_NAME, -1, NULL, OBI_INT, 0, 1, NULL, false, false, false, NULL, NULL, -1, CLEAN_SINGLETONCOUNT_COLUMN_COMMENTS, true) < 0)
+	{
+		obidebug(1, "\nError creating the %s column", CLEAN_SINGLETONCOUNT_COLUMN_NAME);
+		return -1;
+	}
+
+	return 0;
+}
+
+
+static inline int idxcmp(const void* idx1, const void* idx2)
+{
+    if (*(index_t*) idx1 < *(index_t*) idx2)
+        return -1;
+    if (*(index_t*) idx1 > *(index_t*) idx2)
+        return 1;
+    return 0;
+}
+
+
+/**********************************************************************
+ *
+ * D E F I N I T I O N   O F   T H E   P U B L I C   F U N C T I O N S
+ *
+ **********************************************************************/
+
+
+// TODO comment
+int obi_clean(const char* dms_name,
+		      const char* i_view_name,
+			  const char* sample_column_name,
+			  const char* o_view_name,
+			  const char* o_view_comments,
+			  double threshold,
+			  double max_ratio,
+			  bool heads_only,
+			  int thread_count)
+{
+	float 			p;
+	index_t         i, j;
+	index_t         seq_count;
+	index_t*  		index_array;
+	double          score;
+	bool            above_threshold;
+	int             lcs_length;
+	int             ali_length;
+	Kmer_table_p    ktable;
+	Obi_blob_p      blob1;
+	Obi_blob_p   	blob2;
+	int				lcs_min;
+	int				sample_count;
+	int 			sample;
+	int             s1_count;
+	int             s2_count;
+	bool            head;
+	int             head_count;
+	int       	    internal_count;
+	int 			singleton_count;
+	int 			ind_sample_count;
+	char  			status;
+
+	void**          yes_trees;
+	void**          no_trees;
+
+	OBIDMS_p        dms = NULL;
+	Obiview_p       i_view = NULL;
+	Obiview_p       o_view = NULL;
+	OBIDMS_column_p iseq_column = NULL;
+	OBIDMS_column_p sample_column = NULL;
+	OBIDMS_column_p status_column = NULL;
+	OBIDMS_column_p head_column = NULL;
+	OBIDMS_column_p internalcount_column = NULL;
+	OBIDMS_column_p headcount_column = NULL;
+	OBIDMS_column_p singletoncount_column = NULL;
+	OBIDMS_column_p samplecount_column = NULL;
+
+	// TODO other columns
+
+	void* no;
+	void* yes;
+	void* key_p;
+
+	bool normalize = false;
+	int reference = 0;
+	bool similarity_mode = false;
+
+	char* seq1;
+	char* seq2;
+
+	// Open DMS
+	dms = obi_dms(dms_name);
+
+	// Open input view
+	i_view = obi_open_view(dms, i_view_name);
+	if (i_view == NULL)
+	{
+		obidebug(1, "\nError opening the input view to clean");
+		return -1;
+	}
+
+	// Open the sequence column
+	if (strcmp((i_view->infos)->view_type, VIEW_TYPE_NUC_SEQS) == 0)
+		iseq_column = obi_view_get_column(i_view, NUC_SEQUENCE_COLUMN);
+	else
+	{
+		obi_set_errno(OBI_CLEAN_ERROR);
+		obidebug(1, "\nError: no sequence column");
+		return -1;
+	}
+	if (iseq_column == NULL)
+	{
+		obidebug(1, "\nError getting the sequence column");
+		return -1;
+	}
+
+	// Open the sample column if there is one
+	if ((strcmp(sample_column_name, "") == 0) || (sample_column_name == NULL))
+		sample_column = NULL;
+	else
+	{
+		sample_column = obi_view_get_column(i_view, sample_column_name);
+		if (sample_column == NULL)
+		{
+			obidebug(1, "\nError getting the sample column");
+			return -1;
+		}
+	}
+
+	// Create the output view
+	o_view = obi_clone_view(dms, i_view, o_view_name, NULL, o_view_comments);
+	if (o_view == NULL)
+	{
+		obidebug(1, "\nError creating the output view");
+		return -1;
+	}
+
+	sample_count = (sample_column->header)->nb_elements_per_line;
+
+	// Create the output columns
+	if (create_output_columns(o_view, i_view, sample_column, sample_count) < 0)
+		// TODO
+		return -1;
+
+	// Get the created output columns
+	status_column = obi_view_get_column(o_view, CLEAN_STATUS_COLUMN_NAME);
+	head_column = obi_view_get_column(o_view, CLEAN_HEAD_COLUMN_NAME);
+	internalcount_column = obi_view_get_column(o_view, CLEAN_INTERNALCOUNT_COLUMN_NAME);
+	headcount_column = obi_view_get_column(o_view, CLEAN_HEADCOUNT_COLUMN_NAME);
+	singletoncount_column = obi_view_get_column(o_view, CLEAN_SINGLETONCOUNT_COLUMN_NAME);
+	samplecount_column = obi_view_get_column(o_view, CLEAN_SAMPLECOUNT_COLUMN_NAME);
+
+	// Build kmer tables
+	ktable = hash_seq_column(i_view, iseq_column, 0);
+	if (ktable == NULL)
+	{
+		obi_set_errno(OBI_CLEAN_ERROR);
+		obidebug(1, "\nError building kmer tables before aligning");
+		return -1;
+	}
+
+	seq_count = (i_view->infos)->line_count;
+
+	// Allocate arrays of pointers to binary trees
+	yes_trees = (void**) calloc(seq_count, sizeof(void*));
+	no_trees = (void**) calloc(seq_count, sizeof(void*));
+
+	// Allocate and fill index array for the binary trees to reference (they don't copy the data)
+	index_array = (index_t*) malloc(seq_count * sizeof(index_t));
+	for (i=0; i < seq_count; i++)
+		index_array[i]=i;
+
+	// Initialize all sequences to singletons or NA if no sequences in that sample
+	for (i=0; i < (seq_count - 1); i++)
+	{
+		for (sample=0; sample < sample_count; sample++)
+		{
+			if (obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, i, sample) != OBIInt_NA)
+			{
+				if (obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample, 's') < 0)
+				// TODO
+					return -1;
+			}
+		}
+	}
+
+	for (sample=0; sample < sample_count; sample++)
+	{
+		for (i=0; i < (seq_count - 1); i++)
+		{
+			if (i%1000 == 0)
+			{
+				p = ((i+(sample*(seq_count)))/(float)(seq_count*sample_count))*100;
+			    fprintf(stderr,"\rDone : %f %%          ",p);
+			}
+
+			// Get first sequence
+			blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0);
+			if (blob1 == NULL)
+			{
+				obidebug(1, "\nError retrieving sequences to align");
+				return -1;
+			}
+
+			seq1 = obi_get_seq_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0);
+
+			// Get count for this sample
+			s1_count = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, i, sample);
+
+			for (j=i+1; j < seq_count; j++)  // TODO parallelize this loop?
+			{
+				// Get second sequence
+				blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(i_view, iseq_column, j, 0);
+				if (blob2 == NULL)
+				{
+					obidebug(1, "\nError retrieving sequences to align");
+					return -1;
+				}
+
+				seq2 = obi_get_seq_with_elt_idx_and_col_p_in_view(i_view, iseq_column, j, 0);
+
+				// Get count for this sample
+				s2_count = obi_get_int_with_elt_idx_and_col_p_in_view(i_view, sample_column, j, sample);
+
+				// Checking ratio
+				if (((s1_count!=OBIInt_NA && s2_count!=OBIInt_NA) && (s1_count>0 && s2_count>0)) &&
+						((((s1_count >= s2_count) && (((double) s2_count / (double) s1_count) <= max_ratio))) ||
+						(((s2_count >= s1_count) && (((double) s1_count / (double) s2_count) <= max_ratio)))))
+				{
+					yes=NULL;
+					no=NULL;
+					no = tfind(index_array+j, &(no_trees[i]), idxcmp);
+					if (no == NULL)
+						yes = tfind(index_array+j, &(yes_trees[i]), idxcmp);
+
+					above_threshold = false;
+					if ((no == NULL) && (yes == NULL))			//never compared
+					{
+						// Check if the sequences are identical in a quick way (same index in the same indexer)
+						if (obi_get_index_with_elt_idx_and_col_p_in_view(i_view, iseq_column, i, 0) == obi_get_index_with_elt_idx_and_col_p_in_view(i_view, iseq_column, j, 0))
+							above_threshold = true;
+						else // the sequences aren't identical
+						{
+							// kmer filter
+							align_filters(ktable, blob1, blob2, i, j, threshold, normalize, reference, similarity_mode, &score, &lcs_min, false);
+
+							// Compute alignment score if filter passed
+							if (score == -1.0)
+								score = obiblob_sse_banded_lcs_align(blob1, blob2, threshold, normalize, reference, similarity_mode, &lcs_length, &ali_length);
+
+							above_threshold = ((score >= 0) && (score <= threshold));
+						}
+					}
+
+					if (yes || above_threshold)
+					{
+						if (yes == NULL)
+							// put seqs in their 'yes' trees
+							key_p = tsearch(index_array+j, &(yes_trees[i]), idxcmp);   // TODO error checking
+						// label as head or internal
+						if (s1_count >= s2_count)
+						{
+							if (obi_get_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample) == 's')	// seq can become head ONLY if it's a singleton
+								obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample, 'h');
+							// Otherwise it's an internal
+							obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample, 'i');
+						}
+						else
+						{
+							if (obi_get_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample) == 's')	// seq can become head ONLY if it's a singleton
+								obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, j, sample, 'h');
+							obi_set_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample, 'i');
+						}
+					}
+					else if (no == NULL)
+					// put in 'no' tree of both sequences  // TODO why just one then???
+						key_p = tsearch(index_array+j, &(no_trees[i]), idxcmp);   // TODO error checking
+				}
+				free(seq2);
+			}
+			free(seq1);
+		}
+	}
+
+	free_kmer_tables(ktable, seq_count);
+
+	fprintf(stderr, "\n");
+
+	for (i=0; i < (seq_count - 1); i++)
+	{
+		if (i%1000 == 0)
+		{
+			p = (i/(float)(seq_count))*100;
+		    fprintf(stderr, "\rAnnotating : %f %%          ",p);
+		}
+
+		head = false;
+		head_count = 0;
+		internal_count = 0;
+		singleton_count = 0;
+		ind_sample_count = 0;
+
+		for (sample=0; sample < sample_count; sample++)
+		{
+			// Check if head or singleton in at least one sample
+			status = obi_get_char_with_elt_idx_and_col_p_in_view(o_view, status_column, i, sample);
+			if ((!head) && ((status == 'h') || (status == 's')))
+				head = true;
+
+			if (status == 's')
+			{
+				singleton_count++;
+				ind_sample_count++;
+			}
+			else if (status == 'i')
+			{
+				internal_count++;
+				ind_sample_count++;
+			}
+			else if (status == 'h')
+			{
+				head_count++;
+				ind_sample_count++;
+			}
+
+		}
+
+		if (obi_set_bool_with_elt_idx_and_col_p_in_view(o_view, head_column, i, 0, head) < 0)
+			// TODO
+			return -1;
+
+		if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, singletoncount_column, i, 0, singleton_count) < 0)
+			// TODO
+			return -1;
+		if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, internalcount_column, i, 0, internal_count) < 0)
+			// TODO
+			return -1;
+		if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, headcount_column, i, 0, head_count) < 0)
+			// TODO
+			return -1;
+		if (obi_set_int_with_elt_idx_and_col_p_in_view(o_view, samplecount_column, i, 0, ind_sample_count) < 0)
+			// TODO
+			return -1;
+
+	}
+
+	// Close views
+	if (obi_save_and_close_view(i_view) < 0)
+	{
+		obidebug(1, "\nError closing the input view after aligning");
+		return -1;
+	}
+	if (obi_save_and_close_view(o_view) < 0)
+	{
+		obidebug(1, "\nError closing the output view after aligning");
+		return -1;
+	}
+
+	fprintf(stderr,"\rDone : 100 %%           ");
+	return 0;
+}
+
+
+
--- a/src/obi_clean.h
+++ b/src/obi_clean.h
@ -0,0 +1,85 @@
+/*************************************************************************************************
+ * Header file for functions tagging a set of sequences for PCR/sequencing errors identification *
+ *************************************************************************************************/
+
+/**
+ * @file obi_clean.h
+ * @author Celine Mercier (celine.mercier@metabarcoding.org)
+ * @date April 9th 2018
+ * @brief Header file for the functions tagging a set of sequences for PCR/sequencing errors identification.
+ */
+
+
+#ifndef OBI_CLEAN_H_
+#define OBI_CLEAN_H_
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+
+#include "obidms.h"
+#include "obiview.h"
+#include "obidmscolumn.h"
+#include "obitypes.h"
+
+
+/**
+ * @brief Names and comments of columns automatically created in the output view when aligning.
+ *
+ * @since April 2018
+ * @author Celine Mercier (celine.mercier@metabarcoding.org)
+ */
+#define CLEAN_STATUS_COLUMN_NAME "obiclean_status"
+#define CLEAN_HEAD_COLUMN_NAME "obiclean_head"
+#define CLEAN_SAMPLECOUNT_COLUMN_NAME "obiclean_samplecount"
+#define CLEAN_HEADCOUNT_COLUMN_NAME "obiclean_headcount"
+#define CLEAN_INTERNALCOUNT_COLUMN_NAME "obiclean_internalcount"
+#define CLEAN_SINGLETONCOUNT_COLUMN_NAME "obiclean_singletoncount"
+
+#define CLEAN_STATUS_COLUMN_COMMENTS ""
+#define CLEAN_HEAD_COLUMN_COMMENTS ""
+#define CLEAN_SAMPLECOUNT_COLUMN_COMMENTS ""
+#define CLEAN_HEADCOUNT_COLUMN_COMMENTS ""
+#define CLEAN_INTERNALCOUNT_COLUMN_COMMENTS ""
+#define CLEAN_SINGLETONCOUNT_COLUMN_COMMENTS ""
+
+
+/**
+ * @brief Tags a set of sequences for PCR/sequencing errors identification
+ *
+ * Note: The columns where the results are written are automatically named and created.
+ *
+ * @param dms A pointer on an OBIDMS.
+ * @param i_view_name The name of the input view.
+ * @param sample_column_name The name of the OBI_STR column in the input view where the sample information is kept.
+ *                           NULL or "" (empty string) if there is no sample information.
+ * @param o_view_name The name of the output view where the results should be written (should not already exist).
+ * @param o_view_comments The comments that should be associated with the output view.
+ * @param threshold Similarity threshold expressed as a number of differences.
+ *                  Only sequence pairs with a similarity above the threshold are clustered.
+ * @param max_ratio Maximum ratio between the counts of two sequences so that the less abundant one can be considered
+ *          	    as a variant of the more abundant one.
+ * @param heads_only If true, only cluster heads are printed to the output view.
+ * @param thread_count Number of threads to use (Not available yet) TODO
+ *
+ * @returns A value indicating the success of the operation.
+ * @retval 0 if the operation was successfully completed.
+ * @retval -1 if an error occurred.
+ *
+ * @since April 2017
+ * @author Celine Mercier (celine.mercier@metabarcoding.org)
+ */
+int obi_clean(const char* dms_name,
+		      const char* i_view_name,
+			  const char* sample_column_name,
+			  const char* o_view_name,
+			  const char* o_view_comments,
+			  double threshold,
+			  double max_ratio,
+			  bool heads_only,
+			  int thread_count);
+
+
+#endif /* OBI_CLEAN_H_ */
+
--- a/src/obierrno.h
+++ b/src/obierrno.h
@ -122,6 +122,8 @@ extern int obi_errno;
 											 */
 #define OBI_ELT_IDX_ERROR           (31)    /** Error setting or getting a value at a non-existent element index or with a non-existent element name
 											 */
+#define OBI_CLEAN_ERROR				(32)    /** Error while cleaning sequences
+ 	 	 	 	 	 	 	 	 	 	 	 */

 /**@}*/