Alignpairedend: added alignment using shifting with best kmer similarity

(low level layer in C and Cython API)
2019-01-21 17:30:46 +01:00
parent 59017c0d6b
commit c7f5b8d980
10 changed files with 934 additions and 89 deletions
--- a/python/obitools3/commands/alignpairedend.pxd
+++ b/python/obitools3/commands/alignpairedend.pxd
@ -0,0 +1,4 @@
 #cython: language_level=3
 cdef object buildAlignment(object direct, object reverse)
--- a/python/obitools3/commands/alignpairedend.pyx
+++ b/python/obitools3/commands/alignpairedend.pyx
@ -2,19 +2,20 @@
 from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
 from obitools3.dms import DMS
 from obitools3.dms.view.view cimport View
 from obitools3.dms.view import RollbackException
 from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
-from obitools3.dms.column.column cimport Column, Column_line
+from obitools3.dms.column.column cimport Column
-from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN
+from obitools3.dms.capi.obiview cimport QUALITY_COLUMN
-from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t, OBI_QUAL
+from obitools3.dms.capi.obitypes cimport OBI_QUAL
 from obitools3.apps.optiongroups import addMinimalInputOption, addMinimalOutputOption
 from obitools3.uri.decode import open_uri
 from obitools3.apps.config import logger
 from obitools3.libalign._qsassemble import QSolexaReverseAssemble
 from obitools3.libalign._qsrassemble import QSolexaRightReverseAssemble
 from obitools3.libalign._solexapairend import buildConsensus, buildJoinedSequence
-from obitools3.dms.obiseq cimport Nuc_Seq_Stored, Nuc_Seq
+from obitools3.dms.obiseq cimport Nuc_Seq
 from obitools3.libalign.shifted_ali cimport Kmer_similarity, Ali_shifted
 import sys
 import os
@ -27,6 +28,7 @@ REVERSE_QUALITY_COLUMN_NAME = b"REVERSE_QUALITY"
 __title__="Aligns paired-ended reads"
 def addOptions(parser):
    addMinimalInputOption(parser)
@ -41,14 +43,6 @@ def addOptions(parser):
                     type=str,
                     help="URI to the reverse reads if they are in a different view than the forward reads")
 # TODO
 #    group.add_argument('--index-file',
 #                     action="store", dest="alignpairedend:index",
 #                     metavar="<FILENAME>",
 #                     default=None,
 #                     type=str,
 #                     help="URI to the index reads")
    group.add_argument('--score-min',
                     action="store", dest="alignpairedend:smin",
                     metavar="#.###",
@ -56,12 +50,22 @@ def addOptions(parser):
                     type=float,
                     help="Minimum score for keeping alignments")
    group.add_argument('-A', '--true-ali',
                       action="store_true", dest="alignpairedend:trueali",
                       default=False,
                       help="Performs gap free end alignment of sequences instead of using kmers to compute alignments (slower).")
    group.add_argument('-k', '--kmer-size',
                       action="store", dest="alignpairedend:kmersize",
                       metavar="#",
                       default=3,
                       type=int,
                       help="K-mer size for kmer comparisons, between 1 and 4 (not when using -A option; default: 3)")
 # TODO declarations, cdef, imports
 la = QSolexaReverseAssemble()
 ra = QSolexaRightReverseAssemble()
-def buildAlignment(object direct, object reverse):
+cdef object buildAlignment(object direct, object reverse):
    if len(direct)==0 or len(reverse)==0:
        return None
@ -84,7 +88,7 @@ def buildAlignment(object direct, object reverse):
    return ali
-def alignmentIterator(entries): 
+def alignmentIterator(entries, aligner): 
    if type(entries) == list:
        two_views = True
@ -104,9 +108,12 @@ def alignmentIterator(entries):
            seqR = Nuc_Seq(seqF.id, seqF[REVERSE_SEQ_COLUMN_NAME], quality=seqF[REVERSE_QUALITY_COLUMN_NAME])
            seqF.pop(REVERSE_SEQ_COLUMN_NAME)
            seqF.pop(REVERSE_QUALITY_COLUMN_NAME)
-        ali = buildAlignment(seqF, seqR)
+        
        ali = aligner(seqF, seqR)
        if ali is None:
            continue
        yield ali
@ -160,11 +167,6 @@ def run(config):
    else:
        entries_len = len(entries)
 #     if "index" in config["alignpairedend"]:    # TODO
 #         index = open_uri(config["alignpairedend"]["index"])
 #     else:
 #         index = None
    # Open the output
    output = open_uri(config['obi']['outputURI'],
                      input=False,
@ -174,21 +176,29 @@ def run(config):
    view = output[1]
-    Column.new_column(view, b"QUALITY", OBI_QUAL)   #TODO output URI quality option
+    Column.new_column(view, QUALITY_COLUMN, OBI_QUAL)   #TODO output URI quality option?
    if 'smin' in config['alignpairedend']:
-        config['alignpairedend']['sminL'] = config['alignpairedend']['smin']
+        smin = config['alignpairedend']['smin']
        config['alignpairedend']['sminR'] = config['alignpairedend']['smin']
        sminL = config['alignpairedend']['sminL']
        sminR = config['alignpairedend']['sminR']
    else:
-        sminL = 0
+        smin = 0
        sminR = 0
    # Initialize the progress bar
    pb = ProgressBar(entries_len, config, seconde=5)
-    ba = alignmentIterator(entries)
+    if config['alignpairedend']['trueali']:
        kmer_ali = False
        aligner = buildAlignment
    else :
        kmer_ali = True
        if type(entries) == list:
            forward = entries[0]
            reverse = entries[1]
            aligner = Kmer_similarity(forward, view2=reverse, kmer_size=config['alignpairedend']['kmersize'])
        else:
            aligner = Kmer_similarity(entries, kmer_size=config['alignpairedend']['kmersize'])
    ba = alignmentIterator(entries, aligner)
    i = 0
    for ali in ba:
@ -197,31 +207,33 @@ def run(config):
        consensus = view[i]
-        if sminL > 0:
+        if not two_views:
-            if ((ali.direction=='left' and ali.score > sminL)
+            seqF = entries[i]
-                or (ali.score > sminR)):
+        else:
-                buildConsensus(ali, consensus)
+            seqF = forward[i]
        if smin > 0:
            if (ali.score > smin) :
                buildConsensus(ali, consensus, seqF)
            else:
                seqF = ali[0].wrapped
                if not two_views:
-                    seq = entries[i]
+                    seqR = Nuc_Seq(seqF.id, seqF[REVERSE_SEQ_COLUMN_NAME], quality = seqF[REVERSE_QUALITY_COLUMN_NAME])
                    seqR = Nuc_Seq(seq.id, seq[REVERSE_SEQ_COLUMN_NAME], quality = seq[REVERSE_QUALITY_COLUMN_NAME])
                else:
                    seqR = reverse[i]
-                buildJoinedSequence(ali, seqR, consensus)
+                buildJoinedSequence(ali, seqR, consensus, forward=seqF)
-            consensus[b"sminL"] = sminL
+            consensus[b"smin"] = smin
            consensus[b"sminR"] = sminR
        else:
-            buildConsensus(ali, consensus)
+            buildConsensus(ali, consensus, seqF)
-# TODO            
+        if kmer_ali :
-#        if "index" in config['alignpairedend'] and config['alignpairedend']['index'] is not None:
+            ali.free()
 #            idx = str(config['alignpairedend']['index'].next())
 #            consensus[b"illumina_index"] = idx
        i+=1
    if kmer_ali :
        aligner.free()
    # Save command config in View and DMS comments
    command_line = " ".join(sys.argv[1:])
    view.write_config(config, "alignpairedend", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
--- a/python/obitools3/dms/capi/kmer_similarity.pxd
+++ b/python/obitools3/dms/capi/kmer_similarity.pxd
@ -0,0 +1,42 @@
 #cython: language_level=3
 from obitools3.dms.capi.obiview cimport Obiview_p
 from obitools3.dms.capi.obidmscolumn cimport OBIDMS_column_p
 from obitools3.dms.capi.obitypes cimport index_t
 from libc.stdint  cimport int32_t, uint8_t
 cdef extern from "kmer_similarity.h" nogil:
    struct Obi_ali_t :
        double   score
        int      consensus_length
        int      overlap_length
        char*    consensus_seq
        uint8_t* consensus_qual
        int      shift
        char*    direction
    ctypedef Obi_ali_t* Obi_ali_p
    void obi_free_shifted_ali(Obi_ali_p ali)
    Obi_ali_p kmer_similarity(Obiview_p view1,
                              OBIDMS_column_p column1,
                              index_t idx1,
                              index_t elt_idx1,
                              Obiview_p view2,
                              OBIDMS_column_p column2,
                              index_t idx2,
                              index_t elt_idx2,
                              uint8_t kmer_size,
                              int32_t* kmer_pos_array,
                              int32_t* kmer_pos_array_height_p,
                              int32_t* shift_array,
                              int32_t* shift_array_height_p,
                              int32_t* shift_count_array, 
                              int32_t* shift_count_array_height_p,
                              bint build_consensus)
--- a/python/obitools3/dms/dms.cfiles
+++ b/python/obitools3/dms/dms.cfiles
@ -6,6 +6,7 @@
 ../../../src/dna_seq_indexer.c
 ../../../src/encode.c
 ../../../src/hashtable.c
 ../../../src/kmer_similarity.c
 ../../../src/linked_list.c
 ../../../src/murmurhash2.c
 ../../../src/obi_lcs.c
--- a/python/obitools3/libalign/_solexapairend.pyx
+++ b/python/obitools3/libalign/_solexapairend.pyx
@ -3,6 +3,18 @@
 from cpython cimport array
 from .solexapairend import iterOnAligment
 from .shifted_ali cimport Ali_shifted
 from obitools3.dms.capi.obiview cimport Obiview_p, QUALITY_COLUMN, \
                                        obi_set_qual_int_with_elt_idx_and_col_p_in_view, \
                                        obi_set_str_with_elt_idx_and_col_p_in_view
 from obitools3.dms.capi.obidmscolumn cimport OBIDMS_column_p
 from obitools3.dms.view.view cimport View
 from obitools3.dms.column.column cimport Column
 from math import log10
@ -146,7 +158,7 @@ cdef class IterOnConsensus:
    seqBDeletion = property(get_seqBDeletion, None, None, "deletereverse's docstring")
-def buildConsensus(ali, seq):
+def buildConsensus(ali, seq, ref_tags=None):
    cdef list quality
    cdef char   aseq[1000]
    cdef int i=0
@ -156,61 +168,82 @@ def buildConsensus(ali, seq):
    cdef double score
    cdef bytes sseq
    cdef double p
    cdef OBIDMS_column_p col_p
    cdef Obiview_p view_p
    cdef View view
    cdef Column column
    quality = []
-    if len(ali[0])>999:   # TODO why?
+    if type(ali) == Ali_shifted:
-        raise AssertionError,"Too long alignemnt"
+        view = seq.view
        view_p = view.pointer()
        column = view[QUALITY_COLUMN]
        col_p = column.pointer()
        # doesn't work because uint8_t* are forced into bytes by cython (nothing read/kept beyond 0 values)
        #obi_set_qual_int_with_elt_idx_and_col_p_in_view(view_p, col_p, seq.index, 0, ali.consensus_qual, ali.consensus_len)
        seq.set(ref_tags.id+b"_CONS", ali.consensus_seq, quality=ali.consensus_qual, tags=ref_tags)
        seq[b'ali_length'] = ali.consensus_len
        seq[b'score_norm']=float(ali.score)/ali.consensus_len
        seq[b'shift']=ali.shift
    else:
        if len(ali[0])>999:   # TODO why?
            raise AssertionError,"Too long alignemnt"
-    ic=IterOnConsensus(ali)
+        ic=IterOnConsensus(ali)
-    for nuc,score in ic:
+        for nuc,score in ic:
-        cnuc=nuc
+            cnuc=nuc
-        quality.append(score)
+            quality.append(score)
-        aseq[i]=cnuc[0]
+            aseq[i]=cnuc[0]
-        i+=1
+            i+=1
-    aseq[i]=0
+        aseq[i]=0
-    sseq=aseq
+        sseq=aseq
-    # Reconvert quality from array of probabilities to array of raw quality values
+        # Reconvert quality from array of probabilities to array of raw quality values
-    i=0
+        i=0
-    for p in quality:
+        for p in quality:
-        quality[i] = min(42, round(-10*log10(p)))
+            quality[i] = min(42, round(-10*log10(p)))
-        i+=1
+            i+=1
-    seq.set(ali[0].wrapped.id+b"_CONS", sseq, quality=quality, tags=ali[0].wrapped)
+        seq.set(ali[0].wrapped.id+b"_CONS", sseq, quality=quality, tags=ali[0].wrapped)
        if hasattr(ali, "counter"):
            seq[b'alignement_id']=ali.counter
        seq[b'seq_a_single']=ic.seqASingle
        seq[b'seq_b_single']=ic.seqBSingle
        seq[b'seq_ab_match']=ic.seqABMatch
        seq[b'seq_a_mismatch']=ic.seqAMismatch
        seq[b'seq_b_mismatch']=ic.seqBMismatch
        seq[b'seq_a_insertion']=ic.seqAInsertion
        seq[b'seq_b_insertion']=ic.seqBInsertion-ic.seqBSingle
        seq[b'seq_a_deletion']=ic.seqADeletion
        seq[b'seq_b_deletion']=ic.seqBDeletion
        seq[b'ali_length']=len(seq)-ic.seqASingle-ic.seqBSingle
        if seq[b'ali_length']>0:
            seq[b'score_norm']=float(ali.score)/seq[b'ali_length']
    if hasattr(ali, "direction"):
-        seq[b'direction']=ali.direction
+        seq[b'ali_direction']=ali.direction
    if hasattr(ali, "counter"):
        seq[b'alignement_id']=ali.counter
    seq[b'seq_a_single']=ic.seqASingle
    seq[b'seq_b_single']=ic.seqBSingle
    seq[b'seq_ab_match']=ic.seqABMatch
    seq[b'seq_a_mismatch']=ic.seqAMismatch
    seq[b'seq_b_mismatch']=ic.seqBMismatch
    seq[b'seq_a_insertion']=ic.seqAInsertion
    seq[b'seq_b_insertion']=ic.seqBInsertion-ic.seqBSingle
    seq[b'seq_a_deletion']=ic.seqADeletion
    seq[b'seq_b_deletion']=ic.seqBDeletion
    seq[b'score']=ali.score
    seq[b'ali_length']=len(seq)-ic.seqASingle-ic.seqBSingle
    if seq[b'ali_length']>0:
        seq[b'score_norm']=float(ali.score)/seq[b'ali_length']
    seq[b'mode']=b'alignment'
    return seq
-def buildJoinedSequence(ali, reverse, seq):
+def buildJoinedSequence(ali, reverse, seq, forward=None):
-    forward = ali[0].wrapped
+    if forward is not None:
        forward = forward
    else:
        forward = ali[0].wrapped
    s = forward.seq + reverse.seq     
    quality = forward.quality
    quality.extend(reverse.quality)
    seq.set(forward.id +b"_PairedEnd", s, definition=forward.definition, quality=quality, tags=forward)
    seq[b"score"]=ali.score
-    seq[b"ali_dir"]=ali.direction
+    seq[b"ali_direction"]=ali.direction
    seq[b"mode"]=b"joined"
    seq[b"pairedend_limit"]=len(forward)    
    return seq
--- a/python/obitools3/libalign/shifted_ali.pxd
+++ b/python/obitools3/libalign/shifted_ali.pxd
@ -0,0 +1,30 @@
 #cython: language_level=3
 from obitools3.dms.capi.kmer_similarity cimport kmer_similarity, Obi_ali_p
 from libc.stdint  cimport int32_t, uint8_t
 from obitools3.dms.capi.obiview cimport Obiview_p
 from obitools3.dms.capi.obidmscolumn cimport OBIDMS_column_p
 cdef class Ali_shifted:
    cdef Obi_ali_p _pointer
    cdef inline Obi_ali_p pointer(self)
    cpdef free(self)
    @staticmethod
    cdef Ali_shifted new_ali(Obi_ali_p ali_p)
 cdef class Kmer_similarity:
    cdef uint8_t         kmer_size
    cdef int32_t*        kmer_pos_array_p
    cdef int32_t         kmer_pos_array_height_a[1]
    cdef int32_t*        shift_array_p
    cdef int32_t         shift_array_height_a[1]
    cdef int32_t*        shift_count_array_p
    cdef int32_t         shift_count_array_height_a[1]
    cdef Obiview_p       view1_p
    cdef OBIDMS_column_p column1_p
    cdef Obiview_p       view2_p
    cdef OBIDMS_column_p column2_p
    cdef bint            build_consensus
    cpdef free(self)
--- a/python/obitools3/libalign/shifted_ali.pyx
+++ b/python/obitools3/libalign/shifted_ali.pyx
@ -0,0 +1,119 @@
 #cython: language_level=3
 from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN
 from obitools3.dms.obiseq cimport Nuc_Seq_Stored
 from obitools3.dms.capi.kmer_similarity cimport kmer_similarity, Obi_ali_p, obi_free_shifted_ali
 from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
 from obitools3.dms.column.column cimport Column
 from libc.stdlib  cimport free
 cdef class Ali_shifted:
    cdef inline Obi_ali_p pointer(self) :
        return <Obi_ali_p>(self._pointer)        
    @staticmethod
    cdef Ali_shifted new_ali(Obi_ali_p ali_p) :   # not __init__ method because need to pass pointer
        ali = Ali_shifted()
        ali._pointer = ali_p
        return ali
    @property
    def score(self):
        return self.pointer().score
    @property
    def consensus_len(self):
        return self.pointer().consensus_length
    @property
    def overlap_len(self):
        return self.pointer().overlap_length
    @property
    def consensus_seq(self):
        return self.pointer().consensus_seq
    @property
    def shift(self):
        return self.pointer().shift
    @property
    def consensus_qual(self):   # must return list because uint8_t* are forced into bytes by cython which won't keep beyond the first 0 value
        qual_list = []
        for i in range(self.consensus_len):
            qual_list.append((self.pointer().consensus_qual)[i])
        return qual_list
    @property
    def direction(self):
        return self.pointer().direction
    cpdef free(self):
        obi_free_shifted_ali(self.pointer())
 cdef class Kmer_similarity:
    def __init__(self, View_NUC_SEQS view1, Column column1=None, View_NUC_SEQS view2=None, Column column2=None, uint8_t kmer_size=3, build_consensus=True) :
        cdef Column default_seq_col
        if kmer_size < 1 or kmer_size > 3:
            raise Exception("Kmer size to compute kmer similarity must be >=1 or <=4")
        self.kmer_pos_array_p = NULL
        self.shift_array_p = NULL
        self.shift_count_array_p = NULL
        self.kmer_pos_array_height_a[0] = 0
        self.shift_array_height_a[0] = 0 
        self.shift_count_array_height_a[0] = 0
        self.kmer_size = kmer_size
        self.build_consensus = build_consensus
        self.view1_p = view1.pointer()
        if column1 is not None:
            self.column1_p = column1.pointer()
        else:
            if type(view1) != View_NUC_SEQS:
                raise Exception("Kmer similarity with no sequence column given must be given a NUC_SEQS view")
            default_seq_col = view1[NUC_SEQUENCE_COLUMN]
            self.column1_p = default_seq_col.pointer()
        if view2 is not None:
            self.view2_p = view2.pointer()
        else:
            view2 = view1
            self.view2_p = self.view1_p
        if column2 is not None:
            self.column2_p = column2.pointer()
        else:
            if type(view2) != View_NUC_SEQS:
                raise Exception("Kmer similarity with no sequence column given must be given a NUC_SEQS view")
            default_seq_col = view2[NUC_SEQUENCE_COLUMN]
            self.column2_p = default_seq_col.pointer()
    def __call__(self, Nuc_Seq_Stored seq1, Nuc_Seq_Stored seq2):
        cdef Obi_ali_p ali_p
        cdef Ali_shifted ali
        ali_p = kmer_similarity(self.view1_p, self.column1_p, seq1.index, 0, \
                               self.view2_p, self.column2_p, seq2.index, 0, \
                               self.kmer_size, \
                               self.kmer_pos_array_p, self.kmer_pos_array_height_a, \
                               self.shift_array_p, self.shift_array_height_a, \
                               self.shift_count_array_p, self.shift_count_array_height_a,
                               self.build_consensus)
        ali = Ali_shifted.new_ali(ali_p)
        return ali
    cpdef free(self):
        if self.kmer_pos_array_p is not NULL:
            free(self.kmer_pos_array_p)
            self.kmer_pos_array_height_a[0] = 0
        if (self.shift_array_p is not NULL) :
            free(self.shift_array_p)
            self.shift_array_height_a[0] = 0
        if (self.shift_count_array_p is not NULL) :
            free(self.shift_count_array_p)
            self.shift_count_array_height_a[0] = 0
--- a/python/obitools3/utils.cfiles
+++ b/python/obitools3/utils.cfiles
@ -6,6 +6,7 @@
 ../../src/dna_seq_indexer.c
 ../../src/encode.c
 ../../src/hashtable.c
 ../../src/kmer_similarity.c
 ../../src/linked_list.c
 ../../src/murmurhash2.c
 ../../src/obi_lcs.c
--- a/src/kmer_similarity.c
+++ b/src/kmer_similarity.c
@ -0,0 +1,483 @@
 /****************************************************************************
 * Kmer similarity computation functions                                      *
 ****************************************************************************/
 /**
 * @file kmer_similarity.c
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 * @date January 7th 2019
 * @brief Kmer similarity computation functions.
 */
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <inttypes.h>
 #include "utils.h"
 #include "obidebug.h"
 #include "obierrno.h"
 #include "obitypes.h"
 #include "kmer_similarity.h"
 #include "obidmscolumn.h"
 #include "obiview.h"
 #include "encode.h"
 #include "dna_seq_indexer.h"
 #define DEBUG_LEVEL 0	// TODO has to be defined somewhere else (cython compil flag?)
 /**************************************************************************
 *
 * D E C L A R A T I O N   O F   T H E   P R I V A T E   F U N C T I O N S
 *
 **************************************************************************/
 /************************************************************************
 *
 * D E F I N I T I O N   O F   T H E   P R I V A T E   F U N C T I O N S
 *
 ************************************************************************/
 /**********************************************************************
 *
 * D E F I N I T I O N   O F   T H E   P U B L I C   F U N C T I O N S
 *
 **********************************************************************/
 void obi_free_shifted_ali(Obi_ali_p ali)
 {
 	free(ali->consensus_seq);
 	free(ali->consensus_qual);
 	free(ali);
 }
 Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1, index_t elt_idx1,
 							Obiview_p view2, OBIDMS_column_p column2, index_t idx2, index_t elt_idx2,
 							uint8_t kmer_size,
 							int32_t* kmer_pos_array, int32_t* kmer_pos_array_height_p,
 							int32_t* shift_array, int32_t* shift_array_height_p,
 							int32_t* shift_count_array, int32_t* shift_count_array_length_p,
 							bool build_consensus)
 {
 	Obi_ali_p       ali = NULL;
 	int i, j;
 	OBIDMS_column_p qual_col1;
 	OBIDMS_column_p qual_col2;
 	double 			score = 0.0;
 	Obi_blob_p      blob1 = NULL;
 	Obi_blob_p   	blob2 = NULL;
 	Obi_blob_p   	temp_blob = NULL;
 	char*			seq1 = NULL;
 	char*			seq2 = NULL;
 	int32_t         len1;
 	int32_t         len2;
 	int32_t			temp_len;
 	int32_t         pos;
 	int32_t         shift1;
 	int32_t         shift2;
 	const uint8_t*  qual1;
 	const uint8_t*  qual2;
 	int             qual1_len;
 	int				qual2_len;
 	char*           consensus_seq;
 	uint8_t*        consensus_qual;
 	int32_t  		empty_part;
 	int32_t	        nuc_idx;
 	int32_t    		kmer_idx;
 	int32_t	        kmer_count;
 	int32_t         best_shift_idx;
 	int32_t         best_shift;
 	int32_t         max_common_kmers;
 	int32_t         overlap_len;
 	int32_t         consensus_len;
 	int32_t  		cons_shift;
 	uint8_t   		kmer;
 	byte_t          nuc;
 	uint8_t         encoding;
 	int             total_len;
 	int             shift;
 	int32_t  		shift_idx;
 	int32_t         kmer_pos_array_height    = *kmer_pos_array_height_p;
 	int32_t         shift_array_height       = *shift_array_height_p;
 	int32_t         shift_count_array_length = *shift_count_array_length_p;
 	bool			free_blob1 = false;
 	// Check kmer size
 	if ((kmer_size < 1) || (kmer_size > 4))
 	{
 		obi_set_errno(OBI_ALIGN_ERROR);
 		obidebug(1, "\nError when computing the kmer similarity between two sequences: the kmer size must be >= 1 or <= 4");
 		return NULL;
 	}
 	// Get sequence blobs
 	blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(view1, column1, idx1, elt_idx1);
 	if (blob1 == NULL)
 	{
 	 	obidebug(1, "\nError getting the blob of the 1st sequence when computing the kmer similarity between two sequences");
 	 	return NULL;
 	}
 	blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(view2, column2, idx2, elt_idx2);
 	if (blob2 == NULL)
 	{
 	 	obidebug(1, "\nError getting the blob of the 2nd sequence when computing the kmer similarity between two sequences");
 	 	return NULL;
 	}
 	// Choose the shortest sequence to save kmer positions in array
 	len1 = blob1->length_decoded_value;
 	len2 = blob2->length_decoded_value;
 	if (len2 > len1)
 	{
 		temp_len = len1;
 		len1 = len2;
 		len2 = temp_len;
 		temp_blob = blob1;
 		blob1 = blob2;
 		blob2 = temp_blob;
 	}
 	// Force encoding on 2 bits by replacing ambiguous nucleotides by 'a's
 	if (blob1->element_size == 4)
 	{
 		seq1 = obi_blob_to_seq(blob1);
 		for (i=0; i<len1; i++)
 		{
 			if (seq1[i] != 'a' && seq1[i] != 't' && seq1[i] != 'g' && seq1[i] != 'c')
 				seq1[i] = 'a';
 		}
 		blob1 = obi_seq_to_blob(seq1);
 		free_blob1 = true;
 	}
 	if (blob2->element_size == 4)
 	{
 		seq2 = obi_blob_to_seq(blob2);
 		for (i=0; i<len2; i++)
 		{
 			if (seq2[i] != 'a' && seq2[i] != 't' && seq2[i] != 'g' && seq2[i] != 'c')
 				seq2[i] = 'a';
 		}
 	}
 	// Reverse complement the other sequence    // TODO blob to seq to blob for now, could be more efficient
 	if (seq2 == NULL)
 		seq2 = obi_blob_to_seq(blob2);
 	seq2 = reverse_complement_sequence(seq2);
 	blob2 = obi_seq_to_blob(seq2);
 	// Save total length for the shift counts array
 	total_len = len1 + len2 + 1;   // +1 for shift 0
 	// Allocate or reallocate memory for the array of shift counts if necessary
 	if (shift_count_array == NULL)
 	{
 		shift_count_array_length = total_len;
 		shift_count_array = (int32_t*) malloc(shift_count_array_length * sizeof(int32_t));
 		if (shift_count_array == NULL)
 		{
 		 	obi_set_errno(OBI_MALLOC_ERROR);
 		 	obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory");
 		 	return NULL;
 		}
 	}
 	else if (total_len >= shift_count_array_length)
 	{
 		shift_count_array_length = total_len;
 		shift_count_array = (int32_t*) realloc(shift_count_array, shift_count_array_length * sizeof(int32_t));
 		if (shift_count_array == NULL)
 		{
 		 	obi_set_errno(OBI_MALLOC_ERROR);
 		 	obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory");
 		 	return NULL;
 		}
 	}
 	// Allocate or reallocate memory for the array of shifts if necessary
 	if (shift_array == NULL)
 	{
 		shift_array_height = total_len;
 		shift_array = (int32_t*) malloc(ARRAY_LENGTH * shift_array_height * sizeof(int32_t));
 		if (shift_array == NULL)
 		{
 		 	obi_set_errno(OBI_MALLOC_ERROR);
 		 	obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory");
 		 	return NULL;
 		}
 	}
 	else if (len1 >= shift_array_height)
 	{
 		shift_array_height = total_len;
 		shift_array = (int32_t*) realloc(shift_array, ARRAY_LENGTH * shift_array_height * sizeof(int32_t));
 		if (shift_array == NULL)
 		{
 		 	obi_set_errno(OBI_MALLOC_ERROR);
 		 	obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory");
 		 	return NULL;
 		}
 	}
 	// Allocate or reallocate memory for the array of positions if necessary
 	if (kmer_pos_array == NULL)
 	{
 		kmer_pos_array_height = len1;
 		kmer_pos_array = (int32_t*) malloc(ARRAY_LENGTH * kmer_pos_array_height * sizeof(int32_t));
 		if (kmer_pos_array == NULL)
 		{
 		 	obi_set_errno(OBI_MALLOC_ERROR);
 		 	obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory");
 		 	return NULL;
 		}
 	}
 	else if (len1 >= kmer_pos_array_height)
 	{
 		kmer_pos_array_height = len1;
 		kmer_pos_array = (int32_t*) realloc(kmer_pos_array, ARRAY_LENGTH * kmer_pos_array_height * sizeof(int32_t));
 		if (kmer_pos_array == NULL)
 		{
 		 	obi_set_errno(OBI_MALLOC_ERROR);
 		 	obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory");
 		 	return NULL;
 		}
 	}
 	// Initialize all positions to -1
 	for (i=0; i<(ARRAY_LENGTH * kmer_pos_array_height); i++)
 		kmer_pos_array[i] = -1;
 	// Initialize all shifts to the maximum value of an int32_t
 	for (i=0; i<(ARRAY_LENGTH * shift_array_height); i++)
 		shift_array[i] = INT32_MAX;
 	//memset(shift_array, 1, ARRAY_LENGTH * shift_array_height * sizeof(int32_t)); // why doesn't this work?
 	// Initialize all shift counts to 0
 	memset(shift_count_array, 0, shift_count_array_length * sizeof(int32_t));
 	*kmer_pos_array_height_p = kmer_pos_array_height;
 	*shift_array_height_p = shift_array_height;
 	*shift_count_array_length_p = shift_count_array_length;
 	// Fill array with positions of kmers in the shortest sequence
 	encoding = blob1->element_size;
 	kmer_count = len1 - kmer_size + 1;
 	for (kmer_idx=0; kmer_idx < kmer_count; kmer_idx++)
 	{
 		kmer = 0;
 		for (nuc_idx=kmer_idx; nuc_idx<(kmer_idx+kmer_size); nuc_idx++)
 		{
 			nuc = get_nucleotide_from_encoded_seq(blob1->value, nuc_idx, encoding);
 			kmer <<= encoding;
 			kmer |= nuc;
 		}
 		i = 0;
 		while (kmer_pos_array[(kmer*kmer_pos_array_height)+i] != -1)
 			i++;
 		kmer_pos_array[(kmer*kmer_pos_array_height)+i] = kmer_idx;
 	}
 	// Compare positions of kmers between both sequences and store shifts
 	kmer_count = blob2->length_decoded_value - kmer_size + 1;
 	for (kmer_idx=0; kmer_idx < kmer_count; kmer_idx++)
 	{
 		kmer = 0;
 		for (nuc_idx=kmer_idx; nuc_idx<(kmer_idx+kmer_size); nuc_idx++)
 		{
 			nuc = get_nucleotide_from_encoded_seq(blob2->value, nuc_idx, encoding);
 			kmer <<= encoding;
 			kmer |= nuc;
 		}
 		// Get the index at which the new shifts should be stored for that kmer
 		j = 0;
 		while ((shift_array[(kmer*shift_array_height)+j] != INT32_MAX) && (j<shift_array_height))
 			j++;
 		// Store the shift between the kmer in the 1st and the 2nd sequence
 		i = 0;
 		while ((kmer_pos_array[(kmer*kmer_pos_array_height)+i] != -1) && (i<kmer_pos_array_height))
 		{
 			shift_array[(kmer*shift_array_height)+j] = kmer_idx - kmer_pos_array[(kmer*kmer_pos_array_height)+i];
 			i++;
 			j++;
 		}
 	}
 	// Count how many times each shift is represented
 	for (kmer=0;;kmer++)
 	{
 		for (i=0;;i++)
 		{
 			shift = shift_array[(kmer*shift_array_height)+i];
 			if ((shift == INT32_MAX) || (i==shift_array_height))
 				break;
 			shift_idx = shift + len1;
 			shift_count_array[shift_idx]++;
 		}
 		if (kmer == (ARRAY_LENGTH-1))  // stop condition can't be in for line because kmer can't be >= 255
 			break;
 	}
 	// Find the most represented shift
 	best_shift_idx = 0;
 	max_common_kmers = 0;
 	empty_part = (shift_count_array_length-1)/2 - len1;
 	for (i=empty_part; i < (shift_count_array_length - empty_part); i++)  // skipping empty parts of the array
 	{
 		if (shift_count_array[i] > max_common_kmers)
 		{
 			best_shift_idx = i;
 			max_common_kmers = shift_count_array[i];
 		}
 	}
 	best_shift = best_shift_idx - len1;
 	if (best_shift > 0)
 	    overlap_len = len2 - best_shift;
 	else
 	    overlap_len = len1 + best_shift;
 	score = max_common_kmers + kmer_size - 1;
 	ali = (Obi_ali_p) malloc(sizeof(Obi_ali_t));
 	if (ali == NULL)
 	{
 	 	obi_set_errno(OBI_MALLOC_ERROR);
 	 	obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory for the result structure");
 	 	return NULL;
 	}
 	// Save result in Obi_ali structure
 	ali->score = score;
 	ali->consensus_length = 0;
 	ali->overlap_length = overlap_len;
 	ali->shift = best_shift;
 	ali->consensus_seq = NULL;
 	ali->consensus_qual = NULL;
 	// Build the consensus sequence if asked
 	if (build_consensus)
 	{
 		// Get the quality columns
 		qual_col1 = obi_view_get_column(view1, QUALITY_COLUMN);
 		if (qual_col1 == NULL)
 		{
 			obi_set_errno(OBI_ALIGN_ERROR);
 			obidebug(1, "\nError when computing the kmer similarity between two sequences: can't open quality column to build the consensus sequence");
 			return NULL;
 		}
 		qual_col2 = obi_view_get_column(view2, QUALITY_COLUMN);
 		if (qual_col2 == NULL)
 		{
 			obi_set_errno(OBI_ALIGN_ERROR);
 			obidebug(1, "\nError when computing the kmer similarity between two sequences: can't open quality column to build the consensus sequence");
 			return NULL;
 		}
 		// Get the quality arrays
 		qual1 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view1, qual_col1, idx1, 0, &qual1_len);
 		if (qual1 == NULL)
 		{
 			obidebug(1, "\nError getting the blob of the 1st sequence when computing the kmer similarity between two sequences");
 			return NULL;
 		}
 		qual2 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view2, qual_col2, idx2, 0, &qual2_len);
 		if (qual2 == NULL)
 		{
 			obidebug(1, "\nError getting the blob of the 1st sequence when computing the kmer similarity between two sequences");
 			return NULL;
 		}
 		// Decode the first sequence if not already done
 		if (seq1 == NULL)
 			seq1 = obi_blob_to_seq(blob1);
 		if (best_shift > 0)
 		    consensus_len = len1 + len2 - overlap_len;
 		else
 		    consensus_len = overlap_len;
 		// Allocate memory for consensus sequence
 		consensus_seq = (char*) malloc(consensus_len + 1 * sizeof(char)); // TODO keep malloced too maybe
 		if (consensus_seq == NULL)
 		{
 		 	obi_set_errno(OBI_MALLOC_ERROR);
 		 	obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory for the consensus sequence");
 		 	return NULL;
 		}
 		// Allocate memory for consensus quality
 		consensus_qual = (uint8_t*) malloc(consensus_len * sizeof(uint8_t));
 		if (consensus_qual == NULL)
 		{
 		 	obi_set_errno(OBI_MALLOC_ERROR);
 		 	obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory for the consensus quality");
 		 	return NULL;
 		}
 		ali->consensus_length = consensus_len;
 		ali->consensus_seq = consensus_seq;
 		ali->consensus_qual = consensus_qual;
 		// Compute consensus-relative shift for each sequence and store direction
 		if (best_shift >= 0)
 		{
 			shift1 = 0;
 			shift2 = best_shift;
 			strcpy(ali->direction, "left");
 		}
 		else
 		{
 			shift1 = -(best_shift);
 			shift2 = 0;
 			strcpy(ali->direction, "right");
 		}
 		// If positive shift, copy first part of second sequence
 		if (best_shift > 0)
 		{
 			strncpy(consensus_seq, seq2, best_shift);
 			memcpy(consensus_qual, qual2, best_shift*sizeof(uint8_t));
 			cons_shift = best_shift;
 		}
 		else
 			cons_shift = 0;
 		// Build consensus part
 		for (pos=0; pos<overlap_len; pos++)
 		{
 			if (qual1[pos+shift1] >= qual2[pos+shift2])
 				consensus_seq[pos+cons_shift] = seq1[pos+shift1];
 			else
 				consensus_seq[pos+cons_shift] = seq2[pos+shift2];
 			consensus_qual[pos+cons_shift] = round((qual1[pos+shift1] + qual2[pos+shift2])/2);    // TODO maybe use the (p1*(1-p2/3)) formula (but parenthesis bug???)
 		}
 		// If positive shift, copy last part of first sequence
 		if (best_shift > 0)
 		{
 			strncpy(consensus_seq+cons_shift+overlap_len, seq1+overlap_len, len1-overlap_len);
 			memcpy(consensus_qual+cons_shift+overlap_len, qual1+overlap_len, (len1-overlap_len)*sizeof(uint8_t));
 		}
 		consensus_seq[consensus_len] = '\0';
 	}
 	if (seq1 != NULL)
 		free(seq1);
 	if (free_blob1)
 		free(blob1);
 	free(seq2);
 	free(blob2);
 	return ali;
 }
--- a/src/kmer_similarity.h
+++ b/src/kmer_similarity.h
@ -0,0 +1,120 @@
 /****************************************************************************
 * Header file for Kmer similarity computation functions                    *
 ****************************************************************************/
 /**
 * @file kmer_similarity.h
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 * @date January 7th 2019
 * @brief Header file for Kmer similarity computation functions.
 */
 #ifndef KMER_SIMILARITY_H_
 #define KMER_SIMILARITY_H_
 #include <stdio.h>
 #include "obitypes.h"
 #include "obidmscolumn.h"
 #include "obiview.h"
 #define ARRAY_LENGTH (256)
 /**
 * @brief Alignment structure, with informations about the similarity and to rebuild the alignment.
 */
 typedef struct Obi_ali {
 	double   score;	    		/**< Alignment score.
 	 	 	 	 	 	 	 	 */
 	int      consensus_length; 	/**< Length of the final consensus sequence.
 	 	 	 	 	 	 	 	 */
 	int      overlap_length;	/**< Length of the overlap between the aligned sequences.
 	 	 	 	 	 	 	 	 */
 	char*    consensus_seq; 	/**< Consensus sequence built with the computed overlap.
 	 	 	 	 	 	 	 	 */
 	uint8_t* consensus_qual;	/**< Consensus quality built with the computed overlap.
 	 	 	 	        	 	 */
 	int		 shift;   		    /**< Shift chosen to align the sequences (for shifted alignment).
 	 	 	 	 	 	 	 	 */
 	char     direction[6]; 	    /**< Alignement direction (positive/right or negative/left shift) (for shifted alignment).
 	 	 	 	 	 	 	 	 */   // TODO but sequences switched around depending on size..... discuss
 } Obi_ali_t, *Obi_ali_p;
 /**
 * @brief Frees an Obi_ali_p structure and all its elements.
 *
 * @warning The pointer sent becomes unusable.
 *
 * @param ali The pointer on the Obi_ali_p structure to free.
 *
 * @since January 2019
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 */
 void obi_free_shifted_ali(Obi_ali_p ali);
 /**
 * Function computing the kmer similarity of two sequences stored in views.
 *
 * The similarity is computing this way: the positions of identical kmers in both sequences are
 * compared, and the most represented shift is chosen. The similarity is then calculated as:
 * kmer_similarity = number_of_common_kmers_with_the_chosen_shift + kmer_size - 1
 *
 * @warning Several pointers and structures passed to or returned by the function have to be freed by the caller:
 * 			  - kmer_pos_array
 * 			  - shift_array
 * 			  - shift_count_array
 * 			  - the Obi_ali_p structure returned
 *
 * @param view1 A pointer on the view containing the first sequence.
 * @param column1 A pointer on the column containing the first sequence.
 * @param idx1 The index of the first sequence in view1.
 * @param elt_idx1 The element index of the first sequence in column1.
 * @param view2 A pointer on the view containing the second sequence.
 * @param column2 A pointer on the column containing the second sequence.
 * @param idx2 The index of the second sequence in view2.
 * @param elt_idx2 The element index of the second sequence in column2.
 * @param kmer_size The kmer length to use. Must be >= 1 <= 4.
 * @param kmer_pos_array The array used to store kmer positions. If NULL, allocated by the function.
 *        If needed, reallocated to a bigger size.
 * @param kmer_pos_array_height_p A pointer on an integer corresponding to the size (number of elements)
 *        allocated for kmer_pos_array. Updated by the function as needed.
 * @param shift_array The array used to store kmer shifts. If NULL, allocated by the function.
 *        If needed, reallocated to a bigger size.
 * @param shift_array_height_p A pointer on an integer corresponding to the size (number of elements)
 *        allocated for shift_array. Updated by the function as needed.
 * @param shift_count_array The array used to store shift counts. If NULL, allocated by the function.
 *        If needed, reallocated to a bigger size.
 * @param shift_count_array_height_p A pointer on an integer corresponding to the size (number of elements)
 *        allocated for shift_count_array. Updated by the function as needed.
 * @param build_consensus A boolean indicating whether the function should build the consensus sequence and quality. // TODO option to build consensus without quality?
 *
 * @returns A pointer on an Obi_ali_p structure containing the results.
 * @retval NULL if an error occurred.
 *
 * @since January 2019
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 */
 Obi_ali_p kmer_similarity(Obiview_p view1,
 						  OBIDMS_column_p column1,
 						  index_t idx1,
 						  index_t elt_idx1,
 						  Obiview_p view2,
 						  OBIDMS_column_p column2,
 						  index_t idx2,
 						  index_t elt_idx2,
 						  uint8_t kmer_size,
 						  int32_t* kmer_pos_array,
 						  int32_t* kmer_pos_array_height_p,
 						  int32_t* shift_array,
 						  int32_t* shift_array_height_p,
 						  int32_t* shift_count_array,
 						  int32_t* shift_count_array_height_p,
 						  bool build_consensus);
 #endif /* KMER_SIMILARITY_H_ */
		`@ -0,0 +1,4 @@`
							`#cython: language_level=3`


							`cdef object buildAlignment(object direct, object reverse)`