Alignpairedend: added alignment using shifting with best kmer similarity

(low level layer in C and Cython API)
2019-01-21 17:30:46 +01:00
parent 59017c0d6b
commit c7f5b8d980
10 changed files with 934 additions and 89 deletions
--- a/python/obitools3/commands/alignpairedend.pxd
+++ b/python/obitools3/commands/alignpairedend.pxd
@ -0,0 +1,4 @@
+#cython: language_level=3
+
+
+cdef object buildAlignment(object direct, object reverse)
--- a/python/obitools3/commands/alignpairedend.pyx
+++ b/python/obitools3/commands/alignpairedend.pyx
@ -2,19 +2,20 @@

 from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
 from obitools3.dms import DMS
-from obitools3.dms.view.view cimport View
 from obitools3.dms.view import RollbackException
 from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
-from obitools3.dms.column.column cimport Column, Column_line
-from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN
-from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t, OBI_QUAL
+from obitools3.dms.column.column cimport Column
+from obitools3.dms.capi.obiview cimport QUALITY_COLUMN
+from obitools3.dms.capi.obitypes cimport OBI_QUAL
 from obitools3.apps.optiongroups import addMinimalInputOption, addMinimalOutputOption
 from obitools3.uri.decode import open_uri
 from obitools3.apps.config import logger
 from obitools3.libalign._qsassemble import QSolexaReverseAssemble
 from obitools3.libalign._qsrassemble import QSolexaRightReverseAssemble
 from obitools3.libalign._solexapairend import buildConsensus, buildJoinedSequence
-from obitools3.dms.obiseq cimport Nuc_Seq_Stored, Nuc_Seq
+from obitools3.dms.obiseq cimport Nuc_Seq
+from obitools3.libalign.shifted_ali cimport Kmer_similarity, Ali_shifted
+

 import sys
 import os
@ -27,6 +28,7 @@ REVERSE_QUALITY_COLUMN_NAME = b"REVERSE_QUALITY"
 __title__="Aligns paired-ended reads"


+
 def addOptions(parser):

    addMinimalInputOption(parser)
@ -41,14 +43,6 @@ def addOptions(parser):
                     type=str,
                     help="URI to the reverse reads if they are in a different view than the forward reads")

-# TODO
-#    group.add_argument('--index-file',
-#                     action="store", dest="alignpairedend:index",
-#                     metavar="<FILENAME>",
-#                     default=None,
-#                     type=str,
-#                     help="URI to the index reads")
-
    group.add_argument('--score-min',
                     action="store", dest="alignpairedend:smin",
                     metavar="#.###",
@ -56,12 +50,22 @@ def addOptions(parser):
                     type=float,
                     help="Minimum score for keeping alignments")

+    group.add_argument('-A', '--true-ali',
+                       action="store_true", dest="alignpairedend:trueali",
+                       default=False,
+                       help="Performs gap free end alignment of sequences instead of using kmers to compute alignments (slower).")
+
+    group.add_argument('-k', '--kmer-size',
+                       action="store", dest="alignpairedend:kmersize",
+                       metavar="#",
+                       default=3,
+                       type=int,
+                       help="K-mer size for kmer comparisons, between 1 and 4 (not when using -A option; default: 3)")

-# TODO declarations, cdef, imports

 la = QSolexaReverseAssemble()
 ra = QSolexaRightReverseAssemble()
-def buildAlignment(object direct, object reverse):
+cdef object buildAlignment(object direct, object reverse):
    
    if len(direct)==0 or len(reverse)==0:
        return None
@ -82,9 +86,9 @@ def buildAlignment(object direct, object reverse):
        ali = rali
    
    return ali
+    

-
-def alignmentIterator(entries): 
+def alignmentIterator(entries, aligner): 
    
    if type(entries) == list:
        two_views = True
@ -94,7 +98,7 @@ def alignmentIterator(entries):
    else:
        two_views = False
        entries_len = len(entries)
-       
+    
    for i in range(entries_len):
        if two_views:
            seqF = forward[i]
@ -104,9 +108,12 @@ def alignmentIterator(entries):
            seqR = Nuc_Seq(seqF.id, seqF[REVERSE_SEQ_COLUMN_NAME], quality=seqF[REVERSE_QUALITY_COLUMN_NAME])
            seqF.pop(REVERSE_SEQ_COLUMN_NAME)
            seqF.pop(REVERSE_QUALITY_COLUMN_NAME)
-        ali = buildAlignment(seqF, seqR)
+        
+        ali = aligner(seqF, seqR)
+        
        if ali is None:
            continue
+        
        yield ali


@ -160,11 +167,6 @@ def run(config):
    else:
        entries_len = len(entries)
    
-#     if "index" in config["alignpairedend"]:    # TODO
-#         index = open_uri(config["alignpairedend"]["index"])
-#     else:
-#         index = None
-    
    # Open the output
    output = open_uri(config['obi']['outputURI'],
                      input=False,
@ -174,22 +176,30 @@ def run(config):

    view = output[1]
    
-    Column.new_column(view, b"QUALITY", OBI_QUAL)   #TODO output URI quality option
+    Column.new_column(view, QUALITY_COLUMN, OBI_QUAL)   #TODO output URI quality option?

    if 'smin' in config['alignpairedend']:
-        config['alignpairedend']['sminL'] = config['alignpairedend']['smin']
-        config['alignpairedend']['sminR'] = config['alignpairedend']['smin']
-        sminL = config['alignpairedend']['sminL']
-        sminR = config['alignpairedend']['sminR']
+        smin = config['alignpairedend']['smin']
    else:
-        sminL = 0
-        sminR = 0
+        smin = 0

    # Initialize the progress bar
    pb = ProgressBar(entries_len, config, seconde=5)
    
-    ba = alignmentIterator(entries)
-            
+    if config['alignpairedend']['trueali']:
+        kmer_ali = False
+        aligner = buildAlignment
+    else :
+        kmer_ali = True
+        if type(entries) == list:
+            forward = entries[0]
+            reverse = entries[1]
+            aligner = Kmer_similarity(forward, view2=reverse, kmer_size=config['alignpairedend']['kmersize'])
+        else:
+            aligner = Kmer_similarity(entries, kmer_size=config['alignpairedend']['kmersize'])
+        
+    ba = alignmentIterator(entries, aligner)
+
    i = 0
    for ali in ba:
        
@ -197,31 +207,33 @@ def run(config):
        
        consensus = view[i]
        
-        if sminL > 0:
-            if ((ali.direction=='left' and ali.score > sminL)
-                or (ali.score > sminR)):
-                buildConsensus(ali, consensus)
+        if not two_views:
+            seqF = entries[i]
+        else:
+            seqF = forward[i]
+                
+        if smin > 0:
+            if (ali.score > smin) :
+                buildConsensus(ali, consensus, seqF)
            else:
-                seqF = ali[0].wrapped
                if not two_views:
-                    seq = entries[i]
-                    seqR = Nuc_Seq(seq.id, seq[REVERSE_SEQ_COLUMN_NAME], quality = seq[REVERSE_QUALITY_COLUMN_NAME])
+                    seqR = Nuc_Seq(seqF.id, seqF[REVERSE_SEQ_COLUMN_NAME], quality = seqF[REVERSE_QUALITY_COLUMN_NAME])
                else:
                    seqR = reverse[i]
-                buildJoinedSequence(ali, seqR, consensus)
+                buildJoinedSequence(ali, seqR, consensus, forward=seqF)
                
-            consensus[b"sminL"] = sminL
-            consensus[b"sminR"] = sminR
+            consensus[b"smin"] = smin
        else:
-            buildConsensus(ali, consensus)
-
-# TODO            
-#        if "index" in config['alignpairedend'] and config['alignpairedend']['index'] is not None:
-#            idx = str(config['alignpairedend']['index'].next())
-#            consensus[b"illumina_index"] = idx
+            buildConsensus(ali, consensus, seqF)
+                
+        if kmer_ali :
+            ali.free()
    
        i+=1

+    if kmer_ali :
+        aligner.free()
+
    # Save command config in View and DMS comments
    command_line = " ".join(sys.argv[1:])
    view.write_config(config, "alignpairedend", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
--- a/python/obitools3/dms/capi/kmer_similarity.pxd
+++ b/python/obitools3/dms/capi/kmer_similarity.pxd
@ -0,0 +1,42 @@
+#cython: language_level=3
+
+from obitools3.dms.capi.obiview cimport Obiview_p
+from obitools3.dms.capi.obidmscolumn cimport OBIDMS_column_p
+from obitools3.dms.capi.obitypes cimport index_t
+
+from libc.stdint  cimport int32_t, uint8_t
+
+
+cdef extern from "kmer_similarity.h" nogil:
+
+    struct Obi_ali_t :
+        double   score
+        int      consensus_length
+        int      overlap_length
+        char*    consensus_seq
+        uint8_t* consensus_qual
+        int      shift
+        char*    direction
+
+    ctypedef Obi_ali_t* Obi_ali_p
+
+    
+    void obi_free_shifted_ali(Obi_ali_p ali)
+
+
+    Obi_ali_p kmer_similarity(Obiview_p view1,
+                              OBIDMS_column_p column1,
+                              index_t idx1,
+                              index_t elt_idx1,
+                              Obiview_p view2,
+                              OBIDMS_column_p column2,
+                              index_t idx2,
+                              index_t elt_idx2,
+                              uint8_t kmer_size,
+                              int32_t* kmer_pos_array,
+                              int32_t* kmer_pos_array_height_p,
+                              int32_t* shift_array,
+                              int32_t* shift_array_height_p,
+                              int32_t* shift_count_array, 
+                              int32_t* shift_count_array_height_p,
+                              bint build_consensus)
--- a/python/obitools3/dms/dms.cfiles
+++ b/python/obitools3/dms/dms.cfiles
@ -6,6 +6,7 @@
 ../../../src/dna_seq_indexer.c
 ../../../src/encode.c
 ../../../src/hashtable.c
+../../../src/kmer_similarity.c
 ../../../src/linked_list.c
 ../../../src/murmurhash2.c
 ../../../src/obi_lcs.c
--- a/python/obitools3/libalign/_solexapairend.pyx
+++ b/python/obitools3/libalign/_solexapairend.pyx
@ -3,6 +3,18 @@
 from cpython cimport array

 from .solexapairend import iterOnAligment
+from .shifted_ali cimport Ali_shifted
+
+from obitools3.dms.capi.obiview cimport Obiview_p, QUALITY_COLUMN, \
+                                        obi_set_qual_int_with_elt_idx_and_col_p_in_view, \
+                                        obi_set_str_with_elt_idx_and_col_p_in_view
+                                        
+from obitools3.dms.capi.obidmscolumn cimport OBIDMS_column_p
+
+from obitools3.dms.view.view cimport View
+from obitools3.dms.column.column cimport Column
+
+
 from math import log10


@ -146,7 +158,7 @@ cdef class IterOnConsensus:
    seqBDeletion = property(get_seqBDeletion, None, None, "deletereverse's docstring")


-def buildConsensus(ali, seq):
+def buildConsensus(ali, seq, ref_tags=None):
    cdef list quality
    cdef char   aseq[1000]
    cdef int i=0
@ -156,61 +168,82 @@ def buildConsensus(ali, seq):
    cdef double score
    cdef bytes sseq
    cdef double p
+    cdef OBIDMS_column_p col_p
+    cdef Obiview_p view_p
+    cdef View view
+    cdef Column column
    
    quality = []
+        
+    if type(ali) == Ali_shifted:
+        view = seq.view
+        view_p = view.pointer()
+        column = view[QUALITY_COLUMN]
+        col_p = column.pointer()
+        # doesn't work because uint8_t* are forced into bytes by cython (nothing read/kept beyond 0 values)
+        #obi_set_qual_int_with_elt_idx_and_col_p_in_view(view_p, col_p, seq.index, 0, ali.consensus_qual, ali.consensus_len)
+        seq.set(ref_tags.id+b"_CONS", ali.consensus_seq, quality=ali.consensus_qual, tags=ref_tags)
+        seq[b'ali_length'] = ali.consensus_len
+        seq[b'score_norm']=float(ali.score)/ali.consensus_len
+        seq[b'shift']=ali.shift
+    else:
+        if len(ali[0])>999:   # TODO why?
+            raise AssertionError,"Too long alignemnt"
    
-    if len(ali[0])>999:   # TODO why?
-        raise AssertionError,"Too long alignemnt"
-
-    ic=IterOnConsensus(ali)
-
-    for nuc,score in ic:
-        cnuc=nuc
-        quality.append(score)
-        aseq[i]=cnuc[0]
-        i+=1
-
-    aseq[i]=0
-
-    sseq=aseq
-
-    # Reconvert quality from array of probabilities to array of raw quality values
-    i=0
-    for p in quality:
-        quality[i] = min(42, round(-10*log10(p)))
-        i+=1
+        ic=IterOnConsensus(ali)
    
-    seq.set(ali[0].wrapped.id+b"_CONS", sseq, quality=quality, tags=ali[0].wrapped)
+        for nuc,score in ic:
+            cnuc=nuc
+            quality.append(score)
+            aseq[i]=cnuc[0]
+            i+=1
    
+        aseq[i]=0
+    
+        sseq=aseq
+    
+        # Reconvert quality from array of probabilities to array of raw quality values
+        i=0
+        for p in quality:
+            quality[i] = min(42, round(-10*log10(p)))
+            i+=1
+        
+        seq.set(ali[0].wrapped.id+b"_CONS", sseq, quality=quality, tags=ali[0].wrapped)
+        
+        if hasattr(ali, "counter"):
+            seq[b'alignement_id']=ali.counter
+        seq[b'seq_a_single']=ic.seqASingle
+        seq[b'seq_b_single']=ic.seqBSingle
+        seq[b'seq_ab_match']=ic.seqABMatch
+        seq[b'seq_a_mismatch']=ic.seqAMismatch
+        seq[b'seq_b_mismatch']=ic.seqBMismatch
+        seq[b'seq_a_insertion']=ic.seqAInsertion
+        seq[b'seq_b_insertion']=ic.seqBInsertion-ic.seqBSingle
+        seq[b'seq_a_deletion']=ic.seqADeletion
+        seq[b'seq_b_deletion']=ic.seqBDeletion
+        seq[b'ali_length']=len(seq)-ic.seqASingle-ic.seqBSingle
+        if seq[b'ali_length']>0:
+            seq[b'score_norm']=float(ali.score)/seq[b'ali_length']
+
    if hasattr(ali, "direction"):
-        seq[b'direction']=ali.direction
-    if hasattr(ali, "counter"):
-        seq[b'alignement_id']=ali.counter
-    seq[b'seq_a_single']=ic.seqASingle
-    seq[b'seq_b_single']=ic.seqBSingle
-    seq[b'seq_ab_match']=ic.seqABMatch
-    seq[b'seq_a_mismatch']=ic.seqAMismatch
-    seq[b'seq_b_mismatch']=ic.seqBMismatch
-    seq[b'seq_a_insertion']=ic.seqAInsertion
-    seq[b'seq_b_insertion']=ic.seqBInsertion-ic.seqBSingle
-    seq[b'seq_a_deletion']=ic.seqADeletion
-    seq[b'seq_b_deletion']=ic.seqBDeletion
+        seq[b'ali_direction']=ali.direction
    seq[b'score']=ali.score
-    seq[b'ali_length']=len(seq)-ic.seqASingle-ic.seqBSingle
-    if seq[b'ali_length']>0:
-        seq[b'score_norm']=float(ali.score)/seq[b'ali_length']
    seq[b'mode']=b'alignment'
+    
    return seq


-def buildJoinedSequence(ali, reverse, seq):
-    forward = ali[0].wrapped
+def buildJoinedSequence(ali, reverse, seq, forward=None):
+    if forward is not None:
+        forward = forward
+    else:
+        forward = ali[0].wrapped
    s = forward.seq + reverse.seq     
    quality = forward.quality
    quality.extend(reverse.quality)
    seq.set(forward.id +b"_PairedEnd", s, definition=forward.definition, quality=quality, tags=forward)
    seq[b"score"]=ali.score
-    seq[b"ali_dir"]=ali.direction
+    seq[b"ali_direction"]=ali.direction
    seq[b"mode"]=b"joined"
    seq[b"pairedend_limit"]=len(forward)    
    return seq
--- a/python/obitools3/libalign/shifted_ali.pxd
+++ b/python/obitools3/libalign/shifted_ali.pxd
@ -0,0 +1,30 @@
+#cython: language_level=3
+
+from obitools3.dms.capi.kmer_similarity cimport kmer_similarity, Obi_ali_p
+from libc.stdint  cimport int32_t, uint8_t
+from obitools3.dms.capi.obiview cimport Obiview_p
+from obitools3.dms.capi.obidmscolumn cimport OBIDMS_column_p
+
+
+cdef class Ali_shifted:
+    cdef Obi_ali_p _pointer
+    cdef inline Obi_ali_p pointer(self)
+    cpdef free(self)
+    @staticmethod
+    cdef Ali_shifted new_ali(Obi_ali_p ali_p)
+
+
+cdef class Kmer_similarity:
+    cdef uint8_t         kmer_size
+    cdef int32_t*        kmer_pos_array_p
+    cdef int32_t         kmer_pos_array_height_a[1]
+    cdef int32_t*        shift_array_p
+    cdef int32_t         shift_array_height_a[1]
+    cdef int32_t*        shift_count_array_p
+    cdef int32_t         shift_count_array_height_a[1]
+    cdef Obiview_p       view1_p
+    cdef OBIDMS_column_p column1_p
+    cdef Obiview_p       view2_p
+    cdef OBIDMS_column_p column2_p
+    cdef bint            build_consensus
+    cpdef free(self)
--- a/python/obitools3/libalign/shifted_ali.pyx
+++ b/python/obitools3/libalign/shifted_ali.pyx
@ -0,0 +1,119 @@
+#cython: language_level=3
+
+from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN
+
+from obitools3.dms.obiseq cimport Nuc_Seq_Stored
+from obitools3.dms.capi.kmer_similarity cimport kmer_similarity, Obi_ali_p, obi_free_shifted_ali
+from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
+from obitools3.dms.column.column cimport Column
+
+from libc.stdlib  cimport free
+
+
+cdef class Ali_shifted:
+            
+    cdef inline Obi_ali_p pointer(self) :
+        return <Obi_ali_p>(self._pointer)        
+    
+    @staticmethod
+    cdef Ali_shifted new_ali(Obi_ali_p ali_p) :   # not __init__ method because need to pass pointer
+        ali = Ali_shifted()
+        ali._pointer = ali_p
+        return ali
+
+    @property
+    def score(self):
+        return self.pointer().score
+
+    @property
+    def consensus_len(self):
+        return self.pointer().consensus_length
+
+    @property
+    def overlap_len(self):
+        return self.pointer().overlap_length
+    
+    @property
+    def consensus_seq(self):
+        return self.pointer().consensus_seq
+
+    @property
+    def shift(self):
+        return self.pointer().shift
+    
+    @property
+    def consensus_qual(self):   # must return list because uint8_t* are forced into bytes by cython which won't keep beyond the first 0 value
+        qual_list = []
+        for i in range(self.consensus_len):
+            qual_list.append((self.pointer().consensus_qual)[i])
+        return qual_list
+
+    @property
+    def direction(self):
+        return self.pointer().direction
+
+    cpdef free(self):
+        obi_free_shifted_ali(self.pointer())
+
+
+cdef class Kmer_similarity:
+    def __init__(self, View_NUC_SEQS view1, Column column1=None, View_NUC_SEQS view2=None, Column column2=None, uint8_t kmer_size=3, build_consensus=True) :
+        cdef Column default_seq_col
+        if kmer_size < 1 or kmer_size > 3:
+            raise Exception("Kmer size to compute kmer similarity must be >=1 or <=4")
+        self.kmer_pos_array_p = NULL
+        self.shift_array_p = NULL
+        self.shift_count_array_p = NULL
+        self.kmer_pos_array_height_a[0] = 0
+        self.shift_array_height_a[0] = 0 
+        self.shift_count_array_height_a[0] = 0
+        self.kmer_size = kmer_size
+        self.build_consensus = build_consensus
+        self.view1_p = view1.pointer()
+        if column1 is not None:
+            self.column1_p = column1.pointer()
+        else:
+            if type(view1) != View_NUC_SEQS:
+                raise Exception("Kmer similarity with no sequence column given must be given a NUC_SEQS view")
+            default_seq_col = view1[NUC_SEQUENCE_COLUMN]
+            self.column1_p = default_seq_col.pointer()
+        if view2 is not None:
+            self.view2_p = view2.pointer()
+        else:
+            view2 = view1
+            self.view2_p = self.view1_p
+        if column2 is not None:
+            self.column2_p = column2.pointer()
+        else:
+            if type(view2) != View_NUC_SEQS:
+                raise Exception("Kmer similarity with no sequence column given must be given a NUC_SEQS view")
+            default_seq_col = view2[NUC_SEQUENCE_COLUMN]
+            self.column2_p = default_seq_col.pointer()
+    
+    
+    def __call__(self, Nuc_Seq_Stored seq1, Nuc_Seq_Stored seq2):
+        cdef Obi_ali_p ali_p
+        cdef Ali_shifted ali
+        ali_p = kmer_similarity(self.view1_p, self.column1_p, seq1.index, 0, \
+                               self.view2_p, self.column2_p, seq2.index, 0, \
+                               self.kmer_size, \
+                               self.kmer_pos_array_p, self.kmer_pos_array_height_a, \
+                               self.shift_array_p, self.shift_array_height_a, \
+                               self.shift_count_array_p, self.shift_count_array_height_a,
+                               self.build_consensus)
+        
+        ali = Ali_shifted.new_ali(ali_p)
+        return ali
+    
+    cpdef free(self):
+        if self.kmer_pos_array_p is not NULL:
+            free(self.kmer_pos_array_p)
+            self.kmer_pos_array_height_a[0] = 0
+        if (self.shift_array_p is not NULL) :
+            free(self.shift_array_p)
+            self.shift_array_height_a[0] = 0
+        if (self.shift_count_array_p is not NULL) :
+            free(self.shift_count_array_p)
+            self.shift_count_array_height_a[0] = 0
+        
+
--- a/python/obitools3/utils.cfiles
+++ b/python/obitools3/utils.cfiles
@ -6,6 +6,7 @@
 ../../src/dna_seq_indexer.c
 ../../src/encode.c
 ../../src/hashtable.c
+../../src/kmer_similarity.c
 ../../src/linked_list.c
 ../../src/murmurhash2.c
 ../../src/obi_lcs.c