Fixed the new alignpaired end to work after ngsfilter with the 9879847

possible cases
2019-02-17 18:32:35 +01:00
parent 4ddd1a1c37
commit e026e9ec83
9 changed files with 151 additions and 83 deletions
--- a/python/obitools3/commands/alignpairedend.pyx
+++ b/python/obitools3/commands/alignpairedend.pyx
@ -15,16 +15,12 @@ from obitools3.libalign._qsrassemble import QSolexaRightReverseAssemble
 from obitools3.libalign._solexapairend import buildConsensus, buildJoinedSequence
 from obitools3.dms.obiseq cimport Nuc_Seq
 from obitools3.libalign.shifted_ali cimport Kmer_similarity, Ali_shifted
-
+from obitools3.commands.ngsfilter import REVERSE_SEQ_COLUMN_NAME, REVERSE_QUALITY_COLUMN_NAME

 import sys
 import os


-REVERSE_SEQ_COLUMN_NAME = b"REVERSE_SEQUENCE"
-REVERSE_QUALITY_COLUMN_NAME = b"REVERSE_QUALITY"
-
-
 __title__="Aligns paired-ended reads"


@ -106,8 +102,7 @@ def alignmentIterator(entries, aligner):
        else:
            seqF = Nuc_Seq.new_from_stored(entries[i])
            seqR = Nuc_Seq(seqF.id, seqF[REVERSE_SEQ_COLUMN_NAME], quality=seqF[REVERSE_QUALITY_COLUMN_NAME])
-            seqF.pop(REVERSE_SEQ_COLUMN_NAME)
-            seqF.pop(REVERSE_QUALITY_COLUMN_NAME)
+            seqR.index = i
        
        ali = aligner(seqF, seqR)
        
@ -196,7 +191,7 @@ def run(config):
            reverse = entries[1]
            aligner = Kmer_similarity(forward, view2=reverse, kmer_size=config['alignpairedend']['kmersize'])
        else:
-            aligner = Kmer_similarity(entries, kmer_size=config['alignpairedend']['kmersize'])
+            aligner = Kmer_similarity(entries, column2=entries[REVERSE_SEQ_COLUMN_NAME], qual_column2=entries[REVERSE_QUALITY_COLUMN_NAME], kmer_size=config['alignpairedend']['kmersize'])
        
    ba = alignmentIterator(entries, aligner)

--- a/python/obitools3/dms/capi/kmer_similarity.pxd
+++ b/python/obitools3/dms/capi/kmer_similarity.pxd
@ -32,6 +32,8 @@ cdef extern from "kmer_similarity.h" nogil:
                              OBIDMS_column_p column2,
                              index_t idx2,
                              index_t elt_idx2,
+                              OBIDMS_column_p qual_col1,
+                              OBIDMS_column_p qual_col2,
                              uint8_t kmer_size,
                              int32_t* kmer_pos_array,
                              int32_t* kmer_pos_array_height_p,
--- a/python/obitools3/dms/obiseq.pxd
+++ b/python/obitools3/dms/obiseq.pxd
@ -5,6 +5,7 @@ from .view.view cimport Line

 cdef class Seq(dict) :

+    cdef int _index
    cpdef object clone(self)
    cpdef str get_str(self)
    cpdef get_symbol_at(self, int pos)
@ -22,6 +23,7 @@ cdef class Nuc_Seq(Seq) :
    

 cdef class Seq_Stored(Line) :
+
    cpdef get_symbol_at(self, int pos)
    cpdef get_slice(self, slice slice_to_get)

@ -31,6 +33,7 @@ cdef class Nuc_Seq_Stored(Seq_Stored) :
    cdef  Nuc_Seq _reverse_complement
    cdef  object  _quality_array
    cdef  bytes   _seq
+    
    cpdef set(self, object id, object seq, object definition=*, object quality=*, int offset=*, object tags=*)
    cpdef set_quality_int(self, list new_qual)
    cpdef set_quality_char(self, object new_qual, int offset=*)
--- a/python/obitools3/dms/obiseq.pyx
+++ b/python/obitools3/dms/obiseq.pyx
@ -40,6 +40,7 @@ cdef class Seq(dict) :
        self.id = id
        self.seq = seq
        self.definition = definition
+        self._index = -1
        if tags is not None :
            for k in tags:
                k_b = tobytes(k)
@ -54,6 +55,7 @@ cdef class Seq(dict) :
    def new_from_stored(Seq_Stored seq_to_clone) :
        cdef Seq  new_seq
        new_seq = Seq(seq_to_clone.id, seq_to_clone.seq, definition=seq_to_clone.definition, quality=seq_to_clone.quality, tags=seq_to_clone)
+        new_seq._index = seq_to_clone._index
        return new_seq

    def __contains__(self, object key):
@ -128,7 +130,16 @@ cdef class Seq(dict) :
        if new_definition is not None:
            new_definition = tobytes(new_definition)
        self[DEFINITION_COLUMN] = new_definition
-        
+
+    # sequence index (for reference in a view eventually) property getter and setter
+    @property
+    def index(self):  # @ReservedAssignment
+        return self._index
+
+    @index.setter
+    def index(self, int idx):  # @DuplicatedSignature
+        self._index = idx
+
        
 cdef class Nuc_Seq(Seq) :

@ -160,6 +171,7 @@ cdef class Nuc_Seq(Seq) :
    def new_from_stored(Nuc_Seq_Stored seq_to_clone) :
        cdef Nuc_Seq new_seq
        new_seq = Nuc_Seq(seq_to_clone.id, seq_to_clone.seq, definition=seq_to_clone.definition, quality=seq_to_clone.quality, tags=seq_to_clone)
+        new_seq._index = seq_to_clone.index
        return new_seq

    # is_revcomp property getter and setter (boolean indicating whether the sequence was created by reverse complementing another sequence)
--- a/python/obitools3/libalign/_solexapairend.pyx
+++ b/python/obitools3/libalign/_solexapairend.pyx
@ -13,7 +13,7 @@ from obitools3.dms.capi.obidmscolumn cimport OBIDMS_column_p

 from obitools3.dms.view.view cimport View
 from obitools3.dms.column.column cimport Column
-
+from obitools3.commands.ngsfilter import REVERSE_SEQ_COLUMN_NAME, REVERSE_QUALITY_COLUMN_NAME

 from math import log10

@ -182,7 +182,7 @@ def buildConsensus(ali, seq, ref_tags=None):
        col_p = column.pointer()
        # doesn't work because uint8_t* are forced into bytes by cython (nothing read/kept beyond 0 values)
        #obi_set_qual_int_with_elt_idx_and_col_p_in_view(view_p, col_p, seq.index, 0, ali.consensus_qual, ali.consensus_len)
-        seq.set(ref_tags.id+b"_CONS", ali.consensus_seq, quality=ali.consensus_qual, tags=ref_tags)
+        seq.set(ref_tags.id+b"_CONS", ali.consensus_seq, quality=ali.consensus_qual)
        seq[b'ali_length'] = ali.consensus_len
        seq[b'score_norm']=float(ali.score)/ali.consensus_len
        seq[b'shift']=ali.shift
@ -208,7 +208,7 @@ def buildConsensus(ali, seq, ref_tags=None):
            quality[i] = min(42, round(-10*log10(p)))
            i+=1
        
-        seq.set(ali[0].wrapped.id+b"_CONS", sseq, quality=quality, tags=ali[0].wrapped)
+        seq.set(ali[0].wrapped.id+b"_CONS", sseq, quality=quality)
        
        if hasattr(ali, "counter"):
            seq[b'alignement_id']=ali.counter
@ -224,12 +224,18 @@ def buildConsensus(ali, seq, ref_tags=None):
        seq[b'ali_length']=len(seq)-ic.seqASingle-ic.seqBSingle
        if seq[b'ali_length']>0:
            seq[b'score_norm']=float(ali.score)/seq[b'ali_length']
+        
+        ref_tags = ali[0].wrapped

    if hasattr(ali, "direction"):
        seq[b'ali_direction']=ali.direction
    seq[b'score']=ali.score
    seq[b'mode']=b'alignment'
-    
+
+    for tag in ref_tags:
+        if tag != REVERSE_SEQ_COLUMN_NAME and tag != REVERSE_QUALITY_COLUMN_NAME:
+            seq[tag] = ref_tags[tag]
+
    return seq


@ -241,10 +247,13 @@ def buildJoinedSequence(ali, reverse, seq, forward=None):
    s = forward.seq + reverse.seq     
    quality = forward.quality
    quality.extend(reverse.quality)
-    seq.set(forward.id +b"_PairedEnd", s, definition=forward.definition, quality=quality, tags=forward)
+    seq.set(forward.id +b"_PairedEnd", s, definition=forward.definition, quality=quality)
    seq[b"score"]=ali.score
    seq[b"ali_direction"]=ali.direction
    seq[b"mode"]=b"joined"
    seq[b"pairedend_limit"]=len(forward)    
+    for tag in forward:
+        if tag != REVERSE_SEQ_COLUMN_NAME and tag != REVERSE_QUALITY_COLUMN_NAME:
+            seq[tag] = forward[tag]
    return seq

--- a/python/obitools3/libalign/shifted_ali.pxd
+++ b/python/obitools3/libalign/shifted_ali.pxd
@ -24,7 +24,9 @@ cdef class Kmer_similarity:
    cdef int32_t         shift_count_array_height_a[1]
    cdef Obiview_p       view1_p
    cdef OBIDMS_column_p column1_p
+    cdef OBIDMS_column_p qual_col1_p
    cdef Obiview_p       view2_p
    cdef OBIDMS_column_p column2_p
+    cdef OBIDMS_column_p qual_col2_p
    cdef bint            build_consensus
    cpdef free(self)
--- a/python/obitools3/libalign/shifted_ali.pyx
+++ b/python/obitools3/libalign/shifted_ali.pyx
@ -1,8 +1,7 @@
 #cython: language_level=3

-from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN
+from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN, QUALITY_COLUMN

-from obitools3.dms.obiseq cimport Nuc_Seq_Stored
 from obitools3.dms.capi.kmer_similarity cimport kmer_similarity, Obi_ali_p, obi_free_shifted_ali
 from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
 from obitools3.dms.column.column cimport Column
@ -52,13 +51,14 @@ cdef class Ali_shifted:
    def direction(self):
        return self.pointer().direction

-    cpdef free(self):
+    cpdef free(self):  # TODO allocated memory could be kept
        obi_free_shifted_ali(self.pointer())


 cdef class Kmer_similarity:
-    def __init__(self, View_NUC_SEQS view1, Column column1=None, View_NUC_SEQS view2=None, Column column2=None, uint8_t kmer_size=3, build_consensus=True) :
+    def __init__(self, View_NUC_SEQS view1, Column column1=None, Column qual_column1=None, View_NUC_SEQS view2=None, Column column2=None, Column qual_column2=None, uint8_t kmer_size=3, build_consensus=True) :
        cdef Column default_seq_col
+        cdef Column default_qual_col
        if kmer_size < 1 or kmer_size > 3:
            raise Exception("Kmer size to compute kmer similarity must be >=1 or <=4")
        self.kmer_pos_array_p = NULL
@ -89,13 +89,28 @@ cdef class Kmer_similarity:
                raise Exception("Kmer similarity with no sequence column given must be given a NUC_SEQS view")
            default_seq_col = view2[NUC_SEQUENCE_COLUMN]
            self.column2_p = default_seq_col.pointer()
+        if qual_column1 is not None:
+            self.qual_col1_p = qual_column1.pointer()
+        else:
+            if type(view1) != View_NUC_SEQS:
+                raise Exception("Kmer similarity with no quality column given must be given a NUC_SEQS view")
+            default_qual_col = view1[QUALITY_COLUMN]
+            self.qual_col1_p = default_qual_col.pointer()
+        if qual_column2 is not None:
+            self.qual_col2_p = qual_column2.pointer()
+        else:
+            if type(view2) != View_NUC_SEQS:
+                raise Exception("Kmer similarity with no quality column given must be given a NUC_SEQS view")
+            default_qual_col = view2[QUALITY_COLUMN]
+            self.qual_col2_p = default_qual_col.pointer()
+
    
-    
-    def __call__(self, Nuc_Seq_Stored seq1, Nuc_Seq_Stored seq2):
+    def __call__(self, object seq1, object seq2):
        cdef Obi_ali_p ali_p
        cdef Ali_shifted ali
        ali_p = kmer_similarity(self.view1_p, self.column1_p, seq1.index, 0, \
                               self.view2_p, self.column2_p, seq2.index, 0, \
+                               self.qual_col1_p, self.qual_col2_p, \
                               self.kmer_size, \
                               self.kmer_pos_array_p, self.kmer_pos_array_height_a, \
                               self.shift_array_p, self.shift_array_height_a, \