Fixed the new alignpaired end to work after ngsfilter with the 9879847
possible cases
This commit is contained in:
@ -15,16 +15,12 @@ from obitools3.libalign._qsrassemble import QSolexaRightReverseAssemble
|
|||||||
from obitools3.libalign._solexapairend import buildConsensus, buildJoinedSequence
|
from obitools3.libalign._solexapairend import buildConsensus, buildJoinedSequence
|
||||||
from obitools3.dms.obiseq cimport Nuc_Seq
|
from obitools3.dms.obiseq cimport Nuc_Seq
|
||||||
from obitools3.libalign.shifted_ali cimport Kmer_similarity, Ali_shifted
|
from obitools3.libalign.shifted_ali cimport Kmer_similarity, Ali_shifted
|
||||||
|
from obitools3.commands.ngsfilter import REVERSE_SEQ_COLUMN_NAME, REVERSE_QUALITY_COLUMN_NAME
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
|
||||||
REVERSE_SEQ_COLUMN_NAME = b"REVERSE_SEQUENCE"
|
|
||||||
REVERSE_QUALITY_COLUMN_NAME = b"REVERSE_QUALITY"
|
|
||||||
|
|
||||||
|
|
||||||
__title__="Aligns paired-ended reads"
|
__title__="Aligns paired-ended reads"
|
||||||
|
|
||||||
|
|
||||||
@ -106,8 +102,7 @@ def alignmentIterator(entries, aligner):
|
|||||||
else:
|
else:
|
||||||
seqF = Nuc_Seq.new_from_stored(entries[i])
|
seqF = Nuc_Seq.new_from_stored(entries[i])
|
||||||
seqR = Nuc_Seq(seqF.id, seqF[REVERSE_SEQ_COLUMN_NAME], quality=seqF[REVERSE_QUALITY_COLUMN_NAME])
|
seqR = Nuc_Seq(seqF.id, seqF[REVERSE_SEQ_COLUMN_NAME], quality=seqF[REVERSE_QUALITY_COLUMN_NAME])
|
||||||
seqF.pop(REVERSE_SEQ_COLUMN_NAME)
|
seqR.index = i
|
||||||
seqF.pop(REVERSE_QUALITY_COLUMN_NAME)
|
|
||||||
|
|
||||||
ali = aligner(seqF, seqR)
|
ali = aligner(seqF, seqR)
|
||||||
|
|
||||||
@ -196,7 +191,7 @@ def run(config):
|
|||||||
reverse = entries[1]
|
reverse = entries[1]
|
||||||
aligner = Kmer_similarity(forward, view2=reverse, kmer_size=config['alignpairedend']['kmersize'])
|
aligner = Kmer_similarity(forward, view2=reverse, kmer_size=config['alignpairedend']['kmersize'])
|
||||||
else:
|
else:
|
||||||
aligner = Kmer_similarity(entries, kmer_size=config['alignpairedend']['kmersize'])
|
aligner = Kmer_similarity(entries, column2=entries[REVERSE_SEQ_COLUMN_NAME], qual_column2=entries[REVERSE_QUALITY_COLUMN_NAME], kmer_size=config['alignpairedend']['kmersize'])
|
||||||
|
|
||||||
ba = alignmentIterator(entries, aligner)
|
ba = alignmentIterator(entries, aligner)
|
||||||
|
|
||||||
|
@ -32,6 +32,8 @@ cdef extern from "kmer_similarity.h" nogil:
|
|||||||
OBIDMS_column_p column2,
|
OBIDMS_column_p column2,
|
||||||
index_t idx2,
|
index_t idx2,
|
||||||
index_t elt_idx2,
|
index_t elt_idx2,
|
||||||
|
OBIDMS_column_p qual_col1,
|
||||||
|
OBIDMS_column_p qual_col2,
|
||||||
uint8_t kmer_size,
|
uint8_t kmer_size,
|
||||||
int32_t* kmer_pos_array,
|
int32_t* kmer_pos_array,
|
||||||
int32_t* kmer_pos_array_height_p,
|
int32_t* kmer_pos_array_height_p,
|
||||||
|
@ -5,6 +5,7 @@ from .view.view cimport Line
|
|||||||
|
|
||||||
cdef class Seq(dict) :
|
cdef class Seq(dict) :
|
||||||
|
|
||||||
|
cdef int _index
|
||||||
cpdef object clone(self)
|
cpdef object clone(self)
|
||||||
cpdef str get_str(self)
|
cpdef str get_str(self)
|
||||||
cpdef get_symbol_at(self, int pos)
|
cpdef get_symbol_at(self, int pos)
|
||||||
@ -22,6 +23,7 @@ cdef class Nuc_Seq(Seq) :
|
|||||||
|
|
||||||
|
|
||||||
cdef class Seq_Stored(Line) :
|
cdef class Seq_Stored(Line) :
|
||||||
|
|
||||||
cpdef get_symbol_at(self, int pos)
|
cpdef get_symbol_at(self, int pos)
|
||||||
cpdef get_slice(self, slice slice_to_get)
|
cpdef get_slice(self, slice slice_to_get)
|
||||||
|
|
||||||
@ -31,6 +33,7 @@ cdef class Nuc_Seq_Stored(Seq_Stored) :
|
|||||||
cdef Nuc_Seq _reverse_complement
|
cdef Nuc_Seq _reverse_complement
|
||||||
cdef object _quality_array
|
cdef object _quality_array
|
||||||
cdef bytes _seq
|
cdef bytes _seq
|
||||||
|
|
||||||
cpdef set(self, object id, object seq, object definition=*, object quality=*, int offset=*, object tags=*)
|
cpdef set(self, object id, object seq, object definition=*, object quality=*, int offset=*, object tags=*)
|
||||||
cpdef set_quality_int(self, list new_qual)
|
cpdef set_quality_int(self, list new_qual)
|
||||||
cpdef set_quality_char(self, object new_qual, int offset=*)
|
cpdef set_quality_char(self, object new_qual, int offset=*)
|
||||||
|
@ -40,6 +40,7 @@ cdef class Seq(dict) :
|
|||||||
self.id = id
|
self.id = id
|
||||||
self.seq = seq
|
self.seq = seq
|
||||||
self.definition = definition
|
self.definition = definition
|
||||||
|
self._index = -1
|
||||||
if tags is not None :
|
if tags is not None :
|
||||||
for k in tags:
|
for k in tags:
|
||||||
k_b = tobytes(k)
|
k_b = tobytes(k)
|
||||||
@ -54,6 +55,7 @@ cdef class Seq(dict) :
|
|||||||
def new_from_stored(Seq_Stored seq_to_clone) :
|
def new_from_stored(Seq_Stored seq_to_clone) :
|
||||||
cdef Seq new_seq
|
cdef Seq new_seq
|
||||||
new_seq = Seq(seq_to_clone.id, seq_to_clone.seq, definition=seq_to_clone.definition, quality=seq_to_clone.quality, tags=seq_to_clone)
|
new_seq = Seq(seq_to_clone.id, seq_to_clone.seq, definition=seq_to_clone.definition, quality=seq_to_clone.quality, tags=seq_to_clone)
|
||||||
|
new_seq._index = seq_to_clone._index
|
||||||
return new_seq
|
return new_seq
|
||||||
|
|
||||||
def __contains__(self, object key):
|
def __contains__(self, object key):
|
||||||
@ -128,7 +130,16 @@ cdef class Seq(dict) :
|
|||||||
if new_definition is not None:
|
if new_definition is not None:
|
||||||
new_definition = tobytes(new_definition)
|
new_definition = tobytes(new_definition)
|
||||||
self[DEFINITION_COLUMN] = new_definition
|
self[DEFINITION_COLUMN] = new_definition
|
||||||
|
|
||||||
|
# sequence index (for reference in a view eventually) property getter and setter
|
||||||
|
@property
|
||||||
|
def index(self): # @ReservedAssignment
|
||||||
|
return self._index
|
||||||
|
|
||||||
|
@index.setter
|
||||||
|
def index(self, int idx): # @DuplicatedSignature
|
||||||
|
self._index = idx
|
||||||
|
|
||||||
|
|
||||||
cdef class Nuc_Seq(Seq) :
|
cdef class Nuc_Seq(Seq) :
|
||||||
|
|
||||||
@ -160,6 +171,7 @@ cdef class Nuc_Seq(Seq) :
|
|||||||
def new_from_stored(Nuc_Seq_Stored seq_to_clone) :
|
def new_from_stored(Nuc_Seq_Stored seq_to_clone) :
|
||||||
cdef Nuc_Seq new_seq
|
cdef Nuc_Seq new_seq
|
||||||
new_seq = Nuc_Seq(seq_to_clone.id, seq_to_clone.seq, definition=seq_to_clone.definition, quality=seq_to_clone.quality, tags=seq_to_clone)
|
new_seq = Nuc_Seq(seq_to_clone.id, seq_to_clone.seq, definition=seq_to_clone.definition, quality=seq_to_clone.quality, tags=seq_to_clone)
|
||||||
|
new_seq._index = seq_to_clone.index
|
||||||
return new_seq
|
return new_seq
|
||||||
|
|
||||||
# is_revcomp property getter and setter (boolean indicating whether the sequence was created by reverse complementing another sequence)
|
# is_revcomp property getter and setter (boolean indicating whether the sequence was created by reverse complementing another sequence)
|
||||||
|
@ -13,7 +13,7 @@ from obitools3.dms.capi.obidmscolumn cimport OBIDMS_column_p
|
|||||||
|
|
||||||
from obitools3.dms.view.view cimport View
|
from obitools3.dms.view.view cimport View
|
||||||
from obitools3.dms.column.column cimport Column
|
from obitools3.dms.column.column cimport Column
|
||||||
|
from obitools3.commands.ngsfilter import REVERSE_SEQ_COLUMN_NAME, REVERSE_QUALITY_COLUMN_NAME
|
||||||
|
|
||||||
from math import log10
|
from math import log10
|
||||||
|
|
||||||
@ -182,7 +182,7 @@ def buildConsensus(ali, seq, ref_tags=None):
|
|||||||
col_p = column.pointer()
|
col_p = column.pointer()
|
||||||
# doesn't work because uint8_t* are forced into bytes by cython (nothing read/kept beyond 0 values)
|
# doesn't work because uint8_t* are forced into bytes by cython (nothing read/kept beyond 0 values)
|
||||||
#obi_set_qual_int_with_elt_idx_and_col_p_in_view(view_p, col_p, seq.index, 0, ali.consensus_qual, ali.consensus_len)
|
#obi_set_qual_int_with_elt_idx_and_col_p_in_view(view_p, col_p, seq.index, 0, ali.consensus_qual, ali.consensus_len)
|
||||||
seq.set(ref_tags.id+b"_CONS", ali.consensus_seq, quality=ali.consensus_qual, tags=ref_tags)
|
seq.set(ref_tags.id+b"_CONS", ali.consensus_seq, quality=ali.consensus_qual)
|
||||||
seq[b'ali_length'] = ali.consensus_len
|
seq[b'ali_length'] = ali.consensus_len
|
||||||
seq[b'score_norm']=float(ali.score)/ali.consensus_len
|
seq[b'score_norm']=float(ali.score)/ali.consensus_len
|
||||||
seq[b'shift']=ali.shift
|
seq[b'shift']=ali.shift
|
||||||
@ -208,7 +208,7 @@ def buildConsensus(ali, seq, ref_tags=None):
|
|||||||
quality[i] = min(42, round(-10*log10(p)))
|
quality[i] = min(42, round(-10*log10(p)))
|
||||||
i+=1
|
i+=1
|
||||||
|
|
||||||
seq.set(ali[0].wrapped.id+b"_CONS", sseq, quality=quality, tags=ali[0].wrapped)
|
seq.set(ali[0].wrapped.id+b"_CONS", sseq, quality=quality)
|
||||||
|
|
||||||
if hasattr(ali, "counter"):
|
if hasattr(ali, "counter"):
|
||||||
seq[b'alignement_id']=ali.counter
|
seq[b'alignement_id']=ali.counter
|
||||||
@ -224,12 +224,18 @@ def buildConsensus(ali, seq, ref_tags=None):
|
|||||||
seq[b'ali_length']=len(seq)-ic.seqASingle-ic.seqBSingle
|
seq[b'ali_length']=len(seq)-ic.seqASingle-ic.seqBSingle
|
||||||
if seq[b'ali_length']>0:
|
if seq[b'ali_length']>0:
|
||||||
seq[b'score_norm']=float(ali.score)/seq[b'ali_length']
|
seq[b'score_norm']=float(ali.score)/seq[b'ali_length']
|
||||||
|
|
||||||
|
ref_tags = ali[0].wrapped
|
||||||
|
|
||||||
if hasattr(ali, "direction"):
|
if hasattr(ali, "direction"):
|
||||||
seq[b'ali_direction']=ali.direction
|
seq[b'ali_direction']=ali.direction
|
||||||
seq[b'score']=ali.score
|
seq[b'score']=ali.score
|
||||||
seq[b'mode']=b'alignment'
|
seq[b'mode']=b'alignment'
|
||||||
|
|
||||||
|
for tag in ref_tags:
|
||||||
|
if tag != REVERSE_SEQ_COLUMN_NAME and tag != REVERSE_QUALITY_COLUMN_NAME:
|
||||||
|
seq[tag] = ref_tags[tag]
|
||||||
|
|
||||||
return seq
|
return seq
|
||||||
|
|
||||||
|
|
||||||
@ -241,10 +247,13 @@ def buildJoinedSequence(ali, reverse, seq, forward=None):
|
|||||||
s = forward.seq + reverse.seq
|
s = forward.seq + reverse.seq
|
||||||
quality = forward.quality
|
quality = forward.quality
|
||||||
quality.extend(reverse.quality)
|
quality.extend(reverse.quality)
|
||||||
seq.set(forward.id +b"_PairedEnd", s, definition=forward.definition, quality=quality, tags=forward)
|
seq.set(forward.id +b"_PairedEnd", s, definition=forward.definition, quality=quality)
|
||||||
seq[b"score"]=ali.score
|
seq[b"score"]=ali.score
|
||||||
seq[b"ali_direction"]=ali.direction
|
seq[b"ali_direction"]=ali.direction
|
||||||
seq[b"mode"]=b"joined"
|
seq[b"mode"]=b"joined"
|
||||||
seq[b"pairedend_limit"]=len(forward)
|
seq[b"pairedend_limit"]=len(forward)
|
||||||
|
for tag in forward:
|
||||||
|
if tag != REVERSE_SEQ_COLUMN_NAME and tag != REVERSE_QUALITY_COLUMN_NAME:
|
||||||
|
seq[tag] = forward[tag]
|
||||||
return seq
|
return seq
|
||||||
|
|
||||||
|
@ -24,7 +24,9 @@ cdef class Kmer_similarity:
|
|||||||
cdef int32_t shift_count_array_height_a[1]
|
cdef int32_t shift_count_array_height_a[1]
|
||||||
cdef Obiview_p view1_p
|
cdef Obiview_p view1_p
|
||||||
cdef OBIDMS_column_p column1_p
|
cdef OBIDMS_column_p column1_p
|
||||||
|
cdef OBIDMS_column_p qual_col1_p
|
||||||
cdef Obiview_p view2_p
|
cdef Obiview_p view2_p
|
||||||
cdef OBIDMS_column_p column2_p
|
cdef OBIDMS_column_p column2_p
|
||||||
|
cdef OBIDMS_column_p qual_col2_p
|
||||||
cdef bint build_consensus
|
cdef bint build_consensus
|
||||||
cpdef free(self)
|
cpdef free(self)
|
@ -1,8 +1,7 @@
|
|||||||
#cython: language_level=3
|
#cython: language_level=3
|
||||||
|
|
||||||
from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN
|
from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN, QUALITY_COLUMN
|
||||||
|
|
||||||
from obitools3.dms.obiseq cimport Nuc_Seq_Stored
|
|
||||||
from obitools3.dms.capi.kmer_similarity cimport kmer_similarity, Obi_ali_p, obi_free_shifted_ali
|
from obitools3.dms.capi.kmer_similarity cimport kmer_similarity, Obi_ali_p, obi_free_shifted_ali
|
||||||
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
||||||
from obitools3.dms.column.column cimport Column
|
from obitools3.dms.column.column cimport Column
|
||||||
@ -52,13 +51,14 @@ cdef class Ali_shifted:
|
|||||||
def direction(self):
|
def direction(self):
|
||||||
return self.pointer().direction
|
return self.pointer().direction
|
||||||
|
|
||||||
cpdef free(self):
|
cpdef free(self): # TODO allocated memory could be kept
|
||||||
obi_free_shifted_ali(self.pointer())
|
obi_free_shifted_ali(self.pointer())
|
||||||
|
|
||||||
|
|
||||||
cdef class Kmer_similarity:
|
cdef class Kmer_similarity:
|
||||||
def __init__(self, View_NUC_SEQS view1, Column column1=None, View_NUC_SEQS view2=None, Column column2=None, uint8_t kmer_size=3, build_consensus=True) :
|
def __init__(self, View_NUC_SEQS view1, Column column1=None, Column qual_column1=None, View_NUC_SEQS view2=None, Column column2=None, Column qual_column2=None, uint8_t kmer_size=3, build_consensus=True) :
|
||||||
cdef Column default_seq_col
|
cdef Column default_seq_col
|
||||||
|
cdef Column default_qual_col
|
||||||
if kmer_size < 1 or kmer_size > 3:
|
if kmer_size < 1 or kmer_size > 3:
|
||||||
raise Exception("Kmer size to compute kmer similarity must be >=1 or <=4")
|
raise Exception("Kmer size to compute kmer similarity must be >=1 or <=4")
|
||||||
self.kmer_pos_array_p = NULL
|
self.kmer_pos_array_p = NULL
|
||||||
@ -89,13 +89,28 @@ cdef class Kmer_similarity:
|
|||||||
raise Exception("Kmer similarity with no sequence column given must be given a NUC_SEQS view")
|
raise Exception("Kmer similarity with no sequence column given must be given a NUC_SEQS view")
|
||||||
default_seq_col = view2[NUC_SEQUENCE_COLUMN]
|
default_seq_col = view2[NUC_SEQUENCE_COLUMN]
|
||||||
self.column2_p = default_seq_col.pointer()
|
self.column2_p = default_seq_col.pointer()
|
||||||
|
if qual_column1 is not None:
|
||||||
|
self.qual_col1_p = qual_column1.pointer()
|
||||||
|
else:
|
||||||
|
if type(view1) != View_NUC_SEQS:
|
||||||
|
raise Exception("Kmer similarity with no quality column given must be given a NUC_SEQS view")
|
||||||
|
default_qual_col = view1[QUALITY_COLUMN]
|
||||||
|
self.qual_col1_p = default_qual_col.pointer()
|
||||||
|
if qual_column2 is not None:
|
||||||
|
self.qual_col2_p = qual_column2.pointer()
|
||||||
|
else:
|
||||||
|
if type(view2) != View_NUC_SEQS:
|
||||||
|
raise Exception("Kmer similarity with no quality column given must be given a NUC_SEQS view")
|
||||||
|
default_qual_col = view2[QUALITY_COLUMN]
|
||||||
|
self.qual_col2_p = default_qual_col.pointer()
|
||||||
|
|
||||||
|
|
||||||
|
def __call__(self, object seq1, object seq2):
|
||||||
def __call__(self, Nuc_Seq_Stored seq1, Nuc_Seq_Stored seq2):
|
|
||||||
cdef Obi_ali_p ali_p
|
cdef Obi_ali_p ali_p
|
||||||
cdef Ali_shifted ali
|
cdef Ali_shifted ali
|
||||||
ali_p = kmer_similarity(self.view1_p, self.column1_p, seq1.index, 0, \
|
ali_p = kmer_similarity(self.view1_p, self.column1_p, seq1.index, 0, \
|
||||||
self.view2_p, self.column2_p, seq2.index, 0, \
|
self.view2_p, self.column2_p, seq2.index, 0, \
|
||||||
|
self.qual_col1_p, self.qual_col2_p, \
|
||||||
self.kmer_size, \
|
self.kmer_size, \
|
||||||
self.kmer_pos_array_p, self.kmer_pos_array_height_a, \
|
self.kmer_pos_array_p, self.kmer_pos_array_height_a, \
|
||||||
self.shift_array_p, self.shift_array_height_a, \
|
self.shift_array_p, self.shift_array_height_a, \
|
||||||
|
@ -64,19 +64,18 @@ void obi_free_shifted_ali(Obi_ali_p ali)
|
|||||||
|
|
||||||
|
|
||||||
Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1, index_t elt_idx1,
|
Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1, index_t elt_idx1,
|
||||||
Obiview_p view2, OBIDMS_column_p column2, index_t idx2, index_t elt_idx2,
|
Obiview_p view2, OBIDMS_column_p column2, index_t idx2, index_t elt_idx2,
|
||||||
uint8_t kmer_size,
|
OBIDMS_column_p qual_col1, OBIDMS_column_p qual_col2,
|
||||||
int32_t* kmer_pos_array, int32_t* kmer_pos_array_height_p,
|
uint8_t kmer_size,
|
||||||
int32_t* shift_array, int32_t* shift_array_height_p,
|
int32_t* kmer_pos_array, int32_t* kmer_pos_array_height_p,
|
||||||
int32_t* shift_count_array, int32_t* shift_count_array_length_p,
|
int32_t* shift_array, int32_t* shift_array_height_p,
|
||||||
bool build_consensus)
|
int32_t* shift_count_array, int32_t* shift_count_array_length_p,
|
||||||
|
bool build_consensus)
|
||||||
{
|
{
|
||||||
Obi_ali_p ali = NULL;
|
Obi_ali_p ali = NULL;
|
||||||
int i, j;
|
int i, j;
|
||||||
bool switched_seqs;
|
bool switched_seqs;
|
||||||
bool left_ali;
|
bool left_ali;
|
||||||
OBIDMS_column_p qual_col1;
|
|
||||||
OBIDMS_column_p qual_col2;
|
|
||||||
double score = 0.0;
|
double score = 0.0;
|
||||||
Obi_blob_p blob1 = NULL;
|
Obi_blob_p blob1 = NULL;
|
||||||
Obi_blob_p blob2 = NULL;
|
Obi_blob_p blob2 = NULL;
|
||||||
@ -106,6 +105,8 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
|||||||
int32_t overlap_len;
|
int32_t overlap_len;
|
||||||
int32_t consensus_len;
|
int32_t consensus_len;
|
||||||
int32_t cons_shift;
|
int32_t cons_shift;
|
||||||
|
int32_t copy_start;
|
||||||
|
int32_t copy_len;
|
||||||
uint8_t kmer;
|
uint8_t kmer;
|
||||||
byte_t nuc;
|
byte_t nuc;
|
||||||
uint8_t encoding;
|
uint8_t encoding;
|
||||||
@ -116,6 +117,10 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
|||||||
int32_t shift_array_height = *shift_array_height_p;
|
int32_t shift_array_height = *shift_array_height_p;
|
||||||
int32_t shift_count_array_length = *shift_count_array_length_p;
|
int32_t shift_count_array_length = *shift_count_array_length_p;
|
||||||
bool free_blob1 = false;
|
bool free_blob1 = false;
|
||||||
|
bool keep_seq1_start;
|
||||||
|
bool keep_seq2_start;
|
||||||
|
bool keep_seq1_end;
|
||||||
|
bool keep_seq2_end;
|
||||||
|
|
||||||
// Check kmer size
|
// Check kmer size
|
||||||
if ((kmer_size < 1) || (kmer_size > 4))
|
if ((kmer_size < 1) || (kmer_size > 4))
|
||||||
@ -143,7 +148,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
|||||||
switched_seqs = false;
|
switched_seqs = false;
|
||||||
len1 = blob1->length_decoded_value;
|
len1 = blob1->length_decoded_value;
|
||||||
len2 = blob2->length_decoded_value;
|
len2 = blob2->length_decoded_value;
|
||||||
if (len2 > len1)
|
if (len2 < len1)
|
||||||
{
|
{
|
||||||
switched_seqs = true;
|
switched_seqs = true;
|
||||||
temp_len = len1;
|
temp_len = len1;
|
||||||
@ -334,7 +339,8 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
|||||||
// Find the most represented shift
|
// Find the most represented shift
|
||||||
best_shift_idx = 0;
|
best_shift_idx = 0;
|
||||||
max_common_kmers = 0;
|
max_common_kmers = 0;
|
||||||
empty_part = (shift_count_array_length-1)/2 - len1;
|
//empty_part = (shift_count_array_length-1)/2 - len1; //TODO wrong in some cases (len1 shorter than overlap or something like that)
|
||||||
|
empty_part = 0;
|
||||||
for (i=empty_part; i < (shift_count_array_length - empty_part); i++) // skipping empty parts of the array
|
for (i=empty_part; i < (shift_count_array_length - empty_part); i++) // skipping empty parts of the array
|
||||||
{
|
{
|
||||||
if (shift_count_array[i] > max_common_kmers)
|
if (shift_count_array[i] > max_common_kmers)
|
||||||
@ -345,10 +351,44 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
|||||||
}
|
}
|
||||||
best_shift = best_shift_idx - len1;
|
best_shift = best_shift_idx - len1;
|
||||||
|
|
||||||
|
keep_seq1_start = false;
|
||||||
|
keep_seq1_end = false;
|
||||||
|
keep_seq2_start = false;
|
||||||
|
keep_seq2_end = false;
|
||||||
|
|
||||||
|
// The 873863 cases of hell
|
||||||
if (best_shift > 0)
|
if (best_shift > 0)
|
||||||
overlap_len = len2 - best_shift;
|
{
|
||||||
else
|
overlap_len = len2 - best_shift;
|
||||||
|
if (len1 <= overlap_len)
|
||||||
|
{
|
||||||
|
overlap_len = len1;
|
||||||
|
if (! switched_seqs)
|
||||||
|
keep_seq2_end = true;
|
||||||
|
else
|
||||||
|
keep_seq2_start = true;
|
||||||
|
}
|
||||||
|
else if (switched_seqs)
|
||||||
|
{
|
||||||
|
keep_seq2_start = true;
|
||||||
|
keep_seq1_end = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (best_shift < 0)
|
||||||
|
{
|
||||||
overlap_len = len1 + best_shift;
|
overlap_len = len1 + best_shift;
|
||||||
|
if (!switched_seqs)
|
||||||
|
{
|
||||||
|
keep_seq1_start = true;
|
||||||
|
keep_seq2_end = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
overlap_len = len1;
|
||||||
|
if ((!switched_seqs) && (len2 > len1))
|
||||||
|
keep_seq2_end = true;
|
||||||
|
}
|
||||||
|
|
||||||
score = max_common_kmers + kmer_size - 1;
|
score = max_common_kmers + kmer_size - 1;
|
||||||
|
|
||||||
@ -369,7 +409,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
|||||||
ali->shift = abs_shift;
|
ali->shift = abs_shift;
|
||||||
ali->consensus_seq = NULL;
|
ali->consensus_seq = NULL;
|
||||||
ali->consensus_qual = NULL;
|
ali->consensus_qual = NULL;
|
||||||
if (((best_shift < 0) && (!switched_seqs)) || ((best_shift > 0) && switched_seqs))
|
if (((best_shift <= 0) && (!switched_seqs)) || ((best_shift > 0) && switched_seqs))
|
||||||
{
|
{
|
||||||
left_ali = true;
|
left_ali = true;
|
||||||
strcpy(ali->direction, "left");
|
strcpy(ali->direction, "left");
|
||||||
@ -383,33 +423,17 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
|||||||
// Build the consensus sequence if asked
|
// Build the consensus sequence if asked
|
||||||
if (build_consensus)
|
if (build_consensus)
|
||||||
{
|
{
|
||||||
// Get the quality columns
|
|
||||||
qual_col1 = obi_view_get_column(view1, QUALITY_COLUMN);
|
|
||||||
if (qual_col1 == NULL)
|
|
||||||
{
|
|
||||||
obi_set_errno(OBI_ALIGN_ERROR);
|
|
||||||
obidebug(1, "\nError when computing the kmer similarity between two sequences: can't open quality column to build the consensus sequence");
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
qual_col2 = obi_view_get_column(view2, QUALITY_COLUMN);
|
|
||||||
if (qual_col2 == NULL)
|
|
||||||
{
|
|
||||||
obi_set_errno(OBI_ALIGN_ERROR);
|
|
||||||
obidebug(1, "\nError when computing the kmer similarity between two sequences: can't open quality column to build the consensus sequence");
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get the quality arrays
|
// Get the quality arrays
|
||||||
qual1 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view1, qual_col1, idx1, 0, &qual1_len);
|
qual1 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view1, qual_col1, idx1, 0, &qual1_len);
|
||||||
if (qual1 == NULL)
|
if (qual1 == NULL)
|
||||||
{
|
{
|
||||||
obidebug(1, "\nError getting the blob of the 1st sequence when computing the kmer similarity between two sequences");
|
obidebug(1, "\nError getting the quality of the 1st sequence when computing the kmer similarity between two sequences");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
qual2 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view2, qual_col2, idx2, 0, &qual2_len);
|
qual2 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view2, qual_col2, idx2, 0, &qual2_len);
|
||||||
if (qual2 == NULL)
|
if (qual2 == NULL)
|
||||||
{
|
{
|
||||||
obidebug(1, "\nError getting the blob of the 1st sequence when computing the kmer similarity between two sequences");
|
obidebug(1, "\nError getting the quality of the 2nd sequence when computing the kmer similarity between two sequences");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -417,10 +441,10 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
|||||||
if (seq1 == NULL)
|
if (seq1 == NULL)
|
||||||
seq1 = obi_blob_to_seq(blob1);
|
seq1 = obi_blob_to_seq(blob1);
|
||||||
|
|
||||||
if (left_ali)
|
if (! switched_seqs)
|
||||||
consensus_len = len1 + len2 - overlap_len;
|
consensus_len = len2 - best_shift;
|
||||||
else
|
else
|
||||||
consensus_len = overlap_len;
|
consensus_len = len1 + best_shift;
|
||||||
|
|
||||||
// Allocate memory for consensus sequence
|
// Allocate memory for consensus sequence
|
||||||
consensus_seq = (char*) malloc(consensus_len + 1 * sizeof(char)); // TODO keep malloced too maybe
|
consensus_seq = (char*) malloc(consensus_len + 1 * sizeof(char)); // TODO keep malloced too maybe
|
||||||
@ -447,30 +471,27 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
|||||||
// Compute consensus-relative shift for each sequence
|
// Compute consensus-relative shift for each sequence
|
||||||
if (best_shift > 0)
|
if (best_shift > 0)
|
||||||
{
|
{
|
||||||
shift1 = best_shift;
|
shift1 = 0;
|
||||||
shift2 = 0;
|
shift2 = best_shift;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
shift1 = 0;
|
shift1 = -(best_shift);
|
||||||
shift2 = -(best_shift);
|
shift2 = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// If left alignment, copy first part of first sequence
|
// Copy first part of first or second sequence depending on cases
|
||||||
if (left_ali)
|
if (keep_seq1_start)
|
||||||
{
|
{
|
||||||
if (switched_seqs)
|
strncpy(consensus_seq, seq1, abs_shift);
|
||||||
{
|
memcpy(consensus_qual, qual1, abs_shift*sizeof(uint8_t));
|
||||||
strncpy(consensus_seq, seq2, abs_shift);
|
cons_shift = abs_shift;
|
||||||
memcpy(consensus_qual, qual2, abs_shift*sizeof(uint8_t));
|
}
|
||||||
cons_shift = abs_shift;
|
else if (keep_seq2_start)
|
||||||
}
|
{
|
||||||
else
|
strncpy(consensus_seq, seq2, abs_shift);
|
||||||
{
|
memcpy(consensus_qual, qual2, abs_shift*sizeof(uint8_t));
|
||||||
strncpy(consensus_seq, seq1, abs_shift);
|
cons_shift = abs_shift;
|
||||||
memcpy(consensus_qual, qual1, abs_shift*sizeof(uint8_t));
|
|
||||||
cons_shift = abs_shift;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
cons_shift = 0;
|
cons_shift = 0;
|
||||||
@ -485,19 +506,26 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1
|
|||||||
consensus_qual[pos+cons_shift] = round((qual1[pos+shift1] + qual2[pos+shift2])/2); // TODO maybe use the (p1*(1-p2/3)) formula (but parenthesis bug???)
|
consensus_qual[pos+cons_shift] = round((qual1[pos+shift1] + qual2[pos+shift2])/2); // TODO maybe use the (p1*(1-p2/3)) formula (but parenthesis bug???)
|
||||||
}
|
}
|
||||||
|
|
||||||
// If left alignment, copy last part of first sequence
|
// Copy last part of first or second sequence depending on cases
|
||||||
if (left_ali)
|
if (keep_seq1_end)
|
||||||
{
|
{
|
||||||
if (switched_seqs)
|
strncpy(consensus_seq+cons_shift+overlap_len, seq1+overlap_len, len1 - overlap_len);
|
||||||
|
memcpy(consensus_qual+cons_shift+overlap_len, qual1+overlap_len, (len1 - overlap_len)*sizeof(uint8_t));
|
||||||
|
}
|
||||||
|
if (keep_seq2_end)
|
||||||
|
{
|
||||||
|
if (best_shift <= 0)
|
||||||
{
|
{
|
||||||
strncpy(consensus_seq+cons_shift+overlap_len, seq1+overlap_len, len1-overlap_len);
|
copy_start = overlap_len;
|
||||||
memcpy(consensus_qual+cons_shift+overlap_len, qual1+overlap_len, (len1-overlap_len)*sizeof(uint8_t));
|
copy_len = len2 - overlap_len;
|
||||||
}
|
}
|
||||||
else
|
if (best_shift > 0)
|
||||||
{
|
{
|
||||||
strncpy(consensus_seq+cons_shift+overlap_len, seq2+overlap_len, len2-overlap_len);
|
copy_start = overlap_len + best_shift;
|
||||||
memcpy(consensus_qual+cons_shift+overlap_len, qual2+overlap_len, (len2-overlap_len)*sizeof(uint8_t));
|
copy_len = len2 - overlap_len - best_shift;
|
||||||
}
|
}
|
||||||
|
strncpy(consensus_seq+cons_shift+overlap_len, seq2+copy_start, copy_len);
|
||||||
|
memcpy(consensus_qual+cons_shift+overlap_len, qual2+copy_start, copy_len*sizeof(uint8_t));
|
||||||
}
|
}
|
||||||
|
|
||||||
consensus_seq[consensus_len] = '\0';
|
consensus_seq[consensus_len] = '\0';
|
||||||
|
@ -33,9 +33,9 @@ typedef struct Obi_ali {
|
|||||||
*/
|
*/
|
||||||
int overlap_length; /**< Length of the overlap between the aligned sequences.
|
int overlap_length; /**< Length of the overlap between the aligned sequences.
|
||||||
*/
|
*/
|
||||||
char* consensus_seq; /**< Consensus sequence built with the computed overlap.
|
char* consensus_seq; /**< Consensus sequence built as to reconstruct a pairedend read.
|
||||||
*/
|
*/
|
||||||
uint8_t* consensus_qual; /**< Consensus quality built with the computed overlap.
|
uint8_t* consensus_qual; /**< Consensus quality built as to reconstruct a pairedend read.
|
||||||
*/
|
*/
|
||||||
int shift; /**< Shift chosen to align the sequences (for shifted alignment).
|
int shift; /**< Shift chosen to align the sequences (for shifted alignment).
|
||||||
*/
|
*/
|
||||||
@ -91,7 +91,7 @@ void obi_free_shifted_ali(Obi_ali_p ali);
|
|||||||
* If needed, reallocated to a bigger size.
|
* If needed, reallocated to a bigger size.
|
||||||
* @param shift_count_array_height_p A pointer on an integer corresponding to the size (number of elements)
|
* @param shift_count_array_height_p A pointer on an integer corresponding to the size (number of elements)
|
||||||
* allocated for shift_count_array. Updated by the function as needed.
|
* allocated for shift_count_array. Updated by the function as needed.
|
||||||
* @param build_consensus A boolean indicating whether the function should build the consensus sequence and quality. // TODO option to build consensus without quality?
|
* @param build_consensus A boolean indicating whether the function should build the consensus sequence and quality as to reconstruct a pairedend read. // TODO option to build consensus without quality?
|
||||||
*
|
*
|
||||||
* @returns A pointer on an Obi_ali_p structure containing the results.
|
* @returns A pointer on an Obi_ali_p structure containing the results.
|
||||||
* @retval NULL if an error occurred.
|
* @retval NULL if an error occurred.
|
||||||
@ -107,6 +107,8 @@ Obi_ali_p kmer_similarity(Obiview_p view1,
|
|||||||
OBIDMS_column_p column2,
|
OBIDMS_column_p column2,
|
||||||
index_t idx2,
|
index_t idx2,
|
||||||
index_t elt_idx2,
|
index_t elt_idx2,
|
||||||
|
OBIDMS_column_p qual_col1,
|
||||||
|
OBIDMS_column_p qual_col2,
|
||||||
uint8_t kmer_size,
|
uint8_t kmer_size,
|
||||||
int32_t* kmer_pos_array,
|
int32_t* kmer_pos_array,
|
||||||
int32_t* kmer_pos_array_height_p,
|
int32_t* kmer_pos_array_height_p,
|
||||||
|
Reference in New Issue
Block a user