diff --git a/python/obitools3/commands/alignpairedend.pyx b/python/obitools3/commands/alignpairedend.pyx index c80a880..0f8f95b 100755 --- a/python/obitools3/commands/alignpairedend.pyx +++ b/python/obitools3/commands/alignpairedend.pyx @@ -15,16 +15,12 @@ from obitools3.libalign._qsrassemble import QSolexaRightReverseAssemble from obitools3.libalign._solexapairend import buildConsensus, buildJoinedSequence from obitools3.dms.obiseq cimport Nuc_Seq from obitools3.libalign.shifted_ali cimport Kmer_similarity, Ali_shifted - +from obitools3.commands.ngsfilter import REVERSE_SEQ_COLUMN_NAME, REVERSE_QUALITY_COLUMN_NAME import sys import os -REVERSE_SEQ_COLUMN_NAME = b"REVERSE_SEQUENCE" -REVERSE_QUALITY_COLUMN_NAME = b"REVERSE_QUALITY" - - __title__="Aligns paired-ended reads" @@ -106,8 +102,7 @@ def alignmentIterator(entries, aligner): else: seqF = Nuc_Seq.new_from_stored(entries[i]) seqR = Nuc_Seq(seqF.id, seqF[REVERSE_SEQ_COLUMN_NAME], quality=seqF[REVERSE_QUALITY_COLUMN_NAME]) - seqF.pop(REVERSE_SEQ_COLUMN_NAME) - seqF.pop(REVERSE_QUALITY_COLUMN_NAME) + seqR.index = i ali = aligner(seqF, seqR) @@ -196,7 +191,7 @@ def run(config): reverse = entries[1] aligner = Kmer_similarity(forward, view2=reverse, kmer_size=config['alignpairedend']['kmersize']) else: - aligner = Kmer_similarity(entries, kmer_size=config['alignpairedend']['kmersize']) + aligner = Kmer_similarity(entries, column2=entries[REVERSE_SEQ_COLUMN_NAME], qual_column2=entries[REVERSE_QUALITY_COLUMN_NAME], kmer_size=config['alignpairedend']['kmersize']) ba = alignmentIterator(entries, aligner) diff --git a/python/obitools3/dms/capi/kmer_similarity.pxd b/python/obitools3/dms/capi/kmer_similarity.pxd index 024046b..232ecb4 100755 --- a/python/obitools3/dms/capi/kmer_similarity.pxd +++ b/python/obitools3/dms/capi/kmer_similarity.pxd @@ -32,6 +32,8 @@ cdef extern from "kmer_similarity.h" nogil: OBIDMS_column_p column2, index_t idx2, index_t elt_idx2, + OBIDMS_column_p qual_col1, + OBIDMS_column_p qual_col2, uint8_t kmer_size, int32_t* kmer_pos_array, int32_t* kmer_pos_array_height_p, diff --git a/python/obitools3/dms/obiseq.pxd b/python/obitools3/dms/obiseq.pxd index c096649..c4c5df4 100755 --- a/python/obitools3/dms/obiseq.pxd +++ b/python/obitools3/dms/obiseq.pxd @@ -5,6 +5,7 @@ from .view.view cimport Line cdef class Seq(dict) : + cdef int _index cpdef object clone(self) cpdef str get_str(self) cpdef get_symbol_at(self, int pos) @@ -22,6 +23,7 @@ cdef class Nuc_Seq(Seq) : cdef class Seq_Stored(Line) : + cpdef get_symbol_at(self, int pos) cpdef get_slice(self, slice slice_to_get) @@ -31,6 +33,7 @@ cdef class Nuc_Seq_Stored(Seq_Stored) : cdef Nuc_Seq _reverse_complement cdef object _quality_array cdef bytes _seq + cpdef set(self, object id, object seq, object definition=*, object quality=*, int offset=*, object tags=*) cpdef set_quality_int(self, list new_qual) cpdef set_quality_char(self, object new_qual, int offset=*) diff --git a/python/obitools3/dms/obiseq.pyx b/python/obitools3/dms/obiseq.pyx index 1669ca4..75970eb 100755 --- a/python/obitools3/dms/obiseq.pyx +++ b/python/obitools3/dms/obiseq.pyx @@ -40,6 +40,7 @@ cdef class Seq(dict) : self.id = id self.seq = seq self.definition = definition + self._index = -1 if tags is not None : for k in tags: k_b = tobytes(k) @@ -54,6 +55,7 @@ cdef class Seq(dict) : def new_from_stored(Seq_Stored seq_to_clone) : cdef Seq new_seq new_seq = Seq(seq_to_clone.id, seq_to_clone.seq, definition=seq_to_clone.definition, quality=seq_to_clone.quality, tags=seq_to_clone) + new_seq._index = seq_to_clone._index return new_seq def __contains__(self, object key): @@ -128,7 +130,16 @@ cdef class Seq(dict) : if new_definition is not None: new_definition = tobytes(new_definition) self[DEFINITION_COLUMN] = new_definition - + + # sequence index (for reference in a view eventually) property getter and setter + @property + def index(self): # @ReservedAssignment + return self._index + + @index.setter + def index(self, int idx): # @DuplicatedSignature + self._index = idx + cdef class Nuc_Seq(Seq) : @@ -160,6 +171,7 @@ cdef class Nuc_Seq(Seq) : def new_from_stored(Nuc_Seq_Stored seq_to_clone) : cdef Nuc_Seq new_seq new_seq = Nuc_Seq(seq_to_clone.id, seq_to_clone.seq, definition=seq_to_clone.definition, quality=seq_to_clone.quality, tags=seq_to_clone) + new_seq._index = seq_to_clone.index return new_seq # is_revcomp property getter and setter (boolean indicating whether the sequence was created by reverse complementing another sequence) diff --git a/python/obitools3/libalign/_solexapairend.pyx b/python/obitools3/libalign/_solexapairend.pyx index 31973d0..634a0cc 100755 --- a/python/obitools3/libalign/_solexapairend.pyx +++ b/python/obitools3/libalign/_solexapairend.pyx @@ -13,7 +13,7 @@ from obitools3.dms.capi.obidmscolumn cimport OBIDMS_column_p from obitools3.dms.view.view cimport View from obitools3.dms.column.column cimport Column - +from obitools3.commands.ngsfilter import REVERSE_SEQ_COLUMN_NAME, REVERSE_QUALITY_COLUMN_NAME from math import log10 @@ -182,7 +182,7 @@ def buildConsensus(ali, seq, ref_tags=None): col_p = column.pointer() # doesn't work because uint8_t* are forced into bytes by cython (nothing read/kept beyond 0 values) #obi_set_qual_int_with_elt_idx_and_col_p_in_view(view_p, col_p, seq.index, 0, ali.consensus_qual, ali.consensus_len) - seq.set(ref_tags.id+b"_CONS", ali.consensus_seq, quality=ali.consensus_qual, tags=ref_tags) + seq.set(ref_tags.id+b"_CONS", ali.consensus_seq, quality=ali.consensus_qual) seq[b'ali_length'] = ali.consensus_len seq[b'score_norm']=float(ali.score)/ali.consensus_len seq[b'shift']=ali.shift @@ -208,7 +208,7 @@ def buildConsensus(ali, seq, ref_tags=None): quality[i] = min(42, round(-10*log10(p))) i+=1 - seq.set(ali[0].wrapped.id+b"_CONS", sseq, quality=quality, tags=ali[0].wrapped) + seq.set(ali[0].wrapped.id+b"_CONS", sseq, quality=quality) if hasattr(ali, "counter"): seq[b'alignement_id']=ali.counter @@ -224,12 +224,18 @@ def buildConsensus(ali, seq, ref_tags=None): seq[b'ali_length']=len(seq)-ic.seqASingle-ic.seqBSingle if seq[b'ali_length']>0: seq[b'score_norm']=float(ali.score)/seq[b'ali_length'] + + ref_tags = ali[0].wrapped if hasattr(ali, "direction"): seq[b'ali_direction']=ali.direction seq[b'score']=ali.score seq[b'mode']=b'alignment' - + + for tag in ref_tags: + if tag != REVERSE_SEQ_COLUMN_NAME and tag != REVERSE_QUALITY_COLUMN_NAME: + seq[tag] = ref_tags[tag] + return seq @@ -241,10 +247,13 @@ def buildJoinedSequence(ali, reverse, seq, forward=None): s = forward.seq + reverse.seq quality = forward.quality quality.extend(reverse.quality) - seq.set(forward.id +b"_PairedEnd", s, definition=forward.definition, quality=quality, tags=forward) + seq.set(forward.id +b"_PairedEnd", s, definition=forward.definition, quality=quality) seq[b"score"]=ali.score seq[b"ali_direction"]=ali.direction seq[b"mode"]=b"joined" seq[b"pairedend_limit"]=len(forward) + for tag in forward: + if tag != REVERSE_SEQ_COLUMN_NAME and tag != REVERSE_QUALITY_COLUMN_NAME: + seq[tag] = forward[tag] return seq diff --git a/python/obitools3/libalign/shifted_ali.pxd b/python/obitools3/libalign/shifted_ali.pxd index 2277c97..4e45bf9 100755 --- a/python/obitools3/libalign/shifted_ali.pxd +++ b/python/obitools3/libalign/shifted_ali.pxd @@ -24,7 +24,9 @@ cdef class Kmer_similarity: cdef int32_t shift_count_array_height_a[1] cdef Obiview_p view1_p cdef OBIDMS_column_p column1_p + cdef OBIDMS_column_p qual_col1_p cdef Obiview_p view2_p cdef OBIDMS_column_p column2_p + cdef OBIDMS_column_p qual_col2_p cdef bint build_consensus cpdef free(self) \ No newline at end of file diff --git a/python/obitools3/libalign/shifted_ali.pyx b/python/obitools3/libalign/shifted_ali.pyx index 3697ef1..8edd2c4 100755 --- a/python/obitools3/libalign/shifted_ali.pyx +++ b/python/obitools3/libalign/shifted_ali.pyx @@ -1,8 +1,7 @@ #cython: language_level=3 -from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN +from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN, QUALITY_COLUMN -from obitools3.dms.obiseq cimport Nuc_Seq_Stored from obitools3.dms.capi.kmer_similarity cimport kmer_similarity, Obi_ali_p, obi_free_shifted_ali from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS from obitools3.dms.column.column cimport Column @@ -52,13 +51,14 @@ cdef class Ali_shifted: def direction(self): return self.pointer().direction - cpdef free(self): + cpdef free(self): # TODO allocated memory could be kept obi_free_shifted_ali(self.pointer()) cdef class Kmer_similarity: - def __init__(self, View_NUC_SEQS view1, Column column1=None, View_NUC_SEQS view2=None, Column column2=None, uint8_t kmer_size=3, build_consensus=True) : + def __init__(self, View_NUC_SEQS view1, Column column1=None, Column qual_column1=None, View_NUC_SEQS view2=None, Column column2=None, Column qual_column2=None, uint8_t kmer_size=3, build_consensus=True) : cdef Column default_seq_col + cdef Column default_qual_col if kmer_size < 1 or kmer_size > 3: raise Exception("Kmer size to compute kmer similarity must be >=1 or <=4") self.kmer_pos_array_p = NULL @@ -89,13 +89,28 @@ cdef class Kmer_similarity: raise Exception("Kmer similarity with no sequence column given must be given a NUC_SEQS view") default_seq_col = view2[NUC_SEQUENCE_COLUMN] self.column2_p = default_seq_col.pointer() + if qual_column1 is not None: + self.qual_col1_p = qual_column1.pointer() + else: + if type(view1) != View_NUC_SEQS: + raise Exception("Kmer similarity with no quality column given must be given a NUC_SEQS view") + default_qual_col = view1[QUALITY_COLUMN] + self.qual_col1_p = default_qual_col.pointer() + if qual_column2 is not None: + self.qual_col2_p = qual_column2.pointer() + else: + if type(view2) != View_NUC_SEQS: + raise Exception("Kmer similarity with no quality column given must be given a NUC_SEQS view") + default_qual_col = view2[QUALITY_COLUMN] + self.qual_col2_p = default_qual_col.pointer() + - - def __call__(self, Nuc_Seq_Stored seq1, Nuc_Seq_Stored seq2): + def __call__(self, object seq1, object seq2): cdef Obi_ali_p ali_p cdef Ali_shifted ali ali_p = kmer_similarity(self.view1_p, self.column1_p, seq1.index, 0, \ self.view2_p, self.column2_p, seq2.index, 0, \ + self.qual_col1_p, self.qual_col2_p, \ self.kmer_size, \ self.kmer_pos_array_p, self.kmer_pos_array_height_a, \ self.shift_array_p, self.shift_array_height_a, \ diff --git a/src/kmer_similarity.c b/src/kmer_similarity.c index 7024da9..4aab869 100755 --- a/src/kmer_similarity.c +++ b/src/kmer_similarity.c @@ -64,19 +64,18 @@ void obi_free_shifted_ali(Obi_ali_p ali) Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1, index_t elt_idx1, - Obiview_p view2, OBIDMS_column_p column2, index_t idx2, index_t elt_idx2, - uint8_t kmer_size, - int32_t* kmer_pos_array, int32_t* kmer_pos_array_height_p, - int32_t* shift_array, int32_t* shift_array_height_p, - int32_t* shift_count_array, int32_t* shift_count_array_length_p, - bool build_consensus) + Obiview_p view2, OBIDMS_column_p column2, index_t idx2, index_t elt_idx2, + OBIDMS_column_p qual_col1, OBIDMS_column_p qual_col2, + uint8_t kmer_size, + int32_t* kmer_pos_array, int32_t* kmer_pos_array_height_p, + int32_t* shift_array, int32_t* shift_array_height_p, + int32_t* shift_count_array, int32_t* shift_count_array_length_p, + bool build_consensus) { Obi_ali_p ali = NULL; int i, j; bool switched_seqs; bool left_ali; - OBIDMS_column_p qual_col1; - OBIDMS_column_p qual_col2; double score = 0.0; Obi_blob_p blob1 = NULL; Obi_blob_p blob2 = NULL; @@ -106,6 +105,8 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 int32_t overlap_len; int32_t consensus_len; int32_t cons_shift; + int32_t copy_start; + int32_t copy_len; uint8_t kmer; byte_t nuc; uint8_t encoding; @@ -116,6 +117,10 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 int32_t shift_array_height = *shift_array_height_p; int32_t shift_count_array_length = *shift_count_array_length_p; bool free_blob1 = false; + bool keep_seq1_start; + bool keep_seq2_start; + bool keep_seq1_end; + bool keep_seq2_end; // Check kmer size if ((kmer_size < 1) || (kmer_size > 4)) @@ -143,7 +148,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 switched_seqs = false; len1 = blob1->length_decoded_value; len2 = blob2->length_decoded_value; - if (len2 > len1) + if (len2 < len1) { switched_seqs = true; temp_len = len1; @@ -334,7 +339,8 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 // Find the most represented shift best_shift_idx = 0; max_common_kmers = 0; - empty_part = (shift_count_array_length-1)/2 - len1; + //empty_part = (shift_count_array_length-1)/2 - len1; //TODO wrong in some cases (len1 shorter than overlap or something like that) + empty_part = 0; for (i=empty_part; i < (shift_count_array_length - empty_part); i++) // skipping empty parts of the array { if (shift_count_array[i] > max_common_kmers) @@ -345,10 +351,44 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 } best_shift = best_shift_idx - len1; + keep_seq1_start = false; + keep_seq1_end = false; + keep_seq2_start = false; + keep_seq2_end = false; + + // The 873863 cases of hell if (best_shift > 0) - overlap_len = len2 - best_shift; - else + { + overlap_len = len2 - best_shift; + if (len1 <= overlap_len) + { + overlap_len = len1; + if (! switched_seqs) + keep_seq2_end = true; + else + keep_seq2_start = true; + } + else if (switched_seqs) + { + keep_seq2_start = true; + keep_seq1_end = true; + } + } + else if (best_shift < 0) + { overlap_len = len1 + best_shift; + if (!switched_seqs) + { + keep_seq1_start = true; + keep_seq2_end = true; + } + } + else + { + overlap_len = len1; + if ((!switched_seqs) && (len2 > len1)) + keep_seq2_end = true; + } score = max_common_kmers + kmer_size - 1; @@ -369,7 +409,7 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 ali->shift = abs_shift; ali->consensus_seq = NULL; ali->consensus_qual = NULL; - if (((best_shift < 0) && (!switched_seqs)) || ((best_shift > 0) && switched_seqs)) + if (((best_shift <= 0) && (!switched_seqs)) || ((best_shift > 0) && switched_seqs)) { left_ali = true; strcpy(ali->direction, "left"); @@ -383,33 +423,17 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 // Build the consensus sequence if asked if (build_consensus) { - // Get the quality columns - qual_col1 = obi_view_get_column(view1, QUALITY_COLUMN); - if (qual_col1 == NULL) - { - obi_set_errno(OBI_ALIGN_ERROR); - obidebug(1, "\nError when computing the kmer similarity between two sequences: can't open quality column to build the consensus sequence"); - return NULL; - } - qual_col2 = obi_view_get_column(view2, QUALITY_COLUMN); - if (qual_col2 == NULL) - { - obi_set_errno(OBI_ALIGN_ERROR); - obidebug(1, "\nError when computing the kmer similarity between two sequences: can't open quality column to build the consensus sequence"); - return NULL; - } - // Get the quality arrays qual1 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view1, qual_col1, idx1, 0, &qual1_len); if (qual1 == NULL) { - obidebug(1, "\nError getting the blob of the 1st sequence when computing the kmer similarity between two sequences"); + obidebug(1, "\nError getting the quality of the 1st sequence when computing the kmer similarity between two sequences"); return NULL; } qual2 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view2, qual_col2, idx2, 0, &qual2_len); if (qual2 == NULL) { - obidebug(1, "\nError getting the blob of the 1st sequence when computing the kmer similarity between two sequences"); + obidebug(1, "\nError getting the quality of the 2nd sequence when computing the kmer similarity between two sequences"); return NULL; } @@ -417,10 +441,10 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 if (seq1 == NULL) seq1 = obi_blob_to_seq(blob1); - if (left_ali) - consensus_len = len1 + len2 - overlap_len; + if (! switched_seqs) + consensus_len = len2 - best_shift; else - consensus_len = overlap_len; + consensus_len = len1 + best_shift; // Allocate memory for consensus sequence consensus_seq = (char*) malloc(consensus_len + 1 * sizeof(char)); // TODO keep malloced too maybe @@ -447,30 +471,27 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 // Compute consensus-relative shift for each sequence if (best_shift > 0) { - shift1 = best_shift; - shift2 = 0; + shift1 = 0; + shift2 = best_shift; } else { - shift1 = 0; - shift2 = -(best_shift); + shift1 = -(best_shift); + shift2 = 0; } - // If left alignment, copy first part of first sequence - if (left_ali) + // Copy first part of first or second sequence depending on cases + if (keep_seq1_start) { - if (switched_seqs) - { - strncpy(consensus_seq, seq2, abs_shift); - memcpy(consensus_qual, qual2, abs_shift*sizeof(uint8_t)); - cons_shift = abs_shift; - } - else - { - strncpy(consensus_seq, seq1, abs_shift); - memcpy(consensus_qual, qual1, abs_shift*sizeof(uint8_t)); - cons_shift = abs_shift; - } + strncpy(consensus_seq, seq1, abs_shift); + memcpy(consensus_qual, qual1, abs_shift*sizeof(uint8_t)); + cons_shift = abs_shift; + } + else if (keep_seq2_start) + { + strncpy(consensus_seq, seq2, abs_shift); + memcpy(consensus_qual, qual2, abs_shift*sizeof(uint8_t)); + cons_shift = abs_shift; } else cons_shift = 0; @@ -485,19 +506,26 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1 consensus_qual[pos+cons_shift] = round((qual1[pos+shift1] + qual2[pos+shift2])/2); // TODO maybe use the (p1*(1-p2/3)) formula (but parenthesis bug???) } - // If left alignment, copy last part of first sequence - if (left_ali) + // Copy last part of first or second sequence depending on cases + if (keep_seq1_end) { - if (switched_seqs) + strncpy(consensus_seq+cons_shift+overlap_len, seq1+overlap_len, len1 - overlap_len); + memcpy(consensus_qual+cons_shift+overlap_len, qual1+overlap_len, (len1 - overlap_len)*sizeof(uint8_t)); + } + if (keep_seq2_end) + { + if (best_shift <= 0) { - strncpy(consensus_seq+cons_shift+overlap_len, seq1+overlap_len, len1-overlap_len); - memcpy(consensus_qual+cons_shift+overlap_len, qual1+overlap_len, (len1-overlap_len)*sizeof(uint8_t)); + copy_start = overlap_len; + copy_len = len2 - overlap_len; } - else + if (best_shift > 0) { - strncpy(consensus_seq+cons_shift+overlap_len, seq2+overlap_len, len2-overlap_len); - memcpy(consensus_qual+cons_shift+overlap_len, qual2+overlap_len, (len2-overlap_len)*sizeof(uint8_t)); + copy_start = overlap_len + best_shift; + copy_len = len2 - overlap_len - best_shift; } + strncpy(consensus_seq+cons_shift+overlap_len, seq2+copy_start, copy_len); + memcpy(consensus_qual+cons_shift+overlap_len, qual2+copy_start, copy_len*sizeof(uint8_t)); } consensus_seq[consensus_len] = '\0'; diff --git a/src/kmer_similarity.h b/src/kmer_similarity.h index c8255c0..8d4cb5f 100755 --- a/src/kmer_similarity.h +++ b/src/kmer_similarity.h @@ -33,9 +33,9 @@ typedef struct Obi_ali { */ int overlap_length; /**< Length of the overlap between the aligned sequences. */ - char* consensus_seq; /**< Consensus sequence built with the computed overlap. + char* consensus_seq; /**< Consensus sequence built as to reconstruct a pairedend read. */ - uint8_t* consensus_qual; /**< Consensus quality built with the computed overlap. + uint8_t* consensus_qual; /**< Consensus quality built as to reconstruct a pairedend read. */ int shift; /**< Shift chosen to align the sequences (for shifted alignment). */ @@ -91,7 +91,7 @@ void obi_free_shifted_ali(Obi_ali_p ali); * If needed, reallocated to a bigger size. * @param shift_count_array_height_p A pointer on an integer corresponding to the size (number of elements) * allocated for shift_count_array. Updated by the function as needed. - * @param build_consensus A boolean indicating whether the function should build the consensus sequence and quality. // TODO option to build consensus without quality? + * @param build_consensus A boolean indicating whether the function should build the consensus sequence and quality as to reconstruct a pairedend read. // TODO option to build consensus without quality? * * @returns A pointer on an Obi_ali_p structure containing the results. * @retval NULL if an error occurred. @@ -107,6 +107,8 @@ Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column2, index_t idx2, index_t elt_idx2, + OBIDMS_column_p qual_col1, + OBIDMS_column_p qual_col2, uint8_t kmer_size, int32_t* kmer_pos_array, int32_t* kmer_pos_array_height_p,