diff --git a/python/obitools3/commands/alignpairedend.pxd b/python/obitools3/commands/alignpairedend.pxd new file mode 100755 index 0000000..862cabf --- /dev/null +++ b/python/obitools3/commands/alignpairedend.pxd @@ -0,0 +1,4 @@ +#cython: language_level=3 + + +cdef object buildAlignment(object direct, object reverse) \ No newline at end of file diff --git a/python/obitools3/commands/alignpairedend.pyx b/python/obitools3/commands/alignpairedend.pyx index 3b5e2d3..c80a880 100755 --- a/python/obitools3/commands/alignpairedend.pyx +++ b/python/obitools3/commands/alignpairedend.pyx @@ -2,19 +2,20 @@ from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport from obitools3.dms import DMS -from obitools3.dms.view.view cimport View from obitools3.dms.view import RollbackException from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS -from obitools3.dms.column.column cimport Column, Column_line -from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN -from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t, OBI_QUAL +from obitools3.dms.column.column cimport Column +from obitools3.dms.capi.obiview cimport QUALITY_COLUMN +from obitools3.dms.capi.obitypes cimport OBI_QUAL from obitools3.apps.optiongroups import addMinimalInputOption, addMinimalOutputOption from obitools3.uri.decode import open_uri from obitools3.apps.config import logger from obitools3.libalign._qsassemble import QSolexaReverseAssemble from obitools3.libalign._qsrassemble import QSolexaRightReverseAssemble from obitools3.libalign._solexapairend import buildConsensus, buildJoinedSequence -from obitools3.dms.obiseq cimport Nuc_Seq_Stored, Nuc_Seq +from obitools3.dms.obiseq cimport Nuc_Seq +from obitools3.libalign.shifted_ali cimport Kmer_similarity, Ali_shifted + import sys import os @@ -27,6 +28,7 @@ REVERSE_QUALITY_COLUMN_NAME = b"REVERSE_QUALITY" __title__="Aligns paired-ended reads" + def addOptions(parser): addMinimalInputOption(parser) @@ -41,14 +43,6 @@ def addOptions(parser): type=str, help="URI to the reverse reads if they are in a different view than the forward reads") -# TODO -# group.add_argument('--index-file', -# action="store", dest="alignpairedend:index", -# metavar="", -# default=None, -# type=str, -# help="URI to the index reads") - group.add_argument('--score-min', action="store", dest="alignpairedend:smin", metavar="#.###", @@ -56,12 +50,22 @@ def addOptions(parser): type=float, help="Minimum score for keeping alignments") + group.add_argument('-A', '--true-ali', + action="store_true", dest="alignpairedend:trueali", + default=False, + help="Performs gap free end alignment of sequences instead of using kmers to compute alignments (slower).") + + group.add_argument('-k', '--kmer-size', + action="store", dest="alignpairedend:kmersize", + metavar="#", + default=3, + type=int, + help="K-mer size for kmer comparisons, between 1 and 4 (not when using -A option; default: 3)") -# TODO declarations, cdef, imports la = QSolexaReverseAssemble() ra = QSolexaRightReverseAssemble() -def buildAlignment(object direct, object reverse): +cdef object buildAlignment(object direct, object reverse): if len(direct)==0 or len(reverse)==0: return None @@ -82,9 +86,9 @@ def buildAlignment(object direct, object reverse): ali = rali return ali + - -def alignmentIterator(entries): +def alignmentIterator(entries, aligner): if type(entries) == list: two_views = True @@ -94,7 +98,7 @@ def alignmentIterator(entries): else: two_views = False entries_len = len(entries) - + for i in range(entries_len): if two_views: seqF = forward[i] @@ -104,9 +108,12 @@ def alignmentIterator(entries): seqR = Nuc_Seq(seqF.id, seqF[REVERSE_SEQ_COLUMN_NAME], quality=seqF[REVERSE_QUALITY_COLUMN_NAME]) seqF.pop(REVERSE_SEQ_COLUMN_NAME) seqF.pop(REVERSE_QUALITY_COLUMN_NAME) - ali = buildAlignment(seqF, seqR) + + ali = aligner(seqF, seqR) + if ali is None: continue + yield ali @@ -160,11 +167,6 @@ def run(config): else: entries_len = len(entries) -# if "index" in config["alignpairedend"]: # TODO -# index = open_uri(config["alignpairedend"]["index"]) -# else: -# index = None - # Open the output output = open_uri(config['obi']['outputURI'], input=False, @@ -174,22 +176,30 @@ def run(config): view = output[1] - Column.new_column(view, b"QUALITY", OBI_QUAL) #TODO output URI quality option + Column.new_column(view, QUALITY_COLUMN, OBI_QUAL) #TODO output URI quality option? if 'smin' in config['alignpairedend']: - config['alignpairedend']['sminL'] = config['alignpairedend']['smin'] - config['alignpairedend']['sminR'] = config['alignpairedend']['smin'] - sminL = config['alignpairedend']['sminL'] - sminR = config['alignpairedend']['sminR'] + smin = config['alignpairedend']['smin'] else: - sminL = 0 - sminR = 0 + smin = 0 # Initialize the progress bar pb = ProgressBar(entries_len, config, seconde=5) - ba = alignmentIterator(entries) - + if config['alignpairedend']['trueali']: + kmer_ali = False + aligner = buildAlignment + else : + kmer_ali = True + if type(entries) == list: + forward = entries[0] + reverse = entries[1] + aligner = Kmer_similarity(forward, view2=reverse, kmer_size=config['alignpairedend']['kmersize']) + else: + aligner = Kmer_similarity(entries, kmer_size=config['alignpairedend']['kmersize']) + + ba = alignmentIterator(entries, aligner) + i = 0 for ali in ba: @@ -197,31 +207,33 @@ def run(config): consensus = view[i] - if sminL > 0: - if ((ali.direction=='left' and ali.score > sminL) - or (ali.score > sminR)): - buildConsensus(ali, consensus) + if not two_views: + seqF = entries[i] + else: + seqF = forward[i] + + if smin > 0: + if (ali.score > smin) : + buildConsensus(ali, consensus, seqF) else: - seqF = ali[0].wrapped if not two_views: - seq = entries[i] - seqR = Nuc_Seq(seq.id, seq[REVERSE_SEQ_COLUMN_NAME], quality = seq[REVERSE_QUALITY_COLUMN_NAME]) + seqR = Nuc_Seq(seqF.id, seqF[REVERSE_SEQ_COLUMN_NAME], quality = seqF[REVERSE_QUALITY_COLUMN_NAME]) else: seqR = reverse[i] - buildJoinedSequence(ali, seqR, consensus) + buildJoinedSequence(ali, seqR, consensus, forward=seqF) - consensus[b"sminL"] = sminL - consensus[b"sminR"] = sminR + consensus[b"smin"] = smin else: - buildConsensus(ali, consensus) - -# TODO -# if "index" in config['alignpairedend'] and config['alignpairedend']['index'] is not None: -# idx = str(config['alignpairedend']['index'].next()) -# consensus[b"illumina_index"] = idx + buildConsensus(ali, consensus, seqF) + + if kmer_ali : + ali.free() i+=1 + if kmer_ali : + aligner.free() + # Save command config in View and DMS comments command_line = " ".join(sys.argv[1:]) view.write_config(config, "alignpairedend", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name) diff --git a/python/obitools3/dms/capi/kmer_similarity.pxd b/python/obitools3/dms/capi/kmer_similarity.pxd new file mode 100755 index 0000000..024046b --- /dev/null +++ b/python/obitools3/dms/capi/kmer_similarity.pxd @@ -0,0 +1,42 @@ +#cython: language_level=3 + +from obitools3.dms.capi.obiview cimport Obiview_p +from obitools3.dms.capi.obidmscolumn cimport OBIDMS_column_p +from obitools3.dms.capi.obitypes cimport index_t + +from libc.stdint cimport int32_t, uint8_t + + +cdef extern from "kmer_similarity.h" nogil: + + struct Obi_ali_t : + double score + int consensus_length + int overlap_length + char* consensus_seq + uint8_t* consensus_qual + int shift + char* direction + + ctypedef Obi_ali_t* Obi_ali_p + + + void obi_free_shifted_ali(Obi_ali_p ali) + + + Obi_ali_p kmer_similarity(Obiview_p view1, + OBIDMS_column_p column1, + index_t idx1, + index_t elt_idx1, + Obiview_p view2, + OBIDMS_column_p column2, + index_t idx2, + index_t elt_idx2, + uint8_t kmer_size, + int32_t* kmer_pos_array, + int32_t* kmer_pos_array_height_p, + int32_t* shift_array, + int32_t* shift_array_height_p, + int32_t* shift_count_array, + int32_t* shift_count_array_height_p, + bint build_consensus) diff --git a/python/obitools3/dms/dms.cfiles b/python/obitools3/dms/dms.cfiles index b407a36..9425abb 100755 --- a/python/obitools3/dms/dms.cfiles +++ b/python/obitools3/dms/dms.cfiles @@ -6,6 +6,7 @@ ../../../src/dna_seq_indexer.c ../../../src/encode.c ../../../src/hashtable.c +../../../src/kmer_similarity.c ../../../src/linked_list.c ../../../src/murmurhash2.c ../../../src/obi_lcs.c diff --git a/python/obitools3/libalign/_solexapairend.pyx b/python/obitools3/libalign/_solexapairend.pyx index 13e57f0..31973d0 100755 --- a/python/obitools3/libalign/_solexapairend.pyx +++ b/python/obitools3/libalign/_solexapairend.pyx @@ -3,6 +3,18 @@ from cpython cimport array from .solexapairend import iterOnAligment +from .shifted_ali cimport Ali_shifted + +from obitools3.dms.capi.obiview cimport Obiview_p, QUALITY_COLUMN, \ + obi_set_qual_int_with_elt_idx_and_col_p_in_view, \ + obi_set_str_with_elt_idx_and_col_p_in_view + +from obitools3.dms.capi.obidmscolumn cimport OBIDMS_column_p + +from obitools3.dms.view.view cimport View +from obitools3.dms.column.column cimport Column + + from math import log10 @@ -146,7 +158,7 @@ cdef class IterOnConsensus: seqBDeletion = property(get_seqBDeletion, None, None, "deletereverse's docstring") -def buildConsensus(ali, seq): +def buildConsensus(ali, seq, ref_tags=None): cdef list quality cdef char aseq[1000] cdef int i=0 @@ -156,61 +168,82 @@ def buildConsensus(ali, seq): cdef double score cdef bytes sseq cdef double p + cdef OBIDMS_column_p col_p + cdef Obiview_p view_p + cdef View view + cdef Column column quality = [] + + if type(ali) == Ali_shifted: + view = seq.view + view_p = view.pointer() + column = view[QUALITY_COLUMN] + col_p = column.pointer() + # doesn't work because uint8_t* are forced into bytes by cython (nothing read/kept beyond 0 values) + #obi_set_qual_int_with_elt_idx_and_col_p_in_view(view_p, col_p, seq.index, 0, ali.consensus_qual, ali.consensus_len) + seq.set(ref_tags.id+b"_CONS", ali.consensus_seq, quality=ali.consensus_qual, tags=ref_tags) + seq[b'ali_length'] = ali.consensus_len + seq[b'score_norm']=float(ali.score)/ali.consensus_len + seq[b'shift']=ali.shift + else: + if len(ali[0])>999: # TODO why? + raise AssertionError,"Too long alignemnt" - if len(ali[0])>999: # TODO why? - raise AssertionError,"Too long alignemnt" - - ic=IterOnConsensus(ali) - - for nuc,score in ic: - cnuc=nuc - quality.append(score) - aseq[i]=cnuc[0] - i+=1 - - aseq[i]=0 - - sseq=aseq - - # Reconvert quality from array of probabilities to array of raw quality values - i=0 - for p in quality: - quality[i] = min(42, round(-10*log10(p))) - i+=1 + ic=IterOnConsensus(ali) - seq.set(ali[0].wrapped.id+b"_CONS", sseq, quality=quality, tags=ali[0].wrapped) + for nuc,score in ic: + cnuc=nuc + quality.append(score) + aseq[i]=cnuc[0] + i+=1 + aseq[i]=0 + + sseq=aseq + + # Reconvert quality from array of probabilities to array of raw quality values + i=0 + for p in quality: + quality[i] = min(42, round(-10*log10(p))) + i+=1 + + seq.set(ali[0].wrapped.id+b"_CONS", sseq, quality=quality, tags=ali[0].wrapped) + + if hasattr(ali, "counter"): + seq[b'alignement_id']=ali.counter + seq[b'seq_a_single']=ic.seqASingle + seq[b'seq_b_single']=ic.seqBSingle + seq[b'seq_ab_match']=ic.seqABMatch + seq[b'seq_a_mismatch']=ic.seqAMismatch + seq[b'seq_b_mismatch']=ic.seqBMismatch + seq[b'seq_a_insertion']=ic.seqAInsertion + seq[b'seq_b_insertion']=ic.seqBInsertion-ic.seqBSingle + seq[b'seq_a_deletion']=ic.seqADeletion + seq[b'seq_b_deletion']=ic.seqBDeletion + seq[b'ali_length']=len(seq)-ic.seqASingle-ic.seqBSingle + if seq[b'ali_length']>0: + seq[b'score_norm']=float(ali.score)/seq[b'ali_length'] + if hasattr(ali, "direction"): - seq[b'direction']=ali.direction - if hasattr(ali, "counter"): - seq[b'alignement_id']=ali.counter - seq[b'seq_a_single']=ic.seqASingle - seq[b'seq_b_single']=ic.seqBSingle - seq[b'seq_ab_match']=ic.seqABMatch - seq[b'seq_a_mismatch']=ic.seqAMismatch - seq[b'seq_b_mismatch']=ic.seqBMismatch - seq[b'seq_a_insertion']=ic.seqAInsertion - seq[b'seq_b_insertion']=ic.seqBInsertion-ic.seqBSingle - seq[b'seq_a_deletion']=ic.seqADeletion - seq[b'seq_b_deletion']=ic.seqBDeletion + seq[b'ali_direction']=ali.direction seq[b'score']=ali.score - seq[b'ali_length']=len(seq)-ic.seqASingle-ic.seqBSingle - if seq[b'ali_length']>0: - seq[b'score_norm']=float(ali.score)/seq[b'ali_length'] seq[b'mode']=b'alignment' + return seq -def buildJoinedSequence(ali, reverse, seq): - forward = ali[0].wrapped +def buildJoinedSequence(ali, reverse, seq, forward=None): + if forward is not None: + forward = forward + else: + forward = ali[0].wrapped s = forward.seq + reverse.seq quality = forward.quality quality.extend(reverse.quality) seq.set(forward.id +b"_PairedEnd", s, definition=forward.definition, quality=quality, tags=forward) seq[b"score"]=ali.score - seq[b"ali_dir"]=ali.direction + seq[b"ali_direction"]=ali.direction seq[b"mode"]=b"joined" seq[b"pairedend_limit"]=len(forward) return seq diff --git a/python/obitools3/libalign/shifted_ali.pxd b/python/obitools3/libalign/shifted_ali.pxd new file mode 100755 index 0000000..2277c97 --- /dev/null +++ b/python/obitools3/libalign/shifted_ali.pxd @@ -0,0 +1,30 @@ +#cython: language_level=3 + +from obitools3.dms.capi.kmer_similarity cimport kmer_similarity, Obi_ali_p +from libc.stdint cimport int32_t, uint8_t +from obitools3.dms.capi.obiview cimport Obiview_p +from obitools3.dms.capi.obidmscolumn cimport OBIDMS_column_p + + +cdef class Ali_shifted: + cdef Obi_ali_p _pointer + cdef inline Obi_ali_p pointer(self) + cpdef free(self) + @staticmethod + cdef Ali_shifted new_ali(Obi_ali_p ali_p) + + +cdef class Kmer_similarity: + cdef uint8_t kmer_size + cdef int32_t* kmer_pos_array_p + cdef int32_t kmer_pos_array_height_a[1] + cdef int32_t* shift_array_p + cdef int32_t shift_array_height_a[1] + cdef int32_t* shift_count_array_p + cdef int32_t shift_count_array_height_a[1] + cdef Obiview_p view1_p + cdef OBIDMS_column_p column1_p + cdef Obiview_p view2_p + cdef OBIDMS_column_p column2_p + cdef bint build_consensus + cpdef free(self) \ No newline at end of file diff --git a/python/obitools3/libalign/shifted_ali.pyx b/python/obitools3/libalign/shifted_ali.pyx new file mode 100755 index 0000000..3697ef1 --- /dev/null +++ b/python/obitools3/libalign/shifted_ali.pyx @@ -0,0 +1,119 @@ +#cython: language_level=3 + +from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN + +from obitools3.dms.obiseq cimport Nuc_Seq_Stored +from obitools3.dms.capi.kmer_similarity cimport kmer_similarity, Obi_ali_p, obi_free_shifted_ali +from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS +from obitools3.dms.column.column cimport Column + +from libc.stdlib cimport free + + +cdef class Ali_shifted: + + cdef inline Obi_ali_p pointer(self) : + return (self._pointer) + + @staticmethod + cdef Ali_shifted new_ali(Obi_ali_p ali_p) : # not __init__ method because need to pass pointer + ali = Ali_shifted() + ali._pointer = ali_p + return ali + + @property + def score(self): + return self.pointer().score + + @property + def consensus_len(self): + return self.pointer().consensus_length + + @property + def overlap_len(self): + return self.pointer().overlap_length + + @property + def consensus_seq(self): + return self.pointer().consensus_seq + + @property + def shift(self): + return self.pointer().shift + + @property + def consensus_qual(self): # must return list because uint8_t* are forced into bytes by cython which won't keep beyond the first 0 value + qual_list = [] + for i in range(self.consensus_len): + qual_list.append((self.pointer().consensus_qual)[i]) + return qual_list + + @property + def direction(self): + return self.pointer().direction + + cpdef free(self): + obi_free_shifted_ali(self.pointer()) + + +cdef class Kmer_similarity: + def __init__(self, View_NUC_SEQS view1, Column column1=None, View_NUC_SEQS view2=None, Column column2=None, uint8_t kmer_size=3, build_consensus=True) : + cdef Column default_seq_col + if kmer_size < 1 or kmer_size > 3: + raise Exception("Kmer size to compute kmer similarity must be >=1 or <=4") + self.kmer_pos_array_p = NULL + self.shift_array_p = NULL + self.shift_count_array_p = NULL + self.kmer_pos_array_height_a[0] = 0 + self.shift_array_height_a[0] = 0 + self.shift_count_array_height_a[0] = 0 + self.kmer_size = kmer_size + self.build_consensus = build_consensus + self.view1_p = view1.pointer() + if column1 is not None: + self.column1_p = column1.pointer() + else: + if type(view1) != View_NUC_SEQS: + raise Exception("Kmer similarity with no sequence column given must be given a NUC_SEQS view") + default_seq_col = view1[NUC_SEQUENCE_COLUMN] + self.column1_p = default_seq_col.pointer() + if view2 is not None: + self.view2_p = view2.pointer() + else: + view2 = view1 + self.view2_p = self.view1_p + if column2 is not None: + self.column2_p = column2.pointer() + else: + if type(view2) != View_NUC_SEQS: + raise Exception("Kmer similarity with no sequence column given must be given a NUC_SEQS view") + default_seq_col = view2[NUC_SEQUENCE_COLUMN] + self.column2_p = default_seq_col.pointer() + + + def __call__(self, Nuc_Seq_Stored seq1, Nuc_Seq_Stored seq2): + cdef Obi_ali_p ali_p + cdef Ali_shifted ali + ali_p = kmer_similarity(self.view1_p, self.column1_p, seq1.index, 0, \ + self.view2_p, self.column2_p, seq2.index, 0, \ + self.kmer_size, \ + self.kmer_pos_array_p, self.kmer_pos_array_height_a, \ + self.shift_array_p, self.shift_array_height_a, \ + self.shift_count_array_p, self.shift_count_array_height_a, + self.build_consensus) + + ali = Ali_shifted.new_ali(ali_p) + return ali + + cpdef free(self): + if self.kmer_pos_array_p is not NULL: + free(self.kmer_pos_array_p) + self.kmer_pos_array_height_a[0] = 0 + if (self.shift_array_p is not NULL) : + free(self.shift_array_p) + self.shift_array_height_a[0] = 0 + if (self.shift_count_array_p is not NULL) : + free(self.shift_count_array_p) + self.shift_count_array_height_a[0] = 0 + + diff --git a/python/obitools3/utils.cfiles b/python/obitools3/utils.cfiles index e3c390f..d7ee383 100755 --- a/python/obitools3/utils.cfiles +++ b/python/obitools3/utils.cfiles @@ -6,6 +6,7 @@ ../../src/dna_seq_indexer.c ../../src/encode.c ../../src/hashtable.c +../../src/kmer_similarity.c ../../src/linked_list.c ../../src/murmurhash2.c ../../src/obi_lcs.c diff --git a/src/kmer_similarity.c b/src/kmer_similarity.c new file mode 100755 index 0000000..c2941bf --- /dev/null +++ b/src/kmer_similarity.c @@ -0,0 +1,483 @@ +/**************************************************************************** + * Kmer similarity computation functions * + ****************************************************************************/ + +/** + * @file kmer_similarity.c + * @author Celine Mercier (celine.mercier@metabarcoding.org) + * @date January 7th 2019 + * @brief Kmer similarity computation functions. + */ + +#include +#include +#include +#include + +#include "utils.h" +#include "obidebug.h" +#include "obierrno.h" +#include "obitypes.h" + +#include "kmer_similarity.h" +#include "obidmscolumn.h" +#include "obiview.h" +#include "encode.h" +#include "dna_seq_indexer.h" + + +#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?) + + + +/************************************************************************** + * + * D E C L A R A T I O N O F T H E P R I V A T E F U N C T I O N S + * + **************************************************************************/ + + + + +/************************************************************************ + * + * D E F I N I T I O N O F T H E P R I V A T E F U N C T I O N S + * + ************************************************************************/ + + + + +/********************************************************************** + * + * D E F I N I T I O N O F T H E P U B L I C F U N C T I O N S + * + **********************************************************************/ + + +void obi_free_shifted_ali(Obi_ali_p ali) +{ + free(ali->consensus_seq); + free(ali->consensus_qual); + free(ali); +} + + +Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1, index_t elt_idx1, + Obiview_p view2, OBIDMS_column_p column2, index_t idx2, index_t elt_idx2, + uint8_t kmer_size, + int32_t* kmer_pos_array, int32_t* kmer_pos_array_height_p, + int32_t* shift_array, int32_t* shift_array_height_p, + int32_t* shift_count_array, int32_t* shift_count_array_length_p, + bool build_consensus) +{ + Obi_ali_p ali = NULL; + int i, j; + OBIDMS_column_p qual_col1; + OBIDMS_column_p qual_col2; + double score = 0.0; + Obi_blob_p blob1 = NULL; + Obi_blob_p blob2 = NULL; + Obi_blob_p temp_blob = NULL; + char* seq1 = NULL; + char* seq2 = NULL; + int32_t len1; + int32_t len2; + int32_t temp_len; + int32_t pos; + int32_t shift1; + int32_t shift2; + const uint8_t* qual1; + const uint8_t* qual2; + int qual1_len; + int qual2_len; + char* consensus_seq; + uint8_t* consensus_qual; + int32_t empty_part; + int32_t nuc_idx; + int32_t kmer_idx; + int32_t kmer_count; + int32_t best_shift_idx; + int32_t best_shift; + int32_t max_common_kmers; + int32_t overlap_len; + int32_t consensus_len; + int32_t cons_shift; + uint8_t kmer; + byte_t nuc; + uint8_t encoding; + int total_len; + int shift; + int32_t shift_idx; + int32_t kmer_pos_array_height = *kmer_pos_array_height_p; + int32_t shift_array_height = *shift_array_height_p; + int32_t shift_count_array_length = *shift_count_array_length_p; + bool free_blob1 = false; + + // Check kmer size + if ((kmer_size < 1) || (kmer_size > 4)) + { + obi_set_errno(OBI_ALIGN_ERROR); + obidebug(1, "\nError when computing the kmer similarity between two sequences: the kmer size must be >= 1 or <= 4"); + return NULL; + } + + // Get sequence blobs + blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(view1, column1, idx1, elt_idx1); + if (blob1 == NULL) + { + obidebug(1, "\nError getting the blob of the 1st sequence when computing the kmer similarity between two sequences"); + return NULL; + } + blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(view2, column2, idx2, elt_idx2); + if (blob2 == NULL) + { + obidebug(1, "\nError getting the blob of the 2nd sequence when computing the kmer similarity between two sequences"); + return NULL; + } + + // Choose the shortest sequence to save kmer positions in array + len1 = blob1->length_decoded_value; + len2 = blob2->length_decoded_value; + if (len2 > len1) + { + temp_len = len1; + len1 = len2; + len2 = temp_len; + temp_blob = blob1; + blob1 = blob2; + blob2 = temp_blob; + } + + // Force encoding on 2 bits by replacing ambiguous nucleotides by 'a's + if (blob1->element_size == 4) + { + seq1 = obi_blob_to_seq(blob1); + for (i=0; ielement_size == 4) + { + seq2 = obi_blob_to_seq(blob2); + for (i=0; i= shift_count_array_length) + { + shift_count_array_length = total_len; + shift_count_array = (int32_t*) realloc(shift_count_array, shift_count_array_length * sizeof(int32_t)); + if (shift_count_array == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory"); + return NULL; + } + } + + // Allocate or reallocate memory for the array of shifts if necessary + if (shift_array == NULL) + { + shift_array_height = total_len; + shift_array = (int32_t*) malloc(ARRAY_LENGTH * shift_array_height * sizeof(int32_t)); + if (shift_array == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory"); + return NULL; + } + } + else if (len1 >= shift_array_height) + { + shift_array_height = total_len; + shift_array = (int32_t*) realloc(shift_array, ARRAY_LENGTH * shift_array_height * sizeof(int32_t)); + if (shift_array == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory"); + return NULL; + } + } + + // Allocate or reallocate memory for the array of positions if necessary + if (kmer_pos_array == NULL) + { + kmer_pos_array_height = len1; + kmer_pos_array = (int32_t*) malloc(ARRAY_LENGTH * kmer_pos_array_height * sizeof(int32_t)); + if (kmer_pos_array == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory"); + return NULL; + } + } + else if (len1 >= kmer_pos_array_height) + { + kmer_pos_array_height = len1; + kmer_pos_array = (int32_t*) realloc(kmer_pos_array, ARRAY_LENGTH * kmer_pos_array_height * sizeof(int32_t)); + if (kmer_pos_array == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory"); + return NULL; + } + } + + // Initialize all positions to -1 + for (i=0; i<(ARRAY_LENGTH * kmer_pos_array_height); i++) + kmer_pos_array[i] = -1; + // Initialize all shifts to the maximum value of an int32_t + for (i=0; i<(ARRAY_LENGTH * shift_array_height); i++) + shift_array[i] = INT32_MAX; + //memset(shift_array, 1, ARRAY_LENGTH * shift_array_height * sizeof(int32_t)); // why doesn't this work? + // Initialize all shift counts to 0 + memset(shift_count_array, 0, shift_count_array_length * sizeof(int32_t)); + + *kmer_pos_array_height_p = kmer_pos_array_height; + *shift_array_height_p = shift_array_height; + *shift_count_array_length_p = shift_count_array_length; + + // Fill array with positions of kmers in the shortest sequence + encoding = blob1->element_size; + kmer_count = len1 - kmer_size + 1; + for (kmer_idx=0; kmer_idx < kmer_count; kmer_idx++) + { + kmer = 0; + for (nuc_idx=kmer_idx; nuc_idx<(kmer_idx+kmer_size); nuc_idx++) + { + nuc = get_nucleotide_from_encoded_seq(blob1->value, nuc_idx, encoding); + kmer <<= encoding; + kmer |= nuc; + } + + i = 0; + while (kmer_pos_array[(kmer*kmer_pos_array_height)+i] != -1) + i++; + kmer_pos_array[(kmer*kmer_pos_array_height)+i] = kmer_idx; + } + + // Compare positions of kmers between both sequences and store shifts + kmer_count = blob2->length_decoded_value - kmer_size + 1; + for (kmer_idx=0; kmer_idx < kmer_count; kmer_idx++) + { + kmer = 0; + for (nuc_idx=kmer_idx; nuc_idx<(kmer_idx+kmer_size); nuc_idx++) + { + nuc = get_nucleotide_from_encoded_seq(blob2->value, nuc_idx, encoding); + kmer <<= encoding; + kmer |= nuc; + } + + // Get the index at which the new shifts should be stored for that kmer + j = 0; + while ((shift_array[(kmer*shift_array_height)+j] != INT32_MAX) && (j= 255 + break; + } + + // Find the most represented shift + best_shift_idx = 0; + max_common_kmers = 0; + empty_part = (shift_count_array_length-1)/2 - len1; + for (i=empty_part; i < (shift_count_array_length - empty_part); i++) // skipping empty parts of the array + { + if (shift_count_array[i] > max_common_kmers) + { + best_shift_idx = i; + max_common_kmers = shift_count_array[i]; + } + } + best_shift = best_shift_idx - len1; + + if (best_shift > 0) + overlap_len = len2 - best_shift; + else + overlap_len = len1 + best_shift; + + score = max_common_kmers + kmer_size - 1; + + ali = (Obi_ali_p) malloc(sizeof(Obi_ali_t)); + if (ali == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory for the result structure"); + return NULL; + } + + // Save result in Obi_ali structure + ali->score = score; + ali->consensus_length = 0; + ali->overlap_length = overlap_len; + ali->shift = best_shift; + ali->consensus_seq = NULL; + ali->consensus_qual = NULL; + + // Build the consensus sequence if asked + if (build_consensus) + { + // Get the quality columns + qual_col1 = obi_view_get_column(view1, QUALITY_COLUMN); + if (qual_col1 == NULL) + { + obi_set_errno(OBI_ALIGN_ERROR); + obidebug(1, "\nError when computing the kmer similarity between two sequences: can't open quality column to build the consensus sequence"); + return NULL; + } + qual_col2 = obi_view_get_column(view2, QUALITY_COLUMN); + if (qual_col2 == NULL) + { + obi_set_errno(OBI_ALIGN_ERROR); + obidebug(1, "\nError when computing the kmer similarity between two sequences: can't open quality column to build the consensus sequence"); + return NULL; + } + + // Get the quality arrays + qual1 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view1, qual_col1, idx1, 0, &qual1_len); + if (qual1 == NULL) + { + obidebug(1, "\nError getting the blob of the 1st sequence when computing the kmer similarity between two sequences"); + return NULL; + } + qual2 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view2, qual_col2, idx2, 0, &qual2_len); + if (qual2 == NULL) + { + obidebug(1, "\nError getting the blob of the 1st sequence when computing the kmer similarity between two sequences"); + return NULL; + } + + // Decode the first sequence if not already done + if (seq1 == NULL) + seq1 = obi_blob_to_seq(blob1); + + if (best_shift > 0) + consensus_len = len1 + len2 - overlap_len; + else + consensus_len = overlap_len; + + // Allocate memory for consensus sequence + consensus_seq = (char*) malloc(consensus_len + 1 * sizeof(char)); // TODO keep malloced too maybe + if (consensus_seq == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory for the consensus sequence"); + return NULL; + } + + // Allocate memory for consensus quality + consensus_qual = (uint8_t*) malloc(consensus_len * sizeof(uint8_t)); + if (consensus_qual == NULL) + { + obi_set_errno(OBI_MALLOC_ERROR); + obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory for the consensus quality"); + return NULL; + } + + ali->consensus_length = consensus_len; + ali->consensus_seq = consensus_seq; + ali->consensus_qual = consensus_qual; + + // Compute consensus-relative shift for each sequence and store direction + if (best_shift >= 0) + { + shift1 = 0; + shift2 = best_shift; + strcpy(ali->direction, "left"); + } + else + { + shift1 = -(best_shift); + shift2 = 0; + strcpy(ali->direction, "right"); + } + + // If positive shift, copy first part of second sequence + if (best_shift > 0) + { + strncpy(consensus_seq, seq2, best_shift); + memcpy(consensus_qual, qual2, best_shift*sizeof(uint8_t)); + cons_shift = best_shift; + } + else + cons_shift = 0; + + // Build consensus part + for (pos=0; pos= qual2[pos+shift2]) + consensus_seq[pos+cons_shift] = seq1[pos+shift1]; + else + consensus_seq[pos+cons_shift] = seq2[pos+shift2]; + consensus_qual[pos+cons_shift] = round((qual1[pos+shift1] + qual2[pos+shift2])/2); // TODO maybe use the (p1*(1-p2/3)) formula (but parenthesis bug???) + } + + // If positive shift, copy last part of first sequence + if (best_shift > 0) + { + strncpy(consensus_seq+cons_shift+overlap_len, seq1+overlap_len, len1-overlap_len); + memcpy(consensus_qual+cons_shift+overlap_len, qual1+overlap_len, (len1-overlap_len)*sizeof(uint8_t)); + } + + consensus_seq[consensus_len] = '\0'; + } + + if (seq1 != NULL) + free(seq1); + if (free_blob1) + free(blob1); + free(seq2); + free(blob2); + + return ali; +} + diff --git a/src/kmer_similarity.h b/src/kmer_similarity.h new file mode 100755 index 0000000..c8255c0 --- /dev/null +++ b/src/kmer_similarity.h @@ -0,0 +1,120 @@ +/**************************************************************************** + * Header file for Kmer similarity computation functions * + ****************************************************************************/ + +/** + * @file kmer_similarity.h + * @author Celine Mercier (celine.mercier@metabarcoding.org) + * @date January 7th 2019 + * @brief Header file for Kmer similarity computation functions. + */ + + +#ifndef KMER_SIMILARITY_H_ +#define KMER_SIMILARITY_H_ + +#include + +#include "obitypes.h" +#include "obidmscolumn.h" +#include "obiview.h" + + +#define ARRAY_LENGTH (256) + + +/** + * @brief Alignment structure, with informations about the similarity and to rebuild the alignment. + */ +typedef struct Obi_ali { + double score; /**< Alignment score. + */ + int consensus_length; /**< Length of the final consensus sequence. + */ + int overlap_length; /**< Length of the overlap between the aligned sequences. + */ + char* consensus_seq; /**< Consensus sequence built with the computed overlap. + */ + uint8_t* consensus_qual; /**< Consensus quality built with the computed overlap. + */ + int shift; /**< Shift chosen to align the sequences (for shifted alignment). + */ + char direction[6]; /**< Alignement direction (positive/right or negative/left shift) (for shifted alignment). + */ // TODO but sequences switched around depending on size..... discuss +} Obi_ali_t, *Obi_ali_p; + + +/** + * @brief Frees an Obi_ali_p structure and all its elements. + * + * @warning The pointer sent becomes unusable. + * + * @param ali The pointer on the Obi_ali_p structure to free. + * + * @since January 2019 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +void obi_free_shifted_ali(Obi_ali_p ali); + + +/** + * Function computing the kmer similarity of two sequences stored in views. + * + * The similarity is computing this way: the positions of identical kmers in both sequences are + * compared, and the most represented shift is chosen. The similarity is then calculated as: + * kmer_similarity = number_of_common_kmers_with_the_chosen_shift + kmer_size - 1 + * + * @warning Several pointers and structures passed to or returned by the function have to be freed by the caller: + * - kmer_pos_array + * - shift_array + * - shift_count_array + * - the Obi_ali_p structure returned + * + * @param view1 A pointer on the view containing the first sequence. + * @param column1 A pointer on the column containing the first sequence. + * @param idx1 The index of the first sequence in view1. + * @param elt_idx1 The element index of the first sequence in column1. + * @param view2 A pointer on the view containing the second sequence. + * @param column2 A pointer on the column containing the second sequence. + * @param idx2 The index of the second sequence in view2. + * @param elt_idx2 The element index of the second sequence in column2. + * @param kmer_size The kmer length to use. Must be >= 1 <= 4. + * @param kmer_pos_array The array used to store kmer positions. If NULL, allocated by the function. + * If needed, reallocated to a bigger size. + * @param kmer_pos_array_height_p A pointer on an integer corresponding to the size (number of elements) + * allocated for kmer_pos_array. Updated by the function as needed. + * @param shift_array The array used to store kmer shifts. If NULL, allocated by the function. + * If needed, reallocated to a bigger size. + * @param shift_array_height_p A pointer on an integer corresponding to the size (number of elements) + * allocated for shift_array. Updated by the function as needed. + * @param shift_count_array The array used to store shift counts. If NULL, allocated by the function. + * If needed, reallocated to a bigger size. + * @param shift_count_array_height_p A pointer on an integer corresponding to the size (number of elements) + * allocated for shift_count_array. Updated by the function as needed. + * @param build_consensus A boolean indicating whether the function should build the consensus sequence and quality. // TODO option to build consensus without quality? + * + * @returns A pointer on an Obi_ali_p structure containing the results. + * @retval NULL if an error occurred. + * + * @since January 2019 + * @author Celine Mercier (celine.mercier@metabarcoding.org) + */ +Obi_ali_p kmer_similarity(Obiview_p view1, + OBIDMS_column_p column1, + index_t idx1, + index_t elt_idx1, + Obiview_p view2, + OBIDMS_column_p column2, + index_t idx2, + index_t elt_idx2, + uint8_t kmer_size, + int32_t* kmer_pos_array, + int32_t* kmer_pos_array_height_p, + int32_t* shift_array, + int32_t* shift_array_height_p, + int32_t* shift_count_array, + int32_t* shift_count_array_height_p, + bool build_consensus); + + +#endif /* KMER_SIMILARITY_H_ */