Alignpairedend: added alignment using shifting with best kmer similarity
(low level layer in C and Cython API)
This commit is contained in:
4
python/obitools3/commands/alignpairedend.pxd
Executable file
4
python/obitools3/commands/alignpairedend.pxd
Executable file
@ -0,0 +1,4 @@
|
|||||||
|
#cython: language_level=3
|
||||||
|
|
||||||
|
|
||||||
|
cdef object buildAlignment(object direct, object reverse)
|
@ -2,19 +2,20 @@
|
|||||||
|
|
||||||
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
||||||
from obitools3.dms import DMS
|
from obitools3.dms import DMS
|
||||||
from obitools3.dms.view.view cimport View
|
|
||||||
from obitools3.dms.view import RollbackException
|
from obitools3.dms.view import RollbackException
|
||||||
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
||||||
from obitools3.dms.column.column cimport Column, Column_line
|
from obitools3.dms.column.column cimport Column
|
||||||
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN
|
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN
|
||||||
from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t, OBI_QUAL
|
from obitools3.dms.capi.obitypes cimport OBI_QUAL
|
||||||
from obitools3.apps.optiongroups import addMinimalInputOption, addMinimalOutputOption
|
from obitools3.apps.optiongroups import addMinimalInputOption, addMinimalOutputOption
|
||||||
from obitools3.uri.decode import open_uri
|
from obitools3.uri.decode import open_uri
|
||||||
from obitools3.apps.config import logger
|
from obitools3.apps.config import logger
|
||||||
from obitools3.libalign._qsassemble import QSolexaReverseAssemble
|
from obitools3.libalign._qsassemble import QSolexaReverseAssemble
|
||||||
from obitools3.libalign._qsrassemble import QSolexaRightReverseAssemble
|
from obitools3.libalign._qsrassemble import QSolexaRightReverseAssemble
|
||||||
from obitools3.libalign._solexapairend import buildConsensus, buildJoinedSequence
|
from obitools3.libalign._solexapairend import buildConsensus, buildJoinedSequence
|
||||||
from obitools3.dms.obiseq cimport Nuc_Seq_Stored, Nuc_Seq
|
from obitools3.dms.obiseq cimport Nuc_Seq
|
||||||
|
from obitools3.libalign.shifted_ali cimport Kmer_similarity, Ali_shifted
|
||||||
|
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
@ -27,6 +28,7 @@ REVERSE_QUALITY_COLUMN_NAME = b"REVERSE_QUALITY"
|
|||||||
__title__="Aligns paired-ended reads"
|
__title__="Aligns paired-ended reads"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def addOptions(parser):
|
def addOptions(parser):
|
||||||
|
|
||||||
addMinimalInputOption(parser)
|
addMinimalInputOption(parser)
|
||||||
@ -41,14 +43,6 @@ def addOptions(parser):
|
|||||||
type=str,
|
type=str,
|
||||||
help="URI to the reverse reads if they are in a different view than the forward reads")
|
help="URI to the reverse reads if they are in a different view than the forward reads")
|
||||||
|
|
||||||
# TODO
|
|
||||||
# group.add_argument('--index-file',
|
|
||||||
# action="store", dest="alignpairedend:index",
|
|
||||||
# metavar="<FILENAME>",
|
|
||||||
# default=None,
|
|
||||||
# type=str,
|
|
||||||
# help="URI to the index reads")
|
|
||||||
|
|
||||||
group.add_argument('--score-min',
|
group.add_argument('--score-min',
|
||||||
action="store", dest="alignpairedend:smin",
|
action="store", dest="alignpairedend:smin",
|
||||||
metavar="#.###",
|
metavar="#.###",
|
||||||
@ -56,12 +50,22 @@ def addOptions(parser):
|
|||||||
type=float,
|
type=float,
|
||||||
help="Minimum score for keeping alignments")
|
help="Minimum score for keeping alignments")
|
||||||
|
|
||||||
|
group.add_argument('-A', '--true-ali',
|
||||||
|
action="store_true", dest="alignpairedend:trueali",
|
||||||
|
default=False,
|
||||||
|
help="Performs gap free end alignment of sequences instead of using kmers to compute alignments (slower).")
|
||||||
|
|
||||||
|
group.add_argument('-k', '--kmer-size',
|
||||||
|
action="store", dest="alignpairedend:kmersize",
|
||||||
|
metavar="#",
|
||||||
|
default=3,
|
||||||
|
type=int,
|
||||||
|
help="K-mer size for kmer comparisons, between 1 and 4 (not when using -A option; default: 3)")
|
||||||
|
|
||||||
# TODO declarations, cdef, imports
|
|
||||||
|
|
||||||
la = QSolexaReverseAssemble()
|
la = QSolexaReverseAssemble()
|
||||||
ra = QSolexaRightReverseAssemble()
|
ra = QSolexaRightReverseAssemble()
|
||||||
def buildAlignment(object direct, object reverse):
|
cdef object buildAlignment(object direct, object reverse):
|
||||||
|
|
||||||
if len(direct)==0 or len(reverse)==0:
|
if len(direct)==0 or len(reverse)==0:
|
||||||
return None
|
return None
|
||||||
@ -84,7 +88,7 @@ def buildAlignment(object direct, object reverse):
|
|||||||
return ali
|
return ali
|
||||||
|
|
||||||
|
|
||||||
def alignmentIterator(entries):
|
def alignmentIterator(entries, aligner):
|
||||||
|
|
||||||
if type(entries) == list:
|
if type(entries) == list:
|
||||||
two_views = True
|
two_views = True
|
||||||
@ -104,9 +108,12 @@ def alignmentIterator(entries):
|
|||||||
seqR = Nuc_Seq(seqF.id, seqF[REVERSE_SEQ_COLUMN_NAME], quality=seqF[REVERSE_QUALITY_COLUMN_NAME])
|
seqR = Nuc_Seq(seqF.id, seqF[REVERSE_SEQ_COLUMN_NAME], quality=seqF[REVERSE_QUALITY_COLUMN_NAME])
|
||||||
seqF.pop(REVERSE_SEQ_COLUMN_NAME)
|
seqF.pop(REVERSE_SEQ_COLUMN_NAME)
|
||||||
seqF.pop(REVERSE_QUALITY_COLUMN_NAME)
|
seqF.pop(REVERSE_QUALITY_COLUMN_NAME)
|
||||||
ali = buildAlignment(seqF, seqR)
|
|
||||||
|
ali = aligner(seqF, seqR)
|
||||||
|
|
||||||
if ali is None:
|
if ali is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
yield ali
|
yield ali
|
||||||
|
|
||||||
|
|
||||||
@ -160,11 +167,6 @@ def run(config):
|
|||||||
else:
|
else:
|
||||||
entries_len = len(entries)
|
entries_len = len(entries)
|
||||||
|
|
||||||
# if "index" in config["alignpairedend"]: # TODO
|
|
||||||
# index = open_uri(config["alignpairedend"]["index"])
|
|
||||||
# else:
|
|
||||||
# index = None
|
|
||||||
|
|
||||||
# Open the output
|
# Open the output
|
||||||
output = open_uri(config['obi']['outputURI'],
|
output = open_uri(config['obi']['outputURI'],
|
||||||
input=False,
|
input=False,
|
||||||
@ -174,21 +176,29 @@ def run(config):
|
|||||||
|
|
||||||
view = output[1]
|
view = output[1]
|
||||||
|
|
||||||
Column.new_column(view, b"QUALITY", OBI_QUAL) #TODO output URI quality option
|
Column.new_column(view, QUALITY_COLUMN, OBI_QUAL) #TODO output URI quality option?
|
||||||
|
|
||||||
if 'smin' in config['alignpairedend']:
|
if 'smin' in config['alignpairedend']:
|
||||||
config['alignpairedend']['sminL'] = config['alignpairedend']['smin']
|
smin = config['alignpairedend']['smin']
|
||||||
config['alignpairedend']['sminR'] = config['alignpairedend']['smin']
|
|
||||||
sminL = config['alignpairedend']['sminL']
|
|
||||||
sminR = config['alignpairedend']['sminR']
|
|
||||||
else:
|
else:
|
||||||
sminL = 0
|
smin = 0
|
||||||
sminR = 0
|
|
||||||
|
|
||||||
# Initialize the progress bar
|
# Initialize the progress bar
|
||||||
pb = ProgressBar(entries_len, config, seconde=5)
|
pb = ProgressBar(entries_len, config, seconde=5)
|
||||||
|
|
||||||
ba = alignmentIterator(entries)
|
if config['alignpairedend']['trueali']:
|
||||||
|
kmer_ali = False
|
||||||
|
aligner = buildAlignment
|
||||||
|
else :
|
||||||
|
kmer_ali = True
|
||||||
|
if type(entries) == list:
|
||||||
|
forward = entries[0]
|
||||||
|
reverse = entries[1]
|
||||||
|
aligner = Kmer_similarity(forward, view2=reverse, kmer_size=config['alignpairedend']['kmersize'])
|
||||||
|
else:
|
||||||
|
aligner = Kmer_similarity(entries, kmer_size=config['alignpairedend']['kmersize'])
|
||||||
|
|
||||||
|
ba = alignmentIterator(entries, aligner)
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
for ali in ba:
|
for ali in ba:
|
||||||
@ -197,31 +207,33 @@ def run(config):
|
|||||||
|
|
||||||
consensus = view[i]
|
consensus = view[i]
|
||||||
|
|
||||||
if sminL > 0:
|
if not two_views:
|
||||||
if ((ali.direction=='left' and ali.score > sminL)
|
seqF = entries[i]
|
||||||
or (ali.score > sminR)):
|
else:
|
||||||
buildConsensus(ali, consensus)
|
seqF = forward[i]
|
||||||
|
|
||||||
|
if smin > 0:
|
||||||
|
if (ali.score > smin) :
|
||||||
|
buildConsensus(ali, consensus, seqF)
|
||||||
else:
|
else:
|
||||||
seqF = ali[0].wrapped
|
|
||||||
if not two_views:
|
if not two_views:
|
||||||
seq = entries[i]
|
seqR = Nuc_Seq(seqF.id, seqF[REVERSE_SEQ_COLUMN_NAME], quality = seqF[REVERSE_QUALITY_COLUMN_NAME])
|
||||||
seqR = Nuc_Seq(seq.id, seq[REVERSE_SEQ_COLUMN_NAME], quality = seq[REVERSE_QUALITY_COLUMN_NAME])
|
|
||||||
else:
|
else:
|
||||||
seqR = reverse[i]
|
seqR = reverse[i]
|
||||||
buildJoinedSequence(ali, seqR, consensus)
|
buildJoinedSequence(ali, seqR, consensus, forward=seqF)
|
||||||
|
|
||||||
consensus[b"sminL"] = sminL
|
consensus[b"smin"] = smin
|
||||||
consensus[b"sminR"] = sminR
|
|
||||||
else:
|
else:
|
||||||
buildConsensus(ali, consensus)
|
buildConsensus(ali, consensus, seqF)
|
||||||
|
|
||||||
# TODO
|
if kmer_ali :
|
||||||
# if "index" in config['alignpairedend'] and config['alignpairedend']['index'] is not None:
|
ali.free()
|
||||||
# idx = str(config['alignpairedend']['index'].next())
|
|
||||||
# consensus[b"illumina_index"] = idx
|
|
||||||
|
|
||||||
i+=1
|
i+=1
|
||||||
|
|
||||||
|
if kmer_ali :
|
||||||
|
aligner.free()
|
||||||
|
|
||||||
# Save command config in View and DMS comments
|
# Save command config in View and DMS comments
|
||||||
command_line = " ".join(sys.argv[1:])
|
command_line = " ".join(sys.argv[1:])
|
||||||
view.write_config(config, "alignpairedend", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
|
view.write_config(config, "alignpairedend", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
|
||||||
|
42
python/obitools3/dms/capi/kmer_similarity.pxd
Executable file
42
python/obitools3/dms/capi/kmer_similarity.pxd
Executable file
@ -0,0 +1,42 @@
|
|||||||
|
#cython: language_level=3
|
||||||
|
|
||||||
|
from obitools3.dms.capi.obiview cimport Obiview_p
|
||||||
|
from obitools3.dms.capi.obidmscolumn cimport OBIDMS_column_p
|
||||||
|
from obitools3.dms.capi.obitypes cimport index_t
|
||||||
|
|
||||||
|
from libc.stdint cimport int32_t, uint8_t
|
||||||
|
|
||||||
|
|
||||||
|
cdef extern from "kmer_similarity.h" nogil:
|
||||||
|
|
||||||
|
struct Obi_ali_t :
|
||||||
|
double score
|
||||||
|
int consensus_length
|
||||||
|
int overlap_length
|
||||||
|
char* consensus_seq
|
||||||
|
uint8_t* consensus_qual
|
||||||
|
int shift
|
||||||
|
char* direction
|
||||||
|
|
||||||
|
ctypedef Obi_ali_t* Obi_ali_p
|
||||||
|
|
||||||
|
|
||||||
|
void obi_free_shifted_ali(Obi_ali_p ali)
|
||||||
|
|
||||||
|
|
||||||
|
Obi_ali_p kmer_similarity(Obiview_p view1,
|
||||||
|
OBIDMS_column_p column1,
|
||||||
|
index_t idx1,
|
||||||
|
index_t elt_idx1,
|
||||||
|
Obiview_p view2,
|
||||||
|
OBIDMS_column_p column2,
|
||||||
|
index_t idx2,
|
||||||
|
index_t elt_idx2,
|
||||||
|
uint8_t kmer_size,
|
||||||
|
int32_t* kmer_pos_array,
|
||||||
|
int32_t* kmer_pos_array_height_p,
|
||||||
|
int32_t* shift_array,
|
||||||
|
int32_t* shift_array_height_p,
|
||||||
|
int32_t* shift_count_array,
|
||||||
|
int32_t* shift_count_array_height_p,
|
||||||
|
bint build_consensus)
|
@ -6,6 +6,7 @@
|
|||||||
../../../src/dna_seq_indexer.c
|
../../../src/dna_seq_indexer.c
|
||||||
../../../src/encode.c
|
../../../src/encode.c
|
||||||
../../../src/hashtable.c
|
../../../src/hashtable.c
|
||||||
|
../../../src/kmer_similarity.c
|
||||||
../../../src/linked_list.c
|
../../../src/linked_list.c
|
||||||
../../../src/murmurhash2.c
|
../../../src/murmurhash2.c
|
||||||
../../../src/obi_lcs.c
|
../../../src/obi_lcs.c
|
||||||
|
@ -3,6 +3,18 @@
|
|||||||
from cpython cimport array
|
from cpython cimport array
|
||||||
|
|
||||||
from .solexapairend import iterOnAligment
|
from .solexapairend import iterOnAligment
|
||||||
|
from .shifted_ali cimport Ali_shifted
|
||||||
|
|
||||||
|
from obitools3.dms.capi.obiview cimport Obiview_p, QUALITY_COLUMN, \
|
||||||
|
obi_set_qual_int_with_elt_idx_and_col_p_in_view, \
|
||||||
|
obi_set_str_with_elt_idx_and_col_p_in_view
|
||||||
|
|
||||||
|
from obitools3.dms.capi.obidmscolumn cimport OBIDMS_column_p
|
||||||
|
|
||||||
|
from obitools3.dms.view.view cimport View
|
||||||
|
from obitools3.dms.column.column cimport Column
|
||||||
|
|
||||||
|
|
||||||
from math import log10
|
from math import log10
|
||||||
|
|
||||||
|
|
||||||
@ -146,7 +158,7 @@ cdef class IterOnConsensus:
|
|||||||
seqBDeletion = property(get_seqBDeletion, None, None, "deletereverse's docstring")
|
seqBDeletion = property(get_seqBDeletion, None, None, "deletereverse's docstring")
|
||||||
|
|
||||||
|
|
||||||
def buildConsensus(ali, seq):
|
def buildConsensus(ali, seq, ref_tags=None):
|
||||||
cdef list quality
|
cdef list quality
|
||||||
cdef char aseq[1000]
|
cdef char aseq[1000]
|
||||||
cdef int i=0
|
cdef int i=0
|
||||||
@ -156,61 +168,82 @@ def buildConsensus(ali, seq):
|
|||||||
cdef double score
|
cdef double score
|
||||||
cdef bytes sseq
|
cdef bytes sseq
|
||||||
cdef double p
|
cdef double p
|
||||||
|
cdef OBIDMS_column_p col_p
|
||||||
|
cdef Obiview_p view_p
|
||||||
|
cdef View view
|
||||||
|
cdef Column column
|
||||||
|
|
||||||
quality = []
|
quality = []
|
||||||
|
|
||||||
if len(ali[0])>999: # TODO why?
|
if type(ali) == Ali_shifted:
|
||||||
raise AssertionError,"Too long alignemnt"
|
view = seq.view
|
||||||
|
view_p = view.pointer()
|
||||||
|
column = view[QUALITY_COLUMN]
|
||||||
|
col_p = column.pointer()
|
||||||
|
# doesn't work because uint8_t* are forced into bytes by cython (nothing read/kept beyond 0 values)
|
||||||
|
#obi_set_qual_int_with_elt_idx_and_col_p_in_view(view_p, col_p, seq.index, 0, ali.consensus_qual, ali.consensus_len)
|
||||||
|
seq.set(ref_tags.id+b"_CONS", ali.consensus_seq, quality=ali.consensus_qual, tags=ref_tags)
|
||||||
|
seq[b'ali_length'] = ali.consensus_len
|
||||||
|
seq[b'score_norm']=float(ali.score)/ali.consensus_len
|
||||||
|
seq[b'shift']=ali.shift
|
||||||
|
else:
|
||||||
|
if len(ali[0])>999: # TODO why?
|
||||||
|
raise AssertionError,"Too long alignemnt"
|
||||||
|
|
||||||
ic=IterOnConsensus(ali)
|
ic=IterOnConsensus(ali)
|
||||||
|
|
||||||
for nuc,score in ic:
|
for nuc,score in ic:
|
||||||
cnuc=nuc
|
cnuc=nuc
|
||||||
quality.append(score)
|
quality.append(score)
|
||||||
aseq[i]=cnuc[0]
|
aseq[i]=cnuc[0]
|
||||||
i+=1
|
i+=1
|
||||||
|
|
||||||
aseq[i]=0
|
aseq[i]=0
|
||||||
|
|
||||||
sseq=aseq
|
sseq=aseq
|
||||||
|
|
||||||
# Reconvert quality from array of probabilities to array of raw quality values
|
# Reconvert quality from array of probabilities to array of raw quality values
|
||||||
i=0
|
i=0
|
||||||
for p in quality:
|
for p in quality:
|
||||||
quality[i] = min(42, round(-10*log10(p)))
|
quality[i] = min(42, round(-10*log10(p)))
|
||||||
i+=1
|
i+=1
|
||||||
|
|
||||||
seq.set(ali[0].wrapped.id+b"_CONS", sseq, quality=quality, tags=ali[0].wrapped)
|
seq.set(ali[0].wrapped.id+b"_CONS", sseq, quality=quality, tags=ali[0].wrapped)
|
||||||
|
|
||||||
|
if hasattr(ali, "counter"):
|
||||||
|
seq[b'alignement_id']=ali.counter
|
||||||
|
seq[b'seq_a_single']=ic.seqASingle
|
||||||
|
seq[b'seq_b_single']=ic.seqBSingle
|
||||||
|
seq[b'seq_ab_match']=ic.seqABMatch
|
||||||
|
seq[b'seq_a_mismatch']=ic.seqAMismatch
|
||||||
|
seq[b'seq_b_mismatch']=ic.seqBMismatch
|
||||||
|
seq[b'seq_a_insertion']=ic.seqAInsertion
|
||||||
|
seq[b'seq_b_insertion']=ic.seqBInsertion-ic.seqBSingle
|
||||||
|
seq[b'seq_a_deletion']=ic.seqADeletion
|
||||||
|
seq[b'seq_b_deletion']=ic.seqBDeletion
|
||||||
|
seq[b'ali_length']=len(seq)-ic.seqASingle-ic.seqBSingle
|
||||||
|
if seq[b'ali_length']>0:
|
||||||
|
seq[b'score_norm']=float(ali.score)/seq[b'ali_length']
|
||||||
|
|
||||||
if hasattr(ali, "direction"):
|
if hasattr(ali, "direction"):
|
||||||
seq[b'direction']=ali.direction
|
seq[b'ali_direction']=ali.direction
|
||||||
if hasattr(ali, "counter"):
|
|
||||||
seq[b'alignement_id']=ali.counter
|
|
||||||
seq[b'seq_a_single']=ic.seqASingle
|
|
||||||
seq[b'seq_b_single']=ic.seqBSingle
|
|
||||||
seq[b'seq_ab_match']=ic.seqABMatch
|
|
||||||
seq[b'seq_a_mismatch']=ic.seqAMismatch
|
|
||||||
seq[b'seq_b_mismatch']=ic.seqBMismatch
|
|
||||||
seq[b'seq_a_insertion']=ic.seqAInsertion
|
|
||||||
seq[b'seq_b_insertion']=ic.seqBInsertion-ic.seqBSingle
|
|
||||||
seq[b'seq_a_deletion']=ic.seqADeletion
|
|
||||||
seq[b'seq_b_deletion']=ic.seqBDeletion
|
|
||||||
seq[b'score']=ali.score
|
seq[b'score']=ali.score
|
||||||
seq[b'ali_length']=len(seq)-ic.seqASingle-ic.seqBSingle
|
|
||||||
if seq[b'ali_length']>0:
|
|
||||||
seq[b'score_norm']=float(ali.score)/seq[b'ali_length']
|
|
||||||
seq[b'mode']=b'alignment'
|
seq[b'mode']=b'alignment'
|
||||||
|
|
||||||
return seq
|
return seq
|
||||||
|
|
||||||
|
|
||||||
def buildJoinedSequence(ali, reverse, seq):
|
def buildJoinedSequence(ali, reverse, seq, forward=None):
|
||||||
forward = ali[0].wrapped
|
if forward is not None:
|
||||||
|
forward = forward
|
||||||
|
else:
|
||||||
|
forward = ali[0].wrapped
|
||||||
s = forward.seq + reverse.seq
|
s = forward.seq + reverse.seq
|
||||||
quality = forward.quality
|
quality = forward.quality
|
||||||
quality.extend(reverse.quality)
|
quality.extend(reverse.quality)
|
||||||
seq.set(forward.id +b"_PairedEnd", s, definition=forward.definition, quality=quality, tags=forward)
|
seq.set(forward.id +b"_PairedEnd", s, definition=forward.definition, quality=quality, tags=forward)
|
||||||
seq[b"score"]=ali.score
|
seq[b"score"]=ali.score
|
||||||
seq[b"ali_dir"]=ali.direction
|
seq[b"ali_direction"]=ali.direction
|
||||||
seq[b"mode"]=b"joined"
|
seq[b"mode"]=b"joined"
|
||||||
seq[b"pairedend_limit"]=len(forward)
|
seq[b"pairedend_limit"]=len(forward)
|
||||||
return seq
|
return seq
|
||||||
|
30
python/obitools3/libalign/shifted_ali.pxd
Executable file
30
python/obitools3/libalign/shifted_ali.pxd
Executable file
@ -0,0 +1,30 @@
|
|||||||
|
#cython: language_level=3
|
||||||
|
|
||||||
|
from obitools3.dms.capi.kmer_similarity cimport kmer_similarity, Obi_ali_p
|
||||||
|
from libc.stdint cimport int32_t, uint8_t
|
||||||
|
from obitools3.dms.capi.obiview cimport Obiview_p
|
||||||
|
from obitools3.dms.capi.obidmscolumn cimport OBIDMS_column_p
|
||||||
|
|
||||||
|
|
||||||
|
cdef class Ali_shifted:
|
||||||
|
cdef Obi_ali_p _pointer
|
||||||
|
cdef inline Obi_ali_p pointer(self)
|
||||||
|
cpdef free(self)
|
||||||
|
@staticmethod
|
||||||
|
cdef Ali_shifted new_ali(Obi_ali_p ali_p)
|
||||||
|
|
||||||
|
|
||||||
|
cdef class Kmer_similarity:
|
||||||
|
cdef uint8_t kmer_size
|
||||||
|
cdef int32_t* kmer_pos_array_p
|
||||||
|
cdef int32_t kmer_pos_array_height_a[1]
|
||||||
|
cdef int32_t* shift_array_p
|
||||||
|
cdef int32_t shift_array_height_a[1]
|
||||||
|
cdef int32_t* shift_count_array_p
|
||||||
|
cdef int32_t shift_count_array_height_a[1]
|
||||||
|
cdef Obiview_p view1_p
|
||||||
|
cdef OBIDMS_column_p column1_p
|
||||||
|
cdef Obiview_p view2_p
|
||||||
|
cdef OBIDMS_column_p column2_p
|
||||||
|
cdef bint build_consensus
|
||||||
|
cpdef free(self)
|
119
python/obitools3/libalign/shifted_ali.pyx
Executable file
119
python/obitools3/libalign/shifted_ali.pyx
Executable file
@ -0,0 +1,119 @@
|
|||||||
|
#cython: language_level=3
|
||||||
|
|
||||||
|
from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN
|
||||||
|
|
||||||
|
from obitools3.dms.obiseq cimport Nuc_Seq_Stored
|
||||||
|
from obitools3.dms.capi.kmer_similarity cimport kmer_similarity, Obi_ali_p, obi_free_shifted_ali
|
||||||
|
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
|
||||||
|
from obitools3.dms.column.column cimport Column
|
||||||
|
|
||||||
|
from libc.stdlib cimport free
|
||||||
|
|
||||||
|
|
||||||
|
cdef class Ali_shifted:
|
||||||
|
|
||||||
|
cdef inline Obi_ali_p pointer(self) :
|
||||||
|
return <Obi_ali_p>(self._pointer)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
cdef Ali_shifted new_ali(Obi_ali_p ali_p) : # not __init__ method because need to pass pointer
|
||||||
|
ali = Ali_shifted()
|
||||||
|
ali._pointer = ali_p
|
||||||
|
return ali
|
||||||
|
|
||||||
|
@property
|
||||||
|
def score(self):
|
||||||
|
return self.pointer().score
|
||||||
|
|
||||||
|
@property
|
||||||
|
def consensus_len(self):
|
||||||
|
return self.pointer().consensus_length
|
||||||
|
|
||||||
|
@property
|
||||||
|
def overlap_len(self):
|
||||||
|
return self.pointer().overlap_length
|
||||||
|
|
||||||
|
@property
|
||||||
|
def consensus_seq(self):
|
||||||
|
return self.pointer().consensus_seq
|
||||||
|
|
||||||
|
@property
|
||||||
|
def shift(self):
|
||||||
|
return self.pointer().shift
|
||||||
|
|
||||||
|
@property
|
||||||
|
def consensus_qual(self): # must return list because uint8_t* are forced into bytes by cython which won't keep beyond the first 0 value
|
||||||
|
qual_list = []
|
||||||
|
for i in range(self.consensus_len):
|
||||||
|
qual_list.append((self.pointer().consensus_qual)[i])
|
||||||
|
return qual_list
|
||||||
|
|
||||||
|
@property
|
||||||
|
def direction(self):
|
||||||
|
return self.pointer().direction
|
||||||
|
|
||||||
|
cpdef free(self):
|
||||||
|
obi_free_shifted_ali(self.pointer())
|
||||||
|
|
||||||
|
|
||||||
|
cdef class Kmer_similarity:
|
||||||
|
def __init__(self, View_NUC_SEQS view1, Column column1=None, View_NUC_SEQS view2=None, Column column2=None, uint8_t kmer_size=3, build_consensus=True) :
|
||||||
|
cdef Column default_seq_col
|
||||||
|
if kmer_size < 1 or kmer_size > 3:
|
||||||
|
raise Exception("Kmer size to compute kmer similarity must be >=1 or <=4")
|
||||||
|
self.kmer_pos_array_p = NULL
|
||||||
|
self.shift_array_p = NULL
|
||||||
|
self.shift_count_array_p = NULL
|
||||||
|
self.kmer_pos_array_height_a[0] = 0
|
||||||
|
self.shift_array_height_a[0] = 0
|
||||||
|
self.shift_count_array_height_a[0] = 0
|
||||||
|
self.kmer_size = kmer_size
|
||||||
|
self.build_consensus = build_consensus
|
||||||
|
self.view1_p = view1.pointer()
|
||||||
|
if column1 is not None:
|
||||||
|
self.column1_p = column1.pointer()
|
||||||
|
else:
|
||||||
|
if type(view1) != View_NUC_SEQS:
|
||||||
|
raise Exception("Kmer similarity with no sequence column given must be given a NUC_SEQS view")
|
||||||
|
default_seq_col = view1[NUC_SEQUENCE_COLUMN]
|
||||||
|
self.column1_p = default_seq_col.pointer()
|
||||||
|
if view2 is not None:
|
||||||
|
self.view2_p = view2.pointer()
|
||||||
|
else:
|
||||||
|
view2 = view1
|
||||||
|
self.view2_p = self.view1_p
|
||||||
|
if column2 is not None:
|
||||||
|
self.column2_p = column2.pointer()
|
||||||
|
else:
|
||||||
|
if type(view2) != View_NUC_SEQS:
|
||||||
|
raise Exception("Kmer similarity with no sequence column given must be given a NUC_SEQS view")
|
||||||
|
default_seq_col = view2[NUC_SEQUENCE_COLUMN]
|
||||||
|
self.column2_p = default_seq_col.pointer()
|
||||||
|
|
||||||
|
|
||||||
|
def __call__(self, Nuc_Seq_Stored seq1, Nuc_Seq_Stored seq2):
|
||||||
|
cdef Obi_ali_p ali_p
|
||||||
|
cdef Ali_shifted ali
|
||||||
|
ali_p = kmer_similarity(self.view1_p, self.column1_p, seq1.index, 0, \
|
||||||
|
self.view2_p, self.column2_p, seq2.index, 0, \
|
||||||
|
self.kmer_size, \
|
||||||
|
self.kmer_pos_array_p, self.kmer_pos_array_height_a, \
|
||||||
|
self.shift_array_p, self.shift_array_height_a, \
|
||||||
|
self.shift_count_array_p, self.shift_count_array_height_a,
|
||||||
|
self.build_consensus)
|
||||||
|
|
||||||
|
ali = Ali_shifted.new_ali(ali_p)
|
||||||
|
return ali
|
||||||
|
|
||||||
|
cpdef free(self):
|
||||||
|
if self.kmer_pos_array_p is not NULL:
|
||||||
|
free(self.kmer_pos_array_p)
|
||||||
|
self.kmer_pos_array_height_a[0] = 0
|
||||||
|
if (self.shift_array_p is not NULL) :
|
||||||
|
free(self.shift_array_p)
|
||||||
|
self.shift_array_height_a[0] = 0
|
||||||
|
if (self.shift_count_array_p is not NULL) :
|
||||||
|
free(self.shift_count_array_p)
|
||||||
|
self.shift_count_array_height_a[0] = 0
|
||||||
|
|
||||||
|
|
@ -6,6 +6,7 @@
|
|||||||
../../src/dna_seq_indexer.c
|
../../src/dna_seq_indexer.c
|
||||||
../../src/encode.c
|
../../src/encode.c
|
||||||
../../src/hashtable.c
|
../../src/hashtable.c
|
||||||
|
../../src/kmer_similarity.c
|
||||||
../../src/linked_list.c
|
../../src/linked_list.c
|
||||||
../../src/murmurhash2.c
|
../../src/murmurhash2.c
|
||||||
../../src/obi_lcs.c
|
../../src/obi_lcs.c
|
||||||
|
483
src/kmer_similarity.c
Executable file
483
src/kmer_similarity.c
Executable file
@ -0,0 +1,483 @@
|
|||||||
|
/****************************************************************************
|
||||||
|
* Kmer similarity computation functions *
|
||||||
|
****************************************************************************/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @file kmer_similarity.c
|
||||||
|
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||||
|
* @date January 7th 2019
|
||||||
|
* @brief Kmer similarity computation functions.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <inttypes.h>
|
||||||
|
|
||||||
|
#include "utils.h"
|
||||||
|
#include "obidebug.h"
|
||||||
|
#include "obierrno.h"
|
||||||
|
#include "obitypes.h"
|
||||||
|
|
||||||
|
#include "kmer_similarity.h"
|
||||||
|
#include "obidmscolumn.h"
|
||||||
|
#include "obiview.h"
|
||||||
|
#include "encode.h"
|
||||||
|
#include "dna_seq_indexer.h"
|
||||||
|
|
||||||
|
|
||||||
|
#define DEBUG_LEVEL 0 // TODO has to be defined somewhere else (cython compil flag?)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**************************************************************************
|
||||||
|
*
|
||||||
|
* D E C L A R A T I O N O F T H E P R I V A T E F U N C T I O N S
|
||||||
|
*
|
||||||
|
**************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/************************************************************************
|
||||||
|
*
|
||||||
|
* D E F I N I T I O N O F T H E P R I V A T E F U N C T I O N S
|
||||||
|
*
|
||||||
|
************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************
|
||||||
|
*
|
||||||
|
* D E F I N I T I O N O F T H E P U B L I C F U N C T I O N S
|
||||||
|
*
|
||||||
|
**********************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
void obi_free_shifted_ali(Obi_ali_p ali)
|
||||||
|
{
|
||||||
|
free(ali->consensus_seq);
|
||||||
|
free(ali->consensus_qual);
|
||||||
|
free(ali);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Obi_ali_p kmer_similarity(Obiview_p view1, OBIDMS_column_p column1, index_t idx1, index_t elt_idx1,
|
||||||
|
Obiview_p view2, OBIDMS_column_p column2, index_t idx2, index_t elt_idx2,
|
||||||
|
uint8_t kmer_size,
|
||||||
|
int32_t* kmer_pos_array, int32_t* kmer_pos_array_height_p,
|
||||||
|
int32_t* shift_array, int32_t* shift_array_height_p,
|
||||||
|
int32_t* shift_count_array, int32_t* shift_count_array_length_p,
|
||||||
|
bool build_consensus)
|
||||||
|
{
|
||||||
|
Obi_ali_p ali = NULL;
|
||||||
|
int i, j;
|
||||||
|
OBIDMS_column_p qual_col1;
|
||||||
|
OBIDMS_column_p qual_col2;
|
||||||
|
double score = 0.0;
|
||||||
|
Obi_blob_p blob1 = NULL;
|
||||||
|
Obi_blob_p blob2 = NULL;
|
||||||
|
Obi_blob_p temp_blob = NULL;
|
||||||
|
char* seq1 = NULL;
|
||||||
|
char* seq2 = NULL;
|
||||||
|
int32_t len1;
|
||||||
|
int32_t len2;
|
||||||
|
int32_t temp_len;
|
||||||
|
int32_t pos;
|
||||||
|
int32_t shift1;
|
||||||
|
int32_t shift2;
|
||||||
|
const uint8_t* qual1;
|
||||||
|
const uint8_t* qual2;
|
||||||
|
int qual1_len;
|
||||||
|
int qual2_len;
|
||||||
|
char* consensus_seq;
|
||||||
|
uint8_t* consensus_qual;
|
||||||
|
int32_t empty_part;
|
||||||
|
int32_t nuc_idx;
|
||||||
|
int32_t kmer_idx;
|
||||||
|
int32_t kmer_count;
|
||||||
|
int32_t best_shift_idx;
|
||||||
|
int32_t best_shift;
|
||||||
|
int32_t max_common_kmers;
|
||||||
|
int32_t overlap_len;
|
||||||
|
int32_t consensus_len;
|
||||||
|
int32_t cons_shift;
|
||||||
|
uint8_t kmer;
|
||||||
|
byte_t nuc;
|
||||||
|
uint8_t encoding;
|
||||||
|
int total_len;
|
||||||
|
int shift;
|
||||||
|
int32_t shift_idx;
|
||||||
|
int32_t kmer_pos_array_height = *kmer_pos_array_height_p;
|
||||||
|
int32_t shift_array_height = *shift_array_height_p;
|
||||||
|
int32_t shift_count_array_length = *shift_count_array_length_p;
|
||||||
|
bool free_blob1 = false;
|
||||||
|
|
||||||
|
// Check kmer size
|
||||||
|
if ((kmer_size < 1) || (kmer_size > 4))
|
||||||
|
{
|
||||||
|
obi_set_errno(OBI_ALIGN_ERROR);
|
||||||
|
obidebug(1, "\nError when computing the kmer similarity between two sequences: the kmer size must be >= 1 or <= 4");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get sequence blobs
|
||||||
|
blob1 = obi_get_blob_with_elt_idx_and_col_p_in_view(view1, column1, idx1, elt_idx1);
|
||||||
|
if (blob1 == NULL)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError getting the blob of the 1st sequence when computing the kmer similarity between two sequences");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
blob2 = obi_get_blob_with_elt_idx_and_col_p_in_view(view2, column2, idx2, elt_idx2);
|
||||||
|
if (blob2 == NULL)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError getting the blob of the 2nd sequence when computing the kmer similarity between two sequences");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Choose the shortest sequence to save kmer positions in array
|
||||||
|
len1 = blob1->length_decoded_value;
|
||||||
|
len2 = blob2->length_decoded_value;
|
||||||
|
if (len2 > len1)
|
||||||
|
{
|
||||||
|
temp_len = len1;
|
||||||
|
len1 = len2;
|
||||||
|
len2 = temp_len;
|
||||||
|
temp_blob = blob1;
|
||||||
|
blob1 = blob2;
|
||||||
|
blob2 = temp_blob;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Force encoding on 2 bits by replacing ambiguous nucleotides by 'a's
|
||||||
|
if (blob1->element_size == 4)
|
||||||
|
{
|
||||||
|
seq1 = obi_blob_to_seq(blob1);
|
||||||
|
for (i=0; i<len1; i++)
|
||||||
|
{
|
||||||
|
if (seq1[i] != 'a' && seq1[i] != 't' && seq1[i] != 'g' && seq1[i] != 'c')
|
||||||
|
seq1[i] = 'a';
|
||||||
|
}
|
||||||
|
blob1 = obi_seq_to_blob(seq1);
|
||||||
|
free_blob1 = true;
|
||||||
|
}
|
||||||
|
if (blob2->element_size == 4)
|
||||||
|
{
|
||||||
|
seq2 = obi_blob_to_seq(blob2);
|
||||||
|
for (i=0; i<len2; i++)
|
||||||
|
{
|
||||||
|
if (seq2[i] != 'a' && seq2[i] != 't' && seq2[i] != 'g' && seq2[i] != 'c')
|
||||||
|
seq2[i] = 'a';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reverse complement the other sequence // TODO blob to seq to blob for now, could be more efficient
|
||||||
|
if (seq2 == NULL)
|
||||||
|
seq2 = obi_blob_to_seq(blob2);
|
||||||
|
seq2 = reverse_complement_sequence(seq2);
|
||||||
|
blob2 = obi_seq_to_blob(seq2);
|
||||||
|
|
||||||
|
// Save total length for the shift counts array
|
||||||
|
total_len = len1 + len2 + 1; // +1 for shift 0
|
||||||
|
|
||||||
|
// Allocate or reallocate memory for the array of shift counts if necessary
|
||||||
|
if (shift_count_array == NULL)
|
||||||
|
{
|
||||||
|
shift_count_array_length = total_len;
|
||||||
|
shift_count_array = (int32_t*) malloc(shift_count_array_length * sizeof(int32_t));
|
||||||
|
if (shift_count_array == NULL)
|
||||||
|
{
|
||||||
|
obi_set_errno(OBI_MALLOC_ERROR);
|
||||||
|
obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (total_len >= shift_count_array_length)
|
||||||
|
{
|
||||||
|
shift_count_array_length = total_len;
|
||||||
|
shift_count_array = (int32_t*) realloc(shift_count_array, shift_count_array_length * sizeof(int32_t));
|
||||||
|
if (shift_count_array == NULL)
|
||||||
|
{
|
||||||
|
obi_set_errno(OBI_MALLOC_ERROR);
|
||||||
|
obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Allocate or reallocate memory for the array of shifts if necessary
|
||||||
|
if (shift_array == NULL)
|
||||||
|
{
|
||||||
|
shift_array_height = total_len;
|
||||||
|
shift_array = (int32_t*) malloc(ARRAY_LENGTH * shift_array_height * sizeof(int32_t));
|
||||||
|
if (shift_array == NULL)
|
||||||
|
{
|
||||||
|
obi_set_errno(OBI_MALLOC_ERROR);
|
||||||
|
obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (len1 >= shift_array_height)
|
||||||
|
{
|
||||||
|
shift_array_height = total_len;
|
||||||
|
shift_array = (int32_t*) realloc(shift_array, ARRAY_LENGTH * shift_array_height * sizeof(int32_t));
|
||||||
|
if (shift_array == NULL)
|
||||||
|
{
|
||||||
|
obi_set_errno(OBI_MALLOC_ERROR);
|
||||||
|
obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Allocate or reallocate memory for the array of positions if necessary
|
||||||
|
if (kmer_pos_array == NULL)
|
||||||
|
{
|
||||||
|
kmer_pos_array_height = len1;
|
||||||
|
kmer_pos_array = (int32_t*) malloc(ARRAY_LENGTH * kmer_pos_array_height * sizeof(int32_t));
|
||||||
|
if (kmer_pos_array == NULL)
|
||||||
|
{
|
||||||
|
obi_set_errno(OBI_MALLOC_ERROR);
|
||||||
|
obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (len1 >= kmer_pos_array_height)
|
||||||
|
{
|
||||||
|
kmer_pos_array_height = len1;
|
||||||
|
kmer_pos_array = (int32_t*) realloc(kmer_pos_array, ARRAY_LENGTH * kmer_pos_array_height * sizeof(int32_t));
|
||||||
|
if (kmer_pos_array == NULL)
|
||||||
|
{
|
||||||
|
obi_set_errno(OBI_MALLOC_ERROR);
|
||||||
|
obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize all positions to -1
|
||||||
|
for (i=0; i<(ARRAY_LENGTH * kmer_pos_array_height); i++)
|
||||||
|
kmer_pos_array[i] = -1;
|
||||||
|
// Initialize all shifts to the maximum value of an int32_t
|
||||||
|
for (i=0; i<(ARRAY_LENGTH * shift_array_height); i++)
|
||||||
|
shift_array[i] = INT32_MAX;
|
||||||
|
//memset(shift_array, 1, ARRAY_LENGTH * shift_array_height * sizeof(int32_t)); // why doesn't this work?
|
||||||
|
// Initialize all shift counts to 0
|
||||||
|
memset(shift_count_array, 0, shift_count_array_length * sizeof(int32_t));
|
||||||
|
|
||||||
|
*kmer_pos_array_height_p = kmer_pos_array_height;
|
||||||
|
*shift_array_height_p = shift_array_height;
|
||||||
|
*shift_count_array_length_p = shift_count_array_length;
|
||||||
|
|
||||||
|
// Fill array with positions of kmers in the shortest sequence
|
||||||
|
encoding = blob1->element_size;
|
||||||
|
kmer_count = len1 - kmer_size + 1;
|
||||||
|
for (kmer_idx=0; kmer_idx < kmer_count; kmer_idx++)
|
||||||
|
{
|
||||||
|
kmer = 0;
|
||||||
|
for (nuc_idx=kmer_idx; nuc_idx<(kmer_idx+kmer_size); nuc_idx++)
|
||||||
|
{
|
||||||
|
nuc = get_nucleotide_from_encoded_seq(blob1->value, nuc_idx, encoding);
|
||||||
|
kmer <<= encoding;
|
||||||
|
kmer |= nuc;
|
||||||
|
}
|
||||||
|
|
||||||
|
i = 0;
|
||||||
|
while (kmer_pos_array[(kmer*kmer_pos_array_height)+i] != -1)
|
||||||
|
i++;
|
||||||
|
kmer_pos_array[(kmer*kmer_pos_array_height)+i] = kmer_idx;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compare positions of kmers between both sequences and store shifts
|
||||||
|
kmer_count = blob2->length_decoded_value - kmer_size + 1;
|
||||||
|
for (kmer_idx=0; kmer_idx < kmer_count; kmer_idx++)
|
||||||
|
{
|
||||||
|
kmer = 0;
|
||||||
|
for (nuc_idx=kmer_idx; nuc_idx<(kmer_idx+kmer_size); nuc_idx++)
|
||||||
|
{
|
||||||
|
nuc = get_nucleotide_from_encoded_seq(blob2->value, nuc_idx, encoding);
|
||||||
|
kmer <<= encoding;
|
||||||
|
kmer |= nuc;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the index at which the new shifts should be stored for that kmer
|
||||||
|
j = 0;
|
||||||
|
while ((shift_array[(kmer*shift_array_height)+j] != INT32_MAX) && (j<shift_array_height))
|
||||||
|
j++;
|
||||||
|
// Store the shift between the kmer in the 1st and the 2nd sequence
|
||||||
|
i = 0;
|
||||||
|
while ((kmer_pos_array[(kmer*kmer_pos_array_height)+i] != -1) && (i<kmer_pos_array_height))
|
||||||
|
{
|
||||||
|
shift_array[(kmer*shift_array_height)+j] = kmer_idx - kmer_pos_array[(kmer*kmer_pos_array_height)+i];
|
||||||
|
i++;
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Count how many times each shift is represented
|
||||||
|
for (kmer=0;;kmer++)
|
||||||
|
{
|
||||||
|
for (i=0;;i++)
|
||||||
|
{
|
||||||
|
shift = shift_array[(kmer*shift_array_height)+i];
|
||||||
|
if ((shift == INT32_MAX) || (i==shift_array_height))
|
||||||
|
break;
|
||||||
|
shift_idx = shift + len1;
|
||||||
|
shift_count_array[shift_idx]++;
|
||||||
|
}
|
||||||
|
if (kmer == (ARRAY_LENGTH-1)) // stop condition can't be in for line because kmer can't be >= 255
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find the most represented shift
|
||||||
|
best_shift_idx = 0;
|
||||||
|
max_common_kmers = 0;
|
||||||
|
empty_part = (shift_count_array_length-1)/2 - len1;
|
||||||
|
for (i=empty_part; i < (shift_count_array_length - empty_part); i++) // skipping empty parts of the array
|
||||||
|
{
|
||||||
|
if (shift_count_array[i] > max_common_kmers)
|
||||||
|
{
|
||||||
|
best_shift_idx = i;
|
||||||
|
max_common_kmers = shift_count_array[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
best_shift = best_shift_idx - len1;
|
||||||
|
|
||||||
|
if (best_shift > 0)
|
||||||
|
overlap_len = len2 - best_shift;
|
||||||
|
else
|
||||||
|
overlap_len = len1 + best_shift;
|
||||||
|
|
||||||
|
score = max_common_kmers + kmer_size - 1;
|
||||||
|
|
||||||
|
ali = (Obi_ali_p) malloc(sizeof(Obi_ali_t));
|
||||||
|
if (ali == NULL)
|
||||||
|
{
|
||||||
|
obi_set_errno(OBI_MALLOC_ERROR);
|
||||||
|
obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory for the result structure");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save result in Obi_ali structure
|
||||||
|
ali->score = score;
|
||||||
|
ali->consensus_length = 0;
|
||||||
|
ali->overlap_length = overlap_len;
|
||||||
|
ali->shift = best_shift;
|
||||||
|
ali->consensus_seq = NULL;
|
||||||
|
ali->consensus_qual = NULL;
|
||||||
|
|
||||||
|
// Build the consensus sequence if asked
|
||||||
|
if (build_consensus)
|
||||||
|
{
|
||||||
|
// Get the quality columns
|
||||||
|
qual_col1 = obi_view_get_column(view1, QUALITY_COLUMN);
|
||||||
|
if (qual_col1 == NULL)
|
||||||
|
{
|
||||||
|
obi_set_errno(OBI_ALIGN_ERROR);
|
||||||
|
obidebug(1, "\nError when computing the kmer similarity between two sequences: can't open quality column to build the consensus sequence");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
qual_col2 = obi_view_get_column(view2, QUALITY_COLUMN);
|
||||||
|
if (qual_col2 == NULL)
|
||||||
|
{
|
||||||
|
obi_set_errno(OBI_ALIGN_ERROR);
|
||||||
|
obidebug(1, "\nError when computing the kmer similarity between two sequences: can't open quality column to build the consensus sequence");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the quality arrays
|
||||||
|
qual1 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view1, qual_col1, idx1, 0, &qual1_len);
|
||||||
|
if (qual1 == NULL)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError getting the blob of the 1st sequence when computing the kmer similarity between two sequences");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
qual2 = obi_get_qual_int_with_elt_idx_and_col_p_in_view(view2, qual_col2, idx2, 0, &qual2_len);
|
||||||
|
if (qual2 == NULL)
|
||||||
|
{
|
||||||
|
obidebug(1, "\nError getting the blob of the 1st sequence when computing the kmer similarity between two sequences");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decode the first sequence if not already done
|
||||||
|
if (seq1 == NULL)
|
||||||
|
seq1 = obi_blob_to_seq(blob1);
|
||||||
|
|
||||||
|
if (best_shift > 0)
|
||||||
|
consensus_len = len1 + len2 - overlap_len;
|
||||||
|
else
|
||||||
|
consensus_len = overlap_len;
|
||||||
|
|
||||||
|
// Allocate memory for consensus sequence
|
||||||
|
consensus_seq = (char*) malloc(consensus_len + 1 * sizeof(char)); // TODO keep malloced too maybe
|
||||||
|
if (consensus_seq == NULL)
|
||||||
|
{
|
||||||
|
obi_set_errno(OBI_MALLOC_ERROR);
|
||||||
|
obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory for the consensus sequence");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Allocate memory for consensus quality
|
||||||
|
consensus_qual = (uint8_t*) malloc(consensus_len * sizeof(uint8_t));
|
||||||
|
if (consensus_qual == NULL)
|
||||||
|
{
|
||||||
|
obi_set_errno(OBI_MALLOC_ERROR);
|
||||||
|
obidebug(1, "\nError computing the kmer similarity between two sequences: error allocating memory for the consensus quality");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
ali->consensus_length = consensus_len;
|
||||||
|
ali->consensus_seq = consensus_seq;
|
||||||
|
ali->consensus_qual = consensus_qual;
|
||||||
|
|
||||||
|
// Compute consensus-relative shift for each sequence and store direction
|
||||||
|
if (best_shift >= 0)
|
||||||
|
{
|
||||||
|
shift1 = 0;
|
||||||
|
shift2 = best_shift;
|
||||||
|
strcpy(ali->direction, "left");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
shift1 = -(best_shift);
|
||||||
|
shift2 = 0;
|
||||||
|
strcpy(ali->direction, "right");
|
||||||
|
}
|
||||||
|
|
||||||
|
// If positive shift, copy first part of second sequence
|
||||||
|
if (best_shift > 0)
|
||||||
|
{
|
||||||
|
strncpy(consensus_seq, seq2, best_shift);
|
||||||
|
memcpy(consensus_qual, qual2, best_shift*sizeof(uint8_t));
|
||||||
|
cons_shift = best_shift;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
cons_shift = 0;
|
||||||
|
|
||||||
|
// Build consensus part
|
||||||
|
for (pos=0; pos<overlap_len; pos++)
|
||||||
|
{
|
||||||
|
if (qual1[pos+shift1] >= qual2[pos+shift2])
|
||||||
|
consensus_seq[pos+cons_shift] = seq1[pos+shift1];
|
||||||
|
else
|
||||||
|
consensus_seq[pos+cons_shift] = seq2[pos+shift2];
|
||||||
|
consensus_qual[pos+cons_shift] = round((qual1[pos+shift1] + qual2[pos+shift2])/2); // TODO maybe use the (p1*(1-p2/3)) formula (but parenthesis bug???)
|
||||||
|
}
|
||||||
|
|
||||||
|
// If positive shift, copy last part of first sequence
|
||||||
|
if (best_shift > 0)
|
||||||
|
{
|
||||||
|
strncpy(consensus_seq+cons_shift+overlap_len, seq1+overlap_len, len1-overlap_len);
|
||||||
|
memcpy(consensus_qual+cons_shift+overlap_len, qual1+overlap_len, (len1-overlap_len)*sizeof(uint8_t));
|
||||||
|
}
|
||||||
|
|
||||||
|
consensus_seq[consensus_len] = '\0';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (seq1 != NULL)
|
||||||
|
free(seq1);
|
||||||
|
if (free_blob1)
|
||||||
|
free(blob1);
|
||||||
|
free(seq2);
|
||||||
|
free(blob2);
|
||||||
|
|
||||||
|
return ali;
|
||||||
|
}
|
||||||
|
|
120
src/kmer_similarity.h
Executable file
120
src/kmer_similarity.h
Executable file
@ -0,0 +1,120 @@
|
|||||||
|
/****************************************************************************
|
||||||
|
* Header file for Kmer similarity computation functions *
|
||||||
|
****************************************************************************/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @file kmer_similarity.h
|
||||||
|
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||||
|
* @date January 7th 2019
|
||||||
|
* @brief Header file for Kmer similarity computation functions.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef KMER_SIMILARITY_H_
|
||||||
|
#define KMER_SIMILARITY_H_
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
#include "obitypes.h"
|
||||||
|
#include "obidmscolumn.h"
|
||||||
|
#include "obiview.h"
|
||||||
|
|
||||||
|
|
||||||
|
#define ARRAY_LENGTH (256)
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Alignment structure, with informations about the similarity and to rebuild the alignment.
|
||||||
|
*/
|
||||||
|
typedef struct Obi_ali {
|
||||||
|
double score; /**< Alignment score.
|
||||||
|
*/
|
||||||
|
int consensus_length; /**< Length of the final consensus sequence.
|
||||||
|
*/
|
||||||
|
int overlap_length; /**< Length of the overlap between the aligned sequences.
|
||||||
|
*/
|
||||||
|
char* consensus_seq; /**< Consensus sequence built with the computed overlap.
|
||||||
|
*/
|
||||||
|
uint8_t* consensus_qual; /**< Consensus quality built with the computed overlap.
|
||||||
|
*/
|
||||||
|
int shift; /**< Shift chosen to align the sequences (for shifted alignment).
|
||||||
|
*/
|
||||||
|
char direction[6]; /**< Alignement direction (positive/right or negative/left shift) (for shifted alignment).
|
||||||
|
*/ // TODO but sequences switched around depending on size..... discuss
|
||||||
|
} Obi_ali_t, *Obi_ali_p;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Frees an Obi_ali_p structure and all its elements.
|
||||||
|
*
|
||||||
|
* @warning The pointer sent becomes unusable.
|
||||||
|
*
|
||||||
|
* @param ali The pointer on the Obi_ali_p structure to free.
|
||||||
|
*
|
||||||
|
* @since January 2019
|
||||||
|
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||||
|
*/
|
||||||
|
void obi_free_shifted_ali(Obi_ali_p ali);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Function computing the kmer similarity of two sequences stored in views.
|
||||||
|
*
|
||||||
|
* The similarity is computing this way: the positions of identical kmers in both sequences are
|
||||||
|
* compared, and the most represented shift is chosen. The similarity is then calculated as:
|
||||||
|
* kmer_similarity = number_of_common_kmers_with_the_chosen_shift + kmer_size - 1
|
||||||
|
*
|
||||||
|
* @warning Several pointers and structures passed to or returned by the function have to be freed by the caller:
|
||||||
|
* - kmer_pos_array
|
||||||
|
* - shift_array
|
||||||
|
* - shift_count_array
|
||||||
|
* - the Obi_ali_p structure returned
|
||||||
|
*
|
||||||
|
* @param view1 A pointer on the view containing the first sequence.
|
||||||
|
* @param column1 A pointer on the column containing the first sequence.
|
||||||
|
* @param idx1 The index of the first sequence in view1.
|
||||||
|
* @param elt_idx1 The element index of the first sequence in column1.
|
||||||
|
* @param view2 A pointer on the view containing the second sequence.
|
||||||
|
* @param column2 A pointer on the column containing the second sequence.
|
||||||
|
* @param idx2 The index of the second sequence in view2.
|
||||||
|
* @param elt_idx2 The element index of the second sequence in column2.
|
||||||
|
* @param kmer_size The kmer length to use. Must be >= 1 <= 4.
|
||||||
|
* @param kmer_pos_array The array used to store kmer positions. If NULL, allocated by the function.
|
||||||
|
* If needed, reallocated to a bigger size.
|
||||||
|
* @param kmer_pos_array_height_p A pointer on an integer corresponding to the size (number of elements)
|
||||||
|
* allocated for kmer_pos_array. Updated by the function as needed.
|
||||||
|
* @param shift_array The array used to store kmer shifts. If NULL, allocated by the function.
|
||||||
|
* If needed, reallocated to a bigger size.
|
||||||
|
* @param shift_array_height_p A pointer on an integer corresponding to the size (number of elements)
|
||||||
|
* allocated for shift_array. Updated by the function as needed.
|
||||||
|
* @param shift_count_array The array used to store shift counts. If NULL, allocated by the function.
|
||||||
|
* If needed, reallocated to a bigger size.
|
||||||
|
* @param shift_count_array_height_p A pointer on an integer corresponding to the size (number of elements)
|
||||||
|
* allocated for shift_count_array. Updated by the function as needed.
|
||||||
|
* @param build_consensus A boolean indicating whether the function should build the consensus sequence and quality. // TODO option to build consensus without quality?
|
||||||
|
*
|
||||||
|
* @returns A pointer on an Obi_ali_p structure containing the results.
|
||||||
|
* @retval NULL if an error occurred.
|
||||||
|
*
|
||||||
|
* @since January 2019
|
||||||
|
* @author Celine Mercier (celine.mercier@metabarcoding.org)
|
||||||
|
*/
|
||||||
|
Obi_ali_p kmer_similarity(Obiview_p view1,
|
||||||
|
OBIDMS_column_p column1,
|
||||||
|
index_t idx1,
|
||||||
|
index_t elt_idx1,
|
||||||
|
Obiview_p view2,
|
||||||
|
OBIDMS_column_p column2,
|
||||||
|
index_t idx2,
|
||||||
|
index_t elt_idx2,
|
||||||
|
uint8_t kmer_size,
|
||||||
|
int32_t* kmer_pos_array,
|
||||||
|
int32_t* kmer_pos_array_height_p,
|
||||||
|
int32_t* shift_array,
|
||||||
|
int32_t* shift_array_height_p,
|
||||||
|
int32_t* shift_count_array,
|
||||||
|
int32_t* shift_count_array_height_p,
|
||||||
|
bool build_consensus);
|
||||||
|
|
||||||
|
|
||||||
|
#endif /* KMER_SIMILARITY_H_ */
|
Reference in New Issue
Block a user