Cython alignment library

This commit is contained in:
Celine Mercier
2018-02-12 13:28:20 +01:00
parent 156fb04e88
commit f5a00c9322
13 changed files with 1607 additions and 0 deletions

View File

@ -0,0 +1,166 @@
from obitools3.dms.obiseq import Nuc_Seq_Stored, Nuc_Seq
from functools import reduce
import struct
from copy import deepcopy
class GappedPositionException(Exception):
pass
class AlignedSequence(Nuc_Seq): # TODO discuss. can be both Nuc_Seq or Nuc_Seq_Stored....
def __init__(self, seq) :
self.wrapped = seq
self._length=len(seq)
self._gaps=[[self._length,0]]
def clone(self):
seq = AlignedSequence(self.wrapped)
seq._gaps=deepcopy(self._gaps)
seq._length=reduce(lambda x,y:x+y, (z[0]+z[1] for z in self._gaps),0)
return seq
def setGaps(self, value):
'''
Set gap vector to an AlignedSequence.
Gap vector describes the gap positions on a sequence.
It is a gap of couple. The first couple member is the count
of sequence letter, the second one is the gap length.
@param value: a list of length 2 list describing gap positions
@type value: list of couple
'''
assert isinstance(value, list),'Gap vector must be a list'
assert reduce(lambda x,y: x and y,
(isinstance(z, list) and len(z)==2 for z in value),
True),"Value must be a list of length 2 list"
lseq = reduce(lambda x,y:x+y, (z[0] for z in value),0)
assert lseq==len(self.wrapped),"Gap vector incompatible with the sequence"
self._gaps = value
self._length=reduce(lambda x,y:x+y, (z[0]+z[1] for z in value),0)
def getGaps(self):
return tuple(self._gaps)
gaps = property(getGaps, setGaps, None, "Gaps's Docstring")
def _getIndice(self,pos):
i=0
cpos=0
for s,g in self._gaps:
cpos+=s
if cpos>pos:
return i,pos-cpos+s
cpos+=g
if cpos>pos:
return i,-pos+cpos-g-1
i+=1
raise IndexError
def getId(self):
d = self.id or ("%s_ALN" % self.wrapped.id)
return d
def __len__(self):
return self._length
def get_str(self):
return ''.join([x.decode('ascii') for x in self])
def __iter__(self):
def isymb():
cpos=0
for s,g in self._gaps:
for x in range(s):
yield (self.wrapped[cpos+x])
for x in range(g):
yield b"-"
cpos+=s
return isymb()
def _posInWrapped(self,position):
i,s=self._getIndice(position)
if s<0:
raise GappedPositionException
value=self._gaps
p=reduce(lambda x,y:x+y, (z[0] for z in value[:i]),0)+s
return p
def get_symbol_at(self,position):
try:
return self.wrapped.get_symbol_at(self._posInWrapped(position))
except GappedPositionException:
return b"-"
def insertGap(self,position,count=1):
if position==self._length:
idx=len(self._gaps)-1
p=-1
else:
idx,p = self._getIndice(position)
if p >= 0:
self._gaps.insert(idx, [p,count])
self._gaps[idx+1][0]-=p
else:
self._gaps[idx][1]+=count
self._length=reduce(lambda x,y:x+y, (z[0]+z[1] for z in self._gaps),0)
class Alignment(list):
def _assertData(self,data):
# assert isinstance(data, Nuc_Seq_Stored),'You must only add bioseq to an alignement' TODO
if hasattr(self, '_alignlen'):
assert self._alignlen==len(data),'All aligned sequences must have the same length'
else:
self._alignlen=len(data)
return data
def clone(self):
ali = Alignment(x.clone() for x in self)
return ali
def append(self,data):
data = self._assertData(data)
list.append(self,data)
def __setitem__(self,index,data):
data = self._assertData(data)
list.__setitem__(self,index,data)
def getSite(self,key):
if isinstance(key,int):
return [x[key] for x in self]
def insertGap(self,position,count=1):
for s in self:
s.insertGap(position,count)
def isFullGapSite(self,key):
return reduce(lambda x,y: x and y,(z==b"-" for z in self.getSite(key)),True)
def isGappedSite(self,key):
return b"-" in self.getSite(key)
def __str__(self):
l = len(self[0])
rep=""
idmax = max(len(x.id) for x in self)+2
template= "%%-%ds %%-60s" % idmax
for p in range(0,l,60):
for s in self:
rep+= (template % (s.id,s[p:p+60])).strip() + '\n'
rep+="\n"
return rep
def columnIterator(alignment):
lali = len(alignment[0])
for p in range(lali):
c = []
for x in alignment:
c.append(x[p])
yield c

View File

@ -0,0 +1,12 @@
#cython: language_level=3
from ._nws cimport *
cdef class DirectAssemble(NWS):
cdef double ysmax
cdef int ymax
cdef double doAlignment(self) except? 0
cdef class ReverseAssemble(DirectAssemble):
pass

View File

@ -0,0 +1,168 @@
#cython: language_level=3
'''
Created on 6 Nov. 2009
@author: coissac
'''
cdef class DirectAssemble(NWS):
def __init__(self,match=4,mismatch=-6,opengap=-8,extgap=-2):
NWS.__init__(self,match,mismatch,opengap,extgap)
self.ysmax=0
self.ymax=0
cdef double doAlignment(self) except? 0:
cdef int i # vertical index
cdef int j # horizontal index
cdef int idx
cdef int idx0
cdef int idx1
cdef int jump
cdef int delta
cdef double score
cdef double scoremax
cdef int path
if self.needToCompute:
self.allocate()
self.reset()
self.ysmax=0
self.ymax=0
for j in range(1,self.hSeq.length+1):
idx = self.index(j,0)
self.matrix.matrix[idx].score = 0
self.matrix.matrix[idx].path = j
for i in range(1,self.vSeq.length+1):
idx = self.index(0,i)
self.matrix.matrix[idx].score = self._opengap + (self._extgap * (i-1))
self.matrix.matrix[idx].path = -i
idx0=self.index(-1,0)
idx1=self.index(0,1)
for i in range(1,self.vSeq.length+1):
idx0+=1
idx1+=1
for j in range(1,self.hSeq.length+1):
# 1 - came from diagonal
#idx = self.index(j-1,i-1)
idx = idx0
#print("ASSEMBLE computing cell : %d,%d --> %d/%d" % (j,i,self.index(j,i),self.matrix.msize),)
scoremax = self.matrix.matrix[idx].score + \
self.matchScore(j,i)
path = 0
#print("so=%f sd=%f sm=%f" % (self.matrix.matrix[idx].score,self.matchScore(j,i),scoremax),)
# 2 - open horizontal gap
# idx = self.index(j-1,i)
idx = idx1 - 1
score = self.matrix.matrix[idx].score+ \
self._opengap
if score > scoremax :
scoremax = score
path = +1
# 3 - open vertical gap
# idx = self.index(j,i-1)
idx = idx0 + 1
score = self.matrix.matrix[idx].score + \
self._opengap
if score > scoremax :
scoremax = score
path = -1
# 4 - extend horizontal gap
jump = self.matrix.bestHJump[i]
if jump >= 0:
idx = self.index(jump,i)
delta = j-jump
score = self.matrix.matrix[idx].score + \
self._extgap * delta
if score > scoremax :
scoremax = score
path = delta+1
# 5 - extend vertical gap
jump = self.matrix.bestVJump[j]
if jump >= 0:
idx = self.index(j,jump)
delta = i-jump
score = self.matrix.matrix[idx].score + \
self._extgap * delta
if score > scoremax :
scoremax = score
path = -delta-1
# idx = self.index(j,i)
idx = idx1
self.matrix.matrix[idx].score = scoremax
self.matrix.matrix[idx].path = path
if path == -1:
self.matrix.bestVJump[j]=i
elif path == +1 :
self.matrix.bestHJump[i]=j
if j==self.hSeq.length and scoremax > self.ysmax:
self.ysmax=scoremax
self.ymax=i
idx0+=1
idx1+=1
self.sequenceChanged=False
self.scoreChanged=False
return self.ysmax
cdef void backtrack(self):
#cdef list path=[]
cdef int i
cdef int j
cdef int p
self.doAlignment()
i=self.ymax
j=self.hSeq.length
self.path=allocatePath(i,j+1,self.path)
if self.ymax<self.vSeq.length:
self.path.path[self.path.length]=self.ymax-self.vSeq.length
self.path.length+=1
while (i or j):
p=self.matrix.matrix[self.index(j,i)].path
self.path.path[self.path.length]=p
self.path.length+=1
#path.append(p)
if p==0:
i-=1
j-=1
elif p < 0:
i+=p
else:
j-=p
#path.reverse()
#reversePath(self.path)
self.path.hStart=0
self.path.vStart=0
#return 0,0,path
cdef class ReverseAssemble(DirectAssemble):
property seqB:
def __get__(self):
return self.verticalSeq.wrapped
def __set__(self, seq):
self.sequenceChanged=True
self.verticalSeq=seq.reverse_complement
self.vSeq=allocateSequence(self.verticalSeq,self.vSeq)

View File

@ -0,0 +1,92 @@
#cython: language_level=3
cdef import from "stdlib.h":
void* malloc(int size) except NULL
void* realloc(void* chunk,int size) except NULL
void free(void* chunk)
cdef import from "string.h":
void bzero(void *s, size_t n)
void memset(void* chunk,int car,int length)
void memcpy(void* s1, void* s2, int n)
cdef struct AlignCell :
double score
int path
cdef struct AlignMatrix :
AlignCell* matrix
int* bestVJump
int* bestHJump
int msize
int vsize
int hsize
cdef AlignMatrix* allocateMatrix(int hsize, int vsize,AlignMatrix *matrix=?)
cdef void freeMatrix(AlignMatrix* matrix)
cdef void resetMatrix(AlignMatrix* matrix)
cdef struct alignSequence:
long length
long buffsize
bint hasQuality
char* sequence
double* quality
cdef alignSequence* allocateSequence(object bioseq, alignSequence* seq=?) except *
cdef void freeSequence(alignSequence* seq)
cdef struct alignPath:
long length
long buffsize
long vStart
long hStart
long *path
cdef alignPath* allocatePath(long l1,long l2,alignPath* path=?)
cdef void reversePath(alignPath* path)
cdef void freePath(alignPath* path)
cdef int bitCount(int x)
cpdef bint iupacMatch(unsigned char a, unsigned char b)
cpdef double iupacPartialMatch(unsigned char a, unsigned char b)
cpdef unsigned char encodeBase(unsigned char lettre)
cdef class DynamicProgramming:
cdef AlignMatrix* matrix
cdef object horizontalSeq
cdef object verticalSeq
cdef alignSequence* hSeq
cdef alignSequence* vSeq
cdef alignPath* path
cdef double _opengap
cdef double _extgap
cdef object alignment
cdef bint sequenceChanged
cdef bint scoreChanged
cdef int _vlen(self)
cdef int _hlen(self)
cdef int allocate(self) except -1
cdef double doAlignment(self) except? 0
cdef void reset(self)
cdef inline int index(self, int x, int y)
cdef inline bint _needToCompute(self)
cdef void backtrack(self)
cdef void clean(self)

View File

@ -0,0 +1,361 @@
#cython: language_level=3
'''
Created on 14 sept. 2009
@author: coissac
'''
from obitools3.align import AlignedSequence
from obitools3.align import Alignment
from copy import deepcopy
######
#
# Import standard memory management function to improve
# efficiency of the alignment code
#
#
cdef AlignMatrix* allocateMatrix(int hsize, int vsize,AlignMatrix *matrix=NULL):
vsize+=1
hsize+=1
if matrix is NULL:
matrix = <AlignMatrix*>malloc(sizeof(AlignMatrix))
matrix.vsize=0
matrix.hsize=0
matrix.msize=0
matrix.matrix=NULL
matrix.bestVJump=NULL
matrix.bestHJump=NULL
if hsize > matrix.hsize:
matrix.bestVJump = <int*>realloc(matrix.bestVJump,hsize * sizeof(int))
matrix.hsize=hsize
if vsize > matrix.vsize:
matrix.bestHJump = <int*>realloc(matrix.bestHJump,vsize * sizeof(int))
matrix.vsize=vsize
if (hsize * vsize) > matrix.msize:
matrix.msize = hsize * vsize
matrix.matrix = <AlignCell*>realloc(matrix.matrix, matrix.msize * sizeof(AlignCell))
return matrix
cdef void freeMatrix(AlignMatrix* matrix):
if matrix is not NULL:
if matrix.matrix is not NULL:
free(matrix.matrix)
if matrix.bestVJump is not NULL:
free(matrix.bestVJump)
if matrix.bestHJump is not NULL:
free(matrix.bestHJump)
free(matrix)
cdef void resetMatrix(AlignMatrix* matrix):
if matrix is not NULL:
if matrix.matrix is not NULL:
bzero(<void*>matrix.matrix, matrix.msize * sizeof(AlignCell))
if matrix.bestHJump is not NULL:
memset(<void*>matrix.bestHJump,255,matrix.vsize * sizeof(int))
if matrix.bestVJump is not NULL:
memset(<void*>matrix.bestVJump,255,matrix.hsize * sizeof(int))
cdef alignSequence* allocateSequence(object bioseq, alignSequence* seq=NULL) except *:
cdef bytes strseq
cdef int i
if seq is NULL:
seq = <alignSequence*>malloc(sizeof(alignSequence))
seq.length=0
seq.buffsize=0
seq.sequence=NULL
seq.quality=NULL
seq.hasQuality=False
seq.length=len(bioseq)
if seq.length > seq.buffsize:
#seq.sequence = <char*>realloc(seq.sequence,sizeof(char)*seq.length)
#seq.quality = <double*>realloc(seq.quality,sizeof(double)*seq.length)
seq.buffsize = seq.length
# TODO optimisable...
#strseq = bioseq.seq.lower()
#memcpy(seq.sequence,<char*>strseq,seq.length)
seq.sequence = bioseq.seq
return seq
cdef void freeSequence(alignSequence* seq):
if seq is not NULL:
#if seq.sequence is not NULL:
# free(<void*>seq.sequence)
#if seq.quality is not NULL:
# free(<void*>seq.quality)
free(seq)
cdef alignPath* allocatePath(long l1,long l2,alignPath* path=NULL):
cdef long length=l1+l2
if path is NULL:
path = <alignPath*>malloc(sizeof(alignPath))
path.length=0
path.buffsize=0
path.path=NULL
if length > path.buffsize:
path.buffsize=length
path.path=<long*>realloc(path.path,sizeof(long)*length)
path.length=0
path.vStart=0
path.hStart=0
return path
cdef void reversePath(alignPath* path):
cdef long i
cdef long j
j=path.length
for i in range(<long>(path.length/2)): # TODO not sure about the cast
j-=1
path.path[i],path.path[j]=path.path[j],path.path[i]
cdef void freePath(alignPath* path):
if path is not NULL:
if path.path is not NULL:
free(<void*>path.path)
free(<void*>path)
cdef int aascii = ord(b'a')
cdef int _basecode[26]
cdef int bitCount(int x):
cdef int i=0
while(x):
i+=1
x&=x-1
return i
cpdef bint iupacMatch(unsigned char a, unsigned char b):
cdef bint m
if a==42: # * ascii code
a=110 # n ascii code
if b==42: # * ascii code
b=110 # n ascii code
m = _basecode[a - aascii] & _basecode[b - aascii]
return m
cpdef unsigned char encodeBase(unsigned char lettre):
return _basecode[lettre - aascii]
cpdef double iupacPartialMatch(unsigned char a, unsigned char b):
cdef int codeA
cdef int codeB
cdef int good
cdef int all
cdef double partial
if a==42: # * ascii code
a=110 # n ascii code
if b==42: # * ascii code
b=110 # n ascii code
codeA = _basecode[a - aascii]
codeB = _basecode[b - aascii]
good = bitCount(codeA & codeB)
all = bitCount(codeA) * bitCount(codeB)
partial= <double>good / all
return partial
cdef class DynamicProgramming:
def __init__(self,opengap,extgap):
self.sequenceChanged=True
self.scoreChanged=True
self.matrix=NULL
self.hSeq=NULL
self.vSeq=NULL
self.path=NULL
self.horizontalSeq=None
self.verticalSeq=None
self._opengap=opengap
self._extgap=extgap
cdef int _vlen(self):
return self.vSeq.length
cdef int _hlen(self):
return self.hSeq.length
cdef int allocate(self) except -1:
assert self.horizontalSeq is not None,'Sequence A must be set'
assert self.verticalSeq is not None,'Sequence B must be set'
cdef long lenH=self._hlen()
cdef long lenV=self._vlen()
self.matrix=allocateMatrix(lenH,lenV,self.matrix)
return 0
cdef double doAlignment(self) except? 0:
pass
cdef bint _needToCompute(self):
return self.scoreChanged or self.sequenceChanged
cdef void backtrack(self):
pass
property seqA:
def __get__(self):
return self.horizontalSeq
def __set__(self, seq):
self.sequenceChanged = True
self.horizontalSeq = seq
self.hSeq = allocateSequence(self.horizontalSeq, self.hSeq)
property seqB:
def __get__(self):
return self.verticalSeq
def __set__(self, seq):
self.sequenceChanged = True
self.verticalSeq = seq
self.vSeq = allocateSequence(self.verticalSeq, self.vSeq)
property opengap:
def __get__(self):
return self._opengap
def __set__(self,opengap):
self._opengap=opengap
self.scoreChanged=True
property extgap:
def __get__(self):
return self._extgap
def __set__(self,extgap):
self._extgap=extgap
self.scoreChanged=True
property needToCompute:
def __get__(self):
return self.scoreChanged or self.sequenceChanged
property score:
def __get__(self):
return self.doAlignment()
cdef void reset(self):
self.scoreChanged=True
resetMatrix(self.matrix)
cdef inline int index(self, int x, int y):
return (self._hlen()+1) * y + x
cdef void clean(self):
freeMatrix(self.matrix)
freeSequence(self.hSeq)
freeSequence(self.vSeq)
freePath(self.path)
def __dealloc__(self):
self.clean()
def __call__(self):
cdef list hgaps=[]
cdef list vgaps=[]
cdef list b
cdef int hp=0
cdef int vp=0
cdef int lenh=0
cdef int lenv=0
cdef int h,v,p
cdef int i
cdef object ali
cdef double score
if self._needToCompute():
score = self.doAlignment()
self.backtrack()
for i in range(self.path.length-1,-1,-1):
p=self.path.path[i]
if p==0:
hp+=1
vp+=1
lenh+=1
lenv+=1
elif p>0:
hp+=p
lenh+=p
vgaps.append([vp,p])
vp=0
else:
vp-=p
lenv-=p
hgaps.append([hp,-p])
hp=0
if hp:
hgaps.append([hp,0])
if vp:
vgaps.append([vp,0])
if lenh < self._hlen():
hseq=self.horizontalSeq[self.path.hStart:self.path.hStart+lenh]
else:
hseq=self.horizontalSeq
hseq=AlignedSequence(hseq)
hseq.gaps=hgaps
if lenv < self._vlen():
vseq=self.verticalSeq[self.path.vStart:self.path.vStart+lenv]
else:
vseq=self.verticalSeq
vseq=AlignedSequence(vseq)
vseq.gaps=vgaps
ali=Alignment()
ali.append(hseq)
ali.append(vseq)
ali.score=score
self.alignment=ali
ali=self.alignment.clone()
ali.score=self.alignment.score
return ali
# initialize iupac carray
__basecode=[1,14,2,13,0,0,4,11,0,0,12,0,3,15,0,0,0,5,6,8,8,7,9,0,10,0]
for i in range(26):
_basecode[i]=__basecode[i]

View File

@ -0,0 +1,12 @@
#cython: language_level=3
from ._dynamic cimport *
cdef class NWS(DynamicProgramming):
cdef double _match
cdef double _mismatch
cdef double matchScore(self,int h, int v)
cdef double doAlignment(self) except? 0

View File

@ -0,0 +1,161 @@
#cython: language_level=3
'''
Created on 6 Nov. 2009
@author: coissac
'''
cdef class NWS(DynamicProgramming):
def __init__(self,match=4,mismatch=-6,opengap=-8,extgap=-2):
DynamicProgramming.__init__(self,opengap,extgap)
self._match=match
self._mismatch=mismatch
cdef double matchScore(self,int h, int v):
cdef double score
score = iupacPartialMatch(self.hSeq.sequence[h-1],self.vSeq.sequence[v-1])
return score * self._match + (1-score) * self._mismatch
cdef double doAlignment(self) except? 0:
cdef int i # vertical index
cdef int j # horizontal index
cdef int idx
cdef int jump
cdef int delta
cdef double score
cdef double scoremax
cdef int path
if self.needToCompute:
self.allocate()
self.reset()
for j in range(1,self._hlen()+1):
idx = self.index(j,0)
self.matrix.matrix[idx].score = self._opengap + (self._extgap * (j-1))
self.matrix.matrix[idx].path = j
for i in range(1,self._vlen()+1):
idx = self.index(0,i)
self.matrix.matrix[idx].score = self._opengap + (self._extgap * (i-1))
self.matrix.matrix[idx].path = -i
for i in range(1,self._vlen()+1):
for j in range(1,self._hlen()+1):
# 1 - came from diagonal
idx = self.index(j-1,i-1)
# print "computing cell : %d,%d --> %d/%d" % (j,i,self.index(j,i),self.matrix.msize),
scoremax = self.matrix.matrix[idx].score + \
self.matchScore(j,i)
path = 0
# print "so=%f sd=%f sm=%f" % (self.matrix.matrix[idx].score,self.matchScore(j,i),scoremax),
# 2 - open horizontal gap
idx = self.index(j-1,i)
score = self.matrix.matrix[idx].score + \
self._opengap
if score > scoremax :
scoremax = score
path = +1
# 3 - open vertical gap
idx = self.index(j,i-1)
score = self.matrix.matrix[idx].score + \
self._opengap
if score > scoremax :
scoremax = score
path = -1
# 4 - extend horizontal gap
jump = self.matrix.bestHJump[i]
if jump >= 0:
idx = self.index(jump,i)
delta = j-jump
score = self.matrix.matrix[idx].score + \
self._extgap * delta
if score > scoremax :
scoremax = score
path = delta+1
# 5 - extend vertical gap
jump = self.matrix.bestVJump[j]
if jump >= 0:
idx = self.index(j,jump)
delta = i-jump
score = self.matrix.matrix[idx].score + \
self._extgap * delta
if score > scoremax :
scoremax = score
path = -delta-1
idx = self.index(j,i)
self.matrix.matrix[idx].score = scoremax
self.matrix.matrix[idx].path = path
if path == -1:
self.matrix.bestVJump[j]=i
elif path == +1 :
self.matrix.bestHJump[i]=j
self.sequenceChanged=False
self.scoreChanged=False
idx = self.index(self._hlen(),self._vlen())
return self.matrix.matrix[idx].score
cdef void backtrack(self):
#cdef list path=[]
cdef int i
cdef int j
cdef int p
self.doAlignment()
i=self._vlen()
j=self._hlen()
self.path=allocatePath(i,j,self.path)
while (i or j):
p=self.matrix.matrix[self.index(j,i)].path
self.path.path[self.path.length]=p
self.path.length+=1
#path.append(p)
if p==0:
i-=1
j-=1
elif p < 0:
i+=p
else:
j-=p
#path.reverse()
#reversePath(self.path)
self.path.hStart=0
self.path.vStart=0
#return 0,0,path
property match:
def __get__(self):
return self._match
def __set__(self,match):
self._match=match
self.scoreChanged=True
property mismatch:
def __get__(self):
return self._mismatch
def __set__(self,mismatch):
self._mismatch=mismatch
self.scoreChanged=True

View File

@ -0,0 +1,98 @@
#cython: language_level=3
'''
Created on 6 Nov. 2009
@author: coissac
'''
from ._dynamic cimport *
from ._assemble cimport DirectAssemble
import array
cdef class QSolexaDirectAssemble(DirectAssemble):
cdef double* hError
cdef double* vError
cdef object a
def __init__(self,match=4,mismatch=-4,opengap=-8,extgap=-2):
"""
Rapport entre score de match et mismatch:
si mismatch = - match / 3
alors quand scrore temps vers 0 et qu'il est impossible de decider
pas de penalisation (s'=0)
si mismatch < - match / 3 la non decidabilite est penalisee.
"""
DirectAssemble.__init__(self,match,mismatch,opengap,extgap)
cdef double matchScore(self,int h, int v):
cdef double score
cdef double smatch
cdef double smismatch
cdef double hok=1-self.hError[h-1]
cdef double vok=1-self.vError[v-1]
score=iupacPartialMatch(self.hSeq.sequence[h-1],self.vSeq.sequence[v-1])
smatch=((4*hok*vok-hok-vok)*(self._match-self._mismatch)+self._match+2*self._mismatch)/3
smismatch=((hok+vok-4*hok*vok)*(self._match-self._mismatch)+2*self._match+7*self._mismatch)/9
return smatch * score + smismatch * (1. - score)
property seqA:
def __get__(self):
return self.horizontalSeq
def __set__(self, seq):
cdef object oaddresse,olength
assert hasattr(seq, "quality_array"),"You must use sequence with quality indices"
self.sequenceChanged=True
self.horizontalSeq=seq
self.hSeq=allocateSequence(self.horizontalSeq,self.hSeq)
(oaddress,olength)=seq.quality_array.buffer_info()
self.hError=<double*><unsigned long int>oaddress
property seqB:
def __get__(self):
return self.verticalSeq
def __set__(self, seq):
cdef object oaddresse,olength
assert hasattr(seq, "quality_array"),"You must use sequence with quality indices"
self.sequenceChanged=True
self.verticalSeq=seq
self.vSeq=allocateSequence(self.verticalSeq,self.vSeq)
(oaddress,olength)=seq.quality_array.buffer_info()
self.vError=<double*><unsigned long int>oaddress
cdef class QSolexaReverseAssemble(QSolexaDirectAssemble):
cdef double matchScore(self,int h, int v):
cdef double score
cdef double smatch
cdef double smismatch
cdef double hok=1-self.hError[h-1]
cdef double vok=1-self.vError[self.vSeq.length - v]
score=iupacPartialMatch(self.hSeq.sequence[h-1],self.vSeq.sequence[v-1])
smatch=((4*hok*vok-hok-vok)*(self._match-self._mismatch)+self._match+2*self._mismatch)/3
smismatch=((hok+vok-4*hok*vok)*(self._match-self._mismatch)+2*self._match+7*self._mismatch)/9
return smatch * score + smismatch * (1. - score)
property seqB:
def __get__(self):
return self.verticalSeq.wrapped
def __set__(self, seq):
cdef object oaddresse,olength
assert hasattr(seq, "quality_array"),"You must use sequence with quality indices"
self.sequenceChanged=True
self.verticalSeq=seq.reverse_complement
self.vSeq=allocateSequence(self.verticalSeq,self.vSeq)
self.a = array.array('d', list(seq.quality_array))
(oaddress,olength) = self.a.buffer_info()
self.vError = <double*><unsigned long int>oaddress

View File

@ -0,0 +1,96 @@
#cython: language_level=3
'''
Created on 6 Nov. 2009
@author: coissac
'''
from ._dynamic cimport *
from ._rassemble cimport RightDirectAssemble
from copy import deepcopy
import array
cdef class QSolexaRightDirectAssemble(RightDirectAssemble):
cdef double* hError
cdef double* vError
cdef object a
def __init__(self,match=4,mismatch=-4,opengap=-8,extgap=-2):
"""
Rapport entre score de match et mismatch:
si mismatch = - match / 3
alors quand scrore temps vers 0 et qu'il est impossible de decider
pas de penalisation (s'=0)
si mismatch < - match / 3 la non decidabilite est penalisee.
"""
RightDirectAssemble.__init__(self,match,mismatch,opengap,extgap)
cdef double matchScore(self,int h, int v):
cdef double score
cdef double smatch
cdef double smismatch
cdef double hok=1-self.hError[h-1]
cdef double vok=1-self.vError[v-1]
score=iupacPartialMatch(self.hSeq.sequence[h-1],self.vSeq.sequence[v-1])
smatch=((4*hok*vok-hok-vok)*(self._match-self._mismatch)+self._match+2*self._mismatch)/3
smismatch=((hok+vok-4*hok*vok)*(self._match-self._mismatch)+2*self._match+7*self._mismatch)/9
return smatch * score + smismatch * (1. - score)
property seqA:
def __get__(self):
return self.horizontalSeq
def __set__(self, seq):
cdef object oaddresse,olength
assert hasattr(seq, "quality"),"You must use sequence with quality indices"
self.sequenceChanged=True
self.horizontalSeq=seq
self.hSeq=allocateSequence(self.horizontalSeq,self.hSeq)
(oaddress,olength)=seq.quality_array.buffer_info()
self.hError=<double*><unsigned long int>oaddress
property seqB:
def __get__(self):
return self.verticalSeq
def __set__(self, seq):
cdef object oaddresse,olength
assert hasattr(seq, "quality"),"You must use sequence with quality indices"
self.sequenceChanged=True
self.verticalSeq=seq
self.vSeq=allocateSequence(self.verticalSeq,self.vSeq)
(oaddress,olength)=seq.quality_array.buffer_info()
self.vError=<double*><unsigned long int>oaddress
cdef class QSolexaRightReverseAssemble(QSolexaRightDirectAssemble):
cdef double matchScore(self,int h, int v):
cdef double score
cdef double smatch
cdef double smismatch
cdef double hok=1-self.hError[h-1]
cdef double vok=1-self.vError[self.vSeq.length - v]
score=iupacPartialMatch(self.hSeq.sequence[h-1],self.vSeq.sequence[v-1])
smatch=((4*hok*vok-hok-vok)*(self._match-self._mismatch)+self._match+2*self._mismatch)/3
smismatch=((hok+vok-4*hok*vok)*(self._match-self._mismatch)+2*self._match+7*self._mismatch)/9
return smatch * score + smismatch * (1. - score)
property seqB:
def __get__(self):
return self.verticalSeq.wrapped
def __set__(self, seq):
cdef object oaddresse,olength
assert hasattr(seq, "quality"),"You must use sequence with quality indices"
self.sequenceChanged=True
self.verticalSeq=seq.reverse_complement
self.vSeq=allocateSequence(self.verticalSeq,self.vSeq)
self.a = array.array('d', list(seq.quality_array))
(oaddress,olength) = self.a.buffer_info()
self.vError = <double*><unsigned long int>oaddress

View File

@ -0,0 +1,12 @@
#cython: language_level=3
from ._nws cimport *
cdef class RightDirectAssemble(NWS):
cdef double xsmax
cdef int xmax
cdef double doAlignment(self) except? 0
cdef class RightReverseAssemble(RightDirectAssemble):
pass

View File

@ -0,0 +1,158 @@
#cython: language_level=3
'''
Created on 6 Nov. 2009
@author: coissac
'''
from ._rassemble cimport *
cdef class RightDirectAssemble(NWS):
def __init__(self,match=4,mismatch=-6,opengap=-8,extgap=-2):
NWS.__init__(self,match,mismatch,opengap,extgap)
self.xsmax=0
self.xmax=0
cdef double doAlignment(self) except? 0:
cdef int i # vertical index
cdef int j # horizontal index
cdef int idx
cdef int jump
cdef int delta
cdef double score
cdef double scoremax
cdef int path
if self.needToCompute:
self.allocate()
self.reset()
self.xsmax=0
self.xmax=0
for j in range(1,self.hSeq.length+1):
idx = self.index(j,0)
self.matrix.matrix[idx].score = self._opengap + (self._extgap * (j-1))
self.matrix.matrix[idx].path = j
for i in range(1,self.vSeq.length+1):
idx = self.index(0,i)
self.matrix.matrix[idx].score = 0
self.matrix.matrix[idx].path = -i
for i in range(1,self.vSeq.length+1):
for j in range(1,self.hSeq.length+1):
# 1 - came from diagonal
idx = self.index(j-1,i-1)
#print("computing cell : %d,%d --> %d/%d" % (j,i,self.index(j,i),self.matrix.msize),)
scoremax = self.matrix.matrix[idx].score + \
self.matchScore(j,i)
path = 0
#print("so=%f sd=%f sm=%f" % (self.matrix.matrix[idx].score,self.matchScore(j,i),scoremax),)
# 2 - open horizontal gap
idx = self.index(j-1,i)
score = self.matrix.matrix[idx].score+ \
self._opengap
if score > scoremax :
scoremax = score
path = +1
# 3 - open vertical gap
idx = self.index(j,i-1)
score = self.matrix.matrix[idx].score + \
self._opengap
if score > scoremax :
scoremax = score
path = -1
# 4 - extend horizontal gap
jump = self.matrix.bestHJump[i]
if jump >= 0:
idx = self.index(jump,i)
delta = j-jump
score = self.matrix.matrix[idx].score + \
self._extgap * delta
if score > scoremax :
scoremax = score
path = delta+1
# 5 - extend vertical gap
jump = self.matrix.bestVJump[j]
if jump >= 0:
idx = self.index(j,jump)
delta = i-jump
score = self.matrix.matrix[idx].score + \
self._extgap * delta
if score > scoremax :
scoremax = score
path = -delta-1
idx = self.index(j,i)
self.matrix.matrix[idx].score = scoremax
self.matrix.matrix[idx].path = path
if path == -1:
self.matrix.bestVJump[j]=i
elif path == +1 :
self.matrix.bestHJump[i]=j
if i==self.vSeq.length and scoremax > self.xsmax:
self.xsmax=scoremax
self.xmax=j
self.sequenceChanged=False
self.scoreChanged=False
return self.xsmax
cdef void backtrack(self):
cdef list path=[]
cdef int i
cdef int j
cdef int p
self.doAlignment()
j=self.xmax
i=self.vSeq.length
self.path=allocatePath(i,j+1,self.path)
if self.xmax<self.hSeq.length:
self.path.path[self.path.length]=self.hSeq.length-self.xmax
self.path.length+=1
while (i or j):
p=self.matrix.matrix[self.index(j,i)].path
self.path.path[self.path.length]=p
self.path.length+=1
#path.append(p)
if p==0:
i-=1
j-=1
elif p < 0:
i+=p
else:
j-=p
#path.reverse()
self.path.hStart=0
self.path.vStart=0
#reversePath(self.path)
#return 0,0,path
cdef class RightReverseAssemble(RightDirectAssemble):
property seqB:
def __get__(self):
return self.verticalSeq.wrapped
def __set__(self, seq):
self.sequenceChanged=True
self.verticalSeq=seq.reverse_complement
self.vSeq=allocateSequence(self.verticalSeq,self.vSeq)

View File

@ -0,0 +1,217 @@
#cython: language_level=3
from cpython cimport array
from .solexapairend import iterOnAligment
from math import log10
cdef class IterOnConsensus:
cdef object _ali
cdef int __seqASingle
cdef int __seqBSingle
cdef int __seqABMatch
cdef int __seqAMismatch
cdef int __seqBMismatch
cdef int __seqAInsertion
cdef int __seqBInsertion
cdef int __seqADeletion
cdef int __seqBDeletion
cdef object __ioa
cdef bint __firstSeqB
def __cinit__(self,ali):
self._ali=ali
self.__seqASingle=0
self.__seqBSingle=0
self.__seqABMatch=0
self.__seqAMismatch=0
self.__seqBMismatch=0
self.__seqAInsertion=0
self.__seqBInsertion=0
self.__seqADeletion=0
self.__seqBDeletion=0
self.__ioa = iterOnAligment(self._ali)
self.__firstSeqB=False
def get_seqASingle(self):
return self.__seqASingle
def get_seqBSingle(self):
return self.__seqBSingle
def get_seqABMatch(self):
return self.__seqABMatch
def get_seqAMismatch(self):
return self.__seqAMismatch
def get_seqBMismatch(self):
return self.__seqBMismatch
def get_seqAInsertion(self):
return self.__seqAInsertion
def get_seqBInsertion(self):
return self.__seqBInsertion
def get_seqADeletion(self):
return self.__seqADeletion
def get_seqBDeletion(self):
return self.__seqBDeletion
def __next__(self):
cdef bytes snuc0
cdef bytes snuc1
cdef char* nuc0
cdef char* nuc1
cdef char* dash=b"-"
cdef double score0
cdef double score1
cdef double h0
cdef double h1
while(1):
snuc0,score0,snuc1,score1 = self.__ioa.__next__()
nuc0=snuc0
nuc1=snuc1
if nuc0[0]==nuc1[0]:
if nuc1[0]!=dash[0]:
self.__firstSeqB=True
self.__seqABMatch+=1
self.__seqBSingle=0
return (nuc0,score0*score1)
else:
h0 = score0 * (1-score1/3)
h1 = score1 * (1-score0/3)
if h0 < h1:
if nuc0[0]!=dash[0]:
self.__seqBSingle=0
if nuc1[0]==dash[0]:
if self.__firstSeqB:
self.__seqAInsertion+=1
else:
self.__seqASingle+=1
else:
self.__firstSeqB=True
self.__seqAMismatch+=1
if score1==1.0: # Free end gap
return (nuc0,score0)
else:
return (nuc0,h0)
else:
self.__seqADeletion+=1
else:
if nuc1[0]!=dash[0]:
self.__firstSeqB=True
if nuc0[0]==dash[0]:
self.__seqBInsertion+=1
self.__seqBSingle+=1
else:
self.__seqBMismatch+=1
self.__seqBSingle=0
if score0==1.0: # Free end gap
return (nuc1,score1)
else:
return (nuc1,h1)
else:
self.__seqBSingle=0
self.__seqBDeletion+=1
def __iter__(self):
return self
seqASingle = property(get_seqASingle, None, None, "direct's docstring")
seqBSingle = property(get_seqBSingle, None, None, "reverse's docstring")
seqABMatch = property(get_seqABMatch, None, None, "idem's docstring")
seqAMismatch = property(get_seqAMismatch, None, None, "mismatchdirect's docstring")
seqBMismatch = property(get_seqBMismatch, None, None, "mismatchreverse's docstring")
seqAInsertion = property(get_seqAInsertion, None, None, "insertdirect's docstring")
seqBInsertion = property(get_seqBInsertion, None, None, "insertreverse's docstring")
seqADeletion = property(get_seqADeletion, None, None, "deletedirect's docstring")
seqBDeletion = property(get_seqBDeletion, None, None, "deletereverse's docstring")
def buildConsensus(ali, seq):
cdef list quality
cdef char aseq[1000]
cdef int i=0
cdef int j=0
cdef char* cnuc
cdef bytes nuc
cdef double score
cdef bytes sseq
cdef double p
quality = []
if len(ali[0])>999: # TODO why?
raise AssertionError,"Too long alignemnt"
ic=IterOnConsensus(ali)
for nuc,score in ic:
cnuc=nuc
quality.append(score)
aseq[i]=cnuc[0]
i+=1
aseq[i]=0
sseq=aseq
# Reconvert quality from array of probabilities to array of raw quality values
i=0
for p in quality:
quality[i] = min(42, round(-10*log10(p)))
i+=1
seq.set(ali[0].wrapped.id+b"_CONS", sseq, quality=quality, tags=ali[0].wrapped)
if hasattr(ali, "direction"):
seq[b'direction']=ali.direction
if hasattr(ali, "counter"):
seq[b'alignement_id']=ali.counter
seq[b'seq_a_single']=ic.seqASingle
seq[b'seq_b_single']=ic.seqBSingle
seq[b'seq_ab_match']=ic.seqABMatch
seq[b'seq_a_mismatch']=ic.seqAMismatch
seq[b'seq_b_mismatch']=ic.seqBMismatch
seq[b'seq_a_insertion']=ic.seqAInsertion
seq[b'seq_b_insertion']=ic.seqBInsertion-ic.seqBSingle
seq[b'seq_a_deletion']=ic.seqADeletion
seq[b'seq_b_deletion']=ic.seqBDeletion
seq[b'score']=ali.score
seq[b'ali_length']=len(seq)-ic.seqASingle-ic.seqBSingle
if seq[b'ali_length']>0:
seq[b'score_norm']=float(ali.score)/seq[b'ali_length']
seq[b'mode']=b'alignment'
return seq
def buildJoinedSequence(ali, reverse, seq):
forward = ali[0].wrapped
s = forward.seq + reverse.seq
quality = forward.quality
quality.extend(reverse.quality)
seq.set(forward.id +b"_PairedEnd", s, definition=forward.definition, quality=quality, tags=forward)
seq[b"score"]=ali.score
seq[b"ali_dir"]=ali.direction
seq[b"mode"]=b"joined"
seq[b"pairedend_limit"]=len(forward)
return seq

View File

@ -0,0 +1,54 @@
#cython: language_level=3
'''
Created on 17 mai 2010
@author: coissac
'''
from obitools3.align import columnIterator
def iterOnAligment(ali):
pos0=0
pos1=0
begin0=False
end0=False
begin1=False
end1=False
for nuc0,nuc1 in columnIterator(ali):
if nuc0==b"-":
if begin0:
if not end0:
score0 = ( ali[0].wrapped.quality_array[pos0-1]
+ali[0].wrapped.quality_array[pos0]
)/2
else:
score0 = 1.
else:
score0 = 0.
else:
begin0=True
score0 = ali[0].wrapped.quality_array[pos0]
pos0+=1
end0= pos0==len(ali[0].wrapped)
if nuc1==b"-":
if begin1:
if not end1:
score1 = ( ali[1].wrapped.quality_array[pos1-1]
+ali[1].wrapped.quality_array[pos1]
)/2
else:
score1 = 0.
else:
score1 = 1.
else:
begin1=True
score1 = ali[1].wrapped.quality_array[pos1]
pos1+=1
end1= pos1==len(ali[1].wrapped)
result = (nuc0,score0,nuc1,score1)
yield result