From f5a00c9322f5437fad35fbcb223f1f3e4a05c1bc Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Mon, 12 Feb 2018 13:28:20 +0100 Subject: [PATCH] Cython alignment library --- python/obitools3/align/__init__.py | 166 ++++++++++ python/obitools3/align/_assemble.pxd | 12 + python/obitools3/align/_assemble.pyx | 168 ++++++++++ python/obitools3/align/_dynamic.pxd | 92 ++++++ python/obitools3/align/_dynamic.pyx | 361 ++++++++++++++++++++++ python/obitools3/align/_nws.pxd | 12 + python/obitools3/align/_nws.pyx | 161 ++++++++++ python/obitools3/align/_qsassemble.pyx | 98 ++++++ python/obitools3/align/_qsrassemble.pyx | 96 ++++++ python/obitools3/align/_rassemble.pxd | 12 + python/obitools3/align/_rassemble.pyx | 158 ++++++++++ python/obitools3/align/_solexapairend.pyx | 217 +++++++++++++ python/obitools3/align/solexapairend.py | 54 ++++ 13 files changed, 1607 insertions(+) create mode 100644 python/obitools3/align/__init__.py create mode 100644 python/obitools3/align/_assemble.pxd create mode 100644 python/obitools3/align/_assemble.pyx create mode 100644 python/obitools3/align/_dynamic.pxd create mode 100644 python/obitools3/align/_dynamic.pyx create mode 100644 python/obitools3/align/_nws.pxd create mode 100644 python/obitools3/align/_nws.pyx create mode 100644 python/obitools3/align/_qsassemble.pyx create mode 100644 python/obitools3/align/_qsrassemble.pyx create mode 100644 python/obitools3/align/_rassemble.pxd create mode 100644 python/obitools3/align/_rassemble.pyx create mode 100644 python/obitools3/align/_solexapairend.pyx create mode 100644 python/obitools3/align/solexapairend.py diff --git a/python/obitools3/align/__init__.py b/python/obitools3/align/__init__.py new file mode 100644 index 0000000..4a89e80 --- /dev/null +++ b/python/obitools3/align/__init__.py @@ -0,0 +1,166 @@ +from obitools3.dms.obiseq import Nuc_Seq_Stored, Nuc_Seq +from functools import reduce +import struct +from copy import deepcopy + + +class GappedPositionException(Exception): + pass + +class AlignedSequence(Nuc_Seq): # TODO discuss. can be both Nuc_Seq or Nuc_Seq_Stored.... + + def __init__(self, seq) : + self.wrapped = seq + self._length=len(seq) + self._gaps=[[self._length,0]] + + def clone(self): + seq = AlignedSequence(self.wrapped) + seq._gaps=deepcopy(self._gaps) + seq._length=reduce(lambda x,y:x+y, (z[0]+z[1] for z in self._gaps),0) + return seq + + def setGaps(self, value): + ''' + Set gap vector to an AlignedSequence. + + Gap vector describes the gap positions on a sequence. + It is a gap of couple. The first couple member is the count + of sequence letter, the second one is the gap length. + @param value: a list of length 2 list describing gap positions + @type value: list of couple + ''' + assert isinstance(value, list),'Gap vector must be a list' + assert reduce(lambda x,y: x and y, + (isinstance(z, list) and len(z)==2 for z in value), + True),"Value must be a list of length 2 list" + + lseq = reduce(lambda x,y:x+y, (z[0] for z in value),0) + assert lseq==len(self.wrapped),"Gap vector incompatible with the sequence" + self._gaps = value + self._length=reduce(lambda x,y:x+y, (z[0]+z[1] for z in value),0) + + def getGaps(self): + return tuple(self._gaps) + gaps = property(getGaps, setGaps, None, "Gaps's Docstring") + + def _getIndice(self,pos): + i=0 + cpos=0 + for s,g in self._gaps: + cpos+=s + if cpos>pos: + return i,pos-cpos+s + cpos+=g + if cpos>pos: + return i,-pos+cpos-g-1 + i+=1 + raise IndexError + + def getId(self): + d = self.id or ("%s_ALN" % self.wrapped.id) + return d + + def __len__(self): + return self._length + + def get_str(self): + return ''.join([x.decode('ascii') for x in self]) + + def __iter__(self): + def isymb(): + cpos=0 + for s,g in self._gaps: + for x in range(s): + yield (self.wrapped[cpos+x]) + for x in range(g): + yield b"-" + cpos+=s + return isymb() + + def _posInWrapped(self,position): + i,s=self._getIndice(position) + if s<0: + raise GappedPositionException + value=self._gaps + p=reduce(lambda x,y:x+y, (z[0] for z in value[:i]),0)+s + return p + + def get_symbol_at(self,position): + try: + return self.wrapped.get_symbol_at(self._posInWrapped(position)) + except GappedPositionException: + return b"-" + + def insertGap(self,position,count=1): + if position==self._length: + idx=len(self._gaps)-1 + p=-1 + else: + idx,p = self._getIndice(position) + + if p >= 0: + self._gaps.insert(idx, [p,count]) + self._gaps[idx+1][0]-=p + else: + self._gaps[idx][1]+=count + self._length=reduce(lambda x,y:x+y, (z[0]+z[1] for z in self._gaps),0) + + +class Alignment(list): + + def _assertData(self,data): +# assert isinstance(data, Nuc_Seq_Stored),'You must only add bioseq to an alignement' TODO + if hasattr(self, '_alignlen'): + assert self._alignlen==len(data),'All aligned sequences must have the same length' + else: + self._alignlen=len(data) + return data + + def clone(self): + ali = Alignment(x.clone() for x in self) + return ali + + def append(self,data): + data = self._assertData(data) + list.append(self,data) + + def __setitem__(self,index,data): + + data = self._assertData(data) + list.__setitem__(self,index,data) + + def getSite(self,key): + if isinstance(key,int): + return [x[key] for x in self] + + def insertGap(self,position,count=1): + for s in self: + s.insertGap(position,count) + + def isFullGapSite(self,key): + return reduce(lambda x,y: x and y,(z==b"-" for z in self.getSite(key)),True) + + def isGappedSite(self,key): + return b"-" in self.getSite(key) + + def __str__(self): + l = len(self[0]) + rep="" + idmax = max(len(x.id) for x in self)+2 + template= "%%-%ds %%-60s" % idmax + for p in range(0,l,60): + for s in self: + rep+= (template % (s.id,s[p:p+60])).strip() + '\n' + rep+="\n" + return rep + + +def columnIterator(alignment): + lali = len(alignment[0]) + for p in range(lali): + c = [] + for x in alignment: + c.append(x[p]) + yield c + diff --git a/python/obitools3/align/_assemble.pxd b/python/obitools3/align/_assemble.pxd new file mode 100644 index 0000000..309ab05 --- /dev/null +++ b/python/obitools3/align/_assemble.pxd @@ -0,0 +1,12 @@ +#cython: language_level=3 + +from ._nws cimport * + +cdef class DirectAssemble(NWS): + cdef double ysmax + cdef int ymax + + cdef double doAlignment(self) except? 0 + +cdef class ReverseAssemble(DirectAssemble): + pass \ No newline at end of file diff --git a/python/obitools3/align/_assemble.pyx b/python/obitools3/align/_assemble.pyx new file mode 100644 index 0000000..90ecafb --- /dev/null +++ b/python/obitools3/align/_assemble.pyx @@ -0,0 +1,168 @@ +#cython: language_level=3 + +''' +Created on 6 Nov. 2009 + +@author: coissac +''' + + +cdef class DirectAssemble(NWS): + + def __init__(self,match=4,mismatch=-6,opengap=-8,extgap=-2): + NWS.__init__(self,match,mismatch,opengap,extgap) + self.ysmax=0 + self.ymax=0 + + cdef double doAlignment(self) except? 0: + cdef int i # vertical index + cdef int j # horizontal index + cdef int idx + cdef int idx0 + cdef int idx1 + cdef int jump + cdef int delta + cdef double score + cdef double scoremax + cdef int path + + + if self.needToCompute: + self.allocate() + self.reset() + self.ysmax=0 + self.ymax=0 + + for j in range(1,self.hSeq.length+1): + idx = self.index(j,0) + self.matrix.matrix[idx].score = 0 + self.matrix.matrix[idx].path = j + + for i in range(1,self.vSeq.length+1): + idx = self.index(0,i) + self.matrix.matrix[idx].score = self._opengap + (self._extgap * (i-1)) + self.matrix.matrix[idx].path = -i + + idx0=self.index(-1,0) + idx1=self.index(0,1) + for i in range(1,self.vSeq.length+1): + idx0+=1 + idx1+=1 + for j in range(1,self.hSeq.length+1): + + # 1 - came from diagonal + #idx = self.index(j-1,i-1) + idx = idx0 + #print("ASSEMBLE computing cell : %d,%d --> %d/%d" % (j,i,self.index(j,i),self.matrix.msize),) + scoremax = self.matrix.matrix[idx].score + \ + self.matchScore(j,i) + path = 0 + + #print("so=%f sd=%f sm=%f" % (self.matrix.matrix[idx].score,self.matchScore(j,i),scoremax),) + + # 2 - open horizontal gap + # idx = self.index(j-1,i) + idx = idx1 - 1 + score = self.matrix.matrix[idx].score+ \ + self._opengap + if score > scoremax : + scoremax = score + path = +1 + + # 3 - open vertical gap + # idx = self.index(j,i-1) + idx = idx0 + 1 + score = self.matrix.matrix[idx].score + \ + self._opengap + if score > scoremax : + scoremax = score + path = -1 + + # 4 - extend horizontal gap + jump = self.matrix.bestHJump[i] + if jump >= 0: + idx = self.index(jump,i) + delta = j-jump + score = self.matrix.matrix[idx].score + \ + self._extgap * delta + if score > scoremax : + scoremax = score + path = delta+1 + + # 5 - extend vertical gap + jump = self.matrix.bestVJump[j] + if jump >= 0: + idx = self.index(j,jump) + delta = i-jump + score = self.matrix.matrix[idx].score + \ + self._extgap * delta + if score > scoremax : + scoremax = score + path = -delta-1 + + # idx = self.index(j,i) + idx = idx1 + self.matrix.matrix[idx].score = scoremax + self.matrix.matrix[idx].path = path + + if path == -1: + self.matrix.bestVJump[j]=i + elif path == +1 : + self.matrix.bestHJump[i]=j + + if j==self.hSeq.length and scoremax > self.ysmax: + self.ysmax=scoremax + self.ymax=i + idx0+=1 + idx1+=1 + + self.sequenceChanged=False + self.scoreChanged=False + + return self.ysmax + + cdef void backtrack(self): + #cdef list path=[] + cdef int i + cdef int j + cdef int p + + self.doAlignment() + i=self.ymax + j=self.hSeq.length + self.path=allocatePath(i,j+1,self.path) + + if self.ymaxmalloc(sizeof(AlignMatrix)) + matrix.vsize=0 + matrix.hsize=0 + matrix.msize=0 + matrix.matrix=NULL + matrix.bestVJump=NULL + matrix.bestHJump=NULL + + if hsize > matrix.hsize: + matrix.bestVJump = realloc(matrix.bestVJump,hsize * sizeof(int)) + matrix.hsize=hsize + + if vsize > matrix.vsize: + matrix.bestHJump = realloc(matrix.bestHJump,vsize * sizeof(int)) + matrix.vsize=vsize + + if (hsize * vsize) > matrix.msize: + matrix.msize = hsize * vsize + matrix.matrix = realloc(matrix.matrix, matrix.msize * sizeof(AlignCell)) + + return matrix + +cdef void freeMatrix(AlignMatrix* matrix): + if matrix is not NULL: + if matrix.matrix is not NULL: + free(matrix.matrix) + if matrix.bestVJump is not NULL: + free(matrix.bestVJump) + if matrix.bestHJump is not NULL: + free(matrix.bestHJump) + free(matrix) + +cdef void resetMatrix(AlignMatrix* matrix): + if matrix is not NULL: + if matrix.matrix is not NULL: + bzero(matrix.matrix, matrix.msize * sizeof(AlignCell)) + if matrix.bestHJump is not NULL: + memset(matrix.bestHJump,255,matrix.vsize * sizeof(int)) + if matrix.bestVJump is not NULL: + memset(matrix.bestVJump,255,matrix.hsize * sizeof(int)) + + +cdef alignSequence* allocateSequence(object bioseq, alignSequence* seq=NULL) except *: + cdef bytes strseq + cdef int i + + if seq is NULL: + seq = malloc(sizeof(alignSequence)) + seq.length=0 + seq.buffsize=0 + seq.sequence=NULL + seq.quality=NULL + seq.hasQuality=False + + seq.length=len(bioseq) + if seq.length > seq.buffsize: + #seq.sequence = realloc(seq.sequence,sizeof(char)*seq.length) + #seq.quality = realloc(seq.quality,sizeof(double)*seq.length) + seq.buffsize = seq.length + + # TODO optimisable... + #strseq = bioseq.seq.lower() + #memcpy(seq.sequence,strseq,seq.length) + seq.sequence = bioseq.seq + + return seq + +cdef void freeSequence(alignSequence* seq): + if seq is not NULL: + #if seq.sequence is not NULL: + # free(seq.sequence) + #if seq.quality is not NULL: + # free(seq.quality) + free(seq) + +cdef alignPath* allocatePath(long l1,long l2,alignPath* path=NULL): + cdef long length=l1+l2 + + if path is NULL: + path = malloc(sizeof(alignPath)) + path.length=0 + path.buffsize=0 + path.path=NULL + + if length > path.buffsize: + path.buffsize=length + path.path=realloc(path.path,sizeof(long)*length) + + path.length=0 + path.vStart=0 + path.hStart=0 + + return path + +cdef void reversePath(alignPath* path): + cdef long i + cdef long j + + j=path.length + for i in range((path.length/2)): # TODO not sure about the cast + j-=1 + path.path[i],path.path[j]=path.path[j],path.path[i] + +cdef void freePath(alignPath* path): + if path is not NULL: + if path.path is not NULL: + free(path.path) + free(path) + + +cdef int aascii = ord(b'a') +cdef int _basecode[26] + +cdef int bitCount(int x): + cdef int i=0 + while(x): + i+=1 + x&=x-1 + return i + +cpdef bint iupacMatch(unsigned char a, unsigned char b): + cdef bint m + + if a==42: # * ascii code + a=110 # n ascii code + + if b==42: # * ascii code + b=110 # n ascii code + + m = _basecode[a - aascii] & _basecode[b - aascii] + return m + +cpdef unsigned char encodeBase(unsigned char lettre): + return _basecode[lettre - aascii] + +cpdef double iupacPartialMatch(unsigned char a, unsigned char b): + cdef int codeA + cdef int codeB + cdef int good + cdef int all + cdef double partial + + if a==42: # * ascii code + a=110 # n ascii code + + if b==42: # * ascii code + b=110 # n ascii code + + codeA = _basecode[a - aascii] + codeB = _basecode[b - aascii] + good = bitCount(codeA & codeB) + all = bitCount(codeA) * bitCount(codeB) + partial= good / all + + return partial + + +cdef class DynamicProgramming: + + def __init__(self,opengap,extgap): + self.sequenceChanged=True + self.scoreChanged=True + + self.matrix=NULL + self.hSeq=NULL + self.vSeq=NULL + self.path=NULL + + self.horizontalSeq=None + self.verticalSeq=None + + self._opengap=opengap + self._extgap=extgap + + cdef int _vlen(self): + return self.vSeq.length + + cdef int _hlen(self): + return self.hSeq.length + + cdef int allocate(self) except -1: + + assert self.horizontalSeq is not None,'Sequence A must be set' + assert self.verticalSeq is not None,'Sequence B must be set' + + cdef long lenH=self._hlen() + cdef long lenV=self._vlen() + + self.matrix=allocateMatrix(lenH,lenV,self.matrix) + return 0 + + + cdef double doAlignment(self) except? 0: + pass + + cdef bint _needToCompute(self): + return self.scoreChanged or self.sequenceChanged + + cdef void backtrack(self): + pass + + property seqA: + def __get__(self): + return self.horizontalSeq + + def __set__(self, seq): + self.sequenceChanged = True + self.horizontalSeq = seq + self.hSeq = allocateSequence(self.horizontalSeq, self.hSeq) + + property seqB: + def __get__(self): + return self.verticalSeq + + def __set__(self, seq): + self.sequenceChanged = True + self.verticalSeq = seq + self.vSeq = allocateSequence(self.verticalSeq, self.vSeq) + + property opengap: + def __get__(self): + return self._opengap + + def __set__(self,opengap): + self._opengap=opengap + self.scoreChanged=True + + property extgap: + def __get__(self): + return self._extgap + + def __set__(self,extgap): + self._extgap=extgap + self.scoreChanged=True + + property needToCompute: + def __get__(self): + return self.scoreChanged or self.sequenceChanged + + property score: + def __get__(self): + return self.doAlignment() + + cdef void reset(self): + self.scoreChanged=True + resetMatrix(self.matrix) + + cdef inline int index(self, int x, int y): + return (self._hlen()+1) * y + x + + cdef void clean(self): + freeMatrix(self.matrix) + freeSequence(self.hSeq) + freeSequence(self.vSeq) + freePath(self.path) + + def __dealloc__(self): + self.clean() + + def __call__(self): + cdef list hgaps=[] + cdef list vgaps=[] + cdef list b + cdef int hp=0 + cdef int vp=0 + cdef int lenh=0 + cdef int lenv=0 + cdef int h,v,p + cdef int i + cdef object ali + cdef double score + + if self._needToCompute(): + score = self.doAlignment() + self.backtrack() + for i in range(self.path.length-1,-1,-1): + p=self.path.path[i] + if p==0: + hp+=1 + vp+=1 + lenh+=1 + lenv+=1 + elif p>0: + hp+=p + lenh+=p + vgaps.append([vp,p]) + vp=0 + else: + vp-=p + lenv-=p + hgaps.append([hp,-p]) + hp=0 + + if hp: + hgaps.append([hp,0]) + if vp: + vgaps.append([vp,0]) + + if lenh < self._hlen(): + hseq=self.horizontalSeq[self.path.hStart:self.path.hStart+lenh] + else: + hseq=self.horizontalSeq + + hseq=AlignedSequence(hseq) + hseq.gaps=hgaps + + if lenv < self._vlen(): + vseq=self.verticalSeq[self.path.vStart:self.path.vStart+lenv] + else: + vseq=self.verticalSeq + + vseq=AlignedSequence(vseq) + vseq.gaps=vgaps + + ali=Alignment() + ali.append(hseq) + ali.append(vseq) + + ali.score=score + self.alignment=ali + ali=self.alignment.clone() + ali.score=self.alignment.score + return ali + + + + +# initialize iupac carray + +__basecode=[1,14,2,13,0,0,4,11,0,0,12,0,3,15,0,0,0,5,6,8,8,7,9,0,10,0] +for i in range(26): + _basecode[i]=__basecode[i] + + \ No newline at end of file diff --git a/python/obitools3/align/_nws.pxd b/python/obitools3/align/_nws.pxd new file mode 100644 index 0000000..b801167 --- /dev/null +++ b/python/obitools3/align/_nws.pxd @@ -0,0 +1,12 @@ +#cython: language_level=3 + +from ._dynamic cimport * + +cdef class NWS(DynamicProgramming): + cdef double _match + cdef double _mismatch + + cdef double matchScore(self,int h, int v) + cdef double doAlignment(self) except? 0 + + diff --git a/python/obitools3/align/_nws.pyx b/python/obitools3/align/_nws.pyx new file mode 100644 index 0000000..19c26b0 --- /dev/null +++ b/python/obitools3/align/_nws.pyx @@ -0,0 +1,161 @@ +#cython: language_level=3 + +''' +Created on 6 Nov. 2009 + +@author: coissac +''' + + +cdef class NWS(DynamicProgramming): + + def __init__(self,match=4,mismatch=-6,opengap=-8,extgap=-2): + DynamicProgramming.__init__(self,opengap,extgap) + self._match=match + self._mismatch=mismatch + + cdef double matchScore(self,int h, int v): + cdef double score + score = iupacPartialMatch(self.hSeq.sequence[h-1],self.vSeq.sequence[v-1]) + return score * self._match + (1-score) * self._mismatch + + cdef double doAlignment(self) except? 0: + cdef int i # vertical index + cdef int j # horizontal index + cdef int idx + cdef int jump + cdef int delta + cdef double score + cdef double scoremax + cdef int path + + + if self.needToCompute: + self.allocate() + self.reset() + + for j in range(1,self._hlen()+1): + idx = self.index(j,0) + self.matrix.matrix[idx].score = self._opengap + (self._extgap * (j-1)) + self.matrix.matrix[idx].path = j + + for i in range(1,self._vlen()+1): + idx = self.index(0,i) + self.matrix.matrix[idx].score = self._opengap + (self._extgap * (i-1)) + self.matrix.matrix[idx].path = -i + + for i in range(1,self._vlen()+1): + for j in range(1,self._hlen()+1): + + # 1 - came from diagonal + idx = self.index(j-1,i-1) + # print "computing cell : %d,%d --> %d/%d" % (j,i,self.index(j,i),self.matrix.msize), + scoremax = self.matrix.matrix[idx].score + \ + self.matchScore(j,i) + path = 0 + + # print "so=%f sd=%f sm=%f" % (self.matrix.matrix[idx].score,self.matchScore(j,i),scoremax), + + # 2 - open horizontal gap + idx = self.index(j-1,i) + score = self.matrix.matrix[idx].score + \ + self._opengap + if score > scoremax : + scoremax = score + path = +1 + + # 3 - open vertical gap + idx = self.index(j,i-1) + score = self.matrix.matrix[idx].score + \ + self._opengap + if score > scoremax : + scoremax = score + path = -1 + + # 4 - extend horizontal gap + jump = self.matrix.bestHJump[i] + if jump >= 0: + idx = self.index(jump,i) + delta = j-jump + score = self.matrix.matrix[idx].score + \ + self._extgap * delta + if score > scoremax : + scoremax = score + path = delta+1 + + # 5 - extend vertical gap + jump = self.matrix.bestVJump[j] + if jump >= 0: + idx = self.index(j,jump) + delta = i-jump + score = self.matrix.matrix[idx].score + \ + self._extgap * delta + if score > scoremax : + scoremax = score + path = -delta-1 + + idx = self.index(j,i) + self.matrix.matrix[idx].score = scoremax + self.matrix.matrix[idx].path = path + + if path == -1: + self.matrix.bestVJump[j]=i + elif path == +1 : + self.matrix.bestHJump[i]=j + + self.sequenceChanged=False + self.scoreChanged=False + + idx = self.index(self._hlen(),self._vlen()) + return self.matrix.matrix[idx].score + + cdef void backtrack(self): + #cdef list path=[] + cdef int i + cdef int j + cdef int p + + self.doAlignment() + i=self._vlen() + j=self._hlen() + self.path=allocatePath(i,j,self.path) + + while (i or j): + p=self.matrix.matrix[self.index(j,i)].path + self.path.path[self.path.length]=p + self.path.length+=1 + #path.append(p) + if p==0: + i-=1 + j-=1 + elif p < 0: + i+=p + else: + j-=p + + #path.reverse() + #reversePath(self.path) + self.path.hStart=0 + self.path.vStart=0 + + #return 0,0,path + + property match: + def __get__(self): + return self._match + + def __set__(self,match): + self._match=match + self.scoreChanged=True + + property mismatch: + def __get__(self): + return self._mismatch + + def __set__(self,mismatch): + self._mismatch=mismatch + self.scoreChanged=True + + + + diff --git a/python/obitools3/align/_qsassemble.pyx b/python/obitools3/align/_qsassemble.pyx new file mode 100644 index 0000000..39c89b2 --- /dev/null +++ b/python/obitools3/align/_qsassemble.pyx @@ -0,0 +1,98 @@ +#cython: language_level=3 + +''' +Created on 6 Nov. 2009 + +@author: coissac +''' + +from ._dynamic cimport * +from ._assemble cimport DirectAssemble +import array + + +cdef class QSolexaDirectAssemble(DirectAssemble): + + cdef double* hError + cdef double* vError + cdef object a + + + def __init__(self,match=4,mismatch=-4,opengap=-8,extgap=-2): + """ + Rapport entre score de match et mismatch: + si mismatch = - match / 3 + alors quand scrore temps vers 0 et qu'il est impossible de decider + pas de penalisation (s'=0) + si mismatch < - match / 3 la non decidabilite est penalisee. + """ + DirectAssemble.__init__(self,match,mismatch,opengap,extgap) + + + cdef double matchScore(self,int h, int v): + cdef double score + cdef double smatch + cdef double smismatch + cdef double hok=1-self.hError[h-1] + cdef double vok=1-self.vError[v-1] + score=iupacPartialMatch(self.hSeq.sequence[h-1],self.vSeq.sequence[v-1]) + smatch=((4*hok*vok-hok-vok)*(self._match-self._mismatch)+self._match+2*self._mismatch)/3 + smismatch=((hok+vok-4*hok*vok)*(self._match-self._mismatch)+2*self._match+7*self._mismatch)/9 + return smatch * score + smismatch * (1. - score) + + property seqA: + def __get__(self): + return self.horizontalSeq + + def __set__(self, seq): + cdef object oaddresse,olength + assert hasattr(seq, "quality_array"),"You must use sequence with quality indices" + self.sequenceChanged=True + self.horizontalSeq=seq + self.hSeq=allocateSequence(self.horizontalSeq,self.hSeq) + (oaddress,olength)=seq.quality_array.buffer_info() + self.hError=oaddress + + property seqB: + def __get__(self): + return self.verticalSeq + + def __set__(self, seq): + cdef object oaddresse,olength + assert hasattr(seq, "quality_array"),"You must use sequence with quality indices" + self.sequenceChanged=True + self.verticalSeq=seq + self.vSeq=allocateSequence(self.verticalSeq,self.vSeq) + (oaddress,olength)=seq.quality_array.buffer_info() + self.vError=oaddress + + +cdef class QSolexaReverseAssemble(QSolexaDirectAssemble): + + cdef double matchScore(self,int h, int v): + cdef double score + cdef double smatch + cdef double smismatch + cdef double hok=1-self.hError[h-1] + cdef double vok=1-self.vError[self.vSeq.length - v] + score=iupacPartialMatch(self.hSeq.sequence[h-1],self.vSeq.sequence[v-1]) + smatch=((4*hok*vok-hok-vok)*(self._match-self._mismatch)+self._match+2*self._mismatch)/3 + smismatch=((hok+vok-4*hok*vok)*(self._match-self._mismatch)+2*self._match+7*self._mismatch)/9 + return smatch * score + smismatch * (1. - score) + + + property seqB: + + def __get__(self): + return self.verticalSeq.wrapped + + def __set__(self, seq): + cdef object oaddresse,olength + assert hasattr(seq, "quality_array"),"You must use sequence with quality indices" + self.sequenceChanged=True + self.verticalSeq=seq.reverse_complement + self.vSeq=allocateSequence(self.verticalSeq,self.vSeq) + self.a = array.array('d', list(seq.quality_array)) + (oaddress,olength) = self.a.buffer_info() + self.vError = oaddress + diff --git a/python/obitools3/align/_qsrassemble.pyx b/python/obitools3/align/_qsrassemble.pyx new file mode 100644 index 0000000..7cb0ff9 --- /dev/null +++ b/python/obitools3/align/_qsrassemble.pyx @@ -0,0 +1,96 @@ +#cython: language_level=3 + +''' +Created on 6 Nov. 2009 + +@author: coissac +''' + +from ._dynamic cimport * +from ._rassemble cimport RightDirectAssemble +from copy import deepcopy +import array + + +cdef class QSolexaRightDirectAssemble(RightDirectAssemble): + + cdef double* hError + cdef double* vError + cdef object a + + def __init__(self,match=4,mismatch=-4,opengap=-8,extgap=-2): + """ + Rapport entre score de match et mismatch: + si mismatch = - match / 3 + alors quand scrore temps vers 0 et qu'il est impossible de decider + pas de penalisation (s'=0) + si mismatch < - match / 3 la non decidabilite est penalisee. + """ + RightDirectAssemble.__init__(self,match,mismatch,opengap,extgap) + + + cdef double matchScore(self,int h, int v): + cdef double score + cdef double smatch + cdef double smismatch + cdef double hok=1-self.hError[h-1] + cdef double vok=1-self.vError[v-1] + score=iupacPartialMatch(self.hSeq.sequence[h-1],self.vSeq.sequence[v-1]) + smatch=((4*hok*vok-hok-vok)*(self._match-self._mismatch)+self._match+2*self._mismatch)/3 + smismatch=((hok+vok-4*hok*vok)*(self._match-self._mismatch)+2*self._match+7*self._mismatch)/9 + return smatch * score + smismatch * (1. - score) + + property seqA: + def __get__(self): + return self.horizontalSeq + + def __set__(self, seq): + cdef object oaddresse,olength + assert hasattr(seq, "quality"),"You must use sequence with quality indices" + self.sequenceChanged=True + self.horizontalSeq=seq + self.hSeq=allocateSequence(self.horizontalSeq,self.hSeq) + (oaddress,olength)=seq.quality_array.buffer_info() + self.hError=oaddress + + property seqB: + def __get__(self): + return self.verticalSeq + + def __set__(self, seq): + cdef object oaddresse,olength + assert hasattr(seq, "quality"),"You must use sequence with quality indices" + self.sequenceChanged=True + self.verticalSeq=seq + self.vSeq=allocateSequence(self.verticalSeq,self.vSeq) + (oaddress,olength)=seq.quality_array.buffer_info() + self.vError=oaddress + + +cdef class QSolexaRightReverseAssemble(QSolexaRightDirectAssemble): + + cdef double matchScore(self,int h, int v): + cdef double score + cdef double smatch + cdef double smismatch + cdef double hok=1-self.hError[h-1] + cdef double vok=1-self.vError[self.vSeq.length - v] + score=iupacPartialMatch(self.hSeq.sequence[h-1],self.vSeq.sequence[v-1]) + smatch=((4*hok*vok-hok-vok)*(self._match-self._mismatch)+self._match+2*self._mismatch)/3 + smismatch=((hok+vok-4*hok*vok)*(self._match-self._mismatch)+2*self._match+7*self._mismatch)/9 + return smatch * score + smismatch * (1. - score) + + + property seqB: + def __get__(self): + return self.verticalSeq.wrapped + + def __set__(self, seq): + cdef object oaddresse,olength + assert hasattr(seq, "quality"),"You must use sequence with quality indices" + self.sequenceChanged=True + self.verticalSeq=seq.reverse_complement + self.vSeq=allocateSequence(self.verticalSeq,self.vSeq) + self.a = array.array('d', list(seq.quality_array)) + (oaddress,olength) = self.a.buffer_info() + self.vError = oaddress diff --git a/python/obitools3/align/_rassemble.pxd b/python/obitools3/align/_rassemble.pxd new file mode 100644 index 0000000..b65dcf5 --- /dev/null +++ b/python/obitools3/align/_rassemble.pxd @@ -0,0 +1,12 @@ +#cython: language_level=3 + +from ._nws cimport * + +cdef class RightDirectAssemble(NWS): + cdef double xsmax + cdef int xmax + + cdef double doAlignment(self) except? 0 + +cdef class RightReverseAssemble(RightDirectAssemble): + pass \ No newline at end of file diff --git a/python/obitools3/align/_rassemble.pyx b/python/obitools3/align/_rassemble.pyx new file mode 100644 index 0000000..6c5caf1 --- /dev/null +++ b/python/obitools3/align/_rassemble.pyx @@ -0,0 +1,158 @@ +#cython: language_level=3 + +''' +Created on 6 Nov. 2009 + +@author: coissac +''' + +from ._rassemble cimport * + + +cdef class RightDirectAssemble(NWS): + + def __init__(self,match=4,mismatch=-6,opengap=-8,extgap=-2): + NWS.__init__(self,match,mismatch,opengap,extgap) + self.xsmax=0 + self.xmax=0 + + cdef double doAlignment(self) except? 0: + cdef int i # vertical index + cdef int j # horizontal index + cdef int idx + cdef int jump + cdef int delta + cdef double score + cdef double scoremax + cdef int path + + + if self.needToCompute: + self.allocate() + self.reset() + self.xsmax=0 + self.xmax=0 + + for j in range(1,self.hSeq.length+1): + idx = self.index(j,0) + self.matrix.matrix[idx].score = self._opengap + (self._extgap * (j-1)) + self.matrix.matrix[idx].path = j + + for i in range(1,self.vSeq.length+1): + idx = self.index(0,i) + self.matrix.matrix[idx].score = 0 + self.matrix.matrix[idx].path = -i + + for i in range(1,self.vSeq.length+1): + for j in range(1,self.hSeq.length+1): + + # 1 - came from diagonal + idx = self.index(j-1,i-1) + #print("computing cell : %d,%d --> %d/%d" % (j,i,self.index(j,i),self.matrix.msize),) + scoremax = self.matrix.matrix[idx].score + \ + self.matchScore(j,i) + path = 0 + + #print("so=%f sd=%f sm=%f" % (self.matrix.matrix[idx].score,self.matchScore(j,i),scoremax),) + + # 2 - open horizontal gap + idx = self.index(j-1,i) + score = self.matrix.matrix[idx].score+ \ + self._opengap + if score > scoremax : + scoremax = score + path = +1 + + # 3 - open vertical gap + idx = self.index(j,i-1) + score = self.matrix.matrix[idx].score + \ + self._opengap + if score > scoremax : + scoremax = score + path = -1 + + # 4 - extend horizontal gap + jump = self.matrix.bestHJump[i] + if jump >= 0: + idx = self.index(jump,i) + delta = j-jump + score = self.matrix.matrix[idx].score + \ + self._extgap * delta + if score > scoremax : + scoremax = score + path = delta+1 + + # 5 - extend vertical gap + jump = self.matrix.bestVJump[j] + if jump >= 0: + idx = self.index(j,jump) + delta = i-jump + score = self.matrix.matrix[idx].score + \ + self._extgap * delta + if score > scoremax : + scoremax = score + path = -delta-1 + + idx = self.index(j,i) + self.matrix.matrix[idx].score = scoremax + self.matrix.matrix[idx].path = path + + if path == -1: + self.matrix.bestVJump[j]=i + elif path == +1 : + self.matrix.bestHJump[i]=j + + if i==self.vSeq.length and scoremax > self.xsmax: + self.xsmax=scoremax + self.xmax=j + + self.sequenceChanged=False + self.scoreChanged=False + + return self.xsmax + + cdef void backtrack(self): + cdef list path=[] + cdef int i + cdef int j + cdef int p + + self.doAlignment() + j=self.xmax + i=self.vSeq.length + self.path=allocatePath(i,j+1,self.path) + + if self.xmax999: # TODO why? + raise AssertionError,"Too long alignemnt" + + ic=IterOnConsensus(ali) + + for nuc,score in ic: + cnuc=nuc + quality.append(score) + aseq[i]=cnuc[0] + i+=1 + + aseq[i]=0 + + sseq=aseq + + # Reconvert quality from array of probabilities to array of raw quality values + i=0 + for p in quality: + quality[i] = min(42, round(-10*log10(p))) + i+=1 + + seq.set(ali[0].wrapped.id+b"_CONS", sseq, quality=quality, tags=ali[0].wrapped) + + if hasattr(ali, "direction"): + seq[b'direction']=ali.direction + if hasattr(ali, "counter"): + seq[b'alignement_id']=ali.counter + seq[b'seq_a_single']=ic.seqASingle + seq[b'seq_b_single']=ic.seqBSingle + seq[b'seq_ab_match']=ic.seqABMatch + seq[b'seq_a_mismatch']=ic.seqAMismatch + seq[b'seq_b_mismatch']=ic.seqBMismatch + seq[b'seq_a_insertion']=ic.seqAInsertion + seq[b'seq_b_insertion']=ic.seqBInsertion-ic.seqBSingle + seq[b'seq_a_deletion']=ic.seqADeletion + seq[b'seq_b_deletion']=ic.seqBDeletion + seq[b'score']=ali.score + seq[b'ali_length']=len(seq)-ic.seqASingle-ic.seqBSingle + if seq[b'ali_length']>0: + seq[b'score_norm']=float(ali.score)/seq[b'ali_length'] + seq[b'mode']=b'alignment' + return seq + + +def buildJoinedSequence(ali, reverse, seq): + forward = ali[0].wrapped + s = forward.seq + reverse.seq + quality = forward.quality + quality.extend(reverse.quality) + seq.set(forward.id +b"_PairedEnd", s, definition=forward.definition, quality=quality, tags=forward) + seq[b"score"]=ali.score + seq[b"ali_dir"]=ali.direction + seq[b"mode"]=b"joined" + seq[b"pairedend_limit"]=len(forward) + return seq + diff --git a/python/obitools3/align/solexapairend.py b/python/obitools3/align/solexapairend.py new file mode 100644 index 0000000..db9dd53 --- /dev/null +++ b/python/obitools3/align/solexapairend.py @@ -0,0 +1,54 @@ +#cython: language_level=3 + +''' +Created on 17 mai 2010 + +@author: coissac +''' + +from obitools3.align import columnIterator + + +def iterOnAligment(ali): + pos0=0 + pos1=0 + begin0=False + end0=False + begin1=False + end1=False + + for nuc0,nuc1 in columnIterator(ali): + if nuc0==b"-": + if begin0: + if not end0: + score0 = ( ali[0].wrapped.quality_array[pos0-1] + +ali[0].wrapped.quality_array[pos0] + )/2 + else: + score0 = 1. + else: + score0 = 0. + else: + begin0=True + score0 = ali[0].wrapped.quality_array[pos0] + pos0+=1 + end0= pos0==len(ali[0].wrapped) + + if nuc1==b"-": + if begin1: + if not end1: + score1 = ( ali[1].wrapped.quality_array[pos1-1] + +ali[1].wrapped.quality_array[pos1] + )/2 + else: + score1 = 0. + else: + score1 = 1. + else: + begin1=True + score1 = ali[1].wrapped.quality_array[pos1] + pos1+=1 + end1= pos1==len(ali[1].wrapped) + + result = (nuc0,score0,nuc1,score1) + yield result