From 8a0b95c1d60956a68631d94c38365f3c3ec719e6 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Mon, 12 Mar 2018 18:09:22 +0100 Subject: [PATCH] New command: obi ngsfilter --- python/obitools3/align/_freeendgap.pxd | 11 + python/obitools3/align/_freeendgap.pyx | 160 +++++++++ python/obitools3/align/_freeendgapfm.pxd | 7 + python/obitools3/align/_freeendgapfm.pyx | 18 + python/obitools3/align/_rassemble.pyx | 2 - python/obitools3/commands/ngsfilter.pyx | 410 +++++++++++++++++++++++ 6 files changed, 606 insertions(+), 2 deletions(-) create mode 100755 python/obitools3/align/_freeendgap.pxd create mode 100755 python/obitools3/align/_freeendgap.pyx create mode 100755 python/obitools3/align/_freeendgapfm.pxd create mode 100755 python/obitools3/align/_freeendgapfm.pyx create mode 100644 python/obitools3/commands/ngsfilter.pyx diff --git a/python/obitools3/align/_freeendgap.pxd b/python/obitools3/align/_freeendgap.pxd new file mode 100755 index 0000000..e366cb0 --- /dev/null +++ b/python/obitools3/align/_freeendgap.pxd @@ -0,0 +1,11 @@ +#cython: language_level=3 + +from ._nws cimport * + +cdef class FreeEndGap(NWS): + cdef double xsmax + cdef int xmax + + + cdef double doAlignment(self) except? 0 + diff --git a/python/obitools3/align/_freeendgap.pyx b/python/obitools3/align/_freeendgap.pyx new file mode 100755 index 0000000..7a3f2ec --- /dev/null +++ b/python/obitools3/align/_freeendgap.pyx @@ -0,0 +1,160 @@ +#cython: language_level=3 + +''' +Created on 6 Nov. 2009 + +@author: coissac +''' + + +cdef class FreeEndGap(NWS): + + def __init__(self,match=4,mismatch=-6,opengap=-8,extgap=-2): + NWS.__init__(self,match,mismatch,opengap,extgap) + self.xsmax=0 + self.xmax=0 + + cdef double doAlignment(self) except? 0: + cdef int i # vertical index + cdef int j # horizontal index + cdef int idx + cdef int idx0 + cdef int idx1 + cdef int jump + cdef int delta + cdef double score + cdef double scoremax + cdef int path + + assert self.hSeq.length > self.vSeq.length, \ + "Sequence B must be shorter than sequence A" + + + if self.needToCompute: + self.allocate() + self.reset() + self.xsmax=0 + self.xmax=0 + + for j in range(1,self.hSeq.length+1): + idx = self.index(j,0) + self.matrix.matrix[idx].score = 0 + self.matrix.matrix[idx].path = j + + for i in range(1,self.vSeq.length+1): + idx = self.index(0,i) + self.matrix.matrix[idx].score = self._opengap + (self._extgap * (i-1)) + self.matrix.matrix[idx].path = -i + + idx0=self.index(-1,0) + idx1=self.index(0,1) + for i in range(1,self.vSeq.length+1): + idx0+=1 + idx1+=1 + for j in range(1,self.hSeq.length+1): + + # 1 - came from diagonal + #idx = self.index(j-1,i-1) + idx = idx0 + # print "computing cell : %d,%d --> %d/%d" % (j,i,self.index(j,i),self.matrix.msize), + scoremax = self.matrix.matrix[idx].score + \ + self.matchScore(j,i) + path = 0 + + # print "so=%f sd=%f sm=%f" % (self.matrix.matrix[idx].score,self.matchScore(j,i),scoremax), + + # 2 - open horizontal gap + # idx = self.index(j-1,i) + idx = idx1 - 1 + score = self.matrix.matrix[idx].score+ \ + self._opengap + if score > scoremax : + scoremax = score + path = +1 + + # 3 - open vertical gap + # idx = self.index(j,i-1) + idx = idx0 + 1 + score = self.matrix.matrix[idx].score + \ + self._opengap + if score > scoremax : + scoremax = score + path = -1 + + # 4 - extend horizontal gap + jump = self.matrix.bestHJump[i] + if jump >= 0: + idx = self.index(jump,i) + delta = j-jump + score = self.matrix.matrix[idx].score + \ + self._extgap * delta + if score > scoremax : + scoremax = score + path = delta+1 + + # 5 - extend vertical gap + jump = self.matrix.bestVJump[j] + if jump >= 0: + idx = self.index(j,jump) + delta = i-jump + score = self.matrix.matrix[idx].score + \ + self._extgap * delta + if score > scoremax : + scoremax = score + path = -delta-1 + + # idx = self.index(j,i) + idx = idx1 + self.matrix.matrix[idx].score = scoremax + self.matrix.matrix[idx].path = path + + if path == -1: + self.matrix.bestVJump[j]=i + elif path == +1 : + self.matrix.bestHJump[i]=j + + if i==self.vSeq.length and scoremax > self.xsmax: + self.xsmax=scoremax + self.xmax=j + idx0+=1 + idx1+=1 + + self.sequenceChanged=False + self.scoreChanged=False + + return self.xsmax + + cdef void backtrack(self): + #cdef list path=[] + cdef int i + cdef int j + cdef int p + + self.doAlignment() + j=self.xmax + i=self.vSeq.length + self.path=allocatePath(i,j+1,self.path) + + if self.xmax= self.minscore: + score = ali.score + start = ali[1].gaps[0][1] + end = len(ali[1])-ali[1].gaps[-1][1] + if self.taglength is not None: + if self.sequence.revcomp: + if (len(sequence)-end) >= self.taglength: + tag = sequence.clone() + tag = tag[end:end+self.taglength] + tag = tag.reverse_complement.seq + else: + tag=None + else: + if start >= self.taglength: + tag = sequence.clone() + tag = tag[start - self.taglength:start].seq + else: + tag=None + else: + tag=None + + return score,start,end,tag + + return None + + def __str__(self): + return "%s: %s" % ({True:'D',False:'R'}[self.direct],self.raw) + + __repr__=__str__ + + +cdef dict read_info_view(info_view, error=2, verbose=False): + infos = {} + for p in info_view: + forward=Primer(p[b'forward_primer'], + len(p[b'forward_tag']) if p[b'forward_tag']!=b'-' else None, + True, + error=error,verbose=verbose) + + fp = infos.get(forward,{}) + infos[forward]=fp + + reverse=Primer(p[b'reverse_primer'], + len(p[b'reverse_tag']) if p[b'reverse_tag']!=b'-' else None, + False, + error=error,verbose=verbose) + + rp = infos.get(reverse,{}) + infos[reverse]=rp + + cf=forward.reverse_complement() + cr=reverse.reverse_complement() + + dpp=fp.get(cr,{}) + fp[cr]=dpp + + rpp=rp.get(cf,{}) + rp[cf]=rpp + + tags = (p[b'forward_tag'] if p[b'forward_tag']!=b'-' else None, + p[b'reverse_tag'] if p[b'reverse_tag']!=b'-' else None) + + assert tags not in dpp, \ + "Tag pair %s is already used with primer pairs: (%s,%s)" % (str(tags),forward,reverse) + + # Save additional data + special_keys = [b'forward_primer', b'reverse_primer', b'forward_tag', b'reverse_tag'] + data={} + for key in p: + if key not in special_keys: + data[key] = p[key] + + dpp[tags] = data + rpp[tags] = data + + return infos + + +cdef tuple annotate(sequence, infos, verbose=False): + + def sortMatch(m1, m2): + if m1[1] is None and m2[1] is None: + return 0 + + if m1[1] is None: + return 1 + + if m2[1] is None: + return -1 + + return (m1[1][1] > m2[1][2]) - (m1[1][1] < m2[1][2]) + + if hasattr(sequence, "quality_array"): + q = -reduce(lambda x,y:x+y,(math.log10(z) for z in sequence.quality_array),0)/len(sequence.quality_array)*10 + sequence[b'avg_quality']=q + q = -reduce(lambda x,y:x+y,(math.log10(z) for z in sequence.quality_array[0:10]),0) + sequence[b'head_quality']=q + if len(sequence.quality_array[10:-10]) : + q = -reduce(lambda x,y:x+y,(math.log10(z) for z in sequence.quality_array[10:-10]),0)/len(sequence.quality_array[10:-10])*10 + sequence[b'mid_quality']=q + q = -reduce(lambda x,y:x+y,(math.log10(z) for z in sequence.quality_array[-10:]),0) + sequence[b'tail_quality']=q + + directmatch = [(p,p(sequence)) for p in infos] + directmatch.sort(key=cmp_to_key(sortMatch)) + directmatch=directmatch[0] if directmatch[0][1] is not None else None + + if directmatch is None: + sequence[b'error']=b'No primer match' + return False,sequence + + match = sequence[directmatch[1][1]:directmatch[1][2]] + + sequence[b'seq_length_ori']=len(sequence) + + sequence = sequence[directmatch[1][2]:] + + if directmatch[0].direct: + sequence[b'direction']=b'forward' + sequence[b'forward_score']=directmatch[1][0] + sequence[b'forward_primer']=directmatch[0].raw + sequence[b'forward_match']=match.seq + + else: + sequence[b'direction']=b'reverse' + sequence[b'reverse_score']=directmatch[1][0] + sequence[b'reverse_primer']=directmatch[0].raw + sequence[b'reverse_match']=match.seq + + infos = infos[directmatch[0]] + reversematch = [(p,p(sequence)) for p in infos if p is not None] + reversematch.sort(key=cmp_to_key(sortMatch)) + reversematch = reversematch[0] if reversematch[0][1] is not None else None + + if reversematch is None and None not in infos: + if directmatch[0].direct: + message = b'No reverse primer match' + else: + message = b'No direct primer match' + + sequence[b'error']=message + return False,sequence + + if reversematch is None: + sequence[b'status']=b'partial' + + if directmatch[0].direct: + tags=(directmatch[1][3],None) + else: + tags=(None,directmatch[1][3]) + + samples = infos[None] + + else: + sequence[b'status']=b'full' + + match = sequence[reversematch[1][1]:reversematch[1][2]] + match = match.reverse_complement + sequence = sequence[0:reversematch[1][1]] + + if directmatch[0].direct: + tags=(directmatch[1][3],reversematch[1][3]) + sequence[b'reverse_score']=reversematch[1][0] + sequence[b'reverse_primer']=reversematch[0].raw + sequence[b'reverse_match']=match.seq + + else: + tags=(reversematch[1][3],directmatch[1][3]) + sequence[b'forward_score']=reversematch[1][0] + sequence[b'forward_primer']=reversematch[0].raw + sequence[b'forward_match']=match.seq + + if tags[0] is not None: + sequence[b'forward_tag'] = tags[0] + if tags[1] is not None: + sequence[b'reverse_tag'] = tags[1] + + samples = infos[reversematch[0]] + + if not directmatch[0].direct: + sequence=sequence.reverse_complement + + sample=None + + if tags[0] is not None: # Direct tag known + if tags[1] is not None: # Reverse tag known + sample = samples.get(tags, None) + else: # Reverse tag known + s=[samples[x] for x in samples if x[0]==tags[0]] + if len(s)==1: + sample=s[0] + elif len(s)>1: + sequence[b'error']=b'multiple samples match tags' + return False,sequence + else: + sample=None + else: + if tags[1] is not None: # Reverse tag known + s=[samples[x] for x in samples if x[1]==tags[1]] + if len(s)==1: + sample=s[0] + elif len(s)>1: + sequence[b'error']=b'multiple samples match tags' + return False,sequence + else: # Reverse tag known + sample=None + + if sample is None: + sequence[b'error']=b"Cannot assign sequence to a sample" + return False,sequence + + sequence.update(sample) + sequence[b'seq_length']=len(sequence) + + return True, sequence + + +def run(config): + + DMS.obi_atexit() + + logger("info", "obi ngsfilter") + + assert config['ngsfilter']['info_view'] is not None, "Option -t must be specified" + + # Open the input + input = open_uri(config['obi']['inputURI']) + if input is None: + raise Exception("Could not read input view") + if input[2] != View_NUC_SEQS: + raise NotImplementedError('obi ngsfilter only works on NUC_SEQS views') + + # Open the output + output = open_uri(config['obi']['outputURI'], + input=False, + newviewtype=View_NUC_SEQS) + + if output is None: + raise Exception("Could not create output view") + + entries = input[1] + o_view = output[1] + + # Open the the view containing the informations about the tags and the primers + info_input = open_uri(config['ngsfilter']['info_view']) + if info_input is None: + raise Exception("Could not read the view containing the informations about the tags and the primers") + info_view = info_input[1] + + # Open the unidentified view + if config['ngsfilter']['unidentified'] is not None: + unidentified_input = open_uri(config['ngsfilter']['unidentified'], + input=False, + newviewtype=View_NUC_SEQS) + if unidentified_input is None: + raise Exception("Could not open the view containing the unidentified reads") + unidentified = unidentified_input[1] + + # Initialize the progress bar + pb = ProgressBar(len(entries), config, seconde=5) + + # Check and store primers and tags + infos = read_info_view(info_view, error=config['ngsfilter']['error']) + + g = 0 + u = 0 + i = 0 + try: + for iseq in entries: + pb(i) + i+=1 + modseq = Nuc_Seq.new_from_stored(iseq) + good, oseq = annotate(modseq, infos) + if good: + o_view[g].set(oseq.id, oseq.seq, definition=oseq.definition, quality=oseq.quality, tags=oseq) + g+=1 + elif unidentified is not None: + unidentified[u].set(oseq.id, oseq.seq, definition=oseq.definition, quality=oseq.quality, tags=oseq) + u+=1 + except Exception, e: + raise RollbackException("obi ngsfilter error, rollbacking views: "+str(e), o_view, unidentified) + + input[0].close() + output[0].close() + info_input[0].close() + unidentified_input[0].close() + + + +# TODO ?? +# if options.taglist is not None: +#TODO: Patch when no taglists +# else: +# options.direct=options.direct.lower() +# options.reverse=options.reverse.lower() +# primers={options.direct:(options.taglength,{})} +# if options.reverse is not None: +# reverse = options.reverse +# else: +# reverse = '-' +# primers[options.direct][1][reverse]={'-':('-','-',True,None)} +