New command: obi ngsfilter

This commit is contained in:
Celine Mercier
2018-03-12 18:09:22 +01:00
parent dd225a255f
commit 8a0b95c1d6
6 changed files with 606 additions and 2 deletions

View File

@ -0,0 +1,11 @@
#cython: language_level=3
from ._nws cimport *
cdef class FreeEndGap(NWS):
cdef double xsmax
cdef int xmax
cdef double doAlignment(self) except? 0

View File

@ -0,0 +1,160 @@
#cython: language_level=3
'''
Created on 6 Nov. 2009
@author: coissac
'''
cdef class FreeEndGap(NWS):
def __init__(self,match=4,mismatch=-6,opengap=-8,extgap=-2):
NWS.__init__(self,match,mismatch,opengap,extgap)
self.xsmax=0
self.xmax=0
cdef double doAlignment(self) except? 0:
cdef int i # vertical index
cdef int j # horizontal index
cdef int idx
cdef int idx0
cdef int idx1
cdef int jump
cdef int delta
cdef double score
cdef double scoremax
cdef int path
assert self.hSeq.length > self.vSeq.length, \
"Sequence B must be shorter than sequence A"
if self.needToCompute:
self.allocate()
self.reset()
self.xsmax=0
self.xmax=0
for j in range(1,self.hSeq.length+1):
idx = self.index(j,0)
self.matrix.matrix[idx].score = 0
self.matrix.matrix[idx].path = j
for i in range(1,self.vSeq.length+1):
idx = self.index(0,i)
self.matrix.matrix[idx].score = self._opengap + (self._extgap * (i-1))
self.matrix.matrix[idx].path = -i
idx0=self.index(-1,0)
idx1=self.index(0,1)
for i in range(1,self.vSeq.length+1):
idx0+=1
idx1+=1
for j in range(1,self.hSeq.length+1):
# 1 - came from diagonal
#idx = self.index(j-1,i-1)
idx = idx0
# print "computing cell : %d,%d --> %d/%d" % (j,i,self.index(j,i),self.matrix.msize),
scoremax = self.matrix.matrix[idx].score + \
self.matchScore(j,i)
path = 0
# print "so=%f sd=%f sm=%f" % (self.matrix.matrix[idx].score,self.matchScore(j,i),scoremax),
# 2 - open horizontal gap
# idx = self.index(j-1,i)
idx = idx1 - 1
score = self.matrix.matrix[idx].score+ \
self._opengap
if score > scoremax :
scoremax = score
path = +1
# 3 - open vertical gap
# idx = self.index(j,i-1)
idx = idx0 + 1
score = self.matrix.matrix[idx].score + \
self._opengap
if score > scoremax :
scoremax = score
path = -1
# 4 - extend horizontal gap
jump = self.matrix.bestHJump[i]
if jump >= 0:
idx = self.index(jump,i)
delta = j-jump
score = self.matrix.matrix[idx].score + \
self._extgap * delta
if score > scoremax :
scoremax = score
path = delta+1
# 5 - extend vertical gap
jump = self.matrix.bestVJump[j]
if jump >= 0:
idx = self.index(j,jump)
delta = i-jump
score = self.matrix.matrix[idx].score + \
self._extgap * delta
if score > scoremax :
scoremax = score
path = -delta-1
# idx = self.index(j,i)
idx = idx1
self.matrix.matrix[idx].score = scoremax
self.matrix.matrix[idx].path = path
if path == -1:
self.matrix.bestVJump[j]=i
elif path == +1 :
self.matrix.bestHJump[i]=j
if i==self.vSeq.length and scoremax > self.xsmax:
self.xsmax=scoremax
self.xmax=j
idx0+=1
idx1+=1
self.sequenceChanged=False
self.scoreChanged=False
return self.xsmax
cdef void backtrack(self):
#cdef list path=[]
cdef int i
cdef int j
cdef int p
self.doAlignment()
j=self.xmax
i=self.vSeq.length
self.path=allocatePath(i,j+1,self.path)
if self.xmax<self.hSeq.length:
self.path.path[self.path.length]=self.hSeq.length-self.xmax
self.path.length+=1
while (i or j):
p=self.matrix.matrix[self.index(j,i)].path
self.path.path[self.path.length]=p
self.path.length+=1
#path.append(p)
if p==0:
i-=1
j-=1
elif p < 0:
i+=p
else:
j-=p
#path.reverse()
#reversePath(self.path)
self.path.hStart=0
self.path.vStart=0
#return 0,0,path

View File

@ -0,0 +1,7 @@
#cython: language_level=3
from ._freeendgap cimport *
cdef class FreeEndGapFullMatch(FreeEndGap):
cdef double matchScore(self,int h, int v)

View File

@ -0,0 +1,18 @@
#cython: language_level=3
'''
Created on 6 Nov. 2009
@author: coissac
'''
cdef class FreeEndGapFullMatch(FreeEndGap):
cdef double matchScore(self,int h, int v):
cdef double score
if iupacMatch(self.hSeq.sequence[h-1],self.vSeq.sequence[v-1]):
score=self._match
else:
score=self._mismatch
return score

View File

@ -6,8 +6,6 @@ Created on 6 Nov. 2009
@author: coissac
'''
from ._rassemble cimport *
cdef class RightDirectAssemble(NWS):

View File

@ -0,0 +1,410 @@
#cython: language_level=3
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.dms import DMS
from obitools3.dms.view import RollbackException
from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
from obitools3.dms.column.column cimport Column, Column_line
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN
from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t, OBI_QUAL
from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
from obitools3.uri.decode import open_uri
from obitools3.apps.config import logger
from obitools3.align._freeendgapfm import FreeEndGapFullMatch
from obitools3.dms.obiseq cimport Nuc_Seq, Nuc_Seq_Stored
from functools import reduce, cmp_to_key
import math
from curses.ascii import SO
__title__="Assigns sequence records to the corresponding experiment/sample based on DNA tags and primers"
def addOptions(parser):
addSequenceInputOption(parser)
addMinimalOutputOption(parser)
group = parser.add_argument_group('obi ngsfilter specific options')
group.add_argument('-t','--info-view',
action="store", dest="ngsfilter:info_view",
metavar="<URI>",
type=str,
default=None,
help="URI to the view containing the samples definition (with tags, primers, sample names,...)")
group.add_argument('-u','--unidentified',
action="store", dest="ngsfilter:unidentified",
metavar="<URI>",
type=str,
default=None,
help="URI to the view used to store the sequences unassigned to any sample")
group.add_argument('-e','--error',
action="store", dest="ngsfilter:error",
metavar="###",
type=int,
default=2,
help="Number of errors allowed for matching primers [default = 2]")
class Primer:
collection={}
def __init__(self, sequence, taglength, direct=True, error=2, verbose=False):
'''
@param sequence:
@type sequence:
@param direct:
@type direct:
'''
assert sequence not in Primer.collection \
or Primer.collection[sequence]==taglength, \
"Primer %s must always be used with tags of the same length" % sequence
Primer.collection[sequence]=taglength
self.raw=sequence
self.sequence = Nuc_Seq(b"primer", sequence)
self.lseq = len(self.sequence)
self.align=FreeEndGapFullMatch()
self.align.match=4
self.align.mismatch=-2
self.align.opengap=-2
self.align.extgap=-2
self.error=error
self.minscore = (self.lseq-error) * self.align.match + error * self.align.mismatch
self.taglength=taglength
self.align.seqB=self.sequence
self.direct = direct
self.verbose=verbose
def reverse_complement(self):
p = Primer(self.raw,
self.taglength,
not self.direct,verbose=self.verbose,
error=self.error)
p.sequence=p.sequence.reverse_complement
p.align.seqB=p.sequence
return p
def __hash__(self):
return hash(str(self.raw))
def __eq__(self,primer):
return self.raw==primer.raw
def __call__(self,sequence):
if len(sequence) <= self.lseq:
return None
self.align.seqA=sequence
ali=self.align()
if ali.score >= self.minscore:
score = ali.score
start = ali[1].gaps[0][1]
end = len(ali[1])-ali[1].gaps[-1][1]
if self.taglength is not None:
if self.sequence.revcomp:
if (len(sequence)-end) >= self.taglength:
tag = sequence.clone()
tag = tag[end:end+self.taglength]
tag = tag.reverse_complement.seq
else:
tag=None
else:
if start >= self.taglength:
tag = sequence.clone()
tag = tag[start - self.taglength:start].seq
else:
tag=None
else:
tag=None
return score,start,end,tag
return None
def __str__(self):
return "%s: %s" % ({True:'D',False:'R'}[self.direct],self.raw)
__repr__=__str__
cdef dict read_info_view(info_view, error=2, verbose=False):
infos = {}
for p in info_view:
forward=Primer(p[b'forward_primer'],
len(p[b'forward_tag']) if p[b'forward_tag']!=b'-' else None,
True,
error=error,verbose=verbose)
fp = infos.get(forward,{})
infos[forward]=fp
reverse=Primer(p[b'reverse_primer'],
len(p[b'reverse_tag']) if p[b'reverse_tag']!=b'-' else None,
False,
error=error,verbose=verbose)
rp = infos.get(reverse,{})
infos[reverse]=rp
cf=forward.reverse_complement()
cr=reverse.reverse_complement()
dpp=fp.get(cr,{})
fp[cr]=dpp
rpp=rp.get(cf,{})
rp[cf]=rpp
tags = (p[b'forward_tag'] if p[b'forward_tag']!=b'-' else None,
p[b'reverse_tag'] if p[b'reverse_tag']!=b'-' else None)
assert tags not in dpp, \
"Tag pair %s is already used with primer pairs: (%s,%s)" % (str(tags),forward,reverse)
# Save additional data
special_keys = [b'forward_primer', b'reverse_primer', b'forward_tag', b'reverse_tag']
data={}
for key in p:
if key not in special_keys:
data[key] = p[key]
dpp[tags] = data
rpp[tags] = data
return infos
cdef tuple annotate(sequence, infos, verbose=False):
def sortMatch(m1, m2):
if m1[1] is None and m2[1] is None:
return 0
if m1[1] is None:
return 1
if m2[1] is None:
return -1
return (m1[1][1] > m2[1][2]) - (m1[1][1] < m2[1][2])
if hasattr(sequence, "quality_array"):
q = -reduce(lambda x,y:x+y,(math.log10(z) for z in sequence.quality_array),0)/len(sequence.quality_array)*10
sequence[b'avg_quality']=q
q = -reduce(lambda x,y:x+y,(math.log10(z) for z in sequence.quality_array[0:10]),0)
sequence[b'head_quality']=q
if len(sequence.quality_array[10:-10]) :
q = -reduce(lambda x,y:x+y,(math.log10(z) for z in sequence.quality_array[10:-10]),0)/len(sequence.quality_array[10:-10])*10
sequence[b'mid_quality']=q
q = -reduce(lambda x,y:x+y,(math.log10(z) for z in sequence.quality_array[-10:]),0)
sequence[b'tail_quality']=q
directmatch = [(p,p(sequence)) for p in infos]
directmatch.sort(key=cmp_to_key(sortMatch))
directmatch=directmatch[0] if directmatch[0][1] is not None else None
if directmatch is None:
sequence[b'error']=b'No primer match'
return False,sequence
match = sequence[directmatch[1][1]:directmatch[1][2]]
sequence[b'seq_length_ori']=len(sequence)
sequence = sequence[directmatch[1][2]:]
if directmatch[0].direct:
sequence[b'direction']=b'forward'
sequence[b'forward_score']=directmatch[1][0]
sequence[b'forward_primer']=directmatch[0].raw
sequence[b'forward_match']=match.seq
else:
sequence[b'direction']=b'reverse'
sequence[b'reverse_score']=directmatch[1][0]
sequence[b'reverse_primer']=directmatch[0].raw
sequence[b'reverse_match']=match.seq
infos = infos[directmatch[0]]
reversematch = [(p,p(sequence)) for p in infos if p is not None]
reversematch.sort(key=cmp_to_key(sortMatch))
reversematch = reversematch[0] if reversematch[0][1] is not None else None
if reversematch is None and None not in infos:
if directmatch[0].direct:
message = b'No reverse primer match'
else:
message = b'No direct primer match'
sequence[b'error']=message
return False,sequence
if reversematch is None:
sequence[b'status']=b'partial'
if directmatch[0].direct:
tags=(directmatch[1][3],None)
else:
tags=(None,directmatch[1][3])
samples = infos[None]
else:
sequence[b'status']=b'full'
match = sequence[reversematch[1][1]:reversematch[1][2]]
match = match.reverse_complement
sequence = sequence[0:reversematch[1][1]]
if directmatch[0].direct:
tags=(directmatch[1][3],reversematch[1][3])
sequence[b'reverse_score']=reversematch[1][0]
sequence[b'reverse_primer']=reversematch[0].raw
sequence[b'reverse_match']=match.seq
else:
tags=(reversematch[1][3],directmatch[1][3])
sequence[b'forward_score']=reversematch[1][0]
sequence[b'forward_primer']=reversematch[0].raw
sequence[b'forward_match']=match.seq
if tags[0] is not None:
sequence[b'forward_tag'] = tags[0]
if tags[1] is not None:
sequence[b'reverse_tag'] = tags[1]
samples = infos[reversematch[0]]
if not directmatch[0].direct:
sequence=sequence.reverse_complement
sample=None
if tags[0] is not None: # Direct tag known
if tags[1] is not None: # Reverse tag known
sample = samples.get(tags, None)
else: # Reverse tag known
s=[samples[x] for x in samples if x[0]==tags[0]]
if len(s)==1:
sample=s[0]
elif len(s)>1:
sequence[b'error']=b'multiple samples match tags'
return False,sequence
else:
sample=None
else:
if tags[1] is not None: # Reverse tag known
s=[samples[x] for x in samples if x[1]==tags[1]]
if len(s)==1:
sample=s[0]
elif len(s)>1:
sequence[b'error']=b'multiple samples match tags'
return False,sequence
else: # Reverse tag known
sample=None
if sample is None:
sequence[b'error']=b"Cannot assign sequence to a sample"
return False,sequence
sequence.update(sample)
sequence[b'seq_length']=len(sequence)
return True, sequence
def run(config):
DMS.obi_atexit()
logger("info", "obi ngsfilter")
assert config['ngsfilter']['info_view'] is not None, "Option -t must be specified"
# Open the input
input = open_uri(config['obi']['inputURI'])
if input is None:
raise Exception("Could not read input view")
if input[2] != View_NUC_SEQS:
raise NotImplementedError('obi ngsfilter only works on NUC_SEQS views')
# Open the output
output = open_uri(config['obi']['outputURI'],
input=False,
newviewtype=View_NUC_SEQS)
if output is None:
raise Exception("Could not create output view")
entries = input[1]
o_view = output[1]
# Open the the view containing the informations about the tags and the primers
info_input = open_uri(config['ngsfilter']['info_view'])
if info_input is None:
raise Exception("Could not read the view containing the informations about the tags and the primers")
info_view = info_input[1]
# Open the unidentified view
if config['ngsfilter']['unidentified'] is not None:
unidentified_input = open_uri(config['ngsfilter']['unidentified'],
input=False,
newviewtype=View_NUC_SEQS)
if unidentified_input is None:
raise Exception("Could not open the view containing the unidentified reads")
unidentified = unidentified_input[1]
# Initialize the progress bar
pb = ProgressBar(len(entries), config, seconde=5)
# Check and store primers and tags
infos = read_info_view(info_view, error=config['ngsfilter']['error'])
g = 0
u = 0
i = 0
try:
for iseq in entries:
pb(i)
i+=1
modseq = Nuc_Seq.new_from_stored(iseq)
good, oseq = annotate(modseq, infos)
if good:
o_view[g].set(oseq.id, oseq.seq, definition=oseq.definition, quality=oseq.quality, tags=oseq)
g+=1
elif unidentified is not None:
unidentified[u].set(oseq.id, oseq.seq, definition=oseq.definition, quality=oseq.quality, tags=oseq)
u+=1
except Exception, e:
raise RollbackException("obi ngsfilter error, rollbacking views: "+str(e), o_view, unidentified)
input[0].close()
output[0].close()
info_input[0].close()
unidentified_input[0].close()
# TODO ??
# if options.taglist is not None:
#TODO: Patch when no taglists
# else:
# options.direct=options.direct.lower()
# options.reverse=options.reverse.lower()
# primers={options.direct:(options.taglength,{})}
# if options.reverse is not None:
# reverse = options.reverse
# else:
# reverse = '-'
# primers[options.direct][1][reverse]={'-':('-','-',True,None)}