ngsfilter: reworked to use apat library

This commit is contained in:
Celine Mercier
2019-02-06 18:13:54 +01:00
parent 04a3682307
commit 08bcbcd357

View File

@ -9,16 +9,20 @@ from obitools3.apps.optiongroups import addMinimalInputOption, addMinimalOutputO
from obitools3.uri.decode import open_uri
from obitools3.apps.config import logger
from obitools3.libalign._freeendgapfm import FreeEndGapFullMatch
from obitools3.libalign.apat_pattern import Primer_search
from obitools3.dms.obiseq cimport Nuc_Seq
from obitools3.dms.capi.obitypes cimport OBI_SEQ, OBI_QUAL
from obitools3.dms.capi.apat cimport MAX_PATTERN
from obitools3.utils cimport tobytes
from functools import reduce, cmp_to_key
from libc.stdint cimport INT32_MAX
from functools import reduce
import math
import sys
REVERSE_SEQ_COLUMN_NAME = b"REVERSE_SEQUENCE"
REVERSE_QUALITY_COLUMN_NAME = b"REVERSE_QUALITY"
REVERSE_SEQ_COLUMN_NAME = b"REVERSE_SEQUENCE" # used by alignpairedend tool
REVERSE_QUALITY_COLUMN_NAME = b"REVERSE_QUALITY" # used by alignpairedend tool
__title__="Assigns sequence records to the corresponding experiment/sample based on DNA tags and primers"
@ -64,7 +68,7 @@ class Primer:
collection={}
def __init__(self, sequence, taglength, direct=True, error=2, verbose=False):
def __init__(self, sequence, taglength, forward=True, max_errors=2, verbose=False, primer_pair_idx=0, primer_idx=0):
'''
@param sequence:
@ -79,28 +83,29 @@ class Primer:
Primer.collection[sequence]=taglength
self.primer_pair_idx = primer_pair_idx
self.primer_idx = primer_idx
self.is_revcomp = False
self.revcomp = None
self.raw=sequence
self.sequence = Nuc_Seq(b"primer", sequence)
self.lseq = len(self.sequence)
self.align=FreeEndGapFullMatch()
self.align.match=4
self.align.mismatch=-2
self.align.opengap=-2
self.align.extgap=-2
self.error=error
self.minscore = (self.lseq-error) * self.align.match + error * self.align.mismatch
self.max_errors=max_errors
self.taglength=taglength
self.align.seqB=self.sequence
self.direct = direct
self.forward = forward
self.verbose=verbose
def reverse_complement(self):
p = Primer(self.raw,
self.taglength,
not self.direct,verbose=self.verbose,
error=self.error)
self.taglength,
not self.forward,
verbose=self.verbose,
max_errors=self.max_errors,
primer_pair_idx=self.primer_pair_idx,
primer_idx=self.primer_idx)
p.sequence=p.sequence.reverse_complement
p.align.seqB=p.sequence
p.is_revcomp = True
p.revcomp = None
return p
def __hash__(self):
@ -109,51 +114,64 @@ class Primer:
def __eq__(self,primer):
return self.raw==primer.raw
def __call__(self,sequence):
def __call__(self, sequence, same_sequence=False, pattern=0, begin=0):
if len(sequence) <= self.lseq:
return None
self.align.seqA=sequence
ali=self.align()
ali = self.aligner.search_one_primer(sequence.seq,
self.primer_pair_idx,
self.primer_idx,
reverse_comp=self.is_revcomp,
same_sequence=same_sequence,
pattern_ref=pattern,
begin=begin)
if ali.score >= self.minscore:
score = ali.score
start = ali[1].gaps[0][1]
end = len(ali[1])-ali[1].gaps[-1][1]
if ali is None: # no match
return None
errors, start = ali.first_encountered()
if errors <= self.max_errors:
end = start + self.lseq
if self.taglength is not None:
if self.sequence.revcomp:
if self.sequence.is_revcomp:
if (len(sequence)-end) >= self.taglength:
tag = sequence.clone()
tag = tag[end:end+self.taglength]
tag = tag.reverse_complement.seq
tag_start = len(sequence) - end - self.taglength
tag = sequence.reverse_complement[tag_start:tag_start+self.taglength].seq
else:
tag=None
else:
if start >= self.taglength:
tag = sequence.clone()
tag = tag[start - self.taglength:start].seq
tag = tobytes((sequence[start - self.taglength:start].seq).lower()) # turn back to lowercase because apat turned to uppercase
else:
tag=None
else:
tag=None
return score,start,end,tag
return errors,start,end,tag
return None
def __str__(self):
return "%s: %s" % ({True:'D',False:'R'}[self.direct],self.raw)
return "%s: %s" % ({True:'D',False:'R'}[self.forward],self.raw)
__repr__=__str__
cdef dict read_info_view(info_view, error=2, verbose=False, not_aligned=False):
cdef read_info_view(info_view, max_errors=2, verbose=False, not_aligned=False):
infos = {}
primer_list = []
i=0
for p in info_view:
forward=Primer(p[b'forward_primer'],
len(p[b'forward_tag']) if p[b'forward_tag']!=b'-' else None,
True,
error=error,verbose=verbose)
max_errors=max_errors,
verbose=verbose,
primer_pair_idx=i,
primer_idx=0)
fp = infos.get(forward,{})
infos[forward]=fp
@ -161,17 +179,28 @@ cdef dict read_info_view(info_view, error=2, verbose=False, not_aligned=False):
reverse=Primer(p[b'reverse_primer'],
len(p[b'reverse_tag']) if p[b'reverse_tag']!=b'-' else None,
False,
error=error,verbose=verbose)
max_errors=max_errors,
verbose=verbose,
primer_pair_idx=i,
primer_idx=1)
primer_list.append((p[b'forward_primer'], p[b'reverse_primer']))
rp = infos.get(reverse,{})
infos[reverse]=rp
if not_aligned:
dpp=fp.get(reverse,{})
fp[reverse]=dpp
cf=forward
cr=reverse
rpp=rp.get(forward,{})
rp[forward]=rpp
cf.revcomp = forward.reverse_complement()
cr.revcomp = reverse.reverse_complement()
dpp=fp.get(cr,{})
fp[cr]=dpp
rpp=rp.get(cf,{})
rp[cf]=rpp
else:
cf=forward.reverse_complement()
@ -199,22 +228,24 @@ cdef dict read_info_view(info_view, error=2, verbose=False, not_aligned=False):
dpp[tags] = data
rpp[tags] = data
return infos
i+=1
return infos, primer_list
cdef tuple annotate(sequences, infos, verbose=False):
def sortMatch(m1, m2):
if m1[1] is None and m2[1] is None:
return 0
def sortMatch(match):
if match[1] is None:
return INT32_MAX
else:
return match[1][1]
if m1[1] is None:
return 1
if m2[1] is None:
def sortReverseMatch(match):
if match[1] is None:
return -1
return (m1[1][1] > m2[1][2]) - (m1[1][1] < m2[1][2])
else:
return match[1][1]
not_aligned = len(sequences) > 1
sequenceF = sequences[0]
@ -226,8 +257,8 @@ cdef tuple annotate(sequences, infos, verbose=False):
if not_aligned:
sequenceR = sequences[1]
final_sequence[REVERSE_SEQ_COLUMN_NAME] = sequenceR.seq
final_sequence[REVERSE_QUALITY_COLUMN_NAME] = sequenceR.quality
final_sequence[REVERSE_SEQ_COLUMN_NAME] = sequenceR.seq # used by alignpairedend tool
final_sequence[REVERSE_QUALITY_COLUMN_NAME] = sequenceR.quality # used by alignpairedend tool
for seq in sequences:
if hasattr(seq, "quality_array"):
@ -242,28 +273,35 @@ cdef tuple annotate(sequences, infos, verbose=False):
seq[b'tail_quality']=q
# Try direct matching:
directmatch = None
directmatch = []
first_matched_seq = None
second_matched_seq = None
for seq in sequences:
new_seq = True
pattern = 0
for p in infos:
directmatch = (p, p(seq))
if directmatch[1] is not None:
break
if directmatch[1] is not None:
first_matched_seq = seq
if id(first_matched_seq) == id(sequenceF) and not_aligned:
second_matched_seq = sequenceR
else:
second_matched_seq = sequenceF
break
if directmatch[1] is None:
directmatch = None
if pattern == MAX_PATTERN:
new_seq = True
pattern = 0
directmatch.append((p, p(seq, same_sequence=not new_seq, pattern=pattern), seq))
new_seq = False
pattern+=1
# Choose match closer to the start of (one of the) sequence(s)
directmatch = sorted(directmatch, key=sortMatch)
all_direct_matches = directmatch
directmatch = directmatch[0] if directmatch[0][1] is not None else None
if directmatch is None:
final_sequence[b'error']=b'No primer match'
return False, final_sequence
first_matched_seq = directmatch[2]
if id(first_matched_seq) == id(sequenceF) and not_aligned:
second_matched_seq = sequenceR
else:
second_matched_seq = sequenceF
match = first_matched_seq[directmatch[1][1]:directmatch[1][2]]
if not not_aligned:
@ -273,37 +311,65 @@ cdef tuple annotate(sequences, infos, verbose=False):
final_sequence = final_sequence[directmatch[1][2]:]
else:
cut_seq = sequenceR[directmatch[1][2]:]
final_sequence[REVERSE_SEQ_COLUMN_NAME] = cut_seq.seq
final_sequence[REVERSE_QUALITY_COLUMN_NAME] = cut_seq.quality
final_sequence[REVERSE_SEQ_COLUMN_NAME] = cut_seq.seq # used by alignpairedend tool
final_sequence[REVERSE_QUALITY_COLUMN_NAME] = cut_seq.quality # used by alignpairedend tool
if directmatch[0].direct:
if directmatch[0].forward:
final_sequence[b'direction']=b'forward'
final_sequence[b'forward_score']=directmatch[1][0]
final_sequence[b'forward_errors']=directmatch[1][0]
final_sequence[b'forward_primer']=directmatch[0].raw
final_sequence[b'forward_match']=match.seq
else:
final_sequence[b'direction']=b'reverse'
final_sequence[b'reverse_score']=directmatch[1][0]
final_sequence[b'reverse_errors']=directmatch[1][0]
final_sequence[b'reverse_primer']=directmatch[0].raw
final_sequence[b'reverse_match']=match.seq
# Keep only paired reverse primer
infos = infos[directmatch[0]]
# Try reverse matching on the other sequence:
for p in infos:
reversematch = (p, p(second_matched_seq))
# If not aligned, look for other match in already computed match (choose the one that makes the biggest amplicon)
if not_aligned:
i=1
while all_direct_matches[i][1] is None and all_direct_matches[i][0].forward and i<len(all_direct_matches):
i+=1
if i < len(all_direct_matches):
reversematch = all_direct_matches[i]
else:
reversematch = None
if reversematch[1] is None and not_aligned:
# Try matching on the same sequence than the first match
reverse_p = p.reverse_complement()
reversematch = (reverse_p, reverse_p(first_matched_seq))
if reversematch[1] is None:
reversematch = None
# Look for other primer in the other direction on the sequence, or
# If sequences are not already aligned and reverse primer not found in most likely sequence (the one without the forward primer), try matching on the same sequence than the first match (primer in the other direction)
if not not_aligned or (not_aligned and reversematch[1] is None):
if not not_aligned:
sequence_to_match = second_matched_seq
else:
sequence_to_match = first_matched_seq
reversematch = []
# Compute begin
begin=directmatch[1][2]+1 # end of match + 1 on the same sequence
# Try reverse matching on the other sequence:
new_seq = True
pattern = 0
for p in infos:
if pattern == MAX_PATTERN:
new_seq = True
pattern = 0
if not_aligned:
primer=p.revcomp
else:
primer=p
reversematch.append((primer, primer(sequence_to_match, same_sequence=not new_seq, pattern=pattern, begin=begin)))
new_seq = False
pattern+=1
# Choose match closer to the end of the sequence
reversematch = sorted(reversematch, key=sortReverseMatch, reverse=True)
all_reverse_matches = reversematch
reversematch = reversematch[0] if reversematch[0][1] is not None else None
if reversematch is None and None not in infos:
if directmatch[0].direct:
if directmatch[0].forward:
message = b'No reverse primer match'
else:
message = b'No direct primer match'
@ -313,7 +379,7 @@ cdef tuple annotate(sequences, infos, verbose=False):
if reversematch is None:
final_sequence[b'status']=b'partial'
if directmatch[0].direct:
if directmatch[0].forward:
tags=(directmatch[1][3],None)
else:
tags=(None,directmatch[1][3])
@ -330,18 +396,18 @@ cdef tuple annotate(sequences, infos, verbose=False):
final_sequence = final_sequence[0:reversematch[1][1]]
else:
cut_seq = sequenceR[reversematch[1][2]:]
final_sequence[REVERSE_SEQ_COLUMN_NAME] = cut_seq.seq
final_sequence[REVERSE_QUALITY_COLUMN_NAME] = cut_seq.quality
final_sequence[REVERSE_SEQ_COLUMN_NAME] = cut_seq.seq # used by alignpairedend tool
final_sequence[REVERSE_QUALITY_COLUMN_NAME] = cut_seq.quality # used by alignpairedend tool
if directmatch[0].direct:
if directmatch[0].forward:
tags=(directmatch[1][3], reversematch[1][3])
final_sequence[b'reverse_score'] = reversematch[1][0]
final_sequence[b'reverse_errors'] = reversematch[1][0]
final_sequence[b'reverse_primer'] = reversematch[0].raw
final_sequence[b'reverse_match'] = match.seq
else:
tags=(reversematch[1][3], directmatch[1][3])
final_sequence[b'forward_score'] = reversematch[1][0]
final_sequence[b'forward_errors'] = reversematch[1][0]
final_sequence[b'forward_primer'] = reversematch[0].raw
final_sequence[b'forward_match'] = match.seq
@ -352,15 +418,15 @@ cdef tuple annotate(sequences, infos, verbose=False):
samples = infos[reversematch[0]]
if not directmatch[0].direct and not not_aligned: # don't reverse complement if not_aligned
if not directmatch[0].forward and not not_aligned: # don't reverse complement if not_aligned
final_sequence = final_sequence.reverse_complement
sample=None
if tags[0] is not None: # Direct tag known
if tags[0] is not None: # Direct tag known
if tags[1] is not None: # Reverse tag known
sample = samples.get(tags, None)
else: # Reverse tag known
else: # Only direct tag known
s=[samples[x] for x in samples if x[0]==tags[0]]
if len(s)==1:
sample=s[0]
@ -370,14 +436,14 @@ cdef tuple annotate(sequences, infos, verbose=False):
else:
sample=None
else:
if tags[1] is not None: # Reverse tag known
if tags[1] is not None: # Only reverse tag known
s=[samples[x] for x in samples if x[1]==tags[1]]
if len(s)==1:
sample=s[0]
elif len(s)>1:
final_sequence[b'error']=b'multiple samples match tags'
return False, final_sequence
else: # Reverse tag known
else:
sample=None
if sample is None:
@ -478,9 +544,18 @@ def run(config):
pb = ProgressBar(entries_len, config, seconde=5)
# Check and store primers and tags
infos = read_info_view(info_view, error=config['ngsfilter']['error'], verbose=False, not_aligned=not_aligned) # TODO obi verbose option
infos, primer_list = read_info_view(info_view, max_errors=config['ngsfilter']['error'], verbose=False, not_aligned=not_aligned) # TODO obi verbose option
if not_aligned:
aligner = Primer_search(primer_list, config['ngsfilter']['error'])
for p in infos:
p.aligner = aligner
for paired_p in infos[p]:
paired_p.aligner = aligner
if paired_p.revcomp is not None:
paired_p.revcomp.aligner = aligner
if not_aligned: # create columns used by alignpairedend tool
Column.new_column(o_view, REVERSE_SEQ_COLUMN_NAME, OBI_SEQ)
Column.new_column(o_view, REVERSE_QUALITY_COLUMN_NAME, OBI_QUAL, associated_column_name=REVERSE_SEQ_COLUMN_NAME, associated_column_version=o_view[REVERSE_SEQ_COLUMN_NAME].version)
@ -510,7 +585,8 @@ def run(config):
command_line = " ".join(sys.argv[1:])
o_view.write_config(config, "ngsfilter", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
unidentified.write_config(config, "ngsfilter", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
# TODO add comment about unidentified seqs
# Add comment about unidentified seqs
unidentified.comments["info"] = "View containing sequences categorized as unidentified by the ngsfilter command"
output[0].record_command_line(command_line)
print("\n")
@ -520,19 +596,5 @@ def run(config):
output[0].close()
info_input[0].close()
unidentified_input[0].close()
# TODO ??
# if options.taglist is not None:
#TODO: Patch when no taglists
# else:
# options.direct=options.direct.lower()
# options.reverse=options.reverse.lower()
# primers={options.direct:(options.taglength,{})}
# if options.reverse is not None:
# reverse = options.reverse
# else:
# reverse = '-'
# primers[options.direct][1][reverse]={'-':('-','-',True,None)}
aligner.free()