ngsfilter: reworked to use apat library

This commit is contained in:
Celine Mercier
2019-02-06 18:13:54 +01:00
parent 04a3682307
commit 08bcbcd357

View File

@ -9,16 +9,20 @@ from obitools3.apps.optiongroups import addMinimalInputOption, addMinimalOutputO
from obitools3.uri.decode import open_uri from obitools3.uri.decode import open_uri
from obitools3.apps.config import logger from obitools3.apps.config import logger
from obitools3.libalign._freeendgapfm import FreeEndGapFullMatch from obitools3.libalign._freeendgapfm import FreeEndGapFullMatch
from obitools3.libalign.apat_pattern import Primer_search
from obitools3.dms.obiseq cimport Nuc_Seq from obitools3.dms.obiseq cimport Nuc_Seq
from obitools3.dms.capi.obitypes cimport OBI_SEQ, OBI_QUAL from obitools3.dms.capi.obitypes cimport OBI_SEQ, OBI_QUAL
from obitools3.dms.capi.apat cimport MAX_PATTERN
from obitools3.utils cimport tobytes
from functools import reduce, cmp_to_key from libc.stdint cimport INT32_MAX
from functools import reduce
import math import math
import sys import sys
REVERSE_SEQ_COLUMN_NAME = b"REVERSE_SEQUENCE" REVERSE_SEQ_COLUMN_NAME = b"REVERSE_SEQUENCE" # used by alignpairedend tool
REVERSE_QUALITY_COLUMN_NAME = b"REVERSE_QUALITY" REVERSE_QUALITY_COLUMN_NAME = b"REVERSE_QUALITY" # used by alignpairedend tool
__title__="Assigns sequence records to the corresponding experiment/sample based on DNA tags and primers" __title__="Assigns sequence records to the corresponding experiment/sample based on DNA tags and primers"
@ -64,7 +68,7 @@ class Primer:
collection={} collection={}
def __init__(self, sequence, taglength, direct=True, error=2, verbose=False): def __init__(self, sequence, taglength, forward=True, max_errors=2, verbose=False, primer_pair_idx=0, primer_idx=0):
''' '''
@param sequence: @param sequence:
@ -79,28 +83,29 @@ class Primer:
Primer.collection[sequence]=taglength Primer.collection[sequence]=taglength
self.primer_pair_idx = primer_pair_idx
self.primer_idx = primer_idx
self.is_revcomp = False
self.revcomp = None
self.raw=sequence self.raw=sequence
self.sequence = Nuc_Seq(b"primer", sequence) self.sequence = Nuc_Seq(b"primer", sequence)
self.lseq = len(self.sequence) self.lseq = len(self.sequence)
self.align=FreeEndGapFullMatch() self.max_errors=max_errors
self.align.match=4
self.align.mismatch=-2
self.align.opengap=-2
self.align.extgap=-2
self.error=error
self.minscore = (self.lseq-error) * self.align.match + error * self.align.mismatch
self.taglength=taglength self.taglength=taglength
self.align.seqB=self.sequence self.forward = forward
self.direct = direct
self.verbose=verbose self.verbose=verbose
def reverse_complement(self): def reverse_complement(self):
p = Primer(self.raw, p = Primer(self.raw,
self.taglength, self.taglength,
not self.direct,verbose=self.verbose, not self.forward,
error=self.error) verbose=self.verbose,
max_errors=self.max_errors,
primer_pair_idx=self.primer_pair_idx,
primer_idx=self.primer_idx)
p.sequence=p.sequence.reverse_complement p.sequence=p.sequence.reverse_complement
p.align.seqB=p.sequence p.is_revcomp = True
p.revcomp = None
return p return p
def __hash__(self): def __hash__(self):
@ -109,51 +114,64 @@ class Primer:
def __eq__(self,primer): def __eq__(self,primer):
return self.raw==primer.raw return self.raw==primer.raw
def __call__(self,sequence): def __call__(self, sequence, same_sequence=False, pattern=0, begin=0):
if len(sequence) <= self.lseq: if len(sequence) <= self.lseq:
return None return None
self.align.seqA=sequence ali = self.aligner.search_one_primer(sequence.seq,
ali=self.align() self.primer_pair_idx,
self.primer_idx,
reverse_comp=self.is_revcomp,
same_sequence=same_sequence,
pattern_ref=pattern,
begin=begin)
if ali.score >= self.minscore: if ali is None: # no match
score = ali.score return None
start = ali[1].gaps[0][1]
end = len(ali[1])-ali[1].gaps[-1][1] errors, start = ali.first_encountered()
if errors <= self.max_errors:
end = start + self.lseq
if self.taglength is not None: if self.taglength is not None:
if self.sequence.revcomp: if self.sequence.is_revcomp:
if (len(sequence)-end) >= self.taglength: if (len(sequence)-end) >= self.taglength:
tag = sequence.clone() tag_start = len(sequence) - end - self.taglength
tag = tag[end:end+self.taglength] tag = sequence.reverse_complement[tag_start:tag_start+self.taglength].seq
tag = tag.reverse_complement.seq
else: else:
tag=None tag=None
else: else:
if start >= self.taglength: if start >= self.taglength:
tag = sequence.clone() tag = tobytes((sequence[start - self.taglength:start].seq).lower()) # turn back to lowercase because apat turned to uppercase
tag = tag[start - self.taglength:start].seq
else: else:
tag=None tag=None
else: else:
tag=None tag=None
return score,start,end,tag return errors,start,end,tag
return None return None
def __str__(self): def __str__(self):
return "%s: %s" % ({True:'D',False:'R'}[self.direct],self.raw) return "%s: %s" % ({True:'D',False:'R'}[self.forward],self.raw)
__repr__=__str__ __repr__=__str__
cdef dict read_info_view(info_view, error=2, verbose=False, not_aligned=False): cdef read_info_view(info_view, max_errors=2, verbose=False, not_aligned=False):
infos = {} infos = {}
primer_list = []
i=0
for p in info_view: for p in info_view:
forward=Primer(p[b'forward_primer'], forward=Primer(p[b'forward_primer'],
len(p[b'forward_tag']) if p[b'forward_tag']!=b'-' else None, len(p[b'forward_tag']) if p[b'forward_tag']!=b'-' else None,
True, True,
error=error,verbose=verbose) max_errors=max_errors,
verbose=verbose,
primer_pair_idx=i,
primer_idx=0)
fp = infos.get(forward,{}) fp = infos.get(forward,{})
infos[forward]=fp infos[forward]=fp
@ -161,17 +179,28 @@ cdef dict read_info_view(info_view, error=2, verbose=False, not_aligned=False):
reverse=Primer(p[b'reverse_primer'], reverse=Primer(p[b'reverse_primer'],
len(p[b'reverse_tag']) if p[b'reverse_tag']!=b'-' else None, len(p[b'reverse_tag']) if p[b'reverse_tag']!=b'-' else None,
False, False,
error=error,verbose=verbose) max_errors=max_errors,
verbose=verbose,
primer_pair_idx=i,
primer_idx=1)
primer_list.append((p[b'forward_primer'], p[b'reverse_primer']))
rp = infos.get(reverse,{}) rp = infos.get(reverse,{})
infos[reverse]=rp infos[reverse]=rp
if not_aligned: if not_aligned:
dpp=fp.get(reverse,{}) cf=forward
fp[reverse]=dpp cr=reverse
rpp=rp.get(forward,{}) cf.revcomp = forward.reverse_complement()
rp[forward]=rpp cr.revcomp = reverse.reverse_complement()
dpp=fp.get(cr,{})
fp[cr]=dpp
rpp=rp.get(cf,{})
rp[cf]=rpp
else: else:
cf=forward.reverse_complement() cf=forward.reverse_complement()
@ -199,22 +228,24 @@ cdef dict read_info_view(info_view, error=2, verbose=False, not_aligned=False):
dpp[tags] = data dpp[tags] = data
rpp[tags] = data rpp[tags] = data
return infos i+=1
return infos, primer_list
cdef tuple annotate(sequences, infos, verbose=False): cdef tuple annotate(sequences, infos, verbose=False):
def sortMatch(m1, m2): def sortMatch(match):
if m1[1] is None and m2[1] is None: if match[1] is None:
return 0 return INT32_MAX
else:
return match[1][1]
if m1[1] is None: def sortReverseMatch(match):
return 1 if match[1] is None:
if m2[1] is None:
return -1 return -1
else:
return (m1[1][1] > m2[1][2]) - (m1[1][1] < m2[1][2]) return match[1][1]
not_aligned = len(sequences) > 1 not_aligned = len(sequences) > 1
sequenceF = sequences[0] sequenceF = sequences[0]
@ -226,8 +257,8 @@ cdef tuple annotate(sequences, infos, verbose=False):
if not_aligned: if not_aligned:
sequenceR = sequences[1] sequenceR = sequences[1]
final_sequence[REVERSE_SEQ_COLUMN_NAME] = sequenceR.seq final_sequence[REVERSE_SEQ_COLUMN_NAME] = sequenceR.seq # used by alignpairedend tool
final_sequence[REVERSE_QUALITY_COLUMN_NAME] = sequenceR.quality final_sequence[REVERSE_QUALITY_COLUMN_NAME] = sequenceR.quality # used by alignpairedend tool
for seq in sequences: for seq in sequences:
if hasattr(seq, "quality_array"): if hasattr(seq, "quality_array"):
@ -242,28 +273,35 @@ cdef tuple annotate(sequences, infos, verbose=False):
seq[b'tail_quality']=q seq[b'tail_quality']=q
# Try direct matching: # Try direct matching:
directmatch = None directmatch = []
first_matched_seq = None first_matched_seq = None
second_matched_seq = None second_matched_seq = None
for seq in sequences: for seq in sequences:
new_seq = True
pattern = 0
for p in infos: for p in infos:
directmatch = (p, p(seq)) if pattern == MAX_PATTERN:
if directmatch[1] is not None: new_seq = True
break pattern = 0
if directmatch[1] is not None: directmatch.append((p, p(seq, same_sequence=not new_seq, pattern=pattern), seq))
first_matched_seq = seq new_seq = False
if id(first_matched_seq) == id(sequenceF) and not_aligned: pattern+=1
second_matched_seq = sequenceR
else: # Choose match closer to the start of (one of the) sequence(s)
second_matched_seq = sequenceF directmatch = sorted(directmatch, key=sortMatch)
break all_direct_matches = directmatch
if directmatch[1] is None: directmatch = directmatch[0] if directmatch[0][1] is not None else None
directmatch = None
if directmatch is None: if directmatch is None:
final_sequence[b'error']=b'No primer match' final_sequence[b'error']=b'No primer match'
return False, final_sequence return False, final_sequence
first_matched_seq = directmatch[2]
if id(first_matched_seq) == id(sequenceF) and not_aligned:
second_matched_seq = sequenceR
else:
second_matched_seq = sequenceF
match = first_matched_seq[directmatch[1][1]:directmatch[1][2]] match = first_matched_seq[directmatch[1][1]:directmatch[1][2]]
if not not_aligned: if not not_aligned:
@ -273,37 +311,65 @@ cdef tuple annotate(sequences, infos, verbose=False):
final_sequence = final_sequence[directmatch[1][2]:] final_sequence = final_sequence[directmatch[1][2]:]
else: else:
cut_seq = sequenceR[directmatch[1][2]:] cut_seq = sequenceR[directmatch[1][2]:]
final_sequence[REVERSE_SEQ_COLUMN_NAME] = cut_seq.seq final_sequence[REVERSE_SEQ_COLUMN_NAME] = cut_seq.seq # used by alignpairedend tool
final_sequence[REVERSE_QUALITY_COLUMN_NAME] = cut_seq.quality final_sequence[REVERSE_QUALITY_COLUMN_NAME] = cut_seq.quality # used by alignpairedend tool
if directmatch[0].direct: if directmatch[0].forward:
final_sequence[b'direction']=b'forward' final_sequence[b'direction']=b'forward'
final_sequence[b'forward_score']=directmatch[1][0] final_sequence[b'forward_errors']=directmatch[1][0]
final_sequence[b'forward_primer']=directmatch[0].raw final_sequence[b'forward_primer']=directmatch[0].raw
final_sequence[b'forward_match']=match.seq final_sequence[b'forward_match']=match.seq
else: else:
final_sequence[b'direction']=b'reverse' final_sequence[b'direction']=b'reverse'
final_sequence[b'reverse_score']=directmatch[1][0] final_sequence[b'reverse_errors']=directmatch[1][0]
final_sequence[b'reverse_primer']=directmatch[0].raw final_sequence[b'reverse_primer']=directmatch[0].raw
final_sequence[b'reverse_match']=match.seq final_sequence[b'reverse_match']=match.seq
# Keep only paired reverse primer
infos = infos[directmatch[0]] infos = infos[directmatch[0]]
# Try reverse matching on the other sequence: # If not aligned, look for other match in already computed match (choose the one that makes the biggest amplicon)
for p in infos: if not_aligned:
reversematch = (p, p(second_matched_seq)) i=1
while all_direct_matches[i][1] is None and all_direct_matches[i][0].forward and i<len(all_direct_matches):
if reversematch[1] is None and not_aligned: i+=1
# Try matching on the same sequence than the first match if i < len(all_direct_matches):
reverse_p = p.reverse_complement() reversematch = all_direct_matches[i]
reversematch = (reverse_p, reverse_p(first_matched_seq)) else:
if reversematch[1] is None:
reversematch = None reversematch = None
# Look for other primer in the other direction on the sequence, or
# If sequences are not already aligned and reverse primer not found in most likely sequence (the one without the forward primer), try matching on the same sequence than the first match (primer in the other direction)
if not not_aligned or (not_aligned and reversematch[1] is None):
if not not_aligned:
sequence_to_match = second_matched_seq
else:
sequence_to_match = first_matched_seq
reversematch = []
# Compute begin
begin=directmatch[1][2]+1 # end of match + 1 on the same sequence
# Try reverse matching on the other sequence:
new_seq = True
pattern = 0
for p in infos:
if pattern == MAX_PATTERN:
new_seq = True
pattern = 0
if not_aligned:
primer=p.revcomp
else:
primer=p
reversematch.append((primer, primer(sequence_to_match, same_sequence=not new_seq, pattern=pattern, begin=begin)))
new_seq = False
pattern+=1
# Choose match closer to the end of the sequence
reversematch = sorted(reversematch, key=sortReverseMatch, reverse=True)
all_reverse_matches = reversematch
reversematch = reversematch[0] if reversematch[0][1] is not None else None
if reversematch is None and None not in infos: if reversematch is None and None not in infos:
if directmatch[0].direct: if directmatch[0].forward:
message = b'No reverse primer match' message = b'No reverse primer match'
else: else:
message = b'No direct primer match' message = b'No direct primer match'
@ -313,7 +379,7 @@ cdef tuple annotate(sequences, infos, verbose=False):
if reversematch is None: if reversematch is None:
final_sequence[b'status']=b'partial' final_sequence[b'status']=b'partial'
if directmatch[0].direct: if directmatch[0].forward:
tags=(directmatch[1][3],None) tags=(directmatch[1][3],None)
else: else:
tags=(None,directmatch[1][3]) tags=(None,directmatch[1][3])
@ -330,18 +396,18 @@ cdef tuple annotate(sequences, infos, verbose=False):
final_sequence = final_sequence[0:reversematch[1][1]] final_sequence = final_sequence[0:reversematch[1][1]]
else: else:
cut_seq = sequenceR[reversematch[1][2]:] cut_seq = sequenceR[reversematch[1][2]:]
final_sequence[REVERSE_SEQ_COLUMN_NAME] = cut_seq.seq final_sequence[REVERSE_SEQ_COLUMN_NAME] = cut_seq.seq # used by alignpairedend tool
final_sequence[REVERSE_QUALITY_COLUMN_NAME] = cut_seq.quality final_sequence[REVERSE_QUALITY_COLUMN_NAME] = cut_seq.quality # used by alignpairedend tool
if directmatch[0].direct: if directmatch[0].forward:
tags=(directmatch[1][3], reversematch[1][3]) tags=(directmatch[1][3], reversematch[1][3])
final_sequence[b'reverse_score'] = reversematch[1][0] final_sequence[b'reverse_errors'] = reversematch[1][0]
final_sequence[b'reverse_primer'] = reversematch[0].raw final_sequence[b'reverse_primer'] = reversematch[0].raw
final_sequence[b'reverse_match'] = match.seq final_sequence[b'reverse_match'] = match.seq
else: else:
tags=(reversematch[1][3], directmatch[1][3]) tags=(reversematch[1][3], directmatch[1][3])
final_sequence[b'forward_score'] = reversematch[1][0] final_sequence[b'forward_errors'] = reversematch[1][0]
final_sequence[b'forward_primer'] = reversematch[0].raw final_sequence[b'forward_primer'] = reversematch[0].raw
final_sequence[b'forward_match'] = match.seq final_sequence[b'forward_match'] = match.seq
@ -352,7 +418,7 @@ cdef tuple annotate(sequences, infos, verbose=False):
samples = infos[reversematch[0]] samples = infos[reversematch[0]]
if not directmatch[0].direct and not not_aligned: # don't reverse complement if not_aligned if not directmatch[0].forward and not not_aligned: # don't reverse complement if not_aligned
final_sequence = final_sequence.reverse_complement final_sequence = final_sequence.reverse_complement
sample=None sample=None
@ -360,7 +426,7 @@ cdef tuple annotate(sequences, infos, verbose=False):
if tags[0] is not None: # Direct tag known if tags[0] is not None: # Direct tag known
if tags[1] is not None: # Reverse tag known if tags[1] is not None: # Reverse tag known
sample = samples.get(tags, None) sample = samples.get(tags, None)
else: # Reverse tag known else: # Only direct tag known
s=[samples[x] for x in samples if x[0]==tags[0]] s=[samples[x] for x in samples if x[0]==tags[0]]
if len(s)==1: if len(s)==1:
sample=s[0] sample=s[0]
@ -370,14 +436,14 @@ cdef tuple annotate(sequences, infos, verbose=False):
else: else:
sample=None sample=None
else: else:
if tags[1] is not None: # Reverse tag known if tags[1] is not None: # Only reverse tag known
s=[samples[x] for x in samples if x[1]==tags[1]] s=[samples[x] for x in samples if x[1]==tags[1]]
if len(s)==1: if len(s)==1:
sample=s[0] sample=s[0]
elif len(s)>1: elif len(s)>1:
final_sequence[b'error']=b'multiple samples match tags' final_sequence[b'error']=b'multiple samples match tags'
return False, final_sequence return False, final_sequence
else: # Reverse tag known else:
sample=None sample=None
if sample is None: if sample is None:
@ -478,9 +544,18 @@ def run(config):
pb = ProgressBar(entries_len, config, seconde=5) pb = ProgressBar(entries_len, config, seconde=5)
# Check and store primers and tags # Check and store primers and tags
infos = read_info_view(info_view, error=config['ngsfilter']['error'], verbose=False, not_aligned=not_aligned) # TODO obi verbose option infos, primer_list = read_info_view(info_view, max_errors=config['ngsfilter']['error'], verbose=False, not_aligned=not_aligned) # TODO obi verbose option
if not_aligned: aligner = Primer_search(primer_list, config['ngsfilter']['error'])
for p in infos:
p.aligner = aligner
for paired_p in infos[p]:
paired_p.aligner = aligner
if paired_p.revcomp is not None:
paired_p.revcomp.aligner = aligner
if not_aligned: # create columns used by alignpairedend tool
Column.new_column(o_view, REVERSE_SEQ_COLUMN_NAME, OBI_SEQ) Column.new_column(o_view, REVERSE_SEQ_COLUMN_NAME, OBI_SEQ)
Column.new_column(o_view, REVERSE_QUALITY_COLUMN_NAME, OBI_QUAL, associated_column_name=REVERSE_SEQ_COLUMN_NAME, associated_column_version=o_view[REVERSE_SEQ_COLUMN_NAME].version) Column.new_column(o_view, REVERSE_QUALITY_COLUMN_NAME, OBI_QUAL, associated_column_name=REVERSE_SEQ_COLUMN_NAME, associated_column_version=o_view[REVERSE_SEQ_COLUMN_NAME].version)
@ -510,7 +585,8 @@ def run(config):
command_line = " ".join(sys.argv[1:]) command_line = " ".join(sys.argv[1:])
o_view.write_config(config, "ngsfilter", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name) o_view.write_config(config, "ngsfilter", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
unidentified.write_config(config, "ngsfilter", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name) unidentified.write_config(config, "ngsfilter", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
# TODO add comment about unidentified seqs # Add comment about unidentified seqs
unidentified.comments["info"] = "View containing sequences categorized as unidentified by the ngsfilter command"
output[0].record_command_line(command_line) output[0].record_command_line(command_line)
print("\n") print("\n")
@ -520,19 +596,5 @@ def run(config):
output[0].close() output[0].close()
info_input[0].close() info_input[0].close()
unidentified_input[0].close() unidentified_input[0].close()
aligner.free()
# TODO ??
# if options.taglist is not None:
#TODO: Patch when no taglists
# else:
# options.direct=options.direct.lower()
# options.reverse=options.reverse.lower()
# primers={options.direct:(options.taglength,{})}
# if options.reverse is not None:
# reverse = options.reverse
# else:
# reverse = '-'
# primers[options.direct][1][reverse]={'-':('-','-',True,None)}