ngsfilter: reworked to use apat library
This commit is contained in:
@ -9,16 +9,20 @@ from obitools3.apps.optiongroups import addMinimalInputOption, addMinimalOutputO
|
||||
from obitools3.uri.decode import open_uri
|
||||
from obitools3.apps.config import logger
|
||||
from obitools3.libalign._freeendgapfm import FreeEndGapFullMatch
|
||||
from obitools3.libalign.apat_pattern import Primer_search
|
||||
from obitools3.dms.obiseq cimport Nuc_Seq
|
||||
from obitools3.dms.capi.obitypes cimport OBI_SEQ, OBI_QUAL
|
||||
from obitools3.dms.capi.apat cimport MAX_PATTERN
|
||||
from obitools3.utils cimport tobytes
|
||||
|
||||
from functools import reduce, cmp_to_key
|
||||
from libc.stdint cimport INT32_MAX
|
||||
from functools import reduce
|
||||
import math
|
||||
import sys
|
||||
|
||||
|
||||
REVERSE_SEQ_COLUMN_NAME = b"REVERSE_SEQUENCE"
|
||||
REVERSE_QUALITY_COLUMN_NAME = b"REVERSE_QUALITY"
|
||||
REVERSE_SEQ_COLUMN_NAME = b"REVERSE_SEQUENCE" # used by alignpairedend tool
|
||||
REVERSE_QUALITY_COLUMN_NAME = b"REVERSE_QUALITY" # used by alignpairedend tool
|
||||
|
||||
|
||||
__title__="Assigns sequence records to the corresponding experiment/sample based on DNA tags and primers"
|
||||
@ -64,7 +68,7 @@ class Primer:
|
||||
|
||||
collection={}
|
||||
|
||||
def __init__(self, sequence, taglength, direct=True, error=2, verbose=False):
|
||||
def __init__(self, sequence, taglength, forward=True, max_errors=2, verbose=False, primer_pair_idx=0, primer_idx=0):
|
||||
'''
|
||||
|
||||
@param sequence:
|
||||
@ -79,28 +83,29 @@ class Primer:
|
||||
|
||||
Primer.collection[sequence]=taglength
|
||||
|
||||
self.primer_pair_idx = primer_pair_idx
|
||||
self.primer_idx = primer_idx
|
||||
self.is_revcomp = False
|
||||
self.revcomp = None
|
||||
self.raw=sequence
|
||||
self.sequence = Nuc_Seq(b"primer", sequence)
|
||||
self.lseq = len(self.sequence)
|
||||
self.align=FreeEndGapFullMatch()
|
||||
self.align.match=4
|
||||
self.align.mismatch=-2
|
||||
self.align.opengap=-2
|
||||
self.align.extgap=-2
|
||||
self.error=error
|
||||
self.minscore = (self.lseq-error) * self.align.match + error * self.align.mismatch
|
||||
self.max_errors=max_errors
|
||||
self.taglength=taglength
|
||||
self.align.seqB=self.sequence
|
||||
self.direct = direct
|
||||
self.forward = forward
|
||||
self.verbose=verbose
|
||||
|
||||
def reverse_complement(self):
|
||||
p = Primer(self.raw,
|
||||
self.taglength,
|
||||
not self.direct,verbose=self.verbose,
|
||||
error=self.error)
|
||||
self.taglength,
|
||||
not self.forward,
|
||||
verbose=self.verbose,
|
||||
max_errors=self.max_errors,
|
||||
primer_pair_idx=self.primer_pair_idx,
|
||||
primer_idx=self.primer_idx)
|
||||
p.sequence=p.sequence.reverse_complement
|
||||
p.align.seqB=p.sequence
|
||||
p.is_revcomp = True
|
||||
p.revcomp = None
|
||||
return p
|
||||
|
||||
def __hash__(self):
|
||||
@ -109,51 +114,64 @@ class Primer:
|
||||
def __eq__(self,primer):
|
||||
return self.raw==primer.raw
|
||||
|
||||
def __call__(self,sequence):
|
||||
def __call__(self, sequence, same_sequence=False, pattern=0, begin=0):
|
||||
|
||||
if len(sequence) <= self.lseq:
|
||||
return None
|
||||
|
||||
self.align.seqA=sequence
|
||||
ali=self.align()
|
||||
ali = self.aligner.search_one_primer(sequence.seq,
|
||||
self.primer_pair_idx,
|
||||
self.primer_idx,
|
||||
reverse_comp=self.is_revcomp,
|
||||
same_sequence=same_sequence,
|
||||
pattern_ref=pattern,
|
||||
begin=begin)
|
||||
|
||||
if ali.score >= self.minscore:
|
||||
score = ali.score
|
||||
start = ali[1].gaps[0][1]
|
||||
end = len(ali[1])-ali[1].gaps[-1][1]
|
||||
if ali is None: # no match
|
||||
return None
|
||||
|
||||
errors, start = ali.first_encountered()
|
||||
|
||||
if errors <= self.max_errors:
|
||||
end = start + self.lseq
|
||||
if self.taglength is not None:
|
||||
if self.sequence.revcomp:
|
||||
if self.sequence.is_revcomp:
|
||||
if (len(sequence)-end) >= self.taglength:
|
||||
tag = sequence.clone()
|
||||
tag = tag[end:end+self.taglength]
|
||||
tag = tag.reverse_complement.seq
|
||||
tag_start = len(sequence) - end - self.taglength
|
||||
tag = sequence.reverse_complement[tag_start:tag_start+self.taglength].seq
|
||||
else:
|
||||
tag=None
|
||||
else:
|
||||
if start >= self.taglength:
|
||||
tag = sequence.clone()
|
||||
tag = tag[start - self.taglength:start].seq
|
||||
tag = tobytes((sequence[start - self.taglength:start].seq).lower()) # turn back to lowercase because apat turned to uppercase
|
||||
else:
|
||||
tag=None
|
||||
else:
|
||||
tag=None
|
||||
|
||||
return score,start,end,tag
|
||||
return errors,start,end,tag
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def __str__(self):
|
||||
return "%s: %s" % ({True:'D',False:'R'}[self.direct],self.raw)
|
||||
return "%s: %s" % ({True:'D',False:'R'}[self.forward],self.raw)
|
||||
|
||||
__repr__=__str__
|
||||
|
||||
|
||||
cdef dict read_info_view(info_view, error=2, verbose=False, not_aligned=False):
|
||||
cdef read_info_view(info_view, max_errors=2, verbose=False, not_aligned=False):
|
||||
infos = {}
|
||||
primer_list = []
|
||||
i=0
|
||||
for p in info_view:
|
||||
forward=Primer(p[b'forward_primer'],
|
||||
len(p[b'forward_tag']) if p[b'forward_tag']!=b'-' else None,
|
||||
True,
|
||||
error=error,verbose=verbose)
|
||||
max_errors=max_errors,
|
||||
verbose=verbose,
|
||||
primer_pair_idx=i,
|
||||
primer_idx=0)
|
||||
|
||||
fp = infos.get(forward,{})
|
||||
infos[forward]=fp
|
||||
@ -161,17 +179,28 @@ cdef dict read_info_view(info_view, error=2, verbose=False, not_aligned=False):
|
||||
reverse=Primer(p[b'reverse_primer'],
|
||||
len(p[b'reverse_tag']) if p[b'reverse_tag']!=b'-' else None,
|
||||
False,
|
||||
error=error,verbose=verbose)
|
||||
max_errors=max_errors,
|
||||
verbose=verbose,
|
||||
primer_pair_idx=i,
|
||||
primer_idx=1)
|
||||
|
||||
primer_list.append((p[b'forward_primer'], p[b'reverse_primer']))
|
||||
|
||||
rp = infos.get(reverse,{})
|
||||
infos[reverse]=rp
|
||||
|
||||
if not_aligned:
|
||||
dpp=fp.get(reverse,{})
|
||||
fp[reverse]=dpp
|
||||
cf=forward
|
||||
cr=reverse
|
||||
|
||||
rpp=rp.get(forward,{})
|
||||
rp[forward]=rpp
|
||||
cf.revcomp = forward.reverse_complement()
|
||||
cr.revcomp = reverse.reverse_complement()
|
||||
|
||||
dpp=fp.get(cr,{})
|
||||
fp[cr]=dpp
|
||||
|
||||
rpp=rp.get(cf,{})
|
||||
rp[cf]=rpp
|
||||
|
||||
else:
|
||||
cf=forward.reverse_complement()
|
||||
@ -199,22 +228,24 @@ cdef dict read_info_view(info_view, error=2, verbose=False, not_aligned=False):
|
||||
dpp[tags] = data
|
||||
rpp[tags] = data
|
||||
|
||||
return infos
|
||||
i+=1
|
||||
|
||||
return infos, primer_list
|
||||
|
||||
|
||||
cdef tuple annotate(sequences, infos, verbose=False):
|
||||
|
||||
def sortMatch(m1, m2):
|
||||
if m1[1] is None and m2[1] is None:
|
||||
return 0
|
||||
def sortMatch(match):
|
||||
if match[1] is None:
|
||||
return INT32_MAX
|
||||
else:
|
||||
return match[1][1]
|
||||
|
||||
if m1[1] is None:
|
||||
return 1
|
||||
|
||||
if m2[1] is None:
|
||||
def sortReverseMatch(match):
|
||||
if match[1] is None:
|
||||
return -1
|
||||
|
||||
return (m1[1][1] > m2[1][2]) - (m1[1][1] < m2[1][2])
|
||||
else:
|
||||
return match[1][1]
|
||||
|
||||
not_aligned = len(sequences) > 1
|
||||
sequenceF = sequences[0]
|
||||
@ -226,8 +257,8 @@ cdef tuple annotate(sequences, infos, verbose=False):
|
||||
|
||||
if not_aligned:
|
||||
sequenceR = sequences[1]
|
||||
final_sequence[REVERSE_SEQ_COLUMN_NAME] = sequenceR.seq
|
||||
final_sequence[REVERSE_QUALITY_COLUMN_NAME] = sequenceR.quality
|
||||
final_sequence[REVERSE_SEQ_COLUMN_NAME] = sequenceR.seq # used by alignpairedend tool
|
||||
final_sequence[REVERSE_QUALITY_COLUMN_NAME] = sequenceR.quality # used by alignpairedend tool
|
||||
|
||||
for seq in sequences:
|
||||
if hasattr(seq, "quality_array"):
|
||||
@ -242,28 +273,35 @@ cdef tuple annotate(sequences, infos, verbose=False):
|
||||
seq[b'tail_quality']=q
|
||||
|
||||
# Try direct matching:
|
||||
directmatch = None
|
||||
directmatch = []
|
||||
first_matched_seq = None
|
||||
second_matched_seq = None
|
||||
for seq in sequences:
|
||||
new_seq = True
|
||||
pattern = 0
|
||||
for p in infos:
|
||||
directmatch = (p, p(seq))
|
||||
if directmatch[1] is not None:
|
||||
break
|
||||
if directmatch[1] is not None:
|
||||
first_matched_seq = seq
|
||||
if id(first_matched_seq) == id(sequenceF) and not_aligned:
|
||||
second_matched_seq = sequenceR
|
||||
else:
|
||||
second_matched_seq = sequenceF
|
||||
break
|
||||
if directmatch[1] is None:
|
||||
directmatch = None
|
||||
if pattern == MAX_PATTERN:
|
||||
new_seq = True
|
||||
pattern = 0
|
||||
directmatch.append((p, p(seq, same_sequence=not new_seq, pattern=pattern), seq))
|
||||
new_seq = False
|
||||
pattern+=1
|
||||
|
||||
# Choose match closer to the start of (one of the) sequence(s)
|
||||
directmatch = sorted(directmatch, key=sortMatch)
|
||||
all_direct_matches = directmatch
|
||||
directmatch = directmatch[0] if directmatch[0][1] is not None else None
|
||||
|
||||
if directmatch is None:
|
||||
final_sequence[b'error']=b'No primer match'
|
||||
return False, final_sequence
|
||||
|
||||
first_matched_seq = directmatch[2]
|
||||
if id(first_matched_seq) == id(sequenceF) and not_aligned:
|
||||
second_matched_seq = sequenceR
|
||||
else:
|
||||
second_matched_seq = sequenceF
|
||||
|
||||
match = first_matched_seq[directmatch[1][1]:directmatch[1][2]]
|
||||
|
||||
if not not_aligned:
|
||||
@ -273,37 +311,65 @@ cdef tuple annotate(sequences, infos, verbose=False):
|
||||
final_sequence = final_sequence[directmatch[1][2]:]
|
||||
else:
|
||||
cut_seq = sequenceR[directmatch[1][2]:]
|
||||
final_sequence[REVERSE_SEQ_COLUMN_NAME] = cut_seq.seq
|
||||
final_sequence[REVERSE_QUALITY_COLUMN_NAME] = cut_seq.quality
|
||||
final_sequence[REVERSE_SEQ_COLUMN_NAME] = cut_seq.seq # used by alignpairedend tool
|
||||
final_sequence[REVERSE_QUALITY_COLUMN_NAME] = cut_seq.quality # used by alignpairedend tool
|
||||
|
||||
if directmatch[0].direct:
|
||||
if directmatch[0].forward:
|
||||
final_sequence[b'direction']=b'forward'
|
||||
final_sequence[b'forward_score']=directmatch[1][0]
|
||||
final_sequence[b'forward_errors']=directmatch[1][0]
|
||||
final_sequence[b'forward_primer']=directmatch[0].raw
|
||||
final_sequence[b'forward_match']=match.seq
|
||||
|
||||
else:
|
||||
final_sequence[b'direction']=b'reverse'
|
||||
final_sequence[b'reverse_score']=directmatch[1][0]
|
||||
final_sequence[b'reverse_errors']=directmatch[1][0]
|
||||
final_sequence[b'reverse_primer']=directmatch[0].raw
|
||||
final_sequence[b'reverse_match']=match.seq
|
||||
|
||||
# Keep only paired reverse primer
|
||||
infos = infos[directmatch[0]]
|
||||
|
||||
# Try reverse matching on the other sequence:
|
||||
for p in infos:
|
||||
reversematch = (p, p(second_matched_seq))
|
||||
# If not aligned, look for other match in already computed match (choose the one that makes the biggest amplicon)
|
||||
if not_aligned:
|
||||
i=1
|
||||
while all_direct_matches[i][1] is None and all_direct_matches[i][0].forward and i<len(all_direct_matches):
|
||||
i+=1
|
||||
if i < len(all_direct_matches):
|
||||
reversematch = all_direct_matches[i]
|
||||
else:
|
||||
reversematch = None
|
||||
|
||||
if reversematch[1] is None and not_aligned:
|
||||
# Try matching on the same sequence than the first match
|
||||
reverse_p = p.reverse_complement()
|
||||
reversematch = (reverse_p, reverse_p(first_matched_seq))
|
||||
|
||||
if reversematch[1] is None:
|
||||
reversematch = None
|
||||
# Look for other primer in the other direction on the sequence, or
|
||||
# If sequences are not already aligned and reverse primer not found in most likely sequence (the one without the forward primer), try matching on the same sequence than the first match (primer in the other direction)
|
||||
if not not_aligned or (not_aligned and reversematch[1] is None):
|
||||
if not not_aligned:
|
||||
sequence_to_match = second_matched_seq
|
||||
else:
|
||||
sequence_to_match = first_matched_seq
|
||||
reversematch = []
|
||||
# Compute begin
|
||||
begin=directmatch[1][2]+1 # end of match + 1 on the same sequence
|
||||
# Try reverse matching on the other sequence:
|
||||
new_seq = True
|
||||
pattern = 0
|
||||
for p in infos:
|
||||
if pattern == MAX_PATTERN:
|
||||
new_seq = True
|
||||
pattern = 0
|
||||
if not_aligned:
|
||||
primer=p.revcomp
|
||||
else:
|
||||
primer=p
|
||||
reversematch.append((primer, primer(sequence_to_match, same_sequence=not new_seq, pattern=pattern, begin=begin)))
|
||||
new_seq = False
|
||||
pattern+=1
|
||||
# Choose match closer to the end of the sequence
|
||||
reversematch = sorted(reversematch, key=sortReverseMatch, reverse=True)
|
||||
all_reverse_matches = reversematch
|
||||
reversematch = reversematch[0] if reversematch[0][1] is not None else None
|
||||
|
||||
if reversematch is None and None not in infos:
|
||||
if directmatch[0].direct:
|
||||
if directmatch[0].forward:
|
||||
message = b'No reverse primer match'
|
||||
else:
|
||||
message = b'No direct primer match'
|
||||
@ -313,7 +379,7 @@ cdef tuple annotate(sequences, infos, verbose=False):
|
||||
if reversematch is None:
|
||||
final_sequence[b'status']=b'partial'
|
||||
|
||||
if directmatch[0].direct:
|
||||
if directmatch[0].forward:
|
||||
tags=(directmatch[1][3],None)
|
||||
else:
|
||||
tags=(None,directmatch[1][3])
|
||||
@ -330,18 +396,18 @@ cdef tuple annotate(sequences, infos, verbose=False):
|
||||
final_sequence = final_sequence[0:reversematch[1][1]]
|
||||
else:
|
||||
cut_seq = sequenceR[reversematch[1][2]:]
|
||||
final_sequence[REVERSE_SEQ_COLUMN_NAME] = cut_seq.seq
|
||||
final_sequence[REVERSE_QUALITY_COLUMN_NAME] = cut_seq.quality
|
||||
final_sequence[REVERSE_SEQ_COLUMN_NAME] = cut_seq.seq # used by alignpairedend tool
|
||||
final_sequence[REVERSE_QUALITY_COLUMN_NAME] = cut_seq.quality # used by alignpairedend tool
|
||||
|
||||
if directmatch[0].direct:
|
||||
if directmatch[0].forward:
|
||||
tags=(directmatch[1][3], reversematch[1][3])
|
||||
final_sequence[b'reverse_score'] = reversematch[1][0]
|
||||
final_sequence[b'reverse_errors'] = reversematch[1][0]
|
||||
final_sequence[b'reverse_primer'] = reversematch[0].raw
|
||||
final_sequence[b'reverse_match'] = match.seq
|
||||
|
||||
else:
|
||||
tags=(reversematch[1][3], directmatch[1][3])
|
||||
final_sequence[b'forward_score'] = reversematch[1][0]
|
||||
final_sequence[b'forward_errors'] = reversematch[1][0]
|
||||
final_sequence[b'forward_primer'] = reversematch[0].raw
|
||||
final_sequence[b'forward_match'] = match.seq
|
||||
|
||||
@ -352,15 +418,15 @@ cdef tuple annotate(sequences, infos, verbose=False):
|
||||
|
||||
samples = infos[reversematch[0]]
|
||||
|
||||
if not directmatch[0].direct and not not_aligned: # don't reverse complement if not_aligned
|
||||
if not directmatch[0].forward and not not_aligned: # don't reverse complement if not_aligned
|
||||
final_sequence = final_sequence.reverse_complement
|
||||
|
||||
sample=None
|
||||
|
||||
if tags[0] is not None: # Direct tag known
|
||||
if tags[0] is not None: # Direct tag known
|
||||
if tags[1] is not None: # Reverse tag known
|
||||
sample = samples.get(tags, None)
|
||||
else: # Reverse tag known
|
||||
else: # Only direct tag known
|
||||
s=[samples[x] for x in samples if x[0]==tags[0]]
|
||||
if len(s)==1:
|
||||
sample=s[0]
|
||||
@ -370,14 +436,14 @@ cdef tuple annotate(sequences, infos, verbose=False):
|
||||
else:
|
||||
sample=None
|
||||
else:
|
||||
if tags[1] is not None: # Reverse tag known
|
||||
if tags[1] is not None: # Only reverse tag known
|
||||
s=[samples[x] for x in samples if x[1]==tags[1]]
|
||||
if len(s)==1:
|
||||
sample=s[0]
|
||||
elif len(s)>1:
|
||||
final_sequence[b'error']=b'multiple samples match tags'
|
||||
return False, final_sequence
|
||||
else: # Reverse tag known
|
||||
else:
|
||||
sample=None
|
||||
|
||||
if sample is None:
|
||||
@ -478,9 +544,18 @@ def run(config):
|
||||
pb = ProgressBar(entries_len, config, seconde=5)
|
||||
|
||||
# Check and store primers and tags
|
||||
infos = read_info_view(info_view, error=config['ngsfilter']['error'], verbose=False, not_aligned=not_aligned) # TODO obi verbose option
|
||||
infos, primer_list = read_info_view(info_view, max_errors=config['ngsfilter']['error'], verbose=False, not_aligned=not_aligned) # TODO obi verbose option
|
||||
|
||||
if not_aligned:
|
||||
aligner = Primer_search(primer_list, config['ngsfilter']['error'])
|
||||
|
||||
for p in infos:
|
||||
p.aligner = aligner
|
||||
for paired_p in infos[p]:
|
||||
paired_p.aligner = aligner
|
||||
if paired_p.revcomp is not None:
|
||||
paired_p.revcomp.aligner = aligner
|
||||
|
||||
if not_aligned: # create columns used by alignpairedend tool
|
||||
Column.new_column(o_view, REVERSE_SEQ_COLUMN_NAME, OBI_SEQ)
|
||||
Column.new_column(o_view, REVERSE_QUALITY_COLUMN_NAME, OBI_QUAL, associated_column_name=REVERSE_SEQ_COLUMN_NAME, associated_column_version=o_view[REVERSE_SEQ_COLUMN_NAME].version)
|
||||
|
||||
@ -510,7 +585,8 @@ def run(config):
|
||||
command_line = " ".join(sys.argv[1:])
|
||||
o_view.write_config(config, "ngsfilter", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
|
||||
unidentified.write_config(config, "ngsfilter", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
|
||||
# TODO add comment about unidentified seqs
|
||||
# Add comment about unidentified seqs
|
||||
unidentified.comments["info"] = "View containing sequences categorized as unidentified by the ngsfilter command"
|
||||
output[0].record_command_line(command_line)
|
||||
|
||||
print("\n")
|
||||
@ -520,19 +596,5 @@ def run(config):
|
||||
output[0].close()
|
||||
info_input[0].close()
|
||||
unidentified_input[0].close()
|
||||
|
||||
|
||||
|
||||
# TODO ??
|
||||
# if options.taglist is not None:
|
||||
#TODO: Patch when no taglists
|
||||
# else:
|
||||
# options.direct=options.direct.lower()
|
||||
# options.reverse=options.reverse.lower()
|
||||
# primers={options.direct:(options.taglength,{})}
|
||||
# if options.reverse is not None:
|
||||
# reverse = options.reverse
|
||||
# else:
|
||||
# reverse = '-'
|
||||
# primers[options.direct][1][reverse]={'-':('-','-',True,None)}
|
||||
aligner.free()
|
||||
|
||||
|
Reference in New Issue
Block a user