diff --git a/python/obitools3/commands/ngsfilter.pyx b/python/obitools3/commands/ngsfilter.pyx index 1b79f2a..429e679 100755 --- a/python/obitools3/commands/ngsfilter.pyx +++ b/python/obitools3/commands/ngsfilter.pyx @@ -9,16 +9,20 @@ from obitools3.apps.optiongroups import addMinimalInputOption, addMinimalOutputO from obitools3.uri.decode import open_uri from obitools3.apps.config import logger from obitools3.libalign._freeendgapfm import FreeEndGapFullMatch +from obitools3.libalign.apat_pattern import Primer_search from obitools3.dms.obiseq cimport Nuc_Seq from obitools3.dms.capi.obitypes cimport OBI_SEQ, OBI_QUAL +from obitools3.dms.capi.apat cimport MAX_PATTERN +from obitools3.utils cimport tobytes -from functools import reduce, cmp_to_key +from libc.stdint cimport INT32_MAX +from functools import reduce import math import sys -REVERSE_SEQ_COLUMN_NAME = b"REVERSE_SEQUENCE" -REVERSE_QUALITY_COLUMN_NAME = b"REVERSE_QUALITY" +REVERSE_SEQ_COLUMN_NAME = b"REVERSE_SEQUENCE" # used by alignpairedend tool +REVERSE_QUALITY_COLUMN_NAME = b"REVERSE_QUALITY" # used by alignpairedend tool __title__="Assigns sequence records to the corresponding experiment/sample based on DNA tags and primers" @@ -64,7 +68,7 @@ class Primer: collection={} - def __init__(self, sequence, taglength, direct=True, error=2, verbose=False): + def __init__(self, sequence, taglength, forward=True, max_errors=2, verbose=False, primer_pair_idx=0, primer_idx=0): ''' @param sequence: @@ -79,28 +83,29 @@ class Primer: Primer.collection[sequence]=taglength + self.primer_pair_idx = primer_pair_idx + self.primer_idx = primer_idx + self.is_revcomp = False + self.revcomp = None self.raw=sequence self.sequence = Nuc_Seq(b"primer", sequence) self.lseq = len(self.sequence) - self.align=FreeEndGapFullMatch() - self.align.match=4 - self.align.mismatch=-2 - self.align.opengap=-2 - self.align.extgap=-2 - self.error=error - self.minscore = (self.lseq-error) * self.align.match + error * self.align.mismatch + self.max_errors=max_errors self.taglength=taglength - self.align.seqB=self.sequence - self.direct = direct + self.forward = forward self.verbose=verbose def reverse_complement(self): p = Primer(self.raw, - self.taglength, - not self.direct,verbose=self.verbose, - error=self.error) + self.taglength, + not self.forward, + verbose=self.verbose, + max_errors=self.max_errors, + primer_pair_idx=self.primer_pair_idx, + primer_idx=self.primer_idx) p.sequence=p.sequence.reverse_complement - p.align.seqB=p.sequence + p.is_revcomp = True + p.revcomp = None return p def __hash__(self): @@ -109,51 +114,64 @@ class Primer: def __eq__(self,primer): return self.raw==primer.raw - def __call__(self,sequence): + def __call__(self, sequence, same_sequence=False, pattern=0, begin=0): + if len(sequence) <= self.lseq: return None + + ali = self.aligner.search_one_primer(sequence.seq, + self.primer_pair_idx, + self.primer_idx, + reverse_comp=self.is_revcomp, + same_sequence=same_sequence, + pattern_ref=pattern, + begin=begin) - self.align.seqA=sequence - ali=self.align() - - if ali.score >= self.minscore: - score = ali.score - start = ali[1].gaps[0][1] - end = len(ali[1])-ali[1].gaps[-1][1] + if ali is None: # no match + return None + + errors, start = ali.first_encountered() + + if errors <= self.max_errors: + end = start + self.lseq if self.taglength is not None: - if self.sequence.revcomp: + if self.sequence.is_revcomp: if (len(sequence)-end) >= self.taglength: - tag = sequence.clone() - tag = tag[end:end+self.taglength] - tag = tag.reverse_complement.seq + tag_start = len(sequence) - end - self.taglength + tag = sequence.reverse_complement[tag_start:tag_start+self.taglength].seq else: tag=None else: if start >= self.taglength: - tag = sequence.clone() - tag = tag[start - self.taglength:start].seq + tag = tobytes((sequence[start - self.taglength:start].seq).lower()) # turn back to lowercase because apat turned to uppercase else: tag=None else: tag=None - - return score,start,end,tag + + return errors,start,end,tag return None + def __str__(self): - return "%s: %s" % ({True:'D',False:'R'}[self.direct],self.raw) + return "%s: %s" % ({True:'D',False:'R'}[self.forward],self.raw) __repr__=__str__ -cdef dict read_info_view(info_view, error=2, verbose=False, not_aligned=False): +cdef read_info_view(info_view, max_errors=2, verbose=False, not_aligned=False): infos = {} + primer_list = [] + i=0 for p in info_view: forward=Primer(p[b'forward_primer'], len(p[b'forward_tag']) if p[b'forward_tag']!=b'-' else None, True, - error=error,verbose=verbose) + max_errors=max_errors, + verbose=verbose, + primer_pair_idx=i, + primer_idx=0) fp = infos.get(forward,{}) infos[forward]=fp @@ -161,17 +179,28 @@ cdef dict read_info_view(info_view, error=2, verbose=False, not_aligned=False): reverse=Primer(p[b'reverse_primer'], len(p[b'reverse_tag']) if p[b'reverse_tag']!=b'-' else None, False, - error=error,verbose=verbose) + max_errors=max_errors, + verbose=verbose, + primer_pair_idx=i, + primer_idx=1) + primer_list.append((p[b'forward_primer'], p[b'reverse_primer'])) + rp = infos.get(reverse,{}) infos[reverse]=rp if not_aligned: - dpp=fp.get(reverse,{}) - fp[reverse]=dpp + cf=forward + cr=reverse - rpp=rp.get(forward,{}) - rp[forward]=rpp + cf.revcomp = forward.reverse_complement() + cr.revcomp = reverse.reverse_complement() + + dpp=fp.get(cr,{}) + fp[cr]=dpp + + rpp=rp.get(cf,{}) + rp[cf]=rpp else: cf=forward.reverse_complement() @@ -199,22 +228,24 @@ cdef dict read_info_view(info_view, error=2, verbose=False, not_aligned=False): dpp[tags] = data rpp[tags] = data - return infos + i+=1 + + return infos, primer_list cdef tuple annotate(sequences, infos, verbose=False): - def sortMatch(m1, m2): - if m1[1] is None and m2[1] is None: - return 0 - - if m1[1] is None: - return 1 - - if m2[1] is None: + def sortMatch(match): + if match[1] is None: + return INT32_MAX + else: + return match[1][1] + + def sortReverseMatch(match): + if match[1] is None: return -1 - - return (m1[1][1] > m2[1][2]) - (m1[1][1] < m2[1][2]) + else: + return match[1][1] not_aligned = len(sequences) > 1 sequenceF = sequences[0] @@ -226,8 +257,8 @@ cdef tuple annotate(sequences, infos, verbose=False): if not_aligned: sequenceR = sequences[1] - final_sequence[REVERSE_SEQ_COLUMN_NAME] = sequenceR.seq - final_sequence[REVERSE_QUALITY_COLUMN_NAME] = sequenceR.quality + final_sequence[REVERSE_SEQ_COLUMN_NAME] = sequenceR.seq # used by alignpairedend tool + final_sequence[REVERSE_QUALITY_COLUMN_NAME] = sequenceR.quality # used by alignpairedend tool for seq in sequences: if hasattr(seq, "quality_array"): @@ -242,28 +273,35 @@ cdef tuple annotate(sequences, infos, verbose=False): seq[b'tail_quality']=q # Try direct matching: - directmatch = None + directmatch = [] first_matched_seq = None second_matched_seq = None for seq in sequences: + new_seq = True + pattern = 0 for p in infos: - directmatch = (p, p(seq)) - if directmatch[1] is not None: - break - if directmatch[1] is not None: - first_matched_seq = seq - if id(first_matched_seq) == id(sequenceF) and not_aligned: - second_matched_seq = sequenceR - else: - second_matched_seq = sequenceF - break - if directmatch[1] is None: - directmatch = None - + if pattern == MAX_PATTERN: + new_seq = True + pattern = 0 + directmatch.append((p, p(seq, same_sequence=not new_seq, pattern=pattern), seq)) + new_seq = False + pattern+=1 + + # Choose match closer to the start of (one of the) sequence(s) + directmatch = sorted(directmatch, key=sortMatch) + all_direct_matches = directmatch + directmatch = directmatch[0] if directmatch[0][1] is not None else None + if directmatch is None: final_sequence[b'error']=b'No primer match' return False, final_sequence - + + first_matched_seq = directmatch[2] + if id(first_matched_seq) == id(sequenceF) and not_aligned: + second_matched_seq = sequenceR + else: + second_matched_seq = sequenceF + match = first_matched_seq[directmatch[1][1]:directmatch[1][2]] if not not_aligned: @@ -273,37 +311,65 @@ cdef tuple annotate(sequences, infos, verbose=False): final_sequence = final_sequence[directmatch[1][2]:] else: cut_seq = sequenceR[directmatch[1][2]:] - final_sequence[REVERSE_SEQ_COLUMN_NAME] = cut_seq.seq - final_sequence[REVERSE_QUALITY_COLUMN_NAME] = cut_seq.quality + final_sequence[REVERSE_SEQ_COLUMN_NAME] = cut_seq.seq # used by alignpairedend tool + final_sequence[REVERSE_QUALITY_COLUMN_NAME] = cut_seq.quality # used by alignpairedend tool - if directmatch[0].direct: + if directmatch[0].forward: final_sequence[b'direction']=b'forward' - final_sequence[b'forward_score']=directmatch[1][0] + final_sequence[b'forward_errors']=directmatch[1][0] final_sequence[b'forward_primer']=directmatch[0].raw final_sequence[b'forward_match']=match.seq else: final_sequence[b'direction']=b'reverse' - final_sequence[b'reverse_score']=directmatch[1][0] + final_sequence[b'reverse_errors']=directmatch[1][0] final_sequence[b'reverse_primer']=directmatch[0].raw final_sequence[b'reverse_match']=match.seq - infos = infos[directmatch[0]] - - # Try reverse matching on the other sequence: - for p in infos: - reversematch = (p, p(second_matched_seq)) + # Keep only paired reverse primer + infos = infos[directmatch[0]] - if reversematch[1] is None and not_aligned: - # Try matching on the same sequence than the first match - reverse_p = p.reverse_complement() - reversematch = (reverse_p, reverse_p(first_matched_seq)) + # If not aligned, look for other match in already computed match (choose the one that makes the biggest amplicon) + if not_aligned: + i=1 + while all_direct_matches[i][1] is None and all_direct_matches[i][0].forward and i1: final_sequence[b'error']=b'multiple samples match tags' return False, final_sequence - else: # Reverse tag known + else: sample=None if sample is None: @@ -388,7 +454,7 @@ cdef tuple annotate(sequences, infos, verbose=False): if not not_aligned: final_sequence[b'seq_length']=len(final_sequence) - + return True, final_sequence @@ -478,9 +544,18 @@ def run(config): pb = ProgressBar(entries_len, config, seconde=5) # Check and store primers and tags - infos = read_info_view(info_view, error=config['ngsfilter']['error'], verbose=False, not_aligned=not_aligned) # TODO obi verbose option + infos, primer_list = read_info_view(info_view, max_errors=config['ngsfilter']['error'], verbose=False, not_aligned=not_aligned) # TODO obi verbose option - if not_aligned: + aligner = Primer_search(primer_list, config['ngsfilter']['error']) + + for p in infos: + p.aligner = aligner + for paired_p in infos[p]: + paired_p.aligner = aligner + if paired_p.revcomp is not None: + paired_p.revcomp.aligner = aligner + + if not_aligned: # create columns used by alignpairedend tool Column.new_column(o_view, REVERSE_SEQ_COLUMN_NAME, OBI_SEQ) Column.new_column(o_view, REVERSE_QUALITY_COLUMN_NAME, OBI_QUAL, associated_column_name=REVERSE_SEQ_COLUMN_NAME, associated_column_version=o_view[REVERSE_SEQ_COLUMN_NAME].version) @@ -510,7 +585,8 @@ def run(config): command_line = " ".join(sys.argv[1:]) o_view.write_config(config, "ngsfilter", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name) unidentified.write_config(config, "ngsfilter", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name) - # TODO add comment about unidentified seqs + # Add comment about unidentified seqs + unidentified.comments["info"] = "View containing sequences categorized as unidentified by the ngsfilter command" output[0].record_command_line(command_line) print("\n") @@ -520,19 +596,5 @@ def run(config): output[0].close() info_input[0].close() unidentified_input[0].close() - - - -# TODO ?? -# if options.taglist is not None: -#TODO: Patch when no taglists -# else: -# options.direct=options.direct.lower() -# options.reverse=options.reverse.lower() -# primers={options.direct:(options.taglength,{})} -# if options.reverse is not None: -# reverse = options.reverse -# else: -# reverse = '-' -# primers[options.direct][1][reverse]={'-':('-','-',True,None)} + aligner.free()