From e8dc5eb1235d1c94acc55d1406508839c5e6a470 Mon Sep 17 00:00:00 2001 From: Celine Mercier Date: Wed, 8 Aug 2018 19:53:26 +0200 Subject: [PATCH] Commands: ngsfilter and alignpairedend can now be used in whichever order --- python/obitools3/commands/alignpairedend.pyx | 121 +++++--- python/obitools3/commands/ngsfilter.pyx | 293 +++++++++++++------ 2 files changed, 292 insertions(+), 122 deletions(-) diff --git a/python/obitools3/commands/alignpairedend.pyx b/python/obitools3/commands/alignpairedend.pyx index df649dc..eab56f7 100644 --- a/python/obitools3/commands/alignpairedend.pyx +++ b/python/obitools3/commands/alignpairedend.pyx @@ -8,13 +8,17 @@ from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS from obitools3.dms.column.column cimport Column, Column_line from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t, OBI_QUAL -from obitools3.apps.optiongroups import addMinimalOutputOption +from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption from obitools3.uri.decode import open_uri from obitools3.apps.config import logger from obitools3.align._qsassemble import QSolexaReverseAssemble from obitools3.align._qsrassemble import QSolexaRightReverseAssemble from obitools3.align._solexapairend import buildConsensus, buildJoinedSequence -from obitools3.dms.obiseq cimport Nuc_Seq_Stored +from obitools3.dms.obiseq cimport Nuc_Seq_Stored, Nuc_Seq + + +REVERSE_SEQ_COLUMN_NAME = b"REVERSE_SEQUENCE" +REVERSE_QUALITY_COLUMN_NAME = b"REVERSE_QUALITY" __title__="Aligns paired-ended reads" @@ -22,23 +26,17 @@ __title__="Aligns paired-ended reads" def addOptions(parser): + addSequenceInputOption(parser) addMinimalOutputOption(parser) group = parser.add_argument_group('obi alignpairedend specific options') - - group.add_argument('-f', '--forward-reads', - action="store", dest="alignpairedend:forward", - metavar="", - default=None, - type=str, - help="URI to the forward reads") - group.add_argument('-r', '--reverse-reads', + group.add_argument('-R', '--reverse-reads', action="store", dest="alignpairedend:reverse", metavar="", default=None, type=str, - help="URI to the reverse reads") + help="URI to the reverse reads if they are in a different view than the forward reads") # TODO # group.add_argument('--index-file', @@ -60,7 +58,7 @@ def addOptions(parser): la = QSolexaReverseAssemble() ra = QSolexaRightReverseAssemble() -def buildAlignment(Nuc_Seq_Stored direct, Nuc_Seq_Stored reverse): +def buildAlignment(object direct, object reverse): if len(direct)==0 or len(reverse)==0: return None @@ -83,9 +81,27 @@ def buildAlignment(Nuc_Seq_Stored direct, Nuc_Seq_Stored reverse): return ali -def alignmentIterator(View_NUC_SEQS forward, View_NUC_SEQS reverse): - for i in range(len(forward)): - ali = buildAlignment(forward[i], reverse[i]) +def alignmentIterator(entries): + + if type(entries) == list: + two_views = True + forward = entries[0] + reverse = entries[1] + entries_len = len(forward) + else: + two_views = False + entries_len = len(entries) + + for i in range(entries_len): + if two_views: + seqF = forward[i] + seqR = reverse[i] + else: + seqF = Nuc_Seq.new_from_stored(entries[i]) + seqR = Nuc_Seq(seqF.id, seqF[REVERSE_SEQ_COLUMN_NAME], quality=seqF[REVERSE_QUALITY_COLUMN_NAME]) + seqF.pop(REVERSE_SEQ_COLUMN_NAME) + seqF.pop(REVERSE_QUALITY_COLUMN_NAME) + ali = buildAlignment(seqF, seqR) if ali is None: continue yield ali @@ -98,22 +114,49 @@ def run(config): logger("info", "obi alignpairedend") # Open the input - finput = open_uri(config["alignpairedend"]["forward"]) - if finput is None: - raise Exception("Could not open forward reads") - forward = finput[1] - rinput = open_uri(config["alignpairedend"]["reverse"]) - if rinput is None: - raise Exception("Could not open reverse reads") - reverse = rinput[1] - - if len(forward) != len(reverse): - raise Exception("Error: the number of forward and reverse reads are different") - if "index" in config["alignpairedend"]: # TODO - index = open_uri(config["alignpairedend"]["index"]) + two_views = False + forward = None + reverse = None + input = None + + input = open_uri(config['obi']['inputURI']) + if input is None: + raise Exception("Could not open input reads") + if input[2] != View_NUC_SEQS: + raise NotImplementedError('obi alignpairedend only works on NUC_SEQS views') + + if "reverse" in config["alignpairedend"]: + + two_views = True + + forward = input[1] + + rinput = open_uri(config["alignpairedend"]["reverse"]) + if rinput is None: + raise Exception("Could not open reverse reads") + if rinput[2] != View_NUC_SEQS: + raise NotImplementedError('obi alignpairedend only works on NUC_SEQS views') + + reverse = rinput[1] + + if len(forward) != len(reverse): + raise Exception("Error: the number of forward and reverse reads are different") + + entries = [forward, reverse] + else: - index = None + entries = input[1] + + if two_views: + entries_len = len(forward) + else: + entries_len = len(entries) + +# if "index" in config["alignpairedend"]: # TODO +# index = open_uri(config["alignpairedend"]["index"]) +# else: +# index = None # Open the output output = open_uri(config['obi']['outputURI'], @@ -135,9 +178,9 @@ def run(config): sminR = 0 # Initialize the progress bar - pb = ProgressBar(len(forward), config, seconde=5) + pb = ProgressBar(entries_len, config, seconde=5) - ba = alignmentIterator(forward, reverse) + ba = alignmentIterator(entries) i = 0 for ali in ba: @@ -151,7 +194,13 @@ def run(config): or (ali.score > sminR)): buildConsensus(ali, consensus) else: - buildJoinedSequence(ali, reverse[i], consensus) + seqF = ali[0].wrapped + if not two_views: + seq = entries[i] + seqR = Nuc_Seq(seq.id, seq[REVERSE_SEQ_COLUMN_NAME], quality = seq[REVERSE_QUALITY_COLUMN_NAME]) + else: + seqR = reverse[i] + buildJoinedSequence(ali, seqR, consensus) consensus[b"sminL"] = sminL consensus[b"sminR"] = sminR @@ -165,7 +214,11 @@ def run(config): i+=1 - finput[0].close() - rinput[0].close() + print("\n") + print(repr(view)) + + input[0].close() + if two_views: + rinput[0].close() output[0].close() diff --git a/python/obitools3/commands/ngsfilter.pyx b/python/obitools3/commands/ngsfilter.pyx index 6fc320c..3693a99 100644 --- a/python/obitools3/commands/ngsfilter.pyx +++ b/python/obitools3/commands/ngsfilter.pyx @@ -10,10 +10,16 @@ from obitools3.uri.decode import open_uri from obitools3.apps.config import logger from obitools3.align._freeendgapfm import FreeEndGapFullMatch from obitools3.dms.obiseq cimport Nuc_Seq +from obitools3.dms.capi.obitypes cimport OBI_SEQ, OBI_QUAL + from functools import reduce, cmp_to_key import math +REVERSE_SEQ_COLUMN_NAME = b"REVERSE_SEQUENCE" +REVERSE_QUALITY_COLUMN_NAME = b"REVERSE_QUALITY" + + __title__="Assigns sequence records to the corresponding experiment/sample based on DNA tags and primers" @@ -31,6 +37,20 @@ def addOptions(parser): default=None, help="URI to the view containing the samples definition (with tags, primers, sample names,...)") +# group.add_argument('-F', '--forward-reads', +# action="store", dest="ngsfilter:forward", +# metavar="", +# default=None, +# type=str, +# help="URI to the forward reads if the paired-end reads haven't been aligned yet") + + group.add_argument('-R', '--reverse-reads', + action="store", dest="ngsfilter:reverse", + metavar="", + default=None, + type=str, + help="URI to the reverse reads if the paired-end reads haven't been aligned yet") + group.add_argument('-u','--unidentified', action="store", dest="ngsfilter:unidentified", metavar="", @@ -66,7 +86,7 @@ class Primer: Primer.collection[sequence]=taglength self.raw=sequence - self.sequence = Nuc_Seq(b"primer", sequence) + self.sequence = Nuc_Seq(b"primer", sequence) self.lseq = len(self.sequence) self.align=FreeEndGapFullMatch() self.align.match=4 @@ -76,9 +96,7 @@ class Primer: self.error=error self.minscore = (self.lseq-error) * self.align.match + error * self.align.mismatch self.taglength=taglength - self.align.seqB=self.sequence - self.direct = direct self.verbose=verbose @@ -100,9 +118,10 @@ class Primer: def __call__(self,sequence): if len(sequence) <= self.lseq: return None + self.align.seqA=sequence ali=self.align() - + if ali.score >= self.minscore: score = ali.score start = ali[1].gaps[0][1] @@ -134,7 +153,7 @@ class Primer: __repr__=__str__ -cdef dict read_info_view(info_view, error=2, verbose=False): +cdef dict read_info_view(info_view, error=2, verbose=False, not_aligned=False): infos = {} for p in info_view: forward=Primer(p[b'forward_primer'], @@ -144,7 +163,7 @@ cdef dict read_info_view(info_view, error=2, verbose=False): fp = infos.get(forward,{}) infos[forward]=fp - + reverse=Primer(p[b'reverse_primer'], len(p[b'reverse_tag']) if p[b'reverse_tag']!=b'-' else None, False, @@ -153,15 +172,23 @@ cdef dict read_info_view(info_view, error=2, verbose=False): rp = infos.get(reverse,{}) infos[reverse]=rp - cf=forward.reverse_complement() - cr=reverse.reverse_complement() - - dpp=fp.get(cr,{}) - fp[cr]=dpp - - rpp=rp.get(cf,{}) - rp[cf]=rpp + if not_aligned: + dpp=fp.get(reverse,{}) + fp[reverse]=dpp + rpp=rp.get(forward,{}) + rp[forward]=rpp + + else: + cf=forward.reverse_complement() + cr=reverse.reverse_complement() + + dpp=fp.get(cr,{}) + fp[cr]=dpp + + rpp=rp.get(cf,{}) + rp[cf]=rpp + tags = (p[b'forward_tag'] if p[b'forward_tag']!=b'-' else None, p[b'reverse_tag'] if p[b'reverse_tag']!=b'-' else None) @@ -181,7 +208,7 @@ cdef dict read_info_view(info_view, error=2, verbose=False): return infos -cdef tuple annotate(sequence, infos, verbose=False): +cdef tuple annotate(sequences, infos, verbose=False): def sortMatch(m1, m2): if m1[1] is None and m2[1] is None: @@ -195,59 +222,102 @@ cdef tuple annotate(sequence, infos, verbose=False): return (m1[1][1] > m2[1][2]) - (m1[1][1] < m2[1][2]) - if hasattr(sequence, "quality_array"): - q = -reduce(lambda x,y:x+y,(math.log10(z) for z in sequence.quality_array),0)/len(sequence.quality_array)*10 - sequence[b'avg_quality']=q - q = -reduce(lambda x,y:x+y,(math.log10(z) for z in sequence.quality_array[0:10]),0) - sequence[b'head_quality']=q - if len(sequence.quality_array[10:-10]) : - q = -reduce(lambda x,y:x+y,(math.log10(z) for z in sequence.quality_array[10:-10]),0)/len(sequence.quality_array[10:-10])*10 - sequence[b'mid_quality']=q - q = -reduce(lambda x,y:x+y,(math.log10(z) for z in sequence.quality_array[-10:]),0) - sequence[b'tail_quality']=q + not_aligned = len(sequences) > 1 + sequenceF = sequences[0] + sequenceR = None + if not not_aligned: + final_sequence = sequenceF + else: + final_sequence = sequenceF.clone() # TODO maybe not cloning and then deleting quality tags is more efficient + + if not_aligned: + sequenceR = sequences[1] + final_sequence[REVERSE_SEQ_COLUMN_NAME] = sequenceR.seq + final_sequence[REVERSE_QUALITY_COLUMN_NAME] = sequenceR.quality + + for seq in sequences: + if hasattr(seq, "quality_array"): + q = -reduce(lambda x,y:x+y,(math.log10(z) for z in seq.quality_array),0)/len(seq.quality_array)*10 + seq[b'avg_quality']=q + q = -reduce(lambda x,y:x+y,(math.log10(z) for z in seq.quality_array[0:10]),0) + seq[b'head_quality']=q + if len(seq.quality_array[10:-10]) : + q = -reduce(lambda x,y:x+y,(math.log10(z) for z in seq.quality_array[10:-10]),0)/len(seq.quality_array[10:-10])*10 + seq[b'mid_quality']=q + q = -reduce(lambda x,y:x+y,(math.log10(z) for z in seq.quality_array[-10:]),0) + seq[b'tail_quality']=q - directmatch = [(p,p(sequence)) for p in infos] - directmatch.sort(key=cmp_to_key(sortMatch)) - directmatch=directmatch[0] if directmatch[0][1] is not None else None - + # Try direct matching: + directmatch = None + first_matched_seq = None + second_matched_seq = None + for seq in sequences: + for p in infos: + directmatch = (p, p(seq)) + if directmatch[1] is not None: + break + if directmatch[1] is not None: + first_matched_seq = seq + if id(first_matched_seq) == id(sequenceF) and not_aligned: + second_matched_seq = sequenceR + else: + second_matched_seq = sequenceF + break + if directmatch[1] is None: + directmatch = None + if directmatch is None: - sequence[b'error']=b'No primer match' - return False,sequence + final_sequence[b'error']=b'No primer match' + return False, final_sequence + + match = first_matched_seq[directmatch[1][1]:directmatch[1][2]] - match = sequence[directmatch[1][1]:directmatch[1][2]] + if not not_aligned: + final_sequence[b'seq_length_ori']=len(final_sequence) - sequence[b'seq_length_ori']=len(sequence) - - sequence = sequence[directmatch[1][2]:] + if not not_aligned or id(first_matched_seq) == id(sequenceF): + final_sequence = final_sequence[directmatch[1][2]:] + else: + cut_seq = sequenceR[directmatch[1][2]:] + final_sequence[REVERSE_SEQ_COLUMN_NAME] = cut_seq.seq + final_sequence[REVERSE_QUALITY_COLUMN_NAME] = cut_seq.quality if directmatch[0].direct: - sequence[b'direction']=b'forward' - sequence[b'forward_score']=directmatch[1][0] - sequence[b'forward_primer']=directmatch[0].raw - sequence[b'forward_match']=match.seq + final_sequence[b'direction']=b'forward' + final_sequence[b'forward_score']=directmatch[1][0] + final_sequence[b'forward_primer']=directmatch[0].raw + final_sequence[b'forward_match']=match.seq else: - sequence[b'direction']=b'reverse' - sequence[b'reverse_score']=directmatch[1][0] - sequence[b'reverse_primer']=directmatch[0].raw - sequence[b'reverse_match']=match.seq - + final_sequence[b'direction']=b'reverse' + final_sequence[b'reverse_score']=directmatch[1][0] + final_sequence[b'reverse_primer']=directmatch[0].raw + final_sequence[b'reverse_match']=match.seq + infos = infos[directmatch[0]] - reversematch = [(p,p(sequence)) for p in infos if p is not None] - reversematch.sort(key=cmp_to_key(sortMatch)) - reversematch = reversematch[0] if reversematch[0][1] is not None else None + + # Try reverse matching on the other sequence: + for p in infos: + reversematch = (p, p(second_matched_seq)) + + if reversematch[1] is None and not_aligned: + # Try matching on the same sequence than the first match + reverse_p = p.reverse_complement() + reversematch = (reverse_p, reverse_p(first_matched_seq)) + + if reversematch[1] is None: + reversematch = None if reversematch is None and None not in infos: if directmatch[0].direct: message = b'No reverse primer match' else: message = b'No direct primer match' - - sequence[b'error']=message - return False,sequence + final_sequence[b'error']=message + return False, final_sequence if reversematch is None: - sequence[b'status']=b'partial' + final_sequence[b'status']=b'partial' if directmatch[0].direct: tags=(directmatch[1][3],None) @@ -255,48 +325,54 @@ cdef tuple annotate(sequence, infos, verbose=False): tags=(None,directmatch[1][3]) samples = infos[None] - - else: - sequence[b'status']=b'full' - match = sequence[reversematch[1][1]:reversematch[1][2]] + else: + final_sequence[b'status']=b'full' + + match = second_matched_seq[reversematch[1][1]:reversematch[1][2]] match = match.reverse_complement - sequence = sequence[0:reversematch[1][1]] + + if not not_aligned or id(second_matched_seq) == id(sequenceF): + final_sequence = final_sequence[0:reversematch[1][1]] + else: + cut_seq = sequenceR[reversematch[1][2]:] + final_sequence[REVERSE_SEQ_COLUMN_NAME] = cut_seq.seq + final_sequence[REVERSE_QUALITY_COLUMN_NAME] = cut_seq.quality if directmatch[0].direct: - tags=(directmatch[1][3],reversematch[1][3]) - sequence[b'reverse_score']=reversematch[1][0] - sequence[b'reverse_primer']=reversematch[0].raw - sequence[b'reverse_match']=match.seq + tags=(directmatch[1][3], reversematch[1][3]) + final_sequence[b'reverse_score'] = reversematch[1][0] + final_sequence[b'reverse_primer'] = reversematch[0].raw + final_sequence[b'reverse_match'] = match.seq else: - tags=(reversematch[1][3],directmatch[1][3]) - sequence[b'forward_score']=reversematch[1][0] - sequence[b'forward_primer']=reversematch[0].raw - sequence[b'forward_match']=match.seq + tags=(reversematch[1][3], directmatch[1][3]) + final_sequence[b'forward_score'] = reversematch[1][0] + final_sequence[b'forward_primer'] = reversematch[0].raw + final_sequence[b'forward_match'] = match.seq if tags[0] is not None: - sequence[b'forward_tag'] = tags[0] + final_sequence[b'forward_tag'] = tags[0] if tags[1] is not None: - sequence[b'reverse_tag'] = tags[1] + final_sequence[b'reverse_tag'] = tags[1] samples = infos[reversematch[0]] - if not directmatch[0].direct: - sequence=sequence.reverse_complement + if not directmatch[0].direct and not not_aligned: # don't reverse complement if not_aligned + final_sequence = final_sequence.reverse_complement sample=None if tags[0] is not None: # Direct tag known - if tags[1] is not None: # Reverse tag known + if tags[1] is not None: # Reverse tag known sample = samples.get(tags, None) else: # Reverse tag known s=[samples[x] for x in samples if x[0]==tags[0]] if len(s)==1: sample=s[0] elif len(s)>1: - sequence[b'error']=b'multiple samples match tags' - return False,sequence + final_sequence[b'error']=b'multiple samples match tags' + return False, final_sequence else: sample=None else: @@ -305,19 +381,21 @@ cdef tuple annotate(sequence, infos, verbose=False): if len(s)==1: sample=s[0] elif len(s)>1: - sequence[b'error']=b'multiple samples match tags' - return False,sequence + final_sequence[b'error']=b'multiple samples match tags' + return False, final_sequence else: # Reverse tag known sample=None if sample is None: - sequence[b'error']=b"Cannot assign sequence to a sample" - return False,sequence + final_sequence[b'error']=b"Cannot assign sequence to a sample" + return False, final_sequence - sequence.update(sample) - sequence[b'seq_length']=len(sequence) + final_sequence.update(sample) + + if not not_aligned: + final_sequence[b'seq_length']=len(final_sequence) - return True, sequence + return True, final_sequence def run(config): @@ -329,11 +407,43 @@ def run(config): assert config['ngsfilter']['info_view'] is not None, "Option -t must be specified" # Open the input + + forward = None + reverse = None + input = None + not_aligned = False + input = open_uri(config['obi']['inputURI']) if input is None: - raise Exception("Could not read input view") + raise Exception("Could not open input reads") if input[2] != View_NUC_SEQS: raise NotImplementedError('obi ngsfilter only works on NUC_SEQS views') + + if "reverse" in config["ngsfilter"]: + + forward = input[1] + + rinput = open_uri(config["ngsfilter"]["reverse"]) + if rinput is None: + raise Exception("Could not open reverse reads") + if rinput[2] != View_NUC_SEQS: + raise NotImplementedError('obi ngsfilter only works on NUC_SEQS views') + + reverse = rinput[1] + + if len(forward) != len(reverse): + raise Exception("Error: the number of forward and reverse reads are different") + + entries = [forward, reverse] + not_aligned = True + + else: + entries = input[1] + + if not_aligned: + entries_len = len(forward) + else: + entries_len = len(entries) # Open the output output = open_uri(config['obi']['outputURI'], @@ -343,10 +453,9 @@ def run(config): if output is None: raise Exception("Could not create output view") - entries = input[1] o_view = output[1] - # Open the the view containing the informations about the tags and the primers + # Open the view containing the informations about the tags and the primers info_input = open_uri(config['ngsfilter']['info_view']) if info_input is None: raise Exception("Could not read the view containing the informations about the tags and the primers") @@ -364,19 +473,27 @@ def run(config): unidentified = None # Initialize the progress bar - pb = ProgressBar(len(entries), config, seconde=5) + pb = ProgressBar(entries_len, config, seconde=5) # Check and store primers and tags - infos = read_info_view(info_view, error=config['ngsfilter']['error']) + infos = read_info_view(info_view, error=config['ngsfilter']['error'], verbose=False, not_aligned=not_aligned) # TODO obi verbose option + + if not_aligned: + Column.new_column(o_view, REVERSE_SEQ_COLUMN_NAME, OBI_SEQ) + Column.new_column(o_view, REVERSE_QUALITY_COLUMN_NAME, OBI_QUAL, associated_column_name=REVERSE_SEQ_COLUMN_NAME, associated_column_version=o_view[REVERSE_SEQ_COLUMN_NAME].version) + Column.new_column(unidentified, REVERSE_SEQ_COLUMN_NAME, OBI_SEQ) + Column.new_column(unidentified, REVERSE_QUALITY_COLUMN_NAME, OBI_QUAL, associated_column_name=REVERSE_SEQ_COLUMN_NAME, associated_column_version=unidentified[REVERSE_SEQ_COLUMN_NAME].version) + g = 0 u = 0 - i = 0 try: - for iseq in entries: + for i in range(entries_len): pb(i) - i+=1 - modseq = Nuc_Seq.new_from_stored(iseq) + if not_aligned: + modseq = [Nuc_Seq.new_from_stored(forward[i]), Nuc_Seq.new_from_stored(reverse[i])] + else: + modseq = [Nuc_Seq.new_from_stored(entries[i])] good, oseq = annotate(modseq, infos) if good: o_view[g].set(oseq.id, oseq.seq, definition=oseq.definition, quality=oseq.quality, tags=oseq) @@ -386,7 +503,7 @@ def run(config): u+=1 except Exception, e: raise RollbackException("obi ngsfilter error, rollbacking views: "+str(e), o_view, unidentified) - + print("\n") print(repr(o_view))