Commands: ngsfilter and alignpairedend can now be used in whichever

order
2018-08-08 19:53:26 +02:00
parent 3fcf29a76f
commit e8dc5eb123
2 changed files with 292 additions and 122 deletions
--- a/python/obitools3/commands/alignpairedend.pyx
+++ b/python/obitools3/commands/alignpairedend.pyx
@ -8,13 +8,17 @@ from obitools3.dms.view.typed_view.view_NUC_SEQS cimport View_NUC_SEQS
 from obitools3.dms.column.column cimport Column, Column_line
 from obitools3.dms.capi.obiview cimport QUALITY_COLUMN, COUNT_COLUMN, NUC_SEQUENCE_COLUMN, ID_COLUMN
 from obitools3.dms.capi.obitypes cimport OBI_INT, OBI_STR, index_t, OBI_QUAL
-from obitools3.apps.optiongroups import addMinimalOutputOption
+from obitools3.apps.optiongroups import addSequenceInputOption, addMinimalOutputOption
 from obitools3.uri.decode import open_uri
 from obitools3.apps.config import logger
 from obitools3.align._qsassemble import QSolexaReverseAssemble
 from obitools3.align._qsrassemble import QSolexaRightReverseAssemble
 from obitools3.align._solexapairend import buildConsensus, buildJoinedSequence
-from obitools3.dms.obiseq cimport Nuc_Seq_Stored
+from obitools3.dms.obiseq cimport Nuc_Seq_Stored, Nuc_Seq
+
+
+REVERSE_SEQ_COLUMN_NAME = b"REVERSE_SEQUENCE"
+REVERSE_QUALITY_COLUMN_NAME = b"REVERSE_QUALITY"


 __title__="Aligns paired-ended reads"
@ -22,23 +26,17 @@ __title__="Aligns paired-ended reads"

 def addOptions(parser):

+    addSequenceInputOption(parser)
    addMinimalOutputOption(parser)
 
    group = parser.add_argument_group('obi alignpairedend specific options')

-    group.add_argument('-f', '--forward-reads',
-                     action="store", dest="alignpairedend:forward",
-                     metavar="<URI>",
-                     default=None,
-                     type=str,
-                     help="URI to the forward reads")
-
-    group.add_argument('-r', '--reverse-reads',
+    group.add_argument('-R', '--reverse-reads',
                     action="store", dest="alignpairedend:reverse",
                     metavar="<URI>",
                     default=None,
                     type=str,
-                     help="URI to the reverse reads")
+                     help="URI to the reverse reads if they are in a different view than the forward reads")

 # TODO
 #    group.add_argument('--index-file',
@ -60,7 +58,7 @@ def addOptions(parser):

 la = QSolexaReverseAssemble()
 ra = QSolexaRightReverseAssemble()
-def buildAlignment(Nuc_Seq_Stored direct, Nuc_Seq_Stored reverse):
+def buildAlignment(object direct, object reverse):
    
    if len(direct)==0 or len(reverse)==0:
        return None
@ -83,9 +81,27 @@ def buildAlignment(Nuc_Seq_Stored direct, Nuc_Seq_Stored reverse):
    return ali


-def alignmentIterator(View_NUC_SEQS forward, View_NUC_SEQS reverse):    
-    for i in range(len(forward)):
-        ali = buildAlignment(forward[i], reverse[i])
+def alignmentIterator(entries): 
+    
+    if type(entries) == list:
+        two_views = True
+        forward = entries[0]
+        reverse = entries[1]
+        entries_len = len(forward)
+    else:
+        two_views = False
+        entries_len = len(entries)
+       
+    for i in range(entries_len):
+        if two_views:
+            seqF = forward[i]
+            seqR = reverse[i]
+        else:
+            seqF = Nuc_Seq.new_from_stored(entries[i])
+            seqR = Nuc_Seq(seqF.id, seqF[REVERSE_SEQ_COLUMN_NAME], quality=seqF[REVERSE_QUALITY_COLUMN_NAME])
+            seqF.pop(REVERSE_SEQ_COLUMN_NAME)
+            seqF.pop(REVERSE_QUALITY_COLUMN_NAME)
+        ali = buildAlignment(seqF, seqR)
        if ali is None:
            continue
        yield ali
@ -98,22 +114,49 @@ def run(config):
    logger("info", "obi alignpairedend")
    
    # Open the input
-    finput = open_uri(config["alignpairedend"]["forward"])
-    if finput is None:
-        raise Exception("Could not open forward reads")
-    forward = finput[1]
+    
+    two_views = False
+    forward = None
+    reverse = None
+    input = None
+    
+    input = open_uri(config['obi']['inputURI'])
+    if input is None:
+        raise Exception("Could not open input reads")
+    if input[2] != View_NUC_SEQS:
+        raise NotImplementedError('obi alignpairedend only works on NUC_SEQS views')
+
+    if "reverse" in config["alignpairedend"]:
+        
+        two_views = True
+        
+        forward = input[1]        
+        
        rinput = open_uri(config["alignpairedend"]["reverse"])
        if rinput is None:
            raise Exception("Could not open reverse reads")
+        if rinput[2] != View_NUC_SEQS:
+            raise NotImplementedError('obi alignpairedend only works on NUC_SEQS views')
+        
        reverse = rinput[1]
    
        if len(forward) != len(reverse):
            raise Exception("Error: the number of forward and reverse reads are different")
        
-    if "index" in config["alignpairedend"]:    # TODO
-        index = open_uri(config["alignpairedend"]["index"])
+        entries = [forward, reverse]
+    
    else:
-        index = None
+        entries = input[1]
+    
+    if two_views:
+        entries_len = len(forward)
+    else:
+        entries_len = len(entries)
+    
+#     if "index" in config["alignpairedend"]:    # TODO
+#         index = open_uri(config["alignpairedend"]["index"])
+#     else:
+#         index = None
    
    # Open the output
    output = open_uri(config['obi']['outputURI'],
@ -135,9 +178,9 @@ def run(config):
        sminR = 0

    # Initialize the progress bar
-    pb = ProgressBar(len(forward), config, seconde=5)
+    pb = ProgressBar(entries_len, config, seconde=5)
    
-    ba = alignmentIterator(forward, reverse)
+    ba = alignmentIterator(entries)
            
    i = 0
    for ali in ba:
@ -151,7 +194,13 @@ def run(config):
                or (ali.score > sminR)):
                buildConsensus(ali, consensus)
            else:
-                buildJoinedSequence(ali, reverse[i], consensus)
+                seqF = ali[0].wrapped
+                if not two_views:
+                    seq = entries[i]
+                    seqR = Nuc_Seq(seq.id, seq[REVERSE_SEQ_COLUMN_NAME], quality = seq[REVERSE_QUALITY_COLUMN_NAME])
+                else:
+                    seqR = reverse[i]
+                buildJoinedSequence(ali, seqR, consensus)
                
            consensus[b"sminL"] = sminL
            consensus[b"sminR"] = sminR
@ -165,7 +214,11 @@ def run(config):
    
        i+=1
    
-    finput[0].close()
+    print("\n")
+    print(repr(view))
+    
+    input[0].close()
+    if two_views:
        rinput[0].close()
    output[0].close()

--- a/python/obitools3/commands/ngsfilter.pyx
+++ b/python/obitools3/commands/ngsfilter.pyx
@ -10,10 +10,16 @@ from obitools3.uri.decode import open_uri
 from obitools3.apps.config import logger
 from obitools3.align._freeendgapfm import FreeEndGapFullMatch
 from obitools3.dms.obiseq cimport Nuc_Seq
+from obitools3.dms.capi.obitypes cimport OBI_SEQ, OBI_QUAL
+
 from functools import reduce, cmp_to_key
 import math


+REVERSE_SEQ_COLUMN_NAME = b"REVERSE_SEQUENCE"
+REVERSE_QUALITY_COLUMN_NAME = b"REVERSE_QUALITY"
+
+
 __title__="Assigns sequence records to the corresponding experiment/sample based on DNA tags and primers"


@ -31,6 +37,20 @@ def addOptions(parser):
                     default=None,
                     help="URI to the view containing the samples definition (with tags, primers, sample names,...)")

+#     group.add_argument('-F', '--forward-reads',
+#                      action="store", dest="ngsfilter:forward",
+#                      metavar="<URI>",
+#                      default=None,
+#                      type=str,
+#                      help="URI to the forward reads if the paired-end reads haven't been aligned yet")
+
+    group.add_argument('-R', '--reverse-reads',
+                     action="store", dest="ngsfilter:reverse",
+                     metavar="<URI>",
+                     default=None,
+                     type=str,
+                     help="URI to the reverse reads if the paired-end reads haven't been aligned yet")
+
    group.add_argument('-u','--unidentified',
                     action="store", dest="ngsfilter:unidentified",
                     metavar="<URI>",
@ -76,9 +96,7 @@ class Primer:
        self.error=error
        self.minscore = (self.lseq-error) * self.align.match + error * self.align.mismatch
        self.taglength=taglength
-        
        self.align.seqB=self.sequence
-        
        self.direct = direct
        self.verbose=verbose
        
@ -100,6 +118,7 @@ class Primer:
    def __call__(self,sequence):
        if len(sequence) <= self.lseq:
            return None
+        
        self.align.seqA=sequence
        ali=self.align()

@ -134,7 +153,7 @@ class Primer:
    __repr__=__str__


-cdef dict read_info_view(info_view, error=2, verbose=False):
+cdef dict read_info_view(info_view, error=2, verbose=False, not_aligned=False):
    infos = {}
    for p in info_view:
        forward=Primer(p[b'forward_primer'],
@ -153,6 +172,14 @@ cdef dict read_info_view(info_view, error=2, verbose=False):
        rp = infos.get(reverse,{})
        infos[reverse]=rp
        
+        if not_aligned:
+            dpp=fp.get(reverse,{})
+            fp[reverse]=dpp
+            
+            rpp=rp.get(forward,{})
+            rp[forward]=rpp
+        
+        else:
            cf=forward.reverse_complement()
            cr=reverse.reverse_complement()
            
@ -181,7 +208,7 @@ cdef dict read_info_view(info_view, error=2, verbose=False):
    return infos
    

-cdef tuple annotate(sequence, infos, verbose=False):
+cdef tuple annotate(sequences, infos, verbose=False):
        
    def sortMatch(m1, m2):
        if m1[1] is None and m2[1] is None:
@ -195,59 +222,102 @@ cdef tuple annotate(sequence, infos, verbose=False):
        
        return (m1[1][1] > m2[1][2]) - (m1[1][1] < m2[1][2])
    
-    if hasattr(sequence, "quality_array"): 
-        q = -reduce(lambda x,y:x+y,(math.log10(z) for z in sequence.quality_array),0)/len(sequence.quality_array)*10
-        sequence[b'avg_quality']=q
-        q = -reduce(lambda x,y:x+y,(math.log10(z) for z in sequence.quality_array[0:10]),0)
-        sequence[b'head_quality']=q
-        if len(sequence.quality_array[10:-10]) :
-            q = -reduce(lambda x,y:x+y,(math.log10(z) for z in sequence.quality_array[10:-10]),0)/len(sequence.quality_array[10:-10])*10
-            sequence[b'mid_quality']=q
-        q = -reduce(lambda x,y:x+y,(math.log10(z) for z in sequence.quality_array[-10:]),0)
-        sequence[b'tail_quality']=q
+    not_aligned = len(sequences) > 1
+    sequenceF = sequences[0]
+    sequenceR = None
+    if not not_aligned: 
+        final_sequence = sequenceF
+    else:
+        final_sequence = sequenceF.clone()   # TODO maybe not cloning and then deleting quality tags is more efficient

-    directmatch = [(p,p(sequence)) for p in infos]    
-    directmatch.sort(key=cmp_to_key(sortMatch))
-    directmatch=directmatch[0] if directmatch[0][1] is not None else None
+    if not_aligned:
+        sequenceR = sequences[1]
+        final_sequence[REVERSE_SEQ_COLUMN_NAME] = sequenceR.seq
+        final_sequence[REVERSE_QUALITY_COLUMN_NAME] = sequenceR.quality
+
+    for seq in sequences:
+        if hasattr(seq, "quality_array"): 
+            q = -reduce(lambda x,y:x+y,(math.log10(z) for z in seq.quality_array),0)/len(seq.quality_array)*10
+            seq[b'avg_quality']=q
+            q = -reduce(lambda x,y:x+y,(math.log10(z) for z in seq.quality_array[0:10]),0)
+            seq[b'head_quality']=q
+            if len(seq.quality_array[10:-10]) :
+                q = -reduce(lambda x,y:x+y,(math.log10(z) for z in seq.quality_array[10:-10]),0)/len(seq.quality_array[10:-10])*10
+                seq[b'mid_quality']=q
+            q = -reduce(lambda x,y:x+y,(math.log10(z) for z in seq.quality_array[-10:]),0)
+            seq[b'tail_quality']=q
+    
+    # Try direct matching:
+    directmatch = None
+    first_matched_seq = None
+    second_matched_seq = None
+    for seq in sequences:
+        for p in infos:
+            directmatch = (p, p(seq))
+            if directmatch[1] is not None:
+                break
+        if directmatch[1] is not None:
+            first_matched_seq = seq
+            if id(first_matched_seq) == id(sequenceF) and not_aligned:
+                second_matched_seq = sequenceR
+            else:
+                second_matched_seq = sequenceF
+            break
+    if directmatch[1] is None:
+        directmatch = None
   
    if directmatch is None:
-        sequence[b'error']=b'No primer match'
-        return False,sequence
+        final_sequence[b'error']=b'No primer match'
+        return False, final_sequence
        
-    match = sequence[directmatch[1][1]:directmatch[1][2]]
+    match = first_matched_seq[directmatch[1][1]:directmatch[1][2]]
    
-    sequence[b'seq_length_ori']=len(sequence)
+    if not not_aligned:
+        final_sequence[b'seq_length_ori']=len(final_sequence)
    
-    sequence = sequence[directmatch[1][2]:]
+    if not not_aligned or id(first_matched_seq) == id(sequenceF):
+        final_sequence = final_sequence[directmatch[1][2]:]
+    else:
+        cut_seq = sequenceR[directmatch[1][2]:]
+        final_sequence[REVERSE_SEQ_COLUMN_NAME] = cut_seq.seq
+        final_sequence[REVERSE_QUALITY_COLUMN_NAME] = cut_seq.quality
    
    if directmatch[0].direct:
-        sequence[b'direction']=b'forward'
-        sequence[b'forward_score']=directmatch[1][0]
-        sequence[b'forward_primer']=directmatch[0].raw
-        sequence[b'forward_match']=match.seq
+        final_sequence[b'direction']=b'forward'
+        final_sequence[b'forward_score']=directmatch[1][0]
+        final_sequence[b'forward_primer']=directmatch[0].raw
+        final_sequence[b'forward_match']=match.seq
        
    else:
-        sequence[b'direction']=b'reverse'
-        sequence[b'reverse_score']=directmatch[1][0]
-        sequence[b'reverse_primer']=directmatch[0].raw
-        sequence[b'reverse_match']=match.seq
+        final_sequence[b'direction']=b'reverse'
+        final_sequence[b'reverse_score']=directmatch[1][0]
+        final_sequence[b'reverse_primer']=directmatch[0].raw
+        final_sequence[b'reverse_match']=match.seq

    infos = infos[directmatch[0]]
-    reversematch = [(p,p(sequence)) for p in infos if p is not None]
-    reversematch.sort(key=cmp_to_key(sortMatch))
-    reversematch = reversematch[0] if reversematch[0][1] is not None else None
+
+    # Try reverse matching on the other sequence:
+    for p in infos:
+         reversematch = (p, p(second_matched_seq))
+    
+    if reversematch[1] is None and not_aligned:
+        # Try matching on the same sequence than the first match
+        reverse_p = p.reverse_complement()
+        reversematch = (reverse_p, reverse_p(first_matched_seq))
+        
+    if reversematch[1] is None:
+        reversematch = None
    
    if reversematch is None and None not in infos:
        if directmatch[0].direct:
            message = b'No reverse primer match'
        else:
            message = b'No direct primer match'
-                  
-        sequence[b'error']=message
-        return False,sequence
+        final_sequence[b'error']=message
+        return False, final_sequence
    
    if reversematch is None:
-        sequence[b'status']=b'partial'
+        final_sequence[b'status']=b'partial'
        
        if directmatch[0].direct:
            tags=(directmatch[1][3],None)
@ -257,33 +327,39 @@ cdef tuple annotate(sequence, infos, verbose=False):
        samples = infos[None]
        
    else:
-        sequence[b'status']=b'full'
+        final_sequence[b'status']=b'full'
        
-        match = sequence[reversematch[1][1]:reversematch[1][2]]
+        match = second_matched_seq[reversematch[1][1]:reversematch[1][2]]
        match = match.reverse_complement
-        sequence = sequence[0:reversematch[1][1]]
+        
+        if not not_aligned or id(second_matched_seq) == id(sequenceF):
+            final_sequence = final_sequence[0:reversematch[1][1]]
+        else:
+            cut_seq = sequenceR[reversematch[1][2]:]
+            final_sequence[REVERSE_SEQ_COLUMN_NAME] = cut_seq.seq
+            final_sequence[REVERSE_QUALITY_COLUMN_NAME] = cut_seq.quality
        
        if directmatch[0].direct:
            tags=(directmatch[1][3], reversematch[1][3])
-            sequence[b'reverse_score']=reversematch[1][0]
-            sequence[b'reverse_primer']=reversematch[0].raw
-            sequence[b'reverse_match']=match.seq
+            final_sequence[b'reverse_score'] = reversematch[1][0]
+            final_sequence[b'reverse_primer'] = reversematch[0].raw
+            final_sequence[b'reverse_match'] = match.seq
        
        else:
            tags=(reversematch[1][3], directmatch[1][3])
-            sequence[b'forward_score']=reversematch[1][0]
-            sequence[b'forward_primer']=reversematch[0].raw
-            sequence[b'forward_match']=match.seq
+            final_sequence[b'forward_score'] = reversematch[1][0]
+            final_sequence[b'forward_primer'] = reversematch[0].raw
+            final_sequence[b'forward_match'] = match.seq
                
        if tags[0] is not None:
-            sequence[b'forward_tag'] = tags[0]
+            final_sequence[b'forward_tag'] = tags[0]
        if tags[1] is not None:
-            sequence[b'reverse_tag'] = tags[1]
+            final_sequence[b'reverse_tag'] = tags[1]

        samples = infos[reversematch[0]]
        
-    if not directmatch[0].direct:
-        sequence=sequence.reverse_complement
+    if not directmatch[0].direct and not not_aligned:   # don't reverse complement if not_aligned
+        final_sequence = final_sequence.reverse_complement

    sample=None

@ -295,8 +371,8 @@ cdef tuple annotate(sequence, infos, verbose=False):
            if len(s)==1:
                sample=s[0]
            elif len(s)>1:
-                sequence[b'error']=b'multiple samples match tags'
-                return False,sequence
+                final_sequence[b'error']=b'multiple samples match tags'
+                return False, final_sequence
            else:
                sample=None
    else: 
@ -305,19 +381,21 @@ cdef tuple annotate(sequence, infos, verbose=False):
            if len(s)==1:
                sample=s[0]
            elif len(s)>1:
-                sequence[b'error']=b'multiple samples match tags'
-                return False,sequence
+                final_sequence[b'error']=b'multiple samples match tags'
+                return False, final_sequence
            else:                                               # Reverse tag known
                sample=None
    
    if sample is None:
-        sequence[b'error']=b"Cannot assign sequence to a sample"
-        return False,sequence
+        final_sequence[b'error']=b"Cannot assign sequence to a sample"
+        return False, final_sequence
    
-    sequence.update(sample)
-    sequence[b'seq_length']=len(sequence)
+    final_sequence.update(sample)
    
-    return True, sequence
+    if not not_aligned:
+        final_sequence[b'seq_length']=len(final_sequence)
+        
+    return True, final_sequence


 def run(config):
@ -329,12 +407,44 @@ def run(config):
    assert config['ngsfilter']['info_view'] is not None, "Option -t must be specified"

    # Open the input
+    
+    forward = None
+    reverse = None
+    input = None
+    not_aligned = False
+    
    input = open_uri(config['obi']['inputURI'])
    if input is None:
-        raise Exception("Could not read input view")
+        raise Exception("Could not open input reads")
    if input[2] != View_NUC_SEQS:
        raise NotImplementedError('obi ngsfilter only works on NUC_SEQS views')

+    if "reverse" in config["ngsfilter"]:
+        
+        forward = input[1]        
+        
+        rinput = open_uri(config["ngsfilter"]["reverse"])
+        if rinput is None:
+            raise Exception("Could not open reverse reads")
+        if rinput[2] != View_NUC_SEQS:
+            raise NotImplementedError('obi ngsfilter only works on NUC_SEQS views')
+        
+        reverse = rinput[1]
+    
+        if len(forward) != len(reverse):
+            raise Exception("Error: the number of forward and reverse reads are different")
+        
+        entries = [forward, reverse]
+        not_aligned = True
+    
+    else:
+        entries = input[1]
+    
+    if not_aligned:
+        entries_len = len(forward)
+    else:
+        entries_len = len(entries)
+    
    # Open the output
    output = open_uri(config['obi']['outputURI'],
                      input=False,
@ -343,10 +453,9 @@ def run(config):
    if output is None:
        raise Exception("Could not create output view")
    
-    entries = input[1]
    o_view = output[1]
    
-    # Open the the view containing the informations about the tags and the primers
+    # Open the view containing the informations about the tags and the primers
    info_input = open_uri(config['ngsfilter']['info_view'])
    if info_input is None:
        raise Exception("Could not read the view containing the informations about the tags and the primers")
@ -364,19 +473,27 @@ def run(config):
        unidentified = None
        
    # Initialize the progress bar
-    pb = ProgressBar(len(entries), config, seconde=5)
+    pb = ProgressBar(entries_len, config, seconde=5)

    # Check and store primers and tags
-    infos = read_info_view(info_view, error=config['ngsfilter']['error'])
+    infos = read_info_view(info_view, error=config['ngsfilter']['error'], verbose=False, not_aligned=not_aligned)   # TODO obi verbose option
+    
+    if not_aligned:
+        Column.new_column(o_view, REVERSE_SEQ_COLUMN_NAME, OBI_SEQ)
+        Column.new_column(o_view, REVERSE_QUALITY_COLUMN_NAME, OBI_QUAL, associated_column_name=REVERSE_SEQ_COLUMN_NAME, associated_column_version=o_view[REVERSE_SEQ_COLUMN_NAME].version)
+        
+        Column.new_column(unidentified, REVERSE_SEQ_COLUMN_NAME, OBI_SEQ)
+        Column.new_column(unidentified, REVERSE_QUALITY_COLUMN_NAME, OBI_QUAL, associated_column_name=REVERSE_SEQ_COLUMN_NAME, associated_column_version=unidentified[REVERSE_SEQ_COLUMN_NAME].version)
    
    g = 0
    u = 0
-    i = 0
    try:
-        for iseq in entries:
+        for i in range(entries_len):
            pb(i)
-            i+=1
-            modseq = Nuc_Seq.new_from_stored(iseq)
+            if not_aligned:
+                modseq = [Nuc_Seq.new_from_stored(forward[i]), Nuc_Seq.new_from_stored(reverse[i])]
+            else:
+                modseq = [Nuc_Seq.new_from_stored(entries[i])]
            good, oseq = annotate(modseq, infos)
            if good:
                o_view[g].set(oseq.id, oseq.seq, definition=oseq.definition, quality=oseq.quality, tags=oseq)