ngsfilter: fixed a bug with unaligned chimeras (unpaired primers) and

made error annotations more explicit
ngsfilter: made more robust and practical to use with empty tags
2019-12-10 13:43:32 +01:00 · 2019-11-29 15:21:08 +01:00
5 changed files with 61 additions and 44 deletions
--- a/python/obitools3/apps/optiongroups/init.py
+++ b/python/obitools3/apps/optiongroups/init.py
@ -55,7 +55,7 @@ def __addImportInputOption(optionManager):
                     action="store_const", dest="obi:inputformat",
                     default=None,
                     const=b'ngsfilter',
-                     help="Input file is an ngsfilter file")
+                     help="Input file is an ngsfilter file. If not using tags, use ':' or 'None:None' or '-:-' or any combination")

    group.add_argument('--ecopcr-result-input',
                     action="store_const", dest="obi:inputformat",
--- a/python/obitools3/commands/ngsfilter.pyx
+++ b/python/obitools3/commands/ngsfilter.pyx
@ -56,6 +56,11 @@ def addOptions(parser):
                     type=str,
                     default=None,
                     help="URI to the view used to store the sequences unassigned to any sample")
+
+    group.add_argument('--no-tags',
+                     action="store_true", dest="ngsfilter:notags",
+                     default=False,
+                     help="Use this option if your experiment does not use tags to identify samples")
    
    group.add_argument('-e','--error',
                     action="store", dest="ngsfilter:error",
@ -167,7 +172,7 @@ cdef read_info_view(info_view, max_errors=2, verbose=False, not_aligned=False):
    i=0
    for p in info_view:
        forward=Primer(p[b'forward_primer'],
-                       len(p[b'forward_tag']) if p[b'forward_tag']!=b'-' else None,
+                       len(p[b'forward_tag']) if (b'forward_tag' in p and p[b'forward_tag']!=None) else None,
                       True,
                       max_errors=max_errors,
                       verbose=verbose,
@ -178,7 +183,7 @@ cdef read_info_view(info_view, max_errors=2, verbose=False, not_aligned=False):
        infos[forward]=fp
        
        reverse=Primer(p[b'reverse_primer'],
-                       len(p[b'reverse_tag']) if p[b'reverse_tag']!=b'-' else None,
+                       len(p[b'reverse_tag']) if (b'reverse_tag' in p and p[b'reverse_tag']!=None) else None,
                       False,
                       max_errors=max_errors,
                       verbose=verbose,
@ -213,10 +218,11 @@ cdef read_info_view(info_view, max_errors=2, verbose=False, not_aligned=False):
            rpp=rp.get(cf,{})
            rp[cf]=rpp

-        tags = (p[b'forward_tag'] if p[b'forward_tag']!=b'-' else None,
-                p[b'reverse_tag'] if p[b'reverse_tag']!=b'-' else None)
+        tags = (p[b'forward_tag'] if (b'forward_tag' in p and p[b'forward_tag']!=None) else None,
+                p[b'reverse_tag'] if (b'reverse_tag' in p and p[b'reverse_tag']!=None) else None)
        
-        assert tags not in dpp, \
+        if tags != (None, None):
+            assert tags not in dpp, \
               "Tag pair %s is already used with primer pairs: (%s,%s)" % (str(tags),forward,reverse)
        
        # Save additional data
@ -234,7 +240,7 @@ cdef read_info_view(info_view, max_errors=2, verbose=False, not_aligned=False):
    return infos, primer_list
    

-cdef tuple annotate(sequences, infos, verbose=False):
+cdef tuple annotate(sequences, infos, no_tags, verbose=False):
        
    def sortMatch(match):
        if match[1] is None:
@ -329,13 +335,18 @@ cdef tuple annotate(sequences, infos, verbose=False):
        final_sequence[b'reverse_match']=match.seq

    # Keep only paired reverse primer
-    infos = infos[directmatch[0]]    
+    infos = infos[directmatch[0]]
+    rev_prim = list(infos.keys())[0]
    
    # If not aligned, look for other match in already computed matches (choose the one that makes the biggest amplicon)
    if not_aligned:
        i=1
        # TODO comment
-        while i<len(all_direct_matches) and (all_direct_matches[i][1] is None or all_direct_matches[i][0].forward == directmatch[0].forward or all_direct_matches[i][0] == directmatch[0]):
+        while i<len(all_direct_matches) and \
+            (all_direct_matches[i][1] is None or \
+             all_direct_matches[i][0].forward == directmatch[0].forward or \
+             all_direct_matches[i][0] == directmatch[0] or \
+             rev_prim != all_direct_matches[i][0]) :
            i+=1
        if i < len(all_direct_matches):
            reversematch = all_direct_matches[i]
@ -430,35 +441,35 @@ cdef tuple annotate(sequences, infos, verbose=False):
        final_sequence[b'reversed'] = True   # used by the alignpairedend tool (in kmer_similarity.c)

    sample=None
-
-    if tags[0] is not None:                                    # Direct  tag known
-        if tags[1] is not None:                                # Reverse tag known
-            sample = samples.get(tags, None)             
-        else:                                                   # Only direct tag known
-            s=[samples[x] for x in samples if x[0]==tags[0]]
-            if len(s)==1:
-                sample=s[0]
-            elif len(s)>1:
-                final_sequence[b'error']=b'multiple samples match tags'
-                return False, final_sequence
-            else:
-                sample=None
-    else: 
-        if tags[1] is not None:                                 # Only reverse tag known
-            s=[samples[x] for x in samples if x[1]==tags[1]]
-            if len(s)==1:
-                sample=s[0]
-            elif len(s)>1:
-                final_sequence[b'error']=b'multiple samples match tags'
-                return False, final_sequence
-            else:
-                sample=None
+    if not no_tags:
+        if tags[0] is not None:                                    # Direct  tag known
+            if tags[1] is not None:                                # Reverse tag known
+                sample = samples.get(tags, None)             
+            else:                                                   # Only direct tag known
+                s=[samples[x] for x in samples if x[0]==tags[0]]
+                if len(s)==1:
+                    sample=s[0]
+                elif len(s)>1:
+                    final_sequence[b'error']=b'Did not found reverse tag'
+                    return False, final_sequence
+                else:
+                    sample=None
+        else: 
+            if tags[1] is not None:                                 # Only reverse tag known
+                s=[samples[x] for x in samples if x[1]==tags[1]]
+                if len(s)==1:
+                    sample=s[0]
+                elif len(s)>1:
+                    final_sequence[b'error']=b'Did not found forward tag'
+                    return False, final_sequence
+                else:
+                    sample=None
+        
+        if sample is None:
+            final_sequence[b'error']=b"No tags found"
+            return False, final_sequence
    
-    if sample is None:
-        final_sequence[b'error']=b"Cannot assign sequence to a sample"
-        return False, final_sequence
-    
-    final_sequence.update(sample)
+        final_sequence.update(sample)
    
    if not not_aligned:
        final_sequence[b'seq_length']=len(final_sequence)
@ -572,6 +583,7 @@ def run(config):
    
    g = 0
    u = 0
+    no_tags = config['ngsfilter']['notags']
    try:
        for i in range(entries_len):
            PyErr_CheckSignals()
@ -580,7 +592,7 @@ def run(config):
                modseq = [Nuc_Seq.new_from_stored(forward[i]), Nuc_Seq.new_from_stored(reverse[i])]
            else:
                modseq = [Nuc_Seq.new_from_stored(entries[i])]
-            good, oseq = annotate(modseq, infos)
+            good, oseq = annotate(modseq, infos, no_tags)
            if good:
                o_view[g].set(oseq.id, oseq.seq, definition=oseq.definition, quality=oseq.quality, tags=oseq)
                g+=1
@ -596,9 +608,10 @@ def run(config):
    # Save command config in View and DMS comments
    command_line = " ".join(sys.argv[1:])
    o_view.write_config(config, "ngsfilter", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
-    unidentified.write_config(config, "ngsfilter", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
-    # Add comment about unidentified seqs
-    unidentified.comments["info"] = "View containing sequences categorized as unidentified by the ngsfilter command"
+    if unidentified is not None:
+        unidentified.write_config(config, "ngsfilter", command_line, input_dms_name=input_dms_name, input_view_name=input_view_name)
+        # Add comment about unidentified seqs
+        unidentified.comments["info"] = "View containing sequences categorized as unidentified by the ngsfilter command"
    output[0].record_command_line(command_line)
    
    #print("\n\nOutput view:\n````````````", file=sys.stderr)
@ -607,7 +620,8 @@ def run(config):
    input[0].close()
    output[0].close()
    info_input[0].close()
-    unidentified_input[0].close()
+    if unidentified is not None:
+        unidentified_input[0].close()
    aligner.free()

    logger("info", "Done.")
--- a/python/obitools3/parsers/ngsfilter.pyx
+++ b/python/obitools3/parsers/ngsfilter.pyx
@ -57,6 +57,9 @@ def ngsfilterIterator(lineiterator,
        split_line = line.split()
        tags = split_line.pop(2)
        tags = tags.split(b":")
+        for t_idx in range(2):
+            if tags[t_idx]==b"-" or tags[t_idx]==b"None" or tags[t_idx]==b"":
+                tags[t_idx] = nastring
        if len(tags) == 1:          # Forward and reverse tags are the same
            tags.append(tags[0])
        split_line.insert(2, tags[0])
--- a/python/obitools3/version.py
+++ b/python/obitools3/version.py
@ -1,5 +1,5 @@
 major = 3
 minor = 0
-serial= '0-beta1'
+serial= '0-beta2'

 version ="%d.%02d.%s" % (major,minor,serial)
--- a/setup.py
+++ b/setup.py
@ -83,7 +83,7 @@ def findPackage(root,base=None):
  

 PACKAGE     = "OBITools3"
-VERSION     = "3.0.0-beta1"
+VERSION     = "3.0.0-beta2"
 AUTHOR      = 'Celine Mercier'
 EMAIL       = 'celine.mercier@metabarcoding.org'
 URL         = "http://metabarcoding.org/obitools3"
Author	SHA1	Message	Date
Celine Mercier	35ce37c0f7	ngsfilter: fixed a bug with unaligned chimeras (unpaired primers) and made error annotations more explicit	2019-12-10 13:43:32 +01:00
Celine Mercier	53f18316b0	ngsfilter: made more robust and practical to use with empty tags	2019-11-29 15:21:08 +01:00