Switch to version 3.0.1b20

alignpairedend and ngsfilter: ids of original sequences are now kept
import: made ngsfilter file parsing more resilient and switching to
2022-09-21 11:33:03 +12:00 · 2022-09-21 11:32:19 +12:00 · 2022-07-15 16:02:21 +12:00 · 2022-05-17 09:36:33 +12:00 · 2022-05-03 10:55:36 +12:00 · 2022-05-03 10:54:41 +12:00
8 changed files with 88 additions and 30 deletions
--- a/python/obitools3/apps/optiongroups/init.py
+++ b/python/obitools3/apps/optiongroups/init.py
@ -57,6 +57,18 @@ def __addImportInputOption(optionManager):
                     const=b'rdp',
                     help="Input file is in RDP training set fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")
    group.add_argument('--unite-input',
                     action="store_const", dest="obi:inputformat",
                     default=None,
                     const=b'unite',
                     help="Input file is in UNITE fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")
    group.add_argument('--sintax-input',
                     action="store_const", dest="obi:inputformat",
                     default=None,
                     const=b'sintax',
                     help="Input file is in SINTAX fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")
    group.add_argument('--embl-input',
                     action="store_const", dest="obi:inputformat",
                     default=None,
--- a/python/obitools3/commands/alignpairedend.pyx
+++ b/python/obitools3/commands/alignpairedend.pyx
@ -234,6 +234,10 @@ def run(config):
        consensus = o_view[i]
        if two_views:
            consensus[b"R1_parent"] = forward[i].id
            consensus[b"R2_parent"] = reverse[i].id
        if not two_views:
            seqF = entries[i]
        else:
--- a/python/obitools3/commands/import.pyx
+++ b/python/obitools3/commands/import.pyx
@ -2,6 +2,7 @@
 import sys
 import os
 import re
 from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
 from obitools3.dms.view.view cimport View
@ -108,6 +109,8 @@ def run(config):
    cdef   bint        NUC_SEQS_view
    cdef   bint        silva
    cdef   bint        rdp
    cdef   bint        unite
    cdef   bint        sintax
    cdef   int         nb_elts
    cdef   object      d
    cdef   View        view
@ -233,16 +236,27 @@ def run(config):
        def_col = view[DEFINITION_COLUMN]
        seq_col = view[NUC_SEQUENCE_COLUMN]
-    # Prepare taxon scientific name and taxid refs if RDP or SILVA file
+    # Prepare taxon scientific name and taxid refs if RDP/SILVA/UNITE/SINTAX formats
    silva = False
    rdp = False
-    if 'inputformat' in config['obi'] and (config['obi']['inputformat'] == b"silva" or config['obi']['inputformat'] == b"rdp"):
+    unite = False
    sintax=False
    if 'inputformat' in config['obi'] and (config['obi']['inputformat'] == b"silva" or \
                                           config['obi']['inputformat'] == b"rdp" or \
                                           config['obi']['inputformat'] == b"unite" or \
                                           config['obi']['inputformat'] == b"sintax"):
        #if taxo is None:
        #    raise Exception("A taxonomy (as built by 'obi import --taxdump') must be provided for SILVA and RDP files")
-        silva = True
+        if config['obi']['inputformat'] == b"silva":
-        rdp = True
+            silva = True
        elif config['obi']['inputformat'] == b"rdp":
            rdp = True
        elif config['obi']['inputformat'] == b"unite":
            unite = True
        elif config['obi']['inputformat'] == b"sintax":
            sintax = True
        sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR)
        if taxo is not None:
            sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR)
            taxid_col = Column.new_column(view, TAXID_COLUMN, OBI_INT)
    dcols = {}
@ -349,17 +363,39 @@ def run(config):
                if get_quality:
                    qual_col[i] = entry.quality
-                # Parse taxon scientific name if RDP file
+                # Parse taxon scientific name if RDP or Silva or Unite file
-                if (rdp or silva) and taxo is not None:
+                if (rdp or silva or unite or sintax):
-                    sci_names = entry.definition.split(b";")
+                    if rdp or silva:
-                    for sci_name in reversed(sci_names):
+                        sci_names = entry.definition.split(b";")
-                        if sci_name.split()[0] != b'unidentified' and sci_name.split()[0] != b'uncultured' and sci_name.split()[0] != b'metagenome' :
+                        sci_name_col[i] = sci_names[-1]
-                            taxon = taxo.get_taxon_by_name(sci_name)
+                    elif unite:
-                            if taxon is not None:
+                        sci_names = entry.id.split(b'|')[-1].split(b';')
-                                sci_name_col[i] = taxon.name
+                        sci_name_col[i] = re.sub(b'[a-zA-Z]__', b'', sci_names[-1])
-                                taxid_col[i] = taxon.taxid
+                    elif sintax:
-                                #print(taxid_col[i], sci_name_col[i])
+                        reconstructed_line = entry.id+b' '+entry.definition[:-1]
-                                break
+                        splitted_reconstructed_line = reconstructed_line.split(b';')
                        taxa = splitted_reconstructed_line[1].split(b'=')[1]
                        taxa = splitted_reconstructed_line[1].split(b',')
                        sci_names = []
                        for t in taxa:
                            tf = t.split(b':')[1]
                            sci_names.append(tf)
                        sci_name_col[i] = sci_names[-1]
                        id_col[i] = reconstructed_line.split(b';')[0]    
                        def_col[i] = reconstructed_line
                    # Fond taxid if taxonomy provided
                    if taxo is not None :
                        for sci_name in reversed(sci_names):
                            if unite:
                                sci_name = re.sub(b'[a-zA-Z]__', b'', sci_name)
                            if sci_name.split()[0] != b'unidentified' and sci_name.split()[0] != b'uncultured' and sci_name.split()[0] != b'metagenome':
                                taxon = taxo.get_taxon_by_name(sci_name)
                                if taxon is not None:
                                    sci_name_col[i] = taxon.name
                                    taxid_col[i] = taxon.taxid
                                    #print(taxid_col[i], sci_name_col[i])
                                    break
            for tag in entry :
--- a/python/obitools3/commands/ngsfilter.pyx
+++ b/python/obitools3/commands/ngsfilter.pyx
@ -265,6 +265,10 @@ cdef tuple annotate(sequences, infos, no_tags, verbose=False):
    not_aligned = len(sequences) > 1
    sequences[0] = sequences[0].clone()
    if not_aligned:
        sequences[0][b"R1_parent"] = sequences[0].id
        sequences[0][b"R2_parent"] = sequences[1].id
    if not_aligned:
        sequences[1] = sequences[1].clone()
        sequences[0][REVERSE_SEQUENCE_COLUMN] = sequences[1].seq             # used by alignpairedend tool
--- a/python/obitools3/parsers/ngsfilter.pyx
+++ b/python/obitools3/parsers/ngsfilter.pyx
@ -48,23 +48,24 @@ def ngsfilterIterator(lineiterator,
        all_lines.insert(0, firstline)
    # Insert header for column names
-    column_names = [b"experiment", b"sample", b"forward_tag", b"reverse_tag", b"forward_primer", b"reverse_primer",b"additional_info"]
+    column_names = [b"experiment", b"sample", b"forward_tag", b"reverse_tag", b"forward_primer", b"reverse_primer"] #,b"additional_info"]
    header = out_sep.join(column_names)
    new_lines.append(header)
    for line in all_lines:
        split_line = line.split(maxsplit=5)
-        tags = split_line.pop(2)
+        if split_line:
-        tags = tags.split(b":")
+            tags = split_line.pop(2)
-        for t_idx in range(len(tags)):
+            tags = tags.split(b":")
-            if tags[t_idx]==b"-" or tags[t_idx]==b"None" or tags[t_idx]==b"":
+            for t_idx in range(len(tags)):
-                tags[t_idx] = nastring
+                if tags[t_idx]==b"-" or tags[t_idx]==b"None" or tags[t_idx]==b"":
-        if len(tags) == 1:          # Forward and reverse tags are the same
+                    tags[t_idx] = nastring
-            tags.append(tags[0])
+            if len(tags) == 1:          # Forward and reverse tags are the same
-        split_line.insert(2, tags[0])
+                tags.append(tags[0])
-        split_line.insert(3, tags[1])
+            split_line.insert(2, tags[0])
-        new_lines.append(out_sep.join(split_line[0:7]))
+            split_line.insert(3, tags[1])
            new_lines.append(out_sep.join(split_line[0:6]))
    return tabIterator(iter(new_lines),
                       header = True,
--- a/python/obitools3/uri/decode.pyx
+++ b/python/obitools3/uri/decode.pyx
@ -506,7 +506,7 @@ def open_uri(uri,
        if format is not None:
            if seqtype==b"nuc":
                objclass = Nuc_Seq    # Nuc_Seq_Stored? TODO
-                if format==b"fasta" or format==b"silva" or format==b"rdp":
+                if format==b"fasta" or format==b"silva" or format==b"rdp" or format == b"unite" or format == b"sintax":
                    if input:
                        iseq = fastaNucIterator(file, 
                                                skip=skip, 
--- a/python/obitools3/version.py
+++ b/python/obitools3/version.py
@ -1,5 +1,5 @@
 major = 3
 minor = 0
-serial= '1b16'
+serial= '1b20'
 version ="%d.%d.%s" % (major,minor,serial)
--- a/src/.gitignore
+++ b/src/.gitignore
@ -3,3 +3,4 @@
 /cmake_install.cmake
 /libcobitools3.dylib
 /Makefile
 /build/
Author	SHA1	Message	Date
Celine Mercier	7e492578b3	Switch to version 3.0.1b20	2022-09-21 11:33:03 +12:00
Celine Mercier	02e9df3ad1	alignpairedend and ngsfilter: ids of original sequences are now kept	2022-09-21 11:32:19 +12:00
mercierc	55ada80500	import: made ngsfilter file parsing more resilient and switching to version 3.0.1b19	2022-07-15 16:02:21 +12:00
mercierc	ef9d9674b0	obi import: added SINTAX format import and switch to version 3.0.1b18	2022-05-17 09:36:33 +12:00
mercierc	4f39bb2418	switch to version 3.0.1b17	2022-05-03 10:55:36 +12:00
mercierc	0a2b8adb50	import: added import of UNITE fasta format	2022-05-03 10:54:41 +12:00