obi import: added SINTAX format import and switch to version 3.0.1b18

2022-05-17 09:36:33 +12:00
parent 4f39bb2418
commit ef9d9674b0
4 changed files with 32 additions and 5 deletions
--- a/python/obitools3/apps/optiongroups/init.py
+++ b/python/obitools3/apps/optiongroups/init.py
@ -63,6 +63,12 @@ def __addImportInputOption(optionManager):
                     const=b'unite',
                     help="Input file is in UNITE fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")

+    group.add_argument('--sintax-input',
+                     action="store_const", dest="obi:inputformat",
+                     default=None,
+                     const=b'sintax',
+                     help="Input file is in SINTAX fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")
+
    group.add_argument('--embl-input',
                     action="store_const", dest="obi:inputformat",
                     default=None,
--- a/python/obitools3/commands/import.pyx
+++ b/python/obitools3/commands/import.pyx
@ -109,6 +109,8 @@ def run(config):
    cdef   bint        NUC_SEQS_view
    cdef   bint        silva
    cdef   bint        rdp
+    cdef   bint        unite
+    cdef   bint        sintax
    cdef   int         nb_elts
    cdef   object      d
    cdef   View        view
@ -234,11 +236,15 @@ def run(config):
        def_col = view[DEFINITION_COLUMN]
        seq_col = view[NUC_SEQUENCE_COLUMN]

-    # Prepare taxon scientific name and taxid refs if RDP or SILVA file
+    # Prepare taxon scientific name and taxid refs if RDP/SILVA/UNITE/SINTAX formats
    silva = False
    rdp = False
    unite = False
-    if 'inputformat' in config['obi'] and (config['obi']['inputformat'] == b"silva" or config['obi']['inputformat'] == b"rdp" or config['obi']['inputformat'] == b"unite"):
+    sintax=False
+    if 'inputformat' in config['obi'] and (config['obi']['inputformat'] == b"silva" or \
+                                           config['obi']['inputformat'] == b"rdp" or \
+                                           config['obi']['inputformat'] == b"unite" or \
+                                           config['obi']['inputformat'] == b"sintax"):
        #if taxo is None:
        #    raise Exception("A taxonomy (as built by 'obi import --taxdump') must be provided for SILVA and RDP files")
        if config['obi']['inputformat'] == b"silva":
@ -247,6 +253,8 @@ def run(config):
            rdp = True
        elif config['obi']['inputformat'] == b"unite":
            unite = True
+        elif config['obi']['inputformat'] == b"sintax":
+            sintax = True
        sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR)
        if taxo is not None:
            taxid_col = Column.new_column(view, TAXID_COLUMN, OBI_INT)
@ -356,13 +364,26 @@ def run(config):
                    qual_col[i] = entry.quality
                    
                # Parse taxon scientific name if RDP or Silva or Unite file
-                if (rdp or silva or unite):
+                if (rdp or silva or unite or sintax):
                    if rdp or silva:
                        sci_names = entry.definition.split(b";")
                        sci_name_col[i] = sci_names[-1]
                    elif unite:
                        sci_names = entry.id.split(b'|')[-1].split(b';')
                        sci_name_col[i] = re.sub(b'[a-zA-Z]__', b'', sci_names[-1])
+                    elif sintax:
+                        reconstructed_line = entry.id+b' '+entry.definition[:-1]
+                        splitted_reconstructed_line = reconstructed_line.split(b';')
+                        taxa = splitted_reconstructed_line[1].split(b'=')[1]
+                        taxa = splitted_reconstructed_line[1].split(b',')
+                        sci_names = []
+                        for t in taxa:
+                            tf = t.split(b':')[1]
+                            sci_names.append(tf)
+                        sci_name_col[i] = sci_names[-1]
+                        id_col[i] = reconstructed_line.split(b';')[0]    
+                        def_col[i] = reconstructed_line
+                        
                    # Fond taxid if taxonomy provided
                    if taxo is not None :
                        for sci_name in reversed(sci_names):
--- a/python/obitools3/uri/decode.pyx
+++ b/python/obitools3/uri/decode.pyx
@ -506,7 +506,7 @@ def open_uri(uri,
        if format is not None:
            if seqtype==b"nuc":
                objclass = Nuc_Seq    # Nuc_Seq_Stored? TODO
-                if format==b"fasta" or format==b"silva" or format==b"rdp" or format == b"unite":
+                if format==b"fasta" or format==b"silva" or format==b"rdp" or format == b"unite" or format == b"sintax":
                    if input:
                        iseq = fastaNucIterator(file, 
                                                skip=skip, 
--- a/python/obitools3/version.py
+++ b/python/obitools3/version.py
@ -1,5 +1,5 @@
 major = 3
 minor = 0
-serial= '1b17'
+serial= '1b18'

 version ="%d.%d.%s" % (major,minor,serial)