diff --git a/python/obitools3/apps/optiongroups/__init__.py b/python/obitools3/apps/optiongroups/__init__.py index bc0aab5..ac626d8 100755 --- a/python/obitools3/apps/optiongroups/__init__.py +++ b/python/obitools3/apps/optiongroups/__init__.py @@ -57,6 +57,12 @@ def __addImportInputOption(optionManager): const=b'rdp', help="Input file is in RDP training set fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.") + group.add_argument('--unite-input', + action="store_const", dest="obi:inputformat", + default=None, + const=b'unite', + help="Input file is in UNITE fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.") + group.add_argument('--embl-input', action="store_const", dest="obi:inputformat", default=None, diff --git a/python/obitools3/commands/import.pyx b/python/obitools3/commands/import.pyx index 3f3fc69..fde0237 100755 --- a/python/obitools3/commands/import.pyx +++ b/python/obitools3/commands/import.pyx @@ -2,6 +2,7 @@ import sys import os +import re from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport from obitools3.dms.view.view cimport View @@ -236,13 +237,18 @@ def run(config): # Prepare taxon scientific name and taxid refs if RDP or SILVA file silva = False rdp = False - if 'inputformat' in config['obi'] and (config['obi']['inputformat'] == b"silva" or config['obi']['inputformat'] == b"rdp"): + unite = False + if 'inputformat' in config['obi'] and (config['obi']['inputformat'] == b"silva" or config['obi']['inputformat'] == b"rdp" or config['obi']['inputformat'] == b"unite"): #if taxo is None: # raise Exception("A taxonomy (as built by 'obi import --taxdump') must be provided for SILVA and RDP files") - silva = True - rdp = True + if config['obi']['inputformat'] == b"silva": + silva = True + elif config['obi']['inputformat'] == b"rdp": + rdp = True + elif config['obi']['inputformat'] == b"unite": + unite = True + sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR) if taxo is not None: - sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR) taxid_col = Column.new_column(view, TAXID_COLUMN, OBI_INT) dcols = {} @@ -349,17 +355,26 @@ def run(config): if get_quality: qual_col[i] = entry.quality - # Parse taxon scientific name if RDP file - if (rdp or silva) and taxo is not None: - sci_names = entry.definition.split(b";") - for sci_name in reversed(sci_names): - if sci_name.split()[0] != b'unidentified' and sci_name.split()[0] != b'uncultured' and sci_name.split()[0] != b'metagenome' : - taxon = taxo.get_taxon_by_name(sci_name) - if taxon is not None: - sci_name_col[i] = taxon.name - taxid_col[i] = taxon.taxid - #print(taxid_col[i], sci_name_col[i]) - break + # Parse taxon scientific name if RDP or Silva or Unite file + if (rdp or silva or unite): + if rdp or silva: + sci_names = entry.definition.split(b";") + sci_name_col[i] = sci_names[-1] + elif unite: + sci_names = entry.id.split(b'|')[-1].split(b';') + sci_name_col[i] = re.sub(b'[a-zA-Z]__', b'', sci_names[-1]) + # Fond taxid if taxonomy provided + if taxo is not None : + for sci_name in reversed(sci_names): + if unite: + sci_name = re.sub(b'[a-zA-Z]__', b'', sci_name) + if sci_name.split()[0] != b'unidentified' and sci_name.split()[0] != b'uncultured' and sci_name.split()[0] != b'metagenome': + taxon = taxo.get_taxon_by_name(sci_name) + if taxon is not None: + sci_name_col[i] = taxon.name + taxid_col[i] = taxon.taxid + #print(taxid_col[i], sci_name_col[i]) + break for tag in entry : diff --git a/python/obitools3/uri/decode.pyx b/python/obitools3/uri/decode.pyx index 82ba6dd..86c5970 100644 --- a/python/obitools3/uri/decode.pyx +++ b/python/obitools3/uri/decode.pyx @@ -506,7 +506,7 @@ def open_uri(uri, if format is not None: if seqtype==b"nuc": objclass = Nuc_Seq # Nuc_Seq_Stored? TODO - if format==b"fasta" or format==b"silva" or format==b"rdp": + if format==b"fasta" or format==b"silva" or format==b"rdp" or format == b"unite": if input: iseq = fastaNucIterator(file, skip=skip,