import: added import of UNITE fasta format

This commit is contained in:
mercierc
2022-05-03 10:54:41 +12:00
parent f9b99a9397
commit 0a2b8adb50
3 changed files with 37 additions and 16 deletions

View File

@ -57,6 +57,12 @@ def __addImportInputOption(optionManager):
const=b'rdp', const=b'rdp',
help="Input file is in RDP training set fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.") help="Input file is in RDP training set fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")
group.add_argument('--unite-input',
action="store_const", dest="obi:inputformat",
default=None,
const=b'unite',
help="Input file is in UNITE fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")
group.add_argument('--embl-input', group.add_argument('--embl-input',
action="store_const", dest="obi:inputformat", action="store_const", dest="obi:inputformat",
default=None, default=None,

View File

@ -2,6 +2,7 @@
import sys import sys
import os import os
import re
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.dms.view.view cimport View from obitools3.dms.view.view cimport View
@ -236,13 +237,18 @@ def run(config):
# Prepare taxon scientific name and taxid refs if RDP or SILVA file # Prepare taxon scientific name and taxid refs if RDP or SILVA file
silva = False silva = False
rdp = False rdp = False
if 'inputformat' in config['obi'] and (config['obi']['inputformat'] == b"silva" or config['obi']['inputformat'] == b"rdp"): unite = False
if 'inputformat' in config['obi'] and (config['obi']['inputformat'] == b"silva" or config['obi']['inputformat'] == b"rdp" or config['obi']['inputformat'] == b"unite"):
#if taxo is None: #if taxo is None:
# raise Exception("A taxonomy (as built by 'obi import --taxdump') must be provided for SILVA and RDP files") # raise Exception("A taxonomy (as built by 'obi import --taxdump') must be provided for SILVA and RDP files")
if config['obi']['inputformat'] == b"silva":
silva = True silva = True
elif config['obi']['inputformat'] == b"rdp":
rdp = True rdp = True
if taxo is not None: elif config['obi']['inputformat'] == b"unite":
unite = True
sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR) sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR)
if taxo is not None:
taxid_col = Column.new_column(view, TAXID_COLUMN, OBI_INT) taxid_col = Column.new_column(view, TAXID_COLUMN, OBI_INT)
dcols = {} dcols = {}
@ -349,11 +355,20 @@ def run(config):
if get_quality: if get_quality:
qual_col[i] = entry.quality qual_col[i] = entry.quality
# Parse taxon scientific name if RDP file # Parse taxon scientific name if RDP or Silva or Unite file
if (rdp or silva) and taxo is not None: if (rdp or silva or unite):
if rdp or silva:
sci_names = entry.definition.split(b";") sci_names = entry.definition.split(b";")
sci_name_col[i] = sci_names[-1]
elif unite:
sci_names = entry.id.split(b'|')[-1].split(b';')
sci_name_col[i] = re.sub(b'[a-zA-Z]__', b'', sci_names[-1])
# Fond taxid if taxonomy provided
if taxo is not None :
for sci_name in reversed(sci_names): for sci_name in reversed(sci_names):
if sci_name.split()[0] != b'unidentified' and sci_name.split()[0] != b'uncultured' and sci_name.split()[0] != b'metagenome' : if unite:
sci_name = re.sub(b'[a-zA-Z]__', b'', sci_name)
if sci_name.split()[0] != b'unidentified' and sci_name.split()[0] != b'uncultured' and sci_name.split()[0] != b'metagenome':
taxon = taxo.get_taxon_by_name(sci_name) taxon = taxo.get_taxon_by_name(sci_name)
if taxon is not None: if taxon is not None:
sci_name_col[i] = taxon.name sci_name_col[i] = taxon.name

View File

@ -506,7 +506,7 @@ def open_uri(uri,
if format is not None: if format is not None:
if seqtype==b"nuc": if seqtype==b"nuc":
objclass = Nuc_Seq # Nuc_Seq_Stored? TODO objclass = Nuc_Seq # Nuc_Seq_Stored? TODO
if format==b"fasta" or format==b"silva" or format==b"rdp": if format==b"fasta" or format==b"silva" or format==b"rdp" or format == b"unite":
if input: if input:
iseq = fastaNucIterator(file, iseq = fastaNucIterator(file,
skip=skip, skip=skip,