import: added import of UNITE fasta format
This commit is contained in:
@ -57,6 +57,12 @@ def __addImportInputOption(optionManager):
|
|||||||
const=b'rdp',
|
const=b'rdp',
|
||||||
help="Input file is in RDP training set fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")
|
help="Input file is in RDP training set fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")
|
||||||
|
|
||||||
|
group.add_argument('--unite-input',
|
||||||
|
action="store_const", dest="obi:inputformat",
|
||||||
|
default=None,
|
||||||
|
const=b'unite',
|
||||||
|
help="Input file is in UNITE fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")
|
||||||
|
|
||||||
group.add_argument('--embl-input',
|
group.add_argument('--embl-input',
|
||||||
action="store_const", dest="obi:inputformat",
|
action="store_const", dest="obi:inputformat",
|
||||||
default=None,
|
default=None,
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
|
||||||
from obitools3.dms.view.view cimport View
|
from obitools3.dms.view.view cimport View
|
||||||
@ -236,13 +237,18 @@ def run(config):
|
|||||||
# Prepare taxon scientific name and taxid refs if RDP or SILVA file
|
# Prepare taxon scientific name and taxid refs if RDP or SILVA file
|
||||||
silva = False
|
silva = False
|
||||||
rdp = False
|
rdp = False
|
||||||
if 'inputformat' in config['obi'] and (config['obi']['inputformat'] == b"silva" or config['obi']['inputformat'] == b"rdp"):
|
unite = False
|
||||||
|
if 'inputformat' in config['obi'] and (config['obi']['inputformat'] == b"silva" or config['obi']['inputformat'] == b"rdp" or config['obi']['inputformat'] == b"unite"):
|
||||||
#if taxo is None:
|
#if taxo is None:
|
||||||
# raise Exception("A taxonomy (as built by 'obi import --taxdump') must be provided for SILVA and RDP files")
|
# raise Exception("A taxonomy (as built by 'obi import --taxdump') must be provided for SILVA and RDP files")
|
||||||
silva = True
|
if config['obi']['inputformat'] == b"silva":
|
||||||
rdp = True
|
silva = True
|
||||||
|
elif config['obi']['inputformat'] == b"rdp":
|
||||||
|
rdp = True
|
||||||
|
elif config['obi']['inputformat'] == b"unite":
|
||||||
|
unite = True
|
||||||
|
sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR)
|
||||||
if taxo is not None:
|
if taxo is not None:
|
||||||
sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR)
|
|
||||||
taxid_col = Column.new_column(view, TAXID_COLUMN, OBI_INT)
|
taxid_col = Column.new_column(view, TAXID_COLUMN, OBI_INT)
|
||||||
|
|
||||||
dcols = {}
|
dcols = {}
|
||||||
@ -349,17 +355,26 @@ def run(config):
|
|||||||
if get_quality:
|
if get_quality:
|
||||||
qual_col[i] = entry.quality
|
qual_col[i] = entry.quality
|
||||||
|
|
||||||
# Parse taxon scientific name if RDP file
|
# Parse taxon scientific name if RDP or Silva or Unite file
|
||||||
if (rdp or silva) and taxo is not None:
|
if (rdp or silva or unite):
|
||||||
sci_names = entry.definition.split(b";")
|
if rdp or silva:
|
||||||
for sci_name in reversed(sci_names):
|
sci_names = entry.definition.split(b";")
|
||||||
if sci_name.split()[0] != b'unidentified' and sci_name.split()[0] != b'uncultured' and sci_name.split()[0] != b'metagenome' :
|
sci_name_col[i] = sci_names[-1]
|
||||||
taxon = taxo.get_taxon_by_name(sci_name)
|
elif unite:
|
||||||
if taxon is not None:
|
sci_names = entry.id.split(b'|')[-1].split(b';')
|
||||||
sci_name_col[i] = taxon.name
|
sci_name_col[i] = re.sub(b'[a-zA-Z]__', b'', sci_names[-1])
|
||||||
taxid_col[i] = taxon.taxid
|
# Fond taxid if taxonomy provided
|
||||||
#print(taxid_col[i], sci_name_col[i])
|
if taxo is not None :
|
||||||
break
|
for sci_name in reversed(sci_names):
|
||||||
|
if unite:
|
||||||
|
sci_name = re.sub(b'[a-zA-Z]__', b'', sci_name)
|
||||||
|
if sci_name.split()[0] != b'unidentified' and sci_name.split()[0] != b'uncultured' and sci_name.split()[0] != b'metagenome':
|
||||||
|
taxon = taxo.get_taxon_by_name(sci_name)
|
||||||
|
if taxon is not None:
|
||||||
|
sci_name_col[i] = taxon.name
|
||||||
|
taxid_col[i] = taxon.taxid
|
||||||
|
#print(taxid_col[i], sci_name_col[i])
|
||||||
|
break
|
||||||
|
|
||||||
for tag in entry :
|
for tag in entry :
|
||||||
|
|
||||||
|
@ -506,7 +506,7 @@ def open_uri(uri,
|
|||||||
if format is not None:
|
if format is not None:
|
||||||
if seqtype==b"nuc":
|
if seqtype==b"nuc":
|
||||||
objclass = Nuc_Seq # Nuc_Seq_Stored? TODO
|
objclass = Nuc_Seq # Nuc_Seq_Stored? TODO
|
||||||
if format==b"fasta" or format==b"silva" or format==b"rdp":
|
if format==b"fasta" or format==b"silva" or format==b"rdp" or format == b"unite":
|
||||||
if input:
|
if input:
|
||||||
iseq = fastaNucIterator(file,
|
iseq = fastaNucIterator(file,
|
||||||
skip=skip,
|
skip=skip,
|
||||||
|
Reference in New Issue
Block a user