obi import: added SINTAX format import and switch to version 3.0.1b18

This commit is contained in:
mercierc
2022-05-17 09:36:33 +12:00
parent 4f39bb2418
commit ef9d9674b0
4 changed files with 32 additions and 5 deletions

View File

@ -63,6 +63,12 @@ def __addImportInputOption(optionManager):
const=b'unite',
help="Input file is in UNITE fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")
group.add_argument('--sintax-input',
action="store_const", dest="obi:inputformat",
default=None,
const=b'sintax',
help="Input file is in SINTAX fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")
group.add_argument('--embl-input',
action="store_const", dest="obi:inputformat",
default=None,

View File

@ -109,6 +109,8 @@ def run(config):
cdef bint NUC_SEQS_view
cdef bint silva
cdef bint rdp
cdef bint unite
cdef bint sintax
cdef int nb_elts
cdef object d
cdef View view
@ -234,11 +236,15 @@ def run(config):
def_col = view[DEFINITION_COLUMN]
seq_col = view[NUC_SEQUENCE_COLUMN]
# Prepare taxon scientific name and taxid refs if RDP or SILVA file
# Prepare taxon scientific name and taxid refs if RDP/SILVA/UNITE/SINTAX formats
silva = False
rdp = False
unite = False
if 'inputformat' in config['obi'] and (config['obi']['inputformat'] == b"silva" or config['obi']['inputformat'] == b"rdp" or config['obi']['inputformat'] == b"unite"):
sintax=False
if 'inputformat' in config['obi'] and (config['obi']['inputformat'] == b"silva" or \
config['obi']['inputformat'] == b"rdp" or \
config['obi']['inputformat'] == b"unite" or \
config['obi']['inputformat'] == b"sintax"):
#if taxo is None:
# raise Exception("A taxonomy (as built by 'obi import --taxdump') must be provided for SILVA and RDP files")
if config['obi']['inputformat'] == b"silva":
@ -247,6 +253,8 @@ def run(config):
rdp = True
elif config['obi']['inputformat'] == b"unite":
unite = True
elif config['obi']['inputformat'] == b"sintax":
sintax = True
sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR)
if taxo is not None:
taxid_col = Column.new_column(view, TAXID_COLUMN, OBI_INT)
@ -356,13 +364,26 @@ def run(config):
qual_col[i] = entry.quality
# Parse taxon scientific name if RDP or Silva or Unite file
if (rdp or silva or unite):
if (rdp or silva or unite or sintax):
if rdp or silva:
sci_names = entry.definition.split(b";")
sci_name_col[i] = sci_names[-1]
elif unite:
sci_names = entry.id.split(b'|')[-1].split(b';')
sci_name_col[i] = re.sub(b'[a-zA-Z]__', b'', sci_names[-1])
elif sintax:
reconstructed_line = entry.id+b' '+entry.definition[:-1]
splitted_reconstructed_line = reconstructed_line.split(b';')
taxa = splitted_reconstructed_line[1].split(b'=')[1]
taxa = splitted_reconstructed_line[1].split(b',')
sci_names = []
for t in taxa:
tf = t.split(b':')[1]
sci_names.append(tf)
sci_name_col[i] = sci_names[-1]
id_col[i] = reconstructed_line.split(b';')[0]
def_col[i] = reconstructed_line
# Fond taxid if taxonomy provided
if taxo is not None :
for sci_name in reversed(sci_names):

View File

@ -506,7 +506,7 @@ def open_uri(uri,
if format is not None:
if seqtype==b"nuc":
objclass = Nuc_Seq # Nuc_Seq_Stored? TODO
if format==b"fasta" or format==b"silva" or format==b"rdp" or format == b"unite":
if format==b"fasta" or format==b"silva" or format==b"rdp" or format == b"unite" or format == b"sintax":
if input:
iseq = fastaNucIterator(file,
skip=skip,

View File

@ -1,5 +1,5 @@
major = 3
minor = 0
serial= '1b17'
serial= '1b18'
version ="%d.%d.%s" % (major,minor,serial)