Compare commits

..

7 Commits

9 changed files with 92 additions and 30 deletions

View File

@ -57,6 +57,18 @@ def __addImportInputOption(optionManager):
const=b'rdp', const=b'rdp',
help="Input file is in RDP training set fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.") help="Input file is in RDP training set fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")
group.add_argument('--unite-input',
action="store_const", dest="obi:inputformat",
default=None,
const=b'unite',
help="Input file is in UNITE fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")
group.add_argument('--sintax-input',
action="store_const", dest="obi:inputformat",
default=None,
const=b'sintax',
help="Input file is in SINTAX fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")
group.add_argument('--embl-input', group.add_argument('--embl-input',
action="store_const", dest="obi:inputformat", action="store_const", dest="obi:inputformat",
default=None, default=None,

View File

@ -234,6 +234,10 @@ def run(config):
consensus = o_view[i] consensus = o_view[i]
if two_views:
consensus[b"R1_parent"] = forward[i].id
consensus[b"R2_parent"] = reverse[i].id
if not two_views: if not two_views:
seqF = entries[i] seqF = entries[i]
else: else:

View File

@ -16,6 +16,8 @@ from obitools3.dms.capi.obiview cimport NUC_SEQUENCE_COLUMN, \
QUALITY_COLUMN, \ QUALITY_COLUMN, \
COUNT_COLUMN, \ COUNT_COLUMN, \
TAXID_COLUMN TAXID_COLUMN
from obitools3.dms.capi.obitypes cimport OBI_STR
from obitools3.dms.column.column cimport Column
import time import time
import math import math
@ -187,6 +189,8 @@ def sequenceTaggerGenerator(config, taxo=None):
else: else:
scn=None scn=None
seq[rank]=rtaxid seq[rank]=rtaxid
if "%s_name"%rank not in seq.view:
Column.new_column(seq.view, "%s_name"%rank, OBI_STR)
seq["%s_name"%rank]=scn seq["%s_name"%rank]=scn
if add_rank: if add_rank:

View File

@ -2,6 +2,7 @@
import sys import sys
import os import os
import re
from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport from obitools3.apps.progress cimport ProgressBar # @UnresolvedImport
from obitools3.dms.view.view cimport View from obitools3.dms.view.view cimport View
@ -108,6 +109,8 @@ def run(config):
cdef bint NUC_SEQS_view cdef bint NUC_SEQS_view
cdef bint silva cdef bint silva
cdef bint rdp cdef bint rdp
cdef bint unite
cdef bint sintax
cdef int nb_elts cdef int nb_elts
cdef object d cdef object d
cdef View view cdef View view
@ -233,16 +236,27 @@ def run(config):
def_col = view[DEFINITION_COLUMN] def_col = view[DEFINITION_COLUMN]
seq_col = view[NUC_SEQUENCE_COLUMN] seq_col = view[NUC_SEQUENCE_COLUMN]
# Prepare taxon scientific name and taxid refs if RDP or SILVA file # Prepare taxon scientific name and taxid refs if RDP/SILVA/UNITE/SINTAX formats
silva = False silva = False
rdp = False rdp = False
if 'inputformat' in config['obi'] and (config['obi']['inputformat'] == b"silva" or config['obi']['inputformat'] == b"rdp"): unite = False
sintax=False
if 'inputformat' in config['obi'] and (config['obi']['inputformat'] == b"silva" or \
config['obi']['inputformat'] == b"rdp" or \
config['obi']['inputformat'] == b"unite" or \
config['obi']['inputformat'] == b"sintax"):
#if taxo is None: #if taxo is None:
# raise Exception("A taxonomy (as built by 'obi import --taxdump') must be provided for SILVA and RDP files") # raise Exception("A taxonomy (as built by 'obi import --taxdump') must be provided for SILVA and RDP files")
if config['obi']['inputformat'] == b"silva":
silva = True silva = True
elif config['obi']['inputformat'] == b"rdp":
rdp = True rdp = True
if taxo is not None: elif config['obi']['inputformat'] == b"unite":
unite = True
elif config['obi']['inputformat'] == b"sintax":
sintax = True
sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR) sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR)
if taxo is not None:
taxid_col = Column.new_column(view, TAXID_COLUMN, OBI_INT) taxid_col = Column.new_column(view, TAXID_COLUMN, OBI_INT)
dcols = {} dcols = {}
@ -349,10 +363,32 @@ def run(config):
if get_quality: if get_quality:
qual_col[i] = entry.quality qual_col[i] = entry.quality
# Parse taxon scientific name if RDP file # Parse taxon scientific name if RDP or Silva or Unite file
if (rdp or silva) and taxo is not None: if (rdp or silva or unite or sintax):
if rdp or silva:
sci_names = entry.definition.split(b";") sci_names = entry.definition.split(b";")
sci_name_col[i] = sci_names[-1]
elif unite:
sci_names = entry.id.split(b'|')[-1].split(b';')
sci_name_col[i] = re.sub(b'[a-zA-Z]__', b'', sci_names[-1])
elif sintax:
reconstructed_line = entry.id+b' '+entry.definition[:-1]
splitted_reconstructed_line = reconstructed_line.split(b';')
taxa = splitted_reconstructed_line[1].split(b'=')[1]
taxa = splitted_reconstructed_line[1].split(b',')
sci_names = []
for t in taxa:
tf = t.split(b':')[1]
sci_names.append(tf)
sci_name_col[i] = sci_names[-1]
id_col[i] = reconstructed_line.split(b';')[0]
def_col[i] = reconstructed_line
# Fond taxid if taxonomy provided
if taxo is not None :
for sci_name in reversed(sci_names): for sci_name in reversed(sci_names):
if unite:
sci_name = re.sub(b'[a-zA-Z]__', b'', sci_name)
if sci_name.split()[0] != b'unidentified' and sci_name.split()[0] != b'uncultured' and sci_name.split()[0] != b'metagenome': if sci_name.split()[0] != b'unidentified' and sci_name.split()[0] != b'uncultured' and sci_name.split()[0] != b'metagenome':
taxon = taxo.get_taxon_by_name(sci_name) taxon = taxo.get_taxon_by_name(sci_name)
if taxon is not None: if taxon is not None:

View File

@ -265,6 +265,10 @@ cdef tuple annotate(sequences, infos, no_tags, verbose=False):
not_aligned = len(sequences) > 1 not_aligned = len(sequences) > 1
sequences[0] = sequences[0].clone() sequences[0] = sequences[0].clone()
if not_aligned:
sequences[0][b"R1_parent"] = sequences[0].id
sequences[0][b"R2_parent"] = sequences[1].id
if not_aligned: if not_aligned:
sequences[1] = sequences[1].clone() sequences[1] = sequences[1].clone()
sequences[0][REVERSE_SEQUENCE_COLUMN] = sequences[1].seq # used by alignpairedend tool sequences[0][REVERSE_SEQUENCE_COLUMN] = sequences[1].seq # used by alignpairedend tool

View File

@ -48,13 +48,14 @@ def ngsfilterIterator(lineiterator,
all_lines.insert(0, firstline) all_lines.insert(0, firstline)
# Insert header for column names # Insert header for column names
column_names = [b"experiment", b"sample", b"forward_tag", b"reverse_tag", b"forward_primer", b"reverse_primer",b"additional_info"] column_names = [b"experiment", b"sample", b"forward_tag", b"reverse_tag", b"forward_primer", b"reverse_primer"] #,b"additional_info"]
header = out_sep.join(column_names) header = out_sep.join(column_names)
new_lines.append(header) new_lines.append(header)
for line in all_lines: for line in all_lines:
split_line = line.split(maxsplit=5) split_line = line.split(maxsplit=5)
if split_line:
tags = split_line.pop(2) tags = split_line.pop(2)
tags = tags.split(b":") tags = tags.split(b":")
for t_idx in range(len(tags)): for t_idx in range(len(tags)):
@ -64,7 +65,7 @@ def ngsfilterIterator(lineiterator,
tags.append(tags[0]) tags.append(tags[0])
split_line.insert(2, tags[0]) split_line.insert(2, tags[0])
split_line.insert(3, tags[1]) split_line.insert(3, tags[1])
new_lines.append(out_sep.join(split_line[0:7])) new_lines.append(out_sep.join(split_line[0:6]))
return tabIterator(iter(new_lines), return tabIterator(iter(new_lines),
header = True, header = True,

View File

@ -506,7 +506,7 @@ def open_uri(uri,
if format is not None: if format is not None:
if seqtype==b"nuc": if seqtype==b"nuc":
objclass = Nuc_Seq # Nuc_Seq_Stored? TODO objclass = Nuc_Seq # Nuc_Seq_Stored? TODO
if format==b"fasta" or format==b"silva" or format==b"rdp": if format==b"fasta" or format==b"silva" or format==b"rdp" or format == b"unite" or format == b"sintax":
if input: if input:
iseq = fastaNucIterator(file, iseq = fastaNucIterator(file,
skip=skip, skip=skip,

View File

@ -1,5 +1,5 @@
major = 3 major = 3
minor = 0 minor = 0
serial= '1b15' serial= '1b20'
version ="%d.%d.%s" % (major,minor,serial) version ="%d.%d.%s" % (major,minor,serial)

1
src/.gitignore vendored
View File

@ -3,3 +3,4 @@
/cmake_install.cmake /cmake_install.cmake
/libcobitools3.dylib /libcobitools3.dylib
/Makefile /Makefile
/build/