Compare commits
4 Commits
Author | SHA1 | Date | |
---|---|---|---|
7e492578b3 | |||
02e9df3ad1 | |||
55ada80500 | |||
ef9d9674b0 |
@ -63,6 +63,12 @@ def __addImportInputOption(optionManager):
|
||||
const=b'unite',
|
||||
help="Input file is in UNITE fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")
|
||||
|
||||
group.add_argument('--sintax-input',
|
||||
action="store_const", dest="obi:inputformat",
|
||||
default=None,
|
||||
const=b'sintax',
|
||||
help="Input file is in SINTAX fasta format. If NCBI taxonomy provided with --taxonomy, taxid and scientific name will be added for each sequence.")
|
||||
|
||||
group.add_argument('--embl-input',
|
||||
action="store_const", dest="obi:inputformat",
|
||||
default=None,
|
||||
|
@ -233,6 +233,10 @@ def run(config):
|
||||
PyErr_CheckSignals()
|
||||
|
||||
consensus = o_view[i]
|
||||
|
||||
if two_views:
|
||||
consensus[b"R1_parent"] = forward[i].id
|
||||
consensus[b"R2_parent"] = reverse[i].id
|
||||
|
||||
if not two_views:
|
||||
seqF = entries[i]
|
||||
|
@ -109,6 +109,8 @@ def run(config):
|
||||
cdef bint NUC_SEQS_view
|
||||
cdef bint silva
|
||||
cdef bint rdp
|
||||
cdef bint unite
|
||||
cdef bint sintax
|
||||
cdef int nb_elts
|
||||
cdef object d
|
||||
cdef View view
|
||||
@ -234,11 +236,15 @@ def run(config):
|
||||
def_col = view[DEFINITION_COLUMN]
|
||||
seq_col = view[NUC_SEQUENCE_COLUMN]
|
||||
|
||||
# Prepare taxon scientific name and taxid refs if RDP or SILVA file
|
||||
# Prepare taxon scientific name and taxid refs if RDP/SILVA/UNITE/SINTAX formats
|
||||
silva = False
|
||||
rdp = False
|
||||
unite = False
|
||||
if 'inputformat' in config['obi'] and (config['obi']['inputformat'] == b"silva" or config['obi']['inputformat'] == b"rdp" or config['obi']['inputformat'] == b"unite"):
|
||||
sintax=False
|
||||
if 'inputformat' in config['obi'] and (config['obi']['inputformat'] == b"silva" or \
|
||||
config['obi']['inputformat'] == b"rdp" or \
|
||||
config['obi']['inputformat'] == b"unite" or \
|
||||
config['obi']['inputformat'] == b"sintax"):
|
||||
#if taxo is None:
|
||||
# raise Exception("A taxonomy (as built by 'obi import --taxdump') must be provided for SILVA and RDP files")
|
||||
if config['obi']['inputformat'] == b"silva":
|
||||
@ -247,6 +253,8 @@ def run(config):
|
||||
rdp = True
|
||||
elif config['obi']['inputformat'] == b"unite":
|
||||
unite = True
|
||||
elif config['obi']['inputformat'] == b"sintax":
|
||||
sintax = True
|
||||
sci_name_col = Column.new_column(view, SCIENTIFIC_NAME_COLUMN, OBI_STR)
|
||||
if taxo is not None:
|
||||
taxid_col = Column.new_column(view, TAXID_COLUMN, OBI_INT)
|
||||
@ -356,13 +364,26 @@ def run(config):
|
||||
qual_col[i] = entry.quality
|
||||
|
||||
# Parse taxon scientific name if RDP or Silva or Unite file
|
||||
if (rdp or silva or unite):
|
||||
if (rdp or silva or unite or sintax):
|
||||
if rdp or silva:
|
||||
sci_names = entry.definition.split(b";")
|
||||
sci_name_col[i] = sci_names[-1]
|
||||
elif unite:
|
||||
sci_names = entry.id.split(b'|')[-1].split(b';')
|
||||
sci_name_col[i] = re.sub(b'[a-zA-Z]__', b'', sci_names[-1])
|
||||
elif sintax:
|
||||
reconstructed_line = entry.id+b' '+entry.definition[:-1]
|
||||
splitted_reconstructed_line = reconstructed_line.split(b';')
|
||||
taxa = splitted_reconstructed_line[1].split(b'=')[1]
|
||||
taxa = splitted_reconstructed_line[1].split(b',')
|
||||
sci_names = []
|
||||
for t in taxa:
|
||||
tf = t.split(b':')[1]
|
||||
sci_names.append(tf)
|
||||
sci_name_col[i] = sci_names[-1]
|
||||
id_col[i] = reconstructed_line.split(b';')[0]
|
||||
def_col[i] = reconstructed_line
|
||||
|
||||
# Fond taxid if taxonomy provided
|
||||
if taxo is not None :
|
||||
for sci_name in reversed(sci_names):
|
||||
|
@ -264,6 +264,10 @@ cdef tuple annotate(sequences, infos, no_tags, verbose=False):
|
||||
|
||||
not_aligned = len(sequences) > 1
|
||||
sequences[0] = sequences[0].clone()
|
||||
|
||||
if not_aligned:
|
||||
sequences[0][b"R1_parent"] = sequences[0].id
|
||||
sequences[0][b"R2_parent"] = sequences[1].id
|
||||
|
||||
if not_aligned:
|
||||
sequences[1] = sequences[1].clone()
|
||||
|
@ -48,24 +48,25 @@ def ngsfilterIterator(lineiterator,
|
||||
all_lines.insert(0, firstline)
|
||||
|
||||
# Insert header for column names
|
||||
column_names = [b"experiment", b"sample", b"forward_tag", b"reverse_tag", b"forward_primer", b"reverse_primer",b"additional_info"]
|
||||
column_names = [b"experiment", b"sample", b"forward_tag", b"reverse_tag", b"forward_primer", b"reverse_primer"] #,b"additional_info"]
|
||||
header = out_sep.join(column_names)
|
||||
|
||||
new_lines.append(header)
|
||||
|
||||
for line in all_lines:
|
||||
split_line = line.split(maxsplit=5)
|
||||
tags = split_line.pop(2)
|
||||
tags = tags.split(b":")
|
||||
for t_idx in range(len(tags)):
|
||||
if tags[t_idx]==b"-" or tags[t_idx]==b"None" or tags[t_idx]==b"":
|
||||
tags[t_idx] = nastring
|
||||
if len(tags) == 1: # Forward and reverse tags are the same
|
||||
tags.append(tags[0])
|
||||
split_line.insert(2, tags[0])
|
||||
split_line.insert(3, tags[1])
|
||||
new_lines.append(out_sep.join(split_line[0:7]))
|
||||
|
||||
if split_line:
|
||||
tags = split_line.pop(2)
|
||||
tags = tags.split(b":")
|
||||
for t_idx in range(len(tags)):
|
||||
if tags[t_idx]==b"-" or tags[t_idx]==b"None" or tags[t_idx]==b"":
|
||||
tags[t_idx] = nastring
|
||||
if len(tags) == 1: # Forward and reverse tags are the same
|
||||
tags.append(tags[0])
|
||||
split_line.insert(2, tags[0])
|
||||
split_line.insert(3, tags[1])
|
||||
new_lines.append(out_sep.join(split_line[0:6]))
|
||||
|
||||
return tabIterator(iter(new_lines),
|
||||
header = True,
|
||||
sep = out_sep,
|
||||
|
@ -506,7 +506,7 @@ def open_uri(uri,
|
||||
if format is not None:
|
||||
if seqtype==b"nuc":
|
||||
objclass = Nuc_Seq # Nuc_Seq_Stored? TODO
|
||||
if format==b"fasta" or format==b"silva" or format==b"rdp" or format == b"unite":
|
||||
if format==b"fasta" or format==b"silva" or format==b"rdp" or format == b"unite" or format == b"sintax":
|
||||
if input:
|
||||
iseq = fastaNucIterator(file,
|
||||
skip=skip,
|
||||
|
@ -1,5 +1,5 @@
|
||||
major = 3
|
||||
minor = 0
|
||||
serial= '1b17'
|
||||
serial= '1b20'
|
||||
|
||||
version ="%d.%d.%s" % (major,minor,serial)
|
||||
|
1
src/.gitignore
vendored
1
src/.gitignore
vendored
@ -3,3 +3,4 @@
|
||||
/cmake_install.cmake
|
||||
/libcobitools3.dylib
|
||||
/Makefile
|
||||
/build/
|
||||
|
Reference in New Issue
Block a user