obi export: added options to export to metabaR compatible format

This commit is contained in:
mercierc
2021-11-11 15:24:12 +13:00
parent 5a8b9dca5d
commit f6353fbf28
5 changed files with 127 additions and 7 deletions

View File

@ -297,6 +297,29 @@ def __addExportOutputOption(optionManager):
const=b'tabular', const=b'tabular',
help="Output file is in tabular format") help="Output file is in tabular format")
group.add_argument('--metabaR-output',
action="store_const", dest="obi:outputformat",
default=None,
const=b'metabaR',
help="Export the files needed by the obifiles_to_metabarlist function of the metabaR package")
group.add_argument('--metabaR-prefix',
action="store", dest="obi:metabarprefix",
type=str,
help="Prefix for the files when using --metabaR-output option")
group.add_argument('--metabaR-ngsfilter',
action="store", dest="obi:metabarngsfilter",
type=str,
default=None,
help="URI to the ngsfilter view when using --metabaR-output option (if not provided, it is not exported)")
group.add_argument('--metabaR-samples',
action="store", dest="obi:metabarsamples",
type=str,
default=None,
help="URI to the sample metadata view when using --metabaR-output option (if not provided, it is built as just a list of the sample names)")
group.add_argument('--only-keys', group.add_argument('--only-keys',
action="append", dest="obi:only_keys", action="append", dest="obi:only_keys",
type=str, type=str,

View File

@ -6,6 +6,9 @@ from obitools3.apps.config import logger
from obitools3.dms import DMS from obitools3.dms import DMS
from obitools3.dms.obiseq import Nuc_Seq from obitools3.dms.obiseq import Nuc_Seq
from obitools3.dms.capi.obiview cimport QUALITY_COLUMN from obitools3.dms.capi.obiview cimport QUALITY_COLUMN
from obitools3.writers.tab import TabWriter
from obitools3.format.tab import TabFormat
from obitools3.utils cimport tobytes, tostr
from obitools3.apps.optiongroups import addMinimalInputOption, \ from obitools3.apps.optiongroups import addMinimalInputOption, \
addExportOutputOption, \ addExportOutputOption, \
@ -76,6 +79,13 @@ def run(config):
else: else:
pb = ProgressBar(withoutskip - skip, config) pb = ProgressBar(withoutskip - skip, config)
if config['obi']['outputformat'] == b'metabaR':
# Check prefix
if "metabarprefix" not in config["obi"]:
raise Exception("Prefix needed when exporting for metabaR (--metabaR-prefix option)")
else:
metabaRprefix = config["obi"]["metabarprefix"]
i=0 i=0
for seq in iview : for seq in iview :
PyErr_CheckSignals() PyErr_CheckSignals()
@ -91,6 +101,81 @@ def run(config):
pb(i, force=True) pb(i, force=True)
print("", file=sys.stderr) print("", file=sys.stderr)
if config['obi']['outputformat'] == b'metabaR':
# Export ngsfilter file if view provided
if 'metabarngsfilter' in config['obi']:
ngsfilter_input = open_uri(config['obi']['metabarngsfilter'])
if ngsfilter_input is None:
raise Exception("Could not read ngsfilter view for metabaR output")
ngsfilter_view = ngsfilter_input[1]
ngsfilter_output = open(config['obi']['metabarprefix']+'.ngsfilter', 'w')
for line in ngsfilter_view:
line_to_print = b""
line_to_print += line[b'experiment']
line_to_print += b"\t"
line_to_print += line[b'sample']
line_to_print += b"\t"
line_to_print += line[b'forward_tag']
line_to_print += b":"
line_to_print += line[b'reverse_tag']
line_to_print += b"\t"
line_to_print += line[b'forward_primer']
line_to_print += b"\t"
line_to_print += line[b'reverse_primer']
line_to_print += b"\t"
line_to_print += line[b'additional_info']
print(tostr(line_to_print), file=ngsfilter_output)
if ngsfilter_input[0] != input[0]:
ngsfilter_input[0].close()
ngsfilter_output.close()
# Export sample metadata
samples_output = open(config['obi']['metabarprefix']+'_samples.csv', 'w')
# Export sample metadata file if view provided
if 'metabarsamples' in config['obi']:
samples_input = open_uri(config['obi']['metabarsamples'])
if samples_input is None:
raise Exception("Could not read sample view for metabaR output")
samples_view = samples_input[1]
# Export with tab formatter
TabWriter(TabFormat(header=True, sep='\t',),
samples_output,
header=True)
if samples_input[0] != input[0]:
samples_input[0].close()
# Else export just sample names from main view
else:
sample_list = []
if 'MERGED_sample' in iview:
sample_list = iview['MERGED_sample'].keys()
elif 'sample' not in iview:
for seq in iview:
sample = seq['sample']
if sample not in sample_list:
sample_list.append(sample)
else:
logger("warning", "Can not read sample list from main view for metabaR sample list export")
print("sample_id", file=samples_output)
for sample in sample_list:
line_to_print = b""
line_to_print += sample
line_to_print += b"\t"
print(tostr(line_to_print), file=samples_output)
samples_output.close()
# TODO save command in input dms? # TODO save command in input dms?
if not BrokenPipeError and not IOError: if not BrokenPipeError and not IOError:

View File

@ -6,4 +6,6 @@ cdef class TabFormat:
cdef bytes NAString cdef bytes NAString
cdef set tags cdef set tags
cdef bytes sep cdef bytes sep
cdef bint NAIntTo0 cdef bint NAIntTo0
cdef bint metabaR
cdef bint ngsfilter

View File

@ -10,13 +10,15 @@ import sys
cdef class TabFormat: cdef class TabFormat:
def __init__(self, list tags=[], header=True, bytes NAString=b"NA", bytes sep=b"\t", bint NAIntTo0=True): def __init__(self, list tags=[], header=True, bytes NAString=b"NA", bytes sep=b"\t", bint NAIntTo0=True, metabaR=False, ngsfilter=False):
self.tags = set(tags) self.tags = set(tags)
self.header = header self.header = header
self.first_line = True self.first_line = True
self.NAString = NAString self.NAString = NAString
self.sep = sep self.sep = sep
self.NAIntTo0 = NAIntTo0 self.NAIntTo0 = NAIntTo0
self.metabaR = metabaR
self.ngsfilter = ngsfilter
@cython.boundscheck(False) @cython.boundscheck(False)
def __call__(self, object data): def __call__(self, object data):
@ -34,13 +36,21 @@ cdef class TabFormat:
if self.header and self.first_line: if self.header and self.first_line:
for k in ktags: for k in ktags:
if k in tags: if k in tags:
if self.metabaR:
if k == b'NUC_SEQ':
ktoprint = b'sequence'
else:
ktoprint = k.lower()
ktoprint = ktoprint.replace(b'merged_', b'')
else:
ktoprint = k
if isinstance(data.view[k], Column_multi_elts): if isinstance(data.view[k], Column_multi_elts):
keys = data.view[k].keys() keys = data.view[k].keys()
keys.sort() keys.sort()
for k2 in keys: for k2 in keys:
line.append(tobytes(k)+b':'+tobytes(k2)) line.append(tobytes(ktoprint)+b':'+tobytes(k2))
else: else:
line.append(tobytes(k)) line.append(tobytes(ktoprint))
r = self.sep.join(value for value in line) r = self.sep.join(value for value in line)
r += b'\n' r += b'\n'
line = [] line = []

View File

@ -48,13 +48,13 @@ def ngsfilterIterator(lineiterator,
all_lines.insert(0, firstline) all_lines.insert(0, firstline)
# Insert header for column names # Insert header for column names
column_names = [b"experiment", b"sample", b"forward_tag", b"reverse_tag", b"forward_primer", b"reverse_primer"] column_names = [b"experiment", b"sample", b"forward_tag", b"reverse_tag", b"forward_primer", b"reverse_primer",b"additional_info"]
header = out_sep.join(column_names) header = out_sep.join(column_names)
new_lines.append(header) new_lines.append(header)
for line in all_lines: for line in all_lines:
split_line = line.split() split_line = line.split(maxsplit=5)
tags = split_line.pop(2) tags = split_line.pop(2)
tags = tags.split(b":") tags = tags.split(b":")
for t_idx in range(len(tags)): for t_idx in range(len(tags)):
@ -64,7 +64,7 @@ def ngsfilterIterator(lineiterator,
tags.append(tags[0]) tags.append(tags[0])
split_line.insert(2, tags[0]) split_line.insert(2, tags[0])
split_line.insert(3, tags[1]) split_line.insert(3, tags[1])
new_lines.append(out_sep.join(split_line[0:6])) new_lines.append(out_sep.join(split_line[0:7]))
return tabIterator(iter(new_lines), return tabIterator(iter(new_lines),
header = True, header = True,